1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Kernel-based Virtual Machine driver for Linux 4 * 5 * This module enables machines with Intel VT-x extensions to run virtual 6 * machines without emulation or binary translation. 7 * 8 * Copyright (C) 2006 Qumranet, Inc. 9 * Copyright 2010 Red Hat, Inc. and/or its affiliates. 10 * 11 * Authors: 12 * Avi Kivity <avi@qumranet.com> 13 * Yaniv Kamay <yaniv@qumranet.com> 14 */ 15 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 16 17 #include <linux/highmem.h> 18 #include <linux/hrtimer.h> 19 #include <linux/kernel.h> 20 #include <linux/kvm_host.h> 21 #include <linux/module.h> 22 #include <linux/moduleparam.h> 23 #include <linux/mod_devicetable.h> 24 #include <linux/mm.h> 25 #include <linux/objtool.h> 26 #include <linux/sched.h> 27 #include <linux/sched/smt.h> 28 #include <linux/slab.h> 29 #include <linux/tboot.h> 30 #include <linux/trace_events.h> 31 #include <linux/entry-kvm.h> 32 33 #include <asm/apic.h> 34 #include <asm/asm.h> 35 #include <asm/cpu.h> 36 #include <asm/cpu_device_id.h> 37 #include <asm/debugreg.h> 38 #include <asm/desc.h> 39 #include <asm/fpu/api.h> 40 #include <asm/fpu/xstate.h> 41 #include <asm/fred.h> 42 #include <asm/idtentry.h> 43 #include <asm/io.h> 44 #include <asm/irq_remapping.h> 45 #include <asm/reboot.h> 46 #include <asm/perf_event.h> 47 #include <asm/mmu_context.h> 48 #include <asm/mshyperv.h> 49 #include <asm/mwait.h> 50 #include <asm/spec-ctrl.h> 51 #include <asm/vmx.h> 52 53 #include <trace/events/ipi.h> 54 55 #include "capabilities.h" 56 #include "cpuid.h" 57 #include "hyperv.h" 58 #include "kvm_onhyperv.h" 59 #include "irq.h" 60 #include "kvm_cache_regs.h" 61 #include "lapic.h" 62 #include "mmu.h" 63 #include "nested.h" 64 #include "pmu.h" 65 #include "sgx.h" 66 #include "trace.h" 67 #include "vmcs.h" 68 #include "vmcs12.h" 69 #include "vmx.h" 70 #include "x86.h" 71 #include "smm.h" 72 #include "vmx_onhyperv.h" 73 74 MODULE_AUTHOR("Qumranet"); 75 MODULE_LICENSE("GPL"); 76 77 #ifdef MODULE 78 static const struct x86_cpu_id vmx_cpu_id[] = { 79 X86_MATCH_FEATURE(X86_FEATURE_VMX, NULL), 80 {} 81 }; 82 MODULE_DEVICE_TABLE(x86cpu, vmx_cpu_id); 83 #endif 84 85 bool __read_mostly enable_vpid = 1; 86 module_param_named(vpid, enable_vpid, bool, 0444); 87 88 static bool __read_mostly enable_vnmi = 1; 89 module_param_named(vnmi, enable_vnmi, bool, 0444); 90 91 bool __read_mostly flexpriority_enabled = 1; 92 module_param_named(flexpriority, flexpriority_enabled, bool, 0444); 93 94 bool __read_mostly enable_ept = 1; 95 module_param_named(ept, enable_ept, bool, 0444); 96 97 bool __read_mostly enable_unrestricted_guest = 1; 98 module_param_named(unrestricted_guest, 99 enable_unrestricted_guest, bool, 0444); 100 101 bool __read_mostly enable_ept_ad_bits = 1; 102 module_param_named(eptad, enable_ept_ad_bits, bool, 0444); 103 104 static bool __read_mostly emulate_invalid_guest_state = true; 105 module_param(emulate_invalid_guest_state, bool, 0444); 106 107 static bool __read_mostly fasteoi = 1; 108 module_param(fasteoi, bool, 0444); 109 110 module_param(enable_apicv, bool, 0444); 111 112 bool __read_mostly enable_ipiv = true; 113 module_param(enable_ipiv, bool, 0444); 114 115 /* 116 * If nested=1, nested virtualization is supported, i.e., guests may use 117 * VMX and be a hypervisor for its own guests. If nested=0, guests may not 118 * use VMX instructions. 119 */ 120 static bool __read_mostly nested = 1; 121 module_param(nested, bool, 0444); 122 123 bool __read_mostly enable_pml = 1; 124 module_param_named(pml, enable_pml, bool, 0444); 125 126 static bool __read_mostly error_on_inconsistent_vmcs_config = true; 127 module_param(error_on_inconsistent_vmcs_config, bool, 0444); 128 129 static bool __read_mostly dump_invalid_vmcs = 0; 130 module_param(dump_invalid_vmcs, bool, 0644); 131 132 #define MSR_BITMAP_MODE_X2APIC 1 133 #define MSR_BITMAP_MODE_X2APIC_APICV 2 134 135 #define KVM_VMX_TSC_MULTIPLIER_MAX 0xffffffffffffffffULL 136 137 /* Guest_tsc -> host_tsc conversion requires 64-bit division. */ 138 static int __read_mostly cpu_preemption_timer_multi; 139 static bool __read_mostly enable_preemption_timer = 1; 140 #ifdef CONFIG_X86_64 141 module_param_named(preemption_timer, enable_preemption_timer, bool, S_IRUGO); 142 #endif 143 144 extern bool __read_mostly allow_smaller_maxphyaddr; 145 module_param(allow_smaller_maxphyaddr, bool, S_IRUGO); 146 147 #define KVM_VM_CR0_ALWAYS_OFF (X86_CR0_NW | X86_CR0_CD) 148 #define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST X86_CR0_NE 149 #define KVM_VM_CR0_ALWAYS_ON \ 150 (KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST | X86_CR0_PG | X86_CR0_PE) 151 152 #define KVM_VM_CR4_ALWAYS_ON_UNRESTRICTED_GUEST X86_CR4_VMXE 153 #define KVM_PMODE_VM_CR4_ALWAYS_ON (X86_CR4_PAE | X86_CR4_VMXE) 154 #define KVM_RMODE_VM_CR4_ALWAYS_ON (X86_CR4_VME | X86_CR4_PAE | X86_CR4_VMXE) 155 156 #define RMODE_GUEST_OWNED_EFLAGS_BITS (~(X86_EFLAGS_IOPL | X86_EFLAGS_VM)) 157 158 #define MSR_IA32_RTIT_STATUS_MASK (~(RTIT_STATUS_FILTEREN | \ 159 RTIT_STATUS_CONTEXTEN | RTIT_STATUS_TRIGGEREN | \ 160 RTIT_STATUS_ERROR | RTIT_STATUS_STOPPED | \ 161 RTIT_STATUS_BYTECNT)) 162 163 /* 164 * List of MSRs that can be directly passed to the guest. 165 * In addition to these x2apic, PT and LBR MSRs are handled specially. 166 */ 167 static u32 vmx_possible_passthrough_msrs[MAX_POSSIBLE_PASSTHROUGH_MSRS] = { 168 MSR_IA32_SPEC_CTRL, 169 MSR_IA32_PRED_CMD, 170 MSR_IA32_FLUSH_CMD, 171 MSR_IA32_TSC, 172 #ifdef CONFIG_X86_64 173 MSR_FS_BASE, 174 MSR_GS_BASE, 175 MSR_KERNEL_GS_BASE, 176 MSR_IA32_XFD, 177 MSR_IA32_XFD_ERR, 178 #endif 179 MSR_IA32_SYSENTER_CS, 180 MSR_IA32_SYSENTER_ESP, 181 MSR_IA32_SYSENTER_EIP, 182 MSR_CORE_C1_RES, 183 MSR_CORE_C3_RESIDENCY, 184 MSR_CORE_C6_RESIDENCY, 185 MSR_CORE_C7_RESIDENCY, 186 }; 187 188 /* 189 * These 2 parameters are used to config the controls for Pause-Loop Exiting: 190 * ple_gap: upper bound on the amount of time between two successive 191 * executions of PAUSE in a loop. Also indicate if ple enabled. 192 * According to test, this time is usually smaller than 128 cycles. 193 * ple_window: upper bound on the amount of time a guest is allowed to execute 194 * in a PAUSE loop. Tests indicate that most spinlocks are held for 195 * less than 2^12 cycles 196 * Time is measured based on a counter that runs at the same rate as the TSC, 197 * refer SDM volume 3b section 21.6.13 & 22.1.3. 198 */ 199 static unsigned int ple_gap = KVM_DEFAULT_PLE_GAP; 200 module_param(ple_gap, uint, 0444); 201 202 static unsigned int ple_window = KVM_VMX_DEFAULT_PLE_WINDOW; 203 module_param(ple_window, uint, 0444); 204 205 /* Default doubles per-vcpu window every exit. */ 206 static unsigned int ple_window_grow = KVM_DEFAULT_PLE_WINDOW_GROW; 207 module_param(ple_window_grow, uint, 0444); 208 209 /* Default resets per-vcpu window every exit to ple_window. */ 210 static unsigned int ple_window_shrink = KVM_DEFAULT_PLE_WINDOW_SHRINK; 211 module_param(ple_window_shrink, uint, 0444); 212 213 /* Default is to compute the maximum so we can never overflow. */ 214 static unsigned int ple_window_max = KVM_VMX_DEFAULT_PLE_WINDOW_MAX; 215 module_param(ple_window_max, uint, 0444); 216 217 /* Default is SYSTEM mode, 1 for host-guest mode */ 218 int __read_mostly pt_mode = PT_MODE_SYSTEM; 219 module_param(pt_mode, int, S_IRUGO); 220 221 static DEFINE_STATIC_KEY_FALSE(vmx_l1d_should_flush); 222 static DEFINE_STATIC_KEY_FALSE(vmx_l1d_flush_cond); 223 static DEFINE_MUTEX(vmx_l1d_flush_mutex); 224 225 /* Storage for pre module init parameter parsing */ 226 static enum vmx_l1d_flush_state __read_mostly vmentry_l1d_flush_param = VMENTER_L1D_FLUSH_AUTO; 227 228 static const struct { 229 const char *option; 230 bool for_parse; 231 } vmentry_l1d_param[] = { 232 [VMENTER_L1D_FLUSH_AUTO] = {"auto", true}, 233 [VMENTER_L1D_FLUSH_NEVER] = {"never", true}, 234 [VMENTER_L1D_FLUSH_COND] = {"cond", true}, 235 [VMENTER_L1D_FLUSH_ALWAYS] = {"always", true}, 236 [VMENTER_L1D_FLUSH_EPT_DISABLED] = {"EPT disabled", false}, 237 [VMENTER_L1D_FLUSH_NOT_REQUIRED] = {"not required", false}, 238 }; 239 240 #define L1D_CACHE_ORDER 4 241 static void *vmx_l1d_flush_pages; 242 243 static int vmx_setup_l1d_flush(enum vmx_l1d_flush_state l1tf) 244 { 245 struct page *page; 246 unsigned int i; 247 248 if (!boot_cpu_has_bug(X86_BUG_L1TF)) { 249 l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_NOT_REQUIRED; 250 return 0; 251 } 252 253 if (!enable_ept) { 254 l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_EPT_DISABLED; 255 return 0; 256 } 257 258 if (host_arch_capabilities & ARCH_CAP_SKIP_VMENTRY_L1DFLUSH) { 259 l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_NOT_REQUIRED; 260 return 0; 261 } 262 263 /* If set to auto use the default l1tf mitigation method */ 264 if (l1tf == VMENTER_L1D_FLUSH_AUTO) { 265 switch (l1tf_mitigation) { 266 case L1TF_MITIGATION_OFF: 267 l1tf = VMENTER_L1D_FLUSH_NEVER; 268 break; 269 case L1TF_MITIGATION_FLUSH_NOWARN: 270 case L1TF_MITIGATION_FLUSH: 271 case L1TF_MITIGATION_FLUSH_NOSMT: 272 l1tf = VMENTER_L1D_FLUSH_COND; 273 break; 274 case L1TF_MITIGATION_FULL: 275 case L1TF_MITIGATION_FULL_FORCE: 276 l1tf = VMENTER_L1D_FLUSH_ALWAYS; 277 break; 278 } 279 } else if (l1tf_mitigation == L1TF_MITIGATION_FULL_FORCE) { 280 l1tf = VMENTER_L1D_FLUSH_ALWAYS; 281 } 282 283 if (l1tf != VMENTER_L1D_FLUSH_NEVER && !vmx_l1d_flush_pages && 284 !boot_cpu_has(X86_FEATURE_FLUSH_L1D)) { 285 /* 286 * This allocation for vmx_l1d_flush_pages is not tied to a VM 287 * lifetime and so should not be charged to a memcg. 288 */ 289 page = alloc_pages(GFP_KERNEL, L1D_CACHE_ORDER); 290 if (!page) 291 return -ENOMEM; 292 vmx_l1d_flush_pages = page_address(page); 293 294 /* 295 * Initialize each page with a different pattern in 296 * order to protect against KSM in the nested 297 * virtualization case. 298 */ 299 for (i = 0; i < 1u << L1D_CACHE_ORDER; ++i) { 300 memset(vmx_l1d_flush_pages + i * PAGE_SIZE, i + 1, 301 PAGE_SIZE); 302 } 303 } 304 305 l1tf_vmx_mitigation = l1tf; 306 307 if (l1tf != VMENTER_L1D_FLUSH_NEVER) 308 static_branch_enable(&vmx_l1d_should_flush); 309 else 310 static_branch_disable(&vmx_l1d_should_flush); 311 312 if (l1tf == VMENTER_L1D_FLUSH_COND) 313 static_branch_enable(&vmx_l1d_flush_cond); 314 else 315 static_branch_disable(&vmx_l1d_flush_cond); 316 return 0; 317 } 318 319 static int vmentry_l1d_flush_parse(const char *s) 320 { 321 unsigned int i; 322 323 if (s) { 324 for (i = 0; i < ARRAY_SIZE(vmentry_l1d_param); i++) { 325 if (vmentry_l1d_param[i].for_parse && 326 sysfs_streq(s, vmentry_l1d_param[i].option)) 327 return i; 328 } 329 } 330 return -EINVAL; 331 } 332 333 static int vmentry_l1d_flush_set(const char *s, const struct kernel_param *kp) 334 { 335 int l1tf, ret; 336 337 l1tf = vmentry_l1d_flush_parse(s); 338 if (l1tf < 0) 339 return l1tf; 340 341 if (!boot_cpu_has(X86_BUG_L1TF)) 342 return 0; 343 344 /* 345 * Has vmx_init() run already? If not then this is the pre init 346 * parameter parsing. In that case just store the value and let 347 * vmx_init() do the proper setup after enable_ept has been 348 * established. 349 */ 350 if (l1tf_vmx_mitigation == VMENTER_L1D_FLUSH_AUTO) { 351 vmentry_l1d_flush_param = l1tf; 352 return 0; 353 } 354 355 mutex_lock(&vmx_l1d_flush_mutex); 356 ret = vmx_setup_l1d_flush(l1tf); 357 mutex_unlock(&vmx_l1d_flush_mutex); 358 return ret; 359 } 360 361 static int vmentry_l1d_flush_get(char *s, const struct kernel_param *kp) 362 { 363 if (WARN_ON_ONCE(l1tf_vmx_mitigation >= ARRAY_SIZE(vmentry_l1d_param))) 364 return sysfs_emit(s, "???\n"); 365 366 return sysfs_emit(s, "%s\n", vmentry_l1d_param[l1tf_vmx_mitigation].option); 367 } 368 369 static __always_inline void vmx_disable_fb_clear(struct vcpu_vmx *vmx) 370 { 371 u64 msr; 372 373 if (!vmx->disable_fb_clear) 374 return; 375 376 msr = __rdmsr(MSR_IA32_MCU_OPT_CTRL); 377 msr |= FB_CLEAR_DIS; 378 native_wrmsrl(MSR_IA32_MCU_OPT_CTRL, msr); 379 /* Cache the MSR value to avoid reading it later */ 380 vmx->msr_ia32_mcu_opt_ctrl = msr; 381 } 382 383 static __always_inline void vmx_enable_fb_clear(struct vcpu_vmx *vmx) 384 { 385 if (!vmx->disable_fb_clear) 386 return; 387 388 vmx->msr_ia32_mcu_opt_ctrl &= ~FB_CLEAR_DIS; 389 native_wrmsrl(MSR_IA32_MCU_OPT_CTRL, vmx->msr_ia32_mcu_opt_ctrl); 390 } 391 392 static void vmx_update_fb_clear_dis(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx) 393 { 394 /* 395 * Disable VERW's behavior of clearing CPU buffers for the guest if the 396 * CPU isn't affected by MDS/TAA, and the host hasn't forcefully enabled 397 * the mitigation. Disabling the clearing behavior provides a 398 * performance boost for guests that aren't aware that manually clearing 399 * CPU buffers is unnecessary, at the cost of MSR accesses on VM-Entry 400 * and VM-Exit. 401 */ 402 vmx->disable_fb_clear = !cpu_feature_enabled(X86_FEATURE_CLEAR_CPU_BUF) && 403 (host_arch_capabilities & ARCH_CAP_FB_CLEAR_CTRL) && 404 !boot_cpu_has_bug(X86_BUG_MDS) && 405 !boot_cpu_has_bug(X86_BUG_TAA); 406 407 /* 408 * If guest will not execute VERW, there is no need to set FB_CLEAR_DIS 409 * at VMEntry. Skip the MSR read/write when a guest has no use case to 410 * execute VERW. 411 */ 412 if ((vcpu->arch.arch_capabilities & ARCH_CAP_FB_CLEAR) || 413 ((vcpu->arch.arch_capabilities & ARCH_CAP_MDS_NO) && 414 (vcpu->arch.arch_capabilities & ARCH_CAP_TAA_NO) && 415 (vcpu->arch.arch_capabilities & ARCH_CAP_PSDP_NO) && 416 (vcpu->arch.arch_capabilities & ARCH_CAP_FBSDP_NO) && 417 (vcpu->arch.arch_capabilities & ARCH_CAP_SBDR_SSDP_NO))) 418 vmx->disable_fb_clear = false; 419 } 420 421 static const struct kernel_param_ops vmentry_l1d_flush_ops = { 422 .set = vmentry_l1d_flush_set, 423 .get = vmentry_l1d_flush_get, 424 }; 425 module_param_cb(vmentry_l1d_flush, &vmentry_l1d_flush_ops, NULL, 0644); 426 427 static u32 vmx_segment_access_rights(struct kvm_segment *var); 428 429 void vmx_vmexit(void); 430 431 #define vmx_insn_failed(fmt...) \ 432 do { \ 433 WARN_ONCE(1, fmt); \ 434 pr_warn_ratelimited(fmt); \ 435 } while (0) 436 437 noinline void vmread_error(unsigned long field) 438 { 439 vmx_insn_failed("vmread failed: field=%lx\n", field); 440 } 441 442 #ifndef CONFIG_CC_HAS_ASM_GOTO_OUTPUT 443 noinstr void vmread_error_trampoline2(unsigned long field, bool fault) 444 { 445 if (fault) { 446 kvm_spurious_fault(); 447 } else { 448 instrumentation_begin(); 449 vmread_error(field); 450 instrumentation_end(); 451 } 452 } 453 #endif 454 455 noinline void vmwrite_error(unsigned long field, unsigned long value) 456 { 457 vmx_insn_failed("vmwrite failed: field=%lx val=%lx err=%u\n", 458 field, value, vmcs_read32(VM_INSTRUCTION_ERROR)); 459 } 460 461 noinline void vmclear_error(struct vmcs *vmcs, u64 phys_addr) 462 { 463 vmx_insn_failed("vmclear failed: %p/%llx err=%u\n", 464 vmcs, phys_addr, vmcs_read32(VM_INSTRUCTION_ERROR)); 465 } 466 467 noinline void vmptrld_error(struct vmcs *vmcs, u64 phys_addr) 468 { 469 vmx_insn_failed("vmptrld failed: %p/%llx err=%u\n", 470 vmcs, phys_addr, vmcs_read32(VM_INSTRUCTION_ERROR)); 471 } 472 473 noinline void invvpid_error(unsigned long ext, u16 vpid, gva_t gva) 474 { 475 vmx_insn_failed("invvpid failed: ext=0x%lx vpid=%u gva=0x%lx\n", 476 ext, vpid, gva); 477 } 478 479 noinline void invept_error(unsigned long ext, u64 eptp, gpa_t gpa) 480 { 481 vmx_insn_failed("invept failed: ext=0x%lx eptp=%llx gpa=0x%llx\n", 482 ext, eptp, gpa); 483 } 484 485 static DEFINE_PER_CPU(struct vmcs *, vmxarea); 486 DEFINE_PER_CPU(struct vmcs *, current_vmcs); 487 /* 488 * We maintain a per-CPU linked-list of VMCS loaded on that CPU. This is needed 489 * when a CPU is brought down, and we need to VMCLEAR all VMCSs loaded on it. 490 */ 491 static DEFINE_PER_CPU(struct list_head, loaded_vmcss_on_cpu); 492 493 static DECLARE_BITMAP(vmx_vpid_bitmap, VMX_NR_VPIDS); 494 static DEFINE_SPINLOCK(vmx_vpid_lock); 495 496 struct vmcs_config vmcs_config __ro_after_init; 497 struct vmx_capability vmx_capability __ro_after_init; 498 499 #define VMX_SEGMENT_FIELD(seg) \ 500 [VCPU_SREG_##seg] = { \ 501 .selector = GUEST_##seg##_SELECTOR, \ 502 .base = GUEST_##seg##_BASE, \ 503 .limit = GUEST_##seg##_LIMIT, \ 504 .ar_bytes = GUEST_##seg##_AR_BYTES, \ 505 } 506 507 static const struct kvm_vmx_segment_field { 508 unsigned selector; 509 unsigned base; 510 unsigned limit; 511 unsigned ar_bytes; 512 } kvm_vmx_segment_fields[] = { 513 VMX_SEGMENT_FIELD(CS), 514 VMX_SEGMENT_FIELD(DS), 515 VMX_SEGMENT_FIELD(ES), 516 VMX_SEGMENT_FIELD(FS), 517 VMX_SEGMENT_FIELD(GS), 518 VMX_SEGMENT_FIELD(SS), 519 VMX_SEGMENT_FIELD(TR), 520 VMX_SEGMENT_FIELD(LDTR), 521 }; 522 523 static inline void vmx_segment_cache_clear(struct vcpu_vmx *vmx) 524 { 525 vmx->segment_cache.bitmask = 0; 526 } 527 528 static unsigned long host_idt_base; 529 530 #if IS_ENABLED(CONFIG_HYPERV) 531 static struct kvm_x86_ops vmx_x86_ops __initdata; 532 533 static bool __read_mostly enlightened_vmcs = true; 534 module_param(enlightened_vmcs, bool, 0444); 535 536 static int hv_enable_l2_tlb_flush(struct kvm_vcpu *vcpu) 537 { 538 struct hv_enlightened_vmcs *evmcs; 539 hpa_t partition_assist_page = hv_get_partition_assist_page(vcpu); 540 541 if (partition_assist_page == INVALID_PAGE) 542 return -ENOMEM; 543 544 evmcs = (struct hv_enlightened_vmcs *)to_vmx(vcpu)->loaded_vmcs->vmcs; 545 546 evmcs->partition_assist_page = partition_assist_page; 547 evmcs->hv_vm_id = (unsigned long)vcpu->kvm; 548 evmcs->hv_enlightenments_control.nested_flush_hypercall = 1; 549 550 return 0; 551 } 552 553 static __init void hv_init_evmcs(void) 554 { 555 int cpu; 556 557 if (!enlightened_vmcs) 558 return; 559 560 /* 561 * Enlightened VMCS usage should be recommended and the host needs 562 * to support eVMCS v1 or above. 563 */ 564 if (ms_hyperv.hints & HV_X64_ENLIGHTENED_VMCS_RECOMMENDED && 565 (ms_hyperv.nested_features & HV_X64_ENLIGHTENED_VMCS_VERSION) >= 566 KVM_EVMCS_VERSION) { 567 568 /* Check that we have assist pages on all online CPUs */ 569 for_each_online_cpu(cpu) { 570 if (!hv_get_vp_assist_page(cpu)) { 571 enlightened_vmcs = false; 572 break; 573 } 574 } 575 576 if (enlightened_vmcs) { 577 pr_info("Using Hyper-V Enlightened VMCS\n"); 578 static_branch_enable(&__kvm_is_using_evmcs); 579 } 580 581 if (ms_hyperv.nested_features & HV_X64_NESTED_DIRECT_FLUSH) 582 vmx_x86_ops.enable_l2_tlb_flush 583 = hv_enable_l2_tlb_flush; 584 585 } else { 586 enlightened_vmcs = false; 587 } 588 } 589 590 static void hv_reset_evmcs(void) 591 { 592 struct hv_vp_assist_page *vp_ap; 593 594 if (!kvm_is_using_evmcs()) 595 return; 596 597 /* 598 * KVM should enable eVMCS if and only if all CPUs have a VP assist 599 * page, and should reject CPU onlining if eVMCS is enabled the CPU 600 * doesn't have a VP assist page allocated. 601 */ 602 vp_ap = hv_get_vp_assist_page(smp_processor_id()); 603 if (WARN_ON_ONCE(!vp_ap)) 604 return; 605 606 /* 607 * Reset everything to support using non-enlightened VMCS access later 608 * (e.g. when we reload the module with enlightened_vmcs=0) 609 */ 610 vp_ap->nested_control.features.directhypercall = 0; 611 vp_ap->current_nested_vmcs = 0; 612 vp_ap->enlighten_vmentry = 0; 613 } 614 615 #else /* IS_ENABLED(CONFIG_HYPERV) */ 616 static void hv_init_evmcs(void) {} 617 static void hv_reset_evmcs(void) {} 618 #endif /* IS_ENABLED(CONFIG_HYPERV) */ 619 620 /* 621 * Comment's format: document - errata name - stepping - processor name. 622 * Refer from 623 * https://www.virtualbox.org/svn/vbox/trunk/src/VBox/VMM/VMMR0/HMR0.cpp 624 */ 625 static u32 vmx_preemption_cpu_tfms[] = { 626 /* 323344.pdf - BA86 - D0 - Xeon 7500 Series */ 627 0x000206E6, 628 /* 323056.pdf - AAX65 - C2 - Xeon L3406 */ 629 /* 322814.pdf - AAT59 - C2 - i7-600, i5-500, i5-400 and i3-300 Mobile */ 630 /* 322911.pdf - AAU65 - C2 - i5-600, i3-500 Desktop and Pentium G6950 */ 631 0x00020652, 632 /* 322911.pdf - AAU65 - K0 - i5-600, i3-500 Desktop and Pentium G6950 */ 633 0x00020655, 634 /* 322373.pdf - AAO95 - B1 - Xeon 3400 Series */ 635 /* 322166.pdf - AAN92 - B1 - i7-800 and i5-700 Desktop */ 636 /* 637 * 320767.pdf - AAP86 - B1 - 638 * i7-900 Mobile Extreme, i7-800 and i7-700 Mobile 639 */ 640 0x000106E5, 641 /* 321333.pdf - AAM126 - C0 - Xeon 3500 */ 642 0x000106A0, 643 /* 321333.pdf - AAM126 - C1 - Xeon 3500 */ 644 0x000106A1, 645 /* 320836.pdf - AAJ124 - C0 - i7-900 Desktop Extreme and i7-900 Desktop */ 646 0x000106A4, 647 /* 321333.pdf - AAM126 - D0 - Xeon 3500 */ 648 /* 321324.pdf - AAK139 - D0 - Xeon 5500 */ 649 /* 320836.pdf - AAJ124 - D0 - i7-900 Extreme and i7-900 Desktop */ 650 0x000106A5, 651 /* Xeon E3-1220 V2 */ 652 0x000306A8, 653 }; 654 655 static inline bool cpu_has_broken_vmx_preemption_timer(void) 656 { 657 u32 eax = cpuid_eax(0x00000001), i; 658 659 /* Clear the reserved bits */ 660 eax &= ~(0x3U << 14 | 0xfU << 28); 661 for (i = 0; i < ARRAY_SIZE(vmx_preemption_cpu_tfms); i++) 662 if (eax == vmx_preemption_cpu_tfms[i]) 663 return true; 664 665 return false; 666 } 667 668 static inline bool cpu_need_virtualize_apic_accesses(struct kvm_vcpu *vcpu) 669 { 670 return flexpriority_enabled && lapic_in_kernel(vcpu); 671 } 672 673 static int vmx_get_passthrough_msr_slot(u32 msr) 674 { 675 int i; 676 677 switch (msr) { 678 case 0x800 ... 0x8ff: 679 /* x2APIC MSRs. These are handled in vmx_update_msr_bitmap_x2apic() */ 680 return -ENOENT; 681 case MSR_IA32_RTIT_STATUS: 682 case MSR_IA32_RTIT_OUTPUT_BASE: 683 case MSR_IA32_RTIT_OUTPUT_MASK: 684 case MSR_IA32_RTIT_CR3_MATCH: 685 case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B: 686 /* PT MSRs. These are handled in pt_update_intercept_for_msr() */ 687 case MSR_LBR_SELECT: 688 case MSR_LBR_TOS: 689 case MSR_LBR_INFO_0 ... MSR_LBR_INFO_0 + 31: 690 case MSR_LBR_NHM_FROM ... MSR_LBR_NHM_FROM + 31: 691 case MSR_LBR_NHM_TO ... MSR_LBR_NHM_TO + 31: 692 case MSR_LBR_CORE_FROM ... MSR_LBR_CORE_FROM + 8: 693 case MSR_LBR_CORE_TO ... MSR_LBR_CORE_TO + 8: 694 /* LBR MSRs. These are handled in vmx_update_intercept_for_lbr_msrs() */ 695 return -ENOENT; 696 } 697 698 for (i = 0; i < ARRAY_SIZE(vmx_possible_passthrough_msrs); i++) { 699 if (vmx_possible_passthrough_msrs[i] == msr) 700 return i; 701 } 702 703 WARN(1, "Invalid MSR %x, please adapt vmx_possible_passthrough_msrs[]", msr); 704 return -ENOENT; 705 } 706 707 struct vmx_uret_msr *vmx_find_uret_msr(struct vcpu_vmx *vmx, u32 msr) 708 { 709 int i; 710 711 i = kvm_find_user_return_msr(msr); 712 if (i >= 0) 713 return &vmx->guest_uret_msrs[i]; 714 return NULL; 715 } 716 717 static int vmx_set_guest_uret_msr(struct vcpu_vmx *vmx, 718 struct vmx_uret_msr *msr, u64 data) 719 { 720 unsigned int slot = msr - vmx->guest_uret_msrs; 721 int ret = 0; 722 723 if (msr->load_into_hardware) { 724 preempt_disable(); 725 ret = kvm_set_user_return_msr(slot, data, msr->mask); 726 preempt_enable(); 727 } 728 if (!ret) 729 msr->data = data; 730 return ret; 731 } 732 733 /* 734 * Disable VMX and clear CR4.VMXE (even if VMXOFF faults) 735 * 736 * Note, VMXOFF causes a #UD if the CPU is !post-VMXON, but it's impossible to 737 * atomically track post-VMXON state, e.g. this may be called in NMI context. 738 * Eat all faults as all other faults on VMXOFF faults are mode related, i.e. 739 * faults are guaranteed to be due to the !post-VMXON check unless the CPU is 740 * magically in RM, VM86, compat mode, or at CPL>0. 741 */ 742 static int kvm_cpu_vmxoff(void) 743 { 744 asm goto("1: vmxoff\n\t" 745 _ASM_EXTABLE(1b, %l[fault]) 746 ::: "cc", "memory" : fault); 747 748 cr4_clear_bits(X86_CR4_VMXE); 749 return 0; 750 751 fault: 752 cr4_clear_bits(X86_CR4_VMXE); 753 return -EIO; 754 } 755 756 static void vmx_emergency_disable(void) 757 { 758 int cpu = raw_smp_processor_id(); 759 struct loaded_vmcs *v; 760 761 kvm_rebooting = true; 762 763 /* 764 * Note, CR4.VMXE can be _cleared_ in NMI context, but it can only be 765 * set in task context. If this races with VMX is disabled by an NMI, 766 * VMCLEAR and VMXOFF may #UD, but KVM will eat those faults due to 767 * kvm_rebooting set. 768 */ 769 if (!(__read_cr4() & X86_CR4_VMXE)) 770 return; 771 772 list_for_each_entry(v, &per_cpu(loaded_vmcss_on_cpu, cpu), 773 loaded_vmcss_on_cpu_link) 774 vmcs_clear(v->vmcs); 775 776 kvm_cpu_vmxoff(); 777 } 778 779 static void __loaded_vmcs_clear(void *arg) 780 { 781 struct loaded_vmcs *loaded_vmcs = arg; 782 int cpu = raw_smp_processor_id(); 783 784 if (loaded_vmcs->cpu != cpu) 785 return; /* vcpu migration can race with cpu offline */ 786 if (per_cpu(current_vmcs, cpu) == loaded_vmcs->vmcs) 787 per_cpu(current_vmcs, cpu) = NULL; 788 789 vmcs_clear(loaded_vmcs->vmcs); 790 if (loaded_vmcs->shadow_vmcs && loaded_vmcs->launched) 791 vmcs_clear(loaded_vmcs->shadow_vmcs); 792 793 list_del(&loaded_vmcs->loaded_vmcss_on_cpu_link); 794 795 /* 796 * Ensure all writes to loaded_vmcs, including deleting it from its 797 * current percpu list, complete before setting loaded_vmcs->cpu to 798 * -1, otherwise a different cpu can see loaded_vmcs->cpu == -1 first 799 * and add loaded_vmcs to its percpu list before it's deleted from this 800 * cpu's list. Pairs with the smp_rmb() in vmx_vcpu_load_vmcs(). 801 */ 802 smp_wmb(); 803 804 loaded_vmcs->cpu = -1; 805 loaded_vmcs->launched = 0; 806 } 807 808 void loaded_vmcs_clear(struct loaded_vmcs *loaded_vmcs) 809 { 810 int cpu = loaded_vmcs->cpu; 811 812 if (cpu != -1) 813 smp_call_function_single(cpu, 814 __loaded_vmcs_clear, loaded_vmcs, 1); 815 } 816 817 static bool vmx_segment_cache_test_set(struct vcpu_vmx *vmx, unsigned seg, 818 unsigned field) 819 { 820 bool ret; 821 u32 mask = 1 << (seg * SEG_FIELD_NR + field); 822 823 if (!kvm_register_is_available(&vmx->vcpu, VCPU_EXREG_SEGMENTS)) { 824 kvm_register_mark_available(&vmx->vcpu, VCPU_EXREG_SEGMENTS); 825 vmx->segment_cache.bitmask = 0; 826 } 827 ret = vmx->segment_cache.bitmask & mask; 828 vmx->segment_cache.bitmask |= mask; 829 return ret; 830 } 831 832 static u16 vmx_read_guest_seg_selector(struct vcpu_vmx *vmx, unsigned seg) 833 { 834 u16 *p = &vmx->segment_cache.seg[seg].selector; 835 836 if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_SEL)) 837 *p = vmcs_read16(kvm_vmx_segment_fields[seg].selector); 838 return *p; 839 } 840 841 static ulong vmx_read_guest_seg_base(struct vcpu_vmx *vmx, unsigned seg) 842 { 843 ulong *p = &vmx->segment_cache.seg[seg].base; 844 845 if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_BASE)) 846 *p = vmcs_readl(kvm_vmx_segment_fields[seg].base); 847 return *p; 848 } 849 850 static u32 vmx_read_guest_seg_limit(struct vcpu_vmx *vmx, unsigned seg) 851 { 852 u32 *p = &vmx->segment_cache.seg[seg].limit; 853 854 if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_LIMIT)) 855 *p = vmcs_read32(kvm_vmx_segment_fields[seg].limit); 856 return *p; 857 } 858 859 static u32 vmx_read_guest_seg_ar(struct vcpu_vmx *vmx, unsigned seg) 860 { 861 u32 *p = &vmx->segment_cache.seg[seg].ar; 862 863 if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_AR)) 864 *p = vmcs_read32(kvm_vmx_segment_fields[seg].ar_bytes); 865 return *p; 866 } 867 868 void vmx_update_exception_bitmap(struct kvm_vcpu *vcpu) 869 { 870 u32 eb; 871 872 eb = (1u << PF_VECTOR) | (1u << UD_VECTOR) | (1u << MC_VECTOR) | 873 (1u << DB_VECTOR) | (1u << AC_VECTOR); 874 /* 875 * Guest access to VMware backdoor ports could legitimately 876 * trigger #GP because of TSS I/O permission bitmap. 877 * We intercept those #GP and allow access to them anyway 878 * as VMware does. 879 */ 880 if (enable_vmware_backdoor) 881 eb |= (1u << GP_VECTOR); 882 if ((vcpu->guest_debug & 883 (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP)) == 884 (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP)) 885 eb |= 1u << BP_VECTOR; 886 if (to_vmx(vcpu)->rmode.vm86_active) 887 eb = ~0; 888 if (!vmx_need_pf_intercept(vcpu)) 889 eb &= ~(1u << PF_VECTOR); 890 891 /* When we are running a nested L2 guest and L1 specified for it a 892 * certain exception bitmap, we must trap the same exceptions and pass 893 * them to L1. When running L2, we will only handle the exceptions 894 * specified above if L1 did not want them. 895 */ 896 if (is_guest_mode(vcpu)) 897 eb |= get_vmcs12(vcpu)->exception_bitmap; 898 else { 899 int mask = 0, match = 0; 900 901 if (enable_ept && (eb & (1u << PF_VECTOR))) { 902 /* 903 * If EPT is enabled, #PF is currently only intercepted 904 * if MAXPHYADDR is smaller on the guest than on the 905 * host. In that case we only care about present, 906 * non-reserved faults. For vmcs02, however, PFEC_MASK 907 * and PFEC_MATCH are set in prepare_vmcs02_rare. 908 */ 909 mask = PFERR_PRESENT_MASK | PFERR_RSVD_MASK; 910 match = PFERR_PRESENT_MASK; 911 } 912 vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, mask); 913 vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, match); 914 } 915 916 /* 917 * Disabling xfd interception indicates that dynamic xfeatures 918 * might be used in the guest. Always trap #NM in this case 919 * to save guest xfd_err timely. 920 */ 921 if (vcpu->arch.xfd_no_write_intercept) 922 eb |= (1u << NM_VECTOR); 923 924 vmcs_write32(EXCEPTION_BITMAP, eb); 925 } 926 927 /* 928 * Check if MSR is intercepted for currently loaded MSR bitmap. 929 */ 930 static bool msr_write_intercepted(struct vcpu_vmx *vmx, u32 msr) 931 { 932 if (!(exec_controls_get(vmx) & CPU_BASED_USE_MSR_BITMAPS)) 933 return true; 934 935 return vmx_test_msr_bitmap_write(vmx->loaded_vmcs->msr_bitmap, msr); 936 } 937 938 unsigned int __vmx_vcpu_run_flags(struct vcpu_vmx *vmx) 939 { 940 unsigned int flags = 0; 941 942 if (vmx->loaded_vmcs->launched) 943 flags |= VMX_RUN_VMRESUME; 944 945 /* 946 * If writes to the SPEC_CTRL MSR aren't intercepted, the guest is free 947 * to change it directly without causing a vmexit. In that case read 948 * it after vmexit and store it in vmx->spec_ctrl. 949 */ 950 if (!msr_write_intercepted(vmx, MSR_IA32_SPEC_CTRL)) 951 flags |= VMX_RUN_SAVE_SPEC_CTRL; 952 953 return flags; 954 } 955 956 static __always_inline void clear_atomic_switch_msr_special(struct vcpu_vmx *vmx, 957 unsigned long entry, unsigned long exit) 958 { 959 vm_entry_controls_clearbit(vmx, entry); 960 vm_exit_controls_clearbit(vmx, exit); 961 } 962 963 int vmx_find_loadstore_msr_slot(struct vmx_msrs *m, u32 msr) 964 { 965 unsigned int i; 966 967 for (i = 0; i < m->nr; ++i) { 968 if (m->val[i].index == msr) 969 return i; 970 } 971 return -ENOENT; 972 } 973 974 static void clear_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr) 975 { 976 int i; 977 struct msr_autoload *m = &vmx->msr_autoload; 978 979 switch (msr) { 980 case MSR_EFER: 981 if (cpu_has_load_ia32_efer()) { 982 clear_atomic_switch_msr_special(vmx, 983 VM_ENTRY_LOAD_IA32_EFER, 984 VM_EXIT_LOAD_IA32_EFER); 985 return; 986 } 987 break; 988 case MSR_CORE_PERF_GLOBAL_CTRL: 989 if (cpu_has_load_perf_global_ctrl()) { 990 clear_atomic_switch_msr_special(vmx, 991 VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL, 992 VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL); 993 return; 994 } 995 break; 996 } 997 i = vmx_find_loadstore_msr_slot(&m->guest, msr); 998 if (i < 0) 999 goto skip_guest; 1000 --m->guest.nr; 1001 m->guest.val[i] = m->guest.val[m->guest.nr]; 1002 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->guest.nr); 1003 1004 skip_guest: 1005 i = vmx_find_loadstore_msr_slot(&m->host, msr); 1006 if (i < 0) 1007 return; 1008 1009 --m->host.nr; 1010 m->host.val[i] = m->host.val[m->host.nr]; 1011 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->host.nr); 1012 } 1013 1014 static __always_inline void add_atomic_switch_msr_special(struct vcpu_vmx *vmx, 1015 unsigned long entry, unsigned long exit, 1016 unsigned long guest_val_vmcs, unsigned long host_val_vmcs, 1017 u64 guest_val, u64 host_val) 1018 { 1019 vmcs_write64(guest_val_vmcs, guest_val); 1020 if (host_val_vmcs != HOST_IA32_EFER) 1021 vmcs_write64(host_val_vmcs, host_val); 1022 vm_entry_controls_setbit(vmx, entry); 1023 vm_exit_controls_setbit(vmx, exit); 1024 } 1025 1026 static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr, 1027 u64 guest_val, u64 host_val, bool entry_only) 1028 { 1029 int i, j = 0; 1030 struct msr_autoload *m = &vmx->msr_autoload; 1031 1032 switch (msr) { 1033 case MSR_EFER: 1034 if (cpu_has_load_ia32_efer()) { 1035 add_atomic_switch_msr_special(vmx, 1036 VM_ENTRY_LOAD_IA32_EFER, 1037 VM_EXIT_LOAD_IA32_EFER, 1038 GUEST_IA32_EFER, 1039 HOST_IA32_EFER, 1040 guest_val, host_val); 1041 return; 1042 } 1043 break; 1044 case MSR_CORE_PERF_GLOBAL_CTRL: 1045 if (cpu_has_load_perf_global_ctrl()) { 1046 add_atomic_switch_msr_special(vmx, 1047 VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL, 1048 VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL, 1049 GUEST_IA32_PERF_GLOBAL_CTRL, 1050 HOST_IA32_PERF_GLOBAL_CTRL, 1051 guest_val, host_val); 1052 return; 1053 } 1054 break; 1055 case MSR_IA32_PEBS_ENABLE: 1056 /* PEBS needs a quiescent period after being disabled (to write 1057 * a record). Disabling PEBS through VMX MSR swapping doesn't 1058 * provide that period, so a CPU could write host's record into 1059 * guest's memory. 1060 */ 1061 wrmsrl(MSR_IA32_PEBS_ENABLE, 0); 1062 } 1063 1064 i = vmx_find_loadstore_msr_slot(&m->guest, msr); 1065 if (!entry_only) 1066 j = vmx_find_loadstore_msr_slot(&m->host, msr); 1067 1068 if ((i < 0 && m->guest.nr == MAX_NR_LOADSTORE_MSRS) || 1069 (j < 0 && m->host.nr == MAX_NR_LOADSTORE_MSRS)) { 1070 printk_once(KERN_WARNING "Not enough msr switch entries. " 1071 "Can't add msr %x\n", msr); 1072 return; 1073 } 1074 if (i < 0) { 1075 i = m->guest.nr++; 1076 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->guest.nr); 1077 } 1078 m->guest.val[i].index = msr; 1079 m->guest.val[i].value = guest_val; 1080 1081 if (entry_only) 1082 return; 1083 1084 if (j < 0) { 1085 j = m->host.nr++; 1086 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->host.nr); 1087 } 1088 m->host.val[j].index = msr; 1089 m->host.val[j].value = host_val; 1090 } 1091 1092 static bool update_transition_efer(struct vcpu_vmx *vmx) 1093 { 1094 u64 guest_efer = vmx->vcpu.arch.efer; 1095 u64 ignore_bits = 0; 1096 int i; 1097 1098 /* Shadow paging assumes NX to be available. */ 1099 if (!enable_ept) 1100 guest_efer |= EFER_NX; 1101 1102 /* 1103 * LMA and LME handled by hardware; SCE meaningless outside long mode. 1104 */ 1105 ignore_bits |= EFER_SCE; 1106 #ifdef CONFIG_X86_64 1107 ignore_bits |= EFER_LMA | EFER_LME; 1108 /* SCE is meaningful only in long mode on Intel */ 1109 if (guest_efer & EFER_LMA) 1110 ignore_bits &= ~(u64)EFER_SCE; 1111 #endif 1112 1113 /* 1114 * On EPT, we can't emulate NX, so we must switch EFER atomically. 1115 * On CPUs that support "load IA32_EFER", always switch EFER 1116 * atomically, since it's faster than switching it manually. 1117 */ 1118 if (cpu_has_load_ia32_efer() || 1119 (enable_ept && ((vmx->vcpu.arch.efer ^ host_efer) & EFER_NX))) { 1120 if (!(guest_efer & EFER_LMA)) 1121 guest_efer &= ~EFER_LME; 1122 if (guest_efer != host_efer) 1123 add_atomic_switch_msr(vmx, MSR_EFER, 1124 guest_efer, host_efer, false); 1125 else 1126 clear_atomic_switch_msr(vmx, MSR_EFER); 1127 return false; 1128 } 1129 1130 i = kvm_find_user_return_msr(MSR_EFER); 1131 if (i < 0) 1132 return false; 1133 1134 clear_atomic_switch_msr(vmx, MSR_EFER); 1135 1136 guest_efer &= ~ignore_bits; 1137 guest_efer |= host_efer & ignore_bits; 1138 1139 vmx->guest_uret_msrs[i].data = guest_efer; 1140 vmx->guest_uret_msrs[i].mask = ~ignore_bits; 1141 1142 return true; 1143 } 1144 1145 #ifdef CONFIG_X86_32 1146 /* 1147 * On 32-bit kernels, VM exits still load the FS and GS bases from the 1148 * VMCS rather than the segment table. KVM uses this helper to figure 1149 * out the current bases to poke them into the VMCS before entry. 1150 */ 1151 static unsigned long segment_base(u16 selector) 1152 { 1153 struct desc_struct *table; 1154 unsigned long v; 1155 1156 if (!(selector & ~SEGMENT_RPL_MASK)) 1157 return 0; 1158 1159 table = get_current_gdt_ro(); 1160 1161 if ((selector & SEGMENT_TI_MASK) == SEGMENT_LDT) { 1162 u16 ldt_selector = kvm_read_ldt(); 1163 1164 if (!(ldt_selector & ~SEGMENT_RPL_MASK)) 1165 return 0; 1166 1167 table = (struct desc_struct *)segment_base(ldt_selector); 1168 } 1169 v = get_desc_base(&table[selector >> 3]); 1170 return v; 1171 } 1172 #endif 1173 1174 static inline bool pt_can_write_msr(struct vcpu_vmx *vmx) 1175 { 1176 return vmx_pt_mode_is_host_guest() && 1177 !(vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN); 1178 } 1179 1180 static inline bool pt_output_base_valid(struct kvm_vcpu *vcpu, u64 base) 1181 { 1182 /* The base must be 128-byte aligned and a legal physical address. */ 1183 return kvm_vcpu_is_legal_aligned_gpa(vcpu, base, 128); 1184 } 1185 1186 static inline void pt_load_msr(struct pt_ctx *ctx, u32 addr_range) 1187 { 1188 u32 i; 1189 1190 wrmsrl(MSR_IA32_RTIT_STATUS, ctx->status); 1191 wrmsrl(MSR_IA32_RTIT_OUTPUT_BASE, ctx->output_base); 1192 wrmsrl(MSR_IA32_RTIT_OUTPUT_MASK, ctx->output_mask); 1193 wrmsrl(MSR_IA32_RTIT_CR3_MATCH, ctx->cr3_match); 1194 for (i = 0; i < addr_range; i++) { 1195 wrmsrl(MSR_IA32_RTIT_ADDR0_A + i * 2, ctx->addr_a[i]); 1196 wrmsrl(MSR_IA32_RTIT_ADDR0_B + i * 2, ctx->addr_b[i]); 1197 } 1198 } 1199 1200 static inline void pt_save_msr(struct pt_ctx *ctx, u32 addr_range) 1201 { 1202 u32 i; 1203 1204 rdmsrl(MSR_IA32_RTIT_STATUS, ctx->status); 1205 rdmsrl(MSR_IA32_RTIT_OUTPUT_BASE, ctx->output_base); 1206 rdmsrl(MSR_IA32_RTIT_OUTPUT_MASK, ctx->output_mask); 1207 rdmsrl(MSR_IA32_RTIT_CR3_MATCH, ctx->cr3_match); 1208 for (i = 0; i < addr_range; i++) { 1209 rdmsrl(MSR_IA32_RTIT_ADDR0_A + i * 2, ctx->addr_a[i]); 1210 rdmsrl(MSR_IA32_RTIT_ADDR0_B + i * 2, ctx->addr_b[i]); 1211 } 1212 } 1213 1214 static void pt_guest_enter(struct vcpu_vmx *vmx) 1215 { 1216 if (vmx_pt_mode_is_system()) 1217 return; 1218 1219 /* 1220 * GUEST_IA32_RTIT_CTL is already set in the VMCS. 1221 * Save host state before VM entry. 1222 */ 1223 rdmsrl(MSR_IA32_RTIT_CTL, vmx->pt_desc.host.ctl); 1224 if (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) { 1225 wrmsrl(MSR_IA32_RTIT_CTL, 0); 1226 pt_save_msr(&vmx->pt_desc.host, vmx->pt_desc.num_address_ranges); 1227 pt_load_msr(&vmx->pt_desc.guest, vmx->pt_desc.num_address_ranges); 1228 } 1229 } 1230 1231 static void pt_guest_exit(struct vcpu_vmx *vmx) 1232 { 1233 if (vmx_pt_mode_is_system()) 1234 return; 1235 1236 if (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) { 1237 pt_save_msr(&vmx->pt_desc.guest, vmx->pt_desc.num_address_ranges); 1238 pt_load_msr(&vmx->pt_desc.host, vmx->pt_desc.num_address_ranges); 1239 } 1240 1241 /* 1242 * KVM requires VM_EXIT_CLEAR_IA32_RTIT_CTL to expose PT to the guest, 1243 * i.e. RTIT_CTL is always cleared on VM-Exit. Restore it if necessary. 1244 */ 1245 if (vmx->pt_desc.host.ctl) 1246 wrmsrl(MSR_IA32_RTIT_CTL, vmx->pt_desc.host.ctl); 1247 } 1248 1249 void vmx_set_host_fs_gs(struct vmcs_host_state *host, u16 fs_sel, u16 gs_sel, 1250 unsigned long fs_base, unsigned long gs_base) 1251 { 1252 if (unlikely(fs_sel != host->fs_sel)) { 1253 if (!(fs_sel & 7)) 1254 vmcs_write16(HOST_FS_SELECTOR, fs_sel); 1255 else 1256 vmcs_write16(HOST_FS_SELECTOR, 0); 1257 host->fs_sel = fs_sel; 1258 } 1259 if (unlikely(gs_sel != host->gs_sel)) { 1260 if (!(gs_sel & 7)) 1261 vmcs_write16(HOST_GS_SELECTOR, gs_sel); 1262 else 1263 vmcs_write16(HOST_GS_SELECTOR, 0); 1264 host->gs_sel = gs_sel; 1265 } 1266 if (unlikely(fs_base != host->fs_base)) { 1267 vmcs_writel(HOST_FS_BASE, fs_base); 1268 host->fs_base = fs_base; 1269 } 1270 if (unlikely(gs_base != host->gs_base)) { 1271 vmcs_writel(HOST_GS_BASE, gs_base); 1272 host->gs_base = gs_base; 1273 } 1274 } 1275 1276 void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu) 1277 { 1278 struct vcpu_vmx *vmx = to_vmx(vcpu); 1279 struct vmcs_host_state *host_state; 1280 #ifdef CONFIG_X86_64 1281 int cpu = raw_smp_processor_id(); 1282 #endif 1283 unsigned long fs_base, gs_base; 1284 u16 fs_sel, gs_sel; 1285 int i; 1286 1287 /* 1288 * Note that guest MSRs to be saved/restored can also be changed 1289 * when guest state is loaded. This happens when guest transitions 1290 * to/from long-mode by setting MSR_EFER.LMA. 1291 */ 1292 if (!vmx->guest_uret_msrs_loaded) { 1293 vmx->guest_uret_msrs_loaded = true; 1294 for (i = 0; i < kvm_nr_uret_msrs; ++i) { 1295 if (!vmx->guest_uret_msrs[i].load_into_hardware) 1296 continue; 1297 1298 kvm_set_user_return_msr(i, 1299 vmx->guest_uret_msrs[i].data, 1300 vmx->guest_uret_msrs[i].mask); 1301 } 1302 } 1303 1304 if (vmx->nested.need_vmcs12_to_shadow_sync) 1305 nested_sync_vmcs12_to_shadow(vcpu); 1306 1307 if (vmx->guest_state_loaded) 1308 return; 1309 1310 host_state = &vmx->loaded_vmcs->host_state; 1311 1312 /* 1313 * Set host fs and gs selectors. Unfortunately, 22.2.3 does not 1314 * allow segment selectors with cpl > 0 or ti == 1. 1315 */ 1316 host_state->ldt_sel = kvm_read_ldt(); 1317 1318 #ifdef CONFIG_X86_64 1319 savesegment(ds, host_state->ds_sel); 1320 savesegment(es, host_state->es_sel); 1321 1322 gs_base = cpu_kernelmode_gs_base(cpu); 1323 if (likely(is_64bit_mm(current->mm))) { 1324 current_save_fsgs(); 1325 fs_sel = current->thread.fsindex; 1326 gs_sel = current->thread.gsindex; 1327 fs_base = current->thread.fsbase; 1328 vmx->msr_host_kernel_gs_base = current->thread.gsbase; 1329 } else { 1330 savesegment(fs, fs_sel); 1331 savesegment(gs, gs_sel); 1332 fs_base = read_msr(MSR_FS_BASE); 1333 vmx->msr_host_kernel_gs_base = read_msr(MSR_KERNEL_GS_BASE); 1334 } 1335 1336 wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base); 1337 #else 1338 savesegment(fs, fs_sel); 1339 savesegment(gs, gs_sel); 1340 fs_base = segment_base(fs_sel); 1341 gs_base = segment_base(gs_sel); 1342 #endif 1343 1344 vmx_set_host_fs_gs(host_state, fs_sel, gs_sel, fs_base, gs_base); 1345 vmx->guest_state_loaded = true; 1346 } 1347 1348 static void vmx_prepare_switch_to_host(struct vcpu_vmx *vmx) 1349 { 1350 struct vmcs_host_state *host_state; 1351 1352 if (!vmx->guest_state_loaded) 1353 return; 1354 1355 host_state = &vmx->loaded_vmcs->host_state; 1356 1357 ++vmx->vcpu.stat.host_state_reload; 1358 1359 #ifdef CONFIG_X86_64 1360 rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base); 1361 #endif 1362 if (host_state->ldt_sel || (host_state->gs_sel & 7)) { 1363 kvm_load_ldt(host_state->ldt_sel); 1364 #ifdef CONFIG_X86_64 1365 load_gs_index(host_state->gs_sel); 1366 #else 1367 loadsegment(gs, host_state->gs_sel); 1368 #endif 1369 } 1370 if (host_state->fs_sel & 7) 1371 loadsegment(fs, host_state->fs_sel); 1372 #ifdef CONFIG_X86_64 1373 if (unlikely(host_state->ds_sel | host_state->es_sel)) { 1374 loadsegment(ds, host_state->ds_sel); 1375 loadsegment(es, host_state->es_sel); 1376 } 1377 #endif 1378 invalidate_tss_limit(); 1379 #ifdef CONFIG_X86_64 1380 wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base); 1381 #endif 1382 load_fixmap_gdt(raw_smp_processor_id()); 1383 vmx->guest_state_loaded = false; 1384 vmx->guest_uret_msrs_loaded = false; 1385 } 1386 1387 #ifdef CONFIG_X86_64 1388 static u64 vmx_read_guest_kernel_gs_base(struct vcpu_vmx *vmx) 1389 { 1390 preempt_disable(); 1391 if (vmx->guest_state_loaded) 1392 rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base); 1393 preempt_enable(); 1394 return vmx->msr_guest_kernel_gs_base; 1395 } 1396 1397 static void vmx_write_guest_kernel_gs_base(struct vcpu_vmx *vmx, u64 data) 1398 { 1399 preempt_disable(); 1400 if (vmx->guest_state_loaded) 1401 wrmsrl(MSR_KERNEL_GS_BASE, data); 1402 preempt_enable(); 1403 vmx->msr_guest_kernel_gs_base = data; 1404 } 1405 #endif 1406 1407 void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu, 1408 struct loaded_vmcs *buddy) 1409 { 1410 struct vcpu_vmx *vmx = to_vmx(vcpu); 1411 bool already_loaded = vmx->loaded_vmcs->cpu == cpu; 1412 struct vmcs *prev; 1413 1414 if (!already_loaded) { 1415 loaded_vmcs_clear(vmx->loaded_vmcs); 1416 local_irq_disable(); 1417 1418 /* 1419 * Ensure loaded_vmcs->cpu is read before adding loaded_vmcs to 1420 * this cpu's percpu list, otherwise it may not yet be deleted 1421 * from its previous cpu's percpu list. Pairs with the 1422 * smb_wmb() in __loaded_vmcs_clear(). 1423 */ 1424 smp_rmb(); 1425 1426 list_add(&vmx->loaded_vmcs->loaded_vmcss_on_cpu_link, 1427 &per_cpu(loaded_vmcss_on_cpu, cpu)); 1428 local_irq_enable(); 1429 } 1430 1431 prev = per_cpu(current_vmcs, cpu); 1432 if (prev != vmx->loaded_vmcs->vmcs) { 1433 per_cpu(current_vmcs, cpu) = vmx->loaded_vmcs->vmcs; 1434 vmcs_load(vmx->loaded_vmcs->vmcs); 1435 1436 /* 1437 * No indirect branch prediction barrier needed when switching 1438 * the active VMCS within a vCPU, unless IBRS is advertised to 1439 * the vCPU. To minimize the number of IBPBs executed, KVM 1440 * performs IBPB on nested VM-Exit (a single nested transition 1441 * may switch the active VMCS multiple times). 1442 */ 1443 if (!buddy || WARN_ON_ONCE(buddy->vmcs != prev)) 1444 indirect_branch_prediction_barrier(); 1445 } 1446 1447 if (!already_loaded) { 1448 void *gdt = get_current_gdt_ro(); 1449 1450 /* 1451 * Flush all EPTP/VPID contexts, the new pCPU may have stale 1452 * TLB entries from its previous association with the vCPU. 1453 */ 1454 kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); 1455 1456 /* 1457 * Linux uses per-cpu TSS and GDT, so set these when switching 1458 * processors. See 22.2.4. 1459 */ 1460 vmcs_writel(HOST_TR_BASE, 1461 (unsigned long)&get_cpu_entry_area(cpu)->tss.x86_tss); 1462 vmcs_writel(HOST_GDTR_BASE, (unsigned long)gdt); /* 22.2.4 */ 1463 1464 if (IS_ENABLED(CONFIG_IA32_EMULATION) || IS_ENABLED(CONFIG_X86_32)) { 1465 /* 22.2.3 */ 1466 vmcs_writel(HOST_IA32_SYSENTER_ESP, 1467 (unsigned long)(cpu_entry_stack(cpu) + 1)); 1468 } 1469 1470 vmx->loaded_vmcs->cpu = cpu; 1471 } 1472 } 1473 1474 /* 1475 * Switches to specified vcpu, until a matching vcpu_put(), but assumes 1476 * vcpu mutex is already taken. 1477 */ 1478 static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) 1479 { 1480 struct vcpu_vmx *vmx = to_vmx(vcpu); 1481 1482 vmx_vcpu_load_vmcs(vcpu, cpu, NULL); 1483 1484 vmx_vcpu_pi_load(vcpu, cpu); 1485 1486 vmx->host_debugctlmsr = get_debugctlmsr(); 1487 } 1488 1489 static void vmx_vcpu_put(struct kvm_vcpu *vcpu) 1490 { 1491 vmx_vcpu_pi_put(vcpu); 1492 1493 vmx_prepare_switch_to_host(to_vmx(vcpu)); 1494 } 1495 1496 bool vmx_emulation_required(struct kvm_vcpu *vcpu) 1497 { 1498 return emulate_invalid_guest_state && !vmx_guest_state_valid(vcpu); 1499 } 1500 1501 unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu) 1502 { 1503 struct vcpu_vmx *vmx = to_vmx(vcpu); 1504 unsigned long rflags, save_rflags; 1505 1506 if (!kvm_register_is_available(vcpu, VCPU_EXREG_RFLAGS)) { 1507 kvm_register_mark_available(vcpu, VCPU_EXREG_RFLAGS); 1508 rflags = vmcs_readl(GUEST_RFLAGS); 1509 if (vmx->rmode.vm86_active) { 1510 rflags &= RMODE_GUEST_OWNED_EFLAGS_BITS; 1511 save_rflags = vmx->rmode.save_rflags; 1512 rflags |= save_rflags & ~RMODE_GUEST_OWNED_EFLAGS_BITS; 1513 } 1514 vmx->rflags = rflags; 1515 } 1516 return vmx->rflags; 1517 } 1518 1519 void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) 1520 { 1521 struct vcpu_vmx *vmx = to_vmx(vcpu); 1522 unsigned long old_rflags; 1523 1524 /* 1525 * Unlike CR0 and CR4, RFLAGS handling requires checking if the vCPU 1526 * is an unrestricted guest in order to mark L2 as needing emulation 1527 * if L1 runs L2 as a restricted guest. 1528 */ 1529 if (is_unrestricted_guest(vcpu)) { 1530 kvm_register_mark_available(vcpu, VCPU_EXREG_RFLAGS); 1531 vmx->rflags = rflags; 1532 vmcs_writel(GUEST_RFLAGS, rflags); 1533 return; 1534 } 1535 1536 old_rflags = vmx_get_rflags(vcpu); 1537 vmx->rflags = rflags; 1538 if (vmx->rmode.vm86_active) { 1539 vmx->rmode.save_rflags = rflags; 1540 rflags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM; 1541 } 1542 vmcs_writel(GUEST_RFLAGS, rflags); 1543 1544 if ((old_rflags ^ vmx->rflags) & X86_EFLAGS_VM) 1545 vmx->emulation_required = vmx_emulation_required(vcpu); 1546 } 1547 1548 static bool vmx_get_if_flag(struct kvm_vcpu *vcpu) 1549 { 1550 return vmx_get_rflags(vcpu) & X86_EFLAGS_IF; 1551 } 1552 1553 u32 vmx_get_interrupt_shadow(struct kvm_vcpu *vcpu) 1554 { 1555 u32 interruptibility = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO); 1556 int ret = 0; 1557 1558 if (interruptibility & GUEST_INTR_STATE_STI) 1559 ret |= KVM_X86_SHADOW_INT_STI; 1560 if (interruptibility & GUEST_INTR_STATE_MOV_SS) 1561 ret |= KVM_X86_SHADOW_INT_MOV_SS; 1562 1563 return ret; 1564 } 1565 1566 void vmx_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask) 1567 { 1568 u32 interruptibility_old = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO); 1569 u32 interruptibility = interruptibility_old; 1570 1571 interruptibility &= ~(GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS); 1572 1573 if (mask & KVM_X86_SHADOW_INT_MOV_SS) 1574 interruptibility |= GUEST_INTR_STATE_MOV_SS; 1575 else if (mask & KVM_X86_SHADOW_INT_STI) 1576 interruptibility |= GUEST_INTR_STATE_STI; 1577 1578 if ((interruptibility != interruptibility_old)) 1579 vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, interruptibility); 1580 } 1581 1582 static int vmx_rtit_ctl_check(struct kvm_vcpu *vcpu, u64 data) 1583 { 1584 struct vcpu_vmx *vmx = to_vmx(vcpu); 1585 unsigned long value; 1586 1587 /* 1588 * Any MSR write that attempts to change bits marked reserved will 1589 * case a #GP fault. 1590 */ 1591 if (data & vmx->pt_desc.ctl_bitmask) 1592 return 1; 1593 1594 /* 1595 * Any attempt to modify IA32_RTIT_CTL while TraceEn is set will 1596 * result in a #GP unless the same write also clears TraceEn. 1597 */ 1598 if ((vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) && 1599 ((vmx->pt_desc.guest.ctl ^ data) & ~RTIT_CTL_TRACEEN)) 1600 return 1; 1601 1602 /* 1603 * WRMSR to IA32_RTIT_CTL that sets TraceEn but clears this bit 1604 * and FabricEn would cause #GP, if 1605 * CPUID.(EAX=14H, ECX=0):ECX.SNGLRGNOUT[bit 2] = 0 1606 */ 1607 if ((data & RTIT_CTL_TRACEEN) && !(data & RTIT_CTL_TOPA) && 1608 !(data & RTIT_CTL_FABRIC_EN) && 1609 !intel_pt_validate_cap(vmx->pt_desc.caps, 1610 PT_CAP_single_range_output)) 1611 return 1; 1612 1613 /* 1614 * MTCFreq, CycThresh and PSBFreq encodings check, any MSR write that 1615 * utilize encodings marked reserved will cause a #GP fault. 1616 */ 1617 value = intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_mtc_periods); 1618 if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_mtc) && 1619 !test_bit((data & RTIT_CTL_MTC_RANGE) >> 1620 RTIT_CTL_MTC_RANGE_OFFSET, &value)) 1621 return 1; 1622 value = intel_pt_validate_cap(vmx->pt_desc.caps, 1623 PT_CAP_cycle_thresholds); 1624 if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_psb_cyc) && 1625 !test_bit((data & RTIT_CTL_CYC_THRESH) >> 1626 RTIT_CTL_CYC_THRESH_OFFSET, &value)) 1627 return 1; 1628 value = intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_psb_periods); 1629 if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_psb_cyc) && 1630 !test_bit((data & RTIT_CTL_PSB_FREQ) >> 1631 RTIT_CTL_PSB_FREQ_OFFSET, &value)) 1632 return 1; 1633 1634 /* 1635 * If ADDRx_CFG is reserved or the encodings is >2 will 1636 * cause a #GP fault. 1637 */ 1638 value = (data & RTIT_CTL_ADDR0) >> RTIT_CTL_ADDR0_OFFSET; 1639 if ((value && (vmx->pt_desc.num_address_ranges < 1)) || (value > 2)) 1640 return 1; 1641 value = (data & RTIT_CTL_ADDR1) >> RTIT_CTL_ADDR1_OFFSET; 1642 if ((value && (vmx->pt_desc.num_address_ranges < 2)) || (value > 2)) 1643 return 1; 1644 value = (data & RTIT_CTL_ADDR2) >> RTIT_CTL_ADDR2_OFFSET; 1645 if ((value && (vmx->pt_desc.num_address_ranges < 3)) || (value > 2)) 1646 return 1; 1647 value = (data & RTIT_CTL_ADDR3) >> RTIT_CTL_ADDR3_OFFSET; 1648 if ((value && (vmx->pt_desc.num_address_ranges < 4)) || (value > 2)) 1649 return 1; 1650 1651 return 0; 1652 } 1653 1654 static int vmx_check_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type, 1655 void *insn, int insn_len) 1656 { 1657 /* 1658 * Emulation of instructions in SGX enclaves is impossible as RIP does 1659 * not point at the failing instruction, and even if it did, the code 1660 * stream is inaccessible. Inject #UD instead of exiting to userspace 1661 * so that guest userspace can't DoS the guest simply by triggering 1662 * emulation (enclaves are CPL3 only). 1663 */ 1664 if (to_vmx(vcpu)->exit_reason.enclave_mode) { 1665 kvm_queue_exception(vcpu, UD_VECTOR); 1666 return X86EMUL_PROPAGATE_FAULT; 1667 } 1668 return X86EMUL_CONTINUE; 1669 } 1670 1671 static int skip_emulated_instruction(struct kvm_vcpu *vcpu) 1672 { 1673 union vmx_exit_reason exit_reason = to_vmx(vcpu)->exit_reason; 1674 unsigned long rip, orig_rip; 1675 u32 instr_len; 1676 1677 /* 1678 * Using VMCS.VM_EXIT_INSTRUCTION_LEN on EPT misconfig depends on 1679 * undefined behavior: Intel's SDM doesn't mandate the VMCS field be 1680 * set when EPT misconfig occurs. In practice, real hardware updates 1681 * VM_EXIT_INSTRUCTION_LEN on EPT misconfig, but other hypervisors 1682 * (namely Hyper-V) don't set it due to it being undefined behavior, 1683 * i.e. we end up advancing IP with some random value. 1684 */ 1685 if (!static_cpu_has(X86_FEATURE_HYPERVISOR) || 1686 exit_reason.basic != EXIT_REASON_EPT_MISCONFIG) { 1687 instr_len = vmcs_read32(VM_EXIT_INSTRUCTION_LEN); 1688 1689 /* 1690 * Emulating an enclave's instructions isn't supported as KVM 1691 * cannot access the enclave's memory or its true RIP, e.g. the 1692 * vmcs.GUEST_RIP points at the exit point of the enclave, not 1693 * the RIP that actually triggered the VM-Exit. But, because 1694 * most instructions that cause VM-Exit will #UD in an enclave, 1695 * most instruction-based VM-Exits simply do not occur. 1696 * 1697 * There are a few exceptions, notably the debug instructions 1698 * INT1ICEBRK and INT3, as they are allowed in debug enclaves 1699 * and generate #DB/#BP as expected, which KVM might intercept. 1700 * But again, the CPU does the dirty work and saves an instr 1701 * length of zero so VMMs don't shoot themselves in the foot. 1702 * WARN if KVM tries to skip a non-zero length instruction on 1703 * a VM-Exit from an enclave. 1704 */ 1705 if (!instr_len) 1706 goto rip_updated; 1707 1708 WARN_ONCE(exit_reason.enclave_mode, 1709 "skipping instruction after SGX enclave VM-Exit"); 1710 1711 orig_rip = kvm_rip_read(vcpu); 1712 rip = orig_rip + instr_len; 1713 #ifdef CONFIG_X86_64 1714 /* 1715 * We need to mask out the high 32 bits of RIP if not in 64-bit 1716 * mode, but just finding out that we are in 64-bit mode is 1717 * quite expensive. Only do it if there was a carry. 1718 */ 1719 if (unlikely(((rip ^ orig_rip) >> 31) == 3) && !is_64_bit_mode(vcpu)) 1720 rip = (u32)rip; 1721 #endif 1722 kvm_rip_write(vcpu, rip); 1723 } else { 1724 if (!kvm_emulate_instruction(vcpu, EMULTYPE_SKIP)) 1725 return 0; 1726 } 1727 1728 rip_updated: 1729 /* skipping an emulated instruction also counts */ 1730 vmx_set_interrupt_shadow(vcpu, 0); 1731 1732 return 1; 1733 } 1734 1735 /* 1736 * Recognizes a pending MTF VM-exit and records the nested state for later 1737 * delivery. 1738 */ 1739 static void vmx_update_emulated_instruction(struct kvm_vcpu *vcpu) 1740 { 1741 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 1742 struct vcpu_vmx *vmx = to_vmx(vcpu); 1743 1744 if (!is_guest_mode(vcpu)) 1745 return; 1746 1747 /* 1748 * Per the SDM, MTF takes priority over debug-trap exceptions besides 1749 * TSS T-bit traps and ICEBP (INT1). KVM doesn't emulate T-bit traps 1750 * or ICEBP (in the emulator proper), and skipping of ICEBP after an 1751 * intercepted #DB deliberately avoids single-step #DB and MTF updates 1752 * as ICEBP is higher priority than both. As instruction emulation is 1753 * completed at this point (i.e. KVM is at the instruction boundary), 1754 * any #DB exception pending delivery must be a debug-trap of lower 1755 * priority than MTF. Record the pending MTF state to be delivered in 1756 * vmx_check_nested_events(). 1757 */ 1758 if (nested_cpu_has_mtf(vmcs12) && 1759 (!vcpu->arch.exception.pending || 1760 vcpu->arch.exception.vector == DB_VECTOR) && 1761 (!vcpu->arch.exception_vmexit.pending || 1762 vcpu->arch.exception_vmexit.vector == DB_VECTOR)) { 1763 vmx->nested.mtf_pending = true; 1764 kvm_make_request(KVM_REQ_EVENT, vcpu); 1765 } else { 1766 vmx->nested.mtf_pending = false; 1767 } 1768 } 1769 1770 static int vmx_skip_emulated_instruction(struct kvm_vcpu *vcpu) 1771 { 1772 vmx_update_emulated_instruction(vcpu); 1773 return skip_emulated_instruction(vcpu); 1774 } 1775 1776 static void vmx_clear_hlt(struct kvm_vcpu *vcpu) 1777 { 1778 /* 1779 * Ensure that we clear the HLT state in the VMCS. We don't need to 1780 * explicitly skip the instruction because if the HLT state is set, 1781 * then the instruction is already executing and RIP has already been 1782 * advanced. 1783 */ 1784 if (kvm_hlt_in_guest(vcpu->kvm) && 1785 vmcs_read32(GUEST_ACTIVITY_STATE) == GUEST_ACTIVITY_HLT) 1786 vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE); 1787 } 1788 1789 static void vmx_inject_exception(struct kvm_vcpu *vcpu) 1790 { 1791 struct kvm_queued_exception *ex = &vcpu->arch.exception; 1792 u32 intr_info = ex->vector | INTR_INFO_VALID_MASK; 1793 struct vcpu_vmx *vmx = to_vmx(vcpu); 1794 1795 kvm_deliver_exception_payload(vcpu, ex); 1796 1797 if (ex->has_error_code) { 1798 /* 1799 * Despite the error code being architecturally defined as 32 1800 * bits, and the VMCS field being 32 bits, Intel CPUs and thus 1801 * VMX don't actually supporting setting bits 31:16. Hardware 1802 * will (should) never provide a bogus error code, but AMD CPUs 1803 * do generate error codes with bits 31:16 set, and so KVM's 1804 * ABI lets userspace shove in arbitrary 32-bit values. Drop 1805 * the upper bits to avoid VM-Fail, losing information that 1806 * doesn't really exist is preferable to killing the VM. 1807 */ 1808 vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, (u16)ex->error_code); 1809 intr_info |= INTR_INFO_DELIVER_CODE_MASK; 1810 } 1811 1812 if (vmx->rmode.vm86_active) { 1813 int inc_eip = 0; 1814 if (kvm_exception_is_soft(ex->vector)) 1815 inc_eip = vcpu->arch.event_exit_inst_len; 1816 kvm_inject_realmode_interrupt(vcpu, ex->vector, inc_eip); 1817 return; 1818 } 1819 1820 WARN_ON_ONCE(vmx->emulation_required); 1821 1822 if (kvm_exception_is_soft(ex->vector)) { 1823 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 1824 vmx->vcpu.arch.event_exit_inst_len); 1825 intr_info |= INTR_TYPE_SOFT_EXCEPTION; 1826 } else 1827 intr_info |= INTR_TYPE_HARD_EXCEPTION; 1828 1829 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr_info); 1830 1831 vmx_clear_hlt(vcpu); 1832 } 1833 1834 static void vmx_setup_uret_msr(struct vcpu_vmx *vmx, unsigned int msr, 1835 bool load_into_hardware) 1836 { 1837 struct vmx_uret_msr *uret_msr; 1838 1839 uret_msr = vmx_find_uret_msr(vmx, msr); 1840 if (!uret_msr) 1841 return; 1842 1843 uret_msr->load_into_hardware = load_into_hardware; 1844 } 1845 1846 /* 1847 * Configuring user return MSRs to automatically save, load, and restore MSRs 1848 * that need to be shoved into hardware when running the guest. Note, omitting 1849 * an MSR here does _NOT_ mean it's not emulated, only that it will not be 1850 * loaded into hardware when running the guest. 1851 */ 1852 static void vmx_setup_uret_msrs(struct vcpu_vmx *vmx) 1853 { 1854 #ifdef CONFIG_X86_64 1855 bool load_syscall_msrs; 1856 1857 /* 1858 * The SYSCALL MSRs are only needed on long mode guests, and only 1859 * when EFER.SCE is set. 1860 */ 1861 load_syscall_msrs = is_long_mode(&vmx->vcpu) && 1862 (vmx->vcpu.arch.efer & EFER_SCE); 1863 1864 vmx_setup_uret_msr(vmx, MSR_STAR, load_syscall_msrs); 1865 vmx_setup_uret_msr(vmx, MSR_LSTAR, load_syscall_msrs); 1866 vmx_setup_uret_msr(vmx, MSR_SYSCALL_MASK, load_syscall_msrs); 1867 #endif 1868 vmx_setup_uret_msr(vmx, MSR_EFER, update_transition_efer(vmx)); 1869 1870 vmx_setup_uret_msr(vmx, MSR_TSC_AUX, 1871 guest_cpuid_has(&vmx->vcpu, X86_FEATURE_RDTSCP) || 1872 guest_cpuid_has(&vmx->vcpu, X86_FEATURE_RDPID)); 1873 1874 /* 1875 * hle=0, rtm=0, tsx_ctrl=1 can be found with some combinations of new 1876 * kernel and old userspace. If those guests run on a tsx=off host, do 1877 * allow guests to use TSX_CTRL, but don't change the value in hardware 1878 * so that TSX remains always disabled. 1879 */ 1880 vmx_setup_uret_msr(vmx, MSR_IA32_TSX_CTRL, boot_cpu_has(X86_FEATURE_RTM)); 1881 1882 /* 1883 * The set of MSRs to load may have changed, reload MSRs before the 1884 * next VM-Enter. 1885 */ 1886 vmx->guest_uret_msrs_loaded = false; 1887 } 1888 1889 u64 vmx_get_l2_tsc_offset(struct kvm_vcpu *vcpu) 1890 { 1891 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 1892 1893 if (nested_cpu_has(vmcs12, CPU_BASED_USE_TSC_OFFSETTING)) 1894 return vmcs12->tsc_offset; 1895 1896 return 0; 1897 } 1898 1899 u64 vmx_get_l2_tsc_multiplier(struct kvm_vcpu *vcpu) 1900 { 1901 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 1902 1903 if (nested_cpu_has(vmcs12, CPU_BASED_USE_TSC_OFFSETTING) && 1904 nested_cpu_has2(vmcs12, SECONDARY_EXEC_TSC_SCALING)) 1905 return vmcs12->tsc_multiplier; 1906 1907 return kvm_caps.default_tsc_scaling_ratio; 1908 } 1909 1910 static void vmx_write_tsc_offset(struct kvm_vcpu *vcpu) 1911 { 1912 vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset); 1913 } 1914 1915 static void vmx_write_tsc_multiplier(struct kvm_vcpu *vcpu) 1916 { 1917 vmcs_write64(TSC_MULTIPLIER, vcpu->arch.tsc_scaling_ratio); 1918 } 1919 1920 /* 1921 * Userspace is allowed to set any supported IA32_FEATURE_CONTROL regardless of 1922 * guest CPUID. Note, KVM allows userspace to set "VMX in SMX" to maintain 1923 * backwards compatibility even though KVM doesn't support emulating SMX. And 1924 * because userspace set "VMX in SMX", the guest must also be allowed to set it, 1925 * e.g. if the MSR is left unlocked and the guest does a RMW operation. 1926 */ 1927 #define KVM_SUPPORTED_FEATURE_CONTROL (FEAT_CTL_LOCKED | \ 1928 FEAT_CTL_VMX_ENABLED_INSIDE_SMX | \ 1929 FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX | \ 1930 FEAT_CTL_SGX_LC_ENABLED | \ 1931 FEAT_CTL_SGX_ENABLED | \ 1932 FEAT_CTL_LMCE_ENABLED) 1933 1934 static inline bool is_vmx_feature_control_msr_valid(struct vcpu_vmx *vmx, 1935 struct msr_data *msr) 1936 { 1937 uint64_t valid_bits; 1938 1939 /* 1940 * Ensure KVM_SUPPORTED_FEATURE_CONTROL is updated when new bits are 1941 * exposed to the guest. 1942 */ 1943 WARN_ON_ONCE(vmx->msr_ia32_feature_control_valid_bits & 1944 ~KVM_SUPPORTED_FEATURE_CONTROL); 1945 1946 if (!msr->host_initiated && 1947 (vmx->msr_ia32_feature_control & FEAT_CTL_LOCKED)) 1948 return false; 1949 1950 if (msr->host_initiated) 1951 valid_bits = KVM_SUPPORTED_FEATURE_CONTROL; 1952 else 1953 valid_bits = vmx->msr_ia32_feature_control_valid_bits; 1954 1955 return !(msr->data & ~valid_bits); 1956 } 1957 1958 static int vmx_get_msr_feature(struct kvm_msr_entry *msr) 1959 { 1960 switch (msr->index) { 1961 case KVM_FIRST_EMULATED_VMX_MSR ... KVM_LAST_EMULATED_VMX_MSR: 1962 if (!nested) 1963 return 1; 1964 return vmx_get_vmx_msr(&vmcs_config.nested, msr->index, &msr->data); 1965 default: 1966 return KVM_MSR_RET_INVALID; 1967 } 1968 } 1969 1970 /* 1971 * Reads an msr value (of 'msr_info->index') into 'msr_info->data'. 1972 * Returns 0 on success, non-0 otherwise. 1973 * Assumes vcpu_load() was already called. 1974 */ 1975 static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) 1976 { 1977 struct vcpu_vmx *vmx = to_vmx(vcpu); 1978 struct vmx_uret_msr *msr; 1979 u32 index; 1980 1981 switch (msr_info->index) { 1982 #ifdef CONFIG_X86_64 1983 case MSR_FS_BASE: 1984 msr_info->data = vmcs_readl(GUEST_FS_BASE); 1985 break; 1986 case MSR_GS_BASE: 1987 msr_info->data = vmcs_readl(GUEST_GS_BASE); 1988 break; 1989 case MSR_KERNEL_GS_BASE: 1990 msr_info->data = vmx_read_guest_kernel_gs_base(vmx); 1991 break; 1992 #endif 1993 case MSR_EFER: 1994 return kvm_get_msr_common(vcpu, msr_info); 1995 case MSR_IA32_TSX_CTRL: 1996 if (!msr_info->host_initiated && 1997 !(vcpu->arch.arch_capabilities & ARCH_CAP_TSX_CTRL_MSR)) 1998 return 1; 1999 goto find_uret_msr; 2000 case MSR_IA32_UMWAIT_CONTROL: 2001 if (!msr_info->host_initiated && !vmx_has_waitpkg(vmx)) 2002 return 1; 2003 2004 msr_info->data = vmx->msr_ia32_umwait_control; 2005 break; 2006 case MSR_IA32_SPEC_CTRL: 2007 if (!msr_info->host_initiated && 2008 !guest_has_spec_ctrl_msr(vcpu)) 2009 return 1; 2010 2011 msr_info->data = to_vmx(vcpu)->spec_ctrl; 2012 break; 2013 case MSR_IA32_SYSENTER_CS: 2014 msr_info->data = vmcs_read32(GUEST_SYSENTER_CS); 2015 break; 2016 case MSR_IA32_SYSENTER_EIP: 2017 msr_info->data = vmcs_readl(GUEST_SYSENTER_EIP); 2018 break; 2019 case MSR_IA32_SYSENTER_ESP: 2020 msr_info->data = vmcs_readl(GUEST_SYSENTER_ESP); 2021 break; 2022 case MSR_IA32_BNDCFGS: 2023 if (!kvm_mpx_supported() || 2024 (!msr_info->host_initiated && 2025 !guest_cpuid_has(vcpu, X86_FEATURE_MPX))) 2026 return 1; 2027 msr_info->data = vmcs_read64(GUEST_BNDCFGS); 2028 break; 2029 case MSR_IA32_MCG_EXT_CTL: 2030 if (!msr_info->host_initiated && 2031 !(vmx->msr_ia32_feature_control & 2032 FEAT_CTL_LMCE_ENABLED)) 2033 return 1; 2034 msr_info->data = vcpu->arch.mcg_ext_ctl; 2035 break; 2036 case MSR_IA32_FEAT_CTL: 2037 msr_info->data = vmx->msr_ia32_feature_control; 2038 break; 2039 case MSR_IA32_SGXLEPUBKEYHASH0 ... MSR_IA32_SGXLEPUBKEYHASH3: 2040 if (!msr_info->host_initiated && 2041 !guest_cpuid_has(vcpu, X86_FEATURE_SGX_LC)) 2042 return 1; 2043 msr_info->data = to_vmx(vcpu)->msr_ia32_sgxlepubkeyhash 2044 [msr_info->index - MSR_IA32_SGXLEPUBKEYHASH0]; 2045 break; 2046 case KVM_FIRST_EMULATED_VMX_MSR ... KVM_LAST_EMULATED_VMX_MSR: 2047 if (!guest_can_use(vcpu, X86_FEATURE_VMX)) 2048 return 1; 2049 if (vmx_get_vmx_msr(&vmx->nested.msrs, msr_info->index, 2050 &msr_info->data)) 2051 return 1; 2052 #ifdef CONFIG_KVM_HYPERV 2053 /* 2054 * Enlightened VMCS v1 doesn't have certain VMCS fields but 2055 * instead of just ignoring the features, different Hyper-V 2056 * versions are either trying to use them and fail or do some 2057 * sanity checking and refuse to boot. Filter all unsupported 2058 * features out. 2059 */ 2060 if (!msr_info->host_initiated && guest_cpuid_has_evmcs(vcpu)) 2061 nested_evmcs_filter_control_msr(vcpu, msr_info->index, 2062 &msr_info->data); 2063 #endif 2064 break; 2065 case MSR_IA32_RTIT_CTL: 2066 if (!vmx_pt_mode_is_host_guest()) 2067 return 1; 2068 msr_info->data = vmx->pt_desc.guest.ctl; 2069 break; 2070 case MSR_IA32_RTIT_STATUS: 2071 if (!vmx_pt_mode_is_host_guest()) 2072 return 1; 2073 msr_info->data = vmx->pt_desc.guest.status; 2074 break; 2075 case MSR_IA32_RTIT_CR3_MATCH: 2076 if (!vmx_pt_mode_is_host_guest() || 2077 !intel_pt_validate_cap(vmx->pt_desc.caps, 2078 PT_CAP_cr3_filtering)) 2079 return 1; 2080 msr_info->data = vmx->pt_desc.guest.cr3_match; 2081 break; 2082 case MSR_IA32_RTIT_OUTPUT_BASE: 2083 if (!vmx_pt_mode_is_host_guest() || 2084 (!intel_pt_validate_cap(vmx->pt_desc.caps, 2085 PT_CAP_topa_output) && 2086 !intel_pt_validate_cap(vmx->pt_desc.caps, 2087 PT_CAP_single_range_output))) 2088 return 1; 2089 msr_info->data = vmx->pt_desc.guest.output_base; 2090 break; 2091 case MSR_IA32_RTIT_OUTPUT_MASK: 2092 if (!vmx_pt_mode_is_host_guest() || 2093 (!intel_pt_validate_cap(vmx->pt_desc.caps, 2094 PT_CAP_topa_output) && 2095 !intel_pt_validate_cap(vmx->pt_desc.caps, 2096 PT_CAP_single_range_output))) 2097 return 1; 2098 msr_info->data = vmx->pt_desc.guest.output_mask; 2099 break; 2100 case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B: 2101 index = msr_info->index - MSR_IA32_RTIT_ADDR0_A; 2102 if (!vmx_pt_mode_is_host_guest() || 2103 (index >= 2 * vmx->pt_desc.num_address_ranges)) 2104 return 1; 2105 if (index % 2) 2106 msr_info->data = vmx->pt_desc.guest.addr_b[index / 2]; 2107 else 2108 msr_info->data = vmx->pt_desc.guest.addr_a[index / 2]; 2109 break; 2110 case MSR_IA32_DEBUGCTLMSR: 2111 msr_info->data = vmcs_read64(GUEST_IA32_DEBUGCTL); 2112 break; 2113 default: 2114 find_uret_msr: 2115 msr = vmx_find_uret_msr(vmx, msr_info->index); 2116 if (msr) { 2117 msr_info->data = msr->data; 2118 break; 2119 } 2120 return kvm_get_msr_common(vcpu, msr_info); 2121 } 2122 2123 return 0; 2124 } 2125 2126 static u64 nested_vmx_truncate_sysenter_addr(struct kvm_vcpu *vcpu, 2127 u64 data) 2128 { 2129 #ifdef CONFIG_X86_64 2130 if (!guest_cpuid_has(vcpu, X86_FEATURE_LM)) 2131 return (u32)data; 2132 #endif 2133 return (unsigned long)data; 2134 } 2135 2136 static u64 vmx_get_supported_debugctl(struct kvm_vcpu *vcpu, bool host_initiated) 2137 { 2138 u64 debugctl = 0; 2139 2140 if (boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT) && 2141 (host_initiated || guest_cpuid_has(vcpu, X86_FEATURE_BUS_LOCK_DETECT))) 2142 debugctl |= DEBUGCTLMSR_BUS_LOCK_DETECT; 2143 2144 if ((kvm_caps.supported_perf_cap & PMU_CAP_LBR_FMT) && 2145 (host_initiated || intel_pmu_lbr_is_enabled(vcpu))) 2146 debugctl |= DEBUGCTLMSR_LBR | DEBUGCTLMSR_FREEZE_LBRS_ON_PMI; 2147 2148 return debugctl; 2149 } 2150 2151 /* 2152 * Writes msr value into the appropriate "register". 2153 * Returns 0 on success, non-0 otherwise. 2154 * Assumes vcpu_load() was already called. 2155 */ 2156 static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) 2157 { 2158 struct vcpu_vmx *vmx = to_vmx(vcpu); 2159 struct vmx_uret_msr *msr; 2160 int ret = 0; 2161 u32 msr_index = msr_info->index; 2162 u64 data = msr_info->data; 2163 u32 index; 2164 2165 switch (msr_index) { 2166 case MSR_EFER: 2167 ret = kvm_set_msr_common(vcpu, msr_info); 2168 break; 2169 #ifdef CONFIG_X86_64 2170 case MSR_FS_BASE: 2171 vmx_segment_cache_clear(vmx); 2172 vmcs_writel(GUEST_FS_BASE, data); 2173 break; 2174 case MSR_GS_BASE: 2175 vmx_segment_cache_clear(vmx); 2176 vmcs_writel(GUEST_GS_BASE, data); 2177 break; 2178 case MSR_KERNEL_GS_BASE: 2179 vmx_write_guest_kernel_gs_base(vmx, data); 2180 break; 2181 case MSR_IA32_XFD: 2182 ret = kvm_set_msr_common(vcpu, msr_info); 2183 /* 2184 * Always intercepting WRMSR could incur non-negligible 2185 * overhead given xfd might be changed frequently in 2186 * guest context switch. Disable write interception 2187 * upon the first write with a non-zero value (indicating 2188 * potential usage on dynamic xfeatures). Also update 2189 * exception bitmap to trap #NM for proper virtualization 2190 * of guest xfd_err. 2191 */ 2192 if (!ret && data) { 2193 vmx_disable_intercept_for_msr(vcpu, MSR_IA32_XFD, 2194 MSR_TYPE_RW); 2195 vcpu->arch.xfd_no_write_intercept = true; 2196 vmx_update_exception_bitmap(vcpu); 2197 } 2198 break; 2199 #endif 2200 case MSR_IA32_SYSENTER_CS: 2201 if (is_guest_mode(vcpu)) 2202 get_vmcs12(vcpu)->guest_sysenter_cs = data; 2203 vmcs_write32(GUEST_SYSENTER_CS, data); 2204 break; 2205 case MSR_IA32_SYSENTER_EIP: 2206 if (is_guest_mode(vcpu)) { 2207 data = nested_vmx_truncate_sysenter_addr(vcpu, data); 2208 get_vmcs12(vcpu)->guest_sysenter_eip = data; 2209 } 2210 vmcs_writel(GUEST_SYSENTER_EIP, data); 2211 break; 2212 case MSR_IA32_SYSENTER_ESP: 2213 if (is_guest_mode(vcpu)) { 2214 data = nested_vmx_truncate_sysenter_addr(vcpu, data); 2215 get_vmcs12(vcpu)->guest_sysenter_esp = data; 2216 } 2217 vmcs_writel(GUEST_SYSENTER_ESP, data); 2218 break; 2219 case MSR_IA32_DEBUGCTLMSR: { 2220 u64 invalid; 2221 2222 invalid = data & ~vmx_get_supported_debugctl(vcpu, msr_info->host_initiated); 2223 if (invalid & (DEBUGCTLMSR_BTF|DEBUGCTLMSR_LBR)) { 2224 kvm_pr_unimpl_wrmsr(vcpu, msr_index, data); 2225 data &= ~(DEBUGCTLMSR_BTF|DEBUGCTLMSR_LBR); 2226 invalid &= ~(DEBUGCTLMSR_BTF|DEBUGCTLMSR_LBR); 2227 } 2228 2229 if (invalid) 2230 return 1; 2231 2232 if (is_guest_mode(vcpu) && get_vmcs12(vcpu)->vm_exit_controls & 2233 VM_EXIT_SAVE_DEBUG_CONTROLS) 2234 get_vmcs12(vcpu)->guest_ia32_debugctl = data; 2235 2236 vmcs_write64(GUEST_IA32_DEBUGCTL, data); 2237 if (intel_pmu_lbr_is_enabled(vcpu) && !to_vmx(vcpu)->lbr_desc.event && 2238 (data & DEBUGCTLMSR_LBR)) 2239 intel_pmu_create_guest_lbr_event(vcpu); 2240 return 0; 2241 } 2242 case MSR_IA32_BNDCFGS: 2243 if (!kvm_mpx_supported() || 2244 (!msr_info->host_initiated && 2245 !guest_cpuid_has(vcpu, X86_FEATURE_MPX))) 2246 return 1; 2247 if (is_noncanonical_address(data & PAGE_MASK, vcpu) || 2248 (data & MSR_IA32_BNDCFGS_RSVD)) 2249 return 1; 2250 2251 if (is_guest_mode(vcpu) && 2252 ((vmx->nested.msrs.entry_ctls_high & VM_ENTRY_LOAD_BNDCFGS) || 2253 (vmx->nested.msrs.exit_ctls_high & VM_EXIT_CLEAR_BNDCFGS))) 2254 get_vmcs12(vcpu)->guest_bndcfgs = data; 2255 2256 vmcs_write64(GUEST_BNDCFGS, data); 2257 break; 2258 case MSR_IA32_UMWAIT_CONTROL: 2259 if (!msr_info->host_initiated && !vmx_has_waitpkg(vmx)) 2260 return 1; 2261 2262 /* The reserved bit 1 and non-32 bit [63:32] should be zero */ 2263 if (data & (BIT_ULL(1) | GENMASK_ULL(63, 32))) 2264 return 1; 2265 2266 vmx->msr_ia32_umwait_control = data; 2267 break; 2268 case MSR_IA32_SPEC_CTRL: 2269 if (!msr_info->host_initiated && 2270 !guest_has_spec_ctrl_msr(vcpu)) 2271 return 1; 2272 2273 if (kvm_spec_ctrl_test_value(data)) 2274 return 1; 2275 2276 vmx->spec_ctrl = data; 2277 if (!data) 2278 break; 2279 2280 /* 2281 * For non-nested: 2282 * When it's written (to non-zero) for the first time, pass 2283 * it through. 2284 * 2285 * For nested: 2286 * The handling of the MSR bitmap for L2 guests is done in 2287 * nested_vmx_prepare_msr_bitmap. We should not touch the 2288 * vmcs02.msr_bitmap here since it gets completely overwritten 2289 * in the merging. We update the vmcs01 here for L1 as well 2290 * since it will end up touching the MSR anyway now. 2291 */ 2292 vmx_disable_intercept_for_msr(vcpu, 2293 MSR_IA32_SPEC_CTRL, 2294 MSR_TYPE_RW); 2295 break; 2296 case MSR_IA32_TSX_CTRL: 2297 if (!msr_info->host_initiated && 2298 !(vcpu->arch.arch_capabilities & ARCH_CAP_TSX_CTRL_MSR)) 2299 return 1; 2300 if (data & ~(TSX_CTRL_RTM_DISABLE | TSX_CTRL_CPUID_CLEAR)) 2301 return 1; 2302 goto find_uret_msr; 2303 case MSR_IA32_CR_PAT: 2304 ret = kvm_set_msr_common(vcpu, msr_info); 2305 if (ret) 2306 break; 2307 2308 if (is_guest_mode(vcpu) && 2309 get_vmcs12(vcpu)->vm_exit_controls & VM_EXIT_SAVE_IA32_PAT) 2310 get_vmcs12(vcpu)->guest_ia32_pat = data; 2311 2312 if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) 2313 vmcs_write64(GUEST_IA32_PAT, data); 2314 break; 2315 case MSR_IA32_MCG_EXT_CTL: 2316 if ((!msr_info->host_initiated && 2317 !(to_vmx(vcpu)->msr_ia32_feature_control & 2318 FEAT_CTL_LMCE_ENABLED)) || 2319 (data & ~MCG_EXT_CTL_LMCE_EN)) 2320 return 1; 2321 vcpu->arch.mcg_ext_ctl = data; 2322 break; 2323 case MSR_IA32_FEAT_CTL: 2324 if (!is_vmx_feature_control_msr_valid(vmx, msr_info)) 2325 return 1; 2326 2327 vmx->msr_ia32_feature_control = data; 2328 if (msr_info->host_initiated && data == 0) 2329 vmx_leave_nested(vcpu); 2330 2331 /* SGX may be enabled/disabled by guest's firmware */ 2332 vmx_write_encls_bitmap(vcpu, NULL); 2333 break; 2334 case MSR_IA32_SGXLEPUBKEYHASH0 ... MSR_IA32_SGXLEPUBKEYHASH3: 2335 /* 2336 * On real hardware, the LE hash MSRs are writable before 2337 * the firmware sets bit 0 in MSR 0x7a ("activating" SGX), 2338 * at which point SGX related bits in IA32_FEATURE_CONTROL 2339 * become writable. 2340 * 2341 * KVM does not emulate SGX activation for simplicity, so 2342 * allow writes to the LE hash MSRs if IA32_FEATURE_CONTROL 2343 * is unlocked. This is technically not architectural 2344 * behavior, but it's close enough. 2345 */ 2346 if (!msr_info->host_initiated && 2347 (!guest_cpuid_has(vcpu, X86_FEATURE_SGX_LC) || 2348 ((vmx->msr_ia32_feature_control & FEAT_CTL_LOCKED) && 2349 !(vmx->msr_ia32_feature_control & FEAT_CTL_SGX_LC_ENABLED)))) 2350 return 1; 2351 vmx->msr_ia32_sgxlepubkeyhash 2352 [msr_index - MSR_IA32_SGXLEPUBKEYHASH0] = data; 2353 break; 2354 case KVM_FIRST_EMULATED_VMX_MSR ... KVM_LAST_EMULATED_VMX_MSR: 2355 if (!msr_info->host_initiated) 2356 return 1; /* they are read-only */ 2357 if (!guest_can_use(vcpu, X86_FEATURE_VMX)) 2358 return 1; 2359 return vmx_set_vmx_msr(vcpu, msr_index, data); 2360 case MSR_IA32_RTIT_CTL: 2361 if (!vmx_pt_mode_is_host_guest() || 2362 vmx_rtit_ctl_check(vcpu, data) || 2363 vmx->nested.vmxon) 2364 return 1; 2365 vmcs_write64(GUEST_IA32_RTIT_CTL, data); 2366 vmx->pt_desc.guest.ctl = data; 2367 pt_update_intercept_for_msr(vcpu); 2368 break; 2369 case MSR_IA32_RTIT_STATUS: 2370 if (!pt_can_write_msr(vmx)) 2371 return 1; 2372 if (data & MSR_IA32_RTIT_STATUS_MASK) 2373 return 1; 2374 vmx->pt_desc.guest.status = data; 2375 break; 2376 case MSR_IA32_RTIT_CR3_MATCH: 2377 if (!pt_can_write_msr(vmx)) 2378 return 1; 2379 if (!intel_pt_validate_cap(vmx->pt_desc.caps, 2380 PT_CAP_cr3_filtering)) 2381 return 1; 2382 vmx->pt_desc.guest.cr3_match = data; 2383 break; 2384 case MSR_IA32_RTIT_OUTPUT_BASE: 2385 if (!pt_can_write_msr(vmx)) 2386 return 1; 2387 if (!intel_pt_validate_cap(vmx->pt_desc.caps, 2388 PT_CAP_topa_output) && 2389 !intel_pt_validate_cap(vmx->pt_desc.caps, 2390 PT_CAP_single_range_output)) 2391 return 1; 2392 if (!pt_output_base_valid(vcpu, data)) 2393 return 1; 2394 vmx->pt_desc.guest.output_base = data; 2395 break; 2396 case MSR_IA32_RTIT_OUTPUT_MASK: 2397 if (!pt_can_write_msr(vmx)) 2398 return 1; 2399 if (!intel_pt_validate_cap(vmx->pt_desc.caps, 2400 PT_CAP_topa_output) && 2401 !intel_pt_validate_cap(vmx->pt_desc.caps, 2402 PT_CAP_single_range_output)) 2403 return 1; 2404 vmx->pt_desc.guest.output_mask = data; 2405 break; 2406 case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B: 2407 if (!pt_can_write_msr(vmx)) 2408 return 1; 2409 index = msr_info->index - MSR_IA32_RTIT_ADDR0_A; 2410 if (index >= 2 * vmx->pt_desc.num_address_ranges) 2411 return 1; 2412 if (is_noncanonical_address(data, vcpu)) 2413 return 1; 2414 if (index % 2) 2415 vmx->pt_desc.guest.addr_b[index / 2] = data; 2416 else 2417 vmx->pt_desc.guest.addr_a[index / 2] = data; 2418 break; 2419 case MSR_IA32_PERF_CAPABILITIES: 2420 if (data && !vcpu_to_pmu(vcpu)->version) 2421 return 1; 2422 if (data & PMU_CAP_LBR_FMT) { 2423 if ((data & PMU_CAP_LBR_FMT) != 2424 (kvm_caps.supported_perf_cap & PMU_CAP_LBR_FMT)) 2425 return 1; 2426 if (!cpuid_model_is_consistent(vcpu)) 2427 return 1; 2428 } 2429 if (data & PERF_CAP_PEBS_FORMAT) { 2430 if ((data & PERF_CAP_PEBS_MASK) != 2431 (kvm_caps.supported_perf_cap & PERF_CAP_PEBS_MASK)) 2432 return 1; 2433 if (!guest_cpuid_has(vcpu, X86_FEATURE_DS)) 2434 return 1; 2435 if (!guest_cpuid_has(vcpu, X86_FEATURE_DTES64)) 2436 return 1; 2437 if (!cpuid_model_is_consistent(vcpu)) 2438 return 1; 2439 } 2440 ret = kvm_set_msr_common(vcpu, msr_info); 2441 break; 2442 2443 default: 2444 find_uret_msr: 2445 msr = vmx_find_uret_msr(vmx, msr_index); 2446 if (msr) 2447 ret = vmx_set_guest_uret_msr(vmx, msr, data); 2448 else 2449 ret = kvm_set_msr_common(vcpu, msr_info); 2450 } 2451 2452 /* FB_CLEAR may have changed, also update the FB_CLEAR_DIS behavior */ 2453 if (msr_index == MSR_IA32_ARCH_CAPABILITIES) 2454 vmx_update_fb_clear_dis(vcpu, vmx); 2455 2456 return ret; 2457 } 2458 2459 static void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg) 2460 { 2461 unsigned long guest_owned_bits; 2462 2463 kvm_register_mark_available(vcpu, reg); 2464 2465 switch (reg) { 2466 case VCPU_REGS_RSP: 2467 vcpu->arch.regs[VCPU_REGS_RSP] = vmcs_readl(GUEST_RSP); 2468 break; 2469 case VCPU_REGS_RIP: 2470 vcpu->arch.regs[VCPU_REGS_RIP] = vmcs_readl(GUEST_RIP); 2471 break; 2472 case VCPU_EXREG_PDPTR: 2473 if (enable_ept) 2474 ept_save_pdptrs(vcpu); 2475 break; 2476 case VCPU_EXREG_CR0: 2477 guest_owned_bits = vcpu->arch.cr0_guest_owned_bits; 2478 2479 vcpu->arch.cr0 &= ~guest_owned_bits; 2480 vcpu->arch.cr0 |= vmcs_readl(GUEST_CR0) & guest_owned_bits; 2481 break; 2482 case VCPU_EXREG_CR3: 2483 /* 2484 * When intercepting CR3 loads, e.g. for shadowing paging, KVM's 2485 * CR3 is loaded into hardware, not the guest's CR3. 2486 */ 2487 if (!(exec_controls_get(to_vmx(vcpu)) & CPU_BASED_CR3_LOAD_EXITING)) 2488 vcpu->arch.cr3 = vmcs_readl(GUEST_CR3); 2489 break; 2490 case VCPU_EXREG_CR4: 2491 guest_owned_bits = vcpu->arch.cr4_guest_owned_bits; 2492 2493 vcpu->arch.cr4 &= ~guest_owned_bits; 2494 vcpu->arch.cr4 |= vmcs_readl(GUEST_CR4) & guest_owned_bits; 2495 break; 2496 default: 2497 KVM_BUG_ON(1, vcpu->kvm); 2498 break; 2499 } 2500 } 2501 2502 /* 2503 * There is no X86_FEATURE for SGX yet, but anyway we need to query CPUID 2504 * directly instead of going through cpu_has(), to ensure KVM is trapping 2505 * ENCLS whenever it's supported in hardware. It does not matter whether 2506 * the host OS supports or has enabled SGX. 2507 */ 2508 static bool cpu_has_sgx(void) 2509 { 2510 return cpuid_eax(0) >= 0x12 && (cpuid_eax(0x12) & BIT(0)); 2511 } 2512 2513 /* 2514 * Some cpus support VM_{ENTRY,EXIT}_IA32_PERF_GLOBAL_CTRL but they 2515 * can't be used due to errata where VM Exit may incorrectly clear 2516 * IA32_PERF_GLOBAL_CTRL[34:32]. Work around the errata by using the 2517 * MSR load mechanism to switch IA32_PERF_GLOBAL_CTRL. 2518 */ 2519 static bool cpu_has_perf_global_ctrl_bug(void) 2520 { 2521 if (boot_cpu_data.x86 == 0x6) { 2522 switch (boot_cpu_data.x86_model) { 2523 case INTEL_FAM6_NEHALEM_EP: /* AAK155 */ 2524 case INTEL_FAM6_NEHALEM: /* AAP115 */ 2525 case INTEL_FAM6_WESTMERE: /* AAT100 */ 2526 case INTEL_FAM6_WESTMERE_EP: /* BC86,AAY89,BD102 */ 2527 case INTEL_FAM6_NEHALEM_EX: /* BA97 */ 2528 return true; 2529 default: 2530 break; 2531 } 2532 } 2533 2534 return false; 2535 } 2536 2537 static int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt, u32 msr, u32 *result) 2538 { 2539 u32 vmx_msr_low, vmx_msr_high; 2540 u32 ctl = ctl_min | ctl_opt; 2541 2542 rdmsr(msr, vmx_msr_low, vmx_msr_high); 2543 2544 ctl &= vmx_msr_high; /* bit == 0 in high word ==> must be zero */ 2545 ctl |= vmx_msr_low; /* bit == 1 in low word ==> must be one */ 2546 2547 /* Ensure minimum (required) set of control bits are supported. */ 2548 if (ctl_min & ~ctl) 2549 return -EIO; 2550 2551 *result = ctl; 2552 return 0; 2553 } 2554 2555 static u64 adjust_vmx_controls64(u64 ctl_opt, u32 msr) 2556 { 2557 u64 allowed; 2558 2559 rdmsrl(msr, allowed); 2560 2561 return ctl_opt & allowed; 2562 } 2563 2564 static int setup_vmcs_config(struct vmcs_config *vmcs_conf, 2565 struct vmx_capability *vmx_cap) 2566 { 2567 u32 vmx_msr_low, vmx_msr_high; 2568 u32 _pin_based_exec_control = 0; 2569 u32 _cpu_based_exec_control = 0; 2570 u32 _cpu_based_2nd_exec_control = 0; 2571 u64 _cpu_based_3rd_exec_control = 0; 2572 u32 _vmexit_control = 0; 2573 u32 _vmentry_control = 0; 2574 u64 misc_msr; 2575 int i; 2576 2577 /* 2578 * LOAD/SAVE_DEBUG_CONTROLS are absent because both are mandatory. 2579 * SAVE_IA32_PAT and SAVE_IA32_EFER are absent because KVM always 2580 * intercepts writes to PAT and EFER, i.e. never enables those controls. 2581 */ 2582 struct { 2583 u32 entry_control; 2584 u32 exit_control; 2585 } const vmcs_entry_exit_pairs[] = { 2586 { VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL, VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL }, 2587 { VM_ENTRY_LOAD_IA32_PAT, VM_EXIT_LOAD_IA32_PAT }, 2588 { VM_ENTRY_LOAD_IA32_EFER, VM_EXIT_LOAD_IA32_EFER }, 2589 { VM_ENTRY_LOAD_BNDCFGS, VM_EXIT_CLEAR_BNDCFGS }, 2590 { VM_ENTRY_LOAD_IA32_RTIT_CTL, VM_EXIT_CLEAR_IA32_RTIT_CTL }, 2591 }; 2592 2593 memset(vmcs_conf, 0, sizeof(*vmcs_conf)); 2594 2595 if (adjust_vmx_controls(KVM_REQUIRED_VMX_CPU_BASED_VM_EXEC_CONTROL, 2596 KVM_OPTIONAL_VMX_CPU_BASED_VM_EXEC_CONTROL, 2597 MSR_IA32_VMX_PROCBASED_CTLS, 2598 &_cpu_based_exec_control)) 2599 return -EIO; 2600 if (_cpu_based_exec_control & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) { 2601 if (adjust_vmx_controls(KVM_REQUIRED_VMX_SECONDARY_VM_EXEC_CONTROL, 2602 KVM_OPTIONAL_VMX_SECONDARY_VM_EXEC_CONTROL, 2603 MSR_IA32_VMX_PROCBASED_CTLS2, 2604 &_cpu_based_2nd_exec_control)) 2605 return -EIO; 2606 } 2607 #ifndef CONFIG_X86_64 2608 if (!(_cpu_based_2nd_exec_control & 2609 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) 2610 _cpu_based_exec_control &= ~CPU_BASED_TPR_SHADOW; 2611 #endif 2612 2613 if (!(_cpu_based_exec_control & CPU_BASED_TPR_SHADOW)) 2614 _cpu_based_2nd_exec_control &= ~( 2615 SECONDARY_EXEC_APIC_REGISTER_VIRT | 2616 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | 2617 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY); 2618 2619 rdmsr_safe(MSR_IA32_VMX_EPT_VPID_CAP, 2620 &vmx_cap->ept, &vmx_cap->vpid); 2621 2622 if (!(_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_EPT) && 2623 vmx_cap->ept) { 2624 pr_warn_once("EPT CAP should not exist if not support " 2625 "1-setting enable EPT VM-execution control\n"); 2626 2627 if (error_on_inconsistent_vmcs_config) 2628 return -EIO; 2629 2630 vmx_cap->ept = 0; 2631 } 2632 if (!(_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_VPID) && 2633 vmx_cap->vpid) { 2634 pr_warn_once("VPID CAP should not exist if not support " 2635 "1-setting enable VPID VM-execution control\n"); 2636 2637 if (error_on_inconsistent_vmcs_config) 2638 return -EIO; 2639 2640 vmx_cap->vpid = 0; 2641 } 2642 2643 if (!cpu_has_sgx()) 2644 _cpu_based_2nd_exec_control &= ~SECONDARY_EXEC_ENCLS_EXITING; 2645 2646 if (_cpu_based_exec_control & CPU_BASED_ACTIVATE_TERTIARY_CONTROLS) 2647 _cpu_based_3rd_exec_control = 2648 adjust_vmx_controls64(KVM_OPTIONAL_VMX_TERTIARY_VM_EXEC_CONTROL, 2649 MSR_IA32_VMX_PROCBASED_CTLS3); 2650 2651 if (adjust_vmx_controls(KVM_REQUIRED_VMX_VM_EXIT_CONTROLS, 2652 KVM_OPTIONAL_VMX_VM_EXIT_CONTROLS, 2653 MSR_IA32_VMX_EXIT_CTLS, 2654 &_vmexit_control)) 2655 return -EIO; 2656 2657 if (adjust_vmx_controls(KVM_REQUIRED_VMX_PIN_BASED_VM_EXEC_CONTROL, 2658 KVM_OPTIONAL_VMX_PIN_BASED_VM_EXEC_CONTROL, 2659 MSR_IA32_VMX_PINBASED_CTLS, 2660 &_pin_based_exec_control)) 2661 return -EIO; 2662 2663 if (cpu_has_broken_vmx_preemption_timer()) 2664 _pin_based_exec_control &= ~PIN_BASED_VMX_PREEMPTION_TIMER; 2665 if (!(_cpu_based_2nd_exec_control & 2666 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY)) 2667 _pin_based_exec_control &= ~PIN_BASED_POSTED_INTR; 2668 2669 if (adjust_vmx_controls(KVM_REQUIRED_VMX_VM_ENTRY_CONTROLS, 2670 KVM_OPTIONAL_VMX_VM_ENTRY_CONTROLS, 2671 MSR_IA32_VMX_ENTRY_CTLS, 2672 &_vmentry_control)) 2673 return -EIO; 2674 2675 for (i = 0; i < ARRAY_SIZE(vmcs_entry_exit_pairs); i++) { 2676 u32 n_ctrl = vmcs_entry_exit_pairs[i].entry_control; 2677 u32 x_ctrl = vmcs_entry_exit_pairs[i].exit_control; 2678 2679 if (!(_vmentry_control & n_ctrl) == !(_vmexit_control & x_ctrl)) 2680 continue; 2681 2682 pr_warn_once("Inconsistent VM-Entry/VM-Exit pair, entry = %x, exit = %x\n", 2683 _vmentry_control & n_ctrl, _vmexit_control & x_ctrl); 2684 2685 if (error_on_inconsistent_vmcs_config) 2686 return -EIO; 2687 2688 _vmentry_control &= ~n_ctrl; 2689 _vmexit_control &= ~x_ctrl; 2690 } 2691 2692 rdmsr(MSR_IA32_VMX_BASIC, vmx_msr_low, vmx_msr_high); 2693 2694 /* IA-32 SDM Vol 3B: VMCS size is never greater than 4kB. */ 2695 if ((vmx_msr_high & 0x1fff) > PAGE_SIZE) 2696 return -EIO; 2697 2698 #ifdef CONFIG_X86_64 2699 /* IA-32 SDM Vol 3B: 64-bit CPUs always have VMX_BASIC_MSR[48]==0. */ 2700 if (vmx_msr_high & (1u<<16)) 2701 return -EIO; 2702 #endif 2703 2704 /* Require Write-Back (WB) memory type for VMCS accesses. */ 2705 if (((vmx_msr_high >> 18) & 15) != 6) 2706 return -EIO; 2707 2708 rdmsrl(MSR_IA32_VMX_MISC, misc_msr); 2709 2710 vmcs_conf->size = vmx_msr_high & 0x1fff; 2711 vmcs_conf->basic_cap = vmx_msr_high & ~0x1fff; 2712 2713 vmcs_conf->revision_id = vmx_msr_low; 2714 2715 vmcs_conf->pin_based_exec_ctrl = _pin_based_exec_control; 2716 vmcs_conf->cpu_based_exec_ctrl = _cpu_based_exec_control; 2717 vmcs_conf->cpu_based_2nd_exec_ctrl = _cpu_based_2nd_exec_control; 2718 vmcs_conf->cpu_based_3rd_exec_ctrl = _cpu_based_3rd_exec_control; 2719 vmcs_conf->vmexit_ctrl = _vmexit_control; 2720 vmcs_conf->vmentry_ctrl = _vmentry_control; 2721 vmcs_conf->misc = misc_msr; 2722 2723 #if IS_ENABLED(CONFIG_HYPERV) 2724 if (enlightened_vmcs) 2725 evmcs_sanitize_exec_ctrls(vmcs_conf); 2726 #endif 2727 2728 return 0; 2729 } 2730 2731 static bool __kvm_is_vmx_supported(void) 2732 { 2733 int cpu = smp_processor_id(); 2734 2735 if (!(cpuid_ecx(1) & feature_bit(VMX))) { 2736 pr_err("VMX not supported by CPU %d\n", cpu); 2737 return false; 2738 } 2739 2740 if (!this_cpu_has(X86_FEATURE_MSR_IA32_FEAT_CTL) || 2741 !this_cpu_has(X86_FEATURE_VMX)) { 2742 pr_err("VMX not enabled (by BIOS) in MSR_IA32_FEAT_CTL on CPU %d\n", cpu); 2743 return false; 2744 } 2745 2746 return true; 2747 } 2748 2749 static bool kvm_is_vmx_supported(void) 2750 { 2751 bool supported; 2752 2753 migrate_disable(); 2754 supported = __kvm_is_vmx_supported(); 2755 migrate_enable(); 2756 2757 return supported; 2758 } 2759 2760 static int vmx_check_processor_compat(void) 2761 { 2762 int cpu = raw_smp_processor_id(); 2763 struct vmcs_config vmcs_conf; 2764 struct vmx_capability vmx_cap; 2765 2766 if (!__kvm_is_vmx_supported()) 2767 return -EIO; 2768 2769 if (setup_vmcs_config(&vmcs_conf, &vmx_cap) < 0) { 2770 pr_err("Failed to setup VMCS config on CPU %d\n", cpu); 2771 return -EIO; 2772 } 2773 if (nested) 2774 nested_vmx_setup_ctls_msrs(&vmcs_conf, vmx_cap.ept); 2775 if (memcmp(&vmcs_config, &vmcs_conf, sizeof(struct vmcs_config))) { 2776 pr_err("Inconsistent VMCS config on CPU %d\n", cpu); 2777 return -EIO; 2778 } 2779 return 0; 2780 } 2781 2782 static int kvm_cpu_vmxon(u64 vmxon_pointer) 2783 { 2784 u64 msr; 2785 2786 cr4_set_bits(X86_CR4_VMXE); 2787 2788 asm goto("1: vmxon %[vmxon_pointer]\n\t" 2789 _ASM_EXTABLE(1b, %l[fault]) 2790 : : [vmxon_pointer] "m"(vmxon_pointer) 2791 : : fault); 2792 return 0; 2793 2794 fault: 2795 WARN_ONCE(1, "VMXON faulted, MSR_IA32_FEAT_CTL (0x3a) = 0x%llx\n", 2796 rdmsrl_safe(MSR_IA32_FEAT_CTL, &msr) ? 0xdeadbeef : msr); 2797 cr4_clear_bits(X86_CR4_VMXE); 2798 2799 return -EFAULT; 2800 } 2801 2802 static int vmx_hardware_enable(void) 2803 { 2804 int cpu = raw_smp_processor_id(); 2805 u64 phys_addr = __pa(per_cpu(vmxarea, cpu)); 2806 int r; 2807 2808 if (cr4_read_shadow() & X86_CR4_VMXE) 2809 return -EBUSY; 2810 2811 /* 2812 * This can happen if we hot-added a CPU but failed to allocate 2813 * VP assist page for it. 2814 */ 2815 if (kvm_is_using_evmcs() && !hv_get_vp_assist_page(cpu)) 2816 return -EFAULT; 2817 2818 intel_pt_handle_vmx(1); 2819 2820 r = kvm_cpu_vmxon(phys_addr); 2821 if (r) { 2822 intel_pt_handle_vmx(0); 2823 return r; 2824 } 2825 2826 if (enable_ept) 2827 ept_sync_global(); 2828 2829 return 0; 2830 } 2831 2832 static void vmclear_local_loaded_vmcss(void) 2833 { 2834 int cpu = raw_smp_processor_id(); 2835 struct loaded_vmcs *v, *n; 2836 2837 list_for_each_entry_safe(v, n, &per_cpu(loaded_vmcss_on_cpu, cpu), 2838 loaded_vmcss_on_cpu_link) 2839 __loaded_vmcs_clear(v); 2840 } 2841 2842 static void vmx_hardware_disable(void) 2843 { 2844 vmclear_local_loaded_vmcss(); 2845 2846 if (kvm_cpu_vmxoff()) 2847 kvm_spurious_fault(); 2848 2849 hv_reset_evmcs(); 2850 2851 intel_pt_handle_vmx(0); 2852 } 2853 2854 struct vmcs *alloc_vmcs_cpu(bool shadow, int cpu, gfp_t flags) 2855 { 2856 int node = cpu_to_node(cpu); 2857 struct page *pages; 2858 struct vmcs *vmcs; 2859 2860 pages = __alloc_pages_node(node, flags, 0); 2861 if (!pages) 2862 return NULL; 2863 vmcs = page_address(pages); 2864 memset(vmcs, 0, vmcs_config.size); 2865 2866 /* KVM supports Enlightened VMCS v1 only */ 2867 if (kvm_is_using_evmcs()) 2868 vmcs->hdr.revision_id = KVM_EVMCS_VERSION; 2869 else 2870 vmcs->hdr.revision_id = vmcs_config.revision_id; 2871 2872 if (shadow) 2873 vmcs->hdr.shadow_vmcs = 1; 2874 return vmcs; 2875 } 2876 2877 void free_vmcs(struct vmcs *vmcs) 2878 { 2879 free_page((unsigned long)vmcs); 2880 } 2881 2882 /* 2883 * Free a VMCS, but before that VMCLEAR it on the CPU where it was last loaded 2884 */ 2885 void free_loaded_vmcs(struct loaded_vmcs *loaded_vmcs) 2886 { 2887 if (!loaded_vmcs->vmcs) 2888 return; 2889 loaded_vmcs_clear(loaded_vmcs); 2890 free_vmcs(loaded_vmcs->vmcs); 2891 loaded_vmcs->vmcs = NULL; 2892 if (loaded_vmcs->msr_bitmap) 2893 free_page((unsigned long)loaded_vmcs->msr_bitmap); 2894 WARN_ON(loaded_vmcs->shadow_vmcs != NULL); 2895 } 2896 2897 int alloc_loaded_vmcs(struct loaded_vmcs *loaded_vmcs) 2898 { 2899 loaded_vmcs->vmcs = alloc_vmcs(false); 2900 if (!loaded_vmcs->vmcs) 2901 return -ENOMEM; 2902 2903 vmcs_clear(loaded_vmcs->vmcs); 2904 2905 loaded_vmcs->shadow_vmcs = NULL; 2906 loaded_vmcs->hv_timer_soft_disabled = false; 2907 loaded_vmcs->cpu = -1; 2908 loaded_vmcs->launched = 0; 2909 2910 if (cpu_has_vmx_msr_bitmap()) { 2911 loaded_vmcs->msr_bitmap = (unsigned long *) 2912 __get_free_page(GFP_KERNEL_ACCOUNT); 2913 if (!loaded_vmcs->msr_bitmap) 2914 goto out_vmcs; 2915 memset(loaded_vmcs->msr_bitmap, 0xff, PAGE_SIZE); 2916 } 2917 2918 memset(&loaded_vmcs->host_state, 0, sizeof(struct vmcs_host_state)); 2919 memset(&loaded_vmcs->controls_shadow, 0, 2920 sizeof(struct vmcs_controls_shadow)); 2921 2922 return 0; 2923 2924 out_vmcs: 2925 free_loaded_vmcs(loaded_vmcs); 2926 return -ENOMEM; 2927 } 2928 2929 static void free_kvm_area(void) 2930 { 2931 int cpu; 2932 2933 for_each_possible_cpu(cpu) { 2934 free_vmcs(per_cpu(vmxarea, cpu)); 2935 per_cpu(vmxarea, cpu) = NULL; 2936 } 2937 } 2938 2939 static __init int alloc_kvm_area(void) 2940 { 2941 int cpu; 2942 2943 for_each_possible_cpu(cpu) { 2944 struct vmcs *vmcs; 2945 2946 vmcs = alloc_vmcs_cpu(false, cpu, GFP_KERNEL); 2947 if (!vmcs) { 2948 free_kvm_area(); 2949 return -ENOMEM; 2950 } 2951 2952 /* 2953 * When eVMCS is enabled, alloc_vmcs_cpu() sets 2954 * vmcs->revision_id to KVM_EVMCS_VERSION instead of 2955 * revision_id reported by MSR_IA32_VMX_BASIC. 2956 * 2957 * However, even though not explicitly documented by 2958 * TLFS, VMXArea passed as VMXON argument should 2959 * still be marked with revision_id reported by 2960 * physical CPU. 2961 */ 2962 if (kvm_is_using_evmcs()) 2963 vmcs->hdr.revision_id = vmcs_config.revision_id; 2964 2965 per_cpu(vmxarea, cpu) = vmcs; 2966 } 2967 return 0; 2968 } 2969 2970 static void fix_pmode_seg(struct kvm_vcpu *vcpu, int seg, 2971 struct kvm_segment *save) 2972 { 2973 if (!emulate_invalid_guest_state) { 2974 /* 2975 * CS and SS RPL should be equal during guest entry according 2976 * to VMX spec, but in reality it is not always so. Since vcpu 2977 * is in the middle of the transition from real mode to 2978 * protected mode it is safe to assume that RPL 0 is a good 2979 * default value. 2980 */ 2981 if (seg == VCPU_SREG_CS || seg == VCPU_SREG_SS) 2982 save->selector &= ~SEGMENT_RPL_MASK; 2983 save->dpl = save->selector & SEGMENT_RPL_MASK; 2984 save->s = 1; 2985 } 2986 __vmx_set_segment(vcpu, save, seg); 2987 } 2988 2989 static void enter_pmode(struct kvm_vcpu *vcpu) 2990 { 2991 unsigned long flags; 2992 struct vcpu_vmx *vmx = to_vmx(vcpu); 2993 2994 /* 2995 * Update real mode segment cache. It may be not up-to-date if segment 2996 * register was written while vcpu was in a guest mode. 2997 */ 2998 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_ES], VCPU_SREG_ES); 2999 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_DS], VCPU_SREG_DS); 3000 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_FS], VCPU_SREG_FS); 3001 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_GS], VCPU_SREG_GS); 3002 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_SS], VCPU_SREG_SS); 3003 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_CS], VCPU_SREG_CS); 3004 3005 vmx->rmode.vm86_active = 0; 3006 3007 __vmx_set_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_TR], VCPU_SREG_TR); 3008 3009 flags = vmcs_readl(GUEST_RFLAGS); 3010 flags &= RMODE_GUEST_OWNED_EFLAGS_BITS; 3011 flags |= vmx->rmode.save_rflags & ~RMODE_GUEST_OWNED_EFLAGS_BITS; 3012 vmcs_writel(GUEST_RFLAGS, flags); 3013 3014 vmcs_writel(GUEST_CR4, (vmcs_readl(GUEST_CR4) & ~X86_CR4_VME) | 3015 (vmcs_readl(CR4_READ_SHADOW) & X86_CR4_VME)); 3016 3017 vmx_update_exception_bitmap(vcpu); 3018 3019 fix_pmode_seg(vcpu, VCPU_SREG_CS, &vmx->rmode.segs[VCPU_SREG_CS]); 3020 fix_pmode_seg(vcpu, VCPU_SREG_SS, &vmx->rmode.segs[VCPU_SREG_SS]); 3021 fix_pmode_seg(vcpu, VCPU_SREG_ES, &vmx->rmode.segs[VCPU_SREG_ES]); 3022 fix_pmode_seg(vcpu, VCPU_SREG_DS, &vmx->rmode.segs[VCPU_SREG_DS]); 3023 fix_pmode_seg(vcpu, VCPU_SREG_FS, &vmx->rmode.segs[VCPU_SREG_FS]); 3024 fix_pmode_seg(vcpu, VCPU_SREG_GS, &vmx->rmode.segs[VCPU_SREG_GS]); 3025 } 3026 3027 static void fix_rmode_seg(int seg, struct kvm_segment *save) 3028 { 3029 const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; 3030 struct kvm_segment var = *save; 3031 3032 var.dpl = 0x3; 3033 if (seg == VCPU_SREG_CS) 3034 var.type = 0x3; 3035 3036 if (!emulate_invalid_guest_state) { 3037 var.selector = var.base >> 4; 3038 var.base = var.base & 0xffff0; 3039 var.limit = 0xffff; 3040 var.g = 0; 3041 var.db = 0; 3042 var.present = 1; 3043 var.s = 1; 3044 var.l = 0; 3045 var.unusable = 0; 3046 var.type = 0x3; 3047 var.avl = 0; 3048 if (save->base & 0xf) 3049 pr_warn_once("segment base is not paragraph aligned " 3050 "when entering protected mode (seg=%d)", seg); 3051 } 3052 3053 vmcs_write16(sf->selector, var.selector); 3054 vmcs_writel(sf->base, var.base); 3055 vmcs_write32(sf->limit, var.limit); 3056 vmcs_write32(sf->ar_bytes, vmx_segment_access_rights(&var)); 3057 } 3058 3059 static void enter_rmode(struct kvm_vcpu *vcpu) 3060 { 3061 unsigned long flags; 3062 struct vcpu_vmx *vmx = to_vmx(vcpu); 3063 struct kvm_vmx *kvm_vmx = to_kvm_vmx(vcpu->kvm); 3064 3065 /* 3066 * KVM should never use VM86 to virtualize Real Mode when L2 is active, 3067 * as using VM86 is unnecessary if unrestricted guest is enabled, and 3068 * if unrestricted guest is disabled, VM-Enter (from L1) with CR0.PG=0 3069 * should VM-Fail and KVM should reject userspace attempts to stuff 3070 * CR0.PG=0 when L2 is active. 3071 */ 3072 WARN_ON_ONCE(is_guest_mode(vcpu)); 3073 3074 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_TR], VCPU_SREG_TR); 3075 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_ES], VCPU_SREG_ES); 3076 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_DS], VCPU_SREG_DS); 3077 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_FS], VCPU_SREG_FS); 3078 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_GS], VCPU_SREG_GS); 3079 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_SS], VCPU_SREG_SS); 3080 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_CS], VCPU_SREG_CS); 3081 3082 vmx->rmode.vm86_active = 1; 3083 3084 vmx_segment_cache_clear(vmx); 3085 3086 vmcs_writel(GUEST_TR_BASE, kvm_vmx->tss_addr); 3087 vmcs_write32(GUEST_TR_LIMIT, RMODE_TSS_SIZE - 1); 3088 vmcs_write32(GUEST_TR_AR_BYTES, 0x008b); 3089 3090 flags = vmcs_readl(GUEST_RFLAGS); 3091 vmx->rmode.save_rflags = flags; 3092 3093 flags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM; 3094 3095 vmcs_writel(GUEST_RFLAGS, flags); 3096 vmcs_writel(GUEST_CR4, vmcs_readl(GUEST_CR4) | X86_CR4_VME); 3097 vmx_update_exception_bitmap(vcpu); 3098 3099 fix_rmode_seg(VCPU_SREG_SS, &vmx->rmode.segs[VCPU_SREG_SS]); 3100 fix_rmode_seg(VCPU_SREG_CS, &vmx->rmode.segs[VCPU_SREG_CS]); 3101 fix_rmode_seg(VCPU_SREG_ES, &vmx->rmode.segs[VCPU_SREG_ES]); 3102 fix_rmode_seg(VCPU_SREG_DS, &vmx->rmode.segs[VCPU_SREG_DS]); 3103 fix_rmode_seg(VCPU_SREG_GS, &vmx->rmode.segs[VCPU_SREG_GS]); 3104 fix_rmode_seg(VCPU_SREG_FS, &vmx->rmode.segs[VCPU_SREG_FS]); 3105 } 3106 3107 int vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer) 3108 { 3109 struct vcpu_vmx *vmx = to_vmx(vcpu); 3110 3111 /* Nothing to do if hardware doesn't support EFER. */ 3112 if (!vmx_find_uret_msr(vmx, MSR_EFER)) 3113 return 0; 3114 3115 vcpu->arch.efer = efer; 3116 #ifdef CONFIG_X86_64 3117 if (efer & EFER_LMA) 3118 vm_entry_controls_setbit(vmx, VM_ENTRY_IA32E_MODE); 3119 else 3120 vm_entry_controls_clearbit(vmx, VM_ENTRY_IA32E_MODE); 3121 #else 3122 if (KVM_BUG_ON(efer & EFER_LMA, vcpu->kvm)) 3123 return 1; 3124 #endif 3125 3126 vmx_setup_uret_msrs(vmx); 3127 return 0; 3128 } 3129 3130 #ifdef CONFIG_X86_64 3131 3132 static void enter_lmode(struct kvm_vcpu *vcpu) 3133 { 3134 u32 guest_tr_ar; 3135 3136 vmx_segment_cache_clear(to_vmx(vcpu)); 3137 3138 guest_tr_ar = vmcs_read32(GUEST_TR_AR_BYTES); 3139 if ((guest_tr_ar & VMX_AR_TYPE_MASK) != VMX_AR_TYPE_BUSY_64_TSS) { 3140 pr_debug_ratelimited("%s: tss fixup for long mode. \n", 3141 __func__); 3142 vmcs_write32(GUEST_TR_AR_BYTES, 3143 (guest_tr_ar & ~VMX_AR_TYPE_MASK) 3144 | VMX_AR_TYPE_BUSY_64_TSS); 3145 } 3146 vmx_set_efer(vcpu, vcpu->arch.efer | EFER_LMA); 3147 } 3148 3149 static void exit_lmode(struct kvm_vcpu *vcpu) 3150 { 3151 vmx_set_efer(vcpu, vcpu->arch.efer & ~EFER_LMA); 3152 } 3153 3154 #endif 3155 3156 static void vmx_flush_tlb_all(struct kvm_vcpu *vcpu) 3157 { 3158 struct vcpu_vmx *vmx = to_vmx(vcpu); 3159 3160 /* 3161 * INVEPT must be issued when EPT is enabled, irrespective of VPID, as 3162 * the CPU is not required to invalidate guest-physical mappings on 3163 * VM-Entry, even if VPID is disabled. Guest-physical mappings are 3164 * associated with the root EPT structure and not any particular VPID 3165 * (INVVPID also isn't required to invalidate guest-physical mappings). 3166 */ 3167 if (enable_ept) { 3168 ept_sync_global(); 3169 } else if (enable_vpid) { 3170 if (cpu_has_vmx_invvpid_global()) { 3171 vpid_sync_vcpu_global(); 3172 } else { 3173 vpid_sync_vcpu_single(vmx->vpid); 3174 vpid_sync_vcpu_single(vmx->nested.vpid02); 3175 } 3176 } 3177 } 3178 3179 static inline int vmx_get_current_vpid(struct kvm_vcpu *vcpu) 3180 { 3181 if (is_guest_mode(vcpu)) 3182 return nested_get_vpid02(vcpu); 3183 return to_vmx(vcpu)->vpid; 3184 } 3185 3186 static void vmx_flush_tlb_current(struct kvm_vcpu *vcpu) 3187 { 3188 struct kvm_mmu *mmu = vcpu->arch.mmu; 3189 u64 root_hpa = mmu->root.hpa; 3190 3191 /* No flush required if the current context is invalid. */ 3192 if (!VALID_PAGE(root_hpa)) 3193 return; 3194 3195 if (enable_ept) 3196 ept_sync_context(construct_eptp(vcpu, root_hpa, 3197 mmu->root_role.level)); 3198 else 3199 vpid_sync_context(vmx_get_current_vpid(vcpu)); 3200 } 3201 3202 static void vmx_flush_tlb_gva(struct kvm_vcpu *vcpu, gva_t addr) 3203 { 3204 /* 3205 * vpid_sync_vcpu_addr() is a nop if vpid==0, see the comment in 3206 * vmx_flush_tlb_guest() for an explanation of why this is ok. 3207 */ 3208 vpid_sync_vcpu_addr(vmx_get_current_vpid(vcpu), addr); 3209 } 3210 3211 static void vmx_flush_tlb_guest(struct kvm_vcpu *vcpu) 3212 { 3213 /* 3214 * vpid_sync_context() is a nop if vpid==0, e.g. if enable_vpid==0 or a 3215 * vpid couldn't be allocated for this vCPU. VM-Enter and VM-Exit are 3216 * required to flush GVA->{G,H}PA mappings from the TLB if vpid is 3217 * disabled (VM-Enter with vpid enabled and vpid==0 is disallowed), 3218 * i.e. no explicit INVVPID is necessary. 3219 */ 3220 vpid_sync_context(vmx_get_current_vpid(vcpu)); 3221 } 3222 3223 void vmx_ept_load_pdptrs(struct kvm_vcpu *vcpu) 3224 { 3225 struct kvm_mmu *mmu = vcpu->arch.walk_mmu; 3226 3227 if (!kvm_register_is_dirty(vcpu, VCPU_EXREG_PDPTR)) 3228 return; 3229 3230 if (is_pae_paging(vcpu)) { 3231 vmcs_write64(GUEST_PDPTR0, mmu->pdptrs[0]); 3232 vmcs_write64(GUEST_PDPTR1, mmu->pdptrs[1]); 3233 vmcs_write64(GUEST_PDPTR2, mmu->pdptrs[2]); 3234 vmcs_write64(GUEST_PDPTR3, mmu->pdptrs[3]); 3235 } 3236 } 3237 3238 void ept_save_pdptrs(struct kvm_vcpu *vcpu) 3239 { 3240 struct kvm_mmu *mmu = vcpu->arch.walk_mmu; 3241 3242 if (WARN_ON_ONCE(!is_pae_paging(vcpu))) 3243 return; 3244 3245 mmu->pdptrs[0] = vmcs_read64(GUEST_PDPTR0); 3246 mmu->pdptrs[1] = vmcs_read64(GUEST_PDPTR1); 3247 mmu->pdptrs[2] = vmcs_read64(GUEST_PDPTR2); 3248 mmu->pdptrs[3] = vmcs_read64(GUEST_PDPTR3); 3249 3250 kvm_register_mark_available(vcpu, VCPU_EXREG_PDPTR); 3251 } 3252 3253 #define CR3_EXITING_BITS (CPU_BASED_CR3_LOAD_EXITING | \ 3254 CPU_BASED_CR3_STORE_EXITING) 3255 3256 static bool vmx_is_valid_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) 3257 { 3258 if (is_guest_mode(vcpu)) 3259 return nested_guest_cr0_valid(vcpu, cr0); 3260 3261 if (to_vmx(vcpu)->nested.vmxon) 3262 return nested_host_cr0_valid(vcpu, cr0); 3263 3264 return true; 3265 } 3266 3267 void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) 3268 { 3269 struct vcpu_vmx *vmx = to_vmx(vcpu); 3270 unsigned long hw_cr0, old_cr0_pg; 3271 u32 tmp; 3272 3273 old_cr0_pg = kvm_read_cr0_bits(vcpu, X86_CR0_PG); 3274 3275 hw_cr0 = (cr0 & ~KVM_VM_CR0_ALWAYS_OFF); 3276 if (enable_unrestricted_guest) 3277 hw_cr0 |= KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST; 3278 else { 3279 hw_cr0 |= KVM_VM_CR0_ALWAYS_ON; 3280 if (!enable_ept) 3281 hw_cr0 |= X86_CR0_WP; 3282 3283 if (vmx->rmode.vm86_active && (cr0 & X86_CR0_PE)) 3284 enter_pmode(vcpu); 3285 3286 if (!vmx->rmode.vm86_active && !(cr0 & X86_CR0_PE)) 3287 enter_rmode(vcpu); 3288 } 3289 3290 vmcs_writel(CR0_READ_SHADOW, cr0); 3291 vmcs_writel(GUEST_CR0, hw_cr0); 3292 vcpu->arch.cr0 = cr0; 3293 kvm_register_mark_available(vcpu, VCPU_EXREG_CR0); 3294 3295 #ifdef CONFIG_X86_64 3296 if (vcpu->arch.efer & EFER_LME) { 3297 if (!old_cr0_pg && (cr0 & X86_CR0_PG)) 3298 enter_lmode(vcpu); 3299 else if (old_cr0_pg && !(cr0 & X86_CR0_PG)) 3300 exit_lmode(vcpu); 3301 } 3302 #endif 3303 3304 if (enable_ept && !enable_unrestricted_guest) { 3305 /* 3306 * Ensure KVM has an up-to-date snapshot of the guest's CR3. If 3307 * the below code _enables_ CR3 exiting, vmx_cache_reg() will 3308 * (correctly) stop reading vmcs.GUEST_CR3 because it thinks 3309 * KVM's CR3 is installed. 3310 */ 3311 if (!kvm_register_is_available(vcpu, VCPU_EXREG_CR3)) 3312 vmx_cache_reg(vcpu, VCPU_EXREG_CR3); 3313 3314 /* 3315 * When running with EPT but not unrestricted guest, KVM must 3316 * intercept CR3 accesses when paging is _disabled_. This is 3317 * necessary because restricted guests can't actually run with 3318 * paging disabled, and so KVM stuffs its own CR3 in order to 3319 * run the guest when identity mapped page tables. 3320 * 3321 * Do _NOT_ check the old CR0.PG, e.g. to optimize away the 3322 * update, it may be stale with respect to CR3 interception, 3323 * e.g. after nested VM-Enter. 3324 * 3325 * Lastly, honor L1's desires, i.e. intercept CR3 loads and/or 3326 * stores to forward them to L1, even if KVM does not need to 3327 * intercept them to preserve its identity mapped page tables. 3328 */ 3329 if (!(cr0 & X86_CR0_PG)) { 3330 exec_controls_setbit(vmx, CR3_EXITING_BITS); 3331 } else if (!is_guest_mode(vcpu)) { 3332 exec_controls_clearbit(vmx, CR3_EXITING_BITS); 3333 } else { 3334 tmp = exec_controls_get(vmx); 3335 tmp &= ~CR3_EXITING_BITS; 3336 tmp |= get_vmcs12(vcpu)->cpu_based_vm_exec_control & CR3_EXITING_BITS; 3337 exec_controls_set(vmx, tmp); 3338 } 3339 3340 /* Note, vmx_set_cr4() consumes the new vcpu->arch.cr0. */ 3341 if ((old_cr0_pg ^ cr0) & X86_CR0_PG) 3342 vmx_set_cr4(vcpu, kvm_read_cr4(vcpu)); 3343 3344 /* 3345 * When !CR0_PG -> CR0_PG, vcpu->arch.cr3 becomes active, but 3346 * GUEST_CR3 is still vmx->ept_identity_map_addr if EPT + !URG. 3347 */ 3348 if (!(old_cr0_pg & X86_CR0_PG) && (cr0 & X86_CR0_PG)) 3349 kvm_register_mark_dirty(vcpu, VCPU_EXREG_CR3); 3350 } 3351 3352 /* depends on vcpu->arch.cr0 to be set to a new value */ 3353 vmx->emulation_required = vmx_emulation_required(vcpu); 3354 } 3355 3356 static int vmx_get_max_ept_level(void) 3357 { 3358 if (cpu_has_vmx_ept_5levels()) 3359 return 5; 3360 return 4; 3361 } 3362 3363 u64 construct_eptp(struct kvm_vcpu *vcpu, hpa_t root_hpa, int root_level) 3364 { 3365 u64 eptp = VMX_EPTP_MT_WB; 3366 3367 eptp |= (root_level == 5) ? VMX_EPTP_PWL_5 : VMX_EPTP_PWL_4; 3368 3369 if (enable_ept_ad_bits && 3370 (!is_guest_mode(vcpu) || nested_ept_ad_enabled(vcpu))) 3371 eptp |= VMX_EPTP_AD_ENABLE_BIT; 3372 eptp |= root_hpa; 3373 3374 return eptp; 3375 } 3376 3377 static void vmx_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa, 3378 int root_level) 3379 { 3380 struct kvm *kvm = vcpu->kvm; 3381 bool update_guest_cr3 = true; 3382 unsigned long guest_cr3; 3383 u64 eptp; 3384 3385 if (enable_ept) { 3386 eptp = construct_eptp(vcpu, root_hpa, root_level); 3387 vmcs_write64(EPT_POINTER, eptp); 3388 3389 hv_track_root_tdp(vcpu, root_hpa); 3390 3391 if (!enable_unrestricted_guest && !is_paging(vcpu)) 3392 guest_cr3 = to_kvm_vmx(kvm)->ept_identity_map_addr; 3393 else if (kvm_register_is_dirty(vcpu, VCPU_EXREG_CR3)) 3394 guest_cr3 = vcpu->arch.cr3; 3395 else /* vmcs.GUEST_CR3 is already up-to-date. */ 3396 update_guest_cr3 = false; 3397 vmx_ept_load_pdptrs(vcpu); 3398 } else { 3399 guest_cr3 = root_hpa | kvm_get_active_pcid(vcpu) | 3400 kvm_get_active_cr3_lam_bits(vcpu); 3401 } 3402 3403 if (update_guest_cr3) 3404 vmcs_writel(GUEST_CR3, guest_cr3); 3405 } 3406 3407 3408 static bool vmx_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) 3409 { 3410 /* 3411 * We operate under the default treatment of SMM, so VMX cannot be 3412 * enabled under SMM. Note, whether or not VMXE is allowed at all, 3413 * i.e. is a reserved bit, is handled by common x86 code. 3414 */ 3415 if ((cr4 & X86_CR4_VMXE) && is_smm(vcpu)) 3416 return false; 3417 3418 if (to_vmx(vcpu)->nested.vmxon && !nested_cr4_valid(vcpu, cr4)) 3419 return false; 3420 3421 return true; 3422 } 3423 3424 void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) 3425 { 3426 unsigned long old_cr4 = kvm_read_cr4(vcpu); 3427 struct vcpu_vmx *vmx = to_vmx(vcpu); 3428 unsigned long hw_cr4; 3429 3430 /* 3431 * Pass through host's Machine Check Enable value to hw_cr4, which 3432 * is in force while we are in guest mode. Do not let guests control 3433 * this bit, even if host CR4.MCE == 0. 3434 */ 3435 hw_cr4 = (cr4_read_shadow() & X86_CR4_MCE) | (cr4 & ~X86_CR4_MCE); 3436 if (enable_unrestricted_guest) 3437 hw_cr4 |= KVM_VM_CR4_ALWAYS_ON_UNRESTRICTED_GUEST; 3438 else if (vmx->rmode.vm86_active) 3439 hw_cr4 |= KVM_RMODE_VM_CR4_ALWAYS_ON; 3440 else 3441 hw_cr4 |= KVM_PMODE_VM_CR4_ALWAYS_ON; 3442 3443 if (vmx_umip_emulated()) { 3444 if (cr4 & X86_CR4_UMIP) { 3445 secondary_exec_controls_setbit(vmx, SECONDARY_EXEC_DESC); 3446 hw_cr4 &= ~X86_CR4_UMIP; 3447 } else if (!is_guest_mode(vcpu) || 3448 !nested_cpu_has2(get_vmcs12(vcpu), SECONDARY_EXEC_DESC)) { 3449 secondary_exec_controls_clearbit(vmx, SECONDARY_EXEC_DESC); 3450 } 3451 } 3452 3453 vcpu->arch.cr4 = cr4; 3454 kvm_register_mark_available(vcpu, VCPU_EXREG_CR4); 3455 3456 if (!enable_unrestricted_guest) { 3457 if (enable_ept) { 3458 if (!is_paging(vcpu)) { 3459 hw_cr4 &= ~X86_CR4_PAE; 3460 hw_cr4 |= X86_CR4_PSE; 3461 } else if (!(cr4 & X86_CR4_PAE)) { 3462 hw_cr4 &= ~X86_CR4_PAE; 3463 } 3464 } 3465 3466 /* 3467 * SMEP/SMAP/PKU is disabled if CPU is in non-paging mode in 3468 * hardware. To emulate this behavior, SMEP/SMAP/PKU needs 3469 * to be manually disabled when guest switches to non-paging 3470 * mode. 3471 * 3472 * If !enable_unrestricted_guest, the CPU is always running 3473 * with CR0.PG=1 and CR4 needs to be modified. 3474 * If enable_unrestricted_guest, the CPU automatically 3475 * disables SMEP/SMAP/PKU when the guest sets CR0.PG=0. 3476 */ 3477 if (!is_paging(vcpu)) 3478 hw_cr4 &= ~(X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_PKE); 3479 } 3480 3481 vmcs_writel(CR4_READ_SHADOW, cr4); 3482 vmcs_writel(GUEST_CR4, hw_cr4); 3483 3484 if ((cr4 ^ old_cr4) & (X86_CR4_OSXSAVE | X86_CR4_PKE)) 3485 kvm_update_cpuid_runtime(vcpu); 3486 } 3487 3488 void vmx_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg) 3489 { 3490 struct vcpu_vmx *vmx = to_vmx(vcpu); 3491 u32 ar; 3492 3493 if (vmx->rmode.vm86_active && seg != VCPU_SREG_LDTR) { 3494 *var = vmx->rmode.segs[seg]; 3495 if (seg == VCPU_SREG_TR 3496 || var->selector == vmx_read_guest_seg_selector(vmx, seg)) 3497 return; 3498 var->base = vmx_read_guest_seg_base(vmx, seg); 3499 var->selector = vmx_read_guest_seg_selector(vmx, seg); 3500 return; 3501 } 3502 var->base = vmx_read_guest_seg_base(vmx, seg); 3503 var->limit = vmx_read_guest_seg_limit(vmx, seg); 3504 var->selector = vmx_read_guest_seg_selector(vmx, seg); 3505 ar = vmx_read_guest_seg_ar(vmx, seg); 3506 var->unusable = (ar >> 16) & 1; 3507 var->type = ar & 15; 3508 var->s = (ar >> 4) & 1; 3509 var->dpl = (ar >> 5) & 3; 3510 /* 3511 * Some userspaces do not preserve unusable property. Since usable 3512 * segment has to be present according to VMX spec we can use present 3513 * property to amend userspace bug by making unusable segment always 3514 * nonpresent. vmx_segment_access_rights() already marks nonpresent 3515 * segment as unusable. 3516 */ 3517 var->present = !var->unusable; 3518 var->avl = (ar >> 12) & 1; 3519 var->l = (ar >> 13) & 1; 3520 var->db = (ar >> 14) & 1; 3521 var->g = (ar >> 15) & 1; 3522 } 3523 3524 static u64 vmx_get_segment_base(struct kvm_vcpu *vcpu, int seg) 3525 { 3526 struct kvm_segment s; 3527 3528 if (to_vmx(vcpu)->rmode.vm86_active) { 3529 vmx_get_segment(vcpu, &s, seg); 3530 return s.base; 3531 } 3532 return vmx_read_guest_seg_base(to_vmx(vcpu), seg); 3533 } 3534 3535 int vmx_get_cpl(struct kvm_vcpu *vcpu) 3536 { 3537 struct vcpu_vmx *vmx = to_vmx(vcpu); 3538 3539 if (unlikely(vmx->rmode.vm86_active)) 3540 return 0; 3541 else { 3542 int ar = vmx_read_guest_seg_ar(vmx, VCPU_SREG_SS); 3543 return VMX_AR_DPL(ar); 3544 } 3545 } 3546 3547 static u32 vmx_segment_access_rights(struct kvm_segment *var) 3548 { 3549 u32 ar; 3550 3551 ar = var->type & 15; 3552 ar |= (var->s & 1) << 4; 3553 ar |= (var->dpl & 3) << 5; 3554 ar |= (var->present & 1) << 7; 3555 ar |= (var->avl & 1) << 12; 3556 ar |= (var->l & 1) << 13; 3557 ar |= (var->db & 1) << 14; 3558 ar |= (var->g & 1) << 15; 3559 ar |= (var->unusable || !var->present) << 16; 3560 3561 return ar; 3562 } 3563 3564 void __vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg) 3565 { 3566 struct vcpu_vmx *vmx = to_vmx(vcpu); 3567 const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; 3568 3569 vmx_segment_cache_clear(vmx); 3570 3571 if (vmx->rmode.vm86_active && seg != VCPU_SREG_LDTR) { 3572 vmx->rmode.segs[seg] = *var; 3573 if (seg == VCPU_SREG_TR) 3574 vmcs_write16(sf->selector, var->selector); 3575 else if (var->s) 3576 fix_rmode_seg(seg, &vmx->rmode.segs[seg]); 3577 return; 3578 } 3579 3580 vmcs_writel(sf->base, var->base); 3581 vmcs_write32(sf->limit, var->limit); 3582 vmcs_write16(sf->selector, var->selector); 3583 3584 /* 3585 * Fix the "Accessed" bit in AR field of segment registers for older 3586 * qemu binaries. 3587 * IA32 arch specifies that at the time of processor reset the 3588 * "Accessed" bit in the AR field of segment registers is 1. And qemu 3589 * is setting it to 0 in the userland code. This causes invalid guest 3590 * state vmexit when "unrestricted guest" mode is turned on. 3591 * Fix for this setup issue in cpu_reset is being pushed in the qemu 3592 * tree. Newer qemu binaries with that qemu fix would not need this 3593 * kvm hack. 3594 */ 3595 if (is_unrestricted_guest(vcpu) && (seg != VCPU_SREG_LDTR)) 3596 var->type |= 0x1; /* Accessed */ 3597 3598 vmcs_write32(sf->ar_bytes, vmx_segment_access_rights(var)); 3599 } 3600 3601 static void vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg) 3602 { 3603 __vmx_set_segment(vcpu, var, seg); 3604 3605 to_vmx(vcpu)->emulation_required = vmx_emulation_required(vcpu); 3606 } 3607 3608 static void vmx_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l) 3609 { 3610 u32 ar = vmx_read_guest_seg_ar(to_vmx(vcpu), VCPU_SREG_CS); 3611 3612 *db = (ar >> 14) & 1; 3613 *l = (ar >> 13) & 1; 3614 } 3615 3616 static void vmx_get_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) 3617 { 3618 dt->size = vmcs_read32(GUEST_IDTR_LIMIT); 3619 dt->address = vmcs_readl(GUEST_IDTR_BASE); 3620 } 3621 3622 static void vmx_set_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) 3623 { 3624 vmcs_write32(GUEST_IDTR_LIMIT, dt->size); 3625 vmcs_writel(GUEST_IDTR_BASE, dt->address); 3626 } 3627 3628 static void vmx_get_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) 3629 { 3630 dt->size = vmcs_read32(GUEST_GDTR_LIMIT); 3631 dt->address = vmcs_readl(GUEST_GDTR_BASE); 3632 } 3633 3634 static void vmx_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) 3635 { 3636 vmcs_write32(GUEST_GDTR_LIMIT, dt->size); 3637 vmcs_writel(GUEST_GDTR_BASE, dt->address); 3638 } 3639 3640 static bool rmode_segment_valid(struct kvm_vcpu *vcpu, int seg) 3641 { 3642 struct kvm_segment var; 3643 u32 ar; 3644 3645 vmx_get_segment(vcpu, &var, seg); 3646 var.dpl = 0x3; 3647 if (seg == VCPU_SREG_CS) 3648 var.type = 0x3; 3649 ar = vmx_segment_access_rights(&var); 3650 3651 if (var.base != (var.selector << 4)) 3652 return false; 3653 if (var.limit != 0xffff) 3654 return false; 3655 if (ar != 0xf3) 3656 return false; 3657 3658 return true; 3659 } 3660 3661 static bool code_segment_valid(struct kvm_vcpu *vcpu) 3662 { 3663 struct kvm_segment cs; 3664 unsigned int cs_rpl; 3665 3666 vmx_get_segment(vcpu, &cs, VCPU_SREG_CS); 3667 cs_rpl = cs.selector & SEGMENT_RPL_MASK; 3668 3669 if (cs.unusable) 3670 return false; 3671 if (~cs.type & (VMX_AR_TYPE_CODE_MASK|VMX_AR_TYPE_ACCESSES_MASK)) 3672 return false; 3673 if (!cs.s) 3674 return false; 3675 if (cs.type & VMX_AR_TYPE_WRITEABLE_MASK) { 3676 if (cs.dpl > cs_rpl) 3677 return false; 3678 } else { 3679 if (cs.dpl != cs_rpl) 3680 return false; 3681 } 3682 if (!cs.present) 3683 return false; 3684 3685 /* TODO: Add Reserved field check, this'll require a new member in the kvm_segment_field structure */ 3686 return true; 3687 } 3688 3689 static bool stack_segment_valid(struct kvm_vcpu *vcpu) 3690 { 3691 struct kvm_segment ss; 3692 unsigned int ss_rpl; 3693 3694 vmx_get_segment(vcpu, &ss, VCPU_SREG_SS); 3695 ss_rpl = ss.selector & SEGMENT_RPL_MASK; 3696 3697 if (ss.unusable) 3698 return true; 3699 if (ss.type != 3 && ss.type != 7) 3700 return false; 3701 if (!ss.s) 3702 return false; 3703 if (ss.dpl != ss_rpl) /* DPL != RPL */ 3704 return false; 3705 if (!ss.present) 3706 return false; 3707 3708 return true; 3709 } 3710 3711 static bool data_segment_valid(struct kvm_vcpu *vcpu, int seg) 3712 { 3713 struct kvm_segment var; 3714 unsigned int rpl; 3715 3716 vmx_get_segment(vcpu, &var, seg); 3717 rpl = var.selector & SEGMENT_RPL_MASK; 3718 3719 if (var.unusable) 3720 return true; 3721 if (!var.s) 3722 return false; 3723 if (!var.present) 3724 return false; 3725 if (~var.type & (VMX_AR_TYPE_CODE_MASK|VMX_AR_TYPE_WRITEABLE_MASK)) { 3726 if (var.dpl < rpl) /* DPL < RPL */ 3727 return false; 3728 } 3729 3730 /* TODO: Add other members to kvm_segment_field to allow checking for other access 3731 * rights flags 3732 */ 3733 return true; 3734 } 3735 3736 static bool tr_valid(struct kvm_vcpu *vcpu) 3737 { 3738 struct kvm_segment tr; 3739 3740 vmx_get_segment(vcpu, &tr, VCPU_SREG_TR); 3741 3742 if (tr.unusable) 3743 return false; 3744 if (tr.selector & SEGMENT_TI_MASK) /* TI = 1 */ 3745 return false; 3746 if (tr.type != 3 && tr.type != 11) /* TODO: Check if guest is in IA32e mode */ 3747 return false; 3748 if (!tr.present) 3749 return false; 3750 3751 return true; 3752 } 3753 3754 static bool ldtr_valid(struct kvm_vcpu *vcpu) 3755 { 3756 struct kvm_segment ldtr; 3757 3758 vmx_get_segment(vcpu, &ldtr, VCPU_SREG_LDTR); 3759 3760 if (ldtr.unusable) 3761 return true; 3762 if (ldtr.selector & SEGMENT_TI_MASK) /* TI = 1 */ 3763 return false; 3764 if (ldtr.type != 2) 3765 return false; 3766 if (!ldtr.present) 3767 return false; 3768 3769 return true; 3770 } 3771 3772 static bool cs_ss_rpl_check(struct kvm_vcpu *vcpu) 3773 { 3774 struct kvm_segment cs, ss; 3775 3776 vmx_get_segment(vcpu, &cs, VCPU_SREG_CS); 3777 vmx_get_segment(vcpu, &ss, VCPU_SREG_SS); 3778 3779 return ((cs.selector & SEGMENT_RPL_MASK) == 3780 (ss.selector & SEGMENT_RPL_MASK)); 3781 } 3782 3783 /* 3784 * Check if guest state is valid. Returns true if valid, false if 3785 * not. 3786 * We assume that registers are always usable 3787 */ 3788 bool __vmx_guest_state_valid(struct kvm_vcpu *vcpu) 3789 { 3790 /* real mode guest state checks */ 3791 if (!is_protmode(vcpu) || (vmx_get_rflags(vcpu) & X86_EFLAGS_VM)) { 3792 if (!rmode_segment_valid(vcpu, VCPU_SREG_CS)) 3793 return false; 3794 if (!rmode_segment_valid(vcpu, VCPU_SREG_SS)) 3795 return false; 3796 if (!rmode_segment_valid(vcpu, VCPU_SREG_DS)) 3797 return false; 3798 if (!rmode_segment_valid(vcpu, VCPU_SREG_ES)) 3799 return false; 3800 if (!rmode_segment_valid(vcpu, VCPU_SREG_FS)) 3801 return false; 3802 if (!rmode_segment_valid(vcpu, VCPU_SREG_GS)) 3803 return false; 3804 } else { 3805 /* protected mode guest state checks */ 3806 if (!cs_ss_rpl_check(vcpu)) 3807 return false; 3808 if (!code_segment_valid(vcpu)) 3809 return false; 3810 if (!stack_segment_valid(vcpu)) 3811 return false; 3812 if (!data_segment_valid(vcpu, VCPU_SREG_DS)) 3813 return false; 3814 if (!data_segment_valid(vcpu, VCPU_SREG_ES)) 3815 return false; 3816 if (!data_segment_valid(vcpu, VCPU_SREG_FS)) 3817 return false; 3818 if (!data_segment_valid(vcpu, VCPU_SREG_GS)) 3819 return false; 3820 if (!tr_valid(vcpu)) 3821 return false; 3822 if (!ldtr_valid(vcpu)) 3823 return false; 3824 } 3825 /* TODO: 3826 * - Add checks on RIP 3827 * - Add checks on RFLAGS 3828 */ 3829 3830 return true; 3831 } 3832 3833 static int init_rmode_tss(struct kvm *kvm, void __user *ua) 3834 { 3835 const void *zero_page = (const void *) __va(page_to_phys(ZERO_PAGE(0))); 3836 u16 data; 3837 int i; 3838 3839 for (i = 0; i < 3; i++) { 3840 if (__copy_to_user(ua + PAGE_SIZE * i, zero_page, PAGE_SIZE)) 3841 return -EFAULT; 3842 } 3843 3844 data = TSS_BASE_SIZE + TSS_REDIRECTION_SIZE; 3845 if (__copy_to_user(ua + TSS_IOPB_BASE_OFFSET, &data, sizeof(u16))) 3846 return -EFAULT; 3847 3848 data = ~0; 3849 if (__copy_to_user(ua + RMODE_TSS_SIZE - 1, &data, sizeof(u8))) 3850 return -EFAULT; 3851 3852 return 0; 3853 } 3854 3855 static int init_rmode_identity_map(struct kvm *kvm) 3856 { 3857 struct kvm_vmx *kvm_vmx = to_kvm_vmx(kvm); 3858 int i, r = 0; 3859 void __user *uaddr; 3860 u32 tmp; 3861 3862 /* Protect kvm_vmx->ept_identity_pagetable_done. */ 3863 mutex_lock(&kvm->slots_lock); 3864 3865 if (likely(kvm_vmx->ept_identity_pagetable_done)) 3866 goto out; 3867 3868 if (!kvm_vmx->ept_identity_map_addr) 3869 kvm_vmx->ept_identity_map_addr = VMX_EPT_IDENTITY_PAGETABLE_ADDR; 3870 3871 uaddr = __x86_set_memory_region(kvm, 3872 IDENTITY_PAGETABLE_PRIVATE_MEMSLOT, 3873 kvm_vmx->ept_identity_map_addr, 3874 PAGE_SIZE); 3875 if (IS_ERR(uaddr)) { 3876 r = PTR_ERR(uaddr); 3877 goto out; 3878 } 3879 3880 /* Set up identity-mapping pagetable for EPT in real mode */ 3881 for (i = 0; i < (PAGE_SIZE / sizeof(tmp)); i++) { 3882 tmp = (i << 22) + (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | 3883 _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_PSE); 3884 if (__copy_to_user(uaddr + i * sizeof(tmp), &tmp, sizeof(tmp))) { 3885 r = -EFAULT; 3886 goto out; 3887 } 3888 } 3889 kvm_vmx->ept_identity_pagetable_done = true; 3890 3891 out: 3892 mutex_unlock(&kvm->slots_lock); 3893 return r; 3894 } 3895 3896 static void seg_setup(int seg) 3897 { 3898 const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; 3899 unsigned int ar; 3900 3901 vmcs_write16(sf->selector, 0); 3902 vmcs_writel(sf->base, 0); 3903 vmcs_write32(sf->limit, 0xffff); 3904 ar = 0x93; 3905 if (seg == VCPU_SREG_CS) 3906 ar |= 0x08; /* code segment */ 3907 3908 vmcs_write32(sf->ar_bytes, ar); 3909 } 3910 3911 int allocate_vpid(void) 3912 { 3913 int vpid; 3914 3915 if (!enable_vpid) 3916 return 0; 3917 spin_lock(&vmx_vpid_lock); 3918 vpid = find_first_zero_bit(vmx_vpid_bitmap, VMX_NR_VPIDS); 3919 if (vpid < VMX_NR_VPIDS) 3920 __set_bit(vpid, vmx_vpid_bitmap); 3921 else 3922 vpid = 0; 3923 spin_unlock(&vmx_vpid_lock); 3924 return vpid; 3925 } 3926 3927 void free_vpid(int vpid) 3928 { 3929 if (!enable_vpid || vpid == 0) 3930 return; 3931 spin_lock(&vmx_vpid_lock); 3932 __clear_bit(vpid, vmx_vpid_bitmap); 3933 spin_unlock(&vmx_vpid_lock); 3934 } 3935 3936 static void vmx_msr_bitmap_l01_changed(struct vcpu_vmx *vmx) 3937 { 3938 /* 3939 * When KVM is a nested hypervisor on top of Hyper-V and uses 3940 * 'Enlightened MSR Bitmap' feature L0 needs to know that MSR 3941 * bitmap has changed. 3942 */ 3943 if (kvm_is_using_evmcs()) { 3944 struct hv_enlightened_vmcs *evmcs = (void *)vmx->vmcs01.vmcs; 3945 3946 if (evmcs->hv_enlightenments_control.msr_bitmap) 3947 evmcs->hv_clean_fields &= 3948 ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP; 3949 } 3950 3951 vmx->nested.force_msr_bitmap_recalc = true; 3952 } 3953 3954 void vmx_disable_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type) 3955 { 3956 struct vcpu_vmx *vmx = to_vmx(vcpu); 3957 unsigned long *msr_bitmap = vmx->vmcs01.msr_bitmap; 3958 int idx; 3959 3960 if (!cpu_has_vmx_msr_bitmap()) 3961 return; 3962 3963 vmx_msr_bitmap_l01_changed(vmx); 3964 3965 /* 3966 * Mark the desired intercept state in shadow bitmap, this is needed 3967 * for resync when the MSR filters change. 3968 */ 3969 idx = vmx_get_passthrough_msr_slot(msr); 3970 if (idx >= 0) { 3971 if (type & MSR_TYPE_R) 3972 clear_bit(idx, vmx->shadow_msr_intercept.read); 3973 if (type & MSR_TYPE_W) 3974 clear_bit(idx, vmx->shadow_msr_intercept.write); 3975 } 3976 3977 if ((type & MSR_TYPE_R) && 3978 !kvm_msr_allowed(vcpu, msr, KVM_MSR_FILTER_READ)) { 3979 vmx_set_msr_bitmap_read(msr_bitmap, msr); 3980 type &= ~MSR_TYPE_R; 3981 } 3982 3983 if ((type & MSR_TYPE_W) && 3984 !kvm_msr_allowed(vcpu, msr, KVM_MSR_FILTER_WRITE)) { 3985 vmx_set_msr_bitmap_write(msr_bitmap, msr); 3986 type &= ~MSR_TYPE_W; 3987 } 3988 3989 if (type & MSR_TYPE_R) 3990 vmx_clear_msr_bitmap_read(msr_bitmap, msr); 3991 3992 if (type & MSR_TYPE_W) 3993 vmx_clear_msr_bitmap_write(msr_bitmap, msr); 3994 } 3995 3996 void vmx_enable_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type) 3997 { 3998 struct vcpu_vmx *vmx = to_vmx(vcpu); 3999 unsigned long *msr_bitmap = vmx->vmcs01.msr_bitmap; 4000 int idx; 4001 4002 if (!cpu_has_vmx_msr_bitmap()) 4003 return; 4004 4005 vmx_msr_bitmap_l01_changed(vmx); 4006 4007 /* 4008 * Mark the desired intercept state in shadow bitmap, this is needed 4009 * for resync when the MSR filter changes. 4010 */ 4011 idx = vmx_get_passthrough_msr_slot(msr); 4012 if (idx >= 0) { 4013 if (type & MSR_TYPE_R) 4014 set_bit(idx, vmx->shadow_msr_intercept.read); 4015 if (type & MSR_TYPE_W) 4016 set_bit(idx, vmx->shadow_msr_intercept.write); 4017 } 4018 4019 if (type & MSR_TYPE_R) 4020 vmx_set_msr_bitmap_read(msr_bitmap, msr); 4021 4022 if (type & MSR_TYPE_W) 4023 vmx_set_msr_bitmap_write(msr_bitmap, msr); 4024 } 4025 4026 static void vmx_update_msr_bitmap_x2apic(struct kvm_vcpu *vcpu) 4027 { 4028 /* 4029 * x2APIC indices for 64-bit accesses into the RDMSR and WRMSR halves 4030 * of the MSR bitmap. KVM emulates APIC registers up through 0x3f0, 4031 * i.e. MSR 0x83f, and so only needs to dynamically manipulate 64 bits. 4032 */ 4033 const int read_idx = APIC_BASE_MSR / BITS_PER_LONG_LONG; 4034 const int write_idx = read_idx + (0x800 / sizeof(u64)); 4035 struct vcpu_vmx *vmx = to_vmx(vcpu); 4036 u64 *msr_bitmap = (u64 *)vmx->vmcs01.msr_bitmap; 4037 u8 mode; 4038 4039 if (!cpu_has_vmx_msr_bitmap() || WARN_ON_ONCE(!lapic_in_kernel(vcpu))) 4040 return; 4041 4042 if (cpu_has_secondary_exec_ctrls() && 4043 (secondary_exec_controls_get(vmx) & 4044 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE)) { 4045 mode = MSR_BITMAP_MODE_X2APIC; 4046 if (enable_apicv && kvm_vcpu_apicv_active(vcpu)) 4047 mode |= MSR_BITMAP_MODE_X2APIC_APICV; 4048 } else { 4049 mode = 0; 4050 } 4051 4052 if (mode == vmx->x2apic_msr_bitmap_mode) 4053 return; 4054 4055 vmx->x2apic_msr_bitmap_mode = mode; 4056 4057 /* 4058 * Reset the bitmap for MSRs 0x800 - 0x83f. Leave AMD's uber-extended 4059 * registers (0x840 and above) intercepted, KVM doesn't support them. 4060 * Intercept all writes by default and poke holes as needed. Pass 4061 * through reads for all valid registers by default in x2APIC+APICv 4062 * mode, only the current timer count needs on-demand emulation by KVM. 4063 */ 4064 if (mode & MSR_BITMAP_MODE_X2APIC_APICV) 4065 msr_bitmap[read_idx] = ~kvm_lapic_readable_reg_mask(vcpu->arch.apic); 4066 else 4067 msr_bitmap[read_idx] = ~0ull; 4068 msr_bitmap[write_idx] = ~0ull; 4069 4070 /* 4071 * TPR reads and writes can be virtualized even if virtual interrupt 4072 * delivery is not in use. 4073 */ 4074 vmx_set_intercept_for_msr(vcpu, X2APIC_MSR(APIC_TASKPRI), MSR_TYPE_RW, 4075 !(mode & MSR_BITMAP_MODE_X2APIC)); 4076 4077 if (mode & MSR_BITMAP_MODE_X2APIC_APICV) { 4078 vmx_enable_intercept_for_msr(vcpu, X2APIC_MSR(APIC_TMCCT), MSR_TYPE_RW); 4079 vmx_disable_intercept_for_msr(vcpu, X2APIC_MSR(APIC_EOI), MSR_TYPE_W); 4080 vmx_disable_intercept_for_msr(vcpu, X2APIC_MSR(APIC_SELF_IPI), MSR_TYPE_W); 4081 if (enable_ipiv) 4082 vmx_disable_intercept_for_msr(vcpu, X2APIC_MSR(APIC_ICR), MSR_TYPE_RW); 4083 } 4084 } 4085 4086 void pt_update_intercept_for_msr(struct kvm_vcpu *vcpu) 4087 { 4088 struct vcpu_vmx *vmx = to_vmx(vcpu); 4089 bool flag = !(vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN); 4090 u32 i; 4091 4092 vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_STATUS, MSR_TYPE_RW, flag); 4093 vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_OUTPUT_BASE, MSR_TYPE_RW, flag); 4094 vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_OUTPUT_MASK, MSR_TYPE_RW, flag); 4095 vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_CR3_MATCH, MSR_TYPE_RW, flag); 4096 for (i = 0; i < vmx->pt_desc.num_address_ranges; i++) { 4097 vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_ADDR0_A + i * 2, MSR_TYPE_RW, flag); 4098 vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_ADDR0_B + i * 2, MSR_TYPE_RW, flag); 4099 } 4100 } 4101 4102 static bool vmx_guest_apic_has_interrupt(struct kvm_vcpu *vcpu) 4103 { 4104 struct vcpu_vmx *vmx = to_vmx(vcpu); 4105 void *vapic_page; 4106 u32 vppr; 4107 int rvi; 4108 4109 if (WARN_ON_ONCE(!is_guest_mode(vcpu)) || 4110 !nested_cpu_has_vid(get_vmcs12(vcpu)) || 4111 WARN_ON_ONCE(!vmx->nested.virtual_apic_map.gfn)) 4112 return false; 4113 4114 rvi = vmx_get_rvi(); 4115 4116 vapic_page = vmx->nested.virtual_apic_map.hva; 4117 vppr = *((u32 *)(vapic_page + APIC_PROCPRI)); 4118 4119 return ((rvi & 0xf0) > (vppr & 0xf0)); 4120 } 4121 4122 static void vmx_msr_filter_changed(struct kvm_vcpu *vcpu) 4123 { 4124 struct vcpu_vmx *vmx = to_vmx(vcpu); 4125 u32 i; 4126 4127 if (!cpu_has_vmx_msr_bitmap()) 4128 return; 4129 4130 /* 4131 * Redo intercept permissions for MSRs that KVM is passing through to 4132 * the guest. Disabling interception will check the new MSR filter and 4133 * ensure that KVM enables interception if usersepace wants to filter 4134 * the MSR. MSRs that KVM is already intercepting don't need to be 4135 * refreshed since KVM is going to intercept them regardless of what 4136 * userspace wants. 4137 */ 4138 for (i = 0; i < ARRAY_SIZE(vmx_possible_passthrough_msrs); i++) { 4139 u32 msr = vmx_possible_passthrough_msrs[i]; 4140 4141 if (!test_bit(i, vmx->shadow_msr_intercept.read)) 4142 vmx_disable_intercept_for_msr(vcpu, msr, MSR_TYPE_R); 4143 4144 if (!test_bit(i, vmx->shadow_msr_intercept.write)) 4145 vmx_disable_intercept_for_msr(vcpu, msr, MSR_TYPE_W); 4146 } 4147 4148 /* PT MSRs can be passed through iff PT is exposed to the guest. */ 4149 if (vmx_pt_mode_is_host_guest()) 4150 pt_update_intercept_for_msr(vcpu); 4151 } 4152 4153 static inline void kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu *vcpu, 4154 int pi_vec) 4155 { 4156 #ifdef CONFIG_SMP 4157 if (vcpu->mode == IN_GUEST_MODE) { 4158 /* 4159 * The vector of the virtual has already been set in the PIR. 4160 * Send a notification event to deliver the virtual interrupt 4161 * unless the vCPU is the currently running vCPU, i.e. the 4162 * event is being sent from a fastpath VM-Exit handler, in 4163 * which case the PIR will be synced to the vIRR before 4164 * re-entering the guest. 4165 * 4166 * When the target is not the running vCPU, the following 4167 * possibilities emerge: 4168 * 4169 * Case 1: vCPU stays in non-root mode. Sending a notification 4170 * event posts the interrupt to the vCPU. 4171 * 4172 * Case 2: vCPU exits to root mode and is still runnable. The 4173 * PIR will be synced to the vIRR before re-entering the guest. 4174 * Sending a notification event is ok as the host IRQ handler 4175 * will ignore the spurious event. 4176 * 4177 * Case 3: vCPU exits to root mode and is blocked. vcpu_block() 4178 * has already synced PIR to vIRR and never blocks the vCPU if 4179 * the vIRR is not empty. Therefore, a blocked vCPU here does 4180 * not wait for any requested interrupts in PIR, and sending a 4181 * notification event also results in a benign, spurious event. 4182 */ 4183 4184 if (vcpu != kvm_get_running_vcpu()) 4185 __apic_send_IPI_mask(get_cpu_mask(vcpu->cpu), pi_vec); 4186 return; 4187 } 4188 #endif 4189 /* 4190 * The vCPU isn't in the guest; wake the vCPU in case it is blocking, 4191 * otherwise do nothing as KVM will grab the highest priority pending 4192 * IRQ via ->sync_pir_to_irr() in vcpu_enter_guest(). 4193 */ 4194 kvm_vcpu_wake_up(vcpu); 4195 } 4196 4197 static int vmx_deliver_nested_posted_interrupt(struct kvm_vcpu *vcpu, 4198 int vector) 4199 { 4200 struct vcpu_vmx *vmx = to_vmx(vcpu); 4201 4202 if (is_guest_mode(vcpu) && 4203 vector == vmx->nested.posted_intr_nv) { 4204 /* 4205 * If a posted intr is not recognized by hardware, 4206 * we will accomplish it in the next vmentry. 4207 */ 4208 vmx->nested.pi_pending = true; 4209 kvm_make_request(KVM_REQ_EVENT, vcpu); 4210 4211 /* 4212 * This pairs with the smp_mb_*() after setting vcpu->mode in 4213 * vcpu_enter_guest() to guarantee the vCPU sees the event 4214 * request if triggering a posted interrupt "fails" because 4215 * vcpu->mode != IN_GUEST_MODE. The extra barrier is needed as 4216 * the smb_wmb() in kvm_make_request() only ensures everything 4217 * done before making the request is visible when the request 4218 * is visible, it doesn't ensure ordering between the store to 4219 * vcpu->requests and the load from vcpu->mode. 4220 */ 4221 smp_mb__after_atomic(); 4222 4223 /* the PIR and ON have been set by L1. */ 4224 kvm_vcpu_trigger_posted_interrupt(vcpu, POSTED_INTR_NESTED_VECTOR); 4225 return 0; 4226 } 4227 return -1; 4228 } 4229 /* 4230 * Send interrupt to vcpu via posted interrupt way. 4231 * 1. If target vcpu is running(non-root mode), send posted interrupt 4232 * notification to vcpu and hardware will sync PIR to vIRR atomically. 4233 * 2. If target vcpu isn't running(root mode), kick it to pick up the 4234 * interrupt from PIR in next vmentry. 4235 */ 4236 static int vmx_deliver_posted_interrupt(struct kvm_vcpu *vcpu, int vector) 4237 { 4238 struct vcpu_vmx *vmx = to_vmx(vcpu); 4239 int r; 4240 4241 r = vmx_deliver_nested_posted_interrupt(vcpu, vector); 4242 if (!r) 4243 return 0; 4244 4245 /* Note, this is called iff the local APIC is in-kernel. */ 4246 if (!vcpu->arch.apic->apicv_active) 4247 return -1; 4248 4249 if (pi_test_and_set_pir(vector, &vmx->pi_desc)) 4250 return 0; 4251 4252 /* If a previous notification has sent the IPI, nothing to do. */ 4253 if (pi_test_and_set_on(&vmx->pi_desc)) 4254 return 0; 4255 4256 /* 4257 * The implied barrier in pi_test_and_set_on() pairs with the smp_mb_*() 4258 * after setting vcpu->mode in vcpu_enter_guest(), thus the vCPU is 4259 * guaranteed to see PID.ON=1 and sync the PIR to IRR if triggering a 4260 * posted interrupt "fails" because vcpu->mode != IN_GUEST_MODE. 4261 */ 4262 kvm_vcpu_trigger_posted_interrupt(vcpu, POSTED_INTR_VECTOR); 4263 return 0; 4264 } 4265 4266 static void vmx_deliver_interrupt(struct kvm_lapic *apic, int delivery_mode, 4267 int trig_mode, int vector) 4268 { 4269 struct kvm_vcpu *vcpu = apic->vcpu; 4270 4271 if (vmx_deliver_posted_interrupt(vcpu, vector)) { 4272 kvm_lapic_set_irr(vector, apic); 4273 kvm_make_request(KVM_REQ_EVENT, vcpu); 4274 kvm_vcpu_kick(vcpu); 4275 } else { 4276 trace_kvm_apicv_accept_irq(vcpu->vcpu_id, delivery_mode, 4277 trig_mode, vector); 4278 } 4279 } 4280 4281 /* 4282 * Set up the vmcs's constant host-state fields, i.e., host-state fields that 4283 * will not change in the lifetime of the guest. 4284 * Note that host-state that does change is set elsewhere. E.g., host-state 4285 * that is set differently for each CPU is set in vmx_vcpu_load(), not here. 4286 */ 4287 void vmx_set_constant_host_state(struct vcpu_vmx *vmx) 4288 { 4289 u32 low32, high32; 4290 unsigned long tmpl; 4291 unsigned long cr0, cr3, cr4; 4292 4293 cr0 = read_cr0(); 4294 WARN_ON(cr0 & X86_CR0_TS); 4295 vmcs_writel(HOST_CR0, cr0); /* 22.2.3 */ 4296 4297 /* 4298 * Save the most likely value for this task's CR3 in the VMCS. 4299 * We can't use __get_current_cr3_fast() because we're not atomic. 4300 */ 4301 cr3 = __read_cr3(); 4302 vmcs_writel(HOST_CR3, cr3); /* 22.2.3 FIXME: shadow tables */ 4303 vmx->loaded_vmcs->host_state.cr3 = cr3; 4304 4305 /* Save the most likely value for this task's CR4 in the VMCS. */ 4306 cr4 = cr4_read_shadow(); 4307 vmcs_writel(HOST_CR4, cr4); /* 22.2.3, 22.2.5 */ 4308 vmx->loaded_vmcs->host_state.cr4 = cr4; 4309 4310 vmcs_write16(HOST_CS_SELECTOR, __KERNEL_CS); /* 22.2.4 */ 4311 #ifdef CONFIG_X86_64 4312 /* 4313 * Load null selectors, so we can avoid reloading them in 4314 * vmx_prepare_switch_to_host(), in case userspace uses 4315 * the null selectors too (the expected case). 4316 */ 4317 vmcs_write16(HOST_DS_SELECTOR, 0); 4318 vmcs_write16(HOST_ES_SELECTOR, 0); 4319 #else 4320 vmcs_write16(HOST_DS_SELECTOR, __KERNEL_DS); /* 22.2.4 */ 4321 vmcs_write16(HOST_ES_SELECTOR, __KERNEL_DS); /* 22.2.4 */ 4322 #endif 4323 vmcs_write16(HOST_SS_SELECTOR, __KERNEL_DS); /* 22.2.4 */ 4324 vmcs_write16(HOST_TR_SELECTOR, GDT_ENTRY_TSS*8); /* 22.2.4 */ 4325 4326 vmcs_writel(HOST_IDTR_BASE, host_idt_base); /* 22.2.4 */ 4327 4328 vmcs_writel(HOST_RIP, (unsigned long)vmx_vmexit); /* 22.2.5 */ 4329 4330 rdmsr(MSR_IA32_SYSENTER_CS, low32, high32); 4331 vmcs_write32(HOST_IA32_SYSENTER_CS, low32); 4332 4333 /* 4334 * SYSENTER is used for 32-bit system calls on either 32-bit or 4335 * 64-bit kernels. It is always zero If neither is allowed, otherwise 4336 * vmx_vcpu_load_vmcs loads it with the per-CPU entry stack (and may 4337 * have already done so!). 4338 */ 4339 if (!IS_ENABLED(CONFIG_IA32_EMULATION) && !IS_ENABLED(CONFIG_X86_32)) 4340 vmcs_writel(HOST_IA32_SYSENTER_ESP, 0); 4341 4342 rdmsrl(MSR_IA32_SYSENTER_EIP, tmpl); 4343 vmcs_writel(HOST_IA32_SYSENTER_EIP, tmpl); /* 22.2.3 */ 4344 4345 if (vmcs_config.vmexit_ctrl & VM_EXIT_LOAD_IA32_PAT) { 4346 rdmsr(MSR_IA32_CR_PAT, low32, high32); 4347 vmcs_write64(HOST_IA32_PAT, low32 | ((u64) high32 << 32)); 4348 } 4349 4350 if (cpu_has_load_ia32_efer()) 4351 vmcs_write64(HOST_IA32_EFER, host_efer); 4352 } 4353 4354 void set_cr4_guest_host_mask(struct vcpu_vmx *vmx) 4355 { 4356 struct kvm_vcpu *vcpu = &vmx->vcpu; 4357 4358 vcpu->arch.cr4_guest_owned_bits = KVM_POSSIBLE_CR4_GUEST_BITS & 4359 ~vcpu->arch.cr4_guest_rsvd_bits; 4360 if (!enable_ept) { 4361 vcpu->arch.cr4_guest_owned_bits &= ~X86_CR4_TLBFLUSH_BITS; 4362 vcpu->arch.cr4_guest_owned_bits &= ~X86_CR4_PDPTR_BITS; 4363 } 4364 if (is_guest_mode(&vmx->vcpu)) 4365 vcpu->arch.cr4_guest_owned_bits &= 4366 ~get_vmcs12(vcpu)->cr4_guest_host_mask; 4367 vmcs_writel(CR4_GUEST_HOST_MASK, ~vcpu->arch.cr4_guest_owned_bits); 4368 } 4369 4370 static u32 vmx_pin_based_exec_ctrl(struct vcpu_vmx *vmx) 4371 { 4372 u32 pin_based_exec_ctrl = vmcs_config.pin_based_exec_ctrl; 4373 4374 if (!kvm_vcpu_apicv_active(&vmx->vcpu)) 4375 pin_based_exec_ctrl &= ~PIN_BASED_POSTED_INTR; 4376 4377 if (!enable_vnmi) 4378 pin_based_exec_ctrl &= ~PIN_BASED_VIRTUAL_NMIS; 4379 4380 if (!enable_preemption_timer) 4381 pin_based_exec_ctrl &= ~PIN_BASED_VMX_PREEMPTION_TIMER; 4382 4383 return pin_based_exec_ctrl; 4384 } 4385 4386 static u32 vmx_vmentry_ctrl(void) 4387 { 4388 u32 vmentry_ctrl = vmcs_config.vmentry_ctrl; 4389 4390 if (vmx_pt_mode_is_system()) 4391 vmentry_ctrl &= ~(VM_ENTRY_PT_CONCEAL_PIP | 4392 VM_ENTRY_LOAD_IA32_RTIT_CTL); 4393 /* 4394 * IA32e mode, and loading of EFER and PERF_GLOBAL_CTRL are toggled dynamically. 4395 */ 4396 vmentry_ctrl &= ~(VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL | 4397 VM_ENTRY_LOAD_IA32_EFER | 4398 VM_ENTRY_IA32E_MODE); 4399 4400 if (cpu_has_perf_global_ctrl_bug()) 4401 vmentry_ctrl &= ~VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL; 4402 4403 return vmentry_ctrl; 4404 } 4405 4406 static u32 vmx_vmexit_ctrl(void) 4407 { 4408 u32 vmexit_ctrl = vmcs_config.vmexit_ctrl; 4409 4410 /* 4411 * Not used by KVM and never set in vmcs01 or vmcs02, but emulated for 4412 * nested virtualization and thus allowed to be set in vmcs12. 4413 */ 4414 vmexit_ctrl &= ~(VM_EXIT_SAVE_IA32_PAT | VM_EXIT_SAVE_IA32_EFER | 4415 VM_EXIT_SAVE_VMX_PREEMPTION_TIMER); 4416 4417 if (vmx_pt_mode_is_system()) 4418 vmexit_ctrl &= ~(VM_EXIT_PT_CONCEAL_PIP | 4419 VM_EXIT_CLEAR_IA32_RTIT_CTL); 4420 4421 if (cpu_has_perf_global_ctrl_bug()) 4422 vmexit_ctrl &= ~VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL; 4423 4424 /* Loading of EFER and PERF_GLOBAL_CTRL are toggled dynamically */ 4425 return vmexit_ctrl & 4426 ~(VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL | VM_EXIT_LOAD_IA32_EFER); 4427 } 4428 4429 static void vmx_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu) 4430 { 4431 struct vcpu_vmx *vmx = to_vmx(vcpu); 4432 4433 if (is_guest_mode(vcpu)) { 4434 vmx->nested.update_vmcs01_apicv_status = true; 4435 return; 4436 } 4437 4438 pin_controls_set(vmx, vmx_pin_based_exec_ctrl(vmx)); 4439 4440 if (kvm_vcpu_apicv_active(vcpu)) { 4441 secondary_exec_controls_setbit(vmx, 4442 SECONDARY_EXEC_APIC_REGISTER_VIRT | 4443 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY); 4444 if (enable_ipiv) 4445 tertiary_exec_controls_setbit(vmx, TERTIARY_EXEC_IPI_VIRT); 4446 } else { 4447 secondary_exec_controls_clearbit(vmx, 4448 SECONDARY_EXEC_APIC_REGISTER_VIRT | 4449 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY); 4450 if (enable_ipiv) 4451 tertiary_exec_controls_clearbit(vmx, TERTIARY_EXEC_IPI_VIRT); 4452 } 4453 4454 vmx_update_msr_bitmap_x2apic(vcpu); 4455 } 4456 4457 static u32 vmx_exec_control(struct vcpu_vmx *vmx) 4458 { 4459 u32 exec_control = vmcs_config.cpu_based_exec_ctrl; 4460 4461 /* 4462 * Not used by KVM, but fully supported for nesting, i.e. are allowed in 4463 * vmcs12 and propagated to vmcs02 when set in vmcs12. 4464 */ 4465 exec_control &= ~(CPU_BASED_RDTSC_EXITING | 4466 CPU_BASED_USE_IO_BITMAPS | 4467 CPU_BASED_MONITOR_TRAP_FLAG | 4468 CPU_BASED_PAUSE_EXITING); 4469 4470 /* INTR_WINDOW_EXITING and NMI_WINDOW_EXITING are toggled dynamically */ 4471 exec_control &= ~(CPU_BASED_INTR_WINDOW_EXITING | 4472 CPU_BASED_NMI_WINDOW_EXITING); 4473 4474 if (vmx->vcpu.arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT) 4475 exec_control &= ~CPU_BASED_MOV_DR_EXITING; 4476 4477 if (!cpu_need_tpr_shadow(&vmx->vcpu)) 4478 exec_control &= ~CPU_BASED_TPR_SHADOW; 4479 4480 #ifdef CONFIG_X86_64 4481 if (exec_control & CPU_BASED_TPR_SHADOW) 4482 exec_control &= ~(CPU_BASED_CR8_LOAD_EXITING | 4483 CPU_BASED_CR8_STORE_EXITING); 4484 else 4485 exec_control |= CPU_BASED_CR8_STORE_EXITING | 4486 CPU_BASED_CR8_LOAD_EXITING; 4487 #endif 4488 /* No need to intercept CR3 access or INVPLG when using EPT. */ 4489 if (enable_ept) 4490 exec_control &= ~(CPU_BASED_CR3_LOAD_EXITING | 4491 CPU_BASED_CR3_STORE_EXITING | 4492 CPU_BASED_INVLPG_EXITING); 4493 if (kvm_mwait_in_guest(vmx->vcpu.kvm)) 4494 exec_control &= ~(CPU_BASED_MWAIT_EXITING | 4495 CPU_BASED_MONITOR_EXITING); 4496 if (kvm_hlt_in_guest(vmx->vcpu.kvm)) 4497 exec_control &= ~CPU_BASED_HLT_EXITING; 4498 return exec_control; 4499 } 4500 4501 static u64 vmx_tertiary_exec_control(struct vcpu_vmx *vmx) 4502 { 4503 u64 exec_control = vmcs_config.cpu_based_3rd_exec_ctrl; 4504 4505 /* 4506 * IPI virtualization relies on APICv. Disable IPI virtualization if 4507 * APICv is inhibited. 4508 */ 4509 if (!enable_ipiv || !kvm_vcpu_apicv_active(&vmx->vcpu)) 4510 exec_control &= ~TERTIARY_EXEC_IPI_VIRT; 4511 4512 return exec_control; 4513 } 4514 4515 /* 4516 * Adjust a single secondary execution control bit to intercept/allow an 4517 * instruction in the guest. This is usually done based on whether or not a 4518 * feature has been exposed to the guest in order to correctly emulate faults. 4519 */ 4520 static inline void 4521 vmx_adjust_secondary_exec_control(struct vcpu_vmx *vmx, u32 *exec_control, 4522 u32 control, bool enabled, bool exiting) 4523 { 4524 /* 4525 * If the control is for an opt-in feature, clear the control if the 4526 * feature is not exposed to the guest, i.e. not enabled. If the 4527 * control is opt-out, i.e. an exiting control, clear the control if 4528 * the feature _is_ exposed to the guest, i.e. exiting/interception is 4529 * disabled for the associated instruction. Note, the caller is 4530 * responsible presetting exec_control to set all supported bits. 4531 */ 4532 if (enabled == exiting) 4533 *exec_control &= ~control; 4534 4535 /* 4536 * Update the nested MSR settings so that a nested VMM can/can't set 4537 * controls for features that are/aren't exposed to the guest. 4538 */ 4539 if (nested) { 4540 /* 4541 * All features that can be added or removed to VMX MSRs must 4542 * be supported in the first place for nested virtualization. 4543 */ 4544 if (WARN_ON_ONCE(!(vmcs_config.nested.secondary_ctls_high & control))) 4545 enabled = false; 4546 4547 if (enabled) 4548 vmx->nested.msrs.secondary_ctls_high |= control; 4549 else 4550 vmx->nested.msrs.secondary_ctls_high &= ~control; 4551 } 4552 } 4553 4554 /* 4555 * Wrapper macro for the common case of adjusting a secondary execution control 4556 * based on a single guest CPUID bit, with a dedicated feature bit. This also 4557 * verifies that the control is actually supported by KVM and hardware. 4558 */ 4559 #define vmx_adjust_sec_exec_control(vmx, exec_control, name, feat_name, ctrl_name, exiting) \ 4560 ({ \ 4561 struct kvm_vcpu *__vcpu = &(vmx)->vcpu; \ 4562 bool __enabled; \ 4563 \ 4564 if (cpu_has_vmx_##name()) { \ 4565 if (kvm_is_governed_feature(X86_FEATURE_##feat_name)) \ 4566 __enabled = guest_can_use(__vcpu, X86_FEATURE_##feat_name); \ 4567 else \ 4568 __enabled = guest_cpuid_has(__vcpu, X86_FEATURE_##feat_name); \ 4569 vmx_adjust_secondary_exec_control(vmx, exec_control, SECONDARY_EXEC_##ctrl_name,\ 4570 __enabled, exiting); \ 4571 } \ 4572 }) 4573 4574 /* More macro magic for ENABLE_/opt-in versus _EXITING/opt-out controls. */ 4575 #define vmx_adjust_sec_exec_feature(vmx, exec_control, lname, uname) \ 4576 vmx_adjust_sec_exec_control(vmx, exec_control, lname, uname, ENABLE_##uname, false) 4577 4578 #define vmx_adjust_sec_exec_exiting(vmx, exec_control, lname, uname) \ 4579 vmx_adjust_sec_exec_control(vmx, exec_control, lname, uname, uname##_EXITING, true) 4580 4581 static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx) 4582 { 4583 struct kvm_vcpu *vcpu = &vmx->vcpu; 4584 4585 u32 exec_control = vmcs_config.cpu_based_2nd_exec_ctrl; 4586 4587 if (vmx_pt_mode_is_system()) 4588 exec_control &= ~(SECONDARY_EXEC_PT_USE_GPA | SECONDARY_EXEC_PT_CONCEAL_VMX); 4589 if (!cpu_need_virtualize_apic_accesses(vcpu)) 4590 exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; 4591 if (vmx->vpid == 0) 4592 exec_control &= ~SECONDARY_EXEC_ENABLE_VPID; 4593 if (!enable_ept) { 4594 exec_control &= ~SECONDARY_EXEC_ENABLE_EPT; 4595 enable_unrestricted_guest = 0; 4596 } 4597 if (!enable_unrestricted_guest) 4598 exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST; 4599 if (kvm_pause_in_guest(vmx->vcpu.kvm)) 4600 exec_control &= ~SECONDARY_EXEC_PAUSE_LOOP_EXITING; 4601 if (!kvm_vcpu_apicv_active(vcpu)) 4602 exec_control &= ~(SECONDARY_EXEC_APIC_REGISTER_VIRT | 4603 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY); 4604 exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE; 4605 4606 /* 4607 * KVM doesn't support VMFUNC for L1, but the control is set in KVM's 4608 * base configuration as KVM emulates VMFUNC[EPTP_SWITCHING] for L2. 4609 */ 4610 exec_control &= ~SECONDARY_EXEC_ENABLE_VMFUNC; 4611 4612 /* SECONDARY_EXEC_DESC is enabled/disabled on writes to CR4.UMIP, 4613 * in vmx_set_cr4. */ 4614 exec_control &= ~SECONDARY_EXEC_DESC; 4615 4616 /* SECONDARY_EXEC_SHADOW_VMCS is enabled when L1 executes VMPTRLD 4617 (handle_vmptrld). 4618 We can NOT enable shadow_vmcs here because we don't have yet 4619 a current VMCS12 4620 */ 4621 exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS; 4622 4623 /* 4624 * PML is enabled/disabled when dirty logging of memsmlots changes, but 4625 * it needs to be set here when dirty logging is already active, e.g. 4626 * if this vCPU was created after dirty logging was enabled. 4627 */ 4628 if (!enable_pml || !atomic_read(&vcpu->kvm->nr_memslots_dirty_logging)) 4629 exec_control &= ~SECONDARY_EXEC_ENABLE_PML; 4630 4631 vmx_adjust_sec_exec_feature(vmx, &exec_control, xsaves, XSAVES); 4632 4633 /* 4634 * RDPID is also gated by ENABLE_RDTSCP, turn on the control if either 4635 * feature is exposed to the guest. This creates a virtualization hole 4636 * if both are supported in hardware but only one is exposed to the 4637 * guest, but letting the guest execute RDTSCP or RDPID when either one 4638 * is advertised is preferable to emulating the advertised instruction 4639 * in KVM on #UD, and obviously better than incorrectly injecting #UD. 4640 */ 4641 if (cpu_has_vmx_rdtscp()) { 4642 bool rdpid_or_rdtscp_enabled = 4643 guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP) || 4644 guest_cpuid_has(vcpu, X86_FEATURE_RDPID); 4645 4646 vmx_adjust_secondary_exec_control(vmx, &exec_control, 4647 SECONDARY_EXEC_ENABLE_RDTSCP, 4648 rdpid_or_rdtscp_enabled, false); 4649 } 4650 4651 vmx_adjust_sec_exec_feature(vmx, &exec_control, invpcid, INVPCID); 4652 4653 vmx_adjust_sec_exec_exiting(vmx, &exec_control, rdrand, RDRAND); 4654 vmx_adjust_sec_exec_exiting(vmx, &exec_control, rdseed, RDSEED); 4655 4656 vmx_adjust_sec_exec_control(vmx, &exec_control, waitpkg, WAITPKG, 4657 ENABLE_USR_WAIT_PAUSE, false); 4658 4659 if (!vcpu->kvm->arch.bus_lock_detection_enabled) 4660 exec_control &= ~SECONDARY_EXEC_BUS_LOCK_DETECTION; 4661 4662 if (!kvm_notify_vmexit_enabled(vcpu->kvm)) 4663 exec_control &= ~SECONDARY_EXEC_NOTIFY_VM_EXITING; 4664 4665 return exec_control; 4666 } 4667 4668 static inline int vmx_get_pid_table_order(struct kvm *kvm) 4669 { 4670 return get_order(kvm->arch.max_vcpu_ids * sizeof(*to_kvm_vmx(kvm)->pid_table)); 4671 } 4672 4673 static int vmx_alloc_ipiv_pid_table(struct kvm *kvm) 4674 { 4675 struct page *pages; 4676 struct kvm_vmx *kvm_vmx = to_kvm_vmx(kvm); 4677 4678 if (!irqchip_in_kernel(kvm) || !enable_ipiv) 4679 return 0; 4680 4681 if (kvm_vmx->pid_table) 4682 return 0; 4683 4684 pages = alloc_pages(GFP_KERNEL_ACCOUNT | __GFP_ZERO, 4685 vmx_get_pid_table_order(kvm)); 4686 if (!pages) 4687 return -ENOMEM; 4688 4689 kvm_vmx->pid_table = (void *)page_address(pages); 4690 return 0; 4691 } 4692 4693 static int vmx_vcpu_precreate(struct kvm *kvm) 4694 { 4695 return vmx_alloc_ipiv_pid_table(kvm); 4696 } 4697 4698 #define VMX_XSS_EXIT_BITMAP 0 4699 4700 static void init_vmcs(struct vcpu_vmx *vmx) 4701 { 4702 struct kvm *kvm = vmx->vcpu.kvm; 4703 struct kvm_vmx *kvm_vmx = to_kvm_vmx(kvm); 4704 4705 if (nested) 4706 nested_vmx_set_vmcs_shadowing_bitmap(); 4707 4708 if (cpu_has_vmx_msr_bitmap()) 4709 vmcs_write64(MSR_BITMAP, __pa(vmx->vmcs01.msr_bitmap)); 4710 4711 vmcs_write64(VMCS_LINK_POINTER, INVALID_GPA); /* 22.3.1.5 */ 4712 4713 /* Control */ 4714 pin_controls_set(vmx, vmx_pin_based_exec_ctrl(vmx)); 4715 4716 exec_controls_set(vmx, vmx_exec_control(vmx)); 4717 4718 if (cpu_has_secondary_exec_ctrls()) 4719 secondary_exec_controls_set(vmx, vmx_secondary_exec_control(vmx)); 4720 4721 if (cpu_has_tertiary_exec_ctrls()) 4722 tertiary_exec_controls_set(vmx, vmx_tertiary_exec_control(vmx)); 4723 4724 if (enable_apicv && lapic_in_kernel(&vmx->vcpu)) { 4725 vmcs_write64(EOI_EXIT_BITMAP0, 0); 4726 vmcs_write64(EOI_EXIT_BITMAP1, 0); 4727 vmcs_write64(EOI_EXIT_BITMAP2, 0); 4728 vmcs_write64(EOI_EXIT_BITMAP3, 0); 4729 4730 vmcs_write16(GUEST_INTR_STATUS, 0); 4731 4732 vmcs_write16(POSTED_INTR_NV, POSTED_INTR_VECTOR); 4733 vmcs_write64(POSTED_INTR_DESC_ADDR, __pa((&vmx->pi_desc))); 4734 } 4735 4736 if (vmx_can_use_ipiv(&vmx->vcpu)) { 4737 vmcs_write64(PID_POINTER_TABLE, __pa(kvm_vmx->pid_table)); 4738 vmcs_write16(LAST_PID_POINTER_INDEX, kvm->arch.max_vcpu_ids - 1); 4739 } 4740 4741 if (!kvm_pause_in_guest(kvm)) { 4742 vmcs_write32(PLE_GAP, ple_gap); 4743 vmx->ple_window = ple_window; 4744 vmx->ple_window_dirty = true; 4745 } 4746 4747 if (kvm_notify_vmexit_enabled(kvm)) 4748 vmcs_write32(NOTIFY_WINDOW, kvm->arch.notify_window); 4749 4750 vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, 0); 4751 vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, 0); 4752 vmcs_write32(CR3_TARGET_COUNT, 0); /* 22.2.1 */ 4753 4754 vmcs_write16(HOST_FS_SELECTOR, 0); /* 22.2.4 */ 4755 vmcs_write16(HOST_GS_SELECTOR, 0); /* 22.2.4 */ 4756 vmx_set_constant_host_state(vmx); 4757 vmcs_writel(HOST_FS_BASE, 0); /* 22.2.4 */ 4758 vmcs_writel(HOST_GS_BASE, 0); /* 22.2.4 */ 4759 4760 if (cpu_has_vmx_vmfunc()) 4761 vmcs_write64(VM_FUNCTION_CONTROL, 0); 4762 4763 vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0); 4764 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0); 4765 vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host.val)); 4766 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0); 4767 vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest.val)); 4768 4769 if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) 4770 vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat); 4771 4772 vm_exit_controls_set(vmx, vmx_vmexit_ctrl()); 4773 4774 /* 22.2.1, 20.8.1 */ 4775 vm_entry_controls_set(vmx, vmx_vmentry_ctrl()); 4776 4777 vmx->vcpu.arch.cr0_guest_owned_bits = vmx_l1_guest_owned_cr0_bits(); 4778 vmcs_writel(CR0_GUEST_HOST_MASK, ~vmx->vcpu.arch.cr0_guest_owned_bits); 4779 4780 set_cr4_guest_host_mask(vmx); 4781 4782 if (vmx->vpid != 0) 4783 vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid); 4784 4785 if (cpu_has_vmx_xsaves()) 4786 vmcs_write64(XSS_EXIT_BITMAP, VMX_XSS_EXIT_BITMAP); 4787 4788 if (enable_pml) { 4789 vmcs_write64(PML_ADDRESS, page_to_phys(vmx->pml_pg)); 4790 vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1); 4791 } 4792 4793 vmx_write_encls_bitmap(&vmx->vcpu, NULL); 4794 4795 if (vmx_pt_mode_is_host_guest()) { 4796 memset(&vmx->pt_desc, 0, sizeof(vmx->pt_desc)); 4797 /* Bit[6~0] are forced to 1, writes are ignored. */ 4798 vmx->pt_desc.guest.output_mask = 0x7F; 4799 vmcs_write64(GUEST_IA32_RTIT_CTL, 0); 4800 } 4801 4802 vmcs_write32(GUEST_SYSENTER_CS, 0); 4803 vmcs_writel(GUEST_SYSENTER_ESP, 0); 4804 vmcs_writel(GUEST_SYSENTER_EIP, 0); 4805 vmcs_write64(GUEST_IA32_DEBUGCTL, 0); 4806 4807 if (cpu_has_vmx_tpr_shadow()) { 4808 vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 0); 4809 if (cpu_need_tpr_shadow(&vmx->vcpu)) 4810 vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 4811 __pa(vmx->vcpu.arch.apic->regs)); 4812 vmcs_write32(TPR_THRESHOLD, 0); 4813 } 4814 4815 vmx_setup_uret_msrs(vmx); 4816 } 4817 4818 static void __vmx_vcpu_reset(struct kvm_vcpu *vcpu) 4819 { 4820 struct vcpu_vmx *vmx = to_vmx(vcpu); 4821 4822 init_vmcs(vmx); 4823 4824 if (nested) 4825 memcpy(&vmx->nested.msrs, &vmcs_config.nested, sizeof(vmx->nested.msrs)); 4826 4827 vcpu_setup_sgx_lepubkeyhash(vcpu); 4828 4829 vmx->nested.posted_intr_nv = -1; 4830 vmx->nested.vmxon_ptr = INVALID_GPA; 4831 vmx->nested.current_vmptr = INVALID_GPA; 4832 4833 #ifdef CONFIG_KVM_HYPERV 4834 vmx->nested.hv_evmcs_vmptr = EVMPTR_INVALID; 4835 #endif 4836 4837 vcpu->arch.microcode_version = 0x100000000ULL; 4838 vmx->msr_ia32_feature_control_valid_bits = FEAT_CTL_LOCKED; 4839 4840 /* 4841 * Enforce invariant: pi_desc.nv is always either POSTED_INTR_VECTOR 4842 * or POSTED_INTR_WAKEUP_VECTOR. 4843 */ 4844 vmx->pi_desc.nv = POSTED_INTR_VECTOR; 4845 vmx->pi_desc.sn = 1; 4846 } 4847 4848 static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) 4849 { 4850 struct vcpu_vmx *vmx = to_vmx(vcpu); 4851 4852 if (!init_event) 4853 __vmx_vcpu_reset(vcpu); 4854 4855 vmx->rmode.vm86_active = 0; 4856 vmx->spec_ctrl = 0; 4857 4858 vmx->msr_ia32_umwait_control = 0; 4859 4860 vmx->hv_deadline_tsc = -1; 4861 kvm_set_cr8(vcpu, 0); 4862 4863 vmx_segment_cache_clear(vmx); 4864 kvm_register_mark_available(vcpu, VCPU_EXREG_SEGMENTS); 4865 4866 seg_setup(VCPU_SREG_CS); 4867 vmcs_write16(GUEST_CS_SELECTOR, 0xf000); 4868 vmcs_writel(GUEST_CS_BASE, 0xffff0000ul); 4869 4870 seg_setup(VCPU_SREG_DS); 4871 seg_setup(VCPU_SREG_ES); 4872 seg_setup(VCPU_SREG_FS); 4873 seg_setup(VCPU_SREG_GS); 4874 seg_setup(VCPU_SREG_SS); 4875 4876 vmcs_write16(GUEST_TR_SELECTOR, 0); 4877 vmcs_writel(GUEST_TR_BASE, 0); 4878 vmcs_write32(GUEST_TR_LIMIT, 0xffff); 4879 vmcs_write32(GUEST_TR_AR_BYTES, 0x008b); 4880 4881 vmcs_write16(GUEST_LDTR_SELECTOR, 0); 4882 vmcs_writel(GUEST_LDTR_BASE, 0); 4883 vmcs_write32(GUEST_LDTR_LIMIT, 0xffff); 4884 vmcs_write32(GUEST_LDTR_AR_BYTES, 0x00082); 4885 4886 vmcs_writel(GUEST_GDTR_BASE, 0); 4887 vmcs_write32(GUEST_GDTR_LIMIT, 0xffff); 4888 4889 vmcs_writel(GUEST_IDTR_BASE, 0); 4890 vmcs_write32(GUEST_IDTR_LIMIT, 0xffff); 4891 4892 vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE); 4893 vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0); 4894 vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS, 0); 4895 if (kvm_mpx_supported()) 4896 vmcs_write64(GUEST_BNDCFGS, 0); 4897 4898 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0); /* 22.2.1 */ 4899 4900 kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu); 4901 4902 vpid_sync_context(vmx->vpid); 4903 4904 vmx_update_fb_clear_dis(vcpu, vmx); 4905 } 4906 4907 static void vmx_enable_irq_window(struct kvm_vcpu *vcpu) 4908 { 4909 exec_controls_setbit(to_vmx(vcpu), CPU_BASED_INTR_WINDOW_EXITING); 4910 } 4911 4912 static void vmx_enable_nmi_window(struct kvm_vcpu *vcpu) 4913 { 4914 if (!enable_vnmi || 4915 vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_STI) { 4916 vmx_enable_irq_window(vcpu); 4917 return; 4918 } 4919 4920 exec_controls_setbit(to_vmx(vcpu), CPU_BASED_NMI_WINDOW_EXITING); 4921 } 4922 4923 static void vmx_inject_irq(struct kvm_vcpu *vcpu, bool reinjected) 4924 { 4925 struct vcpu_vmx *vmx = to_vmx(vcpu); 4926 uint32_t intr; 4927 int irq = vcpu->arch.interrupt.nr; 4928 4929 trace_kvm_inj_virq(irq, vcpu->arch.interrupt.soft, reinjected); 4930 4931 ++vcpu->stat.irq_injections; 4932 if (vmx->rmode.vm86_active) { 4933 int inc_eip = 0; 4934 if (vcpu->arch.interrupt.soft) 4935 inc_eip = vcpu->arch.event_exit_inst_len; 4936 kvm_inject_realmode_interrupt(vcpu, irq, inc_eip); 4937 return; 4938 } 4939 intr = irq | INTR_INFO_VALID_MASK; 4940 if (vcpu->arch.interrupt.soft) { 4941 intr |= INTR_TYPE_SOFT_INTR; 4942 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 4943 vmx->vcpu.arch.event_exit_inst_len); 4944 } else 4945 intr |= INTR_TYPE_EXT_INTR; 4946 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr); 4947 4948 vmx_clear_hlt(vcpu); 4949 } 4950 4951 static void vmx_inject_nmi(struct kvm_vcpu *vcpu) 4952 { 4953 struct vcpu_vmx *vmx = to_vmx(vcpu); 4954 4955 if (!enable_vnmi) { 4956 /* 4957 * Tracking the NMI-blocked state in software is built upon 4958 * finding the next open IRQ window. This, in turn, depends on 4959 * well-behaving guests: They have to keep IRQs disabled at 4960 * least as long as the NMI handler runs. Otherwise we may 4961 * cause NMI nesting, maybe breaking the guest. But as this is 4962 * highly unlikely, we can live with the residual risk. 4963 */ 4964 vmx->loaded_vmcs->soft_vnmi_blocked = 1; 4965 vmx->loaded_vmcs->vnmi_blocked_time = 0; 4966 } 4967 4968 ++vcpu->stat.nmi_injections; 4969 vmx->loaded_vmcs->nmi_known_unmasked = false; 4970 4971 if (vmx->rmode.vm86_active) { 4972 kvm_inject_realmode_interrupt(vcpu, NMI_VECTOR, 0); 4973 return; 4974 } 4975 4976 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 4977 INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR); 4978 4979 vmx_clear_hlt(vcpu); 4980 } 4981 4982 bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu) 4983 { 4984 struct vcpu_vmx *vmx = to_vmx(vcpu); 4985 bool masked; 4986 4987 if (!enable_vnmi) 4988 return vmx->loaded_vmcs->soft_vnmi_blocked; 4989 if (vmx->loaded_vmcs->nmi_known_unmasked) 4990 return false; 4991 masked = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_NMI; 4992 vmx->loaded_vmcs->nmi_known_unmasked = !masked; 4993 return masked; 4994 } 4995 4996 void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked) 4997 { 4998 struct vcpu_vmx *vmx = to_vmx(vcpu); 4999 5000 if (!enable_vnmi) { 5001 if (vmx->loaded_vmcs->soft_vnmi_blocked != masked) { 5002 vmx->loaded_vmcs->soft_vnmi_blocked = masked; 5003 vmx->loaded_vmcs->vnmi_blocked_time = 0; 5004 } 5005 } else { 5006 vmx->loaded_vmcs->nmi_known_unmasked = !masked; 5007 if (masked) 5008 vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, 5009 GUEST_INTR_STATE_NMI); 5010 else 5011 vmcs_clear_bits(GUEST_INTERRUPTIBILITY_INFO, 5012 GUEST_INTR_STATE_NMI); 5013 } 5014 } 5015 5016 bool vmx_nmi_blocked(struct kvm_vcpu *vcpu) 5017 { 5018 if (is_guest_mode(vcpu) && nested_exit_on_nmi(vcpu)) 5019 return false; 5020 5021 if (!enable_vnmi && to_vmx(vcpu)->loaded_vmcs->soft_vnmi_blocked) 5022 return true; 5023 5024 return (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 5025 (GUEST_INTR_STATE_MOV_SS | GUEST_INTR_STATE_STI | 5026 GUEST_INTR_STATE_NMI)); 5027 } 5028 5029 static int vmx_nmi_allowed(struct kvm_vcpu *vcpu, bool for_injection) 5030 { 5031 if (to_vmx(vcpu)->nested.nested_run_pending) 5032 return -EBUSY; 5033 5034 /* An NMI must not be injected into L2 if it's supposed to VM-Exit. */ 5035 if (for_injection && is_guest_mode(vcpu) && nested_exit_on_nmi(vcpu)) 5036 return -EBUSY; 5037 5038 return !vmx_nmi_blocked(vcpu); 5039 } 5040 5041 bool vmx_interrupt_blocked(struct kvm_vcpu *vcpu) 5042 { 5043 if (is_guest_mode(vcpu) && nested_exit_on_intr(vcpu)) 5044 return false; 5045 5046 return !(vmx_get_rflags(vcpu) & X86_EFLAGS_IF) || 5047 (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 5048 (GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS)); 5049 } 5050 5051 static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu, bool for_injection) 5052 { 5053 if (to_vmx(vcpu)->nested.nested_run_pending) 5054 return -EBUSY; 5055 5056 /* 5057 * An IRQ must not be injected into L2 if it's supposed to VM-Exit, 5058 * e.g. if the IRQ arrived asynchronously after checking nested events. 5059 */ 5060 if (for_injection && is_guest_mode(vcpu) && nested_exit_on_intr(vcpu)) 5061 return -EBUSY; 5062 5063 return !vmx_interrupt_blocked(vcpu); 5064 } 5065 5066 static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr) 5067 { 5068 void __user *ret; 5069 5070 if (enable_unrestricted_guest) 5071 return 0; 5072 5073 mutex_lock(&kvm->slots_lock); 5074 ret = __x86_set_memory_region(kvm, TSS_PRIVATE_MEMSLOT, addr, 5075 PAGE_SIZE * 3); 5076 mutex_unlock(&kvm->slots_lock); 5077 5078 if (IS_ERR(ret)) 5079 return PTR_ERR(ret); 5080 5081 to_kvm_vmx(kvm)->tss_addr = addr; 5082 5083 return init_rmode_tss(kvm, ret); 5084 } 5085 5086 static int vmx_set_identity_map_addr(struct kvm *kvm, u64 ident_addr) 5087 { 5088 to_kvm_vmx(kvm)->ept_identity_map_addr = ident_addr; 5089 return 0; 5090 } 5091 5092 static bool rmode_exception(struct kvm_vcpu *vcpu, int vec) 5093 { 5094 switch (vec) { 5095 case BP_VECTOR: 5096 /* 5097 * Update instruction length as we may reinject the exception 5098 * from user space while in guest debugging mode. 5099 */ 5100 to_vmx(vcpu)->vcpu.arch.event_exit_inst_len = 5101 vmcs_read32(VM_EXIT_INSTRUCTION_LEN); 5102 if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP) 5103 return false; 5104 fallthrough; 5105 case DB_VECTOR: 5106 return !(vcpu->guest_debug & 5107 (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)); 5108 case DE_VECTOR: 5109 case OF_VECTOR: 5110 case BR_VECTOR: 5111 case UD_VECTOR: 5112 case DF_VECTOR: 5113 case SS_VECTOR: 5114 case GP_VECTOR: 5115 case MF_VECTOR: 5116 return true; 5117 } 5118 return false; 5119 } 5120 5121 static int handle_rmode_exception(struct kvm_vcpu *vcpu, 5122 int vec, u32 err_code) 5123 { 5124 /* 5125 * Instruction with address size override prefix opcode 0x67 5126 * Cause the #SS fault with 0 error code in VM86 mode. 5127 */ 5128 if (((vec == GP_VECTOR) || (vec == SS_VECTOR)) && err_code == 0) { 5129 if (kvm_emulate_instruction(vcpu, 0)) { 5130 if (vcpu->arch.halt_request) { 5131 vcpu->arch.halt_request = 0; 5132 return kvm_emulate_halt_noskip(vcpu); 5133 } 5134 return 1; 5135 } 5136 return 0; 5137 } 5138 5139 /* 5140 * Forward all other exceptions that are valid in real mode. 5141 * FIXME: Breaks guest debugging in real mode, needs to be fixed with 5142 * the required debugging infrastructure rework. 5143 */ 5144 kvm_queue_exception(vcpu, vec); 5145 return 1; 5146 } 5147 5148 static int handle_machine_check(struct kvm_vcpu *vcpu) 5149 { 5150 /* handled by vmx_vcpu_run() */ 5151 return 1; 5152 } 5153 5154 /* 5155 * If the host has split lock detection disabled, then #AC is 5156 * unconditionally injected into the guest, which is the pre split lock 5157 * detection behaviour. 5158 * 5159 * If the host has split lock detection enabled then #AC is 5160 * only injected into the guest when: 5161 * - Guest CPL == 3 (user mode) 5162 * - Guest has #AC detection enabled in CR0 5163 * - Guest EFLAGS has AC bit set 5164 */ 5165 bool vmx_guest_inject_ac(struct kvm_vcpu *vcpu) 5166 { 5167 if (!boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT)) 5168 return true; 5169 5170 return vmx_get_cpl(vcpu) == 3 && kvm_is_cr0_bit_set(vcpu, X86_CR0_AM) && 5171 (kvm_get_rflags(vcpu) & X86_EFLAGS_AC); 5172 } 5173 5174 static int handle_exception_nmi(struct kvm_vcpu *vcpu) 5175 { 5176 struct vcpu_vmx *vmx = to_vmx(vcpu); 5177 struct kvm_run *kvm_run = vcpu->run; 5178 u32 intr_info, ex_no, error_code; 5179 unsigned long cr2, dr6; 5180 u32 vect_info; 5181 5182 vect_info = vmx->idt_vectoring_info; 5183 intr_info = vmx_get_intr_info(vcpu); 5184 5185 /* 5186 * Machine checks are handled by handle_exception_irqoff(), or by 5187 * vmx_vcpu_run() if a #MC occurs on VM-Entry. NMIs are handled by 5188 * vmx_vcpu_enter_exit(). 5189 */ 5190 if (is_machine_check(intr_info) || is_nmi(intr_info)) 5191 return 1; 5192 5193 /* 5194 * Queue the exception here instead of in handle_nm_fault_irqoff(). 5195 * This ensures the nested_vmx check is not skipped so vmexit can 5196 * be reflected to L1 (when it intercepts #NM) before reaching this 5197 * point. 5198 */ 5199 if (is_nm_fault(intr_info)) { 5200 kvm_queue_exception(vcpu, NM_VECTOR); 5201 return 1; 5202 } 5203 5204 if (is_invalid_opcode(intr_info)) 5205 return handle_ud(vcpu); 5206 5207 error_code = 0; 5208 if (intr_info & INTR_INFO_DELIVER_CODE_MASK) 5209 error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE); 5210 5211 if (!vmx->rmode.vm86_active && is_gp_fault(intr_info)) { 5212 WARN_ON_ONCE(!enable_vmware_backdoor); 5213 5214 /* 5215 * VMware backdoor emulation on #GP interception only handles 5216 * IN{S}, OUT{S}, and RDPMC, none of which generate a non-zero 5217 * error code on #GP. 5218 */ 5219 if (error_code) { 5220 kvm_queue_exception_e(vcpu, GP_VECTOR, error_code); 5221 return 1; 5222 } 5223 return kvm_emulate_instruction(vcpu, EMULTYPE_VMWARE_GP); 5224 } 5225 5226 /* 5227 * The #PF with PFEC.RSVD = 1 indicates the guest is accessing 5228 * MMIO, it is better to report an internal error. 5229 * See the comments in vmx_handle_exit. 5230 */ 5231 if ((vect_info & VECTORING_INFO_VALID_MASK) && 5232 !(is_page_fault(intr_info) && !(error_code & PFERR_RSVD_MASK))) { 5233 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; 5234 vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_SIMUL_EX; 5235 vcpu->run->internal.ndata = 4; 5236 vcpu->run->internal.data[0] = vect_info; 5237 vcpu->run->internal.data[1] = intr_info; 5238 vcpu->run->internal.data[2] = error_code; 5239 vcpu->run->internal.data[3] = vcpu->arch.last_vmentry_cpu; 5240 return 0; 5241 } 5242 5243 if (is_page_fault(intr_info)) { 5244 cr2 = vmx_get_exit_qual(vcpu); 5245 if (enable_ept && !vcpu->arch.apf.host_apf_flags) { 5246 /* 5247 * EPT will cause page fault only if we need to 5248 * detect illegal GPAs. 5249 */ 5250 WARN_ON_ONCE(!allow_smaller_maxphyaddr); 5251 kvm_fixup_and_inject_pf_error(vcpu, cr2, error_code); 5252 return 1; 5253 } else 5254 return kvm_handle_page_fault(vcpu, error_code, cr2, NULL, 0); 5255 } 5256 5257 ex_no = intr_info & INTR_INFO_VECTOR_MASK; 5258 5259 if (vmx->rmode.vm86_active && rmode_exception(vcpu, ex_no)) 5260 return handle_rmode_exception(vcpu, ex_no, error_code); 5261 5262 switch (ex_no) { 5263 case DB_VECTOR: 5264 dr6 = vmx_get_exit_qual(vcpu); 5265 if (!(vcpu->guest_debug & 5266 (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))) { 5267 /* 5268 * If the #DB was due to ICEBP, a.k.a. INT1, skip the 5269 * instruction. ICEBP generates a trap-like #DB, but 5270 * despite its interception control being tied to #DB, 5271 * is an instruction intercept, i.e. the VM-Exit occurs 5272 * on the ICEBP itself. Use the inner "skip" helper to 5273 * avoid single-step #DB and MTF updates, as ICEBP is 5274 * higher priority. Note, skipping ICEBP still clears 5275 * STI and MOVSS blocking. 5276 * 5277 * For all other #DBs, set vmcs.PENDING_DBG_EXCEPTIONS.BS 5278 * if single-step is enabled in RFLAGS and STI or MOVSS 5279 * blocking is active, as the CPU doesn't set the bit 5280 * on VM-Exit due to #DB interception. VM-Entry has a 5281 * consistency check that a single-step #DB is pending 5282 * in this scenario as the previous instruction cannot 5283 * have toggled RFLAGS.TF 0=>1 (because STI and POP/MOV 5284 * don't modify RFLAGS), therefore the one instruction 5285 * delay when activating single-step breakpoints must 5286 * have already expired. Note, the CPU sets/clears BS 5287 * as appropriate for all other VM-Exits types. 5288 */ 5289 if (is_icebp(intr_info)) 5290 WARN_ON(!skip_emulated_instruction(vcpu)); 5291 else if ((vmx_get_rflags(vcpu) & X86_EFLAGS_TF) && 5292 (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 5293 (GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS))) 5294 vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS, 5295 vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS) | DR6_BS); 5296 5297 kvm_queue_exception_p(vcpu, DB_VECTOR, dr6); 5298 return 1; 5299 } 5300 kvm_run->debug.arch.dr6 = dr6 | DR6_ACTIVE_LOW; 5301 kvm_run->debug.arch.dr7 = vmcs_readl(GUEST_DR7); 5302 fallthrough; 5303 case BP_VECTOR: 5304 /* 5305 * Update instruction length as we may reinject #BP from 5306 * user space while in guest debugging mode. Reading it for 5307 * #DB as well causes no harm, it is not used in that case. 5308 */ 5309 vmx->vcpu.arch.event_exit_inst_len = 5310 vmcs_read32(VM_EXIT_INSTRUCTION_LEN); 5311 kvm_run->exit_reason = KVM_EXIT_DEBUG; 5312 kvm_run->debug.arch.pc = kvm_get_linear_rip(vcpu); 5313 kvm_run->debug.arch.exception = ex_no; 5314 break; 5315 case AC_VECTOR: 5316 if (vmx_guest_inject_ac(vcpu)) { 5317 kvm_queue_exception_e(vcpu, AC_VECTOR, error_code); 5318 return 1; 5319 } 5320 5321 /* 5322 * Handle split lock. Depending on detection mode this will 5323 * either warn and disable split lock detection for this 5324 * task or force SIGBUS on it. 5325 */ 5326 if (handle_guest_split_lock(kvm_rip_read(vcpu))) 5327 return 1; 5328 fallthrough; 5329 default: 5330 kvm_run->exit_reason = KVM_EXIT_EXCEPTION; 5331 kvm_run->ex.exception = ex_no; 5332 kvm_run->ex.error_code = error_code; 5333 break; 5334 } 5335 return 0; 5336 } 5337 5338 static __always_inline int handle_external_interrupt(struct kvm_vcpu *vcpu) 5339 { 5340 ++vcpu->stat.irq_exits; 5341 return 1; 5342 } 5343 5344 static int handle_triple_fault(struct kvm_vcpu *vcpu) 5345 { 5346 vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN; 5347 vcpu->mmio_needed = 0; 5348 return 0; 5349 } 5350 5351 static int handle_io(struct kvm_vcpu *vcpu) 5352 { 5353 unsigned long exit_qualification; 5354 int size, in, string; 5355 unsigned port; 5356 5357 exit_qualification = vmx_get_exit_qual(vcpu); 5358 string = (exit_qualification & 16) != 0; 5359 5360 ++vcpu->stat.io_exits; 5361 5362 if (string) 5363 return kvm_emulate_instruction(vcpu, 0); 5364 5365 port = exit_qualification >> 16; 5366 size = (exit_qualification & 7) + 1; 5367 in = (exit_qualification & 8) != 0; 5368 5369 return kvm_fast_pio(vcpu, size, port, in); 5370 } 5371 5372 static void 5373 vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall) 5374 { 5375 /* 5376 * Patch in the VMCALL instruction: 5377 */ 5378 hypercall[0] = 0x0f; 5379 hypercall[1] = 0x01; 5380 hypercall[2] = 0xc1; 5381 } 5382 5383 /* called to set cr0 as appropriate for a mov-to-cr0 exit. */ 5384 static int handle_set_cr0(struct kvm_vcpu *vcpu, unsigned long val) 5385 { 5386 if (is_guest_mode(vcpu)) { 5387 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 5388 unsigned long orig_val = val; 5389 5390 /* 5391 * We get here when L2 changed cr0 in a way that did not change 5392 * any of L1's shadowed bits (see nested_vmx_exit_handled_cr), 5393 * but did change L0 shadowed bits. So we first calculate the 5394 * effective cr0 value that L1 would like to write into the 5395 * hardware. It consists of the L2-owned bits from the new 5396 * value combined with the L1-owned bits from L1's guest_cr0. 5397 */ 5398 val = (val & ~vmcs12->cr0_guest_host_mask) | 5399 (vmcs12->guest_cr0 & vmcs12->cr0_guest_host_mask); 5400 5401 if (kvm_set_cr0(vcpu, val)) 5402 return 1; 5403 vmcs_writel(CR0_READ_SHADOW, orig_val); 5404 return 0; 5405 } else { 5406 return kvm_set_cr0(vcpu, val); 5407 } 5408 } 5409 5410 static int handle_set_cr4(struct kvm_vcpu *vcpu, unsigned long val) 5411 { 5412 if (is_guest_mode(vcpu)) { 5413 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 5414 unsigned long orig_val = val; 5415 5416 /* analogously to handle_set_cr0 */ 5417 val = (val & ~vmcs12->cr4_guest_host_mask) | 5418 (vmcs12->guest_cr4 & vmcs12->cr4_guest_host_mask); 5419 if (kvm_set_cr4(vcpu, val)) 5420 return 1; 5421 vmcs_writel(CR4_READ_SHADOW, orig_val); 5422 return 0; 5423 } else 5424 return kvm_set_cr4(vcpu, val); 5425 } 5426 5427 static int handle_desc(struct kvm_vcpu *vcpu) 5428 { 5429 /* 5430 * UMIP emulation relies on intercepting writes to CR4.UMIP, i.e. this 5431 * and other code needs to be updated if UMIP can be guest owned. 5432 */ 5433 BUILD_BUG_ON(KVM_POSSIBLE_CR4_GUEST_BITS & X86_CR4_UMIP); 5434 5435 WARN_ON_ONCE(!kvm_is_cr4_bit_set(vcpu, X86_CR4_UMIP)); 5436 return kvm_emulate_instruction(vcpu, 0); 5437 } 5438 5439 static int handle_cr(struct kvm_vcpu *vcpu) 5440 { 5441 unsigned long exit_qualification, val; 5442 int cr; 5443 int reg; 5444 int err; 5445 int ret; 5446 5447 exit_qualification = vmx_get_exit_qual(vcpu); 5448 cr = exit_qualification & 15; 5449 reg = (exit_qualification >> 8) & 15; 5450 switch ((exit_qualification >> 4) & 3) { 5451 case 0: /* mov to cr */ 5452 val = kvm_register_read(vcpu, reg); 5453 trace_kvm_cr_write(cr, val); 5454 switch (cr) { 5455 case 0: 5456 err = handle_set_cr0(vcpu, val); 5457 return kvm_complete_insn_gp(vcpu, err); 5458 case 3: 5459 WARN_ON_ONCE(enable_unrestricted_guest); 5460 5461 err = kvm_set_cr3(vcpu, val); 5462 return kvm_complete_insn_gp(vcpu, err); 5463 case 4: 5464 err = handle_set_cr4(vcpu, val); 5465 return kvm_complete_insn_gp(vcpu, err); 5466 case 8: { 5467 u8 cr8_prev = kvm_get_cr8(vcpu); 5468 u8 cr8 = (u8)val; 5469 err = kvm_set_cr8(vcpu, cr8); 5470 ret = kvm_complete_insn_gp(vcpu, err); 5471 if (lapic_in_kernel(vcpu)) 5472 return ret; 5473 if (cr8_prev <= cr8) 5474 return ret; 5475 /* 5476 * TODO: we might be squashing a 5477 * KVM_GUESTDBG_SINGLESTEP-triggered 5478 * KVM_EXIT_DEBUG here. 5479 */ 5480 vcpu->run->exit_reason = KVM_EXIT_SET_TPR; 5481 return 0; 5482 } 5483 } 5484 break; 5485 case 2: /* clts */ 5486 KVM_BUG(1, vcpu->kvm, "Guest always owns CR0.TS"); 5487 return -EIO; 5488 case 1: /*mov from cr*/ 5489 switch (cr) { 5490 case 3: 5491 WARN_ON_ONCE(enable_unrestricted_guest); 5492 5493 val = kvm_read_cr3(vcpu); 5494 kvm_register_write(vcpu, reg, val); 5495 trace_kvm_cr_read(cr, val); 5496 return kvm_skip_emulated_instruction(vcpu); 5497 case 8: 5498 val = kvm_get_cr8(vcpu); 5499 kvm_register_write(vcpu, reg, val); 5500 trace_kvm_cr_read(cr, val); 5501 return kvm_skip_emulated_instruction(vcpu); 5502 } 5503 break; 5504 case 3: /* lmsw */ 5505 val = (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f; 5506 trace_kvm_cr_write(0, (kvm_read_cr0_bits(vcpu, ~0xful) | val)); 5507 kvm_lmsw(vcpu, val); 5508 5509 return kvm_skip_emulated_instruction(vcpu); 5510 default: 5511 break; 5512 } 5513 vcpu->run->exit_reason = 0; 5514 vcpu_unimpl(vcpu, "unhandled control register: op %d cr %d\n", 5515 (int)(exit_qualification >> 4) & 3, cr); 5516 return 0; 5517 } 5518 5519 static int handle_dr(struct kvm_vcpu *vcpu) 5520 { 5521 unsigned long exit_qualification; 5522 int dr, dr7, reg; 5523 int err = 1; 5524 5525 exit_qualification = vmx_get_exit_qual(vcpu); 5526 dr = exit_qualification & DEBUG_REG_ACCESS_NUM; 5527 5528 /* First, if DR does not exist, trigger UD */ 5529 if (!kvm_require_dr(vcpu, dr)) 5530 return 1; 5531 5532 if (vmx_get_cpl(vcpu) > 0) 5533 goto out; 5534 5535 dr7 = vmcs_readl(GUEST_DR7); 5536 if (dr7 & DR7_GD) { 5537 /* 5538 * As the vm-exit takes precedence over the debug trap, we 5539 * need to emulate the latter, either for the host or the 5540 * guest debugging itself. 5541 */ 5542 if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) { 5543 vcpu->run->debug.arch.dr6 = DR6_BD | DR6_ACTIVE_LOW; 5544 vcpu->run->debug.arch.dr7 = dr7; 5545 vcpu->run->debug.arch.pc = kvm_get_linear_rip(vcpu); 5546 vcpu->run->debug.arch.exception = DB_VECTOR; 5547 vcpu->run->exit_reason = KVM_EXIT_DEBUG; 5548 return 0; 5549 } else { 5550 kvm_queue_exception_p(vcpu, DB_VECTOR, DR6_BD); 5551 return 1; 5552 } 5553 } 5554 5555 if (vcpu->guest_debug == 0) { 5556 exec_controls_clearbit(to_vmx(vcpu), CPU_BASED_MOV_DR_EXITING); 5557 5558 /* 5559 * No more DR vmexits; force a reload of the debug registers 5560 * and reenter on this instruction. The next vmexit will 5561 * retrieve the full state of the debug registers. 5562 */ 5563 vcpu->arch.switch_db_regs |= KVM_DEBUGREG_WONT_EXIT; 5564 return 1; 5565 } 5566 5567 reg = DEBUG_REG_ACCESS_REG(exit_qualification); 5568 if (exit_qualification & TYPE_MOV_FROM_DR) { 5569 kvm_register_write(vcpu, reg, kvm_get_dr(vcpu, dr)); 5570 err = 0; 5571 } else { 5572 err = kvm_set_dr(vcpu, dr, kvm_register_read(vcpu, reg)); 5573 } 5574 5575 out: 5576 return kvm_complete_insn_gp(vcpu, err); 5577 } 5578 5579 static void vmx_sync_dirty_debug_regs(struct kvm_vcpu *vcpu) 5580 { 5581 get_debugreg(vcpu->arch.db[0], 0); 5582 get_debugreg(vcpu->arch.db[1], 1); 5583 get_debugreg(vcpu->arch.db[2], 2); 5584 get_debugreg(vcpu->arch.db[3], 3); 5585 get_debugreg(vcpu->arch.dr6, 6); 5586 vcpu->arch.dr7 = vmcs_readl(GUEST_DR7); 5587 5588 vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_WONT_EXIT; 5589 exec_controls_setbit(to_vmx(vcpu), CPU_BASED_MOV_DR_EXITING); 5590 5591 /* 5592 * exc_debug expects dr6 to be cleared after it runs, avoid that it sees 5593 * a stale dr6 from the guest. 5594 */ 5595 set_debugreg(DR6_RESERVED, 6); 5596 } 5597 5598 static void vmx_set_dr7(struct kvm_vcpu *vcpu, unsigned long val) 5599 { 5600 vmcs_writel(GUEST_DR7, val); 5601 } 5602 5603 static int handle_tpr_below_threshold(struct kvm_vcpu *vcpu) 5604 { 5605 kvm_apic_update_ppr(vcpu); 5606 return 1; 5607 } 5608 5609 static int handle_interrupt_window(struct kvm_vcpu *vcpu) 5610 { 5611 exec_controls_clearbit(to_vmx(vcpu), CPU_BASED_INTR_WINDOW_EXITING); 5612 5613 kvm_make_request(KVM_REQ_EVENT, vcpu); 5614 5615 ++vcpu->stat.irq_window_exits; 5616 return 1; 5617 } 5618 5619 static int handle_invlpg(struct kvm_vcpu *vcpu) 5620 { 5621 unsigned long exit_qualification = vmx_get_exit_qual(vcpu); 5622 5623 kvm_mmu_invlpg(vcpu, exit_qualification); 5624 return kvm_skip_emulated_instruction(vcpu); 5625 } 5626 5627 static int handle_apic_access(struct kvm_vcpu *vcpu) 5628 { 5629 if (likely(fasteoi)) { 5630 unsigned long exit_qualification = vmx_get_exit_qual(vcpu); 5631 int access_type, offset; 5632 5633 access_type = exit_qualification & APIC_ACCESS_TYPE; 5634 offset = exit_qualification & APIC_ACCESS_OFFSET; 5635 /* 5636 * Sane guest uses MOV to write EOI, with written value 5637 * not cared. So make a short-circuit here by avoiding 5638 * heavy instruction emulation. 5639 */ 5640 if ((access_type == TYPE_LINEAR_APIC_INST_WRITE) && 5641 (offset == APIC_EOI)) { 5642 kvm_lapic_set_eoi(vcpu); 5643 return kvm_skip_emulated_instruction(vcpu); 5644 } 5645 } 5646 return kvm_emulate_instruction(vcpu, 0); 5647 } 5648 5649 static int handle_apic_eoi_induced(struct kvm_vcpu *vcpu) 5650 { 5651 unsigned long exit_qualification = vmx_get_exit_qual(vcpu); 5652 int vector = exit_qualification & 0xff; 5653 5654 /* EOI-induced VM exit is trap-like and thus no need to adjust IP */ 5655 kvm_apic_set_eoi_accelerated(vcpu, vector); 5656 return 1; 5657 } 5658 5659 static int handle_apic_write(struct kvm_vcpu *vcpu) 5660 { 5661 unsigned long exit_qualification = vmx_get_exit_qual(vcpu); 5662 5663 /* 5664 * APIC-write VM-Exit is trap-like, KVM doesn't need to advance RIP and 5665 * hardware has done any necessary aliasing, offset adjustments, etc... 5666 * for the access. I.e. the correct value has already been written to 5667 * the vAPIC page for the correct 16-byte chunk. KVM needs only to 5668 * retrieve the register value and emulate the access. 5669 */ 5670 u32 offset = exit_qualification & 0xff0; 5671 5672 kvm_apic_write_nodecode(vcpu, offset); 5673 return 1; 5674 } 5675 5676 static int handle_task_switch(struct kvm_vcpu *vcpu) 5677 { 5678 struct vcpu_vmx *vmx = to_vmx(vcpu); 5679 unsigned long exit_qualification; 5680 bool has_error_code = false; 5681 u32 error_code = 0; 5682 u16 tss_selector; 5683 int reason, type, idt_v, idt_index; 5684 5685 idt_v = (vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK); 5686 idt_index = (vmx->idt_vectoring_info & VECTORING_INFO_VECTOR_MASK); 5687 type = (vmx->idt_vectoring_info & VECTORING_INFO_TYPE_MASK); 5688 5689 exit_qualification = vmx_get_exit_qual(vcpu); 5690 5691 reason = (u32)exit_qualification >> 30; 5692 if (reason == TASK_SWITCH_GATE && idt_v) { 5693 switch (type) { 5694 case INTR_TYPE_NMI_INTR: 5695 vcpu->arch.nmi_injected = false; 5696 vmx_set_nmi_mask(vcpu, true); 5697 break; 5698 case INTR_TYPE_EXT_INTR: 5699 case INTR_TYPE_SOFT_INTR: 5700 kvm_clear_interrupt_queue(vcpu); 5701 break; 5702 case INTR_TYPE_HARD_EXCEPTION: 5703 if (vmx->idt_vectoring_info & 5704 VECTORING_INFO_DELIVER_CODE_MASK) { 5705 has_error_code = true; 5706 error_code = 5707 vmcs_read32(IDT_VECTORING_ERROR_CODE); 5708 } 5709 fallthrough; 5710 case INTR_TYPE_SOFT_EXCEPTION: 5711 kvm_clear_exception_queue(vcpu); 5712 break; 5713 default: 5714 break; 5715 } 5716 } 5717 tss_selector = exit_qualification; 5718 5719 if (!idt_v || (type != INTR_TYPE_HARD_EXCEPTION && 5720 type != INTR_TYPE_EXT_INTR && 5721 type != INTR_TYPE_NMI_INTR)) 5722 WARN_ON(!skip_emulated_instruction(vcpu)); 5723 5724 /* 5725 * TODO: What about debug traps on tss switch? 5726 * Are we supposed to inject them and update dr6? 5727 */ 5728 return kvm_task_switch(vcpu, tss_selector, 5729 type == INTR_TYPE_SOFT_INTR ? idt_index : -1, 5730 reason, has_error_code, error_code); 5731 } 5732 5733 static int handle_ept_violation(struct kvm_vcpu *vcpu) 5734 { 5735 unsigned long exit_qualification; 5736 gpa_t gpa; 5737 u64 error_code; 5738 5739 exit_qualification = vmx_get_exit_qual(vcpu); 5740 5741 /* 5742 * EPT violation happened while executing iret from NMI, 5743 * "blocked by NMI" bit has to be set before next VM entry. 5744 * There are errata that may cause this bit to not be set: 5745 * AAK134, BY25. 5746 */ 5747 if (!(to_vmx(vcpu)->idt_vectoring_info & VECTORING_INFO_VALID_MASK) && 5748 enable_vnmi && 5749 (exit_qualification & INTR_INFO_UNBLOCK_NMI)) 5750 vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, GUEST_INTR_STATE_NMI); 5751 5752 gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS); 5753 trace_kvm_page_fault(vcpu, gpa, exit_qualification); 5754 5755 /* Is it a read fault? */ 5756 error_code = (exit_qualification & EPT_VIOLATION_ACC_READ) 5757 ? PFERR_USER_MASK : 0; 5758 /* Is it a write fault? */ 5759 error_code |= (exit_qualification & EPT_VIOLATION_ACC_WRITE) 5760 ? PFERR_WRITE_MASK : 0; 5761 /* Is it a fetch fault? */ 5762 error_code |= (exit_qualification & EPT_VIOLATION_ACC_INSTR) 5763 ? PFERR_FETCH_MASK : 0; 5764 /* ept page table entry is present? */ 5765 error_code |= (exit_qualification & EPT_VIOLATION_RWX_MASK) 5766 ? PFERR_PRESENT_MASK : 0; 5767 5768 error_code |= (exit_qualification & EPT_VIOLATION_GVA_TRANSLATED) != 0 ? 5769 PFERR_GUEST_FINAL_MASK : PFERR_GUEST_PAGE_MASK; 5770 5771 vcpu->arch.exit_qualification = exit_qualification; 5772 5773 /* 5774 * Check that the GPA doesn't exceed physical memory limits, as that is 5775 * a guest page fault. We have to emulate the instruction here, because 5776 * if the illegal address is that of a paging structure, then 5777 * EPT_VIOLATION_ACC_WRITE bit is set. Alternatively, if supported we 5778 * would also use advanced VM-exit information for EPT violations to 5779 * reconstruct the page fault error code. 5780 */ 5781 if (unlikely(allow_smaller_maxphyaddr && !kvm_vcpu_is_legal_gpa(vcpu, gpa))) 5782 return kvm_emulate_instruction(vcpu, 0); 5783 5784 return kvm_mmu_page_fault(vcpu, gpa, error_code, NULL, 0); 5785 } 5786 5787 static int handle_ept_misconfig(struct kvm_vcpu *vcpu) 5788 { 5789 gpa_t gpa; 5790 5791 if (vmx_check_emulate_instruction(vcpu, EMULTYPE_PF, NULL, 0)) 5792 return 1; 5793 5794 /* 5795 * A nested guest cannot optimize MMIO vmexits, because we have an 5796 * nGPA here instead of the required GPA. 5797 */ 5798 gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS); 5799 if (!is_guest_mode(vcpu) && 5800 !kvm_io_bus_write(vcpu, KVM_FAST_MMIO_BUS, gpa, 0, NULL)) { 5801 trace_kvm_fast_mmio(gpa); 5802 return kvm_skip_emulated_instruction(vcpu); 5803 } 5804 5805 return kvm_mmu_page_fault(vcpu, gpa, PFERR_RSVD_MASK, NULL, 0); 5806 } 5807 5808 static int handle_nmi_window(struct kvm_vcpu *vcpu) 5809 { 5810 if (KVM_BUG_ON(!enable_vnmi, vcpu->kvm)) 5811 return -EIO; 5812 5813 exec_controls_clearbit(to_vmx(vcpu), CPU_BASED_NMI_WINDOW_EXITING); 5814 ++vcpu->stat.nmi_window_exits; 5815 kvm_make_request(KVM_REQ_EVENT, vcpu); 5816 5817 return 1; 5818 } 5819 5820 static bool vmx_emulation_required_with_pending_exception(struct kvm_vcpu *vcpu) 5821 { 5822 struct vcpu_vmx *vmx = to_vmx(vcpu); 5823 5824 return vmx->emulation_required && !vmx->rmode.vm86_active && 5825 (kvm_is_exception_pending(vcpu) || vcpu->arch.exception.injected); 5826 } 5827 5828 static int handle_invalid_guest_state(struct kvm_vcpu *vcpu) 5829 { 5830 struct vcpu_vmx *vmx = to_vmx(vcpu); 5831 bool intr_window_requested; 5832 unsigned count = 130; 5833 5834 intr_window_requested = exec_controls_get(vmx) & 5835 CPU_BASED_INTR_WINDOW_EXITING; 5836 5837 while (vmx->emulation_required && count-- != 0) { 5838 if (intr_window_requested && !vmx_interrupt_blocked(vcpu)) 5839 return handle_interrupt_window(&vmx->vcpu); 5840 5841 if (kvm_test_request(KVM_REQ_EVENT, vcpu)) 5842 return 1; 5843 5844 if (!kvm_emulate_instruction(vcpu, 0)) 5845 return 0; 5846 5847 if (vmx_emulation_required_with_pending_exception(vcpu)) { 5848 kvm_prepare_emulation_failure_exit(vcpu); 5849 return 0; 5850 } 5851 5852 if (vcpu->arch.halt_request) { 5853 vcpu->arch.halt_request = 0; 5854 return kvm_emulate_halt_noskip(vcpu); 5855 } 5856 5857 /* 5858 * Note, return 1 and not 0, vcpu_run() will invoke 5859 * xfer_to_guest_mode() which will create a proper return 5860 * code. 5861 */ 5862 if (__xfer_to_guest_mode_work_pending()) 5863 return 1; 5864 } 5865 5866 return 1; 5867 } 5868 5869 static int vmx_vcpu_pre_run(struct kvm_vcpu *vcpu) 5870 { 5871 if (vmx_emulation_required_with_pending_exception(vcpu)) { 5872 kvm_prepare_emulation_failure_exit(vcpu); 5873 return 0; 5874 } 5875 5876 return 1; 5877 } 5878 5879 static void grow_ple_window(struct kvm_vcpu *vcpu) 5880 { 5881 struct vcpu_vmx *vmx = to_vmx(vcpu); 5882 unsigned int old = vmx->ple_window; 5883 5884 vmx->ple_window = __grow_ple_window(old, ple_window, 5885 ple_window_grow, 5886 ple_window_max); 5887 5888 if (vmx->ple_window != old) { 5889 vmx->ple_window_dirty = true; 5890 trace_kvm_ple_window_update(vcpu->vcpu_id, 5891 vmx->ple_window, old); 5892 } 5893 } 5894 5895 static void shrink_ple_window(struct kvm_vcpu *vcpu) 5896 { 5897 struct vcpu_vmx *vmx = to_vmx(vcpu); 5898 unsigned int old = vmx->ple_window; 5899 5900 vmx->ple_window = __shrink_ple_window(old, ple_window, 5901 ple_window_shrink, 5902 ple_window); 5903 5904 if (vmx->ple_window != old) { 5905 vmx->ple_window_dirty = true; 5906 trace_kvm_ple_window_update(vcpu->vcpu_id, 5907 vmx->ple_window, old); 5908 } 5909 } 5910 5911 /* 5912 * Indicate a busy-waiting vcpu in spinlock. We do not enable the PAUSE 5913 * exiting, so only get here on cpu with PAUSE-Loop-Exiting. 5914 */ 5915 static int handle_pause(struct kvm_vcpu *vcpu) 5916 { 5917 if (!kvm_pause_in_guest(vcpu->kvm)) 5918 grow_ple_window(vcpu); 5919 5920 /* 5921 * Intel sdm vol3 ch-25.1.3 says: The "PAUSE-loop exiting" 5922 * VM-execution control is ignored if CPL > 0. OTOH, KVM 5923 * never set PAUSE_EXITING and just set PLE if supported, 5924 * so the vcpu must be CPL=0 if it gets a PAUSE exit. 5925 */ 5926 kvm_vcpu_on_spin(vcpu, true); 5927 return kvm_skip_emulated_instruction(vcpu); 5928 } 5929 5930 static int handle_monitor_trap(struct kvm_vcpu *vcpu) 5931 { 5932 return 1; 5933 } 5934 5935 static int handle_invpcid(struct kvm_vcpu *vcpu) 5936 { 5937 u32 vmx_instruction_info; 5938 unsigned long type; 5939 gva_t gva; 5940 struct { 5941 u64 pcid; 5942 u64 gla; 5943 } operand; 5944 int gpr_index; 5945 5946 if (!guest_cpuid_has(vcpu, X86_FEATURE_INVPCID)) { 5947 kvm_queue_exception(vcpu, UD_VECTOR); 5948 return 1; 5949 } 5950 5951 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); 5952 gpr_index = vmx_get_instr_info_reg2(vmx_instruction_info); 5953 type = kvm_register_read(vcpu, gpr_index); 5954 5955 /* According to the Intel instruction reference, the memory operand 5956 * is read even if it isn't needed (e.g., for type==all) 5957 */ 5958 if (get_vmx_mem_address(vcpu, vmx_get_exit_qual(vcpu), 5959 vmx_instruction_info, false, 5960 sizeof(operand), &gva)) 5961 return 1; 5962 5963 return kvm_handle_invpcid(vcpu, type, gva); 5964 } 5965 5966 static int handle_pml_full(struct kvm_vcpu *vcpu) 5967 { 5968 unsigned long exit_qualification; 5969 5970 trace_kvm_pml_full(vcpu->vcpu_id); 5971 5972 exit_qualification = vmx_get_exit_qual(vcpu); 5973 5974 /* 5975 * PML buffer FULL happened while executing iret from NMI, 5976 * "blocked by NMI" bit has to be set before next VM entry. 5977 */ 5978 if (!(to_vmx(vcpu)->idt_vectoring_info & VECTORING_INFO_VALID_MASK) && 5979 enable_vnmi && 5980 (exit_qualification & INTR_INFO_UNBLOCK_NMI)) 5981 vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, 5982 GUEST_INTR_STATE_NMI); 5983 5984 /* 5985 * PML buffer already flushed at beginning of VMEXIT. Nothing to do 5986 * here.., and there's no userspace involvement needed for PML. 5987 */ 5988 return 1; 5989 } 5990 5991 static fastpath_t handle_fastpath_preemption_timer(struct kvm_vcpu *vcpu, 5992 bool force_immediate_exit) 5993 { 5994 struct vcpu_vmx *vmx = to_vmx(vcpu); 5995 5996 /* 5997 * In the *extremely* unlikely scenario that this is a spurious VM-Exit 5998 * due to the timer expiring while it was "soft" disabled, just eat the 5999 * exit and re-enter the guest. 6000 */ 6001 if (unlikely(vmx->loaded_vmcs->hv_timer_soft_disabled)) 6002 return EXIT_FASTPATH_REENTER_GUEST; 6003 6004 /* 6005 * If the timer expired because KVM used it to force an immediate exit, 6006 * then mission accomplished. 6007 */ 6008 if (force_immediate_exit) 6009 return EXIT_FASTPATH_EXIT_HANDLED; 6010 6011 /* 6012 * If L2 is active, go down the slow path as emulating the guest timer 6013 * expiration likely requires synthesizing a nested VM-Exit. 6014 */ 6015 if (is_guest_mode(vcpu)) 6016 return EXIT_FASTPATH_NONE; 6017 6018 kvm_lapic_expired_hv_timer(vcpu); 6019 return EXIT_FASTPATH_REENTER_GUEST; 6020 } 6021 6022 static int handle_preemption_timer(struct kvm_vcpu *vcpu) 6023 { 6024 /* 6025 * This non-fastpath handler is reached if and only if the preemption 6026 * timer was being used to emulate a guest timer while L2 is active. 6027 * All other scenarios are supposed to be handled in the fastpath. 6028 */ 6029 WARN_ON_ONCE(!is_guest_mode(vcpu)); 6030 kvm_lapic_expired_hv_timer(vcpu); 6031 return 1; 6032 } 6033 6034 /* 6035 * When nested=0, all VMX instruction VM Exits filter here. The handlers 6036 * are overwritten by nested_vmx_setup() when nested=1. 6037 */ 6038 static int handle_vmx_instruction(struct kvm_vcpu *vcpu) 6039 { 6040 kvm_queue_exception(vcpu, UD_VECTOR); 6041 return 1; 6042 } 6043 6044 #ifndef CONFIG_X86_SGX_KVM 6045 static int handle_encls(struct kvm_vcpu *vcpu) 6046 { 6047 /* 6048 * SGX virtualization is disabled. There is no software enable bit for 6049 * SGX, so KVM intercepts all ENCLS leafs and injects a #UD to prevent 6050 * the guest from executing ENCLS (when SGX is supported by hardware). 6051 */ 6052 kvm_queue_exception(vcpu, UD_VECTOR); 6053 return 1; 6054 } 6055 #endif /* CONFIG_X86_SGX_KVM */ 6056 6057 static int handle_bus_lock_vmexit(struct kvm_vcpu *vcpu) 6058 { 6059 /* 6060 * Hardware may or may not set the BUS_LOCK_DETECTED flag on BUS_LOCK 6061 * VM-Exits. Unconditionally set the flag here and leave the handling to 6062 * vmx_handle_exit(). 6063 */ 6064 to_vmx(vcpu)->exit_reason.bus_lock_detected = true; 6065 return 1; 6066 } 6067 6068 static int handle_notify(struct kvm_vcpu *vcpu) 6069 { 6070 unsigned long exit_qual = vmx_get_exit_qual(vcpu); 6071 bool context_invalid = exit_qual & NOTIFY_VM_CONTEXT_INVALID; 6072 6073 ++vcpu->stat.notify_window_exits; 6074 6075 /* 6076 * Notify VM exit happened while executing iret from NMI, 6077 * "blocked by NMI" bit has to be set before next VM entry. 6078 */ 6079 if (enable_vnmi && (exit_qual & INTR_INFO_UNBLOCK_NMI)) 6080 vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, 6081 GUEST_INTR_STATE_NMI); 6082 6083 if (vcpu->kvm->arch.notify_vmexit_flags & KVM_X86_NOTIFY_VMEXIT_USER || 6084 context_invalid) { 6085 vcpu->run->exit_reason = KVM_EXIT_NOTIFY; 6086 vcpu->run->notify.flags = context_invalid ? 6087 KVM_NOTIFY_CONTEXT_INVALID : 0; 6088 return 0; 6089 } 6090 6091 return 1; 6092 } 6093 6094 /* 6095 * The exit handlers return 1 if the exit was handled fully and guest execution 6096 * may resume. Otherwise they set the kvm_run parameter to indicate what needs 6097 * to be done to userspace and return 0. 6098 */ 6099 static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = { 6100 [EXIT_REASON_EXCEPTION_NMI] = handle_exception_nmi, 6101 [EXIT_REASON_EXTERNAL_INTERRUPT] = handle_external_interrupt, 6102 [EXIT_REASON_TRIPLE_FAULT] = handle_triple_fault, 6103 [EXIT_REASON_NMI_WINDOW] = handle_nmi_window, 6104 [EXIT_REASON_IO_INSTRUCTION] = handle_io, 6105 [EXIT_REASON_CR_ACCESS] = handle_cr, 6106 [EXIT_REASON_DR_ACCESS] = handle_dr, 6107 [EXIT_REASON_CPUID] = kvm_emulate_cpuid, 6108 [EXIT_REASON_MSR_READ] = kvm_emulate_rdmsr, 6109 [EXIT_REASON_MSR_WRITE] = kvm_emulate_wrmsr, 6110 [EXIT_REASON_INTERRUPT_WINDOW] = handle_interrupt_window, 6111 [EXIT_REASON_HLT] = kvm_emulate_halt, 6112 [EXIT_REASON_INVD] = kvm_emulate_invd, 6113 [EXIT_REASON_INVLPG] = handle_invlpg, 6114 [EXIT_REASON_RDPMC] = kvm_emulate_rdpmc, 6115 [EXIT_REASON_VMCALL] = kvm_emulate_hypercall, 6116 [EXIT_REASON_VMCLEAR] = handle_vmx_instruction, 6117 [EXIT_REASON_VMLAUNCH] = handle_vmx_instruction, 6118 [EXIT_REASON_VMPTRLD] = handle_vmx_instruction, 6119 [EXIT_REASON_VMPTRST] = handle_vmx_instruction, 6120 [EXIT_REASON_VMREAD] = handle_vmx_instruction, 6121 [EXIT_REASON_VMRESUME] = handle_vmx_instruction, 6122 [EXIT_REASON_VMWRITE] = handle_vmx_instruction, 6123 [EXIT_REASON_VMOFF] = handle_vmx_instruction, 6124 [EXIT_REASON_VMON] = handle_vmx_instruction, 6125 [EXIT_REASON_TPR_BELOW_THRESHOLD] = handle_tpr_below_threshold, 6126 [EXIT_REASON_APIC_ACCESS] = handle_apic_access, 6127 [EXIT_REASON_APIC_WRITE] = handle_apic_write, 6128 [EXIT_REASON_EOI_INDUCED] = handle_apic_eoi_induced, 6129 [EXIT_REASON_WBINVD] = kvm_emulate_wbinvd, 6130 [EXIT_REASON_XSETBV] = kvm_emulate_xsetbv, 6131 [EXIT_REASON_TASK_SWITCH] = handle_task_switch, 6132 [EXIT_REASON_MCE_DURING_VMENTRY] = handle_machine_check, 6133 [EXIT_REASON_GDTR_IDTR] = handle_desc, 6134 [EXIT_REASON_LDTR_TR] = handle_desc, 6135 [EXIT_REASON_EPT_VIOLATION] = handle_ept_violation, 6136 [EXIT_REASON_EPT_MISCONFIG] = handle_ept_misconfig, 6137 [EXIT_REASON_PAUSE_INSTRUCTION] = handle_pause, 6138 [EXIT_REASON_MWAIT_INSTRUCTION] = kvm_emulate_mwait, 6139 [EXIT_REASON_MONITOR_TRAP_FLAG] = handle_monitor_trap, 6140 [EXIT_REASON_MONITOR_INSTRUCTION] = kvm_emulate_monitor, 6141 [EXIT_REASON_INVEPT] = handle_vmx_instruction, 6142 [EXIT_REASON_INVVPID] = handle_vmx_instruction, 6143 [EXIT_REASON_RDRAND] = kvm_handle_invalid_op, 6144 [EXIT_REASON_RDSEED] = kvm_handle_invalid_op, 6145 [EXIT_REASON_PML_FULL] = handle_pml_full, 6146 [EXIT_REASON_INVPCID] = handle_invpcid, 6147 [EXIT_REASON_VMFUNC] = handle_vmx_instruction, 6148 [EXIT_REASON_PREEMPTION_TIMER] = handle_preemption_timer, 6149 [EXIT_REASON_ENCLS] = handle_encls, 6150 [EXIT_REASON_BUS_LOCK] = handle_bus_lock_vmexit, 6151 [EXIT_REASON_NOTIFY] = handle_notify, 6152 }; 6153 6154 static const int kvm_vmx_max_exit_handlers = 6155 ARRAY_SIZE(kvm_vmx_exit_handlers); 6156 6157 static void vmx_get_exit_info(struct kvm_vcpu *vcpu, u32 *reason, 6158 u64 *info1, u64 *info2, 6159 u32 *intr_info, u32 *error_code) 6160 { 6161 struct vcpu_vmx *vmx = to_vmx(vcpu); 6162 6163 *reason = vmx->exit_reason.full; 6164 *info1 = vmx_get_exit_qual(vcpu); 6165 if (!(vmx->exit_reason.failed_vmentry)) { 6166 *info2 = vmx->idt_vectoring_info; 6167 *intr_info = vmx_get_intr_info(vcpu); 6168 if (is_exception_with_error_code(*intr_info)) 6169 *error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE); 6170 else 6171 *error_code = 0; 6172 } else { 6173 *info2 = 0; 6174 *intr_info = 0; 6175 *error_code = 0; 6176 } 6177 } 6178 6179 static void vmx_destroy_pml_buffer(struct vcpu_vmx *vmx) 6180 { 6181 if (vmx->pml_pg) { 6182 __free_page(vmx->pml_pg); 6183 vmx->pml_pg = NULL; 6184 } 6185 } 6186 6187 static void vmx_flush_pml_buffer(struct kvm_vcpu *vcpu) 6188 { 6189 struct vcpu_vmx *vmx = to_vmx(vcpu); 6190 u64 *pml_buf; 6191 u16 pml_idx; 6192 6193 pml_idx = vmcs_read16(GUEST_PML_INDEX); 6194 6195 /* Do nothing if PML buffer is empty */ 6196 if (pml_idx == (PML_ENTITY_NUM - 1)) 6197 return; 6198 6199 /* PML index always points to next available PML buffer entity */ 6200 if (pml_idx >= PML_ENTITY_NUM) 6201 pml_idx = 0; 6202 else 6203 pml_idx++; 6204 6205 pml_buf = page_address(vmx->pml_pg); 6206 for (; pml_idx < PML_ENTITY_NUM; pml_idx++) { 6207 u64 gpa; 6208 6209 gpa = pml_buf[pml_idx]; 6210 WARN_ON(gpa & (PAGE_SIZE - 1)); 6211 kvm_vcpu_mark_page_dirty(vcpu, gpa >> PAGE_SHIFT); 6212 } 6213 6214 /* reset PML index */ 6215 vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1); 6216 } 6217 6218 static void vmx_dump_sel(char *name, uint32_t sel) 6219 { 6220 pr_err("%s sel=0x%04x, attr=0x%05x, limit=0x%08x, base=0x%016lx\n", 6221 name, vmcs_read16(sel), 6222 vmcs_read32(sel + GUEST_ES_AR_BYTES - GUEST_ES_SELECTOR), 6223 vmcs_read32(sel + GUEST_ES_LIMIT - GUEST_ES_SELECTOR), 6224 vmcs_readl(sel + GUEST_ES_BASE - GUEST_ES_SELECTOR)); 6225 } 6226 6227 static void vmx_dump_dtsel(char *name, uint32_t limit) 6228 { 6229 pr_err("%s limit=0x%08x, base=0x%016lx\n", 6230 name, vmcs_read32(limit), 6231 vmcs_readl(limit + GUEST_GDTR_BASE - GUEST_GDTR_LIMIT)); 6232 } 6233 6234 static void vmx_dump_msrs(char *name, struct vmx_msrs *m) 6235 { 6236 unsigned int i; 6237 struct vmx_msr_entry *e; 6238 6239 pr_err("MSR %s:\n", name); 6240 for (i = 0, e = m->val; i < m->nr; ++i, ++e) 6241 pr_err(" %2d: msr=0x%08x value=0x%016llx\n", i, e->index, e->value); 6242 } 6243 6244 void dump_vmcs(struct kvm_vcpu *vcpu) 6245 { 6246 struct vcpu_vmx *vmx = to_vmx(vcpu); 6247 u32 vmentry_ctl, vmexit_ctl; 6248 u32 cpu_based_exec_ctrl, pin_based_exec_ctrl, secondary_exec_control; 6249 u64 tertiary_exec_control; 6250 unsigned long cr4; 6251 int efer_slot; 6252 6253 if (!dump_invalid_vmcs) { 6254 pr_warn_ratelimited("set kvm_intel.dump_invalid_vmcs=1 to dump internal KVM state.\n"); 6255 return; 6256 } 6257 6258 vmentry_ctl = vmcs_read32(VM_ENTRY_CONTROLS); 6259 vmexit_ctl = vmcs_read32(VM_EXIT_CONTROLS); 6260 cpu_based_exec_ctrl = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); 6261 pin_based_exec_ctrl = vmcs_read32(PIN_BASED_VM_EXEC_CONTROL); 6262 cr4 = vmcs_readl(GUEST_CR4); 6263 6264 if (cpu_has_secondary_exec_ctrls()) 6265 secondary_exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL); 6266 else 6267 secondary_exec_control = 0; 6268 6269 if (cpu_has_tertiary_exec_ctrls()) 6270 tertiary_exec_control = vmcs_read64(TERTIARY_VM_EXEC_CONTROL); 6271 else 6272 tertiary_exec_control = 0; 6273 6274 pr_err("VMCS %p, last attempted VM-entry on CPU %d\n", 6275 vmx->loaded_vmcs->vmcs, vcpu->arch.last_vmentry_cpu); 6276 pr_err("*** Guest State ***\n"); 6277 pr_err("CR0: actual=0x%016lx, shadow=0x%016lx, gh_mask=%016lx\n", 6278 vmcs_readl(GUEST_CR0), vmcs_readl(CR0_READ_SHADOW), 6279 vmcs_readl(CR0_GUEST_HOST_MASK)); 6280 pr_err("CR4: actual=0x%016lx, shadow=0x%016lx, gh_mask=%016lx\n", 6281 cr4, vmcs_readl(CR4_READ_SHADOW), vmcs_readl(CR4_GUEST_HOST_MASK)); 6282 pr_err("CR3 = 0x%016lx\n", vmcs_readl(GUEST_CR3)); 6283 if (cpu_has_vmx_ept()) { 6284 pr_err("PDPTR0 = 0x%016llx PDPTR1 = 0x%016llx\n", 6285 vmcs_read64(GUEST_PDPTR0), vmcs_read64(GUEST_PDPTR1)); 6286 pr_err("PDPTR2 = 0x%016llx PDPTR3 = 0x%016llx\n", 6287 vmcs_read64(GUEST_PDPTR2), vmcs_read64(GUEST_PDPTR3)); 6288 } 6289 pr_err("RSP = 0x%016lx RIP = 0x%016lx\n", 6290 vmcs_readl(GUEST_RSP), vmcs_readl(GUEST_RIP)); 6291 pr_err("RFLAGS=0x%08lx DR7 = 0x%016lx\n", 6292 vmcs_readl(GUEST_RFLAGS), vmcs_readl(GUEST_DR7)); 6293 pr_err("Sysenter RSP=%016lx CS:RIP=%04x:%016lx\n", 6294 vmcs_readl(GUEST_SYSENTER_ESP), 6295 vmcs_read32(GUEST_SYSENTER_CS), vmcs_readl(GUEST_SYSENTER_EIP)); 6296 vmx_dump_sel("CS: ", GUEST_CS_SELECTOR); 6297 vmx_dump_sel("DS: ", GUEST_DS_SELECTOR); 6298 vmx_dump_sel("SS: ", GUEST_SS_SELECTOR); 6299 vmx_dump_sel("ES: ", GUEST_ES_SELECTOR); 6300 vmx_dump_sel("FS: ", GUEST_FS_SELECTOR); 6301 vmx_dump_sel("GS: ", GUEST_GS_SELECTOR); 6302 vmx_dump_dtsel("GDTR:", GUEST_GDTR_LIMIT); 6303 vmx_dump_sel("LDTR:", GUEST_LDTR_SELECTOR); 6304 vmx_dump_dtsel("IDTR:", GUEST_IDTR_LIMIT); 6305 vmx_dump_sel("TR: ", GUEST_TR_SELECTOR); 6306 efer_slot = vmx_find_loadstore_msr_slot(&vmx->msr_autoload.guest, MSR_EFER); 6307 if (vmentry_ctl & VM_ENTRY_LOAD_IA32_EFER) 6308 pr_err("EFER= 0x%016llx\n", vmcs_read64(GUEST_IA32_EFER)); 6309 else if (efer_slot >= 0) 6310 pr_err("EFER= 0x%016llx (autoload)\n", 6311 vmx->msr_autoload.guest.val[efer_slot].value); 6312 else if (vmentry_ctl & VM_ENTRY_IA32E_MODE) 6313 pr_err("EFER= 0x%016llx (effective)\n", 6314 vcpu->arch.efer | (EFER_LMA | EFER_LME)); 6315 else 6316 pr_err("EFER= 0x%016llx (effective)\n", 6317 vcpu->arch.efer & ~(EFER_LMA | EFER_LME)); 6318 if (vmentry_ctl & VM_ENTRY_LOAD_IA32_PAT) 6319 pr_err("PAT = 0x%016llx\n", vmcs_read64(GUEST_IA32_PAT)); 6320 pr_err("DebugCtl = 0x%016llx DebugExceptions = 0x%016lx\n", 6321 vmcs_read64(GUEST_IA32_DEBUGCTL), 6322 vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS)); 6323 if (cpu_has_load_perf_global_ctrl() && 6324 vmentry_ctl & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL) 6325 pr_err("PerfGlobCtl = 0x%016llx\n", 6326 vmcs_read64(GUEST_IA32_PERF_GLOBAL_CTRL)); 6327 if (vmentry_ctl & VM_ENTRY_LOAD_BNDCFGS) 6328 pr_err("BndCfgS = 0x%016llx\n", vmcs_read64(GUEST_BNDCFGS)); 6329 pr_err("Interruptibility = %08x ActivityState = %08x\n", 6330 vmcs_read32(GUEST_INTERRUPTIBILITY_INFO), 6331 vmcs_read32(GUEST_ACTIVITY_STATE)); 6332 if (secondary_exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY) 6333 pr_err("InterruptStatus = %04x\n", 6334 vmcs_read16(GUEST_INTR_STATUS)); 6335 if (vmcs_read32(VM_ENTRY_MSR_LOAD_COUNT) > 0) 6336 vmx_dump_msrs("guest autoload", &vmx->msr_autoload.guest); 6337 if (vmcs_read32(VM_EXIT_MSR_STORE_COUNT) > 0) 6338 vmx_dump_msrs("guest autostore", &vmx->msr_autostore.guest); 6339 6340 pr_err("*** Host State ***\n"); 6341 pr_err("RIP = 0x%016lx RSP = 0x%016lx\n", 6342 vmcs_readl(HOST_RIP), vmcs_readl(HOST_RSP)); 6343 pr_err("CS=%04x SS=%04x DS=%04x ES=%04x FS=%04x GS=%04x TR=%04x\n", 6344 vmcs_read16(HOST_CS_SELECTOR), vmcs_read16(HOST_SS_SELECTOR), 6345 vmcs_read16(HOST_DS_SELECTOR), vmcs_read16(HOST_ES_SELECTOR), 6346 vmcs_read16(HOST_FS_SELECTOR), vmcs_read16(HOST_GS_SELECTOR), 6347 vmcs_read16(HOST_TR_SELECTOR)); 6348 pr_err("FSBase=%016lx GSBase=%016lx TRBase=%016lx\n", 6349 vmcs_readl(HOST_FS_BASE), vmcs_readl(HOST_GS_BASE), 6350 vmcs_readl(HOST_TR_BASE)); 6351 pr_err("GDTBase=%016lx IDTBase=%016lx\n", 6352 vmcs_readl(HOST_GDTR_BASE), vmcs_readl(HOST_IDTR_BASE)); 6353 pr_err("CR0=%016lx CR3=%016lx CR4=%016lx\n", 6354 vmcs_readl(HOST_CR0), vmcs_readl(HOST_CR3), 6355 vmcs_readl(HOST_CR4)); 6356 pr_err("Sysenter RSP=%016lx CS:RIP=%04x:%016lx\n", 6357 vmcs_readl(HOST_IA32_SYSENTER_ESP), 6358 vmcs_read32(HOST_IA32_SYSENTER_CS), 6359 vmcs_readl(HOST_IA32_SYSENTER_EIP)); 6360 if (vmexit_ctl & VM_EXIT_LOAD_IA32_EFER) 6361 pr_err("EFER= 0x%016llx\n", vmcs_read64(HOST_IA32_EFER)); 6362 if (vmexit_ctl & VM_EXIT_LOAD_IA32_PAT) 6363 pr_err("PAT = 0x%016llx\n", vmcs_read64(HOST_IA32_PAT)); 6364 if (cpu_has_load_perf_global_ctrl() && 6365 vmexit_ctl & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL) 6366 pr_err("PerfGlobCtl = 0x%016llx\n", 6367 vmcs_read64(HOST_IA32_PERF_GLOBAL_CTRL)); 6368 if (vmcs_read32(VM_EXIT_MSR_LOAD_COUNT) > 0) 6369 vmx_dump_msrs("host autoload", &vmx->msr_autoload.host); 6370 6371 pr_err("*** Control State ***\n"); 6372 pr_err("CPUBased=0x%08x SecondaryExec=0x%08x TertiaryExec=0x%016llx\n", 6373 cpu_based_exec_ctrl, secondary_exec_control, tertiary_exec_control); 6374 pr_err("PinBased=0x%08x EntryControls=%08x ExitControls=%08x\n", 6375 pin_based_exec_ctrl, vmentry_ctl, vmexit_ctl); 6376 pr_err("ExceptionBitmap=%08x PFECmask=%08x PFECmatch=%08x\n", 6377 vmcs_read32(EXCEPTION_BITMAP), 6378 vmcs_read32(PAGE_FAULT_ERROR_CODE_MASK), 6379 vmcs_read32(PAGE_FAULT_ERROR_CODE_MATCH)); 6380 pr_err("VMEntry: intr_info=%08x errcode=%08x ilen=%08x\n", 6381 vmcs_read32(VM_ENTRY_INTR_INFO_FIELD), 6382 vmcs_read32(VM_ENTRY_EXCEPTION_ERROR_CODE), 6383 vmcs_read32(VM_ENTRY_INSTRUCTION_LEN)); 6384 pr_err("VMExit: intr_info=%08x errcode=%08x ilen=%08x\n", 6385 vmcs_read32(VM_EXIT_INTR_INFO), 6386 vmcs_read32(VM_EXIT_INTR_ERROR_CODE), 6387 vmcs_read32(VM_EXIT_INSTRUCTION_LEN)); 6388 pr_err(" reason=%08x qualification=%016lx\n", 6389 vmcs_read32(VM_EXIT_REASON), vmcs_readl(EXIT_QUALIFICATION)); 6390 pr_err("IDTVectoring: info=%08x errcode=%08x\n", 6391 vmcs_read32(IDT_VECTORING_INFO_FIELD), 6392 vmcs_read32(IDT_VECTORING_ERROR_CODE)); 6393 pr_err("TSC Offset = 0x%016llx\n", vmcs_read64(TSC_OFFSET)); 6394 if (secondary_exec_control & SECONDARY_EXEC_TSC_SCALING) 6395 pr_err("TSC Multiplier = 0x%016llx\n", 6396 vmcs_read64(TSC_MULTIPLIER)); 6397 if (cpu_based_exec_ctrl & CPU_BASED_TPR_SHADOW) { 6398 if (secondary_exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY) { 6399 u16 status = vmcs_read16(GUEST_INTR_STATUS); 6400 pr_err("SVI|RVI = %02x|%02x ", status >> 8, status & 0xff); 6401 } 6402 pr_cont("TPR Threshold = 0x%02x\n", vmcs_read32(TPR_THRESHOLD)); 6403 if (secondary_exec_control & SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES) 6404 pr_err("APIC-access addr = 0x%016llx ", vmcs_read64(APIC_ACCESS_ADDR)); 6405 pr_cont("virt-APIC addr = 0x%016llx\n", vmcs_read64(VIRTUAL_APIC_PAGE_ADDR)); 6406 } 6407 if (pin_based_exec_ctrl & PIN_BASED_POSTED_INTR) 6408 pr_err("PostedIntrVec = 0x%02x\n", vmcs_read16(POSTED_INTR_NV)); 6409 if ((secondary_exec_control & SECONDARY_EXEC_ENABLE_EPT)) 6410 pr_err("EPT pointer = 0x%016llx\n", vmcs_read64(EPT_POINTER)); 6411 if (secondary_exec_control & SECONDARY_EXEC_PAUSE_LOOP_EXITING) 6412 pr_err("PLE Gap=%08x Window=%08x\n", 6413 vmcs_read32(PLE_GAP), vmcs_read32(PLE_WINDOW)); 6414 if (secondary_exec_control & SECONDARY_EXEC_ENABLE_VPID) 6415 pr_err("Virtual processor ID = 0x%04x\n", 6416 vmcs_read16(VIRTUAL_PROCESSOR_ID)); 6417 } 6418 6419 /* 6420 * The guest has exited. See if we can fix it or if we need userspace 6421 * assistance. 6422 */ 6423 static int __vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) 6424 { 6425 struct vcpu_vmx *vmx = to_vmx(vcpu); 6426 union vmx_exit_reason exit_reason = vmx->exit_reason; 6427 u32 vectoring_info = vmx->idt_vectoring_info; 6428 u16 exit_handler_index; 6429 6430 /* 6431 * Flush logged GPAs PML buffer, this will make dirty_bitmap more 6432 * updated. Another good is, in kvm_vm_ioctl_get_dirty_log, before 6433 * querying dirty_bitmap, we only need to kick all vcpus out of guest 6434 * mode as if vcpus is in root mode, the PML buffer must has been 6435 * flushed already. Note, PML is never enabled in hardware while 6436 * running L2. 6437 */ 6438 if (enable_pml && !is_guest_mode(vcpu)) 6439 vmx_flush_pml_buffer(vcpu); 6440 6441 /* 6442 * KVM should never reach this point with a pending nested VM-Enter. 6443 * More specifically, short-circuiting VM-Entry to emulate L2 due to 6444 * invalid guest state should never happen as that means KVM knowingly 6445 * allowed a nested VM-Enter with an invalid vmcs12. More below. 6446 */ 6447 if (KVM_BUG_ON(vmx->nested.nested_run_pending, vcpu->kvm)) 6448 return -EIO; 6449 6450 if (is_guest_mode(vcpu)) { 6451 /* 6452 * PML is never enabled when running L2, bail immediately if a 6453 * PML full exit occurs as something is horribly wrong. 6454 */ 6455 if (exit_reason.basic == EXIT_REASON_PML_FULL) 6456 goto unexpected_vmexit; 6457 6458 /* 6459 * The host physical addresses of some pages of guest memory 6460 * are loaded into the vmcs02 (e.g. vmcs12's Virtual APIC 6461 * Page). The CPU may write to these pages via their host 6462 * physical address while L2 is running, bypassing any 6463 * address-translation-based dirty tracking (e.g. EPT write 6464 * protection). 6465 * 6466 * Mark them dirty on every exit from L2 to prevent them from 6467 * getting out of sync with dirty tracking. 6468 */ 6469 nested_mark_vmcs12_pages_dirty(vcpu); 6470 6471 /* 6472 * Synthesize a triple fault if L2 state is invalid. In normal 6473 * operation, nested VM-Enter rejects any attempt to enter L2 6474 * with invalid state. However, those checks are skipped if 6475 * state is being stuffed via RSM or KVM_SET_NESTED_STATE. If 6476 * L2 state is invalid, it means either L1 modified SMRAM state 6477 * or userspace provided bad state. Synthesize TRIPLE_FAULT as 6478 * doing so is architecturally allowed in the RSM case, and is 6479 * the least awful solution for the userspace case without 6480 * risking false positives. 6481 */ 6482 if (vmx->emulation_required) { 6483 nested_vmx_vmexit(vcpu, EXIT_REASON_TRIPLE_FAULT, 0, 0); 6484 return 1; 6485 } 6486 6487 if (nested_vmx_reflect_vmexit(vcpu)) 6488 return 1; 6489 } 6490 6491 /* If guest state is invalid, start emulating. L2 is handled above. */ 6492 if (vmx->emulation_required) 6493 return handle_invalid_guest_state(vcpu); 6494 6495 if (exit_reason.failed_vmentry) { 6496 dump_vmcs(vcpu); 6497 vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY; 6498 vcpu->run->fail_entry.hardware_entry_failure_reason 6499 = exit_reason.full; 6500 vcpu->run->fail_entry.cpu = vcpu->arch.last_vmentry_cpu; 6501 return 0; 6502 } 6503 6504 if (unlikely(vmx->fail)) { 6505 dump_vmcs(vcpu); 6506 vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY; 6507 vcpu->run->fail_entry.hardware_entry_failure_reason 6508 = vmcs_read32(VM_INSTRUCTION_ERROR); 6509 vcpu->run->fail_entry.cpu = vcpu->arch.last_vmentry_cpu; 6510 return 0; 6511 } 6512 6513 /* 6514 * Note: 6515 * Do not try to fix EXIT_REASON_EPT_MISCONFIG if it caused by 6516 * delivery event since it indicates guest is accessing MMIO. 6517 * The vm-exit can be triggered again after return to guest that 6518 * will cause infinite loop. 6519 */ 6520 if ((vectoring_info & VECTORING_INFO_VALID_MASK) && 6521 (exit_reason.basic != EXIT_REASON_EXCEPTION_NMI && 6522 exit_reason.basic != EXIT_REASON_EPT_VIOLATION && 6523 exit_reason.basic != EXIT_REASON_PML_FULL && 6524 exit_reason.basic != EXIT_REASON_APIC_ACCESS && 6525 exit_reason.basic != EXIT_REASON_TASK_SWITCH && 6526 exit_reason.basic != EXIT_REASON_NOTIFY)) { 6527 int ndata = 3; 6528 6529 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; 6530 vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_DELIVERY_EV; 6531 vcpu->run->internal.data[0] = vectoring_info; 6532 vcpu->run->internal.data[1] = exit_reason.full; 6533 vcpu->run->internal.data[2] = vmx_get_exit_qual(vcpu); 6534 if (exit_reason.basic == EXIT_REASON_EPT_MISCONFIG) { 6535 vcpu->run->internal.data[ndata++] = 6536 vmcs_read64(GUEST_PHYSICAL_ADDRESS); 6537 } 6538 vcpu->run->internal.data[ndata++] = vcpu->arch.last_vmentry_cpu; 6539 vcpu->run->internal.ndata = ndata; 6540 return 0; 6541 } 6542 6543 if (unlikely(!enable_vnmi && 6544 vmx->loaded_vmcs->soft_vnmi_blocked)) { 6545 if (!vmx_interrupt_blocked(vcpu)) { 6546 vmx->loaded_vmcs->soft_vnmi_blocked = 0; 6547 } else if (vmx->loaded_vmcs->vnmi_blocked_time > 1000000000LL && 6548 vcpu->arch.nmi_pending) { 6549 /* 6550 * This CPU don't support us in finding the end of an 6551 * NMI-blocked window if the guest runs with IRQs 6552 * disabled. So we pull the trigger after 1 s of 6553 * futile waiting, but inform the user about this. 6554 */ 6555 printk(KERN_WARNING "%s: Breaking out of NMI-blocked " 6556 "state on VCPU %d after 1 s timeout\n", 6557 __func__, vcpu->vcpu_id); 6558 vmx->loaded_vmcs->soft_vnmi_blocked = 0; 6559 } 6560 } 6561 6562 if (exit_fastpath != EXIT_FASTPATH_NONE) 6563 return 1; 6564 6565 if (exit_reason.basic >= kvm_vmx_max_exit_handlers) 6566 goto unexpected_vmexit; 6567 #ifdef CONFIG_MITIGATION_RETPOLINE 6568 if (exit_reason.basic == EXIT_REASON_MSR_WRITE) 6569 return kvm_emulate_wrmsr(vcpu); 6570 else if (exit_reason.basic == EXIT_REASON_PREEMPTION_TIMER) 6571 return handle_preemption_timer(vcpu); 6572 else if (exit_reason.basic == EXIT_REASON_INTERRUPT_WINDOW) 6573 return handle_interrupt_window(vcpu); 6574 else if (exit_reason.basic == EXIT_REASON_EXTERNAL_INTERRUPT) 6575 return handle_external_interrupt(vcpu); 6576 else if (exit_reason.basic == EXIT_REASON_HLT) 6577 return kvm_emulate_halt(vcpu); 6578 else if (exit_reason.basic == EXIT_REASON_EPT_MISCONFIG) 6579 return handle_ept_misconfig(vcpu); 6580 #endif 6581 6582 exit_handler_index = array_index_nospec((u16)exit_reason.basic, 6583 kvm_vmx_max_exit_handlers); 6584 if (!kvm_vmx_exit_handlers[exit_handler_index]) 6585 goto unexpected_vmexit; 6586 6587 return kvm_vmx_exit_handlers[exit_handler_index](vcpu); 6588 6589 unexpected_vmexit: 6590 vcpu_unimpl(vcpu, "vmx: unexpected exit reason 0x%x\n", 6591 exit_reason.full); 6592 dump_vmcs(vcpu); 6593 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; 6594 vcpu->run->internal.suberror = 6595 KVM_INTERNAL_ERROR_UNEXPECTED_EXIT_REASON; 6596 vcpu->run->internal.ndata = 2; 6597 vcpu->run->internal.data[0] = exit_reason.full; 6598 vcpu->run->internal.data[1] = vcpu->arch.last_vmentry_cpu; 6599 return 0; 6600 } 6601 6602 static int vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) 6603 { 6604 int ret = __vmx_handle_exit(vcpu, exit_fastpath); 6605 6606 /* 6607 * Exit to user space when bus lock detected to inform that there is 6608 * a bus lock in guest. 6609 */ 6610 if (to_vmx(vcpu)->exit_reason.bus_lock_detected) { 6611 if (ret > 0) 6612 vcpu->run->exit_reason = KVM_EXIT_X86_BUS_LOCK; 6613 6614 vcpu->run->flags |= KVM_RUN_X86_BUS_LOCK; 6615 return 0; 6616 } 6617 return ret; 6618 } 6619 6620 /* 6621 * Software based L1D cache flush which is used when microcode providing 6622 * the cache control MSR is not loaded. 6623 * 6624 * The L1D cache is 32 KiB on Nehalem and later microarchitectures, but to 6625 * flush it is required to read in 64 KiB because the replacement algorithm 6626 * is not exactly LRU. This could be sized at runtime via topology 6627 * information but as all relevant affected CPUs have 32KiB L1D cache size 6628 * there is no point in doing so. 6629 */ 6630 static noinstr void vmx_l1d_flush(struct kvm_vcpu *vcpu) 6631 { 6632 int size = PAGE_SIZE << L1D_CACHE_ORDER; 6633 6634 /* 6635 * This code is only executed when the flush mode is 'cond' or 6636 * 'always' 6637 */ 6638 if (static_branch_likely(&vmx_l1d_flush_cond)) { 6639 bool flush_l1d; 6640 6641 /* 6642 * Clear the per-vcpu flush bit, it gets set again 6643 * either from vcpu_run() or from one of the unsafe 6644 * VMEXIT handlers. 6645 */ 6646 flush_l1d = vcpu->arch.l1tf_flush_l1d; 6647 vcpu->arch.l1tf_flush_l1d = false; 6648 6649 /* 6650 * Clear the per-cpu flush bit, it gets set again from 6651 * the interrupt handlers. 6652 */ 6653 flush_l1d |= kvm_get_cpu_l1tf_flush_l1d(); 6654 kvm_clear_cpu_l1tf_flush_l1d(); 6655 6656 if (!flush_l1d) 6657 return; 6658 } 6659 6660 vcpu->stat.l1d_flush++; 6661 6662 if (static_cpu_has(X86_FEATURE_FLUSH_L1D)) { 6663 native_wrmsrl(MSR_IA32_FLUSH_CMD, L1D_FLUSH); 6664 return; 6665 } 6666 6667 asm volatile( 6668 /* First ensure the pages are in the TLB */ 6669 "xorl %%eax, %%eax\n" 6670 ".Lpopulate_tlb:\n\t" 6671 "movzbl (%[flush_pages], %%" _ASM_AX "), %%ecx\n\t" 6672 "addl $4096, %%eax\n\t" 6673 "cmpl %%eax, %[size]\n\t" 6674 "jne .Lpopulate_tlb\n\t" 6675 "xorl %%eax, %%eax\n\t" 6676 "cpuid\n\t" 6677 /* Now fill the cache */ 6678 "xorl %%eax, %%eax\n" 6679 ".Lfill_cache:\n" 6680 "movzbl (%[flush_pages], %%" _ASM_AX "), %%ecx\n\t" 6681 "addl $64, %%eax\n\t" 6682 "cmpl %%eax, %[size]\n\t" 6683 "jne .Lfill_cache\n\t" 6684 "lfence\n" 6685 :: [flush_pages] "r" (vmx_l1d_flush_pages), 6686 [size] "r" (size) 6687 : "eax", "ebx", "ecx", "edx"); 6688 } 6689 6690 static void vmx_update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr) 6691 { 6692 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 6693 int tpr_threshold; 6694 6695 if (is_guest_mode(vcpu) && 6696 nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) 6697 return; 6698 6699 tpr_threshold = (irr == -1 || tpr < irr) ? 0 : irr; 6700 if (is_guest_mode(vcpu)) 6701 to_vmx(vcpu)->nested.l1_tpr_threshold = tpr_threshold; 6702 else 6703 vmcs_write32(TPR_THRESHOLD, tpr_threshold); 6704 } 6705 6706 void vmx_set_virtual_apic_mode(struct kvm_vcpu *vcpu) 6707 { 6708 struct vcpu_vmx *vmx = to_vmx(vcpu); 6709 u32 sec_exec_control; 6710 6711 if (!lapic_in_kernel(vcpu)) 6712 return; 6713 6714 if (!flexpriority_enabled && 6715 !cpu_has_vmx_virtualize_x2apic_mode()) 6716 return; 6717 6718 /* Postpone execution until vmcs01 is the current VMCS. */ 6719 if (is_guest_mode(vcpu)) { 6720 vmx->nested.change_vmcs01_virtual_apic_mode = true; 6721 return; 6722 } 6723 6724 sec_exec_control = secondary_exec_controls_get(vmx); 6725 sec_exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | 6726 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE); 6727 6728 switch (kvm_get_apic_mode(vcpu)) { 6729 case LAPIC_MODE_INVALID: 6730 WARN_ONCE(true, "Invalid local APIC state"); 6731 break; 6732 case LAPIC_MODE_DISABLED: 6733 break; 6734 case LAPIC_MODE_XAPIC: 6735 if (flexpriority_enabled) { 6736 sec_exec_control |= 6737 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; 6738 kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu); 6739 6740 /* 6741 * Flush the TLB, reloading the APIC access page will 6742 * only do so if its physical address has changed, but 6743 * the guest may have inserted a non-APIC mapping into 6744 * the TLB while the APIC access page was disabled. 6745 */ 6746 kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu); 6747 } 6748 break; 6749 case LAPIC_MODE_X2APIC: 6750 if (cpu_has_vmx_virtualize_x2apic_mode()) 6751 sec_exec_control |= 6752 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE; 6753 break; 6754 } 6755 secondary_exec_controls_set(vmx, sec_exec_control); 6756 6757 vmx_update_msr_bitmap_x2apic(vcpu); 6758 } 6759 6760 static void vmx_set_apic_access_page_addr(struct kvm_vcpu *vcpu) 6761 { 6762 const gfn_t gfn = APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT; 6763 struct kvm *kvm = vcpu->kvm; 6764 struct kvm_memslots *slots = kvm_memslots(kvm); 6765 struct kvm_memory_slot *slot; 6766 unsigned long mmu_seq; 6767 kvm_pfn_t pfn; 6768 6769 /* Defer reload until vmcs01 is the current VMCS. */ 6770 if (is_guest_mode(vcpu)) { 6771 to_vmx(vcpu)->nested.reload_vmcs01_apic_access_page = true; 6772 return; 6773 } 6774 6775 if (!(secondary_exec_controls_get(to_vmx(vcpu)) & 6776 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) 6777 return; 6778 6779 /* 6780 * Explicitly grab the memslot using KVM's internal slot ID to ensure 6781 * KVM doesn't unintentionally grab a userspace memslot. It _should_ 6782 * be impossible for userspace to create a memslot for the APIC when 6783 * APICv is enabled, but paranoia won't hurt in this case. 6784 */ 6785 slot = id_to_memslot(slots, APIC_ACCESS_PAGE_PRIVATE_MEMSLOT); 6786 if (!slot || slot->flags & KVM_MEMSLOT_INVALID) 6787 return; 6788 6789 /* 6790 * Ensure that the mmu_notifier sequence count is read before KVM 6791 * retrieves the pfn from the primary MMU. Note, the memslot is 6792 * protected by SRCU, not the mmu_notifier. Pairs with the smp_wmb() 6793 * in kvm_mmu_invalidate_end(). 6794 */ 6795 mmu_seq = kvm->mmu_invalidate_seq; 6796 smp_rmb(); 6797 6798 /* 6799 * No need to retry if the memslot does not exist or is invalid. KVM 6800 * controls the APIC-access page memslot, and only deletes the memslot 6801 * if APICv is permanently inhibited, i.e. the memslot won't reappear. 6802 */ 6803 pfn = gfn_to_pfn_memslot(slot, gfn); 6804 if (is_error_noslot_pfn(pfn)) 6805 return; 6806 6807 read_lock(&vcpu->kvm->mmu_lock); 6808 if (mmu_invalidate_retry_gfn(kvm, mmu_seq, gfn)) { 6809 kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu); 6810 read_unlock(&vcpu->kvm->mmu_lock); 6811 goto out; 6812 } 6813 6814 vmcs_write64(APIC_ACCESS_ADDR, pfn_to_hpa(pfn)); 6815 read_unlock(&vcpu->kvm->mmu_lock); 6816 6817 /* 6818 * No need for a manual TLB flush at this point, KVM has already done a 6819 * flush if there were SPTEs pointing at the previous page. 6820 */ 6821 out: 6822 /* 6823 * Do not pin apic access page in memory, the MMU notifier 6824 * will call us again if it is migrated or swapped out. 6825 */ 6826 kvm_release_pfn_clean(pfn); 6827 } 6828 6829 static void vmx_hwapic_isr_update(int max_isr) 6830 { 6831 u16 status; 6832 u8 old; 6833 6834 if (max_isr == -1) 6835 max_isr = 0; 6836 6837 status = vmcs_read16(GUEST_INTR_STATUS); 6838 old = status >> 8; 6839 if (max_isr != old) { 6840 status &= 0xff; 6841 status |= max_isr << 8; 6842 vmcs_write16(GUEST_INTR_STATUS, status); 6843 } 6844 } 6845 6846 static void vmx_set_rvi(int vector) 6847 { 6848 u16 status; 6849 u8 old; 6850 6851 if (vector == -1) 6852 vector = 0; 6853 6854 status = vmcs_read16(GUEST_INTR_STATUS); 6855 old = (u8)status & 0xff; 6856 if ((u8)vector != old) { 6857 status &= ~0xff; 6858 status |= (u8)vector; 6859 vmcs_write16(GUEST_INTR_STATUS, status); 6860 } 6861 } 6862 6863 static void vmx_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr) 6864 { 6865 /* 6866 * When running L2, updating RVI is only relevant when 6867 * vmcs12 virtual-interrupt-delivery enabled. 6868 * However, it can be enabled only when L1 also 6869 * intercepts external-interrupts and in that case 6870 * we should not update vmcs02 RVI but instead intercept 6871 * interrupt. Therefore, do nothing when running L2. 6872 */ 6873 if (!is_guest_mode(vcpu)) 6874 vmx_set_rvi(max_irr); 6875 } 6876 6877 static int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu) 6878 { 6879 struct vcpu_vmx *vmx = to_vmx(vcpu); 6880 int max_irr; 6881 bool got_posted_interrupt; 6882 6883 if (KVM_BUG_ON(!enable_apicv, vcpu->kvm)) 6884 return -EIO; 6885 6886 if (pi_test_on(&vmx->pi_desc)) { 6887 pi_clear_on(&vmx->pi_desc); 6888 /* 6889 * IOMMU can write to PID.ON, so the barrier matters even on UP. 6890 * But on x86 this is just a compiler barrier anyway. 6891 */ 6892 smp_mb__after_atomic(); 6893 got_posted_interrupt = 6894 kvm_apic_update_irr(vcpu, vmx->pi_desc.pir, &max_irr); 6895 } else { 6896 max_irr = kvm_lapic_find_highest_irr(vcpu); 6897 got_posted_interrupt = false; 6898 } 6899 6900 /* 6901 * Newly recognized interrupts are injected via either virtual interrupt 6902 * delivery (RVI) or KVM_REQ_EVENT. Virtual interrupt delivery is 6903 * disabled in two cases: 6904 * 6905 * 1) If L2 is running and the vCPU has a new pending interrupt. If L1 6906 * wants to exit on interrupts, KVM_REQ_EVENT is needed to synthesize a 6907 * VM-Exit to L1. If L1 doesn't want to exit, the interrupt is injected 6908 * into L2, but KVM doesn't use virtual interrupt delivery to inject 6909 * interrupts into L2, and so KVM_REQ_EVENT is again needed. 6910 * 6911 * 2) If APICv is disabled for this vCPU, assigned devices may still 6912 * attempt to post interrupts. The posted interrupt vector will cause 6913 * a VM-Exit and the subsequent entry will call sync_pir_to_irr. 6914 */ 6915 if (!is_guest_mode(vcpu) && kvm_vcpu_apicv_active(vcpu)) 6916 vmx_set_rvi(max_irr); 6917 else if (got_posted_interrupt) 6918 kvm_make_request(KVM_REQ_EVENT, vcpu); 6919 6920 return max_irr; 6921 } 6922 6923 static void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap) 6924 { 6925 if (!kvm_vcpu_apicv_active(vcpu)) 6926 return; 6927 6928 vmcs_write64(EOI_EXIT_BITMAP0, eoi_exit_bitmap[0]); 6929 vmcs_write64(EOI_EXIT_BITMAP1, eoi_exit_bitmap[1]); 6930 vmcs_write64(EOI_EXIT_BITMAP2, eoi_exit_bitmap[2]); 6931 vmcs_write64(EOI_EXIT_BITMAP3, eoi_exit_bitmap[3]); 6932 } 6933 6934 static void vmx_apicv_pre_state_restore(struct kvm_vcpu *vcpu) 6935 { 6936 struct vcpu_vmx *vmx = to_vmx(vcpu); 6937 6938 pi_clear_on(&vmx->pi_desc); 6939 memset(vmx->pi_desc.pir, 0, sizeof(vmx->pi_desc.pir)); 6940 } 6941 6942 void vmx_do_interrupt_irqoff(unsigned long entry); 6943 void vmx_do_nmi_irqoff(void); 6944 6945 static void handle_nm_fault_irqoff(struct kvm_vcpu *vcpu) 6946 { 6947 /* 6948 * Save xfd_err to guest_fpu before interrupt is enabled, so the 6949 * MSR value is not clobbered by the host activity before the guest 6950 * has chance to consume it. 6951 * 6952 * Do not blindly read xfd_err here, since this exception might 6953 * be caused by L1 interception on a platform which doesn't 6954 * support xfd at all. 6955 * 6956 * Do it conditionally upon guest_fpu::xfd. xfd_err matters 6957 * only when xfd contains a non-zero value. 6958 * 6959 * Queuing exception is done in vmx_handle_exit. See comment there. 6960 */ 6961 if (vcpu->arch.guest_fpu.fpstate->xfd) 6962 rdmsrl(MSR_IA32_XFD_ERR, vcpu->arch.guest_fpu.xfd_err); 6963 } 6964 6965 static void handle_exception_irqoff(struct vcpu_vmx *vmx) 6966 { 6967 u32 intr_info = vmx_get_intr_info(&vmx->vcpu); 6968 6969 /* if exit due to PF check for async PF */ 6970 if (is_page_fault(intr_info)) 6971 vmx->vcpu.arch.apf.host_apf_flags = kvm_read_and_reset_apf_flags(); 6972 /* if exit due to NM, handle before interrupts are enabled */ 6973 else if (is_nm_fault(intr_info)) 6974 handle_nm_fault_irqoff(&vmx->vcpu); 6975 /* Handle machine checks before interrupts are enabled */ 6976 else if (is_machine_check(intr_info)) 6977 kvm_machine_check(); 6978 } 6979 6980 static void handle_external_interrupt_irqoff(struct kvm_vcpu *vcpu) 6981 { 6982 u32 intr_info = vmx_get_intr_info(vcpu); 6983 unsigned int vector = intr_info & INTR_INFO_VECTOR_MASK; 6984 6985 if (KVM_BUG(!is_external_intr(intr_info), vcpu->kvm, 6986 "unexpected VM-Exit interrupt info: 0x%x", intr_info)) 6987 return; 6988 6989 kvm_before_interrupt(vcpu, KVM_HANDLING_IRQ); 6990 if (cpu_feature_enabled(X86_FEATURE_FRED)) 6991 fred_entry_from_kvm(EVENT_TYPE_EXTINT, vector); 6992 else 6993 vmx_do_interrupt_irqoff(gate_offset((gate_desc *)host_idt_base + vector)); 6994 kvm_after_interrupt(vcpu); 6995 6996 vcpu->arch.at_instruction_boundary = true; 6997 } 6998 6999 static void vmx_handle_exit_irqoff(struct kvm_vcpu *vcpu) 7000 { 7001 struct vcpu_vmx *vmx = to_vmx(vcpu); 7002 7003 if (vmx->emulation_required) 7004 return; 7005 7006 if (vmx->exit_reason.basic == EXIT_REASON_EXTERNAL_INTERRUPT) 7007 handle_external_interrupt_irqoff(vcpu); 7008 else if (vmx->exit_reason.basic == EXIT_REASON_EXCEPTION_NMI) 7009 handle_exception_irqoff(vmx); 7010 } 7011 7012 /* 7013 * The kvm parameter can be NULL (module initialization, or invocation before 7014 * VM creation). Be sure to check the kvm parameter before using it. 7015 */ 7016 static bool vmx_has_emulated_msr(struct kvm *kvm, u32 index) 7017 { 7018 switch (index) { 7019 case MSR_IA32_SMBASE: 7020 if (!IS_ENABLED(CONFIG_KVM_SMM)) 7021 return false; 7022 /* 7023 * We cannot do SMM unless we can run the guest in big 7024 * real mode. 7025 */ 7026 return enable_unrestricted_guest || emulate_invalid_guest_state; 7027 case KVM_FIRST_EMULATED_VMX_MSR ... KVM_LAST_EMULATED_VMX_MSR: 7028 return nested; 7029 case MSR_AMD64_VIRT_SPEC_CTRL: 7030 case MSR_AMD64_TSC_RATIO: 7031 /* This is AMD only. */ 7032 return false; 7033 default: 7034 return true; 7035 } 7036 } 7037 7038 static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx) 7039 { 7040 u32 exit_intr_info; 7041 bool unblock_nmi; 7042 u8 vector; 7043 bool idtv_info_valid; 7044 7045 idtv_info_valid = vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK; 7046 7047 if (enable_vnmi) { 7048 if (vmx->loaded_vmcs->nmi_known_unmasked) 7049 return; 7050 7051 exit_intr_info = vmx_get_intr_info(&vmx->vcpu); 7052 unblock_nmi = (exit_intr_info & INTR_INFO_UNBLOCK_NMI) != 0; 7053 vector = exit_intr_info & INTR_INFO_VECTOR_MASK; 7054 /* 7055 * SDM 3: 27.7.1.2 (September 2008) 7056 * Re-set bit "block by NMI" before VM entry if vmexit caused by 7057 * a guest IRET fault. 7058 * SDM 3: 23.2.2 (September 2008) 7059 * Bit 12 is undefined in any of the following cases: 7060 * If the VM exit sets the valid bit in the IDT-vectoring 7061 * information field. 7062 * If the VM exit is due to a double fault. 7063 */ 7064 if ((exit_intr_info & INTR_INFO_VALID_MASK) && unblock_nmi && 7065 vector != DF_VECTOR && !idtv_info_valid) 7066 vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, 7067 GUEST_INTR_STATE_NMI); 7068 else 7069 vmx->loaded_vmcs->nmi_known_unmasked = 7070 !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) 7071 & GUEST_INTR_STATE_NMI); 7072 } else if (unlikely(vmx->loaded_vmcs->soft_vnmi_blocked)) 7073 vmx->loaded_vmcs->vnmi_blocked_time += 7074 ktime_to_ns(ktime_sub(ktime_get(), 7075 vmx->loaded_vmcs->entry_time)); 7076 } 7077 7078 static void __vmx_complete_interrupts(struct kvm_vcpu *vcpu, 7079 u32 idt_vectoring_info, 7080 int instr_len_field, 7081 int error_code_field) 7082 { 7083 u8 vector; 7084 int type; 7085 bool idtv_info_valid; 7086 7087 idtv_info_valid = idt_vectoring_info & VECTORING_INFO_VALID_MASK; 7088 7089 vcpu->arch.nmi_injected = false; 7090 kvm_clear_exception_queue(vcpu); 7091 kvm_clear_interrupt_queue(vcpu); 7092 7093 if (!idtv_info_valid) 7094 return; 7095 7096 kvm_make_request(KVM_REQ_EVENT, vcpu); 7097 7098 vector = idt_vectoring_info & VECTORING_INFO_VECTOR_MASK; 7099 type = idt_vectoring_info & VECTORING_INFO_TYPE_MASK; 7100 7101 switch (type) { 7102 case INTR_TYPE_NMI_INTR: 7103 vcpu->arch.nmi_injected = true; 7104 /* 7105 * SDM 3: 27.7.1.2 (September 2008) 7106 * Clear bit "block by NMI" before VM entry if a NMI 7107 * delivery faulted. 7108 */ 7109 vmx_set_nmi_mask(vcpu, false); 7110 break; 7111 case INTR_TYPE_SOFT_EXCEPTION: 7112 vcpu->arch.event_exit_inst_len = vmcs_read32(instr_len_field); 7113 fallthrough; 7114 case INTR_TYPE_HARD_EXCEPTION: 7115 if (idt_vectoring_info & VECTORING_INFO_DELIVER_CODE_MASK) { 7116 u32 err = vmcs_read32(error_code_field); 7117 kvm_requeue_exception_e(vcpu, vector, err); 7118 } else 7119 kvm_requeue_exception(vcpu, vector); 7120 break; 7121 case INTR_TYPE_SOFT_INTR: 7122 vcpu->arch.event_exit_inst_len = vmcs_read32(instr_len_field); 7123 fallthrough; 7124 case INTR_TYPE_EXT_INTR: 7125 kvm_queue_interrupt(vcpu, vector, type == INTR_TYPE_SOFT_INTR); 7126 break; 7127 default: 7128 break; 7129 } 7130 } 7131 7132 static void vmx_complete_interrupts(struct vcpu_vmx *vmx) 7133 { 7134 __vmx_complete_interrupts(&vmx->vcpu, vmx->idt_vectoring_info, 7135 VM_EXIT_INSTRUCTION_LEN, 7136 IDT_VECTORING_ERROR_CODE); 7137 } 7138 7139 static void vmx_cancel_injection(struct kvm_vcpu *vcpu) 7140 { 7141 __vmx_complete_interrupts(vcpu, 7142 vmcs_read32(VM_ENTRY_INTR_INFO_FIELD), 7143 VM_ENTRY_INSTRUCTION_LEN, 7144 VM_ENTRY_EXCEPTION_ERROR_CODE); 7145 7146 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0); 7147 } 7148 7149 static void atomic_switch_perf_msrs(struct vcpu_vmx *vmx) 7150 { 7151 int i, nr_msrs; 7152 struct perf_guest_switch_msr *msrs; 7153 struct kvm_pmu *pmu = vcpu_to_pmu(&vmx->vcpu); 7154 7155 pmu->host_cross_mapped_mask = 0; 7156 if (pmu->pebs_enable & pmu->global_ctrl) 7157 intel_pmu_cross_mapped_check(pmu); 7158 7159 /* Note, nr_msrs may be garbage if perf_guest_get_msrs() returns NULL. */ 7160 msrs = perf_guest_get_msrs(&nr_msrs, (void *)pmu); 7161 if (!msrs) 7162 return; 7163 7164 for (i = 0; i < nr_msrs; i++) 7165 if (msrs[i].host == msrs[i].guest) 7166 clear_atomic_switch_msr(vmx, msrs[i].msr); 7167 else 7168 add_atomic_switch_msr(vmx, msrs[i].msr, msrs[i].guest, 7169 msrs[i].host, false); 7170 } 7171 7172 static void vmx_update_hv_timer(struct kvm_vcpu *vcpu, bool force_immediate_exit) 7173 { 7174 struct vcpu_vmx *vmx = to_vmx(vcpu); 7175 u64 tscl; 7176 u32 delta_tsc; 7177 7178 if (force_immediate_exit) { 7179 vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, 0); 7180 vmx->loaded_vmcs->hv_timer_soft_disabled = false; 7181 } else if (vmx->hv_deadline_tsc != -1) { 7182 tscl = rdtsc(); 7183 if (vmx->hv_deadline_tsc > tscl) 7184 /* set_hv_timer ensures the delta fits in 32-bits */ 7185 delta_tsc = (u32)((vmx->hv_deadline_tsc - tscl) >> 7186 cpu_preemption_timer_multi); 7187 else 7188 delta_tsc = 0; 7189 7190 vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, delta_tsc); 7191 vmx->loaded_vmcs->hv_timer_soft_disabled = false; 7192 } else if (!vmx->loaded_vmcs->hv_timer_soft_disabled) { 7193 vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, -1); 7194 vmx->loaded_vmcs->hv_timer_soft_disabled = true; 7195 } 7196 } 7197 7198 void noinstr vmx_update_host_rsp(struct vcpu_vmx *vmx, unsigned long host_rsp) 7199 { 7200 if (unlikely(host_rsp != vmx->loaded_vmcs->host_state.rsp)) { 7201 vmx->loaded_vmcs->host_state.rsp = host_rsp; 7202 vmcs_writel(HOST_RSP, host_rsp); 7203 } 7204 } 7205 7206 void noinstr vmx_spec_ctrl_restore_host(struct vcpu_vmx *vmx, 7207 unsigned int flags) 7208 { 7209 u64 hostval = this_cpu_read(x86_spec_ctrl_current); 7210 7211 if (!cpu_feature_enabled(X86_FEATURE_MSR_SPEC_CTRL)) 7212 return; 7213 7214 if (flags & VMX_RUN_SAVE_SPEC_CTRL) 7215 vmx->spec_ctrl = __rdmsr(MSR_IA32_SPEC_CTRL); 7216 7217 /* 7218 * If the guest/host SPEC_CTRL values differ, restore the host value. 7219 * 7220 * For legacy IBRS, the IBRS bit always needs to be written after 7221 * transitioning from a less privileged predictor mode, regardless of 7222 * whether the guest/host values differ. 7223 */ 7224 if (cpu_feature_enabled(X86_FEATURE_KERNEL_IBRS) || 7225 vmx->spec_ctrl != hostval) 7226 native_wrmsrl(MSR_IA32_SPEC_CTRL, hostval); 7227 7228 barrier_nospec(); 7229 } 7230 7231 static fastpath_t vmx_exit_handlers_fastpath(struct kvm_vcpu *vcpu, 7232 bool force_immediate_exit) 7233 { 7234 /* 7235 * If L2 is active, some VMX preemption timer exits can be handled in 7236 * the fastpath even, all other exits must use the slow path. 7237 */ 7238 if (is_guest_mode(vcpu) && 7239 to_vmx(vcpu)->exit_reason.basic != EXIT_REASON_PREEMPTION_TIMER) 7240 return EXIT_FASTPATH_NONE; 7241 7242 switch (to_vmx(vcpu)->exit_reason.basic) { 7243 case EXIT_REASON_MSR_WRITE: 7244 return handle_fastpath_set_msr_irqoff(vcpu); 7245 case EXIT_REASON_PREEMPTION_TIMER: 7246 return handle_fastpath_preemption_timer(vcpu, force_immediate_exit); 7247 default: 7248 return EXIT_FASTPATH_NONE; 7249 } 7250 } 7251 7252 static noinstr void vmx_vcpu_enter_exit(struct kvm_vcpu *vcpu, 7253 unsigned int flags) 7254 { 7255 struct vcpu_vmx *vmx = to_vmx(vcpu); 7256 7257 guest_state_enter_irqoff(); 7258 7259 /* 7260 * L1D Flush includes CPU buffer clear to mitigate MDS, but VERW 7261 * mitigation for MDS is done late in VMentry and is still 7262 * executed in spite of L1D Flush. This is because an extra VERW 7263 * should not matter much after the big hammer L1D Flush. 7264 */ 7265 if (static_branch_unlikely(&vmx_l1d_should_flush)) 7266 vmx_l1d_flush(vcpu); 7267 else if (static_branch_unlikely(&mmio_stale_data_clear) && 7268 kvm_arch_has_assigned_device(vcpu->kvm)) 7269 mds_clear_cpu_buffers(); 7270 7271 vmx_disable_fb_clear(vmx); 7272 7273 if (vcpu->arch.cr2 != native_read_cr2()) 7274 native_write_cr2(vcpu->arch.cr2); 7275 7276 vmx->fail = __vmx_vcpu_run(vmx, (unsigned long *)&vcpu->arch.regs, 7277 flags); 7278 7279 vcpu->arch.cr2 = native_read_cr2(); 7280 vcpu->arch.regs_avail &= ~VMX_REGS_LAZY_LOAD_SET; 7281 7282 vmx->idt_vectoring_info = 0; 7283 7284 vmx_enable_fb_clear(vmx); 7285 7286 if (unlikely(vmx->fail)) { 7287 vmx->exit_reason.full = 0xdead; 7288 goto out; 7289 } 7290 7291 vmx->exit_reason.full = vmcs_read32(VM_EXIT_REASON); 7292 if (likely(!vmx->exit_reason.failed_vmentry)) 7293 vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD); 7294 7295 if ((u16)vmx->exit_reason.basic == EXIT_REASON_EXCEPTION_NMI && 7296 is_nmi(vmx_get_intr_info(vcpu))) { 7297 kvm_before_interrupt(vcpu, KVM_HANDLING_NMI); 7298 if (cpu_feature_enabled(X86_FEATURE_FRED)) 7299 fred_entry_from_kvm(EVENT_TYPE_NMI, NMI_VECTOR); 7300 else 7301 vmx_do_nmi_irqoff(); 7302 kvm_after_interrupt(vcpu); 7303 } 7304 7305 out: 7306 guest_state_exit_irqoff(); 7307 } 7308 7309 static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, bool force_immediate_exit) 7310 { 7311 struct vcpu_vmx *vmx = to_vmx(vcpu); 7312 unsigned long cr3, cr4; 7313 7314 /* Record the guest's net vcpu time for enforced NMI injections. */ 7315 if (unlikely(!enable_vnmi && 7316 vmx->loaded_vmcs->soft_vnmi_blocked)) 7317 vmx->loaded_vmcs->entry_time = ktime_get(); 7318 7319 /* 7320 * Don't enter VMX if guest state is invalid, let the exit handler 7321 * start emulation until we arrive back to a valid state. Synthesize a 7322 * consistency check VM-Exit due to invalid guest state and bail. 7323 */ 7324 if (unlikely(vmx->emulation_required)) { 7325 vmx->fail = 0; 7326 7327 vmx->exit_reason.full = EXIT_REASON_INVALID_STATE; 7328 vmx->exit_reason.failed_vmentry = 1; 7329 kvm_register_mark_available(vcpu, VCPU_EXREG_EXIT_INFO_1); 7330 vmx->exit_qualification = ENTRY_FAIL_DEFAULT; 7331 kvm_register_mark_available(vcpu, VCPU_EXREG_EXIT_INFO_2); 7332 vmx->exit_intr_info = 0; 7333 return EXIT_FASTPATH_NONE; 7334 } 7335 7336 trace_kvm_entry(vcpu, force_immediate_exit); 7337 7338 if (vmx->ple_window_dirty) { 7339 vmx->ple_window_dirty = false; 7340 vmcs_write32(PLE_WINDOW, vmx->ple_window); 7341 } 7342 7343 /* 7344 * We did this in prepare_switch_to_guest, because it needs to 7345 * be within srcu_read_lock. 7346 */ 7347 WARN_ON_ONCE(vmx->nested.need_vmcs12_to_shadow_sync); 7348 7349 if (kvm_register_is_dirty(vcpu, VCPU_REGS_RSP)) 7350 vmcs_writel(GUEST_RSP, vcpu->arch.regs[VCPU_REGS_RSP]); 7351 if (kvm_register_is_dirty(vcpu, VCPU_REGS_RIP)) 7352 vmcs_writel(GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]); 7353 vcpu->arch.regs_dirty = 0; 7354 7355 /* 7356 * Refresh vmcs.HOST_CR3 if necessary. This must be done immediately 7357 * prior to VM-Enter, as the kernel may load a new ASID (PCID) any time 7358 * it switches back to the current->mm, which can occur in KVM context 7359 * when switching to a temporary mm to patch kernel code, e.g. if KVM 7360 * toggles a static key while handling a VM-Exit. 7361 */ 7362 cr3 = __get_current_cr3_fast(); 7363 if (unlikely(cr3 != vmx->loaded_vmcs->host_state.cr3)) { 7364 vmcs_writel(HOST_CR3, cr3); 7365 vmx->loaded_vmcs->host_state.cr3 = cr3; 7366 } 7367 7368 cr4 = cr4_read_shadow(); 7369 if (unlikely(cr4 != vmx->loaded_vmcs->host_state.cr4)) { 7370 vmcs_writel(HOST_CR4, cr4); 7371 vmx->loaded_vmcs->host_state.cr4 = cr4; 7372 } 7373 7374 /* When KVM_DEBUGREG_WONT_EXIT, dr6 is accessible in guest. */ 7375 if (unlikely(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT)) 7376 set_debugreg(vcpu->arch.dr6, 6); 7377 7378 /* When single-stepping over STI and MOV SS, we must clear the 7379 * corresponding interruptibility bits in the guest state. Otherwise 7380 * vmentry fails as it then expects bit 14 (BS) in pending debug 7381 * exceptions being set, but that's not correct for the guest debugging 7382 * case. */ 7383 if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) 7384 vmx_set_interrupt_shadow(vcpu, 0); 7385 7386 kvm_load_guest_xsave_state(vcpu); 7387 7388 pt_guest_enter(vmx); 7389 7390 atomic_switch_perf_msrs(vmx); 7391 if (intel_pmu_lbr_is_enabled(vcpu)) 7392 vmx_passthrough_lbr_msrs(vcpu); 7393 7394 if (enable_preemption_timer) 7395 vmx_update_hv_timer(vcpu, force_immediate_exit); 7396 else if (force_immediate_exit) 7397 smp_send_reschedule(vcpu->cpu); 7398 7399 kvm_wait_lapic_expire(vcpu); 7400 7401 /* The actual VMENTER/EXIT is in the .noinstr.text section. */ 7402 vmx_vcpu_enter_exit(vcpu, __vmx_vcpu_run_flags(vmx)); 7403 7404 /* All fields are clean at this point */ 7405 if (kvm_is_using_evmcs()) { 7406 current_evmcs->hv_clean_fields |= 7407 HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL; 7408 7409 current_evmcs->hv_vp_id = kvm_hv_get_vpindex(vcpu); 7410 } 7411 7412 /* MSR_IA32_DEBUGCTLMSR is zeroed on vmexit. Restore it if needed */ 7413 if (vmx->host_debugctlmsr) 7414 update_debugctlmsr(vmx->host_debugctlmsr); 7415 7416 #ifndef CONFIG_X86_64 7417 /* 7418 * The sysexit path does not restore ds/es, so we must set them to 7419 * a reasonable value ourselves. 7420 * 7421 * We can't defer this to vmx_prepare_switch_to_host() since that 7422 * function may be executed in interrupt context, which saves and 7423 * restore segments around it, nullifying its effect. 7424 */ 7425 loadsegment(ds, __USER_DS); 7426 loadsegment(es, __USER_DS); 7427 #endif 7428 7429 pt_guest_exit(vmx); 7430 7431 kvm_load_host_xsave_state(vcpu); 7432 7433 if (is_guest_mode(vcpu)) { 7434 /* 7435 * Track VMLAUNCH/VMRESUME that have made past guest state 7436 * checking. 7437 */ 7438 if (vmx->nested.nested_run_pending && 7439 !vmx->exit_reason.failed_vmentry) 7440 ++vcpu->stat.nested_run; 7441 7442 vmx->nested.nested_run_pending = 0; 7443 } 7444 7445 if (unlikely(vmx->fail)) 7446 return EXIT_FASTPATH_NONE; 7447 7448 if (unlikely((u16)vmx->exit_reason.basic == EXIT_REASON_MCE_DURING_VMENTRY)) 7449 kvm_machine_check(); 7450 7451 trace_kvm_exit(vcpu, KVM_ISA_VMX); 7452 7453 if (unlikely(vmx->exit_reason.failed_vmentry)) 7454 return EXIT_FASTPATH_NONE; 7455 7456 vmx->loaded_vmcs->launched = 1; 7457 7458 vmx_recover_nmi_blocking(vmx); 7459 vmx_complete_interrupts(vmx); 7460 7461 return vmx_exit_handlers_fastpath(vcpu, force_immediate_exit); 7462 } 7463 7464 static void vmx_vcpu_free(struct kvm_vcpu *vcpu) 7465 { 7466 struct vcpu_vmx *vmx = to_vmx(vcpu); 7467 7468 if (enable_pml) 7469 vmx_destroy_pml_buffer(vmx); 7470 free_vpid(vmx->vpid); 7471 nested_vmx_free_vcpu(vcpu); 7472 free_loaded_vmcs(vmx->loaded_vmcs); 7473 } 7474 7475 static int vmx_vcpu_create(struct kvm_vcpu *vcpu) 7476 { 7477 struct vmx_uret_msr *tsx_ctrl; 7478 struct vcpu_vmx *vmx; 7479 int i, err; 7480 7481 BUILD_BUG_ON(offsetof(struct vcpu_vmx, vcpu) != 0); 7482 vmx = to_vmx(vcpu); 7483 7484 INIT_LIST_HEAD(&vmx->pi_wakeup_list); 7485 7486 err = -ENOMEM; 7487 7488 vmx->vpid = allocate_vpid(); 7489 7490 /* 7491 * If PML is turned on, failure on enabling PML just results in failure 7492 * of creating the vcpu, therefore we can simplify PML logic (by 7493 * avoiding dealing with cases, such as enabling PML partially on vcpus 7494 * for the guest), etc. 7495 */ 7496 if (enable_pml) { 7497 vmx->pml_pg = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO); 7498 if (!vmx->pml_pg) 7499 goto free_vpid; 7500 } 7501 7502 for (i = 0; i < kvm_nr_uret_msrs; ++i) 7503 vmx->guest_uret_msrs[i].mask = -1ull; 7504 if (boot_cpu_has(X86_FEATURE_RTM)) { 7505 /* 7506 * TSX_CTRL_CPUID_CLEAR is handled in the CPUID interception. 7507 * Keep the host value unchanged to avoid changing CPUID bits 7508 * under the host kernel's feet. 7509 */ 7510 tsx_ctrl = vmx_find_uret_msr(vmx, MSR_IA32_TSX_CTRL); 7511 if (tsx_ctrl) 7512 tsx_ctrl->mask = ~(u64)TSX_CTRL_CPUID_CLEAR; 7513 } 7514 7515 err = alloc_loaded_vmcs(&vmx->vmcs01); 7516 if (err < 0) 7517 goto free_pml; 7518 7519 /* 7520 * Use Hyper-V 'Enlightened MSR Bitmap' feature when KVM runs as a 7521 * nested (L1) hypervisor and Hyper-V in L0 supports it. Enable the 7522 * feature only for vmcs01, KVM currently isn't equipped to realize any 7523 * performance benefits from enabling it for vmcs02. 7524 */ 7525 if (kvm_is_using_evmcs() && 7526 (ms_hyperv.nested_features & HV_X64_NESTED_MSR_BITMAP)) { 7527 struct hv_enlightened_vmcs *evmcs = (void *)vmx->vmcs01.vmcs; 7528 7529 evmcs->hv_enlightenments_control.msr_bitmap = 1; 7530 } 7531 7532 /* The MSR bitmap starts with all ones */ 7533 bitmap_fill(vmx->shadow_msr_intercept.read, MAX_POSSIBLE_PASSTHROUGH_MSRS); 7534 bitmap_fill(vmx->shadow_msr_intercept.write, MAX_POSSIBLE_PASSTHROUGH_MSRS); 7535 7536 vmx_disable_intercept_for_msr(vcpu, MSR_IA32_TSC, MSR_TYPE_R); 7537 #ifdef CONFIG_X86_64 7538 vmx_disable_intercept_for_msr(vcpu, MSR_FS_BASE, MSR_TYPE_RW); 7539 vmx_disable_intercept_for_msr(vcpu, MSR_GS_BASE, MSR_TYPE_RW); 7540 vmx_disable_intercept_for_msr(vcpu, MSR_KERNEL_GS_BASE, MSR_TYPE_RW); 7541 #endif 7542 vmx_disable_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_CS, MSR_TYPE_RW); 7543 vmx_disable_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_ESP, MSR_TYPE_RW); 7544 vmx_disable_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_EIP, MSR_TYPE_RW); 7545 if (kvm_cstate_in_guest(vcpu->kvm)) { 7546 vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C1_RES, MSR_TYPE_R); 7547 vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C3_RESIDENCY, MSR_TYPE_R); 7548 vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C6_RESIDENCY, MSR_TYPE_R); 7549 vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C7_RESIDENCY, MSR_TYPE_R); 7550 } 7551 7552 vmx->loaded_vmcs = &vmx->vmcs01; 7553 7554 if (cpu_need_virtualize_apic_accesses(vcpu)) { 7555 err = kvm_alloc_apic_access_page(vcpu->kvm); 7556 if (err) 7557 goto free_vmcs; 7558 } 7559 7560 if (enable_ept && !enable_unrestricted_guest) { 7561 err = init_rmode_identity_map(vcpu->kvm); 7562 if (err) 7563 goto free_vmcs; 7564 } 7565 7566 if (vmx_can_use_ipiv(vcpu)) 7567 WRITE_ONCE(to_kvm_vmx(vcpu->kvm)->pid_table[vcpu->vcpu_id], 7568 __pa(&vmx->pi_desc) | PID_TABLE_ENTRY_VALID); 7569 7570 return 0; 7571 7572 free_vmcs: 7573 free_loaded_vmcs(vmx->loaded_vmcs); 7574 free_pml: 7575 vmx_destroy_pml_buffer(vmx); 7576 free_vpid: 7577 free_vpid(vmx->vpid); 7578 return err; 7579 } 7580 7581 #define L1TF_MSG_SMT "L1TF CPU bug present and SMT on, data leak possible. See CVE-2018-3646 and https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/l1tf.html for details.\n" 7582 #define L1TF_MSG_L1D "L1TF CPU bug present and virtualization mitigation disabled, data leak possible. See CVE-2018-3646 and https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/l1tf.html for details.\n" 7583 7584 static int vmx_vm_init(struct kvm *kvm) 7585 { 7586 if (!ple_gap) 7587 kvm->arch.pause_in_guest = true; 7588 7589 if (boot_cpu_has(X86_BUG_L1TF) && enable_ept) { 7590 switch (l1tf_mitigation) { 7591 case L1TF_MITIGATION_OFF: 7592 case L1TF_MITIGATION_FLUSH_NOWARN: 7593 /* 'I explicitly don't care' is set */ 7594 break; 7595 case L1TF_MITIGATION_FLUSH: 7596 case L1TF_MITIGATION_FLUSH_NOSMT: 7597 case L1TF_MITIGATION_FULL: 7598 /* 7599 * Warn upon starting the first VM in a potentially 7600 * insecure environment. 7601 */ 7602 if (sched_smt_active()) 7603 pr_warn_once(L1TF_MSG_SMT); 7604 if (l1tf_vmx_mitigation == VMENTER_L1D_FLUSH_NEVER) 7605 pr_warn_once(L1TF_MSG_L1D); 7606 break; 7607 case L1TF_MITIGATION_FULL_FORCE: 7608 /* Flush is enforced */ 7609 break; 7610 } 7611 } 7612 return 0; 7613 } 7614 7615 static u8 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio) 7616 { 7617 /* We wanted to honor guest CD/MTRR/PAT, but doing so could result in 7618 * memory aliases with conflicting memory types and sometimes MCEs. 7619 * We have to be careful as to what are honored and when. 7620 * 7621 * For MMIO, guest CD/MTRR are ignored. The EPT memory type is set to 7622 * UC. The effective memory type is UC or WC depending on guest PAT. 7623 * This was historically the source of MCEs and we want to be 7624 * conservative. 7625 * 7626 * When there is no need to deal with noncoherent DMA (e.g., no VT-d 7627 * or VT-d has snoop control), guest CD/MTRR/PAT are all ignored. The 7628 * EPT memory type is set to WB. The effective memory type is forced 7629 * WB. 7630 * 7631 * Otherwise, we trust guest. Guest CD/MTRR/PAT are all honored. The 7632 * EPT memory type is used to emulate guest CD/MTRR. 7633 */ 7634 7635 if (is_mmio) 7636 return MTRR_TYPE_UNCACHABLE << VMX_EPT_MT_EPTE_SHIFT; 7637 7638 if (!kvm_arch_has_noncoherent_dma(vcpu->kvm)) 7639 return (MTRR_TYPE_WRBACK << VMX_EPT_MT_EPTE_SHIFT) | VMX_EPT_IPAT_BIT; 7640 7641 if (kvm_read_cr0_bits(vcpu, X86_CR0_CD)) { 7642 if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_CD_NW_CLEARED)) 7643 return MTRR_TYPE_WRBACK << VMX_EPT_MT_EPTE_SHIFT; 7644 else 7645 return (MTRR_TYPE_UNCACHABLE << VMX_EPT_MT_EPTE_SHIFT) | 7646 VMX_EPT_IPAT_BIT; 7647 } 7648 7649 return kvm_mtrr_get_guest_memory_type(vcpu, gfn) << VMX_EPT_MT_EPTE_SHIFT; 7650 } 7651 7652 static void vmcs_set_secondary_exec_control(struct vcpu_vmx *vmx, u32 new_ctl) 7653 { 7654 /* 7655 * These bits in the secondary execution controls field 7656 * are dynamic, the others are mostly based on the hypervisor 7657 * architecture and the guest's CPUID. Do not touch the 7658 * dynamic bits. 7659 */ 7660 u32 mask = 7661 SECONDARY_EXEC_SHADOW_VMCS | 7662 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | 7663 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | 7664 SECONDARY_EXEC_DESC; 7665 7666 u32 cur_ctl = secondary_exec_controls_get(vmx); 7667 7668 secondary_exec_controls_set(vmx, (new_ctl & ~mask) | (cur_ctl & mask)); 7669 } 7670 7671 /* 7672 * Generate MSR_IA32_VMX_CR{0,4}_FIXED1 according to CPUID. Only set bits 7673 * (indicating "allowed-1") if they are supported in the guest's CPUID. 7674 */ 7675 static void nested_vmx_cr_fixed1_bits_update(struct kvm_vcpu *vcpu) 7676 { 7677 struct vcpu_vmx *vmx = to_vmx(vcpu); 7678 struct kvm_cpuid_entry2 *entry; 7679 7680 vmx->nested.msrs.cr0_fixed1 = 0xffffffff; 7681 vmx->nested.msrs.cr4_fixed1 = X86_CR4_PCE; 7682 7683 #define cr4_fixed1_update(_cr4_mask, _reg, _cpuid_mask) do { \ 7684 if (entry && (entry->_reg & (_cpuid_mask))) \ 7685 vmx->nested.msrs.cr4_fixed1 |= (_cr4_mask); \ 7686 } while (0) 7687 7688 entry = kvm_find_cpuid_entry(vcpu, 0x1); 7689 cr4_fixed1_update(X86_CR4_VME, edx, feature_bit(VME)); 7690 cr4_fixed1_update(X86_CR4_PVI, edx, feature_bit(VME)); 7691 cr4_fixed1_update(X86_CR4_TSD, edx, feature_bit(TSC)); 7692 cr4_fixed1_update(X86_CR4_DE, edx, feature_bit(DE)); 7693 cr4_fixed1_update(X86_CR4_PSE, edx, feature_bit(PSE)); 7694 cr4_fixed1_update(X86_CR4_PAE, edx, feature_bit(PAE)); 7695 cr4_fixed1_update(X86_CR4_MCE, edx, feature_bit(MCE)); 7696 cr4_fixed1_update(X86_CR4_PGE, edx, feature_bit(PGE)); 7697 cr4_fixed1_update(X86_CR4_OSFXSR, edx, feature_bit(FXSR)); 7698 cr4_fixed1_update(X86_CR4_OSXMMEXCPT, edx, feature_bit(XMM)); 7699 cr4_fixed1_update(X86_CR4_VMXE, ecx, feature_bit(VMX)); 7700 cr4_fixed1_update(X86_CR4_SMXE, ecx, feature_bit(SMX)); 7701 cr4_fixed1_update(X86_CR4_PCIDE, ecx, feature_bit(PCID)); 7702 cr4_fixed1_update(X86_CR4_OSXSAVE, ecx, feature_bit(XSAVE)); 7703 7704 entry = kvm_find_cpuid_entry_index(vcpu, 0x7, 0); 7705 cr4_fixed1_update(X86_CR4_FSGSBASE, ebx, feature_bit(FSGSBASE)); 7706 cr4_fixed1_update(X86_CR4_SMEP, ebx, feature_bit(SMEP)); 7707 cr4_fixed1_update(X86_CR4_SMAP, ebx, feature_bit(SMAP)); 7708 cr4_fixed1_update(X86_CR4_PKE, ecx, feature_bit(PKU)); 7709 cr4_fixed1_update(X86_CR4_UMIP, ecx, feature_bit(UMIP)); 7710 cr4_fixed1_update(X86_CR4_LA57, ecx, feature_bit(LA57)); 7711 7712 entry = kvm_find_cpuid_entry_index(vcpu, 0x7, 1); 7713 cr4_fixed1_update(X86_CR4_LAM_SUP, eax, feature_bit(LAM)); 7714 7715 #undef cr4_fixed1_update 7716 } 7717 7718 static void update_intel_pt_cfg(struct kvm_vcpu *vcpu) 7719 { 7720 struct vcpu_vmx *vmx = to_vmx(vcpu); 7721 struct kvm_cpuid_entry2 *best = NULL; 7722 int i; 7723 7724 for (i = 0; i < PT_CPUID_LEAVES; i++) { 7725 best = kvm_find_cpuid_entry_index(vcpu, 0x14, i); 7726 if (!best) 7727 return; 7728 vmx->pt_desc.caps[CPUID_EAX + i*PT_CPUID_REGS_NUM] = best->eax; 7729 vmx->pt_desc.caps[CPUID_EBX + i*PT_CPUID_REGS_NUM] = best->ebx; 7730 vmx->pt_desc.caps[CPUID_ECX + i*PT_CPUID_REGS_NUM] = best->ecx; 7731 vmx->pt_desc.caps[CPUID_EDX + i*PT_CPUID_REGS_NUM] = best->edx; 7732 } 7733 7734 /* Get the number of configurable Address Ranges for filtering */ 7735 vmx->pt_desc.num_address_ranges = intel_pt_validate_cap(vmx->pt_desc.caps, 7736 PT_CAP_num_address_ranges); 7737 7738 /* Initialize and clear the no dependency bits */ 7739 vmx->pt_desc.ctl_bitmask = ~(RTIT_CTL_TRACEEN | RTIT_CTL_OS | 7740 RTIT_CTL_USR | RTIT_CTL_TSC_EN | RTIT_CTL_DISRETC | 7741 RTIT_CTL_BRANCH_EN); 7742 7743 /* 7744 * If CPUID.(EAX=14H,ECX=0):EBX[0]=1 CR3Filter can be set otherwise 7745 * will inject an #GP 7746 */ 7747 if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_cr3_filtering)) 7748 vmx->pt_desc.ctl_bitmask &= ~RTIT_CTL_CR3EN; 7749 7750 /* 7751 * If CPUID.(EAX=14H,ECX=0):EBX[1]=1 CYCEn, CycThresh and 7752 * PSBFreq can be set 7753 */ 7754 if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_psb_cyc)) 7755 vmx->pt_desc.ctl_bitmask &= ~(RTIT_CTL_CYCLEACC | 7756 RTIT_CTL_CYC_THRESH | RTIT_CTL_PSB_FREQ); 7757 7758 /* 7759 * If CPUID.(EAX=14H,ECX=0):EBX[3]=1 MTCEn and MTCFreq can be set 7760 */ 7761 if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_mtc)) 7762 vmx->pt_desc.ctl_bitmask &= ~(RTIT_CTL_MTC_EN | 7763 RTIT_CTL_MTC_RANGE); 7764 7765 /* If CPUID.(EAX=14H,ECX=0):EBX[4]=1 FUPonPTW and PTWEn can be set */ 7766 if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_ptwrite)) 7767 vmx->pt_desc.ctl_bitmask &= ~(RTIT_CTL_FUP_ON_PTW | 7768 RTIT_CTL_PTW_EN); 7769 7770 /* If CPUID.(EAX=14H,ECX=0):EBX[5]=1 PwrEvEn can be set */ 7771 if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_power_event_trace)) 7772 vmx->pt_desc.ctl_bitmask &= ~RTIT_CTL_PWR_EVT_EN; 7773 7774 /* If CPUID.(EAX=14H,ECX=0):ECX[0]=1 ToPA can be set */ 7775 if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_topa_output)) 7776 vmx->pt_desc.ctl_bitmask &= ~RTIT_CTL_TOPA; 7777 7778 /* If CPUID.(EAX=14H,ECX=0):ECX[3]=1 FabricEn can be set */ 7779 if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_output_subsys)) 7780 vmx->pt_desc.ctl_bitmask &= ~RTIT_CTL_FABRIC_EN; 7781 7782 /* unmask address range configure area */ 7783 for (i = 0; i < vmx->pt_desc.num_address_ranges; i++) 7784 vmx->pt_desc.ctl_bitmask &= ~(0xfULL << (32 + i * 4)); 7785 } 7786 7787 static void vmx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) 7788 { 7789 struct vcpu_vmx *vmx = to_vmx(vcpu); 7790 7791 /* 7792 * XSAVES is effectively enabled if and only if XSAVE is also exposed 7793 * to the guest. XSAVES depends on CR4.OSXSAVE, and CR4.OSXSAVE can be 7794 * set if and only if XSAVE is supported. 7795 */ 7796 if (boot_cpu_has(X86_FEATURE_XSAVE) && 7797 guest_cpuid_has(vcpu, X86_FEATURE_XSAVE)) 7798 kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_XSAVES); 7799 7800 kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_VMX); 7801 kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_LAM); 7802 7803 vmx_setup_uret_msrs(vmx); 7804 7805 if (cpu_has_secondary_exec_ctrls()) 7806 vmcs_set_secondary_exec_control(vmx, 7807 vmx_secondary_exec_control(vmx)); 7808 7809 if (guest_can_use(vcpu, X86_FEATURE_VMX)) 7810 vmx->msr_ia32_feature_control_valid_bits |= 7811 FEAT_CTL_VMX_ENABLED_INSIDE_SMX | 7812 FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX; 7813 else 7814 vmx->msr_ia32_feature_control_valid_bits &= 7815 ~(FEAT_CTL_VMX_ENABLED_INSIDE_SMX | 7816 FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX); 7817 7818 if (guest_can_use(vcpu, X86_FEATURE_VMX)) 7819 nested_vmx_cr_fixed1_bits_update(vcpu); 7820 7821 if (boot_cpu_has(X86_FEATURE_INTEL_PT) && 7822 guest_cpuid_has(vcpu, X86_FEATURE_INTEL_PT)) 7823 update_intel_pt_cfg(vcpu); 7824 7825 if (boot_cpu_has(X86_FEATURE_RTM)) { 7826 struct vmx_uret_msr *msr; 7827 msr = vmx_find_uret_msr(vmx, MSR_IA32_TSX_CTRL); 7828 if (msr) { 7829 bool enabled = guest_cpuid_has(vcpu, X86_FEATURE_RTM); 7830 vmx_set_guest_uret_msr(vmx, msr, enabled ? 0 : TSX_CTRL_RTM_DISABLE); 7831 } 7832 } 7833 7834 if (kvm_cpu_cap_has(X86_FEATURE_XFD)) 7835 vmx_set_intercept_for_msr(vcpu, MSR_IA32_XFD_ERR, MSR_TYPE_R, 7836 !guest_cpuid_has(vcpu, X86_FEATURE_XFD)); 7837 7838 if (boot_cpu_has(X86_FEATURE_IBPB)) 7839 vmx_set_intercept_for_msr(vcpu, MSR_IA32_PRED_CMD, MSR_TYPE_W, 7840 !guest_has_pred_cmd_msr(vcpu)); 7841 7842 if (boot_cpu_has(X86_FEATURE_FLUSH_L1D)) 7843 vmx_set_intercept_for_msr(vcpu, MSR_IA32_FLUSH_CMD, MSR_TYPE_W, 7844 !guest_cpuid_has(vcpu, X86_FEATURE_FLUSH_L1D)); 7845 7846 set_cr4_guest_host_mask(vmx); 7847 7848 vmx_write_encls_bitmap(vcpu, NULL); 7849 if (guest_cpuid_has(vcpu, X86_FEATURE_SGX)) 7850 vmx->msr_ia32_feature_control_valid_bits |= FEAT_CTL_SGX_ENABLED; 7851 else 7852 vmx->msr_ia32_feature_control_valid_bits &= ~FEAT_CTL_SGX_ENABLED; 7853 7854 if (guest_cpuid_has(vcpu, X86_FEATURE_SGX_LC)) 7855 vmx->msr_ia32_feature_control_valid_bits |= 7856 FEAT_CTL_SGX_LC_ENABLED; 7857 else 7858 vmx->msr_ia32_feature_control_valid_bits &= 7859 ~FEAT_CTL_SGX_LC_ENABLED; 7860 7861 /* Refresh #PF interception to account for MAXPHYADDR changes. */ 7862 vmx_update_exception_bitmap(vcpu); 7863 } 7864 7865 static u64 vmx_get_perf_capabilities(void) 7866 { 7867 u64 perf_cap = PMU_CAP_FW_WRITES; 7868 struct x86_pmu_lbr lbr; 7869 u64 host_perf_cap = 0; 7870 7871 if (!enable_pmu) 7872 return 0; 7873 7874 if (boot_cpu_has(X86_FEATURE_PDCM)) 7875 rdmsrl(MSR_IA32_PERF_CAPABILITIES, host_perf_cap); 7876 7877 if (!cpu_feature_enabled(X86_FEATURE_ARCH_LBR)) { 7878 x86_perf_get_lbr(&lbr); 7879 if (lbr.nr) 7880 perf_cap |= host_perf_cap & PMU_CAP_LBR_FMT; 7881 } 7882 7883 if (vmx_pebs_supported()) { 7884 perf_cap |= host_perf_cap & PERF_CAP_PEBS_MASK; 7885 if ((perf_cap & PERF_CAP_PEBS_FORMAT) < 4) 7886 perf_cap &= ~PERF_CAP_PEBS_BASELINE; 7887 } 7888 7889 return perf_cap; 7890 } 7891 7892 static __init void vmx_set_cpu_caps(void) 7893 { 7894 kvm_set_cpu_caps(); 7895 7896 /* CPUID 0x1 */ 7897 if (nested) 7898 kvm_cpu_cap_set(X86_FEATURE_VMX); 7899 7900 /* CPUID 0x7 */ 7901 if (kvm_mpx_supported()) 7902 kvm_cpu_cap_check_and_set(X86_FEATURE_MPX); 7903 if (!cpu_has_vmx_invpcid()) 7904 kvm_cpu_cap_clear(X86_FEATURE_INVPCID); 7905 if (vmx_pt_mode_is_host_guest()) 7906 kvm_cpu_cap_check_and_set(X86_FEATURE_INTEL_PT); 7907 if (vmx_pebs_supported()) { 7908 kvm_cpu_cap_check_and_set(X86_FEATURE_DS); 7909 kvm_cpu_cap_check_and_set(X86_FEATURE_DTES64); 7910 } 7911 7912 if (!enable_pmu) 7913 kvm_cpu_cap_clear(X86_FEATURE_PDCM); 7914 kvm_caps.supported_perf_cap = vmx_get_perf_capabilities(); 7915 7916 if (!enable_sgx) { 7917 kvm_cpu_cap_clear(X86_FEATURE_SGX); 7918 kvm_cpu_cap_clear(X86_FEATURE_SGX_LC); 7919 kvm_cpu_cap_clear(X86_FEATURE_SGX1); 7920 kvm_cpu_cap_clear(X86_FEATURE_SGX2); 7921 } 7922 7923 if (vmx_umip_emulated()) 7924 kvm_cpu_cap_set(X86_FEATURE_UMIP); 7925 7926 /* CPUID 0xD.1 */ 7927 kvm_caps.supported_xss = 0; 7928 if (!cpu_has_vmx_xsaves()) 7929 kvm_cpu_cap_clear(X86_FEATURE_XSAVES); 7930 7931 /* CPUID 0x80000001 and 0x7 (RDPID) */ 7932 if (!cpu_has_vmx_rdtscp()) { 7933 kvm_cpu_cap_clear(X86_FEATURE_RDTSCP); 7934 kvm_cpu_cap_clear(X86_FEATURE_RDPID); 7935 } 7936 7937 if (cpu_has_vmx_waitpkg()) 7938 kvm_cpu_cap_check_and_set(X86_FEATURE_WAITPKG); 7939 } 7940 7941 static int vmx_check_intercept_io(struct kvm_vcpu *vcpu, 7942 struct x86_instruction_info *info) 7943 { 7944 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 7945 unsigned short port; 7946 bool intercept; 7947 int size; 7948 7949 if (info->intercept == x86_intercept_in || 7950 info->intercept == x86_intercept_ins) { 7951 port = info->src_val; 7952 size = info->dst_bytes; 7953 } else { 7954 port = info->dst_val; 7955 size = info->src_bytes; 7956 } 7957 7958 /* 7959 * If the 'use IO bitmaps' VM-execution control is 0, IO instruction 7960 * VM-exits depend on the 'unconditional IO exiting' VM-execution 7961 * control. 7962 * 7963 * Otherwise, IO instruction VM-exits are controlled by the IO bitmaps. 7964 */ 7965 if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS)) 7966 intercept = nested_cpu_has(vmcs12, 7967 CPU_BASED_UNCOND_IO_EXITING); 7968 else 7969 intercept = nested_vmx_check_io_bitmaps(vcpu, port, size); 7970 7971 /* FIXME: produce nested vmexit and return X86EMUL_INTERCEPTED. */ 7972 return intercept ? X86EMUL_UNHANDLEABLE : X86EMUL_CONTINUE; 7973 } 7974 7975 static int vmx_check_intercept(struct kvm_vcpu *vcpu, 7976 struct x86_instruction_info *info, 7977 enum x86_intercept_stage stage, 7978 struct x86_exception *exception) 7979 { 7980 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 7981 7982 switch (info->intercept) { 7983 /* 7984 * RDPID causes #UD if disabled through secondary execution controls. 7985 * Because it is marked as EmulateOnUD, we need to intercept it here. 7986 * Note, RDPID is hidden behind ENABLE_RDTSCP. 7987 */ 7988 case x86_intercept_rdpid: 7989 if (!nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_RDTSCP)) { 7990 exception->vector = UD_VECTOR; 7991 exception->error_code_valid = false; 7992 return X86EMUL_PROPAGATE_FAULT; 7993 } 7994 break; 7995 7996 case x86_intercept_in: 7997 case x86_intercept_ins: 7998 case x86_intercept_out: 7999 case x86_intercept_outs: 8000 return vmx_check_intercept_io(vcpu, info); 8001 8002 case x86_intercept_lgdt: 8003 case x86_intercept_lidt: 8004 case x86_intercept_lldt: 8005 case x86_intercept_ltr: 8006 case x86_intercept_sgdt: 8007 case x86_intercept_sidt: 8008 case x86_intercept_sldt: 8009 case x86_intercept_str: 8010 if (!nested_cpu_has2(vmcs12, SECONDARY_EXEC_DESC)) 8011 return X86EMUL_CONTINUE; 8012 8013 /* FIXME: produce nested vmexit and return X86EMUL_INTERCEPTED. */ 8014 break; 8015 8016 case x86_intercept_pause: 8017 /* 8018 * PAUSE is a single-byte NOP with a REPE prefix, i.e. collides 8019 * with vanilla NOPs in the emulator. Apply the interception 8020 * check only to actual PAUSE instructions. Don't check 8021 * PAUSE-loop-exiting, software can't expect a given PAUSE to 8022 * exit, i.e. KVM is within its rights to allow L2 to execute 8023 * the PAUSE. 8024 */ 8025 if ((info->rep_prefix != REPE_PREFIX) || 8026 !nested_cpu_has2(vmcs12, CPU_BASED_PAUSE_EXITING)) 8027 return X86EMUL_CONTINUE; 8028 8029 break; 8030 8031 /* TODO: check more intercepts... */ 8032 default: 8033 break; 8034 } 8035 8036 return X86EMUL_UNHANDLEABLE; 8037 } 8038 8039 #ifdef CONFIG_X86_64 8040 /* (a << shift) / divisor, return 1 if overflow otherwise 0 */ 8041 static inline int u64_shl_div_u64(u64 a, unsigned int shift, 8042 u64 divisor, u64 *result) 8043 { 8044 u64 low = a << shift, high = a >> (64 - shift); 8045 8046 /* To avoid the overflow on divq */ 8047 if (high >= divisor) 8048 return 1; 8049 8050 /* Low hold the result, high hold rem which is discarded */ 8051 asm("divq %2\n\t" : "=a" (low), "=d" (high) : 8052 "rm" (divisor), "0" (low), "1" (high)); 8053 *result = low; 8054 8055 return 0; 8056 } 8057 8058 static int vmx_set_hv_timer(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc, 8059 bool *expired) 8060 { 8061 struct vcpu_vmx *vmx; 8062 u64 tscl, guest_tscl, delta_tsc, lapic_timer_advance_cycles; 8063 struct kvm_timer *ktimer = &vcpu->arch.apic->lapic_timer; 8064 8065 vmx = to_vmx(vcpu); 8066 tscl = rdtsc(); 8067 guest_tscl = kvm_read_l1_tsc(vcpu, tscl); 8068 delta_tsc = max(guest_deadline_tsc, guest_tscl) - guest_tscl; 8069 lapic_timer_advance_cycles = nsec_to_cycles(vcpu, 8070 ktimer->timer_advance_ns); 8071 8072 if (delta_tsc > lapic_timer_advance_cycles) 8073 delta_tsc -= lapic_timer_advance_cycles; 8074 else 8075 delta_tsc = 0; 8076 8077 /* Convert to host delta tsc if tsc scaling is enabled */ 8078 if (vcpu->arch.l1_tsc_scaling_ratio != kvm_caps.default_tsc_scaling_ratio && 8079 delta_tsc && u64_shl_div_u64(delta_tsc, 8080 kvm_caps.tsc_scaling_ratio_frac_bits, 8081 vcpu->arch.l1_tsc_scaling_ratio, &delta_tsc)) 8082 return -ERANGE; 8083 8084 /* 8085 * If the delta tsc can't fit in the 32 bit after the multi shift, 8086 * we can't use the preemption timer. 8087 * It's possible that it fits on later vmentries, but checking 8088 * on every vmentry is costly so we just use an hrtimer. 8089 */ 8090 if (delta_tsc >> (cpu_preemption_timer_multi + 32)) 8091 return -ERANGE; 8092 8093 vmx->hv_deadline_tsc = tscl + delta_tsc; 8094 *expired = !delta_tsc; 8095 return 0; 8096 } 8097 8098 static void vmx_cancel_hv_timer(struct kvm_vcpu *vcpu) 8099 { 8100 to_vmx(vcpu)->hv_deadline_tsc = -1; 8101 } 8102 #endif 8103 8104 static void vmx_sched_in(struct kvm_vcpu *vcpu, int cpu) 8105 { 8106 if (!kvm_pause_in_guest(vcpu->kvm)) 8107 shrink_ple_window(vcpu); 8108 } 8109 8110 void vmx_update_cpu_dirty_logging(struct kvm_vcpu *vcpu) 8111 { 8112 struct vcpu_vmx *vmx = to_vmx(vcpu); 8113 8114 if (WARN_ON_ONCE(!enable_pml)) 8115 return; 8116 8117 if (is_guest_mode(vcpu)) { 8118 vmx->nested.update_vmcs01_cpu_dirty_logging = true; 8119 return; 8120 } 8121 8122 /* 8123 * Note, nr_memslots_dirty_logging can be changed concurrent with this 8124 * code, but in that case another update request will be made and so 8125 * the guest will never run with a stale PML value. 8126 */ 8127 if (atomic_read(&vcpu->kvm->nr_memslots_dirty_logging)) 8128 secondary_exec_controls_setbit(vmx, SECONDARY_EXEC_ENABLE_PML); 8129 else 8130 secondary_exec_controls_clearbit(vmx, SECONDARY_EXEC_ENABLE_PML); 8131 } 8132 8133 static void vmx_setup_mce(struct kvm_vcpu *vcpu) 8134 { 8135 if (vcpu->arch.mcg_cap & MCG_LMCE_P) 8136 to_vmx(vcpu)->msr_ia32_feature_control_valid_bits |= 8137 FEAT_CTL_LMCE_ENABLED; 8138 else 8139 to_vmx(vcpu)->msr_ia32_feature_control_valid_bits &= 8140 ~FEAT_CTL_LMCE_ENABLED; 8141 } 8142 8143 #ifdef CONFIG_KVM_SMM 8144 static int vmx_smi_allowed(struct kvm_vcpu *vcpu, bool for_injection) 8145 { 8146 /* we need a nested vmexit to enter SMM, postpone if run is pending */ 8147 if (to_vmx(vcpu)->nested.nested_run_pending) 8148 return -EBUSY; 8149 return !is_smm(vcpu); 8150 } 8151 8152 static int vmx_enter_smm(struct kvm_vcpu *vcpu, union kvm_smram *smram) 8153 { 8154 struct vcpu_vmx *vmx = to_vmx(vcpu); 8155 8156 /* 8157 * TODO: Implement custom flows for forcing the vCPU out/in of L2 on 8158 * SMI and RSM. Using the common VM-Exit + VM-Enter routines is wrong 8159 * SMI and RSM only modify state that is saved and restored via SMRAM. 8160 * E.g. most MSRs are left untouched, but many are modified by VM-Exit 8161 * and VM-Enter, and thus L2's values may be corrupted on SMI+RSM. 8162 */ 8163 vmx->nested.smm.guest_mode = is_guest_mode(vcpu); 8164 if (vmx->nested.smm.guest_mode) 8165 nested_vmx_vmexit(vcpu, -1, 0, 0); 8166 8167 vmx->nested.smm.vmxon = vmx->nested.vmxon; 8168 vmx->nested.vmxon = false; 8169 vmx_clear_hlt(vcpu); 8170 return 0; 8171 } 8172 8173 static int vmx_leave_smm(struct kvm_vcpu *vcpu, const union kvm_smram *smram) 8174 { 8175 struct vcpu_vmx *vmx = to_vmx(vcpu); 8176 int ret; 8177 8178 if (vmx->nested.smm.vmxon) { 8179 vmx->nested.vmxon = true; 8180 vmx->nested.smm.vmxon = false; 8181 } 8182 8183 if (vmx->nested.smm.guest_mode) { 8184 ret = nested_vmx_enter_non_root_mode(vcpu, false); 8185 if (ret) 8186 return ret; 8187 8188 vmx->nested.nested_run_pending = 1; 8189 vmx->nested.smm.guest_mode = false; 8190 } 8191 return 0; 8192 } 8193 8194 static void vmx_enable_smi_window(struct kvm_vcpu *vcpu) 8195 { 8196 /* RSM will cause a vmexit anyway. */ 8197 } 8198 #endif 8199 8200 static bool vmx_apic_init_signal_blocked(struct kvm_vcpu *vcpu) 8201 { 8202 return to_vmx(vcpu)->nested.vmxon && !is_guest_mode(vcpu); 8203 } 8204 8205 static void vmx_migrate_timers(struct kvm_vcpu *vcpu) 8206 { 8207 if (is_guest_mode(vcpu)) { 8208 struct hrtimer *timer = &to_vmx(vcpu)->nested.preemption_timer; 8209 8210 if (hrtimer_try_to_cancel(timer) == 1) 8211 hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED); 8212 } 8213 } 8214 8215 static void vmx_hardware_unsetup(void) 8216 { 8217 kvm_set_posted_intr_wakeup_handler(NULL); 8218 8219 if (nested) 8220 nested_vmx_hardware_unsetup(); 8221 8222 free_kvm_area(); 8223 } 8224 8225 #define VMX_REQUIRED_APICV_INHIBITS \ 8226 ( \ 8227 BIT(APICV_INHIBIT_REASON_DISABLE)| \ 8228 BIT(APICV_INHIBIT_REASON_ABSENT) | \ 8229 BIT(APICV_INHIBIT_REASON_HYPERV) | \ 8230 BIT(APICV_INHIBIT_REASON_BLOCKIRQ) | \ 8231 BIT(APICV_INHIBIT_REASON_PHYSICAL_ID_ALIASED) | \ 8232 BIT(APICV_INHIBIT_REASON_APIC_ID_MODIFIED) | \ 8233 BIT(APICV_INHIBIT_REASON_APIC_BASE_MODIFIED) \ 8234 ) 8235 8236 static void vmx_vm_destroy(struct kvm *kvm) 8237 { 8238 struct kvm_vmx *kvm_vmx = to_kvm_vmx(kvm); 8239 8240 free_pages((unsigned long)kvm_vmx->pid_table, vmx_get_pid_table_order(kvm)); 8241 } 8242 8243 /* 8244 * Note, the SDM states that the linear address is masked *after* the modified 8245 * canonicality check, whereas KVM masks (untags) the address and then performs 8246 * a "normal" canonicality check. Functionally, the two methods are identical, 8247 * and when the masking occurs relative to the canonicality check isn't visible 8248 * to software, i.e. KVM's behavior doesn't violate the SDM. 8249 */ 8250 gva_t vmx_get_untagged_addr(struct kvm_vcpu *vcpu, gva_t gva, unsigned int flags) 8251 { 8252 int lam_bit; 8253 unsigned long cr3_bits; 8254 8255 if (flags & (X86EMUL_F_FETCH | X86EMUL_F_IMPLICIT | X86EMUL_F_INVLPG)) 8256 return gva; 8257 8258 if (!is_64_bit_mode(vcpu)) 8259 return gva; 8260 8261 /* 8262 * Bit 63 determines if the address should be treated as user address 8263 * or a supervisor address. 8264 */ 8265 if (!(gva & BIT_ULL(63))) { 8266 cr3_bits = kvm_get_active_cr3_lam_bits(vcpu); 8267 if (!(cr3_bits & (X86_CR3_LAM_U57 | X86_CR3_LAM_U48))) 8268 return gva; 8269 8270 /* LAM_U48 is ignored if LAM_U57 is set. */ 8271 lam_bit = cr3_bits & X86_CR3_LAM_U57 ? 56 : 47; 8272 } else { 8273 if (!kvm_is_cr4_bit_set(vcpu, X86_CR4_LAM_SUP)) 8274 return gva; 8275 8276 lam_bit = kvm_is_cr4_bit_set(vcpu, X86_CR4_LA57) ? 56 : 47; 8277 } 8278 8279 /* 8280 * Untag the address by sign-extending the lam_bit, but NOT to bit 63. 8281 * Bit 63 is retained from the raw virtual address so that untagging 8282 * doesn't change a user access to a supervisor access, and vice versa. 8283 */ 8284 return (sign_extend64(gva, lam_bit) & ~BIT_ULL(63)) | (gva & BIT_ULL(63)); 8285 } 8286 8287 static struct kvm_x86_ops vmx_x86_ops __initdata = { 8288 .name = KBUILD_MODNAME, 8289 8290 .check_processor_compatibility = vmx_check_processor_compat, 8291 8292 .hardware_unsetup = vmx_hardware_unsetup, 8293 8294 .hardware_enable = vmx_hardware_enable, 8295 .hardware_disable = vmx_hardware_disable, 8296 .has_emulated_msr = vmx_has_emulated_msr, 8297 8298 .vm_size = sizeof(struct kvm_vmx), 8299 .vm_init = vmx_vm_init, 8300 .vm_destroy = vmx_vm_destroy, 8301 8302 .vcpu_precreate = vmx_vcpu_precreate, 8303 .vcpu_create = vmx_vcpu_create, 8304 .vcpu_free = vmx_vcpu_free, 8305 .vcpu_reset = vmx_vcpu_reset, 8306 8307 .prepare_switch_to_guest = vmx_prepare_switch_to_guest, 8308 .vcpu_load = vmx_vcpu_load, 8309 .vcpu_put = vmx_vcpu_put, 8310 8311 .update_exception_bitmap = vmx_update_exception_bitmap, 8312 .get_msr_feature = vmx_get_msr_feature, 8313 .get_msr = vmx_get_msr, 8314 .set_msr = vmx_set_msr, 8315 .get_segment_base = vmx_get_segment_base, 8316 .get_segment = vmx_get_segment, 8317 .set_segment = vmx_set_segment, 8318 .get_cpl = vmx_get_cpl, 8319 .get_cs_db_l_bits = vmx_get_cs_db_l_bits, 8320 .is_valid_cr0 = vmx_is_valid_cr0, 8321 .set_cr0 = vmx_set_cr0, 8322 .is_valid_cr4 = vmx_is_valid_cr4, 8323 .set_cr4 = vmx_set_cr4, 8324 .set_efer = vmx_set_efer, 8325 .get_idt = vmx_get_idt, 8326 .set_idt = vmx_set_idt, 8327 .get_gdt = vmx_get_gdt, 8328 .set_gdt = vmx_set_gdt, 8329 .set_dr7 = vmx_set_dr7, 8330 .sync_dirty_debug_regs = vmx_sync_dirty_debug_regs, 8331 .cache_reg = vmx_cache_reg, 8332 .get_rflags = vmx_get_rflags, 8333 .set_rflags = vmx_set_rflags, 8334 .get_if_flag = vmx_get_if_flag, 8335 8336 .flush_tlb_all = vmx_flush_tlb_all, 8337 .flush_tlb_current = vmx_flush_tlb_current, 8338 .flush_tlb_gva = vmx_flush_tlb_gva, 8339 .flush_tlb_guest = vmx_flush_tlb_guest, 8340 8341 .vcpu_pre_run = vmx_vcpu_pre_run, 8342 .vcpu_run = vmx_vcpu_run, 8343 .handle_exit = vmx_handle_exit, 8344 .skip_emulated_instruction = vmx_skip_emulated_instruction, 8345 .update_emulated_instruction = vmx_update_emulated_instruction, 8346 .set_interrupt_shadow = vmx_set_interrupt_shadow, 8347 .get_interrupt_shadow = vmx_get_interrupt_shadow, 8348 .patch_hypercall = vmx_patch_hypercall, 8349 .inject_irq = vmx_inject_irq, 8350 .inject_nmi = vmx_inject_nmi, 8351 .inject_exception = vmx_inject_exception, 8352 .cancel_injection = vmx_cancel_injection, 8353 .interrupt_allowed = vmx_interrupt_allowed, 8354 .nmi_allowed = vmx_nmi_allowed, 8355 .get_nmi_mask = vmx_get_nmi_mask, 8356 .set_nmi_mask = vmx_set_nmi_mask, 8357 .enable_nmi_window = vmx_enable_nmi_window, 8358 .enable_irq_window = vmx_enable_irq_window, 8359 .update_cr8_intercept = vmx_update_cr8_intercept, 8360 .set_virtual_apic_mode = vmx_set_virtual_apic_mode, 8361 .set_apic_access_page_addr = vmx_set_apic_access_page_addr, 8362 .refresh_apicv_exec_ctrl = vmx_refresh_apicv_exec_ctrl, 8363 .load_eoi_exitmap = vmx_load_eoi_exitmap, 8364 .apicv_pre_state_restore = vmx_apicv_pre_state_restore, 8365 .required_apicv_inhibits = VMX_REQUIRED_APICV_INHIBITS, 8366 .hwapic_irr_update = vmx_hwapic_irr_update, 8367 .hwapic_isr_update = vmx_hwapic_isr_update, 8368 .guest_apic_has_interrupt = vmx_guest_apic_has_interrupt, 8369 .sync_pir_to_irr = vmx_sync_pir_to_irr, 8370 .deliver_interrupt = vmx_deliver_interrupt, 8371 .dy_apicv_has_pending_interrupt = pi_has_pending_interrupt, 8372 8373 .set_tss_addr = vmx_set_tss_addr, 8374 .set_identity_map_addr = vmx_set_identity_map_addr, 8375 .get_mt_mask = vmx_get_mt_mask, 8376 8377 .get_exit_info = vmx_get_exit_info, 8378 8379 .vcpu_after_set_cpuid = vmx_vcpu_after_set_cpuid, 8380 8381 .has_wbinvd_exit = cpu_has_vmx_wbinvd_exit, 8382 8383 .get_l2_tsc_offset = vmx_get_l2_tsc_offset, 8384 .get_l2_tsc_multiplier = vmx_get_l2_tsc_multiplier, 8385 .write_tsc_offset = vmx_write_tsc_offset, 8386 .write_tsc_multiplier = vmx_write_tsc_multiplier, 8387 8388 .load_mmu_pgd = vmx_load_mmu_pgd, 8389 8390 .check_intercept = vmx_check_intercept, 8391 .handle_exit_irqoff = vmx_handle_exit_irqoff, 8392 8393 .sched_in = vmx_sched_in, 8394 8395 .cpu_dirty_log_size = PML_ENTITY_NUM, 8396 .update_cpu_dirty_logging = vmx_update_cpu_dirty_logging, 8397 8398 .nested_ops = &vmx_nested_ops, 8399 8400 .pi_update_irte = vmx_pi_update_irte, 8401 .pi_start_assignment = vmx_pi_start_assignment, 8402 8403 #ifdef CONFIG_X86_64 8404 .set_hv_timer = vmx_set_hv_timer, 8405 .cancel_hv_timer = vmx_cancel_hv_timer, 8406 #endif 8407 8408 .setup_mce = vmx_setup_mce, 8409 8410 #ifdef CONFIG_KVM_SMM 8411 .smi_allowed = vmx_smi_allowed, 8412 .enter_smm = vmx_enter_smm, 8413 .leave_smm = vmx_leave_smm, 8414 .enable_smi_window = vmx_enable_smi_window, 8415 #endif 8416 8417 .check_emulate_instruction = vmx_check_emulate_instruction, 8418 .apic_init_signal_blocked = vmx_apic_init_signal_blocked, 8419 .migrate_timers = vmx_migrate_timers, 8420 8421 .msr_filter_changed = vmx_msr_filter_changed, 8422 .complete_emulated_msr = kvm_complete_insn_gp, 8423 8424 .vcpu_deliver_sipi_vector = kvm_vcpu_deliver_sipi_vector, 8425 8426 .get_untagged_addr = vmx_get_untagged_addr, 8427 }; 8428 8429 static unsigned int vmx_handle_intel_pt_intr(void) 8430 { 8431 struct kvm_vcpu *vcpu = kvm_get_running_vcpu(); 8432 8433 /* '0' on failure so that the !PT case can use a RET0 static call. */ 8434 if (!vcpu || !kvm_handling_nmi_from_guest(vcpu)) 8435 return 0; 8436 8437 kvm_make_request(KVM_REQ_PMI, vcpu); 8438 __set_bit(MSR_CORE_PERF_GLOBAL_OVF_CTRL_TRACE_TOPA_PMI_BIT, 8439 (unsigned long *)&vcpu->arch.pmu.global_status); 8440 return 1; 8441 } 8442 8443 static __init void vmx_setup_user_return_msrs(void) 8444 { 8445 8446 /* 8447 * Though SYSCALL is only supported in 64-bit mode on Intel CPUs, kvm 8448 * will emulate SYSCALL in legacy mode if the vendor string in guest 8449 * CPUID.0:{EBX,ECX,EDX} is "AuthenticAMD" or "AMDisbetter!" To 8450 * support this emulation, MSR_STAR is included in the list for i386, 8451 * but is never loaded into hardware. MSR_CSTAR is also never loaded 8452 * into hardware and is here purely for emulation purposes. 8453 */ 8454 const u32 vmx_uret_msrs_list[] = { 8455 #ifdef CONFIG_X86_64 8456 MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR, 8457 #endif 8458 MSR_EFER, MSR_TSC_AUX, MSR_STAR, 8459 MSR_IA32_TSX_CTRL, 8460 }; 8461 int i; 8462 8463 BUILD_BUG_ON(ARRAY_SIZE(vmx_uret_msrs_list) != MAX_NR_USER_RETURN_MSRS); 8464 8465 for (i = 0; i < ARRAY_SIZE(vmx_uret_msrs_list); ++i) 8466 kvm_add_user_return_msr(vmx_uret_msrs_list[i]); 8467 } 8468 8469 static void __init vmx_setup_me_spte_mask(void) 8470 { 8471 u64 me_mask = 0; 8472 8473 /* 8474 * kvm_get_shadow_phys_bits() returns shadow_phys_bits. Use 8475 * the former to avoid exposing shadow_phys_bits. 8476 * 8477 * On pre-MKTME system, boot_cpu_data.x86_phys_bits equals to 8478 * shadow_phys_bits. On MKTME and/or TDX capable systems, 8479 * boot_cpu_data.x86_phys_bits holds the actual physical address 8480 * w/o the KeyID bits, and shadow_phys_bits equals to MAXPHYADDR 8481 * reported by CPUID. Those bits between are KeyID bits. 8482 */ 8483 if (boot_cpu_data.x86_phys_bits != kvm_get_shadow_phys_bits()) 8484 me_mask = rsvd_bits(boot_cpu_data.x86_phys_bits, 8485 kvm_get_shadow_phys_bits() - 1); 8486 /* 8487 * Unlike SME, host kernel doesn't support setting up any 8488 * MKTME KeyID on Intel platforms. No memory encryption 8489 * bits should be included into the SPTE. 8490 */ 8491 kvm_mmu_set_me_spte_mask(0, me_mask); 8492 } 8493 8494 static struct kvm_x86_init_ops vmx_init_ops __initdata; 8495 8496 static __init int hardware_setup(void) 8497 { 8498 unsigned long host_bndcfgs; 8499 struct desc_ptr dt; 8500 int r; 8501 8502 store_idt(&dt); 8503 host_idt_base = dt.address; 8504 8505 vmx_setup_user_return_msrs(); 8506 8507 if (setup_vmcs_config(&vmcs_config, &vmx_capability) < 0) 8508 return -EIO; 8509 8510 if (cpu_has_perf_global_ctrl_bug()) 8511 pr_warn_once("VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL " 8512 "does not work properly. Using workaround\n"); 8513 8514 if (boot_cpu_has(X86_FEATURE_NX)) 8515 kvm_enable_efer_bits(EFER_NX); 8516 8517 if (boot_cpu_has(X86_FEATURE_MPX)) { 8518 rdmsrl(MSR_IA32_BNDCFGS, host_bndcfgs); 8519 WARN_ONCE(host_bndcfgs, "BNDCFGS in host will be lost"); 8520 } 8521 8522 if (!cpu_has_vmx_mpx()) 8523 kvm_caps.supported_xcr0 &= ~(XFEATURE_MASK_BNDREGS | 8524 XFEATURE_MASK_BNDCSR); 8525 8526 if (!cpu_has_vmx_vpid() || !cpu_has_vmx_invvpid() || 8527 !(cpu_has_vmx_invvpid_single() || cpu_has_vmx_invvpid_global())) 8528 enable_vpid = 0; 8529 8530 if (!cpu_has_vmx_ept() || 8531 !cpu_has_vmx_ept_4levels() || 8532 !cpu_has_vmx_ept_mt_wb() || 8533 !cpu_has_vmx_invept_global()) 8534 enable_ept = 0; 8535 8536 /* NX support is required for shadow paging. */ 8537 if (!enable_ept && !boot_cpu_has(X86_FEATURE_NX)) { 8538 pr_err_ratelimited("NX (Execute Disable) not supported\n"); 8539 return -EOPNOTSUPP; 8540 } 8541 8542 if (!cpu_has_vmx_ept_ad_bits() || !enable_ept) 8543 enable_ept_ad_bits = 0; 8544 8545 if (!cpu_has_vmx_unrestricted_guest() || !enable_ept) 8546 enable_unrestricted_guest = 0; 8547 8548 if (!cpu_has_vmx_flexpriority()) 8549 flexpriority_enabled = 0; 8550 8551 if (!cpu_has_virtual_nmis()) 8552 enable_vnmi = 0; 8553 8554 #ifdef CONFIG_X86_SGX_KVM 8555 if (!cpu_has_vmx_encls_vmexit()) 8556 enable_sgx = false; 8557 #endif 8558 8559 /* 8560 * set_apic_access_page_addr() is used to reload apic access 8561 * page upon invalidation. No need to do anything if not 8562 * using the APIC_ACCESS_ADDR VMCS field. 8563 */ 8564 if (!flexpriority_enabled) 8565 vmx_x86_ops.set_apic_access_page_addr = NULL; 8566 8567 if (!cpu_has_vmx_tpr_shadow()) 8568 vmx_x86_ops.update_cr8_intercept = NULL; 8569 8570 #if IS_ENABLED(CONFIG_HYPERV) 8571 if (ms_hyperv.nested_features & HV_X64_NESTED_GUEST_MAPPING_FLUSH 8572 && enable_ept) { 8573 vmx_x86_ops.flush_remote_tlbs = hv_flush_remote_tlbs; 8574 vmx_x86_ops.flush_remote_tlbs_range = hv_flush_remote_tlbs_range; 8575 } 8576 #endif 8577 8578 if (!cpu_has_vmx_ple()) { 8579 ple_gap = 0; 8580 ple_window = 0; 8581 ple_window_grow = 0; 8582 ple_window_max = 0; 8583 ple_window_shrink = 0; 8584 } 8585 8586 if (!cpu_has_vmx_apicv()) 8587 enable_apicv = 0; 8588 if (!enable_apicv) 8589 vmx_x86_ops.sync_pir_to_irr = NULL; 8590 8591 if (!enable_apicv || !cpu_has_vmx_ipiv()) 8592 enable_ipiv = false; 8593 8594 if (cpu_has_vmx_tsc_scaling()) 8595 kvm_caps.has_tsc_control = true; 8596 8597 kvm_caps.max_tsc_scaling_ratio = KVM_VMX_TSC_MULTIPLIER_MAX; 8598 kvm_caps.tsc_scaling_ratio_frac_bits = 48; 8599 kvm_caps.has_bus_lock_exit = cpu_has_vmx_bus_lock_detection(); 8600 kvm_caps.has_notify_vmexit = cpu_has_notify_vmexit(); 8601 8602 set_bit(0, vmx_vpid_bitmap); /* 0 is reserved for host */ 8603 8604 if (enable_ept) 8605 kvm_mmu_set_ept_masks(enable_ept_ad_bits, 8606 cpu_has_vmx_ept_execute_only()); 8607 8608 /* 8609 * Setup shadow_me_value/shadow_me_mask to include MKTME KeyID 8610 * bits to shadow_zero_check. 8611 */ 8612 vmx_setup_me_spte_mask(); 8613 8614 kvm_configure_mmu(enable_ept, 0, vmx_get_max_ept_level(), 8615 ept_caps_to_lpage_level(vmx_capability.ept)); 8616 8617 /* 8618 * Only enable PML when hardware supports PML feature, and both EPT 8619 * and EPT A/D bit features are enabled -- PML depends on them to work. 8620 */ 8621 if (!enable_ept || !enable_ept_ad_bits || !cpu_has_vmx_pml()) 8622 enable_pml = 0; 8623 8624 if (!enable_pml) 8625 vmx_x86_ops.cpu_dirty_log_size = 0; 8626 8627 if (!cpu_has_vmx_preemption_timer()) 8628 enable_preemption_timer = false; 8629 8630 if (enable_preemption_timer) { 8631 u64 use_timer_freq = 5000ULL * 1000 * 1000; 8632 8633 cpu_preemption_timer_multi = 8634 vmcs_config.misc & VMX_MISC_PREEMPTION_TIMER_RATE_MASK; 8635 8636 if (tsc_khz) 8637 use_timer_freq = (u64)tsc_khz * 1000; 8638 use_timer_freq >>= cpu_preemption_timer_multi; 8639 8640 /* 8641 * KVM "disables" the preemption timer by setting it to its max 8642 * value. Don't use the timer if it might cause spurious exits 8643 * at a rate faster than 0.1 Hz (of uninterrupted guest time). 8644 */ 8645 if (use_timer_freq > 0xffffffffu / 10) 8646 enable_preemption_timer = false; 8647 } 8648 8649 if (!enable_preemption_timer) { 8650 vmx_x86_ops.set_hv_timer = NULL; 8651 vmx_x86_ops.cancel_hv_timer = NULL; 8652 } 8653 8654 kvm_caps.supported_mce_cap |= MCG_LMCE_P; 8655 kvm_caps.supported_mce_cap |= MCG_CMCI_P; 8656 8657 if (pt_mode != PT_MODE_SYSTEM && pt_mode != PT_MODE_HOST_GUEST) 8658 return -EINVAL; 8659 if (!enable_ept || !enable_pmu || !cpu_has_vmx_intel_pt()) 8660 pt_mode = PT_MODE_SYSTEM; 8661 if (pt_mode == PT_MODE_HOST_GUEST) 8662 vmx_init_ops.handle_intel_pt_intr = vmx_handle_intel_pt_intr; 8663 else 8664 vmx_init_ops.handle_intel_pt_intr = NULL; 8665 8666 setup_default_sgx_lepubkeyhash(); 8667 8668 if (nested) { 8669 nested_vmx_setup_ctls_msrs(&vmcs_config, vmx_capability.ept); 8670 8671 r = nested_vmx_hardware_setup(kvm_vmx_exit_handlers); 8672 if (r) 8673 return r; 8674 } 8675 8676 vmx_set_cpu_caps(); 8677 8678 r = alloc_kvm_area(); 8679 if (r && nested) 8680 nested_vmx_hardware_unsetup(); 8681 8682 kvm_set_posted_intr_wakeup_handler(pi_wakeup_handler); 8683 8684 return r; 8685 } 8686 8687 static struct kvm_x86_init_ops vmx_init_ops __initdata = { 8688 .hardware_setup = hardware_setup, 8689 .handle_intel_pt_intr = NULL, 8690 8691 .runtime_ops = &vmx_x86_ops, 8692 .pmu_ops = &intel_pmu_ops, 8693 }; 8694 8695 static void vmx_cleanup_l1d_flush(void) 8696 { 8697 if (vmx_l1d_flush_pages) { 8698 free_pages((unsigned long)vmx_l1d_flush_pages, L1D_CACHE_ORDER); 8699 vmx_l1d_flush_pages = NULL; 8700 } 8701 /* Restore state so sysfs ignores VMX */ 8702 l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_AUTO; 8703 } 8704 8705 static void __vmx_exit(void) 8706 { 8707 allow_smaller_maxphyaddr = false; 8708 8709 cpu_emergency_unregister_virt_callback(vmx_emergency_disable); 8710 8711 vmx_cleanup_l1d_flush(); 8712 } 8713 8714 static void vmx_exit(void) 8715 { 8716 kvm_exit(); 8717 kvm_x86_vendor_exit(); 8718 8719 __vmx_exit(); 8720 } 8721 module_exit(vmx_exit); 8722 8723 static int __init vmx_init(void) 8724 { 8725 int r, cpu; 8726 8727 if (!kvm_is_vmx_supported()) 8728 return -EOPNOTSUPP; 8729 8730 /* 8731 * Note, hv_init_evmcs() touches only VMX knobs, i.e. there's nothing 8732 * to unwind if a later step fails. 8733 */ 8734 hv_init_evmcs(); 8735 8736 r = kvm_x86_vendor_init(&vmx_init_ops); 8737 if (r) 8738 return r; 8739 8740 /* 8741 * Must be called after common x86 init so enable_ept is properly set 8742 * up. Hand the parameter mitigation value in which was stored in 8743 * the pre module init parser. If no parameter was given, it will 8744 * contain 'auto' which will be turned into the default 'cond' 8745 * mitigation mode. 8746 */ 8747 r = vmx_setup_l1d_flush(vmentry_l1d_flush_param); 8748 if (r) 8749 goto err_l1d_flush; 8750 8751 for_each_possible_cpu(cpu) { 8752 INIT_LIST_HEAD(&per_cpu(loaded_vmcss_on_cpu, cpu)); 8753 8754 pi_init_cpu(cpu); 8755 } 8756 8757 cpu_emergency_register_virt_callback(vmx_emergency_disable); 8758 8759 vmx_check_vmcs12_offsets(); 8760 8761 /* 8762 * Shadow paging doesn't have a (further) performance penalty 8763 * from GUEST_MAXPHYADDR < HOST_MAXPHYADDR so enable it 8764 * by default 8765 */ 8766 if (!enable_ept) 8767 allow_smaller_maxphyaddr = true; 8768 8769 /* 8770 * Common KVM initialization _must_ come last, after this, /dev/kvm is 8771 * exposed to userspace! 8772 */ 8773 r = kvm_init(sizeof(struct vcpu_vmx), __alignof__(struct vcpu_vmx), 8774 THIS_MODULE); 8775 if (r) 8776 goto err_kvm_init; 8777 8778 return 0; 8779 8780 err_kvm_init: 8781 __vmx_exit(); 8782 err_l1d_flush: 8783 kvm_x86_vendor_exit(); 8784 return r; 8785 } 8786 module_init(vmx_init); 8787