1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Kernel-based Virtual Machine driver for Linux 4 * 5 * This module enables machines with Intel VT-x extensions to run virtual 6 * machines without emulation or binary translation. 7 * 8 * Copyright (C) 2006 Qumranet, Inc. 9 * Copyright 2010 Red Hat, Inc. and/or its affiliates. 10 * 11 * Authors: 12 * Avi Kivity <avi@qumranet.com> 13 * Yaniv Kamay <yaniv@qumranet.com> 14 */ 15 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 16 17 #include <linux/highmem.h> 18 #include <linux/hrtimer.h> 19 #include <linux/kernel.h> 20 #include <linux/kvm_host.h> 21 #include <linux/module.h> 22 #include <linux/moduleparam.h> 23 #include <linux/mod_devicetable.h> 24 #include <linux/mm.h> 25 #include <linux/objtool.h> 26 #include <linux/sched.h> 27 #include <linux/sched/smt.h> 28 #include <linux/slab.h> 29 #include <linux/tboot.h> 30 #include <linux/trace_events.h> 31 32 #include <asm/apic.h> 33 #include <asm/asm.h> 34 #include <asm/cpu.h> 35 #include <asm/cpu_device_id.h> 36 #include <asm/cpuid/api.h> 37 #include <asm/debugreg.h> 38 #include <asm/desc.h> 39 #include <asm/fpu/api.h> 40 #include <asm/fpu/xstate.h> 41 #include <asm/fred.h> 42 #include <asm/idtentry.h> 43 #include <asm/io.h> 44 #include <asm/irq_remapping.h> 45 #include <asm/reboot.h> 46 #include <asm/perf_event.h> 47 #include <asm/mmu_context.h> 48 #include <asm/mshyperv.h> 49 #include <asm/msr.h> 50 #include <asm/mwait.h> 51 #include <asm/spec-ctrl.h> 52 #include <asm/virt.h> 53 #include <asm/vmx.h> 54 55 #include <trace/events/ipi.h> 56 57 #include "capabilities.h" 58 #include "common.h" 59 #include "cpuid.h" 60 #include "hyperv.h" 61 #include "kvm_onhyperv.h" 62 #include "irq.h" 63 #include "regs.h" 64 #include "lapic.h" 65 #include "mmu.h" 66 #include "nested.h" 67 #include "pmu.h" 68 #include "sgx.h" 69 #include "trace.h" 70 #include "vmcs.h" 71 #include "vmcs12.h" 72 #include "vmx.h" 73 #include "x86.h" 74 #include "x86_ops.h" 75 #include "smm.h" 76 #include "vmx_onhyperv.h" 77 #include "vmenter.h" 78 #include "posted_intr.h" 79 80 #include "mmu/spte.h" 81 82 MODULE_AUTHOR("Qumranet"); 83 MODULE_DESCRIPTION("KVM support for VMX (Intel VT-x) extensions"); 84 MODULE_LICENSE("GPL"); 85 86 #ifdef MODULE 87 static const struct x86_cpu_id vmx_cpu_id[] = { 88 X86_MATCH_FEATURE(X86_FEATURE_VMX, NULL), 89 {} 90 }; 91 MODULE_DEVICE_TABLE(x86cpu, vmx_cpu_id); 92 #endif 93 94 bool __read_mostly enable_vpid = 1; 95 module_param_named(vpid, enable_vpid, bool, 0444); 96 97 static bool __read_mostly enable_vnmi = 1; 98 module_param_named(vnmi, enable_vnmi, bool, 0444); 99 100 bool __read_mostly flexpriority_enabled = 1; 101 module_param_named(flexpriority, flexpriority_enabled, bool, 0444); 102 103 bool __read_mostly enable_ept = 1; 104 module_param_named(ept, enable_ept, bool, 0444); 105 106 bool __read_mostly enable_unrestricted_guest = 1; 107 module_param_named(unrestricted_guest, 108 enable_unrestricted_guest, bool, 0444); 109 110 bool __read_mostly enable_ept_ad_bits = 1; 111 module_param_named(eptad, enable_ept_ad_bits, bool, 0444); 112 113 bool __read_mostly enable_cet = 1; 114 module_param_named(cet, enable_cet, bool, 0444); 115 116 static bool __read_mostly emulate_invalid_guest_state = true; 117 module_param(emulate_invalid_guest_state, bool, 0444); 118 119 static bool __read_mostly fasteoi = 1; 120 module_param(fasteoi, bool, 0444); 121 122 bool __read_mostly enable_mbec = 1; 123 module_param_named(mbec, enable_mbec, bool, 0444); 124 125 module_param(enable_apicv, bool, 0444); 126 module_param(enable_ipiv, bool, 0444); 127 128 module_param(enable_device_posted_irqs, bool, 0444); 129 130 /* 131 * If nested=1, nested virtualization is supported, i.e., guests may use 132 * VMX and be a hypervisor for its own guests. If nested=0, guests may not 133 * use VMX instructions. 134 */ 135 static bool __read_mostly nested = 1; 136 module_param(nested, bool, 0444); 137 138 bool __read_mostly enable_pml = 1; 139 module_param_named(pml, enable_pml, bool, 0444); 140 141 static bool __read_mostly error_on_inconsistent_vmcs_config = true; 142 module_param(error_on_inconsistent_vmcs_config, bool, 0444); 143 144 static bool __read_mostly dump_invalid_vmcs = 0; 145 module_param(dump_invalid_vmcs, bool, 0644); 146 147 #define MSR_BITMAP_MODE_X2APIC 1 148 #define MSR_BITMAP_MODE_X2APIC_APICV 2 149 150 #define KVM_VMX_TSC_MULTIPLIER_MAX 0xffffffffffffffffULL 151 152 /* Guest_tsc -> host_tsc conversion requires 64-bit division. */ 153 static int __read_mostly cpu_preemption_timer_multi; 154 static bool __read_mostly enable_preemption_timer = 1; 155 #ifdef CONFIG_X86_64 156 module_param_named(preemption_timer, enable_preemption_timer, bool, S_IRUGO); 157 #endif 158 159 extern bool __read_mostly allow_smaller_maxphyaddr; 160 module_param(allow_smaller_maxphyaddr, bool, S_IRUGO); 161 162 module_param(enable_mediated_pmu, bool, 0444); 163 164 #define KVM_VM_CR0_ALWAYS_OFF (X86_CR0_NW | X86_CR0_CD) 165 #define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST X86_CR0_NE 166 #define KVM_VM_CR0_ALWAYS_ON \ 167 (KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST | X86_CR0_PG | X86_CR0_PE) 168 169 #define KVM_VM_CR4_ALWAYS_ON_UNRESTRICTED_GUEST X86_CR4_VMXE 170 #define KVM_PMODE_VM_CR4_ALWAYS_ON (X86_CR4_PAE | X86_CR4_VMXE) 171 #define KVM_RMODE_VM_CR4_ALWAYS_ON (X86_CR4_VME | X86_CR4_PAE | X86_CR4_VMXE) 172 173 #define RMODE_GUEST_OWNED_EFLAGS_BITS (~(X86_EFLAGS_IOPL | X86_EFLAGS_VM)) 174 175 #define MSR_IA32_RTIT_STATUS_MASK (~(RTIT_STATUS_FILTEREN | \ 176 RTIT_STATUS_CONTEXTEN | RTIT_STATUS_TRIGGEREN | \ 177 RTIT_STATUS_ERROR | RTIT_STATUS_STOPPED | \ 178 RTIT_STATUS_BYTECNT)) 179 180 /* 181 * These 2 parameters are used to config the controls for Pause-Loop Exiting: 182 * ple_gap: upper bound on the amount of time between two successive 183 * executions of PAUSE in a loop. Also indicate if ple enabled. 184 * According to test, this time is usually smaller than 128 cycles. 185 * ple_window: upper bound on the amount of time a guest is allowed to execute 186 * in a PAUSE loop. Tests indicate that most spinlocks are held for 187 * less than 2^12 cycles 188 * Time is measured based on a counter that runs at the same rate as the TSC, 189 * refer SDM volume 3b section 21.6.13 & 22.1.3. 190 */ 191 static unsigned int ple_gap = KVM_DEFAULT_PLE_GAP; 192 module_param(ple_gap, uint, 0444); 193 194 static unsigned int ple_window = KVM_VMX_DEFAULT_PLE_WINDOW; 195 module_param(ple_window, uint, 0444); 196 197 /* Default doubles per-vcpu window every exit. */ 198 static unsigned int ple_window_grow = KVM_DEFAULT_PLE_WINDOW_GROW; 199 module_param(ple_window_grow, uint, 0444); 200 201 /* Default resets per-vcpu window every exit to ple_window. */ 202 static unsigned int ple_window_shrink = KVM_DEFAULT_PLE_WINDOW_SHRINK; 203 module_param(ple_window_shrink, uint, 0444); 204 205 /* Default is to compute the maximum so we can never overflow. */ 206 static unsigned int ple_window_max = KVM_VMX_DEFAULT_PLE_WINDOW_MAX; 207 module_param(ple_window_max, uint, 0444); 208 209 /* Default is SYSTEM mode, 1 for host-guest mode (which is BROKEN) */ 210 int __read_mostly pt_mode = PT_MODE_SYSTEM; 211 #ifdef CONFIG_BROKEN 212 module_param(pt_mode, int, S_IRUGO); 213 #endif 214 215 struct x86_pmu_lbr __ro_after_init vmx_lbr_caps; 216 217 #ifdef CONFIG_CPU_MITIGATIONS 218 static DEFINE_STATIC_KEY_FALSE(vmx_l1d_should_flush); 219 static DEFINE_STATIC_KEY_FALSE(vmx_l1d_flush_cond); 220 static DEFINE_MUTEX(vmx_l1d_flush_mutex); 221 222 /* Storage for pre module init parameter parsing */ 223 static enum vmx_l1d_flush_state __read_mostly vmentry_l1d_flush_param = VMENTER_L1D_FLUSH_AUTO; 224 225 static const struct { 226 const char *option; 227 bool for_parse; 228 } vmentry_l1d_param[] = { 229 [VMENTER_L1D_FLUSH_AUTO] = {"auto", true}, 230 [VMENTER_L1D_FLUSH_NEVER] = {"never", true}, 231 [VMENTER_L1D_FLUSH_COND] = {"cond", true}, 232 [VMENTER_L1D_FLUSH_ALWAYS] = {"always", true}, 233 [VMENTER_L1D_FLUSH_EPT_DISABLED] = {"EPT disabled", false}, 234 [VMENTER_L1D_FLUSH_NOT_REQUIRED] = {"not required", false}, 235 }; 236 237 #define L1D_CACHE_ORDER 4 238 static void *vmx_l1d_flush_pages; 239 240 static int __vmx_setup_l1d_flush(enum vmx_l1d_flush_state l1tf) 241 { 242 struct page *page; 243 unsigned int i; 244 245 if (!boot_cpu_has_bug(X86_BUG_L1TF)) { 246 l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_NOT_REQUIRED; 247 return 0; 248 } 249 250 if (!enable_ept) { 251 l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_EPT_DISABLED; 252 return 0; 253 } 254 255 if (kvm_host.arch_capabilities & ARCH_CAP_SKIP_VMENTRY_L1DFLUSH) { 256 l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_NOT_REQUIRED; 257 return 0; 258 } 259 260 /* If set to auto use the default l1tf mitigation method */ 261 if (l1tf == VMENTER_L1D_FLUSH_AUTO) { 262 switch (l1tf_mitigation) { 263 case L1TF_MITIGATION_OFF: 264 l1tf = VMENTER_L1D_FLUSH_NEVER; 265 break; 266 case L1TF_MITIGATION_AUTO: 267 case L1TF_MITIGATION_FLUSH_NOWARN: 268 case L1TF_MITIGATION_FLUSH: 269 case L1TF_MITIGATION_FLUSH_NOSMT: 270 l1tf = VMENTER_L1D_FLUSH_COND; 271 break; 272 case L1TF_MITIGATION_FULL: 273 case L1TF_MITIGATION_FULL_FORCE: 274 l1tf = VMENTER_L1D_FLUSH_ALWAYS; 275 break; 276 } 277 } else if (l1tf_mitigation == L1TF_MITIGATION_FULL_FORCE) { 278 l1tf = VMENTER_L1D_FLUSH_ALWAYS; 279 } 280 281 if (l1tf != VMENTER_L1D_FLUSH_NEVER && !vmx_l1d_flush_pages && 282 !boot_cpu_has(X86_FEATURE_FLUSH_L1D)) { 283 /* 284 * This allocation for vmx_l1d_flush_pages is not tied to a VM 285 * lifetime and so should not be charged to a memcg. 286 */ 287 page = alloc_pages(GFP_KERNEL, L1D_CACHE_ORDER); 288 if (!page) 289 return -ENOMEM; 290 vmx_l1d_flush_pages = page_address(page); 291 292 /* 293 * Initialize each page with a different pattern in 294 * order to protect against KSM in the nested 295 * virtualization case. 296 */ 297 for (i = 0; i < 1u << L1D_CACHE_ORDER; ++i) { 298 memset(vmx_l1d_flush_pages + i * PAGE_SIZE, i + 1, 299 PAGE_SIZE); 300 } 301 } 302 303 l1tf_vmx_mitigation = l1tf; 304 305 if (l1tf != VMENTER_L1D_FLUSH_NEVER) 306 static_branch_enable(&vmx_l1d_should_flush); 307 else 308 static_branch_disable(&vmx_l1d_should_flush); 309 310 if (l1tf == VMENTER_L1D_FLUSH_COND) 311 static_branch_enable(&vmx_l1d_flush_cond); 312 else 313 static_branch_disable(&vmx_l1d_flush_cond); 314 return 0; 315 } 316 317 static int vmx_setup_l1d_flush(void) 318 { 319 /* 320 * Hand the parameter mitigation value in which was stored in the pre 321 * module init parser. If no parameter was given, it will contain 322 * 'auto' which will be turned into the default 'cond' mitigation mode. 323 */ 324 return __vmx_setup_l1d_flush(vmentry_l1d_flush_param); 325 } 326 327 static void vmx_cleanup_l1d_flush(void) 328 { 329 if (vmx_l1d_flush_pages) { 330 free_pages((unsigned long)vmx_l1d_flush_pages, L1D_CACHE_ORDER); 331 vmx_l1d_flush_pages = NULL; 332 } 333 /* Restore state so sysfs ignores VMX */ 334 l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_AUTO; 335 } 336 337 static int vmentry_l1d_flush_parse(const char *s) 338 { 339 unsigned int i; 340 341 if (s) { 342 for (i = 0; i < ARRAY_SIZE(vmentry_l1d_param); i++) { 343 if (vmentry_l1d_param[i].for_parse && 344 sysfs_streq(s, vmentry_l1d_param[i].option)) 345 return i; 346 } 347 } 348 return -EINVAL; 349 } 350 351 static int vmentry_l1d_flush_set(const char *s, const struct kernel_param *kp) 352 { 353 int l1tf, ret; 354 355 l1tf = vmentry_l1d_flush_parse(s); 356 if (l1tf < 0) 357 return l1tf; 358 359 if (!boot_cpu_has(X86_BUG_L1TF)) 360 return 0; 361 362 /* 363 * Has vmx_init() run already? If not then this is the pre init 364 * parameter parsing. In that case just store the value and let 365 * vmx_init() do the proper setup after enable_ept has been 366 * established. 367 */ 368 if (l1tf_vmx_mitigation == VMENTER_L1D_FLUSH_AUTO) { 369 vmentry_l1d_flush_param = l1tf; 370 return 0; 371 } 372 373 mutex_lock(&vmx_l1d_flush_mutex); 374 ret = __vmx_setup_l1d_flush(l1tf); 375 mutex_unlock(&vmx_l1d_flush_mutex); 376 return ret; 377 } 378 379 static int vmentry_l1d_flush_get(char *s, const struct kernel_param *kp) 380 { 381 if (WARN_ON_ONCE(l1tf_vmx_mitigation >= ARRAY_SIZE(vmentry_l1d_param))) 382 return sysfs_emit(s, "???\n"); 383 384 return sysfs_emit(s, "%s\n", vmentry_l1d_param[l1tf_vmx_mitigation].option); 385 } 386 387 /* 388 * Software based L1D cache flush which is used when microcode providing 389 * the cache control MSR is not loaded. 390 * 391 * The L1D cache is 32 KiB on Nehalem and later microarchitectures, but to 392 * flush it is required to read in 64 KiB because the replacement algorithm 393 * is not exactly LRU. This could be sized at runtime via topology 394 * information but as all relevant affected CPUs have 32KiB L1D cache size 395 * there is no point in doing so. 396 */ 397 static noinstr void vmx_l1d_flush(struct kvm_vcpu *vcpu) 398 { 399 int size = PAGE_SIZE << L1D_CACHE_ORDER; 400 401 if (!static_branch_unlikely(&vmx_l1d_should_flush)) 402 return; 403 404 /* 405 * This code is only executed when the flush mode is 'cond' or 406 * 'always' 407 */ 408 if (static_branch_likely(&vmx_l1d_flush_cond)) { 409 /* 410 * Clear the per-cpu flush bit, it gets set again if the vCPU 411 * is reloaded, i.e. if the vCPU is scheduled out or if KVM 412 * exits to userspace, or if KVM reaches one of the unsafe 413 * VMEXIT handlers, e.g. if KVM calls into the emulator, 414 * or from the interrupt handlers. 415 */ 416 if (!kvm_get_cpu_l1tf_flush_l1d()) 417 return; 418 kvm_clear_cpu_l1tf_flush_l1d(); 419 } 420 421 vcpu->stat.l1d_flush++; 422 423 if (static_cpu_has(X86_FEATURE_FLUSH_L1D)) { 424 native_wrmsrq(MSR_IA32_FLUSH_CMD, L1D_FLUSH); 425 return; 426 } 427 428 asm volatile( 429 /* First ensure the pages are in the TLB */ 430 "xorl %%eax, %%eax\n" 431 ".Lpopulate_tlb:\n\t" 432 "movzbl (%[flush_pages], %%" _ASM_AX "), %%ecx\n\t" 433 "addl $4096, %%eax\n\t" 434 "cmpl %%eax, %[size]\n\t" 435 "jne .Lpopulate_tlb\n\t" 436 "xorl %%eax, %%eax\n\t" 437 "cpuid\n\t" 438 /* Now fill the cache */ 439 "xorl %%eax, %%eax\n" 440 ".Lfill_cache:\n" 441 "movzbl (%[flush_pages], %%" _ASM_AX "), %%ecx\n\t" 442 "addl $64, %%eax\n\t" 443 "cmpl %%eax, %[size]\n\t" 444 "jne .Lfill_cache\n\t" 445 "lfence\n" 446 :: [flush_pages] "r" (vmx_l1d_flush_pages), 447 [size] "r" (size) 448 : "eax", "ebx", "ecx", "edx"); 449 } 450 451 #else /* CONFIG_CPU_MITIGATIONS*/ 452 static int vmx_setup_l1d_flush(void) 453 { 454 l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_NEVER; 455 return 0; 456 } 457 static void vmx_cleanup_l1d_flush(void) 458 { 459 l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_AUTO; 460 } 461 static __always_inline void vmx_l1d_flush(struct kvm_vcpu *vcpu) 462 { 463 464 } 465 static int vmentry_l1d_flush_set(const char *s, const struct kernel_param *kp) 466 { 467 pr_warn_once("Kernel compiled without mitigations, ignoring vmentry_l1d_flush\n"); 468 return 0; 469 } 470 static int vmentry_l1d_flush_get(char *s, const struct kernel_param *kp) 471 { 472 return sysfs_emit(s, "never\n"); 473 } 474 #endif 475 476 static const struct kernel_param_ops vmentry_l1d_flush_ops = { 477 .set = vmentry_l1d_flush_set, 478 .get = vmentry_l1d_flush_get, 479 }; 480 module_param_cb(vmentry_l1d_flush, &vmentry_l1d_flush_ops, NULL, 0644); 481 482 static __always_inline void vmx_disable_fb_clear(struct vcpu_vmx *vmx) 483 { 484 u64 msr; 485 486 if (!vmx->disable_fb_clear) 487 return; 488 489 msr = native_rdmsrq(MSR_IA32_MCU_OPT_CTRL); 490 msr |= FB_CLEAR_DIS; 491 native_wrmsrq(MSR_IA32_MCU_OPT_CTRL, msr); 492 /* Cache the MSR value to avoid reading it later */ 493 vmx->msr_ia32_mcu_opt_ctrl = msr; 494 } 495 496 static __always_inline void vmx_enable_fb_clear(struct vcpu_vmx *vmx) 497 { 498 if (!vmx->disable_fb_clear) 499 return; 500 501 vmx->msr_ia32_mcu_opt_ctrl &= ~FB_CLEAR_DIS; 502 native_wrmsrq(MSR_IA32_MCU_OPT_CTRL, vmx->msr_ia32_mcu_opt_ctrl); 503 } 504 505 static void vmx_update_fb_clear_dis(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx) 506 { 507 /* 508 * Disable VERW's behavior of clearing CPU buffers for the guest if the 509 * CPU isn't affected by MDS/TAA, and the host hasn't forcefully enabled 510 * the mitigation. Disabling the clearing behavior provides a 511 * performance boost for guests that aren't aware that manually clearing 512 * CPU buffers is unnecessary, at the cost of MSR accesses on VM-Entry 513 * and VM-Exit. 514 */ 515 vmx->disable_fb_clear = !cpu_feature_enabled(X86_FEATURE_CLEAR_CPU_BUF) && 516 (kvm_host.arch_capabilities & ARCH_CAP_FB_CLEAR_CTRL) && 517 !boot_cpu_has_bug(X86_BUG_MDS) && 518 !boot_cpu_has_bug(X86_BUG_TAA); 519 520 /* 521 * If guest will not execute VERW, there is no need to set FB_CLEAR_DIS 522 * at VMEntry. Skip the MSR read/write when a guest has no use case to 523 * execute VERW. 524 */ 525 if ((vcpu->arch.arch_capabilities & ARCH_CAP_FB_CLEAR) || 526 ((vcpu->arch.arch_capabilities & ARCH_CAP_MDS_NO) && 527 (vcpu->arch.arch_capabilities & ARCH_CAP_TAA_NO) && 528 (vcpu->arch.arch_capabilities & ARCH_CAP_PSDP_NO) && 529 (vcpu->arch.arch_capabilities & ARCH_CAP_FBSDP_NO) && 530 (vcpu->arch.arch_capabilities & ARCH_CAP_SBDR_SSDP_NO))) 531 vmx->disable_fb_clear = false; 532 } 533 534 static u32 vmx_segment_access_rights(struct kvm_segment *var); 535 536 void vmx_vmexit(void); 537 538 #define vmx_insn_failed(fmt...) \ 539 do { \ 540 WARN_ONCE(1, fmt); \ 541 pr_warn_ratelimited(fmt); \ 542 } while (0) 543 544 noinline void vmread_error(unsigned long field) 545 { 546 vmx_insn_failed("vmread failed: field=%lx\n", field); 547 } 548 549 #ifndef CONFIG_CC_HAS_ASM_GOTO_OUTPUT 550 noinstr void vmread_error_trampoline2(unsigned long field, bool fault) 551 { 552 if (fault) { 553 kvm_spurious_fault(); 554 } else { 555 instrumentation_begin(); 556 vmread_error(field); 557 instrumentation_end(); 558 } 559 } 560 #endif 561 562 noinline void vmwrite_error(unsigned long field, unsigned long value) 563 { 564 vmx_insn_failed("vmwrite failed: field=%lx val=%lx err=%u\n", 565 field, value, vmcs_read32(VM_INSTRUCTION_ERROR)); 566 } 567 568 noinline void vmclear_error(struct vmcs *vmcs, u64 phys_addr) 569 { 570 vmx_insn_failed("vmclear failed: %p/%llx err=%u\n", 571 vmcs, phys_addr, vmcs_read32(VM_INSTRUCTION_ERROR)); 572 } 573 574 noinline void vmptrld_error(struct vmcs *vmcs, u64 phys_addr) 575 { 576 vmx_insn_failed("vmptrld failed: %p/%llx err=%u\n", 577 vmcs, phys_addr, vmcs_read32(VM_INSTRUCTION_ERROR)); 578 } 579 580 noinline void invvpid_error(unsigned long ext, u16 vpid, gva_t gva) 581 { 582 vmx_insn_failed("invvpid failed: ext=0x%lx vpid=%u gva=0x%lx\n", 583 ext, vpid, gva); 584 } 585 586 noinline void invept_error(unsigned long ext, u64 eptp) 587 { 588 vmx_insn_failed("invept failed: ext=0x%lx eptp=%llx\n", ext, eptp); 589 } 590 591 DEFINE_PER_CPU(struct vmcs *, current_vmcs); 592 /* 593 * We maintain a per-CPU linked-list of VMCS loaded on that CPU. This is needed 594 * when a CPU is brought down, and we need to VMCLEAR all VMCSs loaded on it. 595 */ 596 static DEFINE_PER_CPU(struct list_head, loaded_vmcss_on_cpu); 597 598 static DECLARE_BITMAP(vmx_vpid_bitmap, VMX_NR_VPIDS); 599 static DEFINE_SPINLOCK(vmx_vpid_lock); 600 601 struct vmcs_config vmcs_config __ro_after_init; 602 struct vmx_capability vmx_capability __ro_after_init; 603 604 #define VMX_SEGMENT_FIELD(seg) \ 605 [VCPU_SREG_##seg] = { \ 606 .selector = GUEST_##seg##_SELECTOR, \ 607 .base = GUEST_##seg##_BASE, \ 608 .limit = GUEST_##seg##_LIMIT, \ 609 .ar_bytes = GUEST_##seg##_AR_BYTES, \ 610 } 611 612 static const struct kvm_vmx_segment_field { 613 unsigned selector; 614 unsigned base; 615 unsigned limit; 616 unsigned ar_bytes; 617 } kvm_vmx_segment_fields[] = { 618 VMX_SEGMENT_FIELD(CS), 619 VMX_SEGMENT_FIELD(DS), 620 VMX_SEGMENT_FIELD(ES), 621 VMX_SEGMENT_FIELD(FS), 622 VMX_SEGMENT_FIELD(GS), 623 VMX_SEGMENT_FIELD(SS), 624 VMX_SEGMENT_FIELD(TR), 625 VMX_SEGMENT_FIELD(LDTR), 626 }; 627 628 629 static unsigned long host_idt_base; 630 631 #if IS_ENABLED(CONFIG_HYPERV) 632 static bool __read_mostly enlightened_vmcs = true; 633 module_param(enlightened_vmcs, bool, 0444); 634 635 static int hv_enable_l2_tlb_flush(struct kvm_vcpu *vcpu) 636 { 637 struct hv_enlightened_vmcs *evmcs; 638 hpa_t partition_assist_page = hv_get_partition_assist_page(vcpu); 639 640 if (partition_assist_page == INVALID_PAGE) 641 return -ENOMEM; 642 643 evmcs = (struct hv_enlightened_vmcs *)to_vmx(vcpu)->loaded_vmcs->vmcs; 644 645 evmcs->partition_assist_page = partition_assist_page; 646 evmcs->hv_vm_id = (unsigned long)vcpu->kvm; 647 evmcs->hv_enlightenments_control.nested_flush_hypercall = 1; 648 649 return 0; 650 } 651 652 static __init void hv_init_evmcs(void) 653 { 654 int cpu; 655 656 if (!enlightened_vmcs) 657 return; 658 659 /* 660 * Enlightened VMCS usage should be recommended and the host needs 661 * to support eVMCS v1 or above. 662 */ 663 if (ms_hyperv.hints & HV_X64_ENLIGHTENED_VMCS_RECOMMENDED && 664 (ms_hyperv.nested_features & HV_X64_ENLIGHTENED_VMCS_VERSION) >= 665 KVM_EVMCS_VERSION) { 666 667 /* Check that we have assist pages on all online CPUs */ 668 for_each_online_cpu(cpu) { 669 if (!hv_get_vp_assist_page(cpu)) { 670 enlightened_vmcs = false; 671 break; 672 } 673 } 674 675 if (enlightened_vmcs) { 676 pr_info("Using Hyper-V Enlightened VMCS\n"); 677 static_branch_enable(&__kvm_is_using_evmcs); 678 } 679 680 if (ms_hyperv.nested_features & HV_X64_NESTED_DIRECT_FLUSH) 681 vt_x86_ops.enable_l2_tlb_flush 682 = hv_enable_l2_tlb_flush; 683 } else { 684 enlightened_vmcs = false; 685 } 686 } 687 688 static void hv_reset_evmcs(void) 689 { 690 struct hv_vp_assist_page *vp_ap; 691 692 if (!kvm_is_using_evmcs()) 693 return; 694 695 /* 696 * KVM should enable eVMCS if and only if all CPUs have a VP assist 697 * page, and should reject CPU onlining if eVMCS is enabled the CPU 698 * doesn't have a VP assist page allocated. 699 */ 700 vp_ap = hv_get_vp_assist_page(smp_processor_id()); 701 if (WARN_ON_ONCE(!vp_ap)) 702 return; 703 704 /* 705 * Reset everything to support using non-enlightened VMCS access later 706 * (e.g. when we reload the module with enlightened_vmcs=0) 707 */ 708 vp_ap->nested_control.features.directhypercall = 0; 709 vp_ap->current_nested_vmcs = 0; 710 vp_ap->enlighten_vmentry = 0; 711 } 712 713 #else /* IS_ENABLED(CONFIG_HYPERV) */ 714 static void hv_init_evmcs(void) {} 715 static void hv_reset_evmcs(void) {} 716 #endif /* IS_ENABLED(CONFIG_HYPERV) */ 717 718 /* 719 * Comment's format: document - errata name - stepping - processor name. 720 * Refer from 721 * https://www.virtualbox.org/svn/vbox/trunk/src/VBox/VMM/VMMR0/HMR0.cpp 722 */ 723 static u32 vmx_preemption_cpu_tfms[] = { 724 /* 323344.pdf - BA86 - D0 - Xeon 7500 Series */ 725 0x000206E6, 726 /* 323056.pdf - AAX65 - C2 - Xeon L3406 */ 727 /* 322814.pdf - AAT59 - C2 - i7-600, i5-500, i5-400 and i3-300 Mobile */ 728 /* 322911.pdf - AAU65 - C2 - i5-600, i3-500 Desktop and Pentium G6950 */ 729 0x00020652, 730 /* 322911.pdf - AAU65 - K0 - i5-600, i3-500 Desktop and Pentium G6950 */ 731 0x00020655, 732 /* 322373.pdf - AAO95 - B1 - Xeon 3400 Series */ 733 /* 322166.pdf - AAN92 - B1 - i7-800 and i5-700 Desktop */ 734 /* 735 * 320767.pdf - AAP86 - B1 - 736 * i7-900 Mobile Extreme, i7-800 and i7-700 Mobile 737 */ 738 0x000106E5, 739 /* 321333.pdf - AAM126 - C0 - Xeon 3500 */ 740 0x000106A0, 741 /* 321333.pdf - AAM126 - C1 - Xeon 3500 */ 742 0x000106A1, 743 /* 320836.pdf - AAJ124 - C0 - i7-900 Desktop Extreme and i7-900 Desktop */ 744 0x000106A4, 745 /* 321333.pdf - AAM126 - D0 - Xeon 3500 */ 746 /* 321324.pdf - AAK139 - D0 - Xeon 5500 */ 747 /* 320836.pdf - AAJ124 - D0 - i7-900 Extreme and i7-900 Desktop */ 748 0x000106A5, 749 /* Xeon E3-1220 V2 */ 750 0x000306A8, 751 }; 752 753 static inline bool cpu_has_broken_vmx_preemption_timer(void) 754 { 755 u32 eax = cpuid_eax(0x00000001), i; 756 757 /* Clear the reserved bits */ 758 eax &= ~(0x3U << 14 | 0xfU << 28); 759 for (i = 0; i < ARRAY_SIZE(vmx_preemption_cpu_tfms); i++) 760 if (eax == vmx_preemption_cpu_tfms[i]) 761 return true; 762 763 return false; 764 } 765 766 static inline bool cpu_need_virtualize_apic_accesses(struct kvm_vcpu *vcpu) 767 { 768 return flexpriority_enabled && lapic_in_kernel(vcpu); 769 } 770 771 struct vmx_uret_msr *vmx_find_uret_msr(struct vcpu_vmx *vmx, u32 msr) 772 { 773 int i; 774 775 i = kvm_find_user_return_msr(msr); 776 if (i >= 0) 777 return &vmx->guest_uret_msrs[i]; 778 return NULL; 779 } 780 781 static int vmx_set_guest_uret_msr(struct vcpu_vmx *vmx, 782 struct vmx_uret_msr *msr, u64 data) 783 { 784 unsigned int slot = msr - vmx->guest_uret_msrs; 785 int ret = 0; 786 787 if (msr->load_into_hardware) { 788 preempt_disable(); 789 ret = kvm_set_user_return_msr(slot, data, msr->mask); 790 preempt_enable(); 791 } 792 if (!ret) 793 msr->data = data; 794 return ret; 795 } 796 797 void vmx_emergency_disable_virtualization_cpu(void) 798 { 799 int cpu = raw_smp_processor_id(); 800 struct loaded_vmcs *v; 801 802 list_for_each_entry(v, &per_cpu(loaded_vmcss_on_cpu, cpu), 803 loaded_vmcss_on_cpu_link) { 804 vmcs_clear(v->vmcs); 805 if (v->shadow_vmcs) 806 vmcs_clear(v->shadow_vmcs); 807 } 808 } 809 810 static void __loaded_vmcs_clear(void *arg) 811 { 812 struct loaded_vmcs *loaded_vmcs = arg; 813 int cpu = raw_smp_processor_id(); 814 815 if (loaded_vmcs->cpu != cpu) 816 return; /* vcpu migration can race with cpu offline */ 817 if (per_cpu(current_vmcs, cpu) == loaded_vmcs->vmcs) 818 per_cpu(current_vmcs, cpu) = NULL; 819 820 vmcs_clear(loaded_vmcs->vmcs); 821 if (loaded_vmcs->shadow_vmcs && loaded_vmcs->launched) 822 vmcs_clear(loaded_vmcs->shadow_vmcs); 823 824 list_del(&loaded_vmcs->loaded_vmcss_on_cpu_link); 825 826 /* 827 * Ensure all writes to loaded_vmcs, including deleting it from its 828 * current percpu list, complete before setting loaded_vmcs->cpu to 829 * -1, otherwise a different cpu can see loaded_vmcs->cpu == -1 first 830 * and add loaded_vmcs to its percpu list before it's deleted from this 831 * cpu's list. Pairs with the smp_rmb() in vmx_vcpu_load_vmcs(). 832 */ 833 smp_wmb(); 834 835 loaded_vmcs->cpu = -1; 836 loaded_vmcs->launched = 0; 837 } 838 839 static void loaded_vmcs_clear(struct loaded_vmcs *loaded_vmcs) 840 { 841 int cpu = loaded_vmcs->cpu; 842 843 if (cpu != -1) 844 smp_call_function_single(cpu, 845 __loaded_vmcs_clear, loaded_vmcs, 1); 846 } 847 848 static bool vmx_segment_cache_test_set(struct vcpu_vmx *vmx, unsigned seg, 849 unsigned field) 850 { 851 bool ret; 852 u32 mask = 1 << (seg * SEG_FIELD_NR + field); 853 854 if (!kvm_register_is_available(&vmx->vcpu, VCPU_REG_SEGMENTS)) { 855 kvm_register_mark_available(&vmx->vcpu, VCPU_REG_SEGMENTS); 856 vmx->segment_cache.bitmask = 0; 857 } 858 ret = vmx->segment_cache.bitmask & mask; 859 vmx->segment_cache.bitmask |= mask; 860 return ret; 861 } 862 863 static u16 vmx_read_guest_seg_selector(struct vcpu_vmx *vmx, unsigned seg) 864 { 865 u16 *p = &vmx->segment_cache.seg[seg].selector; 866 867 if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_SEL)) 868 *p = vmcs_read16(kvm_vmx_segment_fields[seg].selector); 869 return *p; 870 } 871 872 static ulong vmx_read_guest_seg_base(struct vcpu_vmx *vmx, unsigned seg) 873 { 874 ulong *p = &vmx->segment_cache.seg[seg].base; 875 876 if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_BASE)) 877 *p = vmcs_readl(kvm_vmx_segment_fields[seg].base); 878 return *p; 879 } 880 881 static u32 vmx_read_guest_seg_limit(struct vcpu_vmx *vmx, unsigned seg) 882 { 883 u32 *p = &vmx->segment_cache.seg[seg].limit; 884 885 if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_LIMIT)) 886 *p = vmcs_read32(kvm_vmx_segment_fields[seg].limit); 887 return *p; 888 } 889 890 static u32 vmx_read_guest_seg_ar(struct vcpu_vmx *vmx, unsigned seg) 891 { 892 u32 *p = &vmx->segment_cache.seg[seg].ar; 893 894 if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_AR)) 895 *p = vmcs_read32(kvm_vmx_segment_fields[seg].ar_bytes); 896 return *p; 897 } 898 899 void vmx_update_exception_bitmap(struct kvm_vcpu *vcpu) 900 { 901 u32 eb; 902 903 eb = (1u << PF_VECTOR) | (1u << UD_VECTOR) | (1u << MC_VECTOR) | 904 (1u << DB_VECTOR) | (1u << AC_VECTOR); 905 /* 906 * #VE isn't used for VMX. To test against unexpected changes 907 * related to #VE for VMX, intercept unexpected #VE and warn on it. 908 */ 909 if (IS_ENABLED(CONFIG_KVM_INTEL_PROVE_VE)) 910 eb |= 1u << VE_VECTOR; 911 /* 912 * Guest access to VMware backdoor ports could legitimately 913 * trigger #GP because of TSS I/O permission bitmap. 914 * We intercept those #GP and allow access to them anyway 915 * as VMware does. 916 */ 917 if (enable_vmware_backdoor) 918 eb |= (1u << GP_VECTOR); 919 if ((vcpu->guest_debug & 920 (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP)) == 921 (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP)) 922 eb |= 1u << BP_VECTOR; 923 if (to_vmx(vcpu)->rmode.vm86_active) 924 eb = ~0; 925 if (!vmx_need_pf_intercept(vcpu)) 926 eb &= ~(1u << PF_VECTOR); 927 928 /* When we are running a nested L2 guest and L1 specified for it a 929 * certain exception bitmap, we must trap the same exceptions and pass 930 * them to L1. When running L2, we will only handle the exceptions 931 * specified above if L1 did not want them. 932 */ 933 if (is_guest_mode(vcpu)) 934 eb |= get_vmcs12(vcpu)->exception_bitmap; 935 else { 936 int mask = 0, match = 0; 937 938 if (enable_ept && (eb & (1u << PF_VECTOR))) { 939 /* 940 * If EPT is enabled, #PF is currently only intercepted 941 * if MAXPHYADDR is smaller on the guest than on the 942 * host. In that case we only care about present, 943 * non-reserved faults. For vmcs02, however, PFEC_MASK 944 * and PFEC_MATCH are set in prepare_vmcs02_rare. 945 */ 946 mask = PFERR_PRESENT_MASK | PFERR_RSVD_MASK; 947 match = PFERR_PRESENT_MASK; 948 } 949 vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, mask); 950 vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, match); 951 } 952 953 /* 954 * Disabling xfd interception indicates that dynamic xfeatures 955 * might be used in the guest. Always trap #NM in this case 956 * to save guest xfd_err timely. 957 */ 958 if (vcpu->arch.xfd_no_write_intercept) 959 eb |= (1u << NM_VECTOR); 960 961 vmcs_write32(EXCEPTION_BITMAP, eb); 962 } 963 964 /* 965 * Check if MSR is intercepted for currently loaded MSR bitmap. 966 */ 967 static bool msr_write_intercepted(struct vcpu_vmx *vmx, u32 msr) 968 { 969 if (!(exec_controls_get(vmx) & CPU_BASED_USE_MSR_BITMAPS)) 970 return true; 971 972 return vmx_test_msr_bitmap_write(vmx->loaded_vmcs->msr_bitmap, msr); 973 } 974 975 unsigned int __vmx_vcpu_enter_flags(struct vcpu_vmx *vmx) 976 { 977 unsigned int flags = 0; 978 979 if (vmx->loaded_vmcs->launched) 980 flags |= KVM_ENTER_VMRESUME; 981 982 /* 983 * If writes to the SPEC_CTRL MSR aren't intercepted, the guest is free 984 * to change it directly without causing a vmexit. In that case read 985 * it after vmexit and store it in vmx->spec_ctrl. 986 */ 987 if (!msr_write_intercepted(vmx, MSR_IA32_SPEC_CTRL)) 988 flags |= KVM_ENTER_SAVE_SPEC_CTRL; 989 990 if (cpu_feature_enabled(X86_FEATURE_CLEAR_CPU_BUF_VM_MMIO) && 991 kvm_vcpu_can_access_host_mmio(&vmx->vcpu)) 992 flags |= KVM_ENTER_CLEAR_CPU_BUFFERS_FOR_MMIO; 993 994 return flags; 995 } 996 997 static __always_inline void clear_atomic_switch_msr_special(struct vcpu_vmx *vmx, 998 unsigned long entry, unsigned long exit) 999 { 1000 vm_entry_controls_clearbit(vmx, entry); 1001 vm_exit_controls_clearbit(vmx, exit); 1002 } 1003 1004 static int vmx_find_loadstore_msr_slot(struct vmx_msrs *m, u32 msr) 1005 { 1006 unsigned int i; 1007 1008 for (i = 0; i < m->nr; ++i) { 1009 if (m->val[i].index == msr) 1010 return i; 1011 } 1012 return -ENOENT; 1013 } 1014 1015 static void vmx_remove_auto_msr(struct vmx_msrs *m, u32 msr, 1016 unsigned long vmcs_count_field) 1017 { 1018 int i; 1019 1020 i = vmx_find_loadstore_msr_slot(m, msr); 1021 if (i < 0) 1022 return; 1023 1024 --m->nr; 1025 m->val[i] = m->val[m->nr]; 1026 vmcs_write32(vmcs_count_field, m->nr); 1027 } 1028 1029 static void clear_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr) 1030 { 1031 struct msr_autoload *m = &vmx->msr_autoload; 1032 1033 switch (msr) { 1034 case MSR_EFER: 1035 if (cpu_has_load_ia32_efer()) { 1036 clear_atomic_switch_msr_special(vmx, 1037 VM_ENTRY_LOAD_IA32_EFER, 1038 VM_EXIT_LOAD_IA32_EFER); 1039 return; 1040 } 1041 break; 1042 case MSR_CORE_PERF_GLOBAL_CTRL: 1043 if (cpu_has_load_perf_global_ctrl()) { 1044 clear_atomic_switch_msr_special(vmx, 1045 VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL, 1046 VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL); 1047 return; 1048 } 1049 break; 1050 } 1051 1052 vmx_remove_auto_msr(&m->guest, msr, VM_ENTRY_MSR_LOAD_COUNT); 1053 vmx_remove_auto_msr(&m->host, msr, VM_EXIT_MSR_LOAD_COUNT); 1054 } 1055 1056 static __always_inline void add_atomic_switch_msr_special(struct vcpu_vmx *vmx, 1057 unsigned long entry, unsigned long exit, 1058 unsigned long guest_val_vmcs, unsigned long host_val_vmcs, 1059 u64 guest_val, u64 host_val) 1060 { 1061 vmcs_write64(guest_val_vmcs, guest_val); 1062 if (host_val_vmcs != HOST_IA32_EFER) 1063 vmcs_write64(host_val_vmcs, host_val); 1064 vm_entry_controls_setbit(vmx, entry); 1065 vm_exit_controls_setbit(vmx, exit); 1066 } 1067 1068 static void vmx_add_auto_msr(struct vmx_msrs *m, u32 msr, u64 value, 1069 unsigned long vmcs_count_field, struct kvm *kvm) 1070 { 1071 int i; 1072 1073 i = vmx_find_loadstore_msr_slot(m, msr); 1074 if (i < 0) { 1075 if (KVM_BUG_ON(m->nr == MAX_NR_LOADSTORE_MSRS, kvm)) 1076 return; 1077 1078 i = m->nr++; 1079 m->val[i].index = msr; 1080 vmcs_write32(vmcs_count_field, m->nr); 1081 } 1082 m->val[i].value = value; 1083 } 1084 1085 static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr, 1086 u64 guest_val, u64 host_val) 1087 { 1088 struct msr_autoload *m = &vmx->msr_autoload; 1089 struct kvm *kvm = vmx->vcpu.kvm; 1090 1091 switch (msr) { 1092 case MSR_EFER: 1093 if (cpu_has_load_ia32_efer()) { 1094 add_atomic_switch_msr_special(vmx, 1095 VM_ENTRY_LOAD_IA32_EFER, 1096 VM_EXIT_LOAD_IA32_EFER, 1097 GUEST_IA32_EFER, 1098 HOST_IA32_EFER, 1099 guest_val, host_val); 1100 return; 1101 } 1102 break; 1103 case MSR_CORE_PERF_GLOBAL_CTRL: 1104 if (cpu_has_load_perf_global_ctrl()) { 1105 add_atomic_switch_msr_special(vmx, 1106 VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL, 1107 VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL, 1108 GUEST_IA32_PERF_GLOBAL_CTRL, 1109 HOST_IA32_PERF_GLOBAL_CTRL, 1110 guest_val, host_val); 1111 return; 1112 } 1113 break; 1114 case MSR_IA32_PEBS_ENABLE: 1115 /* PEBS needs a quiescent period after being disabled (to write 1116 * a record). Disabling PEBS through VMX MSR swapping doesn't 1117 * provide that period, so a CPU could write host's record into 1118 * guest's memory. 1119 */ 1120 wrmsrq(MSR_IA32_PEBS_ENABLE, 0); 1121 } 1122 1123 vmx_add_auto_msr(&m->guest, msr, guest_val, VM_ENTRY_MSR_LOAD_COUNT, kvm); 1124 vmx_add_auto_msr(&m->host, msr, host_val, VM_EXIT_MSR_LOAD_COUNT, kvm); 1125 } 1126 1127 static bool update_transition_efer(struct vcpu_vmx *vmx) 1128 { 1129 u64 guest_efer = vmx->vcpu.arch.efer; 1130 u64 ignore_bits = 0; 1131 int i; 1132 1133 /* Shadow paging assumes NX to be available. */ 1134 if (!enable_ept) 1135 guest_efer |= EFER_NX; 1136 1137 /* 1138 * LMA and LME handled by hardware; SCE meaningless outside long mode. 1139 */ 1140 ignore_bits |= EFER_SCE; 1141 #ifdef CONFIG_X86_64 1142 ignore_bits |= EFER_LMA | EFER_LME; 1143 /* SCE is meaningful only in long mode on Intel */ 1144 if (guest_efer & EFER_LMA) 1145 ignore_bits &= ~(u64)EFER_SCE; 1146 #endif 1147 1148 /* 1149 * On EPT, we can't emulate NX, so we must switch EFER atomically. 1150 * On CPUs that support "load IA32_EFER", always switch EFER 1151 * atomically, since it's faster than switching it manually. 1152 */ 1153 if (cpu_has_load_ia32_efer() || 1154 (enable_ept && ((vmx->vcpu.arch.efer ^ kvm_host.efer) & EFER_NX))) { 1155 if (!(guest_efer & EFER_LMA)) 1156 guest_efer &= ~EFER_LME; 1157 if (guest_efer != kvm_host.efer) 1158 add_atomic_switch_msr(vmx, MSR_EFER, guest_efer, kvm_host.efer); 1159 else 1160 clear_atomic_switch_msr(vmx, MSR_EFER); 1161 return false; 1162 } 1163 1164 i = kvm_find_user_return_msr(MSR_EFER); 1165 if (i < 0) 1166 return false; 1167 1168 clear_atomic_switch_msr(vmx, MSR_EFER); 1169 1170 guest_efer &= ~ignore_bits; 1171 guest_efer |= kvm_host.efer & ignore_bits; 1172 1173 vmx->guest_uret_msrs[i].data = guest_efer; 1174 vmx->guest_uret_msrs[i].mask = ~ignore_bits; 1175 1176 return true; 1177 } 1178 1179 static void vmx_add_autostore_msr(struct vcpu_vmx *vmx, u32 msr) 1180 { 1181 vmx_add_auto_msr(&vmx->msr_autostore, msr, 0, VM_EXIT_MSR_STORE_COUNT, 1182 vmx->vcpu.kvm); 1183 } 1184 1185 static void vmx_remove_autostore_msr(struct vcpu_vmx *vmx, u32 msr) 1186 { 1187 vmx_remove_auto_msr(&vmx->msr_autostore, msr, VM_EXIT_MSR_STORE_COUNT); 1188 } 1189 1190 #ifdef CONFIG_X86_32 1191 /* 1192 * On 32-bit kernels, VM exits still load the FS and GS bases from the 1193 * VMCS rather than the segment table. KVM uses this helper to figure 1194 * out the current bases to poke them into the VMCS before entry. 1195 */ 1196 static unsigned long segment_base(u16 selector) 1197 { 1198 struct desc_struct *table; 1199 unsigned long v; 1200 1201 if (!(selector & ~SEGMENT_RPL_MASK)) 1202 return 0; 1203 1204 table = get_current_gdt_ro(); 1205 1206 if ((selector & SEGMENT_TI_MASK) == SEGMENT_LDT) { 1207 u16 ldt_selector = kvm_read_ldt(); 1208 1209 if (!(ldt_selector & ~SEGMENT_RPL_MASK)) 1210 return 0; 1211 1212 table = (struct desc_struct *)segment_base(ldt_selector); 1213 } 1214 v = get_desc_base(&table[selector >> 3]); 1215 return v; 1216 } 1217 #endif 1218 1219 static inline bool pt_can_write_msr(struct vcpu_vmx *vmx) 1220 { 1221 return vmx_pt_mode_is_host_guest() && 1222 !(vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN); 1223 } 1224 1225 static inline bool pt_output_base_valid(struct kvm_vcpu *vcpu, u64 base) 1226 { 1227 /* The base must be 128-byte aligned and a legal physical address. */ 1228 return kvm_vcpu_is_legal_aligned_gpa(vcpu, base, 128); 1229 } 1230 1231 static inline void pt_load_msr(struct pt_ctx *ctx, u32 addr_range) 1232 { 1233 u32 i; 1234 1235 wrmsrq(MSR_IA32_RTIT_STATUS, ctx->status); 1236 wrmsrq(MSR_IA32_RTIT_OUTPUT_BASE, ctx->output_base); 1237 wrmsrq(MSR_IA32_RTIT_OUTPUT_MASK, ctx->output_mask); 1238 wrmsrq(MSR_IA32_RTIT_CR3_MATCH, ctx->cr3_match); 1239 for (i = 0; i < addr_range; i++) { 1240 wrmsrq(MSR_IA32_RTIT_ADDR0_A + i * 2, ctx->addr_a[i]); 1241 wrmsrq(MSR_IA32_RTIT_ADDR0_B + i * 2, ctx->addr_b[i]); 1242 } 1243 } 1244 1245 static inline void pt_save_msr(struct pt_ctx *ctx, u32 addr_range) 1246 { 1247 u32 i; 1248 1249 rdmsrq(MSR_IA32_RTIT_STATUS, ctx->status); 1250 rdmsrq(MSR_IA32_RTIT_OUTPUT_BASE, ctx->output_base); 1251 rdmsrq(MSR_IA32_RTIT_OUTPUT_MASK, ctx->output_mask); 1252 rdmsrq(MSR_IA32_RTIT_CR3_MATCH, ctx->cr3_match); 1253 for (i = 0; i < addr_range; i++) { 1254 rdmsrq(MSR_IA32_RTIT_ADDR0_A + i * 2, ctx->addr_a[i]); 1255 rdmsrq(MSR_IA32_RTIT_ADDR0_B + i * 2, ctx->addr_b[i]); 1256 } 1257 } 1258 1259 static void pt_guest_enter(struct vcpu_vmx *vmx) 1260 { 1261 if (vmx_pt_mode_is_system()) 1262 return; 1263 1264 /* 1265 * GUEST_IA32_RTIT_CTL is already set in the VMCS. 1266 * Save host state before VM entry. 1267 */ 1268 rdmsrq(MSR_IA32_RTIT_CTL, vmx->pt_desc.host.ctl); 1269 if (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) { 1270 wrmsrq(MSR_IA32_RTIT_CTL, 0); 1271 pt_save_msr(&vmx->pt_desc.host, vmx->pt_desc.num_address_ranges); 1272 pt_load_msr(&vmx->pt_desc.guest, vmx->pt_desc.num_address_ranges); 1273 } 1274 } 1275 1276 static void pt_guest_exit(struct vcpu_vmx *vmx) 1277 { 1278 if (vmx_pt_mode_is_system()) 1279 return; 1280 1281 if (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) { 1282 pt_save_msr(&vmx->pt_desc.guest, vmx->pt_desc.num_address_ranges); 1283 pt_load_msr(&vmx->pt_desc.host, vmx->pt_desc.num_address_ranges); 1284 } 1285 1286 /* 1287 * KVM requires VM_EXIT_CLEAR_IA32_RTIT_CTL to expose PT to the guest, 1288 * i.e. RTIT_CTL is always cleared on VM-Exit. Restore it if necessary. 1289 */ 1290 if (vmx->pt_desc.host.ctl) 1291 wrmsrq(MSR_IA32_RTIT_CTL, vmx->pt_desc.host.ctl); 1292 } 1293 1294 void vmx_set_host_fs_gs(struct vmcs_host_state *host, u16 fs_sel, u16 gs_sel, 1295 unsigned long fs_base, unsigned long gs_base) 1296 { 1297 if (unlikely(fs_sel != host->fs_sel)) { 1298 if (!(fs_sel & 7)) 1299 vmcs_write16(HOST_FS_SELECTOR, fs_sel); 1300 else 1301 vmcs_write16(HOST_FS_SELECTOR, 0); 1302 host->fs_sel = fs_sel; 1303 } 1304 if (unlikely(gs_sel != host->gs_sel)) { 1305 if (!(gs_sel & 7)) 1306 vmcs_write16(HOST_GS_SELECTOR, gs_sel); 1307 else 1308 vmcs_write16(HOST_GS_SELECTOR, 0); 1309 host->gs_sel = gs_sel; 1310 } 1311 if (unlikely(fs_base != host->fs_base)) { 1312 vmcs_writel(HOST_FS_BASE, fs_base); 1313 host->fs_base = fs_base; 1314 } 1315 if (unlikely(gs_base != host->gs_base)) { 1316 vmcs_writel(HOST_GS_BASE, gs_base); 1317 host->gs_base = gs_base; 1318 } 1319 } 1320 1321 void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu) 1322 { 1323 struct vcpu_vmx *vmx = to_vmx(vcpu); 1324 struct vcpu_vt *vt = to_vt(vcpu); 1325 struct vmcs_host_state *host_state; 1326 #ifdef CONFIG_X86_64 1327 int cpu = raw_smp_processor_id(); 1328 #endif 1329 unsigned long fs_base, gs_base; 1330 u16 fs_sel, gs_sel; 1331 int i; 1332 1333 /* 1334 * Note that guest MSRs to be saved/restored can also be changed 1335 * when guest state is loaded. This happens when guest transitions 1336 * to/from long-mode by setting MSR_EFER.LMA. 1337 */ 1338 if (!vmx->guest_uret_msrs_loaded) { 1339 vmx->guest_uret_msrs_loaded = true; 1340 for (i = 0; i < kvm_nr_uret_msrs; ++i) { 1341 if (!vmx->guest_uret_msrs[i].load_into_hardware) 1342 continue; 1343 1344 kvm_set_user_return_msr(i, 1345 vmx->guest_uret_msrs[i].data, 1346 vmx->guest_uret_msrs[i].mask); 1347 } 1348 } 1349 1350 if (vmx->nested.need_vmcs12_to_shadow_sync) 1351 nested_sync_vmcs12_to_shadow(vcpu); 1352 1353 if (vt->guest_state_loaded) 1354 return; 1355 1356 host_state = &vmx->loaded_vmcs->host_state; 1357 1358 /* 1359 * Set host fs and gs selectors. Unfortunately, 22.2.3 does not 1360 * allow segment selectors with cpl > 0 or ti == 1. 1361 */ 1362 host_state->ldt_sel = kvm_read_ldt(); 1363 1364 #ifdef CONFIG_X86_64 1365 savesegment(ds, host_state->ds_sel); 1366 savesegment(es, host_state->es_sel); 1367 1368 gs_base = cpu_kernelmode_gs_base(cpu); 1369 if (likely(is_64bit_mm(current->mm))) { 1370 current_save_fsgs(); 1371 fs_sel = current->thread.fsindex; 1372 gs_sel = current->thread.gsindex; 1373 fs_base = current->thread.fsbase; 1374 vt->msr_host_kernel_gs_base = current->thread.gsbase; 1375 } else { 1376 savesegment(fs, fs_sel); 1377 savesegment(gs, gs_sel); 1378 fs_base = read_msr(MSR_FS_BASE); 1379 vt->msr_host_kernel_gs_base = read_msr(MSR_KERNEL_GS_BASE); 1380 } 1381 1382 wrmsrq(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base); 1383 #else 1384 savesegment(fs, fs_sel); 1385 savesegment(gs, gs_sel); 1386 fs_base = segment_base(fs_sel); 1387 gs_base = segment_base(gs_sel); 1388 #endif 1389 1390 vmx_set_host_fs_gs(host_state, fs_sel, gs_sel, fs_base, gs_base); 1391 vt->guest_state_loaded = true; 1392 } 1393 1394 static void vmx_prepare_switch_to_host(struct vcpu_vmx *vmx) 1395 { 1396 struct vmcs_host_state *host_state; 1397 1398 if (!vmx->vt.guest_state_loaded) 1399 return; 1400 1401 host_state = &vmx->loaded_vmcs->host_state; 1402 1403 ++vmx->vcpu.stat.host_state_reload; 1404 1405 #ifdef CONFIG_X86_64 1406 rdmsrq(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base); 1407 #endif 1408 if (host_state->ldt_sel || (host_state->gs_sel & 7)) { 1409 kvm_load_ldt(host_state->ldt_sel); 1410 #ifdef CONFIG_X86_64 1411 load_gs_index(host_state->gs_sel); 1412 #else 1413 loadsegment(gs, host_state->gs_sel); 1414 #endif 1415 } 1416 if (host_state->fs_sel & 7) 1417 loadsegment(fs, host_state->fs_sel); 1418 #ifdef CONFIG_X86_64 1419 if (unlikely(host_state->ds_sel | host_state->es_sel)) { 1420 loadsegment(ds, host_state->ds_sel); 1421 loadsegment(es, host_state->es_sel); 1422 } 1423 #endif 1424 invalidate_tss_limit(); 1425 #ifdef CONFIG_X86_64 1426 wrmsrq(MSR_KERNEL_GS_BASE, vmx->vt.msr_host_kernel_gs_base); 1427 #endif 1428 load_fixmap_gdt(raw_smp_processor_id()); 1429 vmx->vt.guest_state_loaded = false; 1430 vmx->guest_uret_msrs_loaded = false; 1431 } 1432 1433 #ifdef CONFIG_X86_64 1434 static u64 vmx_read_guest_host_msr(struct vcpu_vmx *vmx, u32 msr, u64 *cache) 1435 { 1436 preempt_disable(); 1437 if (vmx->vt.guest_state_loaded) 1438 *cache = read_msr(msr); 1439 preempt_enable(); 1440 return *cache; 1441 } 1442 1443 static void vmx_write_guest_host_msr(struct vcpu_vmx *vmx, u32 msr, u64 data, 1444 u64 *cache) 1445 { 1446 preempt_disable(); 1447 if (vmx->vt.guest_state_loaded) 1448 wrmsrns(msr, data); 1449 preempt_enable(); 1450 *cache = data; 1451 } 1452 1453 static u64 vmx_read_guest_kernel_gs_base(struct vcpu_vmx *vmx) 1454 { 1455 return vmx_read_guest_host_msr(vmx, MSR_KERNEL_GS_BASE, 1456 &vmx->msr_guest_kernel_gs_base); 1457 } 1458 1459 static void vmx_write_guest_kernel_gs_base(struct vcpu_vmx *vmx, u64 data) 1460 { 1461 vmx_write_guest_host_msr(vmx, MSR_KERNEL_GS_BASE, data, 1462 &vmx->msr_guest_kernel_gs_base); 1463 } 1464 #endif 1465 1466 static void grow_ple_window(struct kvm_vcpu *vcpu) 1467 { 1468 struct vcpu_vmx *vmx = to_vmx(vcpu); 1469 unsigned int old = vmx->ple_window; 1470 1471 vmx->ple_window = __grow_ple_window(old, ple_window, 1472 ple_window_grow, 1473 ple_window_max); 1474 1475 if (vmx->ple_window != old) { 1476 vmx->ple_window_dirty = true; 1477 trace_kvm_ple_window_update(vcpu->vcpu_id, 1478 vmx->ple_window, old); 1479 } 1480 } 1481 1482 static void shrink_ple_window(struct kvm_vcpu *vcpu) 1483 { 1484 struct vcpu_vmx *vmx = to_vmx(vcpu); 1485 unsigned int old = vmx->ple_window; 1486 1487 vmx->ple_window = __shrink_ple_window(old, ple_window, 1488 ple_window_shrink, 1489 ple_window); 1490 1491 if (vmx->ple_window != old) { 1492 vmx->ple_window_dirty = true; 1493 trace_kvm_ple_window_update(vcpu->vcpu_id, 1494 vmx->ple_window, old); 1495 } 1496 } 1497 1498 void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu) 1499 { 1500 struct vcpu_vmx *vmx = to_vmx(vcpu); 1501 bool already_loaded = vmx->loaded_vmcs->cpu == cpu; 1502 struct vmcs *prev; 1503 1504 if (!already_loaded) { 1505 loaded_vmcs_clear(vmx->loaded_vmcs); 1506 local_irq_disable(); 1507 1508 /* 1509 * Ensure loaded_vmcs->cpu is read before adding loaded_vmcs to 1510 * this cpu's percpu list, otherwise it may not yet be deleted 1511 * from its previous cpu's percpu list. Pairs with the 1512 * smb_wmb() in __loaded_vmcs_clear(). 1513 */ 1514 smp_rmb(); 1515 1516 list_add(&vmx->loaded_vmcs->loaded_vmcss_on_cpu_link, 1517 &per_cpu(loaded_vmcss_on_cpu, cpu)); 1518 local_irq_enable(); 1519 } 1520 1521 prev = per_cpu(current_vmcs, cpu); 1522 if (prev != vmx->loaded_vmcs->vmcs) { 1523 per_cpu(current_vmcs, cpu) = vmx->loaded_vmcs->vmcs; 1524 vmcs_load(vmx->loaded_vmcs->vmcs); 1525 } 1526 1527 if (!already_loaded) { 1528 void *gdt = get_current_gdt_ro(); 1529 1530 /* 1531 * Flush all EPTP/VPID contexts, the new pCPU may have stale 1532 * TLB entries from its previous association with the vCPU. 1533 */ 1534 kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); 1535 1536 /* 1537 * Linux uses per-cpu TSS and GDT, so set these when switching 1538 * processors. See 22.2.4. 1539 */ 1540 vmcs_writel(HOST_TR_BASE, 1541 (unsigned long)&get_cpu_entry_area(cpu)->tss.x86_tss); 1542 vmcs_writel(HOST_GDTR_BASE, (unsigned long)gdt); /* 22.2.4 */ 1543 1544 if (IS_ENABLED(CONFIG_IA32_EMULATION) || IS_ENABLED(CONFIG_X86_32)) { 1545 /* 22.2.3 */ 1546 vmcs_writel(HOST_IA32_SYSENTER_ESP, 1547 (unsigned long)(cpu_entry_stack(cpu) + 1)); 1548 } 1549 1550 vmx->loaded_vmcs->cpu = cpu; 1551 } 1552 } 1553 1554 /* 1555 * Switches to specified vcpu, until a matching vcpu_put(), but assumes 1556 * vcpu mutex is already taken. 1557 */ 1558 void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) 1559 { 1560 if (vcpu->scheduled_out && !kvm_pause_in_guest(vcpu->kvm)) 1561 shrink_ple_window(vcpu); 1562 1563 vmx_vcpu_load_vmcs(vcpu, cpu); 1564 1565 vmx_vcpu_pi_load(vcpu, cpu); 1566 } 1567 1568 void vmx_vcpu_put(struct kvm_vcpu *vcpu) 1569 { 1570 vmx_vcpu_pi_put(vcpu); 1571 1572 vmx_prepare_switch_to_host(to_vmx(vcpu)); 1573 } 1574 1575 static void vmx_switch_loaded_vmcs(struct kvm_vcpu *vcpu, 1576 struct loaded_vmcs *vmcs) 1577 { 1578 struct vcpu_vmx *vmx = to_vmx(vcpu); 1579 int cpu; 1580 1581 cpu = get_cpu(); 1582 vmx->loaded_vmcs = vmcs; 1583 vmx_vcpu_load_vmcs(vcpu, cpu); 1584 put_cpu(); 1585 } 1586 1587 static void vmx_load_vmcs01(struct kvm_vcpu *vcpu) 1588 { 1589 struct vcpu_vmx *vmx = to_vmx(vcpu); 1590 1591 if (!is_guest_mode(vcpu)) { 1592 WARN_ON_ONCE(vmx->loaded_vmcs != &vmx->vmcs01); 1593 return; 1594 } 1595 1596 WARN_ON_ONCE(vmx->loaded_vmcs != &vmx->nested.vmcs02); 1597 vmx_switch_loaded_vmcs(vcpu, &vmx->vmcs01); 1598 } 1599 1600 static void vmx_put_vmcs01(struct kvm_vcpu *vcpu) 1601 { 1602 if (!is_guest_mode(vcpu)) 1603 return; 1604 1605 vmx_switch_loaded_vmcs(vcpu, &to_vmx(vcpu)->nested.vmcs02); 1606 } 1607 DEFINE_GUARD(vmx_vmcs01, struct kvm_vcpu *, 1608 vmx_load_vmcs01(_T), vmx_put_vmcs01(_T)) 1609 1610 bool vmx_emulation_required(struct kvm_vcpu *vcpu) 1611 { 1612 return emulate_invalid_guest_state && !vmx_guest_state_valid(vcpu); 1613 } 1614 1615 unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu) 1616 { 1617 struct vcpu_vmx *vmx = to_vmx(vcpu); 1618 unsigned long rflags, save_rflags; 1619 1620 if (!kvm_register_is_available(vcpu, VCPU_REG_RFLAGS)) { 1621 kvm_register_mark_available(vcpu, VCPU_REG_RFLAGS); 1622 rflags = vmcs_readl(GUEST_RFLAGS); 1623 if (vmx->rmode.vm86_active) { 1624 rflags &= RMODE_GUEST_OWNED_EFLAGS_BITS; 1625 save_rflags = vmx->rmode.save_rflags; 1626 rflags |= save_rflags & ~RMODE_GUEST_OWNED_EFLAGS_BITS; 1627 } 1628 vmx->rflags = rflags; 1629 } 1630 return vmx->rflags; 1631 } 1632 1633 void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) 1634 { 1635 struct vcpu_vmx *vmx = to_vmx(vcpu); 1636 unsigned long old_rflags; 1637 1638 /* 1639 * Unlike CR0 and CR4, RFLAGS handling requires checking if the vCPU 1640 * is an unrestricted guest in order to mark L2 as needing emulation 1641 * if L1 runs L2 as a restricted guest. 1642 */ 1643 if (is_unrestricted_guest(vcpu)) { 1644 kvm_register_mark_available(vcpu, VCPU_REG_RFLAGS); 1645 vmx->rflags = rflags; 1646 vmcs_writel(GUEST_RFLAGS, rflags); 1647 return; 1648 } 1649 1650 old_rflags = vmx_get_rflags(vcpu); 1651 vmx->rflags = rflags; 1652 if (vmx->rmode.vm86_active) { 1653 vmx->rmode.save_rflags = rflags; 1654 rflags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM; 1655 } 1656 vmcs_writel(GUEST_RFLAGS, rflags); 1657 1658 if ((old_rflags ^ vmx->rflags) & X86_EFLAGS_VM) 1659 vmx->vt.emulation_required = vmx_emulation_required(vcpu); 1660 } 1661 1662 bool vmx_get_if_flag(struct kvm_vcpu *vcpu) 1663 { 1664 return vmx_get_rflags(vcpu) & X86_EFLAGS_IF; 1665 } 1666 1667 u32 vmx_get_interrupt_shadow(struct kvm_vcpu *vcpu) 1668 { 1669 u32 interruptibility = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO); 1670 int ret = 0; 1671 1672 if (interruptibility & GUEST_INTR_STATE_STI) 1673 ret |= KVM_X86_SHADOW_INT_STI; 1674 if (interruptibility & GUEST_INTR_STATE_MOV_SS) 1675 ret |= KVM_X86_SHADOW_INT_MOV_SS; 1676 1677 return ret; 1678 } 1679 1680 void vmx_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask) 1681 { 1682 u32 interruptibility_old = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO); 1683 u32 interruptibility = interruptibility_old; 1684 1685 interruptibility &= ~(GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS); 1686 1687 if (mask & KVM_X86_SHADOW_INT_MOV_SS) 1688 interruptibility |= GUEST_INTR_STATE_MOV_SS; 1689 else if (mask & KVM_X86_SHADOW_INT_STI) 1690 interruptibility |= GUEST_INTR_STATE_STI; 1691 1692 if ((interruptibility != interruptibility_old)) 1693 vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, interruptibility); 1694 } 1695 1696 static int vmx_rtit_ctl_check(struct kvm_vcpu *vcpu, u64 data) 1697 { 1698 struct vcpu_vmx *vmx = to_vmx(vcpu); 1699 unsigned long value; 1700 1701 /* 1702 * Any MSR write that attempts to change bits marked reserved will 1703 * case a #GP fault. 1704 */ 1705 if (data & vmx->pt_desc.ctl_bitmask) 1706 return 1; 1707 1708 /* 1709 * Any attempt to modify IA32_RTIT_CTL while TraceEn is set will 1710 * result in a #GP unless the same write also clears TraceEn. 1711 */ 1712 if ((vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) && 1713 (data & RTIT_CTL_TRACEEN) && 1714 data != vmx->pt_desc.guest.ctl) 1715 return 1; 1716 1717 /* 1718 * WRMSR to IA32_RTIT_CTL that sets TraceEn but clears this bit 1719 * and FabricEn would cause #GP, if 1720 * CPUID.(EAX=14H, ECX=0):ECX.SNGLRGNOUT[bit 2] = 0 1721 */ 1722 if ((data & RTIT_CTL_TRACEEN) && !(data & RTIT_CTL_TOPA) && 1723 !(data & RTIT_CTL_FABRIC_EN) && 1724 !intel_pt_validate_cap(vmx->pt_desc.caps, 1725 PT_CAP_single_range_output)) 1726 return 1; 1727 1728 /* 1729 * MTCFreq, CycThresh and PSBFreq encodings check, any MSR write that 1730 * utilize encodings marked reserved will cause a #GP fault. 1731 */ 1732 value = intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_mtc_periods); 1733 if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_mtc) && 1734 !test_bit((data & RTIT_CTL_MTC_RANGE) >> 1735 RTIT_CTL_MTC_RANGE_OFFSET, &value)) 1736 return 1; 1737 value = intel_pt_validate_cap(vmx->pt_desc.caps, 1738 PT_CAP_cycle_thresholds); 1739 if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_psb_cyc) && 1740 !test_bit((data & RTIT_CTL_CYC_THRESH) >> 1741 RTIT_CTL_CYC_THRESH_OFFSET, &value)) 1742 return 1; 1743 value = intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_psb_periods); 1744 if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_psb_cyc) && 1745 !test_bit((data & RTIT_CTL_PSB_FREQ) >> 1746 RTIT_CTL_PSB_FREQ_OFFSET, &value)) 1747 return 1; 1748 1749 /* 1750 * If ADDRx_CFG is reserved or the encodings is >2 will 1751 * cause a #GP fault. 1752 */ 1753 value = (data & RTIT_CTL_ADDR0) >> RTIT_CTL_ADDR0_OFFSET; 1754 if ((value && (vmx->pt_desc.num_address_ranges < 1)) || (value > 2)) 1755 return 1; 1756 value = (data & RTIT_CTL_ADDR1) >> RTIT_CTL_ADDR1_OFFSET; 1757 if ((value && (vmx->pt_desc.num_address_ranges < 2)) || (value > 2)) 1758 return 1; 1759 value = (data & RTIT_CTL_ADDR2) >> RTIT_CTL_ADDR2_OFFSET; 1760 if ((value && (vmx->pt_desc.num_address_ranges < 3)) || (value > 2)) 1761 return 1; 1762 value = (data & RTIT_CTL_ADDR3) >> RTIT_CTL_ADDR3_OFFSET; 1763 if ((value && (vmx->pt_desc.num_address_ranges < 4)) || (value > 2)) 1764 return 1; 1765 1766 return 0; 1767 } 1768 1769 int vmx_check_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type, 1770 void *insn, int insn_len) 1771 { 1772 /* 1773 * Emulation of instructions in SGX enclaves is impossible as RIP does 1774 * not point at the failing instruction, and even if it did, the code 1775 * stream is inaccessible. Inject #UD instead of exiting to userspace 1776 * so that guest userspace can't DoS the guest simply by triggering 1777 * emulation (enclaves are CPL3 only). 1778 */ 1779 if (vmx_get_exit_reason(vcpu).enclave_mode) { 1780 kvm_queue_exception(vcpu, UD_VECTOR); 1781 return X86EMUL_PROPAGATE_FAULT; 1782 } 1783 1784 /* Check that emulation is possible during event vectoring */ 1785 if ((to_vmx(vcpu)->idt_vectoring_info & VECTORING_INFO_VALID_MASK) && 1786 !kvm_can_emulate_event_vectoring(emul_type)) 1787 return X86EMUL_UNHANDLEABLE_VECTORING; 1788 1789 return X86EMUL_CONTINUE; 1790 } 1791 1792 static int skip_emulated_instruction(struct kvm_vcpu *vcpu) 1793 { 1794 union vmx_exit_reason exit_reason = vmx_get_exit_reason(vcpu); 1795 unsigned long rip, orig_rip; 1796 u32 instr_len; 1797 1798 /* 1799 * Using VMCS.VM_EXIT_INSTRUCTION_LEN on EPT misconfig depends on 1800 * undefined behavior: Intel's SDM doesn't mandate the VMCS field be 1801 * set when EPT misconfig occurs. In practice, real hardware updates 1802 * VM_EXIT_INSTRUCTION_LEN on EPT misconfig, but other hypervisors 1803 * (namely Hyper-V) don't set it due to it being undefined behavior, 1804 * i.e. we end up advancing IP with some random value. 1805 */ 1806 if (!static_cpu_has(X86_FEATURE_HYPERVISOR) || 1807 exit_reason.basic != EXIT_REASON_EPT_MISCONFIG) { 1808 instr_len = vmcs_read32(VM_EXIT_INSTRUCTION_LEN); 1809 1810 /* 1811 * Emulating an enclave's instructions isn't supported as KVM 1812 * cannot access the enclave's memory or its true RIP, e.g. the 1813 * vmcs.GUEST_RIP points at the exit point of the enclave, not 1814 * the RIP that actually triggered the VM-Exit. But, because 1815 * most instructions that cause VM-Exit will #UD in an enclave, 1816 * most instruction-based VM-Exits simply do not occur. 1817 * 1818 * There are a few exceptions, notably the debug instructions 1819 * INT1ICEBRK and INT3, as they are allowed in debug enclaves 1820 * and generate #DB/#BP as expected, which KVM might intercept. 1821 * But again, the CPU does the dirty work and saves an instr 1822 * length of zero so VMMs don't shoot themselves in the foot. 1823 * WARN if KVM tries to skip a non-zero length instruction on 1824 * a VM-Exit from an enclave. 1825 */ 1826 if (!instr_len) 1827 goto rip_updated; 1828 1829 WARN_ONCE(exit_reason.enclave_mode, 1830 "skipping instruction after SGX enclave VM-Exit"); 1831 1832 orig_rip = kvm_rip_read(vcpu); 1833 rip = orig_rip + instr_len; 1834 #ifdef CONFIG_X86_64 1835 /* 1836 * We need to mask out the high 32 bits of RIP if not in 64-bit 1837 * mode, but just finding out that we are in 64-bit mode is 1838 * quite expensive. Only do it if there was a carry. 1839 */ 1840 if (unlikely(((rip ^ orig_rip) >> 31) == 3) && !is_64_bit_mode(vcpu)) 1841 rip = (u32)rip; 1842 #endif 1843 kvm_rip_write(vcpu, rip); 1844 } else { 1845 if (!kvm_emulate_instruction(vcpu, EMULTYPE_SKIP)) 1846 return 0; 1847 } 1848 1849 rip_updated: 1850 /* skipping an emulated instruction also counts */ 1851 vmx_set_interrupt_shadow(vcpu, 0); 1852 1853 return 1; 1854 } 1855 1856 /* 1857 * Recognizes a pending MTF VM-exit and records the nested state for later 1858 * delivery. 1859 */ 1860 void vmx_update_emulated_instruction(struct kvm_vcpu *vcpu) 1861 { 1862 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 1863 struct vcpu_vmx *vmx = to_vmx(vcpu); 1864 1865 if (!is_guest_mode(vcpu)) 1866 return; 1867 1868 /* 1869 * Per the SDM, MTF takes priority over debug-trap exceptions besides 1870 * TSS T-bit traps and ICEBP (INT1). KVM doesn't emulate T-bit traps 1871 * or ICEBP (in the emulator proper), and skipping of ICEBP after an 1872 * intercepted #DB deliberately avoids single-step #DB and MTF updates 1873 * as ICEBP is higher priority than both. As instruction emulation is 1874 * completed at this point (i.e. KVM is at the instruction boundary), 1875 * any #DB exception pending delivery must be a debug-trap of lower 1876 * priority than MTF. Record the pending MTF state to be delivered in 1877 * vmx_check_nested_events(). 1878 */ 1879 if (nested_cpu_has_mtf(vmcs12) && 1880 (!vcpu->arch.exception.pending || 1881 vcpu->arch.exception.vector == DB_VECTOR) && 1882 (!vcpu->arch.exception_vmexit.pending || 1883 vcpu->arch.exception_vmexit.vector == DB_VECTOR)) { 1884 vmx->nested.mtf_pending = true; 1885 kvm_make_request(KVM_REQ_EVENT, vcpu); 1886 } else { 1887 vmx->nested.mtf_pending = false; 1888 } 1889 } 1890 1891 int vmx_skip_emulated_instruction(struct kvm_vcpu *vcpu) 1892 { 1893 vmx_update_emulated_instruction(vcpu); 1894 return skip_emulated_instruction(vcpu); 1895 } 1896 1897 static void vmx_clear_hlt(struct kvm_vcpu *vcpu) 1898 { 1899 /* 1900 * Ensure that we clear the HLT state in the VMCS. We don't need to 1901 * explicitly skip the instruction because if the HLT state is set, 1902 * then the instruction is already executing and RIP has already been 1903 * advanced. 1904 */ 1905 if (kvm_hlt_in_guest(vcpu->kvm) && 1906 vmcs_read32(GUEST_ACTIVITY_STATE) == GUEST_ACTIVITY_HLT) 1907 vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE); 1908 } 1909 1910 void vmx_inject_exception(struct kvm_vcpu *vcpu) 1911 { 1912 struct kvm_queued_exception *ex = &vcpu->arch.exception; 1913 u32 intr_info = ex->vector | INTR_INFO_VALID_MASK; 1914 struct vcpu_vmx *vmx = to_vmx(vcpu); 1915 1916 /* 1917 * When injecting a #DB, single-stepping is enabled in RFLAGS, and STI 1918 * or MOV-SS blocking is active, set vmcs.PENDING_DBG_EXCEPTIONS.BS to 1919 * prevent a false positive from VM-Entry consistency check. VM-Entry 1920 * asserts that a single-step #DB _must_ be pending in this scenario, 1921 * as the previous instruction cannot have toggled RFLAGS.TF 0=>1 1922 * (because STI and POP/MOV don't modify RFLAGS), therefore the one 1923 * instruction delay when activating single-step breakpoints must have 1924 * already expired. However, the CPU isn't smart enough to peek at 1925 * vmcs.VM_ENTRY_INTR_INFO_FIELD and so doesn't realize that yes, there 1926 * is indeed a #DB pending/imminent. 1927 */ 1928 if (ex->vector == DB_VECTOR && 1929 (vmx_get_rflags(vcpu) & X86_EFLAGS_TF) && 1930 vmx_get_interrupt_shadow(vcpu)) 1931 vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS, 1932 vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS) | DR6_BS); 1933 1934 kvm_deliver_exception_payload(vcpu, ex); 1935 1936 if (ex->has_error_code) { 1937 /* 1938 * Despite the error code being architecturally defined as 32 1939 * bits, and the VMCS field being 32 bits, Intel CPUs and thus 1940 * VMX don't actually supporting setting bits 31:16. Hardware 1941 * will (should) never provide a bogus error code, but AMD CPUs 1942 * do generate error codes with bits 31:16 set, and so KVM's 1943 * ABI lets userspace shove in arbitrary 32-bit values. Drop 1944 * the upper bits to avoid VM-Fail, losing information that 1945 * doesn't really exist is preferable to killing the VM. 1946 */ 1947 vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, (u16)ex->error_code); 1948 intr_info |= INTR_INFO_DELIVER_CODE_MASK; 1949 } 1950 1951 if (vmx->rmode.vm86_active) { 1952 int inc_eip = 0; 1953 if (kvm_exception_is_soft(ex->vector)) 1954 inc_eip = vcpu->arch.event_exit_inst_len; 1955 kvm_inject_realmode_interrupt(vcpu, ex->vector, inc_eip); 1956 return; 1957 } 1958 1959 WARN_ON_ONCE(vmx->vt.emulation_required); 1960 1961 if (kvm_exception_is_soft(ex->vector)) { 1962 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 1963 vmx->vcpu.arch.event_exit_inst_len); 1964 intr_info |= INTR_TYPE_SOFT_EXCEPTION; 1965 } else 1966 intr_info |= INTR_TYPE_HARD_EXCEPTION; 1967 1968 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr_info); 1969 1970 vmx_clear_hlt(vcpu); 1971 } 1972 1973 static void vmx_setup_uret_msr(struct vcpu_vmx *vmx, unsigned int msr, 1974 bool load_into_hardware) 1975 { 1976 struct vmx_uret_msr *uret_msr; 1977 1978 uret_msr = vmx_find_uret_msr(vmx, msr); 1979 if (!uret_msr) 1980 return; 1981 1982 uret_msr->load_into_hardware = load_into_hardware; 1983 } 1984 1985 /* 1986 * Configuring user return MSRs to automatically save, load, and restore MSRs 1987 * that need to be shoved into hardware when running the guest. Note, omitting 1988 * an MSR here does _NOT_ mean it's not emulated, only that it will not be 1989 * loaded into hardware when running the guest. 1990 */ 1991 static void vmx_setup_uret_msrs(struct vcpu_vmx *vmx) 1992 { 1993 #ifdef CONFIG_X86_64 1994 bool load_syscall_msrs; 1995 1996 /* 1997 * The SYSCALL MSRs are only needed on long mode guests, and only 1998 * when EFER.SCE is set. 1999 */ 2000 load_syscall_msrs = is_long_mode(&vmx->vcpu) && 2001 (vmx->vcpu.arch.efer & EFER_SCE); 2002 2003 vmx_setup_uret_msr(vmx, MSR_STAR, load_syscall_msrs); 2004 vmx_setup_uret_msr(vmx, MSR_LSTAR, load_syscall_msrs); 2005 vmx_setup_uret_msr(vmx, MSR_SYSCALL_MASK, load_syscall_msrs); 2006 #endif 2007 vmx_setup_uret_msr(vmx, MSR_EFER, update_transition_efer(vmx)); 2008 2009 vmx_setup_uret_msr(vmx, MSR_TSC_AUX, 2010 guest_cpu_cap_has(&vmx->vcpu, X86_FEATURE_RDTSCP) || 2011 guest_cpu_cap_has(&vmx->vcpu, X86_FEATURE_RDPID)); 2012 2013 /* 2014 * hle=0, rtm=0, tsx_ctrl=1 can be found with some combinations of new 2015 * kernel and old userspace. If those guests run on a tsx=off host, do 2016 * allow guests to use TSX_CTRL, but don't change the value in hardware 2017 * so that TSX remains always disabled. 2018 */ 2019 vmx_setup_uret_msr(vmx, MSR_IA32_TSX_CTRL, boot_cpu_has(X86_FEATURE_RTM)); 2020 2021 /* 2022 * The set of MSRs to load may have changed, reload MSRs before the 2023 * next VM-Enter. 2024 */ 2025 vmx->guest_uret_msrs_loaded = false; 2026 } 2027 2028 u64 vmx_get_l2_tsc_offset(struct kvm_vcpu *vcpu) 2029 { 2030 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 2031 2032 if (nested_cpu_has(vmcs12, CPU_BASED_USE_TSC_OFFSETTING)) 2033 return vmcs12->tsc_offset; 2034 2035 return 0; 2036 } 2037 2038 u64 vmx_get_l2_tsc_multiplier(struct kvm_vcpu *vcpu) 2039 { 2040 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 2041 2042 if (nested_cpu_has(vmcs12, CPU_BASED_USE_TSC_OFFSETTING) && 2043 nested_cpu_has2(vmcs12, SECONDARY_EXEC_TSC_SCALING)) 2044 return vmcs12->tsc_multiplier; 2045 2046 return kvm_caps.default_tsc_scaling_ratio; 2047 } 2048 2049 void vmx_write_tsc_offset(struct kvm_vcpu *vcpu) 2050 { 2051 vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset); 2052 } 2053 2054 void vmx_write_tsc_multiplier(struct kvm_vcpu *vcpu) 2055 { 2056 vmcs_write64(TSC_MULTIPLIER, vcpu->arch.tsc_scaling_ratio); 2057 } 2058 2059 /* 2060 * Userspace is allowed to set any supported IA32_FEATURE_CONTROL regardless of 2061 * guest CPUID. Note, KVM allows userspace to set "VMX in SMX" to maintain 2062 * backwards compatibility even though KVM doesn't support emulating SMX. And 2063 * because userspace set "VMX in SMX", the guest must also be allowed to set it, 2064 * e.g. if the MSR is left unlocked and the guest does a RMW operation. 2065 */ 2066 #define KVM_SUPPORTED_FEATURE_CONTROL (FEAT_CTL_LOCKED | \ 2067 FEAT_CTL_VMX_ENABLED_INSIDE_SMX | \ 2068 FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX | \ 2069 FEAT_CTL_SGX_LC_ENABLED | \ 2070 FEAT_CTL_SGX_ENABLED | \ 2071 FEAT_CTL_LMCE_ENABLED) 2072 2073 static inline bool is_vmx_feature_control_msr_valid(struct vcpu_vmx *vmx, 2074 struct msr_data *msr) 2075 { 2076 uint64_t valid_bits; 2077 2078 /* 2079 * Ensure KVM_SUPPORTED_FEATURE_CONTROL is updated when new bits are 2080 * exposed to the guest. 2081 */ 2082 WARN_ON_ONCE(vmx->msr_ia32_feature_control_valid_bits & 2083 ~KVM_SUPPORTED_FEATURE_CONTROL); 2084 2085 if (!msr->host_initiated && 2086 (vmx->msr_ia32_feature_control & FEAT_CTL_LOCKED)) 2087 return false; 2088 2089 if (msr->host_initiated) 2090 valid_bits = KVM_SUPPORTED_FEATURE_CONTROL; 2091 else 2092 valid_bits = vmx->msr_ia32_feature_control_valid_bits; 2093 2094 return !(msr->data & ~valid_bits); 2095 } 2096 2097 int vmx_get_feature_msr(u32 msr, u64 *data) 2098 { 2099 switch (msr) { 2100 case KVM_FIRST_EMULATED_VMX_MSR ... KVM_LAST_EMULATED_VMX_MSR: 2101 if (!nested) 2102 return 1; 2103 return vmx_get_vmx_msr(&vmcs_config.nested, msr, data); 2104 default: 2105 return KVM_MSR_RET_UNSUPPORTED; 2106 } 2107 } 2108 2109 /* 2110 * Reads an msr value (of 'msr_info->index') into 'msr_info->data'. 2111 * Returns 0 on success, non-0 otherwise. 2112 * Assumes vcpu_load() was already called. 2113 */ 2114 int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) 2115 { 2116 struct vcpu_vmx *vmx = to_vmx(vcpu); 2117 struct vmx_uret_msr *msr; 2118 u32 index; 2119 2120 switch (msr_info->index) { 2121 #ifdef CONFIG_X86_64 2122 case MSR_FS_BASE: 2123 msr_info->data = vmcs_readl(GUEST_FS_BASE); 2124 break; 2125 case MSR_GS_BASE: 2126 msr_info->data = vmcs_readl(GUEST_GS_BASE); 2127 break; 2128 case MSR_KERNEL_GS_BASE: 2129 msr_info->data = vmx_read_guest_kernel_gs_base(vmx); 2130 break; 2131 #endif 2132 case MSR_EFER: 2133 return kvm_get_msr_common(vcpu, msr_info); 2134 case MSR_IA32_TSX_CTRL: 2135 if (!msr_info->host_initiated && 2136 !(vcpu->arch.arch_capabilities & ARCH_CAP_TSX_CTRL_MSR)) 2137 return 1; 2138 goto find_uret_msr; 2139 case MSR_IA32_UMWAIT_CONTROL: 2140 if (!msr_info->host_initiated && !vmx_has_waitpkg(vmx)) 2141 return 1; 2142 2143 msr_info->data = vmx->msr_ia32_umwait_control; 2144 break; 2145 case MSR_IA32_SPEC_CTRL: 2146 if (!msr_info->host_initiated && 2147 !guest_has_spec_ctrl_msr(vcpu)) 2148 return 1; 2149 2150 msr_info->data = to_vmx(vcpu)->spec_ctrl; 2151 break; 2152 case MSR_IA32_SYSENTER_CS: 2153 msr_info->data = vmcs_read32(GUEST_SYSENTER_CS); 2154 break; 2155 case MSR_IA32_SYSENTER_EIP: 2156 msr_info->data = vmcs_readl(GUEST_SYSENTER_EIP); 2157 break; 2158 case MSR_IA32_SYSENTER_ESP: 2159 msr_info->data = vmcs_readl(GUEST_SYSENTER_ESP); 2160 break; 2161 case MSR_IA32_BNDCFGS: 2162 if (!kvm_mpx_supported() || 2163 (!msr_info->host_initiated && 2164 !guest_cpu_cap_has(vcpu, X86_FEATURE_MPX))) 2165 return 1; 2166 msr_info->data = vmcs_read64(GUEST_BNDCFGS); 2167 break; 2168 case MSR_IA32_MCG_EXT_CTL: 2169 if (!msr_info->host_initiated && 2170 !(vmx->msr_ia32_feature_control & 2171 FEAT_CTL_LMCE_ENABLED)) 2172 return 1; 2173 msr_info->data = vcpu->arch.mcg_ext_ctl; 2174 break; 2175 case MSR_IA32_FEAT_CTL: 2176 msr_info->data = vmx->msr_ia32_feature_control; 2177 break; 2178 case MSR_IA32_SGXLEPUBKEYHASH0 ... MSR_IA32_SGXLEPUBKEYHASH3: 2179 if (!msr_info->host_initiated && 2180 !guest_cpu_cap_has(vcpu, X86_FEATURE_SGX_LC)) 2181 return 1; 2182 msr_info->data = to_vmx(vcpu)->msr_ia32_sgxlepubkeyhash 2183 [msr_info->index - MSR_IA32_SGXLEPUBKEYHASH0]; 2184 break; 2185 case KVM_FIRST_EMULATED_VMX_MSR ... KVM_LAST_EMULATED_VMX_MSR: 2186 if (!guest_cpu_cap_has(vcpu, X86_FEATURE_VMX)) 2187 return 1; 2188 if (vmx_get_vmx_msr(&vmx->nested.msrs, msr_info->index, 2189 &msr_info->data)) 2190 return 1; 2191 #ifdef CONFIG_KVM_HYPERV 2192 /* 2193 * Enlightened VMCS v1 doesn't have certain VMCS fields but 2194 * instead of just ignoring the features, different Hyper-V 2195 * versions are either trying to use them and fail or do some 2196 * sanity checking and refuse to boot. Filter all unsupported 2197 * features out. 2198 */ 2199 if (!msr_info->host_initiated && guest_cpu_cap_has_evmcs(vcpu)) 2200 nested_evmcs_filter_control_msr(vcpu, msr_info->index, 2201 &msr_info->data); 2202 #endif 2203 break; 2204 case MSR_IA32_RTIT_CTL: 2205 if (!vmx_pt_mode_is_host_guest()) 2206 return 1; 2207 msr_info->data = vmx->pt_desc.guest.ctl; 2208 break; 2209 case MSR_IA32_RTIT_STATUS: 2210 if (!vmx_pt_mode_is_host_guest()) 2211 return 1; 2212 msr_info->data = vmx->pt_desc.guest.status; 2213 break; 2214 case MSR_IA32_RTIT_CR3_MATCH: 2215 if (!vmx_pt_mode_is_host_guest() || 2216 !intel_pt_validate_cap(vmx->pt_desc.caps, 2217 PT_CAP_cr3_filtering)) 2218 return 1; 2219 msr_info->data = vmx->pt_desc.guest.cr3_match; 2220 break; 2221 case MSR_IA32_RTIT_OUTPUT_BASE: 2222 if (!vmx_pt_mode_is_host_guest() || 2223 (!intel_pt_validate_cap(vmx->pt_desc.caps, 2224 PT_CAP_topa_output) && 2225 !intel_pt_validate_cap(vmx->pt_desc.caps, 2226 PT_CAP_single_range_output))) 2227 return 1; 2228 msr_info->data = vmx->pt_desc.guest.output_base; 2229 break; 2230 case MSR_IA32_RTIT_OUTPUT_MASK: 2231 if (!vmx_pt_mode_is_host_guest() || 2232 (!intel_pt_validate_cap(vmx->pt_desc.caps, 2233 PT_CAP_topa_output) && 2234 !intel_pt_validate_cap(vmx->pt_desc.caps, 2235 PT_CAP_single_range_output))) 2236 return 1; 2237 msr_info->data = vmx->pt_desc.guest.output_mask; 2238 break; 2239 case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B: 2240 index = msr_info->index - MSR_IA32_RTIT_ADDR0_A; 2241 if (!vmx_pt_mode_is_host_guest() || 2242 (index >= 2 * vmx->pt_desc.num_address_ranges)) 2243 return 1; 2244 if (index % 2) 2245 msr_info->data = vmx->pt_desc.guest.addr_b[index / 2]; 2246 else 2247 msr_info->data = vmx->pt_desc.guest.addr_a[index / 2]; 2248 break; 2249 case MSR_IA32_S_CET: 2250 msr_info->data = vmcs_readl(GUEST_S_CET); 2251 break; 2252 case MSR_KVM_INTERNAL_GUEST_SSP: 2253 msr_info->data = vmcs_readl(GUEST_SSP); 2254 break; 2255 case MSR_IA32_INT_SSP_TAB: 2256 msr_info->data = vmcs_readl(GUEST_INTR_SSP_TABLE); 2257 break; 2258 case MSR_IA32_DEBUGCTLMSR: 2259 msr_info->data = vmx_guest_debugctl_read(); 2260 break; 2261 default: 2262 find_uret_msr: 2263 msr = vmx_find_uret_msr(vmx, msr_info->index); 2264 if (msr) { 2265 msr_info->data = msr->data; 2266 break; 2267 } 2268 return kvm_get_msr_common(vcpu, msr_info); 2269 } 2270 2271 return 0; 2272 } 2273 2274 static u64 nested_vmx_truncate_sysenter_addr(struct kvm_vcpu *vcpu, 2275 u64 data) 2276 { 2277 #ifdef CONFIG_X86_64 2278 if (!guest_cpu_cap_has(vcpu, X86_FEATURE_LM)) 2279 return (u32)data; 2280 #endif 2281 return (unsigned long)data; 2282 } 2283 2284 u64 vmx_get_supported_debugctl(struct kvm_vcpu *vcpu, bool host_initiated) 2285 { 2286 u64 debugctl = 0; 2287 2288 if (boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT) && 2289 (host_initiated || guest_cpu_cap_has(vcpu, X86_FEATURE_BUS_LOCK_DETECT))) 2290 debugctl |= DEBUGCTLMSR_BUS_LOCK_DETECT; 2291 2292 if ((kvm_caps.supported_perf_cap & PERF_CAP_LBR_FMT) && 2293 (host_initiated || intel_pmu_lbr_is_enabled(vcpu))) 2294 debugctl |= DEBUGCTLMSR_LBR | DEBUGCTLMSR_FREEZE_LBRS_ON_PMI; 2295 2296 if (boot_cpu_has(X86_FEATURE_RTM) && 2297 (host_initiated || guest_cpu_cap_has(vcpu, X86_FEATURE_RTM))) 2298 debugctl |= DEBUGCTLMSR_RTM_DEBUG; 2299 2300 return debugctl; 2301 } 2302 2303 bool vmx_is_valid_debugctl(struct kvm_vcpu *vcpu, u64 data, bool host_initiated) 2304 { 2305 u64 invalid; 2306 2307 invalid = data & ~vmx_get_supported_debugctl(vcpu, host_initiated); 2308 if (invalid & (DEBUGCTLMSR_BTF | DEBUGCTLMSR_LBR)) { 2309 kvm_pr_unimpl_wrmsr(vcpu, MSR_IA32_DEBUGCTLMSR, data); 2310 invalid &= ~(DEBUGCTLMSR_BTF | DEBUGCTLMSR_LBR); 2311 } 2312 return !invalid; 2313 } 2314 2315 /* 2316 * Writes msr value into the appropriate "register". 2317 * Returns 0 on success, non-0 otherwise. 2318 * Assumes vcpu_load() was already called. 2319 */ 2320 int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) 2321 { 2322 struct vcpu_vmx *vmx = to_vmx(vcpu); 2323 struct vmx_uret_msr *msr; 2324 int ret = 0; 2325 u32 msr_index = msr_info->index; 2326 u64 data = msr_info->data; 2327 u32 index; 2328 2329 switch (msr_index) { 2330 case MSR_EFER: 2331 ret = kvm_set_msr_common(vcpu, msr_info); 2332 break; 2333 #ifdef CONFIG_X86_64 2334 case MSR_FS_BASE: 2335 vmx_segment_cache_clear(vmx); 2336 vmcs_writel(GUEST_FS_BASE, data); 2337 break; 2338 case MSR_GS_BASE: 2339 vmx_segment_cache_clear(vmx); 2340 vmcs_writel(GUEST_GS_BASE, data); 2341 break; 2342 case MSR_KERNEL_GS_BASE: 2343 vmx_write_guest_kernel_gs_base(vmx, data); 2344 break; 2345 case MSR_IA32_XFD: 2346 ret = kvm_set_msr_common(vcpu, msr_info); 2347 /* 2348 * Always intercepting WRMSR could incur non-negligible 2349 * overhead given xfd might be changed frequently in 2350 * guest context switch. Disable write interception 2351 * upon the first write with a non-zero value (indicating 2352 * potential usage on dynamic xfeatures). Also update 2353 * exception bitmap to trap #NM for proper virtualization 2354 * of guest xfd_err. 2355 */ 2356 if (!ret && data) { 2357 vmx_disable_intercept_for_msr(vcpu, MSR_IA32_XFD, 2358 MSR_TYPE_RW); 2359 vcpu->arch.xfd_no_write_intercept = true; 2360 vmx_update_exception_bitmap(vcpu); 2361 } 2362 break; 2363 #endif 2364 case MSR_IA32_SYSENTER_CS: 2365 if (is_guest_mode(vcpu)) 2366 get_vmcs12(vcpu)->guest_sysenter_cs = data; 2367 vmcs_write32(GUEST_SYSENTER_CS, data); 2368 break; 2369 case MSR_IA32_SYSENTER_EIP: 2370 if (is_guest_mode(vcpu)) { 2371 data = nested_vmx_truncate_sysenter_addr(vcpu, data); 2372 get_vmcs12(vcpu)->guest_sysenter_eip = data; 2373 } 2374 vmcs_writel(GUEST_SYSENTER_EIP, data); 2375 break; 2376 case MSR_IA32_SYSENTER_ESP: 2377 if (is_guest_mode(vcpu)) { 2378 data = nested_vmx_truncate_sysenter_addr(vcpu, data); 2379 get_vmcs12(vcpu)->guest_sysenter_esp = data; 2380 } 2381 vmcs_writel(GUEST_SYSENTER_ESP, data); 2382 break; 2383 case MSR_IA32_DEBUGCTLMSR: 2384 if (!vmx_is_valid_debugctl(vcpu, data, msr_info->host_initiated)) 2385 return 1; 2386 2387 data &= vmx_get_supported_debugctl(vcpu, msr_info->host_initiated); 2388 2389 if (is_guest_mode(vcpu) && get_vmcs12(vcpu)->vm_exit_controls & 2390 VM_EXIT_SAVE_DEBUG_CONTROLS) 2391 get_vmcs12(vcpu)->guest_ia32_debugctl = data; 2392 2393 vmx_guest_debugctl_write(vcpu, data); 2394 2395 if (intel_pmu_lbr_is_enabled(vcpu) && !to_vmx(vcpu)->lbr_desc.event && 2396 (data & DEBUGCTLMSR_LBR)) 2397 intel_pmu_create_guest_lbr_event(vcpu); 2398 return 0; 2399 case MSR_IA32_BNDCFGS: 2400 if (!kvm_mpx_supported() || 2401 (!msr_info->host_initiated && 2402 !guest_cpu_cap_has(vcpu, X86_FEATURE_MPX))) 2403 return 1; 2404 if (is_noncanonical_msr_address(data & PAGE_MASK, vcpu) || 2405 (data & MSR_IA32_BNDCFGS_RSVD)) 2406 return 1; 2407 2408 if (is_guest_mode(vcpu) && 2409 ((vmx->nested.msrs.entry_ctls_high & VM_ENTRY_LOAD_BNDCFGS) || 2410 (vmx->nested.msrs.exit_ctls_high & VM_EXIT_CLEAR_BNDCFGS))) 2411 get_vmcs12(vcpu)->guest_bndcfgs = data; 2412 2413 vmcs_write64(GUEST_BNDCFGS, data); 2414 break; 2415 case MSR_IA32_UMWAIT_CONTROL: 2416 if (!msr_info->host_initiated && !vmx_has_waitpkg(vmx)) 2417 return 1; 2418 2419 /* The reserved bit 1 and non-32 bit [63:32] should be zero */ 2420 if (data & (BIT_ULL(1) | GENMASK_ULL(63, 32))) 2421 return 1; 2422 2423 vmx->msr_ia32_umwait_control = data; 2424 break; 2425 case MSR_IA32_SPEC_CTRL: 2426 if (!msr_info->host_initiated && 2427 !guest_has_spec_ctrl_msr(vcpu)) 2428 return 1; 2429 2430 if (kvm_spec_ctrl_test_value(data)) 2431 return 1; 2432 2433 vmx->spec_ctrl = data; 2434 if (!data) 2435 break; 2436 2437 /* 2438 * For non-nested: 2439 * When it's written (to non-zero) for the first time, pass 2440 * it through. 2441 * 2442 * For nested: 2443 * The handling of the MSR bitmap for L2 guests is done in 2444 * nested_vmx_prepare_msr_bitmap. We should not touch the 2445 * vmcs02.msr_bitmap here since it gets completely overwritten 2446 * in the merging. We update the vmcs01 here for L1 as well 2447 * since it will end up touching the MSR anyway now. 2448 */ 2449 vmx_disable_intercept_for_msr(vcpu, 2450 MSR_IA32_SPEC_CTRL, 2451 MSR_TYPE_RW); 2452 break; 2453 case MSR_IA32_TSX_CTRL: 2454 if (!msr_info->host_initiated && 2455 !(vcpu->arch.arch_capabilities & ARCH_CAP_TSX_CTRL_MSR)) 2456 return 1; 2457 if (data & ~(TSX_CTRL_RTM_DISABLE | TSX_CTRL_CPUID_CLEAR)) 2458 return 1; 2459 goto find_uret_msr; 2460 case MSR_IA32_CR_PAT: 2461 ret = kvm_set_msr_common(vcpu, msr_info); 2462 if (ret) 2463 break; 2464 2465 if (is_guest_mode(vcpu) && 2466 get_vmcs12(vcpu)->vm_exit_controls & VM_EXIT_SAVE_IA32_PAT) 2467 get_vmcs12(vcpu)->guest_ia32_pat = data; 2468 2469 if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) 2470 vmcs_write64(GUEST_IA32_PAT, data); 2471 break; 2472 case MSR_IA32_MCG_EXT_CTL: 2473 if ((!msr_info->host_initiated && 2474 !(to_vmx(vcpu)->msr_ia32_feature_control & 2475 FEAT_CTL_LMCE_ENABLED)) || 2476 (data & ~MCG_EXT_CTL_LMCE_EN)) 2477 return 1; 2478 vcpu->arch.mcg_ext_ctl = data; 2479 break; 2480 case MSR_IA32_FEAT_CTL: 2481 if (!is_vmx_feature_control_msr_valid(vmx, msr_info)) 2482 return 1; 2483 2484 vmx->msr_ia32_feature_control = data; 2485 if (msr_info->host_initiated && data == 0) 2486 vmx_leave_nested(vcpu); 2487 2488 /* SGX may be enabled/disabled by guest's firmware */ 2489 vmx_write_encls_bitmap(vcpu, NULL); 2490 break; 2491 case MSR_IA32_SGXLEPUBKEYHASH0 ... MSR_IA32_SGXLEPUBKEYHASH3: 2492 /* 2493 * On real hardware, the LE hash MSRs are writable before 2494 * the firmware sets bit 0 in MSR 0x7a ("activating" SGX), 2495 * at which point SGX related bits in IA32_FEATURE_CONTROL 2496 * become writable. 2497 * 2498 * KVM does not emulate SGX activation for simplicity, so 2499 * allow writes to the LE hash MSRs if IA32_FEATURE_CONTROL 2500 * is unlocked. This is technically not architectural 2501 * behavior, but it's close enough. 2502 */ 2503 if (!msr_info->host_initiated && 2504 (!guest_cpu_cap_has(vcpu, X86_FEATURE_SGX_LC) || 2505 ((vmx->msr_ia32_feature_control & FEAT_CTL_LOCKED) && 2506 !(vmx->msr_ia32_feature_control & FEAT_CTL_SGX_LC_ENABLED)))) 2507 return 1; 2508 vmx->msr_ia32_sgxlepubkeyhash 2509 [msr_index - MSR_IA32_SGXLEPUBKEYHASH0] = data; 2510 break; 2511 case KVM_FIRST_EMULATED_VMX_MSR ... KVM_LAST_EMULATED_VMX_MSR: 2512 if (!msr_info->host_initiated) 2513 return 1; /* they are read-only */ 2514 if (!guest_cpu_cap_has(vcpu, X86_FEATURE_VMX)) 2515 return 1; 2516 return vmx_set_vmx_msr(vcpu, msr_index, data); 2517 case MSR_IA32_RTIT_CTL: 2518 if (!vmx_pt_mode_is_host_guest() || 2519 vmx_rtit_ctl_check(vcpu, data) || 2520 vmx->nested.vmxon) 2521 return 1; 2522 vmcs_write64(GUEST_IA32_RTIT_CTL, data); 2523 vmx->pt_desc.guest.ctl = data; 2524 pt_update_intercept_for_msr(vcpu); 2525 break; 2526 case MSR_IA32_RTIT_STATUS: 2527 if (!pt_can_write_msr(vmx)) 2528 return 1; 2529 if (data & MSR_IA32_RTIT_STATUS_MASK) 2530 return 1; 2531 vmx->pt_desc.guest.status = data; 2532 break; 2533 case MSR_IA32_RTIT_CR3_MATCH: 2534 if (!pt_can_write_msr(vmx)) 2535 return 1; 2536 if (!intel_pt_validate_cap(vmx->pt_desc.caps, 2537 PT_CAP_cr3_filtering)) 2538 return 1; 2539 vmx->pt_desc.guest.cr3_match = data; 2540 break; 2541 case MSR_IA32_RTIT_OUTPUT_BASE: 2542 if (!pt_can_write_msr(vmx)) 2543 return 1; 2544 if (!intel_pt_validate_cap(vmx->pt_desc.caps, 2545 PT_CAP_topa_output) && 2546 !intel_pt_validate_cap(vmx->pt_desc.caps, 2547 PT_CAP_single_range_output)) 2548 return 1; 2549 if (!pt_output_base_valid(vcpu, data)) 2550 return 1; 2551 vmx->pt_desc.guest.output_base = data; 2552 break; 2553 case MSR_IA32_RTIT_OUTPUT_MASK: 2554 if (!pt_can_write_msr(vmx)) 2555 return 1; 2556 if (!intel_pt_validate_cap(vmx->pt_desc.caps, 2557 PT_CAP_topa_output) && 2558 !intel_pt_validate_cap(vmx->pt_desc.caps, 2559 PT_CAP_single_range_output)) 2560 return 1; 2561 vmx->pt_desc.guest.output_mask = data; 2562 break; 2563 case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B: 2564 if (!pt_can_write_msr(vmx)) 2565 return 1; 2566 index = msr_info->index - MSR_IA32_RTIT_ADDR0_A; 2567 if (index >= 2 * vmx->pt_desc.num_address_ranges) 2568 return 1; 2569 if (is_noncanonical_msr_address(data, vcpu)) 2570 return 1; 2571 if (index % 2) 2572 vmx->pt_desc.guest.addr_b[index / 2] = data; 2573 else 2574 vmx->pt_desc.guest.addr_a[index / 2] = data; 2575 break; 2576 case MSR_IA32_S_CET: 2577 vmcs_writel(GUEST_S_CET, data); 2578 break; 2579 case MSR_KVM_INTERNAL_GUEST_SSP: 2580 vmcs_writel(GUEST_SSP, data); 2581 break; 2582 case MSR_IA32_INT_SSP_TAB: 2583 vmcs_writel(GUEST_INTR_SSP_TABLE, data); 2584 break; 2585 case MSR_IA32_PERF_CAPABILITIES: 2586 if (data & PERF_CAP_LBR_FMT) { 2587 if ((data & PERF_CAP_LBR_FMT) != 2588 (kvm_caps.supported_perf_cap & PERF_CAP_LBR_FMT)) 2589 return 1; 2590 if (!cpuid_model_is_consistent(vcpu)) 2591 return 1; 2592 } 2593 if (data & PERF_CAP_PEBS_FORMAT) { 2594 if ((data & PERF_CAP_PEBS_MASK) != 2595 (kvm_caps.supported_perf_cap & PERF_CAP_PEBS_MASK)) 2596 return 1; 2597 if (!guest_cpu_cap_has(vcpu, X86_FEATURE_DS)) 2598 return 1; 2599 if (!guest_cpu_cap_has(vcpu, X86_FEATURE_DTES64)) 2600 return 1; 2601 if (!cpuid_model_is_consistent(vcpu)) 2602 return 1; 2603 } 2604 ret = kvm_set_msr_common(vcpu, msr_info); 2605 break; 2606 2607 default: 2608 find_uret_msr: 2609 msr = vmx_find_uret_msr(vmx, msr_index); 2610 if (msr) 2611 ret = vmx_set_guest_uret_msr(vmx, msr, data); 2612 else 2613 ret = kvm_set_msr_common(vcpu, msr_info); 2614 } 2615 2616 /* FB_CLEAR may have changed, also update the FB_CLEAR_DIS behavior */ 2617 if (msr_index == MSR_IA32_ARCH_CAPABILITIES) 2618 vmx_update_fb_clear_dis(vcpu, vmx); 2619 2620 return ret; 2621 } 2622 2623 void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg) 2624 { 2625 unsigned long guest_owned_bits; 2626 2627 kvm_register_mark_available(vcpu, reg); 2628 2629 switch (reg) { 2630 case VCPU_REGS_RSP: 2631 vcpu->arch.regs[VCPU_REGS_RSP] = vmcs_readl(GUEST_RSP); 2632 break; 2633 case VCPU_REG_RIP: 2634 vcpu->arch.rip = vmcs_readl(GUEST_RIP); 2635 break; 2636 case VCPU_REG_PDPTR: 2637 if (enable_ept) 2638 ept_save_pdptrs(vcpu); 2639 break; 2640 case VCPU_REG_CR0: 2641 guest_owned_bits = vcpu->arch.cr0_guest_owned_bits; 2642 2643 vcpu->arch.cr0 &= ~guest_owned_bits; 2644 vcpu->arch.cr0 |= vmcs_readl(GUEST_CR0) & guest_owned_bits; 2645 break; 2646 case VCPU_REG_CR3: 2647 /* 2648 * When intercepting CR3 loads, e.g. for shadowing paging, KVM's 2649 * CR3 is loaded into hardware, not the guest's CR3. 2650 */ 2651 if (!(exec_controls_get(to_vmx(vcpu)) & CPU_BASED_CR3_LOAD_EXITING)) 2652 vcpu->arch.cr3 = vmcs_readl(GUEST_CR3); 2653 break; 2654 case VCPU_REG_CR4: 2655 guest_owned_bits = vcpu->arch.cr4_guest_owned_bits; 2656 2657 vcpu->arch.cr4 &= ~guest_owned_bits; 2658 vcpu->arch.cr4 |= vmcs_readl(GUEST_CR4) & guest_owned_bits; 2659 break; 2660 default: 2661 KVM_BUG_ON(1, vcpu->kvm); 2662 break; 2663 } 2664 } 2665 2666 /* 2667 * There is no X86_FEATURE for SGX yet, but anyway we need to query CPUID 2668 * directly instead of going through cpu_has(), to ensure KVM is trapping 2669 * ENCLS whenever it's supported in hardware. It does not matter whether 2670 * the host OS supports or has enabled SGX. 2671 */ 2672 static bool cpu_has_sgx(void) 2673 { 2674 return cpuid_eax(0) >= 0x12 && (cpuid_eax(0x12) & BIT(0)); 2675 } 2676 2677 static int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt, u32 msr, u32 *result) 2678 { 2679 u32 vmx_msr_low, vmx_msr_high; 2680 u32 ctl = ctl_min | ctl_opt; 2681 2682 rdmsr(msr, vmx_msr_low, vmx_msr_high); 2683 2684 ctl &= vmx_msr_high; /* bit == 0 in high word ==> must be zero */ 2685 ctl |= vmx_msr_low; /* bit == 1 in low word ==> must be one */ 2686 2687 /* Ensure minimum (required) set of control bits are supported. */ 2688 if (ctl_min & ~ctl) 2689 return -EIO; 2690 2691 *result = ctl; 2692 return 0; 2693 } 2694 2695 static u64 adjust_vmx_controls64(u64 ctl_opt, u32 msr) 2696 { 2697 u64 allowed; 2698 2699 rdmsrq(msr, allowed); 2700 2701 return ctl_opt & allowed; 2702 } 2703 2704 #define vmx_check_entry_exit_pairs(pairs, entry_controls, exit_controls) \ 2705 ({ \ 2706 int i, r = 0; \ 2707 \ 2708 BUILD_BUG_ON(sizeof(pairs[0].entry_control) != sizeof(entry_controls)); \ 2709 BUILD_BUG_ON(sizeof(pairs[0].exit_control) != sizeof(exit_controls)); \ 2710 \ 2711 for (i = 0; i < ARRAY_SIZE(pairs); i++) { \ 2712 typeof(entry_controls) n_ctrl = pairs[i].entry_control; \ 2713 typeof(exit_controls) x_ctrl = pairs[i].exit_control; \ 2714 \ 2715 if (!(entry_controls & n_ctrl) == !(exit_controls & x_ctrl)) \ 2716 continue; \ 2717 \ 2718 pr_warn_once("Inconsistent VM-Entry/VM-Exit pair, " \ 2719 "entry = %llx (%llx), exit = %llx (%llx)\n", \ 2720 (u64)(entry_controls & n_ctrl), (u64)n_ctrl, \ 2721 (u64)(exit_controls & x_ctrl), (u64)x_ctrl); \ 2722 \ 2723 if (error_on_inconsistent_vmcs_config) \ 2724 r = -EIO; \ 2725 \ 2726 entry_controls &= ~n_ctrl; \ 2727 exit_controls &= ~x_ctrl; \ 2728 } \ 2729 r; \ 2730 }) 2731 2732 static int setup_vmcs_config(struct vmcs_config *vmcs_conf, 2733 struct vmx_capability *vmx_cap) 2734 { 2735 u32 _pin_based_exec_control = 0; 2736 u32 _cpu_based_exec_control = 0; 2737 u32 _cpu_based_2nd_exec_control = 0; 2738 u64 _cpu_based_3rd_exec_control = 0; 2739 u32 _vmexit_control = 0; 2740 u32 _vmentry_control = 0; 2741 u64 basic_msr; 2742 u64 misc_msr; 2743 2744 /* 2745 * LOAD/SAVE_DEBUG_CONTROLS are absent because both are mandatory. 2746 * SAVE_IA32_PAT and SAVE_IA32_EFER are absent because KVM always 2747 * intercepts writes to PAT and EFER, i.e. never enables those controls. 2748 */ 2749 struct { 2750 u32 entry_control; 2751 u32 exit_control; 2752 } const vmcs_entry_exit_pairs[] = { 2753 { VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL, VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL }, 2754 { VM_ENTRY_LOAD_IA32_PAT, VM_EXIT_LOAD_IA32_PAT }, 2755 { VM_ENTRY_LOAD_IA32_EFER, VM_EXIT_LOAD_IA32_EFER }, 2756 { VM_ENTRY_LOAD_BNDCFGS, VM_EXIT_CLEAR_BNDCFGS }, 2757 { VM_ENTRY_LOAD_IA32_RTIT_CTL, VM_EXIT_CLEAR_IA32_RTIT_CTL }, 2758 { VM_ENTRY_LOAD_CET_STATE, VM_EXIT_LOAD_CET_STATE }, 2759 }; 2760 2761 memset(vmcs_conf, 0, sizeof(*vmcs_conf)); 2762 2763 if (adjust_vmx_controls(KVM_REQUIRED_VMX_CPU_BASED_VM_EXEC_CONTROL, 2764 KVM_OPTIONAL_VMX_CPU_BASED_VM_EXEC_CONTROL, 2765 MSR_IA32_VMX_PROCBASED_CTLS, 2766 &_cpu_based_exec_control)) 2767 return -EIO; 2768 if (_cpu_based_exec_control & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) { 2769 if (adjust_vmx_controls(KVM_REQUIRED_VMX_SECONDARY_VM_EXEC_CONTROL, 2770 KVM_OPTIONAL_VMX_SECONDARY_VM_EXEC_CONTROL, 2771 MSR_IA32_VMX_PROCBASED_CTLS2, 2772 &_cpu_based_2nd_exec_control)) 2773 return -EIO; 2774 } 2775 if (!IS_ENABLED(CONFIG_KVM_INTEL_PROVE_VE)) 2776 _cpu_based_2nd_exec_control &= ~SECONDARY_EXEC_EPT_VIOLATION_VE; 2777 2778 #ifndef CONFIG_X86_64 2779 if (!(_cpu_based_2nd_exec_control & 2780 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) 2781 _cpu_based_exec_control &= ~CPU_BASED_TPR_SHADOW; 2782 #endif 2783 2784 if (!(_cpu_based_exec_control & CPU_BASED_TPR_SHADOW)) 2785 _cpu_based_2nd_exec_control &= ~( 2786 SECONDARY_EXEC_APIC_REGISTER_VIRT | 2787 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | 2788 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY); 2789 2790 rdmsr_safe(MSR_IA32_VMX_EPT_VPID_CAP, 2791 &vmx_cap->ept, &vmx_cap->vpid); 2792 2793 if (!(_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_EPT) && 2794 vmx_cap->ept) { 2795 pr_warn_once("EPT CAP should not exist if not support " 2796 "1-setting enable EPT VM-execution control\n"); 2797 2798 if (error_on_inconsistent_vmcs_config) 2799 return -EIO; 2800 2801 vmx_cap->ept = 0; 2802 _cpu_based_2nd_exec_control &= ~SECONDARY_EXEC_MODE_BASED_EPT_EXEC; 2803 _cpu_based_2nd_exec_control &= ~SECONDARY_EXEC_EPT_VIOLATION_VE; 2804 } 2805 if (!(_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_VPID) && 2806 vmx_cap->vpid) { 2807 pr_warn_once("VPID CAP should not exist if not support " 2808 "1-setting enable VPID VM-execution control\n"); 2809 2810 if (error_on_inconsistent_vmcs_config) 2811 return -EIO; 2812 2813 vmx_cap->vpid = 0; 2814 } 2815 2816 /* 2817 * Virtualizing MBEC requires advanced vmexit information in order to 2818 * distinguish supervisor and user accesses. For simplicity and clarity 2819 * disable MBEC entirely if advanced vmexit information is not available, 2820 * this way mbec=1 in the kvm_intel module parameters implies availability 2821 * to nested guests as well. 2822 */ 2823 if (!(vmx_cap->ept & VMX_EPT_ADVANCED_VMEXIT_INFO_BIT)) 2824 _cpu_based_2nd_exec_control &= ~SECONDARY_EXEC_MODE_BASED_EPT_EXEC; 2825 2826 if (!cpu_has_sgx()) 2827 _cpu_based_2nd_exec_control &= ~SECONDARY_EXEC_ENCLS_EXITING; 2828 2829 if (_cpu_based_exec_control & CPU_BASED_ACTIVATE_TERTIARY_CONTROLS) 2830 _cpu_based_3rd_exec_control = 2831 adjust_vmx_controls64(KVM_OPTIONAL_VMX_TERTIARY_VM_EXEC_CONTROL, 2832 MSR_IA32_VMX_PROCBASED_CTLS3); 2833 2834 if (adjust_vmx_controls(KVM_REQUIRED_VMX_VM_EXIT_CONTROLS, 2835 KVM_OPTIONAL_VMX_VM_EXIT_CONTROLS, 2836 MSR_IA32_VMX_EXIT_CTLS, 2837 &_vmexit_control)) 2838 return -EIO; 2839 2840 if (adjust_vmx_controls(KVM_REQUIRED_VMX_PIN_BASED_VM_EXEC_CONTROL, 2841 KVM_OPTIONAL_VMX_PIN_BASED_VM_EXEC_CONTROL, 2842 MSR_IA32_VMX_PINBASED_CTLS, 2843 &_pin_based_exec_control)) 2844 return -EIO; 2845 2846 if (cpu_has_broken_vmx_preemption_timer()) 2847 _pin_based_exec_control &= ~PIN_BASED_VMX_PREEMPTION_TIMER; 2848 if (!(_cpu_based_2nd_exec_control & 2849 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY)) 2850 _pin_based_exec_control &= ~PIN_BASED_POSTED_INTR; 2851 2852 if (adjust_vmx_controls(KVM_REQUIRED_VMX_VM_ENTRY_CONTROLS, 2853 KVM_OPTIONAL_VMX_VM_ENTRY_CONTROLS, 2854 MSR_IA32_VMX_ENTRY_CTLS, 2855 &_vmentry_control)) 2856 return -EIO; 2857 2858 if (vmx_check_entry_exit_pairs(vmcs_entry_exit_pairs, 2859 _vmentry_control, _vmexit_control)) 2860 return -EIO; 2861 2862 /* 2863 * Some cpus support VM_{ENTRY,EXIT}_IA32_PERF_GLOBAL_CTRL but they 2864 * can't be used due to an errata where VM Exit may incorrectly clear 2865 * IA32_PERF_GLOBAL_CTRL[34:32]. Workaround the errata by using the 2866 * MSR load mechanism to switch IA32_PERF_GLOBAL_CTRL. 2867 */ 2868 switch (boot_cpu_data.x86_vfm) { 2869 case INTEL_NEHALEM_EP: /* AAK155 */ 2870 case INTEL_NEHALEM: /* AAP115 */ 2871 case INTEL_WESTMERE: /* AAT100 */ 2872 case INTEL_WESTMERE_EP: /* BC86,AAY89,BD102 */ 2873 case INTEL_NEHALEM_EX: /* BA97 */ 2874 _vmentry_control &= ~VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL; 2875 _vmexit_control &= ~VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL; 2876 pr_warn_once("VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL " 2877 "does not work properly. Using workaround\n"); 2878 break; 2879 default: 2880 break; 2881 } 2882 2883 rdmsrq(MSR_IA32_VMX_BASIC, basic_msr); 2884 2885 /* IA-32 SDM Vol 3B: VMCS size is never greater than 4kB. */ 2886 if (vmx_basic_vmcs_size(basic_msr) > PAGE_SIZE) 2887 return -EIO; 2888 2889 #ifdef CONFIG_X86_64 2890 /* 2891 * KVM expects to be able to shove all legal physical addresses into 2892 * VMCS fields for 64-bit kernels, and per the SDM, "This bit is always 2893 * 0 for processors that support Intel 64 architecture". 2894 */ 2895 if (basic_msr & VMX_BASIC_32BIT_PHYS_ADDR_ONLY) 2896 return -EIO; 2897 #endif 2898 2899 /* Require Write-Back (WB) memory type for VMCS accesses. */ 2900 if (vmx_basic_vmcs_mem_type(basic_msr) != X86_MEMTYPE_WB) 2901 return -EIO; 2902 2903 rdmsrq(MSR_IA32_VMX_MISC, misc_msr); 2904 2905 vmcs_conf->basic = basic_msr; 2906 vmcs_conf->pin_based_exec_ctrl = _pin_based_exec_control; 2907 vmcs_conf->cpu_based_exec_ctrl = _cpu_based_exec_control; 2908 vmcs_conf->cpu_based_2nd_exec_ctrl = _cpu_based_2nd_exec_control; 2909 vmcs_conf->cpu_based_3rd_exec_ctrl = _cpu_based_3rd_exec_control; 2910 vmcs_conf->vmexit_ctrl = _vmexit_control; 2911 vmcs_conf->vmentry_ctrl = _vmentry_control; 2912 vmcs_conf->misc = misc_msr; 2913 2914 #if IS_ENABLED(CONFIG_HYPERV) 2915 if (enlightened_vmcs) 2916 evmcs_sanitize_exec_ctrls(vmcs_conf); 2917 #endif 2918 2919 return 0; 2920 } 2921 2922 static bool __kvm_is_vmx_supported(void) 2923 { 2924 int cpu = smp_processor_id(); 2925 2926 if (!(cpuid_ecx(1) & feature_bit(VMX))) { 2927 pr_err("VMX not supported by CPU %d\n", cpu); 2928 return false; 2929 } 2930 2931 if (!this_cpu_has(X86_FEATURE_MSR_IA32_FEAT_CTL)) { 2932 pr_err("VMX not enabled (by BIOS) in MSR_IA32_FEAT_CTL on CPU %d\n", cpu); 2933 return false; 2934 } 2935 2936 if (!this_cpu_has(X86_FEATURE_VMX)) { 2937 pr_err("VMX not fully enabled on CPU %d. Check kernel logs and/or BIOS\n", cpu); 2938 return false; 2939 } 2940 2941 return true; 2942 } 2943 2944 static bool kvm_is_vmx_supported(void) 2945 { 2946 bool supported; 2947 2948 migrate_disable(); 2949 supported = __kvm_is_vmx_supported(); 2950 migrate_enable(); 2951 2952 return supported; 2953 } 2954 2955 int vmx_check_processor_compat(void) 2956 { 2957 int cpu = raw_smp_processor_id(); 2958 struct vmcs_config vmcs_conf; 2959 struct vmx_capability vmx_cap; 2960 2961 if (!__kvm_is_vmx_supported()) 2962 return -EIO; 2963 2964 if (setup_vmcs_config(&vmcs_conf, &vmx_cap) < 0) { 2965 pr_err("Failed to setup VMCS config on CPU %d\n", cpu); 2966 return -EIO; 2967 } 2968 if (nested) 2969 nested_vmx_setup_ctls_msrs(&vmcs_conf, vmx_cap.ept); 2970 2971 if (memcmp(&vmcs_config, &vmcs_conf, sizeof(struct vmcs_config))) { 2972 u32 *gold = (void *)&vmcs_config; 2973 u32 *mine = (void *)&vmcs_conf; 2974 int i; 2975 2976 BUILD_BUG_ON(sizeof(struct vmcs_config) % sizeof(u32)); 2977 2978 pr_err("VMCS config on CPU %d doesn't match reference config:", cpu); 2979 for (i = 0; i < sizeof(struct vmcs_config) / sizeof(u32); i++) { 2980 if (gold[i] == mine[i]) 2981 continue; 2982 2983 pr_cont("\n Offset %u REF = 0x%08x, CPU%u = 0x%08x, mismatch = 0x%08x", 2984 i * (int)sizeof(u32), gold[i], cpu, mine[i], gold[i] ^ mine[i]); 2985 } 2986 pr_cont("\n"); 2987 return -EIO; 2988 } 2989 return 0; 2990 } 2991 2992 int vmx_enable_virtualization_cpu(void) 2993 { 2994 int cpu = raw_smp_processor_id(); 2995 2996 /* 2997 * This can happen if we hot-added a CPU but failed to allocate 2998 * VP assist page for it. 2999 */ 3000 if (kvm_is_using_evmcs() && !hv_get_vp_assist_page(cpu)) 3001 return -EFAULT; 3002 3003 return x86_virt_get_ref(X86_FEATURE_VMX); 3004 } 3005 3006 static void vmclear_local_loaded_vmcss(void) 3007 { 3008 int cpu = raw_smp_processor_id(); 3009 struct loaded_vmcs *v, *n; 3010 3011 list_for_each_entry_safe(v, n, &per_cpu(loaded_vmcss_on_cpu, cpu), 3012 loaded_vmcss_on_cpu_link) 3013 __loaded_vmcs_clear(v); 3014 } 3015 3016 void vmx_disable_virtualization_cpu(void) 3017 { 3018 vmclear_local_loaded_vmcss(); 3019 3020 x86_virt_put_ref(X86_FEATURE_VMX); 3021 3022 hv_reset_evmcs(); 3023 } 3024 3025 struct vmcs *alloc_vmcs_cpu(bool shadow, int cpu, gfp_t flags) 3026 { 3027 int node = cpu_to_node(cpu); 3028 struct page *pages; 3029 struct vmcs *vmcs; 3030 3031 pages = __alloc_pages_node(node, flags, 0); 3032 if (!pages) 3033 return NULL; 3034 vmcs = page_address(pages); 3035 memset(vmcs, 0, vmx_basic_vmcs_size(vmcs_config.basic)); 3036 3037 /* KVM supports Enlightened VMCS v1 only */ 3038 if (kvm_is_using_evmcs()) 3039 vmcs->hdr.revision_id = KVM_EVMCS_VERSION; 3040 else 3041 vmcs->hdr.revision_id = vmx_basic_vmcs_revision_id(vmcs_config.basic); 3042 3043 if (shadow) 3044 vmcs->hdr.shadow_vmcs = 1; 3045 return vmcs; 3046 } 3047 3048 void free_vmcs(struct vmcs *vmcs) 3049 { 3050 free_page((unsigned long)vmcs); 3051 } 3052 3053 /* 3054 * Free a VMCS, but before that VMCLEAR it on the CPU where it was last loaded 3055 */ 3056 void free_loaded_vmcs(struct loaded_vmcs *loaded_vmcs) 3057 { 3058 if (!loaded_vmcs->vmcs) 3059 return; 3060 loaded_vmcs_clear(loaded_vmcs); 3061 free_vmcs(loaded_vmcs->vmcs); 3062 loaded_vmcs->vmcs = NULL; 3063 if (loaded_vmcs->msr_bitmap) 3064 free_page((unsigned long)loaded_vmcs->msr_bitmap); 3065 WARN_ON(loaded_vmcs->shadow_vmcs != NULL); 3066 } 3067 3068 int alloc_loaded_vmcs(struct loaded_vmcs *loaded_vmcs) 3069 { 3070 loaded_vmcs->vmcs = alloc_vmcs(false); 3071 if (!loaded_vmcs->vmcs) 3072 return -ENOMEM; 3073 3074 vmcs_clear(loaded_vmcs->vmcs); 3075 3076 loaded_vmcs->shadow_vmcs = NULL; 3077 loaded_vmcs->hv_timer_soft_disabled = false; 3078 loaded_vmcs->cpu = -1; 3079 loaded_vmcs->launched = 0; 3080 3081 if (cpu_has_vmx_msr_bitmap()) { 3082 loaded_vmcs->msr_bitmap = (unsigned long *) 3083 __get_free_page(GFP_KERNEL_ACCOUNT); 3084 if (!loaded_vmcs->msr_bitmap) 3085 goto out_vmcs; 3086 memset(loaded_vmcs->msr_bitmap, 0xff, PAGE_SIZE); 3087 } 3088 3089 memset(&loaded_vmcs->host_state, 0, sizeof(struct vmcs_host_state)); 3090 memset(&loaded_vmcs->controls_shadow, 0, 3091 sizeof(struct vmcs_controls_shadow)); 3092 3093 return 0; 3094 3095 out_vmcs: 3096 free_loaded_vmcs(loaded_vmcs); 3097 return -ENOMEM; 3098 } 3099 3100 static void fix_pmode_seg(struct kvm_vcpu *vcpu, int seg, 3101 struct kvm_segment *save) 3102 { 3103 if (!emulate_invalid_guest_state) { 3104 /* 3105 * CS and SS RPL should be equal during guest entry according 3106 * to VMX spec, but in reality it is not always so. Since vcpu 3107 * is in the middle of the transition from real mode to 3108 * protected mode it is safe to assume that RPL 0 is a good 3109 * default value. 3110 */ 3111 if (seg == VCPU_SREG_CS || seg == VCPU_SREG_SS) 3112 save->selector &= ~SEGMENT_RPL_MASK; 3113 save->dpl = save->selector & SEGMENT_RPL_MASK; 3114 save->s = 1; 3115 } 3116 __vmx_set_segment(vcpu, save, seg); 3117 } 3118 3119 static void enter_pmode(struct kvm_vcpu *vcpu) 3120 { 3121 unsigned long flags; 3122 struct vcpu_vmx *vmx = to_vmx(vcpu); 3123 3124 /* 3125 * Update real mode segment cache. It may be not up-to-date if segment 3126 * register was written while vcpu was in a guest mode. 3127 */ 3128 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_ES], VCPU_SREG_ES); 3129 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_DS], VCPU_SREG_DS); 3130 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_FS], VCPU_SREG_FS); 3131 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_GS], VCPU_SREG_GS); 3132 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_SS], VCPU_SREG_SS); 3133 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_CS], VCPU_SREG_CS); 3134 3135 vmx->rmode.vm86_active = 0; 3136 3137 __vmx_set_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_TR], VCPU_SREG_TR); 3138 3139 flags = vmcs_readl(GUEST_RFLAGS); 3140 flags &= RMODE_GUEST_OWNED_EFLAGS_BITS; 3141 flags |= vmx->rmode.save_rflags & ~RMODE_GUEST_OWNED_EFLAGS_BITS; 3142 vmcs_writel(GUEST_RFLAGS, flags); 3143 3144 vmcs_writel(GUEST_CR4, (vmcs_readl(GUEST_CR4) & ~X86_CR4_VME) | 3145 (vmcs_readl(CR4_READ_SHADOW) & X86_CR4_VME)); 3146 3147 vmx_update_exception_bitmap(vcpu); 3148 3149 fix_pmode_seg(vcpu, VCPU_SREG_CS, &vmx->rmode.segs[VCPU_SREG_CS]); 3150 fix_pmode_seg(vcpu, VCPU_SREG_SS, &vmx->rmode.segs[VCPU_SREG_SS]); 3151 fix_pmode_seg(vcpu, VCPU_SREG_ES, &vmx->rmode.segs[VCPU_SREG_ES]); 3152 fix_pmode_seg(vcpu, VCPU_SREG_DS, &vmx->rmode.segs[VCPU_SREG_DS]); 3153 fix_pmode_seg(vcpu, VCPU_SREG_FS, &vmx->rmode.segs[VCPU_SREG_FS]); 3154 fix_pmode_seg(vcpu, VCPU_SREG_GS, &vmx->rmode.segs[VCPU_SREG_GS]); 3155 } 3156 3157 static void fix_rmode_seg(int seg, struct kvm_segment *save) 3158 { 3159 const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; 3160 struct kvm_segment var = *save; 3161 3162 var.dpl = 0x3; 3163 if (seg == VCPU_SREG_CS) 3164 var.type = 0x3; 3165 3166 if (!emulate_invalid_guest_state) { 3167 var.selector = var.base >> 4; 3168 var.base = var.base & 0xffff0; 3169 var.limit = 0xffff; 3170 var.g = 0; 3171 var.db = 0; 3172 var.present = 1; 3173 var.s = 1; 3174 var.l = 0; 3175 var.unusable = 0; 3176 var.type = 0x3; 3177 var.avl = 0; 3178 if (save->base & 0xf) 3179 pr_warn_once("segment base is not paragraph aligned " 3180 "when entering protected mode (seg=%d)", seg); 3181 } 3182 3183 vmcs_write16(sf->selector, var.selector); 3184 vmcs_writel(sf->base, var.base); 3185 vmcs_write32(sf->limit, var.limit); 3186 vmcs_write32(sf->ar_bytes, vmx_segment_access_rights(&var)); 3187 } 3188 3189 static void enter_rmode(struct kvm_vcpu *vcpu) 3190 { 3191 unsigned long flags; 3192 struct vcpu_vmx *vmx = to_vmx(vcpu); 3193 struct kvm_vmx *kvm_vmx = to_kvm_vmx(vcpu->kvm); 3194 3195 /* 3196 * KVM should never use VM86 to virtualize Real Mode when L2 is active, 3197 * as using VM86 is unnecessary if unrestricted guest is enabled, and 3198 * if unrestricted guest is disabled, VM-Enter (from L1) with CR0.PG=0 3199 * should VM-Fail and KVM should reject userspace attempts to stuff 3200 * CR0.PG=0 when L2 is active. 3201 */ 3202 WARN_ON_ONCE(is_guest_mode(vcpu)); 3203 3204 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_TR], VCPU_SREG_TR); 3205 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_ES], VCPU_SREG_ES); 3206 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_DS], VCPU_SREG_DS); 3207 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_FS], VCPU_SREG_FS); 3208 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_GS], VCPU_SREG_GS); 3209 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_SS], VCPU_SREG_SS); 3210 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_CS], VCPU_SREG_CS); 3211 3212 vmx->rmode.vm86_active = 1; 3213 3214 vmx_segment_cache_clear(vmx); 3215 3216 vmcs_writel(GUEST_TR_BASE, kvm_vmx->tss_addr); 3217 vmcs_write32(GUEST_TR_LIMIT, RMODE_TSS_SIZE - 1); 3218 vmcs_write32(GUEST_TR_AR_BYTES, 0x008b); 3219 3220 flags = vmcs_readl(GUEST_RFLAGS); 3221 vmx->rmode.save_rflags = flags; 3222 3223 flags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM; 3224 3225 vmcs_writel(GUEST_RFLAGS, flags); 3226 vmcs_writel(GUEST_CR4, vmcs_readl(GUEST_CR4) | X86_CR4_VME); 3227 vmx_update_exception_bitmap(vcpu); 3228 3229 fix_rmode_seg(VCPU_SREG_SS, &vmx->rmode.segs[VCPU_SREG_SS]); 3230 fix_rmode_seg(VCPU_SREG_CS, &vmx->rmode.segs[VCPU_SREG_CS]); 3231 fix_rmode_seg(VCPU_SREG_ES, &vmx->rmode.segs[VCPU_SREG_ES]); 3232 fix_rmode_seg(VCPU_SREG_DS, &vmx->rmode.segs[VCPU_SREG_DS]); 3233 fix_rmode_seg(VCPU_SREG_GS, &vmx->rmode.segs[VCPU_SREG_GS]); 3234 fix_rmode_seg(VCPU_SREG_FS, &vmx->rmode.segs[VCPU_SREG_FS]); 3235 } 3236 3237 int vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer) 3238 { 3239 struct vcpu_vmx *vmx = to_vmx(vcpu); 3240 3241 /* Nothing to do if hardware doesn't support EFER. */ 3242 if (!vmx_find_uret_msr(vmx, MSR_EFER)) 3243 return 0; 3244 3245 vcpu->arch.efer = efer; 3246 #ifdef CONFIG_X86_64 3247 if (efer & EFER_LMA) 3248 vm_entry_controls_setbit(vmx, VM_ENTRY_IA32E_MODE); 3249 else 3250 vm_entry_controls_clearbit(vmx, VM_ENTRY_IA32E_MODE); 3251 #else 3252 if (KVM_BUG_ON(efer & EFER_LMA, vcpu->kvm)) 3253 return 1; 3254 #endif 3255 3256 vmx_setup_uret_msrs(vmx); 3257 return 0; 3258 } 3259 3260 #ifdef CONFIG_X86_64 3261 3262 static void enter_lmode(struct kvm_vcpu *vcpu) 3263 { 3264 u32 guest_tr_ar; 3265 3266 vmx_segment_cache_clear(to_vmx(vcpu)); 3267 3268 guest_tr_ar = vmcs_read32(GUEST_TR_AR_BYTES); 3269 if ((guest_tr_ar & VMX_AR_TYPE_MASK) != VMX_AR_TYPE_BUSY_64_TSS) { 3270 pr_debug_ratelimited("%s: tss fixup for long mode. \n", 3271 __func__); 3272 vmcs_write32(GUEST_TR_AR_BYTES, 3273 (guest_tr_ar & ~VMX_AR_TYPE_MASK) 3274 | VMX_AR_TYPE_BUSY_64_TSS); 3275 } 3276 vmx_set_efer(vcpu, vcpu->arch.efer | EFER_LMA); 3277 } 3278 3279 static void exit_lmode(struct kvm_vcpu *vcpu) 3280 { 3281 vmx_set_efer(vcpu, vcpu->arch.efer & ~EFER_LMA); 3282 } 3283 3284 #endif 3285 3286 void vmx_flush_tlb_all(struct kvm_vcpu *vcpu) 3287 { 3288 struct vcpu_vmx *vmx = to_vmx(vcpu); 3289 3290 /* 3291 * INVEPT must be issued when EPT is enabled, irrespective of VPID, as 3292 * the CPU is not required to invalidate guest-physical mappings on 3293 * VM-Entry, even if VPID is disabled. Guest-physical mappings are 3294 * associated with the root EPT structure and not any particular VPID 3295 * (INVVPID also isn't required to invalidate guest-physical mappings). 3296 */ 3297 if (enable_ept) { 3298 ept_sync_global(); 3299 } else if (enable_vpid) { 3300 if (cpu_has_vmx_invvpid_global()) { 3301 vpid_sync_vcpu_global(); 3302 } else { 3303 vpid_sync_vcpu_single(vmx->vpid); 3304 vpid_sync_vcpu_single(vmx->nested.vpid02); 3305 } 3306 } 3307 } 3308 3309 static inline int vmx_get_current_vpid(struct kvm_vcpu *vcpu) 3310 { 3311 if (is_guest_mode(vcpu) && nested_cpu_has_vpid(get_vmcs12(vcpu))) 3312 return nested_get_vpid02(vcpu); 3313 return to_vmx(vcpu)->vpid; 3314 } 3315 3316 static u64 construct_eptp(hpa_t root_hpa) 3317 { 3318 u64 eptp = root_hpa | VMX_EPTP_MT_WB; 3319 struct kvm_mmu_page *root; 3320 3321 if (kvm_mmu_is_dummy_root(root_hpa)) 3322 return eptp | VMX_EPTP_PWL_4; 3323 3324 /* 3325 * EPT roots should always have an associated MMU page. Return a "bad" 3326 * EPTP to induce VM-Fail instead of continuing on in a unknown state. 3327 */ 3328 root = root_to_sp(root_hpa); 3329 if (WARN_ON_ONCE(!root)) 3330 return INVALID_PAGE; 3331 3332 eptp |= (root->role.level == 5) ? VMX_EPTP_PWL_5 : VMX_EPTP_PWL_4; 3333 3334 if (enable_ept_ad_bits && !root->role.ad_disabled) 3335 eptp |= VMX_EPTP_AD_ENABLE_BIT; 3336 3337 return eptp; 3338 } 3339 3340 static void vmx_flush_tlb_ept_root(hpa_t root_hpa) 3341 { 3342 u64 eptp = construct_eptp(root_hpa); 3343 3344 if (VALID_PAGE(eptp)) 3345 ept_sync_context(eptp); 3346 else 3347 ept_sync_global(); 3348 } 3349 3350 void vmx_flush_tlb_current(struct kvm_vcpu *vcpu) 3351 { 3352 struct kvm_mmu *mmu = vcpu->arch.mmu; 3353 u64 root_hpa = mmu->root.hpa; 3354 3355 /* No flush required if the current context is invalid. */ 3356 if (!VALID_PAGE(root_hpa)) 3357 return; 3358 3359 if (enable_ept) 3360 vmx_flush_tlb_ept_root(root_hpa); 3361 else 3362 vpid_sync_context(vmx_get_current_vpid(vcpu)); 3363 } 3364 3365 void vmx_flush_tlb_gva(struct kvm_vcpu *vcpu, gva_t addr) 3366 { 3367 /* 3368 * vpid_sync_vcpu_addr() is a nop if vpid==0, see the comment in 3369 * vmx_flush_tlb_guest() for an explanation of why this is ok. 3370 */ 3371 vpid_sync_vcpu_addr(vmx_get_current_vpid(vcpu), addr); 3372 } 3373 3374 void vmx_flush_tlb_guest(struct kvm_vcpu *vcpu) 3375 { 3376 /* 3377 * vpid_sync_context() is a nop if vpid==0, e.g. if enable_vpid==0 or a 3378 * vpid couldn't be allocated for this vCPU. VM-Enter and VM-Exit are 3379 * required to flush GVA->{G,H}PA mappings from the TLB if vpid is 3380 * disabled (VM-Enter with vpid enabled and vpid==0 is disallowed), 3381 * i.e. no explicit INVVPID is necessary. 3382 */ 3383 vpid_sync_context(vmx_get_current_vpid(vcpu)); 3384 } 3385 3386 void vmx_ept_load_pdptrs(struct kvm_vcpu *vcpu) 3387 { 3388 if (!kvm_register_is_dirty(vcpu, VCPU_REG_PDPTR)) 3389 return; 3390 3391 if (is_pae_paging(vcpu)) { 3392 vmcs_write64(GUEST_PDPTR0, vcpu->arch.pdptrs[0]); 3393 vmcs_write64(GUEST_PDPTR1, vcpu->arch.pdptrs[1]); 3394 vmcs_write64(GUEST_PDPTR2, vcpu->arch.pdptrs[2]); 3395 vmcs_write64(GUEST_PDPTR3, vcpu->arch.pdptrs[3]); 3396 } 3397 } 3398 3399 void ept_save_pdptrs(struct kvm_vcpu *vcpu) 3400 { 3401 if (WARN_ON_ONCE(!is_pae_paging(vcpu))) 3402 return; 3403 3404 vcpu->arch.pdptrs[0] = vmcs_read64(GUEST_PDPTR0); 3405 vcpu->arch.pdptrs[1] = vmcs_read64(GUEST_PDPTR1); 3406 vcpu->arch.pdptrs[2] = vmcs_read64(GUEST_PDPTR2); 3407 vcpu->arch.pdptrs[3] = vmcs_read64(GUEST_PDPTR3); 3408 3409 kvm_register_mark_available(vcpu, VCPU_REG_PDPTR); 3410 } 3411 3412 #define CR3_EXITING_BITS (CPU_BASED_CR3_LOAD_EXITING | \ 3413 CPU_BASED_CR3_STORE_EXITING) 3414 3415 bool vmx_is_valid_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) 3416 { 3417 if (is_guest_mode(vcpu)) 3418 return nested_guest_cr0_valid(vcpu, cr0); 3419 3420 if (to_vmx(vcpu)->nested.vmxon) 3421 return nested_host_cr0_valid(vcpu, cr0); 3422 3423 return true; 3424 } 3425 3426 void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) 3427 { 3428 struct vcpu_vmx *vmx = to_vmx(vcpu); 3429 unsigned long hw_cr0, old_cr0_pg; 3430 u32 tmp; 3431 3432 old_cr0_pg = kvm_read_cr0_bits(vcpu, X86_CR0_PG); 3433 3434 hw_cr0 = (cr0 & ~KVM_VM_CR0_ALWAYS_OFF); 3435 if (enable_unrestricted_guest) 3436 hw_cr0 |= KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST; 3437 else { 3438 hw_cr0 |= KVM_VM_CR0_ALWAYS_ON; 3439 if (!enable_ept) 3440 hw_cr0 |= X86_CR0_WP; 3441 3442 if (vmx->rmode.vm86_active && (cr0 & X86_CR0_PE)) 3443 enter_pmode(vcpu); 3444 3445 if (!vmx->rmode.vm86_active && !(cr0 & X86_CR0_PE)) 3446 enter_rmode(vcpu); 3447 } 3448 3449 vmcs_writel(CR0_READ_SHADOW, cr0); 3450 vmcs_writel(GUEST_CR0, hw_cr0); 3451 vcpu->arch.cr0 = cr0; 3452 kvm_register_mark_available(vcpu, VCPU_REG_CR0); 3453 3454 #ifdef CONFIG_X86_64 3455 if (vcpu->arch.efer & EFER_LME) { 3456 if (!old_cr0_pg && (cr0 & X86_CR0_PG)) 3457 enter_lmode(vcpu); 3458 else if (old_cr0_pg && !(cr0 & X86_CR0_PG)) 3459 exit_lmode(vcpu); 3460 } 3461 #endif 3462 3463 if (enable_ept && !enable_unrestricted_guest) { 3464 /* 3465 * Ensure KVM has an up-to-date snapshot of the guest's CR3. If 3466 * the below code _enables_ CR3 exiting, vmx_cache_reg() will 3467 * (correctly) stop reading vmcs.GUEST_CR3 because it thinks 3468 * KVM's CR3 is installed. 3469 */ 3470 if (!kvm_register_is_available(vcpu, VCPU_REG_CR3)) 3471 vmx_cache_reg(vcpu, VCPU_REG_CR3); 3472 3473 /* 3474 * When running with EPT but not unrestricted guest, KVM must 3475 * intercept CR3 accesses when paging is _disabled_. This is 3476 * necessary because restricted guests can't actually run with 3477 * paging disabled, and so KVM stuffs its own CR3 in order to 3478 * run the guest when identity mapped page tables. 3479 * 3480 * Do _NOT_ check the old CR0.PG, e.g. to optimize away the 3481 * update, it may be stale with respect to CR3 interception, 3482 * e.g. after nested VM-Enter. 3483 * 3484 * Lastly, honor L1's desires, i.e. intercept CR3 loads and/or 3485 * stores to forward them to L1, even if KVM does not need to 3486 * intercept them to preserve its identity mapped page tables. 3487 */ 3488 if (!(cr0 & X86_CR0_PG)) { 3489 exec_controls_setbit(vmx, CR3_EXITING_BITS); 3490 } else if (!is_guest_mode(vcpu)) { 3491 exec_controls_clearbit(vmx, CR3_EXITING_BITS); 3492 } else { 3493 tmp = exec_controls_get(vmx); 3494 tmp &= ~CR3_EXITING_BITS; 3495 tmp |= get_vmcs12(vcpu)->cpu_based_vm_exec_control & CR3_EXITING_BITS; 3496 exec_controls_set(vmx, tmp); 3497 } 3498 3499 /* Note, vmx_set_cr4() consumes the new vcpu->arch.cr0. */ 3500 if ((old_cr0_pg ^ cr0) & X86_CR0_PG) 3501 vmx_set_cr4(vcpu, kvm_read_cr4(vcpu)); 3502 3503 /* 3504 * When !CR0_PG -> CR0_PG, vcpu->arch.cr3 becomes active, but 3505 * GUEST_CR3 is still vmx->ept_identity_map_addr if EPT + !URG. 3506 */ 3507 if (!(old_cr0_pg & X86_CR0_PG) && (cr0 & X86_CR0_PG)) 3508 kvm_register_mark_dirty(vcpu, VCPU_REG_CR3); 3509 } 3510 3511 /* depends on vcpu->arch.cr0 to be set to a new value */ 3512 vmx->vt.emulation_required = vmx_emulation_required(vcpu); 3513 } 3514 3515 static int vmx_get_max_ept_level(void) 3516 { 3517 if (cpu_has_vmx_ept_5levels()) 3518 return 5; 3519 return 4; 3520 } 3521 3522 void vmx_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa, int root_level) 3523 { 3524 struct kvm *kvm = vcpu->kvm; 3525 bool update_guest_cr3 = true; 3526 unsigned long guest_cr3; 3527 3528 if (enable_ept) { 3529 KVM_MMU_WARN_ON(root_to_sp(root_hpa) && 3530 root_level != root_to_sp(root_hpa)->role.level); 3531 vmcs_write64(EPT_POINTER, construct_eptp(root_hpa)); 3532 3533 hv_track_root_tdp(vcpu, root_hpa); 3534 3535 if (!enable_unrestricted_guest && !is_paging(vcpu)) 3536 guest_cr3 = to_kvm_vmx(kvm)->ept_identity_map_addr; 3537 else if (kvm_register_is_dirty(vcpu, VCPU_REG_CR3)) 3538 guest_cr3 = vcpu->arch.cr3; 3539 else /* vmcs.GUEST_CR3 is already up-to-date. */ 3540 update_guest_cr3 = false; 3541 vmx_ept_load_pdptrs(vcpu); 3542 } else { 3543 guest_cr3 = root_hpa | kvm_get_active_pcid(vcpu) | 3544 kvm_get_active_cr3_lam_bits(vcpu); 3545 } 3546 3547 if (update_guest_cr3) 3548 vmcs_writel(GUEST_CR3, guest_cr3); 3549 } 3550 3551 bool vmx_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) 3552 { 3553 /* 3554 * We operate under the default treatment of SMM, so VMX cannot be 3555 * enabled under SMM. Note, whether or not VMXE is allowed at all, 3556 * i.e. is a reserved bit, is handled by common x86 code. 3557 */ 3558 if ((cr4 & X86_CR4_VMXE) && is_smm(vcpu)) 3559 return false; 3560 3561 if (to_vmx(vcpu)->nested.vmxon && !nested_cr4_valid(vcpu, cr4)) 3562 return false; 3563 3564 return true; 3565 } 3566 3567 void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) 3568 { 3569 unsigned long old_cr4 = kvm_read_cr4(vcpu); 3570 struct vcpu_vmx *vmx = to_vmx(vcpu); 3571 unsigned long hw_cr4; 3572 3573 /* 3574 * Pass through host's Machine Check Enable value to hw_cr4, which 3575 * is in force while we are in guest mode. Do not let guests control 3576 * this bit, even if host CR4.MCE == 0. 3577 */ 3578 hw_cr4 = (cr4_read_shadow() & X86_CR4_MCE) | (cr4 & ~X86_CR4_MCE); 3579 if (enable_unrestricted_guest) 3580 hw_cr4 |= KVM_VM_CR4_ALWAYS_ON_UNRESTRICTED_GUEST; 3581 else if (vmx->rmode.vm86_active) 3582 hw_cr4 |= KVM_RMODE_VM_CR4_ALWAYS_ON; 3583 else 3584 hw_cr4 |= KVM_PMODE_VM_CR4_ALWAYS_ON; 3585 3586 if (vmx_umip_emulated()) { 3587 if (cr4 & X86_CR4_UMIP) { 3588 secondary_exec_controls_setbit(vmx, SECONDARY_EXEC_DESC); 3589 hw_cr4 &= ~X86_CR4_UMIP; 3590 } else if (!is_guest_mode(vcpu) || 3591 !nested_cpu_has2(get_vmcs12(vcpu), SECONDARY_EXEC_DESC)) { 3592 secondary_exec_controls_clearbit(vmx, SECONDARY_EXEC_DESC); 3593 } 3594 } 3595 3596 vcpu->arch.cr4 = cr4; 3597 kvm_register_mark_available(vcpu, VCPU_REG_CR4); 3598 3599 if (!enable_unrestricted_guest) { 3600 if (enable_ept) { 3601 if (!is_paging(vcpu)) { 3602 hw_cr4 &= ~X86_CR4_PAE; 3603 hw_cr4 |= X86_CR4_PSE; 3604 } else if (!(cr4 & X86_CR4_PAE)) { 3605 hw_cr4 &= ~X86_CR4_PAE; 3606 } 3607 } 3608 3609 /* 3610 * SMEP/SMAP/PKU is disabled if CPU is in non-paging mode in 3611 * hardware. To emulate this behavior, SMEP/SMAP/PKU needs 3612 * to be manually disabled when guest switches to non-paging 3613 * mode. 3614 * 3615 * If !enable_unrestricted_guest, the CPU is always running 3616 * with CR0.PG=1 and CR4 needs to be modified. 3617 * If enable_unrestricted_guest, the CPU automatically 3618 * disables SMEP/SMAP/PKU when the guest sets CR0.PG=0. 3619 */ 3620 if (!is_paging(vcpu)) 3621 hw_cr4 &= ~(X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_PKE); 3622 } 3623 3624 vmcs_writel(CR4_READ_SHADOW, cr4); 3625 vmcs_writel(GUEST_CR4, hw_cr4); 3626 3627 if ((cr4 ^ old_cr4) & (X86_CR4_OSXSAVE | X86_CR4_PKE)) 3628 vcpu->arch.cpuid_dynamic_bits_dirty = true; 3629 } 3630 3631 void vmx_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg) 3632 { 3633 struct vcpu_vmx *vmx = to_vmx(vcpu); 3634 u32 ar; 3635 3636 if (vmx->rmode.vm86_active && seg != VCPU_SREG_LDTR) { 3637 *var = vmx->rmode.segs[seg]; 3638 if (seg == VCPU_SREG_TR 3639 || var->selector == vmx_read_guest_seg_selector(vmx, seg)) 3640 return; 3641 var->base = vmx_read_guest_seg_base(vmx, seg); 3642 var->selector = vmx_read_guest_seg_selector(vmx, seg); 3643 return; 3644 } 3645 var->base = vmx_read_guest_seg_base(vmx, seg); 3646 var->limit = vmx_read_guest_seg_limit(vmx, seg); 3647 var->selector = vmx_read_guest_seg_selector(vmx, seg); 3648 ar = vmx_read_guest_seg_ar(vmx, seg); 3649 var->unusable = (ar >> 16) & 1; 3650 var->type = ar & 15; 3651 var->s = (ar >> 4) & 1; 3652 var->dpl = (ar >> 5) & 3; 3653 /* 3654 * Some userspaces do not preserve unusable property. Since usable 3655 * segment has to be present according to VMX spec we can use present 3656 * property to amend userspace bug by making unusable segment always 3657 * nonpresent. vmx_segment_access_rights() already marks nonpresent 3658 * segment as unusable. 3659 */ 3660 var->present = !var->unusable; 3661 var->avl = (ar >> 12) & 1; 3662 var->l = (ar >> 13) & 1; 3663 var->db = (ar >> 14) & 1; 3664 var->g = (ar >> 15) & 1; 3665 } 3666 3667 u64 vmx_get_segment_base(struct kvm_vcpu *vcpu, int seg) 3668 { 3669 struct kvm_segment s; 3670 3671 if (to_vmx(vcpu)->rmode.vm86_active) { 3672 vmx_get_segment(vcpu, &s, seg); 3673 return s.base; 3674 } 3675 return vmx_read_guest_seg_base(to_vmx(vcpu), seg); 3676 } 3677 3678 static int __vmx_get_cpl(struct kvm_vcpu *vcpu, bool no_cache) 3679 { 3680 struct vcpu_vmx *vmx = to_vmx(vcpu); 3681 int ar; 3682 3683 if (unlikely(vmx->rmode.vm86_active)) 3684 return 0; 3685 3686 if (no_cache) 3687 ar = vmcs_read32(GUEST_SS_AR_BYTES); 3688 else 3689 ar = vmx_read_guest_seg_ar(vmx, VCPU_SREG_SS); 3690 return VMX_AR_DPL(ar); 3691 } 3692 3693 int vmx_get_cpl(struct kvm_vcpu *vcpu) 3694 { 3695 return __vmx_get_cpl(vcpu, false); 3696 } 3697 3698 int vmx_get_cpl_no_cache(struct kvm_vcpu *vcpu) 3699 { 3700 return __vmx_get_cpl(vcpu, true); 3701 } 3702 3703 static u32 vmx_segment_access_rights(struct kvm_segment *var) 3704 { 3705 u32 ar; 3706 3707 ar = var->type & 15; 3708 ar |= (var->s & 1) << 4; 3709 ar |= (var->dpl & 3) << 5; 3710 ar |= (var->present & 1) << 7; 3711 ar |= (var->avl & 1) << 12; 3712 ar |= (var->l & 1) << 13; 3713 ar |= (var->db & 1) << 14; 3714 ar |= (var->g & 1) << 15; 3715 ar |= (var->unusable || !var->present) << 16; 3716 3717 return ar; 3718 } 3719 3720 void __vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg) 3721 { 3722 struct vcpu_vmx *vmx = to_vmx(vcpu); 3723 const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; 3724 3725 vmx_segment_cache_clear(vmx); 3726 3727 if (vmx->rmode.vm86_active && seg != VCPU_SREG_LDTR) { 3728 vmx->rmode.segs[seg] = *var; 3729 if (seg == VCPU_SREG_TR) 3730 vmcs_write16(sf->selector, var->selector); 3731 else if (var->s) 3732 fix_rmode_seg(seg, &vmx->rmode.segs[seg]); 3733 return; 3734 } 3735 3736 vmcs_writel(sf->base, var->base); 3737 vmcs_write32(sf->limit, var->limit); 3738 vmcs_write16(sf->selector, var->selector); 3739 3740 /* 3741 * Fix the "Accessed" bit in AR field of segment registers for older 3742 * qemu binaries. 3743 * IA32 arch specifies that at the time of processor reset the 3744 * "Accessed" bit in the AR field of segment registers is 1. And qemu 3745 * is setting it to 0 in the userland code. This causes invalid guest 3746 * state vmexit when "unrestricted guest" mode is turned on. 3747 * Fix for this setup issue in cpu_reset is being pushed in the qemu 3748 * tree. Newer qemu binaries with that qemu fix would not need this 3749 * kvm hack. 3750 */ 3751 if (is_unrestricted_guest(vcpu) && (seg != VCPU_SREG_LDTR)) 3752 var->type |= 0x1; /* Accessed */ 3753 3754 vmcs_write32(sf->ar_bytes, vmx_segment_access_rights(var)); 3755 } 3756 3757 void vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg) 3758 { 3759 __vmx_set_segment(vcpu, var, seg); 3760 3761 to_vmx(vcpu)->vt.emulation_required = vmx_emulation_required(vcpu); 3762 } 3763 3764 void vmx_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l) 3765 { 3766 u32 ar = vmx_read_guest_seg_ar(to_vmx(vcpu), VCPU_SREG_CS); 3767 3768 *db = (ar >> 14) & 1; 3769 *l = (ar >> 13) & 1; 3770 } 3771 3772 void vmx_get_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) 3773 { 3774 dt->size = vmcs_read32(GUEST_IDTR_LIMIT); 3775 dt->address = vmcs_readl(GUEST_IDTR_BASE); 3776 } 3777 3778 void vmx_set_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) 3779 { 3780 vmcs_write32(GUEST_IDTR_LIMIT, dt->size); 3781 vmcs_writel(GUEST_IDTR_BASE, dt->address); 3782 } 3783 3784 void vmx_get_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) 3785 { 3786 dt->size = vmcs_read32(GUEST_GDTR_LIMIT); 3787 dt->address = vmcs_readl(GUEST_GDTR_BASE); 3788 } 3789 3790 void vmx_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) 3791 { 3792 vmcs_write32(GUEST_GDTR_LIMIT, dt->size); 3793 vmcs_writel(GUEST_GDTR_BASE, dt->address); 3794 } 3795 3796 static bool rmode_segment_valid(struct kvm_vcpu *vcpu, int seg) 3797 { 3798 struct kvm_segment var; 3799 u32 ar; 3800 3801 vmx_get_segment(vcpu, &var, seg); 3802 var.dpl = 0x3; 3803 if (seg == VCPU_SREG_CS) 3804 var.type = 0x3; 3805 ar = vmx_segment_access_rights(&var); 3806 3807 if (var.base != (var.selector << 4)) 3808 return false; 3809 if (var.limit != 0xffff) 3810 return false; 3811 if (ar != 0xf3) 3812 return false; 3813 3814 return true; 3815 } 3816 3817 static bool code_segment_valid(struct kvm_vcpu *vcpu) 3818 { 3819 struct kvm_segment cs; 3820 unsigned int cs_rpl; 3821 3822 vmx_get_segment(vcpu, &cs, VCPU_SREG_CS); 3823 cs_rpl = cs.selector & SEGMENT_RPL_MASK; 3824 3825 if (cs.unusable) 3826 return false; 3827 if (~cs.type & (VMX_AR_TYPE_CODE_MASK|VMX_AR_TYPE_ACCESSES_MASK)) 3828 return false; 3829 if (!cs.s) 3830 return false; 3831 if (cs.type & VMX_AR_TYPE_WRITEABLE_MASK) { 3832 if (cs.dpl > cs_rpl) 3833 return false; 3834 } else { 3835 if (cs.dpl != cs_rpl) 3836 return false; 3837 } 3838 if (!cs.present) 3839 return false; 3840 3841 /* TODO: Add Reserved field check, this'll require a new member in the kvm_segment_field structure */ 3842 return true; 3843 } 3844 3845 static bool stack_segment_valid(struct kvm_vcpu *vcpu) 3846 { 3847 struct kvm_segment ss; 3848 unsigned int ss_rpl; 3849 3850 vmx_get_segment(vcpu, &ss, VCPU_SREG_SS); 3851 ss_rpl = ss.selector & SEGMENT_RPL_MASK; 3852 3853 if (ss.unusable) 3854 return true; 3855 if (ss.type != 3 && ss.type != 7) 3856 return false; 3857 if (!ss.s) 3858 return false; 3859 if (ss.dpl != ss_rpl) /* DPL != RPL */ 3860 return false; 3861 if (!ss.present) 3862 return false; 3863 3864 return true; 3865 } 3866 3867 static bool data_segment_valid(struct kvm_vcpu *vcpu, int seg) 3868 { 3869 struct kvm_segment var; 3870 unsigned int rpl; 3871 3872 vmx_get_segment(vcpu, &var, seg); 3873 rpl = var.selector & SEGMENT_RPL_MASK; 3874 3875 if (var.unusable) 3876 return true; 3877 if (!var.s) 3878 return false; 3879 if (!var.present) 3880 return false; 3881 if (~var.type & (VMX_AR_TYPE_CODE_MASK|VMX_AR_TYPE_WRITEABLE_MASK)) { 3882 if (var.dpl < rpl) /* DPL < RPL */ 3883 return false; 3884 } 3885 3886 /* TODO: Add other members to kvm_segment_field to allow checking for other access 3887 * rights flags 3888 */ 3889 return true; 3890 } 3891 3892 static bool tr_valid(struct kvm_vcpu *vcpu) 3893 { 3894 struct kvm_segment tr; 3895 3896 vmx_get_segment(vcpu, &tr, VCPU_SREG_TR); 3897 3898 if (tr.unusable) 3899 return false; 3900 if (tr.selector & SEGMENT_TI_MASK) /* TI = 1 */ 3901 return false; 3902 if (tr.type != 3 && tr.type != 11) /* TODO: Check if guest is in IA32e mode */ 3903 return false; 3904 if (!tr.present) 3905 return false; 3906 3907 return true; 3908 } 3909 3910 static bool ldtr_valid(struct kvm_vcpu *vcpu) 3911 { 3912 struct kvm_segment ldtr; 3913 3914 vmx_get_segment(vcpu, &ldtr, VCPU_SREG_LDTR); 3915 3916 if (ldtr.unusable) 3917 return true; 3918 if (ldtr.selector & SEGMENT_TI_MASK) /* TI = 1 */ 3919 return false; 3920 if (ldtr.type != 2) 3921 return false; 3922 if (!ldtr.present) 3923 return false; 3924 3925 return true; 3926 } 3927 3928 static bool cs_ss_rpl_check(struct kvm_vcpu *vcpu) 3929 { 3930 struct kvm_segment cs, ss; 3931 3932 vmx_get_segment(vcpu, &cs, VCPU_SREG_CS); 3933 vmx_get_segment(vcpu, &ss, VCPU_SREG_SS); 3934 3935 return ((cs.selector & SEGMENT_RPL_MASK) == 3936 (ss.selector & SEGMENT_RPL_MASK)); 3937 } 3938 3939 /* 3940 * Check if guest state is valid. Returns true if valid, false if 3941 * not. 3942 * We assume that registers are always usable 3943 */ 3944 bool __vmx_guest_state_valid(struct kvm_vcpu *vcpu) 3945 { 3946 /* real mode guest state checks */ 3947 if (!is_protmode(vcpu) || (vmx_get_rflags(vcpu) & X86_EFLAGS_VM)) { 3948 if (!rmode_segment_valid(vcpu, VCPU_SREG_CS)) 3949 return false; 3950 if (!rmode_segment_valid(vcpu, VCPU_SREG_SS)) 3951 return false; 3952 if (!rmode_segment_valid(vcpu, VCPU_SREG_DS)) 3953 return false; 3954 if (!rmode_segment_valid(vcpu, VCPU_SREG_ES)) 3955 return false; 3956 if (!rmode_segment_valid(vcpu, VCPU_SREG_FS)) 3957 return false; 3958 if (!rmode_segment_valid(vcpu, VCPU_SREG_GS)) 3959 return false; 3960 } else { 3961 /* protected mode guest state checks */ 3962 if (!cs_ss_rpl_check(vcpu)) 3963 return false; 3964 if (!code_segment_valid(vcpu)) 3965 return false; 3966 if (!stack_segment_valid(vcpu)) 3967 return false; 3968 if (!data_segment_valid(vcpu, VCPU_SREG_DS)) 3969 return false; 3970 if (!data_segment_valid(vcpu, VCPU_SREG_ES)) 3971 return false; 3972 if (!data_segment_valid(vcpu, VCPU_SREG_FS)) 3973 return false; 3974 if (!data_segment_valid(vcpu, VCPU_SREG_GS)) 3975 return false; 3976 if (!tr_valid(vcpu)) 3977 return false; 3978 if (!ldtr_valid(vcpu)) 3979 return false; 3980 } 3981 /* TODO: 3982 * - Add checks on RIP 3983 * - Add checks on RFLAGS 3984 */ 3985 3986 return true; 3987 } 3988 3989 static int init_rmode_tss(struct kvm *kvm, void __user *ua) 3990 { 3991 const void *zero_page = (const void *) __va(page_to_phys(ZERO_PAGE(0))); 3992 u16 data; 3993 int i; 3994 3995 for (i = 0; i < 3; i++) { 3996 if (__copy_to_user(ua + PAGE_SIZE * i, zero_page, PAGE_SIZE)) 3997 return -EFAULT; 3998 } 3999 4000 data = TSS_BASE_SIZE + TSS_REDIRECTION_SIZE; 4001 if (__copy_to_user(ua + TSS_IOPB_BASE_OFFSET, &data, sizeof(u16))) 4002 return -EFAULT; 4003 4004 data = ~0; 4005 if (__copy_to_user(ua + RMODE_TSS_SIZE - 1, &data, sizeof(u8))) 4006 return -EFAULT; 4007 4008 return 0; 4009 } 4010 4011 static int init_rmode_identity_map(struct kvm *kvm) 4012 { 4013 struct kvm_vmx *kvm_vmx = to_kvm_vmx(kvm); 4014 int i, r = 0; 4015 void __user *uaddr; 4016 u32 tmp; 4017 4018 /* Protect kvm_vmx->ept_identity_pagetable_done. */ 4019 mutex_lock(&kvm->slots_lock); 4020 4021 if (likely(kvm_vmx->ept_identity_pagetable_done)) 4022 goto out; 4023 4024 if (!kvm_vmx->ept_identity_map_addr) 4025 kvm_vmx->ept_identity_map_addr = VMX_EPT_IDENTITY_PAGETABLE_ADDR; 4026 4027 uaddr = __x86_set_memory_region(kvm, 4028 IDENTITY_PAGETABLE_PRIVATE_MEMSLOT, 4029 kvm_vmx->ept_identity_map_addr, 4030 PAGE_SIZE); 4031 if (IS_ERR(uaddr)) { 4032 r = PTR_ERR(uaddr); 4033 goto out; 4034 } 4035 4036 /* Set up identity-mapping pagetable for EPT in real mode */ 4037 for (i = 0; i < (PAGE_SIZE / sizeof(tmp)); i++) { 4038 tmp = (i << 22) + (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | 4039 _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_PSE); 4040 if (__copy_to_user(uaddr + i * sizeof(tmp), &tmp, sizeof(tmp))) { 4041 r = -EFAULT; 4042 goto out; 4043 } 4044 } 4045 kvm_vmx->ept_identity_pagetable_done = true; 4046 4047 out: 4048 mutex_unlock(&kvm->slots_lock); 4049 return r; 4050 } 4051 4052 static void seg_setup(int seg) 4053 { 4054 const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; 4055 unsigned int ar; 4056 4057 vmcs_write16(sf->selector, 0); 4058 vmcs_writel(sf->base, 0); 4059 vmcs_write32(sf->limit, 0xffff); 4060 ar = 0x93; 4061 if (seg == VCPU_SREG_CS) 4062 ar |= 0x08; /* code segment */ 4063 4064 vmcs_write32(sf->ar_bytes, ar); 4065 } 4066 4067 int allocate_vpid(void) 4068 { 4069 int vpid; 4070 4071 if (!enable_vpid) 4072 return 0; 4073 spin_lock(&vmx_vpid_lock); 4074 vpid = find_first_zero_bit(vmx_vpid_bitmap, VMX_NR_VPIDS); 4075 if (vpid < VMX_NR_VPIDS) 4076 __set_bit(vpid, vmx_vpid_bitmap); 4077 else 4078 vpid = 0; 4079 spin_unlock(&vmx_vpid_lock); 4080 return vpid; 4081 } 4082 4083 void free_vpid(int vpid) 4084 { 4085 if (!enable_vpid || vpid == 0) 4086 return; 4087 spin_lock(&vmx_vpid_lock); 4088 __clear_bit(vpid, vmx_vpid_bitmap); 4089 spin_unlock(&vmx_vpid_lock); 4090 } 4091 4092 static void vmx_msr_bitmap_l01_changed(struct vcpu_vmx *vmx) 4093 { 4094 /* 4095 * When KVM is a nested hypervisor on top of Hyper-V and uses 4096 * 'Enlightened MSR Bitmap' feature L0 needs to know that MSR 4097 * bitmap has changed. 4098 */ 4099 if (kvm_is_using_evmcs()) { 4100 struct hv_enlightened_vmcs *evmcs = (void *)vmx->vmcs01.vmcs; 4101 4102 if (evmcs->hv_enlightenments_control.msr_bitmap) 4103 evmcs->hv_clean_fields &= 4104 ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP; 4105 } 4106 4107 vmx->nested.force_msr_bitmap_recalc = true; 4108 } 4109 4110 void vmx_set_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type, bool set) 4111 { 4112 struct vcpu_vmx *vmx = to_vmx(vcpu); 4113 unsigned long *msr_bitmap = vmx->vmcs01.msr_bitmap; 4114 4115 if (!cpu_has_vmx_msr_bitmap()) 4116 return; 4117 4118 vmx_msr_bitmap_l01_changed(vmx); 4119 4120 if (type & MSR_TYPE_R) { 4121 if (!set && kvm_msr_allowed(vcpu, msr, KVM_MSR_FILTER_READ)) 4122 vmx_clear_msr_bitmap_read(msr_bitmap, msr); 4123 else 4124 vmx_set_msr_bitmap_read(msr_bitmap, msr); 4125 } 4126 4127 if (type & MSR_TYPE_W) { 4128 if (!set && kvm_msr_allowed(vcpu, msr, KVM_MSR_FILTER_WRITE)) 4129 vmx_clear_msr_bitmap_write(msr_bitmap, msr); 4130 else 4131 vmx_set_msr_bitmap_write(msr_bitmap, msr); 4132 } 4133 } 4134 4135 static void vmx_update_msr_bitmap_x2apic(struct kvm_vcpu *vcpu) 4136 { 4137 /* 4138 * x2APIC indices for 64-bit accesses into the RDMSR and WRMSR halves 4139 * of the MSR bitmap. KVM emulates APIC registers up through 0x3f0, 4140 * i.e. MSR 0x83f, and so only needs to dynamically manipulate 64 bits. 4141 */ 4142 const int read_idx = APIC_BASE_MSR / BITS_PER_LONG_LONG; 4143 const int write_idx = read_idx + (0x800 / sizeof(u64)); 4144 struct vcpu_vmx *vmx = to_vmx(vcpu); 4145 u64 *msr_bitmap = (u64 *)vmx->vmcs01.msr_bitmap; 4146 u8 mode; 4147 4148 if (!cpu_has_vmx_msr_bitmap() || WARN_ON_ONCE(!lapic_in_kernel(vcpu))) 4149 return; 4150 4151 if (cpu_has_secondary_exec_ctrls() && 4152 (secondary_exec_controls_get(vmx) & 4153 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE)) { 4154 mode = MSR_BITMAP_MODE_X2APIC; 4155 if (enable_apicv && kvm_vcpu_apicv_active(vcpu)) 4156 mode |= MSR_BITMAP_MODE_X2APIC_APICV; 4157 } else { 4158 mode = 0; 4159 } 4160 4161 if (mode == vmx->x2apic_msr_bitmap_mode) 4162 return; 4163 4164 vmx->x2apic_msr_bitmap_mode = mode; 4165 4166 /* 4167 * Reset the bitmap for MSRs 0x800 - 0x83f. Leave AMD's uber-extended 4168 * registers (0x840 and above) intercepted, KVM doesn't support them. 4169 * Intercept all writes by default and poke holes as needed. Pass 4170 * through reads for all valid registers by default in x2APIC+APICv 4171 * mode, only the current timer count needs on-demand emulation by KVM. 4172 */ 4173 if (mode & MSR_BITMAP_MODE_X2APIC_APICV) 4174 msr_bitmap[read_idx] = ~kvm_x2apic_disable_read_intercept_reg_mask(vcpu); 4175 else 4176 msr_bitmap[read_idx] = ~0ull; 4177 msr_bitmap[write_idx] = ~0ull; 4178 4179 /* 4180 * TPR reads and writes can be virtualized even if virtual interrupt 4181 * delivery is not in use. 4182 */ 4183 vmx_set_intercept_for_msr(vcpu, X2APIC_MSR(APIC_TASKPRI), MSR_TYPE_RW, 4184 !(mode & MSR_BITMAP_MODE_X2APIC)); 4185 4186 if (mode & MSR_BITMAP_MODE_X2APIC_APICV) { 4187 vmx_disable_intercept_for_msr(vcpu, X2APIC_MSR(APIC_EOI), MSR_TYPE_W); 4188 vmx_disable_intercept_for_msr(vcpu, X2APIC_MSR(APIC_SELF_IPI), MSR_TYPE_W); 4189 if (enable_ipiv) 4190 vmx_disable_intercept_for_msr(vcpu, X2APIC_MSR(APIC_ICR), MSR_TYPE_RW); 4191 } 4192 } 4193 4194 void pt_update_intercept_for_msr(struct kvm_vcpu *vcpu) 4195 { 4196 struct vcpu_vmx *vmx = to_vmx(vcpu); 4197 bool flag = !(vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN); 4198 u32 i; 4199 4200 vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_STATUS, MSR_TYPE_RW, flag); 4201 vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_OUTPUT_BASE, MSR_TYPE_RW, flag); 4202 vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_OUTPUT_MASK, MSR_TYPE_RW, flag); 4203 vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_CR3_MATCH, MSR_TYPE_RW, flag); 4204 for (i = 0; i < vmx->pt_desc.num_address_ranges; i++) { 4205 vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_ADDR0_A + i * 2, MSR_TYPE_RW, flag); 4206 vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_ADDR0_B + i * 2, MSR_TYPE_RW, flag); 4207 } 4208 } 4209 4210 static void vmx_recalc_pmu_msr_intercepts(struct kvm_vcpu *vcpu) 4211 { 4212 u64 vm_exit_controls_bits = VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL | 4213 VM_EXIT_SAVE_IA32_PERF_GLOBAL_CTRL; 4214 bool has_mediated_pmu = kvm_vcpu_has_mediated_pmu(vcpu); 4215 struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); 4216 struct vcpu_vmx *vmx = to_vmx(vcpu); 4217 bool intercept = !has_mediated_pmu; 4218 int i; 4219 4220 if (!enable_mediated_pmu) 4221 return; 4222 4223 if (!cpu_has_save_perf_global_ctrl()) { 4224 vm_exit_controls_bits &= ~VM_EXIT_SAVE_IA32_PERF_GLOBAL_CTRL; 4225 4226 if (has_mediated_pmu) 4227 vmx_add_autostore_msr(vmx, MSR_CORE_PERF_GLOBAL_CTRL); 4228 else 4229 vmx_remove_autostore_msr(vmx, MSR_CORE_PERF_GLOBAL_CTRL); 4230 } 4231 4232 vm_entry_controls_changebit(vmx, VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL, 4233 has_mediated_pmu); 4234 4235 vm_exit_controls_changebit(vmx, vm_exit_controls_bits, has_mediated_pmu); 4236 4237 for (i = 0; i < pmu->nr_arch_gp_counters; i++) { 4238 vmx_set_intercept_for_msr(vcpu, MSR_IA32_PERFCTR0 + i, 4239 MSR_TYPE_RW, intercept); 4240 vmx_set_intercept_for_msr(vcpu, MSR_IA32_PMC0 + i, MSR_TYPE_RW, 4241 intercept || !fw_writes_is_enabled(vcpu)); 4242 } 4243 for ( ; i < kvm_pmu_cap.num_counters_gp; i++) { 4244 vmx_set_intercept_for_msr(vcpu, MSR_IA32_PERFCTR0 + i, 4245 MSR_TYPE_RW, true); 4246 vmx_set_intercept_for_msr(vcpu, MSR_IA32_PMC0 + i, 4247 MSR_TYPE_RW, true); 4248 } 4249 4250 for (i = 0; i < pmu->nr_arch_fixed_counters; i++) 4251 vmx_set_intercept_for_msr(vcpu, MSR_CORE_PERF_FIXED_CTR0 + i, 4252 MSR_TYPE_RW, intercept); 4253 for ( ; i < kvm_pmu_cap.num_counters_fixed; i++) 4254 vmx_set_intercept_for_msr(vcpu, MSR_CORE_PERF_FIXED_CTR0 + i, 4255 MSR_TYPE_RW, true); 4256 4257 intercept = kvm_need_perf_global_ctrl_intercept(vcpu); 4258 vmx_set_intercept_for_msr(vcpu, MSR_CORE_PERF_GLOBAL_STATUS, 4259 MSR_TYPE_RW, intercept); 4260 vmx_set_intercept_for_msr(vcpu, MSR_CORE_PERF_GLOBAL_CTRL, 4261 MSR_TYPE_RW, intercept); 4262 vmx_set_intercept_for_msr(vcpu, MSR_CORE_PERF_GLOBAL_OVF_CTRL, 4263 MSR_TYPE_RW, intercept); 4264 } 4265 4266 static void vmx_recalc_msr_intercepts(struct kvm_vcpu *vcpu) 4267 { 4268 bool intercept; 4269 4270 if (!cpu_has_vmx_msr_bitmap()) 4271 return; 4272 4273 vmx_disable_intercept_for_msr(vcpu, MSR_IA32_TSC, MSR_TYPE_R); 4274 #ifdef CONFIG_X86_64 4275 vmx_disable_intercept_for_msr(vcpu, MSR_FS_BASE, MSR_TYPE_RW); 4276 vmx_disable_intercept_for_msr(vcpu, MSR_GS_BASE, MSR_TYPE_RW); 4277 vmx_disable_intercept_for_msr(vcpu, MSR_KERNEL_GS_BASE, MSR_TYPE_RW); 4278 #endif 4279 vmx_disable_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_CS, MSR_TYPE_RW); 4280 vmx_disable_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_ESP, MSR_TYPE_RW); 4281 vmx_disable_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_EIP, MSR_TYPE_RW); 4282 if (kvm_cstate_in_guest(vcpu->kvm)) { 4283 vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C1_RES, MSR_TYPE_R); 4284 vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C3_RESIDENCY, MSR_TYPE_R); 4285 vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C6_RESIDENCY, MSR_TYPE_R); 4286 vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C7_RESIDENCY, MSR_TYPE_R); 4287 } 4288 if (kvm_aperfmperf_in_guest(vcpu->kvm)) { 4289 vmx_disable_intercept_for_msr(vcpu, MSR_IA32_APERF, MSR_TYPE_R); 4290 vmx_disable_intercept_for_msr(vcpu, MSR_IA32_MPERF, MSR_TYPE_R); 4291 } 4292 4293 /* PT MSRs can be passed through iff PT is exposed to the guest. */ 4294 if (vmx_pt_mode_is_host_guest()) 4295 pt_update_intercept_for_msr(vcpu); 4296 4297 if (vcpu->arch.xfd_no_write_intercept) 4298 vmx_disable_intercept_for_msr(vcpu, MSR_IA32_XFD, MSR_TYPE_RW); 4299 4300 vmx_set_intercept_for_msr(vcpu, MSR_IA32_SPEC_CTRL, MSR_TYPE_RW, 4301 !to_vmx(vcpu)->spec_ctrl); 4302 4303 if (kvm_cpu_cap_has(X86_FEATURE_XFD)) 4304 vmx_set_intercept_for_msr(vcpu, MSR_IA32_XFD_ERR, MSR_TYPE_R, 4305 !guest_cpu_cap_has(vcpu, X86_FEATURE_XFD)); 4306 4307 if (cpu_feature_enabled(X86_FEATURE_IBPB)) 4308 vmx_set_intercept_for_msr(vcpu, MSR_IA32_PRED_CMD, MSR_TYPE_W, 4309 !guest_has_pred_cmd_msr(vcpu)); 4310 4311 if (cpu_feature_enabled(X86_FEATURE_FLUSH_L1D)) 4312 vmx_set_intercept_for_msr(vcpu, MSR_IA32_FLUSH_CMD, MSR_TYPE_W, 4313 !guest_cpu_cap_has(vcpu, X86_FEATURE_FLUSH_L1D)); 4314 4315 if (kvm_cpu_cap_has(X86_FEATURE_SHSTK)) { 4316 intercept = !guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK); 4317 4318 vmx_set_intercept_for_msr(vcpu, MSR_IA32_PL0_SSP, MSR_TYPE_RW, intercept); 4319 vmx_set_intercept_for_msr(vcpu, MSR_IA32_PL1_SSP, MSR_TYPE_RW, intercept); 4320 vmx_set_intercept_for_msr(vcpu, MSR_IA32_PL2_SSP, MSR_TYPE_RW, intercept); 4321 vmx_set_intercept_for_msr(vcpu, MSR_IA32_PL3_SSP, MSR_TYPE_RW, intercept); 4322 } 4323 4324 if (kvm_cpu_cap_has(X86_FEATURE_SHSTK) || kvm_cpu_cap_has(X86_FEATURE_IBT)) { 4325 intercept = !guest_cpu_cap_has(vcpu, X86_FEATURE_IBT) && 4326 !guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK); 4327 4328 vmx_set_intercept_for_msr(vcpu, MSR_IA32_U_CET, MSR_TYPE_RW, intercept); 4329 vmx_set_intercept_for_msr(vcpu, MSR_IA32_S_CET, MSR_TYPE_RW, intercept); 4330 } 4331 4332 vmx_recalc_pmu_msr_intercepts(vcpu); 4333 4334 /* 4335 * x2APIC and LBR MSR intercepts are modified on-demand and cannot be 4336 * filtered by userspace. 4337 */ 4338 } 4339 4340 static void vmx_recalc_instruction_intercepts(struct kvm_vcpu *vcpu) 4341 { 4342 exec_controls_changebit(to_vmx(vcpu), CPU_BASED_RDPMC_EXITING, 4343 kvm_need_rdpmc_intercept(vcpu)); 4344 } 4345 4346 void vmx_recalc_intercepts(struct kvm_vcpu *vcpu) 4347 { 4348 vmx_recalc_instruction_intercepts(vcpu); 4349 vmx_recalc_msr_intercepts(vcpu); 4350 } 4351 4352 static int vmx_deliver_nested_posted_interrupt(struct kvm_vcpu *vcpu, 4353 int vector) 4354 { 4355 struct vcpu_vmx *vmx = to_vmx(vcpu); 4356 4357 /* 4358 * DO NOT query the vCPU's vmcs12, as vmcs12 is dynamically allocated 4359 * and freed, and must not be accessed outside of vcpu->mutex. The 4360 * vCPU's cached PI NV is valid if and only if posted interrupts 4361 * enabled in its vmcs12, i.e. checking the vector also checks that 4362 * L1 has enabled posted interrupts for L2. 4363 */ 4364 if (is_guest_mode(vcpu) && 4365 vector == vmx->nested.posted_intr_nv) { 4366 /* 4367 * If a posted intr is not recognized by hardware, 4368 * we will accomplish it in the next vmentry. 4369 */ 4370 vmx->nested.pi_pending = true; 4371 kvm_make_request(KVM_REQ_EVENT, vcpu); 4372 4373 /* 4374 * This pairs with the smp_mb_*() after setting vcpu->mode in 4375 * vcpu_enter_guest() to guarantee the vCPU sees the event 4376 * request if triggering a posted interrupt "fails" because 4377 * vcpu->mode != IN_GUEST_MODE. The extra barrier is needed as 4378 * the smb_wmb() in kvm_make_request() only ensures everything 4379 * done before making the request is visible when the request 4380 * is visible, it doesn't ensure ordering between the store to 4381 * vcpu->requests and the load from vcpu->mode. 4382 */ 4383 smp_mb__after_atomic(); 4384 4385 /* the PIR and ON have been set by L1. */ 4386 kvm_vcpu_trigger_posted_interrupt(vcpu, POSTED_INTR_NESTED_VECTOR); 4387 return 0; 4388 } 4389 return -1; 4390 } 4391 /* 4392 * Send interrupt to vcpu via posted interrupt way. 4393 * 1. If target vcpu is running(non-root mode), send posted interrupt 4394 * notification to vcpu and hardware will sync PIR to vIRR atomically. 4395 * 2. If target vcpu isn't running(root mode), kick it to pick up the 4396 * interrupt from PIR in next vmentry. 4397 */ 4398 static int vmx_deliver_posted_interrupt(struct kvm_vcpu *vcpu, int vector) 4399 { 4400 struct vcpu_vt *vt = to_vt(vcpu); 4401 int r; 4402 4403 r = vmx_deliver_nested_posted_interrupt(vcpu, vector); 4404 if (!r) 4405 return 0; 4406 4407 /* Note, this is called iff the local APIC is in-kernel. */ 4408 if (!vcpu->arch.apic->apicv_active) 4409 return -1; 4410 4411 __vmx_deliver_posted_interrupt(vcpu, &vt->pi_desc, vector); 4412 return 0; 4413 } 4414 4415 void vmx_deliver_interrupt(struct kvm_lapic *apic, int delivery_mode, 4416 int trig_mode, int vector) 4417 { 4418 struct kvm_vcpu *vcpu = apic->vcpu; 4419 4420 if (vmx_deliver_posted_interrupt(vcpu, vector)) { 4421 kvm_lapic_set_irr(vector, apic); 4422 kvm_make_request(KVM_REQ_EVENT, vcpu); 4423 kvm_vcpu_kick(vcpu); 4424 } else { 4425 trace_kvm_apicv_accept_irq(vcpu->vcpu_id, delivery_mode, 4426 trig_mode, vector); 4427 } 4428 } 4429 4430 /* 4431 * Set up the vmcs's constant host-state fields, i.e., host-state fields that 4432 * will not change in the lifetime of the guest. 4433 * Note that host-state that does change is set elsewhere. E.g., host-state 4434 * that is set differently for each CPU is set in vmx_vcpu_load(), not here. 4435 */ 4436 void vmx_set_constant_host_state(struct vcpu_vmx *vmx) 4437 { 4438 u32 low32, high32; 4439 unsigned long tmpl; 4440 unsigned long cr0, cr3, cr4; 4441 4442 cr0 = read_cr0(); 4443 WARN_ON(cr0 & X86_CR0_TS); 4444 vmcs_writel(HOST_CR0, cr0); /* 22.2.3 */ 4445 4446 /* 4447 * Save the most likely value for this task's CR3 in the VMCS. 4448 * We can't use __get_current_cr3_fast() because we're not atomic. 4449 */ 4450 cr3 = __read_cr3(); 4451 vmcs_writel(HOST_CR3, cr3); /* 22.2.3 FIXME: shadow tables */ 4452 vmx->loaded_vmcs->host_state.cr3 = cr3; 4453 4454 /* Save the most likely value for this task's CR4 in the VMCS. */ 4455 cr4 = cr4_read_shadow(); 4456 vmcs_writel(HOST_CR4, cr4); /* 22.2.3, 22.2.5 */ 4457 vmx->loaded_vmcs->host_state.cr4 = cr4; 4458 4459 vmcs_write16(HOST_CS_SELECTOR, __KERNEL_CS); /* 22.2.4 */ 4460 #ifdef CONFIG_X86_64 4461 /* 4462 * Load null selectors, so we can avoid reloading them in 4463 * vmx_prepare_switch_to_host(), in case userspace uses 4464 * the null selectors too (the expected case). 4465 */ 4466 vmcs_write16(HOST_DS_SELECTOR, 0); 4467 vmcs_write16(HOST_ES_SELECTOR, 0); 4468 #else 4469 vmcs_write16(HOST_DS_SELECTOR, __KERNEL_DS); /* 22.2.4 */ 4470 vmcs_write16(HOST_ES_SELECTOR, __KERNEL_DS); /* 22.2.4 */ 4471 #endif 4472 vmcs_write16(HOST_SS_SELECTOR, __KERNEL_DS); /* 22.2.4 */ 4473 vmcs_write16(HOST_TR_SELECTOR, GDT_ENTRY_TSS*8); /* 22.2.4 */ 4474 4475 vmcs_writel(HOST_IDTR_BASE, host_idt_base); /* 22.2.4 */ 4476 4477 vmcs_writel(HOST_RIP, (unsigned long)vmx_vmexit); /* 22.2.5 */ 4478 4479 rdmsr(MSR_IA32_SYSENTER_CS, low32, high32); 4480 vmcs_write32(HOST_IA32_SYSENTER_CS, low32); 4481 4482 /* 4483 * SYSENTER is used for 32-bit system calls on either 32-bit or 4484 * 64-bit kernels. It is always zero If neither is allowed, otherwise 4485 * vmx_vcpu_load_vmcs loads it with the per-CPU entry stack (and may 4486 * have already done so!). 4487 */ 4488 if (!IS_ENABLED(CONFIG_IA32_EMULATION) && !IS_ENABLED(CONFIG_X86_32)) 4489 vmcs_writel(HOST_IA32_SYSENTER_ESP, 0); 4490 4491 rdmsrq(MSR_IA32_SYSENTER_EIP, tmpl); 4492 vmcs_writel(HOST_IA32_SYSENTER_EIP, tmpl); /* 22.2.3 */ 4493 4494 if (vmcs_config.vmexit_ctrl & VM_EXIT_LOAD_IA32_PAT) { 4495 rdmsr(MSR_IA32_CR_PAT, low32, high32); 4496 vmcs_write64(HOST_IA32_PAT, low32 | ((u64) high32 << 32)); 4497 } 4498 4499 if (cpu_has_load_ia32_efer()) 4500 vmcs_write64(HOST_IA32_EFER, kvm_host.efer); 4501 4502 /* 4503 * Supervisor shadow stack is not enabled on host side, i.e., 4504 * host IA32_S_CET.SHSTK_EN bit is guaranteed to 0 now, per SDM 4505 * description(RDSSP instruction), SSP is not readable in CPL0, 4506 * so resetting the two registers to 0s at VM-Exit does no harm 4507 * to kernel execution. When execution flow exits to userspace, 4508 * SSP is reloaded from IA32_PL3_SSP. Check SDM Vol.2A/B Chapter 4509 * 3 and 4 for details. 4510 */ 4511 if (enable_cet) { 4512 vmcs_writel(HOST_S_CET, kvm_host.s_cet); 4513 vmcs_writel(HOST_SSP, 0); 4514 vmcs_writel(HOST_INTR_SSP_TABLE, 0); 4515 } 4516 4517 /* 4518 * When running a guest with a mediated PMU, guest state is resident in 4519 * hardware after VM-Exit. Zero PERF_GLOBAL_CTRL on exit so that host 4520 * activity doesn't bleed into the guest counters. When running with 4521 * an emulated PMU, PERF_GLOBAL_CTRL is dynamically computed on every 4522 * entry/exit to merge guest and host PMU usage. 4523 */ 4524 if (enable_mediated_pmu) 4525 vmcs_write64(HOST_IA32_PERF_GLOBAL_CTRL, 0); 4526 } 4527 4528 void set_cr4_guest_host_mask(struct vcpu_vmx *vmx) 4529 { 4530 struct kvm_vcpu *vcpu = &vmx->vcpu; 4531 4532 vcpu->arch.cr4_guest_owned_bits = KVM_POSSIBLE_CR4_GUEST_BITS & 4533 ~vcpu->arch.cr4_guest_rsvd_bits; 4534 if (!enable_ept) { 4535 vcpu->arch.cr4_guest_owned_bits &= ~X86_CR4_TLBFLUSH_BITS; 4536 vcpu->arch.cr4_guest_owned_bits &= ~X86_CR4_PDPTR_BITS; 4537 } 4538 if (is_guest_mode(&vmx->vcpu)) 4539 vcpu->arch.cr4_guest_owned_bits &= 4540 ~get_vmcs12(vcpu)->cr4_guest_host_mask; 4541 vmcs_writel(CR4_GUEST_HOST_MASK, ~vcpu->arch.cr4_guest_owned_bits); 4542 } 4543 4544 static u32 vmx_pin_based_exec_ctrl(struct vcpu_vmx *vmx) 4545 { 4546 u32 pin_based_exec_ctrl = vmcs_config.pin_based_exec_ctrl; 4547 4548 if (!kvm_vcpu_apicv_active(&vmx->vcpu)) 4549 pin_based_exec_ctrl &= ~PIN_BASED_POSTED_INTR; 4550 4551 if (!enable_vnmi) 4552 pin_based_exec_ctrl &= ~PIN_BASED_VIRTUAL_NMIS; 4553 4554 if (!enable_preemption_timer) 4555 pin_based_exec_ctrl &= ~PIN_BASED_VMX_PREEMPTION_TIMER; 4556 4557 return pin_based_exec_ctrl; 4558 } 4559 4560 static u32 vmx_get_initial_vmentry_ctrl(void) 4561 { 4562 u32 vmentry_ctrl = vmcs_config.vmentry_ctrl; 4563 4564 if (vmx_pt_mode_is_system()) 4565 vmentry_ctrl &= ~(VM_ENTRY_PT_CONCEAL_PIP | 4566 VM_ENTRY_LOAD_IA32_RTIT_CTL); 4567 4568 if (!enable_cet) 4569 vmentry_ctrl &= ~VM_ENTRY_LOAD_CET_STATE; 4570 4571 /* 4572 * IA32e mode, and loading of EFER and PERF_GLOBAL_CTRL are toggled dynamically. 4573 */ 4574 vmentry_ctrl &= ~(VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL | 4575 VM_ENTRY_LOAD_IA32_EFER | 4576 VM_ENTRY_IA32E_MODE); 4577 4578 return vmentry_ctrl; 4579 } 4580 4581 static u32 vmx_get_initial_vmexit_ctrl(void) 4582 { 4583 u32 vmexit_ctrl = vmcs_config.vmexit_ctrl; 4584 4585 if (!enable_cet) 4586 vmexit_ctrl &= ~VM_EXIT_LOAD_CET_STATE; 4587 4588 /* 4589 * Not used by KVM and never set in vmcs01 or vmcs02, but emulated for 4590 * nested virtualization and thus allowed to be set in vmcs12. 4591 */ 4592 vmexit_ctrl &= ~(VM_EXIT_SAVE_IA32_PAT | VM_EXIT_SAVE_IA32_EFER | 4593 VM_EXIT_SAVE_VMX_PREEMPTION_TIMER); 4594 4595 if (vmx_pt_mode_is_system()) 4596 vmexit_ctrl &= ~(VM_EXIT_PT_CONCEAL_PIP | 4597 VM_EXIT_CLEAR_IA32_RTIT_CTL); 4598 /* Loading of EFER and PERF_GLOBAL_CTRL are toggled dynamically */ 4599 return vmexit_ctrl & 4600 ~(VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL | VM_EXIT_LOAD_IA32_EFER | 4601 VM_EXIT_SAVE_IA32_PERF_GLOBAL_CTRL); 4602 } 4603 4604 void vmx_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu) 4605 { 4606 struct vcpu_vmx *vmx = to_vmx(vcpu); 4607 4608 guard(vmx_vmcs01)(vcpu); 4609 4610 pin_controls_set(vmx, vmx_pin_based_exec_ctrl(vmx)); 4611 4612 secondary_exec_controls_changebit(vmx, 4613 SECONDARY_EXEC_APIC_REGISTER_VIRT | 4614 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY, 4615 kvm_vcpu_apicv_active(vcpu)); 4616 if (enable_ipiv) 4617 tertiary_exec_controls_changebit(vmx, TERTIARY_EXEC_IPI_VIRT, 4618 kvm_vcpu_apicv_active(vcpu)); 4619 4620 vmx_update_msr_bitmap_x2apic(vcpu); 4621 } 4622 4623 static u32 vmx_exec_control(struct vcpu_vmx *vmx) 4624 { 4625 u32 exec_control = vmcs_config.cpu_based_exec_ctrl; 4626 4627 /* 4628 * Not used by KVM, but fully supported for nesting, i.e. are allowed in 4629 * vmcs12 and propagated to vmcs02 when set in vmcs12. 4630 */ 4631 exec_control &= ~(CPU_BASED_RDTSC_EXITING | 4632 CPU_BASED_USE_IO_BITMAPS | 4633 CPU_BASED_MONITOR_TRAP_FLAG | 4634 CPU_BASED_PAUSE_EXITING); 4635 4636 /* INTR_WINDOW_EXITING and NMI_WINDOW_EXITING are toggled dynamically */ 4637 exec_control &= ~(CPU_BASED_INTR_WINDOW_EXITING | 4638 CPU_BASED_NMI_WINDOW_EXITING); 4639 4640 if (vmx->vcpu.arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT) 4641 exec_control &= ~CPU_BASED_MOV_DR_EXITING; 4642 4643 if (!cpu_need_tpr_shadow(&vmx->vcpu)) 4644 exec_control &= ~CPU_BASED_TPR_SHADOW; 4645 4646 #ifdef CONFIG_X86_64 4647 if (exec_control & CPU_BASED_TPR_SHADOW) 4648 exec_control &= ~(CPU_BASED_CR8_LOAD_EXITING | 4649 CPU_BASED_CR8_STORE_EXITING); 4650 else 4651 exec_control |= CPU_BASED_CR8_STORE_EXITING | 4652 CPU_BASED_CR8_LOAD_EXITING; 4653 #endif 4654 /* No need to intercept CR3 access or INVPLG when using EPT. */ 4655 if (enable_ept) 4656 exec_control &= ~(CPU_BASED_CR3_LOAD_EXITING | 4657 CPU_BASED_CR3_STORE_EXITING | 4658 CPU_BASED_INVLPG_EXITING); 4659 if (kvm_mwait_in_guest(vmx->vcpu.kvm)) 4660 exec_control &= ~(CPU_BASED_MWAIT_EXITING | 4661 CPU_BASED_MONITOR_EXITING); 4662 if (kvm_hlt_in_guest(vmx->vcpu.kvm)) 4663 exec_control &= ~CPU_BASED_HLT_EXITING; 4664 return exec_control; 4665 } 4666 4667 static u64 vmx_tertiary_exec_control(struct vcpu_vmx *vmx) 4668 { 4669 u64 exec_control = vmcs_config.cpu_based_3rd_exec_ctrl; 4670 4671 /* 4672 * IPI virtualization relies on APICv. Disable IPI virtualization if 4673 * APICv is inhibited. 4674 */ 4675 if (!enable_ipiv || !kvm_vcpu_apicv_active(&vmx->vcpu)) 4676 exec_control &= ~TERTIARY_EXEC_IPI_VIRT; 4677 4678 return exec_control; 4679 } 4680 4681 /* 4682 * Adjust a single secondary execution control bit to intercept/allow an 4683 * instruction in the guest. This is usually done based on whether or not a 4684 * feature has been exposed to the guest in order to correctly emulate faults. 4685 */ 4686 static inline void 4687 vmx_adjust_secondary_exec_control(struct vcpu_vmx *vmx, u32 *exec_control, 4688 u32 control, bool enabled, bool exiting) 4689 { 4690 /* 4691 * If the control is for an opt-in feature, clear the control if the 4692 * feature is not exposed to the guest, i.e. not enabled. If the 4693 * control is opt-out, i.e. an exiting control, clear the control if 4694 * the feature _is_ exposed to the guest, i.e. exiting/interception is 4695 * disabled for the associated instruction. Note, the caller is 4696 * responsible presetting exec_control to set all supported bits. 4697 */ 4698 if (enabled == exiting) 4699 *exec_control &= ~control; 4700 4701 /* 4702 * Update the nested MSR settings so that a nested VMM can/can't set 4703 * controls for features that are/aren't exposed to the guest. 4704 */ 4705 if (nested && 4706 kvm_check_has_quirk(vmx->vcpu.kvm, KVM_X86_QUIRK_STUFF_FEATURE_MSRS)) { 4707 /* 4708 * All features that can be added or removed to VMX MSRs must 4709 * be supported in the first place for nested virtualization. 4710 */ 4711 if (WARN_ON_ONCE(!(vmcs_config.nested.secondary_ctls_high & control))) 4712 enabled = false; 4713 4714 if (enabled) 4715 vmx->nested.msrs.secondary_ctls_high |= control; 4716 else 4717 vmx->nested.msrs.secondary_ctls_high &= ~control; 4718 } 4719 } 4720 4721 /* 4722 * Wrapper macro for the common case of adjusting a secondary execution control 4723 * based on a single guest CPUID bit, with a dedicated feature bit. This also 4724 * verifies that the control is actually supported by KVM and hardware. 4725 */ 4726 #define vmx_adjust_sec_exec_control(vmx, exec_control, name, feat_name, ctrl_name, exiting) \ 4727 ({ \ 4728 struct kvm_vcpu *__vcpu = &(vmx)->vcpu; \ 4729 bool __enabled; \ 4730 \ 4731 if (cpu_has_vmx_##name()) { \ 4732 __enabled = guest_cpu_cap_has(__vcpu, X86_FEATURE_##feat_name); \ 4733 vmx_adjust_secondary_exec_control(vmx, exec_control, SECONDARY_EXEC_##ctrl_name,\ 4734 __enabled, exiting); \ 4735 } \ 4736 }) 4737 4738 /* More macro magic for ENABLE_/opt-in versus _EXITING/opt-out controls. */ 4739 #define vmx_adjust_sec_exec_feature(vmx, exec_control, lname, uname) \ 4740 vmx_adjust_sec_exec_control(vmx, exec_control, lname, uname, ENABLE_##uname, false) 4741 4742 #define vmx_adjust_sec_exec_exiting(vmx, exec_control, lname, uname) \ 4743 vmx_adjust_sec_exec_control(vmx, exec_control, lname, uname, uname##_EXITING, true) 4744 4745 static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx) 4746 { 4747 struct kvm_vcpu *vcpu = &vmx->vcpu; 4748 4749 u32 exec_control = vmcs_config.cpu_based_2nd_exec_ctrl; 4750 4751 if (vmx_pt_mode_is_system()) 4752 exec_control &= ~(SECONDARY_EXEC_PT_USE_GPA | SECONDARY_EXEC_PT_CONCEAL_VMX); 4753 if (!cpu_need_virtualize_apic_accesses(vcpu)) 4754 exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; 4755 if (vmx->vpid == 0) 4756 exec_control &= ~SECONDARY_EXEC_ENABLE_VPID; 4757 if (!enable_ept) { 4758 exec_control &= ~SECONDARY_EXEC_ENABLE_EPT; 4759 exec_control &= ~SECONDARY_EXEC_EPT_VIOLATION_VE; 4760 enable_unrestricted_guest = 0; 4761 } 4762 if (!enable_unrestricted_guest) 4763 exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST; 4764 if (kvm_pause_in_guest(vmx->vcpu.kvm)) 4765 exec_control &= ~SECONDARY_EXEC_PAUSE_LOOP_EXITING; 4766 if (!kvm_vcpu_apicv_active(vcpu)) 4767 exec_control &= ~(SECONDARY_EXEC_APIC_REGISTER_VIRT | 4768 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY); 4769 exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE; 4770 4771 /* 4772 * KVM doesn't support VMFUNC for L1, but the control is set in KVM's 4773 * base configuration as KVM emulates VMFUNC[EPTP_SWITCHING] for L2. 4774 */ 4775 exec_control &= ~SECONDARY_EXEC_ENABLE_VMFUNC; 4776 4777 if (!enable_mbec) 4778 exec_control &= ~SECONDARY_EXEC_MODE_BASED_EPT_EXEC; 4779 4780 /* SECONDARY_EXEC_DESC is enabled/disabled on writes to CR4.UMIP, 4781 * in vmx_set_cr4. */ 4782 exec_control &= ~SECONDARY_EXEC_DESC; 4783 4784 /* SECONDARY_EXEC_SHADOW_VMCS is enabled when L1 executes VMPTRLD 4785 (handle_vmptrld). 4786 We can NOT enable shadow_vmcs here because we don't have yet 4787 a current VMCS12 4788 */ 4789 exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS; 4790 4791 /* 4792 * PML is enabled/disabled when dirty logging of memsmlots changes, but 4793 * it needs to be set here when dirty logging is already active, e.g. 4794 * if this vCPU was created after dirty logging was enabled. 4795 */ 4796 if (!enable_pml || !atomic_read(&vcpu->kvm->nr_memslots_dirty_logging)) 4797 exec_control &= ~SECONDARY_EXEC_ENABLE_PML; 4798 4799 vmx_adjust_sec_exec_feature(vmx, &exec_control, xsaves, XSAVES); 4800 4801 /* 4802 * RDPID is also gated by ENABLE_RDTSCP, turn on the control if either 4803 * feature is exposed to the guest. This creates a virtualization hole 4804 * if both are supported in hardware but only one is exposed to the 4805 * guest, but letting the guest execute RDTSCP or RDPID when either one 4806 * is advertised is preferable to emulating the advertised instruction 4807 * in KVM on #UD, and obviously better than incorrectly injecting #UD. 4808 */ 4809 if (cpu_has_vmx_rdtscp()) { 4810 bool rdpid_or_rdtscp_enabled = 4811 guest_cpu_cap_has(vcpu, X86_FEATURE_RDTSCP) || 4812 guest_cpu_cap_has(vcpu, X86_FEATURE_RDPID); 4813 4814 vmx_adjust_secondary_exec_control(vmx, &exec_control, 4815 SECONDARY_EXEC_ENABLE_RDTSCP, 4816 rdpid_or_rdtscp_enabled, false); 4817 } 4818 4819 vmx_adjust_sec_exec_feature(vmx, &exec_control, invpcid, INVPCID); 4820 4821 vmx_adjust_sec_exec_exiting(vmx, &exec_control, rdrand, RDRAND); 4822 vmx_adjust_sec_exec_exiting(vmx, &exec_control, rdseed, RDSEED); 4823 4824 vmx_adjust_sec_exec_control(vmx, &exec_control, waitpkg, WAITPKG, 4825 ENABLE_USR_WAIT_PAUSE, false); 4826 4827 if (!vcpu->kvm->arch.bus_lock_detection_enabled) 4828 exec_control &= ~SECONDARY_EXEC_BUS_LOCK_DETECTION; 4829 4830 if (!kvm_notify_vmexit_enabled(vcpu->kvm)) 4831 exec_control &= ~SECONDARY_EXEC_NOTIFY_VM_EXITING; 4832 4833 return exec_control; 4834 } 4835 4836 static inline int vmx_get_pid_table_order(struct kvm *kvm) 4837 { 4838 return get_order(kvm->arch.max_vcpu_ids * sizeof(*to_kvm_vmx(kvm)->pid_table)); 4839 } 4840 4841 static int vmx_alloc_ipiv_pid_table(struct kvm *kvm) 4842 { 4843 struct page *pages; 4844 struct kvm_vmx *kvm_vmx = to_kvm_vmx(kvm); 4845 4846 if (!irqchip_in_kernel(kvm) || !enable_ipiv) 4847 return 0; 4848 4849 if (kvm_vmx->pid_table) 4850 return 0; 4851 4852 pages = alloc_pages(GFP_KERNEL_ACCOUNT | __GFP_ZERO, 4853 vmx_get_pid_table_order(kvm)); 4854 if (!pages) 4855 return -ENOMEM; 4856 4857 kvm_vmx->pid_table = (void *)page_address(pages); 4858 return 0; 4859 } 4860 4861 int vmx_vcpu_precreate(struct kvm *kvm) 4862 { 4863 return vmx_alloc_ipiv_pid_table(kvm); 4864 } 4865 4866 #define VMX_XSS_EXIT_BITMAP 0 4867 4868 static void init_vmcs(struct vcpu_vmx *vmx) 4869 { 4870 struct kvm *kvm = vmx->vcpu.kvm; 4871 struct kvm_vmx *kvm_vmx = to_kvm_vmx(kvm); 4872 4873 if (nested) 4874 nested_vmx_set_vmcs_shadowing_bitmap(); 4875 4876 if (cpu_has_vmx_msr_bitmap()) 4877 vmcs_write64(MSR_BITMAP, __pa(vmx->vmcs01.msr_bitmap)); 4878 4879 vmcs_write64(VMCS_LINK_POINTER, INVALID_GPA); /* 22.3.1.5 */ 4880 4881 /* Control */ 4882 pin_controls_set(vmx, vmx_pin_based_exec_ctrl(vmx)); 4883 4884 exec_controls_set(vmx, vmx_exec_control(vmx)); 4885 4886 if (cpu_has_secondary_exec_ctrls()) { 4887 secondary_exec_controls_set(vmx, vmx_secondary_exec_control(vmx)); 4888 if (vmx->ve_info) 4889 vmcs_write64(VE_INFORMATION_ADDRESS, 4890 __pa(vmx->ve_info)); 4891 } 4892 4893 if (cpu_has_tertiary_exec_ctrls()) 4894 tertiary_exec_controls_set(vmx, vmx_tertiary_exec_control(vmx)); 4895 4896 if (enable_apicv && lapic_in_kernel(&vmx->vcpu)) { 4897 vmcs_write64(EOI_EXIT_BITMAP0, 0); 4898 vmcs_write64(EOI_EXIT_BITMAP1, 0); 4899 vmcs_write64(EOI_EXIT_BITMAP2, 0); 4900 vmcs_write64(EOI_EXIT_BITMAP3, 0); 4901 4902 vmcs_write16(GUEST_INTR_STATUS, 0); 4903 4904 vmcs_write16(POSTED_INTR_NV, POSTED_INTR_VECTOR); 4905 vmcs_write64(POSTED_INTR_DESC_ADDR, __pa((&vmx->vt.pi_desc))); 4906 } 4907 4908 if (vmx_can_use_ipiv(&vmx->vcpu)) { 4909 vmcs_write64(PID_POINTER_TABLE, __pa(kvm_vmx->pid_table)); 4910 vmcs_write16(LAST_PID_POINTER_INDEX, kvm->arch.max_vcpu_ids - 1); 4911 } 4912 4913 if (!kvm_pause_in_guest(kvm)) { 4914 vmcs_write32(PLE_GAP, ple_gap); 4915 vmx->ple_window = ple_window; 4916 vmx->ple_window_dirty = true; 4917 } 4918 4919 if (kvm_notify_vmexit_enabled(kvm)) 4920 vmcs_write32(NOTIFY_WINDOW, kvm->arch.notify_window); 4921 4922 vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, 0); 4923 vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, 0); 4924 vmcs_write32(CR3_TARGET_COUNT, 0); /* 22.2.1 */ 4925 4926 vmcs_write16(HOST_FS_SELECTOR, 0); /* 22.2.4 */ 4927 vmcs_write16(HOST_GS_SELECTOR, 0); /* 22.2.4 */ 4928 vmx_set_constant_host_state(vmx); 4929 vmcs_writel(HOST_FS_BASE, 0); /* 22.2.4 */ 4930 vmcs_writel(HOST_GS_BASE, 0); /* 22.2.4 */ 4931 4932 if (cpu_has_vmx_vmfunc()) 4933 vmcs_write64(VM_FUNCTION_CONTROL, 0); 4934 4935 vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0); 4936 vmcs_write64(VM_EXIT_MSR_STORE_ADDR, __pa(vmx->msr_autostore.val)); 4937 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0); 4938 vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host.val)); 4939 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0); 4940 vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest.val)); 4941 4942 if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) 4943 vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat); 4944 4945 vm_exit_controls_set(vmx, vmx_get_initial_vmexit_ctrl()); 4946 4947 /* 22.2.1, 20.8.1 */ 4948 vm_entry_controls_set(vmx, vmx_get_initial_vmentry_ctrl()); 4949 4950 vmx->vcpu.arch.cr0_guest_owned_bits = vmx_l1_guest_owned_cr0_bits(); 4951 vmcs_writel(CR0_GUEST_HOST_MASK, ~vmx->vcpu.arch.cr0_guest_owned_bits); 4952 4953 set_cr4_guest_host_mask(vmx); 4954 4955 if (vmx->vpid != 0) 4956 vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid); 4957 4958 if (cpu_has_vmx_xsaves()) 4959 vmcs_write64(XSS_EXIT_BITMAP, VMX_XSS_EXIT_BITMAP); 4960 4961 if (enable_pml) { 4962 vmcs_write64(PML_ADDRESS, page_to_phys(vmx->pml_pg)); 4963 vmcs_write16(GUEST_PML_INDEX, PML_HEAD_INDEX); 4964 } 4965 4966 vmx_write_encls_bitmap(&vmx->vcpu, NULL); 4967 4968 if (vmx_pt_mode_is_host_guest()) { 4969 memset(&vmx->pt_desc, 0, sizeof(vmx->pt_desc)); 4970 /* Bit[6~0] are forced to 1, writes are ignored. */ 4971 vmx->pt_desc.guest.output_mask = 0x7F; 4972 vmcs_write64(GUEST_IA32_RTIT_CTL, 0); 4973 } 4974 4975 vmcs_write32(GUEST_SYSENTER_CS, 0); 4976 vmcs_writel(GUEST_SYSENTER_ESP, 0); 4977 vmcs_writel(GUEST_SYSENTER_EIP, 0); 4978 4979 vmx_guest_debugctl_write(&vmx->vcpu, 0); 4980 4981 if (cpu_has_vmx_tpr_shadow()) { 4982 vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 0); 4983 if (cpu_need_tpr_shadow(&vmx->vcpu)) 4984 vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 4985 __pa(vmx->vcpu.arch.apic->regs)); 4986 vmcs_write32(TPR_THRESHOLD, 0); 4987 } 4988 4989 vmx_setup_uret_msrs(vmx); 4990 } 4991 4992 static void __vmx_vcpu_reset(struct kvm_vcpu *vcpu) 4993 { 4994 struct vcpu_vmx *vmx = to_vmx(vcpu); 4995 4996 init_vmcs(vmx); 4997 4998 if (nested && 4999 kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_STUFF_FEATURE_MSRS)) 5000 memcpy(&vmx->nested.msrs, &vmcs_config.nested, sizeof(vmx->nested.msrs)); 5001 5002 vcpu_setup_sgx_lepubkeyhash(vcpu); 5003 5004 vmx->nested.posted_intr_nv = -1; 5005 vmx->nested.vmxon_ptr = INVALID_GPA; 5006 vmx->nested.current_vmptr = INVALID_GPA; 5007 5008 #ifdef CONFIG_KVM_HYPERV 5009 vmx->nested.hv_evmcs_vmptr = EVMPTR_INVALID; 5010 #endif 5011 5012 if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_STUFF_FEATURE_MSRS)) 5013 vcpu->arch.microcode_version = 0x100000000ULL; 5014 vmx->msr_ia32_feature_control_valid_bits = FEAT_CTL_LOCKED; 5015 5016 /* 5017 * Enforce invariant: pi_desc.nv is always either POSTED_INTR_VECTOR 5018 * or POSTED_INTR_WAKEUP_VECTOR. 5019 */ 5020 vmx->vt.pi_desc.nv = POSTED_INTR_VECTOR; 5021 __pi_set_sn(&vmx->vt.pi_desc); 5022 } 5023 5024 void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) 5025 { 5026 struct vcpu_vmx *vmx = to_vmx(vcpu); 5027 5028 if (!init_event) 5029 __vmx_vcpu_reset(vcpu); 5030 5031 vmx->rmode.vm86_active = 0; 5032 vmx->spec_ctrl = 0; 5033 5034 vmx->msr_ia32_umwait_control = 0; 5035 5036 vmx->hv_deadline_tsc = -1; 5037 kvm_set_cr8(vcpu, 0); 5038 5039 seg_setup(VCPU_SREG_CS); 5040 vmcs_write16(GUEST_CS_SELECTOR, 0xf000); 5041 vmcs_writel(GUEST_CS_BASE, 0xffff0000ul); 5042 5043 seg_setup(VCPU_SREG_DS); 5044 seg_setup(VCPU_SREG_ES); 5045 seg_setup(VCPU_SREG_FS); 5046 seg_setup(VCPU_SREG_GS); 5047 seg_setup(VCPU_SREG_SS); 5048 5049 vmcs_write16(GUEST_TR_SELECTOR, 0); 5050 vmcs_writel(GUEST_TR_BASE, 0); 5051 vmcs_write32(GUEST_TR_LIMIT, 0xffff); 5052 vmcs_write32(GUEST_TR_AR_BYTES, 0x008b); 5053 5054 vmcs_write16(GUEST_LDTR_SELECTOR, 0); 5055 vmcs_writel(GUEST_LDTR_BASE, 0); 5056 vmcs_write32(GUEST_LDTR_LIMIT, 0xffff); 5057 vmcs_write32(GUEST_LDTR_AR_BYTES, 0x00082); 5058 5059 vmcs_writel(GUEST_GDTR_BASE, 0); 5060 vmcs_write32(GUEST_GDTR_LIMIT, 0xffff); 5061 5062 vmcs_writel(GUEST_IDTR_BASE, 0); 5063 vmcs_write32(GUEST_IDTR_LIMIT, 0xffff); 5064 5065 vmx_segment_cache_clear(vmx); 5066 kvm_register_mark_available(vcpu, VCPU_REG_SEGMENTS); 5067 5068 vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE); 5069 vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0); 5070 vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS, 0); 5071 if (kvm_mpx_supported()) 5072 vmcs_write64(GUEST_BNDCFGS, 0); 5073 5074 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0); /* 22.2.1 */ 5075 5076 if (kvm_cpu_cap_has(X86_FEATURE_SHSTK)) { 5077 vmcs_writel(GUEST_SSP, 0); 5078 vmcs_writel(GUEST_INTR_SSP_TABLE, 0); 5079 } 5080 if (kvm_cpu_cap_has(X86_FEATURE_IBT) || 5081 kvm_cpu_cap_has(X86_FEATURE_SHSTK)) 5082 vmcs_writel(GUEST_S_CET, 0); 5083 5084 kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu); 5085 5086 vpid_sync_context(vmx->vpid); 5087 5088 vmx_update_fb_clear_dis(vcpu, vmx); 5089 } 5090 5091 void vmx_enable_irq_window(struct kvm_vcpu *vcpu) 5092 { 5093 exec_controls_setbit(to_vmx(vcpu), CPU_BASED_INTR_WINDOW_EXITING); 5094 } 5095 5096 void vmx_enable_nmi_window(struct kvm_vcpu *vcpu) 5097 { 5098 if (!enable_vnmi || 5099 vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_STI) { 5100 vmx_enable_irq_window(vcpu); 5101 return; 5102 } 5103 5104 exec_controls_setbit(to_vmx(vcpu), CPU_BASED_NMI_WINDOW_EXITING); 5105 } 5106 5107 void vmx_inject_irq(struct kvm_vcpu *vcpu, bool reinjected) 5108 { 5109 struct vcpu_vmx *vmx = to_vmx(vcpu); 5110 uint32_t intr; 5111 int irq = vcpu->arch.interrupt.nr; 5112 5113 trace_kvm_inj_virq(irq, vcpu->arch.interrupt.soft, reinjected); 5114 5115 ++vcpu->stat.irq_injections; 5116 if (vmx->rmode.vm86_active) { 5117 int inc_eip = 0; 5118 if (vcpu->arch.interrupt.soft) 5119 inc_eip = vcpu->arch.event_exit_inst_len; 5120 kvm_inject_realmode_interrupt(vcpu, irq, inc_eip); 5121 return; 5122 } 5123 intr = irq | INTR_INFO_VALID_MASK; 5124 if (vcpu->arch.interrupt.soft) { 5125 intr |= INTR_TYPE_SOFT_INTR; 5126 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 5127 vmx->vcpu.arch.event_exit_inst_len); 5128 } else 5129 intr |= INTR_TYPE_EXT_INTR; 5130 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr); 5131 5132 vmx_clear_hlt(vcpu); 5133 } 5134 5135 void vmx_inject_nmi(struct kvm_vcpu *vcpu) 5136 { 5137 struct vcpu_vmx *vmx = to_vmx(vcpu); 5138 5139 if (!enable_vnmi) { 5140 /* 5141 * Tracking the NMI-blocked state in software is built upon 5142 * finding the next open IRQ window. This, in turn, depends on 5143 * well-behaving guests: They have to keep IRQs disabled at 5144 * least as long as the NMI handler runs. Otherwise we may 5145 * cause NMI nesting, maybe breaking the guest. But as this is 5146 * highly unlikely, we can live with the residual risk. 5147 */ 5148 vmx->loaded_vmcs->soft_vnmi_blocked = 1; 5149 vmx->loaded_vmcs->vnmi_blocked_time = 0; 5150 } 5151 5152 ++vcpu->stat.nmi_injections; 5153 vmx->loaded_vmcs->nmi_known_unmasked = false; 5154 5155 if (vmx->rmode.vm86_active) { 5156 kvm_inject_realmode_interrupt(vcpu, NMI_VECTOR, 0); 5157 return; 5158 } 5159 5160 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 5161 INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR); 5162 5163 vmx_clear_hlt(vcpu); 5164 } 5165 5166 bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu) 5167 { 5168 struct vcpu_vmx *vmx = to_vmx(vcpu); 5169 bool masked; 5170 5171 if (!enable_vnmi) 5172 return vmx->loaded_vmcs->soft_vnmi_blocked; 5173 if (vmx->loaded_vmcs->nmi_known_unmasked) 5174 return false; 5175 masked = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_NMI; 5176 vmx->loaded_vmcs->nmi_known_unmasked = !masked; 5177 return masked; 5178 } 5179 5180 void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked) 5181 { 5182 struct vcpu_vmx *vmx = to_vmx(vcpu); 5183 5184 if (!enable_vnmi) { 5185 if (vmx->loaded_vmcs->soft_vnmi_blocked != masked) { 5186 vmx->loaded_vmcs->soft_vnmi_blocked = masked; 5187 vmx->loaded_vmcs->vnmi_blocked_time = 0; 5188 } 5189 } else { 5190 vmx->loaded_vmcs->nmi_known_unmasked = !masked; 5191 if (masked) 5192 vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, 5193 GUEST_INTR_STATE_NMI); 5194 else 5195 vmcs_clear_bits(GUEST_INTERRUPTIBILITY_INFO, 5196 GUEST_INTR_STATE_NMI); 5197 } 5198 } 5199 5200 bool vmx_nmi_blocked(struct kvm_vcpu *vcpu) 5201 { 5202 if (is_guest_mode(vcpu) && nested_exit_on_nmi(vcpu)) 5203 return false; 5204 5205 if (!enable_vnmi && to_vmx(vcpu)->loaded_vmcs->soft_vnmi_blocked) 5206 return true; 5207 5208 return (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 5209 (GUEST_INTR_STATE_MOV_SS | GUEST_INTR_STATE_STI | 5210 GUEST_INTR_STATE_NMI)); 5211 } 5212 5213 int vmx_nmi_allowed(struct kvm_vcpu *vcpu, bool for_injection) 5214 { 5215 if (vcpu->arch.nested_run_pending) 5216 return -EBUSY; 5217 5218 /* An NMI must not be injected into L2 if it's supposed to VM-Exit. */ 5219 if (for_injection && is_guest_mode(vcpu) && nested_exit_on_nmi(vcpu)) 5220 return -EBUSY; 5221 5222 return !vmx_nmi_blocked(vcpu); 5223 } 5224 5225 bool __vmx_interrupt_blocked(struct kvm_vcpu *vcpu) 5226 { 5227 return !(vmx_get_rflags(vcpu) & X86_EFLAGS_IF) || 5228 (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 5229 (GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS)); 5230 } 5231 5232 bool vmx_interrupt_blocked(struct kvm_vcpu *vcpu) 5233 { 5234 if (is_guest_mode(vcpu) && nested_exit_on_intr(vcpu)) 5235 return false; 5236 5237 return __vmx_interrupt_blocked(vcpu); 5238 } 5239 5240 int vmx_interrupt_allowed(struct kvm_vcpu *vcpu, bool for_injection) 5241 { 5242 if (vcpu->arch.nested_run_pending) 5243 return -EBUSY; 5244 5245 /* 5246 * An IRQ must not be injected into L2 if it's supposed to VM-Exit, 5247 * e.g. if the IRQ arrived asynchronously after checking nested events. 5248 */ 5249 if (for_injection && is_guest_mode(vcpu) && nested_exit_on_intr(vcpu)) 5250 return -EBUSY; 5251 5252 return !vmx_interrupt_blocked(vcpu); 5253 } 5254 5255 int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr) 5256 { 5257 void __user *ret; 5258 5259 if (enable_unrestricted_guest) 5260 return 0; 5261 5262 mutex_lock(&kvm->slots_lock); 5263 ret = __x86_set_memory_region(kvm, TSS_PRIVATE_MEMSLOT, addr, 5264 PAGE_SIZE * 3); 5265 mutex_unlock(&kvm->slots_lock); 5266 5267 if (IS_ERR(ret)) 5268 return PTR_ERR(ret); 5269 5270 to_kvm_vmx(kvm)->tss_addr = addr; 5271 5272 return init_rmode_tss(kvm, ret); 5273 } 5274 5275 int vmx_set_identity_map_addr(struct kvm *kvm, u64 ident_addr) 5276 { 5277 to_kvm_vmx(kvm)->ept_identity_map_addr = ident_addr; 5278 return 0; 5279 } 5280 5281 static bool rmode_exception(struct kvm_vcpu *vcpu, int vec) 5282 { 5283 switch (vec) { 5284 case BP_VECTOR: 5285 /* 5286 * Update instruction length as we may reinject the exception 5287 * from user space while in guest debugging mode. 5288 */ 5289 to_vmx(vcpu)->vcpu.arch.event_exit_inst_len = 5290 vmcs_read32(VM_EXIT_INSTRUCTION_LEN); 5291 if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP) 5292 return false; 5293 fallthrough; 5294 case DB_VECTOR: 5295 return !(vcpu->guest_debug & 5296 (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)); 5297 case DE_VECTOR: 5298 case OF_VECTOR: 5299 case BR_VECTOR: 5300 case UD_VECTOR: 5301 case DF_VECTOR: 5302 case SS_VECTOR: 5303 case GP_VECTOR: 5304 case MF_VECTOR: 5305 return true; 5306 } 5307 return false; 5308 } 5309 5310 static int handle_rmode_exception(struct kvm_vcpu *vcpu, 5311 int vec, u32 err_code) 5312 { 5313 /* 5314 * Instruction with address size override prefix opcode 0x67 5315 * Cause the #SS fault with 0 error code in VM86 mode. 5316 */ 5317 if (((vec == GP_VECTOR) || (vec == SS_VECTOR)) && err_code == 0) { 5318 if (kvm_emulate_instruction(vcpu, 0)) { 5319 if (vcpu->arch.halt_request) { 5320 vcpu->arch.halt_request = 0; 5321 return kvm_emulate_halt_noskip(vcpu); 5322 } 5323 return 1; 5324 } 5325 return 0; 5326 } 5327 5328 /* 5329 * Forward all other exceptions that are valid in real mode. 5330 * FIXME: Breaks guest debugging in real mode, needs to be fixed with 5331 * the required debugging infrastructure rework. 5332 */ 5333 kvm_queue_exception(vcpu, vec); 5334 return 1; 5335 } 5336 5337 static int handle_machine_check(struct kvm_vcpu *vcpu) 5338 { 5339 /* handled by vmx_vcpu_run() */ 5340 return 1; 5341 } 5342 5343 /* 5344 * If the host has split lock detection disabled, then #AC is 5345 * unconditionally injected into the guest, which is the pre split lock 5346 * detection behaviour. 5347 * 5348 * If the host has split lock detection enabled then #AC is 5349 * only injected into the guest when: 5350 * - Guest CPL == 3 (user mode) 5351 * - Guest has #AC detection enabled in CR0 5352 * - Guest EFLAGS has AC bit set 5353 */ 5354 bool vmx_guest_inject_ac(struct kvm_vcpu *vcpu) 5355 { 5356 if (!boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT)) 5357 return true; 5358 5359 return vmx_get_cpl(vcpu) == 3 && kvm_is_cr0_bit_set(vcpu, X86_CR0_AM) && 5360 (kvm_get_rflags(vcpu) & X86_EFLAGS_AC); 5361 } 5362 5363 static bool is_xfd_nm_fault(struct kvm_vcpu *vcpu) 5364 { 5365 return vcpu->arch.guest_fpu.fpstate->xfd && 5366 !kvm_is_cr0_bit_set(vcpu, X86_CR0_TS); 5367 } 5368 5369 static int vmx_handle_page_fault(struct kvm_vcpu *vcpu, u32 error_code) 5370 { 5371 unsigned long cr2 = vmx_get_exit_qual(vcpu); 5372 5373 if (vcpu->arch.apf.host_apf_flags) 5374 goto handle_pf; 5375 5376 /* When using EPT, KVM intercepts #PF only to detect illegal GPAs. */ 5377 WARN_ON_ONCE(enable_ept && !allow_smaller_maxphyaddr); 5378 5379 /* 5380 * On SGX2 hardware, EPCM violations are delivered as #PF with the SGX 5381 * flag set in the error code (SGX1 hardware generates #GP(0)). EPCM 5382 * violations have nothing to do with shadow paging and can never be 5383 * resolved by KVM; always reflect them into the guest. 5384 */ 5385 if (error_code & PFERR_SGX_MASK) { 5386 WARN_ON_ONCE(!IS_ENABLED(CONFIG_X86_SGX_KVM) || 5387 !cpu_feature_enabled(X86_FEATURE_SGX2)); 5388 5389 if (guest_cpu_cap_has(vcpu, X86_FEATURE_SGX2)) 5390 kvm_fixup_and_inject_pf_error(vcpu, cr2, error_code); 5391 else 5392 kvm_inject_gp(vcpu, 0); 5393 return 1; 5394 } 5395 5396 /* 5397 * If EPT is enabled, fixup and inject the #PF. KVM intercepts #PFs 5398 * only to set PFERR_RSVD as appropriate (hardware won't set RSVD due 5399 * to the GPA being legal with respect to host.MAXPHYADDR). 5400 */ 5401 if (enable_ept) { 5402 kvm_fixup_and_inject_pf_error(vcpu, cr2, error_code); 5403 return 1; 5404 } 5405 5406 handle_pf: 5407 return kvm_handle_page_fault(vcpu, error_code, cr2, NULL, 0); 5408 } 5409 5410 static int handle_exception_nmi(struct kvm_vcpu *vcpu) 5411 { 5412 struct vcpu_vmx *vmx = to_vmx(vcpu); 5413 struct kvm_run *kvm_run = vcpu->run; 5414 u32 intr_info, ex_no, error_code; 5415 unsigned long dr6; 5416 u32 vect_info; 5417 5418 vect_info = vmx->idt_vectoring_info; 5419 intr_info = vmx_get_intr_info(vcpu); 5420 5421 /* 5422 * Machine checks are handled by handle_exception_irqoff(), or by 5423 * vmx_vcpu_run() if a #MC occurs on VM-Entry. NMIs are handled by 5424 * vmx_vcpu_enter_exit(). 5425 */ 5426 if (is_machine_check(intr_info) || is_nmi(intr_info)) 5427 return 1; 5428 5429 /* 5430 * Queue the exception here instead of in handle_nm_fault_irqoff(). 5431 * This ensures the nested_vmx check is not skipped so vmexit can 5432 * be reflected to L1 (when it intercepts #NM) before reaching this 5433 * point. 5434 */ 5435 if (is_nm_fault(intr_info)) { 5436 kvm_queue_exception_p(vcpu, NM_VECTOR, 5437 is_xfd_nm_fault(vcpu) ? vcpu->arch.guest_fpu.xfd_err : 0); 5438 return 1; 5439 } 5440 5441 if (is_invalid_opcode(intr_info)) 5442 return handle_ud(vcpu); 5443 5444 if (WARN_ON_ONCE(is_ve_fault(intr_info))) { 5445 struct vmx_ve_information *ve_info = vmx->ve_info; 5446 5447 WARN_ONCE(ve_info->exit_reason != EXIT_REASON_EPT_VIOLATION, 5448 "Unexpected #VE on VM-Exit reason 0x%x", ve_info->exit_reason); 5449 dump_vmcs(vcpu); 5450 kvm_mmu_print_sptes(vcpu, ve_info->guest_physical_address, "#VE"); 5451 return 1; 5452 } 5453 5454 error_code = 0; 5455 if (intr_info & INTR_INFO_DELIVER_CODE_MASK) 5456 error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE); 5457 5458 if (!vmx->rmode.vm86_active && is_gp_fault(intr_info)) { 5459 WARN_ON_ONCE(!enable_vmware_backdoor); 5460 5461 /* 5462 * VMware backdoor emulation on #GP interception only handles 5463 * IN{S}, OUT{S}, and RDPMC, none of which generate a non-zero 5464 * error code on #GP. 5465 */ 5466 if (error_code) { 5467 kvm_queue_exception_e(vcpu, GP_VECTOR, error_code); 5468 return 1; 5469 } 5470 return kvm_emulate_instruction(vcpu, EMULTYPE_VMWARE_GP); 5471 } 5472 5473 /* 5474 * The #PF with PFEC.RSVD = 1 indicates the guest is accessing 5475 * MMIO, it is better to report an internal error. 5476 * See the comments in vmx_handle_exit. 5477 */ 5478 if ((vect_info & VECTORING_INFO_VALID_MASK) && 5479 !(is_page_fault(intr_info) && !(error_code & PFERR_RSVD_MASK))) { 5480 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; 5481 vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_SIMUL_EX; 5482 vcpu->run->internal.ndata = 4; 5483 vcpu->run->internal.data[0] = vect_info; 5484 vcpu->run->internal.data[1] = intr_info; 5485 vcpu->run->internal.data[2] = error_code; 5486 vcpu->run->internal.data[3] = vcpu->arch.last_vmentry_cpu; 5487 return 0; 5488 } 5489 5490 if (is_page_fault(intr_info)) 5491 return vmx_handle_page_fault(vcpu, error_code); 5492 5493 ex_no = intr_info & INTR_INFO_VECTOR_MASK; 5494 5495 if (vmx->rmode.vm86_active && rmode_exception(vcpu, ex_no)) 5496 return handle_rmode_exception(vcpu, ex_no, error_code); 5497 5498 switch (ex_no) { 5499 case DB_VECTOR: 5500 dr6 = vmx_get_exit_qual(vcpu); 5501 if (!(vcpu->guest_debug & 5502 (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))) { 5503 /* 5504 * If the #DB was due to ICEBP, a.k.a. INT1, skip the 5505 * instruction. ICEBP generates a trap-like #DB, but 5506 * despite its interception control being tied to #DB, 5507 * is an instruction intercept, i.e. the VM-Exit occurs 5508 * on the ICEBP itself. Use the inner "skip" helper to 5509 * avoid single-step #DB and MTF updates, as ICEBP is 5510 * higher priority. Note, skipping ICEBP still clears 5511 * STI and MOVSS blocking. 5512 */ 5513 if (is_icebp(intr_info)) 5514 WARN_ON(!skip_emulated_instruction(vcpu)); 5515 5516 kvm_queue_exception_p(vcpu, DB_VECTOR, dr6); 5517 return 1; 5518 } 5519 kvm_run->debug.arch.dr6 = dr6 | DR6_ACTIVE_LOW; 5520 kvm_run->debug.arch.dr7 = vmcs_readl(GUEST_DR7); 5521 fallthrough; 5522 case BP_VECTOR: 5523 /* 5524 * Update instruction length as we may reinject #BP from 5525 * user space while in guest debugging mode. Reading it for 5526 * #DB as well causes no harm, it is not used in that case. 5527 */ 5528 vmx->vcpu.arch.event_exit_inst_len = 5529 vmcs_read32(VM_EXIT_INSTRUCTION_LEN); 5530 kvm_run->exit_reason = KVM_EXIT_DEBUG; 5531 kvm_run->debug.arch.pc = kvm_get_linear_rip(vcpu); 5532 kvm_run->debug.arch.exception = ex_no; 5533 break; 5534 case AC_VECTOR: 5535 if (vmx_guest_inject_ac(vcpu)) { 5536 kvm_queue_exception_e(vcpu, AC_VECTOR, error_code); 5537 return 1; 5538 } 5539 5540 /* 5541 * Handle split lock. Depending on detection mode this will 5542 * either warn and disable split lock detection for this 5543 * task or force SIGBUS on it. 5544 */ 5545 if (handle_guest_split_lock(kvm_rip_read(vcpu))) 5546 return 1; 5547 fallthrough; 5548 default: 5549 kvm_run->exit_reason = KVM_EXIT_EXCEPTION; 5550 kvm_run->ex.exception = ex_no; 5551 kvm_run->ex.error_code = error_code; 5552 break; 5553 } 5554 return 0; 5555 } 5556 5557 static __always_inline int handle_external_interrupt(struct kvm_vcpu *vcpu) 5558 { 5559 ++vcpu->stat.irq_exits; 5560 return 1; 5561 } 5562 5563 static int handle_triple_fault(struct kvm_vcpu *vcpu) 5564 { 5565 vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN; 5566 vcpu->mmio_needed = 0; 5567 return 0; 5568 } 5569 5570 static int handle_io(struct kvm_vcpu *vcpu) 5571 { 5572 unsigned long exit_qualification; 5573 int size, in, string; 5574 unsigned port; 5575 5576 exit_qualification = vmx_get_exit_qual(vcpu); 5577 string = (exit_qualification & 16) != 0; 5578 5579 ++vcpu->stat.io_exits; 5580 5581 if (string) 5582 return kvm_emulate_instruction(vcpu, 0); 5583 5584 port = exit_qualification >> 16; 5585 size = (exit_qualification & 7) + 1; 5586 in = (exit_qualification & 8) != 0; 5587 5588 return kvm_fast_pio(vcpu, size, port, in); 5589 } 5590 5591 void vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall) 5592 { 5593 /* 5594 * Patch in the VMCALL instruction: 5595 */ 5596 hypercall[0] = 0x0f; 5597 hypercall[1] = 0x01; 5598 hypercall[2] = 0xc1; 5599 } 5600 5601 /* called to set cr0 as appropriate for a mov-to-cr0 exit. */ 5602 static int handle_set_cr0(struct kvm_vcpu *vcpu, unsigned long val) 5603 { 5604 if (is_guest_mode(vcpu)) { 5605 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 5606 unsigned long orig_val = val; 5607 5608 /* 5609 * We get here when L2 changed cr0 in a way that did not change 5610 * any of L1's shadowed bits (see nested_vmx_exit_handled_cr), 5611 * but did change L0 shadowed bits. So we first calculate the 5612 * effective cr0 value that L1 would like to write into the 5613 * hardware. It consists of the L2-owned bits from the new 5614 * value combined with the L1-owned bits from L1's guest_cr0. 5615 */ 5616 val = (val & ~vmcs12->cr0_guest_host_mask) | 5617 (vmcs12->guest_cr0 & vmcs12->cr0_guest_host_mask); 5618 5619 if (kvm_set_cr0(vcpu, val)) 5620 return 1; 5621 vmcs_writel(CR0_READ_SHADOW, orig_val); 5622 return 0; 5623 } else { 5624 return kvm_set_cr0(vcpu, val); 5625 } 5626 } 5627 5628 static int handle_set_cr4(struct kvm_vcpu *vcpu, unsigned long val) 5629 { 5630 if (is_guest_mode(vcpu)) { 5631 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 5632 unsigned long orig_val = val; 5633 5634 /* analogously to handle_set_cr0 */ 5635 val = (val & ~vmcs12->cr4_guest_host_mask) | 5636 (vmcs12->guest_cr4 & vmcs12->cr4_guest_host_mask); 5637 if (kvm_set_cr4(vcpu, val)) 5638 return 1; 5639 vmcs_writel(CR4_READ_SHADOW, orig_val); 5640 return 0; 5641 } else 5642 return kvm_set_cr4(vcpu, val); 5643 } 5644 5645 static int handle_desc(struct kvm_vcpu *vcpu) 5646 { 5647 /* 5648 * UMIP emulation relies on intercepting writes to CR4.UMIP, i.e. this 5649 * and other code needs to be updated if UMIP can be guest owned. 5650 */ 5651 BUILD_BUG_ON(KVM_POSSIBLE_CR4_GUEST_BITS & X86_CR4_UMIP); 5652 5653 WARN_ON_ONCE(!kvm_is_cr4_bit_set(vcpu, X86_CR4_UMIP)); 5654 return kvm_emulate_instruction(vcpu, 0); 5655 } 5656 5657 static int handle_cr(struct kvm_vcpu *vcpu) 5658 { 5659 unsigned long exit_qualification, val; 5660 int cr; 5661 int reg; 5662 int err; 5663 int ret; 5664 5665 exit_qualification = vmx_get_exit_qual(vcpu); 5666 cr = exit_qualification & 15; 5667 reg = (exit_qualification >> 8) & 15; 5668 switch ((exit_qualification >> 4) & 3) { 5669 case 0: /* mov to cr */ 5670 val = kvm_register_read(vcpu, reg); 5671 trace_kvm_cr_write(cr, val); 5672 switch (cr) { 5673 case 0: 5674 err = handle_set_cr0(vcpu, val); 5675 return kvm_complete_insn_gp(vcpu, err); 5676 case 3: 5677 WARN_ON_ONCE(enable_unrestricted_guest); 5678 5679 err = kvm_set_cr3(vcpu, val); 5680 return kvm_complete_insn_gp(vcpu, err); 5681 case 4: 5682 err = handle_set_cr4(vcpu, val); 5683 return kvm_complete_insn_gp(vcpu, err); 5684 case 8: { 5685 u8 cr8_prev = kvm_get_cr8(vcpu); 5686 u8 cr8 = (u8)val; 5687 err = kvm_set_cr8(vcpu, cr8); 5688 ret = kvm_complete_insn_gp(vcpu, err); 5689 if (lapic_in_kernel(vcpu)) 5690 return ret; 5691 if (cr8_prev <= cr8) 5692 return ret; 5693 /* 5694 * TODO: we might be squashing a 5695 * KVM_GUESTDBG_SINGLESTEP-triggered 5696 * KVM_EXIT_DEBUG here. 5697 */ 5698 vcpu->run->exit_reason = KVM_EXIT_SET_TPR; 5699 return 0; 5700 } 5701 } 5702 break; 5703 case 2: /* clts */ 5704 KVM_BUG(1, vcpu->kvm, "Guest always owns CR0.TS"); 5705 return -EIO; 5706 case 1: /*mov from cr*/ 5707 switch (cr) { 5708 case 3: 5709 WARN_ON_ONCE(enable_unrestricted_guest); 5710 5711 val = kvm_read_cr3(vcpu); 5712 kvm_register_write(vcpu, reg, val); 5713 trace_kvm_cr_read(cr, val); 5714 return kvm_skip_emulated_instruction(vcpu); 5715 case 8: 5716 val = kvm_get_cr8(vcpu); 5717 kvm_register_write(vcpu, reg, val); 5718 trace_kvm_cr_read(cr, val); 5719 return kvm_skip_emulated_instruction(vcpu); 5720 } 5721 break; 5722 case 3: /* lmsw */ 5723 val = (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f; 5724 trace_kvm_cr_write(0, (kvm_read_cr0_bits(vcpu, ~0xful) | val)); 5725 kvm_lmsw(vcpu, val); 5726 5727 return kvm_skip_emulated_instruction(vcpu); 5728 default: 5729 break; 5730 } 5731 vcpu->run->exit_reason = 0; 5732 vcpu_unimpl(vcpu, "unhandled control register: op %d cr %d\n", 5733 (int)(exit_qualification >> 4) & 3, cr); 5734 return 0; 5735 } 5736 5737 static int handle_dr(struct kvm_vcpu *vcpu) 5738 { 5739 unsigned long exit_qualification; 5740 int dr, dr7, reg; 5741 int err = 1; 5742 5743 exit_qualification = vmx_get_exit_qual(vcpu); 5744 dr = exit_qualification & DEBUG_REG_ACCESS_NUM; 5745 5746 /* First, if DR does not exist, trigger UD */ 5747 if (!kvm_require_dr(vcpu, dr)) 5748 return 1; 5749 5750 if (vmx_get_cpl(vcpu) > 0) 5751 goto out; 5752 5753 dr7 = vmcs_readl(GUEST_DR7); 5754 if (dr7 & DR7_GD) { 5755 /* 5756 * As the vm-exit takes precedence over the debug trap, we 5757 * need to emulate the latter, either for the host or the 5758 * guest debugging itself. 5759 */ 5760 if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) { 5761 vcpu->run->debug.arch.dr6 = DR6_BD | DR6_ACTIVE_LOW; 5762 vcpu->run->debug.arch.dr7 = dr7; 5763 vcpu->run->debug.arch.pc = kvm_get_linear_rip(vcpu); 5764 vcpu->run->debug.arch.exception = DB_VECTOR; 5765 vcpu->run->exit_reason = KVM_EXIT_DEBUG; 5766 return 0; 5767 } else { 5768 kvm_queue_exception_p(vcpu, DB_VECTOR, DR6_BD); 5769 return 1; 5770 } 5771 } 5772 5773 if (vcpu->guest_debug == 0) { 5774 exec_controls_clearbit(to_vmx(vcpu), CPU_BASED_MOV_DR_EXITING); 5775 5776 /* 5777 * No more DR vmexits; force a reload of the debug registers 5778 * and reenter on this instruction. The next vmexit will 5779 * retrieve the full state of the debug registers. 5780 */ 5781 vcpu->arch.switch_db_regs |= KVM_DEBUGREG_WONT_EXIT; 5782 return 1; 5783 } 5784 5785 reg = DEBUG_REG_ACCESS_REG(exit_qualification); 5786 if (exit_qualification & TYPE_MOV_FROM_DR) { 5787 kvm_register_write(vcpu, reg, kvm_get_dr(vcpu, dr)); 5788 err = 0; 5789 } else { 5790 err = kvm_set_dr(vcpu, dr, kvm_register_read(vcpu, reg)); 5791 } 5792 5793 out: 5794 return kvm_complete_insn_gp(vcpu, err); 5795 } 5796 5797 void vmx_sync_dirty_debug_regs(struct kvm_vcpu *vcpu) 5798 { 5799 get_debugreg(vcpu->arch.db[0], 0); 5800 get_debugreg(vcpu->arch.db[1], 1); 5801 get_debugreg(vcpu->arch.db[2], 2); 5802 get_debugreg(vcpu->arch.db[3], 3); 5803 get_debugreg(vcpu->arch.dr6, 6); 5804 vcpu->arch.dr7 = vmcs_readl(GUEST_DR7); 5805 5806 vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_WONT_EXIT; 5807 exec_controls_setbit(to_vmx(vcpu), CPU_BASED_MOV_DR_EXITING); 5808 5809 /* 5810 * exc_debug expects dr6 to be cleared after it runs, avoid that it sees 5811 * a stale dr6 from the guest. 5812 */ 5813 set_debugreg(DR6_RESERVED, 6); 5814 } 5815 5816 void vmx_set_dr7(struct kvm_vcpu *vcpu, unsigned long val) 5817 { 5818 vmcs_writel(GUEST_DR7, val); 5819 } 5820 5821 static int handle_tpr_below_threshold(struct kvm_vcpu *vcpu) 5822 { 5823 kvm_apic_update_ppr(vcpu); 5824 return 1; 5825 } 5826 5827 static int handle_interrupt_window(struct kvm_vcpu *vcpu) 5828 { 5829 exec_controls_clearbit(to_vmx(vcpu), CPU_BASED_INTR_WINDOW_EXITING); 5830 5831 kvm_make_request(KVM_REQ_EVENT, vcpu); 5832 5833 ++vcpu->stat.irq_window_exits; 5834 return 1; 5835 } 5836 5837 static int handle_invlpg(struct kvm_vcpu *vcpu) 5838 { 5839 unsigned long exit_qualification = vmx_get_exit_qual(vcpu); 5840 5841 kvm_mmu_invlpg(vcpu, exit_qualification); 5842 return kvm_skip_emulated_instruction(vcpu); 5843 } 5844 5845 static int handle_apic_access(struct kvm_vcpu *vcpu) 5846 { 5847 if (likely(fasteoi)) { 5848 unsigned long exit_qualification = vmx_get_exit_qual(vcpu); 5849 int access_type, offset; 5850 5851 access_type = exit_qualification & APIC_ACCESS_TYPE; 5852 offset = exit_qualification & APIC_ACCESS_OFFSET; 5853 /* 5854 * Sane guest uses MOV to write EOI, with written value 5855 * not cared. So make a short-circuit here by avoiding 5856 * heavy instruction emulation. 5857 */ 5858 if ((access_type == TYPE_LINEAR_APIC_INST_WRITE) && 5859 (offset == APIC_EOI)) { 5860 kvm_lapic_set_eoi(vcpu); 5861 return kvm_skip_emulated_instruction(vcpu); 5862 } 5863 } 5864 return kvm_emulate_instruction(vcpu, 0); 5865 } 5866 5867 static int handle_apic_eoi_induced(struct kvm_vcpu *vcpu) 5868 { 5869 unsigned long exit_qualification = vmx_get_exit_qual(vcpu); 5870 int vector = exit_qualification & 0xff; 5871 5872 /* EOI-induced VM exit is trap-like and thus no need to adjust IP */ 5873 kvm_apic_set_eoi_accelerated(vcpu, vector); 5874 return 1; 5875 } 5876 5877 static int handle_apic_write(struct kvm_vcpu *vcpu) 5878 { 5879 unsigned long exit_qualification = vmx_get_exit_qual(vcpu); 5880 5881 /* 5882 * APIC-write VM-Exit is trap-like, KVM doesn't need to advance RIP and 5883 * hardware has done any necessary aliasing, offset adjustments, etc... 5884 * for the access. I.e. the correct value has already been written to 5885 * the vAPIC page for the correct 16-byte chunk. KVM needs only to 5886 * retrieve the register value and emulate the access. 5887 */ 5888 u32 offset = exit_qualification & 0xff0; 5889 5890 kvm_apic_write_nodecode(vcpu, offset); 5891 return 1; 5892 } 5893 5894 static int handle_task_switch(struct kvm_vcpu *vcpu) 5895 { 5896 struct vcpu_vmx *vmx = to_vmx(vcpu); 5897 unsigned long exit_qualification; 5898 bool has_error_code = false; 5899 u32 error_code = 0; 5900 u16 tss_selector; 5901 int reason, type, idt_v, idt_index; 5902 5903 idt_v = (vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK); 5904 idt_index = (vmx->idt_vectoring_info & VECTORING_INFO_VECTOR_MASK); 5905 type = (vmx->idt_vectoring_info & VECTORING_INFO_TYPE_MASK); 5906 5907 exit_qualification = vmx_get_exit_qual(vcpu); 5908 5909 reason = (u32)exit_qualification >> 30; 5910 if (reason == TASK_SWITCH_GATE && idt_v) { 5911 switch (type) { 5912 case INTR_TYPE_NMI_INTR: 5913 vcpu->arch.nmi_injected = false; 5914 vmx_set_nmi_mask(vcpu, true); 5915 break; 5916 case INTR_TYPE_EXT_INTR: 5917 case INTR_TYPE_SOFT_INTR: 5918 kvm_clear_interrupt_queue(vcpu); 5919 break; 5920 case INTR_TYPE_HARD_EXCEPTION: 5921 if (vmx->idt_vectoring_info & 5922 VECTORING_INFO_DELIVER_CODE_MASK) { 5923 has_error_code = true; 5924 error_code = 5925 vmcs_read32(IDT_VECTORING_ERROR_CODE); 5926 } 5927 fallthrough; 5928 case INTR_TYPE_SOFT_EXCEPTION: 5929 kvm_clear_exception_queue(vcpu); 5930 break; 5931 default: 5932 break; 5933 } 5934 } 5935 tss_selector = exit_qualification; 5936 5937 if (!idt_v || (type != INTR_TYPE_HARD_EXCEPTION && 5938 type != INTR_TYPE_EXT_INTR && 5939 type != INTR_TYPE_NMI_INTR)) 5940 WARN_ON(!skip_emulated_instruction(vcpu)); 5941 5942 /* 5943 * TODO: What about debug traps on tss switch? 5944 * Are we supposed to inject them and update dr6? 5945 */ 5946 return kvm_task_switch(vcpu, tss_selector, 5947 type == INTR_TYPE_SOFT_INTR ? idt_index : -1, 5948 reason, has_error_code, error_code); 5949 } 5950 5951 static int handle_ept_violation(struct kvm_vcpu *vcpu) 5952 { 5953 unsigned long exit_qualification = vmx_get_exit_qual(vcpu); 5954 gpa_t gpa; 5955 5956 /* 5957 * EPT violation happened while executing iret from NMI, 5958 * "blocked by NMI" bit has to be set before next VM entry. 5959 * There are errata that may cause this bit to not be set: 5960 * AAK134, BY25. 5961 */ 5962 if (!(to_vmx(vcpu)->idt_vectoring_info & VECTORING_INFO_VALID_MASK) && 5963 enable_vnmi && 5964 (exit_qualification & INTR_INFO_UNBLOCK_NMI)) 5965 vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, GUEST_INTR_STATE_NMI); 5966 5967 gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS); 5968 trace_kvm_page_fault(vcpu, gpa, exit_qualification); 5969 5970 /* 5971 * Check that the GPA doesn't exceed physical memory limits, as that is 5972 * a guest page fault. We have to emulate the instruction here, because 5973 * if the illegal address is that of a paging structure, then 5974 * EPT_VIOLATION_ACC_WRITE bit is set. Alternatively, if supported we 5975 * would also use advanced VM-exit information for EPT violations to 5976 * reconstruct the page fault error code. 5977 */ 5978 if (unlikely(allow_smaller_maxphyaddr && !kvm_vcpu_is_legal_gpa(vcpu, gpa))) 5979 return kvm_emulate_instruction(vcpu, 0); 5980 5981 return __vmx_handle_ept_violation(vcpu, gpa, exit_qualification); 5982 } 5983 5984 static int handle_ept_misconfig(struct kvm_vcpu *vcpu) 5985 { 5986 gpa_t gpa; 5987 5988 if (vmx_check_emulate_instruction(vcpu, EMULTYPE_PF, NULL, 0)) 5989 return 1; 5990 5991 /* 5992 * A nested guest cannot optimize MMIO vmexits, because we have an 5993 * nGPA here instead of the required GPA. 5994 */ 5995 gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS); 5996 if (!is_guest_mode(vcpu) && 5997 !kvm_io_bus_write(vcpu, KVM_FAST_MMIO_BUS, gpa, 0, NULL)) { 5998 trace_kvm_fast_mmio(gpa); 5999 return kvm_skip_emulated_instruction(vcpu); 6000 } 6001 6002 return kvm_mmu_page_fault(vcpu, gpa, PFERR_RSVD_MASK, NULL, 0); 6003 } 6004 6005 static int handle_nmi_window(struct kvm_vcpu *vcpu) 6006 { 6007 if (KVM_BUG_ON(!enable_vnmi, vcpu->kvm)) 6008 return -EIO; 6009 6010 exec_controls_clearbit(to_vmx(vcpu), CPU_BASED_NMI_WINDOW_EXITING); 6011 ++vcpu->stat.nmi_window_exits; 6012 kvm_make_request(KVM_REQ_EVENT, vcpu); 6013 6014 return 1; 6015 } 6016 6017 /* 6018 * Returns true if emulation is required (due to the vCPU having invalid state 6019 * with unsrestricted guest mode disabled) and KVM can't faithfully emulate the 6020 * current vCPU state. 6021 */ 6022 static bool vmx_unhandleable_emulation_required(struct kvm_vcpu *vcpu) 6023 { 6024 struct vcpu_vmx *vmx = to_vmx(vcpu); 6025 6026 if (!vmx->vt.emulation_required) 6027 return false; 6028 6029 /* 6030 * It is architecturally impossible for emulation to be required when a 6031 * nested VM-Enter is pending completion, as VM-Enter will VM-Fail if 6032 * guest state is invalid and unrestricted guest is disabled, i.e. KVM 6033 * should synthesize VM-Fail instead emulation L2 code. This path is 6034 * only reachable if userspace modifies L2 guest state after KVM has 6035 * performed the nested VM-Enter consistency checks. 6036 */ 6037 if (vcpu->arch.nested_run_pending) 6038 return true; 6039 6040 /* 6041 * KVM only supports emulating exceptions if the vCPU is in Real Mode. 6042 * If emulation is required, KVM can't perform a successful VM-Enter to 6043 * inject the exception. 6044 */ 6045 return !vmx->rmode.vm86_active && 6046 (kvm_is_exception_pending(vcpu) || vcpu->arch.exception.injected); 6047 } 6048 6049 static int handle_invalid_guest_state(struct kvm_vcpu *vcpu) 6050 { 6051 struct vcpu_vmx *vmx = to_vmx(vcpu); 6052 bool intr_window_requested; 6053 unsigned count = 130; 6054 6055 intr_window_requested = exec_controls_get(vmx) & 6056 CPU_BASED_INTR_WINDOW_EXITING; 6057 6058 while (vmx->vt.emulation_required && count-- != 0) { 6059 if (intr_window_requested && !vmx_interrupt_blocked(vcpu)) 6060 return handle_interrupt_window(&vmx->vcpu); 6061 6062 if (kvm_test_request(KVM_REQ_EVENT, vcpu)) 6063 return 1; 6064 6065 /* 6066 * Ensure that any updates to kvm->buses[] observed by the 6067 * previous instruction (emulated or otherwise) are also 6068 * visible to the instruction KVM is about to emulate. 6069 */ 6070 smp_rmb(); 6071 6072 if (!kvm_emulate_instruction(vcpu, 0)) 6073 return 0; 6074 6075 if (vmx_unhandleable_emulation_required(vcpu)) { 6076 kvm_prepare_emulation_failure_exit(vcpu); 6077 return 0; 6078 } 6079 6080 if (vcpu->arch.halt_request) { 6081 vcpu->arch.halt_request = 0; 6082 return kvm_emulate_halt_noskip(vcpu); 6083 } 6084 6085 /* 6086 * Note, return 1 and not 0, vcpu_run() will invoke 6087 * xfer_to_guest_mode() which will create a proper return 6088 * code. 6089 */ 6090 if (__xfer_to_guest_mode_work_pending()) 6091 return 1; 6092 } 6093 6094 return 1; 6095 } 6096 6097 int vmx_vcpu_pre_run(struct kvm_vcpu *vcpu) 6098 { 6099 if (vmx_unhandleable_emulation_required(vcpu)) { 6100 kvm_prepare_emulation_failure_exit(vcpu); 6101 return 0; 6102 } 6103 6104 return 1; 6105 } 6106 6107 /* 6108 * Indicate a busy-waiting vcpu in spinlock. We do not enable the PAUSE 6109 * exiting, so only get here on cpu with PAUSE-Loop-Exiting. 6110 */ 6111 static int handle_pause(struct kvm_vcpu *vcpu) 6112 { 6113 if (!kvm_pause_in_guest(vcpu->kvm)) 6114 grow_ple_window(vcpu); 6115 6116 /* 6117 * Intel sdm vol3 ch-25.1.3 says: The "PAUSE-loop exiting" 6118 * VM-execution control is ignored if CPL > 0. OTOH, KVM 6119 * never set PAUSE_EXITING and just set PLE if supported, 6120 * so the vcpu must be CPL=0 if it gets a PAUSE exit. 6121 */ 6122 kvm_vcpu_on_spin(vcpu, true); 6123 return kvm_skip_emulated_instruction(vcpu); 6124 } 6125 6126 static int handle_monitor_trap(struct kvm_vcpu *vcpu) 6127 { 6128 return 1; 6129 } 6130 6131 static int handle_invpcid(struct kvm_vcpu *vcpu) 6132 { 6133 u32 vmx_instruction_info; 6134 unsigned long type; 6135 gva_t gva; 6136 struct { 6137 u64 pcid; 6138 u64 gla; 6139 } operand; 6140 int gpr_index; 6141 6142 if (!guest_cpu_cap_has(vcpu, X86_FEATURE_INVPCID)) { 6143 kvm_queue_exception(vcpu, UD_VECTOR); 6144 return 1; 6145 } 6146 6147 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); 6148 gpr_index = vmx_get_instr_info_reg2(vmx_instruction_info); 6149 type = kvm_register_read(vcpu, gpr_index); 6150 6151 /* According to the Intel instruction reference, the memory operand 6152 * is read even if it isn't needed (e.g., for type==all) 6153 */ 6154 if (get_vmx_mem_address(vcpu, vmx_get_exit_qual(vcpu), 6155 vmx_instruction_info, false, 6156 sizeof(operand), &gva)) 6157 return 1; 6158 6159 return kvm_handle_invpcid(vcpu, type, gva); 6160 } 6161 6162 static int handle_pml_full(struct kvm_vcpu *vcpu) 6163 { 6164 unsigned long exit_qualification; 6165 6166 trace_kvm_pml_full(vcpu->vcpu_id); 6167 6168 exit_qualification = vmx_get_exit_qual(vcpu); 6169 6170 /* 6171 * PML buffer FULL happened while executing iret from NMI, 6172 * "blocked by NMI" bit has to be set before next VM entry. 6173 */ 6174 if (!(to_vmx(vcpu)->idt_vectoring_info & VECTORING_INFO_VALID_MASK) && 6175 enable_vnmi && 6176 (exit_qualification & INTR_INFO_UNBLOCK_NMI)) 6177 vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, 6178 GUEST_INTR_STATE_NMI); 6179 6180 /* 6181 * PML buffer already flushed at beginning of VMEXIT. Nothing to do 6182 * here.., and there's no userspace involvement needed for PML. 6183 */ 6184 return 1; 6185 } 6186 6187 static fastpath_t handle_fastpath_preemption_timer(struct kvm_vcpu *vcpu, 6188 bool force_immediate_exit) 6189 { 6190 struct vcpu_vmx *vmx = to_vmx(vcpu); 6191 6192 /* 6193 * In the *extremely* unlikely scenario that this is a spurious VM-Exit 6194 * due to the timer expiring while it was "soft" disabled, just eat the 6195 * exit and re-enter the guest. 6196 */ 6197 if (unlikely(vmx->loaded_vmcs->hv_timer_soft_disabled)) 6198 return EXIT_FASTPATH_REENTER_GUEST; 6199 6200 /* 6201 * If the timer expired because KVM used it to force an immediate exit, 6202 * then mission accomplished. 6203 */ 6204 if (force_immediate_exit) 6205 return EXIT_FASTPATH_EXIT_HANDLED; 6206 6207 /* 6208 * If L2 is active, go down the slow path as emulating the guest timer 6209 * expiration likely requires synthesizing a nested VM-Exit. 6210 */ 6211 if (is_guest_mode(vcpu)) 6212 return EXIT_FASTPATH_NONE; 6213 6214 kvm_lapic_expired_hv_timer(vcpu); 6215 return EXIT_FASTPATH_REENTER_GUEST; 6216 } 6217 6218 static int handle_preemption_timer(struct kvm_vcpu *vcpu) 6219 { 6220 /* 6221 * This non-fastpath handler is reached if and only if the preemption 6222 * timer was being used to emulate a guest timer while L2 is active. 6223 * All other scenarios are supposed to be handled in the fastpath. 6224 */ 6225 WARN_ON_ONCE(!is_guest_mode(vcpu)); 6226 kvm_lapic_expired_hv_timer(vcpu); 6227 return 1; 6228 } 6229 6230 /* 6231 * When nested=0, all VMX instruction VM Exits filter here. The handlers 6232 * are overwritten by nested_vmx_hardware_setup() when nested=1. 6233 */ 6234 static int handle_vmx_instruction(struct kvm_vcpu *vcpu) 6235 { 6236 kvm_queue_exception(vcpu, UD_VECTOR); 6237 return 1; 6238 } 6239 6240 static int handle_tdx_instruction(struct kvm_vcpu *vcpu) 6241 { 6242 kvm_queue_exception(vcpu, UD_VECTOR); 6243 return 1; 6244 } 6245 6246 #ifndef CONFIG_X86_SGX_KVM 6247 static int handle_encls(struct kvm_vcpu *vcpu) 6248 { 6249 /* 6250 * SGX virtualization is disabled. There is no software enable bit for 6251 * SGX, so KVM intercepts all ENCLS leafs and injects a #UD to prevent 6252 * the guest from executing ENCLS (when SGX is supported by hardware). 6253 */ 6254 kvm_queue_exception(vcpu, UD_VECTOR); 6255 return 1; 6256 } 6257 #endif /* CONFIG_X86_SGX_KVM */ 6258 6259 static int handle_bus_lock_vmexit(struct kvm_vcpu *vcpu) 6260 { 6261 /* 6262 * Hardware may or may not set the BUS_LOCK_DETECTED flag on BUS_LOCK 6263 * VM-Exits. Unconditionally set the flag here and leave the handling to 6264 * vmx_handle_exit(). 6265 */ 6266 to_vt(vcpu)->exit_reason.bus_lock_detected = true; 6267 return 1; 6268 } 6269 6270 static int handle_notify(struct kvm_vcpu *vcpu) 6271 { 6272 unsigned long exit_qual = vmx_get_exit_qual(vcpu); 6273 bool context_invalid = exit_qual & NOTIFY_VM_CONTEXT_INVALID; 6274 6275 ++vcpu->stat.notify_window_exits; 6276 6277 /* 6278 * Notify VM exit happened while executing iret from NMI, 6279 * "blocked by NMI" bit has to be set before next VM entry. 6280 */ 6281 if (enable_vnmi && (exit_qual & INTR_INFO_UNBLOCK_NMI)) 6282 vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, 6283 GUEST_INTR_STATE_NMI); 6284 6285 if (vcpu->kvm->arch.notify_vmexit_flags & KVM_X86_NOTIFY_VMEXIT_USER || 6286 context_invalid) { 6287 vcpu->run->exit_reason = KVM_EXIT_NOTIFY; 6288 vcpu->run->notify.flags = context_invalid ? 6289 KVM_NOTIFY_CONTEXT_INVALID : 0; 6290 return 0; 6291 } 6292 6293 return 1; 6294 } 6295 6296 static int vmx_get_msr_imm_reg(struct kvm_vcpu *vcpu) 6297 { 6298 return vmx_get_instr_info_reg(vmcs_read32(VMX_INSTRUCTION_INFO)); 6299 } 6300 6301 static int handle_rdmsr_imm(struct kvm_vcpu *vcpu) 6302 { 6303 return kvm_emulate_rdmsr_imm(vcpu, vmx_get_exit_qual(vcpu), 6304 vmx_get_msr_imm_reg(vcpu)); 6305 } 6306 6307 static int handle_wrmsr_imm(struct kvm_vcpu *vcpu) 6308 { 6309 return kvm_emulate_wrmsr_imm(vcpu, vmx_get_exit_qual(vcpu), 6310 vmx_get_msr_imm_reg(vcpu)); 6311 } 6312 6313 /* 6314 * The exit handlers return 1 if the exit was handled fully and guest execution 6315 * may resume. Otherwise they set the kvm_run parameter to indicate what needs 6316 * to be done to userspace and return 0. 6317 */ 6318 static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = { 6319 [EXIT_REASON_EXCEPTION_NMI] = handle_exception_nmi, 6320 [EXIT_REASON_EXTERNAL_INTERRUPT] = handle_external_interrupt, 6321 [EXIT_REASON_TRIPLE_FAULT] = handle_triple_fault, 6322 [EXIT_REASON_NMI_WINDOW] = handle_nmi_window, 6323 [EXIT_REASON_IO_INSTRUCTION] = handle_io, 6324 [EXIT_REASON_CR_ACCESS] = handle_cr, 6325 [EXIT_REASON_DR_ACCESS] = handle_dr, 6326 [EXIT_REASON_CPUID] = kvm_emulate_cpuid, 6327 [EXIT_REASON_MSR_READ] = kvm_emulate_rdmsr, 6328 [EXIT_REASON_MSR_WRITE] = kvm_emulate_wrmsr, 6329 [EXIT_REASON_INTERRUPT_WINDOW] = handle_interrupt_window, 6330 [EXIT_REASON_HLT] = kvm_emulate_halt, 6331 [EXIT_REASON_INVD] = kvm_emulate_invd, 6332 [EXIT_REASON_INVLPG] = handle_invlpg, 6333 [EXIT_REASON_RDPMC] = kvm_emulate_rdpmc, 6334 [EXIT_REASON_VMCALL] = kvm_emulate_hypercall, 6335 [EXIT_REASON_VMCLEAR] = handle_vmx_instruction, 6336 [EXIT_REASON_VMLAUNCH] = handle_vmx_instruction, 6337 [EXIT_REASON_VMPTRLD] = handle_vmx_instruction, 6338 [EXIT_REASON_VMPTRST] = handle_vmx_instruction, 6339 [EXIT_REASON_VMREAD] = handle_vmx_instruction, 6340 [EXIT_REASON_VMRESUME] = handle_vmx_instruction, 6341 [EXIT_REASON_VMWRITE] = handle_vmx_instruction, 6342 [EXIT_REASON_VMOFF] = handle_vmx_instruction, 6343 [EXIT_REASON_VMON] = handle_vmx_instruction, 6344 [EXIT_REASON_TPR_BELOW_THRESHOLD] = handle_tpr_below_threshold, 6345 [EXIT_REASON_APIC_ACCESS] = handle_apic_access, 6346 [EXIT_REASON_APIC_WRITE] = handle_apic_write, 6347 [EXIT_REASON_EOI_INDUCED] = handle_apic_eoi_induced, 6348 [EXIT_REASON_WBINVD] = kvm_emulate_wbinvd, 6349 [EXIT_REASON_XSETBV] = kvm_emulate_xsetbv, 6350 [EXIT_REASON_TASK_SWITCH] = handle_task_switch, 6351 [EXIT_REASON_MCE_DURING_VMENTRY] = handle_machine_check, 6352 [EXIT_REASON_GDTR_IDTR] = handle_desc, 6353 [EXIT_REASON_LDTR_TR] = handle_desc, 6354 [EXIT_REASON_EPT_VIOLATION] = handle_ept_violation, 6355 [EXIT_REASON_EPT_MISCONFIG] = handle_ept_misconfig, 6356 [EXIT_REASON_PAUSE_INSTRUCTION] = handle_pause, 6357 [EXIT_REASON_MWAIT_INSTRUCTION] = kvm_emulate_mwait, 6358 [EXIT_REASON_MONITOR_TRAP_FLAG] = handle_monitor_trap, 6359 [EXIT_REASON_MONITOR_INSTRUCTION] = kvm_emulate_monitor, 6360 [EXIT_REASON_INVEPT] = handle_vmx_instruction, 6361 [EXIT_REASON_INVVPID] = handle_vmx_instruction, 6362 [EXIT_REASON_RDRAND] = kvm_handle_invalid_op, 6363 [EXIT_REASON_RDSEED] = kvm_handle_invalid_op, 6364 [EXIT_REASON_PML_FULL] = handle_pml_full, 6365 [EXIT_REASON_INVPCID] = handle_invpcid, 6366 [EXIT_REASON_VMFUNC] = handle_vmx_instruction, 6367 [EXIT_REASON_PREEMPTION_TIMER] = handle_preemption_timer, 6368 [EXIT_REASON_ENCLS] = handle_encls, 6369 [EXIT_REASON_BUS_LOCK] = handle_bus_lock_vmexit, 6370 [EXIT_REASON_NOTIFY] = handle_notify, 6371 [EXIT_REASON_SEAMCALL] = handle_tdx_instruction, 6372 [EXIT_REASON_TDCALL] = handle_tdx_instruction, 6373 [EXIT_REASON_MSR_READ_IMM] = handle_rdmsr_imm, 6374 [EXIT_REASON_MSR_WRITE_IMM] = handle_wrmsr_imm, 6375 }; 6376 6377 static const int kvm_vmx_max_exit_handlers = 6378 ARRAY_SIZE(kvm_vmx_exit_handlers); 6379 6380 void vmx_get_exit_info(struct kvm_vcpu *vcpu, u32 *reason, 6381 u64 *info1, u64 *info2, u32 *intr_info, u32 *error_code) 6382 { 6383 struct vcpu_vmx *vmx = to_vmx(vcpu); 6384 6385 *reason = vmx->vt.exit_reason.full; 6386 *info1 = vmx_get_exit_qual(vcpu); 6387 if (!(vmx->vt.exit_reason.failed_vmentry)) { 6388 *info2 = vmx->idt_vectoring_info; 6389 *intr_info = vmx_get_intr_info(vcpu); 6390 if (is_exception_with_error_code(*intr_info)) 6391 *error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE); 6392 else 6393 *error_code = 0; 6394 } else { 6395 *info2 = 0; 6396 *intr_info = 0; 6397 *error_code = 0; 6398 } 6399 } 6400 6401 void vmx_get_entry_info(struct kvm_vcpu *vcpu, u32 *intr_info, u32 *error_code) 6402 { 6403 *intr_info = vmcs_read32(VM_ENTRY_INTR_INFO_FIELD); 6404 if (is_exception_with_error_code(*intr_info)) 6405 *error_code = vmcs_read32(VM_ENTRY_EXCEPTION_ERROR_CODE); 6406 else 6407 *error_code = 0; 6408 } 6409 6410 static void vmx_destroy_pml_buffer(struct vcpu_vmx *vmx) 6411 { 6412 if (vmx->pml_pg) { 6413 __free_page(vmx->pml_pg); 6414 vmx->pml_pg = NULL; 6415 } 6416 } 6417 6418 static void vmx_flush_pml_buffer(struct kvm_vcpu *vcpu) 6419 { 6420 struct vcpu_vmx *vmx = to_vmx(vcpu); 6421 u16 pml_idx, pml_tail_index; 6422 u64 *pml_buf; 6423 int i; 6424 6425 pml_idx = vmcs_read16(GUEST_PML_INDEX); 6426 6427 /* Do nothing if PML buffer is empty */ 6428 if (pml_idx == PML_HEAD_INDEX) 6429 return; 6430 /* 6431 * PML index always points to the next available PML buffer entity 6432 * unless PML log has just overflowed. 6433 */ 6434 pml_tail_index = (pml_idx >= PML_LOG_NR_ENTRIES) ? 0 : pml_idx + 1; 6435 6436 /* 6437 * PML log is written backwards: the CPU first writes the entry 511 6438 * then the entry 510, and so on. 6439 * 6440 * Read the entries in the same order they were written, to ensure that 6441 * the dirty ring is filled in the same order the CPU wrote them. 6442 */ 6443 pml_buf = page_address(vmx->pml_pg); 6444 6445 for (i = PML_HEAD_INDEX; i >= pml_tail_index; i--) { 6446 u64 gpa; 6447 6448 gpa = pml_buf[i]; 6449 WARN_ON(gpa & (PAGE_SIZE - 1)); 6450 kvm_vcpu_mark_page_dirty(vcpu, gpa >> PAGE_SHIFT); 6451 } 6452 6453 /* reset PML index */ 6454 vmcs_write16(GUEST_PML_INDEX, PML_HEAD_INDEX); 6455 } 6456 6457 static void nested_vmx_mark_all_vmcs12_pages_dirty(struct kvm_vcpu *vcpu) 6458 { 6459 struct vcpu_vmx *vmx = to_vmx(vcpu); 6460 6461 kvm_vcpu_map_mark_dirty(vcpu, &vmx->nested.apic_access_page_map); 6462 kvm_vcpu_map_mark_dirty(vcpu, &vmx->nested.virtual_apic_map); 6463 kvm_vcpu_map_mark_dirty(vcpu, &vmx->nested.pi_desc_map); 6464 } 6465 6466 static void vmx_dump_sel(char *name, uint32_t sel) 6467 { 6468 pr_err("%s sel=0x%04x, attr=0x%05x, limit=0x%08x, base=0x%016lx\n", 6469 name, vmcs_read16(sel), 6470 vmcs_read32(sel + GUEST_ES_AR_BYTES - GUEST_ES_SELECTOR), 6471 vmcs_read32(sel + GUEST_ES_LIMIT - GUEST_ES_SELECTOR), 6472 vmcs_readl(sel + GUEST_ES_BASE - GUEST_ES_SELECTOR)); 6473 } 6474 6475 static void vmx_dump_dtsel(char *name, uint32_t limit) 6476 { 6477 pr_err("%s limit=0x%08x, base=0x%016lx\n", 6478 name, vmcs_read32(limit), 6479 vmcs_readl(limit + GUEST_GDTR_BASE - GUEST_GDTR_LIMIT)); 6480 } 6481 6482 static void vmx_dump_msrs(char *name, struct vmx_msrs *m) 6483 { 6484 unsigned int i; 6485 struct vmx_msr_entry *e; 6486 6487 pr_err("MSR %s:\n", name); 6488 for (i = 0, e = m->val; i < m->nr; ++i, ++e) 6489 pr_err(" %2d: msr=0x%08x value=0x%016llx\n", i, e->index, e->value); 6490 } 6491 6492 void dump_vmcs(struct kvm_vcpu *vcpu) 6493 { 6494 struct vcpu_vmx *vmx = to_vmx(vcpu); 6495 u32 vmentry_ctl, vmexit_ctl; 6496 u32 cpu_based_exec_ctrl, pin_based_exec_ctrl, secondary_exec_control; 6497 u64 tertiary_exec_control; 6498 unsigned long cr4; 6499 int efer_slot; 6500 6501 if (!dump_invalid_vmcs) { 6502 pr_warn_ratelimited("set kvm_intel.dump_invalid_vmcs=1 to dump internal KVM state.\n"); 6503 return; 6504 } 6505 6506 vmentry_ctl = vmcs_read32(VM_ENTRY_CONTROLS); 6507 vmexit_ctl = vmcs_read32(VM_EXIT_CONTROLS); 6508 cpu_based_exec_ctrl = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); 6509 pin_based_exec_ctrl = vmcs_read32(PIN_BASED_VM_EXEC_CONTROL); 6510 cr4 = vmcs_readl(GUEST_CR4); 6511 6512 if (cpu_has_secondary_exec_ctrls()) 6513 secondary_exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL); 6514 else 6515 secondary_exec_control = 0; 6516 6517 if (cpu_has_tertiary_exec_ctrls()) 6518 tertiary_exec_control = vmcs_read64(TERTIARY_VM_EXEC_CONTROL); 6519 else 6520 tertiary_exec_control = 0; 6521 6522 pr_err("VMCS %p, last attempted VM-entry on CPU %d\n", 6523 vmx->loaded_vmcs->vmcs, vcpu->arch.last_vmentry_cpu); 6524 pr_err("*** Guest State ***\n"); 6525 pr_err("CR0: actual=0x%016lx, shadow=0x%016lx, gh_mask=%016lx\n", 6526 vmcs_readl(GUEST_CR0), vmcs_readl(CR0_READ_SHADOW), 6527 vmcs_readl(CR0_GUEST_HOST_MASK)); 6528 pr_err("CR4: actual=0x%016lx, shadow=0x%016lx, gh_mask=%016lx\n", 6529 cr4, vmcs_readl(CR4_READ_SHADOW), vmcs_readl(CR4_GUEST_HOST_MASK)); 6530 pr_err("CR3 = 0x%016lx\n", vmcs_readl(GUEST_CR3)); 6531 if (cpu_has_vmx_ept()) { 6532 pr_err("PDPTR0 = 0x%016llx PDPTR1 = 0x%016llx\n", 6533 vmcs_read64(GUEST_PDPTR0), vmcs_read64(GUEST_PDPTR1)); 6534 pr_err("PDPTR2 = 0x%016llx PDPTR3 = 0x%016llx\n", 6535 vmcs_read64(GUEST_PDPTR2), vmcs_read64(GUEST_PDPTR3)); 6536 } 6537 pr_err("RSP = 0x%016lx RIP = 0x%016lx\n", 6538 vmcs_readl(GUEST_RSP), vmcs_readl(GUEST_RIP)); 6539 pr_err("RFLAGS=0x%08lx DR7 = 0x%016lx\n", 6540 vmcs_readl(GUEST_RFLAGS), vmcs_readl(GUEST_DR7)); 6541 pr_err("Sysenter RSP=%016lx CS:RIP=%04x:%016lx\n", 6542 vmcs_readl(GUEST_SYSENTER_ESP), 6543 vmcs_read32(GUEST_SYSENTER_CS), vmcs_readl(GUEST_SYSENTER_EIP)); 6544 vmx_dump_sel("CS: ", GUEST_CS_SELECTOR); 6545 vmx_dump_sel("DS: ", GUEST_DS_SELECTOR); 6546 vmx_dump_sel("SS: ", GUEST_SS_SELECTOR); 6547 vmx_dump_sel("ES: ", GUEST_ES_SELECTOR); 6548 vmx_dump_sel("FS: ", GUEST_FS_SELECTOR); 6549 vmx_dump_sel("GS: ", GUEST_GS_SELECTOR); 6550 vmx_dump_dtsel("GDTR:", GUEST_GDTR_LIMIT); 6551 vmx_dump_sel("LDTR:", GUEST_LDTR_SELECTOR); 6552 vmx_dump_dtsel("IDTR:", GUEST_IDTR_LIMIT); 6553 vmx_dump_sel("TR: ", GUEST_TR_SELECTOR); 6554 efer_slot = vmx_find_loadstore_msr_slot(&vmx->msr_autoload.guest, MSR_EFER); 6555 if (vmentry_ctl & VM_ENTRY_LOAD_IA32_EFER) 6556 pr_err("EFER= 0x%016llx\n", vmcs_read64(GUEST_IA32_EFER)); 6557 else if (efer_slot >= 0) 6558 pr_err("EFER= 0x%016llx (autoload)\n", 6559 vmx->msr_autoload.guest.val[efer_slot].value); 6560 else if (vmentry_ctl & VM_ENTRY_IA32E_MODE) 6561 pr_err("EFER= 0x%016llx (effective)\n", 6562 vcpu->arch.efer | (EFER_LMA | EFER_LME)); 6563 else 6564 pr_err("EFER= 0x%016llx (effective)\n", 6565 vcpu->arch.efer & ~(EFER_LMA | EFER_LME)); 6566 if (vmentry_ctl & VM_ENTRY_LOAD_IA32_PAT) 6567 pr_err("PAT = 0x%016llx\n", vmcs_read64(GUEST_IA32_PAT)); 6568 pr_err("DebugCtl = 0x%016llx DebugExceptions = 0x%016lx\n", 6569 vmcs_read64(GUEST_IA32_DEBUGCTL), 6570 vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS)); 6571 if (cpu_has_load_perf_global_ctrl() && 6572 vmentry_ctl & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL) 6573 pr_err("PerfGlobCtl = 0x%016llx\n", 6574 vmcs_read64(GUEST_IA32_PERF_GLOBAL_CTRL)); 6575 if (vmentry_ctl & VM_ENTRY_LOAD_BNDCFGS) 6576 pr_err("BndCfgS = 0x%016llx\n", vmcs_read64(GUEST_BNDCFGS)); 6577 pr_err("Interruptibility = %08x ActivityState = %08x\n", 6578 vmcs_read32(GUEST_INTERRUPTIBILITY_INFO), 6579 vmcs_read32(GUEST_ACTIVITY_STATE)); 6580 if (secondary_exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY) 6581 pr_err("InterruptStatus = %04x\n", 6582 vmcs_read16(GUEST_INTR_STATUS)); 6583 if (vmcs_read32(VM_ENTRY_MSR_LOAD_COUNT) > 0) 6584 vmx_dump_msrs("guest autoload", &vmx->msr_autoload.guest); 6585 if (vmcs_read32(VM_EXIT_MSR_STORE_COUNT) > 0) 6586 vmx_dump_msrs("autostore", &vmx->msr_autostore); 6587 6588 if (vmentry_ctl & VM_ENTRY_LOAD_CET_STATE) 6589 pr_err("S_CET = 0x%016lx, SSP = 0x%016lx, SSP TABLE = 0x%016lx\n", 6590 vmcs_readl(GUEST_S_CET), vmcs_readl(GUEST_SSP), 6591 vmcs_readl(GUEST_INTR_SSP_TABLE)); 6592 pr_err("*** Host State ***\n"); 6593 pr_err("RIP = 0x%016lx RSP = 0x%016lx\n", 6594 vmcs_readl(HOST_RIP), vmcs_readl(HOST_RSP)); 6595 pr_err("CS=%04x SS=%04x DS=%04x ES=%04x FS=%04x GS=%04x TR=%04x\n", 6596 vmcs_read16(HOST_CS_SELECTOR), vmcs_read16(HOST_SS_SELECTOR), 6597 vmcs_read16(HOST_DS_SELECTOR), vmcs_read16(HOST_ES_SELECTOR), 6598 vmcs_read16(HOST_FS_SELECTOR), vmcs_read16(HOST_GS_SELECTOR), 6599 vmcs_read16(HOST_TR_SELECTOR)); 6600 pr_err("FSBase=%016lx GSBase=%016lx TRBase=%016lx\n", 6601 vmcs_readl(HOST_FS_BASE), vmcs_readl(HOST_GS_BASE), 6602 vmcs_readl(HOST_TR_BASE)); 6603 pr_err("GDTBase=%016lx IDTBase=%016lx\n", 6604 vmcs_readl(HOST_GDTR_BASE), vmcs_readl(HOST_IDTR_BASE)); 6605 pr_err("CR0=%016lx CR3=%016lx CR4=%016lx\n", 6606 vmcs_readl(HOST_CR0), vmcs_readl(HOST_CR3), 6607 vmcs_readl(HOST_CR4)); 6608 pr_err("Sysenter RSP=%016lx CS:RIP=%04x:%016lx\n", 6609 vmcs_readl(HOST_IA32_SYSENTER_ESP), 6610 vmcs_read32(HOST_IA32_SYSENTER_CS), 6611 vmcs_readl(HOST_IA32_SYSENTER_EIP)); 6612 if (vmexit_ctl & VM_EXIT_LOAD_IA32_EFER) 6613 pr_err("EFER= 0x%016llx\n", vmcs_read64(HOST_IA32_EFER)); 6614 if (vmexit_ctl & VM_EXIT_LOAD_IA32_PAT) 6615 pr_err("PAT = 0x%016llx\n", vmcs_read64(HOST_IA32_PAT)); 6616 if (cpu_has_load_perf_global_ctrl() && 6617 vmexit_ctl & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL) 6618 pr_err("PerfGlobCtl = 0x%016llx\n", 6619 vmcs_read64(HOST_IA32_PERF_GLOBAL_CTRL)); 6620 if (vmcs_read32(VM_EXIT_MSR_LOAD_COUNT) > 0) 6621 vmx_dump_msrs("host autoload", &vmx->msr_autoload.host); 6622 if (vmexit_ctl & VM_EXIT_LOAD_CET_STATE) 6623 pr_err("S_CET = 0x%016lx, SSP = 0x%016lx, SSP TABLE = 0x%016lx\n", 6624 vmcs_readl(HOST_S_CET), vmcs_readl(HOST_SSP), 6625 vmcs_readl(HOST_INTR_SSP_TABLE)); 6626 6627 pr_err("*** Control State ***\n"); 6628 pr_err("CPUBased=0x%08x SecondaryExec=0x%08x TertiaryExec=0x%016llx\n", 6629 cpu_based_exec_ctrl, secondary_exec_control, tertiary_exec_control); 6630 pr_err("PinBased=0x%08x EntryControls=%08x ExitControls=%08x\n", 6631 pin_based_exec_ctrl, vmentry_ctl, vmexit_ctl); 6632 pr_err("ExceptionBitmap=%08x PFECmask=%08x PFECmatch=%08x\n", 6633 vmcs_read32(EXCEPTION_BITMAP), 6634 vmcs_read32(PAGE_FAULT_ERROR_CODE_MASK), 6635 vmcs_read32(PAGE_FAULT_ERROR_CODE_MATCH)); 6636 pr_err("VMEntry: intr_info=%08x errcode=%08x ilen=%08x\n", 6637 vmcs_read32(VM_ENTRY_INTR_INFO_FIELD), 6638 vmcs_read32(VM_ENTRY_EXCEPTION_ERROR_CODE), 6639 vmcs_read32(VM_ENTRY_INSTRUCTION_LEN)); 6640 pr_err("VMExit: intr_info=%08x errcode=%08x ilen=%08x\n", 6641 vmcs_read32(VM_EXIT_INTR_INFO), 6642 vmcs_read32(VM_EXIT_INTR_ERROR_CODE), 6643 vmcs_read32(VM_EXIT_INSTRUCTION_LEN)); 6644 pr_err(" reason=%08x qualification=%016lx\n", 6645 vmcs_read32(VM_EXIT_REASON), vmcs_readl(EXIT_QUALIFICATION)); 6646 pr_err("IDTVectoring: info=%08x errcode=%08x\n", 6647 vmcs_read32(IDT_VECTORING_INFO_FIELD), 6648 vmcs_read32(IDT_VECTORING_ERROR_CODE)); 6649 pr_err("TSC Offset = 0x%016llx\n", vmcs_read64(TSC_OFFSET)); 6650 if (secondary_exec_control & SECONDARY_EXEC_TSC_SCALING) 6651 pr_err("TSC Multiplier = 0x%016llx\n", 6652 vmcs_read64(TSC_MULTIPLIER)); 6653 if (cpu_based_exec_ctrl & CPU_BASED_TPR_SHADOW) { 6654 if (secondary_exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY) { 6655 u16 status = vmcs_read16(GUEST_INTR_STATUS); 6656 pr_err("SVI|RVI = %02x|%02x ", status >> 8, status & 0xff); 6657 } 6658 pr_cont("TPR Threshold = 0x%02x\n", vmcs_read32(TPR_THRESHOLD)); 6659 if (secondary_exec_control & SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES) 6660 pr_err("APIC-access addr = 0x%016llx ", vmcs_read64(APIC_ACCESS_ADDR)); 6661 pr_cont("virt-APIC addr = 0x%016llx\n", vmcs_read64(VIRTUAL_APIC_PAGE_ADDR)); 6662 } 6663 if (pin_based_exec_ctrl & PIN_BASED_POSTED_INTR) 6664 pr_err("PostedIntrVec = 0x%02x\n", vmcs_read16(POSTED_INTR_NV)); 6665 if ((secondary_exec_control & SECONDARY_EXEC_ENABLE_EPT)) 6666 pr_err("EPT pointer = 0x%016llx\n", vmcs_read64(EPT_POINTER)); 6667 if (secondary_exec_control & SECONDARY_EXEC_PAUSE_LOOP_EXITING) 6668 pr_err("PLE Gap=%08x Window=%08x\n", 6669 vmcs_read32(PLE_GAP), vmcs_read32(PLE_WINDOW)); 6670 if (secondary_exec_control & SECONDARY_EXEC_ENABLE_VPID) 6671 pr_err("Virtual processor ID = 0x%04x\n", 6672 vmcs_read16(VIRTUAL_PROCESSOR_ID)); 6673 if (secondary_exec_control & SECONDARY_EXEC_EPT_VIOLATION_VE) { 6674 struct vmx_ve_information *ve_info = vmx->ve_info; 6675 u64 ve_info_pa = vmcs_read64(VE_INFORMATION_ADDRESS); 6676 6677 /* 6678 * If KVM is dumping the VMCS, then something has gone wrong 6679 * already. Derefencing an address from the VMCS, which could 6680 * very well be corrupted, is a terrible idea. The virtual 6681 * address is known so use it. 6682 */ 6683 pr_err("VE info address = 0x%016llx%s\n", ve_info_pa, 6684 ve_info_pa == __pa(ve_info) ? "" : "(corrupted!)"); 6685 pr_err("ve_info: 0x%08x 0x%08x 0x%016llx 0x%016llx 0x%016llx 0x%04x\n", 6686 ve_info->exit_reason, ve_info->delivery, 6687 ve_info->exit_qualification, 6688 ve_info->guest_linear_address, 6689 ve_info->guest_physical_address, ve_info->eptp_index); 6690 } 6691 } 6692 6693 /* 6694 * The guest has exited. See if we can fix it or if we need userspace 6695 * assistance. 6696 */ 6697 static int __vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) 6698 { 6699 struct vcpu_vmx *vmx = to_vmx(vcpu); 6700 union vmx_exit_reason exit_reason = vmx_get_exit_reason(vcpu); 6701 u32 vectoring_info = vmx->idt_vectoring_info; 6702 u16 exit_handler_index; 6703 6704 /* 6705 * Flush logged GPAs PML buffer, this will make dirty_bitmap more 6706 * updated. Another good is, in kvm_vm_ioctl_get_dirty_log, before 6707 * querying dirty_bitmap, we only need to kick all vcpus out of guest 6708 * mode as if vcpus is in root mode, the PML buffer must has been 6709 * flushed already. Note, PML is never enabled in hardware while 6710 * running L2. 6711 */ 6712 if (enable_pml && !is_guest_mode(vcpu)) 6713 vmx_flush_pml_buffer(vcpu); 6714 6715 if (unlikely(exit_fastpath == EXIT_FASTPATH_EXIT_USERSPACE)) 6716 return 0; 6717 6718 /* 6719 * KVM should never reach this point with a pending nested VM-Enter. 6720 * More specifically, short-circuiting VM-Entry to emulate L2 due to 6721 * invalid guest state should never happen as that means KVM knowingly 6722 * allowed a nested VM-Enter with an invalid vmcs12. More below. 6723 */ 6724 if (KVM_BUG_ON(vcpu->arch.nested_run_pending, vcpu->kvm)) 6725 return -EIO; 6726 6727 if (is_guest_mode(vcpu)) { 6728 /* 6729 * PML is never enabled when running L2, bail immediately if a 6730 * PML full exit occurs as something is horribly wrong. 6731 */ 6732 if (exit_reason.basic == EXIT_REASON_PML_FULL) 6733 goto unexpected_vmexit; 6734 6735 /* 6736 * The host physical addresses of some pages of guest memory 6737 * are loaded into the vmcs02 (e.g. vmcs12's Virtual APIC 6738 * Page). The CPU may write to these pages via their host 6739 * physical address while L2 is running, bypassing any 6740 * address-translation-based dirty tracking (e.g. EPT write 6741 * protection). 6742 * 6743 * Mark them dirty on every exit from L2 to prevent them from 6744 * getting out of sync with dirty tracking. 6745 */ 6746 nested_vmx_mark_all_vmcs12_pages_dirty(vcpu); 6747 6748 /* 6749 * Synthesize a triple fault if L2 state is invalid. In normal 6750 * operation, nested VM-Enter rejects any attempt to enter L2 6751 * with invalid state. However, those checks are skipped if 6752 * state is being stuffed via RSM or KVM_SET_NESTED_STATE. If 6753 * L2 state is invalid, it means either L1 modified SMRAM state 6754 * or userspace provided bad state. Synthesize TRIPLE_FAULT as 6755 * doing so is architecturally allowed in the RSM case, and is 6756 * the least awful solution for the userspace case without 6757 * risking false positives. 6758 */ 6759 if (vmx->vt.emulation_required) { 6760 nested_vmx_vmexit(vcpu, EXIT_REASON_TRIPLE_FAULT, 0, 0); 6761 return 1; 6762 } 6763 6764 if (nested_vmx_reflect_vmexit(vcpu)) 6765 return 1; 6766 } 6767 6768 /* If guest state is invalid, start emulating. L2 is handled above. */ 6769 if (vmx->vt.emulation_required) 6770 return handle_invalid_guest_state(vcpu); 6771 6772 if (exit_reason.failed_vmentry) { 6773 dump_vmcs(vcpu); 6774 vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY; 6775 vcpu->run->fail_entry.hardware_entry_failure_reason 6776 = exit_reason.full; 6777 vcpu->run->fail_entry.cpu = vcpu->arch.last_vmentry_cpu; 6778 return 0; 6779 } 6780 6781 if (unlikely(vmx->fail)) { 6782 dump_vmcs(vcpu); 6783 vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY; 6784 vcpu->run->fail_entry.hardware_entry_failure_reason 6785 = vmcs_read32(VM_INSTRUCTION_ERROR); 6786 vcpu->run->fail_entry.cpu = vcpu->arch.last_vmentry_cpu; 6787 return 0; 6788 } 6789 6790 if ((vectoring_info & VECTORING_INFO_VALID_MASK) && 6791 (exit_reason.basic != EXIT_REASON_EXCEPTION_NMI && 6792 exit_reason.basic != EXIT_REASON_EPT_VIOLATION && 6793 exit_reason.basic != EXIT_REASON_PML_FULL && 6794 exit_reason.basic != EXIT_REASON_APIC_ACCESS && 6795 exit_reason.basic != EXIT_REASON_TASK_SWITCH && 6796 exit_reason.basic != EXIT_REASON_NOTIFY && 6797 exit_reason.basic != EXIT_REASON_EPT_MISCONFIG)) { 6798 kvm_prepare_event_vectoring_exit(vcpu, INVALID_GPA); 6799 return 0; 6800 } 6801 6802 if (unlikely(!enable_vnmi && 6803 vmx->loaded_vmcs->soft_vnmi_blocked)) { 6804 if (!vmx_interrupt_blocked(vcpu)) { 6805 vmx->loaded_vmcs->soft_vnmi_blocked = 0; 6806 } else if (vmx->loaded_vmcs->vnmi_blocked_time > 1000000000LL && 6807 vcpu->arch.nmi_pending) { 6808 /* 6809 * This CPU don't support us in finding the end of an 6810 * NMI-blocked window if the guest runs with IRQs 6811 * disabled. So we pull the trigger after 1 s of 6812 * futile waiting, but inform the user about this. 6813 */ 6814 printk(KERN_WARNING "%s: Breaking out of NMI-blocked " 6815 "state on VCPU %d after 1 s timeout\n", 6816 __func__, vcpu->vcpu_id); 6817 vmx->loaded_vmcs->soft_vnmi_blocked = 0; 6818 } 6819 } 6820 6821 if (exit_fastpath != EXIT_FASTPATH_NONE) 6822 return 1; 6823 6824 if (exit_reason.basic >= kvm_vmx_max_exit_handlers) 6825 goto unexpected_vmexit; 6826 #ifdef CONFIG_MITIGATION_RETPOLINE 6827 if (exit_reason.basic == EXIT_REASON_MSR_WRITE) 6828 return kvm_emulate_wrmsr(vcpu); 6829 else if (exit_reason.basic == EXIT_REASON_MSR_WRITE_IMM) 6830 return handle_wrmsr_imm(vcpu); 6831 else if (exit_reason.basic == EXIT_REASON_PREEMPTION_TIMER) 6832 return handle_preemption_timer(vcpu); 6833 else if (exit_reason.basic == EXIT_REASON_INTERRUPT_WINDOW) 6834 return handle_interrupt_window(vcpu); 6835 else if (exit_reason.basic == EXIT_REASON_EXTERNAL_INTERRUPT) 6836 return handle_external_interrupt(vcpu); 6837 else if (exit_reason.basic == EXIT_REASON_HLT) 6838 return kvm_emulate_halt(vcpu); 6839 else if (exit_reason.basic == EXIT_REASON_EPT_MISCONFIG) 6840 return handle_ept_misconfig(vcpu); 6841 #endif 6842 6843 exit_handler_index = array_index_nospec((u16)exit_reason.basic, 6844 kvm_vmx_max_exit_handlers); 6845 if (!kvm_vmx_exit_handlers[exit_handler_index]) 6846 goto unexpected_vmexit; 6847 6848 return kvm_vmx_exit_handlers[exit_handler_index](vcpu); 6849 6850 unexpected_vmexit: 6851 dump_vmcs(vcpu); 6852 kvm_prepare_unexpected_reason_exit(vcpu, exit_reason.full); 6853 return 0; 6854 } 6855 6856 int vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) 6857 { 6858 int ret = __vmx_handle_exit(vcpu, exit_fastpath); 6859 6860 /* 6861 * Exit to user space when bus lock detected to inform that there is 6862 * a bus lock in guest. 6863 */ 6864 if (vmx_get_exit_reason(vcpu).bus_lock_detected) { 6865 if (ret > 0) 6866 vcpu->run->exit_reason = KVM_EXIT_X86_BUS_LOCK; 6867 6868 vcpu->run->flags |= KVM_RUN_X86_BUS_LOCK; 6869 return 0; 6870 } 6871 return ret; 6872 } 6873 6874 void vmx_update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr) 6875 { 6876 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 6877 int tpr_threshold; 6878 6879 if (is_guest_mode(vcpu) && 6880 nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) 6881 return; 6882 6883 guard(vmx_vmcs01)(vcpu); 6884 6885 tpr_threshold = (irr == -1 || tpr < irr) ? 0 : irr; 6886 vmcs_write32(TPR_THRESHOLD, tpr_threshold); 6887 } 6888 6889 void vmx_set_virtual_apic_mode(struct kvm_vcpu *vcpu) 6890 { 6891 struct vcpu_vmx *vmx = to_vmx(vcpu); 6892 u32 sec_exec_control; 6893 6894 if (!lapic_in_kernel(vcpu)) 6895 return; 6896 6897 if (!flexpriority_enabled && 6898 !cpu_has_vmx_virtualize_x2apic_mode()) 6899 return; 6900 6901 guard(vmx_vmcs01)(vcpu); 6902 6903 sec_exec_control = secondary_exec_controls_get(vmx); 6904 sec_exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | 6905 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE); 6906 6907 switch (kvm_get_apic_mode(vcpu)) { 6908 case LAPIC_MODE_INVALID: 6909 WARN_ONCE(true, "Invalid local APIC state"); 6910 break; 6911 case LAPIC_MODE_DISABLED: 6912 break; 6913 case LAPIC_MODE_XAPIC: 6914 if (flexpriority_enabled) { 6915 sec_exec_control |= 6916 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; 6917 kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu); 6918 6919 /* 6920 * Flush the TLB, reloading the APIC access page will 6921 * only do so if its physical address has changed, but 6922 * the guest may have inserted a non-APIC mapping into 6923 * the TLB while the APIC access page was disabled. 6924 * 6925 * If L2 is active, immediately flush L1's TLB instead 6926 * of requesting a flush of the current TLB, because 6927 * the current TLB context is L2's. 6928 */ 6929 if (!is_guest_mode(vcpu)) 6930 kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu); 6931 else if (!enable_ept) 6932 vpid_sync_context(vmx->vpid); 6933 else if (VALID_PAGE(vcpu->arch.root_mmu.root.hpa)) 6934 vmx_flush_tlb_ept_root(vcpu->arch.root_mmu.root.hpa); 6935 } 6936 break; 6937 case LAPIC_MODE_X2APIC: 6938 if (cpu_has_vmx_virtualize_x2apic_mode()) 6939 sec_exec_control |= 6940 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE; 6941 break; 6942 } 6943 secondary_exec_controls_set(vmx, sec_exec_control); 6944 6945 vmx_update_msr_bitmap_x2apic(vcpu); 6946 } 6947 6948 void vmx_set_apic_access_page_addr(struct kvm_vcpu *vcpu) 6949 { 6950 const gfn_t gfn = APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT; 6951 struct kvm *kvm = vcpu->kvm; 6952 struct kvm_memslots *slots = kvm_memslots(kvm); 6953 struct kvm_memory_slot *slot; 6954 struct page *refcounted_page; 6955 unsigned long mmu_seq; 6956 kvm_pfn_t pfn; 6957 bool writable; 6958 6959 /* Note, the VIRTUALIZE_APIC_ACCESSES check needs to query vmcs01. */ 6960 guard(vmx_vmcs01)(vcpu); 6961 6962 if (!(secondary_exec_controls_get(to_vmx(vcpu)) & 6963 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) 6964 return; 6965 6966 /* 6967 * Explicitly grab the memslot using KVM's internal slot ID to ensure 6968 * KVM doesn't unintentionally grab a userspace memslot. It _should_ 6969 * be impossible for userspace to create a memslot for the APIC when 6970 * APICv is enabled, but paranoia won't hurt in this case. 6971 */ 6972 slot = id_to_memslot(slots, APIC_ACCESS_PAGE_PRIVATE_MEMSLOT); 6973 if (!slot || slot->flags & KVM_MEMSLOT_INVALID) 6974 return; 6975 6976 /* 6977 * Ensure that the mmu_notifier sequence count is read before KVM 6978 * retrieves the pfn from the primary MMU. Note, the memslot is 6979 * protected by SRCU, not the mmu_notifier. Pairs with the smp_wmb() 6980 * in kvm_mmu_invalidate_end(). 6981 */ 6982 mmu_seq = kvm->mmu_invalidate_seq; 6983 smp_rmb(); 6984 6985 /* 6986 * No need to retry if the memslot does not exist or is invalid. KVM 6987 * controls the APIC-access page memslot, and only deletes the memslot 6988 * if APICv is permanently inhibited, i.e. the memslot won't reappear. 6989 */ 6990 pfn = __kvm_faultin_pfn(slot, gfn, FOLL_WRITE, &writable, &refcounted_page); 6991 if (is_error_noslot_pfn(pfn)) 6992 return; 6993 6994 read_lock(&vcpu->kvm->mmu_lock); 6995 if (mmu_invalidate_retry_gfn(kvm, mmu_seq, gfn)) 6996 kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu); 6997 else 6998 vmcs_write64(APIC_ACCESS_ADDR, pfn_to_hpa(pfn)); 6999 7000 /* 7001 * Do not pin the APIC access page in memory so that it can be freely 7002 * migrated, the MMU notifier will call us again if it is migrated or 7003 * swapped out. KVM backs the memslot with anonymous memory, the pfn 7004 * should always point at a refcounted page (if the pfn is valid). 7005 */ 7006 if (!WARN_ON_ONCE(!refcounted_page)) 7007 kvm_release_page_clean(refcounted_page); 7008 7009 /* 7010 * No need for a manual TLB flush at this point, KVM has already done a 7011 * flush if there were SPTEs pointing at the previous page. 7012 */ 7013 read_unlock(&vcpu->kvm->mmu_lock); 7014 } 7015 7016 void vmx_hwapic_isr_update(struct kvm_vcpu *vcpu, int max_isr) 7017 { 7018 u16 status; 7019 u8 old; 7020 7021 if (max_isr == -1) 7022 max_isr = 0; 7023 7024 /* 7025 * Always update SVI in vmcs01, as SVI is only relevant for L2 if and 7026 * only if Virtual Interrupt Delivery is enabled in vmcs12, and if VID 7027 * is enabled then L2 EOIs affect L2's vAPIC, not L1's vAPIC. 7028 */ 7029 guard(vmx_vmcs01)(vcpu); 7030 7031 status = vmcs_read16(GUEST_INTR_STATUS); 7032 old = status >> 8; 7033 if (max_isr != old) { 7034 status &= 0xff; 7035 status |= max_isr << 8; 7036 vmcs_write16(GUEST_INTR_STATUS, status); 7037 } 7038 } 7039 7040 static void vmx_set_rvi(int vector) 7041 { 7042 u16 status; 7043 u8 old; 7044 7045 if (vector == -1) 7046 vector = 0; 7047 7048 status = vmcs_read16(GUEST_INTR_STATUS); 7049 old = (u8)status & 0xff; 7050 if ((u8)vector != old) { 7051 status &= ~0xff; 7052 status |= (u8)vector; 7053 vmcs_write16(GUEST_INTR_STATUS, status); 7054 } 7055 } 7056 7057 int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu) 7058 { 7059 struct vcpu_vt *vt = to_vt(vcpu); 7060 bool max_irr_is_from_pir; 7061 int max_irr; 7062 7063 if (KVM_BUG_ON(!enable_apicv, vcpu->kvm)) 7064 return -EIO; 7065 7066 if (pi_test_on(&vt->pi_desc)) { 7067 pi_clear_on(&vt->pi_desc); 7068 /* 7069 * IOMMU can write to PID.ON, so the barrier matters even on UP. 7070 * But on x86 this is just a compiler barrier anyway. 7071 */ 7072 smp_mb__after_atomic(); 7073 max_irr_is_from_pir = kvm_apic_update_irr(vcpu, vt->pi_desc.pir, 7074 &max_irr); 7075 } else { 7076 max_irr = kvm_lapic_find_highest_irr(vcpu); 7077 max_irr_is_from_pir = false; 7078 } 7079 7080 /* 7081 * If APICv is enabled and L2 is not active, then update the Requesting 7082 * Virtual Interrupt (RVI) portion of vmcs01.GUEST_INTR_STATUS with the 7083 * highest priority IRR to deliver the IRQ via Virtual Interrupt 7084 * Delivery. Note, this is required even if the highest priority IRQ 7085 * was already pending in the IRR, as RVI isn't updated in lockstep with 7086 * the IRR (unlike apic->irr_pending). 7087 * 7088 * For the cases where Virtual Interrupt Delivery can't be used: 7089 * 7090 * 1) If L2 is running and the vCPU has a new pending interrupt. If L1 7091 * wants to exit on interrupts, KVM_REQ_EVENT is needed to synthesize a 7092 * VM-Exit to L1. If L1 doesn't want to exit, the interrupt is injected 7093 * into L2, but KVM doesn't use virtual interrupt delivery to inject 7094 * interrupts into L2, and so KVM_REQ_EVENT is again needed. 7095 * 7096 * 2) If APICv is disabled for this vCPU, assigned devices may still 7097 * attempt to post interrupts. The posted interrupt vector will cause 7098 * a VM-Exit and the subsequent entry will call sync_pir_to_irr. 7099 * 7100 * In both cases, set KVM_REQ_EVENT if and only if the highest priority 7101 * pending IRQ came from the PIR, as setting KVM_REQ_EVENT if any IRQ 7102 * is pending may put the vCPU into an infinite loop, e.g. if the IRQ 7103 * is blocked, then it will stay pending until an IRQ window is opened. 7104 * 7105 * Note! It's possible that one or more IRQs were moved from the PIR 7106 * to the IRR _without_ max_irr_is_from_pir being true! I.e. if there 7107 * was a higher priority IRQ already pending in the IRR. Not setting 7108 * KVM_REQ_EVENT in this case is intentional and safe. If APICv is 7109 * inactive, or L2 is running with exit-on-interrupt off (in vmcs12), 7110 * i.e. without nested virtual interrupt delivery, then there's no need 7111 * to request an IRQ window as the lower priority IRQ only needs to be 7112 * delivered when the higher priority IRQ is dismissed from the ISR, 7113 * i.e. on the next EOI, and EOIs are always intercepted if APICv is 7114 * disabled or if L2 is running without nested VID. If L2 is running 7115 * exit-on-interrupt on (in vmcs12), then the higher priority IRQ will 7116 * trigger a nested VM-Exit, at which point KVM will re-evaluate L1's 7117 * pending IRQs. 7118 */ 7119 if (!is_guest_mode(vcpu) && kvm_vcpu_apicv_active(vcpu)) 7120 vmx_set_rvi(max_irr); 7121 else if (max_irr_is_from_pir) 7122 kvm_make_request(KVM_REQ_EVENT, vcpu); 7123 7124 return max_irr; 7125 } 7126 7127 void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap) 7128 { 7129 if (!kvm_vcpu_apicv_active(vcpu)) 7130 return; 7131 7132 vmcs_write64(EOI_EXIT_BITMAP0, eoi_exit_bitmap[0]); 7133 vmcs_write64(EOI_EXIT_BITMAP1, eoi_exit_bitmap[1]); 7134 vmcs_write64(EOI_EXIT_BITMAP2, eoi_exit_bitmap[2]); 7135 vmcs_write64(EOI_EXIT_BITMAP3, eoi_exit_bitmap[3]); 7136 } 7137 7138 static void handle_nm_fault_irqoff(struct kvm_vcpu *vcpu) 7139 { 7140 /* 7141 * Save xfd_err to guest_fpu before interrupt is enabled, so the 7142 * MSR value is not clobbered by the host activity before the guest 7143 * has chance to consume it. 7144 * 7145 * Update the guest's XFD_ERR if and only if XFD is enabled, as the #NM 7146 * interception may have been caused by L1 interception. Per the SDM, 7147 * XFD_ERR is not modified for non-XFD #NM, i.e. if CR0.TS=1. 7148 * 7149 * Note, XFD_ERR is updated _before_ the #NM interception check, i.e. 7150 * unlike CR2 and DR6, the value is not a payload that is attached to 7151 * the #NM exception. 7152 */ 7153 if (is_xfd_nm_fault(vcpu)) 7154 rdmsrq(MSR_IA32_XFD_ERR, vcpu->arch.guest_fpu.xfd_err); 7155 } 7156 7157 static void handle_exception_irqoff(struct kvm_vcpu *vcpu, u32 intr_info) 7158 { 7159 /* if exit due to PF check for async PF */ 7160 if (is_page_fault(intr_info)) 7161 vcpu->arch.apf.host_apf_flags = kvm_read_and_reset_apf_flags(); 7162 /* if exit due to NM, handle before interrupts are enabled */ 7163 else if (is_nm_fault(intr_info)) 7164 handle_nm_fault_irqoff(vcpu); 7165 /* Handle machine checks before interrupts are enabled */ 7166 else if (is_machine_check(intr_info)) 7167 kvm_machine_check(); 7168 } 7169 7170 static void handle_external_interrupt_irqoff(struct kvm_vcpu *vcpu, 7171 u32 intr_info) 7172 { 7173 unsigned int vector = intr_info & INTR_INFO_VECTOR_MASK; 7174 7175 if (KVM_BUG(!is_external_intr(intr_info), vcpu->kvm, 7176 "unexpected VM-Exit interrupt info: 0x%x", intr_info)) 7177 return; 7178 7179 kvm_before_interrupt(vcpu, KVM_HANDLING_IRQ); 7180 x86_entry_from_kvm(EVENT_TYPE_EXTINT, vector); 7181 kvm_after_interrupt(vcpu); 7182 7183 vcpu->arch.at_instruction_boundary = true; 7184 } 7185 7186 void vmx_handle_exit_irqoff(struct kvm_vcpu *vcpu) 7187 { 7188 if (to_vt(vcpu)->emulation_required) 7189 return; 7190 7191 switch (vmx_get_exit_reason(vcpu).basic) { 7192 case EXIT_REASON_EXTERNAL_INTERRUPT: 7193 handle_external_interrupt_irqoff(vcpu, vmx_get_intr_info(vcpu)); 7194 break; 7195 case EXIT_REASON_EXCEPTION_NMI: 7196 handle_exception_irqoff(vcpu, vmx_get_intr_info(vcpu)); 7197 break; 7198 case EXIT_REASON_MCE_DURING_VMENTRY: 7199 kvm_machine_check(); 7200 break; 7201 default: 7202 break; 7203 } 7204 } 7205 7206 /* 7207 * The kvm parameter can be NULL (module initialization, or invocation before 7208 * VM creation). Be sure to check the kvm parameter before using it. 7209 */ 7210 bool vmx_has_emulated_msr(struct kvm *kvm, u32 index) 7211 { 7212 switch (index) { 7213 case MSR_IA32_SMBASE: 7214 if (!IS_ENABLED(CONFIG_KVM_SMM)) 7215 return false; 7216 /* 7217 * We cannot do SMM unless we can run the guest in big 7218 * real mode. 7219 */ 7220 return enable_unrestricted_guest || emulate_invalid_guest_state; 7221 case KVM_FIRST_EMULATED_VMX_MSR ... KVM_LAST_EMULATED_VMX_MSR: 7222 return nested; 7223 case MSR_AMD64_VIRT_SPEC_CTRL: 7224 case MSR_AMD64_TSC_RATIO: 7225 /* This is AMD only. */ 7226 return false; 7227 default: 7228 return true; 7229 } 7230 } 7231 7232 static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx) 7233 { 7234 u32 exit_intr_info; 7235 bool unblock_nmi; 7236 u8 vector; 7237 bool idtv_info_valid; 7238 7239 idtv_info_valid = vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK; 7240 7241 if (enable_vnmi) { 7242 if (vmx->loaded_vmcs->nmi_known_unmasked) 7243 return; 7244 7245 exit_intr_info = vmx_get_intr_info(&vmx->vcpu); 7246 unblock_nmi = (exit_intr_info & INTR_INFO_UNBLOCK_NMI) != 0; 7247 vector = exit_intr_info & INTR_INFO_VECTOR_MASK; 7248 /* 7249 * SDM 3: 27.7.1.2 (September 2008) 7250 * Re-set bit "block by NMI" before VM entry if vmexit caused by 7251 * a guest IRET fault. 7252 * SDM 3: 23.2.2 (September 2008) 7253 * Bit 12 is undefined in any of the following cases: 7254 * If the VM exit sets the valid bit in the IDT-vectoring 7255 * information field. 7256 * If the VM exit is due to a double fault. 7257 */ 7258 if ((exit_intr_info & INTR_INFO_VALID_MASK) && unblock_nmi && 7259 vector != DF_VECTOR && !idtv_info_valid) 7260 vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, 7261 GUEST_INTR_STATE_NMI); 7262 else 7263 vmx->loaded_vmcs->nmi_known_unmasked = 7264 !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) 7265 & GUEST_INTR_STATE_NMI); 7266 } else if (unlikely(vmx->loaded_vmcs->soft_vnmi_blocked)) 7267 vmx->loaded_vmcs->vnmi_blocked_time += 7268 ktime_to_ns(ktime_sub(ktime_get(), 7269 vmx->loaded_vmcs->entry_time)); 7270 } 7271 7272 static void __vmx_complete_interrupts(struct kvm_vcpu *vcpu, 7273 u32 idt_vectoring_info, 7274 int instr_len_field, 7275 int error_code_field) 7276 { 7277 u8 vector; 7278 int type; 7279 bool idtv_info_valid; 7280 7281 idtv_info_valid = idt_vectoring_info & VECTORING_INFO_VALID_MASK; 7282 7283 vcpu->arch.nmi_injected = false; 7284 kvm_clear_exception_queue(vcpu); 7285 kvm_clear_interrupt_queue(vcpu); 7286 7287 if (!idtv_info_valid) 7288 return; 7289 7290 kvm_make_request(KVM_REQ_EVENT, vcpu); 7291 7292 vector = idt_vectoring_info & VECTORING_INFO_VECTOR_MASK; 7293 type = idt_vectoring_info & VECTORING_INFO_TYPE_MASK; 7294 7295 switch (type) { 7296 case INTR_TYPE_NMI_INTR: 7297 vcpu->arch.nmi_injected = true; 7298 /* 7299 * SDM 3: 27.7.1.2 (September 2008) 7300 * Clear bit "block by NMI" before VM entry if a NMI 7301 * delivery faulted. 7302 */ 7303 vmx_set_nmi_mask(vcpu, false); 7304 break; 7305 case INTR_TYPE_SOFT_EXCEPTION: 7306 vcpu->arch.event_exit_inst_len = vmcs_read32(instr_len_field); 7307 fallthrough; 7308 case INTR_TYPE_HARD_EXCEPTION: { 7309 u32 error_code = 0; 7310 7311 if (idt_vectoring_info & VECTORING_INFO_DELIVER_CODE_MASK) 7312 error_code = vmcs_read32(error_code_field); 7313 7314 kvm_requeue_exception(vcpu, vector, 7315 idt_vectoring_info & VECTORING_INFO_DELIVER_CODE_MASK, 7316 error_code); 7317 break; 7318 } 7319 case INTR_TYPE_SOFT_INTR: 7320 vcpu->arch.event_exit_inst_len = vmcs_read32(instr_len_field); 7321 fallthrough; 7322 case INTR_TYPE_EXT_INTR: 7323 kvm_queue_interrupt(vcpu, vector, type == INTR_TYPE_SOFT_INTR); 7324 break; 7325 default: 7326 break; 7327 } 7328 } 7329 7330 static void vmx_complete_interrupts(struct vcpu_vmx *vmx) 7331 { 7332 __vmx_complete_interrupts(&vmx->vcpu, vmx->idt_vectoring_info, 7333 VM_EXIT_INSTRUCTION_LEN, 7334 IDT_VECTORING_ERROR_CODE); 7335 } 7336 7337 void vmx_cancel_injection(struct kvm_vcpu *vcpu) 7338 { 7339 __vmx_complete_interrupts(vcpu, 7340 vmcs_read32(VM_ENTRY_INTR_INFO_FIELD), 7341 VM_ENTRY_INSTRUCTION_LEN, 7342 VM_ENTRY_EXCEPTION_ERROR_CODE); 7343 7344 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0); 7345 } 7346 7347 static void atomic_switch_perf_msrs(struct vcpu_vmx *vmx) 7348 { 7349 int i, nr_msrs; 7350 struct perf_guest_switch_msr *msrs; 7351 struct kvm_pmu *pmu = vcpu_to_pmu(&vmx->vcpu); 7352 7353 if (kvm_vcpu_has_mediated_pmu(&vmx->vcpu)) 7354 return; 7355 7356 pmu->host_cross_mapped_mask = 0; 7357 if (pmu->pebs_enable & pmu->global_ctrl) 7358 intel_pmu_cross_mapped_check(pmu); 7359 7360 /* Note, nr_msrs may be garbage if perf_guest_get_msrs() returns NULL. */ 7361 msrs = perf_guest_get_msrs(&nr_msrs, (void *)pmu); 7362 if (!msrs) 7363 return; 7364 7365 for (i = 0; i < nr_msrs; i++) 7366 if (msrs[i].host == msrs[i].guest) 7367 clear_atomic_switch_msr(vmx, msrs[i].msr); 7368 else 7369 add_atomic_switch_msr(vmx, msrs[i].msr, msrs[i].guest, 7370 msrs[i].host); 7371 } 7372 7373 static void vmx_refresh_guest_perf_global_control(struct kvm_vcpu *vcpu) 7374 { 7375 struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); 7376 struct vcpu_vmx *vmx = to_vmx(vcpu); 7377 7378 if (msr_write_intercepted(vmx, MSR_CORE_PERF_GLOBAL_CTRL)) 7379 return; 7380 7381 if (!cpu_has_save_perf_global_ctrl()) { 7382 int slot = vmx_find_loadstore_msr_slot(&vmx->msr_autostore, 7383 MSR_CORE_PERF_GLOBAL_CTRL); 7384 7385 if (WARN_ON_ONCE(slot < 0)) 7386 return; 7387 7388 pmu->global_ctrl = vmx->msr_autostore.val[slot].value; 7389 vmcs_write64(GUEST_IA32_PERF_GLOBAL_CTRL, pmu->global_ctrl); 7390 return; 7391 } 7392 7393 pmu->global_ctrl = vmcs_read64(GUEST_IA32_PERF_GLOBAL_CTRL); 7394 } 7395 7396 static void vmx_update_hv_timer(struct kvm_vcpu *vcpu, bool force_immediate_exit) 7397 { 7398 struct vcpu_vmx *vmx = to_vmx(vcpu); 7399 u64 tscl; 7400 u32 delta_tsc; 7401 7402 if (force_immediate_exit) { 7403 vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, 0); 7404 vmx->loaded_vmcs->hv_timer_soft_disabled = false; 7405 } else if (vmx->hv_deadline_tsc != -1) { 7406 tscl = rdtsc(); 7407 if (vmx->hv_deadline_tsc > tscl) 7408 /* set_hv_timer ensures the delta fits in 32-bits */ 7409 delta_tsc = (u32)((vmx->hv_deadline_tsc - tscl) >> 7410 cpu_preemption_timer_multi); 7411 else 7412 delta_tsc = 0; 7413 7414 vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, delta_tsc); 7415 vmx->loaded_vmcs->hv_timer_soft_disabled = false; 7416 } else if (!vmx->loaded_vmcs->hv_timer_soft_disabled) { 7417 vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, -1); 7418 vmx->loaded_vmcs->hv_timer_soft_disabled = true; 7419 } 7420 } 7421 7422 void noinstr vmx_update_host_rsp(struct vcpu_vmx *vmx, unsigned long host_rsp) 7423 { 7424 if (unlikely(host_rsp != vmx->loaded_vmcs->host_state.rsp)) { 7425 vmx->loaded_vmcs->host_state.rsp = host_rsp; 7426 vmcs_writel(HOST_RSP, host_rsp); 7427 } 7428 } 7429 7430 static fastpath_t vmx_exit_handlers_fastpath(struct kvm_vcpu *vcpu, 7431 bool force_immediate_exit) 7432 { 7433 /* 7434 * If L2 is active, some VMX preemption timer exits can be handled in 7435 * the fastpath even, all other exits must use the slow path. 7436 */ 7437 if (is_guest_mode(vcpu) && 7438 vmx_get_exit_reason(vcpu).basic != EXIT_REASON_PREEMPTION_TIMER) 7439 return EXIT_FASTPATH_NONE; 7440 7441 switch (vmx_get_exit_reason(vcpu).basic) { 7442 case EXIT_REASON_MSR_WRITE: 7443 return handle_fastpath_wrmsr(vcpu); 7444 case EXIT_REASON_MSR_WRITE_IMM: 7445 return handle_fastpath_wrmsr_imm(vcpu, vmx_get_exit_qual(vcpu), 7446 vmx_get_msr_imm_reg(vcpu)); 7447 case EXIT_REASON_PREEMPTION_TIMER: 7448 return handle_fastpath_preemption_timer(vcpu, force_immediate_exit); 7449 case EXIT_REASON_HLT: 7450 return handle_fastpath_hlt(vcpu); 7451 case EXIT_REASON_INVD: 7452 return handle_fastpath_invd(vcpu); 7453 default: 7454 return EXIT_FASTPATH_NONE; 7455 } 7456 } 7457 7458 noinstr void vmx_handle_nmi(struct kvm_vcpu *vcpu) 7459 { 7460 if ((u16)vmx_get_exit_reason(vcpu).basic != EXIT_REASON_EXCEPTION_NMI || 7461 !is_nmi(vmx_get_intr_info(vcpu))) 7462 return; 7463 7464 kvm_before_interrupt(vcpu, KVM_HANDLING_NMI); 7465 x86_entry_from_kvm(EVENT_TYPE_NMI, NMI_VECTOR); 7466 kvm_after_interrupt(vcpu); 7467 } 7468 7469 static noinstr void vmx_vcpu_enter_exit(struct kvm_vcpu *vcpu, 7470 unsigned int flags) 7471 { 7472 struct vcpu_vmx *vmx = to_vmx(vcpu); 7473 7474 guest_state_enter_irqoff(); 7475 7476 vmx_l1d_flush(vcpu); 7477 7478 vmx_disable_fb_clear(vmx); 7479 7480 if (vcpu->arch.cr2 != native_read_cr2()) 7481 native_write_cr2(vcpu->arch.cr2); 7482 7483 vmx->fail = __vmx_vcpu_run(vmx, flags); 7484 7485 vcpu->arch.cr2 = native_read_cr2(); 7486 kvm_clear_available_registers(vcpu, VMX_REGS_LAZY_LOAD_SET); 7487 7488 vmx->idt_vectoring_info = 0; 7489 7490 vmx_enable_fb_clear(vmx); 7491 7492 if (unlikely(vmx->fail)) { 7493 vmx->vt.exit_reason.full = 0xdead; 7494 goto out; 7495 } 7496 7497 vmx->vt.exit_reason.full = vmcs_read32(VM_EXIT_REASON); 7498 if (likely(!vmx_get_exit_reason(vcpu).failed_vmentry)) 7499 vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD); 7500 7501 vmx_handle_nmi(vcpu); 7502 7503 out: 7504 guest_state_exit_irqoff(); 7505 } 7506 7507 fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, u64 run_flags) 7508 { 7509 bool force_immediate_exit = run_flags & KVM_RUN_FORCE_IMMEDIATE_EXIT; 7510 struct vcpu_vmx *vmx = to_vmx(vcpu); 7511 unsigned long cr3, cr4; 7512 7513 /* Record the guest's net vcpu time for enforced NMI injections. */ 7514 if (unlikely(!enable_vnmi && 7515 vmx->loaded_vmcs->soft_vnmi_blocked)) 7516 vmx->loaded_vmcs->entry_time = ktime_get(); 7517 7518 /* 7519 * Don't enter VMX if guest state is invalid, let the exit handler 7520 * start emulation until we arrive back to a valid state. Synthesize a 7521 * consistency check VM-Exit due to invalid guest state and bail. 7522 */ 7523 if (unlikely(vmx->vt.emulation_required)) { 7524 vmx->fail = 0; 7525 7526 vmx->vt.exit_reason.full = EXIT_REASON_INVALID_STATE; 7527 vmx->vt.exit_reason.failed_vmentry = 1; 7528 kvm_register_mark_available(vcpu, VCPU_REG_EXIT_INFO_1); 7529 vmx->vt.exit_qualification = ENTRY_FAIL_DEFAULT; 7530 kvm_register_mark_available(vcpu, VCPU_REG_EXIT_INFO_2); 7531 vmx->vt.exit_intr_info = 0; 7532 return EXIT_FASTPATH_NONE; 7533 } 7534 7535 trace_kvm_entry(vcpu, force_immediate_exit); 7536 7537 if (vmx->ple_window_dirty) { 7538 vmx->ple_window_dirty = false; 7539 vmcs_write32(PLE_WINDOW, vmx->ple_window); 7540 } 7541 7542 /* 7543 * We did this in prepare_switch_to_guest, because it needs to 7544 * be within srcu_read_lock. 7545 */ 7546 WARN_ON_ONCE(vmx->nested.need_vmcs12_to_shadow_sync); 7547 7548 if (kvm_register_is_dirty(vcpu, VCPU_REGS_RSP)) 7549 vmcs_writel(GUEST_RSP, vcpu->arch.regs[VCPU_REGS_RSP]); 7550 if (kvm_register_is_dirty(vcpu, VCPU_REG_RIP)) 7551 vmcs_writel(GUEST_RIP, vcpu->arch.rip); 7552 kvm_reset_dirty_registers(vcpu); 7553 7554 if (run_flags & KVM_RUN_LOAD_GUEST_DR6) 7555 set_debugreg(vcpu->arch.dr6, 6); 7556 7557 if (run_flags & KVM_RUN_LOAD_DEBUGCTL) 7558 vmx_reload_guest_debugctl(vcpu); 7559 7560 /* 7561 * Refresh vmcs.HOST_CR3 if necessary. This must be done immediately 7562 * prior to VM-Enter, as the kernel may load a new ASID (PCID) any time 7563 * it switches back to the current->mm, which can occur in KVM context 7564 * when switching to a temporary mm to patch kernel code, e.g. if KVM 7565 * toggles a static key while handling a VM-Exit. 7566 */ 7567 cr3 = __get_current_cr3_fast(); 7568 if (unlikely(cr3 != vmx->loaded_vmcs->host_state.cr3)) { 7569 vmcs_writel(HOST_CR3, cr3); 7570 vmx->loaded_vmcs->host_state.cr3 = cr3; 7571 } 7572 7573 cr4 = cr4_read_shadow(); 7574 if (unlikely(cr4 != vmx->loaded_vmcs->host_state.cr4)) { 7575 vmcs_writel(HOST_CR4, cr4); 7576 vmx->loaded_vmcs->host_state.cr4 = cr4; 7577 } 7578 7579 /* When single-stepping over STI and MOV SS, we must clear the 7580 * corresponding interruptibility bits in the guest state. Otherwise 7581 * vmentry fails as it then expects bit 14 (BS) in pending debug 7582 * exceptions being set, but that's not correct for the guest debugging 7583 * case. */ 7584 if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) 7585 vmx_set_interrupt_shadow(vcpu, 0); 7586 7587 pt_guest_enter(vmx); 7588 7589 atomic_switch_perf_msrs(vmx); 7590 if (intel_pmu_lbr_is_enabled(vcpu)) 7591 vmx_passthrough_lbr_msrs(vcpu); 7592 7593 if (enable_preemption_timer) 7594 vmx_update_hv_timer(vcpu, force_immediate_exit); 7595 else if (force_immediate_exit) 7596 smp_send_reschedule(vcpu->cpu); 7597 7598 kvm_wait_lapic_expire(vcpu); 7599 7600 /* The actual VMENTER/EXIT is in the .noinstr.text section. */ 7601 vmx_vcpu_enter_exit(vcpu, __vmx_vcpu_enter_flags(vmx)); 7602 7603 /* All fields are clean at this point */ 7604 if (kvm_is_using_evmcs()) { 7605 current_evmcs->hv_clean_fields |= 7606 HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL; 7607 7608 current_evmcs->hv_vp_id = kvm_hv_get_vpindex(vcpu); 7609 } 7610 7611 /* MSR_IA32_DEBUGCTLMSR is zeroed on vmexit. Restore it if needed */ 7612 if (vcpu->arch.host_debugctl) 7613 update_debugctlmsr(vcpu->arch.host_debugctl); 7614 7615 #ifndef CONFIG_X86_64 7616 /* 7617 * The sysexit path does not restore ds/es, so we must set them to 7618 * a reasonable value ourselves. 7619 * 7620 * We can't defer this to vmx_prepare_switch_to_host() since that 7621 * function may be executed in interrupt context, which saves and 7622 * restore segments around it, nullifying its effect. 7623 */ 7624 loadsegment(ds, __USER_DS); 7625 loadsegment(es, __USER_DS); 7626 #endif 7627 7628 pt_guest_exit(vmx); 7629 7630 if (is_guest_mode(vcpu)) { 7631 /* 7632 * Track VMLAUNCH/VMRESUME that have made past guest state 7633 * checking. 7634 */ 7635 if (vcpu->arch.nested_run_pending && 7636 !vmx_get_exit_reason(vcpu).failed_vmentry) 7637 ++vcpu->stat.nested_run; 7638 7639 vcpu->arch.nested_run_pending = 0; 7640 } 7641 7642 if (unlikely(vmx->fail)) 7643 return EXIT_FASTPATH_NONE; 7644 7645 trace_kvm_exit(vcpu, KVM_ISA_VMX); 7646 7647 if (unlikely(vmx_get_exit_reason(vcpu).failed_vmentry)) 7648 return EXIT_FASTPATH_NONE; 7649 7650 vmx->loaded_vmcs->launched = 1; 7651 7652 vmx_refresh_guest_perf_global_control(vcpu); 7653 7654 vmx_recover_nmi_blocking(vmx); 7655 vmx_complete_interrupts(vmx); 7656 7657 return vmx_exit_handlers_fastpath(vcpu, force_immediate_exit); 7658 } 7659 7660 void vmx_vcpu_free(struct kvm_vcpu *vcpu) 7661 { 7662 struct vcpu_vmx *vmx = to_vmx(vcpu); 7663 7664 if (enable_pml) 7665 vmx_destroy_pml_buffer(vmx); 7666 free_vpid(vmx->vpid); 7667 nested_vmx_free_vcpu(vcpu); 7668 free_loaded_vmcs(vmx->loaded_vmcs); 7669 free_page((unsigned long)vmx->ve_info); 7670 } 7671 7672 int vmx_vcpu_create(struct kvm_vcpu *vcpu) 7673 { 7674 struct vmx_uret_msr *tsx_ctrl; 7675 struct vcpu_vmx *vmx; 7676 int i, err; 7677 7678 BUILD_BUG_ON(offsetof(struct vcpu_vmx, vcpu) != 0); 7679 vmx = to_vmx(vcpu); 7680 7681 INIT_LIST_HEAD(&vmx->vt.pi_wakeup_list); 7682 7683 err = -ENOMEM; 7684 7685 vmx->vpid = allocate_vpid(); 7686 7687 /* 7688 * If PML is turned on, failure on enabling PML just results in failure 7689 * of creating the vcpu, therefore we can simplify PML logic (by 7690 * avoiding dealing with cases, such as enabling PML partially on vcpus 7691 * for the guest), etc. 7692 */ 7693 if (enable_pml) { 7694 vmx->pml_pg = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO); 7695 if (!vmx->pml_pg) 7696 goto free_vpid; 7697 } 7698 7699 for (i = 0; i < kvm_nr_uret_msrs; ++i) 7700 vmx->guest_uret_msrs[i].mask = -1ull; 7701 if (boot_cpu_has(X86_FEATURE_RTM)) { 7702 /* 7703 * TSX_CTRL_CPUID_CLEAR is handled in the CPUID interception. 7704 * Keep the host value unchanged to avoid changing CPUID bits 7705 * under the host kernel's feet. 7706 */ 7707 tsx_ctrl = vmx_find_uret_msr(vmx, MSR_IA32_TSX_CTRL); 7708 if (tsx_ctrl) 7709 tsx_ctrl->mask = ~(u64)TSX_CTRL_CPUID_CLEAR; 7710 } 7711 7712 err = alloc_loaded_vmcs(&vmx->vmcs01); 7713 if (err < 0) 7714 goto free_pml; 7715 7716 /* 7717 * Use Hyper-V 'Enlightened MSR Bitmap' feature when KVM runs as a 7718 * nested (L1) hypervisor and Hyper-V in L0 supports it. Enable the 7719 * feature only for vmcs01, KVM currently isn't equipped to realize any 7720 * performance benefits from enabling it for vmcs02. 7721 */ 7722 if (kvm_is_using_evmcs() && 7723 (ms_hyperv.nested_features & HV_X64_NESTED_MSR_BITMAP)) { 7724 struct hv_enlightened_vmcs *evmcs = (void *)vmx->vmcs01.vmcs; 7725 7726 evmcs->hv_enlightenments_control.msr_bitmap = 1; 7727 } 7728 7729 vmx->loaded_vmcs = &vmx->vmcs01; 7730 7731 if (cpu_need_virtualize_apic_accesses(vcpu)) { 7732 err = kvm_alloc_apic_access_page(vcpu->kvm); 7733 if (err) 7734 goto free_vmcs; 7735 } 7736 7737 if (enable_ept && !enable_unrestricted_guest) { 7738 err = init_rmode_identity_map(vcpu->kvm); 7739 if (err) 7740 goto free_vmcs; 7741 } 7742 7743 err = -ENOMEM; 7744 if (vmcs_config.cpu_based_2nd_exec_ctrl & SECONDARY_EXEC_EPT_VIOLATION_VE) { 7745 struct page *page; 7746 7747 BUILD_BUG_ON(sizeof(*vmx->ve_info) > PAGE_SIZE); 7748 7749 /* ve_info must be page aligned. */ 7750 page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO); 7751 if (!page) 7752 goto free_vmcs; 7753 7754 vmx->ve_info = page_to_virt(page); 7755 } 7756 7757 if (vmx_can_use_ipiv(vcpu)) 7758 WRITE_ONCE(to_kvm_vmx(vcpu->kvm)->pid_table[vcpu->vcpu_id], 7759 __pa(&vmx->vt.pi_desc) | PID_TABLE_ENTRY_VALID); 7760 7761 return 0; 7762 7763 free_vmcs: 7764 free_loaded_vmcs(vmx->loaded_vmcs); 7765 free_pml: 7766 vmx_destroy_pml_buffer(vmx); 7767 free_vpid: 7768 free_vpid(vmx->vpid); 7769 return err; 7770 } 7771 7772 #define L1TF_MSG_SMT "L1TF CPU bug present and SMT on, data leak possible. See CVE-2018-3646 and https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/l1tf.html for details.\n" 7773 #define L1TF_MSG_L1D "L1TF CPU bug present and virtualization mitigation disabled, data leak possible. See CVE-2018-3646 and https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/l1tf.html for details.\n" 7774 7775 int vmx_vm_init(struct kvm *kvm) 7776 { 7777 if (!ple_gap) 7778 kvm_disable_exits(kvm, KVM_X86_DISABLE_EXITS_PAUSE); 7779 7780 if (boot_cpu_has(X86_BUG_L1TF) && enable_ept) { 7781 switch (l1tf_mitigation) { 7782 case L1TF_MITIGATION_OFF: 7783 case L1TF_MITIGATION_FLUSH_NOWARN: 7784 /* 'I explicitly don't care' is set */ 7785 break; 7786 case L1TF_MITIGATION_AUTO: 7787 case L1TF_MITIGATION_FLUSH: 7788 case L1TF_MITIGATION_FLUSH_NOSMT: 7789 case L1TF_MITIGATION_FULL: 7790 /* 7791 * Warn upon starting the first VM in a potentially 7792 * insecure environment. 7793 */ 7794 if (sched_smt_active()) 7795 pr_warn_once(L1TF_MSG_SMT); 7796 if (l1tf_vmx_mitigation == VMENTER_L1D_FLUSH_NEVER) 7797 pr_warn_once(L1TF_MSG_L1D); 7798 break; 7799 case L1TF_MITIGATION_FULL_FORCE: 7800 /* Flush is enforced */ 7801 break; 7802 } 7803 } 7804 7805 if (enable_pml) 7806 kvm->arch.cpu_dirty_log_size = PML_LOG_NR_ENTRIES; 7807 return 0; 7808 } 7809 7810 static inline bool vmx_ignore_guest_pat(struct kvm *kvm) 7811 { 7812 /* 7813 * Non-coherent DMA devices need the guest to flush CPU properly. 7814 * In that case it is not possible to map all guest RAM as WB, so 7815 * always trust guest PAT. 7816 */ 7817 return !kvm_arch_has_noncoherent_dma(kvm) && 7818 kvm_check_has_quirk(kvm, KVM_X86_QUIRK_IGNORE_GUEST_PAT); 7819 } 7820 7821 u8 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio) 7822 { 7823 /* 7824 * Force UC for host MMIO regions, as allowing the guest to access MMIO 7825 * with cacheable accesses will result in Machine Checks. 7826 */ 7827 if (is_mmio) 7828 return MTRR_TYPE_UNCACHABLE << VMX_EPT_MT_EPTE_SHIFT; 7829 7830 /* Force WB if ignoring guest PAT */ 7831 if (vmx_ignore_guest_pat(vcpu->kvm)) 7832 return (MTRR_TYPE_WRBACK << VMX_EPT_MT_EPTE_SHIFT) | VMX_EPT_IPAT_BIT; 7833 7834 return (MTRR_TYPE_WRBACK << VMX_EPT_MT_EPTE_SHIFT); 7835 } 7836 7837 static void vmcs_set_secondary_exec_control(struct vcpu_vmx *vmx, u32 new_ctl) 7838 { 7839 /* 7840 * These bits in the secondary execution controls field 7841 * are dynamic, the others are mostly based on the hypervisor 7842 * architecture and the guest's CPUID. Do not touch the 7843 * dynamic bits. 7844 */ 7845 u32 mask = 7846 SECONDARY_EXEC_SHADOW_VMCS | 7847 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | 7848 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | 7849 SECONDARY_EXEC_DESC; 7850 7851 u32 cur_ctl = secondary_exec_controls_get(vmx); 7852 7853 secondary_exec_controls_set(vmx, (new_ctl & ~mask) | (cur_ctl & mask)); 7854 } 7855 7856 /* 7857 * Generate MSR_IA32_VMX_CR{0,4}_FIXED1 according to CPUID. Only set bits 7858 * (indicating "allowed-1") if they are supported in the guest's CPUID. 7859 */ 7860 static void nested_vmx_cr_fixed1_bits_update(struct kvm_vcpu *vcpu) 7861 { 7862 struct vcpu_vmx *vmx = to_vmx(vcpu); 7863 struct kvm_cpuid_entry2 *entry; 7864 7865 vmx->nested.msrs.cr0_fixed1 = 0xffffffff; 7866 vmx->nested.msrs.cr4_fixed1 = X86_CR4_PCE; 7867 7868 #define cr4_fixed1_update(_cr4_mask, _reg, _cpuid_mask) do { \ 7869 if (entry && (entry->_reg & (_cpuid_mask))) \ 7870 vmx->nested.msrs.cr4_fixed1 |= (_cr4_mask); \ 7871 } while (0) 7872 7873 entry = kvm_find_cpuid_entry(vcpu, 0x1); 7874 cr4_fixed1_update(X86_CR4_VME, edx, feature_bit(VME)); 7875 cr4_fixed1_update(X86_CR4_PVI, edx, feature_bit(VME)); 7876 cr4_fixed1_update(X86_CR4_TSD, edx, feature_bit(TSC)); 7877 cr4_fixed1_update(X86_CR4_DE, edx, feature_bit(DE)); 7878 cr4_fixed1_update(X86_CR4_PSE, edx, feature_bit(PSE)); 7879 cr4_fixed1_update(X86_CR4_PAE, edx, feature_bit(PAE)); 7880 cr4_fixed1_update(X86_CR4_MCE, edx, feature_bit(MCE)); 7881 cr4_fixed1_update(X86_CR4_PGE, edx, feature_bit(PGE)); 7882 cr4_fixed1_update(X86_CR4_OSFXSR, edx, feature_bit(FXSR)); 7883 cr4_fixed1_update(X86_CR4_OSXMMEXCPT, edx, feature_bit(XMM)); 7884 cr4_fixed1_update(X86_CR4_VMXE, ecx, feature_bit(VMX)); 7885 cr4_fixed1_update(X86_CR4_SMXE, ecx, feature_bit(SMX)); 7886 cr4_fixed1_update(X86_CR4_PCIDE, ecx, feature_bit(PCID)); 7887 cr4_fixed1_update(X86_CR4_OSXSAVE, ecx, feature_bit(XSAVE)); 7888 7889 entry = kvm_find_cpuid_entry_index(vcpu, 0x7, 0); 7890 cr4_fixed1_update(X86_CR4_FSGSBASE, ebx, feature_bit(FSGSBASE)); 7891 cr4_fixed1_update(X86_CR4_SMEP, ebx, feature_bit(SMEP)); 7892 cr4_fixed1_update(X86_CR4_SMAP, ebx, feature_bit(SMAP)); 7893 cr4_fixed1_update(X86_CR4_PKE, ecx, feature_bit(PKU)); 7894 cr4_fixed1_update(X86_CR4_UMIP, ecx, feature_bit(UMIP)); 7895 cr4_fixed1_update(X86_CR4_LA57, ecx, feature_bit(LA57)); 7896 cr4_fixed1_update(X86_CR4_CET, ecx, feature_bit(SHSTK)); 7897 cr4_fixed1_update(X86_CR4_CET, edx, feature_bit(IBT)); 7898 7899 entry = kvm_find_cpuid_entry_index(vcpu, 0x7, 1); 7900 cr4_fixed1_update(X86_CR4_LAM_SUP, eax, feature_bit(LAM)); 7901 7902 #undef cr4_fixed1_update 7903 } 7904 7905 static void update_intel_pt_cfg(struct kvm_vcpu *vcpu) 7906 { 7907 struct vcpu_vmx *vmx = to_vmx(vcpu); 7908 struct kvm_cpuid_entry2 *best = NULL; 7909 int i; 7910 7911 for (i = 0; i < PT_CPUID_LEAVES; i++) { 7912 best = kvm_find_cpuid_entry_index(vcpu, 0x14, i); 7913 if (!best) 7914 return; 7915 vmx->pt_desc.caps[CPUID_EAX + i*PT_CPUID_REGS_NUM] = best->eax; 7916 vmx->pt_desc.caps[CPUID_EBX + i*PT_CPUID_REGS_NUM] = best->ebx; 7917 vmx->pt_desc.caps[CPUID_ECX + i*PT_CPUID_REGS_NUM] = best->ecx; 7918 vmx->pt_desc.caps[CPUID_EDX + i*PT_CPUID_REGS_NUM] = best->edx; 7919 } 7920 7921 /* Get the number of configurable Address Ranges for filtering */ 7922 vmx->pt_desc.num_address_ranges = intel_pt_validate_cap(vmx->pt_desc.caps, 7923 PT_CAP_num_address_ranges); 7924 7925 /* Initialize and clear the no dependency bits */ 7926 vmx->pt_desc.ctl_bitmask = ~(RTIT_CTL_TRACEEN | RTIT_CTL_OS | 7927 RTIT_CTL_USR | RTIT_CTL_TSC_EN | RTIT_CTL_DISRETC | 7928 RTIT_CTL_BRANCH_EN); 7929 7930 /* 7931 * If CPUID.(EAX=14H,ECX=0):EBX[0]=1 CR3Filter can be set otherwise 7932 * will inject an #GP 7933 */ 7934 if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_cr3_filtering)) 7935 vmx->pt_desc.ctl_bitmask &= ~RTIT_CTL_CR3EN; 7936 7937 /* 7938 * If CPUID.(EAX=14H,ECX=0):EBX[1]=1 CYCEn, CycThresh and 7939 * PSBFreq can be set 7940 */ 7941 if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_psb_cyc)) 7942 vmx->pt_desc.ctl_bitmask &= ~(RTIT_CTL_CYCLEACC | 7943 RTIT_CTL_CYC_THRESH | RTIT_CTL_PSB_FREQ); 7944 7945 /* 7946 * If CPUID.(EAX=14H,ECX=0):EBX[3]=1 MTCEn and MTCFreq can be set 7947 */ 7948 if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_mtc)) 7949 vmx->pt_desc.ctl_bitmask &= ~(RTIT_CTL_MTC_EN | 7950 RTIT_CTL_MTC_RANGE); 7951 7952 /* If CPUID.(EAX=14H,ECX=0):EBX[4]=1 FUPonPTW and PTWEn can be set */ 7953 if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_ptwrite)) 7954 vmx->pt_desc.ctl_bitmask &= ~(RTIT_CTL_FUP_ON_PTW | 7955 RTIT_CTL_PTW_EN); 7956 7957 /* If CPUID.(EAX=14H,ECX=0):EBX[5]=1 PwrEvEn can be set */ 7958 if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_power_event_trace)) 7959 vmx->pt_desc.ctl_bitmask &= ~RTIT_CTL_PWR_EVT_EN; 7960 7961 /* If CPUID.(EAX=14H,ECX=0):ECX[0]=1 ToPA can be set */ 7962 if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_topa_output)) 7963 vmx->pt_desc.ctl_bitmask &= ~RTIT_CTL_TOPA; 7964 7965 /* If CPUID.(EAX=14H,ECX=0):ECX[3]=1 FabricEn can be set */ 7966 if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_output_subsys)) 7967 vmx->pt_desc.ctl_bitmask &= ~RTIT_CTL_FABRIC_EN; 7968 7969 /* unmask address range configure area */ 7970 for (i = 0; i < vmx->pt_desc.num_address_ranges; i++) 7971 vmx->pt_desc.ctl_bitmask &= ~(0xfULL << (32 + i * 4)); 7972 } 7973 7974 void vmx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) 7975 { 7976 struct vcpu_vmx *vmx = to_vmx(vcpu); 7977 7978 /* 7979 * XSAVES is effectively enabled if and only if XSAVE is also exposed 7980 * to the guest. XSAVES depends on CR4.OSXSAVE, and CR4.OSXSAVE can be 7981 * set if and only if XSAVE is supported. 7982 */ 7983 if (!guest_cpu_cap_has(vcpu, X86_FEATURE_XSAVE)) 7984 guest_cpu_cap_clear(vcpu, X86_FEATURE_XSAVES); 7985 7986 vmx_setup_uret_msrs(vmx); 7987 7988 if (cpu_has_secondary_exec_ctrls()) 7989 vmcs_set_secondary_exec_control(vmx, 7990 vmx_secondary_exec_control(vmx)); 7991 7992 if (guest_cpu_cap_has(vcpu, X86_FEATURE_VMX)) 7993 vmx->msr_ia32_feature_control_valid_bits |= 7994 FEAT_CTL_VMX_ENABLED_INSIDE_SMX | 7995 FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX; 7996 else 7997 vmx->msr_ia32_feature_control_valid_bits &= 7998 ~(FEAT_CTL_VMX_ENABLED_INSIDE_SMX | 7999 FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX); 8000 8001 if (guest_cpu_cap_has(vcpu, X86_FEATURE_VMX)) 8002 nested_vmx_cr_fixed1_bits_update(vcpu); 8003 8004 if (boot_cpu_has(X86_FEATURE_INTEL_PT) && 8005 guest_cpu_cap_has(vcpu, X86_FEATURE_INTEL_PT)) 8006 update_intel_pt_cfg(vcpu); 8007 8008 if (boot_cpu_has(X86_FEATURE_RTM)) { 8009 struct vmx_uret_msr *msr; 8010 msr = vmx_find_uret_msr(vmx, MSR_IA32_TSX_CTRL); 8011 if (msr) { 8012 bool enabled = guest_cpu_cap_has(vcpu, X86_FEATURE_RTM); 8013 vmx_set_guest_uret_msr(vmx, msr, enabled ? 0 : TSX_CTRL_RTM_DISABLE); 8014 } 8015 } 8016 8017 set_cr4_guest_host_mask(vmx); 8018 8019 vmx_write_encls_bitmap(vcpu, NULL); 8020 if (guest_cpu_cap_has(vcpu, X86_FEATURE_SGX)) 8021 vmx->msr_ia32_feature_control_valid_bits |= FEAT_CTL_SGX_ENABLED; 8022 else 8023 vmx->msr_ia32_feature_control_valid_bits &= ~FEAT_CTL_SGX_ENABLED; 8024 8025 if (guest_cpu_cap_has(vcpu, X86_FEATURE_SGX_LC)) 8026 vmx->msr_ia32_feature_control_valid_bits |= 8027 FEAT_CTL_SGX_LC_ENABLED; 8028 else 8029 vmx->msr_ia32_feature_control_valid_bits &= 8030 ~FEAT_CTL_SGX_LC_ENABLED; 8031 8032 /* Refresh #PF interception to account for MAXPHYADDR changes. */ 8033 vmx_update_exception_bitmap(vcpu); 8034 } 8035 8036 static __init u64 vmx_get_perf_capabilities(void) 8037 { 8038 u64 perf_cap = PERF_CAP_FW_WRITES; 8039 u64 host_perf_cap = 0; 8040 8041 if (!enable_pmu) 8042 return 0; 8043 8044 if (boot_cpu_has(X86_FEATURE_PDCM)) 8045 rdmsrq(MSR_IA32_PERF_CAPABILITIES, host_perf_cap); 8046 8047 if (!cpu_feature_enabled(X86_FEATURE_ARCH_LBR) && 8048 !enable_mediated_pmu) { 8049 x86_perf_get_lbr(&vmx_lbr_caps); 8050 8051 /* 8052 * KVM requires LBR callstack support, as the overhead due to 8053 * context switching LBRs without said support is too high. 8054 * See intel_pmu_create_guest_lbr_event() for more info. 8055 */ 8056 if (!vmx_lbr_caps.has_callstack) 8057 memset(&vmx_lbr_caps, 0, sizeof(vmx_lbr_caps)); 8058 else if (vmx_lbr_caps.nr) 8059 perf_cap |= host_perf_cap & PERF_CAP_LBR_FMT; 8060 } 8061 8062 if (vmx_pebs_supported()) { 8063 perf_cap |= host_perf_cap & PERF_CAP_PEBS_MASK; 8064 8065 /* 8066 * Disallow adaptive PEBS as it is functionally broken, can be 8067 * used by the guest to read *host* LBRs, and can be used to 8068 * bypass userspace event filters. To correctly and safely 8069 * support adaptive PEBS, KVM needs to: 8070 * 8071 * 1. Account for the ADAPTIVE flag when (re)programming fixed 8072 * counters. 8073 * 8074 * 2. Gain support from perf (or take direct control of counter 8075 * programming) to support events without adaptive PEBS 8076 * enabled for the hardware counter. 8077 * 8078 * 3. Ensure LBR MSRs cannot hold host data on VM-Entry with 8079 * adaptive PEBS enabled and MSR_PEBS_DATA_CFG.LBRS=1. 8080 * 8081 * 4. Document which PMU events are effectively exposed to the 8082 * guest via adaptive PEBS, and make adaptive PEBS mutually 8083 * exclusive with KVM_SET_PMU_EVENT_FILTER if necessary. 8084 */ 8085 perf_cap &= ~PERF_CAP_PEBS_BASELINE; 8086 } 8087 8088 return perf_cap; 8089 } 8090 8091 static __init void vmx_set_cpu_caps(void) 8092 { 8093 kvm_initialize_cpu_caps(); 8094 8095 /* CPUID 0x1 */ 8096 if (nested) 8097 kvm_cpu_cap_set(X86_FEATURE_VMX); 8098 8099 /* CPUID 0x7 */ 8100 if (kvm_mpx_supported()) 8101 kvm_cpu_cap_check_and_set(X86_FEATURE_MPX); 8102 if (!cpu_has_vmx_invpcid()) 8103 kvm_cpu_cap_clear(X86_FEATURE_INVPCID); 8104 if (vmx_pt_mode_is_host_guest()) 8105 kvm_cpu_cap_check_and_set(X86_FEATURE_INTEL_PT); 8106 if (vmx_pebs_supported()) { 8107 kvm_cpu_cap_check_and_set(X86_FEATURE_DS); 8108 kvm_cpu_cap_check_and_set(X86_FEATURE_DTES64); 8109 } 8110 8111 if (!enable_pmu) 8112 kvm_cpu_cap_clear(X86_FEATURE_PDCM); 8113 kvm_caps.supported_perf_cap = vmx_get_perf_capabilities(); 8114 8115 if (!enable_sgx) { 8116 kvm_cpu_cap_clear(X86_FEATURE_SGX); 8117 kvm_cpu_cap_clear(X86_FEATURE_SGX_LC); 8118 kvm_cpu_cap_clear(X86_FEATURE_SGX1); 8119 kvm_cpu_cap_clear(X86_FEATURE_SGX2); 8120 kvm_cpu_cap_clear(X86_FEATURE_SGX_EDECCSSA); 8121 } 8122 8123 if (vmx_umip_emulated()) 8124 kvm_cpu_cap_set(X86_FEATURE_UMIP); 8125 8126 /* CPUID 0xD.1 */ 8127 if (!cpu_has_vmx_xsaves()) 8128 kvm_cpu_cap_clear(X86_FEATURE_XSAVES); 8129 8130 /* CPUID 0x80000001 and 0x7 (RDPID) */ 8131 if (!cpu_has_vmx_rdtscp()) { 8132 kvm_cpu_cap_clear(X86_FEATURE_RDTSCP); 8133 kvm_cpu_cap_clear(X86_FEATURE_RDPID); 8134 } 8135 8136 if (cpu_has_vmx_waitpkg()) 8137 kvm_cpu_cap_check_and_set(X86_FEATURE_WAITPKG); 8138 8139 /* 8140 * Disable CET if unrestricted_guest is unsupported as KVM doesn't 8141 * enforce CET HW behaviors in emulator. On platforms with 8142 * VMX_BASIC[bit56] == 0, inject #CP at VMX entry with error code 8143 * fails, so disable CET in this case too. 8144 */ 8145 if (!enable_cet || !enable_unrestricted_guest || 8146 !cpu_has_vmx_basic_no_hw_errcode_cc()) { 8147 kvm_cpu_cap_clear(X86_FEATURE_SHSTK); 8148 kvm_cpu_cap_clear(X86_FEATURE_IBT); 8149 } 8150 8151 kvm_setup_xss_caps(); 8152 kvm_finalize_cpu_caps(); 8153 } 8154 8155 static bool vmx_is_io_intercepted(struct kvm_vcpu *vcpu, 8156 struct x86_instruction_info *info, 8157 unsigned long *exit_qualification) 8158 { 8159 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 8160 unsigned short port; 8161 int size; 8162 bool imm; 8163 8164 /* 8165 * If the 'use IO bitmaps' VM-execution control is 0, IO instruction 8166 * VM-exits depend on the 'unconditional IO exiting' VM-execution 8167 * control. 8168 * 8169 * Otherwise, IO instruction VM-exits are controlled by the IO bitmaps. 8170 */ 8171 if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS)) 8172 return nested_cpu_has(vmcs12, CPU_BASED_UNCOND_IO_EXITING); 8173 8174 if (info->intercept == x86_intercept_in || 8175 info->intercept == x86_intercept_ins) { 8176 port = info->src_val; 8177 size = info->dst_bytes; 8178 imm = info->src_type == OP_IMM; 8179 } else { 8180 port = info->dst_val; 8181 size = info->src_bytes; 8182 imm = info->dst_type == OP_IMM; 8183 } 8184 8185 8186 *exit_qualification = ((unsigned long)port << 16) | (size - 1); 8187 8188 if (info->intercept == x86_intercept_ins || 8189 info->intercept == x86_intercept_outs) 8190 *exit_qualification |= BIT(4); 8191 8192 if (info->rep_prefix) 8193 *exit_qualification |= BIT(5); 8194 8195 if (imm) 8196 *exit_qualification |= BIT(6); 8197 8198 return nested_vmx_check_io_bitmaps(vcpu, port, size); 8199 } 8200 8201 int vmx_check_intercept(struct kvm_vcpu *vcpu, 8202 struct x86_instruction_info *info, 8203 enum x86_intercept_stage stage, 8204 struct x86_exception *exception) 8205 { 8206 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 8207 unsigned long exit_qualification = 0; 8208 u32 vm_exit_reason; 8209 u64 exit_insn_len; 8210 8211 switch (info->intercept) { 8212 case x86_intercept_rdpid: 8213 /* 8214 * RDPID causes #UD if not enabled through secondary execution 8215 * controls (ENABLE_RDTSCP). Note, the implicit MSR access to 8216 * TSC_AUX is NOT subject to interception, i.e. checking only 8217 * the dedicated execution control is architecturally correct. 8218 */ 8219 if (!nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_RDTSCP)) { 8220 exception->vector = UD_VECTOR; 8221 exception->error_code_valid = false; 8222 return X86EMUL_PROPAGATE_FAULT; 8223 } 8224 return X86EMUL_CONTINUE; 8225 8226 case x86_intercept_in: 8227 case x86_intercept_ins: 8228 case x86_intercept_out: 8229 case x86_intercept_outs: 8230 if (!vmx_is_io_intercepted(vcpu, info, &exit_qualification)) 8231 return X86EMUL_CONTINUE; 8232 8233 vm_exit_reason = EXIT_REASON_IO_INSTRUCTION; 8234 break; 8235 8236 case x86_intercept_lgdt: 8237 case x86_intercept_lidt: 8238 case x86_intercept_lldt: 8239 case x86_intercept_ltr: 8240 case x86_intercept_sgdt: 8241 case x86_intercept_sidt: 8242 case x86_intercept_sldt: 8243 case x86_intercept_str: 8244 if (!nested_cpu_has2(vmcs12, SECONDARY_EXEC_DESC)) 8245 return X86EMUL_CONTINUE; 8246 8247 if (info->intercept == x86_intercept_lldt || 8248 info->intercept == x86_intercept_ltr || 8249 info->intercept == x86_intercept_sldt || 8250 info->intercept == x86_intercept_str) 8251 vm_exit_reason = EXIT_REASON_LDTR_TR; 8252 else 8253 vm_exit_reason = EXIT_REASON_GDTR_IDTR; 8254 /* 8255 * FIXME: Decode the ModR/M to generate the correct exit 8256 * qualification for memory operands. 8257 */ 8258 break; 8259 8260 case x86_intercept_hlt: 8261 if (!nested_cpu_has(vmcs12, CPU_BASED_HLT_EXITING)) 8262 return X86EMUL_CONTINUE; 8263 8264 vm_exit_reason = EXIT_REASON_HLT; 8265 break; 8266 8267 case x86_intercept_pause: 8268 /* 8269 * PAUSE is a single-byte NOP with a REPE prefix, i.e. collides 8270 * with vanilla NOPs in the emulator. Apply the interception 8271 * check only to actual PAUSE instructions. Don't check 8272 * PAUSE-loop-exiting, software can't expect a given PAUSE to 8273 * exit, i.e. KVM is within its rights to allow L2 to execute 8274 * the PAUSE. 8275 */ 8276 if ((info->rep_prefix != REPE_PREFIX) || 8277 !nested_cpu_has(vmcs12, CPU_BASED_PAUSE_EXITING)) 8278 return X86EMUL_CONTINUE; 8279 8280 vm_exit_reason = EXIT_REASON_PAUSE_INSTRUCTION; 8281 break; 8282 8283 /* TODO: check more intercepts... */ 8284 default: 8285 return X86EMUL_UNHANDLEABLE; 8286 } 8287 8288 exit_insn_len = abs_diff((s64)info->next_rip, (s64)info->rip); 8289 if (!exit_insn_len || exit_insn_len > X86_MAX_INSTRUCTION_LENGTH) 8290 return X86EMUL_UNHANDLEABLE; 8291 8292 __nested_vmx_vmexit(vcpu, vm_exit_reason, 0, exit_qualification, 8293 exit_insn_len); 8294 return X86EMUL_INTERCEPTED; 8295 } 8296 8297 #ifdef CONFIG_X86_64 8298 /* (a << shift) / divisor, return 1 if overflow otherwise 0 */ 8299 static inline int u64_shl_div_u64(u64 a, unsigned int shift, 8300 u64 divisor, u64 *result) 8301 { 8302 u64 low = a << shift, high = a >> (64 - shift); 8303 8304 /* To avoid the overflow on divq */ 8305 if (high >= divisor) 8306 return 1; 8307 8308 /* Low hold the result, high hold rem which is discarded */ 8309 asm("divq %2\n\t" : "=a" (low), "=d" (high) : 8310 "rm" (divisor), "0" (low), "1" (high)); 8311 *result = low; 8312 8313 return 0; 8314 } 8315 8316 int vmx_set_hv_timer(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc, 8317 bool *expired) 8318 { 8319 struct vcpu_vmx *vmx; 8320 u64 tscl, guest_tscl, delta_tsc, lapic_timer_advance_cycles; 8321 struct kvm_timer *ktimer = &vcpu->arch.apic->lapic_timer; 8322 8323 vmx = to_vmx(vcpu); 8324 tscl = rdtsc(); 8325 guest_tscl = kvm_read_l1_tsc(vcpu, tscl); 8326 delta_tsc = max(guest_deadline_tsc, guest_tscl) - guest_tscl; 8327 lapic_timer_advance_cycles = nsec_to_cycles(vcpu, 8328 ktimer->timer_advance_ns); 8329 8330 if (delta_tsc > lapic_timer_advance_cycles) 8331 delta_tsc -= lapic_timer_advance_cycles; 8332 else 8333 delta_tsc = 0; 8334 8335 /* Convert to host delta tsc if tsc scaling is enabled */ 8336 if (vcpu->arch.l1_tsc_scaling_ratio != kvm_caps.default_tsc_scaling_ratio && 8337 delta_tsc && u64_shl_div_u64(delta_tsc, 8338 kvm_caps.tsc_scaling_ratio_frac_bits, 8339 vcpu->arch.l1_tsc_scaling_ratio, &delta_tsc)) 8340 return -ERANGE; 8341 8342 /* 8343 * If the delta tsc can't fit in the 32 bit after the multi shift, 8344 * we can't use the preemption timer. 8345 * It's possible that it fits on later vmentries, but checking 8346 * on every vmentry is costly so we just use an hrtimer. 8347 */ 8348 if (delta_tsc >> (cpu_preemption_timer_multi + 32)) 8349 return -ERANGE; 8350 8351 vmx->hv_deadline_tsc = tscl + delta_tsc; 8352 *expired = !delta_tsc; 8353 return 0; 8354 } 8355 8356 void vmx_cancel_hv_timer(struct kvm_vcpu *vcpu) 8357 { 8358 to_vmx(vcpu)->hv_deadline_tsc = -1; 8359 } 8360 #endif 8361 8362 void vmx_update_cpu_dirty_logging(struct kvm_vcpu *vcpu) 8363 { 8364 struct vcpu_vmx *vmx = to_vmx(vcpu); 8365 8366 if (WARN_ON_ONCE(!enable_pml)) 8367 return; 8368 8369 guard(vmx_vmcs01)(vcpu); 8370 8371 /* 8372 * Note, nr_memslots_dirty_logging can be changed concurrent with this 8373 * code, but in that case another update request will be made and so 8374 * the guest will never run with a stale PML value. 8375 */ 8376 if (atomic_read(&vcpu->kvm->nr_memslots_dirty_logging)) 8377 secondary_exec_controls_setbit(vmx, SECONDARY_EXEC_ENABLE_PML); 8378 else 8379 secondary_exec_controls_clearbit(vmx, SECONDARY_EXEC_ENABLE_PML); 8380 } 8381 8382 void vmx_setup_mce(struct kvm_vcpu *vcpu) 8383 { 8384 if (vcpu->arch.mcg_cap & MCG_LMCE_P) 8385 to_vmx(vcpu)->msr_ia32_feature_control_valid_bits |= 8386 FEAT_CTL_LMCE_ENABLED; 8387 else 8388 to_vmx(vcpu)->msr_ia32_feature_control_valid_bits &= 8389 ~FEAT_CTL_LMCE_ENABLED; 8390 } 8391 8392 #ifdef CONFIG_KVM_SMM 8393 int vmx_smi_allowed(struct kvm_vcpu *vcpu, bool for_injection) 8394 { 8395 /* we need a nested vmexit to enter SMM, postpone if run is pending */ 8396 if (vcpu->arch.nested_run_pending) 8397 return -EBUSY; 8398 return !is_smm(vcpu); 8399 } 8400 8401 int vmx_enter_smm(struct kvm_vcpu *vcpu, union kvm_smram *smram) 8402 { 8403 struct vcpu_vmx *vmx = to_vmx(vcpu); 8404 8405 /* 8406 * TODO: Implement custom flows for forcing the vCPU out/in of L2 on 8407 * SMI and RSM. Using the common VM-Exit + VM-Enter routines is wrong 8408 * SMI and RSM only modify state that is saved and restored via SMRAM. 8409 * E.g. most MSRs are left untouched, but many are modified by VM-Exit 8410 * and VM-Enter, and thus L2's values may be corrupted on SMI+RSM. 8411 */ 8412 vmx->nested.smm.guest_mode = is_guest_mode(vcpu); 8413 if (vmx->nested.smm.guest_mode) 8414 nested_vmx_vmexit(vcpu, -1, 0, 0); 8415 8416 vmx->nested.smm.vmxon = vmx->nested.vmxon; 8417 vmx->nested.vmxon = false; 8418 vmx_clear_hlt(vcpu); 8419 return 0; 8420 } 8421 8422 int vmx_leave_smm(struct kvm_vcpu *vcpu, const union kvm_smram *smram) 8423 { 8424 struct vcpu_vmx *vmx = to_vmx(vcpu); 8425 int ret; 8426 8427 if (vmx->nested.smm.vmxon) { 8428 vmx->nested.vmxon = true; 8429 vmx->nested.smm.vmxon = false; 8430 } 8431 8432 if (vmx->nested.smm.guest_mode) { 8433 /* Triple fault if the state is invalid. */ 8434 if (nested_vmx_check_restored_vmcs12(vcpu) < 0) 8435 return 1; 8436 8437 ret = nested_vmx_enter_non_root_mode(vcpu, false); 8438 if (ret != NVMX_VMENTRY_SUCCESS) 8439 return 1; 8440 8441 vcpu->arch.nested_run_pending = KVM_NESTED_RUN_PENDING; 8442 vmx->nested.smm.guest_mode = false; 8443 } 8444 return 0; 8445 } 8446 8447 void vmx_enable_smi_window(struct kvm_vcpu *vcpu) 8448 { 8449 /* RSM will cause a vmexit anyway. */ 8450 } 8451 #endif 8452 8453 bool vmx_apic_init_signal_blocked(struct kvm_vcpu *vcpu) 8454 { 8455 return to_vmx(vcpu)->nested.vmxon && !is_guest_mode(vcpu); 8456 } 8457 8458 void vmx_migrate_timers(struct kvm_vcpu *vcpu) 8459 { 8460 if (is_guest_mode(vcpu)) { 8461 struct hrtimer *timer = &to_vmx(vcpu)->nested.preemption_timer; 8462 8463 if (hrtimer_try_to_cancel(timer) == 1) 8464 hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED); 8465 } 8466 } 8467 8468 void vmx_hardware_unsetup(void) 8469 { 8470 kvm_set_posted_intr_wakeup_handler(NULL); 8471 8472 if (nested) 8473 nested_vmx_hardware_unsetup(); 8474 } 8475 8476 void vmx_vm_destroy(struct kvm *kvm) 8477 { 8478 struct kvm_vmx *kvm_vmx = to_kvm_vmx(kvm); 8479 8480 free_pages((unsigned long)kvm_vmx->pid_table, vmx_get_pid_table_order(kvm)); 8481 } 8482 8483 /* 8484 * Note, the SDM states that the linear address is masked *after* the modified 8485 * canonicality check, whereas KVM masks (untags) the address and then performs 8486 * a "normal" canonicality check. Functionally, the two methods are identical, 8487 * and when the masking occurs relative to the canonicality check isn't visible 8488 * to software, i.e. KVM's behavior doesn't violate the SDM. 8489 */ 8490 gva_t vmx_get_untagged_addr(struct kvm_vcpu *vcpu, gva_t gva, unsigned int flags) 8491 { 8492 int lam_bit; 8493 unsigned long cr3_bits; 8494 8495 if (flags & (X86EMUL_F_FETCH | X86EMUL_F_IMPLICIT | X86EMUL_F_INVLPG)) 8496 return gva; 8497 8498 if (!is_64_bit_mode(vcpu)) 8499 return gva; 8500 8501 /* 8502 * Bit 63 determines if the address should be treated as user address 8503 * or a supervisor address. 8504 */ 8505 if (!(gva & BIT_ULL(63))) { 8506 cr3_bits = kvm_get_active_cr3_lam_bits(vcpu); 8507 if (!(cr3_bits & (X86_CR3_LAM_U57 | X86_CR3_LAM_U48))) 8508 return gva; 8509 8510 /* LAM_U48 is ignored if LAM_U57 is set. */ 8511 lam_bit = cr3_bits & X86_CR3_LAM_U57 ? 56 : 47; 8512 } else { 8513 if (!kvm_is_cr4_bit_set(vcpu, X86_CR4_LAM_SUP)) 8514 return gva; 8515 8516 lam_bit = kvm_is_cr4_bit_set(vcpu, X86_CR4_LA57) ? 56 : 47; 8517 } 8518 8519 /* 8520 * Untag the address by sign-extending the lam_bit, but NOT to bit 63. 8521 * Bit 63 is retained from the raw virtual address so that untagging 8522 * doesn't change a user access to a supervisor access, and vice versa. 8523 */ 8524 return (sign_extend64(gva, lam_bit) & ~BIT_ULL(63)) | (gva & BIT_ULL(63)); 8525 } 8526 8527 static unsigned int vmx_handle_intel_pt_intr(void) 8528 { 8529 struct kvm_vcpu *vcpu = kvm_get_running_vcpu(); 8530 8531 /* '0' on failure so that the !PT case can use a RET0 static call. */ 8532 if (!vcpu || !kvm_handling_nmi_from_guest(vcpu)) 8533 return 0; 8534 8535 kvm_make_request(KVM_REQ_PMI, vcpu); 8536 __set_bit(MSR_CORE_PERF_GLOBAL_OVF_CTRL_TRACE_TOPA_PMI_BIT, 8537 (unsigned long *)&vcpu->arch.pmu.global_status); 8538 return 1; 8539 } 8540 8541 static __init void vmx_setup_user_return_msrs(void) 8542 { 8543 8544 /* 8545 * Though SYSCALL is only supported in 64-bit mode on Intel CPUs, kvm 8546 * will emulate SYSCALL in legacy mode if the vendor string in guest 8547 * CPUID.0:{EBX,ECX,EDX} is "AuthenticAMD" or "AMDisbetter!" To 8548 * support this emulation, MSR_STAR is included in the list for i386, 8549 * but is never loaded into hardware. MSR_CSTAR is also never loaded 8550 * into hardware and is here purely for emulation purposes. 8551 */ 8552 const u32 vmx_uret_msrs_list[] = { 8553 #ifdef CONFIG_X86_64 8554 MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR, 8555 #endif 8556 MSR_EFER, MSR_TSC_AUX, MSR_STAR, 8557 MSR_IA32_TSX_CTRL, 8558 }; 8559 int i; 8560 8561 BUILD_BUG_ON(ARRAY_SIZE(vmx_uret_msrs_list) != MAX_NR_USER_RETURN_MSRS); 8562 8563 for (i = 0; i < ARRAY_SIZE(vmx_uret_msrs_list); ++i) 8564 kvm_add_user_return_msr(vmx_uret_msrs_list[i]); 8565 } 8566 8567 static void __init vmx_setup_me_spte_mask(void) 8568 { 8569 u64 me_mask = 0; 8570 8571 /* 8572 * On pre-MKTME system, boot_cpu_data.x86_phys_bits equals to 8573 * kvm_host.maxphyaddr. On MKTME and/or TDX capable systems, 8574 * boot_cpu_data.x86_phys_bits holds the actual physical address 8575 * w/o the KeyID bits, and kvm_host.maxphyaddr equals to 8576 * MAXPHYADDR reported by CPUID. Those bits between are KeyID bits. 8577 */ 8578 if (boot_cpu_data.x86_phys_bits != kvm_host.maxphyaddr) 8579 me_mask = rsvd_bits(boot_cpu_data.x86_phys_bits, 8580 kvm_host.maxphyaddr - 1); 8581 8582 /* 8583 * Unlike SME, host kernel doesn't support setting up any 8584 * MKTME KeyID on Intel platforms. No memory encryption 8585 * bits should be included into the SPTE. 8586 */ 8587 kvm_mmu_set_me_spte_mask(0, me_mask); 8588 } 8589 8590 __init int vmx_hardware_setup(void) 8591 { 8592 unsigned long host_bndcfgs; 8593 struct desc_ptr dt; 8594 int r; 8595 8596 store_idt(&dt); 8597 host_idt_base = dt.address; 8598 8599 vmx_setup_user_return_msrs(); 8600 8601 if (boot_cpu_has(X86_FEATURE_MPX)) { 8602 rdmsrq(MSR_IA32_BNDCFGS, host_bndcfgs); 8603 WARN_ONCE(host_bndcfgs, "BNDCFGS in host will be lost"); 8604 } 8605 8606 if (!cpu_has_vmx_mpx()) 8607 kvm_caps.supported_xcr0 &= ~(XFEATURE_MASK_BNDREGS | 8608 XFEATURE_MASK_BNDCSR); 8609 8610 if (!cpu_has_vmx_vpid() || !cpu_has_vmx_invvpid() || 8611 !(cpu_has_vmx_invvpid_single() || cpu_has_vmx_invvpid_global())) 8612 enable_vpid = 0; 8613 8614 if (!cpu_has_vmx_ept() || 8615 !cpu_has_vmx_ept_4levels() || 8616 !cpu_has_vmx_ept_mt_wb() || 8617 !cpu_has_vmx_invept_global()) 8618 enable_ept = 0; 8619 8620 if (!cpu_has_load_cet_ctrl()) 8621 enable_cet = 0; 8622 8623 /* NX support is required for shadow paging. */ 8624 if (!enable_ept && !boot_cpu_has(X86_FEATURE_NX)) { 8625 pr_err_ratelimited("NX (Execute Disable) not supported\n"); 8626 return -EOPNOTSUPP; 8627 } 8628 8629 /* 8630 * Shadow paging doesn't have a (further) performance penalty 8631 * from GUEST_MAXPHYADDR < HOST_MAXPHYADDR so enable it 8632 * by default 8633 */ 8634 if (!enable_ept) 8635 allow_smaller_maxphyaddr = true; 8636 8637 if (!cpu_has_vmx_ept_ad_bits() || !enable_ept) 8638 enable_ept_ad_bits = 0; 8639 if (!cpu_has_ept_mbec() || !enable_ept) 8640 enable_mbec = 0; 8641 8642 if (!cpu_has_vmx_unrestricted_guest() || !enable_ept) 8643 enable_unrestricted_guest = 0; 8644 8645 if (!cpu_has_vmx_flexpriority()) 8646 flexpriority_enabled = 0; 8647 8648 if (!cpu_has_virtual_nmis()) 8649 enable_vnmi = 0; 8650 8651 #ifdef CONFIG_X86_SGX_KVM 8652 if (!cpu_has_vmx_encls_vmexit()) 8653 enable_sgx = false; 8654 #endif 8655 8656 /* 8657 * set_apic_access_page_addr() is used to reload apic access 8658 * page upon invalidation. No need to do anything if not 8659 * using the APIC_ACCESS_ADDR VMCS field. 8660 */ 8661 if (!flexpriority_enabled) 8662 vt_x86_ops.set_apic_access_page_addr = NULL; 8663 8664 if (!cpu_has_vmx_tpr_shadow()) 8665 vt_x86_ops.update_cr8_intercept = NULL; 8666 8667 #if IS_ENABLED(CONFIG_HYPERV) 8668 if (ms_hyperv.nested_features & HV_X64_NESTED_GUEST_MAPPING_FLUSH 8669 && enable_ept) { 8670 vt_x86_ops.flush_remote_tlbs = hv_flush_remote_tlbs; 8671 vt_x86_ops.flush_remote_tlbs_range = hv_flush_remote_tlbs_range; 8672 } 8673 #endif 8674 8675 if (!cpu_has_vmx_ple()) { 8676 ple_gap = 0; 8677 ple_window = 0; 8678 ple_window_grow = 0; 8679 ple_window_max = 0; 8680 ple_window_shrink = 0; 8681 } 8682 8683 if (!cpu_has_vmx_apicv()) 8684 enable_apicv = 0; 8685 if (!enable_apicv) 8686 vt_x86_ops.sync_pir_to_irr = NULL; 8687 8688 if (!enable_apicv || !cpu_has_vmx_ipiv()) 8689 enable_ipiv = false; 8690 8691 if (cpu_has_vmx_tsc_scaling()) 8692 kvm_caps.has_tsc_control = true; 8693 8694 kvm_caps.max_tsc_scaling_ratio = KVM_VMX_TSC_MULTIPLIER_MAX; 8695 kvm_caps.tsc_scaling_ratio_frac_bits = 48; 8696 kvm_caps.has_bus_lock_exit = cpu_has_vmx_bus_lock_detection(); 8697 kvm_caps.has_notify_vmexit = cpu_has_notify_vmexit(); 8698 8699 set_bit(0, vmx_vpid_bitmap); /* 0 is reserved for host */ 8700 8701 if (enable_ept) 8702 kvm_mmu_set_ept_masks(enable_ept_ad_bits); 8703 else 8704 vt_x86_ops.get_mt_mask = NULL; 8705 8706 /* 8707 * Setup shadow_me_value/shadow_me_mask to include MKTME KeyID 8708 * bits to shadow_zero_check. 8709 */ 8710 vmx_setup_me_spte_mask(); 8711 8712 kvm_configure_mmu(enable_ept, 0, vmx_get_max_ept_level(), 8713 ept_caps_to_lpage_level(vmx_capability.ept)); 8714 8715 /* 8716 * Only enable PML when hardware supports PML feature, and both EPT 8717 * and EPT A/D bit features are enabled -- PML depends on them to work. 8718 */ 8719 if (!enable_ept || !enable_ept_ad_bits || !cpu_has_vmx_pml()) 8720 enable_pml = 0; 8721 8722 if (!cpu_has_vmx_preemption_timer()) 8723 enable_preemption_timer = false; 8724 8725 if (enable_preemption_timer) { 8726 u64 use_timer_freq = 5000ULL * 1000 * 1000; 8727 8728 cpu_preemption_timer_multi = 8729 vmx_misc_preemption_timer_rate(vmcs_config.misc); 8730 8731 if (tsc_khz) 8732 use_timer_freq = (u64)tsc_khz * 1000; 8733 use_timer_freq >>= cpu_preemption_timer_multi; 8734 8735 /* 8736 * KVM "disables" the preemption timer by setting it to its max 8737 * value. Don't use the timer if it might cause spurious exits 8738 * at a rate faster than 0.1 Hz (of uninterrupted guest time). 8739 */ 8740 if (use_timer_freq > 0xffffffffu / 10) 8741 enable_preemption_timer = false; 8742 } 8743 8744 if (!enable_preemption_timer) { 8745 vt_x86_ops.set_hv_timer = NULL; 8746 vt_x86_ops.cancel_hv_timer = NULL; 8747 } 8748 8749 kvm_caps.supported_mce_cap |= MCG_LMCE_P; 8750 kvm_caps.supported_mce_cap |= MCG_CMCI_P; 8751 8752 if (pt_mode != PT_MODE_SYSTEM && pt_mode != PT_MODE_HOST_GUEST) 8753 return -EINVAL; 8754 if (!enable_ept || !enable_pmu || !cpu_has_vmx_intel_pt()) 8755 pt_mode = PT_MODE_SYSTEM; 8756 if (pt_mode == PT_MODE_HOST_GUEST) 8757 vt_init_ops.handle_intel_pt_intr = vmx_handle_intel_pt_intr; 8758 else 8759 vt_init_ops.handle_intel_pt_intr = NULL; 8760 8761 setup_default_sgx_lepubkeyhash(); 8762 8763 vmx_set_cpu_caps(); 8764 8765 /* 8766 * Configure nested capabilities after core CPU capabilities so that 8767 * nested support can be conditional on base support, e.g. so that KVM 8768 * can hide/show features based on kvm_cpu_cap_has(). 8769 */ 8770 if (nested) { 8771 r = nested_vmx_hardware_setup(kvm_vmx_exit_handlers); 8772 if (r) 8773 return r; 8774 } 8775 8776 kvm_set_posted_intr_wakeup_handler(pi_wakeup_handler); 8777 8778 /* 8779 * On Intel CPUs that lack self-snoop feature, letting the guest control 8780 * memory types may result in unexpected behavior. So always ignore guest 8781 * PAT on those CPUs and map VM as writeback, not allowing userspace to 8782 * disable the quirk. 8783 * 8784 * On certain Intel CPUs (e.g. SPR, ICX), though self-snoop feature is 8785 * supported, UC is slow enough to cause issues with some older guests (e.g. 8786 * an old version of bochs driver uses ioremap() instead of ioremap_wc() to 8787 * map the video RAM, causing wayland desktop to fail to get started 8788 * correctly). To avoid breaking those older guests that rely on KVM to force 8789 * memory type to WB, provide KVM_X86_QUIRK_IGNORE_GUEST_PAT to preserve the 8790 * safer (for performance) default behavior. 8791 * 8792 * On top of this, non-coherent DMA devices need the guest to flush CPU 8793 * caches properly. This also requires honoring guest PAT, and is forced 8794 * independent of the quirk in vmx_ignore_guest_pat(). 8795 */ 8796 if (!static_cpu_has(X86_FEATURE_SELFSNOOP)) 8797 kvm_caps.supported_quirks &= ~KVM_X86_QUIRK_IGNORE_GUEST_PAT; 8798 8799 kvm_caps.inapplicable_quirks &= ~KVM_X86_QUIRK_IGNORE_GUEST_PAT; 8800 8801 return 0; 8802 } 8803 8804 void vmx_exit(void) 8805 { 8806 allow_smaller_maxphyaddr = false; 8807 8808 vmx_cleanup_l1d_flush(); 8809 8810 kvm_x86_vendor_exit(); 8811 } 8812 8813 int __init vmx_init(void) 8814 { 8815 int r, cpu; 8816 8817 KVM_SANITY_CHECK_VM_STRUCT_SIZE(kvm_vmx); 8818 8819 if (!kvm_is_vmx_supported()) 8820 return -EOPNOTSUPP; 8821 8822 /* 8823 * Note, VMCS and eVMCS configuration only touch VMX knobs/variables, 8824 * i.e. there's nothing to unwind if a later step fails. 8825 */ 8826 hv_init_evmcs(); 8827 8828 /* 8829 * Parse the VMCS config and VMX capabilities before anything else, so 8830 * that the information is available to all setup flows. 8831 */ 8832 if (setup_vmcs_config(&vmcs_config, &vmx_capability) < 0) 8833 return -EIO; 8834 8835 r = kvm_x86_vendor_init(&vt_init_ops); 8836 if (r) 8837 return r; 8838 8839 /* Must be called after common x86 init so enable_ept is setup. */ 8840 r = vmx_setup_l1d_flush(); 8841 if (r) 8842 goto err_l1d_flush; 8843 8844 for_each_possible_cpu(cpu) { 8845 INIT_LIST_HEAD(&per_cpu(loaded_vmcss_on_cpu, cpu)); 8846 8847 pi_init_cpu(cpu); 8848 } 8849 8850 vmx_check_vmcs12_offsets(); 8851 8852 return 0; 8853 8854 err_l1d_flush: 8855 kvm_x86_vendor_exit(); 8856 return r; 8857 } 8858