1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Kernel-based Virtual Machine driver for Linux 4 * 5 * This module enables machines with Intel VT-x extensions to run virtual 6 * machines without emulation or binary translation. 7 * 8 * Copyright (C) 2006 Qumranet, Inc. 9 * Copyright 2010 Red Hat, Inc. and/or its affiliates. 10 * 11 * Authors: 12 * Avi Kivity <avi@qumranet.com> 13 * Yaniv Kamay <yaniv@qumranet.com> 14 */ 15 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 16 17 #include <linux/highmem.h> 18 #include <linux/hrtimer.h> 19 #include <linux/kernel.h> 20 #include <linux/kvm_host.h> 21 #include <linux/module.h> 22 #include <linux/moduleparam.h> 23 #include <linux/mod_devicetable.h> 24 #include <linux/mm.h> 25 #include <linux/objtool.h> 26 #include <linux/sched.h> 27 #include <linux/sched/smt.h> 28 #include <linux/slab.h> 29 #include <linux/tboot.h> 30 #include <linux/trace_events.h> 31 32 #include <asm/apic.h> 33 #include <asm/asm.h> 34 #include <asm/cpu.h> 35 #include <asm/cpu_device_id.h> 36 #include <asm/debugreg.h> 37 #include <asm/desc.h> 38 #include <asm/fpu/api.h> 39 #include <asm/fpu/xstate.h> 40 #include <asm/fred.h> 41 #include <asm/idtentry.h> 42 #include <asm/io.h> 43 #include <asm/irq_remapping.h> 44 #include <asm/reboot.h> 45 #include <asm/perf_event.h> 46 #include <asm/mmu_context.h> 47 #include <asm/mshyperv.h> 48 #include <asm/msr.h> 49 #include <asm/mwait.h> 50 #include <asm/spec-ctrl.h> 51 #include <asm/vmx.h> 52 53 #include <trace/events/ipi.h> 54 55 #include "capabilities.h" 56 #include "common.h" 57 #include "cpuid.h" 58 #include "hyperv.h" 59 #include "kvm_onhyperv.h" 60 #include "irq.h" 61 #include "kvm_cache_regs.h" 62 #include "lapic.h" 63 #include "mmu.h" 64 #include "nested.h" 65 #include "pmu.h" 66 #include "sgx.h" 67 #include "trace.h" 68 #include "vmcs.h" 69 #include "vmcs12.h" 70 #include "vmx.h" 71 #include "x86.h" 72 #include "x86_ops.h" 73 #include "smm.h" 74 #include "vmx_onhyperv.h" 75 #include "posted_intr.h" 76 77 #include "mmu/spte.h" 78 79 MODULE_AUTHOR("Qumranet"); 80 MODULE_DESCRIPTION("KVM support for VMX (Intel VT-x) extensions"); 81 MODULE_LICENSE("GPL"); 82 83 #ifdef MODULE 84 static const struct x86_cpu_id vmx_cpu_id[] = { 85 X86_MATCH_FEATURE(X86_FEATURE_VMX, NULL), 86 {} 87 }; 88 MODULE_DEVICE_TABLE(x86cpu, vmx_cpu_id); 89 #endif 90 91 bool __read_mostly enable_vpid = 1; 92 module_param_named(vpid, enable_vpid, bool, 0444); 93 94 static bool __read_mostly enable_vnmi = 1; 95 module_param_named(vnmi, enable_vnmi, bool, 0444); 96 97 bool __read_mostly flexpriority_enabled = 1; 98 module_param_named(flexpriority, flexpriority_enabled, bool, 0444); 99 100 bool __read_mostly enable_ept = 1; 101 module_param_named(ept, enable_ept, bool, 0444); 102 103 bool __read_mostly enable_unrestricted_guest = 1; 104 module_param_named(unrestricted_guest, 105 enable_unrestricted_guest, bool, 0444); 106 107 bool __read_mostly enable_ept_ad_bits = 1; 108 module_param_named(eptad, enable_ept_ad_bits, bool, 0444); 109 110 static bool __read_mostly emulate_invalid_guest_state = true; 111 module_param(emulate_invalid_guest_state, bool, 0444); 112 113 static bool __read_mostly fasteoi = 1; 114 module_param(fasteoi, bool, 0444); 115 116 module_param(enable_apicv, bool, 0444); 117 module_param(enable_ipiv, bool, 0444); 118 119 module_param(enable_device_posted_irqs, bool, 0444); 120 121 /* 122 * If nested=1, nested virtualization is supported, i.e., guests may use 123 * VMX and be a hypervisor for its own guests. If nested=0, guests may not 124 * use VMX instructions. 125 */ 126 static bool __read_mostly nested = 1; 127 module_param(nested, bool, 0444); 128 129 bool __read_mostly enable_pml = 1; 130 module_param_named(pml, enable_pml, bool, 0444); 131 132 static bool __read_mostly error_on_inconsistent_vmcs_config = true; 133 module_param(error_on_inconsistent_vmcs_config, bool, 0444); 134 135 static bool __read_mostly dump_invalid_vmcs = 0; 136 module_param(dump_invalid_vmcs, bool, 0644); 137 138 #define MSR_BITMAP_MODE_X2APIC 1 139 #define MSR_BITMAP_MODE_X2APIC_APICV 2 140 141 #define KVM_VMX_TSC_MULTIPLIER_MAX 0xffffffffffffffffULL 142 143 /* Guest_tsc -> host_tsc conversion requires 64-bit division. */ 144 static int __read_mostly cpu_preemption_timer_multi; 145 static bool __read_mostly enable_preemption_timer = 1; 146 #ifdef CONFIG_X86_64 147 module_param_named(preemption_timer, enable_preemption_timer, bool, S_IRUGO); 148 #endif 149 150 extern bool __read_mostly allow_smaller_maxphyaddr; 151 module_param(allow_smaller_maxphyaddr, bool, S_IRUGO); 152 153 module_param(enable_mediated_pmu, bool, 0444); 154 155 #define KVM_VM_CR0_ALWAYS_OFF (X86_CR0_NW | X86_CR0_CD) 156 #define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST X86_CR0_NE 157 #define KVM_VM_CR0_ALWAYS_ON \ 158 (KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST | X86_CR0_PG | X86_CR0_PE) 159 160 #define KVM_VM_CR4_ALWAYS_ON_UNRESTRICTED_GUEST X86_CR4_VMXE 161 #define KVM_PMODE_VM_CR4_ALWAYS_ON (X86_CR4_PAE | X86_CR4_VMXE) 162 #define KVM_RMODE_VM_CR4_ALWAYS_ON (X86_CR4_VME | X86_CR4_PAE | X86_CR4_VMXE) 163 164 #define RMODE_GUEST_OWNED_EFLAGS_BITS (~(X86_EFLAGS_IOPL | X86_EFLAGS_VM)) 165 166 #define MSR_IA32_RTIT_STATUS_MASK (~(RTIT_STATUS_FILTEREN | \ 167 RTIT_STATUS_CONTEXTEN | RTIT_STATUS_TRIGGEREN | \ 168 RTIT_STATUS_ERROR | RTIT_STATUS_STOPPED | \ 169 RTIT_STATUS_BYTECNT)) 170 171 /* 172 * These 2 parameters are used to config the controls for Pause-Loop Exiting: 173 * ple_gap: upper bound on the amount of time between two successive 174 * executions of PAUSE in a loop. Also indicate if ple enabled. 175 * According to test, this time is usually smaller than 128 cycles. 176 * ple_window: upper bound on the amount of time a guest is allowed to execute 177 * in a PAUSE loop. Tests indicate that most spinlocks are held for 178 * less than 2^12 cycles 179 * Time is measured based on a counter that runs at the same rate as the TSC, 180 * refer SDM volume 3b section 21.6.13 & 22.1.3. 181 */ 182 static unsigned int ple_gap = KVM_DEFAULT_PLE_GAP; 183 module_param(ple_gap, uint, 0444); 184 185 static unsigned int ple_window = KVM_VMX_DEFAULT_PLE_WINDOW; 186 module_param(ple_window, uint, 0444); 187 188 /* Default doubles per-vcpu window every exit. */ 189 static unsigned int ple_window_grow = KVM_DEFAULT_PLE_WINDOW_GROW; 190 module_param(ple_window_grow, uint, 0444); 191 192 /* Default resets per-vcpu window every exit to ple_window. */ 193 static unsigned int ple_window_shrink = KVM_DEFAULT_PLE_WINDOW_SHRINK; 194 module_param(ple_window_shrink, uint, 0444); 195 196 /* Default is to compute the maximum so we can never overflow. */ 197 static unsigned int ple_window_max = KVM_VMX_DEFAULT_PLE_WINDOW_MAX; 198 module_param(ple_window_max, uint, 0444); 199 200 /* Default is SYSTEM mode, 1 for host-guest mode (which is BROKEN) */ 201 int __read_mostly pt_mode = PT_MODE_SYSTEM; 202 #ifdef CONFIG_BROKEN 203 module_param(pt_mode, int, S_IRUGO); 204 #endif 205 206 struct x86_pmu_lbr __ro_after_init vmx_lbr_caps; 207 208 #ifdef CONFIG_CPU_MITIGATIONS 209 static DEFINE_STATIC_KEY_FALSE(vmx_l1d_should_flush); 210 static DEFINE_STATIC_KEY_FALSE(vmx_l1d_flush_cond); 211 static DEFINE_MUTEX(vmx_l1d_flush_mutex); 212 213 /* Storage for pre module init parameter parsing */ 214 static enum vmx_l1d_flush_state __read_mostly vmentry_l1d_flush_param = VMENTER_L1D_FLUSH_AUTO; 215 216 static const struct { 217 const char *option; 218 bool for_parse; 219 } vmentry_l1d_param[] = { 220 [VMENTER_L1D_FLUSH_AUTO] = {"auto", true}, 221 [VMENTER_L1D_FLUSH_NEVER] = {"never", true}, 222 [VMENTER_L1D_FLUSH_COND] = {"cond", true}, 223 [VMENTER_L1D_FLUSH_ALWAYS] = {"always", true}, 224 [VMENTER_L1D_FLUSH_EPT_DISABLED] = {"EPT disabled", false}, 225 [VMENTER_L1D_FLUSH_NOT_REQUIRED] = {"not required", false}, 226 }; 227 228 #define L1D_CACHE_ORDER 4 229 static void *vmx_l1d_flush_pages; 230 231 static int __vmx_setup_l1d_flush(enum vmx_l1d_flush_state l1tf) 232 { 233 struct page *page; 234 unsigned int i; 235 236 if (!boot_cpu_has_bug(X86_BUG_L1TF)) { 237 l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_NOT_REQUIRED; 238 return 0; 239 } 240 241 if (!enable_ept) { 242 l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_EPT_DISABLED; 243 return 0; 244 } 245 246 if (kvm_host.arch_capabilities & ARCH_CAP_SKIP_VMENTRY_L1DFLUSH) { 247 l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_NOT_REQUIRED; 248 return 0; 249 } 250 251 /* If set to auto use the default l1tf mitigation method */ 252 if (l1tf == VMENTER_L1D_FLUSH_AUTO) { 253 switch (l1tf_mitigation) { 254 case L1TF_MITIGATION_OFF: 255 l1tf = VMENTER_L1D_FLUSH_NEVER; 256 break; 257 case L1TF_MITIGATION_AUTO: 258 case L1TF_MITIGATION_FLUSH_NOWARN: 259 case L1TF_MITIGATION_FLUSH: 260 case L1TF_MITIGATION_FLUSH_NOSMT: 261 l1tf = VMENTER_L1D_FLUSH_COND; 262 break; 263 case L1TF_MITIGATION_FULL: 264 case L1TF_MITIGATION_FULL_FORCE: 265 l1tf = VMENTER_L1D_FLUSH_ALWAYS; 266 break; 267 } 268 } else if (l1tf_mitigation == L1TF_MITIGATION_FULL_FORCE) { 269 l1tf = VMENTER_L1D_FLUSH_ALWAYS; 270 } 271 272 if (l1tf != VMENTER_L1D_FLUSH_NEVER && !vmx_l1d_flush_pages && 273 !boot_cpu_has(X86_FEATURE_FLUSH_L1D)) { 274 /* 275 * This allocation for vmx_l1d_flush_pages is not tied to a VM 276 * lifetime and so should not be charged to a memcg. 277 */ 278 page = alloc_pages(GFP_KERNEL, L1D_CACHE_ORDER); 279 if (!page) 280 return -ENOMEM; 281 vmx_l1d_flush_pages = page_address(page); 282 283 /* 284 * Initialize each page with a different pattern in 285 * order to protect against KSM in the nested 286 * virtualization case. 287 */ 288 for (i = 0; i < 1u << L1D_CACHE_ORDER; ++i) { 289 memset(vmx_l1d_flush_pages + i * PAGE_SIZE, i + 1, 290 PAGE_SIZE); 291 } 292 } 293 294 l1tf_vmx_mitigation = l1tf; 295 296 if (l1tf != VMENTER_L1D_FLUSH_NEVER) 297 static_branch_enable(&vmx_l1d_should_flush); 298 else 299 static_branch_disable(&vmx_l1d_should_flush); 300 301 if (l1tf == VMENTER_L1D_FLUSH_COND) 302 static_branch_enable(&vmx_l1d_flush_cond); 303 else 304 static_branch_disable(&vmx_l1d_flush_cond); 305 return 0; 306 } 307 308 static int vmx_setup_l1d_flush(void) 309 { 310 /* 311 * Hand the parameter mitigation value in which was stored in the pre 312 * module init parser. If no parameter was given, it will contain 313 * 'auto' which will be turned into the default 'cond' mitigation mode. 314 */ 315 return __vmx_setup_l1d_flush(vmentry_l1d_flush_param); 316 } 317 318 static void vmx_cleanup_l1d_flush(void) 319 { 320 if (vmx_l1d_flush_pages) { 321 free_pages((unsigned long)vmx_l1d_flush_pages, L1D_CACHE_ORDER); 322 vmx_l1d_flush_pages = NULL; 323 } 324 /* Restore state so sysfs ignores VMX */ 325 l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_AUTO; 326 } 327 328 static int vmentry_l1d_flush_parse(const char *s) 329 { 330 unsigned int i; 331 332 if (s) { 333 for (i = 0; i < ARRAY_SIZE(vmentry_l1d_param); i++) { 334 if (vmentry_l1d_param[i].for_parse && 335 sysfs_streq(s, vmentry_l1d_param[i].option)) 336 return i; 337 } 338 } 339 return -EINVAL; 340 } 341 342 static int vmentry_l1d_flush_set(const char *s, const struct kernel_param *kp) 343 { 344 int l1tf, ret; 345 346 l1tf = vmentry_l1d_flush_parse(s); 347 if (l1tf < 0) 348 return l1tf; 349 350 if (!boot_cpu_has(X86_BUG_L1TF)) 351 return 0; 352 353 /* 354 * Has vmx_init() run already? If not then this is the pre init 355 * parameter parsing. In that case just store the value and let 356 * vmx_init() do the proper setup after enable_ept has been 357 * established. 358 */ 359 if (l1tf_vmx_mitigation == VMENTER_L1D_FLUSH_AUTO) { 360 vmentry_l1d_flush_param = l1tf; 361 return 0; 362 } 363 364 mutex_lock(&vmx_l1d_flush_mutex); 365 ret = __vmx_setup_l1d_flush(l1tf); 366 mutex_unlock(&vmx_l1d_flush_mutex); 367 return ret; 368 } 369 370 static int vmentry_l1d_flush_get(char *s, const struct kernel_param *kp) 371 { 372 if (WARN_ON_ONCE(l1tf_vmx_mitigation >= ARRAY_SIZE(vmentry_l1d_param))) 373 return sysfs_emit(s, "???\n"); 374 375 return sysfs_emit(s, "%s\n", vmentry_l1d_param[l1tf_vmx_mitigation].option); 376 } 377 378 /* 379 * Software based L1D cache flush which is used when microcode providing 380 * the cache control MSR is not loaded. 381 * 382 * The L1D cache is 32 KiB on Nehalem and later microarchitectures, but to 383 * flush it is required to read in 64 KiB because the replacement algorithm 384 * is not exactly LRU. This could be sized at runtime via topology 385 * information but as all relevant affected CPUs have 32KiB L1D cache size 386 * there is no point in doing so. 387 */ 388 static noinstr void vmx_l1d_flush(struct kvm_vcpu *vcpu) 389 { 390 int size = PAGE_SIZE << L1D_CACHE_ORDER; 391 392 if (!static_branch_unlikely(&vmx_l1d_should_flush)) 393 return; 394 395 /* 396 * This code is only executed when the flush mode is 'cond' or 397 * 'always' 398 */ 399 if (static_branch_likely(&vmx_l1d_flush_cond)) { 400 /* 401 * Clear the per-cpu flush bit, it gets set again if the vCPU 402 * is reloaded, i.e. if the vCPU is scheduled out or if KVM 403 * exits to userspace, or if KVM reaches one of the unsafe 404 * VMEXIT handlers, e.g. if KVM calls into the emulator, 405 * or from the interrupt handlers. 406 */ 407 if (!kvm_get_cpu_l1tf_flush_l1d()) 408 return; 409 kvm_clear_cpu_l1tf_flush_l1d(); 410 } 411 412 vcpu->stat.l1d_flush++; 413 414 if (static_cpu_has(X86_FEATURE_FLUSH_L1D)) { 415 native_wrmsrq(MSR_IA32_FLUSH_CMD, L1D_FLUSH); 416 return; 417 } 418 419 asm volatile( 420 /* First ensure the pages are in the TLB */ 421 "xorl %%eax, %%eax\n" 422 ".Lpopulate_tlb:\n\t" 423 "movzbl (%[flush_pages], %%" _ASM_AX "), %%ecx\n\t" 424 "addl $4096, %%eax\n\t" 425 "cmpl %%eax, %[size]\n\t" 426 "jne .Lpopulate_tlb\n\t" 427 "xorl %%eax, %%eax\n\t" 428 "cpuid\n\t" 429 /* Now fill the cache */ 430 "xorl %%eax, %%eax\n" 431 ".Lfill_cache:\n" 432 "movzbl (%[flush_pages], %%" _ASM_AX "), %%ecx\n\t" 433 "addl $64, %%eax\n\t" 434 "cmpl %%eax, %[size]\n\t" 435 "jne .Lfill_cache\n\t" 436 "lfence\n" 437 :: [flush_pages] "r" (vmx_l1d_flush_pages), 438 [size] "r" (size) 439 : "eax", "ebx", "ecx", "edx"); 440 } 441 442 #else /* CONFIG_CPU_MITIGATIONS*/ 443 static int vmx_setup_l1d_flush(void) 444 { 445 l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_NEVER; 446 return 0; 447 } 448 static void vmx_cleanup_l1d_flush(void) 449 { 450 l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_AUTO; 451 } 452 static __always_inline void vmx_l1d_flush(struct kvm_vcpu *vcpu) 453 { 454 455 } 456 static int vmentry_l1d_flush_set(const char *s, const struct kernel_param *kp) 457 { 458 pr_warn_once("Kernel compiled without mitigations, ignoring vmentry_l1d_flush\n"); 459 return 0; 460 } 461 static int vmentry_l1d_flush_get(char *s, const struct kernel_param *kp) 462 { 463 return sysfs_emit(s, "never\n"); 464 } 465 #endif 466 467 static const struct kernel_param_ops vmentry_l1d_flush_ops = { 468 .set = vmentry_l1d_flush_set, 469 .get = vmentry_l1d_flush_get, 470 }; 471 module_param_cb(vmentry_l1d_flush, &vmentry_l1d_flush_ops, NULL, 0644); 472 473 static __always_inline void vmx_disable_fb_clear(struct vcpu_vmx *vmx) 474 { 475 u64 msr; 476 477 if (!vmx->disable_fb_clear) 478 return; 479 480 msr = native_rdmsrq(MSR_IA32_MCU_OPT_CTRL); 481 msr |= FB_CLEAR_DIS; 482 native_wrmsrq(MSR_IA32_MCU_OPT_CTRL, msr); 483 /* Cache the MSR value to avoid reading it later */ 484 vmx->msr_ia32_mcu_opt_ctrl = msr; 485 } 486 487 static __always_inline void vmx_enable_fb_clear(struct vcpu_vmx *vmx) 488 { 489 if (!vmx->disable_fb_clear) 490 return; 491 492 vmx->msr_ia32_mcu_opt_ctrl &= ~FB_CLEAR_DIS; 493 native_wrmsrq(MSR_IA32_MCU_OPT_CTRL, vmx->msr_ia32_mcu_opt_ctrl); 494 } 495 496 static void vmx_update_fb_clear_dis(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx) 497 { 498 /* 499 * Disable VERW's behavior of clearing CPU buffers for the guest if the 500 * CPU isn't affected by MDS/TAA, and the host hasn't forcefully enabled 501 * the mitigation. Disabling the clearing behavior provides a 502 * performance boost for guests that aren't aware that manually clearing 503 * CPU buffers is unnecessary, at the cost of MSR accesses on VM-Entry 504 * and VM-Exit. 505 */ 506 vmx->disable_fb_clear = !cpu_feature_enabled(X86_FEATURE_CLEAR_CPU_BUF) && 507 (kvm_host.arch_capabilities & ARCH_CAP_FB_CLEAR_CTRL) && 508 !boot_cpu_has_bug(X86_BUG_MDS) && 509 !boot_cpu_has_bug(X86_BUG_TAA); 510 511 /* 512 * If guest will not execute VERW, there is no need to set FB_CLEAR_DIS 513 * at VMEntry. Skip the MSR read/write when a guest has no use case to 514 * execute VERW. 515 */ 516 if ((vcpu->arch.arch_capabilities & ARCH_CAP_FB_CLEAR) || 517 ((vcpu->arch.arch_capabilities & ARCH_CAP_MDS_NO) && 518 (vcpu->arch.arch_capabilities & ARCH_CAP_TAA_NO) && 519 (vcpu->arch.arch_capabilities & ARCH_CAP_PSDP_NO) && 520 (vcpu->arch.arch_capabilities & ARCH_CAP_FBSDP_NO) && 521 (vcpu->arch.arch_capabilities & ARCH_CAP_SBDR_SSDP_NO))) 522 vmx->disable_fb_clear = false; 523 } 524 525 static u32 vmx_segment_access_rights(struct kvm_segment *var); 526 527 void vmx_vmexit(void); 528 529 #define vmx_insn_failed(fmt...) \ 530 do { \ 531 WARN_ONCE(1, fmt); \ 532 pr_warn_ratelimited(fmt); \ 533 } while (0) 534 535 noinline void vmread_error(unsigned long field) 536 { 537 vmx_insn_failed("vmread failed: field=%lx\n", field); 538 } 539 540 #ifndef CONFIG_CC_HAS_ASM_GOTO_OUTPUT 541 noinstr void vmread_error_trampoline2(unsigned long field, bool fault) 542 { 543 if (fault) { 544 kvm_spurious_fault(); 545 } else { 546 instrumentation_begin(); 547 vmread_error(field); 548 instrumentation_end(); 549 } 550 } 551 #endif 552 553 noinline void vmwrite_error(unsigned long field, unsigned long value) 554 { 555 vmx_insn_failed("vmwrite failed: field=%lx val=%lx err=%u\n", 556 field, value, vmcs_read32(VM_INSTRUCTION_ERROR)); 557 } 558 559 noinline void vmclear_error(struct vmcs *vmcs, u64 phys_addr) 560 { 561 vmx_insn_failed("vmclear failed: %p/%llx err=%u\n", 562 vmcs, phys_addr, vmcs_read32(VM_INSTRUCTION_ERROR)); 563 } 564 565 noinline void vmptrld_error(struct vmcs *vmcs, u64 phys_addr) 566 { 567 vmx_insn_failed("vmptrld failed: %p/%llx err=%u\n", 568 vmcs, phys_addr, vmcs_read32(VM_INSTRUCTION_ERROR)); 569 } 570 571 noinline void invvpid_error(unsigned long ext, u16 vpid, gva_t gva) 572 { 573 vmx_insn_failed("invvpid failed: ext=0x%lx vpid=%u gva=0x%lx\n", 574 ext, vpid, gva); 575 } 576 577 noinline void invept_error(unsigned long ext, u64 eptp) 578 { 579 vmx_insn_failed("invept failed: ext=0x%lx eptp=%llx\n", ext, eptp); 580 } 581 582 static DEFINE_PER_CPU(struct vmcs *, vmxarea); 583 DEFINE_PER_CPU(struct vmcs *, current_vmcs); 584 /* 585 * We maintain a per-CPU linked-list of VMCS loaded on that CPU. This is needed 586 * when a CPU is brought down, and we need to VMCLEAR all VMCSs loaded on it. 587 */ 588 static DEFINE_PER_CPU(struct list_head, loaded_vmcss_on_cpu); 589 590 static DECLARE_BITMAP(vmx_vpid_bitmap, VMX_NR_VPIDS); 591 static DEFINE_SPINLOCK(vmx_vpid_lock); 592 593 struct vmcs_config vmcs_config __ro_after_init; 594 struct vmx_capability vmx_capability __ro_after_init; 595 596 #define VMX_SEGMENT_FIELD(seg) \ 597 [VCPU_SREG_##seg] = { \ 598 .selector = GUEST_##seg##_SELECTOR, \ 599 .base = GUEST_##seg##_BASE, \ 600 .limit = GUEST_##seg##_LIMIT, \ 601 .ar_bytes = GUEST_##seg##_AR_BYTES, \ 602 } 603 604 static const struct kvm_vmx_segment_field { 605 unsigned selector; 606 unsigned base; 607 unsigned limit; 608 unsigned ar_bytes; 609 } kvm_vmx_segment_fields[] = { 610 VMX_SEGMENT_FIELD(CS), 611 VMX_SEGMENT_FIELD(DS), 612 VMX_SEGMENT_FIELD(ES), 613 VMX_SEGMENT_FIELD(FS), 614 VMX_SEGMENT_FIELD(GS), 615 VMX_SEGMENT_FIELD(SS), 616 VMX_SEGMENT_FIELD(TR), 617 VMX_SEGMENT_FIELD(LDTR), 618 }; 619 620 621 static unsigned long host_idt_base; 622 623 #if IS_ENABLED(CONFIG_HYPERV) 624 static bool __read_mostly enlightened_vmcs = true; 625 module_param(enlightened_vmcs, bool, 0444); 626 627 static int hv_enable_l2_tlb_flush(struct kvm_vcpu *vcpu) 628 { 629 struct hv_enlightened_vmcs *evmcs; 630 hpa_t partition_assist_page = hv_get_partition_assist_page(vcpu); 631 632 if (partition_assist_page == INVALID_PAGE) 633 return -ENOMEM; 634 635 evmcs = (struct hv_enlightened_vmcs *)to_vmx(vcpu)->loaded_vmcs->vmcs; 636 637 evmcs->partition_assist_page = partition_assist_page; 638 evmcs->hv_vm_id = (unsigned long)vcpu->kvm; 639 evmcs->hv_enlightenments_control.nested_flush_hypercall = 1; 640 641 return 0; 642 } 643 644 static __init void hv_init_evmcs(void) 645 { 646 int cpu; 647 648 if (!enlightened_vmcs) 649 return; 650 651 /* 652 * Enlightened VMCS usage should be recommended and the host needs 653 * to support eVMCS v1 or above. 654 */ 655 if (ms_hyperv.hints & HV_X64_ENLIGHTENED_VMCS_RECOMMENDED && 656 (ms_hyperv.nested_features & HV_X64_ENLIGHTENED_VMCS_VERSION) >= 657 KVM_EVMCS_VERSION) { 658 659 /* Check that we have assist pages on all online CPUs */ 660 for_each_online_cpu(cpu) { 661 if (!hv_get_vp_assist_page(cpu)) { 662 enlightened_vmcs = false; 663 break; 664 } 665 } 666 667 if (enlightened_vmcs) { 668 pr_info("Using Hyper-V Enlightened VMCS\n"); 669 static_branch_enable(&__kvm_is_using_evmcs); 670 } 671 672 if (ms_hyperv.nested_features & HV_X64_NESTED_DIRECT_FLUSH) 673 vt_x86_ops.enable_l2_tlb_flush 674 = hv_enable_l2_tlb_flush; 675 } else { 676 enlightened_vmcs = false; 677 } 678 } 679 680 static void hv_reset_evmcs(void) 681 { 682 struct hv_vp_assist_page *vp_ap; 683 684 if (!kvm_is_using_evmcs()) 685 return; 686 687 /* 688 * KVM should enable eVMCS if and only if all CPUs have a VP assist 689 * page, and should reject CPU onlining if eVMCS is enabled the CPU 690 * doesn't have a VP assist page allocated. 691 */ 692 vp_ap = hv_get_vp_assist_page(smp_processor_id()); 693 if (WARN_ON_ONCE(!vp_ap)) 694 return; 695 696 /* 697 * Reset everything to support using non-enlightened VMCS access later 698 * (e.g. when we reload the module with enlightened_vmcs=0) 699 */ 700 vp_ap->nested_control.features.directhypercall = 0; 701 vp_ap->current_nested_vmcs = 0; 702 vp_ap->enlighten_vmentry = 0; 703 } 704 705 #else /* IS_ENABLED(CONFIG_HYPERV) */ 706 static void hv_init_evmcs(void) {} 707 static void hv_reset_evmcs(void) {} 708 #endif /* IS_ENABLED(CONFIG_HYPERV) */ 709 710 /* 711 * Comment's format: document - errata name - stepping - processor name. 712 * Refer from 713 * https://www.virtualbox.org/svn/vbox/trunk/src/VBox/VMM/VMMR0/HMR0.cpp 714 */ 715 static u32 vmx_preemption_cpu_tfms[] = { 716 /* 323344.pdf - BA86 - D0 - Xeon 7500 Series */ 717 0x000206E6, 718 /* 323056.pdf - AAX65 - C2 - Xeon L3406 */ 719 /* 322814.pdf - AAT59 - C2 - i7-600, i5-500, i5-400 and i3-300 Mobile */ 720 /* 322911.pdf - AAU65 - C2 - i5-600, i3-500 Desktop and Pentium G6950 */ 721 0x00020652, 722 /* 322911.pdf - AAU65 - K0 - i5-600, i3-500 Desktop and Pentium G6950 */ 723 0x00020655, 724 /* 322373.pdf - AAO95 - B1 - Xeon 3400 Series */ 725 /* 322166.pdf - AAN92 - B1 - i7-800 and i5-700 Desktop */ 726 /* 727 * 320767.pdf - AAP86 - B1 - 728 * i7-900 Mobile Extreme, i7-800 and i7-700 Mobile 729 */ 730 0x000106E5, 731 /* 321333.pdf - AAM126 - C0 - Xeon 3500 */ 732 0x000106A0, 733 /* 321333.pdf - AAM126 - C1 - Xeon 3500 */ 734 0x000106A1, 735 /* 320836.pdf - AAJ124 - C0 - i7-900 Desktop Extreme and i7-900 Desktop */ 736 0x000106A4, 737 /* 321333.pdf - AAM126 - D0 - Xeon 3500 */ 738 /* 321324.pdf - AAK139 - D0 - Xeon 5500 */ 739 /* 320836.pdf - AAJ124 - D0 - i7-900 Extreme and i7-900 Desktop */ 740 0x000106A5, 741 /* Xeon E3-1220 V2 */ 742 0x000306A8, 743 }; 744 745 static inline bool cpu_has_broken_vmx_preemption_timer(void) 746 { 747 u32 eax = cpuid_eax(0x00000001), i; 748 749 /* Clear the reserved bits */ 750 eax &= ~(0x3U << 14 | 0xfU << 28); 751 for (i = 0; i < ARRAY_SIZE(vmx_preemption_cpu_tfms); i++) 752 if (eax == vmx_preemption_cpu_tfms[i]) 753 return true; 754 755 return false; 756 } 757 758 static inline bool cpu_need_virtualize_apic_accesses(struct kvm_vcpu *vcpu) 759 { 760 return flexpriority_enabled && lapic_in_kernel(vcpu); 761 } 762 763 struct vmx_uret_msr *vmx_find_uret_msr(struct vcpu_vmx *vmx, u32 msr) 764 { 765 int i; 766 767 i = kvm_find_user_return_msr(msr); 768 if (i >= 0) 769 return &vmx->guest_uret_msrs[i]; 770 return NULL; 771 } 772 773 static int vmx_set_guest_uret_msr(struct vcpu_vmx *vmx, 774 struct vmx_uret_msr *msr, u64 data) 775 { 776 unsigned int slot = msr - vmx->guest_uret_msrs; 777 int ret = 0; 778 779 if (msr->load_into_hardware) { 780 preempt_disable(); 781 ret = kvm_set_user_return_msr(slot, data, msr->mask); 782 preempt_enable(); 783 } 784 if (!ret) 785 msr->data = data; 786 return ret; 787 } 788 789 /* 790 * Disable VMX and clear CR4.VMXE (even if VMXOFF faults) 791 * 792 * Note, VMXOFF causes a #UD if the CPU is !post-VMXON, but it's impossible to 793 * atomically track post-VMXON state, e.g. this may be called in NMI context. 794 * Eat all faults as all other faults on VMXOFF faults are mode related, i.e. 795 * faults are guaranteed to be due to the !post-VMXON check unless the CPU is 796 * magically in RM, VM86, compat mode, or at CPL>0. 797 */ 798 static int kvm_cpu_vmxoff(void) 799 { 800 asm goto("1: vmxoff\n\t" 801 _ASM_EXTABLE(1b, %l[fault]) 802 ::: "cc", "memory" : fault); 803 804 cr4_clear_bits(X86_CR4_VMXE); 805 return 0; 806 807 fault: 808 cr4_clear_bits(X86_CR4_VMXE); 809 return -EIO; 810 } 811 812 void vmx_emergency_disable_virtualization_cpu(void) 813 { 814 int cpu = raw_smp_processor_id(); 815 struct loaded_vmcs *v; 816 817 kvm_rebooting = true; 818 819 /* 820 * Note, CR4.VMXE can be _cleared_ in NMI context, but it can only be 821 * set in task context. If this races with VMX is disabled by an NMI, 822 * VMCLEAR and VMXOFF may #UD, but KVM will eat those faults due to 823 * kvm_rebooting set. 824 */ 825 if (!(__read_cr4() & X86_CR4_VMXE)) 826 return; 827 828 list_for_each_entry(v, &per_cpu(loaded_vmcss_on_cpu, cpu), 829 loaded_vmcss_on_cpu_link) { 830 vmcs_clear(v->vmcs); 831 if (v->shadow_vmcs) 832 vmcs_clear(v->shadow_vmcs); 833 } 834 835 kvm_cpu_vmxoff(); 836 } 837 838 static void __loaded_vmcs_clear(void *arg) 839 { 840 struct loaded_vmcs *loaded_vmcs = arg; 841 int cpu = raw_smp_processor_id(); 842 843 if (loaded_vmcs->cpu != cpu) 844 return; /* vcpu migration can race with cpu offline */ 845 if (per_cpu(current_vmcs, cpu) == loaded_vmcs->vmcs) 846 per_cpu(current_vmcs, cpu) = NULL; 847 848 vmcs_clear(loaded_vmcs->vmcs); 849 if (loaded_vmcs->shadow_vmcs && loaded_vmcs->launched) 850 vmcs_clear(loaded_vmcs->shadow_vmcs); 851 852 list_del(&loaded_vmcs->loaded_vmcss_on_cpu_link); 853 854 /* 855 * Ensure all writes to loaded_vmcs, including deleting it from its 856 * current percpu list, complete before setting loaded_vmcs->cpu to 857 * -1, otherwise a different cpu can see loaded_vmcs->cpu == -1 first 858 * and add loaded_vmcs to its percpu list before it's deleted from this 859 * cpu's list. Pairs with the smp_rmb() in vmx_vcpu_load_vmcs(). 860 */ 861 smp_wmb(); 862 863 loaded_vmcs->cpu = -1; 864 loaded_vmcs->launched = 0; 865 } 866 867 static void loaded_vmcs_clear(struct loaded_vmcs *loaded_vmcs) 868 { 869 int cpu = loaded_vmcs->cpu; 870 871 if (cpu != -1) 872 smp_call_function_single(cpu, 873 __loaded_vmcs_clear, loaded_vmcs, 1); 874 } 875 876 static bool vmx_segment_cache_test_set(struct vcpu_vmx *vmx, unsigned seg, 877 unsigned field) 878 { 879 bool ret; 880 u32 mask = 1 << (seg * SEG_FIELD_NR + field); 881 882 if (!kvm_register_is_available(&vmx->vcpu, VCPU_EXREG_SEGMENTS)) { 883 kvm_register_mark_available(&vmx->vcpu, VCPU_EXREG_SEGMENTS); 884 vmx->segment_cache.bitmask = 0; 885 } 886 ret = vmx->segment_cache.bitmask & mask; 887 vmx->segment_cache.bitmask |= mask; 888 return ret; 889 } 890 891 static u16 vmx_read_guest_seg_selector(struct vcpu_vmx *vmx, unsigned seg) 892 { 893 u16 *p = &vmx->segment_cache.seg[seg].selector; 894 895 if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_SEL)) 896 *p = vmcs_read16(kvm_vmx_segment_fields[seg].selector); 897 return *p; 898 } 899 900 static ulong vmx_read_guest_seg_base(struct vcpu_vmx *vmx, unsigned seg) 901 { 902 ulong *p = &vmx->segment_cache.seg[seg].base; 903 904 if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_BASE)) 905 *p = vmcs_readl(kvm_vmx_segment_fields[seg].base); 906 return *p; 907 } 908 909 static u32 vmx_read_guest_seg_limit(struct vcpu_vmx *vmx, unsigned seg) 910 { 911 u32 *p = &vmx->segment_cache.seg[seg].limit; 912 913 if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_LIMIT)) 914 *p = vmcs_read32(kvm_vmx_segment_fields[seg].limit); 915 return *p; 916 } 917 918 static u32 vmx_read_guest_seg_ar(struct vcpu_vmx *vmx, unsigned seg) 919 { 920 u32 *p = &vmx->segment_cache.seg[seg].ar; 921 922 if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_AR)) 923 *p = vmcs_read32(kvm_vmx_segment_fields[seg].ar_bytes); 924 return *p; 925 } 926 927 void vmx_update_exception_bitmap(struct kvm_vcpu *vcpu) 928 { 929 u32 eb; 930 931 eb = (1u << PF_VECTOR) | (1u << UD_VECTOR) | (1u << MC_VECTOR) | 932 (1u << DB_VECTOR) | (1u << AC_VECTOR); 933 /* 934 * #VE isn't used for VMX. To test against unexpected changes 935 * related to #VE for VMX, intercept unexpected #VE and warn on it. 936 */ 937 if (IS_ENABLED(CONFIG_KVM_INTEL_PROVE_VE)) 938 eb |= 1u << VE_VECTOR; 939 /* 940 * Guest access to VMware backdoor ports could legitimately 941 * trigger #GP because of TSS I/O permission bitmap. 942 * We intercept those #GP and allow access to them anyway 943 * as VMware does. 944 */ 945 if (enable_vmware_backdoor) 946 eb |= (1u << GP_VECTOR); 947 if ((vcpu->guest_debug & 948 (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP)) == 949 (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP)) 950 eb |= 1u << BP_VECTOR; 951 if (to_vmx(vcpu)->rmode.vm86_active) 952 eb = ~0; 953 if (!vmx_need_pf_intercept(vcpu)) 954 eb &= ~(1u << PF_VECTOR); 955 956 /* When we are running a nested L2 guest and L1 specified for it a 957 * certain exception bitmap, we must trap the same exceptions and pass 958 * them to L1. When running L2, we will only handle the exceptions 959 * specified above if L1 did not want them. 960 */ 961 if (is_guest_mode(vcpu)) 962 eb |= get_vmcs12(vcpu)->exception_bitmap; 963 else { 964 int mask = 0, match = 0; 965 966 if (enable_ept && (eb & (1u << PF_VECTOR))) { 967 /* 968 * If EPT is enabled, #PF is currently only intercepted 969 * if MAXPHYADDR is smaller on the guest than on the 970 * host. In that case we only care about present, 971 * non-reserved faults. For vmcs02, however, PFEC_MASK 972 * and PFEC_MATCH are set in prepare_vmcs02_rare. 973 */ 974 mask = PFERR_PRESENT_MASK | PFERR_RSVD_MASK; 975 match = PFERR_PRESENT_MASK; 976 } 977 vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, mask); 978 vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, match); 979 } 980 981 /* 982 * Disabling xfd interception indicates that dynamic xfeatures 983 * might be used in the guest. Always trap #NM in this case 984 * to save guest xfd_err timely. 985 */ 986 if (vcpu->arch.xfd_no_write_intercept) 987 eb |= (1u << NM_VECTOR); 988 989 vmcs_write32(EXCEPTION_BITMAP, eb); 990 } 991 992 /* 993 * Check if MSR is intercepted for currently loaded MSR bitmap. 994 */ 995 static bool msr_write_intercepted(struct vcpu_vmx *vmx, u32 msr) 996 { 997 if (!(exec_controls_get(vmx) & CPU_BASED_USE_MSR_BITMAPS)) 998 return true; 999 1000 return vmx_test_msr_bitmap_write(vmx->loaded_vmcs->msr_bitmap, msr); 1001 } 1002 1003 unsigned int __vmx_vcpu_run_flags(struct vcpu_vmx *vmx) 1004 { 1005 unsigned int flags = 0; 1006 1007 if (vmx->loaded_vmcs->launched) 1008 flags |= VMX_RUN_VMRESUME; 1009 1010 /* 1011 * If writes to the SPEC_CTRL MSR aren't intercepted, the guest is free 1012 * to change it directly without causing a vmexit. In that case read 1013 * it after vmexit and store it in vmx->spec_ctrl. 1014 */ 1015 if (!msr_write_intercepted(vmx, MSR_IA32_SPEC_CTRL)) 1016 flags |= VMX_RUN_SAVE_SPEC_CTRL; 1017 1018 if (cpu_feature_enabled(X86_FEATURE_CLEAR_CPU_BUF_VM_MMIO) && 1019 kvm_vcpu_can_access_host_mmio(&vmx->vcpu)) 1020 flags |= VMX_RUN_CLEAR_CPU_BUFFERS_FOR_MMIO; 1021 1022 return flags; 1023 } 1024 1025 static __always_inline void clear_atomic_switch_msr_special(struct vcpu_vmx *vmx, 1026 unsigned long entry, unsigned long exit) 1027 { 1028 vm_entry_controls_clearbit(vmx, entry); 1029 vm_exit_controls_clearbit(vmx, exit); 1030 } 1031 1032 static int vmx_find_loadstore_msr_slot(struct vmx_msrs *m, u32 msr) 1033 { 1034 unsigned int i; 1035 1036 for (i = 0; i < m->nr; ++i) { 1037 if (m->val[i].index == msr) 1038 return i; 1039 } 1040 return -ENOENT; 1041 } 1042 1043 static void vmx_remove_auto_msr(struct vmx_msrs *m, u32 msr, 1044 unsigned long vmcs_count_field) 1045 { 1046 int i; 1047 1048 i = vmx_find_loadstore_msr_slot(m, msr); 1049 if (i < 0) 1050 return; 1051 1052 --m->nr; 1053 m->val[i] = m->val[m->nr]; 1054 vmcs_write32(vmcs_count_field, m->nr); 1055 } 1056 1057 static void clear_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr) 1058 { 1059 struct msr_autoload *m = &vmx->msr_autoload; 1060 1061 switch (msr) { 1062 case MSR_EFER: 1063 if (cpu_has_load_ia32_efer()) { 1064 clear_atomic_switch_msr_special(vmx, 1065 VM_ENTRY_LOAD_IA32_EFER, 1066 VM_EXIT_LOAD_IA32_EFER); 1067 return; 1068 } 1069 break; 1070 case MSR_CORE_PERF_GLOBAL_CTRL: 1071 if (cpu_has_load_perf_global_ctrl()) { 1072 clear_atomic_switch_msr_special(vmx, 1073 VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL, 1074 VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL); 1075 return; 1076 } 1077 break; 1078 } 1079 1080 vmx_remove_auto_msr(&m->guest, msr, VM_ENTRY_MSR_LOAD_COUNT); 1081 vmx_remove_auto_msr(&m->host, msr, VM_EXIT_MSR_LOAD_COUNT); 1082 } 1083 1084 static __always_inline void add_atomic_switch_msr_special(struct vcpu_vmx *vmx, 1085 unsigned long entry, unsigned long exit, 1086 unsigned long guest_val_vmcs, unsigned long host_val_vmcs, 1087 u64 guest_val, u64 host_val) 1088 { 1089 vmcs_write64(guest_val_vmcs, guest_val); 1090 if (host_val_vmcs != HOST_IA32_EFER) 1091 vmcs_write64(host_val_vmcs, host_val); 1092 vm_entry_controls_setbit(vmx, entry); 1093 vm_exit_controls_setbit(vmx, exit); 1094 } 1095 1096 static void vmx_add_auto_msr(struct vmx_msrs *m, u32 msr, u64 value, 1097 unsigned long vmcs_count_field, struct kvm *kvm) 1098 { 1099 int i; 1100 1101 i = vmx_find_loadstore_msr_slot(m, msr); 1102 if (i < 0) { 1103 if (KVM_BUG_ON(m->nr == MAX_NR_LOADSTORE_MSRS, kvm)) 1104 return; 1105 1106 i = m->nr++; 1107 m->val[i].index = msr; 1108 vmcs_write32(vmcs_count_field, m->nr); 1109 } 1110 m->val[i].value = value; 1111 } 1112 1113 static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr, 1114 u64 guest_val, u64 host_val) 1115 { 1116 struct msr_autoload *m = &vmx->msr_autoload; 1117 struct kvm *kvm = vmx->vcpu.kvm; 1118 1119 switch (msr) { 1120 case MSR_EFER: 1121 if (cpu_has_load_ia32_efer()) { 1122 add_atomic_switch_msr_special(vmx, 1123 VM_ENTRY_LOAD_IA32_EFER, 1124 VM_EXIT_LOAD_IA32_EFER, 1125 GUEST_IA32_EFER, 1126 HOST_IA32_EFER, 1127 guest_val, host_val); 1128 return; 1129 } 1130 break; 1131 case MSR_CORE_PERF_GLOBAL_CTRL: 1132 if (cpu_has_load_perf_global_ctrl()) { 1133 add_atomic_switch_msr_special(vmx, 1134 VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL, 1135 VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL, 1136 GUEST_IA32_PERF_GLOBAL_CTRL, 1137 HOST_IA32_PERF_GLOBAL_CTRL, 1138 guest_val, host_val); 1139 return; 1140 } 1141 break; 1142 case MSR_IA32_PEBS_ENABLE: 1143 /* PEBS needs a quiescent period after being disabled (to write 1144 * a record). Disabling PEBS through VMX MSR swapping doesn't 1145 * provide that period, so a CPU could write host's record into 1146 * guest's memory. 1147 */ 1148 wrmsrq(MSR_IA32_PEBS_ENABLE, 0); 1149 } 1150 1151 vmx_add_auto_msr(&m->guest, msr, guest_val, VM_ENTRY_MSR_LOAD_COUNT, kvm); 1152 vmx_add_auto_msr(&m->guest, msr, host_val, VM_EXIT_MSR_LOAD_COUNT, kvm); 1153 } 1154 1155 static bool update_transition_efer(struct vcpu_vmx *vmx) 1156 { 1157 u64 guest_efer = vmx->vcpu.arch.efer; 1158 u64 ignore_bits = 0; 1159 int i; 1160 1161 /* Shadow paging assumes NX to be available. */ 1162 if (!enable_ept) 1163 guest_efer |= EFER_NX; 1164 1165 /* 1166 * LMA and LME handled by hardware; SCE meaningless outside long mode. 1167 */ 1168 ignore_bits |= EFER_SCE; 1169 #ifdef CONFIG_X86_64 1170 ignore_bits |= EFER_LMA | EFER_LME; 1171 /* SCE is meaningful only in long mode on Intel */ 1172 if (guest_efer & EFER_LMA) 1173 ignore_bits &= ~(u64)EFER_SCE; 1174 #endif 1175 1176 /* 1177 * On EPT, we can't emulate NX, so we must switch EFER atomically. 1178 * On CPUs that support "load IA32_EFER", always switch EFER 1179 * atomically, since it's faster than switching it manually. 1180 */ 1181 if (cpu_has_load_ia32_efer() || 1182 (enable_ept && ((vmx->vcpu.arch.efer ^ kvm_host.efer) & EFER_NX))) { 1183 if (!(guest_efer & EFER_LMA)) 1184 guest_efer &= ~EFER_LME; 1185 if (guest_efer != kvm_host.efer) 1186 add_atomic_switch_msr(vmx, MSR_EFER, guest_efer, kvm_host.efer); 1187 else 1188 clear_atomic_switch_msr(vmx, MSR_EFER); 1189 return false; 1190 } 1191 1192 i = kvm_find_user_return_msr(MSR_EFER); 1193 if (i < 0) 1194 return false; 1195 1196 clear_atomic_switch_msr(vmx, MSR_EFER); 1197 1198 guest_efer &= ~ignore_bits; 1199 guest_efer |= kvm_host.efer & ignore_bits; 1200 1201 vmx->guest_uret_msrs[i].data = guest_efer; 1202 vmx->guest_uret_msrs[i].mask = ~ignore_bits; 1203 1204 return true; 1205 } 1206 1207 static void vmx_add_autostore_msr(struct vcpu_vmx *vmx, u32 msr) 1208 { 1209 vmx_add_auto_msr(&vmx->msr_autostore, msr, 0, VM_EXIT_MSR_STORE_COUNT, 1210 vmx->vcpu.kvm); 1211 } 1212 1213 static void vmx_remove_autostore_msr(struct vcpu_vmx *vmx, u32 msr) 1214 { 1215 vmx_remove_auto_msr(&vmx->msr_autostore, msr, VM_EXIT_MSR_STORE_COUNT); 1216 } 1217 1218 #ifdef CONFIG_X86_32 1219 /* 1220 * On 32-bit kernels, VM exits still load the FS and GS bases from the 1221 * VMCS rather than the segment table. KVM uses this helper to figure 1222 * out the current bases to poke them into the VMCS before entry. 1223 */ 1224 static unsigned long segment_base(u16 selector) 1225 { 1226 struct desc_struct *table; 1227 unsigned long v; 1228 1229 if (!(selector & ~SEGMENT_RPL_MASK)) 1230 return 0; 1231 1232 table = get_current_gdt_ro(); 1233 1234 if ((selector & SEGMENT_TI_MASK) == SEGMENT_LDT) { 1235 u16 ldt_selector = kvm_read_ldt(); 1236 1237 if (!(ldt_selector & ~SEGMENT_RPL_MASK)) 1238 return 0; 1239 1240 table = (struct desc_struct *)segment_base(ldt_selector); 1241 } 1242 v = get_desc_base(&table[selector >> 3]); 1243 return v; 1244 } 1245 #endif 1246 1247 static inline bool pt_can_write_msr(struct vcpu_vmx *vmx) 1248 { 1249 return vmx_pt_mode_is_host_guest() && 1250 !(vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN); 1251 } 1252 1253 static inline bool pt_output_base_valid(struct kvm_vcpu *vcpu, u64 base) 1254 { 1255 /* The base must be 128-byte aligned and a legal physical address. */ 1256 return kvm_vcpu_is_legal_aligned_gpa(vcpu, base, 128); 1257 } 1258 1259 static inline void pt_load_msr(struct pt_ctx *ctx, u32 addr_range) 1260 { 1261 u32 i; 1262 1263 wrmsrq(MSR_IA32_RTIT_STATUS, ctx->status); 1264 wrmsrq(MSR_IA32_RTIT_OUTPUT_BASE, ctx->output_base); 1265 wrmsrq(MSR_IA32_RTIT_OUTPUT_MASK, ctx->output_mask); 1266 wrmsrq(MSR_IA32_RTIT_CR3_MATCH, ctx->cr3_match); 1267 for (i = 0; i < addr_range; i++) { 1268 wrmsrq(MSR_IA32_RTIT_ADDR0_A + i * 2, ctx->addr_a[i]); 1269 wrmsrq(MSR_IA32_RTIT_ADDR0_B + i * 2, ctx->addr_b[i]); 1270 } 1271 } 1272 1273 static inline void pt_save_msr(struct pt_ctx *ctx, u32 addr_range) 1274 { 1275 u32 i; 1276 1277 rdmsrq(MSR_IA32_RTIT_STATUS, ctx->status); 1278 rdmsrq(MSR_IA32_RTIT_OUTPUT_BASE, ctx->output_base); 1279 rdmsrq(MSR_IA32_RTIT_OUTPUT_MASK, ctx->output_mask); 1280 rdmsrq(MSR_IA32_RTIT_CR3_MATCH, ctx->cr3_match); 1281 for (i = 0; i < addr_range; i++) { 1282 rdmsrq(MSR_IA32_RTIT_ADDR0_A + i * 2, ctx->addr_a[i]); 1283 rdmsrq(MSR_IA32_RTIT_ADDR0_B + i * 2, ctx->addr_b[i]); 1284 } 1285 } 1286 1287 static void pt_guest_enter(struct vcpu_vmx *vmx) 1288 { 1289 if (vmx_pt_mode_is_system()) 1290 return; 1291 1292 /* 1293 * GUEST_IA32_RTIT_CTL is already set in the VMCS. 1294 * Save host state before VM entry. 1295 */ 1296 rdmsrq(MSR_IA32_RTIT_CTL, vmx->pt_desc.host.ctl); 1297 if (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) { 1298 wrmsrq(MSR_IA32_RTIT_CTL, 0); 1299 pt_save_msr(&vmx->pt_desc.host, vmx->pt_desc.num_address_ranges); 1300 pt_load_msr(&vmx->pt_desc.guest, vmx->pt_desc.num_address_ranges); 1301 } 1302 } 1303 1304 static void pt_guest_exit(struct vcpu_vmx *vmx) 1305 { 1306 if (vmx_pt_mode_is_system()) 1307 return; 1308 1309 if (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) { 1310 pt_save_msr(&vmx->pt_desc.guest, vmx->pt_desc.num_address_ranges); 1311 pt_load_msr(&vmx->pt_desc.host, vmx->pt_desc.num_address_ranges); 1312 } 1313 1314 /* 1315 * KVM requires VM_EXIT_CLEAR_IA32_RTIT_CTL to expose PT to the guest, 1316 * i.e. RTIT_CTL is always cleared on VM-Exit. Restore it if necessary. 1317 */ 1318 if (vmx->pt_desc.host.ctl) 1319 wrmsrq(MSR_IA32_RTIT_CTL, vmx->pt_desc.host.ctl); 1320 } 1321 1322 void vmx_set_host_fs_gs(struct vmcs_host_state *host, u16 fs_sel, u16 gs_sel, 1323 unsigned long fs_base, unsigned long gs_base) 1324 { 1325 if (unlikely(fs_sel != host->fs_sel)) { 1326 if (!(fs_sel & 7)) 1327 vmcs_write16(HOST_FS_SELECTOR, fs_sel); 1328 else 1329 vmcs_write16(HOST_FS_SELECTOR, 0); 1330 host->fs_sel = fs_sel; 1331 } 1332 if (unlikely(gs_sel != host->gs_sel)) { 1333 if (!(gs_sel & 7)) 1334 vmcs_write16(HOST_GS_SELECTOR, gs_sel); 1335 else 1336 vmcs_write16(HOST_GS_SELECTOR, 0); 1337 host->gs_sel = gs_sel; 1338 } 1339 if (unlikely(fs_base != host->fs_base)) { 1340 vmcs_writel(HOST_FS_BASE, fs_base); 1341 host->fs_base = fs_base; 1342 } 1343 if (unlikely(gs_base != host->gs_base)) { 1344 vmcs_writel(HOST_GS_BASE, gs_base); 1345 host->gs_base = gs_base; 1346 } 1347 } 1348 1349 void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu) 1350 { 1351 struct vcpu_vmx *vmx = to_vmx(vcpu); 1352 struct vcpu_vt *vt = to_vt(vcpu); 1353 struct vmcs_host_state *host_state; 1354 #ifdef CONFIG_X86_64 1355 int cpu = raw_smp_processor_id(); 1356 #endif 1357 unsigned long fs_base, gs_base; 1358 u16 fs_sel, gs_sel; 1359 int i; 1360 1361 /* 1362 * Note that guest MSRs to be saved/restored can also be changed 1363 * when guest state is loaded. This happens when guest transitions 1364 * to/from long-mode by setting MSR_EFER.LMA. 1365 */ 1366 if (!vmx->guest_uret_msrs_loaded) { 1367 vmx->guest_uret_msrs_loaded = true; 1368 for (i = 0; i < kvm_nr_uret_msrs; ++i) { 1369 if (!vmx->guest_uret_msrs[i].load_into_hardware) 1370 continue; 1371 1372 kvm_set_user_return_msr(i, 1373 vmx->guest_uret_msrs[i].data, 1374 vmx->guest_uret_msrs[i].mask); 1375 } 1376 } 1377 1378 if (vmx->nested.need_vmcs12_to_shadow_sync) 1379 nested_sync_vmcs12_to_shadow(vcpu); 1380 1381 if (vt->guest_state_loaded) 1382 return; 1383 1384 host_state = &vmx->loaded_vmcs->host_state; 1385 1386 /* 1387 * Set host fs and gs selectors. Unfortunately, 22.2.3 does not 1388 * allow segment selectors with cpl > 0 or ti == 1. 1389 */ 1390 host_state->ldt_sel = kvm_read_ldt(); 1391 1392 #ifdef CONFIG_X86_64 1393 savesegment(ds, host_state->ds_sel); 1394 savesegment(es, host_state->es_sel); 1395 1396 gs_base = cpu_kernelmode_gs_base(cpu); 1397 if (likely(is_64bit_mm(current->mm))) { 1398 current_save_fsgs(); 1399 fs_sel = current->thread.fsindex; 1400 gs_sel = current->thread.gsindex; 1401 fs_base = current->thread.fsbase; 1402 vt->msr_host_kernel_gs_base = current->thread.gsbase; 1403 } else { 1404 savesegment(fs, fs_sel); 1405 savesegment(gs, gs_sel); 1406 fs_base = read_msr(MSR_FS_BASE); 1407 vt->msr_host_kernel_gs_base = read_msr(MSR_KERNEL_GS_BASE); 1408 } 1409 1410 wrmsrq(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base); 1411 #else 1412 savesegment(fs, fs_sel); 1413 savesegment(gs, gs_sel); 1414 fs_base = segment_base(fs_sel); 1415 gs_base = segment_base(gs_sel); 1416 #endif 1417 1418 vmx_set_host_fs_gs(host_state, fs_sel, gs_sel, fs_base, gs_base); 1419 vt->guest_state_loaded = true; 1420 } 1421 1422 static void vmx_prepare_switch_to_host(struct vcpu_vmx *vmx) 1423 { 1424 struct vmcs_host_state *host_state; 1425 1426 if (!vmx->vt.guest_state_loaded) 1427 return; 1428 1429 host_state = &vmx->loaded_vmcs->host_state; 1430 1431 ++vmx->vcpu.stat.host_state_reload; 1432 1433 #ifdef CONFIG_X86_64 1434 rdmsrq(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base); 1435 #endif 1436 if (host_state->ldt_sel || (host_state->gs_sel & 7)) { 1437 kvm_load_ldt(host_state->ldt_sel); 1438 #ifdef CONFIG_X86_64 1439 load_gs_index(host_state->gs_sel); 1440 #else 1441 loadsegment(gs, host_state->gs_sel); 1442 #endif 1443 } 1444 if (host_state->fs_sel & 7) 1445 loadsegment(fs, host_state->fs_sel); 1446 #ifdef CONFIG_X86_64 1447 if (unlikely(host_state->ds_sel | host_state->es_sel)) { 1448 loadsegment(ds, host_state->ds_sel); 1449 loadsegment(es, host_state->es_sel); 1450 } 1451 #endif 1452 invalidate_tss_limit(); 1453 #ifdef CONFIG_X86_64 1454 wrmsrq(MSR_KERNEL_GS_BASE, vmx->vt.msr_host_kernel_gs_base); 1455 #endif 1456 load_fixmap_gdt(raw_smp_processor_id()); 1457 vmx->vt.guest_state_loaded = false; 1458 vmx->guest_uret_msrs_loaded = false; 1459 } 1460 1461 #ifdef CONFIG_X86_64 1462 static u64 vmx_read_guest_host_msr(struct vcpu_vmx *vmx, u32 msr, u64 *cache) 1463 { 1464 preempt_disable(); 1465 if (vmx->vt.guest_state_loaded) 1466 *cache = read_msr(msr); 1467 preempt_enable(); 1468 return *cache; 1469 } 1470 1471 static void vmx_write_guest_host_msr(struct vcpu_vmx *vmx, u32 msr, u64 data, 1472 u64 *cache) 1473 { 1474 preempt_disable(); 1475 if (vmx->vt.guest_state_loaded) 1476 wrmsrns(msr, data); 1477 preempt_enable(); 1478 *cache = data; 1479 } 1480 1481 static u64 vmx_read_guest_kernel_gs_base(struct vcpu_vmx *vmx) 1482 { 1483 return vmx_read_guest_host_msr(vmx, MSR_KERNEL_GS_BASE, 1484 &vmx->msr_guest_kernel_gs_base); 1485 } 1486 1487 static void vmx_write_guest_kernel_gs_base(struct vcpu_vmx *vmx, u64 data) 1488 { 1489 vmx_write_guest_host_msr(vmx, MSR_KERNEL_GS_BASE, data, 1490 &vmx->msr_guest_kernel_gs_base); 1491 } 1492 #endif 1493 1494 static void grow_ple_window(struct kvm_vcpu *vcpu) 1495 { 1496 struct vcpu_vmx *vmx = to_vmx(vcpu); 1497 unsigned int old = vmx->ple_window; 1498 1499 vmx->ple_window = __grow_ple_window(old, ple_window, 1500 ple_window_grow, 1501 ple_window_max); 1502 1503 if (vmx->ple_window != old) { 1504 vmx->ple_window_dirty = true; 1505 trace_kvm_ple_window_update(vcpu->vcpu_id, 1506 vmx->ple_window, old); 1507 } 1508 } 1509 1510 static void shrink_ple_window(struct kvm_vcpu *vcpu) 1511 { 1512 struct vcpu_vmx *vmx = to_vmx(vcpu); 1513 unsigned int old = vmx->ple_window; 1514 1515 vmx->ple_window = __shrink_ple_window(old, ple_window, 1516 ple_window_shrink, 1517 ple_window); 1518 1519 if (vmx->ple_window != old) { 1520 vmx->ple_window_dirty = true; 1521 trace_kvm_ple_window_update(vcpu->vcpu_id, 1522 vmx->ple_window, old); 1523 } 1524 } 1525 1526 void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu) 1527 { 1528 struct vcpu_vmx *vmx = to_vmx(vcpu); 1529 bool already_loaded = vmx->loaded_vmcs->cpu == cpu; 1530 struct vmcs *prev; 1531 1532 if (!already_loaded) { 1533 loaded_vmcs_clear(vmx->loaded_vmcs); 1534 local_irq_disable(); 1535 1536 /* 1537 * Ensure loaded_vmcs->cpu is read before adding loaded_vmcs to 1538 * this cpu's percpu list, otherwise it may not yet be deleted 1539 * from its previous cpu's percpu list. Pairs with the 1540 * smb_wmb() in __loaded_vmcs_clear(). 1541 */ 1542 smp_rmb(); 1543 1544 list_add(&vmx->loaded_vmcs->loaded_vmcss_on_cpu_link, 1545 &per_cpu(loaded_vmcss_on_cpu, cpu)); 1546 local_irq_enable(); 1547 } 1548 1549 prev = per_cpu(current_vmcs, cpu); 1550 if (prev != vmx->loaded_vmcs->vmcs) { 1551 per_cpu(current_vmcs, cpu) = vmx->loaded_vmcs->vmcs; 1552 vmcs_load(vmx->loaded_vmcs->vmcs); 1553 } 1554 1555 if (!already_loaded) { 1556 void *gdt = get_current_gdt_ro(); 1557 1558 /* 1559 * Flush all EPTP/VPID contexts, the new pCPU may have stale 1560 * TLB entries from its previous association with the vCPU. 1561 */ 1562 kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); 1563 1564 /* 1565 * Linux uses per-cpu TSS and GDT, so set these when switching 1566 * processors. See 22.2.4. 1567 */ 1568 vmcs_writel(HOST_TR_BASE, 1569 (unsigned long)&get_cpu_entry_area(cpu)->tss.x86_tss); 1570 vmcs_writel(HOST_GDTR_BASE, (unsigned long)gdt); /* 22.2.4 */ 1571 1572 if (IS_ENABLED(CONFIG_IA32_EMULATION) || IS_ENABLED(CONFIG_X86_32)) { 1573 /* 22.2.3 */ 1574 vmcs_writel(HOST_IA32_SYSENTER_ESP, 1575 (unsigned long)(cpu_entry_stack(cpu) + 1)); 1576 } 1577 1578 vmx->loaded_vmcs->cpu = cpu; 1579 } 1580 } 1581 1582 /* 1583 * Switches to specified vcpu, until a matching vcpu_put(), but assumes 1584 * vcpu mutex is already taken. 1585 */ 1586 void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) 1587 { 1588 if (vcpu->scheduled_out && !kvm_pause_in_guest(vcpu->kvm)) 1589 shrink_ple_window(vcpu); 1590 1591 vmx_vcpu_load_vmcs(vcpu, cpu); 1592 1593 vmx_vcpu_pi_load(vcpu, cpu); 1594 } 1595 1596 void vmx_vcpu_put(struct kvm_vcpu *vcpu) 1597 { 1598 vmx_vcpu_pi_put(vcpu); 1599 1600 vmx_prepare_switch_to_host(to_vmx(vcpu)); 1601 } 1602 1603 static void vmx_switch_loaded_vmcs(struct kvm_vcpu *vcpu, 1604 struct loaded_vmcs *vmcs) 1605 { 1606 struct vcpu_vmx *vmx = to_vmx(vcpu); 1607 int cpu; 1608 1609 cpu = get_cpu(); 1610 vmx->loaded_vmcs = vmcs; 1611 vmx_vcpu_load_vmcs(vcpu, cpu); 1612 put_cpu(); 1613 } 1614 1615 static void vmx_load_vmcs01(struct kvm_vcpu *vcpu) 1616 { 1617 struct vcpu_vmx *vmx = to_vmx(vcpu); 1618 1619 if (!is_guest_mode(vcpu)) { 1620 WARN_ON_ONCE(vmx->loaded_vmcs != &vmx->vmcs01); 1621 return; 1622 } 1623 1624 WARN_ON_ONCE(vmx->loaded_vmcs != &vmx->nested.vmcs02); 1625 vmx_switch_loaded_vmcs(vcpu, &vmx->vmcs01); 1626 } 1627 1628 static void vmx_put_vmcs01(struct kvm_vcpu *vcpu) 1629 { 1630 if (!is_guest_mode(vcpu)) 1631 return; 1632 1633 vmx_switch_loaded_vmcs(vcpu, &to_vmx(vcpu)->nested.vmcs02); 1634 } 1635 DEFINE_GUARD(vmx_vmcs01, struct kvm_vcpu *, 1636 vmx_load_vmcs01(_T), vmx_put_vmcs01(_T)) 1637 1638 bool vmx_emulation_required(struct kvm_vcpu *vcpu) 1639 { 1640 return emulate_invalid_guest_state && !vmx_guest_state_valid(vcpu); 1641 } 1642 1643 unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu) 1644 { 1645 struct vcpu_vmx *vmx = to_vmx(vcpu); 1646 unsigned long rflags, save_rflags; 1647 1648 if (!kvm_register_is_available(vcpu, VCPU_EXREG_RFLAGS)) { 1649 kvm_register_mark_available(vcpu, VCPU_EXREG_RFLAGS); 1650 rflags = vmcs_readl(GUEST_RFLAGS); 1651 if (vmx->rmode.vm86_active) { 1652 rflags &= RMODE_GUEST_OWNED_EFLAGS_BITS; 1653 save_rflags = vmx->rmode.save_rflags; 1654 rflags |= save_rflags & ~RMODE_GUEST_OWNED_EFLAGS_BITS; 1655 } 1656 vmx->rflags = rflags; 1657 } 1658 return vmx->rflags; 1659 } 1660 1661 void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) 1662 { 1663 struct vcpu_vmx *vmx = to_vmx(vcpu); 1664 unsigned long old_rflags; 1665 1666 /* 1667 * Unlike CR0 and CR4, RFLAGS handling requires checking if the vCPU 1668 * is an unrestricted guest in order to mark L2 as needing emulation 1669 * if L1 runs L2 as a restricted guest. 1670 */ 1671 if (is_unrestricted_guest(vcpu)) { 1672 kvm_register_mark_available(vcpu, VCPU_EXREG_RFLAGS); 1673 vmx->rflags = rflags; 1674 vmcs_writel(GUEST_RFLAGS, rflags); 1675 return; 1676 } 1677 1678 old_rflags = vmx_get_rflags(vcpu); 1679 vmx->rflags = rflags; 1680 if (vmx->rmode.vm86_active) { 1681 vmx->rmode.save_rflags = rflags; 1682 rflags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM; 1683 } 1684 vmcs_writel(GUEST_RFLAGS, rflags); 1685 1686 if ((old_rflags ^ vmx->rflags) & X86_EFLAGS_VM) 1687 vmx->vt.emulation_required = vmx_emulation_required(vcpu); 1688 } 1689 1690 bool vmx_get_if_flag(struct kvm_vcpu *vcpu) 1691 { 1692 return vmx_get_rflags(vcpu) & X86_EFLAGS_IF; 1693 } 1694 1695 u32 vmx_get_interrupt_shadow(struct kvm_vcpu *vcpu) 1696 { 1697 u32 interruptibility = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO); 1698 int ret = 0; 1699 1700 if (interruptibility & GUEST_INTR_STATE_STI) 1701 ret |= KVM_X86_SHADOW_INT_STI; 1702 if (interruptibility & GUEST_INTR_STATE_MOV_SS) 1703 ret |= KVM_X86_SHADOW_INT_MOV_SS; 1704 1705 return ret; 1706 } 1707 1708 void vmx_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask) 1709 { 1710 u32 interruptibility_old = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO); 1711 u32 interruptibility = interruptibility_old; 1712 1713 interruptibility &= ~(GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS); 1714 1715 if (mask & KVM_X86_SHADOW_INT_MOV_SS) 1716 interruptibility |= GUEST_INTR_STATE_MOV_SS; 1717 else if (mask & KVM_X86_SHADOW_INT_STI) 1718 interruptibility |= GUEST_INTR_STATE_STI; 1719 1720 if ((interruptibility != interruptibility_old)) 1721 vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, interruptibility); 1722 } 1723 1724 static int vmx_rtit_ctl_check(struct kvm_vcpu *vcpu, u64 data) 1725 { 1726 struct vcpu_vmx *vmx = to_vmx(vcpu); 1727 unsigned long value; 1728 1729 /* 1730 * Any MSR write that attempts to change bits marked reserved will 1731 * case a #GP fault. 1732 */ 1733 if (data & vmx->pt_desc.ctl_bitmask) 1734 return 1; 1735 1736 /* 1737 * Any attempt to modify IA32_RTIT_CTL while TraceEn is set will 1738 * result in a #GP unless the same write also clears TraceEn. 1739 */ 1740 if ((vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) && 1741 (data & RTIT_CTL_TRACEEN) && 1742 data != vmx->pt_desc.guest.ctl) 1743 return 1; 1744 1745 /* 1746 * WRMSR to IA32_RTIT_CTL that sets TraceEn but clears this bit 1747 * and FabricEn would cause #GP, if 1748 * CPUID.(EAX=14H, ECX=0):ECX.SNGLRGNOUT[bit 2] = 0 1749 */ 1750 if ((data & RTIT_CTL_TRACEEN) && !(data & RTIT_CTL_TOPA) && 1751 !(data & RTIT_CTL_FABRIC_EN) && 1752 !intel_pt_validate_cap(vmx->pt_desc.caps, 1753 PT_CAP_single_range_output)) 1754 return 1; 1755 1756 /* 1757 * MTCFreq, CycThresh and PSBFreq encodings check, any MSR write that 1758 * utilize encodings marked reserved will cause a #GP fault. 1759 */ 1760 value = intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_mtc_periods); 1761 if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_mtc) && 1762 !test_bit((data & RTIT_CTL_MTC_RANGE) >> 1763 RTIT_CTL_MTC_RANGE_OFFSET, &value)) 1764 return 1; 1765 value = intel_pt_validate_cap(vmx->pt_desc.caps, 1766 PT_CAP_cycle_thresholds); 1767 if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_psb_cyc) && 1768 !test_bit((data & RTIT_CTL_CYC_THRESH) >> 1769 RTIT_CTL_CYC_THRESH_OFFSET, &value)) 1770 return 1; 1771 value = intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_psb_periods); 1772 if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_psb_cyc) && 1773 !test_bit((data & RTIT_CTL_PSB_FREQ) >> 1774 RTIT_CTL_PSB_FREQ_OFFSET, &value)) 1775 return 1; 1776 1777 /* 1778 * If ADDRx_CFG is reserved or the encodings is >2 will 1779 * cause a #GP fault. 1780 */ 1781 value = (data & RTIT_CTL_ADDR0) >> RTIT_CTL_ADDR0_OFFSET; 1782 if ((value && (vmx->pt_desc.num_address_ranges < 1)) || (value > 2)) 1783 return 1; 1784 value = (data & RTIT_CTL_ADDR1) >> RTIT_CTL_ADDR1_OFFSET; 1785 if ((value && (vmx->pt_desc.num_address_ranges < 2)) || (value > 2)) 1786 return 1; 1787 value = (data & RTIT_CTL_ADDR2) >> RTIT_CTL_ADDR2_OFFSET; 1788 if ((value && (vmx->pt_desc.num_address_ranges < 3)) || (value > 2)) 1789 return 1; 1790 value = (data & RTIT_CTL_ADDR3) >> RTIT_CTL_ADDR3_OFFSET; 1791 if ((value && (vmx->pt_desc.num_address_ranges < 4)) || (value > 2)) 1792 return 1; 1793 1794 return 0; 1795 } 1796 1797 int vmx_check_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type, 1798 void *insn, int insn_len) 1799 { 1800 /* 1801 * Emulation of instructions in SGX enclaves is impossible as RIP does 1802 * not point at the failing instruction, and even if it did, the code 1803 * stream is inaccessible. Inject #UD instead of exiting to userspace 1804 * so that guest userspace can't DoS the guest simply by triggering 1805 * emulation (enclaves are CPL3 only). 1806 */ 1807 if (vmx_get_exit_reason(vcpu).enclave_mode) { 1808 kvm_queue_exception(vcpu, UD_VECTOR); 1809 return X86EMUL_PROPAGATE_FAULT; 1810 } 1811 1812 /* Check that emulation is possible during event vectoring */ 1813 if ((to_vmx(vcpu)->idt_vectoring_info & VECTORING_INFO_VALID_MASK) && 1814 !kvm_can_emulate_event_vectoring(emul_type)) 1815 return X86EMUL_UNHANDLEABLE_VECTORING; 1816 1817 return X86EMUL_CONTINUE; 1818 } 1819 1820 static int skip_emulated_instruction(struct kvm_vcpu *vcpu) 1821 { 1822 union vmx_exit_reason exit_reason = vmx_get_exit_reason(vcpu); 1823 unsigned long rip, orig_rip; 1824 u32 instr_len; 1825 1826 /* 1827 * Using VMCS.VM_EXIT_INSTRUCTION_LEN on EPT misconfig depends on 1828 * undefined behavior: Intel's SDM doesn't mandate the VMCS field be 1829 * set when EPT misconfig occurs. In practice, real hardware updates 1830 * VM_EXIT_INSTRUCTION_LEN on EPT misconfig, but other hypervisors 1831 * (namely Hyper-V) don't set it due to it being undefined behavior, 1832 * i.e. we end up advancing IP with some random value. 1833 */ 1834 if (!static_cpu_has(X86_FEATURE_HYPERVISOR) || 1835 exit_reason.basic != EXIT_REASON_EPT_MISCONFIG) { 1836 instr_len = vmcs_read32(VM_EXIT_INSTRUCTION_LEN); 1837 1838 /* 1839 * Emulating an enclave's instructions isn't supported as KVM 1840 * cannot access the enclave's memory or its true RIP, e.g. the 1841 * vmcs.GUEST_RIP points at the exit point of the enclave, not 1842 * the RIP that actually triggered the VM-Exit. But, because 1843 * most instructions that cause VM-Exit will #UD in an enclave, 1844 * most instruction-based VM-Exits simply do not occur. 1845 * 1846 * There are a few exceptions, notably the debug instructions 1847 * INT1ICEBRK and INT3, as they are allowed in debug enclaves 1848 * and generate #DB/#BP as expected, which KVM might intercept. 1849 * But again, the CPU does the dirty work and saves an instr 1850 * length of zero so VMMs don't shoot themselves in the foot. 1851 * WARN if KVM tries to skip a non-zero length instruction on 1852 * a VM-Exit from an enclave. 1853 */ 1854 if (!instr_len) 1855 goto rip_updated; 1856 1857 WARN_ONCE(exit_reason.enclave_mode, 1858 "skipping instruction after SGX enclave VM-Exit"); 1859 1860 orig_rip = kvm_rip_read(vcpu); 1861 rip = orig_rip + instr_len; 1862 #ifdef CONFIG_X86_64 1863 /* 1864 * We need to mask out the high 32 bits of RIP if not in 64-bit 1865 * mode, but just finding out that we are in 64-bit mode is 1866 * quite expensive. Only do it if there was a carry. 1867 */ 1868 if (unlikely(((rip ^ orig_rip) >> 31) == 3) && !is_64_bit_mode(vcpu)) 1869 rip = (u32)rip; 1870 #endif 1871 kvm_rip_write(vcpu, rip); 1872 } else { 1873 if (!kvm_emulate_instruction(vcpu, EMULTYPE_SKIP)) 1874 return 0; 1875 } 1876 1877 rip_updated: 1878 /* skipping an emulated instruction also counts */ 1879 vmx_set_interrupt_shadow(vcpu, 0); 1880 1881 return 1; 1882 } 1883 1884 /* 1885 * Recognizes a pending MTF VM-exit and records the nested state for later 1886 * delivery. 1887 */ 1888 void vmx_update_emulated_instruction(struct kvm_vcpu *vcpu) 1889 { 1890 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 1891 struct vcpu_vmx *vmx = to_vmx(vcpu); 1892 1893 if (!is_guest_mode(vcpu)) 1894 return; 1895 1896 /* 1897 * Per the SDM, MTF takes priority over debug-trap exceptions besides 1898 * TSS T-bit traps and ICEBP (INT1). KVM doesn't emulate T-bit traps 1899 * or ICEBP (in the emulator proper), and skipping of ICEBP after an 1900 * intercepted #DB deliberately avoids single-step #DB and MTF updates 1901 * as ICEBP is higher priority than both. As instruction emulation is 1902 * completed at this point (i.e. KVM is at the instruction boundary), 1903 * any #DB exception pending delivery must be a debug-trap of lower 1904 * priority than MTF. Record the pending MTF state to be delivered in 1905 * vmx_check_nested_events(). 1906 */ 1907 if (nested_cpu_has_mtf(vmcs12) && 1908 (!vcpu->arch.exception.pending || 1909 vcpu->arch.exception.vector == DB_VECTOR) && 1910 (!vcpu->arch.exception_vmexit.pending || 1911 vcpu->arch.exception_vmexit.vector == DB_VECTOR)) { 1912 vmx->nested.mtf_pending = true; 1913 kvm_make_request(KVM_REQ_EVENT, vcpu); 1914 } else { 1915 vmx->nested.mtf_pending = false; 1916 } 1917 } 1918 1919 int vmx_skip_emulated_instruction(struct kvm_vcpu *vcpu) 1920 { 1921 vmx_update_emulated_instruction(vcpu); 1922 return skip_emulated_instruction(vcpu); 1923 } 1924 1925 static void vmx_clear_hlt(struct kvm_vcpu *vcpu) 1926 { 1927 /* 1928 * Ensure that we clear the HLT state in the VMCS. We don't need to 1929 * explicitly skip the instruction because if the HLT state is set, 1930 * then the instruction is already executing and RIP has already been 1931 * advanced. 1932 */ 1933 if (kvm_hlt_in_guest(vcpu->kvm) && 1934 vmcs_read32(GUEST_ACTIVITY_STATE) == GUEST_ACTIVITY_HLT) 1935 vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE); 1936 } 1937 1938 void vmx_inject_exception(struct kvm_vcpu *vcpu) 1939 { 1940 struct kvm_queued_exception *ex = &vcpu->arch.exception; 1941 u32 intr_info = ex->vector | INTR_INFO_VALID_MASK; 1942 struct vcpu_vmx *vmx = to_vmx(vcpu); 1943 1944 kvm_deliver_exception_payload(vcpu, ex); 1945 1946 if (ex->has_error_code) { 1947 /* 1948 * Despite the error code being architecturally defined as 32 1949 * bits, and the VMCS field being 32 bits, Intel CPUs and thus 1950 * VMX don't actually supporting setting bits 31:16. Hardware 1951 * will (should) never provide a bogus error code, but AMD CPUs 1952 * do generate error codes with bits 31:16 set, and so KVM's 1953 * ABI lets userspace shove in arbitrary 32-bit values. Drop 1954 * the upper bits to avoid VM-Fail, losing information that 1955 * doesn't really exist is preferable to killing the VM. 1956 */ 1957 vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, (u16)ex->error_code); 1958 intr_info |= INTR_INFO_DELIVER_CODE_MASK; 1959 } 1960 1961 if (vmx->rmode.vm86_active) { 1962 int inc_eip = 0; 1963 if (kvm_exception_is_soft(ex->vector)) 1964 inc_eip = vcpu->arch.event_exit_inst_len; 1965 kvm_inject_realmode_interrupt(vcpu, ex->vector, inc_eip); 1966 return; 1967 } 1968 1969 WARN_ON_ONCE(vmx->vt.emulation_required); 1970 1971 if (kvm_exception_is_soft(ex->vector)) { 1972 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 1973 vmx->vcpu.arch.event_exit_inst_len); 1974 intr_info |= INTR_TYPE_SOFT_EXCEPTION; 1975 } else 1976 intr_info |= INTR_TYPE_HARD_EXCEPTION; 1977 1978 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr_info); 1979 1980 vmx_clear_hlt(vcpu); 1981 } 1982 1983 static void vmx_setup_uret_msr(struct vcpu_vmx *vmx, unsigned int msr, 1984 bool load_into_hardware) 1985 { 1986 struct vmx_uret_msr *uret_msr; 1987 1988 uret_msr = vmx_find_uret_msr(vmx, msr); 1989 if (!uret_msr) 1990 return; 1991 1992 uret_msr->load_into_hardware = load_into_hardware; 1993 } 1994 1995 /* 1996 * Configuring user return MSRs to automatically save, load, and restore MSRs 1997 * that need to be shoved into hardware when running the guest. Note, omitting 1998 * an MSR here does _NOT_ mean it's not emulated, only that it will not be 1999 * loaded into hardware when running the guest. 2000 */ 2001 static void vmx_setup_uret_msrs(struct vcpu_vmx *vmx) 2002 { 2003 #ifdef CONFIG_X86_64 2004 bool load_syscall_msrs; 2005 2006 /* 2007 * The SYSCALL MSRs are only needed on long mode guests, and only 2008 * when EFER.SCE is set. 2009 */ 2010 load_syscall_msrs = is_long_mode(&vmx->vcpu) && 2011 (vmx->vcpu.arch.efer & EFER_SCE); 2012 2013 vmx_setup_uret_msr(vmx, MSR_STAR, load_syscall_msrs); 2014 vmx_setup_uret_msr(vmx, MSR_LSTAR, load_syscall_msrs); 2015 vmx_setup_uret_msr(vmx, MSR_SYSCALL_MASK, load_syscall_msrs); 2016 #endif 2017 vmx_setup_uret_msr(vmx, MSR_EFER, update_transition_efer(vmx)); 2018 2019 vmx_setup_uret_msr(vmx, MSR_TSC_AUX, 2020 guest_cpu_cap_has(&vmx->vcpu, X86_FEATURE_RDTSCP) || 2021 guest_cpu_cap_has(&vmx->vcpu, X86_FEATURE_RDPID)); 2022 2023 /* 2024 * hle=0, rtm=0, tsx_ctrl=1 can be found with some combinations of new 2025 * kernel and old userspace. If those guests run on a tsx=off host, do 2026 * allow guests to use TSX_CTRL, but don't change the value in hardware 2027 * so that TSX remains always disabled. 2028 */ 2029 vmx_setup_uret_msr(vmx, MSR_IA32_TSX_CTRL, boot_cpu_has(X86_FEATURE_RTM)); 2030 2031 /* 2032 * The set of MSRs to load may have changed, reload MSRs before the 2033 * next VM-Enter. 2034 */ 2035 vmx->guest_uret_msrs_loaded = false; 2036 } 2037 2038 u64 vmx_get_l2_tsc_offset(struct kvm_vcpu *vcpu) 2039 { 2040 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 2041 2042 if (nested_cpu_has(vmcs12, CPU_BASED_USE_TSC_OFFSETTING)) 2043 return vmcs12->tsc_offset; 2044 2045 return 0; 2046 } 2047 2048 u64 vmx_get_l2_tsc_multiplier(struct kvm_vcpu *vcpu) 2049 { 2050 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 2051 2052 if (nested_cpu_has(vmcs12, CPU_BASED_USE_TSC_OFFSETTING) && 2053 nested_cpu_has2(vmcs12, SECONDARY_EXEC_TSC_SCALING)) 2054 return vmcs12->tsc_multiplier; 2055 2056 return kvm_caps.default_tsc_scaling_ratio; 2057 } 2058 2059 void vmx_write_tsc_offset(struct kvm_vcpu *vcpu) 2060 { 2061 vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset); 2062 } 2063 2064 void vmx_write_tsc_multiplier(struct kvm_vcpu *vcpu) 2065 { 2066 vmcs_write64(TSC_MULTIPLIER, vcpu->arch.tsc_scaling_ratio); 2067 } 2068 2069 /* 2070 * Userspace is allowed to set any supported IA32_FEATURE_CONTROL regardless of 2071 * guest CPUID. Note, KVM allows userspace to set "VMX in SMX" to maintain 2072 * backwards compatibility even though KVM doesn't support emulating SMX. And 2073 * because userspace set "VMX in SMX", the guest must also be allowed to set it, 2074 * e.g. if the MSR is left unlocked and the guest does a RMW operation. 2075 */ 2076 #define KVM_SUPPORTED_FEATURE_CONTROL (FEAT_CTL_LOCKED | \ 2077 FEAT_CTL_VMX_ENABLED_INSIDE_SMX | \ 2078 FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX | \ 2079 FEAT_CTL_SGX_LC_ENABLED | \ 2080 FEAT_CTL_SGX_ENABLED | \ 2081 FEAT_CTL_LMCE_ENABLED) 2082 2083 static inline bool is_vmx_feature_control_msr_valid(struct vcpu_vmx *vmx, 2084 struct msr_data *msr) 2085 { 2086 uint64_t valid_bits; 2087 2088 /* 2089 * Ensure KVM_SUPPORTED_FEATURE_CONTROL is updated when new bits are 2090 * exposed to the guest. 2091 */ 2092 WARN_ON_ONCE(vmx->msr_ia32_feature_control_valid_bits & 2093 ~KVM_SUPPORTED_FEATURE_CONTROL); 2094 2095 if (!msr->host_initiated && 2096 (vmx->msr_ia32_feature_control & FEAT_CTL_LOCKED)) 2097 return false; 2098 2099 if (msr->host_initiated) 2100 valid_bits = KVM_SUPPORTED_FEATURE_CONTROL; 2101 else 2102 valid_bits = vmx->msr_ia32_feature_control_valid_bits; 2103 2104 return !(msr->data & ~valid_bits); 2105 } 2106 2107 int vmx_get_feature_msr(u32 msr, u64 *data) 2108 { 2109 switch (msr) { 2110 case KVM_FIRST_EMULATED_VMX_MSR ... KVM_LAST_EMULATED_VMX_MSR: 2111 if (!nested) 2112 return 1; 2113 return vmx_get_vmx_msr(&vmcs_config.nested, msr, data); 2114 default: 2115 return KVM_MSR_RET_UNSUPPORTED; 2116 } 2117 } 2118 2119 /* 2120 * Reads an msr value (of 'msr_info->index') into 'msr_info->data'. 2121 * Returns 0 on success, non-0 otherwise. 2122 * Assumes vcpu_load() was already called. 2123 */ 2124 int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) 2125 { 2126 struct vcpu_vmx *vmx = to_vmx(vcpu); 2127 struct vmx_uret_msr *msr; 2128 u32 index; 2129 2130 switch (msr_info->index) { 2131 #ifdef CONFIG_X86_64 2132 case MSR_FS_BASE: 2133 msr_info->data = vmcs_readl(GUEST_FS_BASE); 2134 break; 2135 case MSR_GS_BASE: 2136 msr_info->data = vmcs_readl(GUEST_GS_BASE); 2137 break; 2138 case MSR_KERNEL_GS_BASE: 2139 msr_info->data = vmx_read_guest_kernel_gs_base(vmx); 2140 break; 2141 #endif 2142 case MSR_EFER: 2143 return kvm_get_msr_common(vcpu, msr_info); 2144 case MSR_IA32_TSX_CTRL: 2145 if (!msr_info->host_initiated && 2146 !(vcpu->arch.arch_capabilities & ARCH_CAP_TSX_CTRL_MSR)) 2147 return 1; 2148 goto find_uret_msr; 2149 case MSR_IA32_UMWAIT_CONTROL: 2150 if (!msr_info->host_initiated && !vmx_has_waitpkg(vmx)) 2151 return 1; 2152 2153 msr_info->data = vmx->msr_ia32_umwait_control; 2154 break; 2155 case MSR_IA32_SPEC_CTRL: 2156 if (!msr_info->host_initiated && 2157 !guest_has_spec_ctrl_msr(vcpu)) 2158 return 1; 2159 2160 msr_info->data = to_vmx(vcpu)->spec_ctrl; 2161 break; 2162 case MSR_IA32_SYSENTER_CS: 2163 msr_info->data = vmcs_read32(GUEST_SYSENTER_CS); 2164 break; 2165 case MSR_IA32_SYSENTER_EIP: 2166 msr_info->data = vmcs_readl(GUEST_SYSENTER_EIP); 2167 break; 2168 case MSR_IA32_SYSENTER_ESP: 2169 msr_info->data = vmcs_readl(GUEST_SYSENTER_ESP); 2170 break; 2171 case MSR_IA32_BNDCFGS: 2172 if (!kvm_mpx_supported() || 2173 (!msr_info->host_initiated && 2174 !guest_cpu_cap_has(vcpu, X86_FEATURE_MPX))) 2175 return 1; 2176 msr_info->data = vmcs_read64(GUEST_BNDCFGS); 2177 break; 2178 case MSR_IA32_MCG_EXT_CTL: 2179 if (!msr_info->host_initiated && 2180 !(vmx->msr_ia32_feature_control & 2181 FEAT_CTL_LMCE_ENABLED)) 2182 return 1; 2183 msr_info->data = vcpu->arch.mcg_ext_ctl; 2184 break; 2185 case MSR_IA32_FEAT_CTL: 2186 msr_info->data = vmx->msr_ia32_feature_control; 2187 break; 2188 case MSR_IA32_SGXLEPUBKEYHASH0 ... MSR_IA32_SGXLEPUBKEYHASH3: 2189 if (!msr_info->host_initiated && 2190 !guest_cpu_cap_has(vcpu, X86_FEATURE_SGX_LC)) 2191 return 1; 2192 msr_info->data = to_vmx(vcpu)->msr_ia32_sgxlepubkeyhash 2193 [msr_info->index - MSR_IA32_SGXLEPUBKEYHASH0]; 2194 break; 2195 case KVM_FIRST_EMULATED_VMX_MSR ... KVM_LAST_EMULATED_VMX_MSR: 2196 if (!guest_cpu_cap_has(vcpu, X86_FEATURE_VMX)) 2197 return 1; 2198 if (vmx_get_vmx_msr(&vmx->nested.msrs, msr_info->index, 2199 &msr_info->data)) 2200 return 1; 2201 #ifdef CONFIG_KVM_HYPERV 2202 /* 2203 * Enlightened VMCS v1 doesn't have certain VMCS fields but 2204 * instead of just ignoring the features, different Hyper-V 2205 * versions are either trying to use them and fail or do some 2206 * sanity checking and refuse to boot. Filter all unsupported 2207 * features out. 2208 */ 2209 if (!msr_info->host_initiated && guest_cpu_cap_has_evmcs(vcpu)) 2210 nested_evmcs_filter_control_msr(vcpu, msr_info->index, 2211 &msr_info->data); 2212 #endif 2213 break; 2214 case MSR_IA32_RTIT_CTL: 2215 if (!vmx_pt_mode_is_host_guest()) 2216 return 1; 2217 msr_info->data = vmx->pt_desc.guest.ctl; 2218 break; 2219 case MSR_IA32_RTIT_STATUS: 2220 if (!vmx_pt_mode_is_host_guest()) 2221 return 1; 2222 msr_info->data = vmx->pt_desc.guest.status; 2223 break; 2224 case MSR_IA32_RTIT_CR3_MATCH: 2225 if (!vmx_pt_mode_is_host_guest() || 2226 !intel_pt_validate_cap(vmx->pt_desc.caps, 2227 PT_CAP_cr3_filtering)) 2228 return 1; 2229 msr_info->data = vmx->pt_desc.guest.cr3_match; 2230 break; 2231 case MSR_IA32_RTIT_OUTPUT_BASE: 2232 if (!vmx_pt_mode_is_host_guest() || 2233 (!intel_pt_validate_cap(vmx->pt_desc.caps, 2234 PT_CAP_topa_output) && 2235 !intel_pt_validate_cap(vmx->pt_desc.caps, 2236 PT_CAP_single_range_output))) 2237 return 1; 2238 msr_info->data = vmx->pt_desc.guest.output_base; 2239 break; 2240 case MSR_IA32_RTIT_OUTPUT_MASK: 2241 if (!vmx_pt_mode_is_host_guest() || 2242 (!intel_pt_validate_cap(vmx->pt_desc.caps, 2243 PT_CAP_topa_output) && 2244 !intel_pt_validate_cap(vmx->pt_desc.caps, 2245 PT_CAP_single_range_output))) 2246 return 1; 2247 msr_info->data = vmx->pt_desc.guest.output_mask; 2248 break; 2249 case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B: 2250 index = msr_info->index - MSR_IA32_RTIT_ADDR0_A; 2251 if (!vmx_pt_mode_is_host_guest() || 2252 (index >= 2 * vmx->pt_desc.num_address_ranges)) 2253 return 1; 2254 if (index % 2) 2255 msr_info->data = vmx->pt_desc.guest.addr_b[index / 2]; 2256 else 2257 msr_info->data = vmx->pt_desc.guest.addr_a[index / 2]; 2258 break; 2259 case MSR_IA32_S_CET: 2260 msr_info->data = vmcs_readl(GUEST_S_CET); 2261 break; 2262 case MSR_KVM_INTERNAL_GUEST_SSP: 2263 msr_info->data = vmcs_readl(GUEST_SSP); 2264 break; 2265 case MSR_IA32_INT_SSP_TAB: 2266 msr_info->data = vmcs_readl(GUEST_INTR_SSP_TABLE); 2267 break; 2268 case MSR_IA32_DEBUGCTLMSR: 2269 msr_info->data = vmx_guest_debugctl_read(); 2270 break; 2271 default: 2272 find_uret_msr: 2273 msr = vmx_find_uret_msr(vmx, msr_info->index); 2274 if (msr) { 2275 msr_info->data = msr->data; 2276 break; 2277 } 2278 return kvm_get_msr_common(vcpu, msr_info); 2279 } 2280 2281 return 0; 2282 } 2283 2284 static u64 nested_vmx_truncate_sysenter_addr(struct kvm_vcpu *vcpu, 2285 u64 data) 2286 { 2287 #ifdef CONFIG_X86_64 2288 if (!guest_cpu_cap_has(vcpu, X86_FEATURE_LM)) 2289 return (u32)data; 2290 #endif 2291 return (unsigned long)data; 2292 } 2293 2294 u64 vmx_get_supported_debugctl(struct kvm_vcpu *vcpu, bool host_initiated) 2295 { 2296 u64 debugctl = 0; 2297 2298 if (boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT) && 2299 (host_initiated || guest_cpu_cap_has(vcpu, X86_FEATURE_BUS_LOCK_DETECT))) 2300 debugctl |= DEBUGCTLMSR_BUS_LOCK_DETECT; 2301 2302 if ((kvm_caps.supported_perf_cap & PERF_CAP_LBR_FMT) && 2303 (host_initiated || intel_pmu_lbr_is_enabled(vcpu))) 2304 debugctl |= DEBUGCTLMSR_LBR | DEBUGCTLMSR_FREEZE_LBRS_ON_PMI; 2305 2306 if (boot_cpu_has(X86_FEATURE_RTM) && 2307 (host_initiated || guest_cpu_cap_has(vcpu, X86_FEATURE_RTM))) 2308 debugctl |= DEBUGCTLMSR_RTM_DEBUG; 2309 2310 return debugctl; 2311 } 2312 2313 bool vmx_is_valid_debugctl(struct kvm_vcpu *vcpu, u64 data, bool host_initiated) 2314 { 2315 u64 invalid; 2316 2317 invalid = data & ~vmx_get_supported_debugctl(vcpu, host_initiated); 2318 if (invalid & (DEBUGCTLMSR_BTF | DEBUGCTLMSR_LBR)) { 2319 kvm_pr_unimpl_wrmsr(vcpu, MSR_IA32_DEBUGCTLMSR, data); 2320 invalid &= ~(DEBUGCTLMSR_BTF | DEBUGCTLMSR_LBR); 2321 } 2322 return !invalid; 2323 } 2324 2325 /* 2326 * Writes msr value into the appropriate "register". 2327 * Returns 0 on success, non-0 otherwise. 2328 * Assumes vcpu_load() was already called. 2329 */ 2330 int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) 2331 { 2332 struct vcpu_vmx *vmx = to_vmx(vcpu); 2333 struct vmx_uret_msr *msr; 2334 int ret = 0; 2335 u32 msr_index = msr_info->index; 2336 u64 data = msr_info->data; 2337 u32 index; 2338 2339 switch (msr_index) { 2340 case MSR_EFER: 2341 ret = kvm_set_msr_common(vcpu, msr_info); 2342 break; 2343 #ifdef CONFIG_X86_64 2344 case MSR_FS_BASE: 2345 vmx_segment_cache_clear(vmx); 2346 vmcs_writel(GUEST_FS_BASE, data); 2347 break; 2348 case MSR_GS_BASE: 2349 vmx_segment_cache_clear(vmx); 2350 vmcs_writel(GUEST_GS_BASE, data); 2351 break; 2352 case MSR_KERNEL_GS_BASE: 2353 vmx_write_guest_kernel_gs_base(vmx, data); 2354 break; 2355 case MSR_IA32_XFD: 2356 ret = kvm_set_msr_common(vcpu, msr_info); 2357 /* 2358 * Always intercepting WRMSR could incur non-negligible 2359 * overhead given xfd might be changed frequently in 2360 * guest context switch. Disable write interception 2361 * upon the first write with a non-zero value (indicating 2362 * potential usage on dynamic xfeatures). Also update 2363 * exception bitmap to trap #NM for proper virtualization 2364 * of guest xfd_err. 2365 */ 2366 if (!ret && data) { 2367 vmx_disable_intercept_for_msr(vcpu, MSR_IA32_XFD, 2368 MSR_TYPE_RW); 2369 vcpu->arch.xfd_no_write_intercept = true; 2370 vmx_update_exception_bitmap(vcpu); 2371 } 2372 break; 2373 #endif 2374 case MSR_IA32_SYSENTER_CS: 2375 if (is_guest_mode(vcpu)) 2376 get_vmcs12(vcpu)->guest_sysenter_cs = data; 2377 vmcs_write32(GUEST_SYSENTER_CS, data); 2378 break; 2379 case MSR_IA32_SYSENTER_EIP: 2380 if (is_guest_mode(vcpu)) { 2381 data = nested_vmx_truncate_sysenter_addr(vcpu, data); 2382 get_vmcs12(vcpu)->guest_sysenter_eip = data; 2383 } 2384 vmcs_writel(GUEST_SYSENTER_EIP, data); 2385 break; 2386 case MSR_IA32_SYSENTER_ESP: 2387 if (is_guest_mode(vcpu)) { 2388 data = nested_vmx_truncate_sysenter_addr(vcpu, data); 2389 get_vmcs12(vcpu)->guest_sysenter_esp = data; 2390 } 2391 vmcs_writel(GUEST_SYSENTER_ESP, data); 2392 break; 2393 case MSR_IA32_DEBUGCTLMSR: 2394 if (!vmx_is_valid_debugctl(vcpu, data, msr_info->host_initiated)) 2395 return 1; 2396 2397 data &= vmx_get_supported_debugctl(vcpu, msr_info->host_initiated); 2398 2399 if (is_guest_mode(vcpu) && get_vmcs12(vcpu)->vm_exit_controls & 2400 VM_EXIT_SAVE_DEBUG_CONTROLS) 2401 get_vmcs12(vcpu)->guest_ia32_debugctl = data; 2402 2403 vmx_guest_debugctl_write(vcpu, data); 2404 2405 if (intel_pmu_lbr_is_enabled(vcpu) && !to_vmx(vcpu)->lbr_desc.event && 2406 (data & DEBUGCTLMSR_LBR)) 2407 intel_pmu_create_guest_lbr_event(vcpu); 2408 return 0; 2409 case MSR_IA32_BNDCFGS: 2410 if (!kvm_mpx_supported() || 2411 (!msr_info->host_initiated && 2412 !guest_cpu_cap_has(vcpu, X86_FEATURE_MPX))) 2413 return 1; 2414 if (is_noncanonical_msr_address(data & PAGE_MASK, vcpu) || 2415 (data & MSR_IA32_BNDCFGS_RSVD)) 2416 return 1; 2417 2418 if (is_guest_mode(vcpu) && 2419 ((vmx->nested.msrs.entry_ctls_high & VM_ENTRY_LOAD_BNDCFGS) || 2420 (vmx->nested.msrs.exit_ctls_high & VM_EXIT_CLEAR_BNDCFGS))) 2421 get_vmcs12(vcpu)->guest_bndcfgs = data; 2422 2423 vmcs_write64(GUEST_BNDCFGS, data); 2424 break; 2425 case MSR_IA32_UMWAIT_CONTROL: 2426 if (!msr_info->host_initiated && !vmx_has_waitpkg(vmx)) 2427 return 1; 2428 2429 /* The reserved bit 1 and non-32 bit [63:32] should be zero */ 2430 if (data & (BIT_ULL(1) | GENMASK_ULL(63, 32))) 2431 return 1; 2432 2433 vmx->msr_ia32_umwait_control = data; 2434 break; 2435 case MSR_IA32_SPEC_CTRL: 2436 if (!msr_info->host_initiated && 2437 !guest_has_spec_ctrl_msr(vcpu)) 2438 return 1; 2439 2440 if (kvm_spec_ctrl_test_value(data)) 2441 return 1; 2442 2443 vmx->spec_ctrl = data; 2444 if (!data) 2445 break; 2446 2447 /* 2448 * For non-nested: 2449 * When it's written (to non-zero) for the first time, pass 2450 * it through. 2451 * 2452 * For nested: 2453 * The handling of the MSR bitmap for L2 guests is done in 2454 * nested_vmx_prepare_msr_bitmap. We should not touch the 2455 * vmcs02.msr_bitmap here since it gets completely overwritten 2456 * in the merging. We update the vmcs01 here for L1 as well 2457 * since it will end up touching the MSR anyway now. 2458 */ 2459 vmx_disable_intercept_for_msr(vcpu, 2460 MSR_IA32_SPEC_CTRL, 2461 MSR_TYPE_RW); 2462 break; 2463 case MSR_IA32_TSX_CTRL: 2464 if (!msr_info->host_initiated && 2465 !(vcpu->arch.arch_capabilities & ARCH_CAP_TSX_CTRL_MSR)) 2466 return 1; 2467 if (data & ~(TSX_CTRL_RTM_DISABLE | TSX_CTRL_CPUID_CLEAR)) 2468 return 1; 2469 goto find_uret_msr; 2470 case MSR_IA32_CR_PAT: 2471 ret = kvm_set_msr_common(vcpu, msr_info); 2472 if (ret) 2473 break; 2474 2475 if (is_guest_mode(vcpu) && 2476 get_vmcs12(vcpu)->vm_exit_controls & VM_EXIT_SAVE_IA32_PAT) 2477 get_vmcs12(vcpu)->guest_ia32_pat = data; 2478 2479 if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) 2480 vmcs_write64(GUEST_IA32_PAT, data); 2481 break; 2482 case MSR_IA32_MCG_EXT_CTL: 2483 if ((!msr_info->host_initiated && 2484 !(to_vmx(vcpu)->msr_ia32_feature_control & 2485 FEAT_CTL_LMCE_ENABLED)) || 2486 (data & ~MCG_EXT_CTL_LMCE_EN)) 2487 return 1; 2488 vcpu->arch.mcg_ext_ctl = data; 2489 break; 2490 case MSR_IA32_FEAT_CTL: 2491 if (!is_vmx_feature_control_msr_valid(vmx, msr_info)) 2492 return 1; 2493 2494 vmx->msr_ia32_feature_control = data; 2495 if (msr_info->host_initiated && data == 0) 2496 vmx_leave_nested(vcpu); 2497 2498 /* SGX may be enabled/disabled by guest's firmware */ 2499 vmx_write_encls_bitmap(vcpu, NULL); 2500 break; 2501 case MSR_IA32_SGXLEPUBKEYHASH0 ... MSR_IA32_SGXLEPUBKEYHASH3: 2502 /* 2503 * On real hardware, the LE hash MSRs are writable before 2504 * the firmware sets bit 0 in MSR 0x7a ("activating" SGX), 2505 * at which point SGX related bits in IA32_FEATURE_CONTROL 2506 * become writable. 2507 * 2508 * KVM does not emulate SGX activation for simplicity, so 2509 * allow writes to the LE hash MSRs if IA32_FEATURE_CONTROL 2510 * is unlocked. This is technically not architectural 2511 * behavior, but it's close enough. 2512 */ 2513 if (!msr_info->host_initiated && 2514 (!guest_cpu_cap_has(vcpu, X86_FEATURE_SGX_LC) || 2515 ((vmx->msr_ia32_feature_control & FEAT_CTL_LOCKED) && 2516 !(vmx->msr_ia32_feature_control & FEAT_CTL_SGX_LC_ENABLED)))) 2517 return 1; 2518 vmx->msr_ia32_sgxlepubkeyhash 2519 [msr_index - MSR_IA32_SGXLEPUBKEYHASH0] = data; 2520 break; 2521 case KVM_FIRST_EMULATED_VMX_MSR ... KVM_LAST_EMULATED_VMX_MSR: 2522 if (!msr_info->host_initiated) 2523 return 1; /* they are read-only */ 2524 if (!guest_cpu_cap_has(vcpu, X86_FEATURE_VMX)) 2525 return 1; 2526 return vmx_set_vmx_msr(vcpu, msr_index, data); 2527 case MSR_IA32_RTIT_CTL: 2528 if (!vmx_pt_mode_is_host_guest() || 2529 vmx_rtit_ctl_check(vcpu, data) || 2530 vmx->nested.vmxon) 2531 return 1; 2532 vmcs_write64(GUEST_IA32_RTIT_CTL, data); 2533 vmx->pt_desc.guest.ctl = data; 2534 pt_update_intercept_for_msr(vcpu); 2535 break; 2536 case MSR_IA32_RTIT_STATUS: 2537 if (!pt_can_write_msr(vmx)) 2538 return 1; 2539 if (data & MSR_IA32_RTIT_STATUS_MASK) 2540 return 1; 2541 vmx->pt_desc.guest.status = data; 2542 break; 2543 case MSR_IA32_RTIT_CR3_MATCH: 2544 if (!pt_can_write_msr(vmx)) 2545 return 1; 2546 if (!intel_pt_validate_cap(vmx->pt_desc.caps, 2547 PT_CAP_cr3_filtering)) 2548 return 1; 2549 vmx->pt_desc.guest.cr3_match = data; 2550 break; 2551 case MSR_IA32_RTIT_OUTPUT_BASE: 2552 if (!pt_can_write_msr(vmx)) 2553 return 1; 2554 if (!intel_pt_validate_cap(vmx->pt_desc.caps, 2555 PT_CAP_topa_output) && 2556 !intel_pt_validate_cap(vmx->pt_desc.caps, 2557 PT_CAP_single_range_output)) 2558 return 1; 2559 if (!pt_output_base_valid(vcpu, data)) 2560 return 1; 2561 vmx->pt_desc.guest.output_base = data; 2562 break; 2563 case MSR_IA32_RTIT_OUTPUT_MASK: 2564 if (!pt_can_write_msr(vmx)) 2565 return 1; 2566 if (!intel_pt_validate_cap(vmx->pt_desc.caps, 2567 PT_CAP_topa_output) && 2568 !intel_pt_validate_cap(vmx->pt_desc.caps, 2569 PT_CAP_single_range_output)) 2570 return 1; 2571 vmx->pt_desc.guest.output_mask = data; 2572 break; 2573 case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B: 2574 if (!pt_can_write_msr(vmx)) 2575 return 1; 2576 index = msr_info->index - MSR_IA32_RTIT_ADDR0_A; 2577 if (index >= 2 * vmx->pt_desc.num_address_ranges) 2578 return 1; 2579 if (is_noncanonical_msr_address(data, vcpu)) 2580 return 1; 2581 if (index % 2) 2582 vmx->pt_desc.guest.addr_b[index / 2] = data; 2583 else 2584 vmx->pt_desc.guest.addr_a[index / 2] = data; 2585 break; 2586 case MSR_IA32_S_CET: 2587 vmcs_writel(GUEST_S_CET, data); 2588 break; 2589 case MSR_KVM_INTERNAL_GUEST_SSP: 2590 vmcs_writel(GUEST_SSP, data); 2591 break; 2592 case MSR_IA32_INT_SSP_TAB: 2593 vmcs_writel(GUEST_INTR_SSP_TABLE, data); 2594 break; 2595 case MSR_IA32_PERF_CAPABILITIES: 2596 if (data & PERF_CAP_LBR_FMT) { 2597 if ((data & PERF_CAP_LBR_FMT) != 2598 (kvm_caps.supported_perf_cap & PERF_CAP_LBR_FMT)) 2599 return 1; 2600 if (!cpuid_model_is_consistent(vcpu)) 2601 return 1; 2602 } 2603 if (data & PERF_CAP_PEBS_FORMAT) { 2604 if ((data & PERF_CAP_PEBS_MASK) != 2605 (kvm_caps.supported_perf_cap & PERF_CAP_PEBS_MASK)) 2606 return 1; 2607 if (!guest_cpu_cap_has(vcpu, X86_FEATURE_DS)) 2608 return 1; 2609 if (!guest_cpu_cap_has(vcpu, X86_FEATURE_DTES64)) 2610 return 1; 2611 if (!cpuid_model_is_consistent(vcpu)) 2612 return 1; 2613 } 2614 ret = kvm_set_msr_common(vcpu, msr_info); 2615 break; 2616 2617 default: 2618 find_uret_msr: 2619 msr = vmx_find_uret_msr(vmx, msr_index); 2620 if (msr) 2621 ret = vmx_set_guest_uret_msr(vmx, msr, data); 2622 else 2623 ret = kvm_set_msr_common(vcpu, msr_info); 2624 } 2625 2626 /* FB_CLEAR may have changed, also update the FB_CLEAR_DIS behavior */ 2627 if (msr_index == MSR_IA32_ARCH_CAPABILITIES) 2628 vmx_update_fb_clear_dis(vcpu, vmx); 2629 2630 return ret; 2631 } 2632 2633 void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg) 2634 { 2635 unsigned long guest_owned_bits; 2636 2637 kvm_register_mark_available(vcpu, reg); 2638 2639 switch (reg) { 2640 case VCPU_REGS_RSP: 2641 vcpu->arch.regs[VCPU_REGS_RSP] = vmcs_readl(GUEST_RSP); 2642 break; 2643 case VCPU_REGS_RIP: 2644 vcpu->arch.regs[VCPU_REGS_RIP] = vmcs_readl(GUEST_RIP); 2645 break; 2646 case VCPU_EXREG_PDPTR: 2647 if (enable_ept) 2648 ept_save_pdptrs(vcpu); 2649 break; 2650 case VCPU_EXREG_CR0: 2651 guest_owned_bits = vcpu->arch.cr0_guest_owned_bits; 2652 2653 vcpu->arch.cr0 &= ~guest_owned_bits; 2654 vcpu->arch.cr0 |= vmcs_readl(GUEST_CR0) & guest_owned_bits; 2655 break; 2656 case VCPU_EXREG_CR3: 2657 /* 2658 * When intercepting CR3 loads, e.g. for shadowing paging, KVM's 2659 * CR3 is loaded into hardware, not the guest's CR3. 2660 */ 2661 if (!(exec_controls_get(to_vmx(vcpu)) & CPU_BASED_CR3_LOAD_EXITING)) 2662 vcpu->arch.cr3 = vmcs_readl(GUEST_CR3); 2663 break; 2664 case VCPU_EXREG_CR4: 2665 guest_owned_bits = vcpu->arch.cr4_guest_owned_bits; 2666 2667 vcpu->arch.cr4 &= ~guest_owned_bits; 2668 vcpu->arch.cr4 |= vmcs_readl(GUEST_CR4) & guest_owned_bits; 2669 break; 2670 default: 2671 KVM_BUG_ON(1, vcpu->kvm); 2672 break; 2673 } 2674 } 2675 2676 /* 2677 * There is no X86_FEATURE for SGX yet, but anyway we need to query CPUID 2678 * directly instead of going through cpu_has(), to ensure KVM is trapping 2679 * ENCLS whenever it's supported in hardware. It does not matter whether 2680 * the host OS supports or has enabled SGX. 2681 */ 2682 static bool cpu_has_sgx(void) 2683 { 2684 return cpuid_eax(0) >= 0x12 && (cpuid_eax(0x12) & BIT(0)); 2685 } 2686 2687 static int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt, u32 msr, u32 *result) 2688 { 2689 u32 vmx_msr_low, vmx_msr_high; 2690 u32 ctl = ctl_min | ctl_opt; 2691 2692 rdmsr(msr, vmx_msr_low, vmx_msr_high); 2693 2694 ctl &= vmx_msr_high; /* bit == 0 in high word ==> must be zero */ 2695 ctl |= vmx_msr_low; /* bit == 1 in low word ==> must be one */ 2696 2697 /* Ensure minimum (required) set of control bits are supported. */ 2698 if (ctl_min & ~ctl) 2699 return -EIO; 2700 2701 *result = ctl; 2702 return 0; 2703 } 2704 2705 static u64 adjust_vmx_controls64(u64 ctl_opt, u32 msr) 2706 { 2707 u64 allowed; 2708 2709 rdmsrq(msr, allowed); 2710 2711 return ctl_opt & allowed; 2712 } 2713 2714 #define vmx_check_entry_exit_pairs(pairs, entry_controls, exit_controls) \ 2715 ({ \ 2716 int i, r = 0; \ 2717 \ 2718 BUILD_BUG_ON(sizeof(pairs[0].entry_control) != sizeof(entry_controls)); \ 2719 BUILD_BUG_ON(sizeof(pairs[0].exit_control) != sizeof(exit_controls)); \ 2720 \ 2721 for (i = 0; i < ARRAY_SIZE(pairs); i++) { \ 2722 typeof(entry_controls) n_ctrl = pairs[i].entry_control; \ 2723 typeof(exit_controls) x_ctrl = pairs[i].exit_control; \ 2724 \ 2725 if (!(entry_controls & n_ctrl) == !(exit_controls & x_ctrl)) \ 2726 continue; \ 2727 \ 2728 pr_warn_once("Inconsistent VM-Entry/VM-Exit pair, " \ 2729 "entry = %llx (%llx), exit = %llx (%llx)\n", \ 2730 (u64)(entry_controls & n_ctrl), (u64)n_ctrl, \ 2731 (u64)(exit_controls & x_ctrl), (u64)x_ctrl); \ 2732 \ 2733 if (error_on_inconsistent_vmcs_config) \ 2734 r = -EIO; \ 2735 \ 2736 entry_controls &= ~n_ctrl; \ 2737 exit_controls &= ~x_ctrl; \ 2738 } \ 2739 r; \ 2740 }) 2741 2742 static int setup_vmcs_config(struct vmcs_config *vmcs_conf, 2743 struct vmx_capability *vmx_cap) 2744 { 2745 u32 _pin_based_exec_control = 0; 2746 u32 _cpu_based_exec_control = 0; 2747 u32 _cpu_based_2nd_exec_control = 0; 2748 u64 _cpu_based_3rd_exec_control = 0; 2749 u32 _vmexit_control = 0; 2750 u32 _vmentry_control = 0; 2751 u64 basic_msr; 2752 u64 misc_msr; 2753 2754 /* 2755 * LOAD/SAVE_DEBUG_CONTROLS are absent because both are mandatory. 2756 * SAVE_IA32_PAT and SAVE_IA32_EFER are absent because KVM always 2757 * intercepts writes to PAT and EFER, i.e. never enables those controls. 2758 */ 2759 struct { 2760 u32 entry_control; 2761 u32 exit_control; 2762 } const vmcs_entry_exit_pairs[] = { 2763 { VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL, VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL }, 2764 { VM_ENTRY_LOAD_IA32_PAT, VM_EXIT_LOAD_IA32_PAT }, 2765 { VM_ENTRY_LOAD_IA32_EFER, VM_EXIT_LOAD_IA32_EFER }, 2766 { VM_ENTRY_LOAD_BNDCFGS, VM_EXIT_CLEAR_BNDCFGS }, 2767 { VM_ENTRY_LOAD_IA32_RTIT_CTL, VM_EXIT_CLEAR_IA32_RTIT_CTL }, 2768 { VM_ENTRY_LOAD_CET_STATE, VM_EXIT_LOAD_CET_STATE }, 2769 }; 2770 2771 memset(vmcs_conf, 0, sizeof(*vmcs_conf)); 2772 2773 if (adjust_vmx_controls(KVM_REQUIRED_VMX_CPU_BASED_VM_EXEC_CONTROL, 2774 KVM_OPTIONAL_VMX_CPU_BASED_VM_EXEC_CONTROL, 2775 MSR_IA32_VMX_PROCBASED_CTLS, 2776 &_cpu_based_exec_control)) 2777 return -EIO; 2778 if (_cpu_based_exec_control & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) { 2779 if (adjust_vmx_controls(KVM_REQUIRED_VMX_SECONDARY_VM_EXEC_CONTROL, 2780 KVM_OPTIONAL_VMX_SECONDARY_VM_EXEC_CONTROL, 2781 MSR_IA32_VMX_PROCBASED_CTLS2, 2782 &_cpu_based_2nd_exec_control)) 2783 return -EIO; 2784 } 2785 if (!IS_ENABLED(CONFIG_KVM_INTEL_PROVE_VE)) 2786 _cpu_based_2nd_exec_control &= ~SECONDARY_EXEC_EPT_VIOLATION_VE; 2787 2788 #ifndef CONFIG_X86_64 2789 if (!(_cpu_based_2nd_exec_control & 2790 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) 2791 _cpu_based_exec_control &= ~CPU_BASED_TPR_SHADOW; 2792 #endif 2793 2794 if (!(_cpu_based_exec_control & CPU_BASED_TPR_SHADOW)) 2795 _cpu_based_2nd_exec_control &= ~( 2796 SECONDARY_EXEC_APIC_REGISTER_VIRT | 2797 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | 2798 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY); 2799 2800 rdmsr_safe(MSR_IA32_VMX_EPT_VPID_CAP, 2801 &vmx_cap->ept, &vmx_cap->vpid); 2802 2803 if (!(_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_EPT) && 2804 vmx_cap->ept) { 2805 pr_warn_once("EPT CAP should not exist if not support " 2806 "1-setting enable EPT VM-execution control\n"); 2807 2808 if (error_on_inconsistent_vmcs_config) 2809 return -EIO; 2810 2811 vmx_cap->ept = 0; 2812 _cpu_based_2nd_exec_control &= ~SECONDARY_EXEC_EPT_VIOLATION_VE; 2813 } 2814 if (!(_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_VPID) && 2815 vmx_cap->vpid) { 2816 pr_warn_once("VPID CAP should not exist if not support " 2817 "1-setting enable VPID VM-execution control\n"); 2818 2819 if (error_on_inconsistent_vmcs_config) 2820 return -EIO; 2821 2822 vmx_cap->vpid = 0; 2823 } 2824 2825 if (!cpu_has_sgx()) 2826 _cpu_based_2nd_exec_control &= ~SECONDARY_EXEC_ENCLS_EXITING; 2827 2828 if (_cpu_based_exec_control & CPU_BASED_ACTIVATE_TERTIARY_CONTROLS) 2829 _cpu_based_3rd_exec_control = 2830 adjust_vmx_controls64(KVM_OPTIONAL_VMX_TERTIARY_VM_EXEC_CONTROL, 2831 MSR_IA32_VMX_PROCBASED_CTLS3); 2832 2833 if (adjust_vmx_controls(KVM_REQUIRED_VMX_VM_EXIT_CONTROLS, 2834 KVM_OPTIONAL_VMX_VM_EXIT_CONTROLS, 2835 MSR_IA32_VMX_EXIT_CTLS, 2836 &_vmexit_control)) 2837 return -EIO; 2838 2839 if (adjust_vmx_controls(KVM_REQUIRED_VMX_PIN_BASED_VM_EXEC_CONTROL, 2840 KVM_OPTIONAL_VMX_PIN_BASED_VM_EXEC_CONTROL, 2841 MSR_IA32_VMX_PINBASED_CTLS, 2842 &_pin_based_exec_control)) 2843 return -EIO; 2844 2845 if (cpu_has_broken_vmx_preemption_timer()) 2846 _pin_based_exec_control &= ~PIN_BASED_VMX_PREEMPTION_TIMER; 2847 if (!(_cpu_based_2nd_exec_control & 2848 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY)) 2849 _pin_based_exec_control &= ~PIN_BASED_POSTED_INTR; 2850 2851 if (adjust_vmx_controls(KVM_REQUIRED_VMX_VM_ENTRY_CONTROLS, 2852 KVM_OPTIONAL_VMX_VM_ENTRY_CONTROLS, 2853 MSR_IA32_VMX_ENTRY_CTLS, 2854 &_vmentry_control)) 2855 return -EIO; 2856 2857 if (vmx_check_entry_exit_pairs(vmcs_entry_exit_pairs, 2858 _vmentry_control, _vmexit_control)) 2859 return -EIO; 2860 2861 /* 2862 * Some cpus support VM_{ENTRY,EXIT}_IA32_PERF_GLOBAL_CTRL but they 2863 * can't be used due to an errata where VM Exit may incorrectly clear 2864 * IA32_PERF_GLOBAL_CTRL[34:32]. Workaround the errata by using the 2865 * MSR load mechanism to switch IA32_PERF_GLOBAL_CTRL. 2866 */ 2867 switch (boot_cpu_data.x86_vfm) { 2868 case INTEL_NEHALEM_EP: /* AAK155 */ 2869 case INTEL_NEHALEM: /* AAP115 */ 2870 case INTEL_WESTMERE: /* AAT100 */ 2871 case INTEL_WESTMERE_EP: /* BC86,AAY89,BD102 */ 2872 case INTEL_NEHALEM_EX: /* BA97 */ 2873 _vmentry_control &= ~VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL; 2874 _vmexit_control &= ~VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL; 2875 pr_warn_once("VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL " 2876 "does not work properly. Using workaround\n"); 2877 break; 2878 default: 2879 break; 2880 } 2881 2882 rdmsrq(MSR_IA32_VMX_BASIC, basic_msr); 2883 2884 /* IA-32 SDM Vol 3B: VMCS size is never greater than 4kB. */ 2885 if (vmx_basic_vmcs_size(basic_msr) > PAGE_SIZE) 2886 return -EIO; 2887 2888 #ifdef CONFIG_X86_64 2889 /* 2890 * KVM expects to be able to shove all legal physical addresses into 2891 * VMCS fields for 64-bit kernels, and per the SDM, "This bit is always 2892 * 0 for processors that support Intel 64 architecture". 2893 */ 2894 if (basic_msr & VMX_BASIC_32BIT_PHYS_ADDR_ONLY) 2895 return -EIO; 2896 #endif 2897 2898 /* Require Write-Back (WB) memory type for VMCS accesses. */ 2899 if (vmx_basic_vmcs_mem_type(basic_msr) != X86_MEMTYPE_WB) 2900 return -EIO; 2901 2902 rdmsrq(MSR_IA32_VMX_MISC, misc_msr); 2903 2904 vmcs_conf->basic = basic_msr; 2905 vmcs_conf->pin_based_exec_ctrl = _pin_based_exec_control; 2906 vmcs_conf->cpu_based_exec_ctrl = _cpu_based_exec_control; 2907 vmcs_conf->cpu_based_2nd_exec_ctrl = _cpu_based_2nd_exec_control; 2908 vmcs_conf->cpu_based_3rd_exec_ctrl = _cpu_based_3rd_exec_control; 2909 vmcs_conf->vmexit_ctrl = _vmexit_control; 2910 vmcs_conf->vmentry_ctrl = _vmentry_control; 2911 vmcs_conf->misc = misc_msr; 2912 2913 #if IS_ENABLED(CONFIG_HYPERV) 2914 if (enlightened_vmcs) 2915 evmcs_sanitize_exec_ctrls(vmcs_conf); 2916 #endif 2917 2918 return 0; 2919 } 2920 2921 static bool __kvm_is_vmx_supported(void) 2922 { 2923 int cpu = smp_processor_id(); 2924 2925 if (!(cpuid_ecx(1) & feature_bit(VMX))) { 2926 pr_err("VMX not supported by CPU %d\n", cpu); 2927 return false; 2928 } 2929 2930 if (!this_cpu_has(X86_FEATURE_MSR_IA32_FEAT_CTL) || 2931 !this_cpu_has(X86_FEATURE_VMX)) { 2932 pr_err("VMX not enabled (by BIOS) in MSR_IA32_FEAT_CTL on CPU %d\n", cpu); 2933 return false; 2934 } 2935 2936 return true; 2937 } 2938 2939 static bool kvm_is_vmx_supported(void) 2940 { 2941 bool supported; 2942 2943 migrate_disable(); 2944 supported = __kvm_is_vmx_supported(); 2945 migrate_enable(); 2946 2947 return supported; 2948 } 2949 2950 int vmx_check_processor_compat(void) 2951 { 2952 int cpu = raw_smp_processor_id(); 2953 struct vmcs_config vmcs_conf; 2954 struct vmx_capability vmx_cap; 2955 2956 if (!__kvm_is_vmx_supported()) 2957 return -EIO; 2958 2959 if (setup_vmcs_config(&vmcs_conf, &vmx_cap) < 0) { 2960 pr_err("Failed to setup VMCS config on CPU %d\n", cpu); 2961 return -EIO; 2962 } 2963 if (nested) 2964 nested_vmx_setup_ctls_msrs(&vmcs_conf, vmx_cap.ept); 2965 2966 if (memcmp(&vmcs_config, &vmcs_conf, sizeof(struct vmcs_config))) { 2967 u32 *gold = (void *)&vmcs_config; 2968 u32 *mine = (void *)&vmcs_conf; 2969 int i; 2970 2971 BUILD_BUG_ON(sizeof(struct vmcs_config) % sizeof(u32)); 2972 2973 pr_err("VMCS config on CPU %d doesn't match reference config:", cpu); 2974 for (i = 0; i < sizeof(struct vmcs_config) / sizeof(u32); i++) { 2975 if (gold[i] == mine[i]) 2976 continue; 2977 2978 pr_cont("\n Offset %u REF = 0x%08x, CPU%u = 0x%08x, mismatch = 0x%08x", 2979 i * (int)sizeof(u32), gold[i], cpu, mine[i], gold[i] ^ mine[i]); 2980 } 2981 pr_cont("\n"); 2982 return -EIO; 2983 } 2984 return 0; 2985 } 2986 2987 static int kvm_cpu_vmxon(u64 vmxon_pointer) 2988 { 2989 u64 msr; 2990 2991 cr4_set_bits(X86_CR4_VMXE); 2992 2993 asm goto("1: vmxon %[vmxon_pointer]\n\t" 2994 _ASM_EXTABLE(1b, %l[fault]) 2995 : : [vmxon_pointer] "m"(vmxon_pointer) 2996 : : fault); 2997 return 0; 2998 2999 fault: 3000 WARN_ONCE(1, "VMXON faulted, MSR_IA32_FEAT_CTL (0x3a) = 0x%llx\n", 3001 rdmsrq_safe(MSR_IA32_FEAT_CTL, &msr) ? 0xdeadbeef : msr); 3002 cr4_clear_bits(X86_CR4_VMXE); 3003 3004 return -EFAULT; 3005 } 3006 3007 int vmx_enable_virtualization_cpu(void) 3008 { 3009 int cpu = raw_smp_processor_id(); 3010 u64 phys_addr = __pa(per_cpu(vmxarea, cpu)); 3011 int r; 3012 3013 if (cr4_read_shadow() & X86_CR4_VMXE) 3014 return -EBUSY; 3015 3016 /* 3017 * This can happen if we hot-added a CPU but failed to allocate 3018 * VP assist page for it. 3019 */ 3020 if (kvm_is_using_evmcs() && !hv_get_vp_assist_page(cpu)) 3021 return -EFAULT; 3022 3023 intel_pt_handle_vmx(1); 3024 3025 r = kvm_cpu_vmxon(phys_addr); 3026 if (r) { 3027 intel_pt_handle_vmx(0); 3028 return r; 3029 } 3030 3031 return 0; 3032 } 3033 3034 static void vmclear_local_loaded_vmcss(void) 3035 { 3036 int cpu = raw_smp_processor_id(); 3037 struct loaded_vmcs *v, *n; 3038 3039 list_for_each_entry_safe(v, n, &per_cpu(loaded_vmcss_on_cpu, cpu), 3040 loaded_vmcss_on_cpu_link) 3041 __loaded_vmcs_clear(v); 3042 } 3043 3044 void vmx_disable_virtualization_cpu(void) 3045 { 3046 vmclear_local_loaded_vmcss(); 3047 3048 if (kvm_cpu_vmxoff()) 3049 kvm_spurious_fault(); 3050 3051 hv_reset_evmcs(); 3052 3053 intel_pt_handle_vmx(0); 3054 } 3055 3056 struct vmcs *alloc_vmcs_cpu(bool shadow, int cpu, gfp_t flags) 3057 { 3058 int node = cpu_to_node(cpu); 3059 struct page *pages; 3060 struct vmcs *vmcs; 3061 3062 pages = __alloc_pages_node(node, flags, 0); 3063 if (!pages) 3064 return NULL; 3065 vmcs = page_address(pages); 3066 memset(vmcs, 0, vmx_basic_vmcs_size(vmcs_config.basic)); 3067 3068 /* KVM supports Enlightened VMCS v1 only */ 3069 if (kvm_is_using_evmcs()) 3070 vmcs->hdr.revision_id = KVM_EVMCS_VERSION; 3071 else 3072 vmcs->hdr.revision_id = vmx_basic_vmcs_revision_id(vmcs_config.basic); 3073 3074 if (shadow) 3075 vmcs->hdr.shadow_vmcs = 1; 3076 return vmcs; 3077 } 3078 3079 void free_vmcs(struct vmcs *vmcs) 3080 { 3081 free_page((unsigned long)vmcs); 3082 } 3083 3084 /* 3085 * Free a VMCS, but before that VMCLEAR it on the CPU where it was last loaded 3086 */ 3087 void free_loaded_vmcs(struct loaded_vmcs *loaded_vmcs) 3088 { 3089 if (!loaded_vmcs->vmcs) 3090 return; 3091 loaded_vmcs_clear(loaded_vmcs); 3092 free_vmcs(loaded_vmcs->vmcs); 3093 loaded_vmcs->vmcs = NULL; 3094 if (loaded_vmcs->msr_bitmap) 3095 free_page((unsigned long)loaded_vmcs->msr_bitmap); 3096 WARN_ON(loaded_vmcs->shadow_vmcs != NULL); 3097 } 3098 3099 int alloc_loaded_vmcs(struct loaded_vmcs *loaded_vmcs) 3100 { 3101 loaded_vmcs->vmcs = alloc_vmcs(false); 3102 if (!loaded_vmcs->vmcs) 3103 return -ENOMEM; 3104 3105 vmcs_clear(loaded_vmcs->vmcs); 3106 3107 loaded_vmcs->shadow_vmcs = NULL; 3108 loaded_vmcs->hv_timer_soft_disabled = false; 3109 loaded_vmcs->cpu = -1; 3110 loaded_vmcs->launched = 0; 3111 3112 if (cpu_has_vmx_msr_bitmap()) { 3113 loaded_vmcs->msr_bitmap = (unsigned long *) 3114 __get_free_page(GFP_KERNEL_ACCOUNT); 3115 if (!loaded_vmcs->msr_bitmap) 3116 goto out_vmcs; 3117 memset(loaded_vmcs->msr_bitmap, 0xff, PAGE_SIZE); 3118 } 3119 3120 memset(&loaded_vmcs->host_state, 0, sizeof(struct vmcs_host_state)); 3121 memset(&loaded_vmcs->controls_shadow, 0, 3122 sizeof(struct vmcs_controls_shadow)); 3123 3124 return 0; 3125 3126 out_vmcs: 3127 free_loaded_vmcs(loaded_vmcs); 3128 return -ENOMEM; 3129 } 3130 3131 static void free_kvm_area(void) 3132 { 3133 int cpu; 3134 3135 for_each_possible_cpu(cpu) { 3136 free_vmcs(per_cpu(vmxarea, cpu)); 3137 per_cpu(vmxarea, cpu) = NULL; 3138 } 3139 } 3140 3141 static __init int alloc_kvm_area(void) 3142 { 3143 int cpu; 3144 3145 for_each_possible_cpu(cpu) { 3146 struct vmcs *vmcs; 3147 3148 vmcs = alloc_vmcs_cpu(false, cpu, GFP_KERNEL); 3149 if (!vmcs) { 3150 free_kvm_area(); 3151 return -ENOMEM; 3152 } 3153 3154 /* 3155 * When eVMCS is enabled, alloc_vmcs_cpu() sets 3156 * vmcs->revision_id to KVM_EVMCS_VERSION instead of 3157 * revision_id reported by MSR_IA32_VMX_BASIC. 3158 * 3159 * However, even though not explicitly documented by 3160 * TLFS, VMXArea passed as VMXON argument should 3161 * still be marked with revision_id reported by 3162 * physical CPU. 3163 */ 3164 if (kvm_is_using_evmcs()) 3165 vmcs->hdr.revision_id = vmx_basic_vmcs_revision_id(vmcs_config.basic); 3166 3167 per_cpu(vmxarea, cpu) = vmcs; 3168 } 3169 return 0; 3170 } 3171 3172 static void fix_pmode_seg(struct kvm_vcpu *vcpu, int seg, 3173 struct kvm_segment *save) 3174 { 3175 if (!emulate_invalid_guest_state) { 3176 /* 3177 * CS and SS RPL should be equal during guest entry according 3178 * to VMX spec, but in reality it is not always so. Since vcpu 3179 * is in the middle of the transition from real mode to 3180 * protected mode it is safe to assume that RPL 0 is a good 3181 * default value. 3182 */ 3183 if (seg == VCPU_SREG_CS || seg == VCPU_SREG_SS) 3184 save->selector &= ~SEGMENT_RPL_MASK; 3185 save->dpl = save->selector & SEGMENT_RPL_MASK; 3186 save->s = 1; 3187 } 3188 __vmx_set_segment(vcpu, save, seg); 3189 } 3190 3191 static void enter_pmode(struct kvm_vcpu *vcpu) 3192 { 3193 unsigned long flags; 3194 struct vcpu_vmx *vmx = to_vmx(vcpu); 3195 3196 /* 3197 * Update real mode segment cache. It may be not up-to-date if segment 3198 * register was written while vcpu was in a guest mode. 3199 */ 3200 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_ES], VCPU_SREG_ES); 3201 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_DS], VCPU_SREG_DS); 3202 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_FS], VCPU_SREG_FS); 3203 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_GS], VCPU_SREG_GS); 3204 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_SS], VCPU_SREG_SS); 3205 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_CS], VCPU_SREG_CS); 3206 3207 vmx->rmode.vm86_active = 0; 3208 3209 __vmx_set_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_TR], VCPU_SREG_TR); 3210 3211 flags = vmcs_readl(GUEST_RFLAGS); 3212 flags &= RMODE_GUEST_OWNED_EFLAGS_BITS; 3213 flags |= vmx->rmode.save_rflags & ~RMODE_GUEST_OWNED_EFLAGS_BITS; 3214 vmcs_writel(GUEST_RFLAGS, flags); 3215 3216 vmcs_writel(GUEST_CR4, (vmcs_readl(GUEST_CR4) & ~X86_CR4_VME) | 3217 (vmcs_readl(CR4_READ_SHADOW) & X86_CR4_VME)); 3218 3219 vmx_update_exception_bitmap(vcpu); 3220 3221 fix_pmode_seg(vcpu, VCPU_SREG_CS, &vmx->rmode.segs[VCPU_SREG_CS]); 3222 fix_pmode_seg(vcpu, VCPU_SREG_SS, &vmx->rmode.segs[VCPU_SREG_SS]); 3223 fix_pmode_seg(vcpu, VCPU_SREG_ES, &vmx->rmode.segs[VCPU_SREG_ES]); 3224 fix_pmode_seg(vcpu, VCPU_SREG_DS, &vmx->rmode.segs[VCPU_SREG_DS]); 3225 fix_pmode_seg(vcpu, VCPU_SREG_FS, &vmx->rmode.segs[VCPU_SREG_FS]); 3226 fix_pmode_seg(vcpu, VCPU_SREG_GS, &vmx->rmode.segs[VCPU_SREG_GS]); 3227 } 3228 3229 static void fix_rmode_seg(int seg, struct kvm_segment *save) 3230 { 3231 const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; 3232 struct kvm_segment var = *save; 3233 3234 var.dpl = 0x3; 3235 if (seg == VCPU_SREG_CS) 3236 var.type = 0x3; 3237 3238 if (!emulate_invalid_guest_state) { 3239 var.selector = var.base >> 4; 3240 var.base = var.base & 0xffff0; 3241 var.limit = 0xffff; 3242 var.g = 0; 3243 var.db = 0; 3244 var.present = 1; 3245 var.s = 1; 3246 var.l = 0; 3247 var.unusable = 0; 3248 var.type = 0x3; 3249 var.avl = 0; 3250 if (save->base & 0xf) 3251 pr_warn_once("segment base is not paragraph aligned " 3252 "when entering protected mode (seg=%d)", seg); 3253 } 3254 3255 vmcs_write16(sf->selector, var.selector); 3256 vmcs_writel(sf->base, var.base); 3257 vmcs_write32(sf->limit, var.limit); 3258 vmcs_write32(sf->ar_bytes, vmx_segment_access_rights(&var)); 3259 } 3260 3261 static void enter_rmode(struct kvm_vcpu *vcpu) 3262 { 3263 unsigned long flags; 3264 struct vcpu_vmx *vmx = to_vmx(vcpu); 3265 struct kvm_vmx *kvm_vmx = to_kvm_vmx(vcpu->kvm); 3266 3267 /* 3268 * KVM should never use VM86 to virtualize Real Mode when L2 is active, 3269 * as using VM86 is unnecessary if unrestricted guest is enabled, and 3270 * if unrestricted guest is disabled, VM-Enter (from L1) with CR0.PG=0 3271 * should VM-Fail and KVM should reject userspace attempts to stuff 3272 * CR0.PG=0 when L2 is active. 3273 */ 3274 WARN_ON_ONCE(is_guest_mode(vcpu)); 3275 3276 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_TR], VCPU_SREG_TR); 3277 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_ES], VCPU_SREG_ES); 3278 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_DS], VCPU_SREG_DS); 3279 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_FS], VCPU_SREG_FS); 3280 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_GS], VCPU_SREG_GS); 3281 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_SS], VCPU_SREG_SS); 3282 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_CS], VCPU_SREG_CS); 3283 3284 vmx->rmode.vm86_active = 1; 3285 3286 vmx_segment_cache_clear(vmx); 3287 3288 vmcs_writel(GUEST_TR_BASE, kvm_vmx->tss_addr); 3289 vmcs_write32(GUEST_TR_LIMIT, RMODE_TSS_SIZE - 1); 3290 vmcs_write32(GUEST_TR_AR_BYTES, 0x008b); 3291 3292 flags = vmcs_readl(GUEST_RFLAGS); 3293 vmx->rmode.save_rflags = flags; 3294 3295 flags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM; 3296 3297 vmcs_writel(GUEST_RFLAGS, flags); 3298 vmcs_writel(GUEST_CR4, vmcs_readl(GUEST_CR4) | X86_CR4_VME); 3299 vmx_update_exception_bitmap(vcpu); 3300 3301 fix_rmode_seg(VCPU_SREG_SS, &vmx->rmode.segs[VCPU_SREG_SS]); 3302 fix_rmode_seg(VCPU_SREG_CS, &vmx->rmode.segs[VCPU_SREG_CS]); 3303 fix_rmode_seg(VCPU_SREG_ES, &vmx->rmode.segs[VCPU_SREG_ES]); 3304 fix_rmode_seg(VCPU_SREG_DS, &vmx->rmode.segs[VCPU_SREG_DS]); 3305 fix_rmode_seg(VCPU_SREG_GS, &vmx->rmode.segs[VCPU_SREG_GS]); 3306 fix_rmode_seg(VCPU_SREG_FS, &vmx->rmode.segs[VCPU_SREG_FS]); 3307 } 3308 3309 int vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer) 3310 { 3311 struct vcpu_vmx *vmx = to_vmx(vcpu); 3312 3313 /* Nothing to do if hardware doesn't support EFER. */ 3314 if (!vmx_find_uret_msr(vmx, MSR_EFER)) 3315 return 0; 3316 3317 vcpu->arch.efer = efer; 3318 #ifdef CONFIG_X86_64 3319 if (efer & EFER_LMA) 3320 vm_entry_controls_setbit(vmx, VM_ENTRY_IA32E_MODE); 3321 else 3322 vm_entry_controls_clearbit(vmx, VM_ENTRY_IA32E_MODE); 3323 #else 3324 if (KVM_BUG_ON(efer & EFER_LMA, vcpu->kvm)) 3325 return 1; 3326 #endif 3327 3328 vmx_setup_uret_msrs(vmx); 3329 return 0; 3330 } 3331 3332 #ifdef CONFIG_X86_64 3333 3334 static void enter_lmode(struct kvm_vcpu *vcpu) 3335 { 3336 u32 guest_tr_ar; 3337 3338 vmx_segment_cache_clear(to_vmx(vcpu)); 3339 3340 guest_tr_ar = vmcs_read32(GUEST_TR_AR_BYTES); 3341 if ((guest_tr_ar & VMX_AR_TYPE_MASK) != VMX_AR_TYPE_BUSY_64_TSS) { 3342 pr_debug_ratelimited("%s: tss fixup for long mode. \n", 3343 __func__); 3344 vmcs_write32(GUEST_TR_AR_BYTES, 3345 (guest_tr_ar & ~VMX_AR_TYPE_MASK) 3346 | VMX_AR_TYPE_BUSY_64_TSS); 3347 } 3348 vmx_set_efer(vcpu, vcpu->arch.efer | EFER_LMA); 3349 } 3350 3351 static void exit_lmode(struct kvm_vcpu *vcpu) 3352 { 3353 vmx_set_efer(vcpu, vcpu->arch.efer & ~EFER_LMA); 3354 } 3355 3356 #endif 3357 3358 void vmx_flush_tlb_all(struct kvm_vcpu *vcpu) 3359 { 3360 struct vcpu_vmx *vmx = to_vmx(vcpu); 3361 3362 /* 3363 * INVEPT must be issued when EPT is enabled, irrespective of VPID, as 3364 * the CPU is not required to invalidate guest-physical mappings on 3365 * VM-Entry, even if VPID is disabled. Guest-physical mappings are 3366 * associated with the root EPT structure and not any particular VPID 3367 * (INVVPID also isn't required to invalidate guest-physical mappings). 3368 */ 3369 if (enable_ept) { 3370 ept_sync_global(); 3371 } else if (enable_vpid) { 3372 if (cpu_has_vmx_invvpid_global()) { 3373 vpid_sync_vcpu_global(); 3374 } else { 3375 vpid_sync_vcpu_single(vmx->vpid); 3376 vpid_sync_vcpu_single(vmx->nested.vpid02); 3377 } 3378 } 3379 } 3380 3381 static inline int vmx_get_current_vpid(struct kvm_vcpu *vcpu) 3382 { 3383 if (is_guest_mode(vcpu) && nested_cpu_has_vpid(get_vmcs12(vcpu))) 3384 return nested_get_vpid02(vcpu); 3385 return to_vmx(vcpu)->vpid; 3386 } 3387 3388 static u64 construct_eptp(hpa_t root_hpa) 3389 { 3390 u64 eptp = root_hpa | VMX_EPTP_MT_WB; 3391 struct kvm_mmu_page *root; 3392 3393 if (kvm_mmu_is_dummy_root(root_hpa)) 3394 return eptp | VMX_EPTP_PWL_4; 3395 3396 /* 3397 * EPT roots should always have an associated MMU page. Return a "bad" 3398 * EPTP to induce VM-Fail instead of continuing on in a unknown state. 3399 */ 3400 root = root_to_sp(root_hpa); 3401 if (WARN_ON_ONCE(!root)) 3402 return INVALID_PAGE; 3403 3404 eptp |= (root->role.level == 5) ? VMX_EPTP_PWL_5 : VMX_EPTP_PWL_4; 3405 3406 if (enable_ept_ad_bits && !root->role.ad_disabled) 3407 eptp |= VMX_EPTP_AD_ENABLE_BIT; 3408 3409 return eptp; 3410 } 3411 3412 static void vmx_flush_tlb_ept_root(hpa_t root_hpa) 3413 { 3414 u64 eptp = construct_eptp(root_hpa); 3415 3416 if (VALID_PAGE(eptp)) 3417 ept_sync_context(eptp); 3418 else 3419 ept_sync_global(); 3420 } 3421 3422 void vmx_flush_tlb_current(struct kvm_vcpu *vcpu) 3423 { 3424 struct kvm_mmu *mmu = vcpu->arch.mmu; 3425 u64 root_hpa = mmu->root.hpa; 3426 3427 /* No flush required if the current context is invalid. */ 3428 if (!VALID_PAGE(root_hpa)) 3429 return; 3430 3431 if (enable_ept) 3432 vmx_flush_tlb_ept_root(root_hpa); 3433 else 3434 vpid_sync_context(vmx_get_current_vpid(vcpu)); 3435 } 3436 3437 void vmx_flush_tlb_gva(struct kvm_vcpu *vcpu, gva_t addr) 3438 { 3439 /* 3440 * vpid_sync_vcpu_addr() is a nop if vpid==0, see the comment in 3441 * vmx_flush_tlb_guest() for an explanation of why this is ok. 3442 */ 3443 vpid_sync_vcpu_addr(vmx_get_current_vpid(vcpu), addr); 3444 } 3445 3446 void vmx_flush_tlb_guest(struct kvm_vcpu *vcpu) 3447 { 3448 /* 3449 * vpid_sync_context() is a nop if vpid==0, e.g. if enable_vpid==0 or a 3450 * vpid couldn't be allocated for this vCPU. VM-Enter and VM-Exit are 3451 * required to flush GVA->{G,H}PA mappings from the TLB if vpid is 3452 * disabled (VM-Enter with vpid enabled and vpid==0 is disallowed), 3453 * i.e. no explicit INVVPID is necessary. 3454 */ 3455 vpid_sync_context(vmx_get_current_vpid(vcpu)); 3456 } 3457 3458 void vmx_ept_load_pdptrs(struct kvm_vcpu *vcpu) 3459 { 3460 struct kvm_mmu *mmu = vcpu->arch.walk_mmu; 3461 3462 if (!kvm_register_is_dirty(vcpu, VCPU_EXREG_PDPTR)) 3463 return; 3464 3465 if (is_pae_paging(vcpu)) { 3466 vmcs_write64(GUEST_PDPTR0, mmu->pdptrs[0]); 3467 vmcs_write64(GUEST_PDPTR1, mmu->pdptrs[1]); 3468 vmcs_write64(GUEST_PDPTR2, mmu->pdptrs[2]); 3469 vmcs_write64(GUEST_PDPTR3, mmu->pdptrs[3]); 3470 } 3471 } 3472 3473 void ept_save_pdptrs(struct kvm_vcpu *vcpu) 3474 { 3475 struct kvm_mmu *mmu = vcpu->arch.walk_mmu; 3476 3477 if (WARN_ON_ONCE(!is_pae_paging(vcpu))) 3478 return; 3479 3480 mmu->pdptrs[0] = vmcs_read64(GUEST_PDPTR0); 3481 mmu->pdptrs[1] = vmcs_read64(GUEST_PDPTR1); 3482 mmu->pdptrs[2] = vmcs_read64(GUEST_PDPTR2); 3483 mmu->pdptrs[3] = vmcs_read64(GUEST_PDPTR3); 3484 3485 kvm_register_mark_available(vcpu, VCPU_EXREG_PDPTR); 3486 } 3487 3488 #define CR3_EXITING_BITS (CPU_BASED_CR3_LOAD_EXITING | \ 3489 CPU_BASED_CR3_STORE_EXITING) 3490 3491 bool vmx_is_valid_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) 3492 { 3493 if (is_guest_mode(vcpu)) 3494 return nested_guest_cr0_valid(vcpu, cr0); 3495 3496 if (to_vmx(vcpu)->nested.vmxon) 3497 return nested_host_cr0_valid(vcpu, cr0); 3498 3499 return true; 3500 } 3501 3502 void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) 3503 { 3504 struct vcpu_vmx *vmx = to_vmx(vcpu); 3505 unsigned long hw_cr0, old_cr0_pg; 3506 u32 tmp; 3507 3508 old_cr0_pg = kvm_read_cr0_bits(vcpu, X86_CR0_PG); 3509 3510 hw_cr0 = (cr0 & ~KVM_VM_CR0_ALWAYS_OFF); 3511 if (enable_unrestricted_guest) 3512 hw_cr0 |= KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST; 3513 else { 3514 hw_cr0 |= KVM_VM_CR0_ALWAYS_ON; 3515 if (!enable_ept) 3516 hw_cr0 |= X86_CR0_WP; 3517 3518 if (vmx->rmode.vm86_active && (cr0 & X86_CR0_PE)) 3519 enter_pmode(vcpu); 3520 3521 if (!vmx->rmode.vm86_active && !(cr0 & X86_CR0_PE)) 3522 enter_rmode(vcpu); 3523 } 3524 3525 vmcs_writel(CR0_READ_SHADOW, cr0); 3526 vmcs_writel(GUEST_CR0, hw_cr0); 3527 vcpu->arch.cr0 = cr0; 3528 kvm_register_mark_available(vcpu, VCPU_EXREG_CR0); 3529 3530 #ifdef CONFIG_X86_64 3531 if (vcpu->arch.efer & EFER_LME) { 3532 if (!old_cr0_pg && (cr0 & X86_CR0_PG)) 3533 enter_lmode(vcpu); 3534 else if (old_cr0_pg && !(cr0 & X86_CR0_PG)) 3535 exit_lmode(vcpu); 3536 } 3537 #endif 3538 3539 if (enable_ept && !enable_unrestricted_guest) { 3540 /* 3541 * Ensure KVM has an up-to-date snapshot of the guest's CR3. If 3542 * the below code _enables_ CR3 exiting, vmx_cache_reg() will 3543 * (correctly) stop reading vmcs.GUEST_CR3 because it thinks 3544 * KVM's CR3 is installed. 3545 */ 3546 if (!kvm_register_is_available(vcpu, VCPU_EXREG_CR3)) 3547 vmx_cache_reg(vcpu, VCPU_EXREG_CR3); 3548 3549 /* 3550 * When running with EPT but not unrestricted guest, KVM must 3551 * intercept CR3 accesses when paging is _disabled_. This is 3552 * necessary because restricted guests can't actually run with 3553 * paging disabled, and so KVM stuffs its own CR3 in order to 3554 * run the guest when identity mapped page tables. 3555 * 3556 * Do _NOT_ check the old CR0.PG, e.g. to optimize away the 3557 * update, it may be stale with respect to CR3 interception, 3558 * e.g. after nested VM-Enter. 3559 * 3560 * Lastly, honor L1's desires, i.e. intercept CR3 loads and/or 3561 * stores to forward them to L1, even if KVM does not need to 3562 * intercept them to preserve its identity mapped page tables. 3563 */ 3564 if (!(cr0 & X86_CR0_PG)) { 3565 exec_controls_setbit(vmx, CR3_EXITING_BITS); 3566 } else if (!is_guest_mode(vcpu)) { 3567 exec_controls_clearbit(vmx, CR3_EXITING_BITS); 3568 } else { 3569 tmp = exec_controls_get(vmx); 3570 tmp &= ~CR3_EXITING_BITS; 3571 tmp |= get_vmcs12(vcpu)->cpu_based_vm_exec_control & CR3_EXITING_BITS; 3572 exec_controls_set(vmx, tmp); 3573 } 3574 3575 /* Note, vmx_set_cr4() consumes the new vcpu->arch.cr0. */ 3576 if ((old_cr0_pg ^ cr0) & X86_CR0_PG) 3577 vmx_set_cr4(vcpu, kvm_read_cr4(vcpu)); 3578 3579 /* 3580 * When !CR0_PG -> CR0_PG, vcpu->arch.cr3 becomes active, but 3581 * GUEST_CR3 is still vmx->ept_identity_map_addr if EPT + !URG. 3582 */ 3583 if (!(old_cr0_pg & X86_CR0_PG) && (cr0 & X86_CR0_PG)) 3584 kvm_register_mark_dirty(vcpu, VCPU_EXREG_CR3); 3585 } 3586 3587 /* depends on vcpu->arch.cr0 to be set to a new value */ 3588 vmx->vt.emulation_required = vmx_emulation_required(vcpu); 3589 } 3590 3591 static int vmx_get_max_ept_level(void) 3592 { 3593 if (cpu_has_vmx_ept_5levels()) 3594 return 5; 3595 return 4; 3596 } 3597 3598 void vmx_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa, int root_level) 3599 { 3600 struct kvm *kvm = vcpu->kvm; 3601 bool update_guest_cr3 = true; 3602 unsigned long guest_cr3; 3603 3604 if (enable_ept) { 3605 KVM_MMU_WARN_ON(root_to_sp(root_hpa) && 3606 root_level != root_to_sp(root_hpa)->role.level); 3607 vmcs_write64(EPT_POINTER, construct_eptp(root_hpa)); 3608 3609 hv_track_root_tdp(vcpu, root_hpa); 3610 3611 if (!enable_unrestricted_guest && !is_paging(vcpu)) 3612 guest_cr3 = to_kvm_vmx(kvm)->ept_identity_map_addr; 3613 else if (kvm_register_is_dirty(vcpu, VCPU_EXREG_CR3)) 3614 guest_cr3 = vcpu->arch.cr3; 3615 else /* vmcs.GUEST_CR3 is already up-to-date. */ 3616 update_guest_cr3 = false; 3617 vmx_ept_load_pdptrs(vcpu); 3618 } else { 3619 guest_cr3 = root_hpa | kvm_get_active_pcid(vcpu) | 3620 kvm_get_active_cr3_lam_bits(vcpu); 3621 } 3622 3623 if (update_guest_cr3) 3624 vmcs_writel(GUEST_CR3, guest_cr3); 3625 } 3626 3627 bool vmx_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) 3628 { 3629 /* 3630 * We operate under the default treatment of SMM, so VMX cannot be 3631 * enabled under SMM. Note, whether or not VMXE is allowed at all, 3632 * i.e. is a reserved bit, is handled by common x86 code. 3633 */ 3634 if ((cr4 & X86_CR4_VMXE) && is_smm(vcpu)) 3635 return false; 3636 3637 if (to_vmx(vcpu)->nested.vmxon && !nested_cr4_valid(vcpu, cr4)) 3638 return false; 3639 3640 return true; 3641 } 3642 3643 void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) 3644 { 3645 unsigned long old_cr4 = kvm_read_cr4(vcpu); 3646 struct vcpu_vmx *vmx = to_vmx(vcpu); 3647 unsigned long hw_cr4; 3648 3649 /* 3650 * Pass through host's Machine Check Enable value to hw_cr4, which 3651 * is in force while we are in guest mode. Do not let guests control 3652 * this bit, even if host CR4.MCE == 0. 3653 */ 3654 hw_cr4 = (cr4_read_shadow() & X86_CR4_MCE) | (cr4 & ~X86_CR4_MCE); 3655 if (enable_unrestricted_guest) 3656 hw_cr4 |= KVM_VM_CR4_ALWAYS_ON_UNRESTRICTED_GUEST; 3657 else if (vmx->rmode.vm86_active) 3658 hw_cr4 |= KVM_RMODE_VM_CR4_ALWAYS_ON; 3659 else 3660 hw_cr4 |= KVM_PMODE_VM_CR4_ALWAYS_ON; 3661 3662 if (vmx_umip_emulated()) { 3663 if (cr4 & X86_CR4_UMIP) { 3664 secondary_exec_controls_setbit(vmx, SECONDARY_EXEC_DESC); 3665 hw_cr4 &= ~X86_CR4_UMIP; 3666 } else if (!is_guest_mode(vcpu) || 3667 !nested_cpu_has2(get_vmcs12(vcpu), SECONDARY_EXEC_DESC)) { 3668 secondary_exec_controls_clearbit(vmx, SECONDARY_EXEC_DESC); 3669 } 3670 } 3671 3672 vcpu->arch.cr4 = cr4; 3673 kvm_register_mark_available(vcpu, VCPU_EXREG_CR4); 3674 3675 if (!enable_unrestricted_guest) { 3676 if (enable_ept) { 3677 if (!is_paging(vcpu)) { 3678 hw_cr4 &= ~X86_CR4_PAE; 3679 hw_cr4 |= X86_CR4_PSE; 3680 } else if (!(cr4 & X86_CR4_PAE)) { 3681 hw_cr4 &= ~X86_CR4_PAE; 3682 } 3683 } 3684 3685 /* 3686 * SMEP/SMAP/PKU is disabled if CPU is in non-paging mode in 3687 * hardware. To emulate this behavior, SMEP/SMAP/PKU needs 3688 * to be manually disabled when guest switches to non-paging 3689 * mode. 3690 * 3691 * If !enable_unrestricted_guest, the CPU is always running 3692 * with CR0.PG=1 and CR4 needs to be modified. 3693 * If enable_unrestricted_guest, the CPU automatically 3694 * disables SMEP/SMAP/PKU when the guest sets CR0.PG=0. 3695 */ 3696 if (!is_paging(vcpu)) 3697 hw_cr4 &= ~(X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_PKE); 3698 } 3699 3700 vmcs_writel(CR4_READ_SHADOW, cr4); 3701 vmcs_writel(GUEST_CR4, hw_cr4); 3702 3703 if ((cr4 ^ old_cr4) & (X86_CR4_OSXSAVE | X86_CR4_PKE)) 3704 vcpu->arch.cpuid_dynamic_bits_dirty = true; 3705 } 3706 3707 void vmx_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg) 3708 { 3709 struct vcpu_vmx *vmx = to_vmx(vcpu); 3710 u32 ar; 3711 3712 if (vmx->rmode.vm86_active && seg != VCPU_SREG_LDTR) { 3713 *var = vmx->rmode.segs[seg]; 3714 if (seg == VCPU_SREG_TR 3715 || var->selector == vmx_read_guest_seg_selector(vmx, seg)) 3716 return; 3717 var->base = vmx_read_guest_seg_base(vmx, seg); 3718 var->selector = vmx_read_guest_seg_selector(vmx, seg); 3719 return; 3720 } 3721 var->base = vmx_read_guest_seg_base(vmx, seg); 3722 var->limit = vmx_read_guest_seg_limit(vmx, seg); 3723 var->selector = vmx_read_guest_seg_selector(vmx, seg); 3724 ar = vmx_read_guest_seg_ar(vmx, seg); 3725 var->unusable = (ar >> 16) & 1; 3726 var->type = ar & 15; 3727 var->s = (ar >> 4) & 1; 3728 var->dpl = (ar >> 5) & 3; 3729 /* 3730 * Some userspaces do not preserve unusable property. Since usable 3731 * segment has to be present according to VMX spec we can use present 3732 * property to amend userspace bug by making unusable segment always 3733 * nonpresent. vmx_segment_access_rights() already marks nonpresent 3734 * segment as unusable. 3735 */ 3736 var->present = !var->unusable; 3737 var->avl = (ar >> 12) & 1; 3738 var->l = (ar >> 13) & 1; 3739 var->db = (ar >> 14) & 1; 3740 var->g = (ar >> 15) & 1; 3741 } 3742 3743 u64 vmx_get_segment_base(struct kvm_vcpu *vcpu, int seg) 3744 { 3745 struct kvm_segment s; 3746 3747 if (to_vmx(vcpu)->rmode.vm86_active) { 3748 vmx_get_segment(vcpu, &s, seg); 3749 return s.base; 3750 } 3751 return vmx_read_guest_seg_base(to_vmx(vcpu), seg); 3752 } 3753 3754 static int __vmx_get_cpl(struct kvm_vcpu *vcpu, bool no_cache) 3755 { 3756 struct vcpu_vmx *vmx = to_vmx(vcpu); 3757 int ar; 3758 3759 if (unlikely(vmx->rmode.vm86_active)) 3760 return 0; 3761 3762 if (no_cache) 3763 ar = vmcs_read32(GUEST_SS_AR_BYTES); 3764 else 3765 ar = vmx_read_guest_seg_ar(vmx, VCPU_SREG_SS); 3766 return VMX_AR_DPL(ar); 3767 } 3768 3769 int vmx_get_cpl(struct kvm_vcpu *vcpu) 3770 { 3771 return __vmx_get_cpl(vcpu, false); 3772 } 3773 3774 int vmx_get_cpl_no_cache(struct kvm_vcpu *vcpu) 3775 { 3776 return __vmx_get_cpl(vcpu, true); 3777 } 3778 3779 static u32 vmx_segment_access_rights(struct kvm_segment *var) 3780 { 3781 u32 ar; 3782 3783 ar = var->type & 15; 3784 ar |= (var->s & 1) << 4; 3785 ar |= (var->dpl & 3) << 5; 3786 ar |= (var->present & 1) << 7; 3787 ar |= (var->avl & 1) << 12; 3788 ar |= (var->l & 1) << 13; 3789 ar |= (var->db & 1) << 14; 3790 ar |= (var->g & 1) << 15; 3791 ar |= (var->unusable || !var->present) << 16; 3792 3793 return ar; 3794 } 3795 3796 void __vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg) 3797 { 3798 struct vcpu_vmx *vmx = to_vmx(vcpu); 3799 const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; 3800 3801 vmx_segment_cache_clear(vmx); 3802 3803 if (vmx->rmode.vm86_active && seg != VCPU_SREG_LDTR) { 3804 vmx->rmode.segs[seg] = *var; 3805 if (seg == VCPU_SREG_TR) 3806 vmcs_write16(sf->selector, var->selector); 3807 else if (var->s) 3808 fix_rmode_seg(seg, &vmx->rmode.segs[seg]); 3809 return; 3810 } 3811 3812 vmcs_writel(sf->base, var->base); 3813 vmcs_write32(sf->limit, var->limit); 3814 vmcs_write16(sf->selector, var->selector); 3815 3816 /* 3817 * Fix the "Accessed" bit in AR field of segment registers for older 3818 * qemu binaries. 3819 * IA32 arch specifies that at the time of processor reset the 3820 * "Accessed" bit in the AR field of segment registers is 1. And qemu 3821 * is setting it to 0 in the userland code. This causes invalid guest 3822 * state vmexit when "unrestricted guest" mode is turned on. 3823 * Fix for this setup issue in cpu_reset is being pushed in the qemu 3824 * tree. Newer qemu binaries with that qemu fix would not need this 3825 * kvm hack. 3826 */ 3827 if (is_unrestricted_guest(vcpu) && (seg != VCPU_SREG_LDTR)) 3828 var->type |= 0x1; /* Accessed */ 3829 3830 vmcs_write32(sf->ar_bytes, vmx_segment_access_rights(var)); 3831 } 3832 3833 void vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg) 3834 { 3835 __vmx_set_segment(vcpu, var, seg); 3836 3837 to_vmx(vcpu)->vt.emulation_required = vmx_emulation_required(vcpu); 3838 } 3839 3840 void vmx_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l) 3841 { 3842 u32 ar = vmx_read_guest_seg_ar(to_vmx(vcpu), VCPU_SREG_CS); 3843 3844 *db = (ar >> 14) & 1; 3845 *l = (ar >> 13) & 1; 3846 } 3847 3848 void vmx_get_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) 3849 { 3850 dt->size = vmcs_read32(GUEST_IDTR_LIMIT); 3851 dt->address = vmcs_readl(GUEST_IDTR_BASE); 3852 } 3853 3854 void vmx_set_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) 3855 { 3856 vmcs_write32(GUEST_IDTR_LIMIT, dt->size); 3857 vmcs_writel(GUEST_IDTR_BASE, dt->address); 3858 } 3859 3860 void vmx_get_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) 3861 { 3862 dt->size = vmcs_read32(GUEST_GDTR_LIMIT); 3863 dt->address = vmcs_readl(GUEST_GDTR_BASE); 3864 } 3865 3866 void vmx_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) 3867 { 3868 vmcs_write32(GUEST_GDTR_LIMIT, dt->size); 3869 vmcs_writel(GUEST_GDTR_BASE, dt->address); 3870 } 3871 3872 static bool rmode_segment_valid(struct kvm_vcpu *vcpu, int seg) 3873 { 3874 struct kvm_segment var; 3875 u32 ar; 3876 3877 vmx_get_segment(vcpu, &var, seg); 3878 var.dpl = 0x3; 3879 if (seg == VCPU_SREG_CS) 3880 var.type = 0x3; 3881 ar = vmx_segment_access_rights(&var); 3882 3883 if (var.base != (var.selector << 4)) 3884 return false; 3885 if (var.limit != 0xffff) 3886 return false; 3887 if (ar != 0xf3) 3888 return false; 3889 3890 return true; 3891 } 3892 3893 static bool code_segment_valid(struct kvm_vcpu *vcpu) 3894 { 3895 struct kvm_segment cs; 3896 unsigned int cs_rpl; 3897 3898 vmx_get_segment(vcpu, &cs, VCPU_SREG_CS); 3899 cs_rpl = cs.selector & SEGMENT_RPL_MASK; 3900 3901 if (cs.unusable) 3902 return false; 3903 if (~cs.type & (VMX_AR_TYPE_CODE_MASK|VMX_AR_TYPE_ACCESSES_MASK)) 3904 return false; 3905 if (!cs.s) 3906 return false; 3907 if (cs.type & VMX_AR_TYPE_WRITEABLE_MASK) { 3908 if (cs.dpl > cs_rpl) 3909 return false; 3910 } else { 3911 if (cs.dpl != cs_rpl) 3912 return false; 3913 } 3914 if (!cs.present) 3915 return false; 3916 3917 /* TODO: Add Reserved field check, this'll require a new member in the kvm_segment_field structure */ 3918 return true; 3919 } 3920 3921 static bool stack_segment_valid(struct kvm_vcpu *vcpu) 3922 { 3923 struct kvm_segment ss; 3924 unsigned int ss_rpl; 3925 3926 vmx_get_segment(vcpu, &ss, VCPU_SREG_SS); 3927 ss_rpl = ss.selector & SEGMENT_RPL_MASK; 3928 3929 if (ss.unusable) 3930 return true; 3931 if (ss.type != 3 && ss.type != 7) 3932 return false; 3933 if (!ss.s) 3934 return false; 3935 if (ss.dpl != ss_rpl) /* DPL != RPL */ 3936 return false; 3937 if (!ss.present) 3938 return false; 3939 3940 return true; 3941 } 3942 3943 static bool data_segment_valid(struct kvm_vcpu *vcpu, int seg) 3944 { 3945 struct kvm_segment var; 3946 unsigned int rpl; 3947 3948 vmx_get_segment(vcpu, &var, seg); 3949 rpl = var.selector & SEGMENT_RPL_MASK; 3950 3951 if (var.unusable) 3952 return true; 3953 if (!var.s) 3954 return false; 3955 if (!var.present) 3956 return false; 3957 if (~var.type & (VMX_AR_TYPE_CODE_MASK|VMX_AR_TYPE_WRITEABLE_MASK)) { 3958 if (var.dpl < rpl) /* DPL < RPL */ 3959 return false; 3960 } 3961 3962 /* TODO: Add other members to kvm_segment_field to allow checking for other access 3963 * rights flags 3964 */ 3965 return true; 3966 } 3967 3968 static bool tr_valid(struct kvm_vcpu *vcpu) 3969 { 3970 struct kvm_segment tr; 3971 3972 vmx_get_segment(vcpu, &tr, VCPU_SREG_TR); 3973 3974 if (tr.unusable) 3975 return false; 3976 if (tr.selector & SEGMENT_TI_MASK) /* TI = 1 */ 3977 return false; 3978 if (tr.type != 3 && tr.type != 11) /* TODO: Check if guest is in IA32e mode */ 3979 return false; 3980 if (!tr.present) 3981 return false; 3982 3983 return true; 3984 } 3985 3986 static bool ldtr_valid(struct kvm_vcpu *vcpu) 3987 { 3988 struct kvm_segment ldtr; 3989 3990 vmx_get_segment(vcpu, &ldtr, VCPU_SREG_LDTR); 3991 3992 if (ldtr.unusable) 3993 return true; 3994 if (ldtr.selector & SEGMENT_TI_MASK) /* TI = 1 */ 3995 return false; 3996 if (ldtr.type != 2) 3997 return false; 3998 if (!ldtr.present) 3999 return false; 4000 4001 return true; 4002 } 4003 4004 static bool cs_ss_rpl_check(struct kvm_vcpu *vcpu) 4005 { 4006 struct kvm_segment cs, ss; 4007 4008 vmx_get_segment(vcpu, &cs, VCPU_SREG_CS); 4009 vmx_get_segment(vcpu, &ss, VCPU_SREG_SS); 4010 4011 return ((cs.selector & SEGMENT_RPL_MASK) == 4012 (ss.selector & SEGMENT_RPL_MASK)); 4013 } 4014 4015 /* 4016 * Check if guest state is valid. Returns true if valid, false if 4017 * not. 4018 * We assume that registers are always usable 4019 */ 4020 bool __vmx_guest_state_valid(struct kvm_vcpu *vcpu) 4021 { 4022 /* real mode guest state checks */ 4023 if (!is_protmode(vcpu) || (vmx_get_rflags(vcpu) & X86_EFLAGS_VM)) { 4024 if (!rmode_segment_valid(vcpu, VCPU_SREG_CS)) 4025 return false; 4026 if (!rmode_segment_valid(vcpu, VCPU_SREG_SS)) 4027 return false; 4028 if (!rmode_segment_valid(vcpu, VCPU_SREG_DS)) 4029 return false; 4030 if (!rmode_segment_valid(vcpu, VCPU_SREG_ES)) 4031 return false; 4032 if (!rmode_segment_valid(vcpu, VCPU_SREG_FS)) 4033 return false; 4034 if (!rmode_segment_valid(vcpu, VCPU_SREG_GS)) 4035 return false; 4036 } else { 4037 /* protected mode guest state checks */ 4038 if (!cs_ss_rpl_check(vcpu)) 4039 return false; 4040 if (!code_segment_valid(vcpu)) 4041 return false; 4042 if (!stack_segment_valid(vcpu)) 4043 return false; 4044 if (!data_segment_valid(vcpu, VCPU_SREG_DS)) 4045 return false; 4046 if (!data_segment_valid(vcpu, VCPU_SREG_ES)) 4047 return false; 4048 if (!data_segment_valid(vcpu, VCPU_SREG_FS)) 4049 return false; 4050 if (!data_segment_valid(vcpu, VCPU_SREG_GS)) 4051 return false; 4052 if (!tr_valid(vcpu)) 4053 return false; 4054 if (!ldtr_valid(vcpu)) 4055 return false; 4056 } 4057 /* TODO: 4058 * - Add checks on RIP 4059 * - Add checks on RFLAGS 4060 */ 4061 4062 return true; 4063 } 4064 4065 static int init_rmode_tss(struct kvm *kvm, void __user *ua) 4066 { 4067 const void *zero_page = (const void *) __va(page_to_phys(ZERO_PAGE(0))); 4068 u16 data; 4069 int i; 4070 4071 for (i = 0; i < 3; i++) { 4072 if (__copy_to_user(ua + PAGE_SIZE * i, zero_page, PAGE_SIZE)) 4073 return -EFAULT; 4074 } 4075 4076 data = TSS_BASE_SIZE + TSS_REDIRECTION_SIZE; 4077 if (__copy_to_user(ua + TSS_IOPB_BASE_OFFSET, &data, sizeof(u16))) 4078 return -EFAULT; 4079 4080 data = ~0; 4081 if (__copy_to_user(ua + RMODE_TSS_SIZE - 1, &data, sizeof(u8))) 4082 return -EFAULT; 4083 4084 return 0; 4085 } 4086 4087 static int init_rmode_identity_map(struct kvm *kvm) 4088 { 4089 struct kvm_vmx *kvm_vmx = to_kvm_vmx(kvm); 4090 int i, r = 0; 4091 void __user *uaddr; 4092 u32 tmp; 4093 4094 /* Protect kvm_vmx->ept_identity_pagetable_done. */ 4095 mutex_lock(&kvm->slots_lock); 4096 4097 if (likely(kvm_vmx->ept_identity_pagetable_done)) 4098 goto out; 4099 4100 if (!kvm_vmx->ept_identity_map_addr) 4101 kvm_vmx->ept_identity_map_addr = VMX_EPT_IDENTITY_PAGETABLE_ADDR; 4102 4103 uaddr = __x86_set_memory_region(kvm, 4104 IDENTITY_PAGETABLE_PRIVATE_MEMSLOT, 4105 kvm_vmx->ept_identity_map_addr, 4106 PAGE_SIZE); 4107 if (IS_ERR(uaddr)) { 4108 r = PTR_ERR(uaddr); 4109 goto out; 4110 } 4111 4112 /* Set up identity-mapping pagetable for EPT in real mode */ 4113 for (i = 0; i < (PAGE_SIZE / sizeof(tmp)); i++) { 4114 tmp = (i << 22) + (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | 4115 _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_PSE); 4116 if (__copy_to_user(uaddr + i * sizeof(tmp), &tmp, sizeof(tmp))) { 4117 r = -EFAULT; 4118 goto out; 4119 } 4120 } 4121 kvm_vmx->ept_identity_pagetable_done = true; 4122 4123 out: 4124 mutex_unlock(&kvm->slots_lock); 4125 return r; 4126 } 4127 4128 static void seg_setup(int seg) 4129 { 4130 const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; 4131 unsigned int ar; 4132 4133 vmcs_write16(sf->selector, 0); 4134 vmcs_writel(sf->base, 0); 4135 vmcs_write32(sf->limit, 0xffff); 4136 ar = 0x93; 4137 if (seg == VCPU_SREG_CS) 4138 ar |= 0x08; /* code segment */ 4139 4140 vmcs_write32(sf->ar_bytes, ar); 4141 } 4142 4143 int allocate_vpid(void) 4144 { 4145 int vpid; 4146 4147 if (!enable_vpid) 4148 return 0; 4149 spin_lock(&vmx_vpid_lock); 4150 vpid = find_first_zero_bit(vmx_vpid_bitmap, VMX_NR_VPIDS); 4151 if (vpid < VMX_NR_VPIDS) 4152 __set_bit(vpid, vmx_vpid_bitmap); 4153 else 4154 vpid = 0; 4155 spin_unlock(&vmx_vpid_lock); 4156 return vpid; 4157 } 4158 4159 void free_vpid(int vpid) 4160 { 4161 if (!enable_vpid || vpid == 0) 4162 return; 4163 spin_lock(&vmx_vpid_lock); 4164 __clear_bit(vpid, vmx_vpid_bitmap); 4165 spin_unlock(&vmx_vpid_lock); 4166 } 4167 4168 static void vmx_msr_bitmap_l01_changed(struct vcpu_vmx *vmx) 4169 { 4170 /* 4171 * When KVM is a nested hypervisor on top of Hyper-V and uses 4172 * 'Enlightened MSR Bitmap' feature L0 needs to know that MSR 4173 * bitmap has changed. 4174 */ 4175 if (kvm_is_using_evmcs()) { 4176 struct hv_enlightened_vmcs *evmcs = (void *)vmx->vmcs01.vmcs; 4177 4178 if (evmcs->hv_enlightenments_control.msr_bitmap) 4179 evmcs->hv_clean_fields &= 4180 ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP; 4181 } 4182 4183 vmx->nested.force_msr_bitmap_recalc = true; 4184 } 4185 4186 void vmx_set_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type, bool set) 4187 { 4188 struct vcpu_vmx *vmx = to_vmx(vcpu); 4189 unsigned long *msr_bitmap = vmx->vmcs01.msr_bitmap; 4190 4191 if (!cpu_has_vmx_msr_bitmap()) 4192 return; 4193 4194 vmx_msr_bitmap_l01_changed(vmx); 4195 4196 if (type & MSR_TYPE_R) { 4197 if (!set && kvm_msr_allowed(vcpu, msr, KVM_MSR_FILTER_READ)) 4198 vmx_clear_msr_bitmap_read(msr_bitmap, msr); 4199 else 4200 vmx_set_msr_bitmap_read(msr_bitmap, msr); 4201 } 4202 4203 if (type & MSR_TYPE_W) { 4204 if (!set && kvm_msr_allowed(vcpu, msr, KVM_MSR_FILTER_WRITE)) 4205 vmx_clear_msr_bitmap_write(msr_bitmap, msr); 4206 else 4207 vmx_set_msr_bitmap_write(msr_bitmap, msr); 4208 } 4209 } 4210 4211 static void vmx_update_msr_bitmap_x2apic(struct kvm_vcpu *vcpu) 4212 { 4213 /* 4214 * x2APIC indices for 64-bit accesses into the RDMSR and WRMSR halves 4215 * of the MSR bitmap. KVM emulates APIC registers up through 0x3f0, 4216 * i.e. MSR 0x83f, and so only needs to dynamically manipulate 64 bits. 4217 */ 4218 const int read_idx = APIC_BASE_MSR / BITS_PER_LONG_LONG; 4219 const int write_idx = read_idx + (0x800 / sizeof(u64)); 4220 struct vcpu_vmx *vmx = to_vmx(vcpu); 4221 u64 *msr_bitmap = (u64 *)vmx->vmcs01.msr_bitmap; 4222 u8 mode; 4223 4224 if (!cpu_has_vmx_msr_bitmap() || WARN_ON_ONCE(!lapic_in_kernel(vcpu))) 4225 return; 4226 4227 if (cpu_has_secondary_exec_ctrls() && 4228 (secondary_exec_controls_get(vmx) & 4229 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE)) { 4230 mode = MSR_BITMAP_MODE_X2APIC; 4231 if (enable_apicv && kvm_vcpu_apicv_active(vcpu)) 4232 mode |= MSR_BITMAP_MODE_X2APIC_APICV; 4233 } else { 4234 mode = 0; 4235 } 4236 4237 if (mode == vmx->x2apic_msr_bitmap_mode) 4238 return; 4239 4240 vmx->x2apic_msr_bitmap_mode = mode; 4241 4242 /* 4243 * Reset the bitmap for MSRs 0x800 - 0x83f. Leave AMD's uber-extended 4244 * registers (0x840 and above) intercepted, KVM doesn't support them. 4245 * Intercept all writes by default and poke holes as needed. Pass 4246 * through reads for all valid registers by default in x2APIC+APICv 4247 * mode, only the current timer count needs on-demand emulation by KVM. 4248 */ 4249 if (mode & MSR_BITMAP_MODE_X2APIC_APICV) 4250 msr_bitmap[read_idx] = ~kvm_lapic_readable_reg_mask(vcpu->arch.apic); 4251 else 4252 msr_bitmap[read_idx] = ~0ull; 4253 msr_bitmap[write_idx] = ~0ull; 4254 4255 /* 4256 * TPR reads and writes can be virtualized even if virtual interrupt 4257 * delivery is not in use. 4258 */ 4259 vmx_set_intercept_for_msr(vcpu, X2APIC_MSR(APIC_TASKPRI), MSR_TYPE_RW, 4260 !(mode & MSR_BITMAP_MODE_X2APIC)); 4261 4262 if (mode & MSR_BITMAP_MODE_X2APIC_APICV) { 4263 vmx_enable_intercept_for_msr(vcpu, X2APIC_MSR(APIC_TMCCT), MSR_TYPE_RW); 4264 vmx_disable_intercept_for_msr(vcpu, X2APIC_MSR(APIC_EOI), MSR_TYPE_W); 4265 vmx_disable_intercept_for_msr(vcpu, X2APIC_MSR(APIC_SELF_IPI), MSR_TYPE_W); 4266 if (enable_ipiv) 4267 vmx_disable_intercept_for_msr(vcpu, X2APIC_MSR(APIC_ICR), MSR_TYPE_RW); 4268 } 4269 } 4270 4271 void pt_update_intercept_for_msr(struct kvm_vcpu *vcpu) 4272 { 4273 struct vcpu_vmx *vmx = to_vmx(vcpu); 4274 bool flag = !(vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN); 4275 u32 i; 4276 4277 vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_STATUS, MSR_TYPE_RW, flag); 4278 vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_OUTPUT_BASE, MSR_TYPE_RW, flag); 4279 vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_OUTPUT_MASK, MSR_TYPE_RW, flag); 4280 vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_CR3_MATCH, MSR_TYPE_RW, flag); 4281 for (i = 0; i < vmx->pt_desc.num_address_ranges; i++) { 4282 vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_ADDR0_A + i * 2, MSR_TYPE_RW, flag); 4283 vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_ADDR0_B + i * 2, MSR_TYPE_RW, flag); 4284 } 4285 } 4286 4287 static void vmx_recalc_pmu_msr_intercepts(struct kvm_vcpu *vcpu) 4288 { 4289 u64 vm_exit_controls_bits = VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL | 4290 VM_EXIT_SAVE_IA32_PERF_GLOBAL_CTRL; 4291 bool has_mediated_pmu = kvm_vcpu_has_mediated_pmu(vcpu); 4292 struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); 4293 struct vcpu_vmx *vmx = to_vmx(vcpu); 4294 bool intercept = !has_mediated_pmu; 4295 int i; 4296 4297 if (!enable_mediated_pmu) 4298 return; 4299 4300 if (!cpu_has_save_perf_global_ctrl()) { 4301 vm_exit_controls_bits &= ~VM_EXIT_SAVE_IA32_PERF_GLOBAL_CTRL; 4302 4303 if (has_mediated_pmu) 4304 vmx_add_autostore_msr(vmx, MSR_CORE_PERF_GLOBAL_CTRL); 4305 else 4306 vmx_remove_autostore_msr(vmx, MSR_CORE_PERF_GLOBAL_CTRL); 4307 } 4308 4309 vm_entry_controls_changebit(vmx, VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL, 4310 has_mediated_pmu); 4311 4312 vm_exit_controls_changebit(vmx, vm_exit_controls_bits, has_mediated_pmu); 4313 4314 for (i = 0; i < pmu->nr_arch_gp_counters; i++) { 4315 vmx_set_intercept_for_msr(vcpu, MSR_IA32_PERFCTR0 + i, 4316 MSR_TYPE_RW, intercept); 4317 vmx_set_intercept_for_msr(vcpu, MSR_IA32_PMC0 + i, MSR_TYPE_RW, 4318 intercept || !fw_writes_is_enabled(vcpu)); 4319 } 4320 for ( ; i < kvm_pmu_cap.num_counters_gp; i++) { 4321 vmx_set_intercept_for_msr(vcpu, MSR_IA32_PERFCTR0 + i, 4322 MSR_TYPE_RW, true); 4323 vmx_set_intercept_for_msr(vcpu, MSR_IA32_PMC0 + i, 4324 MSR_TYPE_RW, true); 4325 } 4326 4327 for (i = 0; i < pmu->nr_arch_fixed_counters; i++) 4328 vmx_set_intercept_for_msr(vcpu, MSR_CORE_PERF_FIXED_CTR0 + i, 4329 MSR_TYPE_RW, intercept); 4330 for ( ; i < kvm_pmu_cap.num_counters_fixed; i++) 4331 vmx_set_intercept_for_msr(vcpu, MSR_CORE_PERF_FIXED_CTR0 + i, 4332 MSR_TYPE_RW, true); 4333 4334 intercept = kvm_need_perf_global_ctrl_intercept(vcpu); 4335 vmx_set_intercept_for_msr(vcpu, MSR_CORE_PERF_GLOBAL_STATUS, 4336 MSR_TYPE_RW, intercept); 4337 vmx_set_intercept_for_msr(vcpu, MSR_CORE_PERF_GLOBAL_CTRL, 4338 MSR_TYPE_RW, intercept); 4339 vmx_set_intercept_for_msr(vcpu, MSR_CORE_PERF_GLOBAL_OVF_CTRL, 4340 MSR_TYPE_RW, intercept); 4341 } 4342 4343 static void vmx_recalc_msr_intercepts(struct kvm_vcpu *vcpu) 4344 { 4345 bool intercept; 4346 4347 if (!cpu_has_vmx_msr_bitmap()) 4348 return; 4349 4350 vmx_disable_intercept_for_msr(vcpu, MSR_IA32_TSC, MSR_TYPE_R); 4351 #ifdef CONFIG_X86_64 4352 vmx_disable_intercept_for_msr(vcpu, MSR_FS_BASE, MSR_TYPE_RW); 4353 vmx_disable_intercept_for_msr(vcpu, MSR_GS_BASE, MSR_TYPE_RW); 4354 vmx_disable_intercept_for_msr(vcpu, MSR_KERNEL_GS_BASE, MSR_TYPE_RW); 4355 #endif 4356 vmx_disable_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_CS, MSR_TYPE_RW); 4357 vmx_disable_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_ESP, MSR_TYPE_RW); 4358 vmx_disable_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_EIP, MSR_TYPE_RW); 4359 if (kvm_cstate_in_guest(vcpu->kvm)) { 4360 vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C1_RES, MSR_TYPE_R); 4361 vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C3_RESIDENCY, MSR_TYPE_R); 4362 vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C6_RESIDENCY, MSR_TYPE_R); 4363 vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C7_RESIDENCY, MSR_TYPE_R); 4364 } 4365 if (kvm_aperfmperf_in_guest(vcpu->kvm)) { 4366 vmx_disable_intercept_for_msr(vcpu, MSR_IA32_APERF, MSR_TYPE_R); 4367 vmx_disable_intercept_for_msr(vcpu, MSR_IA32_MPERF, MSR_TYPE_R); 4368 } 4369 4370 /* PT MSRs can be passed through iff PT is exposed to the guest. */ 4371 if (vmx_pt_mode_is_host_guest()) 4372 pt_update_intercept_for_msr(vcpu); 4373 4374 if (vcpu->arch.xfd_no_write_intercept) 4375 vmx_disable_intercept_for_msr(vcpu, MSR_IA32_XFD, MSR_TYPE_RW); 4376 4377 vmx_set_intercept_for_msr(vcpu, MSR_IA32_SPEC_CTRL, MSR_TYPE_RW, 4378 !to_vmx(vcpu)->spec_ctrl); 4379 4380 if (kvm_cpu_cap_has(X86_FEATURE_XFD)) 4381 vmx_set_intercept_for_msr(vcpu, MSR_IA32_XFD_ERR, MSR_TYPE_R, 4382 !guest_cpu_cap_has(vcpu, X86_FEATURE_XFD)); 4383 4384 if (cpu_feature_enabled(X86_FEATURE_IBPB)) 4385 vmx_set_intercept_for_msr(vcpu, MSR_IA32_PRED_CMD, MSR_TYPE_W, 4386 !guest_has_pred_cmd_msr(vcpu)); 4387 4388 if (cpu_feature_enabled(X86_FEATURE_FLUSH_L1D)) 4389 vmx_set_intercept_for_msr(vcpu, MSR_IA32_FLUSH_CMD, MSR_TYPE_W, 4390 !guest_cpu_cap_has(vcpu, X86_FEATURE_FLUSH_L1D)); 4391 4392 if (kvm_cpu_cap_has(X86_FEATURE_SHSTK)) { 4393 intercept = !guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK); 4394 4395 vmx_set_intercept_for_msr(vcpu, MSR_IA32_PL0_SSP, MSR_TYPE_RW, intercept); 4396 vmx_set_intercept_for_msr(vcpu, MSR_IA32_PL1_SSP, MSR_TYPE_RW, intercept); 4397 vmx_set_intercept_for_msr(vcpu, MSR_IA32_PL2_SSP, MSR_TYPE_RW, intercept); 4398 vmx_set_intercept_for_msr(vcpu, MSR_IA32_PL3_SSP, MSR_TYPE_RW, intercept); 4399 } 4400 4401 if (kvm_cpu_cap_has(X86_FEATURE_SHSTK) || kvm_cpu_cap_has(X86_FEATURE_IBT)) { 4402 intercept = !guest_cpu_cap_has(vcpu, X86_FEATURE_IBT) && 4403 !guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK); 4404 4405 vmx_set_intercept_for_msr(vcpu, MSR_IA32_U_CET, MSR_TYPE_RW, intercept); 4406 vmx_set_intercept_for_msr(vcpu, MSR_IA32_S_CET, MSR_TYPE_RW, intercept); 4407 } 4408 4409 vmx_recalc_pmu_msr_intercepts(vcpu); 4410 4411 /* 4412 * x2APIC and LBR MSR intercepts are modified on-demand and cannot be 4413 * filtered by userspace. 4414 */ 4415 } 4416 4417 static void vmx_recalc_instruction_intercepts(struct kvm_vcpu *vcpu) 4418 { 4419 exec_controls_changebit(to_vmx(vcpu), CPU_BASED_RDPMC_EXITING, 4420 kvm_need_rdpmc_intercept(vcpu)); 4421 } 4422 4423 void vmx_recalc_intercepts(struct kvm_vcpu *vcpu) 4424 { 4425 vmx_recalc_instruction_intercepts(vcpu); 4426 vmx_recalc_msr_intercepts(vcpu); 4427 } 4428 4429 static int vmx_deliver_nested_posted_interrupt(struct kvm_vcpu *vcpu, 4430 int vector) 4431 { 4432 struct vcpu_vmx *vmx = to_vmx(vcpu); 4433 4434 /* 4435 * DO NOT query the vCPU's vmcs12, as vmcs12 is dynamically allocated 4436 * and freed, and must not be accessed outside of vcpu->mutex. The 4437 * vCPU's cached PI NV is valid if and only if posted interrupts 4438 * enabled in its vmcs12, i.e. checking the vector also checks that 4439 * L1 has enabled posted interrupts for L2. 4440 */ 4441 if (is_guest_mode(vcpu) && 4442 vector == vmx->nested.posted_intr_nv) { 4443 /* 4444 * If a posted intr is not recognized by hardware, 4445 * we will accomplish it in the next vmentry. 4446 */ 4447 vmx->nested.pi_pending = true; 4448 kvm_make_request(KVM_REQ_EVENT, vcpu); 4449 4450 /* 4451 * This pairs with the smp_mb_*() after setting vcpu->mode in 4452 * vcpu_enter_guest() to guarantee the vCPU sees the event 4453 * request if triggering a posted interrupt "fails" because 4454 * vcpu->mode != IN_GUEST_MODE. The extra barrier is needed as 4455 * the smb_wmb() in kvm_make_request() only ensures everything 4456 * done before making the request is visible when the request 4457 * is visible, it doesn't ensure ordering between the store to 4458 * vcpu->requests and the load from vcpu->mode. 4459 */ 4460 smp_mb__after_atomic(); 4461 4462 /* the PIR and ON have been set by L1. */ 4463 kvm_vcpu_trigger_posted_interrupt(vcpu, POSTED_INTR_NESTED_VECTOR); 4464 return 0; 4465 } 4466 return -1; 4467 } 4468 /* 4469 * Send interrupt to vcpu via posted interrupt way. 4470 * 1. If target vcpu is running(non-root mode), send posted interrupt 4471 * notification to vcpu and hardware will sync PIR to vIRR atomically. 4472 * 2. If target vcpu isn't running(root mode), kick it to pick up the 4473 * interrupt from PIR in next vmentry. 4474 */ 4475 static int vmx_deliver_posted_interrupt(struct kvm_vcpu *vcpu, int vector) 4476 { 4477 struct vcpu_vt *vt = to_vt(vcpu); 4478 int r; 4479 4480 r = vmx_deliver_nested_posted_interrupt(vcpu, vector); 4481 if (!r) 4482 return 0; 4483 4484 /* Note, this is called iff the local APIC is in-kernel. */ 4485 if (!vcpu->arch.apic->apicv_active) 4486 return -1; 4487 4488 __vmx_deliver_posted_interrupt(vcpu, &vt->pi_desc, vector); 4489 return 0; 4490 } 4491 4492 void vmx_deliver_interrupt(struct kvm_lapic *apic, int delivery_mode, 4493 int trig_mode, int vector) 4494 { 4495 struct kvm_vcpu *vcpu = apic->vcpu; 4496 4497 if (vmx_deliver_posted_interrupt(vcpu, vector)) { 4498 kvm_lapic_set_irr(vector, apic); 4499 kvm_make_request(KVM_REQ_EVENT, vcpu); 4500 kvm_vcpu_kick(vcpu); 4501 } else { 4502 trace_kvm_apicv_accept_irq(vcpu->vcpu_id, delivery_mode, 4503 trig_mode, vector); 4504 } 4505 } 4506 4507 /* 4508 * Set up the vmcs's constant host-state fields, i.e., host-state fields that 4509 * will not change in the lifetime of the guest. 4510 * Note that host-state that does change is set elsewhere. E.g., host-state 4511 * that is set differently for each CPU is set in vmx_vcpu_load(), not here. 4512 */ 4513 void vmx_set_constant_host_state(struct vcpu_vmx *vmx) 4514 { 4515 u32 low32, high32; 4516 unsigned long tmpl; 4517 unsigned long cr0, cr3, cr4; 4518 4519 cr0 = read_cr0(); 4520 WARN_ON(cr0 & X86_CR0_TS); 4521 vmcs_writel(HOST_CR0, cr0); /* 22.2.3 */ 4522 4523 /* 4524 * Save the most likely value for this task's CR3 in the VMCS. 4525 * We can't use __get_current_cr3_fast() because we're not atomic. 4526 */ 4527 cr3 = __read_cr3(); 4528 vmcs_writel(HOST_CR3, cr3); /* 22.2.3 FIXME: shadow tables */ 4529 vmx->loaded_vmcs->host_state.cr3 = cr3; 4530 4531 /* Save the most likely value for this task's CR4 in the VMCS. */ 4532 cr4 = cr4_read_shadow(); 4533 vmcs_writel(HOST_CR4, cr4); /* 22.2.3, 22.2.5 */ 4534 vmx->loaded_vmcs->host_state.cr4 = cr4; 4535 4536 vmcs_write16(HOST_CS_SELECTOR, __KERNEL_CS); /* 22.2.4 */ 4537 #ifdef CONFIG_X86_64 4538 /* 4539 * Load null selectors, so we can avoid reloading them in 4540 * vmx_prepare_switch_to_host(), in case userspace uses 4541 * the null selectors too (the expected case). 4542 */ 4543 vmcs_write16(HOST_DS_SELECTOR, 0); 4544 vmcs_write16(HOST_ES_SELECTOR, 0); 4545 #else 4546 vmcs_write16(HOST_DS_SELECTOR, __KERNEL_DS); /* 22.2.4 */ 4547 vmcs_write16(HOST_ES_SELECTOR, __KERNEL_DS); /* 22.2.4 */ 4548 #endif 4549 vmcs_write16(HOST_SS_SELECTOR, __KERNEL_DS); /* 22.2.4 */ 4550 vmcs_write16(HOST_TR_SELECTOR, GDT_ENTRY_TSS*8); /* 22.2.4 */ 4551 4552 vmcs_writel(HOST_IDTR_BASE, host_idt_base); /* 22.2.4 */ 4553 4554 vmcs_writel(HOST_RIP, (unsigned long)vmx_vmexit); /* 22.2.5 */ 4555 4556 rdmsr(MSR_IA32_SYSENTER_CS, low32, high32); 4557 vmcs_write32(HOST_IA32_SYSENTER_CS, low32); 4558 4559 /* 4560 * SYSENTER is used for 32-bit system calls on either 32-bit or 4561 * 64-bit kernels. It is always zero If neither is allowed, otherwise 4562 * vmx_vcpu_load_vmcs loads it with the per-CPU entry stack (and may 4563 * have already done so!). 4564 */ 4565 if (!IS_ENABLED(CONFIG_IA32_EMULATION) && !IS_ENABLED(CONFIG_X86_32)) 4566 vmcs_writel(HOST_IA32_SYSENTER_ESP, 0); 4567 4568 rdmsrq(MSR_IA32_SYSENTER_EIP, tmpl); 4569 vmcs_writel(HOST_IA32_SYSENTER_EIP, tmpl); /* 22.2.3 */ 4570 4571 if (vmcs_config.vmexit_ctrl & VM_EXIT_LOAD_IA32_PAT) { 4572 rdmsr(MSR_IA32_CR_PAT, low32, high32); 4573 vmcs_write64(HOST_IA32_PAT, low32 | ((u64) high32 << 32)); 4574 } 4575 4576 if (cpu_has_load_ia32_efer()) 4577 vmcs_write64(HOST_IA32_EFER, kvm_host.efer); 4578 4579 /* 4580 * Supervisor shadow stack is not enabled on host side, i.e., 4581 * host IA32_S_CET.SHSTK_EN bit is guaranteed to 0 now, per SDM 4582 * description(RDSSP instruction), SSP is not readable in CPL0, 4583 * so resetting the two registers to 0s at VM-Exit does no harm 4584 * to kernel execution. When execution flow exits to userspace, 4585 * SSP is reloaded from IA32_PL3_SSP. Check SDM Vol.2A/B Chapter 4586 * 3 and 4 for details. 4587 */ 4588 if (cpu_has_load_cet_ctrl()) { 4589 vmcs_writel(HOST_S_CET, kvm_host.s_cet); 4590 vmcs_writel(HOST_SSP, 0); 4591 vmcs_writel(HOST_INTR_SSP_TABLE, 0); 4592 } 4593 4594 /* 4595 * When running a guest with a mediated PMU, guest state is resident in 4596 * hardware after VM-Exit. Zero PERF_GLOBAL_CTRL on exit so that host 4597 * activity doesn't bleed into the guest counters. When running with 4598 * an emulated PMU, PERF_GLOBAL_CTRL is dynamically computed on every 4599 * entry/exit to merge guest and host PMU usage. 4600 */ 4601 if (enable_mediated_pmu) 4602 vmcs_write64(HOST_IA32_PERF_GLOBAL_CTRL, 0); 4603 } 4604 4605 void set_cr4_guest_host_mask(struct vcpu_vmx *vmx) 4606 { 4607 struct kvm_vcpu *vcpu = &vmx->vcpu; 4608 4609 vcpu->arch.cr4_guest_owned_bits = KVM_POSSIBLE_CR4_GUEST_BITS & 4610 ~vcpu->arch.cr4_guest_rsvd_bits; 4611 if (!enable_ept) { 4612 vcpu->arch.cr4_guest_owned_bits &= ~X86_CR4_TLBFLUSH_BITS; 4613 vcpu->arch.cr4_guest_owned_bits &= ~X86_CR4_PDPTR_BITS; 4614 } 4615 if (is_guest_mode(&vmx->vcpu)) 4616 vcpu->arch.cr4_guest_owned_bits &= 4617 ~get_vmcs12(vcpu)->cr4_guest_host_mask; 4618 vmcs_writel(CR4_GUEST_HOST_MASK, ~vcpu->arch.cr4_guest_owned_bits); 4619 } 4620 4621 static u32 vmx_pin_based_exec_ctrl(struct vcpu_vmx *vmx) 4622 { 4623 u32 pin_based_exec_ctrl = vmcs_config.pin_based_exec_ctrl; 4624 4625 if (!kvm_vcpu_apicv_active(&vmx->vcpu)) 4626 pin_based_exec_ctrl &= ~PIN_BASED_POSTED_INTR; 4627 4628 if (!enable_vnmi) 4629 pin_based_exec_ctrl &= ~PIN_BASED_VIRTUAL_NMIS; 4630 4631 if (!enable_preemption_timer) 4632 pin_based_exec_ctrl &= ~PIN_BASED_VMX_PREEMPTION_TIMER; 4633 4634 return pin_based_exec_ctrl; 4635 } 4636 4637 static u32 vmx_get_initial_vmentry_ctrl(void) 4638 { 4639 u32 vmentry_ctrl = vmcs_config.vmentry_ctrl; 4640 4641 if (vmx_pt_mode_is_system()) 4642 vmentry_ctrl &= ~(VM_ENTRY_PT_CONCEAL_PIP | 4643 VM_ENTRY_LOAD_IA32_RTIT_CTL); 4644 /* 4645 * IA32e mode, and loading of EFER and PERF_GLOBAL_CTRL are toggled dynamically. 4646 */ 4647 vmentry_ctrl &= ~(VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL | 4648 VM_ENTRY_LOAD_IA32_EFER | 4649 VM_ENTRY_IA32E_MODE); 4650 4651 return vmentry_ctrl; 4652 } 4653 4654 static u32 vmx_get_initial_vmexit_ctrl(void) 4655 { 4656 u32 vmexit_ctrl = vmcs_config.vmexit_ctrl; 4657 4658 /* 4659 * Not used by KVM and never set in vmcs01 or vmcs02, but emulated for 4660 * nested virtualization and thus allowed to be set in vmcs12. 4661 */ 4662 vmexit_ctrl &= ~(VM_EXIT_SAVE_IA32_PAT | VM_EXIT_SAVE_IA32_EFER | 4663 VM_EXIT_SAVE_VMX_PREEMPTION_TIMER); 4664 4665 if (vmx_pt_mode_is_system()) 4666 vmexit_ctrl &= ~(VM_EXIT_PT_CONCEAL_PIP | 4667 VM_EXIT_CLEAR_IA32_RTIT_CTL); 4668 /* Loading of EFER and PERF_GLOBAL_CTRL are toggled dynamically */ 4669 return vmexit_ctrl & 4670 ~(VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL | VM_EXIT_LOAD_IA32_EFER | 4671 VM_EXIT_SAVE_IA32_PERF_GLOBAL_CTRL); 4672 } 4673 4674 void vmx_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu) 4675 { 4676 struct vcpu_vmx *vmx = to_vmx(vcpu); 4677 4678 guard(vmx_vmcs01)(vcpu); 4679 4680 pin_controls_set(vmx, vmx_pin_based_exec_ctrl(vmx)); 4681 4682 secondary_exec_controls_changebit(vmx, 4683 SECONDARY_EXEC_APIC_REGISTER_VIRT | 4684 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY, 4685 kvm_vcpu_apicv_active(vcpu)); 4686 if (enable_ipiv) 4687 tertiary_exec_controls_changebit(vmx, TERTIARY_EXEC_IPI_VIRT, 4688 kvm_vcpu_apicv_active(vcpu)); 4689 4690 vmx_update_msr_bitmap_x2apic(vcpu); 4691 } 4692 4693 static u32 vmx_exec_control(struct vcpu_vmx *vmx) 4694 { 4695 u32 exec_control = vmcs_config.cpu_based_exec_ctrl; 4696 4697 /* 4698 * Not used by KVM, but fully supported for nesting, i.e. are allowed in 4699 * vmcs12 and propagated to vmcs02 when set in vmcs12. 4700 */ 4701 exec_control &= ~(CPU_BASED_RDTSC_EXITING | 4702 CPU_BASED_USE_IO_BITMAPS | 4703 CPU_BASED_MONITOR_TRAP_FLAG | 4704 CPU_BASED_PAUSE_EXITING); 4705 4706 /* INTR_WINDOW_EXITING and NMI_WINDOW_EXITING are toggled dynamically */ 4707 exec_control &= ~(CPU_BASED_INTR_WINDOW_EXITING | 4708 CPU_BASED_NMI_WINDOW_EXITING); 4709 4710 if (vmx->vcpu.arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT) 4711 exec_control &= ~CPU_BASED_MOV_DR_EXITING; 4712 4713 if (!cpu_need_tpr_shadow(&vmx->vcpu)) 4714 exec_control &= ~CPU_BASED_TPR_SHADOW; 4715 4716 #ifdef CONFIG_X86_64 4717 if (exec_control & CPU_BASED_TPR_SHADOW) 4718 exec_control &= ~(CPU_BASED_CR8_LOAD_EXITING | 4719 CPU_BASED_CR8_STORE_EXITING); 4720 else 4721 exec_control |= CPU_BASED_CR8_STORE_EXITING | 4722 CPU_BASED_CR8_LOAD_EXITING; 4723 #endif 4724 /* No need to intercept CR3 access or INVPLG when using EPT. */ 4725 if (enable_ept) 4726 exec_control &= ~(CPU_BASED_CR3_LOAD_EXITING | 4727 CPU_BASED_CR3_STORE_EXITING | 4728 CPU_BASED_INVLPG_EXITING); 4729 if (kvm_mwait_in_guest(vmx->vcpu.kvm)) 4730 exec_control &= ~(CPU_BASED_MWAIT_EXITING | 4731 CPU_BASED_MONITOR_EXITING); 4732 if (kvm_hlt_in_guest(vmx->vcpu.kvm)) 4733 exec_control &= ~CPU_BASED_HLT_EXITING; 4734 return exec_control; 4735 } 4736 4737 static u64 vmx_tertiary_exec_control(struct vcpu_vmx *vmx) 4738 { 4739 u64 exec_control = vmcs_config.cpu_based_3rd_exec_ctrl; 4740 4741 /* 4742 * IPI virtualization relies on APICv. Disable IPI virtualization if 4743 * APICv is inhibited. 4744 */ 4745 if (!enable_ipiv || !kvm_vcpu_apicv_active(&vmx->vcpu)) 4746 exec_control &= ~TERTIARY_EXEC_IPI_VIRT; 4747 4748 return exec_control; 4749 } 4750 4751 /* 4752 * Adjust a single secondary execution control bit to intercept/allow an 4753 * instruction in the guest. This is usually done based on whether or not a 4754 * feature has been exposed to the guest in order to correctly emulate faults. 4755 */ 4756 static inline void 4757 vmx_adjust_secondary_exec_control(struct vcpu_vmx *vmx, u32 *exec_control, 4758 u32 control, bool enabled, bool exiting) 4759 { 4760 /* 4761 * If the control is for an opt-in feature, clear the control if the 4762 * feature is not exposed to the guest, i.e. not enabled. If the 4763 * control is opt-out, i.e. an exiting control, clear the control if 4764 * the feature _is_ exposed to the guest, i.e. exiting/interception is 4765 * disabled for the associated instruction. Note, the caller is 4766 * responsible presetting exec_control to set all supported bits. 4767 */ 4768 if (enabled == exiting) 4769 *exec_control &= ~control; 4770 4771 /* 4772 * Update the nested MSR settings so that a nested VMM can/can't set 4773 * controls for features that are/aren't exposed to the guest. 4774 */ 4775 if (nested && 4776 kvm_check_has_quirk(vmx->vcpu.kvm, KVM_X86_QUIRK_STUFF_FEATURE_MSRS)) { 4777 /* 4778 * All features that can be added or removed to VMX MSRs must 4779 * be supported in the first place for nested virtualization. 4780 */ 4781 if (WARN_ON_ONCE(!(vmcs_config.nested.secondary_ctls_high & control))) 4782 enabled = false; 4783 4784 if (enabled) 4785 vmx->nested.msrs.secondary_ctls_high |= control; 4786 else 4787 vmx->nested.msrs.secondary_ctls_high &= ~control; 4788 } 4789 } 4790 4791 /* 4792 * Wrapper macro for the common case of adjusting a secondary execution control 4793 * based on a single guest CPUID bit, with a dedicated feature bit. This also 4794 * verifies that the control is actually supported by KVM and hardware. 4795 */ 4796 #define vmx_adjust_sec_exec_control(vmx, exec_control, name, feat_name, ctrl_name, exiting) \ 4797 ({ \ 4798 struct kvm_vcpu *__vcpu = &(vmx)->vcpu; \ 4799 bool __enabled; \ 4800 \ 4801 if (cpu_has_vmx_##name()) { \ 4802 __enabled = guest_cpu_cap_has(__vcpu, X86_FEATURE_##feat_name); \ 4803 vmx_adjust_secondary_exec_control(vmx, exec_control, SECONDARY_EXEC_##ctrl_name,\ 4804 __enabled, exiting); \ 4805 } \ 4806 }) 4807 4808 /* More macro magic for ENABLE_/opt-in versus _EXITING/opt-out controls. */ 4809 #define vmx_adjust_sec_exec_feature(vmx, exec_control, lname, uname) \ 4810 vmx_adjust_sec_exec_control(vmx, exec_control, lname, uname, ENABLE_##uname, false) 4811 4812 #define vmx_adjust_sec_exec_exiting(vmx, exec_control, lname, uname) \ 4813 vmx_adjust_sec_exec_control(vmx, exec_control, lname, uname, uname##_EXITING, true) 4814 4815 static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx) 4816 { 4817 struct kvm_vcpu *vcpu = &vmx->vcpu; 4818 4819 u32 exec_control = vmcs_config.cpu_based_2nd_exec_ctrl; 4820 4821 if (vmx_pt_mode_is_system()) 4822 exec_control &= ~(SECONDARY_EXEC_PT_USE_GPA | SECONDARY_EXEC_PT_CONCEAL_VMX); 4823 if (!cpu_need_virtualize_apic_accesses(vcpu)) 4824 exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; 4825 if (vmx->vpid == 0) 4826 exec_control &= ~SECONDARY_EXEC_ENABLE_VPID; 4827 if (!enable_ept) { 4828 exec_control &= ~SECONDARY_EXEC_ENABLE_EPT; 4829 exec_control &= ~SECONDARY_EXEC_EPT_VIOLATION_VE; 4830 enable_unrestricted_guest = 0; 4831 } 4832 if (!enable_unrestricted_guest) 4833 exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST; 4834 if (kvm_pause_in_guest(vmx->vcpu.kvm)) 4835 exec_control &= ~SECONDARY_EXEC_PAUSE_LOOP_EXITING; 4836 if (!kvm_vcpu_apicv_active(vcpu)) 4837 exec_control &= ~(SECONDARY_EXEC_APIC_REGISTER_VIRT | 4838 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY); 4839 exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE; 4840 4841 /* 4842 * KVM doesn't support VMFUNC for L1, but the control is set in KVM's 4843 * base configuration as KVM emulates VMFUNC[EPTP_SWITCHING] for L2. 4844 */ 4845 exec_control &= ~SECONDARY_EXEC_ENABLE_VMFUNC; 4846 4847 /* SECONDARY_EXEC_DESC is enabled/disabled on writes to CR4.UMIP, 4848 * in vmx_set_cr4. */ 4849 exec_control &= ~SECONDARY_EXEC_DESC; 4850 4851 /* SECONDARY_EXEC_SHADOW_VMCS is enabled when L1 executes VMPTRLD 4852 (handle_vmptrld). 4853 We can NOT enable shadow_vmcs here because we don't have yet 4854 a current VMCS12 4855 */ 4856 exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS; 4857 4858 /* 4859 * PML is enabled/disabled when dirty logging of memsmlots changes, but 4860 * it needs to be set here when dirty logging is already active, e.g. 4861 * if this vCPU was created after dirty logging was enabled. 4862 */ 4863 if (!enable_pml || !atomic_read(&vcpu->kvm->nr_memslots_dirty_logging)) 4864 exec_control &= ~SECONDARY_EXEC_ENABLE_PML; 4865 4866 vmx_adjust_sec_exec_feature(vmx, &exec_control, xsaves, XSAVES); 4867 4868 /* 4869 * RDPID is also gated by ENABLE_RDTSCP, turn on the control if either 4870 * feature is exposed to the guest. This creates a virtualization hole 4871 * if both are supported in hardware but only one is exposed to the 4872 * guest, but letting the guest execute RDTSCP or RDPID when either one 4873 * is advertised is preferable to emulating the advertised instruction 4874 * in KVM on #UD, and obviously better than incorrectly injecting #UD. 4875 */ 4876 if (cpu_has_vmx_rdtscp()) { 4877 bool rdpid_or_rdtscp_enabled = 4878 guest_cpu_cap_has(vcpu, X86_FEATURE_RDTSCP) || 4879 guest_cpu_cap_has(vcpu, X86_FEATURE_RDPID); 4880 4881 vmx_adjust_secondary_exec_control(vmx, &exec_control, 4882 SECONDARY_EXEC_ENABLE_RDTSCP, 4883 rdpid_or_rdtscp_enabled, false); 4884 } 4885 4886 vmx_adjust_sec_exec_feature(vmx, &exec_control, invpcid, INVPCID); 4887 4888 vmx_adjust_sec_exec_exiting(vmx, &exec_control, rdrand, RDRAND); 4889 vmx_adjust_sec_exec_exiting(vmx, &exec_control, rdseed, RDSEED); 4890 4891 vmx_adjust_sec_exec_control(vmx, &exec_control, waitpkg, WAITPKG, 4892 ENABLE_USR_WAIT_PAUSE, false); 4893 4894 if (!vcpu->kvm->arch.bus_lock_detection_enabled) 4895 exec_control &= ~SECONDARY_EXEC_BUS_LOCK_DETECTION; 4896 4897 if (!kvm_notify_vmexit_enabled(vcpu->kvm)) 4898 exec_control &= ~SECONDARY_EXEC_NOTIFY_VM_EXITING; 4899 4900 return exec_control; 4901 } 4902 4903 static inline int vmx_get_pid_table_order(struct kvm *kvm) 4904 { 4905 return get_order(kvm->arch.max_vcpu_ids * sizeof(*to_kvm_vmx(kvm)->pid_table)); 4906 } 4907 4908 static int vmx_alloc_ipiv_pid_table(struct kvm *kvm) 4909 { 4910 struct page *pages; 4911 struct kvm_vmx *kvm_vmx = to_kvm_vmx(kvm); 4912 4913 if (!irqchip_in_kernel(kvm) || !enable_ipiv) 4914 return 0; 4915 4916 if (kvm_vmx->pid_table) 4917 return 0; 4918 4919 pages = alloc_pages(GFP_KERNEL_ACCOUNT | __GFP_ZERO, 4920 vmx_get_pid_table_order(kvm)); 4921 if (!pages) 4922 return -ENOMEM; 4923 4924 kvm_vmx->pid_table = (void *)page_address(pages); 4925 return 0; 4926 } 4927 4928 int vmx_vcpu_precreate(struct kvm *kvm) 4929 { 4930 return vmx_alloc_ipiv_pid_table(kvm); 4931 } 4932 4933 #define VMX_XSS_EXIT_BITMAP 0 4934 4935 static void init_vmcs(struct vcpu_vmx *vmx) 4936 { 4937 struct kvm *kvm = vmx->vcpu.kvm; 4938 struct kvm_vmx *kvm_vmx = to_kvm_vmx(kvm); 4939 4940 if (nested) 4941 nested_vmx_set_vmcs_shadowing_bitmap(); 4942 4943 if (cpu_has_vmx_msr_bitmap()) 4944 vmcs_write64(MSR_BITMAP, __pa(vmx->vmcs01.msr_bitmap)); 4945 4946 vmcs_write64(VMCS_LINK_POINTER, INVALID_GPA); /* 22.3.1.5 */ 4947 4948 /* Control */ 4949 pin_controls_set(vmx, vmx_pin_based_exec_ctrl(vmx)); 4950 4951 exec_controls_set(vmx, vmx_exec_control(vmx)); 4952 4953 if (cpu_has_secondary_exec_ctrls()) { 4954 secondary_exec_controls_set(vmx, vmx_secondary_exec_control(vmx)); 4955 if (vmx->ve_info) 4956 vmcs_write64(VE_INFORMATION_ADDRESS, 4957 __pa(vmx->ve_info)); 4958 } 4959 4960 if (cpu_has_tertiary_exec_ctrls()) 4961 tertiary_exec_controls_set(vmx, vmx_tertiary_exec_control(vmx)); 4962 4963 if (enable_apicv && lapic_in_kernel(&vmx->vcpu)) { 4964 vmcs_write64(EOI_EXIT_BITMAP0, 0); 4965 vmcs_write64(EOI_EXIT_BITMAP1, 0); 4966 vmcs_write64(EOI_EXIT_BITMAP2, 0); 4967 vmcs_write64(EOI_EXIT_BITMAP3, 0); 4968 4969 vmcs_write16(GUEST_INTR_STATUS, 0); 4970 4971 vmcs_write16(POSTED_INTR_NV, POSTED_INTR_VECTOR); 4972 vmcs_write64(POSTED_INTR_DESC_ADDR, __pa((&vmx->vt.pi_desc))); 4973 } 4974 4975 if (vmx_can_use_ipiv(&vmx->vcpu)) { 4976 vmcs_write64(PID_POINTER_TABLE, __pa(kvm_vmx->pid_table)); 4977 vmcs_write16(LAST_PID_POINTER_INDEX, kvm->arch.max_vcpu_ids - 1); 4978 } 4979 4980 if (!kvm_pause_in_guest(kvm)) { 4981 vmcs_write32(PLE_GAP, ple_gap); 4982 vmx->ple_window = ple_window; 4983 vmx->ple_window_dirty = true; 4984 } 4985 4986 if (kvm_notify_vmexit_enabled(kvm)) 4987 vmcs_write32(NOTIFY_WINDOW, kvm->arch.notify_window); 4988 4989 vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, 0); 4990 vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, 0); 4991 vmcs_write32(CR3_TARGET_COUNT, 0); /* 22.2.1 */ 4992 4993 vmcs_write16(HOST_FS_SELECTOR, 0); /* 22.2.4 */ 4994 vmcs_write16(HOST_GS_SELECTOR, 0); /* 22.2.4 */ 4995 vmx_set_constant_host_state(vmx); 4996 vmcs_writel(HOST_FS_BASE, 0); /* 22.2.4 */ 4997 vmcs_writel(HOST_GS_BASE, 0); /* 22.2.4 */ 4998 4999 if (cpu_has_vmx_vmfunc()) 5000 vmcs_write64(VM_FUNCTION_CONTROL, 0); 5001 5002 vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0); 5003 vmcs_write64(VM_EXIT_MSR_STORE_ADDR, __pa(vmx->msr_autostore.val)); 5004 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0); 5005 vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host.val)); 5006 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0); 5007 vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest.val)); 5008 5009 if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) 5010 vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat); 5011 5012 vm_exit_controls_set(vmx, vmx_get_initial_vmexit_ctrl()); 5013 5014 /* 22.2.1, 20.8.1 */ 5015 vm_entry_controls_set(vmx, vmx_get_initial_vmentry_ctrl()); 5016 5017 vmx->vcpu.arch.cr0_guest_owned_bits = vmx_l1_guest_owned_cr0_bits(); 5018 vmcs_writel(CR0_GUEST_HOST_MASK, ~vmx->vcpu.arch.cr0_guest_owned_bits); 5019 5020 set_cr4_guest_host_mask(vmx); 5021 5022 if (vmx->vpid != 0) 5023 vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid); 5024 5025 if (cpu_has_vmx_xsaves()) 5026 vmcs_write64(XSS_EXIT_BITMAP, VMX_XSS_EXIT_BITMAP); 5027 5028 if (enable_pml) { 5029 vmcs_write64(PML_ADDRESS, page_to_phys(vmx->pml_pg)); 5030 vmcs_write16(GUEST_PML_INDEX, PML_HEAD_INDEX); 5031 } 5032 5033 vmx_write_encls_bitmap(&vmx->vcpu, NULL); 5034 5035 if (vmx_pt_mode_is_host_guest()) { 5036 memset(&vmx->pt_desc, 0, sizeof(vmx->pt_desc)); 5037 /* Bit[6~0] are forced to 1, writes are ignored. */ 5038 vmx->pt_desc.guest.output_mask = 0x7F; 5039 vmcs_write64(GUEST_IA32_RTIT_CTL, 0); 5040 } 5041 5042 vmcs_write32(GUEST_SYSENTER_CS, 0); 5043 vmcs_writel(GUEST_SYSENTER_ESP, 0); 5044 vmcs_writel(GUEST_SYSENTER_EIP, 0); 5045 5046 vmx_guest_debugctl_write(&vmx->vcpu, 0); 5047 5048 if (cpu_has_vmx_tpr_shadow()) { 5049 vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 0); 5050 if (cpu_need_tpr_shadow(&vmx->vcpu)) 5051 vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 5052 __pa(vmx->vcpu.arch.apic->regs)); 5053 vmcs_write32(TPR_THRESHOLD, 0); 5054 } 5055 5056 vmx_setup_uret_msrs(vmx); 5057 } 5058 5059 static void __vmx_vcpu_reset(struct kvm_vcpu *vcpu) 5060 { 5061 struct vcpu_vmx *vmx = to_vmx(vcpu); 5062 5063 init_vmcs(vmx); 5064 5065 if (nested && 5066 kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_STUFF_FEATURE_MSRS)) 5067 memcpy(&vmx->nested.msrs, &vmcs_config.nested, sizeof(vmx->nested.msrs)); 5068 5069 vcpu_setup_sgx_lepubkeyhash(vcpu); 5070 5071 vmx->nested.posted_intr_nv = -1; 5072 vmx->nested.vmxon_ptr = INVALID_GPA; 5073 vmx->nested.current_vmptr = INVALID_GPA; 5074 5075 #ifdef CONFIG_KVM_HYPERV 5076 vmx->nested.hv_evmcs_vmptr = EVMPTR_INVALID; 5077 #endif 5078 5079 if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_STUFF_FEATURE_MSRS)) 5080 vcpu->arch.microcode_version = 0x100000000ULL; 5081 vmx->msr_ia32_feature_control_valid_bits = FEAT_CTL_LOCKED; 5082 5083 /* 5084 * Enforce invariant: pi_desc.nv is always either POSTED_INTR_VECTOR 5085 * or POSTED_INTR_WAKEUP_VECTOR. 5086 */ 5087 vmx->vt.pi_desc.nv = POSTED_INTR_VECTOR; 5088 __pi_set_sn(&vmx->vt.pi_desc); 5089 } 5090 5091 void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) 5092 { 5093 struct vcpu_vmx *vmx = to_vmx(vcpu); 5094 5095 if (!init_event) 5096 __vmx_vcpu_reset(vcpu); 5097 5098 vmx->rmode.vm86_active = 0; 5099 vmx->spec_ctrl = 0; 5100 5101 vmx->msr_ia32_umwait_control = 0; 5102 5103 vmx->hv_deadline_tsc = -1; 5104 kvm_set_cr8(vcpu, 0); 5105 5106 seg_setup(VCPU_SREG_CS); 5107 vmcs_write16(GUEST_CS_SELECTOR, 0xf000); 5108 vmcs_writel(GUEST_CS_BASE, 0xffff0000ul); 5109 5110 seg_setup(VCPU_SREG_DS); 5111 seg_setup(VCPU_SREG_ES); 5112 seg_setup(VCPU_SREG_FS); 5113 seg_setup(VCPU_SREG_GS); 5114 seg_setup(VCPU_SREG_SS); 5115 5116 vmcs_write16(GUEST_TR_SELECTOR, 0); 5117 vmcs_writel(GUEST_TR_BASE, 0); 5118 vmcs_write32(GUEST_TR_LIMIT, 0xffff); 5119 vmcs_write32(GUEST_TR_AR_BYTES, 0x008b); 5120 5121 vmcs_write16(GUEST_LDTR_SELECTOR, 0); 5122 vmcs_writel(GUEST_LDTR_BASE, 0); 5123 vmcs_write32(GUEST_LDTR_LIMIT, 0xffff); 5124 vmcs_write32(GUEST_LDTR_AR_BYTES, 0x00082); 5125 5126 vmcs_writel(GUEST_GDTR_BASE, 0); 5127 vmcs_write32(GUEST_GDTR_LIMIT, 0xffff); 5128 5129 vmcs_writel(GUEST_IDTR_BASE, 0); 5130 vmcs_write32(GUEST_IDTR_LIMIT, 0xffff); 5131 5132 vmx_segment_cache_clear(vmx); 5133 kvm_register_mark_available(vcpu, VCPU_EXREG_SEGMENTS); 5134 5135 vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE); 5136 vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0); 5137 vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS, 0); 5138 if (kvm_mpx_supported()) 5139 vmcs_write64(GUEST_BNDCFGS, 0); 5140 5141 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0); /* 22.2.1 */ 5142 5143 if (kvm_cpu_cap_has(X86_FEATURE_SHSTK)) { 5144 vmcs_writel(GUEST_SSP, 0); 5145 vmcs_writel(GUEST_INTR_SSP_TABLE, 0); 5146 } 5147 if (kvm_cpu_cap_has(X86_FEATURE_IBT) || 5148 kvm_cpu_cap_has(X86_FEATURE_SHSTK)) 5149 vmcs_writel(GUEST_S_CET, 0); 5150 5151 kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu); 5152 5153 vpid_sync_context(vmx->vpid); 5154 5155 vmx_update_fb_clear_dis(vcpu, vmx); 5156 } 5157 5158 void vmx_enable_irq_window(struct kvm_vcpu *vcpu) 5159 { 5160 exec_controls_setbit(to_vmx(vcpu), CPU_BASED_INTR_WINDOW_EXITING); 5161 } 5162 5163 void vmx_enable_nmi_window(struct kvm_vcpu *vcpu) 5164 { 5165 if (!enable_vnmi || 5166 vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_STI) { 5167 vmx_enable_irq_window(vcpu); 5168 return; 5169 } 5170 5171 exec_controls_setbit(to_vmx(vcpu), CPU_BASED_NMI_WINDOW_EXITING); 5172 } 5173 5174 void vmx_inject_irq(struct kvm_vcpu *vcpu, bool reinjected) 5175 { 5176 struct vcpu_vmx *vmx = to_vmx(vcpu); 5177 uint32_t intr; 5178 int irq = vcpu->arch.interrupt.nr; 5179 5180 trace_kvm_inj_virq(irq, vcpu->arch.interrupt.soft, reinjected); 5181 5182 ++vcpu->stat.irq_injections; 5183 if (vmx->rmode.vm86_active) { 5184 int inc_eip = 0; 5185 if (vcpu->arch.interrupt.soft) 5186 inc_eip = vcpu->arch.event_exit_inst_len; 5187 kvm_inject_realmode_interrupt(vcpu, irq, inc_eip); 5188 return; 5189 } 5190 intr = irq | INTR_INFO_VALID_MASK; 5191 if (vcpu->arch.interrupt.soft) { 5192 intr |= INTR_TYPE_SOFT_INTR; 5193 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 5194 vmx->vcpu.arch.event_exit_inst_len); 5195 } else 5196 intr |= INTR_TYPE_EXT_INTR; 5197 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr); 5198 5199 vmx_clear_hlt(vcpu); 5200 } 5201 5202 void vmx_inject_nmi(struct kvm_vcpu *vcpu) 5203 { 5204 struct vcpu_vmx *vmx = to_vmx(vcpu); 5205 5206 if (!enable_vnmi) { 5207 /* 5208 * Tracking the NMI-blocked state in software is built upon 5209 * finding the next open IRQ window. This, in turn, depends on 5210 * well-behaving guests: They have to keep IRQs disabled at 5211 * least as long as the NMI handler runs. Otherwise we may 5212 * cause NMI nesting, maybe breaking the guest. But as this is 5213 * highly unlikely, we can live with the residual risk. 5214 */ 5215 vmx->loaded_vmcs->soft_vnmi_blocked = 1; 5216 vmx->loaded_vmcs->vnmi_blocked_time = 0; 5217 } 5218 5219 ++vcpu->stat.nmi_injections; 5220 vmx->loaded_vmcs->nmi_known_unmasked = false; 5221 5222 if (vmx->rmode.vm86_active) { 5223 kvm_inject_realmode_interrupt(vcpu, NMI_VECTOR, 0); 5224 return; 5225 } 5226 5227 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 5228 INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR); 5229 5230 vmx_clear_hlt(vcpu); 5231 } 5232 5233 bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu) 5234 { 5235 struct vcpu_vmx *vmx = to_vmx(vcpu); 5236 bool masked; 5237 5238 if (!enable_vnmi) 5239 return vmx->loaded_vmcs->soft_vnmi_blocked; 5240 if (vmx->loaded_vmcs->nmi_known_unmasked) 5241 return false; 5242 masked = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_NMI; 5243 vmx->loaded_vmcs->nmi_known_unmasked = !masked; 5244 return masked; 5245 } 5246 5247 void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked) 5248 { 5249 struct vcpu_vmx *vmx = to_vmx(vcpu); 5250 5251 if (!enable_vnmi) { 5252 if (vmx->loaded_vmcs->soft_vnmi_blocked != masked) { 5253 vmx->loaded_vmcs->soft_vnmi_blocked = masked; 5254 vmx->loaded_vmcs->vnmi_blocked_time = 0; 5255 } 5256 } else { 5257 vmx->loaded_vmcs->nmi_known_unmasked = !masked; 5258 if (masked) 5259 vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, 5260 GUEST_INTR_STATE_NMI); 5261 else 5262 vmcs_clear_bits(GUEST_INTERRUPTIBILITY_INFO, 5263 GUEST_INTR_STATE_NMI); 5264 } 5265 } 5266 5267 bool vmx_nmi_blocked(struct kvm_vcpu *vcpu) 5268 { 5269 if (is_guest_mode(vcpu) && nested_exit_on_nmi(vcpu)) 5270 return false; 5271 5272 if (!enable_vnmi && to_vmx(vcpu)->loaded_vmcs->soft_vnmi_blocked) 5273 return true; 5274 5275 return (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 5276 (GUEST_INTR_STATE_MOV_SS | GUEST_INTR_STATE_STI | 5277 GUEST_INTR_STATE_NMI)); 5278 } 5279 5280 int vmx_nmi_allowed(struct kvm_vcpu *vcpu, bool for_injection) 5281 { 5282 if (to_vmx(vcpu)->nested.nested_run_pending) 5283 return -EBUSY; 5284 5285 /* An NMI must not be injected into L2 if it's supposed to VM-Exit. */ 5286 if (for_injection && is_guest_mode(vcpu) && nested_exit_on_nmi(vcpu)) 5287 return -EBUSY; 5288 5289 return !vmx_nmi_blocked(vcpu); 5290 } 5291 5292 bool __vmx_interrupt_blocked(struct kvm_vcpu *vcpu) 5293 { 5294 return !(vmx_get_rflags(vcpu) & X86_EFLAGS_IF) || 5295 (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 5296 (GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS)); 5297 } 5298 5299 bool vmx_interrupt_blocked(struct kvm_vcpu *vcpu) 5300 { 5301 if (is_guest_mode(vcpu) && nested_exit_on_intr(vcpu)) 5302 return false; 5303 5304 return __vmx_interrupt_blocked(vcpu); 5305 } 5306 5307 int vmx_interrupt_allowed(struct kvm_vcpu *vcpu, bool for_injection) 5308 { 5309 if (to_vmx(vcpu)->nested.nested_run_pending) 5310 return -EBUSY; 5311 5312 /* 5313 * An IRQ must not be injected into L2 if it's supposed to VM-Exit, 5314 * e.g. if the IRQ arrived asynchronously after checking nested events. 5315 */ 5316 if (for_injection && is_guest_mode(vcpu) && nested_exit_on_intr(vcpu)) 5317 return -EBUSY; 5318 5319 return !vmx_interrupt_blocked(vcpu); 5320 } 5321 5322 int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr) 5323 { 5324 void __user *ret; 5325 5326 if (enable_unrestricted_guest) 5327 return 0; 5328 5329 mutex_lock(&kvm->slots_lock); 5330 ret = __x86_set_memory_region(kvm, TSS_PRIVATE_MEMSLOT, addr, 5331 PAGE_SIZE * 3); 5332 mutex_unlock(&kvm->slots_lock); 5333 5334 if (IS_ERR(ret)) 5335 return PTR_ERR(ret); 5336 5337 to_kvm_vmx(kvm)->tss_addr = addr; 5338 5339 return init_rmode_tss(kvm, ret); 5340 } 5341 5342 int vmx_set_identity_map_addr(struct kvm *kvm, u64 ident_addr) 5343 { 5344 to_kvm_vmx(kvm)->ept_identity_map_addr = ident_addr; 5345 return 0; 5346 } 5347 5348 static bool rmode_exception(struct kvm_vcpu *vcpu, int vec) 5349 { 5350 switch (vec) { 5351 case BP_VECTOR: 5352 /* 5353 * Update instruction length as we may reinject the exception 5354 * from user space while in guest debugging mode. 5355 */ 5356 to_vmx(vcpu)->vcpu.arch.event_exit_inst_len = 5357 vmcs_read32(VM_EXIT_INSTRUCTION_LEN); 5358 if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP) 5359 return false; 5360 fallthrough; 5361 case DB_VECTOR: 5362 return !(vcpu->guest_debug & 5363 (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)); 5364 case DE_VECTOR: 5365 case OF_VECTOR: 5366 case BR_VECTOR: 5367 case UD_VECTOR: 5368 case DF_VECTOR: 5369 case SS_VECTOR: 5370 case GP_VECTOR: 5371 case MF_VECTOR: 5372 return true; 5373 } 5374 return false; 5375 } 5376 5377 static int handle_rmode_exception(struct kvm_vcpu *vcpu, 5378 int vec, u32 err_code) 5379 { 5380 /* 5381 * Instruction with address size override prefix opcode 0x67 5382 * Cause the #SS fault with 0 error code in VM86 mode. 5383 */ 5384 if (((vec == GP_VECTOR) || (vec == SS_VECTOR)) && err_code == 0) { 5385 if (kvm_emulate_instruction(vcpu, 0)) { 5386 if (vcpu->arch.halt_request) { 5387 vcpu->arch.halt_request = 0; 5388 return kvm_emulate_halt_noskip(vcpu); 5389 } 5390 return 1; 5391 } 5392 return 0; 5393 } 5394 5395 /* 5396 * Forward all other exceptions that are valid in real mode. 5397 * FIXME: Breaks guest debugging in real mode, needs to be fixed with 5398 * the required debugging infrastructure rework. 5399 */ 5400 kvm_queue_exception(vcpu, vec); 5401 return 1; 5402 } 5403 5404 static int handle_machine_check(struct kvm_vcpu *vcpu) 5405 { 5406 /* handled by vmx_vcpu_run() */ 5407 return 1; 5408 } 5409 5410 /* 5411 * If the host has split lock detection disabled, then #AC is 5412 * unconditionally injected into the guest, which is the pre split lock 5413 * detection behaviour. 5414 * 5415 * If the host has split lock detection enabled then #AC is 5416 * only injected into the guest when: 5417 * - Guest CPL == 3 (user mode) 5418 * - Guest has #AC detection enabled in CR0 5419 * - Guest EFLAGS has AC bit set 5420 */ 5421 bool vmx_guest_inject_ac(struct kvm_vcpu *vcpu) 5422 { 5423 if (!boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT)) 5424 return true; 5425 5426 return vmx_get_cpl(vcpu) == 3 && kvm_is_cr0_bit_set(vcpu, X86_CR0_AM) && 5427 (kvm_get_rflags(vcpu) & X86_EFLAGS_AC); 5428 } 5429 5430 static bool is_xfd_nm_fault(struct kvm_vcpu *vcpu) 5431 { 5432 return vcpu->arch.guest_fpu.fpstate->xfd && 5433 !kvm_is_cr0_bit_set(vcpu, X86_CR0_TS); 5434 } 5435 5436 static int vmx_handle_page_fault(struct kvm_vcpu *vcpu, u32 error_code) 5437 { 5438 unsigned long cr2 = vmx_get_exit_qual(vcpu); 5439 5440 if (vcpu->arch.apf.host_apf_flags) 5441 goto handle_pf; 5442 5443 /* When using EPT, KVM intercepts #PF only to detect illegal GPAs. */ 5444 WARN_ON_ONCE(enable_ept && !allow_smaller_maxphyaddr); 5445 5446 /* 5447 * On SGX2 hardware, EPCM violations are delivered as #PF with the SGX 5448 * flag set in the error code (SGX1 hardware generates #GP(0)). EPCM 5449 * violations have nothing to do with shadow paging and can never be 5450 * resolved by KVM; always reflect them into the guest. 5451 */ 5452 if (error_code & PFERR_SGX_MASK) { 5453 WARN_ON_ONCE(!IS_ENABLED(CONFIG_X86_SGX_KVM) || 5454 !cpu_feature_enabled(X86_FEATURE_SGX2)); 5455 5456 if (guest_cpu_cap_has(vcpu, X86_FEATURE_SGX2)) 5457 kvm_fixup_and_inject_pf_error(vcpu, cr2, error_code); 5458 else 5459 kvm_inject_gp(vcpu, 0); 5460 return 1; 5461 } 5462 5463 /* 5464 * If EPT is enabled, fixup and inject the #PF. KVM intercepts #PFs 5465 * only to set PFERR_RSVD as appropriate (hardware won't set RSVD due 5466 * to the GPA being legal with respect to host.MAXPHYADDR). 5467 */ 5468 if (enable_ept) { 5469 kvm_fixup_and_inject_pf_error(vcpu, cr2, error_code); 5470 return 1; 5471 } 5472 5473 handle_pf: 5474 return kvm_handle_page_fault(vcpu, error_code, cr2, NULL, 0); 5475 } 5476 5477 static int handle_exception_nmi(struct kvm_vcpu *vcpu) 5478 { 5479 struct vcpu_vmx *vmx = to_vmx(vcpu); 5480 struct kvm_run *kvm_run = vcpu->run; 5481 u32 intr_info, ex_no, error_code; 5482 unsigned long dr6; 5483 u32 vect_info; 5484 5485 vect_info = vmx->idt_vectoring_info; 5486 intr_info = vmx_get_intr_info(vcpu); 5487 5488 /* 5489 * Machine checks are handled by handle_exception_irqoff(), or by 5490 * vmx_vcpu_run() if a #MC occurs on VM-Entry. NMIs are handled by 5491 * vmx_vcpu_enter_exit(). 5492 */ 5493 if (is_machine_check(intr_info) || is_nmi(intr_info)) 5494 return 1; 5495 5496 /* 5497 * Queue the exception here instead of in handle_nm_fault_irqoff(). 5498 * This ensures the nested_vmx check is not skipped so vmexit can 5499 * be reflected to L1 (when it intercepts #NM) before reaching this 5500 * point. 5501 */ 5502 if (is_nm_fault(intr_info)) { 5503 kvm_queue_exception_p(vcpu, NM_VECTOR, 5504 is_xfd_nm_fault(vcpu) ? vcpu->arch.guest_fpu.xfd_err : 0); 5505 return 1; 5506 } 5507 5508 if (is_invalid_opcode(intr_info)) 5509 return handle_ud(vcpu); 5510 5511 if (WARN_ON_ONCE(is_ve_fault(intr_info))) { 5512 struct vmx_ve_information *ve_info = vmx->ve_info; 5513 5514 WARN_ONCE(ve_info->exit_reason != EXIT_REASON_EPT_VIOLATION, 5515 "Unexpected #VE on VM-Exit reason 0x%x", ve_info->exit_reason); 5516 dump_vmcs(vcpu); 5517 kvm_mmu_print_sptes(vcpu, ve_info->guest_physical_address, "#VE"); 5518 return 1; 5519 } 5520 5521 error_code = 0; 5522 if (intr_info & INTR_INFO_DELIVER_CODE_MASK) 5523 error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE); 5524 5525 if (!vmx->rmode.vm86_active && is_gp_fault(intr_info)) { 5526 WARN_ON_ONCE(!enable_vmware_backdoor); 5527 5528 /* 5529 * VMware backdoor emulation on #GP interception only handles 5530 * IN{S}, OUT{S}, and RDPMC, none of which generate a non-zero 5531 * error code on #GP. 5532 */ 5533 if (error_code) { 5534 kvm_queue_exception_e(vcpu, GP_VECTOR, error_code); 5535 return 1; 5536 } 5537 return kvm_emulate_instruction(vcpu, EMULTYPE_VMWARE_GP); 5538 } 5539 5540 /* 5541 * The #PF with PFEC.RSVD = 1 indicates the guest is accessing 5542 * MMIO, it is better to report an internal error. 5543 * See the comments in vmx_handle_exit. 5544 */ 5545 if ((vect_info & VECTORING_INFO_VALID_MASK) && 5546 !(is_page_fault(intr_info) && !(error_code & PFERR_RSVD_MASK))) { 5547 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; 5548 vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_SIMUL_EX; 5549 vcpu->run->internal.ndata = 4; 5550 vcpu->run->internal.data[0] = vect_info; 5551 vcpu->run->internal.data[1] = intr_info; 5552 vcpu->run->internal.data[2] = error_code; 5553 vcpu->run->internal.data[3] = vcpu->arch.last_vmentry_cpu; 5554 return 0; 5555 } 5556 5557 if (is_page_fault(intr_info)) 5558 return vmx_handle_page_fault(vcpu, error_code); 5559 5560 ex_no = intr_info & INTR_INFO_VECTOR_MASK; 5561 5562 if (vmx->rmode.vm86_active && rmode_exception(vcpu, ex_no)) 5563 return handle_rmode_exception(vcpu, ex_no, error_code); 5564 5565 switch (ex_no) { 5566 case DB_VECTOR: 5567 dr6 = vmx_get_exit_qual(vcpu); 5568 if (!(vcpu->guest_debug & 5569 (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))) { 5570 /* 5571 * If the #DB was due to ICEBP, a.k.a. INT1, skip the 5572 * instruction. ICEBP generates a trap-like #DB, but 5573 * despite its interception control being tied to #DB, 5574 * is an instruction intercept, i.e. the VM-Exit occurs 5575 * on the ICEBP itself. Use the inner "skip" helper to 5576 * avoid single-step #DB and MTF updates, as ICEBP is 5577 * higher priority. Note, skipping ICEBP still clears 5578 * STI and MOVSS blocking. 5579 * 5580 * For all other #DBs, set vmcs.PENDING_DBG_EXCEPTIONS.BS 5581 * if single-step is enabled in RFLAGS and STI or MOVSS 5582 * blocking is active, as the CPU doesn't set the bit 5583 * on VM-Exit due to #DB interception. VM-Entry has a 5584 * consistency check that a single-step #DB is pending 5585 * in this scenario as the previous instruction cannot 5586 * have toggled RFLAGS.TF 0=>1 (because STI and POP/MOV 5587 * don't modify RFLAGS), therefore the one instruction 5588 * delay when activating single-step breakpoints must 5589 * have already expired. Note, the CPU sets/clears BS 5590 * as appropriate for all other VM-Exits types. 5591 */ 5592 if (is_icebp(intr_info)) 5593 WARN_ON(!skip_emulated_instruction(vcpu)); 5594 else if ((vmx_get_rflags(vcpu) & X86_EFLAGS_TF) && 5595 (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 5596 (GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS))) 5597 vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS, 5598 vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS) | DR6_BS); 5599 5600 kvm_queue_exception_p(vcpu, DB_VECTOR, dr6); 5601 return 1; 5602 } 5603 kvm_run->debug.arch.dr6 = dr6 | DR6_ACTIVE_LOW; 5604 kvm_run->debug.arch.dr7 = vmcs_readl(GUEST_DR7); 5605 fallthrough; 5606 case BP_VECTOR: 5607 /* 5608 * Update instruction length as we may reinject #BP from 5609 * user space while in guest debugging mode. Reading it for 5610 * #DB as well causes no harm, it is not used in that case. 5611 */ 5612 vmx->vcpu.arch.event_exit_inst_len = 5613 vmcs_read32(VM_EXIT_INSTRUCTION_LEN); 5614 kvm_run->exit_reason = KVM_EXIT_DEBUG; 5615 kvm_run->debug.arch.pc = kvm_get_linear_rip(vcpu); 5616 kvm_run->debug.arch.exception = ex_no; 5617 break; 5618 case AC_VECTOR: 5619 if (vmx_guest_inject_ac(vcpu)) { 5620 kvm_queue_exception_e(vcpu, AC_VECTOR, error_code); 5621 return 1; 5622 } 5623 5624 /* 5625 * Handle split lock. Depending on detection mode this will 5626 * either warn and disable split lock detection for this 5627 * task or force SIGBUS on it. 5628 */ 5629 if (handle_guest_split_lock(kvm_rip_read(vcpu))) 5630 return 1; 5631 fallthrough; 5632 default: 5633 kvm_run->exit_reason = KVM_EXIT_EXCEPTION; 5634 kvm_run->ex.exception = ex_no; 5635 kvm_run->ex.error_code = error_code; 5636 break; 5637 } 5638 return 0; 5639 } 5640 5641 static __always_inline int handle_external_interrupt(struct kvm_vcpu *vcpu) 5642 { 5643 ++vcpu->stat.irq_exits; 5644 return 1; 5645 } 5646 5647 static int handle_triple_fault(struct kvm_vcpu *vcpu) 5648 { 5649 vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN; 5650 vcpu->mmio_needed = 0; 5651 return 0; 5652 } 5653 5654 static int handle_io(struct kvm_vcpu *vcpu) 5655 { 5656 unsigned long exit_qualification; 5657 int size, in, string; 5658 unsigned port; 5659 5660 exit_qualification = vmx_get_exit_qual(vcpu); 5661 string = (exit_qualification & 16) != 0; 5662 5663 ++vcpu->stat.io_exits; 5664 5665 if (string) 5666 return kvm_emulate_instruction(vcpu, 0); 5667 5668 port = exit_qualification >> 16; 5669 size = (exit_qualification & 7) + 1; 5670 in = (exit_qualification & 8) != 0; 5671 5672 return kvm_fast_pio(vcpu, size, port, in); 5673 } 5674 5675 void vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall) 5676 { 5677 /* 5678 * Patch in the VMCALL instruction: 5679 */ 5680 hypercall[0] = 0x0f; 5681 hypercall[1] = 0x01; 5682 hypercall[2] = 0xc1; 5683 } 5684 5685 /* called to set cr0 as appropriate for a mov-to-cr0 exit. */ 5686 static int handle_set_cr0(struct kvm_vcpu *vcpu, unsigned long val) 5687 { 5688 if (is_guest_mode(vcpu)) { 5689 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 5690 unsigned long orig_val = val; 5691 5692 /* 5693 * We get here when L2 changed cr0 in a way that did not change 5694 * any of L1's shadowed bits (see nested_vmx_exit_handled_cr), 5695 * but did change L0 shadowed bits. So we first calculate the 5696 * effective cr0 value that L1 would like to write into the 5697 * hardware. It consists of the L2-owned bits from the new 5698 * value combined with the L1-owned bits from L1's guest_cr0. 5699 */ 5700 val = (val & ~vmcs12->cr0_guest_host_mask) | 5701 (vmcs12->guest_cr0 & vmcs12->cr0_guest_host_mask); 5702 5703 if (kvm_set_cr0(vcpu, val)) 5704 return 1; 5705 vmcs_writel(CR0_READ_SHADOW, orig_val); 5706 return 0; 5707 } else { 5708 return kvm_set_cr0(vcpu, val); 5709 } 5710 } 5711 5712 static int handle_set_cr4(struct kvm_vcpu *vcpu, unsigned long val) 5713 { 5714 if (is_guest_mode(vcpu)) { 5715 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 5716 unsigned long orig_val = val; 5717 5718 /* analogously to handle_set_cr0 */ 5719 val = (val & ~vmcs12->cr4_guest_host_mask) | 5720 (vmcs12->guest_cr4 & vmcs12->cr4_guest_host_mask); 5721 if (kvm_set_cr4(vcpu, val)) 5722 return 1; 5723 vmcs_writel(CR4_READ_SHADOW, orig_val); 5724 return 0; 5725 } else 5726 return kvm_set_cr4(vcpu, val); 5727 } 5728 5729 static int handle_desc(struct kvm_vcpu *vcpu) 5730 { 5731 /* 5732 * UMIP emulation relies on intercepting writes to CR4.UMIP, i.e. this 5733 * and other code needs to be updated if UMIP can be guest owned. 5734 */ 5735 BUILD_BUG_ON(KVM_POSSIBLE_CR4_GUEST_BITS & X86_CR4_UMIP); 5736 5737 WARN_ON_ONCE(!kvm_is_cr4_bit_set(vcpu, X86_CR4_UMIP)); 5738 return kvm_emulate_instruction(vcpu, 0); 5739 } 5740 5741 static int handle_cr(struct kvm_vcpu *vcpu) 5742 { 5743 unsigned long exit_qualification, val; 5744 int cr; 5745 int reg; 5746 int err; 5747 int ret; 5748 5749 exit_qualification = vmx_get_exit_qual(vcpu); 5750 cr = exit_qualification & 15; 5751 reg = (exit_qualification >> 8) & 15; 5752 switch ((exit_qualification >> 4) & 3) { 5753 case 0: /* mov to cr */ 5754 val = kvm_register_read(vcpu, reg); 5755 trace_kvm_cr_write(cr, val); 5756 switch (cr) { 5757 case 0: 5758 err = handle_set_cr0(vcpu, val); 5759 return kvm_complete_insn_gp(vcpu, err); 5760 case 3: 5761 WARN_ON_ONCE(enable_unrestricted_guest); 5762 5763 err = kvm_set_cr3(vcpu, val); 5764 return kvm_complete_insn_gp(vcpu, err); 5765 case 4: 5766 err = handle_set_cr4(vcpu, val); 5767 return kvm_complete_insn_gp(vcpu, err); 5768 case 8: { 5769 u8 cr8_prev = kvm_get_cr8(vcpu); 5770 u8 cr8 = (u8)val; 5771 err = kvm_set_cr8(vcpu, cr8); 5772 ret = kvm_complete_insn_gp(vcpu, err); 5773 if (lapic_in_kernel(vcpu)) 5774 return ret; 5775 if (cr8_prev <= cr8) 5776 return ret; 5777 /* 5778 * TODO: we might be squashing a 5779 * KVM_GUESTDBG_SINGLESTEP-triggered 5780 * KVM_EXIT_DEBUG here. 5781 */ 5782 vcpu->run->exit_reason = KVM_EXIT_SET_TPR; 5783 return 0; 5784 } 5785 } 5786 break; 5787 case 2: /* clts */ 5788 KVM_BUG(1, vcpu->kvm, "Guest always owns CR0.TS"); 5789 return -EIO; 5790 case 1: /*mov from cr*/ 5791 switch (cr) { 5792 case 3: 5793 WARN_ON_ONCE(enable_unrestricted_guest); 5794 5795 val = kvm_read_cr3(vcpu); 5796 kvm_register_write(vcpu, reg, val); 5797 trace_kvm_cr_read(cr, val); 5798 return kvm_skip_emulated_instruction(vcpu); 5799 case 8: 5800 val = kvm_get_cr8(vcpu); 5801 kvm_register_write(vcpu, reg, val); 5802 trace_kvm_cr_read(cr, val); 5803 return kvm_skip_emulated_instruction(vcpu); 5804 } 5805 break; 5806 case 3: /* lmsw */ 5807 val = (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f; 5808 trace_kvm_cr_write(0, (kvm_read_cr0_bits(vcpu, ~0xful) | val)); 5809 kvm_lmsw(vcpu, val); 5810 5811 return kvm_skip_emulated_instruction(vcpu); 5812 default: 5813 break; 5814 } 5815 vcpu->run->exit_reason = 0; 5816 vcpu_unimpl(vcpu, "unhandled control register: op %d cr %d\n", 5817 (int)(exit_qualification >> 4) & 3, cr); 5818 return 0; 5819 } 5820 5821 static int handle_dr(struct kvm_vcpu *vcpu) 5822 { 5823 unsigned long exit_qualification; 5824 int dr, dr7, reg; 5825 int err = 1; 5826 5827 exit_qualification = vmx_get_exit_qual(vcpu); 5828 dr = exit_qualification & DEBUG_REG_ACCESS_NUM; 5829 5830 /* First, if DR does not exist, trigger UD */ 5831 if (!kvm_require_dr(vcpu, dr)) 5832 return 1; 5833 5834 if (vmx_get_cpl(vcpu) > 0) 5835 goto out; 5836 5837 dr7 = vmcs_readl(GUEST_DR7); 5838 if (dr7 & DR7_GD) { 5839 /* 5840 * As the vm-exit takes precedence over the debug trap, we 5841 * need to emulate the latter, either for the host or the 5842 * guest debugging itself. 5843 */ 5844 if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) { 5845 vcpu->run->debug.arch.dr6 = DR6_BD | DR6_ACTIVE_LOW; 5846 vcpu->run->debug.arch.dr7 = dr7; 5847 vcpu->run->debug.arch.pc = kvm_get_linear_rip(vcpu); 5848 vcpu->run->debug.arch.exception = DB_VECTOR; 5849 vcpu->run->exit_reason = KVM_EXIT_DEBUG; 5850 return 0; 5851 } else { 5852 kvm_queue_exception_p(vcpu, DB_VECTOR, DR6_BD); 5853 return 1; 5854 } 5855 } 5856 5857 if (vcpu->guest_debug == 0) { 5858 exec_controls_clearbit(to_vmx(vcpu), CPU_BASED_MOV_DR_EXITING); 5859 5860 /* 5861 * No more DR vmexits; force a reload of the debug registers 5862 * and reenter on this instruction. The next vmexit will 5863 * retrieve the full state of the debug registers. 5864 */ 5865 vcpu->arch.switch_db_regs |= KVM_DEBUGREG_WONT_EXIT; 5866 return 1; 5867 } 5868 5869 reg = DEBUG_REG_ACCESS_REG(exit_qualification); 5870 if (exit_qualification & TYPE_MOV_FROM_DR) { 5871 kvm_register_write(vcpu, reg, kvm_get_dr(vcpu, dr)); 5872 err = 0; 5873 } else { 5874 err = kvm_set_dr(vcpu, dr, kvm_register_read(vcpu, reg)); 5875 } 5876 5877 out: 5878 return kvm_complete_insn_gp(vcpu, err); 5879 } 5880 5881 void vmx_sync_dirty_debug_regs(struct kvm_vcpu *vcpu) 5882 { 5883 get_debugreg(vcpu->arch.db[0], 0); 5884 get_debugreg(vcpu->arch.db[1], 1); 5885 get_debugreg(vcpu->arch.db[2], 2); 5886 get_debugreg(vcpu->arch.db[3], 3); 5887 get_debugreg(vcpu->arch.dr6, 6); 5888 vcpu->arch.dr7 = vmcs_readl(GUEST_DR7); 5889 5890 vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_WONT_EXIT; 5891 exec_controls_setbit(to_vmx(vcpu), CPU_BASED_MOV_DR_EXITING); 5892 5893 /* 5894 * exc_debug expects dr6 to be cleared after it runs, avoid that it sees 5895 * a stale dr6 from the guest. 5896 */ 5897 set_debugreg(DR6_RESERVED, 6); 5898 } 5899 5900 void vmx_set_dr7(struct kvm_vcpu *vcpu, unsigned long val) 5901 { 5902 vmcs_writel(GUEST_DR7, val); 5903 } 5904 5905 static int handle_tpr_below_threshold(struct kvm_vcpu *vcpu) 5906 { 5907 kvm_apic_update_ppr(vcpu); 5908 return 1; 5909 } 5910 5911 static int handle_interrupt_window(struct kvm_vcpu *vcpu) 5912 { 5913 exec_controls_clearbit(to_vmx(vcpu), CPU_BASED_INTR_WINDOW_EXITING); 5914 5915 kvm_make_request(KVM_REQ_EVENT, vcpu); 5916 5917 ++vcpu->stat.irq_window_exits; 5918 return 1; 5919 } 5920 5921 static int handle_invlpg(struct kvm_vcpu *vcpu) 5922 { 5923 unsigned long exit_qualification = vmx_get_exit_qual(vcpu); 5924 5925 kvm_mmu_invlpg(vcpu, exit_qualification); 5926 return kvm_skip_emulated_instruction(vcpu); 5927 } 5928 5929 static int handle_apic_access(struct kvm_vcpu *vcpu) 5930 { 5931 if (likely(fasteoi)) { 5932 unsigned long exit_qualification = vmx_get_exit_qual(vcpu); 5933 int access_type, offset; 5934 5935 access_type = exit_qualification & APIC_ACCESS_TYPE; 5936 offset = exit_qualification & APIC_ACCESS_OFFSET; 5937 /* 5938 * Sane guest uses MOV to write EOI, with written value 5939 * not cared. So make a short-circuit here by avoiding 5940 * heavy instruction emulation. 5941 */ 5942 if ((access_type == TYPE_LINEAR_APIC_INST_WRITE) && 5943 (offset == APIC_EOI)) { 5944 kvm_lapic_set_eoi(vcpu); 5945 return kvm_skip_emulated_instruction(vcpu); 5946 } 5947 } 5948 return kvm_emulate_instruction(vcpu, 0); 5949 } 5950 5951 static int handle_apic_eoi_induced(struct kvm_vcpu *vcpu) 5952 { 5953 unsigned long exit_qualification = vmx_get_exit_qual(vcpu); 5954 int vector = exit_qualification & 0xff; 5955 5956 /* EOI-induced VM exit is trap-like and thus no need to adjust IP */ 5957 kvm_apic_set_eoi_accelerated(vcpu, vector); 5958 return 1; 5959 } 5960 5961 static int handle_apic_write(struct kvm_vcpu *vcpu) 5962 { 5963 unsigned long exit_qualification = vmx_get_exit_qual(vcpu); 5964 5965 /* 5966 * APIC-write VM-Exit is trap-like, KVM doesn't need to advance RIP and 5967 * hardware has done any necessary aliasing, offset adjustments, etc... 5968 * for the access. I.e. the correct value has already been written to 5969 * the vAPIC page for the correct 16-byte chunk. KVM needs only to 5970 * retrieve the register value and emulate the access. 5971 */ 5972 u32 offset = exit_qualification & 0xff0; 5973 5974 kvm_apic_write_nodecode(vcpu, offset); 5975 return 1; 5976 } 5977 5978 static int handle_task_switch(struct kvm_vcpu *vcpu) 5979 { 5980 struct vcpu_vmx *vmx = to_vmx(vcpu); 5981 unsigned long exit_qualification; 5982 bool has_error_code = false; 5983 u32 error_code = 0; 5984 u16 tss_selector; 5985 int reason, type, idt_v, idt_index; 5986 5987 idt_v = (vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK); 5988 idt_index = (vmx->idt_vectoring_info & VECTORING_INFO_VECTOR_MASK); 5989 type = (vmx->idt_vectoring_info & VECTORING_INFO_TYPE_MASK); 5990 5991 exit_qualification = vmx_get_exit_qual(vcpu); 5992 5993 reason = (u32)exit_qualification >> 30; 5994 if (reason == TASK_SWITCH_GATE && idt_v) { 5995 switch (type) { 5996 case INTR_TYPE_NMI_INTR: 5997 vcpu->arch.nmi_injected = false; 5998 vmx_set_nmi_mask(vcpu, true); 5999 break; 6000 case INTR_TYPE_EXT_INTR: 6001 case INTR_TYPE_SOFT_INTR: 6002 kvm_clear_interrupt_queue(vcpu); 6003 break; 6004 case INTR_TYPE_HARD_EXCEPTION: 6005 if (vmx->idt_vectoring_info & 6006 VECTORING_INFO_DELIVER_CODE_MASK) { 6007 has_error_code = true; 6008 error_code = 6009 vmcs_read32(IDT_VECTORING_ERROR_CODE); 6010 } 6011 fallthrough; 6012 case INTR_TYPE_SOFT_EXCEPTION: 6013 kvm_clear_exception_queue(vcpu); 6014 break; 6015 default: 6016 break; 6017 } 6018 } 6019 tss_selector = exit_qualification; 6020 6021 if (!idt_v || (type != INTR_TYPE_HARD_EXCEPTION && 6022 type != INTR_TYPE_EXT_INTR && 6023 type != INTR_TYPE_NMI_INTR)) 6024 WARN_ON(!skip_emulated_instruction(vcpu)); 6025 6026 /* 6027 * TODO: What about debug traps on tss switch? 6028 * Are we supposed to inject them and update dr6? 6029 */ 6030 return kvm_task_switch(vcpu, tss_selector, 6031 type == INTR_TYPE_SOFT_INTR ? idt_index : -1, 6032 reason, has_error_code, error_code); 6033 } 6034 6035 static int handle_ept_violation(struct kvm_vcpu *vcpu) 6036 { 6037 unsigned long exit_qualification = vmx_get_exit_qual(vcpu); 6038 gpa_t gpa; 6039 6040 /* 6041 * EPT violation happened while executing iret from NMI, 6042 * "blocked by NMI" bit has to be set before next VM entry. 6043 * There are errata that may cause this bit to not be set: 6044 * AAK134, BY25. 6045 */ 6046 if (!(to_vmx(vcpu)->idt_vectoring_info & VECTORING_INFO_VALID_MASK) && 6047 enable_vnmi && 6048 (exit_qualification & INTR_INFO_UNBLOCK_NMI)) 6049 vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, GUEST_INTR_STATE_NMI); 6050 6051 gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS); 6052 trace_kvm_page_fault(vcpu, gpa, exit_qualification); 6053 6054 /* 6055 * Check that the GPA doesn't exceed physical memory limits, as that is 6056 * a guest page fault. We have to emulate the instruction here, because 6057 * if the illegal address is that of a paging structure, then 6058 * EPT_VIOLATION_ACC_WRITE bit is set. Alternatively, if supported we 6059 * would also use advanced VM-exit information for EPT violations to 6060 * reconstruct the page fault error code. 6061 */ 6062 if (unlikely(allow_smaller_maxphyaddr && !kvm_vcpu_is_legal_gpa(vcpu, gpa))) 6063 return kvm_emulate_instruction(vcpu, 0); 6064 6065 return __vmx_handle_ept_violation(vcpu, gpa, exit_qualification); 6066 } 6067 6068 static int handle_ept_misconfig(struct kvm_vcpu *vcpu) 6069 { 6070 gpa_t gpa; 6071 6072 if (vmx_check_emulate_instruction(vcpu, EMULTYPE_PF, NULL, 0)) 6073 return 1; 6074 6075 /* 6076 * A nested guest cannot optimize MMIO vmexits, because we have an 6077 * nGPA here instead of the required GPA. 6078 */ 6079 gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS); 6080 if (!is_guest_mode(vcpu) && 6081 !kvm_io_bus_write(vcpu, KVM_FAST_MMIO_BUS, gpa, 0, NULL)) { 6082 trace_kvm_fast_mmio(gpa); 6083 return kvm_skip_emulated_instruction(vcpu); 6084 } 6085 6086 return kvm_mmu_page_fault(vcpu, gpa, PFERR_RSVD_MASK, NULL, 0); 6087 } 6088 6089 static int handle_nmi_window(struct kvm_vcpu *vcpu) 6090 { 6091 if (KVM_BUG_ON(!enable_vnmi, vcpu->kvm)) 6092 return -EIO; 6093 6094 exec_controls_clearbit(to_vmx(vcpu), CPU_BASED_NMI_WINDOW_EXITING); 6095 ++vcpu->stat.nmi_window_exits; 6096 kvm_make_request(KVM_REQ_EVENT, vcpu); 6097 6098 return 1; 6099 } 6100 6101 /* 6102 * Returns true if emulation is required (due to the vCPU having invalid state 6103 * with unsrestricted guest mode disabled) and KVM can't faithfully emulate the 6104 * current vCPU state. 6105 */ 6106 static bool vmx_unhandleable_emulation_required(struct kvm_vcpu *vcpu) 6107 { 6108 struct vcpu_vmx *vmx = to_vmx(vcpu); 6109 6110 if (!vmx->vt.emulation_required) 6111 return false; 6112 6113 /* 6114 * It is architecturally impossible for emulation to be required when a 6115 * nested VM-Enter is pending completion, as VM-Enter will VM-Fail if 6116 * guest state is invalid and unrestricted guest is disabled, i.e. KVM 6117 * should synthesize VM-Fail instead emulation L2 code. This path is 6118 * only reachable if userspace modifies L2 guest state after KVM has 6119 * performed the nested VM-Enter consistency checks. 6120 */ 6121 if (vmx->nested.nested_run_pending) 6122 return true; 6123 6124 /* 6125 * KVM only supports emulating exceptions if the vCPU is in Real Mode. 6126 * If emulation is required, KVM can't perform a successful VM-Enter to 6127 * inject the exception. 6128 */ 6129 return !vmx->rmode.vm86_active && 6130 (kvm_is_exception_pending(vcpu) || vcpu->arch.exception.injected); 6131 } 6132 6133 static int handle_invalid_guest_state(struct kvm_vcpu *vcpu) 6134 { 6135 struct vcpu_vmx *vmx = to_vmx(vcpu); 6136 bool intr_window_requested; 6137 unsigned count = 130; 6138 6139 intr_window_requested = exec_controls_get(vmx) & 6140 CPU_BASED_INTR_WINDOW_EXITING; 6141 6142 while (vmx->vt.emulation_required && count-- != 0) { 6143 if (intr_window_requested && !vmx_interrupt_blocked(vcpu)) 6144 return handle_interrupt_window(&vmx->vcpu); 6145 6146 if (kvm_test_request(KVM_REQ_EVENT, vcpu)) 6147 return 1; 6148 6149 /* 6150 * Ensure that any updates to kvm->buses[] observed by the 6151 * previous instruction (emulated or otherwise) are also 6152 * visible to the instruction KVM is about to emulate. 6153 */ 6154 smp_rmb(); 6155 6156 if (!kvm_emulate_instruction(vcpu, 0)) 6157 return 0; 6158 6159 if (vmx_unhandleable_emulation_required(vcpu)) { 6160 kvm_prepare_emulation_failure_exit(vcpu); 6161 return 0; 6162 } 6163 6164 if (vcpu->arch.halt_request) { 6165 vcpu->arch.halt_request = 0; 6166 return kvm_emulate_halt_noskip(vcpu); 6167 } 6168 6169 /* 6170 * Note, return 1 and not 0, vcpu_run() will invoke 6171 * xfer_to_guest_mode() which will create a proper return 6172 * code. 6173 */ 6174 if (__xfer_to_guest_mode_work_pending()) 6175 return 1; 6176 } 6177 6178 return 1; 6179 } 6180 6181 int vmx_vcpu_pre_run(struct kvm_vcpu *vcpu) 6182 { 6183 if (vmx_unhandleable_emulation_required(vcpu)) { 6184 kvm_prepare_emulation_failure_exit(vcpu); 6185 return 0; 6186 } 6187 6188 return 1; 6189 } 6190 6191 /* 6192 * Indicate a busy-waiting vcpu in spinlock. We do not enable the PAUSE 6193 * exiting, so only get here on cpu with PAUSE-Loop-Exiting. 6194 */ 6195 static int handle_pause(struct kvm_vcpu *vcpu) 6196 { 6197 if (!kvm_pause_in_guest(vcpu->kvm)) 6198 grow_ple_window(vcpu); 6199 6200 /* 6201 * Intel sdm vol3 ch-25.1.3 says: The "PAUSE-loop exiting" 6202 * VM-execution control is ignored if CPL > 0. OTOH, KVM 6203 * never set PAUSE_EXITING and just set PLE if supported, 6204 * so the vcpu must be CPL=0 if it gets a PAUSE exit. 6205 */ 6206 kvm_vcpu_on_spin(vcpu, true); 6207 return kvm_skip_emulated_instruction(vcpu); 6208 } 6209 6210 static int handle_monitor_trap(struct kvm_vcpu *vcpu) 6211 { 6212 return 1; 6213 } 6214 6215 static int handle_invpcid(struct kvm_vcpu *vcpu) 6216 { 6217 u32 vmx_instruction_info; 6218 unsigned long type; 6219 gva_t gva; 6220 struct { 6221 u64 pcid; 6222 u64 gla; 6223 } operand; 6224 int gpr_index; 6225 6226 if (!guest_cpu_cap_has(vcpu, X86_FEATURE_INVPCID)) { 6227 kvm_queue_exception(vcpu, UD_VECTOR); 6228 return 1; 6229 } 6230 6231 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); 6232 gpr_index = vmx_get_instr_info_reg2(vmx_instruction_info); 6233 type = kvm_register_read(vcpu, gpr_index); 6234 6235 /* According to the Intel instruction reference, the memory operand 6236 * is read even if it isn't needed (e.g., for type==all) 6237 */ 6238 if (get_vmx_mem_address(vcpu, vmx_get_exit_qual(vcpu), 6239 vmx_instruction_info, false, 6240 sizeof(operand), &gva)) 6241 return 1; 6242 6243 return kvm_handle_invpcid(vcpu, type, gva); 6244 } 6245 6246 static int handle_pml_full(struct kvm_vcpu *vcpu) 6247 { 6248 unsigned long exit_qualification; 6249 6250 trace_kvm_pml_full(vcpu->vcpu_id); 6251 6252 exit_qualification = vmx_get_exit_qual(vcpu); 6253 6254 /* 6255 * PML buffer FULL happened while executing iret from NMI, 6256 * "blocked by NMI" bit has to be set before next VM entry. 6257 */ 6258 if (!(to_vmx(vcpu)->idt_vectoring_info & VECTORING_INFO_VALID_MASK) && 6259 enable_vnmi && 6260 (exit_qualification & INTR_INFO_UNBLOCK_NMI)) 6261 vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, 6262 GUEST_INTR_STATE_NMI); 6263 6264 /* 6265 * PML buffer already flushed at beginning of VMEXIT. Nothing to do 6266 * here.., and there's no userspace involvement needed for PML. 6267 */ 6268 return 1; 6269 } 6270 6271 static fastpath_t handle_fastpath_preemption_timer(struct kvm_vcpu *vcpu, 6272 bool force_immediate_exit) 6273 { 6274 struct vcpu_vmx *vmx = to_vmx(vcpu); 6275 6276 /* 6277 * In the *extremely* unlikely scenario that this is a spurious VM-Exit 6278 * due to the timer expiring while it was "soft" disabled, just eat the 6279 * exit and re-enter the guest. 6280 */ 6281 if (unlikely(vmx->loaded_vmcs->hv_timer_soft_disabled)) 6282 return EXIT_FASTPATH_REENTER_GUEST; 6283 6284 /* 6285 * If the timer expired because KVM used it to force an immediate exit, 6286 * then mission accomplished. 6287 */ 6288 if (force_immediate_exit) 6289 return EXIT_FASTPATH_EXIT_HANDLED; 6290 6291 /* 6292 * If L2 is active, go down the slow path as emulating the guest timer 6293 * expiration likely requires synthesizing a nested VM-Exit. 6294 */ 6295 if (is_guest_mode(vcpu)) 6296 return EXIT_FASTPATH_NONE; 6297 6298 kvm_lapic_expired_hv_timer(vcpu); 6299 return EXIT_FASTPATH_REENTER_GUEST; 6300 } 6301 6302 static int handle_preemption_timer(struct kvm_vcpu *vcpu) 6303 { 6304 /* 6305 * This non-fastpath handler is reached if and only if the preemption 6306 * timer was being used to emulate a guest timer while L2 is active. 6307 * All other scenarios are supposed to be handled in the fastpath. 6308 */ 6309 WARN_ON_ONCE(!is_guest_mode(vcpu)); 6310 kvm_lapic_expired_hv_timer(vcpu); 6311 return 1; 6312 } 6313 6314 /* 6315 * When nested=0, all VMX instruction VM Exits filter here. The handlers 6316 * are overwritten by nested_vmx_hardware_setup() when nested=1. 6317 */ 6318 static int handle_vmx_instruction(struct kvm_vcpu *vcpu) 6319 { 6320 kvm_queue_exception(vcpu, UD_VECTOR); 6321 return 1; 6322 } 6323 6324 static int handle_tdx_instruction(struct kvm_vcpu *vcpu) 6325 { 6326 kvm_queue_exception(vcpu, UD_VECTOR); 6327 return 1; 6328 } 6329 6330 #ifndef CONFIG_X86_SGX_KVM 6331 static int handle_encls(struct kvm_vcpu *vcpu) 6332 { 6333 /* 6334 * SGX virtualization is disabled. There is no software enable bit for 6335 * SGX, so KVM intercepts all ENCLS leafs and injects a #UD to prevent 6336 * the guest from executing ENCLS (when SGX is supported by hardware). 6337 */ 6338 kvm_queue_exception(vcpu, UD_VECTOR); 6339 return 1; 6340 } 6341 #endif /* CONFIG_X86_SGX_KVM */ 6342 6343 static int handle_bus_lock_vmexit(struct kvm_vcpu *vcpu) 6344 { 6345 /* 6346 * Hardware may or may not set the BUS_LOCK_DETECTED flag on BUS_LOCK 6347 * VM-Exits. Unconditionally set the flag here and leave the handling to 6348 * vmx_handle_exit(). 6349 */ 6350 to_vt(vcpu)->exit_reason.bus_lock_detected = true; 6351 return 1; 6352 } 6353 6354 static int handle_notify(struct kvm_vcpu *vcpu) 6355 { 6356 unsigned long exit_qual = vmx_get_exit_qual(vcpu); 6357 bool context_invalid = exit_qual & NOTIFY_VM_CONTEXT_INVALID; 6358 6359 ++vcpu->stat.notify_window_exits; 6360 6361 /* 6362 * Notify VM exit happened while executing iret from NMI, 6363 * "blocked by NMI" bit has to be set before next VM entry. 6364 */ 6365 if (enable_vnmi && (exit_qual & INTR_INFO_UNBLOCK_NMI)) 6366 vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, 6367 GUEST_INTR_STATE_NMI); 6368 6369 if (vcpu->kvm->arch.notify_vmexit_flags & KVM_X86_NOTIFY_VMEXIT_USER || 6370 context_invalid) { 6371 vcpu->run->exit_reason = KVM_EXIT_NOTIFY; 6372 vcpu->run->notify.flags = context_invalid ? 6373 KVM_NOTIFY_CONTEXT_INVALID : 0; 6374 return 0; 6375 } 6376 6377 return 1; 6378 } 6379 6380 static int vmx_get_msr_imm_reg(struct kvm_vcpu *vcpu) 6381 { 6382 return vmx_get_instr_info_reg(vmcs_read32(VMX_INSTRUCTION_INFO)); 6383 } 6384 6385 static int handle_rdmsr_imm(struct kvm_vcpu *vcpu) 6386 { 6387 return kvm_emulate_rdmsr_imm(vcpu, vmx_get_exit_qual(vcpu), 6388 vmx_get_msr_imm_reg(vcpu)); 6389 } 6390 6391 static int handle_wrmsr_imm(struct kvm_vcpu *vcpu) 6392 { 6393 return kvm_emulate_wrmsr_imm(vcpu, vmx_get_exit_qual(vcpu), 6394 vmx_get_msr_imm_reg(vcpu)); 6395 } 6396 6397 /* 6398 * The exit handlers return 1 if the exit was handled fully and guest execution 6399 * may resume. Otherwise they set the kvm_run parameter to indicate what needs 6400 * to be done to userspace and return 0. 6401 */ 6402 static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = { 6403 [EXIT_REASON_EXCEPTION_NMI] = handle_exception_nmi, 6404 [EXIT_REASON_EXTERNAL_INTERRUPT] = handle_external_interrupt, 6405 [EXIT_REASON_TRIPLE_FAULT] = handle_triple_fault, 6406 [EXIT_REASON_NMI_WINDOW] = handle_nmi_window, 6407 [EXIT_REASON_IO_INSTRUCTION] = handle_io, 6408 [EXIT_REASON_CR_ACCESS] = handle_cr, 6409 [EXIT_REASON_DR_ACCESS] = handle_dr, 6410 [EXIT_REASON_CPUID] = kvm_emulate_cpuid, 6411 [EXIT_REASON_MSR_READ] = kvm_emulate_rdmsr, 6412 [EXIT_REASON_MSR_WRITE] = kvm_emulate_wrmsr, 6413 [EXIT_REASON_INTERRUPT_WINDOW] = handle_interrupt_window, 6414 [EXIT_REASON_HLT] = kvm_emulate_halt, 6415 [EXIT_REASON_INVD] = kvm_emulate_invd, 6416 [EXIT_REASON_INVLPG] = handle_invlpg, 6417 [EXIT_REASON_RDPMC] = kvm_emulate_rdpmc, 6418 [EXIT_REASON_VMCALL] = kvm_emulate_hypercall, 6419 [EXIT_REASON_VMCLEAR] = handle_vmx_instruction, 6420 [EXIT_REASON_VMLAUNCH] = handle_vmx_instruction, 6421 [EXIT_REASON_VMPTRLD] = handle_vmx_instruction, 6422 [EXIT_REASON_VMPTRST] = handle_vmx_instruction, 6423 [EXIT_REASON_VMREAD] = handle_vmx_instruction, 6424 [EXIT_REASON_VMRESUME] = handle_vmx_instruction, 6425 [EXIT_REASON_VMWRITE] = handle_vmx_instruction, 6426 [EXIT_REASON_VMOFF] = handle_vmx_instruction, 6427 [EXIT_REASON_VMON] = handle_vmx_instruction, 6428 [EXIT_REASON_TPR_BELOW_THRESHOLD] = handle_tpr_below_threshold, 6429 [EXIT_REASON_APIC_ACCESS] = handle_apic_access, 6430 [EXIT_REASON_APIC_WRITE] = handle_apic_write, 6431 [EXIT_REASON_EOI_INDUCED] = handle_apic_eoi_induced, 6432 [EXIT_REASON_WBINVD] = kvm_emulate_wbinvd, 6433 [EXIT_REASON_XSETBV] = kvm_emulate_xsetbv, 6434 [EXIT_REASON_TASK_SWITCH] = handle_task_switch, 6435 [EXIT_REASON_MCE_DURING_VMENTRY] = handle_machine_check, 6436 [EXIT_REASON_GDTR_IDTR] = handle_desc, 6437 [EXIT_REASON_LDTR_TR] = handle_desc, 6438 [EXIT_REASON_EPT_VIOLATION] = handle_ept_violation, 6439 [EXIT_REASON_EPT_MISCONFIG] = handle_ept_misconfig, 6440 [EXIT_REASON_PAUSE_INSTRUCTION] = handle_pause, 6441 [EXIT_REASON_MWAIT_INSTRUCTION] = kvm_emulate_mwait, 6442 [EXIT_REASON_MONITOR_TRAP_FLAG] = handle_monitor_trap, 6443 [EXIT_REASON_MONITOR_INSTRUCTION] = kvm_emulate_monitor, 6444 [EXIT_REASON_INVEPT] = handle_vmx_instruction, 6445 [EXIT_REASON_INVVPID] = handle_vmx_instruction, 6446 [EXIT_REASON_RDRAND] = kvm_handle_invalid_op, 6447 [EXIT_REASON_RDSEED] = kvm_handle_invalid_op, 6448 [EXIT_REASON_PML_FULL] = handle_pml_full, 6449 [EXIT_REASON_INVPCID] = handle_invpcid, 6450 [EXIT_REASON_VMFUNC] = handle_vmx_instruction, 6451 [EXIT_REASON_PREEMPTION_TIMER] = handle_preemption_timer, 6452 [EXIT_REASON_ENCLS] = handle_encls, 6453 [EXIT_REASON_BUS_LOCK] = handle_bus_lock_vmexit, 6454 [EXIT_REASON_NOTIFY] = handle_notify, 6455 [EXIT_REASON_SEAMCALL] = handle_tdx_instruction, 6456 [EXIT_REASON_TDCALL] = handle_tdx_instruction, 6457 [EXIT_REASON_MSR_READ_IMM] = handle_rdmsr_imm, 6458 [EXIT_REASON_MSR_WRITE_IMM] = handle_wrmsr_imm, 6459 }; 6460 6461 static const int kvm_vmx_max_exit_handlers = 6462 ARRAY_SIZE(kvm_vmx_exit_handlers); 6463 6464 void vmx_get_exit_info(struct kvm_vcpu *vcpu, u32 *reason, 6465 u64 *info1, u64 *info2, u32 *intr_info, u32 *error_code) 6466 { 6467 struct vcpu_vmx *vmx = to_vmx(vcpu); 6468 6469 *reason = vmx->vt.exit_reason.full; 6470 *info1 = vmx_get_exit_qual(vcpu); 6471 if (!(vmx->vt.exit_reason.failed_vmentry)) { 6472 *info2 = vmx->idt_vectoring_info; 6473 *intr_info = vmx_get_intr_info(vcpu); 6474 if (is_exception_with_error_code(*intr_info)) 6475 *error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE); 6476 else 6477 *error_code = 0; 6478 } else { 6479 *info2 = 0; 6480 *intr_info = 0; 6481 *error_code = 0; 6482 } 6483 } 6484 6485 void vmx_get_entry_info(struct kvm_vcpu *vcpu, u32 *intr_info, u32 *error_code) 6486 { 6487 *intr_info = vmcs_read32(VM_ENTRY_INTR_INFO_FIELD); 6488 if (is_exception_with_error_code(*intr_info)) 6489 *error_code = vmcs_read32(VM_ENTRY_EXCEPTION_ERROR_CODE); 6490 else 6491 *error_code = 0; 6492 } 6493 6494 static void vmx_destroy_pml_buffer(struct vcpu_vmx *vmx) 6495 { 6496 if (vmx->pml_pg) { 6497 __free_page(vmx->pml_pg); 6498 vmx->pml_pg = NULL; 6499 } 6500 } 6501 6502 static void vmx_flush_pml_buffer(struct kvm_vcpu *vcpu) 6503 { 6504 struct vcpu_vmx *vmx = to_vmx(vcpu); 6505 u16 pml_idx, pml_tail_index; 6506 u64 *pml_buf; 6507 int i; 6508 6509 pml_idx = vmcs_read16(GUEST_PML_INDEX); 6510 6511 /* Do nothing if PML buffer is empty */ 6512 if (pml_idx == PML_HEAD_INDEX) 6513 return; 6514 /* 6515 * PML index always points to the next available PML buffer entity 6516 * unless PML log has just overflowed. 6517 */ 6518 pml_tail_index = (pml_idx >= PML_LOG_NR_ENTRIES) ? 0 : pml_idx + 1; 6519 6520 /* 6521 * PML log is written backwards: the CPU first writes the entry 511 6522 * then the entry 510, and so on. 6523 * 6524 * Read the entries in the same order they were written, to ensure that 6525 * the dirty ring is filled in the same order the CPU wrote them. 6526 */ 6527 pml_buf = page_address(vmx->pml_pg); 6528 6529 for (i = PML_HEAD_INDEX; i >= pml_tail_index; i--) { 6530 u64 gpa; 6531 6532 gpa = pml_buf[i]; 6533 WARN_ON(gpa & (PAGE_SIZE - 1)); 6534 kvm_vcpu_mark_page_dirty(vcpu, gpa >> PAGE_SHIFT); 6535 } 6536 6537 /* reset PML index */ 6538 vmcs_write16(GUEST_PML_INDEX, PML_HEAD_INDEX); 6539 } 6540 6541 static void nested_vmx_mark_all_vmcs12_pages_dirty(struct kvm_vcpu *vcpu) 6542 { 6543 struct vcpu_vmx *vmx = to_vmx(vcpu); 6544 6545 kvm_vcpu_map_mark_dirty(vcpu, &vmx->nested.apic_access_page_map); 6546 kvm_vcpu_map_mark_dirty(vcpu, &vmx->nested.virtual_apic_map); 6547 kvm_vcpu_map_mark_dirty(vcpu, &vmx->nested.pi_desc_map); 6548 } 6549 6550 static void vmx_dump_sel(char *name, uint32_t sel) 6551 { 6552 pr_err("%s sel=0x%04x, attr=0x%05x, limit=0x%08x, base=0x%016lx\n", 6553 name, vmcs_read16(sel), 6554 vmcs_read32(sel + GUEST_ES_AR_BYTES - GUEST_ES_SELECTOR), 6555 vmcs_read32(sel + GUEST_ES_LIMIT - GUEST_ES_SELECTOR), 6556 vmcs_readl(sel + GUEST_ES_BASE - GUEST_ES_SELECTOR)); 6557 } 6558 6559 static void vmx_dump_dtsel(char *name, uint32_t limit) 6560 { 6561 pr_err("%s limit=0x%08x, base=0x%016lx\n", 6562 name, vmcs_read32(limit), 6563 vmcs_readl(limit + GUEST_GDTR_BASE - GUEST_GDTR_LIMIT)); 6564 } 6565 6566 static void vmx_dump_msrs(char *name, struct vmx_msrs *m) 6567 { 6568 unsigned int i; 6569 struct vmx_msr_entry *e; 6570 6571 pr_err("MSR %s:\n", name); 6572 for (i = 0, e = m->val; i < m->nr; ++i, ++e) 6573 pr_err(" %2d: msr=0x%08x value=0x%016llx\n", i, e->index, e->value); 6574 } 6575 6576 void dump_vmcs(struct kvm_vcpu *vcpu) 6577 { 6578 struct vcpu_vmx *vmx = to_vmx(vcpu); 6579 u32 vmentry_ctl, vmexit_ctl; 6580 u32 cpu_based_exec_ctrl, pin_based_exec_ctrl, secondary_exec_control; 6581 u64 tertiary_exec_control; 6582 unsigned long cr4; 6583 int efer_slot; 6584 6585 if (!dump_invalid_vmcs) { 6586 pr_warn_ratelimited("set kvm_intel.dump_invalid_vmcs=1 to dump internal KVM state.\n"); 6587 return; 6588 } 6589 6590 vmentry_ctl = vmcs_read32(VM_ENTRY_CONTROLS); 6591 vmexit_ctl = vmcs_read32(VM_EXIT_CONTROLS); 6592 cpu_based_exec_ctrl = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); 6593 pin_based_exec_ctrl = vmcs_read32(PIN_BASED_VM_EXEC_CONTROL); 6594 cr4 = vmcs_readl(GUEST_CR4); 6595 6596 if (cpu_has_secondary_exec_ctrls()) 6597 secondary_exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL); 6598 else 6599 secondary_exec_control = 0; 6600 6601 if (cpu_has_tertiary_exec_ctrls()) 6602 tertiary_exec_control = vmcs_read64(TERTIARY_VM_EXEC_CONTROL); 6603 else 6604 tertiary_exec_control = 0; 6605 6606 pr_err("VMCS %p, last attempted VM-entry on CPU %d\n", 6607 vmx->loaded_vmcs->vmcs, vcpu->arch.last_vmentry_cpu); 6608 pr_err("*** Guest State ***\n"); 6609 pr_err("CR0: actual=0x%016lx, shadow=0x%016lx, gh_mask=%016lx\n", 6610 vmcs_readl(GUEST_CR0), vmcs_readl(CR0_READ_SHADOW), 6611 vmcs_readl(CR0_GUEST_HOST_MASK)); 6612 pr_err("CR4: actual=0x%016lx, shadow=0x%016lx, gh_mask=%016lx\n", 6613 cr4, vmcs_readl(CR4_READ_SHADOW), vmcs_readl(CR4_GUEST_HOST_MASK)); 6614 pr_err("CR3 = 0x%016lx\n", vmcs_readl(GUEST_CR3)); 6615 if (cpu_has_vmx_ept()) { 6616 pr_err("PDPTR0 = 0x%016llx PDPTR1 = 0x%016llx\n", 6617 vmcs_read64(GUEST_PDPTR0), vmcs_read64(GUEST_PDPTR1)); 6618 pr_err("PDPTR2 = 0x%016llx PDPTR3 = 0x%016llx\n", 6619 vmcs_read64(GUEST_PDPTR2), vmcs_read64(GUEST_PDPTR3)); 6620 } 6621 pr_err("RSP = 0x%016lx RIP = 0x%016lx\n", 6622 vmcs_readl(GUEST_RSP), vmcs_readl(GUEST_RIP)); 6623 pr_err("RFLAGS=0x%08lx DR7 = 0x%016lx\n", 6624 vmcs_readl(GUEST_RFLAGS), vmcs_readl(GUEST_DR7)); 6625 pr_err("Sysenter RSP=%016lx CS:RIP=%04x:%016lx\n", 6626 vmcs_readl(GUEST_SYSENTER_ESP), 6627 vmcs_read32(GUEST_SYSENTER_CS), vmcs_readl(GUEST_SYSENTER_EIP)); 6628 vmx_dump_sel("CS: ", GUEST_CS_SELECTOR); 6629 vmx_dump_sel("DS: ", GUEST_DS_SELECTOR); 6630 vmx_dump_sel("SS: ", GUEST_SS_SELECTOR); 6631 vmx_dump_sel("ES: ", GUEST_ES_SELECTOR); 6632 vmx_dump_sel("FS: ", GUEST_FS_SELECTOR); 6633 vmx_dump_sel("GS: ", GUEST_GS_SELECTOR); 6634 vmx_dump_dtsel("GDTR:", GUEST_GDTR_LIMIT); 6635 vmx_dump_sel("LDTR:", GUEST_LDTR_SELECTOR); 6636 vmx_dump_dtsel("IDTR:", GUEST_IDTR_LIMIT); 6637 vmx_dump_sel("TR: ", GUEST_TR_SELECTOR); 6638 efer_slot = vmx_find_loadstore_msr_slot(&vmx->msr_autoload.guest, MSR_EFER); 6639 if (vmentry_ctl & VM_ENTRY_LOAD_IA32_EFER) 6640 pr_err("EFER= 0x%016llx\n", vmcs_read64(GUEST_IA32_EFER)); 6641 else if (efer_slot >= 0) 6642 pr_err("EFER= 0x%016llx (autoload)\n", 6643 vmx->msr_autoload.guest.val[efer_slot].value); 6644 else if (vmentry_ctl & VM_ENTRY_IA32E_MODE) 6645 pr_err("EFER= 0x%016llx (effective)\n", 6646 vcpu->arch.efer | (EFER_LMA | EFER_LME)); 6647 else 6648 pr_err("EFER= 0x%016llx (effective)\n", 6649 vcpu->arch.efer & ~(EFER_LMA | EFER_LME)); 6650 if (vmentry_ctl & VM_ENTRY_LOAD_IA32_PAT) 6651 pr_err("PAT = 0x%016llx\n", vmcs_read64(GUEST_IA32_PAT)); 6652 pr_err("DebugCtl = 0x%016llx DebugExceptions = 0x%016lx\n", 6653 vmcs_read64(GUEST_IA32_DEBUGCTL), 6654 vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS)); 6655 if (cpu_has_load_perf_global_ctrl() && 6656 vmentry_ctl & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL) 6657 pr_err("PerfGlobCtl = 0x%016llx\n", 6658 vmcs_read64(GUEST_IA32_PERF_GLOBAL_CTRL)); 6659 if (vmentry_ctl & VM_ENTRY_LOAD_BNDCFGS) 6660 pr_err("BndCfgS = 0x%016llx\n", vmcs_read64(GUEST_BNDCFGS)); 6661 pr_err("Interruptibility = %08x ActivityState = %08x\n", 6662 vmcs_read32(GUEST_INTERRUPTIBILITY_INFO), 6663 vmcs_read32(GUEST_ACTIVITY_STATE)); 6664 if (secondary_exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY) 6665 pr_err("InterruptStatus = %04x\n", 6666 vmcs_read16(GUEST_INTR_STATUS)); 6667 if (vmcs_read32(VM_ENTRY_MSR_LOAD_COUNT) > 0) 6668 vmx_dump_msrs("guest autoload", &vmx->msr_autoload.guest); 6669 if (vmcs_read32(VM_EXIT_MSR_STORE_COUNT) > 0) 6670 vmx_dump_msrs("autostore", &vmx->msr_autostore); 6671 6672 if (vmentry_ctl & VM_ENTRY_LOAD_CET_STATE) 6673 pr_err("S_CET = 0x%016lx, SSP = 0x%016lx, SSP TABLE = 0x%016lx\n", 6674 vmcs_readl(GUEST_S_CET), vmcs_readl(GUEST_SSP), 6675 vmcs_readl(GUEST_INTR_SSP_TABLE)); 6676 pr_err("*** Host State ***\n"); 6677 pr_err("RIP = 0x%016lx RSP = 0x%016lx\n", 6678 vmcs_readl(HOST_RIP), vmcs_readl(HOST_RSP)); 6679 pr_err("CS=%04x SS=%04x DS=%04x ES=%04x FS=%04x GS=%04x TR=%04x\n", 6680 vmcs_read16(HOST_CS_SELECTOR), vmcs_read16(HOST_SS_SELECTOR), 6681 vmcs_read16(HOST_DS_SELECTOR), vmcs_read16(HOST_ES_SELECTOR), 6682 vmcs_read16(HOST_FS_SELECTOR), vmcs_read16(HOST_GS_SELECTOR), 6683 vmcs_read16(HOST_TR_SELECTOR)); 6684 pr_err("FSBase=%016lx GSBase=%016lx TRBase=%016lx\n", 6685 vmcs_readl(HOST_FS_BASE), vmcs_readl(HOST_GS_BASE), 6686 vmcs_readl(HOST_TR_BASE)); 6687 pr_err("GDTBase=%016lx IDTBase=%016lx\n", 6688 vmcs_readl(HOST_GDTR_BASE), vmcs_readl(HOST_IDTR_BASE)); 6689 pr_err("CR0=%016lx CR3=%016lx CR4=%016lx\n", 6690 vmcs_readl(HOST_CR0), vmcs_readl(HOST_CR3), 6691 vmcs_readl(HOST_CR4)); 6692 pr_err("Sysenter RSP=%016lx CS:RIP=%04x:%016lx\n", 6693 vmcs_readl(HOST_IA32_SYSENTER_ESP), 6694 vmcs_read32(HOST_IA32_SYSENTER_CS), 6695 vmcs_readl(HOST_IA32_SYSENTER_EIP)); 6696 if (vmexit_ctl & VM_EXIT_LOAD_IA32_EFER) 6697 pr_err("EFER= 0x%016llx\n", vmcs_read64(HOST_IA32_EFER)); 6698 if (vmexit_ctl & VM_EXIT_LOAD_IA32_PAT) 6699 pr_err("PAT = 0x%016llx\n", vmcs_read64(HOST_IA32_PAT)); 6700 if (cpu_has_load_perf_global_ctrl() && 6701 vmexit_ctl & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL) 6702 pr_err("PerfGlobCtl = 0x%016llx\n", 6703 vmcs_read64(HOST_IA32_PERF_GLOBAL_CTRL)); 6704 if (vmcs_read32(VM_EXIT_MSR_LOAD_COUNT) > 0) 6705 vmx_dump_msrs("host autoload", &vmx->msr_autoload.host); 6706 if (vmexit_ctl & VM_EXIT_LOAD_CET_STATE) 6707 pr_err("S_CET = 0x%016lx, SSP = 0x%016lx, SSP TABLE = 0x%016lx\n", 6708 vmcs_readl(HOST_S_CET), vmcs_readl(HOST_SSP), 6709 vmcs_readl(HOST_INTR_SSP_TABLE)); 6710 6711 pr_err("*** Control State ***\n"); 6712 pr_err("CPUBased=0x%08x SecondaryExec=0x%08x TertiaryExec=0x%016llx\n", 6713 cpu_based_exec_ctrl, secondary_exec_control, tertiary_exec_control); 6714 pr_err("PinBased=0x%08x EntryControls=%08x ExitControls=%08x\n", 6715 pin_based_exec_ctrl, vmentry_ctl, vmexit_ctl); 6716 pr_err("ExceptionBitmap=%08x PFECmask=%08x PFECmatch=%08x\n", 6717 vmcs_read32(EXCEPTION_BITMAP), 6718 vmcs_read32(PAGE_FAULT_ERROR_CODE_MASK), 6719 vmcs_read32(PAGE_FAULT_ERROR_CODE_MATCH)); 6720 pr_err("VMEntry: intr_info=%08x errcode=%08x ilen=%08x\n", 6721 vmcs_read32(VM_ENTRY_INTR_INFO_FIELD), 6722 vmcs_read32(VM_ENTRY_EXCEPTION_ERROR_CODE), 6723 vmcs_read32(VM_ENTRY_INSTRUCTION_LEN)); 6724 pr_err("VMExit: intr_info=%08x errcode=%08x ilen=%08x\n", 6725 vmcs_read32(VM_EXIT_INTR_INFO), 6726 vmcs_read32(VM_EXIT_INTR_ERROR_CODE), 6727 vmcs_read32(VM_EXIT_INSTRUCTION_LEN)); 6728 pr_err(" reason=%08x qualification=%016lx\n", 6729 vmcs_read32(VM_EXIT_REASON), vmcs_readl(EXIT_QUALIFICATION)); 6730 pr_err("IDTVectoring: info=%08x errcode=%08x\n", 6731 vmcs_read32(IDT_VECTORING_INFO_FIELD), 6732 vmcs_read32(IDT_VECTORING_ERROR_CODE)); 6733 pr_err("TSC Offset = 0x%016llx\n", vmcs_read64(TSC_OFFSET)); 6734 if (secondary_exec_control & SECONDARY_EXEC_TSC_SCALING) 6735 pr_err("TSC Multiplier = 0x%016llx\n", 6736 vmcs_read64(TSC_MULTIPLIER)); 6737 if (cpu_based_exec_ctrl & CPU_BASED_TPR_SHADOW) { 6738 if (secondary_exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY) { 6739 u16 status = vmcs_read16(GUEST_INTR_STATUS); 6740 pr_err("SVI|RVI = %02x|%02x ", status >> 8, status & 0xff); 6741 } 6742 pr_cont("TPR Threshold = 0x%02x\n", vmcs_read32(TPR_THRESHOLD)); 6743 if (secondary_exec_control & SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES) 6744 pr_err("APIC-access addr = 0x%016llx ", vmcs_read64(APIC_ACCESS_ADDR)); 6745 pr_cont("virt-APIC addr = 0x%016llx\n", vmcs_read64(VIRTUAL_APIC_PAGE_ADDR)); 6746 } 6747 if (pin_based_exec_ctrl & PIN_BASED_POSTED_INTR) 6748 pr_err("PostedIntrVec = 0x%02x\n", vmcs_read16(POSTED_INTR_NV)); 6749 if ((secondary_exec_control & SECONDARY_EXEC_ENABLE_EPT)) 6750 pr_err("EPT pointer = 0x%016llx\n", vmcs_read64(EPT_POINTER)); 6751 if (secondary_exec_control & SECONDARY_EXEC_PAUSE_LOOP_EXITING) 6752 pr_err("PLE Gap=%08x Window=%08x\n", 6753 vmcs_read32(PLE_GAP), vmcs_read32(PLE_WINDOW)); 6754 if (secondary_exec_control & SECONDARY_EXEC_ENABLE_VPID) 6755 pr_err("Virtual processor ID = 0x%04x\n", 6756 vmcs_read16(VIRTUAL_PROCESSOR_ID)); 6757 if (secondary_exec_control & SECONDARY_EXEC_EPT_VIOLATION_VE) { 6758 struct vmx_ve_information *ve_info = vmx->ve_info; 6759 u64 ve_info_pa = vmcs_read64(VE_INFORMATION_ADDRESS); 6760 6761 /* 6762 * If KVM is dumping the VMCS, then something has gone wrong 6763 * already. Derefencing an address from the VMCS, which could 6764 * very well be corrupted, is a terrible idea. The virtual 6765 * address is known so use it. 6766 */ 6767 pr_err("VE info address = 0x%016llx%s\n", ve_info_pa, 6768 ve_info_pa == __pa(ve_info) ? "" : "(corrupted!)"); 6769 pr_err("ve_info: 0x%08x 0x%08x 0x%016llx 0x%016llx 0x%016llx 0x%04x\n", 6770 ve_info->exit_reason, ve_info->delivery, 6771 ve_info->exit_qualification, 6772 ve_info->guest_linear_address, 6773 ve_info->guest_physical_address, ve_info->eptp_index); 6774 } 6775 } 6776 6777 /* 6778 * The guest has exited. See if we can fix it or if we need userspace 6779 * assistance. 6780 */ 6781 static int __vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) 6782 { 6783 struct vcpu_vmx *vmx = to_vmx(vcpu); 6784 union vmx_exit_reason exit_reason = vmx_get_exit_reason(vcpu); 6785 u32 vectoring_info = vmx->idt_vectoring_info; 6786 u16 exit_handler_index; 6787 6788 /* 6789 * Flush logged GPAs PML buffer, this will make dirty_bitmap more 6790 * updated. Another good is, in kvm_vm_ioctl_get_dirty_log, before 6791 * querying dirty_bitmap, we only need to kick all vcpus out of guest 6792 * mode as if vcpus is in root mode, the PML buffer must has been 6793 * flushed already. Note, PML is never enabled in hardware while 6794 * running L2. 6795 */ 6796 if (enable_pml && !is_guest_mode(vcpu)) 6797 vmx_flush_pml_buffer(vcpu); 6798 6799 /* 6800 * KVM should never reach this point with a pending nested VM-Enter. 6801 * More specifically, short-circuiting VM-Entry to emulate L2 due to 6802 * invalid guest state should never happen as that means KVM knowingly 6803 * allowed a nested VM-Enter with an invalid vmcs12. More below. 6804 */ 6805 if (KVM_BUG_ON(vmx->nested.nested_run_pending, vcpu->kvm)) 6806 return -EIO; 6807 6808 if (is_guest_mode(vcpu)) { 6809 /* 6810 * PML is never enabled when running L2, bail immediately if a 6811 * PML full exit occurs as something is horribly wrong. 6812 */ 6813 if (exit_reason.basic == EXIT_REASON_PML_FULL) 6814 goto unexpected_vmexit; 6815 6816 /* 6817 * The host physical addresses of some pages of guest memory 6818 * are loaded into the vmcs02 (e.g. vmcs12's Virtual APIC 6819 * Page). The CPU may write to these pages via their host 6820 * physical address while L2 is running, bypassing any 6821 * address-translation-based dirty tracking (e.g. EPT write 6822 * protection). 6823 * 6824 * Mark them dirty on every exit from L2 to prevent them from 6825 * getting out of sync with dirty tracking. 6826 */ 6827 nested_vmx_mark_all_vmcs12_pages_dirty(vcpu); 6828 6829 /* 6830 * Synthesize a triple fault if L2 state is invalid. In normal 6831 * operation, nested VM-Enter rejects any attempt to enter L2 6832 * with invalid state. However, those checks are skipped if 6833 * state is being stuffed via RSM or KVM_SET_NESTED_STATE. If 6834 * L2 state is invalid, it means either L1 modified SMRAM state 6835 * or userspace provided bad state. Synthesize TRIPLE_FAULT as 6836 * doing so is architecturally allowed in the RSM case, and is 6837 * the least awful solution for the userspace case without 6838 * risking false positives. 6839 */ 6840 if (vmx->vt.emulation_required) { 6841 nested_vmx_vmexit(vcpu, EXIT_REASON_TRIPLE_FAULT, 0, 0); 6842 return 1; 6843 } 6844 6845 if (nested_vmx_reflect_vmexit(vcpu)) 6846 return 1; 6847 } 6848 6849 /* If guest state is invalid, start emulating. L2 is handled above. */ 6850 if (vmx->vt.emulation_required) 6851 return handle_invalid_guest_state(vcpu); 6852 6853 if (exit_reason.failed_vmentry) { 6854 dump_vmcs(vcpu); 6855 vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY; 6856 vcpu->run->fail_entry.hardware_entry_failure_reason 6857 = exit_reason.full; 6858 vcpu->run->fail_entry.cpu = vcpu->arch.last_vmentry_cpu; 6859 return 0; 6860 } 6861 6862 if (unlikely(vmx->fail)) { 6863 dump_vmcs(vcpu); 6864 vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY; 6865 vcpu->run->fail_entry.hardware_entry_failure_reason 6866 = vmcs_read32(VM_INSTRUCTION_ERROR); 6867 vcpu->run->fail_entry.cpu = vcpu->arch.last_vmentry_cpu; 6868 return 0; 6869 } 6870 6871 if ((vectoring_info & VECTORING_INFO_VALID_MASK) && 6872 (exit_reason.basic != EXIT_REASON_EXCEPTION_NMI && 6873 exit_reason.basic != EXIT_REASON_EPT_VIOLATION && 6874 exit_reason.basic != EXIT_REASON_PML_FULL && 6875 exit_reason.basic != EXIT_REASON_APIC_ACCESS && 6876 exit_reason.basic != EXIT_REASON_TASK_SWITCH && 6877 exit_reason.basic != EXIT_REASON_NOTIFY && 6878 exit_reason.basic != EXIT_REASON_EPT_MISCONFIG)) { 6879 kvm_prepare_event_vectoring_exit(vcpu, INVALID_GPA); 6880 return 0; 6881 } 6882 6883 if (unlikely(!enable_vnmi && 6884 vmx->loaded_vmcs->soft_vnmi_blocked)) { 6885 if (!vmx_interrupt_blocked(vcpu)) { 6886 vmx->loaded_vmcs->soft_vnmi_blocked = 0; 6887 } else if (vmx->loaded_vmcs->vnmi_blocked_time > 1000000000LL && 6888 vcpu->arch.nmi_pending) { 6889 /* 6890 * This CPU don't support us in finding the end of an 6891 * NMI-blocked window if the guest runs with IRQs 6892 * disabled. So we pull the trigger after 1 s of 6893 * futile waiting, but inform the user about this. 6894 */ 6895 printk(KERN_WARNING "%s: Breaking out of NMI-blocked " 6896 "state on VCPU %d after 1 s timeout\n", 6897 __func__, vcpu->vcpu_id); 6898 vmx->loaded_vmcs->soft_vnmi_blocked = 0; 6899 } 6900 } 6901 6902 if (exit_fastpath != EXIT_FASTPATH_NONE) 6903 return 1; 6904 6905 if (exit_reason.basic >= kvm_vmx_max_exit_handlers) 6906 goto unexpected_vmexit; 6907 #ifdef CONFIG_MITIGATION_RETPOLINE 6908 if (exit_reason.basic == EXIT_REASON_MSR_WRITE) 6909 return kvm_emulate_wrmsr(vcpu); 6910 else if (exit_reason.basic == EXIT_REASON_MSR_WRITE_IMM) 6911 return handle_wrmsr_imm(vcpu); 6912 else if (exit_reason.basic == EXIT_REASON_PREEMPTION_TIMER) 6913 return handle_preemption_timer(vcpu); 6914 else if (exit_reason.basic == EXIT_REASON_INTERRUPT_WINDOW) 6915 return handle_interrupt_window(vcpu); 6916 else if (exit_reason.basic == EXIT_REASON_EXTERNAL_INTERRUPT) 6917 return handle_external_interrupt(vcpu); 6918 else if (exit_reason.basic == EXIT_REASON_HLT) 6919 return kvm_emulate_halt(vcpu); 6920 else if (exit_reason.basic == EXIT_REASON_EPT_MISCONFIG) 6921 return handle_ept_misconfig(vcpu); 6922 #endif 6923 6924 exit_handler_index = array_index_nospec((u16)exit_reason.basic, 6925 kvm_vmx_max_exit_handlers); 6926 if (!kvm_vmx_exit_handlers[exit_handler_index]) 6927 goto unexpected_vmexit; 6928 6929 return kvm_vmx_exit_handlers[exit_handler_index](vcpu); 6930 6931 unexpected_vmexit: 6932 dump_vmcs(vcpu); 6933 kvm_prepare_unexpected_reason_exit(vcpu, exit_reason.full); 6934 return 0; 6935 } 6936 6937 int vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) 6938 { 6939 int ret = __vmx_handle_exit(vcpu, exit_fastpath); 6940 6941 /* 6942 * Exit to user space when bus lock detected to inform that there is 6943 * a bus lock in guest. 6944 */ 6945 if (vmx_get_exit_reason(vcpu).bus_lock_detected) { 6946 if (ret > 0) 6947 vcpu->run->exit_reason = KVM_EXIT_X86_BUS_LOCK; 6948 6949 vcpu->run->flags |= KVM_RUN_X86_BUS_LOCK; 6950 return 0; 6951 } 6952 return ret; 6953 } 6954 6955 void vmx_update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr) 6956 { 6957 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 6958 int tpr_threshold; 6959 6960 if (is_guest_mode(vcpu) && 6961 nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) 6962 return; 6963 6964 guard(vmx_vmcs01)(vcpu); 6965 6966 tpr_threshold = (irr == -1 || tpr < irr) ? 0 : irr; 6967 vmcs_write32(TPR_THRESHOLD, tpr_threshold); 6968 } 6969 6970 void vmx_set_virtual_apic_mode(struct kvm_vcpu *vcpu) 6971 { 6972 struct vcpu_vmx *vmx = to_vmx(vcpu); 6973 u32 sec_exec_control; 6974 6975 if (!lapic_in_kernel(vcpu)) 6976 return; 6977 6978 if (!flexpriority_enabled && 6979 !cpu_has_vmx_virtualize_x2apic_mode()) 6980 return; 6981 6982 guard(vmx_vmcs01)(vcpu); 6983 6984 sec_exec_control = secondary_exec_controls_get(vmx); 6985 sec_exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | 6986 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE); 6987 6988 switch (kvm_get_apic_mode(vcpu)) { 6989 case LAPIC_MODE_INVALID: 6990 WARN_ONCE(true, "Invalid local APIC state"); 6991 break; 6992 case LAPIC_MODE_DISABLED: 6993 break; 6994 case LAPIC_MODE_XAPIC: 6995 if (flexpriority_enabled) { 6996 sec_exec_control |= 6997 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; 6998 kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu); 6999 7000 /* 7001 * Flush the TLB, reloading the APIC access page will 7002 * only do so if its physical address has changed, but 7003 * the guest may have inserted a non-APIC mapping into 7004 * the TLB while the APIC access page was disabled. 7005 * 7006 * If L2 is active, immediately flush L1's TLB instead 7007 * of requesting a flush of the current TLB, because 7008 * the current TLB context is L2's. 7009 */ 7010 if (!is_guest_mode(vcpu)) 7011 kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu); 7012 else if (!enable_ept) 7013 vpid_sync_context(vmx->vpid); 7014 else if (VALID_PAGE(vcpu->arch.root_mmu.root.hpa)) 7015 vmx_flush_tlb_ept_root(vcpu->arch.root_mmu.root.hpa); 7016 } 7017 break; 7018 case LAPIC_MODE_X2APIC: 7019 if (cpu_has_vmx_virtualize_x2apic_mode()) 7020 sec_exec_control |= 7021 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE; 7022 break; 7023 } 7024 secondary_exec_controls_set(vmx, sec_exec_control); 7025 7026 vmx_update_msr_bitmap_x2apic(vcpu); 7027 } 7028 7029 void vmx_set_apic_access_page_addr(struct kvm_vcpu *vcpu) 7030 { 7031 const gfn_t gfn = APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT; 7032 struct kvm *kvm = vcpu->kvm; 7033 struct kvm_memslots *slots = kvm_memslots(kvm); 7034 struct kvm_memory_slot *slot; 7035 struct page *refcounted_page; 7036 unsigned long mmu_seq; 7037 kvm_pfn_t pfn; 7038 bool writable; 7039 7040 /* Note, the VIRTUALIZE_APIC_ACCESSES check needs to query vmcs01. */ 7041 guard(vmx_vmcs01)(vcpu); 7042 7043 if (!(secondary_exec_controls_get(to_vmx(vcpu)) & 7044 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) 7045 return; 7046 7047 /* 7048 * Explicitly grab the memslot using KVM's internal slot ID to ensure 7049 * KVM doesn't unintentionally grab a userspace memslot. It _should_ 7050 * be impossible for userspace to create a memslot for the APIC when 7051 * APICv is enabled, but paranoia won't hurt in this case. 7052 */ 7053 slot = id_to_memslot(slots, APIC_ACCESS_PAGE_PRIVATE_MEMSLOT); 7054 if (!slot || slot->flags & KVM_MEMSLOT_INVALID) 7055 return; 7056 7057 /* 7058 * Ensure that the mmu_notifier sequence count is read before KVM 7059 * retrieves the pfn from the primary MMU. Note, the memslot is 7060 * protected by SRCU, not the mmu_notifier. Pairs with the smp_wmb() 7061 * in kvm_mmu_invalidate_end(). 7062 */ 7063 mmu_seq = kvm->mmu_invalidate_seq; 7064 smp_rmb(); 7065 7066 /* 7067 * No need to retry if the memslot does not exist or is invalid. KVM 7068 * controls the APIC-access page memslot, and only deletes the memslot 7069 * if APICv is permanently inhibited, i.e. the memslot won't reappear. 7070 */ 7071 pfn = __kvm_faultin_pfn(slot, gfn, FOLL_WRITE, &writable, &refcounted_page); 7072 if (is_error_noslot_pfn(pfn)) 7073 return; 7074 7075 read_lock(&vcpu->kvm->mmu_lock); 7076 if (mmu_invalidate_retry_gfn(kvm, mmu_seq, gfn)) 7077 kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu); 7078 else 7079 vmcs_write64(APIC_ACCESS_ADDR, pfn_to_hpa(pfn)); 7080 7081 /* 7082 * Do not pin the APIC access page in memory so that it can be freely 7083 * migrated, the MMU notifier will call us again if it is migrated or 7084 * swapped out. KVM backs the memslot with anonymous memory, the pfn 7085 * should always point at a refcounted page (if the pfn is valid). 7086 */ 7087 if (!WARN_ON_ONCE(!refcounted_page)) 7088 kvm_release_page_clean(refcounted_page); 7089 7090 /* 7091 * No need for a manual TLB flush at this point, KVM has already done a 7092 * flush if there were SPTEs pointing at the previous page. 7093 */ 7094 read_unlock(&vcpu->kvm->mmu_lock); 7095 } 7096 7097 void vmx_hwapic_isr_update(struct kvm_vcpu *vcpu, int max_isr) 7098 { 7099 u16 status; 7100 u8 old; 7101 7102 if (max_isr == -1) 7103 max_isr = 0; 7104 7105 /* 7106 * Always update SVI in vmcs01, as SVI is only relevant for L2 if and 7107 * only if Virtual Interrupt Delivery is enabled in vmcs12, and if VID 7108 * is enabled then L2 EOIs affect L2's vAPIC, not L1's vAPIC. 7109 */ 7110 guard(vmx_vmcs01)(vcpu); 7111 7112 status = vmcs_read16(GUEST_INTR_STATUS); 7113 old = status >> 8; 7114 if (max_isr != old) { 7115 status &= 0xff; 7116 status |= max_isr << 8; 7117 vmcs_write16(GUEST_INTR_STATUS, status); 7118 } 7119 } 7120 7121 static void vmx_set_rvi(int vector) 7122 { 7123 u16 status; 7124 u8 old; 7125 7126 if (vector == -1) 7127 vector = 0; 7128 7129 status = vmcs_read16(GUEST_INTR_STATUS); 7130 old = (u8)status & 0xff; 7131 if ((u8)vector != old) { 7132 status &= ~0xff; 7133 status |= (u8)vector; 7134 vmcs_write16(GUEST_INTR_STATUS, status); 7135 } 7136 } 7137 7138 int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu) 7139 { 7140 struct vcpu_vt *vt = to_vt(vcpu); 7141 int max_irr; 7142 bool got_posted_interrupt; 7143 7144 if (KVM_BUG_ON(!enable_apicv, vcpu->kvm)) 7145 return -EIO; 7146 7147 if (pi_test_on(&vt->pi_desc)) { 7148 pi_clear_on(&vt->pi_desc); 7149 /* 7150 * IOMMU can write to PID.ON, so the barrier matters even on UP. 7151 * But on x86 this is just a compiler barrier anyway. 7152 */ 7153 smp_mb__after_atomic(); 7154 got_posted_interrupt = 7155 kvm_apic_update_irr(vcpu, vt->pi_desc.pir, &max_irr); 7156 } else { 7157 max_irr = kvm_lapic_find_highest_irr(vcpu); 7158 got_posted_interrupt = false; 7159 } 7160 7161 /* 7162 * Newly recognized interrupts are injected via either virtual interrupt 7163 * delivery (RVI) or KVM_REQ_EVENT. Virtual interrupt delivery is 7164 * disabled in two cases: 7165 * 7166 * 1) If L2 is running and the vCPU has a new pending interrupt. If L1 7167 * wants to exit on interrupts, KVM_REQ_EVENT is needed to synthesize a 7168 * VM-Exit to L1. If L1 doesn't want to exit, the interrupt is injected 7169 * into L2, but KVM doesn't use virtual interrupt delivery to inject 7170 * interrupts into L2, and so KVM_REQ_EVENT is again needed. 7171 * 7172 * 2) If APICv is disabled for this vCPU, assigned devices may still 7173 * attempt to post interrupts. The posted interrupt vector will cause 7174 * a VM-Exit and the subsequent entry will call sync_pir_to_irr. 7175 */ 7176 if (!is_guest_mode(vcpu) && kvm_vcpu_apicv_active(vcpu)) 7177 vmx_set_rvi(max_irr); 7178 else if (got_posted_interrupt) 7179 kvm_make_request(KVM_REQ_EVENT, vcpu); 7180 7181 return max_irr; 7182 } 7183 7184 void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap) 7185 { 7186 if (!kvm_vcpu_apicv_active(vcpu)) 7187 return; 7188 7189 vmcs_write64(EOI_EXIT_BITMAP0, eoi_exit_bitmap[0]); 7190 vmcs_write64(EOI_EXIT_BITMAP1, eoi_exit_bitmap[1]); 7191 vmcs_write64(EOI_EXIT_BITMAP2, eoi_exit_bitmap[2]); 7192 vmcs_write64(EOI_EXIT_BITMAP3, eoi_exit_bitmap[3]); 7193 } 7194 7195 void vmx_do_interrupt_irqoff(unsigned long entry); 7196 void vmx_do_nmi_irqoff(void); 7197 7198 static void handle_nm_fault_irqoff(struct kvm_vcpu *vcpu) 7199 { 7200 /* 7201 * Save xfd_err to guest_fpu before interrupt is enabled, so the 7202 * MSR value is not clobbered by the host activity before the guest 7203 * has chance to consume it. 7204 * 7205 * Update the guest's XFD_ERR if and only if XFD is enabled, as the #NM 7206 * interception may have been caused by L1 interception. Per the SDM, 7207 * XFD_ERR is not modified for non-XFD #NM, i.e. if CR0.TS=1. 7208 * 7209 * Note, XFD_ERR is updated _before_ the #NM interception check, i.e. 7210 * unlike CR2 and DR6, the value is not a payload that is attached to 7211 * the #NM exception. 7212 */ 7213 if (is_xfd_nm_fault(vcpu)) 7214 rdmsrq(MSR_IA32_XFD_ERR, vcpu->arch.guest_fpu.xfd_err); 7215 } 7216 7217 static void handle_exception_irqoff(struct kvm_vcpu *vcpu, u32 intr_info) 7218 { 7219 /* if exit due to PF check for async PF */ 7220 if (is_page_fault(intr_info)) 7221 vcpu->arch.apf.host_apf_flags = kvm_read_and_reset_apf_flags(); 7222 /* if exit due to NM, handle before interrupts are enabled */ 7223 else if (is_nm_fault(intr_info)) 7224 handle_nm_fault_irqoff(vcpu); 7225 /* Handle machine checks before interrupts are enabled */ 7226 else if (is_machine_check(intr_info)) 7227 kvm_machine_check(); 7228 } 7229 7230 static void handle_external_interrupt_irqoff(struct kvm_vcpu *vcpu, 7231 u32 intr_info) 7232 { 7233 unsigned int vector = intr_info & INTR_INFO_VECTOR_MASK; 7234 7235 if (KVM_BUG(!is_external_intr(intr_info), vcpu->kvm, 7236 "unexpected VM-Exit interrupt info: 0x%x", intr_info)) 7237 return; 7238 7239 /* 7240 * Invoke the kernel's IRQ handler for the vector. Use the FRED path 7241 * when it's available even if FRED isn't fully enabled, e.g. even if 7242 * FRED isn't supported in hardware, in order to avoid the indirect 7243 * CALL in the non-FRED path. 7244 */ 7245 kvm_before_interrupt(vcpu, KVM_HANDLING_IRQ); 7246 if (IS_ENABLED(CONFIG_X86_FRED)) 7247 fred_entry_from_kvm(EVENT_TYPE_EXTINT, vector); 7248 else 7249 vmx_do_interrupt_irqoff(gate_offset((gate_desc *)host_idt_base + vector)); 7250 kvm_after_interrupt(vcpu); 7251 7252 vcpu->arch.at_instruction_boundary = true; 7253 } 7254 7255 void vmx_handle_exit_irqoff(struct kvm_vcpu *vcpu) 7256 { 7257 if (to_vt(vcpu)->emulation_required) 7258 return; 7259 7260 switch (vmx_get_exit_reason(vcpu).basic) { 7261 case EXIT_REASON_EXTERNAL_INTERRUPT: 7262 handle_external_interrupt_irqoff(vcpu, vmx_get_intr_info(vcpu)); 7263 break; 7264 case EXIT_REASON_EXCEPTION_NMI: 7265 handle_exception_irqoff(vcpu, vmx_get_intr_info(vcpu)); 7266 break; 7267 case EXIT_REASON_MCE_DURING_VMENTRY: 7268 kvm_machine_check(); 7269 break; 7270 default: 7271 break; 7272 } 7273 } 7274 7275 /* 7276 * The kvm parameter can be NULL (module initialization, or invocation before 7277 * VM creation). Be sure to check the kvm parameter before using it. 7278 */ 7279 bool vmx_has_emulated_msr(struct kvm *kvm, u32 index) 7280 { 7281 switch (index) { 7282 case MSR_IA32_SMBASE: 7283 if (!IS_ENABLED(CONFIG_KVM_SMM)) 7284 return false; 7285 /* 7286 * We cannot do SMM unless we can run the guest in big 7287 * real mode. 7288 */ 7289 return enable_unrestricted_guest || emulate_invalid_guest_state; 7290 case KVM_FIRST_EMULATED_VMX_MSR ... KVM_LAST_EMULATED_VMX_MSR: 7291 return nested; 7292 case MSR_AMD64_VIRT_SPEC_CTRL: 7293 case MSR_AMD64_TSC_RATIO: 7294 /* This is AMD only. */ 7295 return false; 7296 default: 7297 return true; 7298 } 7299 } 7300 7301 static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx) 7302 { 7303 u32 exit_intr_info; 7304 bool unblock_nmi; 7305 u8 vector; 7306 bool idtv_info_valid; 7307 7308 idtv_info_valid = vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK; 7309 7310 if (enable_vnmi) { 7311 if (vmx->loaded_vmcs->nmi_known_unmasked) 7312 return; 7313 7314 exit_intr_info = vmx_get_intr_info(&vmx->vcpu); 7315 unblock_nmi = (exit_intr_info & INTR_INFO_UNBLOCK_NMI) != 0; 7316 vector = exit_intr_info & INTR_INFO_VECTOR_MASK; 7317 /* 7318 * SDM 3: 27.7.1.2 (September 2008) 7319 * Re-set bit "block by NMI" before VM entry if vmexit caused by 7320 * a guest IRET fault. 7321 * SDM 3: 23.2.2 (September 2008) 7322 * Bit 12 is undefined in any of the following cases: 7323 * If the VM exit sets the valid bit in the IDT-vectoring 7324 * information field. 7325 * If the VM exit is due to a double fault. 7326 */ 7327 if ((exit_intr_info & INTR_INFO_VALID_MASK) && unblock_nmi && 7328 vector != DF_VECTOR && !idtv_info_valid) 7329 vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, 7330 GUEST_INTR_STATE_NMI); 7331 else 7332 vmx->loaded_vmcs->nmi_known_unmasked = 7333 !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) 7334 & GUEST_INTR_STATE_NMI); 7335 } else if (unlikely(vmx->loaded_vmcs->soft_vnmi_blocked)) 7336 vmx->loaded_vmcs->vnmi_blocked_time += 7337 ktime_to_ns(ktime_sub(ktime_get(), 7338 vmx->loaded_vmcs->entry_time)); 7339 } 7340 7341 static void __vmx_complete_interrupts(struct kvm_vcpu *vcpu, 7342 u32 idt_vectoring_info, 7343 int instr_len_field, 7344 int error_code_field) 7345 { 7346 u8 vector; 7347 int type; 7348 bool idtv_info_valid; 7349 7350 idtv_info_valid = idt_vectoring_info & VECTORING_INFO_VALID_MASK; 7351 7352 vcpu->arch.nmi_injected = false; 7353 kvm_clear_exception_queue(vcpu); 7354 kvm_clear_interrupt_queue(vcpu); 7355 7356 if (!idtv_info_valid) 7357 return; 7358 7359 kvm_make_request(KVM_REQ_EVENT, vcpu); 7360 7361 vector = idt_vectoring_info & VECTORING_INFO_VECTOR_MASK; 7362 type = idt_vectoring_info & VECTORING_INFO_TYPE_MASK; 7363 7364 switch (type) { 7365 case INTR_TYPE_NMI_INTR: 7366 vcpu->arch.nmi_injected = true; 7367 /* 7368 * SDM 3: 27.7.1.2 (September 2008) 7369 * Clear bit "block by NMI" before VM entry if a NMI 7370 * delivery faulted. 7371 */ 7372 vmx_set_nmi_mask(vcpu, false); 7373 break; 7374 case INTR_TYPE_SOFT_EXCEPTION: 7375 vcpu->arch.event_exit_inst_len = vmcs_read32(instr_len_field); 7376 fallthrough; 7377 case INTR_TYPE_HARD_EXCEPTION: { 7378 u32 error_code = 0; 7379 7380 if (idt_vectoring_info & VECTORING_INFO_DELIVER_CODE_MASK) 7381 error_code = vmcs_read32(error_code_field); 7382 7383 kvm_requeue_exception(vcpu, vector, 7384 idt_vectoring_info & VECTORING_INFO_DELIVER_CODE_MASK, 7385 error_code); 7386 break; 7387 } 7388 case INTR_TYPE_SOFT_INTR: 7389 vcpu->arch.event_exit_inst_len = vmcs_read32(instr_len_field); 7390 fallthrough; 7391 case INTR_TYPE_EXT_INTR: 7392 kvm_queue_interrupt(vcpu, vector, type == INTR_TYPE_SOFT_INTR); 7393 break; 7394 default: 7395 break; 7396 } 7397 } 7398 7399 static void vmx_complete_interrupts(struct vcpu_vmx *vmx) 7400 { 7401 __vmx_complete_interrupts(&vmx->vcpu, vmx->idt_vectoring_info, 7402 VM_EXIT_INSTRUCTION_LEN, 7403 IDT_VECTORING_ERROR_CODE); 7404 } 7405 7406 void vmx_cancel_injection(struct kvm_vcpu *vcpu) 7407 { 7408 __vmx_complete_interrupts(vcpu, 7409 vmcs_read32(VM_ENTRY_INTR_INFO_FIELD), 7410 VM_ENTRY_INSTRUCTION_LEN, 7411 VM_ENTRY_EXCEPTION_ERROR_CODE); 7412 7413 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0); 7414 } 7415 7416 static void atomic_switch_perf_msrs(struct vcpu_vmx *vmx) 7417 { 7418 int i, nr_msrs; 7419 struct perf_guest_switch_msr *msrs; 7420 struct kvm_pmu *pmu = vcpu_to_pmu(&vmx->vcpu); 7421 7422 if (kvm_vcpu_has_mediated_pmu(&vmx->vcpu)) 7423 return; 7424 7425 pmu->host_cross_mapped_mask = 0; 7426 if (pmu->pebs_enable & pmu->global_ctrl) 7427 intel_pmu_cross_mapped_check(pmu); 7428 7429 /* Note, nr_msrs may be garbage if perf_guest_get_msrs() returns NULL. */ 7430 msrs = perf_guest_get_msrs(&nr_msrs, (void *)pmu); 7431 if (!msrs) 7432 return; 7433 7434 for (i = 0; i < nr_msrs; i++) 7435 if (msrs[i].host == msrs[i].guest) 7436 clear_atomic_switch_msr(vmx, msrs[i].msr); 7437 else 7438 add_atomic_switch_msr(vmx, msrs[i].msr, msrs[i].guest, 7439 msrs[i].host); 7440 } 7441 7442 static void vmx_refresh_guest_perf_global_control(struct kvm_vcpu *vcpu) 7443 { 7444 struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); 7445 struct vcpu_vmx *vmx = to_vmx(vcpu); 7446 7447 if (msr_write_intercepted(vmx, MSR_CORE_PERF_GLOBAL_CTRL)) 7448 return; 7449 7450 if (!cpu_has_save_perf_global_ctrl()) { 7451 int slot = vmx_find_loadstore_msr_slot(&vmx->msr_autostore, 7452 MSR_CORE_PERF_GLOBAL_CTRL); 7453 7454 if (WARN_ON_ONCE(slot < 0)) 7455 return; 7456 7457 pmu->global_ctrl = vmx->msr_autostore.val[slot].value; 7458 vmcs_write64(GUEST_IA32_PERF_GLOBAL_CTRL, pmu->global_ctrl); 7459 return; 7460 } 7461 7462 pmu->global_ctrl = vmcs_read64(GUEST_IA32_PERF_GLOBAL_CTRL); 7463 } 7464 7465 static void vmx_update_hv_timer(struct kvm_vcpu *vcpu, bool force_immediate_exit) 7466 { 7467 struct vcpu_vmx *vmx = to_vmx(vcpu); 7468 u64 tscl; 7469 u32 delta_tsc; 7470 7471 if (force_immediate_exit) { 7472 vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, 0); 7473 vmx->loaded_vmcs->hv_timer_soft_disabled = false; 7474 } else if (vmx->hv_deadline_tsc != -1) { 7475 tscl = rdtsc(); 7476 if (vmx->hv_deadline_tsc > tscl) 7477 /* set_hv_timer ensures the delta fits in 32-bits */ 7478 delta_tsc = (u32)((vmx->hv_deadline_tsc - tscl) >> 7479 cpu_preemption_timer_multi); 7480 else 7481 delta_tsc = 0; 7482 7483 vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, delta_tsc); 7484 vmx->loaded_vmcs->hv_timer_soft_disabled = false; 7485 } else if (!vmx->loaded_vmcs->hv_timer_soft_disabled) { 7486 vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, -1); 7487 vmx->loaded_vmcs->hv_timer_soft_disabled = true; 7488 } 7489 } 7490 7491 void noinstr vmx_update_host_rsp(struct vcpu_vmx *vmx, unsigned long host_rsp) 7492 { 7493 if (unlikely(host_rsp != vmx->loaded_vmcs->host_state.rsp)) { 7494 vmx->loaded_vmcs->host_state.rsp = host_rsp; 7495 vmcs_writel(HOST_RSP, host_rsp); 7496 } 7497 } 7498 7499 void noinstr vmx_spec_ctrl_restore_host(struct vcpu_vmx *vmx, 7500 unsigned int flags) 7501 { 7502 u64 hostval = this_cpu_read(x86_spec_ctrl_current); 7503 7504 if (!cpu_feature_enabled(X86_FEATURE_MSR_SPEC_CTRL)) 7505 return; 7506 7507 if (flags & VMX_RUN_SAVE_SPEC_CTRL) 7508 vmx->spec_ctrl = native_rdmsrq(MSR_IA32_SPEC_CTRL); 7509 7510 /* 7511 * If the guest/host SPEC_CTRL values differ, restore the host value. 7512 * 7513 * For legacy IBRS, the IBRS bit always needs to be written after 7514 * transitioning from a less privileged predictor mode, regardless of 7515 * whether the guest/host values differ. 7516 */ 7517 if (cpu_feature_enabled(X86_FEATURE_KERNEL_IBRS) || 7518 vmx->spec_ctrl != hostval) 7519 native_wrmsrq(MSR_IA32_SPEC_CTRL, hostval); 7520 7521 barrier_nospec(); 7522 } 7523 7524 static fastpath_t vmx_exit_handlers_fastpath(struct kvm_vcpu *vcpu, 7525 bool force_immediate_exit) 7526 { 7527 /* 7528 * If L2 is active, some VMX preemption timer exits can be handled in 7529 * the fastpath even, all other exits must use the slow path. 7530 */ 7531 if (is_guest_mode(vcpu) && 7532 vmx_get_exit_reason(vcpu).basic != EXIT_REASON_PREEMPTION_TIMER) 7533 return EXIT_FASTPATH_NONE; 7534 7535 switch (vmx_get_exit_reason(vcpu).basic) { 7536 case EXIT_REASON_MSR_WRITE: 7537 return handle_fastpath_wrmsr(vcpu); 7538 case EXIT_REASON_MSR_WRITE_IMM: 7539 return handle_fastpath_wrmsr_imm(vcpu, vmx_get_exit_qual(vcpu), 7540 vmx_get_msr_imm_reg(vcpu)); 7541 case EXIT_REASON_PREEMPTION_TIMER: 7542 return handle_fastpath_preemption_timer(vcpu, force_immediate_exit); 7543 case EXIT_REASON_HLT: 7544 return handle_fastpath_hlt(vcpu); 7545 case EXIT_REASON_INVD: 7546 return handle_fastpath_invd(vcpu); 7547 default: 7548 return EXIT_FASTPATH_NONE; 7549 } 7550 } 7551 7552 noinstr void vmx_handle_nmi(struct kvm_vcpu *vcpu) 7553 { 7554 if ((u16)vmx_get_exit_reason(vcpu).basic != EXIT_REASON_EXCEPTION_NMI || 7555 !is_nmi(vmx_get_intr_info(vcpu))) 7556 return; 7557 7558 kvm_before_interrupt(vcpu, KVM_HANDLING_NMI); 7559 if (cpu_feature_enabled(X86_FEATURE_FRED)) 7560 fred_entry_from_kvm(EVENT_TYPE_NMI, NMI_VECTOR); 7561 else 7562 vmx_do_nmi_irqoff(); 7563 kvm_after_interrupt(vcpu); 7564 } 7565 7566 static noinstr void vmx_vcpu_enter_exit(struct kvm_vcpu *vcpu, 7567 unsigned int flags) 7568 { 7569 struct vcpu_vmx *vmx = to_vmx(vcpu); 7570 7571 guest_state_enter_irqoff(); 7572 7573 vmx_l1d_flush(vcpu); 7574 7575 vmx_disable_fb_clear(vmx); 7576 7577 if (vcpu->arch.cr2 != native_read_cr2()) 7578 native_write_cr2(vcpu->arch.cr2); 7579 7580 vmx->fail = __vmx_vcpu_run(vmx, (unsigned long *)&vcpu->arch.regs, 7581 flags); 7582 7583 vcpu->arch.cr2 = native_read_cr2(); 7584 vcpu->arch.regs_avail &= ~VMX_REGS_LAZY_LOAD_SET; 7585 7586 vmx->idt_vectoring_info = 0; 7587 7588 vmx_enable_fb_clear(vmx); 7589 7590 if (unlikely(vmx->fail)) { 7591 vmx->vt.exit_reason.full = 0xdead; 7592 goto out; 7593 } 7594 7595 vmx->vt.exit_reason.full = vmcs_read32(VM_EXIT_REASON); 7596 if (likely(!vmx_get_exit_reason(vcpu).failed_vmentry)) 7597 vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD); 7598 7599 vmx_handle_nmi(vcpu); 7600 7601 out: 7602 guest_state_exit_irqoff(); 7603 } 7604 7605 fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, u64 run_flags) 7606 { 7607 bool force_immediate_exit = run_flags & KVM_RUN_FORCE_IMMEDIATE_EXIT; 7608 struct vcpu_vmx *vmx = to_vmx(vcpu); 7609 unsigned long cr3, cr4; 7610 7611 /* Record the guest's net vcpu time for enforced NMI injections. */ 7612 if (unlikely(!enable_vnmi && 7613 vmx->loaded_vmcs->soft_vnmi_blocked)) 7614 vmx->loaded_vmcs->entry_time = ktime_get(); 7615 7616 /* 7617 * Don't enter VMX if guest state is invalid, let the exit handler 7618 * start emulation until we arrive back to a valid state. Synthesize a 7619 * consistency check VM-Exit due to invalid guest state and bail. 7620 */ 7621 if (unlikely(vmx->vt.emulation_required)) { 7622 vmx->fail = 0; 7623 7624 vmx->vt.exit_reason.full = EXIT_REASON_INVALID_STATE; 7625 vmx->vt.exit_reason.failed_vmentry = 1; 7626 kvm_register_mark_available(vcpu, VCPU_EXREG_EXIT_INFO_1); 7627 vmx->vt.exit_qualification = ENTRY_FAIL_DEFAULT; 7628 kvm_register_mark_available(vcpu, VCPU_EXREG_EXIT_INFO_2); 7629 vmx->vt.exit_intr_info = 0; 7630 return EXIT_FASTPATH_NONE; 7631 } 7632 7633 trace_kvm_entry(vcpu, force_immediate_exit); 7634 7635 if (vmx->ple_window_dirty) { 7636 vmx->ple_window_dirty = false; 7637 vmcs_write32(PLE_WINDOW, vmx->ple_window); 7638 } 7639 7640 /* 7641 * We did this in prepare_switch_to_guest, because it needs to 7642 * be within srcu_read_lock. 7643 */ 7644 WARN_ON_ONCE(vmx->nested.need_vmcs12_to_shadow_sync); 7645 7646 if (kvm_register_is_dirty(vcpu, VCPU_REGS_RSP)) 7647 vmcs_writel(GUEST_RSP, vcpu->arch.regs[VCPU_REGS_RSP]); 7648 if (kvm_register_is_dirty(vcpu, VCPU_REGS_RIP)) 7649 vmcs_writel(GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]); 7650 vcpu->arch.regs_dirty = 0; 7651 7652 if (run_flags & KVM_RUN_LOAD_GUEST_DR6) 7653 set_debugreg(vcpu->arch.dr6, 6); 7654 7655 if (run_flags & KVM_RUN_LOAD_DEBUGCTL) 7656 vmx_reload_guest_debugctl(vcpu); 7657 7658 /* 7659 * Refresh vmcs.HOST_CR3 if necessary. This must be done immediately 7660 * prior to VM-Enter, as the kernel may load a new ASID (PCID) any time 7661 * it switches back to the current->mm, which can occur in KVM context 7662 * when switching to a temporary mm to patch kernel code, e.g. if KVM 7663 * toggles a static key while handling a VM-Exit. 7664 */ 7665 cr3 = __get_current_cr3_fast(); 7666 if (unlikely(cr3 != vmx->loaded_vmcs->host_state.cr3)) { 7667 vmcs_writel(HOST_CR3, cr3); 7668 vmx->loaded_vmcs->host_state.cr3 = cr3; 7669 } 7670 7671 cr4 = cr4_read_shadow(); 7672 if (unlikely(cr4 != vmx->loaded_vmcs->host_state.cr4)) { 7673 vmcs_writel(HOST_CR4, cr4); 7674 vmx->loaded_vmcs->host_state.cr4 = cr4; 7675 } 7676 7677 /* When single-stepping over STI and MOV SS, we must clear the 7678 * corresponding interruptibility bits in the guest state. Otherwise 7679 * vmentry fails as it then expects bit 14 (BS) in pending debug 7680 * exceptions being set, but that's not correct for the guest debugging 7681 * case. */ 7682 if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) 7683 vmx_set_interrupt_shadow(vcpu, 0); 7684 7685 pt_guest_enter(vmx); 7686 7687 atomic_switch_perf_msrs(vmx); 7688 if (intel_pmu_lbr_is_enabled(vcpu)) 7689 vmx_passthrough_lbr_msrs(vcpu); 7690 7691 if (enable_preemption_timer) 7692 vmx_update_hv_timer(vcpu, force_immediate_exit); 7693 else if (force_immediate_exit) 7694 smp_send_reschedule(vcpu->cpu); 7695 7696 kvm_wait_lapic_expire(vcpu); 7697 7698 /* The actual VMENTER/EXIT is in the .noinstr.text section. */ 7699 vmx_vcpu_enter_exit(vcpu, __vmx_vcpu_run_flags(vmx)); 7700 7701 /* All fields are clean at this point */ 7702 if (kvm_is_using_evmcs()) { 7703 current_evmcs->hv_clean_fields |= 7704 HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL; 7705 7706 current_evmcs->hv_vp_id = kvm_hv_get_vpindex(vcpu); 7707 } 7708 7709 /* MSR_IA32_DEBUGCTLMSR is zeroed on vmexit. Restore it if needed */ 7710 if (vcpu->arch.host_debugctl) 7711 update_debugctlmsr(vcpu->arch.host_debugctl); 7712 7713 #ifndef CONFIG_X86_64 7714 /* 7715 * The sysexit path does not restore ds/es, so we must set them to 7716 * a reasonable value ourselves. 7717 * 7718 * We can't defer this to vmx_prepare_switch_to_host() since that 7719 * function may be executed in interrupt context, which saves and 7720 * restore segments around it, nullifying its effect. 7721 */ 7722 loadsegment(ds, __USER_DS); 7723 loadsegment(es, __USER_DS); 7724 #endif 7725 7726 pt_guest_exit(vmx); 7727 7728 if (is_guest_mode(vcpu)) { 7729 /* 7730 * Track VMLAUNCH/VMRESUME that have made past guest state 7731 * checking. 7732 */ 7733 if (vmx->nested.nested_run_pending && 7734 !vmx_get_exit_reason(vcpu).failed_vmentry) 7735 ++vcpu->stat.nested_run; 7736 7737 vmx->nested.nested_run_pending = 0; 7738 } 7739 7740 if (unlikely(vmx->fail)) 7741 return EXIT_FASTPATH_NONE; 7742 7743 trace_kvm_exit(vcpu, KVM_ISA_VMX); 7744 7745 if (unlikely(vmx_get_exit_reason(vcpu).failed_vmentry)) 7746 return EXIT_FASTPATH_NONE; 7747 7748 vmx->loaded_vmcs->launched = 1; 7749 7750 vmx_refresh_guest_perf_global_control(vcpu); 7751 7752 vmx_recover_nmi_blocking(vmx); 7753 vmx_complete_interrupts(vmx); 7754 7755 return vmx_exit_handlers_fastpath(vcpu, force_immediate_exit); 7756 } 7757 7758 void vmx_vcpu_free(struct kvm_vcpu *vcpu) 7759 { 7760 struct vcpu_vmx *vmx = to_vmx(vcpu); 7761 7762 if (enable_pml) 7763 vmx_destroy_pml_buffer(vmx); 7764 free_vpid(vmx->vpid); 7765 nested_vmx_free_vcpu(vcpu); 7766 free_loaded_vmcs(vmx->loaded_vmcs); 7767 free_page((unsigned long)vmx->ve_info); 7768 } 7769 7770 int vmx_vcpu_create(struct kvm_vcpu *vcpu) 7771 { 7772 struct vmx_uret_msr *tsx_ctrl; 7773 struct vcpu_vmx *vmx; 7774 int i, err; 7775 7776 BUILD_BUG_ON(offsetof(struct vcpu_vmx, vcpu) != 0); 7777 vmx = to_vmx(vcpu); 7778 7779 INIT_LIST_HEAD(&vmx->vt.pi_wakeup_list); 7780 7781 err = -ENOMEM; 7782 7783 vmx->vpid = allocate_vpid(); 7784 7785 /* 7786 * If PML is turned on, failure on enabling PML just results in failure 7787 * of creating the vcpu, therefore we can simplify PML logic (by 7788 * avoiding dealing with cases, such as enabling PML partially on vcpus 7789 * for the guest), etc. 7790 */ 7791 if (enable_pml) { 7792 vmx->pml_pg = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO); 7793 if (!vmx->pml_pg) 7794 goto free_vpid; 7795 } 7796 7797 for (i = 0; i < kvm_nr_uret_msrs; ++i) 7798 vmx->guest_uret_msrs[i].mask = -1ull; 7799 if (boot_cpu_has(X86_FEATURE_RTM)) { 7800 /* 7801 * TSX_CTRL_CPUID_CLEAR is handled in the CPUID interception. 7802 * Keep the host value unchanged to avoid changing CPUID bits 7803 * under the host kernel's feet. 7804 */ 7805 tsx_ctrl = vmx_find_uret_msr(vmx, MSR_IA32_TSX_CTRL); 7806 if (tsx_ctrl) 7807 tsx_ctrl->mask = ~(u64)TSX_CTRL_CPUID_CLEAR; 7808 } 7809 7810 err = alloc_loaded_vmcs(&vmx->vmcs01); 7811 if (err < 0) 7812 goto free_pml; 7813 7814 /* 7815 * Use Hyper-V 'Enlightened MSR Bitmap' feature when KVM runs as a 7816 * nested (L1) hypervisor and Hyper-V in L0 supports it. Enable the 7817 * feature only for vmcs01, KVM currently isn't equipped to realize any 7818 * performance benefits from enabling it for vmcs02. 7819 */ 7820 if (kvm_is_using_evmcs() && 7821 (ms_hyperv.nested_features & HV_X64_NESTED_MSR_BITMAP)) { 7822 struct hv_enlightened_vmcs *evmcs = (void *)vmx->vmcs01.vmcs; 7823 7824 evmcs->hv_enlightenments_control.msr_bitmap = 1; 7825 } 7826 7827 vmx->loaded_vmcs = &vmx->vmcs01; 7828 7829 if (cpu_need_virtualize_apic_accesses(vcpu)) { 7830 err = kvm_alloc_apic_access_page(vcpu->kvm); 7831 if (err) 7832 goto free_vmcs; 7833 } 7834 7835 if (enable_ept && !enable_unrestricted_guest) { 7836 err = init_rmode_identity_map(vcpu->kvm); 7837 if (err) 7838 goto free_vmcs; 7839 } 7840 7841 err = -ENOMEM; 7842 if (vmcs_config.cpu_based_2nd_exec_ctrl & SECONDARY_EXEC_EPT_VIOLATION_VE) { 7843 struct page *page; 7844 7845 BUILD_BUG_ON(sizeof(*vmx->ve_info) > PAGE_SIZE); 7846 7847 /* ve_info must be page aligned. */ 7848 page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO); 7849 if (!page) 7850 goto free_vmcs; 7851 7852 vmx->ve_info = page_to_virt(page); 7853 } 7854 7855 if (vmx_can_use_ipiv(vcpu)) 7856 WRITE_ONCE(to_kvm_vmx(vcpu->kvm)->pid_table[vcpu->vcpu_id], 7857 __pa(&vmx->vt.pi_desc) | PID_TABLE_ENTRY_VALID); 7858 7859 return 0; 7860 7861 free_vmcs: 7862 free_loaded_vmcs(vmx->loaded_vmcs); 7863 free_pml: 7864 vmx_destroy_pml_buffer(vmx); 7865 free_vpid: 7866 free_vpid(vmx->vpid); 7867 return err; 7868 } 7869 7870 #define L1TF_MSG_SMT "L1TF CPU bug present and SMT on, data leak possible. See CVE-2018-3646 and https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/l1tf.html for details.\n" 7871 #define L1TF_MSG_L1D "L1TF CPU bug present and virtualization mitigation disabled, data leak possible. See CVE-2018-3646 and https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/l1tf.html for details.\n" 7872 7873 int vmx_vm_init(struct kvm *kvm) 7874 { 7875 if (!ple_gap) 7876 kvm_disable_exits(kvm, KVM_X86_DISABLE_EXITS_PAUSE); 7877 7878 if (boot_cpu_has(X86_BUG_L1TF) && enable_ept) { 7879 switch (l1tf_mitigation) { 7880 case L1TF_MITIGATION_OFF: 7881 case L1TF_MITIGATION_FLUSH_NOWARN: 7882 /* 'I explicitly don't care' is set */ 7883 break; 7884 case L1TF_MITIGATION_AUTO: 7885 case L1TF_MITIGATION_FLUSH: 7886 case L1TF_MITIGATION_FLUSH_NOSMT: 7887 case L1TF_MITIGATION_FULL: 7888 /* 7889 * Warn upon starting the first VM in a potentially 7890 * insecure environment. 7891 */ 7892 if (sched_smt_active()) 7893 pr_warn_once(L1TF_MSG_SMT); 7894 if (l1tf_vmx_mitigation == VMENTER_L1D_FLUSH_NEVER) 7895 pr_warn_once(L1TF_MSG_L1D); 7896 break; 7897 case L1TF_MITIGATION_FULL_FORCE: 7898 /* Flush is enforced */ 7899 break; 7900 } 7901 } 7902 7903 if (enable_pml) 7904 kvm->arch.cpu_dirty_log_size = PML_LOG_NR_ENTRIES; 7905 return 0; 7906 } 7907 7908 static inline bool vmx_ignore_guest_pat(struct kvm *kvm) 7909 { 7910 /* 7911 * Non-coherent DMA devices need the guest to flush CPU properly. 7912 * In that case it is not possible to map all guest RAM as WB, so 7913 * always trust guest PAT. 7914 */ 7915 return !kvm_arch_has_noncoherent_dma(kvm) && 7916 kvm_check_has_quirk(kvm, KVM_X86_QUIRK_IGNORE_GUEST_PAT); 7917 } 7918 7919 u8 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio) 7920 { 7921 /* 7922 * Force UC for host MMIO regions, as allowing the guest to access MMIO 7923 * with cacheable accesses will result in Machine Checks. 7924 */ 7925 if (is_mmio) 7926 return MTRR_TYPE_UNCACHABLE << VMX_EPT_MT_EPTE_SHIFT; 7927 7928 /* Force WB if ignoring guest PAT */ 7929 if (vmx_ignore_guest_pat(vcpu->kvm)) 7930 return (MTRR_TYPE_WRBACK << VMX_EPT_MT_EPTE_SHIFT) | VMX_EPT_IPAT_BIT; 7931 7932 return (MTRR_TYPE_WRBACK << VMX_EPT_MT_EPTE_SHIFT); 7933 } 7934 7935 static void vmcs_set_secondary_exec_control(struct vcpu_vmx *vmx, u32 new_ctl) 7936 { 7937 /* 7938 * These bits in the secondary execution controls field 7939 * are dynamic, the others are mostly based on the hypervisor 7940 * architecture and the guest's CPUID. Do not touch the 7941 * dynamic bits. 7942 */ 7943 u32 mask = 7944 SECONDARY_EXEC_SHADOW_VMCS | 7945 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | 7946 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | 7947 SECONDARY_EXEC_DESC; 7948 7949 u32 cur_ctl = secondary_exec_controls_get(vmx); 7950 7951 secondary_exec_controls_set(vmx, (new_ctl & ~mask) | (cur_ctl & mask)); 7952 } 7953 7954 /* 7955 * Generate MSR_IA32_VMX_CR{0,4}_FIXED1 according to CPUID. Only set bits 7956 * (indicating "allowed-1") if they are supported in the guest's CPUID. 7957 */ 7958 static void nested_vmx_cr_fixed1_bits_update(struct kvm_vcpu *vcpu) 7959 { 7960 struct vcpu_vmx *vmx = to_vmx(vcpu); 7961 struct kvm_cpuid_entry2 *entry; 7962 7963 vmx->nested.msrs.cr0_fixed1 = 0xffffffff; 7964 vmx->nested.msrs.cr4_fixed1 = X86_CR4_PCE; 7965 7966 #define cr4_fixed1_update(_cr4_mask, _reg, _cpuid_mask) do { \ 7967 if (entry && (entry->_reg & (_cpuid_mask))) \ 7968 vmx->nested.msrs.cr4_fixed1 |= (_cr4_mask); \ 7969 } while (0) 7970 7971 entry = kvm_find_cpuid_entry(vcpu, 0x1); 7972 cr4_fixed1_update(X86_CR4_VME, edx, feature_bit(VME)); 7973 cr4_fixed1_update(X86_CR4_PVI, edx, feature_bit(VME)); 7974 cr4_fixed1_update(X86_CR4_TSD, edx, feature_bit(TSC)); 7975 cr4_fixed1_update(X86_CR4_DE, edx, feature_bit(DE)); 7976 cr4_fixed1_update(X86_CR4_PSE, edx, feature_bit(PSE)); 7977 cr4_fixed1_update(X86_CR4_PAE, edx, feature_bit(PAE)); 7978 cr4_fixed1_update(X86_CR4_MCE, edx, feature_bit(MCE)); 7979 cr4_fixed1_update(X86_CR4_PGE, edx, feature_bit(PGE)); 7980 cr4_fixed1_update(X86_CR4_OSFXSR, edx, feature_bit(FXSR)); 7981 cr4_fixed1_update(X86_CR4_OSXMMEXCPT, edx, feature_bit(XMM)); 7982 cr4_fixed1_update(X86_CR4_VMXE, ecx, feature_bit(VMX)); 7983 cr4_fixed1_update(X86_CR4_SMXE, ecx, feature_bit(SMX)); 7984 cr4_fixed1_update(X86_CR4_PCIDE, ecx, feature_bit(PCID)); 7985 cr4_fixed1_update(X86_CR4_OSXSAVE, ecx, feature_bit(XSAVE)); 7986 7987 entry = kvm_find_cpuid_entry_index(vcpu, 0x7, 0); 7988 cr4_fixed1_update(X86_CR4_FSGSBASE, ebx, feature_bit(FSGSBASE)); 7989 cr4_fixed1_update(X86_CR4_SMEP, ebx, feature_bit(SMEP)); 7990 cr4_fixed1_update(X86_CR4_SMAP, ebx, feature_bit(SMAP)); 7991 cr4_fixed1_update(X86_CR4_PKE, ecx, feature_bit(PKU)); 7992 cr4_fixed1_update(X86_CR4_UMIP, ecx, feature_bit(UMIP)); 7993 cr4_fixed1_update(X86_CR4_LA57, ecx, feature_bit(LA57)); 7994 cr4_fixed1_update(X86_CR4_CET, ecx, feature_bit(SHSTK)); 7995 cr4_fixed1_update(X86_CR4_CET, edx, feature_bit(IBT)); 7996 7997 entry = kvm_find_cpuid_entry_index(vcpu, 0x7, 1); 7998 cr4_fixed1_update(X86_CR4_LAM_SUP, eax, feature_bit(LAM)); 7999 8000 #undef cr4_fixed1_update 8001 } 8002 8003 static void update_intel_pt_cfg(struct kvm_vcpu *vcpu) 8004 { 8005 struct vcpu_vmx *vmx = to_vmx(vcpu); 8006 struct kvm_cpuid_entry2 *best = NULL; 8007 int i; 8008 8009 for (i = 0; i < PT_CPUID_LEAVES; i++) { 8010 best = kvm_find_cpuid_entry_index(vcpu, 0x14, i); 8011 if (!best) 8012 return; 8013 vmx->pt_desc.caps[CPUID_EAX + i*PT_CPUID_REGS_NUM] = best->eax; 8014 vmx->pt_desc.caps[CPUID_EBX + i*PT_CPUID_REGS_NUM] = best->ebx; 8015 vmx->pt_desc.caps[CPUID_ECX + i*PT_CPUID_REGS_NUM] = best->ecx; 8016 vmx->pt_desc.caps[CPUID_EDX + i*PT_CPUID_REGS_NUM] = best->edx; 8017 } 8018 8019 /* Get the number of configurable Address Ranges for filtering */ 8020 vmx->pt_desc.num_address_ranges = intel_pt_validate_cap(vmx->pt_desc.caps, 8021 PT_CAP_num_address_ranges); 8022 8023 /* Initialize and clear the no dependency bits */ 8024 vmx->pt_desc.ctl_bitmask = ~(RTIT_CTL_TRACEEN | RTIT_CTL_OS | 8025 RTIT_CTL_USR | RTIT_CTL_TSC_EN | RTIT_CTL_DISRETC | 8026 RTIT_CTL_BRANCH_EN); 8027 8028 /* 8029 * If CPUID.(EAX=14H,ECX=0):EBX[0]=1 CR3Filter can be set otherwise 8030 * will inject an #GP 8031 */ 8032 if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_cr3_filtering)) 8033 vmx->pt_desc.ctl_bitmask &= ~RTIT_CTL_CR3EN; 8034 8035 /* 8036 * If CPUID.(EAX=14H,ECX=0):EBX[1]=1 CYCEn, CycThresh and 8037 * PSBFreq can be set 8038 */ 8039 if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_psb_cyc)) 8040 vmx->pt_desc.ctl_bitmask &= ~(RTIT_CTL_CYCLEACC | 8041 RTIT_CTL_CYC_THRESH | RTIT_CTL_PSB_FREQ); 8042 8043 /* 8044 * If CPUID.(EAX=14H,ECX=0):EBX[3]=1 MTCEn and MTCFreq can be set 8045 */ 8046 if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_mtc)) 8047 vmx->pt_desc.ctl_bitmask &= ~(RTIT_CTL_MTC_EN | 8048 RTIT_CTL_MTC_RANGE); 8049 8050 /* If CPUID.(EAX=14H,ECX=0):EBX[4]=1 FUPonPTW and PTWEn can be set */ 8051 if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_ptwrite)) 8052 vmx->pt_desc.ctl_bitmask &= ~(RTIT_CTL_FUP_ON_PTW | 8053 RTIT_CTL_PTW_EN); 8054 8055 /* If CPUID.(EAX=14H,ECX=0):EBX[5]=1 PwrEvEn can be set */ 8056 if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_power_event_trace)) 8057 vmx->pt_desc.ctl_bitmask &= ~RTIT_CTL_PWR_EVT_EN; 8058 8059 /* If CPUID.(EAX=14H,ECX=0):ECX[0]=1 ToPA can be set */ 8060 if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_topa_output)) 8061 vmx->pt_desc.ctl_bitmask &= ~RTIT_CTL_TOPA; 8062 8063 /* If CPUID.(EAX=14H,ECX=0):ECX[3]=1 FabricEn can be set */ 8064 if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_output_subsys)) 8065 vmx->pt_desc.ctl_bitmask &= ~RTIT_CTL_FABRIC_EN; 8066 8067 /* unmask address range configure area */ 8068 for (i = 0; i < vmx->pt_desc.num_address_ranges; i++) 8069 vmx->pt_desc.ctl_bitmask &= ~(0xfULL << (32 + i * 4)); 8070 } 8071 8072 void vmx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) 8073 { 8074 struct vcpu_vmx *vmx = to_vmx(vcpu); 8075 8076 /* 8077 * XSAVES is effectively enabled if and only if XSAVE is also exposed 8078 * to the guest. XSAVES depends on CR4.OSXSAVE, and CR4.OSXSAVE can be 8079 * set if and only if XSAVE is supported. 8080 */ 8081 if (!guest_cpu_cap_has(vcpu, X86_FEATURE_XSAVE)) 8082 guest_cpu_cap_clear(vcpu, X86_FEATURE_XSAVES); 8083 8084 vmx_setup_uret_msrs(vmx); 8085 8086 if (cpu_has_secondary_exec_ctrls()) 8087 vmcs_set_secondary_exec_control(vmx, 8088 vmx_secondary_exec_control(vmx)); 8089 8090 if (guest_cpu_cap_has(vcpu, X86_FEATURE_VMX)) 8091 vmx->msr_ia32_feature_control_valid_bits |= 8092 FEAT_CTL_VMX_ENABLED_INSIDE_SMX | 8093 FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX; 8094 else 8095 vmx->msr_ia32_feature_control_valid_bits &= 8096 ~(FEAT_CTL_VMX_ENABLED_INSIDE_SMX | 8097 FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX); 8098 8099 if (guest_cpu_cap_has(vcpu, X86_FEATURE_VMX)) 8100 nested_vmx_cr_fixed1_bits_update(vcpu); 8101 8102 if (boot_cpu_has(X86_FEATURE_INTEL_PT) && 8103 guest_cpu_cap_has(vcpu, X86_FEATURE_INTEL_PT)) 8104 update_intel_pt_cfg(vcpu); 8105 8106 if (boot_cpu_has(X86_FEATURE_RTM)) { 8107 struct vmx_uret_msr *msr; 8108 msr = vmx_find_uret_msr(vmx, MSR_IA32_TSX_CTRL); 8109 if (msr) { 8110 bool enabled = guest_cpu_cap_has(vcpu, X86_FEATURE_RTM); 8111 vmx_set_guest_uret_msr(vmx, msr, enabled ? 0 : TSX_CTRL_RTM_DISABLE); 8112 } 8113 } 8114 8115 set_cr4_guest_host_mask(vmx); 8116 8117 vmx_write_encls_bitmap(vcpu, NULL); 8118 if (guest_cpu_cap_has(vcpu, X86_FEATURE_SGX)) 8119 vmx->msr_ia32_feature_control_valid_bits |= FEAT_CTL_SGX_ENABLED; 8120 else 8121 vmx->msr_ia32_feature_control_valid_bits &= ~FEAT_CTL_SGX_ENABLED; 8122 8123 if (guest_cpu_cap_has(vcpu, X86_FEATURE_SGX_LC)) 8124 vmx->msr_ia32_feature_control_valid_bits |= 8125 FEAT_CTL_SGX_LC_ENABLED; 8126 else 8127 vmx->msr_ia32_feature_control_valid_bits &= 8128 ~FEAT_CTL_SGX_LC_ENABLED; 8129 8130 /* Refresh #PF interception to account for MAXPHYADDR changes. */ 8131 vmx_update_exception_bitmap(vcpu); 8132 } 8133 8134 static __init u64 vmx_get_perf_capabilities(void) 8135 { 8136 u64 perf_cap = PERF_CAP_FW_WRITES; 8137 u64 host_perf_cap = 0; 8138 8139 if (!enable_pmu) 8140 return 0; 8141 8142 if (boot_cpu_has(X86_FEATURE_PDCM)) 8143 rdmsrq(MSR_IA32_PERF_CAPABILITIES, host_perf_cap); 8144 8145 if (!cpu_feature_enabled(X86_FEATURE_ARCH_LBR) && 8146 !enable_mediated_pmu) { 8147 x86_perf_get_lbr(&vmx_lbr_caps); 8148 8149 /* 8150 * KVM requires LBR callstack support, as the overhead due to 8151 * context switching LBRs without said support is too high. 8152 * See intel_pmu_create_guest_lbr_event() for more info. 8153 */ 8154 if (!vmx_lbr_caps.has_callstack) 8155 memset(&vmx_lbr_caps, 0, sizeof(vmx_lbr_caps)); 8156 else if (vmx_lbr_caps.nr) 8157 perf_cap |= host_perf_cap & PERF_CAP_LBR_FMT; 8158 } 8159 8160 if (vmx_pebs_supported()) { 8161 perf_cap |= host_perf_cap & PERF_CAP_PEBS_MASK; 8162 8163 /* 8164 * Disallow adaptive PEBS as it is functionally broken, can be 8165 * used by the guest to read *host* LBRs, and can be used to 8166 * bypass userspace event filters. To correctly and safely 8167 * support adaptive PEBS, KVM needs to: 8168 * 8169 * 1. Account for the ADAPTIVE flag when (re)programming fixed 8170 * counters. 8171 * 8172 * 2. Gain support from perf (or take direct control of counter 8173 * programming) to support events without adaptive PEBS 8174 * enabled for the hardware counter. 8175 * 8176 * 3. Ensure LBR MSRs cannot hold host data on VM-Entry with 8177 * adaptive PEBS enabled and MSR_PEBS_DATA_CFG.LBRS=1. 8178 * 8179 * 4. Document which PMU events are effectively exposed to the 8180 * guest via adaptive PEBS, and make adaptive PEBS mutually 8181 * exclusive with KVM_SET_PMU_EVENT_FILTER if necessary. 8182 */ 8183 perf_cap &= ~PERF_CAP_PEBS_BASELINE; 8184 } 8185 8186 return perf_cap; 8187 } 8188 8189 static __init void vmx_set_cpu_caps(void) 8190 { 8191 kvm_initialize_cpu_caps(); 8192 8193 /* CPUID 0x1 */ 8194 if (nested) 8195 kvm_cpu_cap_set(X86_FEATURE_VMX); 8196 8197 /* CPUID 0x7 */ 8198 if (kvm_mpx_supported()) 8199 kvm_cpu_cap_check_and_set(X86_FEATURE_MPX); 8200 if (!cpu_has_vmx_invpcid()) 8201 kvm_cpu_cap_clear(X86_FEATURE_INVPCID); 8202 if (vmx_pt_mode_is_host_guest()) 8203 kvm_cpu_cap_check_and_set(X86_FEATURE_INTEL_PT); 8204 if (vmx_pebs_supported()) { 8205 kvm_cpu_cap_check_and_set(X86_FEATURE_DS); 8206 kvm_cpu_cap_check_and_set(X86_FEATURE_DTES64); 8207 } 8208 8209 if (!enable_pmu) 8210 kvm_cpu_cap_clear(X86_FEATURE_PDCM); 8211 kvm_caps.supported_perf_cap = vmx_get_perf_capabilities(); 8212 8213 if (!enable_sgx) { 8214 kvm_cpu_cap_clear(X86_FEATURE_SGX); 8215 kvm_cpu_cap_clear(X86_FEATURE_SGX_LC); 8216 kvm_cpu_cap_clear(X86_FEATURE_SGX1); 8217 kvm_cpu_cap_clear(X86_FEATURE_SGX2); 8218 kvm_cpu_cap_clear(X86_FEATURE_SGX_EDECCSSA); 8219 } 8220 8221 if (vmx_umip_emulated()) 8222 kvm_cpu_cap_set(X86_FEATURE_UMIP); 8223 8224 /* CPUID 0xD.1 */ 8225 if (!cpu_has_vmx_xsaves()) 8226 kvm_cpu_cap_clear(X86_FEATURE_XSAVES); 8227 8228 /* CPUID 0x80000001 and 0x7 (RDPID) */ 8229 if (!cpu_has_vmx_rdtscp()) { 8230 kvm_cpu_cap_clear(X86_FEATURE_RDTSCP); 8231 kvm_cpu_cap_clear(X86_FEATURE_RDPID); 8232 } 8233 8234 if (cpu_has_vmx_waitpkg()) 8235 kvm_cpu_cap_check_and_set(X86_FEATURE_WAITPKG); 8236 8237 /* 8238 * Disable CET if unrestricted_guest is unsupported as KVM doesn't 8239 * enforce CET HW behaviors in emulator. On platforms with 8240 * VMX_BASIC[bit56] == 0, inject #CP at VMX entry with error code 8241 * fails, so disable CET in this case too. 8242 */ 8243 if (!cpu_has_load_cet_ctrl() || !enable_unrestricted_guest || 8244 !cpu_has_vmx_basic_no_hw_errcode_cc()) { 8245 kvm_cpu_cap_clear(X86_FEATURE_SHSTK); 8246 kvm_cpu_cap_clear(X86_FEATURE_IBT); 8247 } 8248 8249 kvm_setup_xss_caps(); 8250 kvm_finalize_cpu_caps(); 8251 } 8252 8253 static bool vmx_is_io_intercepted(struct kvm_vcpu *vcpu, 8254 struct x86_instruction_info *info, 8255 unsigned long *exit_qualification) 8256 { 8257 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 8258 unsigned short port; 8259 int size; 8260 bool imm; 8261 8262 /* 8263 * If the 'use IO bitmaps' VM-execution control is 0, IO instruction 8264 * VM-exits depend on the 'unconditional IO exiting' VM-execution 8265 * control. 8266 * 8267 * Otherwise, IO instruction VM-exits are controlled by the IO bitmaps. 8268 */ 8269 if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS)) 8270 return nested_cpu_has(vmcs12, CPU_BASED_UNCOND_IO_EXITING); 8271 8272 if (info->intercept == x86_intercept_in || 8273 info->intercept == x86_intercept_ins) { 8274 port = info->src_val; 8275 size = info->dst_bytes; 8276 imm = info->src_type == OP_IMM; 8277 } else { 8278 port = info->dst_val; 8279 size = info->src_bytes; 8280 imm = info->dst_type == OP_IMM; 8281 } 8282 8283 8284 *exit_qualification = ((unsigned long)port << 16) | (size - 1); 8285 8286 if (info->intercept == x86_intercept_ins || 8287 info->intercept == x86_intercept_outs) 8288 *exit_qualification |= BIT(4); 8289 8290 if (info->rep_prefix) 8291 *exit_qualification |= BIT(5); 8292 8293 if (imm) 8294 *exit_qualification |= BIT(6); 8295 8296 return nested_vmx_check_io_bitmaps(vcpu, port, size); 8297 } 8298 8299 int vmx_check_intercept(struct kvm_vcpu *vcpu, 8300 struct x86_instruction_info *info, 8301 enum x86_intercept_stage stage, 8302 struct x86_exception *exception) 8303 { 8304 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 8305 unsigned long exit_qualification = 0; 8306 u32 vm_exit_reason; 8307 u64 exit_insn_len; 8308 8309 switch (info->intercept) { 8310 case x86_intercept_rdpid: 8311 /* 8312 * RDPID causes #UD if not enabled through secondary execution 8313 * controls (ENABLE_RDTSCP). Note, the implicit MSR access to 8314 * TSC_AUX is NOT subject to interception, i.e. checking only 8315 * the dedicated execution control is architecturally correct. 8316 */ 8317 if (!nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_RDTSCP)) { 8318 exception->vector = UD_VECTOR; 8319 exception->error_code_valid = false; 8320 return X86EMUL_PROPAGATE_FAULT; 8321 } 8322 return X86EMUL_CONTINUE; 8323 8324 case x86_intercept_in: 8325 case x86_intercept_ins: 8326 case x86_intercept_out: 8327 case x86_intercept_outs: 8328 if (!vmx_is_io_intercepted(vcpu, info, &exit_qualification)) 8329 return X86EMUL_CONTINUE; 8330 8331 vm_exit_reason = EXIT_REASON_IO_INSTRUCTION; 8332 break; 8333 8334 case x86_intercept_lgdt: 8335 case x86_intercept_lidt: 8336 case x86_intercept_lldt: 8337 case x86_intercept_ltr: 8338 case x86_intercept_sgdt: 8339 case x86_intercept_sidt: 8340 case x86_intercept_sldt: 8341 case x86_intercept_str: 8342 if (!nested_cpu_has2(vmcs12, SECONDARY_EXEC_DESC)) 8343 return X86EMUL_CONTINUE; 8344 8345 if (info->intercept == x86_intercept_lldt || 8346 info->intercept == x86_intercept_ltr || 8347 info->intercept == x86_intercept_sldt || 8348 info->intercept == x86_intercept_str) 8349 vm_exit_reason = EXIT_REASON_LDTR_TR; 8350 else 8351 vm_exit_reason = EXIT_REASON_GDTR_IDTR; 8352 /* 8353 * FIXME: Decode the ModR/M to generate the correct exit 8354 * qualification for memory operands. 8355 */ 8356 break; 8357 8358 case x86_intercept_hlt: 8359 if (!nested_cpu_has(vmcs12, CPU_BASED_HLT_EXITING)) 8360 return X86EMUL_CONTINUE; 8361 8362 vm_exit_reason = EXIT_REASON_HLT; 8363 break; 8364 8365 case x86_intercept_pause: 8366 /* 8367 * PAUSE is a single-byte NOP with a REPE prefix, i.e. collides 8368 * with vanilla NOPs in the emulator. Apply the interception 8369 * check only to actual PAUSE instructions. Don't check 8370 * PAUSE-loop-exiting, software can't expect a given PAUSE to 8371 * exit, i.e. KVM is within its rights to allow L2 to execute 8372 * the PAUSE. 8373 */ 8374 if ((info->rep_prefix != REPE_PREFIX) || 8375 !nested_cpu_has(vmcs12, CPU_BASED_PAUSE_EXITING)) 8376 return X86EMUL_CONTINUE; 8377 8378 vm_exit_reason = EXIT_REASON_PAUSE_INSTRUCTION; 8379 break; 8380 8381 /* TODO: check more intercepts... */ 8382 default: 8383 return X86EMUL_UNHANDLEABLE; 8384 } 8385 8386 exit_insn_len = abs_diff((s64)info->next_rip, (s64)info->rip); 8387 if (!exit_insn_len || exit_insn_len > X86_MAX_INSTRUCTION_LENGTH) 8388 return X86EMUL_UNHANDLEABLE; 8389 8390 __nested_vmx_vmexit(vcpu, vm_exit_reason, 0, exit_qualification, 8391 exit_insn_len); 8392 return X86EMUL_INTERCEPTED; 8393 } 8394 8395 #ifdef CONFIG_X86_64 8396 /* (a << shift) / divisor, return 1 if overflow otherwise 0 */ 8397 static inline int u64_shl_div_u64(u64 a, unsigned int shift, 8398 u64 divisor, u64 *result) 8399 { 8400 u64 low = a << shift, high = a >> (64 - shift); 8401 8402 /* To avoid the overflow on divq */ 8403 if (high >= divisor) 8404 return 1; 8405 8406 /* Low hold the result, high hold rem which is discarded */ 8407 asm("divq %2\n\t" : "=a" (low), "=d" (high) : 8408 "rm" (divisor), "0" (low), "1" (high)); 8409 *result = low; 8410 8411 return 0; 8412 } 8413 8414 int vmx_set_hv_timer(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc, 8415 bool *expired) 8416 { 8417 struct vcpu_vmx *vmx; 8418 u64 tscl, guest_tscl, delta_tsc, lapic_timer_advance_cycles; 8419 struct kvm_timer *ktimer = &vcpu->arch.apic->lapic_timer; 8420 8421 vmx = to_vmx(vcpu); 8422 tscl = rdtsc(); 8423 guest_tscl = kvm_read_l1_tsc(vcpu, tscl); 8424 delta_tsc = max(guest_deadline_tsc, guest_tscl) - guest_tscl; 8425 lapic_timer_advance_cycles = nsec_to_cycles(vcpu, 8426 ktimer->timer_advance_ns); 8427 8428 if (delta_tsc > lapic_timer_advance_cycles) 8429 delta_tsc -= lapic_timer_advance_cycles; 8430 else 8431 delta_tsc = 0; 8432 8433 /* Convert to host delta tsc if tsc scaling is enabled */ 8434 if (vcpu->arch.l1_tsc_scaling_ratio != kvm_caps.default_tsc_scaling_ratio && 8435 delta_tsc && u64_shl_div_u64(delta_tsc, 8436 kvm_caps.tsc_scaling_ratio_frac_bits, 8437 vcpu->arch.l1_tsc_scaling_ratio, &delta_tsc)) 8438 return -ERANGE; 8439 8440 /* 8441 * If the delta tsc can't fit in the 32 bit after the multi shift, 8442 * we can't use the preemption timer. 8443 * It's possible that it fits on later vmentries, but checking 8444 * on every vmentry is costly so we just use an hrtimer. 8445 */ 8446 if (delta_tsc >> (cpu_preemption_timer_multi + 32)) 8447 return -ERANGE; 8448 8449 vmx->hv_deadline_tsc = tscl + delta_tsc; 8450 *expired = !delta_tsc; 8451 return 0; 8452 } 8453 8454 void vmx_cancel_hv_timer(struct kvm_vcpu *vcpu) 8455 { 8456 to_vmx(vcpu)->hv_deadline_tsc = -1; 8457 } 8458 #endif 8459 8460 void vmx_update_cpu_dirty_logging(struct kvm_vcpu *vcpu) 8461 { 8462 struct vcpu_vmx *vmx = to_vmx(vcpu); 8463 8464 if (WARN_ON_ONCE(!enable_pml)) 8465 return; 8466 8467 guard(vmx_vmcs01)(vcpu); 8468 8469 /* 8470 * Note, nr_memslots_dirty_logging can be changed concurrent with this 8471 * code, but in that case another update request will be made and so 8472 * the guest will never run with a stale PML value. 8473 */ 8474 if (atomic_read(&vcpu->kvm->nr_memslots_dirty_logging)) 8475 secondary_exec_controls_setbit(vmx, SECONDARY_EXEC_ENABLE_PML); 8476 else 8477 secondary_exec_controls_clearbit(vmx, SECONDARY_EXEC_ENABLE_PML); 8478 } 8479 8480 void vmx_setup_mce(struct kvm_vcpu *vcpu) 8481 { 8482 if (vcpu->arch.mcg_cap & MCG_LMCE_P) 8483 to_vmx(vcpu)->msr_ia32_feature_control_valid_bits |= 8484 FEAT_CTL_LMCE_ENABLED; 8485 else 8486 to_vmx(vcpu)->msr_ia32_feature_control_valid_bits &= 8487 ~FEAT_CTL_LMCE_ENABLED; 8488 } 8489 8490 #ifdef CONFIG_KVM_SMM 8491 int vmx_smi_allowed(struct kvm_vcpu *vcpu, bool for_injection) 8492 { 8493 /* we need a nested vmexit to enter SMM, postpone if run is pending */ 8494 if (to_vmx(vcpu)->nested.nested_run_pending) 8495 return -EBUSY; 8496 return !is_smm(vcpu); 8497 } 8498 8499 int vmx_enter_smm(struct kvm_vcpu *vcpu, union kvm_smram *smram) 8500 { 8501 struct vcpu_vmx *vmx = to_vmx(vcpu); 8502 8503 /* 8504 * TODO: Implement custom flows for forcing the vCPU out/in of L2 on 8505 * SMI and RSM. Using the common VM-Exit + VM-Enter routines is wrong 8506 * SMI and RSM only modify state that is saved and restored via SMRAM. 8507 * E.g. most MSRs are left untouched, but many are modified by VM-Exit 8508 * and VM-Enter, and thus L2's values may be corrupted on SMI+RSM. 8509 */ 8510 vmx->nested.smm.guest_mode = is_guest_mode(vcpu); 8511 if (vmx->nested.smm.guest_mode) 8512 nested_vmx_vmexit(vcpu, -1, 0, 0); 8513 8514 vmx->nested.smm.vmxon = vmx->nested.vmxon; 8515 vmx->nested.vmxon = false; 8516 vmx_clear_hlt(vcpu); 8517 return 0; 8518 } 8519 8520 int vmx_leave_smm(struct kvm_vcpu *vcpu, const union kvm_smram *smram) 8521 { 8522 struct vcpu_vmx *vmx = to_vmx(vcpu); 8523 int ret; 8524 8525 if (vmx->nested.smm.vmxon) { 8526 vmx->nested.vmxon = true; 8527 vmx->nested.smm.vmxon = false; 8528 } 8529 8530 if (vmx->nested.smm.guest_mode) { 8531 ret = nested_vmx_enter_non_root_mode(vcpu, false); 8532 if (ret) 8533 return ret; 8534 8535 vmx->nested.nested_run_pending = 1; 8536 vmx->nested.smm.guest_mode = false; 8537 } 8538 return 0; 8539 } 8540 8541 void vmx_enable_smi_window(struct kvm_vcpu *vcpu) 8542 { 8543 /* RSM will cause a vmexit anyway. */ 8544 } 8545 #endif 8546 8547 bool vmx_apic_init_signal_blocked(struct kvm_vcpu *vcpu) 8548 { 8549 return to_vmx(vcpu)->nested.vmxon && !is_guest_mode(vcpu); 8550 } 8551 8552 void vmx_migrate_timers(struct kvm_vcpu *vcpu) 8553 { 8554 if (is_guest_mode(vcpu)) { 8555 struct hrtimer *timer = &to_vmx(vcpu)->nested.preemption_timer; 8556 8557 if (hrtimer_try_to_cancel(timer) == 1) 8558 hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED); 8559 } 8560 } 8561 8562 void vmx_hardware_unsetup(void) 8563 { 8564 kvm_set_posted_intr_wakeup_handler(NULL); 8565 8566 if (nested) 8567 nested_vmx_hardware_unsetup(); 8568 8569 free_kvm_area(); 8570 } 8571 8572 void vmx_vm_destroy(struct kvm *kvm) 8573 { 8574 struct kvm_vmx *kvm_vmx = to_kvm_vmx(kvm); 8575 8576 free_pages((unsigned long)kvm_vmx->pid_table, vmx_get_pid_table_order(kvm)); 8577 } 8578 8579 /* 8580 * Note, the SDM states that the linear address is masked *after* the modified 8581 * canonicality check, whereas KVM masks (untags) the address and then performs 8582 * a "normal" canonicality check. Functionally, the two methods are identical, 8583 * and when the masking occurs relative to the canonicality check isn't visible 8584 * to software, i.e. KVM's behavior doesn't violate the SDM. 8585 */ 8586 gva_t vmx_get_untagged_addr(struct kvm_vcpu *vcpu, gva_t gva, unsigned int flags) 8587 { 8588 int lam_bit; 8589 unsigned long cr3_bits; 8590 8591 if (flags & (X86EMUL_F_FETCH | X86EMUL_F_IMPLICIT | X86EMUL_F_INVLPG)) 8592 return gva; 8593 8594 if (!is_64_bit_mode(vcpu)) 8595 return gva; 8596 8597 /* 8598 * Bit 63 determines if the address should be treated as user address 8599 * or a supervisor address. 8600 */ 8601 if (!(gva & BIT_ULL(63))) { 8602 cr3_bits = kvm_get_active_cr3_lam_bits(vcpu); 8603 if (!(cr3_bits & (X86_CR3_LAM_U57 | X86_CR3_LAM_U48))) 8604 return gva; 8605 8606 /* LAM_U48 is ignored if LAM_U57 is set. */ 8607 lam_bit = cr3_bits & X86_CR3_LAM_U57 ? 56 : 47; 8608 } else { 8609 if (!kvm_is_cr4_bit_set(vcpu, X86_CR4_LAM_SUP)) 8610 return gva; 8611 8612 lam_bit = kvm_is_cr4_bit_set(vcpu, X86_CR4_LA57) ? 56 : 47; 8613 } 8614 8615 /* 8616 * Untag the address by sign-extending the lam_bit, but NOT to bit 63. 8617 * Bit 63 is retained from the raw virtual address so that untagging 8618 * doesn't change a user access to a supervisor access, and vice versa. 8619 */ 8620 return (sign_extend64(gva, lam_bit) & ~BIT_ULL(63)) | (gva & BIT_ULL(63)); 8621 } 8622 8623 static unsigned int vmx_handle_intel_pt_intr(void) 8624 { 8625 struct kvm_vcpu *vcpu = kvm_get_running_vcpu(); 8626 8627 /* '0' on failure so that the !PT case can use a RET0 static call. */ 8628 if (!vcpu || !kvm_handling_nmi_from_guest(vcpu)) 8629 return 0; 8630 8631 kvm_make_request(KVM_REQ_PMI, vcpu); 8632 __set_bit(MSR_CORE_PERF_GLOBAL_OVF_CTRL_TRACE_TOPA_PMI_BIT, 8633 (unsigned long *)&vcpu->arch.pmu.global_status); 8634 return 1; 8635 } 8636 8637 static __init void vmx_setup_user_return_msrs(void) 8638 { 8639 8640 /* 8641 * Though SYSCALL is only supported in 64-bit mode on Intel CPUs, kvm 8642 * will emulate SYSCALL in legacy mode if the vendor string in guest 8643 * CPUID.0:{EBX,ECX,EDX} is "AuthenticAMD" or "AMDisbetter!" To 8644 * support this emulation, MSR_STAR is included in the list for i386, 8645 * but is never loaded into hardware. MSR_CSTAR is also never loaded 8646 * into hardware and is here purely for emulation purposes. 8647 */ 8648 const u32 vmx_uret_msrs_list[] = { 8649 #ifdef CONFIG_X86_64 8650 MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR, 8651 #endif 8652 MSR_EFER, MSR_TSC_AUX, MSR_STAR, 8653 MSR_IA32_TSX_CTRL, 8654 }; 8655 int i; 8656 8657 BUILD_BUG_ON(ARRAY_SIZE(vmx_uret_msrs_list) != MAX_NR_USER_RETURN_MSRS); 8658 8659 for (i = 0; i < ARRAY_SIZE(vmx_uret_msrs_list); ++i) 8660 kvm_add_user_return_msr(vmx_uret_msrs_list[i]); 8661 } 8662 8663 static void __init vmx_setup_me_spte_mask(void) 8664 { 8665 u64 me_mask = 0; 8666 8667 /* 8668 * On pre-MKTME system, boot_cpu_data.x86_phys_bits equals to 8669 * kvm_host.maxphyaddr. On MKTME and/or TDX capable systems, 8670 * boot_cpu_data.x86_phys_bits holds the actual physical address 8671 * w/o the KeyID bits, and kvm_host.maxphyaddr equals to 8672 * MAXPHYADDR reported by CPUID. Those bits between are KeyID bits. 8673 */ 8674 if (boot_cpu_data.x86_phys_bits != kvm_host.maxphyaddr) 8675 me_mask = rsvd_bits(boot_cpu_data.x86_phys_bits, 8676 kvm_host.maxphyaddr - 1); 8677 8678 /* 8679 * Unlike SME, host kernel doesn't support setting up any 8680 * MKTME KeyID on Intel platforms. No memory encryption 8681 * bits should be included into the SPTE. 8682 */ 8683 kvm_mmu_set_me_spte_mask(0, me_mask); 8684 } 8685 8686 __init int vmx_hardware_setup(void) 8687 { 8688 unsigned long host_bndcfgs; 8689 struct desc_ptr dt; 8690 int r; 8691 8692 store_idt(&dt); 8693 host_idt_base = dt.address; 8694 8695 vmx_setup_user_return_msrs(); 8696 8697 8698 if (boot_cpu_has(X86_FEATURE_NX)) 8699 kvm_enable_efer_bits(EFER_NX); 8700 8701 if (boot_cpu_has(X86_FEATURE_MPX)) { 8702 rdmsrq(MSR_IA32_BNDCFGS, host_bndcfgs); 8703 WARN_ONCE(host_bndcfgs, "BNDCFGS in host will be lost"); 8704 } 8705 8706 if (!cpu_has_vmx_mpx()) 8707 kvm_caps.supported_xcr0 &= ~(XFEATURE_MASK_BNDREGS | 8708 XFEATURE_MASK_BNDCSR); 8709 8710 if (!cpu_has_vmx_vpid() || !cpu_has_vmx_invvpid() || 8711 !(cpu_has_vmx_invvpid_single() || cpu_has_vmx_invvpid_global())) 8712 enable_vpid = 0; 8713 8714 if (!cpu_has_vmx_ept() || 8715 !cpu_has_vmx_ept_4levels() || 8716 !cpu_has_vmx_ept_mt_wb() || 8717 !cpu_has_vmx_invept_global()) 8718 enable_ept = 0; 8719 8720 /* NX support is required for shadow paging. */ 8721 if (!enable_ept && !boot_cpu_has(X86_FEATURE_NX)) { 8722 pr_err_ratelimited("NX (Execute Disable) not supported\n"); 8723 return -EOPNOTSUPP; 8724 } 8725 8726 /* 8727 * Shadow paging doesn't have a (further) performance penalty 8728 * from GUEST_MAXPHYADDR < HOST_MAXPHYADDR so enable it 8729 * by default 8730 */ 8731 if (!enable_ept) 8732 allow_smaller_maxphyaddr = true; 8733 8734 if (!cpu_has_vmx_ept_ad_bits() || !enable_ept) 8735 enable_ept_ad_bits = 0; 8736 8737 if (!cpu_has_vmx_unrestricted_guest() || !enable_ept) 8738 enable_unrestricted_guest = 0; 8739 8740 if (!cpu_has_vmx_flexpriority()) 8741 flexpriority_enabled = 0; 8742 8743 if (!cpu_has_virtual_nmis()) 8744 enable_vnmi = 0; 8745 8746 #ifdef CONFIG_X86_SGX_KVM 8747 if (!cpu_has_vmx_encls_vmexit()) 8748 enable_sgx = false; 8749 #endif 8750 8751 /* 8752 * set_apic_access_page_addr() is used to reload apic access 8753 * page upon invalidation. No need to do anything if not 8754 * using the APIC_ACCESS_ADDR VMCS field. 8755 */ 8756 if (!flexpriority_enabled) 8757 vt_x86_ops.set_apic_access_page_addr = NULL; 8758 8759 if (!cpu_has_vmx_tpr_shadow()) 8760 vt_x86_ops.update_cr8_intercept = NULL; 8761 8762 #if IS_ENABLED(CONFIG_HYPERV) 8763 if (ms_hyperv.nested_features & HV_X64_NESTED_GUEST_MAPPING_FLUSH 8764 && enable_ept) { 8765 vt_x86_ops.flush_remote_tlbs = hv_flush_remote_tlbs; 8766 vt_x86_ops.flush_remote_tlbs_range = hv_flush_remote_tlbs_range; 8767 } 8768 #endif 8769 8770 if (!cpu_has_vmx_ple()) { 8771 ple_gap = 0; 8772 ple_window = 0; 8773 ple_window_grow = 0; 8774 ple_window_max = 0; 8775 ple_window_shrink = 0; 8776 } 8777 8778 if (!cpu_has_vmx_apicv()) 8779 enable_apicv = 0; 8780 if (!enable_apicv) 8781 vt_x86_ops.sync_pir_to_irr = NULL; 8782 8783 if (!enable_apicv || !cpu_has_vmx_ipiv()) 8784 enable_ipiv = false; 8785 8786 if (cpu_has_vmx_tsc_scaling()) 8787 kvm_caps.has_tsc_control = true; 8788 8789 kvm_caps.max_tsc_scaling_ratio = KVM_VMX_TSC_MULTIPLIER_MAX; 8790 kvm_caps.tsc_scaling_ratio_frac_bits = 48; 8791 kvm_caps.has_bus_lock_exit = cpu_has_vmx_bus_lock_detection(); 8792 kvm_caps.has_notify_vmexit = cpu_has_notify_vmexit(); 8793 8794 set_bit(0, vmx_vpid_bitmap); /* 0 is reserved for host */ 8795 8796 if (enable_ept) 8797 kvm_mmu_set_ept_masks(enable_ept_ad_bits, 8798 cpu_has_vmx_ept_execute_only()); 8799 else 8800 vt_x86_ops.get_mt_mask = NULL; 8801 8802 /* 8803 * Setup shadow_me_value/shadow_me_mask to include MKTME KeyID 8804 * bits to shadow_zero_check. 8805 */ 8806 vmx_setup_me_spte_mask(); 8807 8808 kvm_configure_mmu(enable_ept, 0, vmx_get_max_ept_level(), 8809 ept_caps_to_lpage_level(vmx_capability.ept)); 8810 8811 /* 8812 * Only enable PML when hardware supports PML feature, and both EPT 8813 * and EPT A/D bit features are enabled -- PML depends on them to work. 8814 */ 8815 if (!enable_ept || !enable_ept_ad_bits || !cpu_has_vmx_pml()) 8816 enable_pml = 0; 8817 8818 if (!cpu_has_vmx_preemption_timer()) 8819 enable_preemption_timer = false; 8820 8821 if (enable_preemption_timer) { 8822 u64 use_timer_freq = 5000ULL * 1000 * 1000; 8823 8824 cpu_preemption_timer_multi = 8825 vmx_misc_preemption_timer_rate(vmcs_config.misc); 8826 8827 if (tsc_khz) 8828 use_timer_freq = (u64)tsc_khz * 1000; 8829 use_timer_freq >>= cpu_preemption_timer_multi; 8830 8831 /* 8832 * KVM "disables" the preemption timer by setting it to its max 8833 * value. Don't use the timer if it might cause spurious exits 8834 * at a rate faster than 0.1 Hz (of uninterrupted guest time). 8835 */ 8836 if (use_timer_freq > 0xffffffffu / 10) 8837 enable_preemption_timer = false; 8838 } 8839 8840 if (!enable_preemption_timer) { 8841 vt_x86_ops.set_hv_timer = NULL; 8842 vt_x86_ops.cancel_hv_timer = NULL; 8843 } 8844 8845 kvm_caps.supported_mce_cap |= MCG_LMCE_P; 8846 kvm_caps.supported_mce_cap |= MCG_CMCI_P; 8847 8848 if (pt_mode != PT_MODE_SYSTEM && pt_mode != PT_MODE_HOST_GUEST) 8849 return -EINVAL; 8850 if (!enable_ept || !enable_pmu || !cpu_has_vmx_intel_pt()) 8851 pt_mode = PT_MODE_SYSTEM; 8852 if (pt_mode == PT_MODE_HOST_GUEST) 8853 vt_init_ops.handle_intel_pt_intr = vmx_handle_intel_pt_intr; 8854 else 8855 vt_init_ops.handle_intel_pt_intr = NULL; 8856 8857 setup_default_sgx_lepubkeyhash(); 8858 8859 vmx_set_cpu_caps(); 8860 8861 /* 8862 * Configure nested capabilities after core CPU capabilities so that 8863 * nested support can be conditional on base support, e.g. so that KVM 8864 * can hide/show features based on kvm_cpu_cap_has(). 8865 */ 8866 if (nested) { 8867 r = nested_vmx_hardware_setup(kvm_vmx_exit_handlers); 8868 if (r) 8869 return r; 8870 } 8871 8872 r = alloc_kvm_area(); 8873 if (r) 8874 goto err_kvm_area; 8875 8876 kvm_set_posted_intr_wakeup_handler(pi_wakeup_handler); 8877 8878 /* 8879 * On Intel CPUs that lack self-snoop feature, letting the guest control 8880 * memory types may result in unexpected behavior. So always ignore guest 8881 * PAT on those CPUs and map VM as writeback, not allowing userspace to 8882 * disable the quirk. 8883 * 8884 * On certain Intel CPUs (e.g. SPR, ICX), though self-snoop feature is 8885 * supported, UC is slow enough to cause issues with some older guests (e.g. 8886 * an old version of bochs driver uses ioremap() instead of ioremap_wc() to 8887 * map the video RAM, causing wayland desktop to fail to get started 8888 * correctly). To avoid breaking those older guests that rely on KVM to force 8889 * memory type to WB, provide KVM_X86_QUIRK_IGNORE_GUEST_PAT to preserve the 8890 * safer (for performance) default behavior. 8891 * 8892 * On top of this, non-coherent DMA devices need the guest to flush CPU 8893 * caches properly. This also requires honoring guest PAT, and is forced 8894 * independent of the quirk in vmx_ignore_guest_pat(). 8895 */ 8896 if (!static_cpu_has(X86_FEATURE_SELFSNOOP)) 8897 kvm_caps.supported_quirks &= ~KVM_X86_QUIRK_IGNORE_GUEST_PAT; 8898 8899 kvm_caps.inapplicable_quirks &= ~KVM_X86_QUIRK_IGNORE_GUEST_PAT; 8900 8901 return 0; 8902 8903 err_kvm_area: 8904 if (nested) 8905 nested_vmx_hardware_unsetup(); 8906 return r; 8907 } 8908 8909 void vmx_exit(void) 8910 { 8911 allow_smaller_maxphyaddr = false; 8912 8913 vmx_cleanup_l1d_flush(); 8914 8915 kvm_x86_vendor_exit(); 8916 } 8917 8918 int __init vmx_init(void) 8919 { 8920 int r, cpu; 8921 8922 KVM_SANITY_CHECK_VM_STRUCT_SIZE(kvm_vmx); 8923 8924 if (!kvm_is_vmx_supported()) 8925 return -EOPNOTSUPP; 8926 8927 /* 8928 * Note, VMCS and eVMCS configuration only touch VMX knobs/variables, 8929 * i.e. there's nothing to unwind if a later step fails. 8930 */ 8931 hv_init_evmcs(); 8932 8933 /* 8934 * Parse the VMCS config and VMX capabilities before anything else, so 8935 * that the information is available to all setup flows. 8936 */ 8937 if (setup_vmcs_config(&vmcs_config, &vmx_capability) < 0) 8938 return -EIO; 8939 8940 r = kvm_x86_vendor_init(&vt_init_ops); 8941 if (r) 8942 return r; 8943 8944 /* Must be called after common x86 init so enable_ept is setup. */ 8945 r = vmx_setup_l1d_flush(); 8946 if (r) 8947 goto err_l1d_flush; 8948 8949 for_each_possible_cpu(cpu) { 8950 INIT_LIST_HEAD(&per_cpu(loaded_vmcss_on_cpu, cpu)); 8951 8952 pi_init_cpu(cpu); 8953 } 8954 8955 vmx_check_vmcs12_offsets(); 8956 8957 return 0; 8958 8959 err_l1d_flush: 8960 kvm_x86_vendor_exit(); 8961 return r; 8962 } 8963