1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Kernel-based Virtual Machine driver for Linux 4 * 5 * This module enables machines with Intel VT-x extensions to run virtual 6 * machines without emulation or binary translation. 7 * 8 * Copyright (C) 2006 Qumranet, Inc. 9 * Copyright 2010 Red Hat, Inc. and/or its affiliates. 10 * 11 * Authors: 12 * Avi Kivity <avi@qumranet.com> 13 * Yaniv Kamay <yaniv@qumranet.com> 14 */ 15 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 16 17 #include <linux/highmem.h> 18 #include <linux/hrtimer.h> 19 #include <linux/kernel.h> 20 #include <linux/kvm_host.h> 21 #include <linux/module.h> 22 #include <linux/moduleparam.h> 23 #include <linux/mod_devicetable.h> 24 #include <linux/mm.h> 25 #include <linux/objtool.h> 26 #include <linux/sched.h> 27 #include <linux/sched/smt.h> 28 #include <linux/slab.h> 29 #include <linux/tboot.h> 30 #include <linux/trace_events.h> 31 32 #include <asm/apic.h> 33 #include <asm/asm.h> 34 #include <asm/cpu.h> 35 #include <asm/cpu_device_id.h> 36 #include <asm/debugreg.h> 37 #include <asm/desc.h> 38 #include <asm/fpu/api.h> 39 #include <asm/fpu/xstate.h> 40 #include <asm/fred.h> 41 #include <asm/idtentry.h> 42 #include <asm/io.h> 43 #include <asm/irq_remapping.h> 44 #include <asm/reboot.h> 45 #include <asm/perf_event.h> 46 #include <asm/mmu_context.h> 47 #include <asm/mshyperv.h> 48 #include <asm/msr.h> 49 #include <asm/mwait.h> 50 #include <asm/spec-ctrl.h> 51 #include <asm/virt.h> 52 #include <asm/vmx.h> 53 54 #include <trace/events/ipi.h> 55 56 #include "capabilities.h" 57 #include "common.h" 58 #include "cpuid.h" 59 #include "hyperv.h" 60 #include "kvm_onhyperv.h" 61 #include "irq.h" 62 #include "kvm_cache_regs.h" 63 #include "lapic.h" 64 #include "mmu.h" 65 #include "nested.h" 66 #include "pmu.h" 67 #include "sgx.h" 68 #include "trace.h" 69 #include "vmcs.h" 70 #include "vmcs12.h" 71 #include "vmx.h" 72 #include "x86.h" 73 #include "x86_ops.h" 74 #include "smm.h" 75 #include "vmx_onhyperv.h" 76 #include "posted_intr.h" 77 78 #include "mmu/spte.h" 79 80 MODULE_AUTHOR("Qumranet"); 81 MODULE_DESCRIPTION("KVM support for VMX (Intel VT-x) extensions"); 82 MODULE_LICENSE("GPL"); 83 84 #ifdef MODULE 85 static const struct x86_cpu_id vmx_cpu_id[] = { 86 X86_MATCH_FEATURE(X86_FEATURE_VMX, NULL), 87 {} 88 }; 89 MODULE_DEVICE_TABLE(x86cpu, vmx_cpu_id); 90 #endif 91 92 bool __read_mostly enable_vpid = 1; 93 module_param_named(vpid, enable_vpid, bool, 0444); 94 95 static bool __read_mostly enable_vnmi = 1; 96 module_param_named(vnmi, enable_vnmi, bool, 0444); 97 98 bool __read_mostly flexpriority_enabled = 1; 99 module_param_named(flexpriority, flexpriority_enabled, bool, 0444); 100 101 bool __read_mostly enable_ept = 1; 102 module_param_named(ept, enable_ept, bool, 0444); 103 104 bool __read_mostly enable_unrestricted_guest = 1; 105 module_param_named(unrestricted_guest, 106 enable_unrestricted_guest, bool, 0444); 107 108 bool __read_mostly enable_ept_ad_bits = 1; 109 module_param_named(eptad, enable_ept_ad_bits, bool, 0444); 110 111 static bool __read_mostly emulate_invalid_guest_state = true; 112 module_param(emulate_invalid_guest_state, bool, 0444); 113 114 static bool __read_mostly fasteoi = 1; 115 module_param(fasteoi, bool, 0444); 116 117 module_param(enable_apicv, bool, 0444); 118 module_param(enable_ipiv, bool, 0444); 119 120 module_param(enable_device_posted_irqs, bool, 0444); 121 122 /* 123 * If nested=1, nested virtualization is supported, i.e., guests may use 124 * VMX and be a hypervisor for its own guests. If nested=0, guests may not 125 * use VMX instructions. 126 */ 127 static bool __read_mostly nested = 1; 128 module_param(nested, bool, 0444); 129 130 bool __read_mostly enable_pml = 1; 131 module_param_named(pml, enable_pml, bool, 0444); 132 133 static bool __read_mostly error_on_inconsistent_vmcs_config = true; 134 module_param(error_on_inconsistent_vmcs_config, bool, 0444); 135 136 static bool __read_mostly dump_invalid_vmcs = 0; 137 module_param(dump_invalid_vmcs, bool, 0644); 138 139 #define MSR_BITMAP_MODE_X2APIC 1 140 #define MSR_BITMAP_MODE_X2APIC_APICV 2 141 142 #define KVM_VMX_TSC_MULTIPLIER_MAX 0xffffffffffffffffULL 143 144 /* Guest_tsc -> host_tsc conversion requires 64-bit division. */ 145 static int __read_mostly cpu_preemption_timer_multi; 146 static bool __read_mostly enable_preemption_timer = 1; 147 #ifdef CONFIG_X86_64 148 module_param_named(preemption_timer, enable_preemption_timer, bool, S_IRUGO); 149 #endif 150 151 extern bool __read_mostly allow_smaller_maxphyaddr; 152 module_param(allow_smaller_maxphyaddr, bool, S_IRUGO); 153 154 module_param(enable_mediated_pmu, bool, 0444); 155 156 #define KVM_VM_CR0_ALWAYS_OFF (X86_CR0_NW | X86_CR0_CD) 157 #define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST X86_CR0_NE 158 #define KVM_VM_CR0_ALWAYS_ON \ 159 (KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST | X86_CR0_PG | X86_CR0_PE) 160 161 #define KVM_VM_CR4_ALWAYS_ON_UNRESTRICTED_GUEST X86_CR4_VMXE 162 #define KVM_PMODE_VM_CR4_ALWAYS_ON (X86_CR4_PAE | X86_CR4_VMXE) 163 #define KVM_RMODE_VM_CR4_ALWAYS_ON (X86_CR4_VME | X86_CR4_PAE | X86_CR4_VMXE) 164 165 #define RMODE_GUEST_OWNED_EFLAGS_BITS (~(X86_EFLAGS_IOPL | X86_EFLAGS_VM)) 166 167 #define MSR_IA32_RTIT_STATUS_MASK (~(RTIT_STATUS_FILTEREN | \ 168 RTIT_STATUS_CONTEXTEN | RTIT_STATUS_TRIGGEREN | \ 169 RTIT_STATUS_ERROR | RTIT_STATUS_STOPPED | \ 170 RTIT_STATUS_BYTECNT)) 171 172 /* 173 * These 2 parameters are used to config the controls for Pause-Loop Exiting: 174 * ple_gap: upper bound on the amount of time between two successive 175 * executions of PAUSE in a loop. Also indicate if ple enabled. 176 * According to test, this time is usually smaller than 128 cycles. 177 * ple_window: upper bound on the amount of time a guest is allowed to execute 178 * in a PAUSE loop. Tests indicate that most spinlocks are held for 179 * less than 2^12 cycles 180 * Time is measured based on a counter that runs at the same rate as the TSC, 181 * refer SDM volume 3b section 21.6.13 & 22.1.3. 182 */ 183 static unsigned int ple_gap = KVM_DEFAULT_PLE_GAP; 184 module_param(ple_gap, uint, 0444); 185 186 static unsigned int ple_window = KVM_VMX_DEFAULT_PLE_WINDOW; 187 module_param(ple_window, uint, 0444); 188 189 /* Default doubles per-vcpu window every exit. */ 190 static unsigned int ple_window_grow = KVM_DEFAULT_PLE_WINDOW_GROW; 191 module_param(ple_window_grow, uint, 0444); 192 193 /* Default resets per-vcpu window every exit to ple_window. */ 194 static unsigned int ple_window_shrink = KVM_DEFAULT_PLE_WINDOW_SHRINK; 195 module_param(ple_window_shrink, uint, 0444); 196 197 /* Default is to compute the maximum so we can never overflow. */ 198 static unsigned int ple_window_max = KVM_VMX_DEFAULT_PLE_WINDOW_MAX; 199 module_param(ple_window_max, uint, 0444); 200 201 /* Default is SYSTEM mode, 1 for host-guest mode (which is BROKEN) */ 202 int __read_mostly pt_mode = PT_MODE_SYSTEM; 203 #ifdef CONFIG_BROKEN 204 module_param(pt_mode, int, S_IRUGO); 205 #endif 206 207 struct x86_pmu_lbr __ro_after_init vmx_lbr_caps; 208 209 #ifdef CONFIG_CPU_MITIGATIONS 210 static DEFINE_STATIC_KEY_FALSE(vmx_l1d_should_flush); 211 static DEFINE_STATIC_KEY_FALSE(vmx_l1d_flush_cond); 212 static DEFINE_MUTEX(vmx_l1d_flush_mutex); 213 214 /* Storage for pre module init parameter parsing */ 215 static enum vmx_l1d_flush_state __read_mostly vmentry_l1d_flush_param = VMENTER_L1D_FLUSH_AUTO; 216 217 static const struct { 218 const char *option; 219 bool for_parse; 220 } vmentry_l1d_param[] = { 221 [VMENTER_L1D_FLUSH_AUTO] = {"auto", true}, 222 [VMENTER_L1D_FLUSH_NEVER] = {"never", true}, 223 [VMENTER_L1D_FLUSH_COND] = {"cond", true}, 224 [VMENTER_L1D_FLUSH_ALWAYS] = {"always", true}, 225 [VMENTER_L1D_FLUSH_EPT_DISABLED] = {"EPT disabled", false}, 226 [VMENTER_L1D_FLUSH_NOT_REQUIRED] = {"not required", false}, 227 }; 228 229 #define L1D_CACHE_ORDER 4 230 static void *vmx_l1d_flush_pages; 231 232 static int __vmx_setup_l1d_flush(enum vmx_l1d_flush_state l1tf) 233 { 234 struct page *page; 235 unsigned int i; 236 237 if (!boot_cpu_has_bug(X86_BUG_L1TF)) { 238 l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_NOT_REQUIRED; 239 return 0; 240 } 241 242 if (!enable_ept) { 243 l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_EPT_DISABLED; 244 return 0; 245 } 246 247 if (kvm_host.arch_capabilities & ARCH_CAP_SKIP_VMENTRY_L1DFLUSH) { 248 l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_NOT_REQUIRED; 249 return 0; 250 } 251 252 /* If set to auto use the default l1tf mitigation method */ 253 if (l1tf == VMENTER_L1D_FLUSH_AUTO) { 254 switch (l1tf_mitigation) { 255 case L1TF_MITIGATION_OFF: 256 l1tf = VMENTER_L1D_FLUSH_NEVER; 257 break; 258 case L1TF_MITIGATION_AUTO: 259 case L1TF_MITIGATION_FLUSH_NOWARN: 260 case L1TF_MITIGATION_FLUSH: 261 case L1TF_MITIGATION_FLUSH_NOSMT: 262 l1tf = VMENTER_L1D_FLUSH_COND; 263 break; 264 case L1TF_MITIGATION_FULL: 265 case L1TF_MITIGATION_FULL_FORCE: 266 l1tf = VMENTER_L1D_FLUSH_ALWAYS; 267 break; 268 } 269 } else if (l1tf_mitigation == L1TF_MITIGATION_FULL_FORCE) { 270 l1tf = VMENTER_L1D_FLUSH_ALWAYS; 271 } 272 273 if (l1tf != VMENTER_L1D_FLUSH_NEVER && !vmx_l1d_flush_pages && 274 !boot_cpu_has(X86_FEATURE_FLUSH_L1D)) { 275 /* 276 * This allocation for vmx_l1d_flush_pages is not tied to a VM 277 * lifetime and so should not be charged to a memcg. 278 */ 279 page = alloc_pages(GFP_KERNEL, L1D_CACHE_ORDER); 280 if (!page) 281 return -ENOMEM; 282 vmx_l1d_flush_pages = page_address(page); 283 284 /* 285 * Initialize each page with a different pattern in 286 * order to protect against KSM in the nested 287 * virtualization case. 288 */ 289 for (i = 0; i < 1u << L1D_CACHE_ORDER; ++i) { 290 memset(vmx_l1d_flush_pages + i * PAGE_SIZE, i + 1, 291 PAGE_SIZE); 292 } 293 } 294 295 l1tf_vmx_mitigation = l1tf; 296 297 if (l1tf != VMENTER_L1D_FLUSH_NEVER) 298 static_branch_enable(&vmx_l1d_should_flush); 299 else 300 static_branch_disable(&vmx_l1d_should_flush); 301 302 if (l1tf == VMENTER_L1D_FLUSH_COND) 303 static_branch_enable(&vmx_l1d_flush_cond); 304 else 305 static_branch_disable(&vmx_l1d_flush_cond); 306 return 0; 307 } 308 309 static int vmx_setup_l1d_flush(void) 310 { 311 /* 312 * Hand the parameter mitigation value in which was stored in the pre 313 * module init parser. If no parameter was given, it will contain 314 * 'auto' which will be turned into the default 'cond' mitigation mode. 315 */ 316 return __vmx_setup_l1d_flush(vmentry_l1d_flush_param); 317 } 318 319 static void vmx_cleanup_l1d_flush(void) 320 { 321 if (vmx_l1d_flush_pages) { 322 free_pages((unsigned long)vmx_l1d_flush_pages, L1D_CACHE_ORDER); 323 vmx_l1d_flush_pages = NULL; 324 } 325 /* Restore state so sysfs ignores VMX */ 326 l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_AUTO; 327 } 328 329 static int vmentry_l1d_flush_parse(const char *s) 330 { 331 unsigned int i; 332 333 if (s) { 334 for (i = 0; i < ARRAY_SIZE(vmentry_l1d_param); i++) { 335 if (vmentry_l1d_param[i].for_parse && 336 sysfs_streq(s, vmentry_l1d_param[i].option)) 337 return i; 338 } 339 } 340 return -EINVAL; 341 } 342 343 static int vmentry_l1d_flush_set(const char *s, const struct kernel_param *kp) 344 { 345 int l1tf, ret; 346 347 l1tf = vmentry_l1d_flush_parse(s); 348 if (l1tf < 0) 349 return l1tf; 350 351 if (!boot_cpu_has(X86_BUG_L1TF)) 352 return 0; 353 354 /* 355 * Has vmx_init() run already? If not then this is the pre init 356 * parameter parsing. In that case just store the value and let 357 * vmx_init() do the proper setup after enable_ept has been 358 * established. 359 */ 360 if (l1tf_vmx_mitigation == VMENTER_L1D_FLUSH_AUTO) { 361 vmentry_l1d_flush_param = l1tf; 362 return 0; 363 } 364 365 mutex_lock(&vmx_l1d_flush_mutex); 366 ret = __vmx_setup_l1d_flush(l1tf); 367 mutex_unlock(&vmx_l1d_flush_mutex); 368 return ret; 369 } 370 371 static int vmentry_l1d_flush_get(char *s, const struct kernel_param *kp) 372 { 373 if (WARN_ON_ONCE(l1tf_vmx_mitigation >= ARRAY_SIZE(vmentry_l1d_param))) 374 return sysfs_emit(s, "???\n"); 375 376 return sysfs_emit(s, "%s\n", vmentry_l1d_param[l1tf_vmx_mitigation].option); 377 } 378 379 /* 380 * Software based L1D cache flush which is used when microcode providing 381 * the cache control MSR is not loaded. 382 * 383 * The L1D cache is 32 KiB on Nehalem and later microarchitectures, but to 384 * flush it is required to read in 64 KiB because the replacement algorithm 385 * is not exactly LRU. This could be sized at runtime via topology 386 * information but as all relevant affected CPUs have 32KiB L1D cache size 387 * there is no point in doing so. 388 */ 389 static noinstr void vmx_l1d_flush(struct kvm_vcpu *vcpu) 390 { 391 int size = PAGE_SIZE << L1D_CACHE_ORDER; 392 393 if (!static_branch_unlikely(&vmx_l1d_should_flush)) 394 return; 395 396 /* 397 * This code is only executed when the flush mode is 'cond' or 398 * 'always' 399 */ 400 if (static_branch_likely(&vmx_l1d_flush_cond)) { 401 /* 402 * Clear the per-cpu flush bit, it gets set again if the vCPU 403 * is reloaded, i.e. if the vCPU is scheduled out or if KVM 404 * exits to userspace, or if KVM reaches one of the unsafe 405 * VMEXIT handlers, e.g. if KVM calls into the emulator, 406 * or from the interrupt handlers. 407 */ 408 if (!kvm_get_cpu_l1tf_flush_l1d()) 409 return; 410 kvm_clear_cpu_l1tf_flush_l1d(); 411 } 412 413 vcpu->stat.l1d_flush++; 414 415 if (static_cpu_has(X86_FEATURE_FLUSH_L1D)) { 416 native_wrmsrq(MSR_IA32_FLUSH_CMD, L1D_FLUSH); 417 return; 418 } 419 420 asm volatile( 421 /* First ensure the pages are in the TLB */ 422 "xorl %%eax, %%eax\n" 423 ".Lpopulate_tlb:\n\t" 424 "movzbl (%[flush_pages], %%" _ASM_AX "), %%ecx\n\t" 425 "addl $4096, %%eax\n\t" 426 "cmpl %%eax, %[size]\n\t" 427 "jne .Lpopulate_tlb\n\t" 428 "xorl %%eax, %%eax\n\t" 429 "cpuid\n\t" 430 /* Now fill the cache */ 431 "xorl %%eax, %%eax\n" 432 ".Lfill_cache:\n" 433 "movzbl (%[flush_pages], %%" _ASM_AX "), %%ecx\n\t" 434 "addl $64, %%eax\n\t" 435 "cmpl %%eax, %[size]\n\t" 436 "jne .Lfill_cache\n\t" 437 "lfence\n" 438 :: [flush_pages] "r" (vmx_l1d_flush_pages), 439 [size] "r" (size) 440 : "eax", "ebx", "ecx", "edx"); 441 } 442 443 #else /* CONFIG_CPU_MITIGATIONS*/ 444 static int vmx_setup_l1d_flush(void) 445 { 446 l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_NEVER; 447 return 0; 448 } 449 static void vmx_cleanup_l1d_flush(void) 450 { 451 l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_AUTO; 452 } 453 static __always_inline void vmx_l1d_flush(struct kvm_vcpu *vcpu) 454 { 455 456 } 457 static int vmentry_l1d_flush_set(const char *s, const struct kernel_param *kp) 458 { 459 pr_warn_once("Kernel compiled without mitigations, ignoring vmentry_l1d_flush\n"); 460 return 0; 461 } 462 static int vmentry_l1d_flush_get(char *s, const struct kernel_param *kp) 463 { 464 return sysfs_emit(s, "never\n"); 465 } 466 #endif 467 468 static const struct kernel_param_ops vmentry_l1d_flush_ops = { 469 .set = vmentry_l1d_flush_set, 470 .get = vmentry_l1d_flush_get, 471 }; 472 module_param_cb(vmentry_l1d_flush, &vmentry_l1d_flush_ops, NULL, 0644); 473 474 static __always_inline void vmx_disable_fb_clear(struct vcpu_vmx *vmx) 475 { 476 u64 msr; 477 478 if (!vmx->disable_fb_clear) 479 return; 480 481 msr = native_rdmsrq(MSR_IA32_MCU_OPT_CTRL); 482 msr |= FB_CLEAR_DIS; 483 native_wrmsrq(MSR_IA32_MCU_OPT_CTRL, msr); 484 /* Cache the MSR value to avoid reading it later */ 485 vmx->msr_ia32_mcu_opt_ctrl = msr; 486 } 487 488 static __always_inline void vmx_enable_fb_clear(struct vcpu_vmx *vmx) 489 { 490 if (!vmx->disable_fb_clear) 491 return; 492 493 vmx->msr_ia32_mcu_opt_ctrl &= ~FB_CLEAR_DIS; 494 native_wrmsrq(MSR_IA32_MCU_OPT_CTRL, vmx->msr_ia32_mcu_opt_ctrl); 495 } 496 497 static void vmx_update_fb_clear_dis(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx) 498 { 499 /* 500 * Disable VERW's behavior of clearing CPU buffers for the guest if the 501 * CPU isn't affected by MDS/TAA, and the host hasn't forcefully enabled 502 * the mitigation. Disabling the clearing behavior provides a 503 * performance boost for guests that aren't aware that manually clearing 504 * CPU buffers is unnecessary, at the cost of MSR accesses on VM-Entry 505 * and VM-Exit. 506 */ 507 vmx->disable_fb_clear = !cpu_feature_enabled(X86_FEATURE_CLEAR_CPU_BUF) && 508 (kvm_host.arch_capabilities & ARCH_CAP_FB_CLEAR_CTRL) && 509 !boot_cpu_has_bug(X86_BUG_MDS) && 510 !boot_cpu_has_bug(X86_BUG_TAA); 511 512 /* 513 * If guest will not execute VERW, there is no need to set FB_CLEAR_DIS 514 * at VMEntry. Skip the MSR read/write when a guest has no use case to 515 * execute VERW. 516 */ 517 if ((vcpu->arch.arch_capabilities & ARCH_CAP_FB_CLEAR) || 518 ((vcpu->arch.arch_capabilities & ARCH_CAP_MDS_NO) && 519 (vcpu->arch.arch_capabilities & ARCH_CAP_TAA_NO) && 520 (vcpu->arch.arch_capabilities & ARCH_CAP_PSDP_NO) && 521 (vcpu->arch.arch_capabilities & ARCH_CAP_FBSDP_NO) && 522 (vcpu->arch.arch_capabilities & ARCH_CAP_SBDR_SSDP_NO))) 523 vmx->disable_fb_clear = false; 524 } 525 526 static u32 vmx_segment_access_rights(struct kvm_segment *var); 527 528 void vmx_vmexit(void); 529 530 #define vmx_insn_failed(fmt...) \ 531 do { \ 532 WARN_ONCE(1, fmt); \ 533 pr_warn_ratelimited(fmt); \ 534 } while (0) 535 536 noinline void vmread_error(unsigned long field) 537 { 538 vmx_insn_failed("vmread failed: field=%lx\n", field); 539 } 540 541 #ifndef CONFIG_CC_HAS_ASM_GOTO_OUTPUT 542 noinstr void vmread_error_trampoline2(unsigned long field, bool fault) 543 { 544 if (fault) { 545 kvm_spurious_fault(); 546 } else { 547 instrumentation_begin(); 548 vmread_error(field); 549 instrumentation_end(); 550 } 551 } 552 #endif 553 554 noinline void vmwrite_error(unsigned long field, unsigned long value) 555 { 556 vmx_insn_failed("vmwrite failed: field=%lx val=%lx err=%u\n", 557 field, value, vmcs_read32(VM_INSTRUCTION_ERROR)); 558 } 559 560 noinline void vmclear_error(struct vmcs *vmcs, u64 phys_addr) 561 { 562 vmx_insn_failed("vmclear failed: %p/%llx err=%u\n", 563 vmcs, phys_addr, vmcs_read32(VM_INSTRUCTION_ERROR)); 564 } 565 566 noinline void vmptrld_error(struct vmcs *vmcs, u64 phys_addr) 567 { 568 vmx_insn_failed("vmptrld failed: %p/%llx err=%u\n", 569 vmcs, phys_addr, vmcs_read32(VM_INSTRUCTION_ERROR)); 570 } 571 572 noinline void invvpid_error(unsigned long ext, u16 vpid, gva_t gva) 573 { 574 vmx_insn_failed("invvpid failed: ext=0x%lx vpid=%u gva=0x%lx\n", 575 ext, vpid, gva); 576 } 577 578 noinline void invept_error(unsigned long ext, u64 eptp) 579 { 580 vmx_insn_failed("invept failed: ext=0x%lx eptp=%llx\n", ext, eptp); 581 } 582 583 DEFINE_PER_CPU(struct vmcs *, current_vmcs); 584 /* 585 * We maintain a per-CPU linked-list of VMCS loaded on that CPU. This is needed 586 * when a CPU is brought down, and we need to VMCLEAR all VMCSs loaded on it. 587 */ 588 static DEFINE_PER_CPU(struct list_head, loaded_vmcss_on_cpu); 589 590 static DECLARE_BITMAP(vmx_vpid_bitmap, VMX_NR_VPIDS); 591 static DEFINE_SPINLOCK(vmx_vpid_lock); 592 593 struct vmcs_config vmcs_config __ro_after_init; 594 struct vmx_capability vmx_capability __ro_after_init; 595 596 #define VMX_SEGMENT_FIELD(seg) \ 597 [VCPU_SREG_##seg] = { \ 598 .selector = GUEST_##seg##_SELECTOR, \ 599 .base = GUEST_##seg##_BASE, \ 600 .limit = GUEST_##seg##_LIMIT, \ 601 .ar_bytes = GUEST_##seg##_AR_BYTES, \ 602 } 603 604 static const struct kvm_vmx_segment_field { 605 unsigned selector; 606 unsigned base; 607 unsigned limit; 608 unsigned ar_bytes; 609 } kvm_vmx_segment_fields[] = { 610 VMX_SEGMENT_FIELD(CS), 611 VMX_SEGMENT_FIELD(DS), 612 VMX_SEGMENT_FIELD(ES), 613 VMX_SEGMENT_FIELD(FS), 614 VMX_SEGMENT_FIELD(GS), 615 VMX_SEGMENT_FIELD(SS), 616 VMX_SEGMENT_FIELD(TR), 617 VMX_SEGMENT_FIELD(LDTR), 618 }; 619 620 621 static unsigned long host_idt_base; 622 623 #if IS_ENABLED(CONFIG_HYPERV) 624 static bool __read_mostly enlightened_vmcs = true; 625 module_param(enlightened_vmcs, bool, 0444); 626 627 static int hv_enable_l2_tlb_flush(struct kvm_vcpu *vcpu) 628 { 629 struct hv_enlightened_vmcs *evmcs; 630 hpa_t partition_assist_page = hv_get_partition_assist_page(vcpu); 631 632 if (partition_assist_page == INVALID_PAGE) 633 return -ENOMEM; 634 635 evmcs = (struct hv_enlightened_vmcs *)to_vmx(vcpu)->loaded_vmcs->vmcs; 636 637 evmcs->partition_assist_page = partition_assist_page; 638 evmcs->hv_vm_id = (unsigned long)vcpu->kvm; 639 evmcs->hv_enlightenments_control.nested_flush_hypercall = 1; 640 641 return 0; 642 } 643 644 static __init void hv_init_evmcs(void) 645 { 646 int cpu; 647 648 if (!enlightened_vmcs) 649 return; 650 651 /* 652 * Enlightened VMCS usage should be recommended and the host needs 653 * to support eVMCS v1 or above. 654 */ 655 if (ms_hyperv.hints & HV_X64_ENLIGHTENED_VMCS_RECOMMENDED && 656 (ms_hyperv.nested_features & HV_X64_ENLIGHTENED_VMCS_VERSION) >= 657 KVM_EVMCS_VERSION) { 658 659 /* Check that we have assist pages on all online CPUs */ 660 for_each_online_cpu(cpu) { 661 if (!hv_get_vp_assist_page(cpu)) { 662 enlightened_vmcs = false; 663 break; 664 } 665 } 666 667 if (enlightened_vmcs) { 668 pr_info("Using Hyper-V Enlightened VMCS\n"); 669 static_branch_enable(&__kvm_is_using_evmcs); 670 } 671 672 if (ms_hyperv.nested_features & HV_X64_NESTED_DIRECT_FLUSH) 673 vt_x86_ops.enable_l2_tlb_flush 674 = hv_enable_l2_tlb_flush; 675 } else { 676 enlightened_vmcs = false; 677 } 678 } 679 680 static void hv_reset_evmcs(void) 681 { 682 struct hv_vp_assist_page *vp_ap; 683 684 if (!kvm_is_using_evmcs()) 685 return; 686 687 /* 688 * KVM should enable eVMCS if and only if all CPUs have a VP assist 689 * page, and should reject CPU onlining if eVMCS is enabled the CPU 690 * doesn't have a VP assist page allocated. 691 */ 692 vp_ap = hv_get_vp_assist_page(smp_processor_id()); 693 if (WARN_ON_ONCE(!vp_ap)) 694 return; 695 696 /* 697 * Reset everything to support using non-enlightened VMCS access later 698 * (e.g. when we reload the module with enlightened_vmcs=0) 699 */ 700 vp_ap->nested_control.features.directhypercall = 0; 701 vp_ap->current_nested_vmcs = 0; 702 vp_ap->enlighten_vmentry = 0; 703 } 704 705 #else /* IS_ENABLED(CONFIG_HYPERV) */ 706 static void hv_init_evmcs(void) {} 707 static void hv_reset_evmcs(void) {} 708 #endif /* IS_ENABLED(CONFIG_HYPERV) */ 709 710 /* 711 * Comment's format: document - errata name - stepping - processor name. 712 * Refer from 713 * https://www.virtualbox.org/svn/vbox/trunk/src/VBox/VMM/VMMR0/HMR0.cpp 714 */ 715 static u32 vmx_preemption_cpu_tfms[] = { 716 /* 323344.pdf - BA86 - D0 - Xeon 7500 Series */ 717 0x000206E6, 718 /* 323056.pdf - AAX65 - C2 - Xeon L3406 */ 719 /* 322814.pdf - AAT59 - C2 - i7-600, i5-500, i5-400 and i3-300 Mobile */ 720 /* 322911.pdf - AAU65 - C2 - i5-600, i3-500 Desktop and Pentium G6950 */ 721 0x00020652, 722 /* 322911.pdf - AAU65 - K0 - i5-600, i3-500 Desktop and Pentium G6950 */ 723 0x00020655, 724 /* 322373.pdf - AAO95 - B1 - Xeon 3400 Series */ 725 /* 322166.pdf - AAN92 - B1 - i7-800 and i5-700 Desktop */ 726 /* 727 * 320767.pdf - AAP86 - B1 - 728 * i7-900 Mobile Extreme, i7-800 and i7-700 Mobile 729 */ 730 0x000106E5, 731 /* 321333.pdf - AAM126 - C0 - Xeon 3500 */ 732 0x000106A0, 733 /* 321333.pdf - AAM126 - C1 - Xeon 3500 */ 734 0x000106A1, 735 /* 320836.pdf - AAJ124 - C0 - i7-900 Desktop Extreme and i7-900 Desktop */ 736 0x000106A4, 737 /* 321333.pdf - AAM126 - D0 - Xeon 3500 */ 738 /* 321324.pdf - AAK139 - D0 - Xeon 5500 */ 739 /* 320836.pdf - AAJ124 - D0 - i7-900 Extreme and i7-900 Desktop */ 740 0x000106A5, 741 /* Xeon E3-1220 V2 */ 742 0x000306A8, 743 }; 744 745 static inline bool cpu_has_broken_vmx_preemption_timer(void) 746 { 747 u32 eax = cpuid_eax(0x00000001), i; 748 749 /* Clear the reserved bits */ 750 eax &= ~(0x3U << 14 | 0xfU << 28); 751 for (i = 0; i < ARRAY_SIZE(vmx_preemption_cpu_tfms); i++) 752 if (eax == vmx_preemption_cpu_tfms[i]) 753 return true; 754 755 return false; 756 } 757 758 static inline bool cpu_need_virtualize_apic_accesses(struct kvm_vcpu *vcpu) 759 { 760 return flexpriority_enabled && lapic_in_kernel(vcpu); 761 } 762 763 struct vmx_uret_msr *vmx_find_uret_msr(struct vcpu_vmx *vmx, u32 msr) 764 { 765 int i; 766 767 i = kvm_find_user_return_msr(msr); 768 if (i >= 0) 769 return &vmx->guest_uret_msrs[i]; 770 return NULL; 771 } 772 773 static int vmx_set_guest_uret_msr(struct vcpu_vmx *vmx, 774 struct vmx_uret_msr *msr, u64 data) 775 { 776 unsigned int slot = msr - vmx->guest_uret_msrs; 777 int ret = 0; 778 779 if (msr->load_into_hardware) { 780 preempt_disable(); 781 ret = kvm_set_user_return_msr(slot, data, msr->mask); 782 preempt_enable(); 783 } 784 if (!ret) 785 msr->data = data; 786 return ret; 787 } 788 789 void vmx_emergency_disable_virtualization_cpu(void) 790 { 791 int cpu = raw_smp_processor_id(); 792 struct loaded_vmcs *v; 793 794 list_for_each_entry(v, &per_cpu(loaded_vmcss_on_cpu, cpu), 795 loaded_vmcss_on_cpu_link) { 796 vmcs_clear(v->vmcs); 797 if (v->shadow_vmcs) 798 vmcs_clear(v->shadow_vmcs); 799 } 800 } 801 802 static void __loaded_vmcs_clear(void *arg) 803 { 804 struct loaded_vmcs *loaded_vmcs = arg; 805 int cpu = raw_smp_processor_id(); 806 807 if (loaded_vmcs->cpu != cpu) 808 return; /* vcpu migration can race with cpu offline */ 809 if (per_cpu(current_vmcs, cpu) == loaded_vmcs->vmcs) 810 per_cpu(current_vmcs, cpu) = NULL; 811 812 vmcs_clear(loaded_vmcs->vmcs); 813 if (loaded_vmcs->shadow_vmcs && loaded_vmcs->launched) 814 vmcs_clear(loaded_vmcs->shadow_vmcs); 815 816 list_del(&loaded_vmcs->loaded_vmcss_on_cpu_link); 817 818 /* 819 * Ensure all writes to loaded_vmcs, including deleting it from its 820 * current percpu list, complete before setting loaded_vmcs->cpu to 821 * -1, otherwise a different cpu can see loaded_vmcs->cpu == -1 first 822 * and add loaded_vmcs to its percpu list before it's deleted from this 823 * cpu's list. Pairs with the smp_rmb() in vmx_vcpu_load_vmcs(). 824 */ 825 smp_wmb(); 826 827 loaded_vmcs->cpu = -1; 828 loaded_vmcs->launched = 0; 829 } 830 831 static void loaded_vmcs_clear(struct loaded_vmcs *loaded_vmcs) 832 { 833 int cpu = loaded_vmcs->cpu; 834 835 if (cpu != -1) 836 smp_call_function_single(cpu, 837 __loaded_vmcs_clear, loaded_vmcs, 1); 838 } 839 840 static bool vmx_segment_cache_test_set(struct vcpu_vmx *vmx, unsigned seg, 841 unsigned field) 842 { 843 bool ret; 844 u32 mask = 1 << (seg * SEG_FIELD_NR + field); 845 846 if (!kvm_register_is_available(&vmx->vcpu, VCPU_EXREG_SEGMENTS)) { 847 kvm_register_mark_available(&vmx->vcpu, VCPU_EXREG_SEGMENTS); 848 vmx->segment_cache.bitmask = 0; 849 } 850 ret = vmx->segment_cache.bitmask & mask; 851 vmx->segment_cache.bitmask |= mask; 852 return ret; 853 } 854 855 static u16 vmx_read_guest_seg_selector(struct vcpu_vmx *vmx, unsigned seg) 856 { 857 u16 *p = &vmx->segment_cache.seg[seg].selector; 858 859 if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_SEL)) 860 *p = vmcs_read16(kvm_vmx_segment_fields[seg].selector); 861 return *p; 862 } 863 864 static ulong vmx_read_guest_seg_base(struct vcpu_vmx *vmx, unsigned seg) 865 { 866 ulong *p = &vmx->segment_cache.seg[seg].base; 867 868 if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_BASE)) 869 *p = vmcs_readl(kvm_vmx_segment_fields[seg].base); 870 return *p; 871 } 872 873 static u32 vmx_read_guest_seg_limit(struct vcpu_vmx *vmx, unsigned seg) 874 { 875 u32 *p = &vmx->segment_cache.seg[seg].limit; 876 877 if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_LIMIT)) 878 *p = vmcs_read32(kvm_vmx_segment_fields[seg].limit); 879 return *p; 880 } 881 882 static u32 vmx_read_guest_seg_ar(struct vcpu_vmx *vmx, unsigned seg) 883 { 884 u32 *p = &vmx->segment_cache.seg[seg].ar; 885 886 if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_AR)) 887 *p = vmcs_read32(kvm_vmx_segment_fields[seg].ar_bytes); 888 return *p; 889 } 890 891 void vmx_update_exception_bitmap(struct kvm_vcpu *vcpu) 892 { 893 u32 eb; 894 895 eb = (1u << PF_VECTOR) | (1u << UD_VECTOR) | (1u << MC_VECTOR) | 896 (1u << DB_VECTOR) | (1u << AC_VECTOR); 897 /* 898 * #VE isn't used for VMX. To test against unexpected changes 899 * related to #VE for VMX, intercept unexpected #VE and warn on it. 900 */ 901 if (IS_ENABLED(CONFIG_KVM_INTEL_PROVE_VE)) 902 eb |= 1u << VE_VECTOR; 903 /* 904 * Guest access to VMware backdoor ports could legitimately 905 * trigger #GP because of TSS I/O permission bitmap. 906 * We intercept those #GP and allow access to them anyway 907 * as VMware does. 908 */ 909 if (enable_vmware_backdoor) 910 eb |= (1u << GP_VECTOR); 911 if ((vcpu->guest_debug & 912 (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP)) == 913 (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP)) 914 eb |= 1u << BP_VECTOR; 915 if (to_vmx(vcpu)->rmode.vm86_active) 916 eb = ~0; 917 if (!vmx_need_pf_intercept(vcpu)) 918 eb &= ~(1u << PF_VECTOR); 919 920 /* When we are running a nested L2 guest and L1 specified for it a 921 * certain exception bitmap, we must trap the same exceptions and pass 922 * them to L1. When running L2, we will only handle the exceptions 923 * specified above if L1 did not want them. 924 */ 925 if (is_guest_mode(vcpu)) 926 eb |= get_vmcs12(vcpu)->exception_bitmap; 927 else { 928 int mask = 0, match = 0; 929 930 if (enable_ept && (eb & (1u << PF_VECTOR))) { 931 /* 932 * If EPT is enabled, #PF is currently only intercepted 933 * if MAXPHYADDR is smaller on the guest than on the 934 * host. In that case we only care about present, 935 * non-reserved faults. For vmcs02, however, PFEC_MASK 936 * and PFEC_MATCH are set in prepare_vmcs02_rare. 937 */ 938 mask = PFERR_PRESENT_MASK | PFERR_RSVD_MASK; 939 match = PFERR_PRESENT_MASK; 940 } 941 vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, mask); 942 vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, match); 943 } 944 945 /* 946 * Disabling xfd interception indicates that dynamic xfeatures 947 * might be used in the guest. Always trap #NM in this case 948 * to save guest xfd_err timely. 949 */ 950 if (vcpu->arch.xfd_no_write_intercept) 951 eb |= (1u << NM_VECTOR); 952 953 vmcs_write32(EXCEPTION_BITMAP, eb); 954 } 955 956 /* 957 * Check if MSR is intercepted for currently loaded MSR bitmap. 958 */ 959 static bool msr_write_intercepted(struct vcpu_vmx *vmx, u32 msr) 960 { 961 if (!(exec_controls_get(vmx) & CPU_BASED_USE_MSR_BITMAPS)) 962 return true; 963 964 return vmx_test_msr_bitmap_write(vmx->loaded_vmcs->msr_bitmap, msr); 965 } 966 967 unsigned int __vmx_vcpu_run_flags(struct vcpu_vmx *vmx) 968 { 969 unsigned int flags = 0; 970 971 if (vmx->loaded_vmcs->launched) 972 flags |= VMX_RUN_VMRESUME; 973 974 /* 975 * If writes to the SPEC_CTRL MSR aren't intercepted, the guest is free 976 * to change it directly without causing a vmexit. In that case read 977 * it after vmexit and store it in vmx->spec_ctrl. 978 */ 979 if (!msr_write_intercepted(vmx, MSR_IA32_SPEC_CTRL)) 980 flags |= VMX_RUN_SAVE_SPEC_CTRL; 981 982 if (cpu_feature_enabled(X86_FEATURE_CLEAR_CPU_BUF_VM_MMIO) && 983 kvm_vcpu_can_access_host_mmio(&vmx->vcpu)) 984 flags |= VMX_RUN_CLEAR_CPU_BUFFERS_FOR_MMIO; 985 986 return flags; 987 } 988 989 static __always_inline void clear_atomic_switch_msr_special(struct vcpu_vmx *vmx, 990 unsigned long entry, unsigned long exit) 991 { 992 vm_entry_controls_clearbit(vmx, entry); 993 vm_exit_controls_clearbit(vmx, exit); 994 } 995 996 static int vmx_find_loadstore_msr_slot(struct vmx_msrs *m, u32 msr) 997 { 998 unsigned int i; 999 1000 for (i = 0; i < m->nr; ++i) { 1001 if (m->val[i].index == msr) 1002 return i; 1003 } 1004 return -ENOENT; 1005 } 1006 1007 static void vmx_remove_auto_msr(struct vmx_msrs *m, u32 msr, 1008 unsigned long vmcs_count_field) 1009 { 1010 int i; 1011 1012 i = vmx_find_loadstore_msr_slot(m, msr); 1013 if (i < 0) 1014 return; 1015 1016 --m->nr; 1017 m->val[i] = m->val[m->nr]; 1018 vmcs_write32(vmcs_count_field, m->nr); 1019 } 1020 1021 static void clear_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr) 1022 { 1023 struct msr_autoload *m = &vmx->msr_autoload; 1024 1025 switch (msr) { 1026 case MSR_EFER: 1027 if (cpu_has_load_ia32_efer()) { 1028 clear_atomic_switch_msr_special(vmx, 1029 VM_ENTRY_LOAD_IA32_EFER, 1030 VM_EXIT_LOAD_IA32_EFER); 1031 return; 1032 } 1033 break; 1034 case MSR_CORE_PERF_GLOBAL_CTRL: 1035 if (cpu_has_load_perf_global_ctrl()) { 1036 clear_atomic_switch_msr_special(vmx, 1037 VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL, 1038 VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL); 1039 return; 1040 } 1041 break; 1042 } 1043 1044 vmx_remove_auto_msr(&m->guest, msr, VM_ENTRY_MSR_LOAD_COUNT); 1045 vmx_remove_auto_msr(&m->host, msr, VM_EXIT_MSR_LOAD_COUNT); 1046 } 1047 1048 static __always_inline void add_atomic_switch_msr_special(struct vcpu_vmx *vmx, 1049 unsigned long entry, unsigned long exit, 1050 unsigned long guest_val_vmcs, unsigned long host_val_vmcs, 1051 u64 guest_val, u64 host_val) 1052 { 1053 vmcs_write64(guest_val_vmcs, guest_val); 1054 if (host_val_vmcs != HOST_IA32_EFER) 1055 vmcs_write64(host_val_vmcs, host_val); 1056 vm_entry_controls_setbit(vmx, entry); 1057 vm_exit_controls_setbit(vmx, exit); 1058 } 1059 1060 static void vmx_add_auto_msr(struct vmx_msrs *m, u32 msr, u64 value, 1061 unsigned long vmcs_count_field, struct kvm *kvm) 1062 { 1063 int i; 1064 1065 i = vmx_find_loadstore_msr_slot(m, msr); 1066 if (i < 0) { 1067 if (KVM_BUG_ON(m->nr == MAX_NR_LOADSTORE_MSRS, kvm)) 1068 return; 1069 1070 i = m->nr++; 1071 m->val[i].index = msr; 1072 vmcs_write32(vmcs_count_field, m->nr); 1073 } 1074 m->val[i].value = value; 1075 } 1076 1077 static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr, 1078 u64 guest_val, u64 host_val) 1079 { 1080 struct msr_autoload *m = &vmx->msr_autoload; 1081 struct kvm *kvm = vmx->vcpu.kvm; 1082 1083 switch (msr) { 1084 case MSR_EFER: 1085 if (cpu_has_load_ia32_efer()) { 1086 add_atomic_switch_msr_special(vmx, 1087 VM_ENTRY_LOAD_IA32_EFER, 1088 VM_EXIT_LOAD_IA32_EFER, 1089 GUEST_IA32_EFER, 1090 HOST_IA32_EFER, 1091 guest_val, host_val); 1092 return; 1093 } 1094 break; 1095 case MSR_CORE_PERF_GLOBAL_CTRL: 1096 if (cpu_has_load_perf_global_ctrl()) { 1097 add_atomic_switch_msr_special(vmx, 1098 VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL, 1099 VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL, 1100 GUEST_IA32_PERF_GLOBAL_CTRL, 1101 HOST_IA32_PERF_GLOBAL_CTRL, 1102 guest_val, host_val); 1103 return; 1104 } 1105 break; 1106 case MSR_IA32_PEBS_ENABLE: 1107 /* PEBS needs a quiescent period after being disabled (to write 1108 * a record). Disabling PEBS through VMX MSR swapping doesn't 1109 * provide that period, so a CPU could write host's record into 1110 * guest's memory. 1111 */ 1112 wrmsrq(MSR_IA32_PEBS_ENABLE, 0); 1113 } 1114 1115 vmx_add_auto_msr(&m->guest, msr, guest_val, VM_ENTRY_MSR_LOAD_COUNT, kvm); 1116 vmx_add_auto_msr(&m->host, msr, host_val, VM_EXIT_MSR_LOAD_COUNT, kvm); 1117 } 1118 1119 static bool update_transition_efer(struct vcpu_vmx *vmx) 1120 { 1121 u64 guest_efer = vmx->vcpu.arch.efer; 1122 u64 ignore_bits = 0; 1123 int i; 1124 1125 /* Shadow paging assumes NX to be available. */ 1126 if (!enable_ept) 1127 guest_efer |= EFER_NX; 1128 1129 /* 1130 * LMA and LME handled by hardware; SCE meaningless outside long mode. 1131 */ 1132 ignore_bits |= EFER_SCE; 1133 #ifdef CONFIG_X86_64 1134 ignore_bits |= EFER_LMA | EFER_LME; 1135 /* SCE is meaningful only in long mode on Intel */ 1136 if (guest_efer & EFER_LMA) 1137 ignore_bits &= ~(u64)EFER_SCE; 1138 #endif 1139 1140 /* 1141 * On EPT, we can't emulate NX, so we must switch EFER atomically. 1142 * On CPUs that support "load IA32_EFER", always switch EFER 1143 * atomically, since it's faster than switching it manually. 1144 */ 1145 if (cpu_has_load_ia32_efer() || 1146 (enable_ept && ((vmx->vcpu.arch.efer ^ kvm_host.efer) & EFER_NX))) { 1147 if (!(guest_efer & EFER_LMA)) 1148 guest_efer &= ~EFER_LME; 1149 if (guest_efer != kvm_host.efer) 1150 add_atomic_switch_msr(vmx, MSR_EFER, guest_efer, kvm_host.efer); 1151 else 1152 clear_atomic_switch_msr(vmx, MSR_EFER); 1153 return false; 1154 } 1155 1156 i = kvm_find_user_return_msr(MSR_EFER); 1157 if (i < 0) 1158 return false; 1159 1160 clear_atomic_switch_msr(vmx, MSR_EFER); 1161 1162 guest_efer &= ~ignore_bits; 1163 guest_efer |= kvm_host.efer & ignore_bits; 1164 1165 vmx->guest_uret_msrs[i].data = guest_efer; 1166 vmx->guest_uret_msrs[i].mask = ~ignore_bits; 1167 1168 return true; 1169 } 1170 1171 static void vmx_add_autostore_msr(struct vcpu_vmx *vmx, u32 msr) 1172 { 1173 vmx_add_auto_msr(&vmx->msr_autostore, msr, 0, VM_EXIT_MSR_STORE_COUNT, 1174 vmx->vcpu.kvm); 1175 } 1176 1177 static void vmx_remove_autostore_msr(struct vcpu_vmx *vmx, u32 msr) 1178 { 1179 vmx_remove_auto_msr(&vmx->msr_autostore, msr, VM_EXIT_MSR_STORE_COUNT); 1180 } 1181 1182 #ifdef CONFIG_X86_32 1183 /* 1184 * On 32-bit kernels, VM exits still load the FS and GS bases from the 1185 * VMCS rather than the segment table. KVM uses this helper to figure 1186 * out the current bases to poke them into the VMCS before entry. 1187 */ 1188 static unsigned long segment_base(u16 selector) 1189 { 1190 struct desc_struct *table; 1191 unsigned long v; 1192 1193 if (!(selector & ~SEGMENT_RPL_MASK)) 1194 return 0; 1195 1196 table = get_current_gdt_ro(); 1197 1198 if ((selector & SEGMENT_TI_MASK) == SEGMENT_LDT) { 1199 u16 ldt_selector = kvm_read_ldt(); 1200 1201 if (!(ldt_selector & ~SEGMENT_RPL_MASK)) 1202 return 0; 1203 1204 table = (struct desc_struct *)segment_base(ldt_selector); 1205 } 1206 v = get_desc_base(&table[selector >> 3]); 1207 return v; 1208 } 1209 #endif 1210 1211 static inline bool pt_can_write_msr(struct vcpu_vmx *vmx) 1212 { 1213 return vmx_pt_mode_is_host_guest() && 1214 !(vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN); 1215 } 1216 1217 static inline bool pt_output_base_valid(struct kvm_vcpu *vcpu, u64 base) 1218 { 1219 /* The base must be 128-byte aligned and a legal physical address. */ 1220 return kvm_vcpu_is_legal_aligned_gpa(vcpu, base, 128); 1221 } 1222 1223 static inline void pt_load_msr(struct pt_ctx *ctx, u32 addr_range) 1224 { 1225 u32 i; 1226 1227 wrmsrq(MSR_IA32_RTIT_STATUS, ctx->status); 1228 wrmsrq(MSR_IA32_RTIT_OUTPUT_BASE, ctx->output_base); 1229 wrmsrq(MSR_IA32_RTIT_OUTPUT_MASK, ctx->output_mask); 1230 wrmsrq(MSR_IA32_RTIT_CR3_MATCH, ctx->cr3_match); 1231 for (i = 0; i < addr_range; i++) { 1232 wrmsrq(MSR_IA32_RTIT_ADDR0_A + i * 2, ctx->addr_a[i]); 1233 wrmsrq(MSR_IA32_RTIT_ADDR0_B + i * 2, ctx->addr_b[i]); 1234 } 1235 } 1236 1237 static inline void pt_save_msr(struct pt_ctx *ctx, u32 addr_range) 1238 { 1239 u32 i; 1240 1241 rdmsrq(MSR_IA32_RTIT_STATUS, ctx->status); 1242 rdmsrq(MSR_IA32_RTIT_OUTPUT_BASE, ctx->output_base); 1243 rdmsrq(MSR_IA32_RTIT_OUTPUT_MASK, ctx->output_mask); 1244 rdmsrq(MSR_IA32_RTIT_CR3_MATCH, ctx->cr3_match); 1245 for (i = 0; i < addr_range; i++) { 1246 rdmsrq(MSR_IA32_RTIT_ADDR0_A + i * 2, ctx->addr_a[i]); 1247 rdmsrq(MSR_IA32_RTIT_ADDR0_B + i * 2, ctx->addr_b[i]); 1248 } 1249 } 1250 1251 static void pt_guest_enter(struct vcpu_vmx *vmx) 1252 { 1253 if (vmx_pt_mode_is_system()) 1254 return; 1255 1256 /* 1257 * GUEST_IA32_RTIT_CTL is already set in the VMCS. 1258 * Save host state before VM entry. 1259 */ 1260 rdmsrq(MSR_IA32_RTIT_CTL, vmx->pt_desc.host.ctl); 1261 if (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) { 1262 wrmsrq(MSR_IA32_RTIT_CTL, 0); 1263 pt_save_msr(&vmx->pt_desc.host, vmx->pt_desc.num_address_ranges); 1264 pt_load_msr(&vmx->pt_desc.guest, vmx->pt_desc.num_address_ranges); 1265 } 1266 } 1267 1268 static void pt_guest_exit(struct vcpu_vmx *vmx) 1269 { 1270 if (vmx_pt_mode_is_system()) 1271 return; 1272 1273 if (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) { 1274 pt_save_msr(&vmx->pt_desc.guest, vmx->pt_desc.num_address_ranges); 1275 pt_load_msr(&vmx->pt_desc.host, vmx->pt_desc.num_address_ranges); 1276 } 1277 1278 /* 1279 * KVM requires VM_EXIT_CLEAR_IA32_RTIT_CTL to expose PT to the guest, 1280 * i.e. RTIT_CTL is always cleared on VM-Exit. Restore it if necessary. 1281 */ 1282 if (vmx->pt_desc.host.ctl) 1283 wrmsrq(MSR_IA32_RTIT_CTL, vmx->pt_desc.host.ctl); 1284 } 1285 1286 void vmx_set_host_fs_gs(struct vmcs_host_state *host, u16 fs_sel, u16 gs_sel, 1287 unsigned long fs_base, unsigned long gs_base) 1288 { 1289 if (unlikely(fs_sel != host->fs_sel)) { 1290 if (!(fs_sel & 7)) 1291 vmcs_write16(HOST_FS_SELECTOR, fs_sel); 1292 else 1293 vmcs_write16(HOST_FS_SELECTOR, 0); 1294 host->fs_sel = fs_sel; 1295 } 1296 if (unlikely(gs_sel != host->gs_sel)) { 1297 if (!(gs_sel & 7)) 1298 vmcs_write16(HOST_GS_SELECTOR, gs_sel); 1299 else 1300 vmcs_write16(HOST_GS_SELECTOR, 0); 1301 host->gs_sel = gs_sel; 1302 } 1303 if (unlikely(fs_base != host->fs_base)) { 1304 vmcs_writel(HOST_FS_BASE, fs_base); 1305 host->fs_base = fs_base; 1306 } 1307 if (unlikely(gs_base != host->gs_base)) { 1308 vmcs_writel(HOST_GS_BASE, gs_base); 1309 host->gs_base = gs_base; 1310 } 1311 } 1312 1313 void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu) 1314 { 1315 struct vcpu_vmx *vmx = to_vmx(vcpu); 1316 struct vcpu_vt *vt = to_vt(vcpu); 1317 struct vmcs_host_state *host_state; 1318 #ifdef CONFIG_X86_64 1319 int cpu = raw_smp_processor_id(); 1320 #endif 1321 unsigned long fs_base, gs_base; 1322 u16 fs_sel, gs_sel; 1323 int i; 1324 1325 /* 1326 * Note that guest MSRs to be saved/restored can also be changed 1327 * when guest state is loaded. This happens when guest transitions 1328 * to/from long-mode by setting MSR_EFER.LMA. 1329 */ 1330 if (!vmx->guest_uret_msrs_loaded) { 1331 vmx->guest_uret_msrs_loaded = true; 1332 for (i = 0; i < kvm_nr_uret_msrs; ++i) { 1333 if (!vmx->guest_uret_msrs[i].load_into_hardware) 1334 continue; 1335 1336 kvm_set_user_return_msr(i, 1337 vmx->guest_uret_msrs[i].data, 1338 vmx->guest_uret_msrs[i].mask); 1339 } 1340 } 1341 1342 if (vmx->nested.need_vmcs12_to_shadow_sync) 1343 nested_sync_vmcs12_to_shadow(vcpu); 1344 1345 if (vt->guest_state_loaded) 1346 return; 1347 1348 host_state = &vmx->loaded_vmcs->host_state; 1349 1350 /* 1351 * Set host fs and gs selectors. Unfortunately, 22.2.3 does not 1352 * allow segment selectors with cpl > 0 or ti == 1. 1353 */ 1354 host_state->ldt_sel = kvm_read_ldt(); 1355 1356 #ifdef CONFIG_X86_64 1357 savesegment(ds, host_state->ds_sel); 1358 savesegment(es, host_state->es_sel); 1359 1360 gs_base = cpu_kernelmode_gs_base(cpu); 1361 if (likely(is_64bit_mm(current->mm))) { 1362 current_save_fsgs(); 1363 fs_sel = current->thread.fsindex; 1364 gs_sel = current->thread.gsindex; 1365 fs_base = current->thread.fsbase; 1366 vt->msr_host_kernel_gs_base = current->thread.gsbase; 1367 } else { 1368 savesegment(fs, fs_sel); 1369 savesegment(gs, gs_sel); 1370 fs_base = read_msr(MSR_FS_BASE); 1371 vt->msr_host_kernel_gs_base = read_msr(MSR_KERNEL_GS_BASE); 1372 } 1373 1374 wrmsrq(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base); 1375 #else 1376 savesegment(fs, fs_sel); 1377 savesegment(gs, gs_sel); 1378 fs_base = segment_base(fs_sel); 1379 gs_base = segment_base(gs_sel); 1380 #endif 1381 1382 vmx_set_host_fs_gs(host_state, fs_sel, gs_sel, fs_base, gs_base); 1383 vt->guest_state_loaded = true; 1384 } 1385 1386 static void vmx_prepare_switch_to_host(struct vcpu_vmx *vmx) 1387 { 1388 struct vmcs_host_state *host_state; 1389 1390 if (!vmx->vt.guest_state_loaded) 1391 return; 1392 1393 host_state = &vmx->loaded_vmcs->host_state; 1394 1395 ++vmx->vcpu.stat.host_state_reload; 1396 1397 #ifdef CONFIG_X86_64 1398 rdmsrq(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base); 1399 #endif 1400 if (host_state->ldt_sel || (host_state->gs_sel & 7)) { 1401 kvm_load_ldt(host_state->ldt_sel); 1402 #ifdef CONFIG_X86_64 1403 load_gs_index(host_state->gs_sel); 1404 #else 1405 loadsegment(gs, host_state->gs_sel); 1406 #endif 1407 } 1408 if (host_state->fs_sel & 7) 1409 loadsegment(fs, host_state->fs_sel); 1410 #ifdef CONFIG_X86_64 1411 if (unlikely(host_state->ds_sel | host_state->es_sel)) { 1412 loadsegment(ds, host_state->ds_sel); 1413 loadsegment(es, host_state->es_sel); 1414 } 1415 #endif 1416 invalidate_tss_limit(); 1417 #ifdef CONFIG_X86_64 1418 wrmsrq(MSR_KERNEL_GS_BASE, vmx->vt.msr_host_kernel_gs_base); 1419 #endif 1420 load_fixmap_gdt(raw_smp_processor_id()); 1421 vmx->vt.guest_state_loaded = false; 1422 vmx->guest_uret_msrs_loaded = false; 1423 } 1424 1425 #ifdef CONFIG_X86_64 1426 static u64 vmx_read_guest_host_msr(struct vcpu_vmx *vmx, u32 msr, u64 *cache) 1427 { 1428 preempt_disable(); 1429 if (vmx->vt.guest_state_loaded) 1430 *cache = read_msr(msr); 1431 preempt_enable(); 1432 return *cache; 1433 } 1434 1435 static void vmx_write_guest_host_msr(struct vcpu_vmx *vmx, u32 msr, u64 data, 1436 u64 *cache) 1437 { 1438 preempt_disable(); 1439 if (vmx->vt.guest_state_loaded) 1440 wrmsrns(msr, data); 1441 preempt_enable(); 1442 *cache = data; 1443 } 1444 1445 static u64 vmx_read_guest_kernel_gs_base(struct vcpu_vmx *vmx) 1446 { 1447 return vmx_read_guest_host_msr(vmx, MSR_KERNEL_GS_BASE, 1448 &vmx->msr_guest_kernel_gs_base); 1449 } 1450 1451 static void vmx_write_guest_kernel_gs_base(struct vcpu_vmx *vmx, u64 data) 1452 { 1453 vmx_write_guest_host_msr(vmx, MSR_KERNEL_GS_BASE, data, 1454 &vmx->msr_guest_kernel_gs_base); 1455 } 1456 #endif 1457 1458 static void grow_ple_window(struct kvm_vcpu *vcpu) 1459 { 1460 struct vcpu_vmx *vmx = to_vmx(vcpu); 1461 unsigned int old = vmx->ple_window; 1462 1463 vmx->ple_window = __grow_ple_window(old, ple_window, 1464 ple_window_grow, 1465 ple_window_max); 1466 1467 if (vmx->ple_window != old) { 1468 vmx->ple_window_dirty = true; 1469 trace_kvm_ple_window_update(vcpu->vcpu_id, 1470 vmx->ple_window, old); 1471 } 1472 } 1473 1474 static void shrink_ple_window(struct kvm_vcpu *vcpu) 1475 { 1476 struct vcpu_vmx *vmx = to_vmx(vcpu); 1477 unsigned int old = vmx->ple_window; 1478 1479 vmx->ple_window = __shrink_ple_window(old, ple_window, 1480 ple_window_shrink, 1481 ple_window); 1482 1483 if (vmx->ple_window != old) { 1484 vmx->ple_window_dirty = true; 1485 trace_kvm_ple_window_update(vcpu->vcpu_id, 1486 vmx->ple_window, old); 1487 } 1488 } 1489 1490 void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu) 1491 { 1492 struct vcpu_vmx *vmx = to_vmx(vcpu); 1493 bool already_loaded = vmx->loaded_vmcs->cpu == cpu; 1494 struct vmcs *prev; 1495 1496 if (!already_loaded) { 1497 loaded_vmcs_clear(vmx->loaded_vmcs); 1498 local_irq_disable(); 1499 1500 /* 1501 * Ensure loaded_vmcs->cpu is read before adding loaded_vmcs to 1502 * this cpu's percpu list, otherwise it may not yet be deleted 1503 * from its previous cpu's percpu list. Pairs with the 1504 * smb_wmb() in __loaded_vmcs_clear(). 1505 */ 1506 smp_rmb(); 1507 1508 list_add(&vmx->loaded_vmcs->loaded_vmcss_on_cpu_link, 1509 &per_cpu(loaded_vmcss_on_cpu, cpu)); 1510 local_irq_enable(); 1511 } 1512 1513 prev = per_cpu(current_vmcs, cpu); 1514 if (prev != vmx->loaded_vmcs->vmcs) { 1515 per_cpu(current_vmcs, cpu) = vmx->loaded_vmcs->vmcs; 1516 vmcs_load(vmx->loaded_vmcs->vmcs); 1517 } 1518 1519 if (!already_loaded) { 1520 void *gdt = get_current_gdt_ro(); 1521 1522 /* 1523 * Flush all EPTP/VPID contexts, the new pCPU may have stale 1524 * TLB entries from its previous association with the vCPU. 1525 */ 1526 kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); 1527 1528 /* 1529 * Linux uses per-cpu TSS and GDT, so set these when switching 1530 * processors. See 22.2.4. 1531 */ 1532 vmcs_writel(HOST_TR_BASE, 1533 (unsigned long)&get_cpu_entry_area(cpu)->tss.x86_tss); 1534 vmcs_writel(HOST_GDTR_BASE, (unsigned long)gdt); /* 22.2.4 */ 1535 1536 if (IS_ENABLED(CONFIG_IA32_EMULATION) || IS_ENABLED(CONFIG_X86_32)) { 1537 /* 22.2.3 */ 1538 vmcs_writel(HOST_IA32_SYSENTER_ESP, 1539 (unsigned long)(cpu_entry_stack(cpu) + 1)); 1540 } 1541 1542 vmx->loaded_vmcs->cpu = cpu; 1543 } 1544 } 1545 1546 /* 1547 * Switches to specified vcpu, until a matching vcpu_put(), but assumes 1548 * vcpu mutex is already taken. 1549 */ 1550 void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) 1551 { 1552 if (vcpu->scheduled_out && !kvm_pause_in_guest(vcpu->kvm)) 1553 shrink_ple_window(vcpu); 1554 1555 vmx_vcpu_load_vmcs(vcpu, cpu); 1556 1557 vmx_vcpu_pi_load(vcpu, cpu); 1558 } 1559 1560 void vmx_vcpu_put(struct kvm_vcpu *vcpu) 1561 { 1562 vmx_vcpu_pi_put(vcpu); 1563 1564 vmx_prepare_switch_to_host(to_vmx(vcpu)); 1565 } 1566 1567 static void vmx_switch_loaded_vmcs(struct kvm_vcpu *vcpu, 1568 struct loaded_vmcs *vmcs) 1569 { 1570 struct vcpu_vmx *vmx = to_vmx(vcpu); 1571 int cpu; 1572 1573 cpu = get_cpu(); 1574 vmx->loaded_vmcs = vmcs; 1575 vmx_vcpu_load_vmcs(vcpu, cpu); 1576 put_cpu(); 1577 } 1578 1579 static void vmx_load_vmcs01(struct kvm_vcpu *vcpu) 1580 { 1581 struct vcpu_vmx *vmx = to_vmx(vcpu); 1582 1583 if (!is_guest_mode(vcpu)) { 1584 WARN_ON_ONCE(vmx->loaded_vmcs != &vmx->vmcs01); 1585 return; 1586 } 1587 1588 WARN_ON_ONCE(vmx->loaded_vmcs != &vmx->nested.vmcs02); 1589 vmx_switch_loaded_vmcs(vcpu, &vmx->vmcs01); 1590 } 1591 1592 static void vmx_put_vmcs01(struct kvm_vcpu *vcpu) 1593 { 1594 if (!is_guest_mode(vcpu)) 1595 return; 1596 1597 vmx_switch_loaded_vmcs(vcpu, &to_vmx(vcpu)->nested.vmcs02); 1598 } 1599 DEFINE_GUARD(vmx_vmcs01, struct kvm_vcpu *, 1600 vmx_load_vmcs01(_T), vmx_put_vmcs01(_T)) 1601 1602 bool vmx_emulation_required(struct kvm_vcpu *vcpu) 1603 { 1604 return emulate_invalid_guest_state && !vmx_guest_state_valid(vcpu); 1605 } 1606 1607 unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu) 1608 { 1609 struct vcpu_vmx *vmx = to_vmx(vcpu); 1610 unsigned long rflags, save_rflags; 1611 1612 if (!kvm_register_is_available(vcpu, VCPU_EXREG_RFLAGS)) { 1613 kvm_register_mark_available(vcpu, VCPU_EXREG_RFLAGS); 1614 rflags = vmcs_readl(GUEST_RFLAGS); 1615 if (vmx->rmode.vm86_active) { 1616 rflags &= RMODE_GUEST_OWNED_EFLAGS_BITS; 1617 save_rflags = vmx->rmode.save_rflags; 1618 rflags |= save_rflags & ~RMODE_GUEST_OWNED_EFLAGS_BITS; 1619 } 1620 vmx->rflags = rflags; 1621 } 1622 return vmx->rflags; 1623 } 1624 1625 void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) 1626 { 1627 struct vcpu_vmx *vmx = to_vmx(vcpu); 1628 unsigned long old_rflags; 1629 1630 /* 1631 * Unlike CR0 and CR4, RFLAGS handling requires checking if the vCPU 1632 * is an unrestricted guest in order to mark L2 as needing emulation 1633 * if L1 runs L2 as a restricted guest. 1634 */ 1635 if (is_unrestricted_guest(vcpu)) { 1636 kvm_register_mark_available(vcpu, VCPU_EXREG_RFLAGS); 1637 vmx->rflags = rflags; 1638 vmcs_writel(GUEST_RFLAGS, rflags); 1639 return; 1640 } 1641 1642 old_rflags = vmx_get_rflags(vcpu); 1643 vmx->rflags = rflags; 1644 if (vmx->rmode.vm86_active) { 1645 vmx->rmode.save_rflags = rflags; 1646 rflags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM; 1647 } 1648 vmcs_writel(GUEST_RFLAGS, rflags); 1649 1650 if ((old_rflags ^ vmx->rflags) & X86_EFLAGS_VM) 1651 vmx->vt.emulation_required = vmx_emulation_required(vcpu); 1652 } 1653 1654 bool vmx_get_if_flag(struct kvm_vcpu *vcpu) 1655 { 1656 return vmx_get_rflags(vcpu) & X86_EFLAGS_IF; 1657 } 1658 1659 u32 vmx_get_interrupt_shadow(struct kvm_vcpu *vcpu) 1660 { 1661 u32 interruptibility = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO); 1662 int ret = 0; 1663 1664 if (interruptibility & GUEST_INTR_STATE_STI) 1665 ret |= KVM_X86_SHADOW_INT_STI; 1666 if (interruptibility & GUEST_INTR_STATE_MOV_SS) 1667 ret |= KVM_X86_SHADOW_INT_MOV_SS; 1668 1669 return ret; 1670 } 1671 1672 void vmx_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask) 1673 { 1674 u32 interruptibility_old = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO); 1675 u32 interruptibility = interruptibility_old; 1676 1677 interruptibility &= ~(GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS); 1678 1679 if (mask & KVM_X86_SHADOW_INT_MOV_SS) 1680 interruptibility |= GUEST_INTR_STATE_MOV_SS; 1681 else if (mask & KVM_X86_SHADOW_INT_STI) 1682 interruptibility |= GUEST_INTR_STATE_STI; 1683 1684 if ((interruptibility != interruptibility_old)) 1685 vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, interruptibility); 1686 } 1687 1688 static int vmx_rtit_ctl_check(struct kvm_vcpu *vcpu, u64 data) 1689 { 1690 struct vcpu_vmx *vmx = to_vmx(vcpu); 1691 unsigned long value; 1692 1693 /* 1694 * Any MSR write that attempts to change bits marked reserved will 1695 * case a #GP fault. 1696 */ 1697 if (data & vmx->pt_desc.ctl_bitmask) 1698 return 1; 1699 1700 /* 1701 * Any attempt to modify IA32_RTIT_CTL while TraceEn is set will 1702 * result in a #GP unless the same write also clears TraceEn. 1703 */ 1704 if ((vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) && 1705 (data & RTIT_CTL_TRACEEN) && 1706 data != vmx->pt_desc.guest.ctl) 1707 return 1; 1708 1709 /* 1710 * WRMSR to IA32_RTIT_CTL that sets TraceEn but clears this bit 1711 * and FabricEn would cause #GP, if 1712 * CPUID.(EAX=14H, ECX=0):ECX.SNGLRGNOUT[bit 2] = 0 1713 */ 1714 if ((data & RTIT_CTL_TRACEEN) && !(data & RTIT_CTL_TOPA) && 1715 !(data & RTIT_CTL_FABRIC_EN) && 1716 !intel_pt_validate_cap(vmx->pt_desc.caps, 1717 PT_CAP_single_range_output)) 1718 return 1; 1719 1720 /* 1721 * MTCFreq, CycThresh and PSBFreq encodings check, any MSR write that 1722 * utilize encodings marked reserved will cause a #GP fault. 1723 */ 1724 value = intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_mtc_periods); 1725 if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_mtc) && 1726 !test_bit((data & RTIT_CTL_MTC_RANGE) >> 1727 RTIT_CTL_MTC_RANGE_OFFSET, &value)) 1728 return 1; 1729 value = intel_pt_validate_cap(vmx->pt_desc.caps, 1730 PT_CAP_cycle_thresholds); 1731 if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_psb_cyc) && 1732 !test_bit((data & RTIT_CTL_CYC_THRESH) >> 1733 RTIT_CTL_CYC_THRESH_OFFSET, &value)) 1734 return 1; 1735 value = intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_psb_periods); 1736 if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_psb_cyc) && 1737 !test_bit((data & RTIT_CTL_PSB_FREQ) >> 1738 RTIT_CTL_PSB_FREQ_OFFSET, &value)) 1739 return 1; 1740 1741 /* 1742 * If ADDRx_CFG is reserved or the encodings is >2 will 1743 * cause a #GP fault. 1744 */ 1745 value = (data & RTIT_CTL_ADDR0) >> RTIT_CTL_ADDR0_OFFSET; 1746 if ((value && (vmx->pt_desc.num_address_ranges < 1)) || (value > 2)) 1747 return 1; 1748 value = (data & RTIT_CTL_ADDR1) >> RTIT_CTL_ADDR1_OFFSET; 1749 if ((value && (vmx->pt_desc.num_address_ranges < 2)) || (value > 2)) 1750 return 1; 1751 value = (data & RTIT_CTL_ADDR2) >> RTIT_CTL_ADDR2_OFFSET; 1752 if ((value && (vmx->pt_desc.num_address_ranges < 3)) || (value > 2)) 1753 return 1; 1754 value = (data & RTIT_CTL_ADDR3) >> RTIT_CTL_ADDR3_OFFSET; 1755 if ((value && (vmx->pt_desc.num_address_ranges < 4)) || (value > 2)) 1756 return 1; 1757 1758 return 0; 1759 } 1760 1761 int vmx_check_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type, 1762 void *insn, int insn_len) 1763 { 1764 /* 1765 * Emulation of instructions in SGX enclaves is impossible as RIP does 1766 * not point at the failing instruction, and even if it did, the code 1767 * stream is inaccessible. Inject #UD instead of exiting to userspace 1768 * so that guest userspace can't DoS the guest simply by triggering 1769 * emulation (enclaves are CPL3 only). 1770 */ 1771 if (vmx_get_exit_reason(vcpu).enclave_mode) { 1772 kvm_queue_exception(vcpu, UD_VECTOR); 1773 return X86EMUL_PROPAGATE_FAULT; 1774 } 1775 1776 /* Check that emulation is possible during event vectoring */ 1777 if ((to_vmx(vcpu)->idt_vectoring_info & VECTORING_INFO_VALID_MASK) && 1778 !kvm_can_emulate_event_vectoring(emul_type)) 1779 return X86EMUL_UNHANDLEABLE_VECTORING; 1780 1781 return X86EMUL_CONTINUE; 1782 } 1783 1784 static int skip_emulated_instruction(struct kvm_vcpu *vcpu) 1785 { 1786 union vmx_exit_reason exit_reason = vmx_get_exit_reason(vcpu); 1787 unsigned long rip, orig_rip; 1788 u32 instr_len; 1789 1790 /* 1791 * Using VMCS.VM_EXIT_INSTRUCTION_LEN on EPT misconfig depends on 1792 * undefined behavior: Intel's SDM doesn't mandate the VMCS field be 1793 * set when EPT misconfig occurs. In practice, real hardware updates 1794 * VM_EXIT_INSTRUCTION_LEN on EPT misconfig, but other hypervisors 1795 * (namely Hyper-V) don't set it due to it being undefined behavior, 1796 * i.e. we end up advancing IP with some random value. 1797 */ 1798 if (!static_cpu_has(X86_FEATURE_HYPERVISOR) || 1799 exit_reason.basic != EXIT_REASON_EPT_MISCONFIG) { 1800 instr_len = vmcs_read32(VM_EXIT_INSTRUCTION_LEN); 1801 1802 /* 1803 * Emulating an enclave's instructions isn't supported as KVM 1804 * cannot access the enclave's memory or its true RIP, e.g. the 1805 * vmcs.GUEST_RIP points at the exit point of the enclave, not 1806 * the RIP that actually triggered the VM-Exit. But, because 1807 * most instructions that cause VM-Exit will #UD in an enclave, 1808 * most instruction-based VM-Exits simply do not occur. 1809 * 1810 * There are a few exceptions, notably the debug instructions 1811 * INT1ICEBRK and INT3, as they are allowed in debug enclaves 1812 * and generate #DB/#BP as expected, which KVM might intercept. 1813 * But again, the CPU does the dirty work and saves an instr 1814 * length of zero so VMMs don't shoot themselves in the foot. 1815 * WARN if KVM tries to skip a non-zero length instruction on 1816 * a VM-Exit from an enclave. 1817 */ 1818 if (!instr_len) 1819 goto rip_updated; 1820 1821 WARN_ONCE(exit_reason.enclave_mode, 1822 "skipping instruction after SGX enclave VM-Exit"); 1823 1824 orig_rip = kvm_rip_read(vcpu); 1825 rip = orig_rip + instr_len; 1826 #ifdef CONFIG_X86_64 1827 /* 1828 * We need to mask out the high 32 bits of RIP if not in 64-bit 1829 * mode, but just finding out that we are in 64-bit mode is 1830 * quite expensive. Only do it if there was a carry. 1831 */ 1832 if (unlikely(((rip ^ orig_rip) >> 31) == 3) && !is_64_bit_mode(vcpu)) 1833 rip = (u32)rip; 1834 #endif 1835 kvm_rip_write(vcpu, rip); 1836 } else { 1837 if (!kvm_emulate_instruction(vcpu, EMULTYPE_SKIP)) 1838 return 0; 1839 } 1840 1841 rip_updated: 1842 /* skipping an emulated instruction also counts */ 1843 vmx_set_interrupt_shadow(vcpu, 0); 1844 1845 return 1; 1846 } 1847 1848 /* 1849 * Recognizes a pending MTF VM-exit and records the nested state for later 1850 * delivery. 1851 */ 1852 void vmx_update_emulated_instruction(struct kvm_vcpu *vcpu) 1853 { 1854 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 1855 struct vcpu_vmx *vmx = to_vmx(vcpu); 1856 1857 if (!is_guest_mode(vcpu)) 1858 return; 1859 1860 /* 1861 * Per the SDM, MTF takes priority over debug-trap exceptions besides 1862 * TSS T-bit traps and ICEBP (INT1). KVM doesn't emulate T-bit traps 1863 * or ICEBP (in the emulator proper), and skipping of ICEBP after an 1864 * intercepted #DB deliberately avoids single-step #DB and MTF updates 1865 * as ICEBP is higher priority than both. As instruction emulation is 1866 * completed at this point (i.e. KVM is at the instruction boundary), 1867 * any #DB exception pending delivery must be a debug-trap of lower 1868 * priority than MTF. Record the pending MTF state to be delivered in 1869 * vmx_check_nested_events(). 1870 */ 1871 if (nested_cpu_has_mtf(vmcs12) && 1872 (!vcpu->arch.exception.pending || 1873 vcpu->arch.exception.vector == DB_VECTOR) && 1874 (!vcpu->arch.exception_vmexit.pending || 1875 vcpu->arch.exception_vmexit.vector == DB_VECTOR)) { 1876 vmx->nested.mtf_pending = true; 1877 kvm_make_request(KVM_REQ_EVENT, vcpu); 1878 } else { 1879 vmx->nested.mtf_pending = false; 1880 } 1881 } 1882 1883 int vmx_skip_emulated_instruction(struct kvm_vcpu *vcpu) 1884 { 1885 vmx_update_emulated_instruction(vcpu); 1886 return skip_emulated_instruction(vcpu); 1887 } 1888 1889 static void vmx_clear_hlt(struct kvm_vcpu *vcpu) 1890 { 1891 /* 1892 * Ensure that we clear the HLT state in the VMCS. We don't need to 1893 * explicitly skip the instruction because if the HLT state is set, 1894 * then the instruction is already executing and RIP has already been 1895 * advanced. 1896 */ 1897 if (kvm_hlt_in_guest(vcpu->kvm) && 1898 vmcs_read32(GUEST_ACTIVITY_STATE) == GUEST_ACTIVITY_HLT) 1899 vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE); 1900 } 1901 1902 void vmx_inject_exception(struct kvm_vcpu *vcpu) 1903 { 1904 struct kvm_queued_exception *ex = &vcpu->arch.exception; 1905 u32 intr_info = ex->vector | INTR_INFO_VALID_MASK; 1906 struct vcpu_vmx *vmx = to_vmx(vcpu); 1907 1908 kvm_deliver_exception_payload(vcpu, ex); 1909 1910 if (ex->has_error_code) { 1911 /* 1912 * Despite the error code being architecturally defined as 32 1913 * bits, and the VMCS field being 32 bits, Intel CPUs and thus 1914 * VMX don't actually supporting setting bits 31:16. Hardware 1915 * will (should) never provide a bogus error code, but AMD CPUs 1916 * do generate error codes with bits 31:16 set, and so KVM's 1917 * ABI lets userspace shove in arbitrary 32-bit values. Drop 1918 * the upper bits to avoid VM-Fail, losing information that 1919 * doesn't really exist is preferable to killing the VM. 1920 */ 1921 vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, (u16)ex->error_code); 1922 intr_info |= INTR_INFO_DELIVER_CODE_MASK; 1923 } 1924 1925 if (vmx->rmode.vm86_active) { 1926 int inc_eip = 0; 1927 if (kvm_exception_is_soft(ex->vector)) 1928 inc_eip = vcpu->arch.event_exit_inst_len; 1929 kvm_inject_realmode_interrupt(vcpu, ex->vector, inc_eip); 1930 return; 1931 } 1932 1933 WARN_ON_ONCE(vmx->vt.emulation_required); 1934 1935 if (kvm_exception_is_soft(ex->vector)) { 1936 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 1937 vmx->vcpu.arch.event_exit_inst_len); 1938 intr_info |= INTR_TYPE_SOFT_EXCEPTION; 1939 } else 1940 intr_info |= INTR_TYPE_HARD_EXCEPTION; 1941 1942 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr_info); 1943 1944 vmx_clear_hlt(vcpu); 1945 } 1946 1947 static void vmx_setup_uret_msr(struct vcpu_vmx *vmx, unsigned int msr, 1948 bool load_into_hardware) 1949 { 1950 struct vmx_uret_msr *uret_msr; 1951 1952 uret_msr = vmx_find_uret_msr(vmx, msr); 1953 if (!uret_msr) 1954 return; 1955 1956 uret_msr->load_into_hardware = load_into_hardware; 1957 } 1958 1959 /* 1960 * Configuring user return MSRs to automatically save, load, and restore MSRs 1961 * that need to be shoved into hardware when running the guest. Note, omitting 1962 * an MSR here does _NOT_ mean it's not emulated, only that it will not be 1963 * loaded into hardware when running the guest. 1964 */ 1965 static void vmx_setup_uret_msrs(struct vcpu_vmx *vmx) 1966 { 1967 #ifdef CONFIG_X86_64 1968 bool load_syscall_msrs; 1969 1970 /* 1971 * The SYSCALL MSRs are only needed on long mode guests, and only 1972 * when EFER.SCE is set. 1973 */ 1974 load_syscall_msrs = is_long_mode(&vmx->vcpu) && 1975 (vmx->vcpu.arch.efer & EFER_SCE); 1976 1977 vmx_setup_uret_msr(vmx, MSR_STAR, load_syscall_msrs); 1978 vmx_setup_uret_msr(vmx, MSR_LSTAR, load_syscall_msrs); 1979 vmx_setup_uret_msr(vmx, MSR_SYSCALL_MASK, load_syscall_msrs); 1980 #endif 1981 vmx_setup_uret_msr(vmx, MSR_EFER, update_transition_efer(vmx)); 1982 1983 vmx_setup_uret_msr(vmx, MSR_TSC_AUX, 1984 guest_cpu_cap_has(&vmx->vcpu, X86_FEATURE_RDTSCP) || 1985 guest_cpu_cap_has(&vmx->vcpu, X86_FEATURE_RDPID)); 1986 1987 /* 1988 * hle=0, rtm=0, tsx_ctrl=1 can be found with some combinations of new 1989 * kernel and old userspace. If those guests run on a tsx=off host, do 1990 * allow guests to use TSX_CTRL, but don't change the value in hardware 1991 * so that TSX remains always disabled. 1992 */ 1993 vmx_setup_uret_msr(vmx, MSR_IA32_TSX_CTRL, boot_cpu_has(X86_FEATURE_RTM)); 1994 1995 /* 1996 * The set of MSRs to load may have changed, reload MSRs before the 1997 * next VM-Enter. 1998 */ 1999 vmx->guest_uret_msrs_loaded = false; 2000 } 2001 2002 u64 vmx_get_l2_tsc_offset(struct kvm_vcpu *vcpu) 2003 { 2004 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 2005 2006 if (nested_cpu_has(vmcs12, CPU_BASED_USE_TSC_OFFSETTING)) 2007 return vmcs12->tsc_offset; 2008 2009 return 0; 2010 } 2011 2012 u64 vmx_get_l2_tsc_multiplier(struct kvm_vcpu *vcpu) 2013 { 2014 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 2015 2016 if (nested_cpu_has(vmcs12, CPU_BASED_USE_TSC_OFFSETTING) && 2017 nested_cpu_has2(vmcs12, SECONDARY_EXEC_TSC_SCALING)) 2018 return vmcs12->tsc_multiplier; 2019 2020 return kvm_caps.default_tsc_scaling_ratio; 2021 } 2022 2023 void vmx_write_tsc_offset(struct kvm_vcpu *vcpu) 2024 { 2025 vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset); 2026 } 2027 2028 void vmx_write_tsc_multiplier(struct kvm_vcpu *vcpu) 2029 { 2030 vmcs_write64(TSC_MULTIPLIER, vcpu->arch.tsc_scaling_ratio); 2031 } 2032 2033 /* 2034 * Userspace is allowed to set any supported IA32_FEATURE_CONTROL regardless of 2035 * guest CPUID. Note, KVM allows userspace to set "VMX in SMX" to maintain 2036 * backwards compatibility even though KVM doesn't support emulating SMX. And 2037 * because userspace set "VMX in SMX", the guest must also be allowed to set it, 2038 * e.g. if the MSR is left unlocked and the guest does a RMW operation. 2039 */ 2040 #define KVM_SUPPORTED_FEATURE_CONTROL (FEAT_CTL_LOCKED | \ 2041 FEAT_CTL_VMX_ENABLED_INSIDE_SMX | \ 2042 FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX | \ 2043 FEAT_CTL_SGX_LC_ENABLED | \ 2044 FEAT_CTL_SGX_ENABLED | \ 2045 FEAT_CTL_LMCE_ENABLED) 2046 2047 static inline bool is_vmx_feature_control_msr_valid(struct vcpu_vmx *vmx, 2048 struct msr_data *msr) 2049 { 2050 uint64_t valid_bits; 2051 2052 /* 2053 * Ensure KVM_SUPPORTED_FEATURE_CONTROL is updated when new bits are 2054 * exposed to the guest. 2055 */ 2056 WARN_ON_ONCE(vmx->msr_ia32_feature_control_valid_bits & 2057 ~KVM_SUPPORTED_FEATURE_CONTROL); 2058 2059 if (!msr->host_initiated && 2060 (vmx->msr_ia32_feature_control & FEAT_CTL_LOCKED)) 2061 return false; 2062 2063 if (msr->host_initiated) 2064 valid_bits = KVM_SUPPORTED_FEATURE_CONTROL; 2065 else 2066 valid_bits = vmx->msr_ia32_feature_control_valid_bits; 2067 2068 return !(msr->data & ~valid_bits); 2069 } 2070 2071 int vmx_get_feature_msr(u32 msr, u64 *data) 2072 { 2073 switch (msr) { 2074 case KVM_FIRST_EMULATED_VMX_MSR ... KVM_LAST_EMULATED_VMX_MSR: 2075 if (!nested) 2076 return 1; 2077 return vmx_get_vmx_msr(&vmcs_config.nested, msr, data); 2078 default: 2079 return KVM_MSR_RET_UNSUPPORTED; 2080 } 2081 } 2082 2083 /* 2084 * Reads an msr value (of 'msr_info->index') into 'msr_info->data'. 2085 * Returns 0 on success, non-0 otherwise. 2086 * Assumes vcpu_load() was already called. 2087 */ 2088 int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) 2089 { 2090 struct vcpu_vmx *vmx = to_vmx(vcpu); 2091 struct vmx_uret_msr *msr; 2092 u32 index; 2093 2094 switch (msr_info->index) { 2095 #ifdef CONFIG_X86_64 2096 case MSR_FS_BASE: 2097 msr_info->data = vmcs_readl(GUEST_FS_BASE); 2098 break; 2099 case MSR_GS_BASE: 2100 msr_info->data = vmcs_readl(GUEST_GS_BASE); 2101 break; 2102 case MSR_KERNEL_GS_BASE: 2103 msr_info->data = vmx_read_guest_kernel_gs_base(vmx); 2104 break; 2105 #endif 2106 case MSR_EFER: 2107 return kvm_get_msr_common(vcpu, msr_info); 2108 case MSR_IA32_TSX_CTRL: 2109 if (!msr_info->host_initiated && 2110 !(vcpu->arch.arch_capabilities & ARCH_CAP_TSX_CTRL_MSR)) 2111 return 1; 2112 goto find_uret_msr; 2113 case MSR_IA32_UMWAIT_CONTROL: 2114 if (!msr_info->host_initiated && !vmx_has_waitpkg(vmx)) 2115 return 1; 2116 2117 msr_info->data = vmx->msr_ia32_umwait_control; 2118 break; 2119 case MSR_IA32_SPEC_CTRL: 2120 if (!msr_info->host_initiated && 2121 !guest_has_spec_ctrl_msr(vcpu)) 2122 return 1; 2123 2124 msr_info->data = to_vmx(vcpu)->spec_ctrl; 2125 break; 2126 case MSR_IA32_SYSENTER_CS: 2127 msr_info->data = vmcs_read32(GUEST_SYSENTER_CS); 2128 break; 2129 case MSR_IA32_SYSENTER_EIP: 2130 msr_info->data = vmcs_readl(GUEST_SYSENTER_EIP); 2131 break; 2132 case MSR_IA32_SYSENTER_ESP: 2133 msr_info->data = vmcs_readl(GUEST_SYSENTER_ESP); 2134 break; 2135 case MSR_IA32_BNDCFGS: 2136 if (!kvm_mpx_supported() || 2137 (!msr_info->host_initiated && 2138 !guest_cpu_cap_has(vcpu, X86_FEATURE_MPX))) 2139 return 1; 2140 msr_info->data = vmcs_read64(GUEST_BNDCFGS); 2141 break; 2142 case MSR_IA32_MCG_EXT_CTL: 2143 if (!msr_info->host_initiated && 2144 !(vmx->msr_ia32_feature_control & 2145 FEAT_CTL_LMCE_ENABLED)) 2146 return 1; 2147 msr_info->data = vcpu->arch.mcg_ext_ctl; 2148 break; 2149 case MSR_IA32_FEAT_CTL: 2150 msr_info->data = vmx->msr_ia32_feature_control; 2151 break; 2152 case MSR_IA32_SGXLEPUBKEYHASH0 ... MSR_IA32_SGXLEPUBKEYHASH3: 2153 if (!msr_info->host_initiated && 2154 !guest_cpu_cap_has(vcpu, X86_FEATURE_SGX_LC)) 2155 return 1; 2156 msr_info->data = to_vmx(vcpu)->msr_ia32_sgxlepubkeyhash 2157 [msr_info->index - MSR_IA32_SGXLEPUBKEYHASH0]; 2158 break; 2159 case KVM_FIRST_EMULATED_VMX_MSR ... KVM_LAST_EMULATED_VMX_MSR: 2160 if (!guest_cpu_cap_has(vcpu, X86_FEATURE_VMX)) 2161 return 1; 2162 if (vmx_get_vmx_msr(&vmx->nested.msrs, msr_info->index, 2163 &msr_info->data)) 2164 return 1; 2165 #ifdef CONFIG_KVM_HYPERV 2166 /* 2167 * Enlightened VMCS v1 doesn't have certain VMCS fields but 2168 * instead of just ignoring the features, different Hyper-V 2169 * versions are either trying to use them and fail or do some 2170 * sanity checking and refuse to boot. Filter all unsupported 2171 * features out. 2172 */ 2173 if (!msr_info->host_initiated && guest_cpu_cap_has_evmcs(vcpu)) 2174 nested_evmcs_filter_control_msr(vcpu, msr_info->index, 2175 &msr_info->data); 2176 #endif 2177 break; 2178 case MSR_IA32_RTIT_CTL: 2179 if (!vmx_pt_mode_is_host_guest()) 2180 return 1; 2181 msr_info->data = vmx->pt_desc.guest.ctl; 2182 break; 2183 case MSR_IA32_RTIT_STATUS: 2184 if (!vmx_pt_mode_is_host_guest()) 2185 return 1; 2186 msr_info->data = vmx->pt_desc.guest.status; 2187 break; 2188 case MSR_IA32_RTIT_CR3_MATCH: 2189 if (!vmx_pt_mode_is_host_guest() || 2190 !intel_pt_validate_cap(vmx->pt_desc.caps, 2191 PT_CAP_cr3_filtering)) 2192 return 1; 2193 msr_info->data = vmx->pt_desc.guest.cr3_match; 2194 break; 2195 case MSR_IA32_RTIT_OUTPUT_BASE: 2196 if (!vmx_pt_mode_is_host_guest() || 2197 (!intel_pt_validate_cap(vmx->pt_desc.caps, 2198 PT_CAP_topa_output) && 2199 !intel_pt_validate_cap(vmx->pt_desc.caps, 2200 PT_CAP_single_range_output))) 2201 return 1; 2202 msr_info->data = vmx->pt_desc.guest.output_base; 2203 break; 2204 case MSR_IA32_RTIT_OUTPUT_MASK: 2205 if (!vmx_pt_mode_is_host_guest() || 2206 (!intel_pt_validate_cap(vmx->pt_desc.caps, 2207 PT_CAP_topa_output) && 2208 !intel_pt_validate_cap(vmx->pt_desc.caps, 2209 PT_CAP_single_range_output))) 2210 return 1; 2211 msr_info->data = vmx->pt_desc.guest.output_mask; 2212 break; 2213 case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B: 2214 index = msr_info->index - MSR_IA32_RTIT_ADDR0_A; 2215 if (!vmx_pt_mode_is_host_guest() || 2216 (index >= 2 * vmx->pt_desc.num_address_ranges)) 2217 return 1; 2218 if (index % 2) 2219 msr_info->data = vmx->pt_desc.guest.addr_b[index / 2]; 2220 else 2221 msr_info->data = vmx->pt_desc.guest.addr_a[index / 2]; 2222 break; 2223 case MSR_IA32_S_CET: 2224 msr_info->data = vmcs_readl(GUEST_S_CET); 2225 break; 2226 case MSR_KVM_INTERNAL_GUEST_SSP: 2227 msr_info->data = vmcs_readl(GUEST_SSP); 2228 break; 2229 case MSR_IA32_INT_SSP_TAB: 2230 msr_info->data = vmcs_readl(GUEST_INTR_SSP_TABLE); 2231 break; 2232 case MSR_IA32_DEBUGCTLMSR: 2233 msr_info->data = vmx_guest_debugctl_read(); 2234 break; 2235 default: 2236 find_uret_msr: 2237 msr = vmx_find_uret_msr(vmx, msr_info->index); 2238 if (msr) { 2239 msr_info->data = msr->data; 2240 break; 2241 } 2242 return kvm_get_msr_common(vcpu, msr_info); 2243 } 2244 2245 return 0; 2246 } 2247 2248 static u64 nested_vmx_truncate_sysenter_addr(struct kvm_vcpu *vcpu, 2249 u64 data) 2250 { 2251 #ifdef CONFIG_X86_64 2252 if (!guest_cpu_cap_has(vcpu, X86_FEATURE_LM)) 2253 return (u32)data; 2254 #endif 2255 return (unsigned long)data; 2256 } 2257 2258 u64 vmx_get_supported_debugctl(struct kvm_vcpu *vcpu, bool host_initiated) 2259 { 2260 u64 debugctl = 0; 2261 2262 if (boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT) && 2263 (host_initiated || guest_cpu_cap_has(vcpu, X86_FEATURE_BUS_LOCK_DETECT))) 2264 debugctl |= DEBUGCTLMSR_BUS_LOCK_DETECT; 2265 2266 if ((kvm_caps.supported_perf_cap & PERF_CAP_LBR_FMT) && 2267 (host_initiated || intel_pmu_lbr_is_enabled(vcpu))) 2268 debugctl |= DEBUGCTLMSR_LBR | DEBUGCTLMSR_FREEZE_LBRS_ON_PMI; 2269 2270 if (boot_cpu_has(X86_FEATURE_RTM) && 2271 (host_initiated || guest_cpu_cap_has(vcpu, X86_FEATURE_RTM))) 2272 debugctl |= DEBUGCTLMSR_RTM_DEBUG; 2273 2274 return debugctl; 2275 } 2276 2277 bool vmx_is_valid_debugctl(struct kvm_vcpu *vcpu, u64 data, bool host_initiated) 2278 { 2279 u64 invalid; 2280 2281 invalid = data & ~vmx_get_supported_debugctl(vcpu, host_initiated); 2282 if (invalid & (DEBUGCTLMSR_BTF | DEBUGCTLMSR_LBR)) { 2283 kvm_pr_unimpl_wrmsr(vcpu, MSR_IA32_DEBUGCTLMSR, data); 2284 invalid &= ~(DEBUGCTLMSR_BTF | DEBUGCTLMSR_LBR); 2285 } 2286 return !invalid; 2287 } 2288 2289 /* 2290 * Writes msr value into the appropriate "register". 2291 * Returns 0 on success, non-0 otherwise. 2292 * Assumes vcpu_load() was already called. 2293 */ 2294 int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) 2295 { 2296 struct vcpu_vmx *vmx = to_vmx(vcpu); 2297 struct vmx_uret_msr *msr; 2298 int ret = 0; 2299 u32 msr_index = msr_info->index; 2300 u64 data = msr_info->data; 2301 u32 index; 2302 2303 switch (msr_index) { 2304 case MSR_EFER: 2305 ret = kvm_set_msr_common(vcpu, msr_info); 2306 break; 2307 #ifdef CONFIG_X86_64 2308 case MSR_FS_BASE: 2309 vmx_segment_cache_clear(vmx); 2310 vmcs_writel(GUEST_FS_BASE, data); 2311 break; 2312 case MSR_GS_BASE: 2313 vmx_segment_cache_clear(vmx); 2314 vmcs_writel(GUEST_GS_BASE, data); 2315 break; 2316 case MSR_KERNEL_GS_BASE: 2317 vmx_write_guest_kernel_gs_base(vmx, data); 2318 break; 2319 case MSR_IA32_XFD: 2320 ret = kvm_set_msr_common(vcpu, msr_info); 2321 /* 2322 * Always intercepting WRMSR could incur non-negligible 2323 * overhead given xfd might be changed frequently in 2324 * guest context switch. Disable write interception 2325 * upon the first write with a non-zero value (indicating 2326 * potential usage on dynamic xfeatures). Also update 2327 * exception bitmap to trap #NM for proper virtualization 2328 * of guest xfd_err. 2329 */ 2330 if (!ret && data) { 2331 vmx_disable_intercept_for_msr(vcpu, MSR_IA32_XFD, 2332 MSR_TYPE_RW); 2333 vcpu->arch.xfd_no_write_intercept = true; 2334 vmx_update_exception_bitmap(vcpu); 2335 } 2336 break; 2337 #endif 2338 case MSR_IA32_SYSENTER_CS: 2339 if (is_guest_mode(vcpu)) 2340 get_vmcs12(vcpu)->guest_sysenter_cs = data; 2341 vmcs_write32(GUEST_SYSENTER_CS, data); 2342 break; 2343 case MSR_IA32_SYSENTER_EIP: 2344 if (is_guest_mode(vcpu)) { 2345 data = nested_vmx_truncate_sysenter_addr(vcpu, data); 2346 get_vmcs12(vcpu)->guest_sysenter_eip = data; 2347 } 2348 vmcs_writel(GUEST_SYSENTER_EIP, data); 2349 break; 2350 case MSR_IA32_SYSENTER_ESP: 2351 if (is_guest_mode(vcpu)) { 2352 data = nested_vmx_truncate_sysenter_addr(vcpu, data); 2353 get_vmcs12(vcpu)->guest_sysenter_esp = data; 2354 } 2355 vmcs_writel(GUEST_SYSENTER_ESP, data); 2356 break; 2357 case MSR_IA32_DEBUGCTLMSR: 2358 if (!vmx_is_valid_debugctl(vcpu, data, msr_info->host_initiated)) 2359 return 1; 2360 2361 data &= vmx_get_supported_debugctl(vcpu, msr_info->host_initiated); 2362 2363 if (is_guest_mode(vcpu) && get_vmcs12(vcpu)->vm_exit_controls & 2364 VM_EXIT_SAVE_DEBUG_CONTROLS) 2365 get_vmcs12(vcpu)->guest_ia32_debugctl = data; 2366 2367 vmx_guest_debugctl_write(vcpu, data); 2368 2369 if (intel_pmu_lbr_is_enabled(vcpu) && !to_vmx(vcpu)->lbr_desc.event && 2370 (data & DEBUGCTLMSR_LBR)) 2371 intel_pmu_create_guest_lbr_event(vcpu); 2372 return 0; 2373 case MSR_IA32_BNDCFGS: 2374 if (!kvm_mpx_supported() || 2375 (!msr_info->host_initiated && 2376 !guest_cpu_cap_has(vcpu, X86_FEATURE_MPX))) 2377 return 1; 2378 if (is_noncanonical_msr_address(data & PAGE_MASK, vcpu) || 2379 (data & MSR_IA32_BNDCFGS_RSVD)) 2380 return 1; 2381 2382 if (is_guest_mode(vcpu) && 2383 ((vmx->nested.msrs.entry_ctls_high & VM_ENTRY_LOAD_BNDCFGS) || 2384 (vmx->nested.msrs.exit_ctls_high & VM_EXIT_CLEAR_BNDCFGS))) 2385 get_vmcs12(vcpu)->guest_bndcfgs = data; 2386 2387 vmcs_write64(GUEST_BNDCFGS, data); 2388 break; 2389 case MSR_IA32_UMWAIT_CONTROL: 2390 if (!msr_info->host_initiated && !vmx_has_waitpkg(vmx)) 2391 return 1; 2392 2393 /* The reserved bit 1 and non-32 bit [63:32] should be zero */ 2394 if (data & (BIT_ULL(1) | GENMASK_ULL(63, 32))) 2395 return 1; 2396 2397 vmx->msr_ia32_umwait_control = data; 2398 break; 2399 case MSR_IA32_SPEC_CTRL: 2400 if (!msr_info->host_initiated && 2401 !guest_has_spec_ctrl_msr(vcpu)) 2402 return 1; 2403 2404 if (kvm_spec_ctrl_test_value(data)) 2405 return 1; 2406 2407 vmx->spec_ctrl = data; 2408 if (!data) 2409 break; 2410 2411 /* 2412 * For non-nested: 2413 * When it's written (to non-zero) for the first time, pass 2414 * it through. 2415 * 2416 * For nested: 2417 * The handling of the MSR bitmap for L2 guests is done in 2418 * nested_vmx_prepare_msr_bitmap. We should not touch the 2419 * vmcs02.msr_bitmap here since it gets completely overwritten 2420 * in the merging. We update the vmcs01 here for L1 as well 2421 * since it will end up touching the MSR anyway now. 2422 */ 2423 vmx_disable_intercept_for_msr(vcpu, 2424 MSR_IA32_SPEC_CTRL, 2425 MSR_TYPE_RW); 2426 break; 2427 case MSR_IA32_TSX_CTRL: 2428 if (!msr_info->host_initiated && 2429 !(vcpu->arch.arch_capabilities & ARCH_CAP_TSX_CTRL_MSR)) 2430 return 1; 2431 if (data & ~(TSX_CTRL_RTM_DISABLE | TSX_CTRL_CPUID_CLEAR)) 2432 return 1; 2433 goto find_uret_msr; 2434 case MSR_IA32_CR_PAT: 2435 ret = kvm_set_msr_common(vcpu, msr_info); 2436 if (ret) 2437 break; 2438 2439 if (is_guest_mode(vcpu) && 2440 get_vmcs12(vcpu)->vm_exit_controls & VM_EXIT_SAVE_IA32_PAT) 2441 get_vmcs12(vcpu)->guest_ia32_pat = data; 2442 2443 if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) 2444 vmcs_write64(GUEST_IA32_PAT, data); 2445 break; 2446 case MSR_IA32_MCG_EXT_CTL: 2447 if ((!msr_info->host_initiated && 2448 !(to_vmx(vcpu)->msr_ia32_feature_control & 2449 FEAT_CTL_LMCE_ENABLED)) || 2450 (data & ~MCG_EXT_CTL_LMCE_EN)) 2451 return 1; 2452 vcpu->arch.mcg_ext_ctl = data; 2453 break; 2454 case MSR_IA32_FEAT_CTL: 2455 if (!is_vmx_feature_control_msr_valid(vmx, msr_info)) 2456 return 1; 2457 2458 vmx->msr_ia32_feature_control = data; 2459 if (msr_info->host_initiated && data == 0) 2460 vmx_leave_nested(vcpu); 2461 2462 /* SGX may be enabled/disabled by guest's firmware */ 2463 vmx_write_encls_bitmap(vcpu, NULL); 2464 break; 2465 case MSR_IA32_SGXLEPUBKEYHASH0 ... MSR_IA32_SGXLEPUBKEYHASH3: 2466 /* 2467 * On real hardware, the LE hash MSRs are writable before 2468 * the firmware sets bit 0 in MSR 0x7a ("activating" SGX), 2469 * at which point SGX related bits in IA32_FEATURE_CONTROL 2470 * become writable. 2471 * 2472 * KVM does not emulate SGX activation for simplicity, so 2473 * allow writes to the LE hash MSRs if IA32_FEATURE_CONTROL 2474 * is unlocked. This is technically not architectural 2475 * behavior, but it's close enough. 2476 */ 2477 if (!msr_info->host_initiated && 2478 (!guest_cpu_cap_has(vcpu, X86_FEATURE_SGX_LC) || 2479 ((vmx->msr_ia32_feature_control & FEAT_CTL_LOCKED) && 2480 !(vmx->msr_ia32_feature_control & FEAT_CTL_SGX_LC_ENABLED)))) 2481 return 1; 2482 vmx->msr_ia32_sgxlepubkeyhash 2483 [msr_index - MSR_IA32_SGXLEPUBKEYHASH0] = data; 2484 break; 2485 case KVM_FIRST_EMULATED_VMX_MSR ... KVM_LAST_EMULATED_VMX_MSR: 2486 if (!msr_info->host_initiated) 2487 return 1; /* they are read-only */ 2488 if (!guest_cpu_cap_has(vcpu, X86_FEATURE_VMX)) 2489 return 1; 2490 return vmx_set_vmx_msr(vcpu, msr_index, data); 2491 case MSR_IA32_RTIT_CTL: 2492 if (!vmx_pt_mode_is_host_guest() || 2493 vmx_rtit_ctl_check(vcpu, data) || 2494 vmx->nested.vmxon) 2495 return 1; 2496 vmcs_write64(GUEST_IA32_RTIT_CTL, data); 2497 vmx->pt_desc.guest.ctl = data; 2498 pt_update_intercept_for_msr(vcpu); 2499 break; 2500 case MSR_IA32_RTIT_STATUS: 2501 if (!pt_can_write_msr(vmx)) 2502 return 1; 2503 if (data & MSR_IA32_RTIT_STATUS_MASK) 2504 return 1; 2505 vmx->pt_desc.guest.status = data; 2506 break; 2507 case MSR_IA32_RTIT_CR3_MATCH: 2508 if (!pt_can_write_msr(vmx)) 2509 return 1; 2510 if (!intel_pt_validate_cap(vmx->pt_desc.caps, 2511 PT_CAP_cr3_filtering)) 2512 return 1; 2513 vmx->pt_desc.guest.cr3_match = data; 2514 break; 2515 case MSR_IA32_RTIT_OUTPUT_BASE: 2516 if (!pt_can_write_msr(vmx)) 2517 return 1; 2518 if (!intel_pt_validate_cap(vmx->pt_desc.caps, 2519 PT_CAP_topa_output) && 2520 !intel_pt_validate_cap(vmx->pt_desc.caps, 2521 PT_CAP_single_range_output)) 2522 return 1; 2523 if (!pt_output_base_valid(vcpu, data)) 2524 return 1; 2525 vmx->pt_desc.guest.output_base = data; 2526 break; 2527 case MSR_IA32_RTIT_OUTPUT_MASK: 2528 if (!pt_can_write_msr(vmx)) 2529 return 1; 2530 if (!intel_pt_validate_cap(vmx->pt_desc.caps, 2531 PT_CAP_topa_output) && 2532 !intel_pt_validate_cap(vmx->pt_desc.caps, 2533 PT_CAP_single_range_output)) 2534 return 1; 2535 vmx->pt_desc.guest.output_mask = data; 2536 break; 2537 case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B: 2538 if (!pt_can_write_msr(vmx)) 2539 return 1; 2540 index = msr_info->index - MSR_IA32_RTIT_ADDR0_A; 2541 if (index >= 2 * vmx->pt_desc.num_address_ranges) 2542 return 1; 2543 if (is_noncanonical_msr_address(data, vcpu)) 2544 return 1; 2545 if (index % 2) 2546 vmx->pt_desc.guest.addr_b[index / 2] = data; 2547 else 2548 vmx->pt_desc.guest.addr_a[index / 2] = data; 2549 break; 2550 case MSR_IA32_S_CET: 2551 vmcs_writel(GUEST_S_CET, data); 2552 break; 2553 case MSR_KVM_INTERNAL_GUEST_SSP: 2554 vmcs_writel(GUEST_SSP, data); 2555 break; 2556 case MSR_IA32_INT_SSP_TAB: 2557 vmcs_writel(GUEST_INTR_SSP_TABLE, data); 2558 break; 2559 case MSR_IA32_PERF_CAPABILITIES: 2560 if (data & PERF_CAP_LBR_FMT) { 2561 if ((data & PERF_CAP_LBR_FMT) != 2562 (kvm_caps.supported_perf_cap & PERF_CAP_LBR_FMT)) 2563 return 1; 2564 if (!cpuid_model_is_consistent(vcpu)) 2565 return 1; 2566 } 2567 if (data & PERF_CAP_PEBS_FORMAT) { 2568 if ((data & PERF_CAP_PEBS_MASK) != 2569 (kvm_caps.supported_perf_cap & PERF_CAP_PEBS_MASK)) 2570 return 1; 2571 if (!guest_cpu_cap_has(vcpu, X86_FEATURE_DS)) 2572 return 1; 2573 if (!guest_cpu_cap_has(vcpu, X86_FEATURE_DTES64)) 2574 return 1; 2575 if (!cpuid_model_is_consistent(vcpu)) 2576 return 1; 2577 } 2578 ret = kvm_set_msr_common(vcpu, msr_info); 2579 break; 2580 2581 default: 2582 find_uret_msr: 2583 msr = vmx_find_uret_msr(vmx, msr_index); 2584 if (msr) 2585 ret = vmx_set_guest_uret_msr(vmx, msr, data); 2586 else 2587 ret = kvm_set_msr_common(vcpu, msr_info); 2588 } 2589 2590 /* FB_CLEAR may have changed, also update the FB_CLEAR_DIS behavior */ 2591 if (msr_index == MSR_IA32_ARCH_CAPABILITIES) 2592 vmx_update_fb_clear_dis(vcpu, vmx); 2593 2594 return ret; 2595 } 2596 2597 void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg) 2598 { 2599 unsigned long guest_owned_bits; 2600 2601 kvm_register_mark_available(vcpu, reg); 2602 2603 switch (reg) { 2604 case VCPU_REGS_RSP: 2605 vcpu->arch.regs[VCPU_REGS_RSP] = vmcs_readl(GUEST_RSP); 2606 break; 2607 case VCPU_REGS_RIP: 2608 vcpu->arch.regs[VCPU_REGS_RIP] = vmcs_readl(GUEST_RIP); 2609 break; 2610 case VCPU_EXREG_PDPTR: 2611 if (enable_ept) 2612 ept_save_pdptrs(vcpu); 2613 break; 2614 case VCPU_EXREG_CR0: 2615 guest_owned_bits = vcpu->arch.cr0_guest_owned_bits; 2616 2617 vcpu->arch.cr0 &= ~guest_owned_bits; 2618 vcpu->arch.cr0 |= vmcs_readl(GUEST_CR0) & guest_owned_bits; 2619 break; 2620 case VCPU_EXREG_CR3: 2621 /* 2622 * When intercepting CR3 loads, e.g. for shadowing paging, KVM's 2623 * CR3 is loaded into hardware, not the guest's CR3. 2624 */ 2625 if (!(exec_controls_get(to_vmx(vcpu)) & CPU_BASED_CR3_LOAD_EXITING)) 2626 vcpu->arch.cr3 = vmcs_readl(GUEST_CR3); 2627 break; 2628 case VCPU_EXREG_CR4: 2629 guest_owned_bits = vcpu->arch.cr4_guest_owned_bits; 2630 2631 vcpu->arch.cr4 &= ~guest_owned_bits; 2632 vcpu->arch.cr4 |= vmcs_readl(GUEST_CR4) & guest_owned_bits; 2633 break; 2634 default: 2635 KVM_BUG_ON(1, vcpu->kvm); 2636 break; 2637 } 2638 } 2639 2640 /* 2641 * There is no X86_FEATURE for SGX yet, but anyway we need to query CPUID 2642 * directly instead of going through cpu_has(), to ensure KVM is trapping 2643 * ENCLS whenever it's supported in hardware. It does not matter whether 2644 * the host OS supports or has enabled SGX. 2645 */ 2646 static bool cpu_has_sgx(void) 2647 { 2648 return cpuid_eax(0) >= 0x12 && (cpuid_eax(0x12) & BIT(0)); 2649 } 2650 2651 static int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt, u32 msr, u32 *result) 2652 { 2653 u32 vmx_msr_low, vmx_msr_high; 2654 u32 ctl = ctl_min | ctl_opt; 2655 2656 rdmsr(msr, vmx_msr_low, vmx_msr_high); 2657 2658 ctl &= vmx_msr_high; /* bit == 0 in high word ==> must be zero */ 2659 ctl |= vmx_msr_low; /* bit == 1 in low word ==> must be one */ 2660 2661 /* Ensure minimum (required) set of control bits are supported. */ 2662 if (ctl_min & ~ctl) 2663 return -EIO; 2664 2665 *result = ctl; 2666 return 0; 2667 } 2668 2669 static u64 adjust_vmx_controls64(u64 ctl_opt, u32 msr) 2670 { 2671 u64 allowed; 2672 2673 rdmsrq(msr, allowed); 2674 2675 return ctl_opt & allowed; 2676 } 2677 2678 #define vmx_check_entry_exit_pairs(pairs, entry_controls, exit_controls) \ 2679 ({ \ 2680 int i, r = 0; \ 2681 \ 2682 BUILD_BUG_ON(sizeof(pairs[0].entry_control) != sizeof(entry_controls)); \ 2683 BUILD_BUG_ON(sizeof(pairs[0].exit_control) != sizeof(exit_controls)); \ 2684 \ 2685 for (i = 0; i < ARRAY_SIZE(pairs); i++) { \ 2686 typeof(entry_controls) n_ctrl = pairs[i].entry_control; \ 2687 typeof(exit_controls) x_ctrl = pairs[i].exit_control; \ 2688 \ 2689 if (!(entry_controls & n_ctrl) == !(exit_controls & x_ctrl)) \ 2690 continue; \ 2691 \ 2692 pr_warn_once("Inconsistent VM-Entry/VM-Exit pair, " \ 2693 "entry = %llx (%llx), exit = %llx (%llx)\n", \ 2694 (u64)(entry_controls & n_ctrl), (u64)n_ctrl, \ 2695 (u64)(exit_controls & x_ctrl), (u64)x_ctrl); \ 2696 \ 2697 if (error_on_inconsistent_vmcs_config) \ 2698 r = -EIO; \ 2699 \ 2700 entry_controls &= ~n_ctrl; \ 2701 exit_controls &= ~x_ctrl; \ 2702 } \ 2703 r; \ 2704 }) 2705 2706 static int setup_vmcs_config(struct vmcs_config *vmcs_conf, 2707 struct vmx_capability *vmx_cap) 2708 { 2709 u32 _pin_based_exec_control = 0; 2710 u32 _cpu_based_exec_control = 0; 2711 u32 _cpu_based_2nd_exec_control = 0; 2712 u64 _cpu_based_3rd_exec_control = 0; 2713 u32 _vmexit_control = 0; 2714 u32 _vmentry_control = 0; 2715 u64 basic_msr; 2716 u64 misc_msr; 2717 2718 /* 2719 * LOAD/SAVE_DEBUG_CONTROLS are absent because both are mandatory. 2720 * SAVE_IA32_PAT and SAVE_IA32_EFER are absent because KVM always 2721 * intercepts writes to PAT and EFER, i.e. never enables those controls. 2722 */ 2723 struct { 2724 u32 entry_control; 2725 u32 exit_control; 2726 } const vmcs_entry_exit_pairs[] = { 2727 { VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL, VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL }, 2728 { VM_ENTRY_LOAD_IA32_PAT, VM_EXIT_LOAD_IA32_PAT }, 2729 { VM_ENTRY_LOAD_IA32_EFER, VM_EXIT_LOAD_IA32_EFER }, 2730 { VM_ENTRY_LOAD_BNDCFGS, VM_EXIT_CLEAR_BNDCFGS }, 2731 { VM_ENTRY_LOAD_IA32_RTIT_CTL, VM_EXIT_CLEAR_IA32_RTIT_CTL }, 2732 { VM_ENTRY_LOAD_CET_STATE, VM_EXIT_LOAD_CET_STATE }, 2733 }; 2734 2735 memset(vmcs_conf, 0, sizeof(*vmcs_conf)); 2736 2737 if (adjust_vmx_controls(KVM_REQUIRED_VMX_CPU_BASED_VM_EXEC_CONTROL, 2738 KVM_OPTIONAL_VMX_CPU_BASED_VM_EXEC_CONTROL, 2739 MSR_IA32_VMX_PROCBASED_CTLS, 2740 &_cpu_based_exec_control)) 2741 return -EIO; 2742 if (_cpu_based_exec_control & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) { 2743 if (adjust_vmx_controls(KVM_REQUIRED_VMX_SECONDARY_VM_EXEC_CONTROL, 2744 KVM_OPTIONAL_VMX_SECONDARY_VM_EXEC_CONTROL, 2745 MSR_IA32_VMX_PROCBASED_CTLS2, 2746 &_cpu_based_2nd_exec_control)) 2747 return -EIO; 2748 } 2749 if (!IS_ENABLED(CONFIG_KVM_INTEL_PROVE_VE)) 2750 _cpu_based_2nd_exec_control &= ~SECONDARY_EXEC_EPT_VIOLATION_VE; 2751 2752 #ifndef CONFIG_X86_64 2753 if (!(_cpu_based_2nd_exec_control & 2754 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) 2755 _cpu_based_exec_control &= ~CPU_BASED_TPR_SHADOW; 2756 #endif 2757 2758 if (!(_cpu_based_exec_control & CPU_BASED_TPR_SHADOW)) 2759 _cpu_based_2nd_exec_control &= ~( 2760 SECONDARY_EXEC_APIC_REGISTER_VIRT | 2761 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | 2762 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY); 2763 2764 rdmsr_safe(MSR_IA32_VMX_EPT_VPID_CAP, 2765 &vmx_cap->ept, &vmx_cap->vpid); 2766 2767 if (!(_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_EPT) && 2768 vmx_cap->ept) { 2769 pr_warn_once("EPT CAP should not exist if not support " 2770 "1-setting enable EPT VM-execution control\n"); 2771 2772 if (error_on_inconsistent_vmcs_config) 2773 return -EIO; 2774 2775 vmx_cap->ept = 0; 2776 _cpu_based_2nd_exec_control &= ~SECONDARY_EXEC_EPT_VIOLATION_VE; 2777 } 2778 if (!(_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_VPID) && 2779 vmx_cap->vpid) { 2780 pr_warn_once("VPID CAP should not exist if not support " 2781 "1-setting enable VPID VM-execution control\n"); 2782 2783 if (error_on_inconsistent_vmcs_config) 2784 return -EIO; 2785 2786 vmx_cap->vpid = 0; 2787 } 2788 2789 if (!cpu_has_sgx()) 2790 _cpu_based_2nd_exec_control &= ~SECONDARY_EXEC_ENCLS_EXITING; 2791 2792 if (_cpu_based_exec_control & CPU_BASED_ACTIVATE_TERTIARY_CONTROLS) 2793 _cpu_based_3rd_exec_control = 2794 adjust_vmx_controls64(KVM_OPTIONAL_VMX_TERTIARY_VM_EXEC_CONTROL, 2795 MSR_IA32_VMX_PROCBASED_CTLS3); 2796 2797 if (adjust_vmx_controls(KVM_REQUIRED_VMX_VM_EXIT_CONTROLS, 2798 KVM_OPTIONAL_VMX_VM_EXIT_CONTROLS, 2799 MSR_IA32_VMX_EXIT_CTLS, 2800 &_vmexit_control)) 2801 return -EIO; 2802 2803 if (adjust_vmx_controls(KVM_REQUIRED_VMX_PIN_BASED_VM_EXEC_CONTROL, 2804 KVM_OPTIONAL_VMX_PIN_BASED_VM_EXEC_CONTROL, 2805 MSR_IA32_VMX_PINBASED_CTLS, 2806 &_pin_based_exec_control)) 2807 return -EIO; 2808 2809 if (cpu_has_broken_vmx_preemption_timer()) 2810 _pin_based_exec_control &= ~PIN_BASED_VMX_PREEMPTION_TIMER; 2811 if (!(_cpu_based_2nd_exec_control & 2812 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY)) 2813 _pin_based_exec_control &= ~PIN_BASED_POSTED_INTR; 2814 2815 if (adjust_vmx_controls(KVM_REQUIRED_VMX_VM_ENTRY_CONTROLS, 2816 KVM_OPTIONAL_VMX_VM_ENTRY_CONTROLS, 2817 MSR_IA32_VMX_ENTRY_CTLS, 2818 &_vmentry_control)) 2819 return -EIO; 2820 2821 if (vmx_check_entry_exit_pairs(vmcs_entry_exit_pairs, 2822 _vmentry_control, _vmexit_control)) 2823 return -EIO; 2824 2825 /* 2826 * Some cpus support VM_{ENTRY,EXIT}_IA32_PERF_GLOBAL_CTRL but they 2827 * can't be used due to an errata where VM Exit may incorrectly clear 2828 * IA32_PERF_GLOBAL_CTRL[34:32]. Workaround the errata by using the 2829 * MSR load mechanism to switch IA32_PERF_GLOBAL_CTRL. 2830 */ 2831 switch (boot_cpu_data.x86_vfm) { 2832 case INTEL_NEHALEM_EP: /* AAK155 */ 2833 case INTEL_NEHALEM: /* AAP115 */ 2834 case INTEL_WESTMERE: /* AAT100 */ 2835 case INTEL_WESTMERE_EP: /* BC86,AAY89,BD102 */ 2836 case INTEL_NEHALEM_EX: /* BA97 */ 2837 _vmentry_control &= ~VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL; 2838 _vmexit_control &= ~VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL; 2839 pr_warn_once("VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL " 2840 "does not work properly. Using workaround\n"); 2841 break; 2842 default: 2843 break; 2844 } 2845 2846 rdmsrq(MSR_IA32_VMX_BASIC, basic_msr); 2847 2848 /* IA-32 SDM Vol 3B: VMCS size is never greater than 4kB. */ 2849 if (vmx_basic_vmcs_size(basic_msr) > PAGE_SIZE) 2850 return -EIO; 2851 2852 #ifdef CONFIG_X86_64 2853 /* 2854 * KVM expects to be able to shove all legal physical addresses into 2855 * VMCS fields for 64-bit kernels, and per the SDM, "This bit is always 2856 * 0 for processors that support Intel 64 architecture". 2857 */ 2858 if (basic_msr & VMX_BASIC_32BIT_PHYS_ADDR_ONLY) 2859 return -EIO; 2860 #endif 2861 2862 /* Require Write-Back (WB) memory type for VMCS accesses. */ 2863 if (vmx_basic_vmcs_mem_type(basic_msr) != X86_MEMTYPE_WB) 2864 return -EIO; 2865 2866 rdmsrq(MSR_IA32_VMX_MISC, misc_msr); 2867 2868 vmcs_conf->basic = basic_msr; 2869 vmcs_conf->pin_based_exec_ctrl = _pin_based_exec_control; 2870 vmcs_conf->cpu_based_exec_ctrl = _cpu_based_exec_control; 2871 vmcs_conf->cpu_based_2nd_exec_ctrl = _cpu_based_2nd_exec_control; 2872 vmcs_conf->cpu_based_3rd_exec_ctrl = _cpu_based_3rd_exec_control; 2873 vmcs_conf->vmexit_ctrl = _vmexit_control; 2874 vmcs_conf->vmentry_ctrl = _vmentry_control; 2875 vmcs_conf->misc = misc_msr; 2876 2877 #if IS_ENABLED(CONFIG_HYPERV) 2878 if (enlightened_vmcs) 2879 evmcs_sanitize_exec_ctrls(vmcs_conf); 2880 #endif 2881 2882 return 0; 2883 } 2884 2885 static bool __kvm_is_vmx_supported(void) 2886 { 2887 int cpu = smp_processor_id(); 2888 2889 if (!(cpuid_ecx(1) & feature_bit(VMX))) { 2890 pr_err("VMX not supported by CPU %d\n", cpu); 2891 return false; 2892 } 2893 2894 if (!this_cpu_has(X86_FEATURE_MSR_IA32_FEAT_CTL)) { 2895 pr_err("VMX not enabled (by BIOS) in MSR_IA32_FEAT_CTL on CPU %d\n", cpu); 2896 return false; 2897 } 2898 2899 if (!this_cpu_has(X86_FEATURE_VMX)) { 2900 pr_err("VMX not fully enabled on CPU %d. Check kernel logs and/or BIOS\n", cpu); 2901 return false; 2902 } 2903 2904 return true; 2905 } 2906 2907 static bool kvm_is_vmx_supported(void) 2908 { 2909 bool supported; 2910 2911 migrate_disable(); 2912 supported = __kvm_is_vmx_supported(); 2913 migrate_enable(); 2914 2915 return supported; 2916 } 2917 2918 int vmx_check_processor_compat(void) 2919 { 2920 int cpu = raw_smp_processor_id(); 2921 struct vmcs_config vmcs_conf; 2922 struct vmx_capability vmx_cap; 2923 2924 if (!__kvm_is_vmx_supported()) 2925 return -EIO; 2926 2927 if (setup_vmcs_config(&vmcs_conf, &vmx_cap) < 0) { 2928 pr_err("Failed to setup VMCS config on CPU %d\n", cpu); 2929 return -EIO; 2930 } 2931 if (nested) 2932 nested_vmx_setup_ctls_msrs(&vmcs_conf, vmx_cap.ept); 2933 2934 if (memcmp(&vmcs_config, &vmcs_conf, sizeof(struct vmcs_config))) { 2935 u32 *gold = (void *)&vmcs_config; 2936 u32 *mine = (void *)&vmcs_conf; 2937 int i; 2938 2939 BUILD_BUG_ON(sizeof(struct vmcs_config) % sizeof(u32)); 2940 2941 pr_err("VMCS config on CPU %d doesn't match reference config:", cpu); 2942 for (i = 0; i < sizeof(struct vmcs_config) / sizeof(u32); i++) { 2943 if (gold[i] == mine[i]) 2944 continue; 2945 2946 pr_cont("\n Offset %u REF = 0x%08x, CPU%u = 0x%08x, mismatch = 0x%08x", 2947 i * (int)sizeof(u32), gold[i], cpu, mine[i], gold[i] ^ mine[i]); 2948 } 2949 pr_cont("\n"); 2950 return -EIO; 2951 } 2952 return 0; 2953 } 2954 2955 int vmx_enable_virtualization_cpu(void) 2956 { 2957 int cpu = raw_smp_processor_id(); 2958 2959 /* 2960 * This can happen if we hot-added a CPU but failed to allocate 2961 * VP assist page for it. 2962 */ 2963 if (kvm_is_using_evmcs() && !hv_get_vp_assist_page(cpu)) 2964 return -EFAULT; 2965 2966 return x86_virt_get_ref(X86_FEATURE_VMX); 2967 } 2968 2969 static void vmclear_local_loaded_vmcss(void) 2970 { 2971 int cpu = raw_smp_processor_id(); 2972 struct loaded_vmcs *v, *n; 2973 2974 list_for_each_entry_safe(v, n, &per_cpu(loaded_vmcss_on_cpu, cpu), 2975 loaded_vmcss_on_cpu_link) 2976 __loaded_vmcs_clear(v); 2977 } 2978 2979 void vmx_disable_virtualization_cpu(void) 2980 { 2981 vmclear_local_loaded_vmcss(); 2982 2983 x86_virt_put_ref(X86_FEATURE_VMX); 2984 2985 hv_reset_evmcs(); 2986 } 2987 2988 struct vmcs *alloc_vmcs_cpu(bool shadow, int cpu, gfp_t flags) 2989 { 2990 int node = cpu_to_node(cpu); 2991 struct page *pages; 2992 struct vmcs *vmcs; 2993 2994 pages = __alloc_pages_node(node, flags, 0); 2995 if (!pages) 2996 return NULL; 2997 vmcs = page_address(pages); 2998 memset(vmcs, 0, vmx_basic_vmcs_size(vmcs_config.basic)); 2999 3000 /* KVM supports Enlightened VMCS v1 only */ 3001 if (kvm_is_using_evmcs()) 3002 vmcs->hdr.revision_id = KVM_EVMCS_VERSION; 3003 else 3004 vmcs->hdr.revision_id = vmx_basic_vmcs_revision_id(vmcs_config.basic); 3005 3006 if (shadow) 3007 vmcs->hdr.shadow_vmcs = 1; 3008 return vmcs; 3009 } 3010 3011 void free_vmcs(struct vmcs *vmcs) 3012 { 3013 free_page((unsigned long)vmcs); 3014 } 3015 3016 /* 3017 * Free a VMCS, but before that VMCLEAR it on the CPU where it was last loaded 3018 */ 3019 void free_loaded_vmcs(struct loaded_vmcs *loaded_vmcs) 3020 { 3021 if (!loaded_vmcs->vmcs) 3022 return; 3023 loaded_vmcs_clear(loaded_vmcs); 3024 free_vmcs(loaded_vmcs->vmcs); 3025 loaded_vmcs->vmcs = NULL; 3026 if (loaded_vmcs->msr_bitmap) 3027 free_page((unsigned long)loaded_vmcs->msr_bitmap); 3028 WARN_ON(loaded_vmcs->shadow_vmcs != NULL); 3029 } 3030 3031 int alloc_loaded_vmcs(struct loaded_vmcs *loaded_vmcs) 3032 { 3033 loaded_vmcs->vmcs = alloc_vmcs(false); 3034 if (!loaded_vmcs->vmcs) 3035 return -ENOMEM; 3036 3037 vmcs_clear(loaded_vmcs->vmcs); 3038 3039 loaded_vmcs->shadow_vmcs = NULL; 3040 loaded_vmcs->hv_timer_soft_disabled = false; 3041 loaded_vmcs->cpu = -1; 3042 loaded_vmcs->launched = 0; 3043 3044 if (cpu_has_vmx_msr_bitmap()) { 3045 loaded_vmcs->msr_bitmap = (unsigned long *) 3046 __get_free_page(GFP_KERNEL_ACCOUNT); 3047 if (!loaded_vmcs->msr_bitmap) 3048 goto out_vmcs; 3049 memset(loaded_vmcs->msr_bitmap, 0xff, PAGE_SIZE); 3050 } 3051 3052 memset(&loaded_vmcs->host_state, 0, sizeof(struct vmcs_host_state)); 3053 memset(&loaded_vmcs->controls_shadow, 0, 3054 sizeof(struct vmcs_controls_shadow)); 3055 3056 return 0; 3057 3058 out_vmcs: 3059 free_loaded_vmcs(loaded_vmcs); 3060 return -ENOMEM; 3061 } 3062 3063 static void fix_pmode_seg(struct kvm_vcpu *vcpu, int seg, 3064 struct kvm_segment *save) 3065 { 3066 if (!emulate_invalid_guest_state) { 3067 /* 3068 * CS and SS RPL should be equal during guest entry according 3069 * to VMX spec, but in reality it is not always so. Since vcpu 3070 * is in the middle of the transition from real mode to 3071 * protected mode it is safe to assume that RPL 0 is a good 3072 * default value. 3073 */ 3074 if (seg == VCPU_SREG_CS || seg == VCPU_SREG_SS) 3075 save->selector &= ~SEGMENT_RPL_MASK; 3076 save->dpl = save->selector & SEGMENT_RPL_MASK; 3077 save->s = 1; 3078 } 3079 __vmx_set_segment(vcpu, save, seg); 3080 } 3081 3082 static void enter_pmode(struct kvm_vcpu *vcpu) 3083 { 3084 unsigned long flags; 3085 struct vcpu_vmx *vmx = to_vmx(vcpu); 3086 3087 /* 3088 * Update real mode segment cache. It may be not up-to-date if segment 3089 * register was written while vcpu was in a guest mode. 3090 */ 3091 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_ES], VCPU_SREG_ES); 3092 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_DS], VCPU_SREG_DS); 3093 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_FS], VCPU_SREG_FS); 3094 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_GS], VCPU_SREG_GS); 3095 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_SS], VCPU_SREG_SS); 3096 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_CS], VCPU_SREG_CS); 3097 3098 vmx->rmode.vm86_active = 0; 3099 3100 __vmx_set_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_TR], VCPU_SREG_TR); 3101 3102 flags = vmcs_readl(GUEST_RFLAGS); 3103 flags &= RMODE_GUEST_OWNED_EFLAGS_BITS; 3104 flags |= vmx->rmode.save_rflags & ~RMODE_GUEST_OWNED_EFLAGS_BITS; 3105 vmcs_writel(GUEST_RFLAGS, flags); 3106 3107 vmcs_writel(GUEST_CR4, (vmcs_readl(GUEST_CR4) & ~X86_CR4_VME) | 3108 (vmcs_readl(CR4_READ_SHADOW) & X86_CR4_VME)); 3109 3110 vmx_update_exception_bitmap(vcpu); 3111 3112 fix_pmode_seg(vcpu, VCPU_SREG_CS, &vmx->rmode.segs[VCPU_SREG_CS]); 3113 fix_pmode_seg(vcpu, VCPU_SREG_SS, &vmx->rmode.segs[VCPU_SREG_SS]); 3114 fix_pmode_seg(vcpu, VCPU_SREG_ES, &vmx->rmode.segs[VCPU_SREG_ES]); 3115 fix_pmode_seg(vcpu, VCPU_SREG_DS, &vmx->rmode.segs[VCPU_SREG_DS]); 3116 fix_pmode_seg(vcpu, VCPU_SREG_FS, &vmx->rmode.segs[VCPU_SREG_FS]); 3117 fix_pmode_seg(vcpu, VCPU_SREG_GS, &vmx->rmode.segs[VCPU_SREG_GS]); 3118 } 3119 3120 static void fix_rmode_seg(int seg, struct kvm_segment *save) 3121 { 3122 const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; 3123 struct kvm_segment var = *save; 3124 3125 var.dpl = 0x3; 3126 if (seg == VCPU_SREG_CS) 3127 var.type = 0x3; 3128 3129 if (!emulate_invalid_guest_state) { 3130 var.selector = var.base >> 4; 3131 var.base = var.base & 0xffff0; 3132 var.limit = 0xffff; 3133 var.g = 0; 3134 var.db = 0; 3135 var.present = 1; 3136 var.s = 1; 3137 var.l = 0; 3138 var.unusable = 0; 3139 var.type = 0x3; 3140 var.avl = 0; 3141 if (save->base & 0xf) 3142 pr_warn_once("segment base is not paragraph aligned " 3143 "when entering protected mode (seg=%d)", seg); 3144 } 3145 3146 vmcs_write16(sf->selector, var.selector); 3147 vmcs_writel(sf->base, var.base); 3148 vmcs_write32(sf->limit, var.limit); 3149 vmcs_write32(sf->ar_bytes, vmx_segment_access_rights(&var)); 3150 } 3151 3152 static void enter_rmode(struct kvm_vcpu *vcpu) 3153 { 3154 unsigned long flags; 3155 struct vcpu_vmx *vmx = to_vmx(vcpu); 3156 struct kvm_vmx *kvm_vmx = to_kvm_vmx(vcpu->kvm); 3157 3158 /* 3159 * KVM should never use VM86 to virtualize Real Mode when L2 is active, 3160 * as using VM86 is unnecessary if unrestricted guest is enabled, and 3161 * if unrestricted guest is disabled, VM-Enter (from L1) with CR0.PG=0 3162 * should VM-Fail and KVM should reject userspace attempts to stuff 3163 * CR0.PG=0 when L2 is active. 3164 */ 3165 WARN_ON_ONCE(is_guest_mode(vcpu)); 3166 3167 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_TR], VCPU_SREG_TR); 3168 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_ES], VCPU_SREG_ES); 3169 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_DS], VCPU_SREG_DS); 3170 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_FS], VCPU_SREG_FS); 3171 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_GS], VCPU_SREG_GS); 3172 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_SS], VCPU_SREG_SS); 3173 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_CS], VCPU_SREG_CS); 3174 3175 vmx->rmode.vm86_active = 1; 3176 3177 vmx_segment_cache_clear(vmx); 3178 3179 vmcs_writel(GUEST_TR_BASE, kvm_vmx->tss_addr); 3180 vmcs_write32(GUEST_TR_LIMIT, RMODE_TSS_SIZE - 1); 3181 vmcs_write32(GUEST_TR_AR_BYTES, 0x008b); 3182 3183 flags = vmcs_readl(GUEST_RFLAGS); 3184 vmx->rmode.save_rflags = flags; 3185 3186 flags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM; 3187 3188 vmcs_writel(GUEST_RFLAGS, flags); 3189 vmcs_writel(GUEST_CR4, vmcs_readl(GUEST_CR4) | X86_CR4_VME); 3190 vmx_update_exception_bitmap(vcpu); 3191 3192 fix_rmode_seg(VCPU_SREG_SS, &vmx->rmode.segs[VCPU_SREG_SS]); 3193 fix_rmode_seg(VCPU_SREG_CS, &vmx->rmode.segs[VCPU_SREG_CS]); 3194 fix_rmode_seg(VCPU_SREG_ES, &vmx->rmode.segs[VCPU_SREG_ES]); 3195 fix_rmode_seg(VCPU_SREG_DS, &vmx->rmode.segs[VCPU_SREG_DS]); 3196 fix_rmode_seg(VCPU_SREG_GS, &vmx->rmode.segs[VCPU_SREG_GS]); 3197 fix_rmode_seg(VCPU_SREG_FS, &vmx->rmode.segs[VCPU_SREG_FS]); 3198 } 3199 3200 int vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer) 3201 { 3202 struct vcpu_vmx *vmx = to_vmx(vcpu); 3203 3204 /* Nothing to do if hardware doesn't support EFER. */ 3205 if (!vmx_find_uret_msr(vmx, MSR_EFER)) 3206 return 0; 3207 3208 vcpu->arch.efer = efer; 3209 #ifdef CONFIG_X86_64 3210 if (efer & EFER_LMA) 3211 vm_entry_controls_setbit(vmx, VM_ENTRY_IA32E_MODE); 3212 else 3213 vm_entry_controls_clearbit(vmx, VM_ENTRY_IA32E_MODE); 3214 #else 3215 if (KVM_BUG_ON(efer & EFER_LMA, vcpu->kvm)) 3216 return 1; 3217 #endif 3218 3219 vmx_setup_uret_msrs(vmx); 3220 return 0; 3221 } 3222 3223 #ifdef CONFIG_X86_64 3224 3225 static void enter_lmode(struct kvm_vcpu *vcpu) 3226 { 3227 u32 guest_tr_ar; 3228 3229 vmx_segment_cache_clear(to_vmx(vcpu)); 3230 3231 guest_tr_ar = vmcs_read32(GUEST_TR_AR_BYTES); 3232 if ((guest_tr_ar & VMX_AR_TYPE_MASK) != VMX_AR_TYPE_BUSY_64_TSS) { 3233 pr_debug_ratelimited("%s: tss fixup for long mode. \n", 3234 __func__); 3235 vmcs_write32(GUEST_TR_AR_BYTES, 3236 (guest_tr_ar & ~VMX_AR_TYPE_MASK) 3237 | VMX_AR_TYPE_BUSY_64_TSS); 3238 } 3239 vmx_set_efer(vcpu, vcpu->arch.efer | EFER_LMA); 3240 } 3241 3242 static void exit_lmode(struct kvm_vcpu *vcpu) 3243 { 3244 vmx_set_efer(vcpu, vcpu->arch.efer & ~EFER_LMA); 3245 } 3246 3247 #endif 3248 3249 void vmx_flush_tlb_all(struct kvm_vcpu *vcpu) 3250 { 3251 struct vcpu_vmx *vmx = to_vmx(vcpu); 3252 3253 /* 3254 * INVEPT must be issued when EPT is enabled, irrespective of VPID, as 3255 * the CPU is not required to invalidate guest-physical mappings on 3256 * VM-Entry, even if VPID is disabled. Guest-physical mappings are 3257 * associated with the root EPT structure and not any particular VPID 3258 * (INVVPID also isn't required to invalidate guest-physical mappings). 3259 */ 3260 if (enable_ept) { 3261 ept_sync_global(); 3262 } else if (enable_vpid) { 3263 if (cpu_has_vmx_invvpid_global()) { 3264 vpid_sync_vcpu_global(); 3265 } else { 3266 vpid_sync_vcpu_single(vmx->vpid); 3267 vpid_sync_vcpu_single(vmx->nested.vpid02); 3268 } 3269 } 3270 } 3271 3272 static inline int vmx_get_current_vpid(struct kvm_vcpu *vcpu) 3273 { 3274 if (is_guest_mode(vcpu) && nested_cpu_has_vpid(get_vmcs12(vcpu))) 3275 return nested_get_vpid02(vcpu); 3276 return to_vmx(vcpu)->vpid; 3277 } 3278 3279 static u64 construct_eptp(hpa_t root_hpa) 3280 { 3281 u64 eptp = root_hpa | VMX_EPTP_MT_WB; 3282 struct kvm_mmu_page *root; 3283 3284 if (kvm_mmu_is_dummy_root(root_hpa)) 3285 return eptp | VMX_EPTP_PWL_4; 3286 3287 /* 3288 * EPT roots should always have an associated MMU page. Return a "bad" 3289 * EPTP to induce VM-Fail instead of continuing on in a unknown state. 3290 */ 3291 root = root_to_sp(root_hpa); 3292 if (WARN_ON_ONCE(!root)) 3293 return INVALID_PAGE; 3294 3295 eptp |= (root->role.level == 5) ? VMX_EPTP_PWL_5 : VMX_EPTP_PWL_4; 3296 3297 if (enable_ept_ad_bits && !root->role.ad_disabled) 3298 eptp |= VMX_EPTP_AD_ENABLE_BIT; 3299 3300 return eptp; 3301 } 3302 3303 static void vmx_flush_tlb_ept_root(hpa_t root_hpa) 3304 { 3305 u64 eptp = construct_eptp(root_hpa); 3306 3307 if (VALID_PAGE(eptp)) 3308 ept_sync_context(eptp); 3309 else 3310 ept_sync_global(); 3311 } 3312 3313 void vmx_flush_tlb_current(struct kvm_vcpu *vcpu) 3314 { 3315 struct kvm_mmu *mmu = vcpu->arch.mmu; 3316 u64 root_hpa = mmu->root.hpa; 3317 3318 /* No flush required if the current context is invalid. */ 3319 if (!VALID_PAGE(root_hpa)) 3320 return; 3321 3322 if (enable_ept) 3323 vmx_flush_tlb_ept_root(root_hpa); 3324 else 3325 vpid_sync_context(vmx_get_current_vpid(vcpu)); 3326 } 3327 3328 void vmx_flush_tlb_gva(struct kvm_vcpu *vcpu, gva_t addr) 3329 { 3330 /* 3331 * vpid_sync_vcpu_addr() is a nop if vpid==0, see the comment in 3332 * vmx_flush_tlb_guest() for an explanation of why this is ok. 3333 */ 3334 vpid_sync_vcpu_addr(vmx_get_current_vpid(vcpu), addr); 3335 } 3336 3337 void vmx_flush_tlb_guest(struct kvm_vcpu *vcpu) 3338 { 3339 /* 3340 * vpid_sync_context() is a nop if vpid==0, e.g. if enable_vpid==0 or a 3341 * vpid couldn't be allocated for this vCPU. VM-Enter and VM-Exit are 3342 * required to flush GVA->{G,H}PA mappings from the TLB if vpid is 3343 * disabled (VM-Enter with vpid enabled and vpid==0 is disallowed), 3344 * i.e. no explicit INVVPID is necessary. 3345 */ 3346 vpid_sync_context(vmx_get_current_vpid(vcpu)); 3347 } 3348 3349 void vmx_ept_load_pdptrs(struct kvm_vcpu *vcpu) 3350 { 3351 struct kvm_mmu *mmu = vcpu->arch.walk_mmu; 3352 3353 if (!kvm_register_is_dirty(vcpu, VCPU_EXREG_PDPTR)) 3354 return; 3355 3356 if (is_pae_paging(vcpu)) { 3357 vmcs_write64(GUEST_PDPTR0, mmu->pdptrs[0]); 3358 vmcs_write64(GUEST_PDPTR1, mmu->pdptrs[1]); 3359 vmcs_write64(GUEST_PDPTR2, mmu->pdptrs[2]); 3360 vmcs_write64(GUEST_PDPTR3, mmu->pdptrs[3]); 3361 } 3362 } 3363 3364 void ept_save_pdptrs(struct kvm_vcpu *vcpu) 3365 { 3366 struct kvm_mmu *mmu = vcpu->arch.walk_mmu; 3367 3368 if (WARN_ON_ONCE(!is_pae_paging(vcpu))) 3369 return; 3370 3371 mmu->pdptrs[0] = vmcs_read64(GUEST_PDPTR0); 3372 mmu->pdptrs[1] = vmcs_read64(GUEST_PDPTR1); 3373 mmu->pdptrs[2] = vmcs_read64(GUEST_PDPTR2); 3374 mmu->pdptrs[3] = vmcs_read64(GUEST_PDPTR3); 3375 3376 kvm_register_mark_available(vcpu, VCPU_EXREG_PDPTR); 3377 } 3378 3379 #define CR3_EXITING_BITS (CPU_BASED_CR3_LOAD_EXITING | \ 3380 CPU_BASED_CR3_STORE_EXITING) 3381 3382 bool vmx_is_valid_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) 3383 { 3384 if (is_guest_mode(vcpu)) 3385 return nested_guest_cr0_valid(vcpu, cr0); 3386 3387 if (to_vmx(vcpu)->nested.vmxon) 3388 return nested_host_cr0_valid(vcpu, cr0); 3389 3390 return true; 3391 } 3392 3393 void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) 3394 { 3395 struct vcpu_vmx *vmx = to_vmx(vcpu); 3396 unsigned long hw_cr0, old_cr0_pg; 3397 u32 tmp; 3398 3399 old_cr0_pg = kvm_read_cr0_bits(vcpu, X86_CR0_PG); 3400 3401 hw_cr0 = (cr0 & ~KVM_VM_CR0_ALWAYS_OFF); 3402 if (enable_unrestricted_guest) 3403 hw_cr0 |= KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST; 3404 else { 3405 hw_cr0 |= KVM_VM_CR0_ALWAYS_ON; 3406 if (!enable_ept) 3407 hw_cr0 |= X86_CR0_WP; 3408 3409 if (vmx->rmode.vm86_active && (cr0 & X86_CR0_PE)) 3410 enter_pmode(vcpu); 3411 3412 if (!vmx->rmode.vm86_active && !(cr0 & X86_CR0_PE)) 3413 enter_rmode(vcpu); 3414 } 3415 3416 vmcs_writel(CR0_READ_SHADOW, cr0); 3417 vmcs_writel(GUEST_CR0, hw_cr0); 3418 vcpu->arch.cr0 = cr0; 3419 kvm_register_mark_available(vcpu, VCPU_EXREG_CR0); 3420 3421 #ifdef CONFIG_X86_64 3422 if (vcpu->arch.efer & EFER_LME) { 3423 if (!old_cr0_pg && (cr0 & X86_CR0_PG)) 3424 enter_lmode(vcpu); 3425 else if (old_cr0_pg && !(cr0 & X86_CR0_PG)) 3426 exit_lmode(vcpu); 3427 } 3428 #endif 3429 3430 if (enable_ept && !enable_unrestricted_guest) { 3431 /* 3432 * Ensure KVM has an up-to-date snapshot of the guest's CR3. If 3433 * the below code _enables_ CR3 exiting, vmx_cache_reg() will 3434 * (correctly) stop reading vmcs.GUEST_CR3 because it thinks 3435 * KVM's CR3 is installed. 3436 */ 3437 if (!kvm_register_is_available(vcpu, VCPU_EXREG_CR3)) 3438 vmx_cache_reg(vcpu, VCPU_EXREG_CR3); 3439 3440 /* 3441 * When running with EPT but not unrestricted guest, KVM must 3442 * intercept CR3 accesses when paging is _disabled_. This is 3443 * necessary because restricted guests can't actually run with 3444 * paging disabled, and so KVM stuffs its own CR3 in order to 3445 * run the guest when identity mapped page tables. 3446 * 3447 * Do _NOT_ check the old CR0.PG, e.g. to optimize away the 3448 * update, it may be stale with respect to CR3 interception, 3449 * e.g. after nested VM-Enter. 3450 * 3451 * Lastly, honor L1's desires, i.e. intercept CR3 loads and/or 3452 * stores to forward them to L1, even if KVM does not need to 3453 * intercept them to preserve its identity mapped page tables. 3454 */ 3455 if (!(cr0 & X86_CR0_PG)) { 3456 exec_controls_setbit(vmx, CR3_EXITING_BITS); 3457 } else if (!is_guest_mode(vcpu)) { 3458 exec_controls_clearbit(vmx, CR3_EXITING_BITS); 3459 } else { 3460 tmp = exec_controls_get(vmx); 3461 tmp &= ~CR3_EXITING_BITS; 3462 tmp |= get_vmcs12(vcpu)->cpu_based_vm_exec_control & CR3_EXITING_BITS; 3463 exec_controls_set(vmx, tmp); 3464 } 3465 3466 /* Note, vmx_set_cr4() consumes the new vcpu->arch.cr0. */ 3467 if ((old_cr0_pg ^ cr0) & X86_CR0_PG) 3468 vmx_set_cr4(vcpu, kvm_read_cr4(vcpu)); 3469 3470 /* 3471 * When !CR0_PG -> CR0_PG, vcpu->arch.cr3 becomes active, but 3472 * GUEST_CR3 is still vmx->ept_identity_map_addr if EPT + !URG. 3473 */ 3474 if (!(old_cr0_pg & X86_CR0_PG) && (cr0 & X86_CR0_PG)) 3475 kvm_register_mark_dirty(vcpu, VCPU_EXREG_CR3); 3476 } 3477 3478 /* depends on vcpu->arch.cr0 to be set to a new value */ 3479 vmx->vt.emulation_required = vmx_emulation_required(vcpu); 3480 } 3481 3482 static int vmx_get_max_ept_level(void) 3483 { 3484 if (cpu_has_vmx_ept_5levels()) 3485 return 5; 3486 return 4; 3487 } 3488 3489 void vmx_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa, int root_level) 3490 { 3491 struct kvm *kvm = vcpu->kvm; 3492 bool update_guest_cr3 = true; 3493 unsigned long guest_cr3; 3494 3495 if (enable_ept) { 3496 KVM_MMU_WARN_ON(root_to_sp(root_hpa) && 3497 root_level != root_to_sp(root_hpa)->role.level); 3498 vmcs_write64(EPT_POINTER, construct_eptp(root_hpa)); 3499 3500 hv_track_root_tdp(vcpu, root_hpa); 3501 3502 if (!enable_unrestricted_guest && !is_paging(vcpu)) 3503 guest_cr3 = to_kvm_vmx(kvm)->ept_identity_map_addr; 3504 else if (kvm_register_is_dirty(vcpu, VCPU_EXREG_CR3)) 3505 guest_cr3 = vcpu->arch.cr3; 3506 else /* vmcs.GUEST_CR3 is already up-to-date. */ 3507 update_guest_cr3 = false; 3508 vmx_ept_load_pdptrs(vcpu); 3509 } else { 3510 guest_cr3 = root_hpa | kvm_get_active_pcid(vcpu) | 3511 kvm_get_active_cr3_lam_bits(vcpu); 3512 } 3513 3514 if (update_guest_cr3) 3515 vmcs_writel(GUEST_CR3, guest_cr3); 3516 } 3517 3518 bool vmx_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) 3519 { 3520 /* 3521 * We operate under the default treatment of SMM, so VMX cannot be 3522 * enabled under SMM. Note, whether or not VMXE is allowed at all, 3523 * i.e. is a reserved bit, is handled by common x86 code. 3524 */ 3525 if ((cr4 & X86_CR4_VMXE) && is_smm(vcpu)) 3526 return false; 3527 3528 if (to_vmx(vcpu)->nested.vmxon && !nested_cr4_valid(vcpu, cr4)) 3529 return false; 3530 3531 return true; 3532 } 3533 3534 void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) 3535 { 3536 unsigned long old_cr4 = kvm_read_cr4(vcpu); 3537 struct vcpu_vmx *vmx = to_vmx(vcpu); 3538 unsigned long hw_cr4; 3539 3540 /* 3541 * Pass through host's Machine Check Enable value to hw_cr4, which 3542 * is in force while we are in guest mode. Do not let guests control 3543 * this bit, even if host CR4.MCE == 0. 3544 */ 3545 hw_cr4 = (cr4_read_shadow() & X86_CR4_MCE) | (cr4 & ~X86_CR4_MCE); 3546 if (enable_unrestricted_guest) 3547 hw_cr4 |= KVM_VM_CR4_ALWAYS_ON_UNRESTRICTED_GUEST; 3548 else if (vmx->rmode.vm86_active) 3549 hw_cr4 |= KVM_RMODE_VM_CR4_ALWAYS_ON; 3550 else 3551 hw_cr4 |= KVM_PMODE_VM_CR4_ALWAYS_ON; 3552 3553 if (vmx_umip_emulated()) { 3554 if (cr4 & X86_CR4_UMIP) { 3555 secondary_exec_controls_setbit(vmx, SECONDARY_EXEC_DESC); 3556 hw_cr4 &= ~X86_CR4_UMIP; 3557 } else if (!is_guest_mode(vcpu) || 3558 !nested_cpu_has2(get_vmcs12(vcpu), SECONDARY_EXEC_DESC)) { 3559 secondary_exec_controls_clearbit(vmx, SECONDARY_EXEC_DESC); 3560 } 3561 } 3562 3563 vcpu->arch.cr4 = cr4; 3564 kvm_register_mark_available(vcpu, VCPU_EXREG_CR4); 3565 3566 if (!enable_unrestricted_guest) { 3567 if (enable_ept) { 3568 if (!is_paging(vcpu)) { 3569 hw_cr4 &= ~X86_CR4_PAE; 3570 hw_cr4 |= X86_CR4_PSE; 3571 } else if (!(cr4 & X86_CR4_PAE)) { 3572 hw_cr4 &= ~X86_CR4_PAE; 3573 } 3574 } 3575 3576 /* 3577 * SMEP/SMAP/PKU is disabled if CPU is in non-paging mode in 3578 * hardware. To emulate this behavior, SMEP/SMAP/PKU needs 3579 * to be manually disabled when guest switches to non-paging 3580 * mode. 3581 * 3582 * If !enable_unrestricted_guest, the CPU is always running 3583 * with CR0.PG=1 and CR4 needs to be modified. 3584 * If enable_unrestricted_guest, the CPU automatically 3585 * disables SMEP/SMAP/PKU when the guest sets CR0.PG=0. 3586 */ 3587 if (!is_paging(vcpu)) 3588 hw_cr4 &= ~(X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_PKE); 3589 } 3590 3591 vmcs_writel(CR4_READ_SHADOW, cr4); 3592 vmcs_writel(GUEST_CR4, hw_cr4); 3593 3594 if ((cr4 ^ old_cr4) & (X86_CR4_OSXSAVE | X86_CR4_PKE)) 3595 vcpu->arch.cpuid_dynamic_bits_dirty = true; 3596 } 3597 3598 void vmx_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg) 3599 { 3600 struct vcpu_vmx *vmx = to_vmx(vcpu); 3601 u32 ar; 3602 3603 if (vmx->rmode.vm86_active && seg != VCPU_SREG_LDTR) { 3604 *var = vmx->rmode.segs[seg]; 3605 if (seg == VCPU_SREG_TR 3606 || var->selector == vmx_read_guest_seg_selector(vmx, seg)) 3607 return; 3608 var->base = vmx_read_guest_seg_base(vmx, seg); 3609 var->selector = vmx_read_guest_seg_selector(vmx, seg); 3610 return; 3611 } 3612 var->base = vmx_read_guest_seg_base(vmx, seg); 3613 var->limit = vmx_read_guest_seg_limit(vmx, seg); 3614 var->selector = vmx_read_guest_seg_selector(vmx, seg); 3615 ar = vmx_read_guest_seg_ar(vmx, seg); 3616 var->unusable = (ar >> 16) & 1; 3617 var->type = ar & 15; 3618 var->s = (ar >> 4) & 1; 3619 var->dpl = (ar >> 5) & 3; 3620 /* 3621 * Some userspaces do not preserve unusable property. Since usable 3622 * segment has to be present according to VMX spec we can use present 3623 * property to amend userspace bug by making unusable segment always 3624 * nonpresent. vmx_segment_access_rights() already marks nonpresent 3625 * segment as unusable. 3626 */ 3627 var->present = !var->unusable; 3628 var->avl = (ar >> 12) & 1; 3629 var->l = (ar >> 13) & 1; 3630 var->db = (ar >> 14) & 1; 3631 var->g = (ar >> 15) & 1; 3632 } 3633 3634 u64 vmx_get_segment_base(struct kvm_vcpu *vcpu, int seg) 3635 { 3636 struct kvm_segment s; 3637 3638 if (to_vmx(vcpu)->rmode.vm86_active) { 3639 vmx_get_segment(vcpu, &s, seg); 3640 return s.base; 3641 } 3642 return vmx_read_guest_seg_base(to_vmx(vcpu), seg); 3643 } 3644 3645 static int __vmx_get_cpl(struct kvm_vcpu *vcpu, bool no_cache) 3646 { 3647 struct vcpu_vmx *vmx = to_vmx(vcpu); 3648 int ar; 3649 3650 if (unlikely(vmx->rmode.vm86_active)) 3651 return 0; 3652 3653 if (no_cache) 3654 ar = vmcs_read32(GUEST_SS_AR_BYTES); 3655 else 3656 ar = vmx_read_guest_seg_ar(vmx, VCPU_SREG_SS); 3657 return VMX_AR_DPL(ar); 3658 } 3659 3660 int vmx_get_cpl(struct kvm_vcpu *vcpu) 3661 { 3662 return __vmx_get_cpl(vcpu, false); 3663 } 3664 3665 int vmx_get_cpl_no_cache(struct kvm_vcpu *vcpu) 3666 { 3667 return __vmx_get_cpl(vcpu, true); 3668 } 3669 3670 static u32 vmx_segment_access_rights(struct kvm_segment *var) 3671 { 3672 u32 ar; 3673 3674 ar = var->type & 15; 3675 ar |= (var->s & 1) << 4; 3676 ar |= (var->dpl & 3) << 5; 3677 ar |= (var->present & 1) << 7; 3678 ar |= (var->avl & 1) << 12; 3679 ar |= (var->l & 1) << 13; 3680 ar |= (var->db & 1) << 14; 3681 ar |= (var->g & 1) << 15; 3682 ar |= (var->unusable || !var->present) << 16; 3683 3684 return ar; 3685 } 3686 3687 void __vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg) 3688 { 3689 struct vcpu_vmx *vmx = to_vmx(vcpu); 3690 const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; 3691 3692 vmx_segment_cache_clear(vmx); 3693 3694 if (vmx->rmode.vm86_active && seg != VCPU_SREG_LDTR) { 3695 vmx->rmode.segs[seg] = *var; 3696 if (seg == VCPU_SREG_TR) 3697 vmcs_write16(sf->selector, var->selector); 3698 else if (var->s) 3699 fix_rmode_seg(seg, &vmx->rmode.segs[seg]); 3700 return; 3701 } 3702 3703 vmcs_writel(sf->base, var->base); 3704 vmcs_write32(sf->limit, var->limit); 3705 vmcs_write16(sf->selector, var->selector); 3706 3707 /* 3708 * Fix the "Accessed" bit in AR field of segment registers for older 3709 * qemu binaries. 3710 * IA32 arch specifies that at the time of processor reset the 3711 * "Accessed" bit in the AR field of segment registers is 1. And qemu 3712 * is setting it to 0 in the userland code. This causes invalid guest 3713 * state vmexit when "unrestricted guest" mode is turned on. 3714 * Fix for this setup issue in cpu_reset is being pushed in the qemu 3715 * tree. Newer qemu binaries with that qemu fix would not need this 3716 * kvm hack. 3717 */ 3718 if (is_unrestricted_guest(vcpu) && (seg != VCPU_SREG_LDTR)) 3719 var->type |= 0x1; /* Accessed */ 3720 3721 vmcs_write32(sf->ar_bytes, vmx_segment_access_rights(var)); 3722 } 3723 3724 void vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg) 3725 { 3726 __vmx_set_segment(vcpu, var, seg); 3727 3728 to_vmx(vcpu)->vt.emulation_required = vmx_emulation_required(vcpu); 3729 } 3730 3731 void vmx_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l) 3732 { 3733 u32 ar = vmx_read_guest_seg_ar(to_vmx(vcpu), VCPU_SREG_CS); 3734 3735 *db = (ar >> 14) & 1; 3736 *l = (ar >> 13) & 1; 3737 } 3738 3739 void vmx_get_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) 3740 { 3741 dt->size = vmcs_read32(GUEST_IDTR_LIMIT); 3742 dt->address = vmcs_readl(GUEST_IDTR_BASE); 3743 } 3744 3745 void vmx_set_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) 3746 { 3747 vmcs_write32(GUEST_IDTR_LIMIT, dt->size); 3748 vmcs_writel(GUEST_IDTR_BASE, dt->address); 3749 } 3750 3751 void vmx_get_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) 3752 { 3753 dt->size = vmcs_read32(GUEST_GDTR_LIMIT); 3754 dt->address = vmcs_readl(GUEST_GDTR_BASE); 3755 } 3756 3757 void vmx_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) 3758 { 3759 vmcs_write32(GUEST_GDTR_LIMIT, dt->size); 3760 vmcs_writel(GUEST_GDTR_BASE, dt->address); 3761 } 3762 3763 static bool rmode_segment_valid(struct kvm_vcpu *vcpu, int seg) 3764 { 3765 struct kvm_segment var; 3766 u32 ar; 3767 3768 vmx_get_segment(vcpu, &var, seg); 3769 var.dpl = 0x3; 3770 if (seg == VCPU_SREG_CS) 3771 var.type = 0x3; 3772 ar = vmx_segment_access_rights(&var); 3773 3774 if (var.base != (var.selector << 4)) 3775 return false; 3776 if (var.limit != 0xffff) 3777 return false; 3778 if (ar != 0xf3) 3779 return false; 3780 3781 return true; 3782 } 3783 3784 static bool code_segment_valid(struct kvm_vcpu *vcpu) 3785 { 3786 struct kvm_segment cs; 3787 unsigned int cs_rpl; 3788 3789 vmx_get_segment(vcpu, &cs, VCPU_SREG_CS); 3790 cs_rpl = cs.selector & SEGMENT_RPL_MASK; 3791 3792 if (cs.unusable) 3793 return false; 3794 if (~cs.type & (VMX_AR_TYPE_CODE_MASK|VMX_AR_TYPE_ACCESSES_MASK)) 3795 return false; 3796 if (!cs.s) 3797 return false; 3798 if (cs.type & VMX_AR_TYPE_WRITEABLE_MASK) { 3799 if (cs.dpl > cs_rpl) 3800 return false; 3801 } else { 3802 if (cs.dpl != cs_rpl) 3803 return false; 3804 } 3805 if (!cs.present) 3806 return false; 3807 3808 /* TODO: Add Reserved field check, this'll require a new member in the kvm_segment_field structure */ 3809 return true; 3810 } 3811 3812 static bool stack_segment_valid(struct kvm_vcpu *vcpu) 3813 { 3814 struct kvm_segment ss; 3815 unsigned int ss_rpl; 3816 3817 vmx_get_segment(vcpu, &ss, VCPU_SREG_SS); 3818 ss_rpl = ss.selector & SEGMENT_RPL_MASK; 3819 3820 if (ss.unusable) 3821 return true; 3822 if (ss.type != 3 && ss.type != 7) 3823 return false; 3824 if (!ss.s) 3825 return false; 3826 if (ss.dpl != ss_rpl) /* DPL != RPL */ 3827 return false; 3828 if (!ss.present) 3829 return false; 3830 3831 return true; 3832 } 3833 3834 static bool data_segment_valid(struct kvm_vcpu *vcpu, int seg) 3835 { 3836 struct kvm_segment var; 3837 unsigned int rpl; 3838 3839 vmx_get_segment(vcpu, &var, seg); 3840 rpl = var.selector & SEGMENT_RPL_MASK; 3841 3842 if (var.unusable) 3843 return true; 3844 if (!var.s) 3845 return false; 3846 if (!var.present) 3847 return false; 3848 if (~var.type & (VMX_AR_TYPE_CODE_MASK|VMX_AR_TYPE_WRITEABLE_MASK)) { 3849 if (var.dpl < rpl) /* DPL < RPL */ 3850 return false; 3851 } 3852 3853 /* TODO: Add other members to kvm_segment_field to allow checking for other access 3854 * rights flags 3855 */ 3856 return true; 3857 } 3858 3859 static bool tr_valid(struct kvm_vcpu *vcpu) 3860 { 3861 struct kvm_segment tr; 3862 3863 vmx_get_segment(vcpu, &tr, VCPU_SREG_TR); 3864 3865 if (tr.unusable) 3866 return false; 3867 if (tr.selector & SEGMENT_TI_MASK) /* TI = 1 */ 3868 return false; 3869 if (tr.type != 3 && tr.type != 11) /* TODO: Check if guest is in IA32e mode */ 3870 return false; 3871 if (!tr.present) 3872 return false; 3873 3874 return true; 3875 } 3876 3877 static bool ldtr_valid(struct kvm_vcpu *vcpu) 3878 { 3879 struct kvm_segment ldtr; 3880 3881 vmx_get_segment(vcpu, &ldtr, VCPU_SREG_LDTR); 3882 3883 if (ldtr.unusable) 3884 return true; 3885 if (ldtr.selector & SEGMENT_TI_MASK) /* TI = 1 */ 3886 return false; 3887 if (ldtr.type != 2) 3888 return false; 3889 if (!ldtr.present) 3890 return false; 3891 3892 return true; 3893 } 3894 3895 static bool cs_ss_rpl_check(struct kvm_vcpu *vcpu) 3896 { 3897 struct kvm_segment cs, ss; 3898 3899 vmx_get_segment(vcpu, &cs, VCPU_SREG_CS); 3900 vmx_get_segment(vcpu, &ss, VCPU_SREG_SS); 3901 3902 return ((cs.selector & SEGMENT_RPL_MASK) == 3903 (ss.selector & SEGMENT_RPL_MASK)); 3904 } 3905 3906 /* 3907 * Check if guest state is valid. Returns true if valid, false if 3908 * not. 3909 * We assume that registers are always usable 3910 */ 3911 bool __vmx_guest_state_valid(struct kvm_vcpu *vcpu) 3912 { 3913 /* real mode guest state checks */ 3914 if (!is_protmode(vcpu) || (vmx_get_rflags(vcpu) & X86_EFLAGS_VM)) { 3915 if (!rmode_segment_valid(vcpu, VCPU_SREG_CS)) 3916 return false; 3917 if (!rmode_segment_valid(vcpu, VCPU_SREG_SS)) 3918 return false; 3919 if (!rmode_segment_valid(vcpu, VCPU_SREG_DS)) 3920 return false; 3921 if (!rmode_segment_valid(vcpu, VCPU_SREG_ES)) 3922 return false; 3923 if (!rmode_segment_valid(vcpu, VCPU_SREG_FS)) 3924 return false; 3925 if (!rmode_segment_valid(vcpu, VCPU_SREG_GS)) 3926 return false; 3927 } else { 3928 /* protected mode guest state checks */ 3929 if (!cs_ss_rpl_check(vcpu)) 3930 return false; 3931 if (!code_segment_valid(vcpu)) 3932 return false; 3933 if (!stack_segment_valid(vcpu)) 3934 return false; 3935 if (!data_segment_valid(vcpu, VCPU_SREG_DS)) 3936 return false; 3937 if (!data_segment_valid(vcpu, VCPU_SREG_ES)) 3938 return false; 3939 if (!data_segment_valid(vcpu, VCPU_SREG_FS)) 3940 return false; 3941 if (!data_segment_valid(vcpu, VCPU_SREG_GS)) 3942 return false; 3943 if (!tr_valid(vcpu)) 3944 return false; 3945 if (!ldtr_valid(vcpu)) 3946 return false; 3947 } 3948 /* TODO: 3949 * - Add checks on RIP 3950 * - Add checks on RFLAGS 3951 */ 3952 3953 return true; 3954 } 3955 3956 static int init_rmode_tss(struct kvm *kvm, void __user *ua) 3957 { 3958 const void *zero_page = (const void *) __va(page_to_phys(ZERO_PAGE(0))); 3959 u16 data; 3960 int i; 3961 3962 for (i = 0; i < 3; i++) { 3963 if (__copy_to_user(ua + PAGE_SIZE * i, zero_page, PAGE_SIZE)) 3964 return -EFAULT; 3965 } 3966 3967 data = TSS_BASE_SIZE + TSS_REDIRECTION_SIZE; 3968 if (__copy_to_user(ua + TSS_IOPB_BASE_OFFSET, &data, sizeof(u16))) 3969 return -EFAULT; 3970 3971 data = ~0; 3972 if (__copy_to_user(ua + RMODE_TSS_SIZE - 1, &data, sizeof(u8))) 3973 return -EFAULT; 3974 3975 return 0; 3976 } 3977 3978 static int init_rmode_identity_map(struct kvm *kvm) 3979 { 3980 struct kvm_vmx *kvm_vmx = to_kvm_vmx(kvm); 3981 int i, r = 0; 3982 void __user *uaddr; 3983 u32 tmp; 3984 3985 /* Protect kvm_vmx->ept_identity_pagetable_done. */ 3986 mutex_lock(&kvm->slots_lock); 3987 3988 if (likely(kvm_vmx->ept_identity_pagetable_done)) 3989 goto out; 3990 3991 if (!kvm_vmx->ept_identity_map_addr) 3992 kvm_vmx->ept_identity_map_addr = VMX_EPT_IDENTITY_PAGETABLE_ADDR; 3993 3994 uaddr = __x86_set_memory_region(kvm, 3995 IDENTITY_PAGETABLE_PRIVATE_MEMSLOT, 3996 kvm_vmx->ept_identity_map_addr, 3997 PAGE_SIZE); 3998 if (IS_ERR(uaddr)) { 3999 r = PTR_ERR(uaddr); 4000 goto out; 4001 } 4002 4003 /* Set up identity-mapping pagetable for EPT in real mode */ 4004 for (i = 0; i < (PAGE_SIZE / sizeof(tmp)); i++) { 4005 tmp = (i << 22) + (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | 4006 _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_PSE); 4007 if (__copy_to_user(uaddr + i * sizeof(tmp), &tmp, sizeof(tmp))) { 4008 r = -EFAULT; 4009 goto out; 4010 } 4011 } 4012 kvm_vmx->ept_identity_pagetable_done = true; 4013 4014 out: 4015 mutex_unlock(&kvm->slots_lock); 4016 return r; 4017 } 4018 4019 static void seg_setup(int seg) 4020 { 4021 const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; 4022 unsigned int ar; 4023 4024 vmcs_write16(sf->selector, 0); 4025 vmcs_writel(sf->base, 0); 4026 vmcs_write32(sf->limit, 0xffff); 4027 ar = 0x93; 4028 if (seg == VCPU_SREG_CS) 4029 ar |= 0x08; /* code segment */ 4030 4031 vmcs_write32(sf->ar_bytes, ar); 4032 } 4033 4034 int allocate_vpid(void) 4035 { 4036 int vpid; 4037 4038 if (!enable_vpid) 4039 return 0; 4040 spin_lock(&vmx_vpid_lock); 4041 vpid = find_first_zero_bit(vmx_vpid_bitmap, VMX_NR_VPIDS); 4042 if (vpid < VMX_NR_VPIDS) 4043 __set_bit(vpid, vmx_vpid_bitmap); 4044 else 4045 vpid = 0; 4046 spin_unlock(&vmx_vpid_lock); 4047 return vpid; 4048 } 4049 4050 void free_vpid(int vpid) 4051 { 4052 if (!enable_vpid || vpid == 0) 4053 return; 4054 spin_lock(&vmx_vpid_lock); 4055 __clear_bit(vpid, vmx_vpid_bitmap); 4056 spin_unlock(&vmx_vpid_lock); 4057 } 4058 4059 static void vmx_msr_bitmap_l01_changed(struct vcpu_vmx *vmx) 4060 { 4061 /* 4062 * When KVM is a nested hypervisor on top of Hyper-V and uses 4063 * 'Enlightened MSR Bitmap' feature L0 needs to know that MSR 4064 * bitmap has changed. 4065 */ 4066 if (kvm_is_using_evmcs()) { 4067 struct hv_enlightened_vmcs *evmcs = (void *)vmx->vmcs01.vmcs; 4068 4069 if (evmcs->hv_enlightenments_control.msr_bitmap) 4070 evmcs->hv_clean_fields &= 4071 ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP; 4072 } 4073 4074 vmx->nested.force_msr_bitmap_recalc = true; 4075 } 4076 4077 void vmx_set_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type, bool set) 4078 { 4079 struct vcpu_vmx *vmx = to_vmx(vcpu); 4080 unsigned long *msr_bitmap = vmx->vmcs01.msr_bitmap; 4081 4082 if (!cpu_has_vmx_msr_bitmap()) 4083 return; 4084 4085 vmx_msr_bitmap_l01_changed(vmx); 4086 4087 if (type & MSR_TYPE_R) { 4088 if (!set && kvm_msr_allowed(vcpu, msr, KVM_MSR_FILTER_READ)) 4089 vmx_clear_msr_bitmap_read(msr_bitmap, msr); 4090 else 4091 vmx_set_msr_bitmap_read(msr_bitmap, msr); 4092 } 4093 4094 if (type & MSR_TYPE_W) { 4095 if (!set && kvm_msr_allowed(vcpu, msr, KVM_MSR_FILTER_WRITE)) 4096 vmx_clear_msr_bitmap_write(msr_bitmap, msr); 4097 else 4098 vmx_set_msr_bitmap_write(msr_bitmap, msr); 4099 } 4100 } 4101 4102 static void vmx_update_msr_bitmap_x2apic(struct kvm_vcpu *vcpu) 4103 { 4104 /* 4105 * x2APIC indices for 64-bit accesses into the RDMSR and WRMSR halves 4106 * of the MSR bitmap. KVM emulates APIC registers up through 0x3f0, 4107 * i.e. MSR 0x83f, and so only needs to dynamically manipulate 64 bits. 4108 */ 4109 const int read_idx = APIC_BASE_MSR / BITS_PER_LONG_LONG; 4110 const int write_idx = read_idx + (0x800 / sizeof(u64)); 4111 struct vcpu_vmx *vmx = to_vmx(vcpu); 4112 u64 *msr_bitmap = (u64 *)vmx->vmcs01.msr_bitmap; 4113 u8 mode; 4114 4115 if (!cpu_has_vmx_msr_bitmap() || WARN_ON_ONCE(!lapic_in_kernel(vcpu))) 4116 return; 4117 4118 if (cpu_has_secondary_exec_ctrls() && 4119 (secondary_exec_controls_get(vmx) & 4120 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE)) { 4121 mode = MSR_BITMAP_MODE_X2APIC; 4122 if (enable_apicv && kvm_vcpu_apicv_active(vcpu)) 4123 mode |= MSR_BITMAP_MODE_X2APIC_APICV; 4124 } else { 4125 mode = 0; 4126 } 4127 4128 if (mode == vmx->x2apic_msr_bitmap_mode) 4129 return; 4130 4131 vmx->x2apic_msr_bitmap_mode = mode; 4132 4133 /* 4134 * Reset the bitmap for MSRs 0x800 - 0x83f. Leave AMD's uber-extended 4135 * registers (0x840 and above) intercepted, KVM doesn't support them. 4136 * Intercept all writes by default and poke holes as needed. Pass 4137 * through reads for all valid registers by default in x2APIC+APICv 4138 * mode, only the current timer count needs on-demand emulation by KVM. 4139 */ 4140 if (mode & MSR_BITMAP_MODE_X2APIC_APICV) 4141 msr_bitmap[read_idx] = ~kvm_lapic_readable_reg_mask(vcpu->arch.apic); 4142 else 4143 msr_bitmap[read_idx] = ~0ull; 4144 msr_bitmap[write_idx] = ~0ull; 4145 4146 /* 4147 * TPR reads and writes can be virtualized even if virtual interrupt 4148 * delivery is not in use. 4149 */ 4150 vmx_set_intercept_for_msr(vcpu, X2APIC_MSR(APIC_TASKPRI), MSR_TYPE_RW, 4151 !(mode & MSR_BITMAP_MODE_X2APIC)); 4152 4153 if (mode & MSR_BITMAP_MODE_X2APIC_APICV) { 4154 vmx_enable_intercept_for_msr(vcpu, X2APIC_MSR(APIC_TMCCT), MSR_TYPE_RW); 4155 vmx_disable_intercept_for_msr(vcpu, X2APIC_MSR(APIC_EOI), MSR_TYPE_W); 4156 vmx_disable_intercept_for_msr(vcpu, X2APIC_MSR(APIC_SELF_IPI), MSR_TYPE_W); 4157 if (enable_ipiv) 4158 vmx_disable_intercept_for_msr(vcpu, X2APIC_MSR(APIC_ICR), MSR_TYPE_RW); 4159 } 4160 } 4161 4162 void pt_update_intercept_for_msr(struct kvm_vcpu *vcpu) 4163 { 4164 struct vcpu_vmx *vmx = to_vmx(vcpu); 4165 bool flag = !(vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN); 4166 u32 i; 4167 4168 vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_STATUS, MSR_TYPE_RW, flag); 4169 vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_OUTPUT_BASE, MSR_TYPE_RW, flag); 4170 vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_OUTPUT_MASK, MSR_TYPE_RW, flag); 4171 vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_CR3_MATCH, MSR_TYPE_RW, flag); 4172 for (i = 0; i < vmx->pt_desc.num_address_ranges; i++) { 4173 vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_ADDR0_A + i * 2, MSR_TYPE_RW, flag); 4174 vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_ADDR0_B + i * 2, MSR_TYPE_RW, flag); 4175 } 4176 } 4177 4178 static void vmx_recalc_pmu_msr_intercepts(struct kvm_vcpu *vcpu) 4179 { 4180 u64 vm_exit_controls_bits = VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL | 4181 VM_EXIT_SAVE_IA32_PERF_GLOBAL_CTRL; 4182 bool has_mediated_pmu = kvm_vcpu_has_mediated_pmu(vcpu); 4183 struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); 4184 struct vcpu_vmx *vmx = to_vmx(vcpu); 4185 bool intercept = !has_mediated_pmu; 4186 int i; 4187 4188 if (!enable_mediated_pmu) 4189 return; 4190 4191 if (!cpu_has_save_perf_global_ctrl()) { 4192 vm_exit_controls_bits &= ~VM_EXIT_SAVE_IA32_PERF_GLOBAL_CTRL; 4193 4194 if (has_mediated_pmu) 4195 vmx_add_autostore_msr(vmx, MSR_CORE_PERF_GLOBAL_CTRL); 4196 else 4197 vmx_remove_autostore_msr(vmx, MSR_CORE_PERF_GLOBAL_CTRL); 4198 } 4199 4200 vm_entry_controls_changebit(vmx, VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL, 4201 has_mediated_pmu); 4202 4203 vm_exit_controls_changebit(vmx, vm_exit_controls_bits, has_mediated_pmu); 4204 4205 for (i = 0; i < pmu->nr_arch_gp_counters; i++) { 4206 vmx_set_intercept_for_msr(vcpu, MSR_IA32_PERFCTR0 + i, 4207 MSR_TYPE_RW, intercept); 4208 vmx_set_intercept_for_msr(vcpu, MSR_IA32_PMC0 + i, MSR_TYPE_RW, 4209 intercept || !fw_writes_is_enabled(vcpu)); 4210 } 4211 for ( ; i < kvm_pmu_cap.num_counters_gp; i++) { 4212 vmx_set_intercept_for_msr(vcpu, MSR_IA32_PERFCTR0 + i, 4213 MSR_TYPE_RW, true); 4214 vmx_set_intercept_for_msr(vcpu, MSR_IA32_PMC0 + i, 4215 MSR_TYPE_RW, true); 4216 } 4217 4218 for (i = 0; i < pmu->nr_arch_fixed_counters; i++) 4219 vmx_set_intercept_for_msr(vcpu, MSR_CORE_PERF_FIXED_CTR0 + i, 4220 MSR_TYPE_RW, intercept); 4221 for ( ; i < kvm_pmu_cap.num_counters_fixed; i++) 4222 vmx_set_intercept_for_msr(vcpu, MSR_CORE_PERF_FIXED_CTR0 + i, 4223 MSR_TYPE_RW, true); 4224 4225 intercept = kvm_need_perf_global_ctrl_intercept(vcpu); 4226 vmx_set_intercept_for_msr(vcpu, MSR_CORE_PERF_GLOBAL_STATUS, 4227 MSR_TYPE_RW, intercept); 4228 vmx_set_intercept_for_msr(vcpu, MSR_CORE_PERF_GLOBAL_CTRL, 4229 MSR_TYPE_RW, intercept); 4230 vmx_set_intercept_for_msr(vcpu, MSR_CORE_PERF_GLOBAL_OVF_CTRL, 4231 MSR_TYPE_RW, intercept); 4232 } 4233 4234 static void vmx_recalc_msr_intercepts(struct kvm_vcpu *vcpu) 4235 { 4236 bool intercept; 4237 4238 if (!cpu_has_vmx_msr_bitmap()) 4239 return; 4240 4241 vmx_disable_intercept_for_msr(vcpu, MSR_IA32_TSC, MSR_TYPE_R); 4242 #ifdef CONFIG_X86_64 4243 vmx_disable_intercept_for_msr(vcpu, MSR_FS_BASE, MSR_TYPE_RW); 4244 vmx_disable_intercept_for_msr(vcpu, MSR_GS_BASE, MSR_TYPE_RW); 4245 vmx_disable_intercept_for_msr(vcpu, MSR_KERNEL_GS_BASE, MSR_TYPE_RW); 4246 #endif 4247 vmx_disable_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_CS, MSR_TYPE_RW); 4248 vmx_disable_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_ESP, MSR_TYPE_RW); 4249 vmx_disable_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_EIP, MSR_TYPE_RW); 4250 if (kvm_cstate_in_guest(vcpu->kvm)) { 4251 vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C1_RES, MSR_TYPE_R); 4252 vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C3_RESIDENCY, MSR_TYPE_R); 4253 vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C6_RESIDENCY, MSR_TYPE_R); 4254 vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C7_RESIDENCY, MSR_TYPE_R); 4255 } 4256 if (kvm_aperfmperf_in_guest(vcpu->kvm)) { 4257 vmx_disable_intercept_for_msr(vcpu, MSR_IA32_APERF, MSR_TYPE_R); 4258 vmx_disable_intercept_for_msr(vcpu, MSR_IA32_MPERF, MSR_TYPE_R); 4259 } 4260 4261 /* PT MSRs can be passed through iff PT is exposed to the guest. */ 4262 if (vmx_pt_mode_is_host_guest()) 4263 pt_update_intercept_for_msr(vcpu); 4264 4265 if (vcpu->arch.xfd_no_write_intercept) 4266 vmx_disable_intercept_for_msr(vcpu, MSR_IA32_XFD, MSR_TYPE_RW); 4267 4268 vmx_set_intercept_for_msr(vcpu, MSR_IA32_SPEC_CTRL, MSR_TYPE_RW, 4269 !to_vmx(vcpu)->spec_ctrl); 4270 4271 if (kvm_cpu_cap_has(X86_FEATURE_XFD)) 4272 vmx_set_intercept_for_msr(vcpu, MSR_IA32_XFD_ERR, MSR_TYPE_R, 4273 !guest_cpu_cap_has(vcpu, X86_FEATURE_XFD)); 4274 4275 if (cpu_feature_enabled(X86_FEATURE_IBPB)) 4276 vmx_set_intercept_for_msr(vcpu, MSR_IA32_PRED_CMD, MSR_TYPE_W, 4277 !guest_has_pred_cmd_msr(vcpu)); 4278 4279 if (cpu_feature_enabled(X86_FEATURE_FLUSH_L1D)) 4280 vmx_set_intercept_for_msr(vcpu, MSR_IA32_FLUSH_CMD, MSR_TYPE_W, 4281 !guest_cpu_cap_has(vcpu, X86_FEATURE_FLUSH_L1D)); 4282 4283 if (kvm_cpu_cap_has(X86_FEATURE_SHSTK)) { 4284 intercept = !guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK); 4285 4286 vmx_set_intercept_for_msr(vcpu, MSR_IA32_PL0_SSP, MSR_TYPE_RW, intercept); 4287 vmx_set_intercept_for_msr(vcpu, MSR_IA32_PL1_SSP, MSR_TYPE_RW, intercept); 4288 vmx_set_intercept_for_msr(vcpu, MSR_IA32_PL2_SSP, MSR_TYPE_RW, intercept); 4289 vmx_set_intercept_for_msr(vcpu, MSR_IA32_PL3_SSP, MSR_TYPE_RW, intercept); 4290 } 4291 4292 if (kvm_cpu_cap_has(X86_FEATURE_SHSTK) || kvm_cpu_cap_has(X86_FEATURE_IBT)) { 4293 intercept = !guest_cpu_cap_has(vcpu, X86_FEATURE_IBT) && 4294 !guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK); 4295 4296 vmx_set_intercept_for_msr(vcpu, MSR_IA32_U_CET, MSR_TYPE_RW, intercept); 4297 vmx_set_intercept_for_msr(vcpu, MSR_IA32_S_CET, MSR_TYPE_RW, intercept); 4298 } 4299 4300 vmx_recalc_pmu_msr_intercepts(vcpu); 4301 4302 /* 4303 * x2APIC and LBR MSR intercepts are modified on-demand and cannot be 4304 * filtered by userspace. 4305 */ 4306 } 4307 4308 static void vmx_recalc_instruction_intercepts(struct kvm_vcpu *vcpu) 4309 { 4310 exec_controls_changebit(to_vmx(vcpu), CPU_BASED_RDPMC_EXITING, 4311 kvm_need_rdpmc_intercept(vcpu)); 4312 } 4313 4314 void vmx_recalc_intercepts(struct kvm_vcpu *vcpu) 4315 { 4316 vmx_recalc_instruction_intercepts(vcpu); 4317 vmx_recalc_msr_intercepts(vcpu); 4318 } 4319 4320 static int vmx_deliver_nested_posted_interrupt(struct kvm_vcpu *vcpu, 4321 int vector) 4322 { 4323 struct vcpu_vmx *vmx = to_vmx(vcpu); 4324 4325 /* 4326 * DO NOT query the vCPU's vmcs12, as vmcs12 is dynamically allocated 4327 * and freed, and must not be accessed outside of vcpu->mutex. The 4328 * vCPU's cached PI NV is valid if and only if posted interrupts 4329 * enabled in its vmcs12, i.e. checking the vector also checks that 4330 * L1 has enabled posted interrupts for L2. 4331 */ 4332 if (is_guest_mode(vcpu) && 4333 vector == vmx->nested.posted_intr_nv) { 4334 /* 4335 * If a posted intr is not recognized by hardware, 4336 * we will accomplish it in the next vmentry. 4337 */ 4338 vmx->nested.pi_pending = true; 4339 kvm_make_request(KVM_REQ_EVENT, vcpu); 4340 4341 /* 4342 * This pairs with the smp_mb_*() after setting vcpu->mode in 4343 * vcpu_enter_guest() to guarantee the vCPU sees the event 4344 * request if triggering a posted interrupt "fails" because 4345 * vcpu->mode != IN_GUEST_MODE. The extra barrier is needed as 4346 * the smb_wmb() in kvm_make_request() only ensures everything 4347 * done before making the request is visible when the request 4348 * is visible, it doesn't ensure ordering between the store to 4349 * vcpu->requests and the load from vcpu->mode. 4350 */ 4351 smp_mb__after_atomic(); 4352 4353 /* the PIR and ON have been set by L1. */ 4354 kvm_vcpu_trigger_posted_interrupt(vcpu, POSTED_INTR_NESTED_VECTOR); 4355 return 0; 4356 } 4357 return -1; 4358 } 4359 /* 4360 * Send interrupt to vcpu via posted interrupt way. 4361 * 1. If target vcpu is running(non-root mode), send posted interrupt 4362 * notification to vcpu and hardware will sync PIR to vIRR atomically. 4363 * 2. If target vcpu isn't running(root mode), kick it to pick up the 4364 * interrupt from PIR in next vmentry. 4365 */ 4366 static int vmx_deliver_posted_interrupt(struct kvm_vcpu *vcpu, int vector) 4367 { 4368 struct vcpu_vt *vt = to_vt(vcpu); 4369 int r; 4370 4371 r = vmx_deliver_nested_posted_interrupt(vcpu, vector); 4372 if (!r) 4373 return 0; 4374 4375 /* Note, this is called iff the local APIC is in-kernel. */ 4376 if (!vcpu->arch.apic->apicv_active) 4377 return -1; 4378 4379 __vmx_deliver_posted_interrupt(vcpu, &vt->pi_desc, vector); 4380 return 0; 4381 } 4382 4383 void vmx_deliver_interrupt(struct kvm_lapic *apic, int delivery_mode, 4384 int trig_mode, int vector) 4385 { 4386 struct kvm_vcpu *vcpu = apic->vcpu; 4387 4388 if (vmx_deliver_posted_interrupt(vcpu, vector)) { 4389 kvm_lapic_set_irr(vector, apic); 4390 kvm_make_request(KVM_REQ_EVENT, vcpu); 4391 kvm_vcpu_kick(vcpu); 4392 } else { 4393 trace_kvm_apicv_accept_irq(vcpu->vcpu_id, delivery_mode, 4394 trig_mode, vector); 4395 } 4396 } 4397 4398 /* 4399 * Set up the vmcs's constant host-state fields, i.e., host-state fields that 4400 * will not change in the lifetime of the guest. 4401 * Note that host-state that does change is set elsewhere. E.g., host-state 4402 * that is set differently for each CPU is set in vmx_vcpu_load(), not here. 4403 */ 4404 void vmx_set_constant_host_state(struct vcpu_vmx *vmx) 4405 { 4406 u32 low32, high32; 4407 unsigned long tmpl; 4408 unsigned long cr0, cr3, cr4; 4409 4410 cr0 = read_cr0(); 4411 WARN_ON(cr0 & X86_CR0_TS); 4412 vmcs_writel(HOST_CR0, cr0); /* 22.2.3 */ 4413 4414 /* 4415 * Save the most likely value for this task's CR3 in the VMCS. 4416 * We can't use __get_current_cr3_fast() because we're not atomic. 4417 */ 4418 cr3 = __read_cr3(); 4419 vmcs_writel(HOST_CR3, cr3); /* 22.2.3 FIXME: shadow tables */ 4420 vmx->loaded_vmcs->host_state.cr3 = cr3; 4421 4422 /* Save the most likely value for this task's CR4 in the VMCS. */ 4423 cr4 = cr4_read_shadow(); 4424 vmcs_writel(HOST_CR4, cr4); /* 22.2.3, 22.2.5 */ 4425 vmx->loaded_vmcs->host_state.cr4 = cr4; 4426 4427 vmcs_write16(HOST_CS_SELECTOR, __KERNEL_CS); /* 22.2.4 */ 4428 #ifdef CONFIG_X86_64 4429 /* 4430 * Load null selectors, so we can avoid reloading them in 4431 * vmx_prepare_switch_to_host(), in case userspace uses 4432 * the null selectors too (the expected case). 4433 */ 4434 vmcs_write16(HOST_DS_SELECTOR, 0); 4435 vmcs_write16(HOST_ES_SELECTOR, 0); 4436 #else 4437 vmcs_write16(HOST_DS_SELECTOR, __KERNEL_DS); /* 22.2.4 */ 4438 vmcs_write16(HOST_ES_SELECTOR, __KERNEL_DS); /* 22.2.4 */ 4439 #endif 4440 vmcs_write16(HOST_SS_SELECTOR, __KERNEL_DS); /* 22.2.4 */ 4441 vmcs_write16(HOST_TR_SELECTOR, GDT_ENTRY_TSS*8); /* 22.2.4 */ 4442 4443 vmcs_writel(HOST_IDTR_BASE, host_idt_base); /* 22.2.4 */ 4444 4445 vmcs_writel(HOST_RIP, (unsigned long)vmx_vmexit); /* 22.2.5 */ 4446 4447 rdmsr(MSR_IA32_SYSENTER_CS, low32, high32); 4448 vmcs_write32(HOST_IA32_SYSENTER_CS, low32); 4449 4450 /* 4451 * SYSENTER is used for 32-bit system calls on either 32-bit or 4452 * 64-bit kernels. It is always zero If neither is allowed, otherwise 4453 * vmx_vcpu_load_vmcs loads it with the per-CPU entry stack (and may 4454 * have already done so!). 4455 */ 4456 if (!IS_ENABLED(CONFIG_IA32_EMULATION) && !IS_ENABLED(CONFIG_X86_32)) 4457 vmcs_writel(HOST_IA32_SYSENTER_ESP, 0); 4458 4459 rdmsrq(MSR_IA32_SYSENTER_EIP, tmpl); 4460 vmcs_writel(HOST_IA32_SYSENTER_EIP, tmpl); /* 22.2.3 */ 4461 4462 if (vmcs_config.vmexit_ctrl & VM_EXIT_LOAD_IA32_PAT) { 4463 rdmsr(MSR_IA32_CR_PAT, low32, high32); 4464 vmcs_write64(HOST_IA32_PAT, low32 | ((u64) high32 << 32)); 4465 } 4466 4467 if (cpu_has_load_ia32_efer()) 4468 vmcs_write64(HOST_IA32_EFER, kvm_host.efer); 4469 4470 /* 4471 * Supervisor shadow stack is not enabled on host side, i.e., 4472 * host IA32_S_CET.SHSTK_EN bit is guaranteed to 0 now, per SDM 4473 * description(RDSSP instruction), SSP is not readable in CPL0, 4474 * so resetting the two registers to 0s at VM-Exit does no harm 4475 * to kernel execution. When execution flow exits to userspace, 4476 * SSP is reloaded from IA32_PL3_SSP. Check SDM Vol.2A/B Chapter 4477 * 3 and 4 for details. 4478 */ 4479 if (cpu_has_load_cet_ctrl()) { 4480 vmcs_writel(HOST_S_CET, kvm_host.s_cet); 4481 vmcs_writel(HOST_SSP, 0); 4482 vmcs_writel(HOST_INTR_SSP_TABLE, 0); 4483 } 4484 4485 /* 4486 * When running a guest with a mediated PMU, guest state is resident in 4487 * hardware after VM-Exit. Zero PERF_GLOBAL_CTRL on exit so that host 4488 * activity doesn't bleed into the guest counters. When running with 4489 * an emulated PMU, PERF_GLOBAL_CTRL is dynamically computed on every 4490 * entry/exit to merge guest and host PMU usage. 4491 */ 4492 if (enable_mediated_pmu) 4493 vmcs_write64(HOST_IA32_PERF_GLOBAL_CTRL, 0); 4494 } 4495 4496 void set_cr4_guest_host_mask(struct vcpu_vmx *vmx) 4497 { 4498 struct kvm_vcpu *vcpu = &vmx->vcpu; 4499 4500 vcpu->arch.cr4_guest_owned_bits = KVM_POSSIBLE_CR4_GUEST_BITS & 4501 ~vcpu->arch.cr4_guest_rsvd_bits; 4502 if (!enable_ept) { 4503 vcpu->arch.cr4_guest_owned_bits &= ~X86_CR4_TLBFLUSH_BITS; 4504 vcpu->arch.cr4_guest_owned_bits &= ~X86_CR4_PDPTR_BITS; 4505 } 4506 if (is_guest_mode(&vmx->vcpu)) 4507 vcpu->arch.cr4_guest_owned_bits &= 4508 ~get_vmcs12(vcpu)->cr4_guest_host_mask; 4509 vmcs_writel(CR4_GUEST_HOST_MASK, ~vcpu->arch.cr4_guest_owned_bits); 4510 } 4511 4512 static u32 vmx_pin_based_exec_ctrl(struct vcpu_vmx *vmx) 4513 { 4514 u32 pin_based_exec_ctrl = vmcs_config.pin_based_exec_ctrl; 4515 4516 if (!kvm_vcpu_apicv_active(&vmx->vcpu)) 4517 pin_based_exec_ctrl &= ~PIN_BASED_POSTED_INTR; 4518 4519 if (!enable_vnmi) 4520 pin_based_exec_ctrl &= ~PIN_BASED_VIRTUAL_NMIS; 4521 4522 if (!enable_preemption_timer) 4523 pin_based_exec_ctrl &= ~PIN_BASED_VMX_PREEMPTION_TIMER; 4524 4525 return pin_based_exec_ctrl; 4526 } 4527 4528 static u32 vmx_get_initial_vmentry_ctrl(void) 4529 { 4530 u32 vmentry_ctrl = vmcs_config.vmentry_ctrl; 4531 4532 if (vmx_pt_mode_is_system()) 4533 vmentry_ctrl &= ~(VM_ENTRY_PT_CONCEAL_PIP | 4534 VM_ENTRY_LOAD_IA32_RTIT_CTL); 4535 /* 4536 * IA32e mode, and loading of EFER and PERF_GLOBAL_CTRL are toggled dynamically. 4537 */ 4538 vmentry_ctrl &= ~(VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL | 4539 VM_ENTRY_LOAD_IA32_EFER | 4540 VM_ENTRY_IA32E_MODE); 4541 4542 return vmentry_ctrl; 4543 } 4544 4545 static u32 vmx_get_initial_vmexit_ctrl(void) 4546 { 4547 u32 vmexit_ctrl = vmcs_config.vmexit_ctrl; 4548 4549 /* 4550 * Not used by KVM and never set in vmcs01 or vmcs02, but emulated for 4551 * nested virtualization and thus allowed to be set in vmcs12. 4552 */ 4553 vmexit_ctrl &= ~(VM_EXIT_SAVE_IA32_PAT | VM_EXIT_SAVE_IA32_EFER | 4554 VM_EXIT_SAVE_VMX_PREEMPTION_TIMER); 4555 4556 if (vmx_pt_mode_is_system()) 4557 vmexit_ctrl &= ~(VM_EXIT_PT_CONCEAL_PIP | 4558 VM_EXIT_CLEAR_IA32_RTIT_CTL); 4559 /* Loading of EFER and PERF_GLOBAL_CTRL are toggled dynamically */ 4560 return vmexit_ctrl & 4561 ~(VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL | VM_EXIT_LOAD_IA32_EFER | 4562 VM_EXIT_SAVE_IA32_PERF_GLOBAL_CTRL); 4563 } 4564 4565 void vmx_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu) 4566 { 4567 struct vcpu_vmx *vmx = to_vmx(vcpu); 4568 4569 guard(vmx_vmcs01)(vcpu); 4570 4571 pin_controls_set(vmx, vmx_pin_based_exec_ctrl(vmx)); 4572 4573 secondary_exec_controls_changebit(vmx, 4574 SECONDARY_EXEC_APIC_REGISTER_VIRT | 4575 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY, 4576 kvm_vcpu_apicv_active(vcpu)); 4577 if (enable_ipiv) 4578 tertiary_exec_controls_changebit(vmx, TERTIARY_EXEC_IPI_VIRT, 4579 kvm_vcpu_apicv_active(vcpu)); 4580 4581 vmx_update_msr_bitmap_x2apic(vcpu); 4582 } 4583 4584 static u32 vmx_exec_control(struct vcpu_vmx *vmx) 4585 { 4586 u32 exec_control = vmcs_config.cpu_based_exec_ctrl; 4587 4588 /* 4589 * Not used by KVM, but fully supported for nesting, i.e. are allowed in 4590 * vmcs12 and propagated to vmcs02 when set in vmcs12. 4591 */ 4592 exec_control &= ~(CPU_BASED_RDTSC_EXITING | 4593 CPU_BASED_USE_IO_BITMAPS | 4594 CPU_BASED_MONITOR_TRAP_FLAG | 4595 CPU_BASED_PAUSE_EXITING); 4596 4597 /* INTR_WINDOW_EXITING and NMI_WINDOW_EXITING are toggled dynamically */ 4598 exec_control &= ~(CPU_BASED_INTR_WINDOW_EXITING | 4599 CPU_BASED_NMI_WINDOW_EXITING); 4600 4601 if (vmx->vcpu.arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT) 4602 exec_control &= ~CPU_BASED_MOV_DR_EXITING; 4603 4604 if (!cpu_need_tpr_shadow(&vmx->vcpu)) 4605 exec_control &= ~CPU_BASED_TPR_SHADOW; 4606 4607 #ifdef CONFIG_X86_64 4608 if (exec_control & CPU_BASED_TPR_SHADOW) 4609 exec_control &= ~(CPU_BASED_CR8_LOAD_EXITING | 4610 CPU_BASED_CR8_STORE_EXITING); 4611 else 4612 exec_control |= CPU_BASED_CR8_STORE_EXITING | 4613 CPU_BASED_CR8_LOAD_EXITING; 4614 #endif 4615 /* No need to intercept CR3 access or INVPLG when using EPT. */ 4616 if (enable_ept) 4617 exec_control &= ~(CPU_BASED_CR3_LOAD_EXITING | 4618 CPU_BASED_CR3_STORE_EXITING | 4619 CPU_BASED_INVLPG_EXITING); 4620 if (kvm_mwait_in_guest(vmx->vcpu.kvm)) 4621 exec_control &= ~(CPU_BASED_MWAIT_EXITING | 4622 CPU_BASED_MONITOR_EXITING); 4623 if (kvm_hlt_in_guest(vmx->vcpu.kvm)) 4624 exec_control &= ~CPU_BASED_HLT_EXITING; 4625 return exec_control; 4626 } 4627 4628 static u64 vmx_tertiary_exec_control(struct vcpu_vmx *vmx) 4629 { 4630 u64 exec_control = vmcs_config.cpu_based_3rd_exec_ctrl; 4631 4632 /* 4633 * IPI virtualization relies on APICv. Disable IPI virtualization if 4634 * APICv is inhibited. 4635 */ 4636 if (!enable_ipiv || !kvm_vcpu_apicv_active(&vmx->vcpu)) 4637 exec_control &= ~TERTIARY_EXEC_IPI_VIRT; 4638 4639 return exec_control; 4640 } 4641 4642 /* 4643 * Adjust a single secondary execution control bit to intercept/allow an 4644 * instruction in the guest. This is usually done based on whether or not a 4645 * feature has been exposed to the guest in order to correctly emulate faults. 4646 */ 4647 static inline void 4648 vmx_adjust_secondary_exec_control(struct vcpu_vmx *vmx, u32 *exec_control, 4649 u32 control, bool enabled, bool exiting) 4650 { 4651 /* 4652 * If the control is for an opt-in feature, clear the control if the 4653 * feature is not exposed to the guest, i.e. not enabled. If the 4654 * control is opt-out, i.e. an exiting control, clear the control if 4655 * the feature _is_ exposed to the guest, i.e. exiting/interception is 4656 * disabled for the associated instruction. Note, the caller is 4657 * responsible presetting exec_control to set all supported bits. 4658 */ 4659 if (enabled == exiting) 4660 *exec_control &= ~control; 4661 4662 /* 4663 * Update the nested MSR settings so that a nested VMM can/can't set 4664 * controls for features that are/aren't exposed to the guest. 4665 */ 4666 if (nested && 4667 kvm_check_has_quirk(vmx->vcpu.kvm, KVM_X86_QUIRK_STUFF_FEATURE_MSRS)) { 4668 /* 4669 * All features that can be added or removed to VMX MSRs must 4670 * be supported in the first place for nested virtualization. 4671 */ 4672 if (WARN_ON_ONCE(!(vmcs_config.nested.secondary_ctls_high & control))) 4673 enabled = false; 4674 4675 if (enabled) 4676 vmx->nested.msrs.secondary_ctls_high |= control; 4677 else 4678 vmx->nested.msrs.secondary_ctls_high &= ~control; 4679 } 4680 } 4681 4682 /* 4683 * Wrapper macro for the common case of adjusting a secondary execution control 4684 * based on a single guest CPUID bit, with a dedicated feature bit. This also 4685 * verifies that the control is actually supported by KVM and hardware. 4686 */ 4687 #define vmx_adjust_sec_exec_control(vmx, exec_control, name, feat_name, ctrl_name, exiting) \ 4688 ({ \ 4689 struct kvm_vcpu *__vcpu = &(vmx)->vcpu; \ 4690 bool __enabled; \ 4691 \ 4692 if (cpu_has_vmx_##name()) { \ 4693 __enabled = guest_cpu_cap_has(__vcpu, X86_FEATURE_##feat_name); \ 4694 vmx_adjust_secondary_exec_control(vmx, exec_control, SECONDARY_EXEC_##ctrl_name,\ 4695 __enabled, exiting); \ 4696 } \ 4697 }) 4698 4699 /* More macro magic for ENABLE_/opt-in versus _EXITING/opt-out controls. */ 4700 #define vmx_adjust_sec_exec_feature(vmx, exec_control, lname, uname) \ 4701 vmx_adjust_sec_exec_control(vmx, exec_control, lname, uname, ENABLE_##uname, false) 4702 4703 #define vmx_adjust_sec_exec_exiting(vmx, exec_control, lname, uname) \ 4704 vmx_adjust_sec_exec_control(vmx, exec_control, lname, uname, uname##_EXITING, true) 4705 4706 static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx) 4707 { 4708 struct kvm_vcpu *vcpu = &vmx->vcpu; 4709 4710 u32 exec_control = vmcs_config.cpu_based_2nd_exec_ctrl; 4711 4712 if (vmx_pt_mode_is_system()) 4713 exec_control &= ~(SECONDARY_EXEC_PT_USE_GPA | SECONDARY_EXEC_PT_CONCEAL_VMX); 4714 if (!cpu_need_virtualize_apic_accesses(vcpu)) 4715 exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; 4716 if (vmx->vpid == 0) 4717 exec_control &= ~SECONDARY_EXEC_ENABLE_VPID; 4718 if (!enable_ept) { 4719 exec_control &= ~SECONDARY_EXEC_ENABLE_EPT; 4720 exec_control &= ~SECONDARY_EXEC_EPT_VIOLATION_VE; 4721 enable_unrestricted_guest = 0; 4722 } 4723 if (!enable_unrestricted_guest) 4724 exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST; 4725 if (kvm_pause_in_guest(vmx->vcpu.kvm)) 4726 exec_control &= ~SECONDARY_EXEC_PAUSE_LOOP_EXITING; 4727 if (!kvm_vcpu_apicv_active(vcpu)) 4728 exec_control &= ~(SECONDARY_EXEC_APIC_REGISTER_VIRT | 4729 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY); 4730 exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE; 4731 4732 /* 4733 * KVM doesn't support VMFUNC for L1, but the control is set in KVM's 4734 * base configuration as KVM emulates VMFUNC[EPTP_SWITCHING] for L2. 4735 */ 4736 exec_control &= ~SECONDARY_EXEC_ENABLE_VMFUNC; 4737 4738 /* SECONDARY_EXEC_DESC is enabled/disabled on writes to CR4.UMIP, 4739 * in vmx_set_cr4. */ 4740 exec_control &= ~SECONDARY_EXEC_DESC; 4741 4742 /* SECONDARY_EXEC_SHADOW_VMCS is enabled when L1 executes VMPTRLD 4743 (handle_vmptrld). 4744 We can NOT enable shadow_vmcs here because we don't have yet 4745 a current VMCS12 4746 */ 4747 exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS; 4748 4749 /* 4750 * PML is enabled/disabled when dirty logging of memsmlots changes, but 4751 * it needs to be set here when dirty logging is already active, e.g. 4752 * if this vCPU was created after dirty logging was enabled. 4753 */ 4754 if (!enable_pml || !atomic_read(&vcpu->kvm->nr_memslots_dirty_logging)) 4755 exec_control &= ~SECONDARY_EXEC_ENABLE_PML; 4756 4757 vmx_adjust_sec_exec_feature(vmx, &exec_control, xsaves, XSAVES); 4758 4759 /* 4760 * RDPID is also gated by ENABLE_RDTSCP, turn on the control if either 4761 * feature is exposed to the guest. This creates a virtualization hole 4762 * if both are supported in hardware but only one is exposed to the 4763 * guest, but letting the guest execute RDTSCP or RDPID when either one 4764 * is advertised is preferable to emulating the advertised instruction 4765 * in KVM on #UD, and obviously better than incorrectly injecting #UD. 4766 */ 4767 if (cpu_has_vmx_rdtscp()) { 4768 bool rdpid_or_rdtscp_enabled = 4769 guest_cpu_cap_has(vcpu, X86_FEATURE_RDTSCP) || 4770 guest_cpu_cap_has(vcpu, X86_FEATURE_RDPID); 4771 4772 vmx_adjust_secondary_exec_control(vmx, &exec_control, 4773 SECONDARY_EXEC_ENABLE_RDTSCP, 4774 rdpid_or_rdtscp_enabled, false); 4775 } 4776 4777 vmx_adjust_sec_exec_feature(vmx, &exec_control, invpcid, INVPCID); 4778 4779 vmx_adjust_sec_exec_exiting(vmx, &exec_control, rdrand, RDRAND); 4780 vmx_adjust_sec_exec_exiting(vmx, &exec_control, rdseed, RDSEED); 4781 4782 vmx_adjust_sec_exec_control(vmx, &exec_control, waitpkg, WAITPKG, 4783 ENABLE_USR_WAIT_PAUSE, false); 4784 4785 if (!vcpu->kvm->arch.bus_lock_detection_enabled) 4786 exec_control &= ~SECONDARY_EXEC_BUS_LOCK_DETECTION; 4787 4788 if (!kvm_notify_vmexit_enabled(vcpu->kvm)) 4789 exec_control &= ~SECONDARY_EXEC_NOTIFY_VM_EXITING; 4790 4791 return exec_control; 4792 } 4793 4794 static inline int vmx_get_pid_table_order(struct kvm *kvm) 4795 { 4796 return get_order(kvm->arch.max_vcpu_ids * sizeof(*to_kvm_vmx(kvm)->pid_table)); 4797 } 4798 4799 static int vmx_alloc_ipiv_pid_table(struct kvm *kvm) 4800 { 4801 struct page *pages; 4802 struct kvm_vmx *kvm_vmx = to_kvm_vmx(kvm); 4803 4804 if (!irqchip_in_kernel(kvm) || !enable_ipiv) 4805 return 0; 4806 4807 if (kvm_vmx->pid_table) 4808 return 0; 4809 4810 pages = alloc_pages(GFP_KERNEL_ACCOUNT | __GFP_ZERO, 4811 vmx_get_pid_table_order(kvm)); 4812 if (!pages) 4813 return -ENOMEM; 4814 4815 kvm_vmx->pid_table = (void *)page_address(pages); 4816 return 0; 4817 } 4818 4819 int vmx_vcpu_precreate(struct kvm *kvm) 4820 { 4821 return vmx_alloc_ipiv_pid_table(kvm); 4822 } 4823 4824 #define VMX_XSS_EXIT_BITMAP 0 4825 4826 static void init_vmcs(struct vcpu_vmx *vmx) 4827 { 4828 struct kvm *kvm = vmx->vcpu.kvm; 4829 struct kvm_vmx *kvm_vmx = to_kvm_vmx(kvm); 4830 4831 if (nested) 4832 nested_vmx_set_vmcs_shadowing_bitmap(); 4833 4834 if (cpu_has_vmx_msr_bitmap()) 4835 vmcs_write64(MSR_BITMAP, __pa(vmx->vmcs01.msr_bitmap)); 4836 4837 vmcs_write64(VMCS_LINK_POINTER, INVALID_GPA); /* 22.3.1.5 */ 4838 4839 /* Control */ 4840 pin_controls_set(vmx, vmx_pin_based_exec_ctrl(vmx)); 4841 4842 exec_controls_set(vmx, vmx_exec_control(vmx)); 4843 4844 if (cpu_has_secondary_exec_ctrls()) { 4845 secondary_exec_controls_set(vmx, vmx_secondary_exec_control(vmx)); 4846 if (vmx->ve_info) 4847 vmcs_write64(VE_INFORMATION_ADDRESS, 4848 __pa(vmx->ve_info)); 4849 } 4850 4851 if (cpu_has_tertiary_exec_ctrls()) 4852 tertiary_exec_controls_set(vmx, vmx_tertiary_exec_control(vmx)); 4853 4854 if (enable_apicv && lapic_in_kernel(&vmx->vcpu)) { 4855 vmcs_write64(EOI_EXIT_BITMAP0, 0); 4856 vmcs_write64(EOI_EXIT_BITMAP1, 0); 4857 vmcs_write64(EOI_EXIT_BITMAP2, 0); 4858 vmcs_write64(EOI_EXIT_BITMAP3, 0); 4859 4860 vmcs_write16(GUEST_INTR_STATUS, 0); 4861 4862 vmcs_write16(POSTED_INTR_NV, POSTED_INTR_VECTOR); 4863 vmcs_write64(POSTED_INTR_DESC_ADDR, __pa((&vmx->vt.pi_desc))); 4864 } 4865 4866 if (vmx_can_use_ipiv(&vmx->vcpu)) { 4867 vmcs_write64(PID_POINTER_TABLE, __pa(kvm_vmx->pid_table)); 4868 vmcs_write16(LAST_PID_POINTER_INDEX, kvm->arch.max_vcpu_ids - 1); 4869 } 4870 4871 if (!kvm_pause_in_guest(kvm)) { 4872 vmcs_write32(PLE_GAP, ple_gap); 4873 vmx->ple_window = ple_window; 4874 vmx->ple_window_dirty = true; 4875 } 4876 4877 if (kvm_notify_vmexit_enabled(kvm)) 4878 vmcs_write32(NOTIFY_WINDOW, kvm->arch.notify_window); 4879 4880 vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, 0); 4881 vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, 0); 4882 vmcs_write32(CR3_TARGET_COUNT, 0); /* 22.2.1 */ 4883 4884 vmcs_write16(HOST_FS_SELECTOR, 0); /* 22.2.4 */ 4885 vmcs_write16(HOST_GS_SELECTOR, 0); /* 22.2.4 */ 4886 vmx_set_constant_host_state(vmx); 4887 vmcs_writel(HOST_FS_BASE, 0); /* 22.2.4 */ 4888 vmcs_writel(HOST_GS_BASE, 0); /* 22.2.4 */ 4889 4890 if (cpu_has_vmx_vmfunc()) 4891 vmcs_write64(VM_FUNCTION_CONTROL, 0); 4892 4893 vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0); 4894 vmcs_write64(VM_EXIT_MSR_STORE_ADDR, __pa(vmx->msr_autostore.val)); 4895 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0); 4896 vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host.val)); 4897 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0); 4898 vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest.val)); 4899 4900 if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) 4901 vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat); 4902 4903 vm_exit_controls_set(vmx, vmx_get_initial_vmexit_ctrl()); 4904 4905 /* 22.2.1, 20.8.1 */ 4906 vm_entry_controls_set(vmx, vmx_get_initial_vmentry_ctrl()); 4907 4908 vmx->vcpu.arch.cr0_guest_owned_bits = vmx_l1_guest_owned_cr0_bits(); 4909 vmcs_writel(CR0_GUEST_HOST_MASK, ~vmx->vcpu.arch.cr0_guest_owned_bits); 4910 4911 set_cr4_guest_host_mask(vmx); 4912 4913 if (vmx->vpid != 0) 4914 vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid); 4915 4916 if (cpu_has_vmx_xsaves()) 4917 vmcs_write64(XSS_EXIT_BITMAP, VMX_XSS_EXIT_BITMAP); 4918 4919 if (enable_pml) { 4920 vmcs_write64(PML_ADDRESS, page_to_phys(vmx->pml_pg)); 4921 vmcs_write16(GUEST_PML_INDEX, PML_HEAD_INDEX); 4922 } 4923 4924 vmx_write_encls_bitmap(&vmx->vcpu, NULL); 4925 4926 if (vmx_pt_mode_is_host_guest()) { 4927 memset(&vmx->pt_desc, 0, sizeof(vmx->pt_desc)); 4928 /* Bit[6~0] are forced to 1, writes are ignored. */ 4929 vmx->pt_desc.guest.output_mask = 0x7F; 4930 vmcs_write64(GUEST_IA32_RTIT_CTL, 0); 4931 } 4932 4933 vmcs_write32(GUEST_SYSENTER_CS, 0); 4934 vmcs_writel(GUEST_SYSENTER_ESP, 0); 4935 vmcs_writel(GUEST_SYSENTER_EIP, 0); 4936 4937 vmx_guest_debugctl_write(&vmx->vcpu, 0); 4938 4939 if (cpu_has_vmx_tpr_shadow()) { 4940 vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 0); 4941 if (cpu_need_tpr_shadow(&vmx->vcpu)) 4942 vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 4943 __pa(vmx->vcpu.arch.apic->regs)); 4944 vmcs_write32(TPR_THRESHOLD, 0); 4945 } 4946 4947 vmx_setup_uret_msrs(vmx); 4948 } 4949 4950 static void __vmx_vcpu_reset(struct kvm_vcpu *vcpu) 4951 { 4952 struct vcpu_vmx *vmx = to_vmx(vcpu); 4953 4954 init_vmcs(vmx); 4955 4956 if (nested && 4957 kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_STUFF_FEATURE_MSRS)) 4958 memcpy(&vmx->nested.msrs, &vmcs_config.nested, sizeof(vmx->nested.msrs)); 4959 4960 vcpu_setup_sgx_lepubkeyhash(vcpu); 4961 4962 vmx->nested.posted_intr_nv = -1; 4963 vmx->nested.vmxon_ptr = INVALID_GPA; 4964 vmx->nested.current_vmptr = INVALID_GPA; 4965 4966 #ifdef CONFIG_KVM_HYPERV 4967 vmx->nested.hv_evmcs_vmptr = EVMPTR_INVALID; 4968 #endif 4969 4970 if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_STUFF_FEATURE_MSRS)) 4971 vcpu->arch.microcode_version = 0x100000000ULL; 4972 vmx->msr_ia32_feature_control_valid_bits = FEAT_CTL_LOCKED; 4973 4974 /* 4975 * Enforce invariant: pi_desc.nv is always either POSTED_INTR_VECTOR 4976 * or POSTED_INTR_WAKEUP_VECTOR. 4977 */ 4978 vmx->vt.pi_desc.nv = POSTED_INTR_VECTOR; 4979 __pi_set_sn(&vmx->vt.pi_desc); 4980 } 4981 4982 void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) 4983 { 4984 struct vcpu_vmx *vmx = to_vmx(vcpu); 4985 4986 if (!init_event) 4987 __vmx_vcpu_reset(vcpu); 4988 4989 vmx->rmode.vm86_active = 0; 4990 vmx->spec_ctrl = 0; 4991 4992 vmx->msr_ia32_umwait_control = 0; 4993 4994 vmx->hv_deadline_tsc = -1; 4995 kvm_set_cr8(vcpu, 0); 4996 4997 seg_setup(VCPU_SREG_CS); 4998 vmcs_write16(GUEST_CS_SELECTOR, 0xf000); 4999 vmcs_writel(GUEST_CS_BASE, 0xffff0000ul); 5000 5001 seg_setup(VCPU_SREG_DS); 5002 seg_setup(VCPU_SREG_ES); 5003 seg_setup(VCPU_SREG_FS); 5004 seg_setup(VCPU_SREG_GS); 5005 seg_setup(VCPU_SREG_SS); 5006 5007 vmcs_write16(GUEST_TR_SELECTOR, 0); 5008 vmcs_writel(GUEST_TR_BASE, 0); 5009 vmcs_write32(GUEST_TR_LIMIT, 0xffff); 5010 vmcs_write32(GUEST_TR_AR_BYTES, 0x008b); 5011 5012 vmcs_write16(GUEST_LDTR_SELECTOR, 0); 5013 vmcs_writel(GUEST_LDTR_BASE, 0); 5014 vmcs_write32(GUEST_LDTR_LIMIT, 0xffff); 5015 vmcs_write32(GUEST_LDTR_AR_BYTES, 0x00082); 5016 5017 vmcs_writel(GUEST_GDTR_BASE, 0); 5018 vmcs_write32(GUEST_GDTR_LIMIT, 0xffff); 5019 5020 vmcs_writel(GUEST_IDTR_BASE, 0); 5021 vmcs_write32(GUEST_IDTR_LIMIT, 0xffff); 5022 5023 vmx_segment_cache_clear(vmx); 5024 kvm_register_mark_available(vcpu, VCPU_EXREG_SEGMENTS); 5025 5026 vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE); 5027 vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0); 5028 vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS, 0); 5029 if (kvm_mpx_supported()) 5030 vmcs_write64(GUEST_BNDCFGS, 0); 5031 5032 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0); /* 22.2.1 */ 5033 5034 if (kvm_cpu_cap_has(X86_FEATURE_SHSTK)) { 5035 vmcs_writel(GUEST_SSP, 0); 5036 vmcs_writel(GUEST_INTR_SSP_TABLE, 0); 5037 } 5038 if (kvm_cpu_cap_has(X86_FEATURE_IBT) || 5039 kvm_cpu_cap_has(X86_FEATURE_SHSTK)) 5040 vmcs_writel(GUEST_S_CET, 0); 5041 5042 kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu); 5043 5044 vpid_sync_context(vmx->vpid); 5045 5046 vmx_update_fb_clear_dis(vcpu, vmx); 5047 } 5048 5049 void vmx_enable_irq_window(struct kvm_vcpu *vcpu) 5050 { 5051 exec_controls_setbit(to_vmx(vcpu), CPU_BASED_INTR_WINDOW_EXITING); 5052 } 5053 5054 void vmx_enable_nmi_window(struct kvm_vcpu *vcpu) 5055 { 5056 if (!enable_vnmi || 5057 vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_STI) { 5058 vmx_enable_irq_window(vcpu); 5059 return; 5060 } 5061 5062 exec_controls_setbit(to_vmx(vcpu), CPU_BASED_NMI_WINDOW_EXITING); 5063 } 5064 5065 void vmx_inject_irq(struct kvm_vcpu *vcpu, bool reinjected) 5066 { 5067 struct vcpu_vmx *vmx = to_vmx(vcpu); 5068 uint32_t intr; 5069 int irq = vcpu->arch.interrupt.nr; 5070 5071 trace_kvm_inj_virq(irq, vcpu->arch.interrupt.soft, reinjected); 5072 5073 ++vcpu->stat.irq_injections; 5074 if (vmx->rmode.vm86_active) { 5075 int inc_eip = 0; 5076 if (vcpu->arch.interrupt.soft) 5077 inc_eip = vcpu->arch.event_exit_inst_len; 5078 kvm_inject_realmode_interrupt(vcpu, irq, inc_eip); 5079 return; 5080 } 5081 intr = irq | INTR_INFO_VALID_MASK; 5082 if (vcpu->arch.interrupt.soft) { 5083 intr |= INTR_TYPE_SOFT_INTR; 5084 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 5085 vmx->vcpu.arch.event_exit_inst_len); 5086 } else 5087 intr |= INTR_TYPE_EXT_INTR; 5088 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr); 5089 5090 vmx_clear_hlt(vcpu); 5091 } 5092 5093 void vmx_inject_nmi(struct kvm_vcpu *vcpu) 5094 { 5095 struct vcpu_vmx *vmx = to_vmx(vcpu); 5096 5097 if (!enable_vnmi) { 5098 /* 5099 * Tracking the NMI-blocked state in software is built upon 5100 * finding the next open IRQ window. This, in turn, depends on 5101 * well-behaving guests: They have to keep IRQs disabled at 5102 * least as long as the NMI handler runs. Otherwise we may 5103 * cause NMI nesting, maybe breaking the guest. But as this is 5104 * highly unlikely, we can live with the residual risk. 5105 */ 5106 vmx->loaded_vmcs->soft_vnmi_blocked = 1; 5107 vmx->loaded_vmcs->vnmi_blocked_time = 0; 5108 } 5109 5110 ++vcpu->stat.nmi_injections; 5111 vmx->loaded_vmcs->nmi_known_unmasked = false; 5112 5113 if (vmx->rmode.vm86_active) { 5114 kvm_inject_realmode_interrupt(vcpu, NMI_VECTOR, 0); 5115 return; 5116 } 5117 5118 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 5119 INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR); 5120 5121 vmx_clear_hlt(vcpu); 5122 } 5123 5124 bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu) 5125 { 5126 struct vcpu_vmx *vmx = to_vmx(vcpu); 5127 bool masked; 5128 5129 if (!enable_vnmi) 5130 return vmx->loaded_vmcs->soft_vnmi_blocked; 5131 if (vmx->loaded_vmcs->nmi_known_unmasked) 5132 return false; 5133 masked = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_NMI; 5134 vmx->loaded_vmcs->nmi_known_unmasked = !masked; 5135 return masked; 5136 } 5137 5138 void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked) 5139 { 5140 struct vcpu_vmx *vmx = to_vmx(vcpu); 5141 5142 if (!enable_vnmi) { 5143 if (vmx->loaded_vmcs->soft_vnmi_blocked != masked) { 5144 vmx->loaded_vmcs->soft_vnmi_blocked = masked; 5145 vmx->loaded_vmcs->vnmi_blocked_time = 0; 5146 } 5147 } else { 5148 vmx->loaded_vmcs->nmi_known_unmasked = !masked; 5149 if (masked) 5150 vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, 5151 GUEST_INTR_STATE_NMI); 5152 else 5153 vmcs_clear_bits(GUEST_INTERRUPTIBILITY_INFO, 5154 GUEST_INTR_STATE_NMI); 5155 } 5156 } 5157 5158 bool vmx_nmi_blocked(struct kvm_vcpu *vcpu) 5159 { 5160 if (is_guest_mode(vcpu) && nested_exit_on_nmi(vcpu)) 5161 return false; 5162 5163 if (!enable_vnmi && to_vmx(vcpu)->loaded_vmcs->soft_vnmi_blocked) 5164 return true; 5165 5166 return (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 5167 (GUEST_INTR_STATE_MOV_SS | GUEST_INTR_STATE_STI | 5168 GUEST_INTR_STATE_NMI)); 5169 } 5170 5171 int vmx_nmi_allowed(struct kvm_vcpu *vcpu, bool for_injection) 5172 { 5173 if (vcpu->arch.nested_run_pending) 5174 return -EBUSY; 5175 5176 /* An NMI must not be injected into L2 if it's supposed to VM-Exit. */ 5177 if (for_injection && is_guest_mode(vcpu) && nested_exit_on_nmi(vcpu)) 5178 return -EBUSY; 5179 5180 return !vmx_nmi_blocked(vcpu); 5181 } 5182 5183 bool __vmx_interrupt_blocked(struct kvm_vcpu *vcpu) 5184 { 5185 return !(vmx_get_rflags(vcpu) & X86_EFLAGS_IF) || 5186 (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 5187 (GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS)); 5188 } 5189 5190 bool vmx_interrupt_blocked(struct kvm_vcpu *vcpu) 5191 { 5192 if (is_guest_mode(vcpu) && nested_exit_on_intr(vcpu)) 5193 return false; 5194 5195 return __vmx_interrupt_blocked(vcpu); 5196 } 5197 5198 int vmx_interrupt_allowed(struct kvm_vcpu *vcpu, bool for_injection) 5199 { 5200 if (vcpu->arch.nested_run_pending) 5201 return -EBUSY; 5202 5203 /* 5204 * An IRQ must not be injected into L2 if it's supposed to VM-Exit, 5205 * e.g. if the IRQ arrived asynchronously after checking nested events. 5206 */ 5207 if (for_injection && is_guest_mode(vcpu) && nested_exit_on_intr(vcpu)) 5208 return -EBUSY; 5209 5210 return !vmx_interrupt_blocked(vcpu); 5211 } 5212 5213 int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr) 5214 { 5215 void __user *ret; 5216 5217 if (enable_unrestricted_guest) 5218 return 0; 5219 5220 mutex_lock(&kvm->slots_lock); 5221 ret = __x86_set_memory_region(kvm, TSS_PRIVATE_MEMSLOT, addr, 5222 PAGE_SIZE * 3); 5223 mutex_unlock(&kvm->slots_lock); 5224 5225 if (IS_ERR(ret)) 5226 return PTR_ERR(ret); 5227 5228 to_kvm_vmx(kvm)->tss_addr = addr; 5229 5230 return init_rmode_tss(kvm, ret); 5231 } 5232 5233 int vmx_set_identity_map_addr(struct kvm *kvm, u64 ident_addr) 5234 { 5235 to_kvm_vmx(kvm)->ept_identity_map_addr = ident_addr; 5236 return 0; 5237 } 5238 5239 static bool rmode_exception(struct kvm_vcpu *vcpu, int vec) 5240 { 5241 switch (vec) { 5242 case BP_VECTOR: 5243 /* 5244 * Update instruction length as we may reinject the exception 5245 * from user space while in guest debugging mode. 5246 */ 5247 to_vmx(vcpu)->vcpu.arch.event_exit_inst_len = 5248 vmcs_read32(VM_EXIT_INSTRUCTION_LEN); 5249 if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP) 5250 return false; 5251 fallthrough; 5252 case DB_VECTOR: 5253 return !(vcpu->guest_debug & 5254 (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)); 5255 case DE_VECTOR: 5256 case OF_VECTOR: 5257 case BR_VECTOR: 5258 case UD_VECTOR: 5259 case DF_VECTOR: 5260 case SS_VECTOR: 5261 case GP_VECTOR: 5262 case MF_VECTOR: 5263 return true; 5264 } 5265 return false; 5266 } 5267 5268 static int handle_rmode_exception(struct kvm_vcpu *vcpu, 5269 int vec, u32 err_code) 5270 { 5271 /* 5272 * Instruction with address size override prefix opcode 0x67 5273 * Cause the #SS fault with 0 error code in VM86 mode. 5274 */ 5275 if (((vec == GP_VECTOR) || (vec == SS_VECTOR)) && err_code == 0) { 5276 if (kvm_emulate_instruction(vcpu, 0)) { 5277 if (vcpu->arch.halt_request) { 5278 vcpu->arch.halt_request = 0; 5279 return kvm_emulate_halt_noskip(vcpu); 5280 } 5281 return 1; 5282 } 5283 return 0; 5284 } 5285 5286 /* 5287 * Forward all other exceptions that are valid in real mode. 5288 * FIXME: Breaks guest debugging in real mode, needs to be fixed with 5289 * the required debugging infrastructure rework. 5290 */ 5291 kvm_queue_exception(vcpu, vec); 5292 return 1; 5293 } 5294 5295 static int handle_machine_check(struct kvm_vcpu *vcpu) 5296 { 5297 /* handled by vmx_vcpu_run() */ 5298 return 1; 5299 } 5300 5301 /* 5302 * If the host has split lock detection disabled, then #AC is 5303 * unconditionally injected into the guest, which is the pre split lock 5304 * detection behaviour. 5305 * 5306 * If the host has split lock detection enabled then #AC is 5307 * only injected into the guest when: 5308 * - Guest CPL == 3 (user mode) 5309 * - Guest has #AC detection enabled in CR0 5310 * - Guest EFLAGS has AC bit set 5311 */ 5312 bool vmx_guest_inject_ac(struct kvm_vcpu *vcpu) 5313 { 5314 if (!boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT)) 5315 return true; 5316 5317 return vmx_get_cpl(vcpu) == 3 && kvm_is_cr0_bit_set(vcpu, X86_CR0_AM) && 5318 (kvm_get_rflags(vcpu) & X86_EFLAGS_AC); 5319 } 5320 5321 static bool is_xfd_nm_fault(struct kvm_vcpu *vcpu) 5322 { 5323 return vcpu->arch.guest_fpu.fpstate->xfd && 5324 !kvm_is_cr0_bit_set(vcpu, X86_CR0_TS); 5325 } 5326 5327 static int vmx_handle_page_fault(struct kvm_vcpu *vcpu, u32 error_code) 5328 { 5329 unsigned long cr2 = vmx_get_exit_qual(vcpu); 5330 5331 if (vcpu->arch.apf.host_apf_flags) 5332 goto handle_pf; 5333 5334 /* When using EPT, KVM intercepts #PF only to detect illegal GPAs. */ 5335 WARN_ON_ONCE(enable_ept && !allow_smaller_maxphyaddr); 5336 5337 /* 5338 * On SGX2 hardware, EPCM violations are delivered as #PF with the SGX 5339 * flag set in the error code (SGX1 hardware generates #GP(0)). EPCM 5340 * violations have nothing to do with shadow paging and can never be 5341 * resolved by KVM; always reflect them into the guest. 5342 */ 5343 if (error_code & PFERR_SGX_MASK) { 5344 WARN_ON_ONCE(!IS_ENABLED(CONFIG_X86_SGX_KVM) || 5345 !cpu_feature_enabled(X86_FEATURE_SGX2)); 5346 5347 if (guest_cpu_cap_has(vcpu, X86_FEATURE_SGX2)) 5348 kvm_fixup_and_inject_pf_error(vcpu, cr2, error_code); 5349 else 5350 kvm_inject_gp(vcpu, 0); 5351 return 1; 5352 } 5353 5354 /* 5355 * If EPT is enabled, fixup and inject the #PF. KVM intercepts #PFs 5356 * only to set PFERR_RSVD as appropriate (hardware won't set RSVD due 5357 * to the GPA being legal with respect to host.MAXPHYADDR). 5358 */ 5359 if (enable_ept) { 5360 kvm_fixup_and_inject_pf_error(vcpu, cr2, error_code); 5361 return 1; 5362 } 5363 5364 handle_pf: 5365 return kvm_handle_page_fault(vcpu, error_code, cr2, NULL, 0); 5366 } 5367 5368 static int handle_exception_nmi(struct kvm_vcpu *vcpu) 5369 { 5370 struct vcpu_vmx *vmx = to_vmx(vcpu); 5371 struct kvm_run *kvm_run = vcpu->run; 5372 u32 intr_info, ex_no, error_code; 5373 unsigned long dr6; 5374 u32 vect_info; 5375 5376 vect_info = vmx->idt_vectoring_info; 5377 intr_info = vmx_get_intr_info(vcpu); 5378 5379 /* 5380 * Machine checks are handled by handle_exception_irqoff(), or by 5381 * vmx_vcpu_run() if a #MC occurs on VM-Entry. NMIs are handled by 5382 * vmx_vcpu_enter_exit(). 5383 */ 5384 if (is_machine_check(intr_info) || is_nmi(intr_info)) 5385 return 1; 5386 5387 /* 5388 * Queue the exception here instead of in handle_nm_fault_irqoff(). 5389 * This ensures the nested_vmx check is not skipped so vmexit can 5390 * be reflected to L1 (when it intercepts #NM) before reaching this 5391 * point. 5392 */ 5393 if (is_nm_fault(intr_info)) { 5394 kvm_queue_exception_p(vcpu, NM_VECTOR, 5395 is_xfd_nm_fault(vcpu) ? vcpu->arch.guest_fpu.xfd_err : 0); 5396 return 1; 5397 } 5398 5399 if (is_invalid_opcode(intr_info)) 5400 return handle_ud(vcpu); 5401 5402 if (WARN_ON_ONCE(is_ve_fault(intr_info))) { 5403 struct vmx_ve_information *ve_info = vmx->ve_info; 5404 5405 WARN_ONCE(ve_info->exit_reason != EXIT_REASON_EPT_VIOLATION, 5406 "Unexpected #VE on VM-Exit reason 0x%x", ve_info->exit_reason); 5407 dump_vmcs(vcpu); 5408 kvm_mmu_print_sptes(vcpu, ve_info->guest_physical_address, "#VE"); 5409 return 1; 5410 } 5411 5412 error_code = 0; 5413 if (intr_info & INTR_INFO_DELIVER_CODE_MASK) 5414 error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE); 5415 5416 if (!vmx->rmode.vm86_active && is_gp_fault(intr_info)) { 5417 WARN_ON_ONCE(!enable_vmware_backdoor); 5418 5419 /* 5420 * VMware backdoor emulation on #GP interception only handles 5421 * IN{S}, OUT{S}, and RDPMC, none of which generate a non-zero 5422 * error code on #GP. 5423 */ 5424 if (error_code) { 5425 kvm_queue_exception_e(vcpu, GP_VECTOR, error_code); 5426 return 1; 5427 } 5428 return kvm_emulate_instruction(vcpu, EMULTYPE_VMWARE_GP); 5429 } 5430 5431 /* 5432 * The #PF with PFEC.RSVD = 1 indicates the guest is accessing 5433 * MMIO, it is better to report an internal error. 5434 * See the comments in vmx_handle_exit. 5435 */ 5436 if ((vect_info & VECTORING_INFO_VALID_MASK) && 5437 !(is_page_fault(intr_info) && !(error_code & PFERR_RSVD_MASK))) { 5438 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; 5439 vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_SIMUL_EX; 5440 vcpu->run->internal.ndata = 4; 5441 vcpu->run->internal.data[0] = vect_info; 5442 vcpu->run->internal.data[1] = intr_info; 5443 vcpu->run->internal.data[2] = error_code; 5444 vcpu->run->internal.data[3] = vcpu->arch.last_vmentry_cpu; 5445 return 0; 5446 } 5447 5448 if (is_page_fault(intr_info)) 5449 return vmx_handle_page_fault(vcpu, error_code); 5450 5451 ex_no = intr_info & INTR_INFO_VECTOR_MASK; 5452 5453 if (vmx->rmode.vm86_active && rmode_exception(vcpu, ex_no)) 5454 return handle_rmode_exception(vcpu, ex_no, error_code); 5455 5456 switch (ex_no) { 5457 case DB_VECTOR: 5458 dr6 = vmx_get_exit_qual(vcpu); 5459 if (!(vcpu->guest_debug & 5460 (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))) { 5461 /* 5462 * If the #DB was due to ICEBP, a.k.a. INT1, skip the 5463 * instruction. ICEBP generates a trap-like #DB, but 5464 * despite its interception control being tied to #DB, 5465 * is an instruction intercept, i.e. the VM-Exit occurs 5466 * on the ICEBP itself. Use the inner "skip" helper to 5467 * avoid single-step #DB and MTF updates, as ICEBP is 5468 * higher priority. Note, skipping ICEBP still clears 5469 * STI and MOVSS blocking. 5470 * 5471 * For all other #DBs, set vmcs.PENDING_DBG_EXCEPTIONS.BS 5472 * if single-step is enabled in RFLAGS and STI or MOVSS 5473 * blocking is active, as the CPU doesn't set the bit 5474 * on VM-Exit due to #DB interception. VM-Entry has a 5475 * consistency check that a single-step #DB is pending 5476 * in this scenario as the previous instruction cannot 5477 * have toggled RFLAGS.TF 0=>1 (because STI and POP/MOV 5478 * don't modify RFLAGS), therefore the one instruction 5479 * delay when activating single-step breakpoints must 5480 * have already expired. Note, the CPU sets/clears BS 5481 * as appropriate for all other VM-Exits types. 5482 */ 5483 if (is_icebp(intr_info)) 5484 WARN_ON(!skip_emulated_instruction(vcpu)); 5485 else if ((vmx_get_rflags(vcpu) & X86_EFLAGS_TF) && 5486 (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 5487 (GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS))) 5488 vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS, 5489 vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS) | DR6_BS); 5490 5491 kvm_queue_exception_p(vcpu, DB_VECTOR, dr6); 5492 return 1; 5493 } 5494 kvm_run->debug.arch.dr6 = dr6 | DR6_ACTIVE_LOW; 5495 kvm_run->debug.arch.dr7 = vmcs_readl(GUEST_DR7); 5496 fallthrough; 5497 case BP_VECTOR: 5498 /* 5499 * Update instruction length as we may reinject #BP from 5500 * user space while in guest debugging mode. Reading it for 5501 * #DB as well causes no harm, it is not used in that case. 5502 */ 5503 vmx->vcpu.arch.event_exit_inst_len = 5504 vmcs_read32(VM_EXIT_INSTRUCTION_LEN); 5505 kvm_run->exit_reason = KVM_EXIT_DEBUG; 5506 kvm_run->debug.arch.pc = kvm_get_linear_rip(vcpu); 5507 kvm_run->debug.arch.exception = ex_no; 5508 break; 5509 case AC_VECTOR: 5510 if (vmx_guest_inject_ac(vcpu)) { 5511 kvm_queue_exception_e(vcpu, AC_VECTOR, error_code); 5512 return 1; 5513 } 5514 5515 /* 5516 * Handle split lock. Depending on detection mode this will 5517 * either warn and disable split lock detection for this 5518 * task or force SIGBUS on it. 5519 */ 5520 if (handle_guest_split_lock(kvm_rip_read(vcpu))) 5521 return 1; 5522 fallthrough; 5523 default: 5524 kvm_run->exit_reason = KVM_EXIT_EXCEPTION; 5525 kvm_run->ex.exception = ex_no; 5526 kvm_run->ex.error_code = error_code; 5527 break; 5528 } 5529 return 0; 5530 } 5531 5532 static __always_inline int handle_external_interrupt(struct kvm_vcpu *vcpu) 5533 { 5534 ++vcpu->stat.irq_exits; 5535 return 1; 5536 } 5537 5538 static int handle_triple_fault(struct kvm_vcpu *vcpu) 5539 { 5540 vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN; 5541 vcpu->mmio_needed = 0; 5542 return 0; 5543 } 5544 5545 static int handle_io(struct kvm_vcpu *vcpu) 5546 { 5547 unsigned long exit_qualification; 5548 int size, in, string; 5549 unsigned port; 5550 5551 exit_qualification = vmx_get_exit_qual(vcpu); 5552 string = (exit_qualification & 16) != 0; 5553 5554 ++vcpu->stat.io_exits; 5555 5556 if (string) 5557 return kvm_emulate_instruction(vcpu, 0); 5558 5559 port = exit_qualification >> 16; 5560 size = (exit_qualification & 7) + 1; 5561 in = (exit_qualification & 8) != 0; 5562 5563 return kvm_fast_pio(vcpu, size, port, in); 5564 } 5565 5566 void vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall) 5567 { 5568 /* 5569 * Patch in the VMCALL instruction: 5570 */ 5571 hypercall[0] = 0x0f; 5572 hypercall[1] = 0x01; 5573 hypercall[2] = 0xc1; 5574 } 5575 5576 /* called to set cr0 as appropriate for a mov-to-cr0 exit. */ 5577 static int handle_set_cr0(struct kvm_vcpu *vcpu, unsigned long val) 5578 { 5579 if (is_guest_mode(vcpu)) { 5580 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 5581 unsigned long orig_val = val; 5582 5583 /* 5584 * We get here when L2 changed cr0 in a way that did not change 5585 * any of L1's shadowed bits (see nested_vmx_exit_handled_cr), 5586 * but did change L0 shadowed bits. So we first calculate the 5587 * effective cr0 value that L1 would like to write into the 5588 * hardware. It consists of the L2-owned bits from the new 5589 * value combined with the L1-owned bits from L1's guest_cr0. 5590 */ 5591 val = (val & ~vmcs12->cr0_guest_host_mask) | 5592 (vmcs12->guest_cr0 & vmcs12->cr0_guest_host_mask); 5593 5594 if (kvm_set_cr0(vcpu, val)) 5595 return 1; 5596 vmcs_writel(CR0_READ_SHADOW, orig_val); 5597 return 0; 5598 } else { 5599 return kvm_set_cr0(vcpu, val); 5600 } 5601 } 5602 5603 static int handle_set_cr4(struct kvm_vcpu *vcpu, unsigned long val) 5604 { 5605 if (is_guest_mode(vcpu)) { 5606 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 5607 unsigned long orig_val = val; 5608 5609 /* analogously to handle_set_cr0 */ 5610 val = (val & ~vmcs12->cr4_guest_host_mask) | 5611 (vmcs12->guest_cr4 & vmcs12->cr4_guest_host_mask); 5612 if (kvm_set_cr4(vcpu, val)) 5613 return 1; 5614 vmcs_writel(CR4_READ_SHADOW, orig_val); 5615 return 0; 5616 } else 5617 return kvm_set_cr4(vcpu, val); 5618 } 5619 5620 static int handle_desc(struct kvm_vcpu *vcpu) 5621 { 5622 /* 5623 * UMIP emulation relies on intercepting writes to CR4.UMIP, i.e. this 5624 * and other code needs to be updated if UMIP can be guest owned. 5625 */ 5626 BUILD_BUG_ON(KVM_POSSIBLE_CR4_GUEST_BITS & X86_CR4_UMIP); 5627 5628 WARN_ON_ONCE(!kvm_is_cr4_bit_set(vcpu, X86_CR4_UMIP)); 5629 return kvm_emulate_instruction(vcpu, 0); 5630 } 5631 5632 static int handle_cr(struct kvm_vcpu *vcpu) 5633 { 5634 unsigned long exit_qualification, val; 5635 int cr; 5636 int reg; 5637 int err; 5638 int ret; 5639 5640 exit_qualification = vmx_get_exit_qual(vcpu); 5641 cr = exit_qualification & 15; 5642 reg = (exit_qualification >> 8) & 15; 5643 switch ((exit_qualification >> 4) & 3) { 5644 case 0: /* mov to cr */ 5645 val = kvm_register_read(vcpu, reg); 5646 trace_kvm_cr_write(cr, val); 5647 switch (cr) { 5648 case 0: 5649 err = handle_set_cr0(vcpu, val); 5650 return kvm_complete_insn_gp(vcpu, err); 5651 case 3: 5652 WARN_ON_ONCE(enable_unrestricted_guest); 5653 5654 err = kvm_set_cr3(vcpu, val); 5655 return kvm_complete_insn_gp(vcpu, err); 5656 case 4: 5657 err = handle_set_cr4(vcpu, val); 5658 return kvm_complete_insn_gp(vcpu, err); 5659 case 8: { 5660 u8 cr8_prev = kvm_get_cr8(vcpu); 5661 u8 cr8 = (u8)val; 5662 err = kvm_set_cr8(vcpu, cr8); 5663 ret = kvm_complete_insn_gp(vcpu, err); 5664 if (lapic_in_kernel(vcpu)) 5665 return ret; 5666 if (cr8_prev <= cr8) 5667 return ret; 5668 /* 5669 * TODO: we might be squashing a 5670 * KVM_GUESTDBG_SINGLESTEP-triggered 5671 * KVM_EXIT_DEBUG here. 5672 */ 5673 vcpu->run->exit_reason = KVM_EXIT_SET_TPR; 5674 return 0; 5675 } 5676 } 5677 break; 5678 case 2: /* clts */ 5679 KVM_BUG(1, vcpu->kvm, "Guest always owns CR0.TS"); 5680 return -EIO; 5681 case 1: /*mov from cr*/ 5682 switch (cr) { 5683 case 3: 5684 WARN_ON_ONCE(enable_unrestricted_guest); 5685 5686 val = kvm_read_cr3(vcpu); 5687 kvm_register_write(vcpu, reg, val); 5688 trace_kvm_cr_read(cr, val); 5689 return kvm_skip_emulated_instruction(vcpu); 5690 case 8: 5691 val = kvm_get_cr8(vcpu); 5692 kvm_register_write(vcpu, reg, val); 5693 trace_kvm_cr_read(cr, val); 5694 return kvm_skip_emulated_instruction(vcpu); 5695 } 5696 break; 5697 case 3: /* lmsw */ 5698 val = (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f; 5699 trace_kvm_cr_write(0, (kvm_read_cr0_bits(vcpu, ~0xful) | val)); 5700 kvm_lmsw(vcpu, val); 5701 5702 return kvm_skip_emulated_instruction(vcpu); 5703 default: 5704 break; 5705 } 5706 vcpu->run->exit_reason = 0; 5707 vcpu_unimpl(vcpu, "unhandled control register: op %d cr %d\n", 5708 (int)(exit_qualification >> 4) & 3, cr); 5709 return 0; 5710 } 5711 5712 static int handle_dr(struct kvm_vcpu *vcpu) 5713 { 5714 unsigned long exit_qualification; 5715 int dr, dr7, reg; 5716 int err = 1; 5717 5718 exit_qualification = vmx_get_exit_qual(vcpu); 5719 dr = exit_qualification & DEBUG_REG_ACCESS_NUM; 5720 5721 /* First, if DR does not exist, trigger UD */ 5722 if (!kvm_require_dr(vcpu, dr)) 5723 return 1; 5724 5725 if (vmx_get_cpl(vcpu) > 0) 5726 goto out; 5727 5728 dr7 = vmcs_readl(GUEST_DR7); 5729 if (dr7 & DR7_GD) { 5730 /* 5731 * As the vm-exit takes precedence over the debug trap, we 5732 * need to emulate the latter, either for the host or the 5733 * guest debugging itself. 5734 */ 5735 if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) { 5736 vcpu->run->debug.arch.dr6 = DR6_BD | DR6_ACTIVE_LOW; 5737 vcpu->run->debug.arch.dr7 = dr7; 5738 vcpu->run->debug.arch.pc = kvm_get_linear_rip(vcpu); 5739 vcpu->run->debug.arch.exception = DB_VECTOR; 5740 vcpu->run->exit_reason = KVM_EXIT_DEBUG; 5741 return 0; 5742 } else { 5743 kvm_queue_exception_p(vcpu, DB_VECTOR, DR6_BD); 5744 return 1; 5745 } 5746 } 5747 5748 if (vcpu->guest_debug == 0) { 5749 exec_controls_clearbit(to_vmx(vcpu), CPU_BASED_MOV_DR_EXITING); 5750 5751 /* 5752 * No more DR vmexits; force a reload of the debug registers 5753 * and reenter on this instruction. The next vmexit will 5754 * retrieve the full state of the debug registers. 5755 */ 5756 vcpu->arch.switch_db_regs |= KVM_DEBUGREG_WONT_EXIT; 5757 return 1; 5758 } 5759 5760 reg = DEBUG_REG_ACCESS_REG(exit_qualification); 5761 if (exit_qualification & TYPE_MOV_FROM_DR) { 5762 kvm_register_write(vcpu, reg, kvm_get_dr(vcpu, dr)); 5763 err = 0; 5764 } else { 5765 err = kvm_set_dr(vcpu, dr, kvm_register_read(vcpu, reg)); 5766 } 5767 5768 out: 5769 return kvm_complete_insn_gp(vcpu, err); 5770 } 5771 5772 void vmx_sync_dirty_debug_regs(struct kvm_vcpu *vcpu) 5773 { 5774 get_debugreg(vcpu->arch.db[0], 0); 5775 get_debugreg(vcpu->arch.db[1], 1); 5776 get_debugreg(vcpu->arch.db[2], 2); 5777 get_debugreg(vcpu->arch.db[3], 3); 5778 get_debugreg(vcpu->arch.dr6, 6); 5779 vcpu->arch.dr7 = vmcs_readl(GUEST_DR7); 5780 5781 vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_WONT_EXIT; 5782 exec_controls_setbit(to_vmx(vcpu), CPU_BASED_MOV_DR_EXITING); 5783 5784 /* 5785 * exc_debug expects dr6 to be cleared after it runs, avoid that it sees 5786 * a stale dr6 from the guest. 5787 */ 5788 set_debugreg(DR6_RESERVED, 6); 5789 } 5790 5791 void vmx_set_dr7(struct kvm_vcpu *vcpu, unsigned long val) 5792 { 5793 vmcs_writel(GUEST_DR7, val); 5794 } 5795 5796 static int handle_tpr_below_threshold(struct kvm_vcpu *vcpu) 5797 { 5798 kvm_apic_update_ppr(vcpu); 5799 return 1; 5800 } 5801 5802 static int handle_interrupt_window(struct kvm_vcpu *vcpu) 5803 { 5804 exec_controls_clearbit(to_vmx(vcpu), CPU_BASED_INTR_WINDOW_EXITING); 5805 5806 kvm_make_request(KVM_REQ_EVENT, vcpu); 5807 5808 ++vcpu->stat.irq_window_exits; 5809 return 1; 5810 } 5811 5812 static int handle_invlpg(struct kvm_vcpu *vcpu) 5813 { 5814 unsigned long exit_qualification = vmx_get_exit_qual(vcpu); 5815 5816 kvm_mmu_invlpg(vcpu, exit_qualification); 5817 return kvm_skip_emulated_instruction(vcpu); 5818 } 5819 5820 static int handle_apic_access(struct kvm_vcpu *vcpu) 5821 { 5822 if (likely(fasteoi)) { 5823 unsigned long exit_qualification = vmx_get_exit_qual(vcpu); 5824 int access_type, offset; 5825 5826 access_type = exit_qualification & APIC_ACCESS_TYPE; 5827 offset = exit_qualification & APIC_ACCESS_OFFSET; 5828 /* 5829 * Sane guest uses MOV to write EOI, with written value 5830 * not cared. So make a short-circuit here by avoiding 5831 * heavy instruction emulation. 5832 */ 5833 if ((access_type == TYPE_LINEAR_APIC_INST_WRITE) && 5834 (offset == APIC_EOI)) { 5835 kvm_lapic_set_eoi(vcpu); 5836 return kvm_skip_emulated_instruction(vcpu); 5837 } 5838 } 5839 return kvm_emulate_instruction(vcpu, 0); 5840 } 5841 5842 static int handle_apic_eoi_induced(struct kvm_vcpu *vcpu) 5843 { 5844 unsigned long exit_qualification = vmx_get_exit_qual(vcpu); 5845 int vector = exit_qualification & 0xff; 5846 5847 /* EOI-induced VM exit is trap-like and thus no need to adjust IP */ 5848 kvm_apic_set_eoi_accelerated(vcpu, vector); 5849 return 1; 5850 } 5851 5852 static int handle_apic_write(struct kvm_vcpu *vcpu) 5853 { 5854 unsigned long exit_qualification = vmx_get_exit_qual(vcpu); 5855 5856 /* 5857 * APIC-write VM-Exit is trap-like, KVM doesn't need to advance RIP and 5858 * hardware has done any necessary aliasing, offset adjustments, etc... 5859 * for the access. I.e. the correct value has already been written to 5860 * the vAPIC page for the correct 16-byte chunk. KVM needs only to 5861 * retrieve the register value and emulate the access. 5862 */ 5863 u32 offset = exit_qualification & 0xff0; 5864 5865 kvm_apic_write_nodecode(vcpu, offset); 5866 return 1; 5867 } 5868 5869 static int handle_task_switch(struct kvm_vcpu *vcpu) 5870 { 5871 struct vcpu_vmx *vmx = to_vmx(vcpu); 5872 unsigned long exit_qualification; 5873 bool has_error_code = false; 5874 u32 error_code = 0; 5875 u16 tss_selector; 5876 int reason, type, idt_v, idt_index; 5877 5878 idt_v = (vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK); 5879 idt_index = (vmx->idt_vectoring_info & VECTORING_INFO_VECTOR_MASK); 5880 type = (vmx->idt_vectoring_info & VECTORING_INFO_TYPE_MASK); 5881 5882 exit_qualification = vmx_get_exit_qual(vcpu); 5883 5884 reason = (u32)exit_qualification >> 30; 5885 if (reason == TASK_SWITCH_GATE && idt_v) { 5886 switch (type) { 5887 case INTR_TYPE_NMI_INTR: 5888 vcpu->arch.nmi_injected = false; 5889 vmx_set_nmi_mask(vcpu, true); 5890 break; 5891 case INTR_TYPE_EXT_INTR: 5892 case INTR_TYPE_SOFT_INTR: 5893 kvm_clear_interrupt_queue(vcpu); 5894 break; 5895 case INTR_TYPE_HARD_EXCEPTION: 5896 if (vmx->idt_vectoring_info & 5897 VECTORING_INFO_DELIVER_CODE_MASK) { 5898 has_error_code = true; 5899 error_code = 5900 vmcs_read32(IDT_VECTORING_ERROR_CODE); 5901 } 5902 fallthrough; 5903 case INTR_TYPE_SOFT_EXCEPTION: 5904 kvm_clear_exception_queue(vcpu); 5905 break; 5906 default: 5907 break; 5908 } 5909 } 5910 tss_selector = exit_qualification; 5911 5912 if (!idt_v || (type != INTR_TYPE_HARD_EXCEPTION && 5913 type != INTR_TYPE_EXT_INTR && 5914 type != INTR_TYPE_NMI_INTR)) 5915 WARN_ON(!skip_emulated_instruction(vcpu)); 5916 5917 /* 5918 * TODO: What about debug traps on tss switch? 5919 * Are we supposed to inject them and update dr6? 5920 */ 5921 return kvm_task_switch(vcpu, tss_selector, 5922 type == INTR_TYPE_SOFT_INTR ? idt_index : -1, 5923 reason, has_error_code, error_code); 5924 } 5925 5926 static int handle_ept_violation(struct kvm_vcpu *vcpu) 5927 { 5928 unsigned long exit_qualification = vmx_get_exit_qual(vcpu); 5929 gpa_t gpa; 5930 5931 /* 5932 * EPT violation happened while executing iret from NMI, 5933 * "blocked by NMI" bit has to be set before next VM entry. 5934 * There are errata that may cause this bit to not be set: 5935 * AAK134, BY25. 5936 */ 5937 if (!(to_vmx(vcpu)->idt_vectoring_info & VECTORING_INFO_VALID_MASK) && 5938 enable_vnmi && 5939 (exit_qualification & INTR_INFO_UNBLOCK_NMI)) 5940 vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, GUEST_INTR_STATE_NMI); 5941 5942 gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS); 5943 trace_kvm_page_fault(vcpu, gpa, exit_qualification); 5944 5945 /* 5946 * Check that the GPA doesn't exceed physical memory limits, as that is 5947 * a guest page fault. We have to emulate the instruction here, because 5948 * if the illegal address is that of a paging structure, then 5949 * EPT_VIOLATION_ACC_WRITE bit is set. Alternatively, if supported we 5950 * would also use advanced VM-exit information for EPT violations to 5951 * reconstruct the page fault error code. 5952 */ 5953 if (unlikely(allow_smaller_maxphyaddr && !kvm_vcpu_is_legal_gpa(vcpu, gpa))) 5954 return kvm_emulate_instruction(vcpu, 0); 5955 5956 return __vmx_handle_ept_violation(vcpu, gpa, exit_qualification); 5957 } 5958 5959 static int handle_ept_misconfig(struct kvm_vcpu *vcpu) 5960 { 5961 gpa_t gpa; 5962 5963 if (vmx_check_emulate_instruction(vcpu, EMULTYPE_PF, NULL, 0)) 5964 return 1; 5965 5966 /* 5967 * A nested guest cannot optimize MMIO vmexits, because we have an 5968 * nGPA here instead of the required GPA. 5969 */ 5970 gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS); 5971 if (!is_guest_mode(vcpu) && 5972 !kvm_io_bus_write(vcpu, KVM_FAST_MMIO_BUS, gpa, 0, NULL)) { 5973 trace_kvm_fast_mmio(gpa); 5974 return kvm_skip_emulated_instruction(vcpu); 5975 } 5976 5977 return kvm_mmu_page_fault(vcpu, gpa, PFERR_RSVD_MASK, NULL, 0); 5978 } 5979 5980 static int handle_nmi_window(struct kvm_vcpu *vcpu) 5981 { 5982 if (KVM_BUG_ON(!enable_vnmi, vcpu->kvm)) 5983 return -EIO; 5984 5985 exec_controls_clearbit(to_vmx(vcpu), CPU_BASED_NMI_WINDOW_EXITING); 5986 ++vcpu->stat.nmi_window_exits; 5987 kvm_make_request(KVM_REQ_EVENT, vcpu); 5988 5989 return 1; 5990 } 5991 5992 /* 5993 * Returns true if emulation is required (due to the vCPU having invalid state 5994 * with unsrestricted guest mode disabled) and KVM can't faithfully emulate the 5995 * current vCPU state. 5996 */ 5997 static bool vmx_unhandleable_emulation_required(struct kvm_vcpu *vcpu) 5998 { 5999 struct vcpu_vmx *vmx = to_vmx(vcpu); 6000 6001 if (!vmx->vt.emulation_required) 6002 return false; 6003 6004 /* 6005 * It is architecturally impossible for emulation to be required when a 6006 * nested VM-Enter is pending completion, as VM-Enter will VM-Fail if 6007 * guest state is invalid and unrestricted guest is disabled, i.e. KVM 6008 * should synthesize VM-Fail instead emulation L2 code. This path is 6009 * only reachable if userspace modifies L2 guest state after KVM has 6010 * performed the nested VM-Enter consistency checks. 6011 */ 6012 if (vcpu->arch.nested_run_pending) 6013 return true; 6014 6015 /* 6016 * KVM only supports emulating exceptions if the vCPU is in Real Mode. 6017 * If emulation is required, KVM can't perform a successful VM-Enter to 6018 * inject the exception. 6019 */ 6020 return !vmx->rmode.vm86_active && 6021 (kvm_is_exception_pending(vcpu) || vcpu->arch.exception.injected); 6022 } 6023 6024 static int handle_invalid_guest_state(struct kvm_vcpu *vcpu) 6025 { 6026 struct vcpu_vmx *vmx = to_vmx(vcpu); 6027 bool intr_window_requested; 6028 unsigned count = 130; 6029 6030 intr_window_requested = exec_controls_get(vmx) & 6031 CPU_BASED_INTR_WINDOW_EXITING; 6032 6033 while (vmx->vt.emulation_required && count-- != 0) { 6034 if (intr_window_requested && !vmx_interrupt_blocked(vcpu)) 6035 return handle_interrupt_window(&vmx->vcpu); 6036 6037 if (kvm_test_request(KVM_REQ_EVENT, vcpu)) 6038 return 1; 6039 6040 /* 6041 * Ensure that any updates to kvm->buses[] observed by the 6042 * previous instruction (emulated or otherwise) are also 6043 * visible to the instruction KVM is about to emulate. 6044 */ 6045 smp_rmb(); 6046 6047 if (!kvm_emulate_instruction(vcpu, 0)) 6048 return 0; 6049 6050 if (vmx_unhandleable_emulation_required(vcpu)) { 6051 kvm_prepare_emulation_failure_exit(vcpu); 6052 return 0; 6053 } 6054 6055 if (vcpu->arch.halt_request) { 6056 vcpu->arch.halt_request = 0; 6057 return kvm_emulate_halt_noskip(vcpu); 6058 } 6059 6060 /* 6061 * Note, return 1 and not 0, vcpu_run() will invoke 6062 * xfer_to_guest_mode() which will create a proper return 6063 * code. 6064 */ 6065 if (__xfer_to_guest_mode_work_pending()) 6066 return 1; 6067 } 6068 6069 return 1; 6070 } 6071 6072 int vmx_vcpu_pre_run(struct kvm_vcpu *vcpu) 6073 { 6074 if (vmx_unhandleable_emulation_required(vcpu)) { 6075 kvm_prepare_emulation_failure_exit(vcpu); 6076 return 0; 6077 } 6078 6079 return 1; 6080 } 6081 6082 /* 6083 * Indicate a busy-waiting vcpu in spinlock. We do not enable the PAUSE 6084 * exiting, so only get here on cpu with PAUSE-Loop-Exiting. 6085 */ 6086 static int handle_pause(struct kvm_vcpu *vcpu) 6087 { 6088 if (!kvm_pause_in_guest(vcpu->kvm)) 6089 grow_ple_window(vcpu); 6090 6091 /* 6092 * Intel sdm vol3 ch-25.1.3 says: The "PAUSE-loop exiting" 6093 * VM-execution control is ignored if CPL > 0. OTOH, KVM 6094 * never set PAUSE_EXITING and just set PLE if supported, 6095 * so the vcpu must be CPL=0 if it gets a PAUSE exit. 6096 */ 6097 kvm_vcpu_on_spin(vcpu, true); 6098 return kvm_skip_emulated_instruction(vcpu); 6099 } 6100 6101 static int handle_monitor_trap(struct kvm_vcpu *vcpu) 6102 { 6103 return 1; 6104 } 6105 6106 static int handle_invpcid(struct kvm_vcpu *vcpu) 6107 { 6108 u32 vmx_instruction_info; 6109 unsigned long type; 6110 gva_t gva; 6111 struct { 6112 u64 pcid; 6113 u64 gla; 6114 } operand; 6115 int gpr_index; 6116 6117 if (!guest_cpu_cap_has(vcpu, X86_FEATURE_INVPCID)) { 6118 kvm_queue_exception(vcpu, UD_VECTOR); 6119 return 1; 6120 } 6121 6122 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); 6123 gpr_index = vmx_get_instr_info_reg2(vmx_instruction_info); 6124 type = kvm_register_read(vcpu, gpr_index); 6125 6126 /* According to the Intel instruction reference, the memory operand 6127 * is read even if it isn't needed (e.g., for type==all) 6128 */ 6129 if (get_vmx_mem_address(vcpu, vmx_get_exit_qual(vcpu), 6130 vmx_instruction_info, false, 6131 sizeof(operand), &gva)) 6132 return 1; 6133 6134 return kvm_handle_invpcid(vcpu, type, gva); 6135 } 6136 6137 static int handle_pml_full(struct kvm_vcpu *vcpu) 6138 { 6139 unsigned long exit_qualification; 6140 6141 trace_kvm_pml_full(vcpu->vcpu_id); 6142 6143 exit_qualification = vmx_get_exit_qual(vcpu); 6144 6145 /* 6146 * PML buffer FULL happened while executing iret from NMI, 6147 * "blocked by NMI" bit has to be set before next VM entry. 6148 */ 6149 if (!(to_vmx(vcpu)->idt_vectoring_info & VECTORING_INFO_VALID_MASK) && 6150 enable_vnmi && 6151 (exit_qualification & INTR_INFO_UNBLOCK_NMI)) 6152 vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, 6153 GUEST_INTR_STATE_NMI); 6154 6155 /* 6156 * PML buffer already flushed at beginning of VMEXIT. Nothing to do 6157 * here.., and there's no userspace involvement needed for PML. 6158 */ 6159 return 1; 6160 } 6161 6162 static fastpath_t handle_fastpath_preemption_timer(struct kvm_vcpu *vcpu, 6163 bool force_immediate_exit) 6164 { 6165 struct vcpu_vmx *vmx = to_vmx(vcpu); 6166 6167 /* 6168 * In the *extremely* unlikely scenario that this is a spurious VM-Exit 6169 * due to the timer expiring while it was "soft" disabled, just eat the 6170 * exit and re-enter the guest. 6171 */ 6172 if (unlikely(vmx->loaded_vmcs->hv_timer_soft_disabled)) 6173 return EXIT_FASTPATH_REENTER_GUEST; 6174 6175 /* 6176 * If the timer expired because KVM used it to force an immediate exit, 6177 * then mission accomplished. 6178 */ 6179 if (force_immediate_exit) 6180 return EXIT_FASTPATH_EXIT_HANDLED; 6181 6182 /* 6183 * If L2 is active, go down the slow path as emulating the guest timer 6184 * expiration likely requires synthesizing a nested VM-Exit. 6185 */ 6186 if (is_guest_mode(vcpu)) 6187 return EXIT_FASTPATH_NONE; 6188 6189 kvm_lapic_expired_hv_timer(vcpu); 6190 return EXIT_FASTPATH_REENTER_GUEST; 6191 } 6192 6193 static int handle_preemption_timer(struct kvm_vcpu *vcpu) 6194 { 6195 /* 6196 * This non-fastpath handler is reached if and only if the preemption 6197 * timer was being used to emulate a guest timer while L2 is active. 6198 * All other scenarios are supposed to be handled in the fastpath. 6199 */ 6200 WARN_ON_ONCE(!is_guest_mode(vcpu)); 6201 kvm_lapic_expired_hv_timer(vcpu); 6202 return 1; 6203 } 6204 6205 /* 6206 * When nested=0, all VMX instruction VM Exits filter here. The handlers 6207 * are overwritten by nested_vmx_hardware_setup() when nested=1. 6208 */ 6209 static int handle_vmx_instruction(struct kvm_vcpu *vcpu) 6210 { 6211 kvm_queue_exception(vcpu, UD_VECTOR); 6212 return 1; 6213 } 6214 6215 static int handle_tdx_instruction(struct kvm_vcpu *vcpu) 6216 { 6217 kvm_queue_exception(vcpu, UD_VECTOR); 6218 return 1; 6219 } 6220 6221 #ifndef CONFIG_X86_SGX_KVM 6222 static int handle_encls(struct kvm_vcpu *vcpu) 6223 { 6224 /* 6225 * SGX virtualization is disabled. There is no software enable bit for 6226 * SGX, so KVM intercepts all ENCLS leafs and injects a #UD to prevent 6227 * the guest from executing ENCLS (when SGX is supported by hardware). 6228 */ 6229 kvm_queue_exception(vcpu, UD_VECTOR); 6230 return 1; 6231 } 6232 #endif /* CONFIG_X86_SGX_KVM */ 6233 6234 static int handle_bus_lock_vmexit(struct kvm_vcpu *vcpu) 6235 { 6236 /* 6237 * Hardware may or may not set the BUS_LOCK_DETECTED flag on BUS_LOCK 6238 * VM-Exits. Unconditionally set the flag here and leave the handling to 6239 * vmx_handle_exit(). 6240 */ 6241 to_vt(vcpu)->exit_reason.bus_lock_detected = true; 6242 return 1; 6243 } 6244 6245 static int handle_notify(struct kvm_vcpu *vcpu) 6246 { 6247 unsigned long exit_qual = vmx_get_exit_qual(vcpu); 6248 bool context_invalid = exit_qual & NOTIFY_VM_CONTEXT_INVALID; 6249 6250 ++vcpu->stat.notify_window_exits; 6251 6252 /* 6253 * Notify VM exit happened while executing iret from NMI, 6254 * "blocked by NMI" bit has to be set before next VM entry. 6255 */ 6256 if (enable_vnmi && (exit_qual & INTR_INFO_UNBLOCK_NMI)) 6257 vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, 6258 GUEST_INTR_STATE_NMI); 6259 6260 if (vcpu->kvm->arch.notify_vmexit_flags & KVM_X86_NOTIFY_VMEXIT_USER || 6261 context_invalid) { 6262 vcpu->run->exit_reason = KVM_EXIT_NOTIFY; 6263 vcpu->run->notify.flags = context_invalid ? 6264 KVM_NOTIFY_CONTEXT_INVALID : 0; 6265 return 0; 6266 } 6267 6268 return 1; 6269 } 6270 6271 static int vmx_get_msr_imm_reg(struct kvm_vcpu *vcpu) 6272 { 6273 return vmx_get_instr_info_reg(vmcs_read32(VMX_INSTRUCTION_INFO)); 6274 } 6275 6276 static int handle_rdmsr_imm(struct kvm_vcpu *vcpu) 6277 { 6278 return kvm_emulate_rdmsr_imm(vcpu, vmx_get_exit_qual(vcpu), 6279 vmx_get_msr_imm_reg(vcpu)); 6280 } 6281 6282 static int handle_wrmsr_imm(struct kvm_vcpu *vcpu) 6283 { 6284 return kvm_emulate_wrmsr_imm(vcpu, vmx_get_exit_qual(vcpu), 6285 vmx_get_msr_imm_reg(vcpu)); 6286 } 6287 6288 /* 6289 * The exit handlers return 1 if the exit was handled fully and guest execution 6290 * may resume. Otherwise they set the kvm_run parameter to indicate what needs 6291 * to be done to userspace and return 0. 6292 */ 6293 static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = { 6294 [EXIT_REASON_EXCEPTION_NMI] = handle_exception_nmi, 6295 [EXIT_REASON_EXTERNAL_INTERRUPT] = handle_external_interrupt, 6296 [EXIT_REASON_TRIPLE_FAULT] = handle_triple_fault, 6297 [EXIT_REASON_NMI_WINDOW] = handle_nmi_window, 6298 [EXIT_REASON_IO_INSTRUCTION] = handle_io, 6299 [EXIT_REASON_CR_ACCESS] = handle_cr, 6300 [EXIT_REASON_DR_ACCESS] = handle_dr, 6301 [EXIT_REASON_CPUID] = kvm_emulate_cpuid, 6302 [EXIT_REASON_MSR_READ] = kvm_emulate_rdmsr, 6303 [EXIT_REASON_MSR_WRITE] = kvm_emulate_wrmsr, 6304 [EXIT_REASON_INTERRUPT_WINDOW] = handle_interrupt_window, 6305 [EXIT_REASON_HLT] = kvm_emulate_halt, 6306 [EXIT_REASON_INVD] = kvm_emulate_invd, 6307 [EXIT_REASON_INVLPG] = handle_invlpg, 6308 [EXIT_REASON_RDPMC] = kvm_emulate_rdpmc, 6309 [EXIT_REASON_VMCALL] = kvm_emulate_hypercall, 6310 [EXIT_REASON_VMCLEAR] = handle_vmx_instruction, 6311 [EXIT_REASON_VMLAUNCH] = handle_vmx_instruction, 6312 [EXIT_REASON_VMPTRLD] = handle_vmx_instruction, 6313 [EXIT_REASON_VMPTRST] = handle_vmx_instruction, 6314 [EXIT_REASON_VMREAD] = handle_vmx_instruction, 6315 [EXIT_REASON_VMRESUME] = handle_vmx_instruction, 6316 [EXIT_REASON_VMWRITE] = handle_vmx_instruction, 6317 [EXIT_REASON_VMOFF] = handle_vmx_instruction, 6318 [EXIT_REASON_VMON] = handle_vmx_instruction, 6319 [EXIT_REASON_TPR_BELOW_THRESHOLD] = handle_tpr_below_threshold, 6320 [EXIT_REASON_APIC_ACCESS] = handle_apic_access, 6321 [EXIT_REASON_APIC_WRITE] = handle_apic_write, 6322 [EXIT_REASON_EOI_INDUCED] = handle_apic_eoi_induced, 6323 [EXIT_REASON_WBINVD] = kvm_emulate_wbinvd, 6324 [EXIT_REASON_XSETBV] = kvm_emulate_xsetbv, 6325 [EXIT_REASON_TASK_SWITCH] = handle_task_switch, 6326 [EXIT_REASON_MCE_DURING_VMENTRY] = handle_machine_check, 6327 [EXIT_REASON_GDTR_IDTR] = handle_desc, 6328 [EXIT_REASON_LDTR_TR] = handle_desc, 6329 [EXIT_REASON_EPT_VIOLATION] = handle_ept_violation, 6330 [EXIT_REASON_EPT_MISCONFIG] = handle_ept_misconfig, 6331 [EXIT_REASON_PAUSE_INSTRUCTION] = handle_pause, 6332 [EXIT_REASON_MWAIT_INSTRUCTION] = kvm_emulate_mwait, 6333 [EXIT_REASON_MONITOR_TRAP_FLAG] = handle_monitor_trap, 6334 [EXIT_REASON_MONITOR_INSTRUCTION] = kvm_emulate_monitor, 6335 [EXIT_REASON_INVEPT] = handle_vmx_instruction, 6336 [EXIT_REASON_INVVPID] = handle_vmx_instruction, 6337 [EXIT_REASON_RDRAND] = kvm_handle_invalid_op, 6338 [EXIT_REASON_RDSEED] = kvm_handle_invalid_op, 6339 [EXIT_REASON_PML_FULL] = handle_pml_full, 6340 [EXIT_REASON_INVPCID] = handle_invpcid, 6341 [EXIT_REASON_VMFUNC] = handle_vmx_instruction, 6342 [EXIT_REASON_PREEMPTION_TIMER] = handle_preemption_timer, 6343 [EXIT_REASON_ENCLS] = handle_encls, 6344 [EXIT_REASON_BUS_LOCK] = handle_bus_lock_vmexit, 6345 [EXIT_REASON_NOTIFY] = handle_notify, 6346 [EXIT_REASON_SEAMCALL] = handle_tdx_instruction, 6347 [EXIT_REASON_TDCALL] = handle_tdx_instruction, 6348 [EXIT_REASON_MSR_READ_IMM] = handle_rdmsr_imm, 6349 [EXIT_REASON_MSR_WRITE_IMM] = handle_wrmsr_imm, 6350 }; 6351 6352 static const int kvm_vmx_max_exit_handlers = 6353 ARRAY_SIZE(kvm_vmx_exit_handlers); 6354 6355 void vmx_get_exit_info(struct kvm_vcpu *vcpu, u32 *reason, 6356 u64 *info1, u64 *info2, u32 *intr_info, u32 *error_code) 6357 { 6358 struct vcpu_vmx *vmx = to_vmx(vcpu); 6359 6360 *reason = vmx->vt.exit_reason.full; 6361 *info1 = vmx_get_exit_qual(vcpu); 6362 if (!(vmx->vt.exit_reason.failed_vmentry)) { 6363 *info2 = vmx->idt_vectoring_info; 6364 *intr_info = vmx_get_intr_info(vcpu); 6365 if (is_exception_with_error_code(*intr_info)) 6366 *error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE); 6367 else 6368 *error_code = 0; 6369 } else { 6370 *info2 = 0; 6371 *intr_info = 0; 6372 *error_code = 0; 6373 } 6374 } 6375 6376 void vmx_get_entry_info(struct kvm_vcpu *vcpu, u32 *intr_info, u32 *error_code) 6377 { 6378 *intr_info = vmcs_read32(VM_ENTRY_INTR_INFO_FIELD); 6379 if (is_exception_with_error_code(*intr_info)) 6380 *error_code = vmcs_read32(VM_ENTRY_EXCEPTION_ERROR_CODE); 6381 else 6382 *error_code = 0; 6383 } 6384 6385 static void vmx_destroy_pml_buffer(struct vcpu_vmx *vmx) 6386 { 6387 if (vmx->pml_pg) { 6388 __free_page(vmx->pml_pg); 6389 vmx->pml_pg = NULL; 6390 } 6391 } 6392 6393 static void vmx_flush_pml_buffer(struct kvm_vcpu *vcpu) 6394 { 6395 struct vcpu_vmx *vmx = to_vmx(vcpu); 6396 u16 pml_idx, pml_tail_index; 6397 u64 *pml_buf; 6398 int i; 6399 6400 pml_idx = vmcs_read16(GUEST_PML_INDEX); 6401 6402 /* Do nothing if PML buffer is empty */ 6403 if (pml_idx == PML_HEAD_INDEX) 6404 return; 6405 /* 6406 * PML index always points to the next available PML buffer entity 6407 * unless PML log has just overflowed. 6408 */ 6409 pml_tail_index = (pml_idx >= PML_LOG_NR_ENTRIES) ? 0 : pml_idx + 1; 6410 6411 /* 6412 * PML log is written backwards: the CPU first writes the entry 511 6413 * then the entry 510, and so on. 6414 * 6415 * Read the entries in the same order they were written, to ensure that 6416 * the dirty ring is filled in the same order the CPU wrote them. 6417 */ 6418 pml_buf = page_address(vmx->pml_pg); 6419 6420 for (i = PML_HEAD_INDEX; i >= pml_tail_index; i--) { 6421 u64 gpa; 6422 6423 gpa = pml_buf[i]; 6424 WARN_ON(gpa & (PAGE_SIZE - 1)); 6425 kvm_vcpu_mark_page_dirty(vcpu, gpa >> PAGE_SHIFT); 6426 } 6427 6428 /* reset PML index */ 6429 vmcs_write16(GUEST_PML_INDEX, PML_HEAD_INDEX); 6430 } 6431 6432 static void nested_vmx_mark_all_vmcs12_pages_dirty(struct kvm_vcpu *vcpu) 6433 { 6434 struct vcpu_vmx *vmx = to_vmx(vcpu); 6435 6436 kvm_vcpu_map_mark_dirty(vcpu, &vmx->nested.apic_access_page_map); 6437 kvm_vcpu_map_mark_dirty(vcpu, &vmx->nested.virtual_apic_map); 6438 kvm_vcpu_map_mark_dirty(vcpu, &vmx->nested.pi_desc_map); 6439 } 6440 6441 static void vmx_dump_sel(char *name, uint32_t sel) 6442 { 6443 pr_err("%s sel=0x%04x, attr=0x%05x, limit=0x%08x, base=0x%016lx\n", 6444 name, vmcs_read16(sel), 6445 vmcs_read32(sel + GUEST_ES_AR_BYTES - GUEST_ES_SELECTOR), 6446 vmcs_read32(sel + GUEST_ES_LIMIT - GUEST_ES_SELECTOR), 6447 vmcs_readl(sel + GUEST_ES_BASE - GUEST_ES_SELECTOR)); 6448 } 6449 6450 static void vmx_dump_dtsel(char *name, uint32_t limit) 6451 { 6452 pr_err("%s limit=0x%08x, base=0x%016lx\n", 6453 name, vmcs_read32(limit), 6454 vmcs_readl(limit + GUEST_GDTR_BASE - GUEST_GDTR_LIMIT)); 6455 } 6456 6457 static void vmx_dump_msrs(char *name, struct vmx_msrs *m) 6458 { 6459 unsigned int i; 6460 struct vmx_msr_entry *e; 6461 6462 pr_err("MSR %s:\n", name); 6463 for (i = 0, e = m->val; i < m->nr; ++i, ++e) 6464 pr_err(" %2d: msr=0x%08x value=0x%016llx\n", i, e->index, e->value); 6465 } 6466 6467 void dump_vmcs(struct kvm_vcpu *vcpu) 6468 { 6469 struct vcpu_vmx *vmx = to_vmx(vcpu); 6470 u32 vmentry_ctl, vmexit_ctl; 6471 u32 cpu_based_exec_ctrl, pin_based_exec_ctrl, secondary_exec_control; 6472 u64 tertiary_exec_control; 6473 unsigned long cr4; 6474 int efer_slot; 6475 6476 if (!dump_invalid_vmcs) { 6477 pr_warn_ratelimited("set kvm_intel.dump_invalid_vmcs=1 to dump internal KVM state.\n"); 6478 return; 6479 } 6480 6481 vmentry_ctl = vmcs_read32(VM_ENTRY_CONTROLS); 6482 vmexit_ctl = vmcs_read32(VM_EXIT_CONTROLS); 6483 cpu_based_exec_ctrl = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); 6484 pin_based_exec_ctrl = vmcs_read32(PIN_BASED_VM_EXEC_CONTROL); 6485 cr4 = vmcs_readl(GUEST_CR4); 6486 6487 if (cpu_has_secondary_exec_ctrls()) 6488 secondary_exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL); 6489 else 6490 secondary_exec_control = 0; 6491 6492 if (cpu_has_tertiary_exec_ctrls()) 6493 tertiary_exec_control = vmcs_read64(TERTIARY_VM_EXEC_CONTROL); 6494 else 6495 tertiary_exec_control = 0; 6496 6497 pr_err("VMCS %p, last attempted VM-entry on CPU %d\n", 6498 vmx->loaded_vmcs->vmcs, vcpu->arch.last_vmentry_cpu); 6499 pr_err("*** Guest State ***\n"); 6500 pr_err("CR0: actual=0x%016lx, shadow=0x%016lx, gh_mask=%016lx\n", 6501 vmcs_readl(GUEST_CR0), vmcs_readl(CR0_READ_SHADOW), 6502 vmcs_readl(CR0_GUEST_HOST_MASK)); 6503 pr_err("CR4: actual=0x%016lx, shadow=0x%016lx, gh_mask=%016lx\n", 6504 cr4, vmcs_readl(CR4_READ_SHADOW), vmcs_readl(CR4_GUEST_HOST_MASK)); 6505 pr_err("CR3 = 0x%016lx\n", vmcs_readl(GUEST_CR3)); 6506 if (cpu_has_vmx_ept()) { 6507 pr_err("PDPTR0 = 0x%016llx PDPTR1 = 0x%016llx\n", 6508 vmcs_read64(GUEST_PDPTR0), vmcs_read64(GUEST_PDPTR1)); 6509 pr_err("PDPTR2 = 0x%016llx PDPTR3 = 0x%016llx\n", 6510 vmcs_read64(GUEST_PDPTR2), vmcs_read64(GUEST_PDPTR3)); 6511 } 6512 pr_err("RSP = 0x%016lx RIP = 0x%016lx\n", 6513 vmcs_readl(GUEST_RSP), vmcs_readl(GUEST_RIP)); 6514 pr_err("RFLAGS=0x%08lx DR7 = 0x%016lx\n", 6515 vmcs_readl(GUEST_RFLAGS), vmcs_readl(GUEST_DR7)); 6516 pr_err("Sysenter RSP=%016lx CS:RIP=%04x:%016lx\n", 6517 vmcs_readl(GUEST_SYSENTER_ESP), 6518 vmcs_read32(GUEST_SYSENTER_CS), vmcs_readl(GUEST_SYSENTER_EIP)); 6519 vmx_dump_sel("CS: ", GUEST_CS_SELECTOR); 6520 vmx_dump_sel("DS: ", GUEST_DS_SELECTOR); 6521 vmx_dump_sel("SS: ", GUEST_SS_SELECTOR); 6522 vmx_dump_sel("ES: ", GUEST_ES_SELECTOR); 6523 vmx_dump_sel("FS: ", GUEST_FS_SELECTOR); 6524 vmx_dump_sel("GS: ", GUEST_GS_SELECTOR); 6525 vmx_dump_dtsel("GDTR:", GUEST_GDTR_LIMIT); 6526 vmx_dump_sel("LDTR:", GUEST_LDTR_SELECTOR); 6527 vmx_dump_dtsel("IDTR:", GUEST_IDTR_LIMIT); 6528 vmx_dump_sel("TR: ", GUEST_TR_SELECTOR); 6529 efer_slot = vmx_find_loadstore_msr_slot(&vmx->msr_autoload.guest, MSR_EFER); 6530 if (vmentry_ctl & VM_ENTRY_LOAD_IA32_EFER) 6531 pr_err("EFER= 0x%016llx\n", vmcs_read64(GUEST_IA32_EFER)); 6532 else if (efer_slot >= 0) 6533 pr_err("EFER= 0x%016llx (autoload)\n", 6534 vmx->msr_autoload.guest.val[efer_slot].value); 6535 else if (vmentry_ctl & VM_ENTRY_IA32E_MODE) 6536 pr_err("EFER= 0x%016llx (effective)\n", 6537 vcpu->arch.efer | (EFER_LMA | EFER_LME)); 6538 else 6539 pr_err("EFER= 0x%016llx (effective)\n", 6540 vcpu->arch.efer & ~(EFER_LMA | EFER_LME)); 6541 if (vmentry_ctl & VM_ENTRY_LOAD_IA32_PAT) 6542 pr_err("PAT = 0x%016llx\n", vmcs_read64(GUEST_IA32_PAT)); 6543 pr_err("DebugCtl = 0x%016llx DebugExceptions = 0x%016lx\n", 6544 vmcs_read64(GUEST_IA32_DEBUGCTL), 6545 vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS)); 6546 if (cpu_has_load_perf_global_ctrl() && 6547 vmentry_ctl & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL) 6548 pr_err("PerfGlobCtl = 0x%016llx\n", 6549 vmcs_read64(GUEST_IA32_PERF_GLOBAL_CTRL)); 6550 if (vmentry_ctl & VM_ENTRY_LOAD_BNDCFGS) 6551 pr_err("BndCfgS = 0x%016llx\n", vmcs_read64(GUEST_BNDCFGS)); 6552 pr_err("Interruptibility = %08x ActivityState = %08x\n", 6553 vmcs_read32(GUEST_INTERRUPTIBILITY_INFO), 6554 vmcs_read32(GUEST_ACTIVITY_STATE)); 6555 if (secondary_exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY) 6556 pr_err("InterruptStatus = %04x\n", 6557 vmcs_read16(GUEST_INTR_STATUS)); 6558 if (vmcs_read32(VM_ENTRY_MSR_LOAD_COUNT) > 0) 6559 vmx_dump_msrs("guest autoload", &vmx->msr_autoload.guest); 6560 if (vmcs_read32(VM_EXIT_MSR_STORE_COUNT) > 0) 6561 vmx_dump_msrs("autostore", &vmx->msr_autostore); 6562 6563 if (vmentry_ctl & VM_ENTRY_LOAD_CET_STATE) 6564 pr_err("S_CET = 0x%016lx, SSP = 0x%016lx, SSP TABLE = 0x%016lx\n", 6565 vmcs_readl(GUEST_S_CET), vmcs_readl(GUEST_SSP), 6566 vmcs_readl(GUEST_INTR_SSP_TABLE)); 6567 pr_err("*** Host State ***\n"); 6568 pr_err("RIP = 0x%016lx RSP = 0x%016lx\n", 6569 vmcs_readl(HOST_RIP), vmcs_readl(HOST_RSP)); 6570 pr_err("CS=%04x SS=%04x DS=%04x ES=%04x FS=%04x GS=%04x TR=%04x\n", 6571 vmcs_read16(HOST_CS_SELECTOR), vmcs_read16(HOST_SS_SELECTOR), 6572 vmcs_read16(HOST_DS_SELECTOR), vmcs_read16(HOST_ES_SELECTOR), 6573 vmcs_read16(HOST_FS_SELECTOR), vmcs_read16(HOST_GS_SELECTOR), 6574 vmcs_read16(HOST_TR_SELECTOR)); 6575 pr_err("FSBase=%016lx GSBase=%016lx TRBase=%016lx\n", 6576 vmcs_readl(HOST_FS_BASE), vmcs_readl(HOST_GS_BASE), 6577 vmcs_readl(HOST_TR_BASE)); 6578 pr_err("GDTBase=%016lx IDTBase=%016lx\n", 6579 vmcs_readl(HOST_GDTR_BASE), vmcs_readl(HOST_IDTR_BASE)); 6580 pr_err("CR0=%016lx CR3=%016lx CR4=%016lx\n", 6581 vmcs_readl(HOST_CR0), vmcs_readl(HOST_CR3), 6582 vmcs_readl(HOST_CR4)); 6583 pr_err("Sysenter RSP=%016lx CS:RIP=%04x:%016lx\n", 6584 vmcs_readl(HOST_IA32_SYSENTER_ESP), 6585 vmcs_read32(HOST_IA32_SYSENTER_CS), 6586 vmcs_readl(HOST_IA32_SYSENTER_EIP)); 6587 if (vmexit_ctl & VM_EXIT_LOAD_IA32_EFER) 6588 pr_err("EFER= 0x%016llx\n", vmcs_read64(HOST_IA32_EFER)); 6589 if (vmexit_ctl & VM_EXIT_LOAD_IA32_PAT) 6590 pr_err("PAT = 0x%016llx\n", vmcs_read64(HOST_IA32_PAT)); 6591 if (cpu_has_load_perf_global_ctrl() && 6592 vmexit_ctl & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL) 6593 pr_err("PerfGlobCtl = 0x%016llx\n", 6594 vmcs_read64(HOST_IA32_PERF_GLOBAL_CTRL)); 6595 if (vmcs_read32(VM_EXIT_MSR_LOAD_COUNT) > 0) 6596 vmx_dump_msrs("host autoload", &vmx->msr_autoload.host); 6597 if (vmexit_ctl & VM_EXIT_LOAD_CET_STATE) 6598 pr_err("S_CET = 0x%016lx, SSP = 0x%016lx, SSP TABLE = 0x%016lx\n", 6599 vmcs_readl(HOST_S_CET), vmcs_readl(HOST_SSP), 6600 vmcs_readl(HOST_INTR_SSP_TABLE)); 6601 6602 pr_err("*** Control State ***\n"); 6603 pr_err("CPUBased=0x%08x SecondaryExec=0x%08x TertiaryExec=0x%016llx\n", 6604 cpu_based_exec_ctrl, secondary_exec_control, tertiary_exec_control); 6605 pr_err("PinBased=0x%08x EntryControls=%08x ExitControls=%08x\n", 6606 pin_based_exec_ctrl, vmentry_ctl, vmexit_ctl); 6607 pr_err("ExceptionBitmap=%08x PFECmask=%08x PFECmatch=%08x\n", 6608 vmcs_read32(EXCEPTION_BITMAP), 6609 vmcs_read32(PAGE_FAULT_ERROR_CODE_MASK), 6610 vmcs_read32(PAGE_FAULT_ERROR_CODE_MATCH)); 6611 pr_err("VMEntry: intr_info=%08x errcode=%08x ilen=%08x\n", 6612 vmcs_read32(VM_ENTRY_INTR_INFO_FIELD), 6613 vmcs_read32(VM_ENTRY_EXCEPTION_ERROR_CODE), 6614 vmcs_read32(VM_ENTRY_INSTRUCTION_LEN)); 6615 pr_err("VMExit: intr_info=%08x errcode=%08x ilen=%08x\n", 6616 vmcs_read32(VM_EXIT_INTR_INFO), 6617 vmcs_read32(VM_EXIT_INTR_ERROR_CODE), 6618 vmcs_read32(VM_EXIT_INSTRUCTION_LEN)); 6619 pr_err(" reason=%08x qualification=%016lx\n", 6620 vmcs_read32(VM_EXIT_REASON), vmcs_readl(EXIT_QUALIFICATION)); 6621 pr_err("IDTVectoring: info=%08x errcode=%08x\n", 6622 vmcs_read32(IDT_VECTORING_INFO_FIELD), 6623 vmcs_read32(IDT_VECTORING_ERROR_CODE)); 6624 pr_err("TSC Offset = 0x%016llx\n", vmcs_read64(TSC_OFFSET)); 6625 if (secondary_exec_control & SECONDARY_EXEC_TSC_SCALING) 6626 pr_err("TSC Multiplier = 0x%016llx\n", 6627 vmcs_read64(TSC_MULTIPLIER)); 6628 if (cpu_based_exec_ctrl & CPU_BASED_TPR_SHADOW) { 6629 if (secondary_exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY) { 6630 u16 status = vmcs_read16(GUEST_INTR_STATUS); 6631 pr_err("SVI|RVI = %02x|%02x ", status >> 8, status & 0xff); 6632 } 6633 pr_cont("TPR Threshold = 0x%02x\n", vmcs_read32(TPR_THRESHOLD)); 6634 if (secondary_exec_control & SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES) 6635 pr_err("APIC-access addr = 0x%016llx ", vmcs_read64(APIC_ACCESS_ADDR)); 6636 pr_cont("virt-APIC addr = 0x%016llx\n", vmcs_read64(VIRTUAL_APIC_PAGE_ADDR)); 6637 } 6638 if (pin_based_exec_ctrl & PIN_BASED_POSTED_INTR) 6639 pr_err("PostedIntrVec = 0x%02x\n", vmcs_read16(POSTED_INTR_NV)); 6640 if ((secondary_exec_control & SECONDARY_EXEC_ENABLE_EPT)) 6641 pr_err("EPT pointer = 0x%016llx\n", vmcs_read64(EPT_POINTER)); 6642 if (secondary_exec_control & SECONDARY_EXEC_PAUSE_LOOP_EXITING) 6643 pr_err("PLE Gap=%08x Window=%08x\n", 6644 vmcs_read32(PLE_GAP), vmcs_read32(PLE_WINDOW)); 6645 if (secondary_exec_control & SECONDARY_EXEC_ENABLE_VPID) 6646 pr_err("Virtual processor ID = 0x%04x\n", 6647 vmcs_read16(VIRTUAL_PROCESSOR_ID)); 6648 if (secondary_exec_control & SECONDARY_EXEC_EPT_VIOLATION_VE) { 6649 struct vmx_ve_information *ve_info = vmx->ve_info; 6650 u64 ve_info_pa = vmcs_read64(VE_INFORMATION_ADDRESS); 6651 6652 /* 6653 * If KVM is dumping the VMCS, then something has gone wrong 6654 * already. Derefencing an address from the VMCS, which could 6655 * very well be corrupted, is a terrible idea. The virtual 6656 * address is known so use it. 6657 */ 6658 pr_err("VE info address = 0x%016llx%s\n", ve_info_pa, 6659 ve_info_pa == __pa(ve_info) ? "" : "(corrupted!)"); 6660 pr_err("ve_info: 0x%08x 0x%08x 0x%016llx 0x%016llx 0x%016llx 0x%04x\n", 6661 ve_info->exit_reason, ve_info->delivery, 6662 ve_info->exit_qualification, 6663 ve_info->guest_linear_address, 6664 ve_info->guest_physical_address, ve_info->eptp_index); 6665 } 6666 } 6667 6668 /* 6669 * The guest has exited. See if we can fix it or if we need userspace 6670 * assistance. 6671 */ 6672 static int __vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) 6673 { 6674 struct vcpu_vmx *vmx = to_vmx(vcpu); 6675 union vmx_exit_reason exit_reason = vmx_get_exit_reason(vcpu); 6676 u32 vectoring_info = vmx->idt_vectoring_info; 6677 u16 exit_handler_index; 6678 6679 /* 6680 * Flush logged GPAs PML buffer, this will make dirty_bitmap more 6681 * updated. Another good is, in kvm_vm_ioctl_get_dirty_log, before 6682 * querying dirty_bitmap, we only need to kick all vcpus out of guest 6683 * mode as if vcpus is in root mode, the PML buffer must has been 6684 * flushed already. Note, PML is never enabled in hardware while 6685 * running L2. 6686 */ 6687 if (enable_pml && !is_guest_mode(vcpu)) 6688 vmx_flush_pml_buffer(vcpu); 6689 6690 /* 6691 * KVM should never reach this point with a pending nested VM-Enter. 6692 * More specifically, short-circuiting VM-Entry to emulate L2 due to 6693 * invalid guest state should never happen as that means KVM knowingly 6694 * allowed a nested VM-Enter with an invalid vmcs12. More below. 6695 */ 6696 if (KVM_BUG_ON(vcpu->arch.nested_run_pending, vcpu->kvm)) 6697 return -EIO; 6698 6699 if (is_guest_mode(vcpu)) { 6700 /* 6701 * PML is never enabled when running L2, bail immediately if a 6702 * PML full exit occurs as something is horribly wrong. 6703 */ 6704 if (exit_reason.basic == EXIT_REASON_PML_FULL) 6705 goto unexpected_vmexit; 6706 6707 /* 6708 * The host physical addresses of some pages of guest memory 6709 * are loaded into the vmcs02 (e.g. vmcs12's Virtual APIC 6710 * Page). The CPU may write to these pages via their host 6711 * physical address while L2 is running, bypassing any 6712 * address-translation-based dirty tracking (e.g. EPT write 6713 * protection). 6714 * 6715 * Mark them dirty on every exit from L2 to prevent them from 6716 * getting out of sync with dirty tracking. 6717 */ 6718 nested_vmx_mark_all_vmcs12_pages_dirty(vcpu); 6719 6720 /* 6721 * Synthesize a triple fault if L2 state is invalid. In normal 6722 * operation, nested VM-Enter rejects any attempt to enter L2 6723 * with invalid state. However, those checks are skipped if 6724 * state is being stuffed via RSM or KVM_SET_NESTED_STATE. If 6725 * L2 state is invalid, it means either L1 modified SMRAM state 6726 * or userspace provided bad state. Synthesize TRIPLE_FAULT as 6727 * doing so is architecturally allowed in the RSM case, and is 6728 * the least awful solution for the userspace case without 6729 * risking false positives. 6730 */ 6731 if (vmx->vt.emulation_required) { 6732 nested_vmx_vmexit(vcpu, EXIT_REASON_TRIPLE_FAULT, 0, 0); 6733 return 1; 6734 } 6735 6736 if (nested_vmx_reflect_vmexit(vcpu)) 6737 return 1; 6738 } 6739 6740 /* If guest state is invalid, start emulating. L2 is handled above. */ 6741 if (vmx->vt.emulation_required) 6742 return handle_invalid_guest_state(vcpu); 6743 6744 if (exit_reason.failed_vmentry) { 6745 dump_vmcs(vcpu); 6746 vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY; 6747 vcpu->run->fail_entry.hardware_entry_failure_reason 6748 = exit_reason.full; 6749 vcpu->run->fail_entry.cpu = vcpu->arch.last_vmentry_cpu; 6750 return 0; 6751 } 6752 6753 if (unlikely(vmx->fail)) { 6754 dump_vmcs(vcpu); 6755 vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY; 6756 vcpu->run->fail_entry.hardware_entry_failure_reason 6757 = vmcs_read32(VM_INSTRUCTION_ERROR); 6758 vcpu->run->fail_entry.cpu = vcpu->arch.last_vmentry_cpu; 6759 return 0; 6760 } 6761 6762 if ((vectoring_info & VECTORING_INFO_VALID_MASK) && 6763 (exit_reason.basic != EXIT_REASON_EXCEPTION_NMI && 6764 exit_reason.basic != EXIT_REASON_EPT_VIOLATION && 6765 exit_reason.basic != EXIT_REASON_PML_FULL && 6766 exit_reason.basic != EXIT_REASON_APIC_ACCESS && 6767 exit_reason.basic != EXIT_REASON_TASK_SWITCH && 6768 exit_reason.basic != EXIT_REASON_NOTIFY && 6769 exit_reason.basic != EXIT_REASON_EPT_MISCONFIG)) { 6770 kvm_prepare_event_vectoring_exit(vcpu, INVALID_GPA); 6771 return 0; 6772 } 6773 6774 if (unlikely(!enable_vnmi && 6775 vmx->loaded_vmcs->soft_vnmi_blocked)) { 6776 if (!vmx_interrupt_blocked(vcpu)) { 6777 vmx->loaded_vmcs->soft_vnmi_blocked = 0; 6778 } else if (vmx->loaded_vmcs->vnmi_blocked_time > 1000000000LL && 6779 vcpu->arch.nmi_pending) { 6780 /* 6781 * This CPU don't support us in finding the end of an 6782 * NMI-blocked window if the guest runs with IRQs 6783 * disabled. So we pull the trigger after 1 s of 6784 * futile waiting, but inform the user about this. 6785 */ 6786 printk(KERN_WARNING "%s: Breaking out of NMI-blocked " 6787 "state on VCPU %d after 1 s timeout\n", 6788 __func__, vcpu->vcpu_id); 6789 vmx->loaded_vmcs->soft_vnmi_blocked = 0; 6790 } 6791 } 6792 6793 if (exit_fastpath != EXIT_FASTPATH_NONE) 6794 return 1; 6795 6796 if (exit_reason.basic >= kvm_vmx_max_exit_handlers) 6797 goto unexpected_vmexit; 6798 #ifdef CONFIG_MITIGATION_RETPOLINE 6799 if (exit_reason.basic == EXIT_REASON_MSR_WRITE) 6800 return kvm_emulate_wrmsr(vcpu); 6801 else if (exit_reason.basic == EXIT_REASON_MSR_WRITE_IMM) 6802 return handle_wrmsr_imm(vcpu); 6803 else if (exit_reason.basic == EXIT_REASON_PREEMPTION_TIMER) 6804 return handle_preemption_timer(vcpu); 6805 else if (exit_reason.basic == EXIT_REASON_INTERRUPT_WINDOW) 6806 return handle_interrupt_window(vcpu); 6807 else if (exit_reason.basic == EXIT_REASON_EXTERNAL_INTERRUPT) 6808 return handle_external_interrupt(vcpu); 6809 else if (exit_reason.basic == EXIT_REASON_HLT) 6810 return kvm_emulate_halt(vcpu); 6811 else if (exit_reason.basic == EXIT_REASON_EPT_MISCONFIG) 6812 return handle_ept_misconfig(vcpu); 6813 #endif 6814 6815 exit_handler_index = array_index_nospec((u16)exit_reason.basic, 6816 kvm_vmx_max_exit_handlers); 6817 if (!kvm_vmx_exit_handlers[exit_handler_index]) 6818 goto unexpected_vmexit; 6819 6820 return kvm_vmx_exit_handlers[exit_handler_index](vcpu); 6821 6822 unexpected_vmexit: 6823 dump_vmcs(vcpu); 6824 kvm_prepare_unexpected_reason_exit(vcpu, exit_reason.full); 6825 return 0; 6826 } 6827 6828 int vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) 6829 { 6830 int ret = __vmx_handle_exit(vcpu, exit_fastpath); 6831 6832 /* 6833 * Exit to user space when bus lock detected to inform that there is 6834 * a bus lock in guest. 6835 */ 6836 if (vmx_get_exit_reason(vcpu).bus_lock_detected) { 6837 if (ret > 0) 6838 vcpu->run->exit_reason = KVM_EXIT_X86_BUS_LOCK; 6839 6840 vcpu->run->flags |= KVM_RUN_X86_BUS_LOCK; 6841 return 0; 6842 } 6843 return ret; 6844 } 6845 6846 void vmx_update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr) 6847 { 6848 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 6849 int tpr_threshold; 6850 6851 if (is_guest_mode(vcpu) && 6852 nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) 6853 return; 6854 6855 guard(vmx_vmcs01)(vcpu); 6856 6857 tpr_threshold = (irr == -1 || tpr < irr) ? 0 : irr; 6858 vmcs_write32(TPR_THRESHOLD, tpr_threshold); 6859 } 6860 6861 void vmx_set_virtual_apic_mode(struct kvm_vcpu *vcpu) 6862 { 6863 struct vcpu_vmx *vmx = to_vmx(vcpu); 6864 u32 sec_exec_control; 6865 6866 if (!lapic_in_kernel(vcpu)) 6867 return; 6868 6869 if (!flexpriority_enabled && 6870 !cpu_has_vmx_virtualize_x2apic_mode()) 6871 return; 6872 6873 guard(vmx_vmcs01)(vcpu); 6874 6875 sec_exec_control = secondary_exec_controls_get(vmx); 6876 sec_exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | 6877 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE); 6878 6879 switch (kvm_get_apic_mode(vcpu)) { 6880 case LAPIC_MODE_INVALID: 6881 WARN_ONCE(true, "Invalid local APIC state"); 6882 break; 6883 case LAPIC_MODE_DISABLED: 6884 break; 6885 case LAPIC_MODE_XAPIC: 6886 if (flexpriority_enabled) { 6887 sec_exec_control |= 6888 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; 6889 kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu); 6890 6891 /* 6892 * Flush the TLB, reloading the APIC access page will 6893 * only do so if its physical address has changed, but 6894 * the guest may have inserted a non-APIC mapping into 6895 * the TLB while the APIC access page was disabled. 6896 * 6897 * If L2 is active, immediately flush L1's TLB instead 6898 * of requesting a flush of the current TLB, because 6899 * the current TLB context is L2's. 6900 */ 6901 if (!is_guest_mode(vcpu)) 6902 kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu); 6903 else if (!enable_ept) 6904 vpid_sync_context(vmx->vpid); 6905 else if (VALID_PAGE(vcpu->arch.root_mmu.root.hpa)) 6906 vmx_flush_tlb_ept_root(vcpu->arch.root_mmu.root.hpa); 6907 } 6908 break; 6909 case LAPIC_MODE_X2APIC: 6910 if (cpu_has_vmx_virtualize_x2apic_mode()) 6911 sec_exec_control |= 6912 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE; 6913 break; 6914 } 6915 secondary_exec_controls_set(vmx, sec_exec_control); 6916 6917 vmx_update_msr_bitmap_x2apic(vcpu); 6918 } 6919 6920 void vmx_set_apic_access_page_addr(struct kvm_vcpu *vcpu) 6921 { 6922 const gfn_t gfn = APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT; 6923 struct kvm *kvm = vcpu->kvm; 6924 struct kvm_memslots *slots = kvm_memslots(kvm); 6925 struct kvm_memory_slot *slot; 6926 struct page *refcounted_page; 6927 unsigned long mmu_seq; 6928 kvm_pfn_t pfn; 6929 bool writable; 6930 6931 /* Note, the VIRTUALIZE_APIC_ACCESSES check needs to query vmcs01. */ 6932 guard(vmx_vmcs01)(vcpu); 6933 6934 if (!(secondary_exec_controls_get(to_vmx(vcpu)) & 6935 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) 6936 return; 6937 6938 /* 6939 * Explicitly grab the memslot using KVM's internal slot ID to ensure 6940 * KVM doesn't unintentionally grab a userspace memslot. It _should_ 6941 * be impossible for userspace to create a memslot for the APIC when 6942 * APICv is enabled, but paranoia won't hurt in this case. 6943 */ 6944 slot = id_to_memslot(slots, APIC_ACCESS_PAGE_PRIVATE_MEMSLOT); 6945 if (!slot || slot->flags & KVM_MEMSLOT_INVALID) 6946 return; 6947 6948 /* 6949 * Ensure that the mmu_notifier sequence count is read before KVM 6950 * retrieves the pfn from the primary MMU. Note, the memslot is 6951 * protected by SRCU, not the mmu_notifier. Pairs with the smp_wmb() 6952 * in kvm_mmu_invalidate_end(). 6953 */ 6954 mmu_seq = kvm->mmu_invalidate_seq; 6955 smp_rmb(); 6956 6957 /* 6958 * No need to retry if the memslot does not exist or is invalid. KVM 6959 * controls the APIC-access page memslot, and only deletes the memslot 6960 * if APICv is permanently inhibited, i.e. the memslot won't reappear. 6961 */ 6962 pfn = __kvm_faultin_pfn(slot, gfn, FOLL_WRITE, &writable, &refcounted_page); 6963 if (is_error_noslot_pfn(pfn)) 6964 return; 6965 6966 read_lock(&vcpu->kvm->mmu_lock); 6967 if (mmu_invalidate_retry_gfn(kvm, mmu_seq, gfn)) 6968 kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu); 6969 else 6970 vmcs_write64(APIC_ACCESS_ADDR, pfn_to_hpa(pfn)); 6971 6972 /* 6973 * Do not pin the APIC access page in memory so that it can be freely 6974 * migrated, the MMU notifier will call us again if it is migrated or 6975 * swapped out. KVM backs the memslot with anonymous memory, the pfn 6976 * should always point at a refcounted page (if the pfn is valid). 6977 */ 6978 if (!WARN_ON_ONCE(!refcounted_page)) 6979 kvm_release_page_clean(refcounted_page); 6980 6981 /* 6982 * No need for a manual TLB flush at this point, KVM has already done a 6983 * flush if there were SPTEs pointing at the previous page. 6984 */ 6985 read_unlock(&vcpu->kvm->mmu_lock); 6986 } 6987 6988 void vmx_hwapic_isr_update(struct kvm_vcpu *vcpu, int max_isr) 6989 { 6990 u16 status; 6991 u8 old; 6992 6993 if (max_isr == -1) 6994 max_isr = 0; 6995 6996 /* 6997 * Always update SVI in vmcs01, as SVI is only relevant for L2 if and 6998 * only if Virtual Interrupt Delivery is enabled in vmcs12, and if VID 6999 * is enabled then L2 EOIs affect L2's vAPIC, not L1's vAPIC. 7000 */ 7001 guard(vmx_vmcs01)(vcpu); 7002 7003 status = vmcs_read16(GUEST_INTR_STATUS); 7004 old = status >> 8; 7005 if (max_isr != old) { 7006 status &= 0xff; 7007 status |= max_isr << 8; 7008 vmcs_write16(GUEST_INTR_STATUS, status); 7009 } 7010 } 7011 7012 static void vmx_set_rvi(int vector) 7013 { 7014 u16 status; 7015 u8 old; 7016 7017 if (vector == -1) 7018 vector = 0; 7019 7020 status = vmcs_read16(GUEST_INTR_STATUS); 7021 old = (u8)status & 0xff; 7022 if ((u8)vector != old) { 7023 status &= ~0xff; 7024 status |= (u8)vector; 7025 vmcs_write16(GUEST_INTR_STATUS, status); 7026 } 7027 } 7028 7029 int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu) 7030 { 7031 struct vcpu_vt *vt = to_vt(vcpu); 7032 int max_irr; 7033 bool got_posted_interrupt; 7034 7035 if (KVM_BUG_ON(!enable_apicv, vcpu->kvm)) 7036 return -EIO; 7037 7038 if (pi_test_on(&vt->pi_desc)) { 7039 pi_clear_on(&vt->pi_desc); 7040 /* 7041 * IOMMU can write to PID.ON, so the barrier matters even on UP. 7042 * But on x86 this is just a compiler barrier anyway. 7043 */ 7044 smp_mb__after_atomic(); 7045 got_posted_interrupt = 7046 kvm_apic_update_irr(vcpu, vt->pi_desc.pir, &max_irr); 7047 } else { 7048 max_irr = kvm_lapic_find_highest_irr(vcpu); 7049 got_posted_interrupt = false; 7050 } 7051 7052 /* 7053 * Newly recognized interrupts are injected via either virtual interrupt 7054 * delivery (RVI) or KVM_REQ_EVENT. Virtual interrupt delivery is 7055 * disabled in two cases: 7056 * 7057 * 1) If L2 is running and the vCPU has a new pending interrupt. If L1 7058 * wants to exit on interrupts, KVM_REQ_EVENT is needed to synthesize a 7059 * VM-Exit to L1. If L1 doesn't want to exit, the interrupt is injected 7060 * into L2, but KVM doesn't use virtual interrupt delivery to inject 7061 * interrupts into L2, and so KVM_REQ_EVENT is again needed. 7062 * 7063 * 2) If APICv is disabled for this vCPU, assigned devices may still 7064 * attempt to post interrupts. The posted interrupt vector will cause 7065 * a VM-Exit and the subsequent entry will call sync_pir_to_irr. 7066 */ 7067 if (!is_guest_mode(vcpu) && kvm_vcpu_apicv_active(vcpu)) 7068 vmx_set_rvi(max_irr); 7069 else if (got_posted_interrupt) 7070 kvm_make_request(KVM_REQ_EVENT, vcpu); 7071 7072 return max_irr; 7073 } 7074 7075 void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap) 7076 { 7077 if (!kvm_vcpu_apicv_active(vcpu)) 7078 return; 7079 7080 vmcs_write64(EOI_EXIT_BITMAP0, eoi_exit_bitmap[0]); 7081 vmcs_write64(EOI_EXIT_BITMAP1, eoi_exit_bitmap[1]); 7082 vmcs_write64(EOI_EXIT_BITMAP2, eoi_exit_bitmap[2]); 7083 vmcs_write64(EOI_EXIT_BITMAP3, eoi_exit_bitmap[3]); 7084 } 7085 7086 void vmx_do_interrupt_irqoff(unsigned long entry); 7087 void vmx_do_nmi_irqoff(void); 7088 7089 static void handle_nm_fault_irqoff(struct kvm_vcpu *vcpu) 7090 { 7091 /* 7092 * Save xfd_err to guest_fpu before interrupt is enabled, so the 7093 * MSR value is not clobbered by the host activity before the guest 7094 * has chance to consume it. 7095 * 7096 * Update the guest's XFD_ERR if and only if XFD is enabled, as the #NM 7097 * interception may have been caused by L1 interception. Per the SDM, 7098 * XFD_ERR is not modified for non-XFD #NM, i.e. if CR0.TS=1. 7099 * 7100 * Note, XFD_ERR is updated _before_ the #NM interception check, i.e. 7101 * unlike CR2 and DR6, the value is not a payload that is attached to 7102 * the #NM exception. 7103 */ 7104 if (is_xfd_nm_fault(vcpu)) 7105 rdmsrq(MSR_IA32_XFD_ERR, vcpu->arch.guest_fpu.xfd_err); 7106 } 7107 7108 static void handle_exception_irqoff(struct kvm_vcpu *vcpu, u32 intr_info) 7109 { 7110 /* if exit due to PF check for async PF */ 7111 if (is_page_fault(intr_info)) 7112 vcpu->arch.apf.host_apf_flags = kvm_read_and_reset_apf_flags(); 7113 /* if exit due to NM, handle before interrupts are enabled */ 7114 else if (is_nm_fault(intr_info)) 7115 handle_nm_fault_irqoff(vcpu); 7116 /* Handle machine checks before interrupts are enabled */ 7117 else if (is_machine_check(intr_info)) 7118 kvm_machine_check(); 7119 } 7120 7121 static void handle_external_interrupt_irqoff(struct kvm_vcpu *vcpu, 7122 u32 intr_info) 7123 { 7124 unsigned int vector = intr_info & INTR_INFO_VECTOR_MASK; 7125 7126 if (KVM_BUG(!is_external_intr(intr_info), vcpu->kvm, 7127 "unexpected VM-Exit interrupt info: 0x%x", intr_info)) 7128 return; 7129 7130 /* 7131 * Invoke the kernel's IRQ handler for the vector. Use the FRED path 7132 * when it's available even if FRED isn't fully enabled, e.g. even if 7133 * FRED isn't supported in hardware, in order to avoid the indirect 7134 * CALL in the non-FRED path. 7135 */ 7136 kvm_before_interrupt(vcpu, KVM_HANDLING_IRQ); 7137 if (IS_ENABLED(CONFIG_X86_FRED)) 7138 fred_entry_from_kvm(EVENT_TYPE_EXTINT, vector); 7139 else 7140 vmx_do_interrupt_irqoff(gate_offset((gate_desc *)host_idt_base + vector)); 7141 kvm_after_interrupt(vcpu); 7142 7143 vcpu->arch.at_instruction_boundary = true; 7144 } 7145 7146 void vmx_handle_exit_irqoff(struct kvm_vcpu *vcpu) 7147 { 7148 if (to_vt(vcpu)->emulation_required) 7149 return; 7150 7151 switch (vmx_get_exit_reason(vcpu).basic) { 7152 case EXIT_REASON_EXTERNAL_INTERRUPT: 7153 handle_external_interrupt_irqoff(vcpu, vmx_get_intr_info(vcpu)); 7154 break; 7155 case EXIT_REASON_EXCEPTION_NMI: 7156 handle_exception_irqoff(vcpu, vmx_get_intr_info(vcpu)); 7157 break; 7158 case EXIT_REASON_MCE_DURING_VMENTRY: 7159 kvm_machine_check(); 7160 break; 7161 default: 7162 break; 7163 } 7164 } 7165 7166 /* 7167 * The kvm parameter can be NULL (module initialization, or invocation before 7168 * VM creation). Be sure to check the kvm parameter before using it. 7169 */ 7170 bool vmx_has_emulated_msr(struct kvm *kvm, u32 index) 7171 { 7172 switch (index) { 7173 case MSR_IA32_SMBASE: 7174 if (!IS_ENABLED(CONFIG_KVM_SMM)) 7175 return false; 7176 /* 7177 * We cannot do SMM unless we can run the guest in big 7178 * real mode. 7179 */ 7180 return enable_unrestricted_guest || emulate_invalid_guest_state; 7181 case KVM_FIRST_EMULATED_VMX_MSR ... KVM_LAST_EMULATED_VMX_MSR: 7182 return nested; 7183 case MSR_AMD64_VIRT_SPEC_CTRL: 7184 case MSR_AMD64_TSC_RATIO: 7185 /* This is AMD only. */ 7186 return false; 7187 default: 7188 return true; 7189 } 7190 } 7191 7192 static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx) 7193 { 7194 u32 exit_intr_info; 7195 bool unblock_nmi; 7196 u8 vector; 7197 bool idtv_info_valid; 7198 7199 idtv_info_valid = vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK; 7200 7201 if (enable_vnmi) { 7202 if (vmx->loaded_vmcs->nmi_known_unmasked) 7203 return; 7204 7205 exit_intr_info = vmx_get_intr_info(&vmx->vcpu); 7206 unblock_nmi = (exit_intr_info & INTR_INFO_UNBLOCK_NMI) != 0; 7207 vector = exit_intr_info & INTR_INFO_VECTOR_MASK; 7208 /* 7209 * SDM 3: 27.7.1.2 (September 2008) 7210 * Re-set bit "block by NMI" before VM entry if vmexit caused by 7211 * a guest IRET fault. 7212 * SDM 3: 23.2.2 (September 2008) 7213 * Bit 12 is undefined in any of the following cases: 7214 * If the VM exit sets the valid bit in the IDT-vectoring 7215 * information field. 7216 * If the VM exit is due to a double fault. 7217 */ 7218 if ((exit_intr_info & INTR_INFO_VALID_MASK) && unblock_nmi && 7219 vector != DF_VECTOR && !idtv_info_valid) 7220 vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, 7221 GUEST_INTR_STATE_NMI); 7222 else 7223 vmx->loaded_vmcs->nmi_known_unmasked = 7224 !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) 7225 & GUEST_INTR_STATE_NMI); 7226 } else if (unlikely(vmx->loaded_vmcs->soft_vnmi_blocked)) 7227 vmx->loaded_vmcs->vnmi_blocked_time += 7228 ktime_to_ns(ktime_sub(ktime_get(), 7229 vmx->loaded_vmcs->entry_time)); 7230 } 7231 7232 static void __vmx_complete_interrupts(struct kvm_vcpu *vcpu, 7233 u32 idt_vectoring_info, 7234 int instr_len_field, 7235 int error_code_field) 7236 { 7237 u8 vector; 7238 int type; 7239 bool idtv_info_valid; 7240 7241 idtv_info_valid = idt_vectoring_info & VECTORING_INFO_VALID_MASK; 7242 7243 vcpu->arch.nmi_injected = false; 7244 kvm_clear_exception_queue(vcpu); 7245 kvm_clear_interrupt_queue(vcpu); 7246 7247 if (!idtv_info_valid) 7248 return; 7249 7250 kvm_make_request(KVM_REQ_EVENT, vcpu); 7251 7252 vector = idt_vectoring_info & VECTORING_INFO_VECTOR_MASK; 7253 type = idt_vectoring_info & VECTORING_INFO_TYPE_MASK; 7254 7255 switch (type) { 7256 case INTR_TYPE_NMI_INTR: 7257 vcpu->arch.nmi_injected = true; 7258 /* 7259 * SDM 3: 27.7.1.2 (September 2008) 7260 * Clear bit "block by NMI" before VM entry if a NMI 7261 * delivery faulted. 7262 */ 7263 vmx_set_nmi_mask(vcpu, false); 7264 break; 7265 case INTR_TYPE_SOFT_EXCEPTION: 7266 vcpu->arch.event_exit_inst_len = vmcs_read32(instr_len_field); 7267 fallthrough; 7268 case INTR_TYPE_HARD_EXCEPTION: { 7269 u32 error_code = 0; 7270 7271 if (idt_vectoring_info & VECTORING_INFO_DELIVER_CODE_MASK) 7272 error_code = vmcs_read32(error_code_field); 7273 7274 kvm_requeue_exception(vcpu, vector, 7275 idt_vectoring_info & VECTORING_INFO_DELIVER_CODE_MASK, 7276 error_code); 7277 break; 7278 } 7279 case INTR_TYPE_SOFT_INTR: 7280 vcpu->arch.event_exit_inst_len = vmcs_read32(instr_len_field); 7281 fallthrough; 7282 case INTR_TYPE_EXT_INTR: 7283 kvm_queue_interrupt(vcpu, vector, type == INTR_TYPE_SOFT_INTR); 7284 break; 7285 default: 7286 break; 7287 } 7288 } 7289 7290 static void vmx_complete_interrupts(struct vcpu_vmx *vmx) 7291 { 7292 __vmx_complete_interrupts(&vmx->vcpu, vmx->idt_vectoring_info, 7293 VM_EXIT_INSTRUCTION_LEN, 7294 IDT_VECTORING_ERROR_CODE); 7295 } 7296 7297 void vmx_cancel_injection(struct kvm_vcpu *vcpu) 7298 { 7299 __vmx_complete_interrupts(vcpu, 7300 vmcs_read32(VM_ENTRY_INTR_INFO_FIELD), 7301 VM_ENTRY_INSTRUCTION_LEN, 7302 VM_ENTRY_EXCEPTION_ERROR_CODE); 7303 7304 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0); 7305 } 7306 7307 static void atomic_switch_perf_msrs(struct vcpu_vmx *vmx) 7308 { 7309 int i, nr_msrs; 7310 struct perf_guest_switch_msr *msrs; 7311 struct kvm_pmu *pmu = vcpu_to_pmu(&vmx->vcpu); 7312 7313 if (kvm_vcpu_has_mediated_pmu(&vmx->vcpu)) 7314 return; 7315 7316 pmu->host_cross_mapped_mask = 0; 7317 if (pmu->pebs_enable & pmu->global_ctrl) 7318 intel_pmu_cross_mapped_check(pmu); 7319 7320 /* Note, nr_msrs may be garbage if perf_guest_get_msrs() returns NULL. */ 7321 msrs = perf_guest_get_msrs(&nr_msrs, (void *)pmu); 7322 if (!msrs) 7323 return; 7324 7325 for (i = 0; i < nr_msrs; i++) 7326 if (msrs[i].host == msrs[i].guest) 7327 clear_atomic_switch_msr(vmx, msrs[i].msr); 7328 else 7329 add_atomic_switch_msr(vmx, msrs[i].msr, msrs[i].guest, 7330 msrs[i].host); 7331 } 7332 7333 static void vmx_refresh_guest_perf_global_control(struct kvm_vcpu *vcpu) 7334 { 7335 struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); 7336 struct vcpu_vmx *vmx = to_vmx(vcpu); 7337 7338 if (msr_write_intercepted(vmx, MSR_CORE_PERF_GLOBAL_CTRL)) 7339 return; 7340 7341 if (!cpu_has_save_perf_global_ctrl()) { 7342 int slot = vmx_find_loadstore_msr_slot(&vmx->msr_autostore, 7343 MSR_CORE_PERF_GLOBAL_CTRL); 7344 7345 if (WARN_ON_ONCE(slot < 0)) 7346 return; 7347 7348 pmu->global_ctrl = vmx->msr_autostore.val[slot].value; 7349 vmcs_write64(GUEST_IA32_PERF_GLOBAL_CTRL, pmu->global_ctrl); 7350 return; 7351 } 7352 7353 pmu->global_ctrl = vmcs_read64(GUEST_IA32_PERF_GLOBAL_CTRL); 7354 } 7355 7356 static void vmx_update_hv_timer(struct kvm_vcpu *vcpu, bool force_immediate_exit) 7357 { 7358 struct vcpu_vmx *vmx = to_vmx(vcpu); 7359 u64 tscl; 7360 u32 delta_tsc; 7361 7362 if (force_immediate_exit) { 7363 vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, 0); 7364 vmx->loaded_vmcs->hv_timer_soft_disabled = false; 7365 } else if (vmx->hv_deadline_tsc != -1) { 7366 tscl = rdtsc(); 7367 if (vmx->hv_deadline_tsc > tscl) 7368 /* set_hv_timer ensures the delta fits in 32-bits */ 7369 delta_tsc = (u32)((vmx->hv_deadline_tsc - tscl) >> 7370 cpu_preemption_timer_multi); 7371 else 7372 delta_tsc = 0; 7373 7374 vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, delta_tsc); 7375 vmx->loaded_vmcs->hv_timer_soft_disabled = false; 7376 } else if (!vmx->loaded_vmcs->hv_timer_soft_disabled) { 7377 vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, -1); 7378 vmx->loaded_vmcs->hv_timer_soft_disabled = true; 7379 } 7380 } 7381 7382 void noinstr vmx_update_host_rsp(struct vcpu_vmx *vmx, unsigned long host_rsp) 7383 { 7384 if (unlikely(host_rsp != vmx->loaded_vmcs->host_state.rsp)) { 7385 vmx->loaded_vmcs->host_state.rsp = host_rsp; 7386 vmcs_writel(HOST_RSP, host_rsp); 7387 } 7388 } 7389 7390 void noinstr vmx_spec_ctrl_restore_host(struct vcpu_vmx *vmx, 7391 unsigned int flags) 7392 { 7393 u64 hostval = this_cpu_read(x86_spec_ctrl_current); 7394 7395 if (!cpu_feature_enabled(X86_FEATURE_MSR_SPEC_CTRL)) 7396 return; 7397 7398 if (flags & VMX_RUN_SAVE_SPEC_CTRL) 7399 vmx->spec_ctrl = native_rdmsrq(MSR_IA32_SPEC_CTRL); 7400 7401 /* 7402 * If the guest/host SPEC_CTRL values differ, restore the host value. 7403 * 7404 * For legacy IBRS, the IBRS bit always needs to be written after 7405 * transitioning from a less privileged predictor mode, regardless of 7406 * whether the guest/host values differ. 7407 */ 7408 if (cpu_feature_enabled(X86_FEATURE_KERNEL_IBRS) || 7409 vmx->spec_ctrl != hostval) 7410 native_wrmsrq(MSR_IA32_SPEC_CTRL, hostval); 7411 7412 barrier_nospec(); 7413 } 7414 7415 static fastpath_t vmx_exit_handlers_fastpath(struct kvm_vcpu *vcpu, 7416 bool force_immediate_exit) 7417 { 7418 /* 7419 * If L2 is active, some VMX preemption timer exits can be handled in 7420 * the fastpath even, all other exits must use the slow path. 7421 */ 7422 if (is_guest_mode(vcpu) && 7423 vmx_get_exit_reason(vcpu).basic != EXIT_REASON_PREEMPTION_TIMER) 7424 return EXIT_FASTPATH_NONE; 7425 7426 switch (vmx_get_exit_reason(vcpu).basic) { 7427 case EXIT_REASON_MSR_WRITE: 7428 return handle_fastpath_wrmsr(vcpu); 7429 case EXIT_REASON_MSR_WRITE_IMM: 7430 return handle_fastpath_wrmsr_imm(vcpu, vmx_get_exit_qual(vcpu), 7431 vmx_get_msr_imm_reg(vcpu)); 7432 case EXIT_REASON_PREEMPTION_TIMER: 7433 return handle_fastpath_preemption_timer(vcpu, force_immediate_exit); 7434 case EXIT_REASON_HLT: 7435 return handle_fastpath_hlt(vcpu); 7436 case EXIT_REASON_INVD: 7437 return handle_fastpath_invd(vcpu); 7438 default: 7439 return EXIT_FASTPATH_NONE; 7440 } 7441 } 7442 7443 noinstr void vmx_handle_nmi(struct kvm_vcpu *vcpu) 7444 { 7445 if ((u16)vmx_get_exit_reason(vcpu).basic != EXIT_REASON_EXCEPTION_NMI || 7446 !is_nmi(vmx_get_intr_info(vcpu))) 7447 return; 7448 7449 kvm_before_interrupt(vcpu, KVM_HANDLING_NMI); 7450 if (cpu_feature_enabled(X86_FEATURE_FRED)) 7451 fred_entry_from_kvm(EVENT_TYPE_NMI, NMI_VECTOR); 7452 else 7453 vmx_do_nmi_irqoff(); 7454 kvm_after_interrupt(vcpu); 7455 } 7456 7457 static noinstr void vmx_vcpu_enter_exit(struct kvm_vcpu *vcpu, 7458 unsigned int flags) 7459 { 7460 struct vcpu_vmx *vmx = to_vmx(vcpu); 7461 7462 guest_state_enter_irqoff(); 7463 7464 vmx_l1d_flush(vcpu); 7465 7466 vmx_disable_fb_clear(vmx); 7467 7468 if (vcpu->arch.cr2 != native_read_cr2()) 7469 native_write_cr2(vcpu->arch.cr2); 7470 7471 vmx->fail = __vmx_vcpu_run(vmx, (unsigned long *)&vcpu->arch.regs, 7472 flags); 7473 7474 vcpu->arch.cr2 = native_read_cr2(); 7475 vcpu->arch.regs_avail &= ~VMX_REGS_LAZY_LOAD_SET; 7476 7477 vmx->idt_vectoring_info = 0; 7478 7479 vmx_enable_fb_clear(vmx); 7480 7481 if (unlikely(vmx->fail)) { 7482 vmx->vt.exit_reason.full = 0xdead; 7483 goto out; 7484 } 7485 7486 vmx->vt.exit_reason.full = vmcs_read32(VM_EXIT_REASON); 7487 if (likely(!vmx_get_exit_reason(vcpu).failed_vmentry)) 7488 vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD); 7489 7490 vmx_handle_nmi(vcpu); 7491 7492 out: 7493 guest_state_exit_irqoff(); 7494 } 7495 7496 fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, u64 run_flags) 7497 { 7498 bool force_immediate_exit = run_flags & KVM_RUN_FORCE_IMMEDIATE_EXIT; 7499 struct vcpu_vmx *vmx = to_vmx(vcpu); 7500 unsigned long cr3, cr4; 7501 7502 /* Record the guest's net vcpu time for enforced NMI injections. */ 7503 if (unlikely(!enable_vnmi && 7504 vmx->loaded_vmcs->soft_vnmi_blocked)) 7505 vmx->loaded_vmcs->entry_time = ktime_get(); 7506 7507 /* 7508 * Don't enter VMX if guest state is invalid, let the exit handler 7509 * start emulation until we arrive back to a valid state. Synthesize a 7510 * consistency check VM-Exit due to invalid guest state and bail. 7511 */ 7512 if (unlikely(vmx->vt.emulation_required)) { 7513 vmx->fail = 0; 7514 7515 vmx->vt.exit_reason.full = EXIT_REASON_INVALID_STATE; 7516 vmx->vt.exit_reason.failed_vmentry = 1; 7517 kvm_register_mark_available(vcpu, VCPU_EXREG_EXIT_INFO_1); 7518 vmx->vt.exit_qualification = ENTRY_FAIL_DEFAULT; 7519 kvm_register_mark_available(vcpu, VCPU_EXREG_EXIT_INFO_2); 7520 vmx->vt.exit_intr_info = 0; 7521 return EXIT_FASTPATH_NONE; 7522 } 7523 7524 trace_kvm_entry(vcpu, force_immediate_exit); 7525 7526 if (vmx->ple_window_dirty) { 7527 vmx->ple_window_dirty = false; 7528 vmcs_write32(PLE_WINDOW, vmx->ple_window); 7529 } 7530 7531 /* 7532 * We did this in prepare_switch_to_guest, because it needs to 7533 * be within srcu_read_lock. 7534 */ 7535 WARN_ON_ONCE(vmx->nested.need_vmcs12_to_shadow_sync); 7536 7537 if (kvm_register_is_dirty(vcpu, VCPU_REGS_RSP)) 7538 vmcs_writel(GUEST_RSP, vcpu->arch.regs[VCPU_REGS_RSP]); 7539 if (kvm_register_is_dirty(vcpu, VCPU_REGS_RIP)) 7540 vmcs_writel(GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]); 7541 vcpu->arch.regs_dirty = 0; 7542 7543 if (run_flags & KVM_RUN_LOAD_GUEST_DR6) 7544 set_debugreg(vcpu->arch.dr6, 6); 7545 7546 if (run_flags & KVM_RUN_LOAD_DEBUGCTL) 7547 vmx_reload_guest_debugctl(vcpu); 7548 7549 /* 7550 * Refresh vmcs.HOST_CR3 if necessary. This must be done immediately 7551 * prior to VM-Enter, as the kernel may load a new ASID (PCID) any time 7552 * it switches back to the current->mm, which can occur in KVM context 7553 * when switching to a temporary mm to patch kernel code, e.g. if KVM 7554 * toggles a static key while handling a VM-Exit. 7555 */ 7556 cr3 = __get_current_cr3_fast(); 7557 if (unlikely(cr3 != vmx->loaded_vmcs->host_state.cr3)) { 7558 vmcs_writel(HOST_CR3, cr3); 7559 vmx->loaded_vmcs->host_state.cr3 = cr3; 7560 } 7561 7562 cr4 = cr4_read_shadow(); 7563 if (unlikely(cr4 != vmx->loaded_vmcs->host_state.cr4)) { 7564 vmcs_writel(HOST_CR4, cr4); 7565 vmx->loaded_vmcs->host_state.cr4 = cr4; 7566 } 7567 7568 /* When single-stepping over STI and MOV SS, we must clear the 7569 * corresponding interruptibility bits in the guest state. Otherwise 7570 * vmentry fails as it then expects bit 14 (BS) in pending debug 7571 * exceptions being set, but that's not correct for the guest debugging 7572 * case. */ 7573 if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) 7574 vmx_set_interrupt_shadow(vcpu, 0); 7575 7576 pt_guest_enter(vmx); 7577 7578 atomic_switch_perf_msrs(vmx); 7579 if (intel_pmu_lbr_is_enabled(vcpu)) 7580 vmx_passthrough_lbr_msrs(vcpu); 7581 7582 if (enable_preemption_timer) 7583 vmx_update_hv_timer(vcpu, force_immediate_exit); 7584 else if (force_immediate_exit) 7585 smp_send_reschedule(vcpu->cpu); 7586 7587 kvm_wait_lapic_expire(vcpu); 7588 7589 /* The actual VMENTER/EXIT is in the .noinstr.text section. */ 7590 vmx_vcpu_enter_exit(vcpu, __vmx_vcpu_run_flags(vmx)); 7591 7592 /* All fields are clean at this point */ 7593 if (kvm_is_using_evmcs()) { 7594 current_evmcs->hv_clean_fields |= 7595 HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL; 7596 7597 current_evmcs->hv_vp_id = kvm_hv_get_vpindex(vcpu); 7598 } 7599 7600 /* MSR_IA32_DEBUGCTLMSR is zeroed on vmexit. Restore it if needed */ 7601 if (vcpu->arch.host_debugctl) 7602 update_debugctlmsr(vcpu->arch.host_debugctl); 7603 7604 #ifndef CONFIG_X86_64 7605 /* 7606 * The sysexit path does not restore ds/es, so we must set them to 7607 * a reasonable value ourselves. 7608 * 7609 * We can't defer this to vmx_prepare_switch_to_host() since that 7610 * function may be executed in interrupt context, which saves and 7611 * restore segments around it, nullifying its effect. 7612 */ 7613 loadsegment(ds, __USER_DS); 7614 loadsegment(es, __USER_DS); 7615 #endif 7616 7617 pt_guest_exit(vmx); 7618 7619 if (is_guest_mode(vcpu)) { 7620 /* 7621 * Track VMLAUNCH/VMRESUME that have made past guest state 7622 * checking. 7623 */ 7624 if (vcpu->arch.nested_run_pending && 7625 !vmx_get_exit_reason(vcpu).failed_vmentry) 7626 ++vcpu->stat.nested_run; 7627 7628 vcpu->arch.nested_run_pending = 0; 7629 } 7630 7631 if (unlikely(vmx->fail)) 7632 return EXIT_FASTPATH_NONE; 7633 7634 trace_kvm_exit(vcpu, KVM_ISA_VMX); 7635 7636 if (unlikely(vmx_get_exit_reason(vcpu).failed_vmentry)) 7637 return EXIT_FASTPATH_NONE; 7638 7639 vmx->loaded_vmcs->launched = 1; 7640 7641 vmx_refresh_guest_perf_global_control(vcpu); 7642 7643 vmx_recover_nmi_blocking(vmx); 7644 vmx_complete_interrupts(vmx); 7645 7646 return vmx_exit_handlers_fastpath(vcpu, force_immediate_exit); 7647 } 7648 7649 void vmx_vcpu_free(struct kvm_vcpu *vcpu) 7650 { 7651 struct vcpu_vmx *vmx = to_vmx(vcpu); 7652 7653 if (enable_pml) 7654 vmx_destroy_pml_buffer(vmx); 7655 free_vpid(vmx->vpid); 7656 nested_vmx_free_vcpu(vcpu); 7657 free_loaded_vmcs(vmx->loaded_vmcs); 7658 free_page((unsigned long)vmx->ve_info); 7659 } 7660 7661 int vmx_vcpu_create(struct kvm_vcpu *vcpu) 7662 { 7663 struct vmx_uret_msr *tsx_ctrl; 7664 struct vcpu_vmx *vmx; 7665 int i, err; 7666 7667 BUILD_BUG_ON(offsetof(struct vcpu_vmx, vcpu) != 0); 7668 vmx = to_vmx(vcpu); 7669 7670 INIT_LIST_HEAD(&vmx->vt.pi_wakeup_list); 7671 7672 err = -ENOMEM; 7673 7674 vmx->vpid = allocate_vpid(); 7675 7676 /* 7677 * If PML is turned on, failure on enabling PML just results in failure 7678 * of creating the vcpu, therefore we can simplify PML logic (by 7679 * avoiding dealing with cases, such as enabling PML partially on vcpus 7680 * for the guest), etc. 7681 */ 7682 if (enable_pml) { 7683 vmx->pml_pg = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO); 7684 if (!vmx->pml_pg) 7685 goto free_vpid; 7686 } 7687 7688 for (i = 0; i < kvm_nr_uret_msrs; ++i) 7689 vmx->guest_uret_msrs[i].mask = -1ull; 7690 if (boot_cpu_has(X86_FEATURE_RTM)) { 7691 /* 7692 * TSX_CTRL_CPUID_CLEAR is handled in the CPUID interception. 7693 * Keep the host value unchanged to avoid changing CPUID bits 7694 * under the host kernel's feet. 7695 */ 7696 tsx_ctrl = vmx_find_uret_msr(vmx, MSR_IA32_TSX_CTRL); 7697 if (tsx_ctrl) 7698 tsx_ctrl->mask = ~(u64)TSX_CTRL_CPUID_CLEAR; 7699 } 7700 7701 err = alloc_loaded_vmcs(&vmx->vmcs01); 7702 if (err < 0) 7703 goto free_pml; 7704 7705 /* 7706 * Use Hyper-V 'Enlightened MSR Bitmap' feature when KVM runs as a 7707 * nested (L1) hypervisor and Hyper-V in L0 supports it. Enable the 7708 * feature only for vmcs01, KVM currently isn't equipped to realize any 7709 * performance benefits from enabling it for vmcs02. 7710 */ 7711 if (kvm_is_using_evmcs() && 7712 (ms_hyperv.nested_features & HV_X64_NESTED_MSR_BITMAP)) { 7713 struct hv_enlightened_vmcs *evmcs = (void *)vmx->vmcs01.vmcs; 7714 7715 evmcs->hv_enlightenments_control.msr_bitmap = 1; 7716 } 7717 7718 vmx->loaded_vmcs = &vmx->vmcs01; 7719 7720 if (cpu_need_virtualize_apic_accesses(vcpu)) { 7721 err = kvm_alloc_apic_access_page(vcpu->kvm); 7722 if (err) 7723 goto free_vmcs; 7724 } 7725 7726 if (enable_ept && !enable_unrestricted_guest) { 7727 err = init_rmode_identity_map(vcpu->kvm); 7728 if (err) 7729 goto free_vmcs; 7730 } 7731 7732 err = -ENOMEM; 7733 if (vmcs_config.cpu_based_2nd_exec_ctrl & SECONDARY_EXEC_EPT_VIOLATION_VE) { 7734 struct page *page; 7735 7736 BUILD_BUG_ON(sizeof(*vmx->ve_info) > PAGE_SIZE); 7737 7738 /* ve_info must be page aligned. */ 7739 page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO); 7740 if (!page) 7741 goto free_vmcs; 7742 7743 vmx->ve_info = page_to_virt(page); 7744 } 7745 7746 if (vmx_can_use_ipiv(vcpu)) 7747 WRITE_ONCE(to_kvm_vmx(vcpu->kvm)->pid_table[vcpu->vcpu_id], 7748 __pa(&vmx->vt.pi_desc) | PID_TABLE_ENTRY_VALID); 7749 7750 return 0; 7751 7752 free_vmcs: 7753 free_loaded_vmcs(vmx->loaded_vmcs); 7754 free_pml: 7755 vmx_destroy_pml_buffer(vmx); 7756 free_vpid: 7757 free_vpid(vmx->vpid); 7758 return err; 7759 } 7760 7761 #define L1TF_MSG_SMT "L1TF CPU bug present and SMT on, data leak possible. See CVE-2018-3646 and https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/l1tf.html for details.\n" 7762 #define L1TF_MSG_L1D "L1TF CPU bug present and virtualization mitigation disabled, data leak possible. See CVE-2018-3646 and https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/l1tf.html for details.\n" 7763 7764 int vmx_vm_init(struct kvm *kvm) 7765 { 7766 if (!ple_gap) 7767 kvm_disable_exits(kvm, KVM_X86_DISABLE_EXITS_PAUSE); 7768 7769 if (boot_cpu_has(X86_BUG_L1TF) && enable_ept) { 7770 switch (l1tf_mitigation) { 7771 case L1TF_MITIGATION_OFF: 7772 case L1TF_MITIGATION_FLUSH_NOWARN: 7773 /* 'I explicitly don't care' is set */ 7774 break; 7775 case L1TF_MITIGATION_AUTO: 7776 case L1TF_MITIGATION_FLUSH: 7777 case L1TF_MITIGATION_FLUSH_NOSMT: 7778 case L1TF_MITIGATION_FULL: 7779 /* 7780 * Warn upon starting the first VM in a potentially 7781 * insecure environment. 7782 */ 7783 if (sched_smt_active()) 7784 pr_warn_once(L1TF_MSG_SMT); 7785 if (l1tf_vmx_mitigation == VMENTER_L1D_FLUSH_NEVER) 7786 pr_warn_once(L1TF_MSG_L1D); 7787 break; 7788 case L1TF_MITIGATION_FULL_FORCE: 7789 /* Flush is enforced */ 7790 break; 7791 } 7792 } 7793 7794 if (enable_pml) 7795 kvm->arch.cpu_dirty_log_size = PML_LOG_NR_ENTRIES; 7796 return 0; 7797 } 7798 7799 static inline bool vmx_ignore_guest_pat(struct kvm *kvm) 7800 { 7801 /* 7802 * Non-coherent DMA devices need the guest to flush CPU properly. 7803 * In that case it is not possible to map all guest RAM as WB, so 7804 * always trust guest PAT. 7805 */ 7806 return !kvm_arch_has_noncoherent_dma(kvm) && 7807 kvm_check_has_quirk(kvm, KVM_X86_QUIRK_IGNORE_GUEST_PAT); 7808 } 7809 7810 u8 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio) 7811 { 7812 /* 7813 * Force UC for host MMIO regions, as allowing the guest to access MMIO 7814 * with cacheable accesses will result in Machine Checks. 7815 */ 7816 if (is_mmio) 7817 return MTRR_TYPE_UNCACHABLE << VMX_EPT_MT_EPTE_SHIFT; 7818 7819 /* Force WB if ignoring guest PAT */ 7820 if (vmx_ignore_guest_pat(vcpu->kvm)) 7821 return (MTRR_TYPE_WRBACK << VMX_EPT_MT_EPTE_SHIFT) | VMX_EPT_IPAT_BIT; 7822 7823 return (MTRR_TYPE_WRBACK << VMX_EPT_MT_EPTE_SHIFT); 7824 } 7825 7826 static void vmcs_set_secondary_exec_control(struct vcpu_vmx *vmx, u32 new_ctl) 7827 { 7828 /* 7829 * These bits in the secondary execution controls field 7830 * are dynamic, the others are mostly based on the hypervisor 7831 * architecture and the guest's CPUID. Do not touch the 7832 * dynamic bits. 7833 */ 7834 u32 mask = 7835 SECONDARY_EXEC_SHADOW_VMCS | 7836 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | 7837 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | 7838 SECONDARY_EXEC_DESC; 7839 7840 u32 cur_ctl = secondary_exec_controls_get(vmx); 7841 7842 secondary_exec_controls_set(vmx, (new_ctl & ~mask) | (cur_ctl & mask)); 7843 } 7844 7845 /* 7846 * Generate MSR_IA32_VMX_CR{0,4}_FIXED1 according to CPUID. Only set bits 7847 * (indicating "allowed-1") if they are supported in the guest's CPUID. 7848 */ 7849 static void nested_vmx_cr_fixed1_bits_update(struct kvm_vcpu *vcpu) 7850 { 7851 struct vcpu_vmx *vmx = to_vmx(vcpu); 7852 struct kvm_cpuid_entry2 *entry; 7853 7854 vmx->nested.msrs.cr0_fixed1 = 0xffffffff; 7855 vmx->nested.msrs.cr4_fixed1 = X86_CR4_PCE; 7856 7857 #define cr4_fixed1_update(_cr4_mask, _reg, _cpuid_mask) do { \ 7858 if (entry && (entry->_reg & (_cpuid_mask))) \ 7859 vmx->nested.msrs.cr4_fixed1 |= (_cr4_mask); \ 7860 } while (0) 7861 7862 entry = kvm_find_cpuid_entry(vcpu, 0x1); 7863 cr4_fixed1_update(X86_CR4_VME, edx, feature_bit(VME)); 7864 cr4_fixed1_update(X86_CR4_PVI, edx, feature_bit(VME)); 7865 cr4_fixed1_update(X86_CR4_TSD, edx, feature_bit(TSC)); 7866 cr4_fixed1_update(X86_CR4_DE, edx, feature_bit(DE)); 7867 cr4_fixed1_update(X86_CR4_PSE, edx, feature_bit(PSE)); 7868 cr4_fixed1_update(X86_CR4_PAE, edx, feature_bit(PAE)); 7869 cr4_fixed1_update(X86_CR4_MCE, edx, feature_bit(MCE)); 7870 cr4_fixed1_update(X86_CR4_PGE, edx, feature_bit(PGE)); 7871 cr4_fixed1_update(X86_CR4_OSFXSR, edx, feature_bit(FXSR)); 7872 cr4_fixed1_update(X86_CR4_OSXMMEXCPT, edx, feature_bit(XMM)); 7873 cr4_fixed1_update(X86_CR4_VMXE, ecx, feature_bit(VMX)); 7874 cr4_fixed1_update(X86_CR4_SMXE, ecx, feature_bit(SMX)); 7875 cr4_fixed1_update(X86_CR4_PCIDE, ecx, feature_bit(PCID)); 7876 cr4_fixed1_update(X86_CR4_OSXSAVE, ecx, feature_bit(XSAVE)); 7877 7878 entry = kvm_find_cpuid_entry_index(vcpu, 0x7, 0); 7879 cr4_fixed1_update(X86_CR4_FSGSBASE, ebx, feature_bit(FSGSBASE)); 7880 cr4_fixed1_update(X86_CR4_SMEP, ebx, feature_bit(SMEP)); 7881 cr4_fixed1_update(X86_CR4_SMAP, ebx, feature_bit(SMAP)); 7882 cr4_fixed1_update(X86_CR4_PKE, ecx, feature_bit(PKU)); 7883 cr4_fixed1_update(X86_CR4_UMIP, ecx, feature_bit(UMIP)); 7884 cr4_fixed1_update(X86_CR4_LA57, ecx, feature_bit(LA57)); 7885 cr4_fixed1_update(X86_CR4_CET, ecx, feature_bit(SHSTK)); 7886 cr4_fixed1_update(X86_CR4_CET, edx, feature_bit(IBT)); 7887 7888 entry = kvm_find_cpuid_entry_index(vcpu, 0x7, 1); 7889 cr4_fixed1_update(X86_CR4_LAM_SUP, eax, feature_bit(LAM)); 7890 7891 #undef cr4_fixed1_update 7892 } 7893 7894 static void update_intel_pt_cfg(struct kvm_vcpu *vcpu) 7895 { 7896 struct vcpu_vmx *vmx = to_vmx(vcpu); 7897 struct kvm_cpuid_entry2 *best = NULL; 7898 int i; 7899 7900 for (i = 0; i < PT_CPUID_LEAVES; i++) { 7901 best = kvm_find_cpuid_entry_index(vcpu, 0x14, i); 7902 if (!best) 7903 return; 7904 vmx->pt_desc.caps[CPUID_EAX + i*PT_CPUID_REGS_NUM] = best->eax; 7905 vmx->pt_desc.caps[CPUID_EBX + i*PT_CPUID_REGS_NUM] = best->ebx; 7906 vmx->pt_desc.caps[CPUID_ECX + i*PT_CPUID_REGS_NUM] = best->ecx; 7907 vmx->pt_desc.caps[CPUID_EDX + i*PT_CPUID_REGS_NUM] = best->edx; 7908 } 7909 7910 /* Get the number of configurable Address Ranges for filtering */ 7911 vmx->pt_desc.num_address_ranges = intel_pt_validate_cap(vmx->pt_desc.caps, 7912 PT_CAP_num_address_ranges); 7913 7914 /* Initialize and clear the no dependency bits */ 7915 vmx->pt_desc.ctl_bitmask = ~(RTIT_CTL_TRACEEN | RTIT_CTL_OS | 7916 RTIT_CTL_USR | RTIT_CTL_TSC_EN | RTIT_CTL_DISRETC | 7917 RTIT_CTL_BRANCH_EN); 7918 7919 /* 7920 * If CPUID.(EAX=14H,ECX=0):EBX[0]=1 CR3Filter can be set otherwise 7921 * will inject an #GP 7922 */ 7923 if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_cr3_filtering)) 7924 vmx->pt_desc.ctl_bitmask &= ~RTIT_CTL_CR3EN; 7925 7926 /* 7927 * If CPUID.(EAX=14H,ECX=0):EBX[1]=1 CYCEn, CycThresh and 7928 * PSBFreq can be set 7929 */ 7930 if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_psb_cyc)) 7931 vmx->pt_desc.ctl_bitmask &= ~(RTIT_CTL_CYCLEACC | 7932 RTIT_CTL_CYC_THRESH | RTIT_CTL_PSB_FREQ); 7933 7934 /* 7935 * If CPUID.(EAX=14H,ECX=0):EBX[3]=1 MTCEn and MTCFreq can be set 7936 */ 7937 if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_mtc)) 7938 vmx->pt_desc.ctl_bitmask &= ~(RTIT_CTL_MTC_EN | 7939 RTIT_CTL_MTC_RANGE); 7940 7941 /* If CPUID.(EAX=14H,ECX=0):EBX[4]=1 FUPonPTW and PTWEn can be set */ 7942 if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_ptwrite)) 7943 vmx->pt_desc.ctl_bitmask &= ~(RTIT_CTL_FUP_ON_PTW | 7944 RTIT_CTL_PTW_EN); 7945 7946 /* If CPUID.(EAX=14H,ECX=0):EBX[5]=1 PwrEvEn can be set */ 7947 if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_power_event_trace)) 7948 vmx->pt_desc.ctl_bitmask &= ~RTIT_CTL_PWR_EVT_EN; 7949 7950 /* If CPUID.(EAX=14H,ECX=0):ECX[0]=1 ToPA can be set */ 7951 if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_topa_output)) 7952 vmx->pt_desc.ctl_bitmask &= ~RTIT_CTL_TOPA; 7953 7954 /* If CPUID.(EAX=14H,ECX=0):ECX[3]=1 FabricEn can be set */ 7955 if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_output_subsys)) 7956 vmx->pt_desc.ctl_bitmask &= ~RTIT_CTL_FABRIC_EN; 7957 7958 /* unmask address range configure area */ 7959 for (i = 0; i < vmx->pt_desc.num_address_ranges; i++) 7960 vmx->pt_desc.ctl_bitmask &= ~(0xfULL << (32 + i * 4)); 7961 } 7962 7963 void vmx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) 7964 { 7965 struct vcpu_vmx *vmx = to_vmx(vcpu); 7966 7967 /* 7968 * XSAVES is effectively enabled if and only if XSAVE is also exposed 7969 * to the guest. XSAVES depends on CR4.OSXSAVE, and CR4.OSXSAVE can be 7970 * set if and only if XSAVE is supported. 7971 */ 7972 if (!guest_cpu_cap_has(vcpu, X86_FEATURE_XSAVE)) 7973 guest_cpu_cap_clear(vcpu, X86_FEATURE_XSAVES); 7974 7975 vmx_setup_uret_msrs(vmx); 7976 7977 if (cpu_has_secondary_exec_ctrls()) 7978 vmcs_set_secondary_exec_control(vmx, 7979 vmx_secondary_exec_control(vmx)); 7980 7981 if (guest_cpu_cap_has(vcpu, X86_FEATURE_VMX)) 7982 vmx->msr_ia32_feature_control_valid_bits |= 7983 FEAT_CTL_VMX_ENABLED_INSIDE_SMX | 7984 FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX; 7985 else 7986 vmx->msr_ia32_feature_control_valid_bits &= 7987 ~(FEAT_CTL_VMX_ENABLED_INSIDE_SMX | 7988 FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX); 7989 7990 if (guest_cpu_cap_has(vcpu, X86_FEATURE_VMX)) 7991 nested_vmx_cr_fixed1_bits_update(vcpu); 7992 7993 if (boot_cpu_has(X86_FEATURE_INTEL_PT) && 7994 guest_cpu_cap_has(vcpu, X86_FEATURE_INTEL_PT)) 7995 update_intel_pt_cfg(vcpu); 7996 7997 if (boot_cpu_has(X86_FEATURE_RTM)) { 7998 struct vmx_uret_msr *msr; 7999 msr = vmx_find_uret_msr(vmx, MSR_IA32_TSX_CTRL); 8000 if (msr) { 8001 bool enabled = guest_cpu_cap_has(vcpu, X86_FEATURE_RTM); 8002 vmx_set_guest_uret_msr(vmx, msr, enabled ? 0 : TSX_CTRL_RTM_DISABLE); 8003 } 8004 } 8005 8006 set_cr4_guest_host_mask(vmx); 8007 8008 vmx_write_encls_bitmap(vcpu, NULL); 8009 if (guest_cpu_cap_has(vcpu, X86_FEATURE_SGX)) 8010 vmx->msr_ia32_feature_control_valid_bits |= FEAT_CTL_SGX_ENABLED; 8011 else 8012 vmx->msr_ia32_feature_control_valid_bits &= ~FEAT_CTL_SGX_ENABLED; 8013 8014 if (guest_cpu_cap_has(vcpu, X86_FEATURE_SGX_LC)) 8015 vmx->msr_ia32_feature_control_valid_bits |= 8016 FEAT_CTL_SGX_LC_ENABLED; 8017 else 8018 vmx->msr_ia32_feature_control_valid_bits &= 8019 ~FEAT_CTL_SGX_LC_ENABLED; 8020 8021 /* Refresh #PF interception to account for MAXPHYADDR changes. */ 8022 vmx_update_exception_bitmap(vcpu); 8023 } 8024 8025 static __init u64 vmx_get_perf_capabilities(void) 8026 { 8027 u64 perf_cap = PERF_CAP_FW_WRITES; 8028 u64 host_perf_cap = 0; 8029 8030 if (!enable_pmu) 8031 return 0; 8032 8033 if (boot_cpu_has(X86_FEATURE_PDCM)) 8034 rdmsrq(MSR_IA32_PERF_CAPABILITIES, host_perf_cap); 8035 8036 if (!cpu_feature_enabled(X86_FEATURE_ARCH_LBR) && 8037 !enable_mediated_pmu) { 8038 x86_perf_get_lbr(&vmx_lbr_caps); 8039 8040 /* 8041 * KVM requires LBR callstack support, as the overhead due to 8042 * context switching LBRs without said support is too high. 8043 * See intel_pmu_create_guest_lbr_event() for more info. 8044 */ 8045 if (!vmx_lbr_caps.has_callstack) 8046 memset(&vmx_lbr_caps, 0, sizeof(vmx_lbr_caps)); 8047 else if (vmx_lbr_caps.nr) 8048 perf_cap |= host_perf_cap & PERF_CAP_LBR_FMT; 8049 } 8050 8051 if (vmx_pebs_supported()) { 8052 perf_cap |= host_perf_cap & PERF_CAP_PEBS_MASK; 8053 8054 /* 8055 * Disallow adaptive PEBS as it is functionally broken, can be 8056 * used by the guest to read *host* LBRs, and can be used to 8057 * bypass userspace event filters. To correctly and safely 8058 * support adaptive PEBS, KVM needs to: 8059 * 8060 * 1. Account for the ADAPTIVE flag when (re)programming fixed 8061 * counters. 8062 * 8063 * 2. Gain support from perf (or take direct control of counter 8064 * programming) to support events without adaptive PEBS 8065 * enabled for the hardware counter. 8066 * 8067 * 3. Ensure LBR MSRs cannot hold host data on VM-Entry with 8068 * adaptive PEBS enabled and MSR_PEBS_DATA_CFG.LBRS=1. 8069 * 8070 * 4. Document which PMU events are effectively exposed to the 8071 * guest via adaptive PEBS, and make adaptive PEBS mutually 8072 * exclusive with KVM_SET_PMU_EVENT_FILTER if necessary. 8073 */ 8074 perf_cap &= ~PERF_CAP_PEBS_BASELINE; 8075 } 8076 8077 return perf_cap; 8078 } 8079 8080 static __init void vmx_set_cpu_caps(void) 8081 { 8082 kvm_initialize_cpu_caps(); 8083 8084 /* CPUID 0x1 */ 8085 if (nested) 8086 kvm_cpu_cap_set(X86_FEATURE_VMX); 8087 8088 /* CPUID 0x7 */ 8089 if (kvm_mpx_supported()) 8090 kvm_cpu_cap_check_and_set(X86_FEATURE_MPX); 8091 if (!cpu_has_vmx_invpcid()) 8092 kvm_cpu_cap_clear(X86_FEATURE_INVPCID); 8093 if (vmx_pt_mode_is_host_guest()) 8094 kvm_cpu_cap_check_and_set(X86_FEATURE_INTEL_PT); 8095 if (vmx_pebs_supported()) { 8096 kvm_cpu_cap_check_and_set(X86_FEATURE_DS); 8097 kvm_cpu_cap_check_and_set(X86_FEATURE_DTES64); 8098 } 8099 8100 if (!enable_pmu) 8101 kvm_cpu_cap_clear(X86_FEATURE_PDCM); 8102 kvm_caps.supported_perf_cap = vmx_get_perf_capabilities(); 8103 8104 if (!enable_sgx) { 8105 kvm_cpu_cap_clear(X86_FEATURE_SGX); 8106 kvm_cpu_cap_clear(X86_FEATURE_SGX_LC); 8107 kvm_cpu_cap_clear(X86_FEATURE_SGX1); 8108 kvm_cpu_cap_clear(X86_FEATURE_SGX2); 8109 kvm_cpu_cap_clear(X86_FEATURE_SGX_EDECCSSA); 8110 } 8111 8112 if (vmx_umip_emulated()) 8113 kvm_cpu_cap_set(X86_FEATURE_UMIP); 8114 8115 /* CPUID 0xD.1 */ 8116 if (!cpu_has_vmx_xsaves()) 8117 kvm_cpu_cap_clear(X86_FEATURE_XSAVES); 8118 8119 /* CPUID 0x80000001 and 0x7 (RDPID) */ 8120 if (!cpu_has_vmx_rdtscp()) { 8121 kvm_cpu_cap_clear(X86_FEATURE_RDTSCP); 8122 kvm_cpu_cap_clear(X86_FEATURE_RDPID); 8123 } 8124 8125 if (cpu_has_vmx_waitpkg()) 8126 kvm_cpu_cap_check_and_set(X86_FEATURE_WAITPKG); 8127 8128 /* 8129 * Disable CET if unrestricted_guest is unsupported as KVM doesn't 8130 * enforce CET HW behaviors in emulator. On platforms with 8131 * VMX_BASIC[bit56] == 0, inject #CP at VMX entry with error code 8132 * fails, so disable CET in this case too. 8133 */ 8134 if (!cpu_has_load_cet_ctrl() || !enable_unrestricted_guest || 8135 !cpu_has_vmx_basic_no_hw_errcode_cc()) { 8136 kvm_cpu_cap_clear(X86_FEATURE_SHSTK); 8137 kvm_cpu_cap_clear(X86_FEATURE_IBT); 8138 } 8139 8140 kvm_setup_xss_caps(); 8141 kvm_finalize_cpu_caps(); 8142 } 8143 8144 static bool vmx_is_io_intercepted(struct kvm_vcpu *vcpu, 8145 struct x86_instruction_info *info, 8146 unsigned long *exit_qualification) 8147 { 8148 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 8149 unsigned short port; 8150 int size; 8151 bool imm; 8152 8153 /* 8154 * If the 'use IO bitmaps' VM-execution control is 0, IO instruction 8155 * VM-exits depend on the 'unconditional IO exiting' VM-execution 8156 * control. 8157 * 8158 * Otherwise, IO instruction VM-exits are controlled by the IO bitmaps. 8159 */ 8160 if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS)) 8161 return nested_cpu_has(vmcs12, CPU_BASED_UNCOND_IO_EXITING); 8162 8163 if (info->intercept == x86_intercept_in || 8164 info->intercept == x86_intercept_ins) { 8165 port = info->src_val; 8166 size = info->dst_bytes; 8167 imm = info->src_type == OP_IMM; 8168 } else { 8169 port = info->dst_val; 8170 size = info->src_bytes; 8171 imm = info->dst_type == OP_IMM; 8172 } 8173 8174 8175 *exit_qualification = ((unsigned long)port << 16) | (size - 1); 8176 8177 if (info->intercept == x86_intercept_ins || 8178 info->intercept == x86_intercept_outs) 8179 *exit_qualification |= BIT(4); 8180 8181 if (info->rep_prefix) 8182 *exit_qualification |= BIT(5); 8183 8184 if (imm) 8185 *exit_qualification |= BIT(6); 8186 8187 return nested_vmx_check_io_bitmaps(vcpu, port, size); 8188 } 8189 8190 int vmx_check_intercept(struct kvm_vcpu *vcpu, 8191 struct x86_instruction_info *info, 8192 enum x86_intercept_stage stage, 8193 struct x86_exception *exception) 8194 { 8195 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 8196 unsigned long exit_qualification = 0; 8197 u32 vm_exit_reason; 8198 u64 exit_insn_len; 8199 8200 switch (info->intercept) { 8201 case x86_intercept_rdpid: 8202 /* 8203 * RDPID causes #UD if not enabled through secondary execution 8204 * controls (ENABLE_RDTSCP). Note, the implicit MSR access to 8205 * TSC_AUX is NOT subject to interception, i.e. checking only 8206 * the dedicated execution control is architecturally correct. 8207 */ 8208 if (!nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_RDTSCP)) { 8209 exception->vector = UD_VECTOR; 8210 exception->error_code_valid = false; 8211 return X86EMUL_PROPAGATE_FAULT; 8212 } 8213 return X86EMUL_CONTINUE; 8214 8215 case x86_intercept_in: 8216 case x86_intercept_ins: 8217 case x86_intercept_out: 8218 case x86_intercept_outs: 8219 if (!vmx_is_io_intercepted(vcpu, info, &exit_qualification)) 8220 return X86EMUL_CONTINUE; 8221 8222 vm_exit_reason = EXIT_REASON_IO_INSTRUCTION; 8223 break; 8224 8225 case x86_intercept_lgdt: 8226 case x86_intercept_lidt: 8227 case x86_intercept_lldt: 8228 case x86_intercept_ltr: 8229 case x86_intercept_sgdt: 8230 case x86_intercept_sidt: 8231 case x86_intercept_sldt: 8232 case x86_intercept_str: 8233 if (!nested_cpu_has2(vmcs12, SECONDARY_EXEC_DESC)) 8234 return X86EMUL_CONTINUE; 8235 8236 if (info->intercept == x86_intercept_lldt || 8237 info->intercept == x86_intercept_ltr || 8238 info->intercept == x86_intercept_sldt || 8239 info->intercept == x86_intercept_str) 8240 vm_exit_reason = EXIT_REASON_LDTR_TR; 8241 else 8242 vm_exit_reason = EXIT_REASON_GDTR_IDTR; 8243 /* 8244 * FIXME: Decode the ModR/M to generate the correct exit 8245 * qualification for memory operands. 8246 */ 8247 break; 8248 8249 case x86_intercept_hlt: 8250 if (!nested_cpu_has(vmcs12, CPU_BASED_HLT_EXITING)) 8251 return X86EMUL_CONTINUE; 8252 8253 vm_exit_reason = EXIT_REASON_HLT; 8254 break; 8255 8256 case x86_intercept_pause: 8257 /* 8258 * PAUSE is a single-byte NOP with a REPE prefix, i.e. collides 8259 * with vanilla NOPs in the emulator. Apply the interception 8260 * check only to actual PAUSE instructions. Don't check 8261 * PAUSE-loop-exiting, software can't expect a given PAUSE to 8262 * exit, i.e. KVM is within its rights to allow L2 to execute 8263 * the PAUSE. 8264 */ 8265 if ((info->rep_prefix != REPE_PREFIX) || 8266 !nested_cpu_has(vmcs12, CPU_BASED_PAUSE_EXITING)) 8267 return X86EMUL_CONTINUE; 8268 8269 vm_exit_reason = EXIT_REASON_PAUSE_INSTRUCTION; 8270 break; 8271 8272 /* TODO: check more intercepts... */ 8273 default: 8274 return X86EMUL_UNHANDLEABLE; 8275 } 8276 8277 exit_insn_len = abs_diff((s64)info->next_rip, (s64)info->rip); 8278 if (!exit_insn_len || exit_insn_len > X86_MAX_INSTRUCTION_LENGTH) 8279 return X86EMUL_UNHANDLEABLE; 8280 8281 __nested_vmx_vmexit(vcpu, vm_exit_reason, 0, exit_qualification, 8282 exit_insn_len); 8283 return X86EMUL_INTERCEPTED; 8284 } 8285 8286 #ifdef CONFIG_X86_64 8287 /* (a << shift) / divisor, return 1 if overflow otherwise 0 */ 8288 static inline int u64_shl_div_u64(u64 a, unsigned int shift, 8289 u64 divisor, u64 *result) 8290 { 8291 u64 low = a << shift, high = a >> (64 - shift); 8292 8293 /* To avoid the overflow on divq */ 8294 if (high >= divisor) 8295 return 1; 8296 8297 /* Low hold the result, high hold rem which is discarded */ 8298 asm("divq %2\n\t" : "=a" (low), "=d" (high) : 8299 "rm" (divisor), "0" (low), "1" (high)); 8300 *result = low; 8301 8302 return 0; 8303 } 8304 8305 int vmx_set_hv_timer(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc, 8306 bool *expired) 8307 { 8308 struct vcpu_vmx *vmx; 8309 u64 tscl, guest_tscl, delta_tsc, lapic_timer_advance_cycles; 8310 struct kvm_timer *ktimer = &vcpu->arch.apic->lapic_timer; 8311 8312 vmx = to_vmx(vcpu); 8313 tscl = rdtsc(); 8314 guest_tscl = kvm_read_l1_tsc(vcpu, tscl); 8315 delta_tsc = max(guest_deadline_tsc, guest_tscl) - guest_tscl; 8316 lapic_timer_advance_cycles = nsec_to_cycles(vcpu, 8317 ktimer->timer_advance_ns); 8318 8319 if (delta_tsc > lapic_timer_advance_cycles) 8320 delta_tsc -= lapic_timer_advance_cycles; 8321 else 8322 delta_tsc = 0; 8323 8324 /* Convert to host delta tsc if tsc scaling is enabled */ 8325 if (vcpu->arch.l1_tsc_scaling_ratio != kvm_caps.default_tsc_scaling_ratio && 8326 delta_tsc && u64_shl_div_u64(delta_tsc, 8327 kvm_caps.tsc_scaling_ratio_frac_bits, 8328 vcpu->arch.l1_tsc_scaling_ratio, &delta_tsc)) 8329 return -ERANGE; 8330 8331 /* 8332 * If the delta tsc can't fit in the 32 bit after the multi shift, 8333 * we can't use the preemption timer. 8334 * It's possible that it fits on later vmentries, but checking 8335 * on every vmentry is costly so we just use an hrtimer. 8336 */ 8337 if (delta_tsc >> (cpu_preemption_timer_multi + 32)) 8338 return -ERANGE; 8339 8340 vmx->hv_deadline_tsc = tscl + delta_tsc; 8341 *expired = !delta_tsc; 8342 return 0; 8343 } 8344 8345 void vmx_cancel_hv_timer(struct kvm_vcpu *vcpu) 8346 { 8347 to_vmx(vcpu)->hv_deadline_tsc = -1; 8348 } 8349 #endif 8350 8351 void vmx_update_cpu_dirty_logging(struct kvm_vcpu *vcpu) 8352 { 8353 struct vcpu_vmx *vmx = to_vmx(vcpu); 8354 8355 if (WARN_ON_ONCE(!enable_pml)) 8356 return; 8357 8358 guard(vmx_vmcs01)(vcpu); 8359 8360 /* 8361 * Note, nr_memslots_dirty_logging can be changed concurrent with this 8362 * code, but in that case another update request will be made and so 8363 * the guest will never run with a stale PML value. 8364 */ 8365 if (atomic_read(&vcpu->kvm->nr_memslots_dirty_logging)) 8366 secondary_exec_controls_setbit(vmx, SECONDARY_EXEC_ENABLE_PML); 8367 else 8368 secondary_exec_controls_clearbit(vmx, SECONDARY_EXEC_ENABLE_PML); 8369 } 8370 8371 void vmx_setup_mce(struct kvm_vcpu *vcpu) 8372 { 8373 if (vcpu->arch.mcg_cap & MCG_LMCE_P) 8374 to_vmx(vcpu)->msr_ia32_feature_control_valid_bits |= 8375 FEAT_CTL_LMCE_ENABLED; 8376 else 8377 to_vmx(vcpu)->msr_ia32_feature_control_valid_bits &= 8378 ~FEAT_CTL_LMCE_ENABLED; 8379 } 8380 8381 #ifdef CONFIG_KVM_SMM 8382 int vmx_smi_allowed(struct kvm_vcpu *vcpu, bool for_injection) 8383 { 8384 /* we need a nested vmexit to enter SMM, postpone if run is pending */ 8385 if (vcpu->arch.nested_run_pending) 8386 return -EBUSY; 8387 return !is_smm(vcpu); 8388 } 8389 8390 int vmx_enter_smm(struct kvm_vcpu *vcpu, union kvm_smram *smram) 8391 { 8392 struct vcpu_vmx *vmx = to_vmx(vcpu); 8393 8394 /* 8395 * TODO: Implement custom flows for forcing the vCPU out/in of L2 on 8396 * SMI and RSM. Using the common VM-Exit + VM-Enter routines is wrong 8397 * SMI and RSM only modify state that is saved and restored via SMRAM. 8398 * E.g. most MSRs are left untouched, but many are modified by VM-Exit 8399 * and VM-Enter, and thus L2's values may be corrupted on SMI+RSM. 8400 */ 8401 vmx->nested.smm.guest_mode = is_guest_mode(vcpu); 8402 if (vmx->nested.smm.guest_mode) 8403 nested_vmx_vmexit(vcpu, -1, 0, 0); 8404 8405 vmx->nested.smm.vmxon = vmx->nested.vmxon; 8406 vmx->nested.vmxon = false; 8407 vmx_clear_hlt(vcpu); 8408 return 0; 8409 } 8410 8411 int vmx_leave_smm(struct kvm_vcpu *vcpu, const union kvm_smram *smram) 8412 { 8413 struct vcpu_vmx *vmx = to_vmx(vcpu); 8414 int ret; 8415 8416 if (vmx->nested.smm.vmxon) { 8417 vmx->nested.vmxon = true; 8418 vmx->nested.smm.vmxon = false; 8419 } 8420 8421 if (vmx->nested.smm.guest_mode) { 8422 /* Triple fault if the state is invalid. */ 8423 if (nested_vmx_check_restored_vmcs12(vcpu) < 0) 8424 return 1; 8425 8426 ret = nested_vmx_enter_non_root_mode(vcpu, false); 8427 if (ret != NVMX_VMENTRY_SUCCESS) 8428 return 1; 8429 8430 vcpu->arch.nested_run_pending = KVM_NESTED_RUN_PENDING; 8431 vmx->nested.smm.guest_mode = false; 8432 } 8433 return 0; 8434 } 8435 8436 void vmx_enable_smi_window(struct kvm_vcpu *vcpu) 8437 { 8438 /* RSM will cause a vmexit anyway. */ 8439 } 8440 #endif 8441 8442 bool vmx_apic_init_signal_blocked(struct kvm_vcpu *vcpu) 8443 { 8444 return to_vmx(vcpu)->nested.vmxon && !is_guest_mode(vcpu); 8445 } 8446 8447 void vmx_migrate_timers(struct kvm_vcpu *vcpu) 8448 { 8449 if (is_guest_mode(vcpu)) { 8450 struct hrtimer *timer = &to_vmx(vcpu)->nested.preemption_timer; 8451 8452 if (hrtimer_try_to_cancel(timer) == 1) 8453 hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED); 8454 } 8455 } 8456 8457 void vmx_hardware_unsetup(void) 8458 { 8459 kvm_set_posted_intr_wakeup_handler(NULL); 8460 8461 if (nested) 8462 nested_vmx_hardware_unsetup(); 8463 } 8464 8465 void vmx_vm_destroy(struct kvm *kvm) 8466 { 8467 struct kvm_vmx *kvm_vmx = to_kvm_vmx(kvm); 8468 8469 free_pages((unsigned long)kvm_vmx->pid_table, vmx_get_pid_table_order(kvm)); 8470 } 8471 8472 /* 8473 * Note, the SDM states that the linear address is masked *after* the modified 8474 * canonicality check, whereas KVM masks (untags) the address and then performs 8475 * a "normal" canonicality check. Functionally, the two methods are identical, 8476 * and when the masking occurs relative to the canonicality check isn't visible 8477 * to software, i.e. KVM's behavior doesn't violate the SDM. 8478 */ 8479 gva_t vmx_get_untagged_addr(struct kvm_vcpu *vcpu, gva_t gva, unsigned int flags) 8480 { 8481 int lam_bit; 8482 unsigned long cr3_bits; 8483 8484 if (flags & (X86EMUL_F_FETCH | X86EMUL_F_IMPLICIT | X86EMUL_F_INVLPG)) 8485 return gva; 8486 8487 if (!is_64_bit_mode(vcpu)) 8488 return gva; 8489 8490 /* 8491 * Bit 63 determines if the address should be treated as user address 8492 * or a supervisor address. 8493 */ 8494 if (!(gva & BIT_ULL(63))) { 8495 cr3_bits = kvm_get_active_cr3_lam_bits(vcpu); 8496 if (!(cr3_bits & (X86_CR3_LAM_U57 | X86_CR3_LAM_U48))) 8497 return gva; 8498 8499 /* LAM_U48 is ignored if LAM_U57 is set. */ 8500 lam_bit = cr3_bits & X86_CR3_LAM_U57 ? 56 : 47; 8501 } else { 8502 if (!kvm_is_cr4_bit_set(vcpu, X86_CR4_LAM_SUP)) 8503 return gva; 8504 8505 lam_bit = kvm_is_cr4_bit_set(vcpu, X86_CR4_LA57) ? 56 : 47; 8506 } 8507 8508 /* 8509 * Untag the address by sign-extending the lam_bit, but NOT to bit 63. 8510 * Bit 63 is retained from the raw virtual address so that untagging 8511 * doesn't change a user access to a supervisor access, and vice versa. 8512 */ 8513 return (sign_extend64(gva, lam_bit) & ~BIT_ULL(63)) | (gva & BIT_ULL(63)); 8514 } 8515 8516 static unsigned int vmx_handle_intel_pt_intr(void) 8517 { 8518 struct kvm_vcpu *vcpu = kvm_get_running_vcpu(); 8519 8520 /* '0' on failure so that the !PT case can use a RET0 static call. */ 8521 if (!vcpu || !kvm_handling_nmi_from_guest(vcpu)) 8522 return 0; 8523 8524 kvm_make_request(KVM_REQ_PMI, vcpu); 8525 __set_bit(MSR_CORE_PERF_GLOBAL_OVF_CTRL_TRACE_TOPA_PMI_BIT, 8526 (unsigned long *)&vcpu->arch.pmu.global_status); 8527 return 1; 8528 } 8529 8530 static __init void vmx_setup_user_return_msrs(void) 8531 { 8532 8533 /* 8534 * Though SYSCALL is only supported in 64-bit mode on Intel CPUs, kvm 8535 * will emulate SYSCALL in legacy mode if the vendor string in guest 8536 * CPUID.0:{EBX,ECX,EDX} is "AuthenticAMD" or "AMDisbetter!" To 8537 * support this emulation, MSR_STAR is included in the list for i386, 8538 * but is never loaded into hardware. MSR_CSTAR is also never loaded 8539 * into hardware and is here purely for emulation purposes. 8540 */ 8541 const u32 vmx_uret_msrs_list[] = { 8542 #ifdef CONFIG_X86_64 8543 MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR, 8544 #endif 8545 MSR_EFER, MSR_TSC_AUX, MSR_STAR, 8546 MSR_IA32_TSX_CTRL, 8547 }; 8548 int i; 8549 8550 BUILD_BUG_ON(ARRAY_SIZE(vmx_uret_msrs_list) != MAX_NR_USER_RETURN_MSRS); 8551 8552 for (i = 0; i < ARRAY_SIZE(vmx_uret_msrs_list); ++i) 8553 kvm_add_user_return_msr(vmx_uret_msrs_list[i]); 8554 } 8555 8556 static void __init vmx_setup_me_spte_mask(void) 8557 { 8558 u64 me_mask = 0; 8559 8560 /* 8561 * On pre-MKTME system, boot_cpu_data.x86_phys_bits equals to 8562 * kvm_host.maxphyaddr. On MKTME and/or TDX capable systems, 8563 * boot_cpu_data.x86_phys_bits holds the actual physical address 8564 * w/o the KeyID bits, and kvm_host.maxphyaddr equals to 8565 * MAXPHYADDR reported by CPUID. Those bits between are KeyID bits. 8566 */ 8567 if (boot_cpu_data.x86_phys_bits != kvm_host.maxphyaddr) 8568 me_mask = rsvd_bits(boot_cpu_data.x86_phys_bits, 8569 kvm_host.maxphyaddr - 1); 8570 8571 /* 8572 * Unlike SME, host kernel doesn't support setting up any 8573 * MKTME KeyID on Intel platforms. No memory encryption 8574 * bits should be included into the SPTE. 8575 */ 8576 kvm_mmu_set_me_spte_mask(0, me_mask); 8577 } 8578 8579 __init int vmx_hardware_setup(void) 8580 { 8581 unsigned long host_bndcfgs; 8582 struct desc_ptr dt; 8583 int r; 8584 8585 store_idt(&dt); 8586 host_idt_base = dt.address; 8587 8588 vmx_setup_user_return_msrs(); 8589 8590 if (boot_cpu_has(X86_FEATURE_MPX)) { 8591 rdmsrq(MSR_IA32_BNDCFGS, host_bndcfgs); 8592 WARN_ONCE(host_bndcfgs, "BNDCFGS in host will be lost"); 8593 } 8594 8595 if (!cpu_has_vmx_mpx()) 8596 kvm_caps.supported_xcr0 &= ~(XFEATURE_MASK_BNDREGS | 8597 XFEATURE_MASK_BNDCSR); 8598 8599 if (!cpu_has_vmx_vpid() || !cpu_has_vmx_invvpid() || 8600 !(cpu_has_vmx_invvpid_single() || cpu_has_vmx_invvpid_global())) 8601 enable_vpid = 0; 8602 8603 if (!cpu_has_vmx_ept() || 8604 !cpu_has_vmx_ept_4levels() || 8605 !cpu_has_vmx_ept_mt_wb() || 8606 !cpu_has_vmx_invept_global()) 8607 enable_ept = 0; 8608 8609 /* NX support is required for shadow paging. */ 8610 if (!enable_ept && !boot_cpu_has(X86_FEATURE_NX)) { 8611 pr_err_ratelimited("NX (Execute Disable) not supported\n"); 8612 return -EOPNOTSUPP; 8613 } 8614 8615 /* 8616 * Shadow paging doesn't have a (further) performance penalty 8617 * from GUEST_MAXPHYADDR < HOST_MAXPHYADDR so enable it 8618 * by default 8619 */ 8620 if (!enable_ept) 8621 allow_smaller_maxphyaddr = true; 8622 8623 if (!cpu_has_vmx_ept_ad_bits() || !enable_ept) 8624 enable_ept_ad_bits = 0; 8625 8626 if (!cpu_has_vmx_unrestricted_guest() || !enable_ept) 8627 enable_unrestricted_guest = 0; 8628 8629 if (!cpu_has_vmx_flexpriority()) 8630 flexpriority_enabled = 0; 8631 8632 if (!cpu_has_virtual_nmis()) 8633 enable_vnmi = 0; 8634 8635 #ifdef CONFIG_X86_SGX_KVM 8636 if (!cpu_has_vmx_encls_vmexit()) 8637 enable_sgx = false; 8638 #endif 8639 8640 /* 8641 * set_apic_access_page_addr() is used to reload apic access 8642 * page upon invalidation. No need to do anything if not 8643 * using the APIC_ACCESS_ADDR VMCS field. 8644 */ 8645 if (!flexpriority_enabled) 8646 vt_x86_ops.set_apic_access_page_addr = NULL; 8647 8648 if (!cpu_has_vmx_tpr_shadow()) 8649 vt_x86_ops.update_cr8_intercept = NULL; 8650 8651 #if IS_ENABLED(CONFIG_HYPERV) 8652 if (ms_hyperv.nested_features & HV_X64_NESTED_GUEST_MAPPING_FLUSH 8653 && enable_ept) { 8654 vt_x86_ops.flush_remote_tlbs = hv_flush_remote_tlbs; 8655 vt_x86_ops.flush_remote_tlbs_range = hv_flush_remote_tlbs_range; 8656 } 8657 #endif 8658 8659 if (!cpu_has_vmx_ple()) { 8660 ple_gap = 0; 8661 ple_window = 0; 8662 ple_window_grow = 0; 8663 ple_window_max = 0; 8664 ple_window_shrink = 0; 8665 } 8666 8667 if (!cpu_has_vmx_apicv()) 8668 enable_apicv = 0; 8669 if (!enable_apicv) 8670 vt_x86_ops.sync_pir_to_irr = NULL; 8671 8672 if (!enable_apicv || !cpu_has_vmx_ipiv()) 8673 enable_ipiv = false; 8674 8675 if (cpu_has_vmx_tsc_scaling()) 8676 kvm_caps.has_tsc_control = true; 8677 8678 kvm_caps.max_tsc_scaling_ratio = KVM_VMX_TSC_MULTIPLIER_MAX; 8679 kvm_caps.tsc_scaling_ratio_frac_bits = 48; 8680 kvm_caps.has_bus_lock_exit = cpu_has_vmx_bus_lock_detection(); 8681 kvm_caps.has_notify_vmexit = cpu_has_notify_vmexit(); 8682 8683 set_bit(0, vmx_vpid_bitmap); /* 0 is reserved for host */ 8684 8685 if (enable_ept) 8686 kvm_mmu_set_ept_masks(enable_ept_ad_bits, 8687 cpu_has_vmx_ept_execute_only()); 8688 else 8689 vt_x86_ops.get_mt_mask = NULL; 8690 8691 /* 8692 * Setup shadow_me_value/shadow_me_mask to include MKTME KeyID 8693 * bits to shadow_zero_check. 8694 */ 8695 vmx_setup_me_spte_mask(); 8696 8697 kvm_configure_mmu(enable_ept, 0, vmx_get_max_ept_level(), 8698 ept_caps_to_lpage_level(vmx_capability.ept)); 8699 8700 /* 8701 * Only enable PML when hardware supports PML feature, and both EPT 8702 * and EPT A/D bit features are enabled -- PML depends on them to work. 8703 */ 8704 if (!enable_ept || !enable_ept_ad_bits || !cpu_has_vmx_pml()) 8705 enable_pml = 0; 8706 8707 if (!cpu_has_vmx_preemption_timer()) 8708 enable_preemption_timer = false; 8709 8710 if (enable_preemption_timer) { 8711 u64 use_timer_freq = 5000ULL * 1000 * 1000; 8712 8713 cpu_preemption_timer_multi = 8714 vmx_misc_preemption_timer_rate(vmcs_config.misc); 8715 8716 if (tsc_khz) 8717 use_timer_freq = (u64)tsc_khz * 1000; 8718 use_timer_freq >>= cpu_preemption_timer_multi; 8719 8720 /* 8721 * KVM "disables" the preemption timer by setting it to its max 8722 * value. Don't use the timer if it might cause spurious exits 8723 * at a rate faster than 0.1 Hz (of uninterrupted guest time). 8724 */ 8725 if (use_timer_freq > 0xffffffffu / 10) 8726 enable_preemption_timer = false; 8727 } 8728 8729 if (!enable_preemption_timer) { 8730 vt_x86_ops.set_hv_timer = NULL; 8731 vt_x86_ops.cancel_hv_timer = NULL; 8732 } 8733 8734 kvm_caps.supported_mce_cap |= MCG_LMCE_P; 8735 kvm_caps.supported_mce_cap |= MCG_CMCI_P; 8736 8737 if (pt_mode != PT_MODE_SYSTEM && pt_mode != PT_MODE_HOST_GUEST) 8738 return -EINVAL; 8739 if (!enable_ept || !enable_pmu || !cpu_has_vmx_intel_pt()) 8740 pt_mode = PT_MODE_SYSTEM; 8741 if (pt_mode == PT_MODE_HOST_GUEST) 8742 vt_init_ops.handle_intel_pt_intr = vmx_handle_intel_pt_intr; 8743 else 8744 vt_init_ops.handle_intel_pt_intr = NULL; 8745 8746 setup_default_sgx_lepubkeyhash(); 8747 8748 vmx_set_cpu_caps(); 8749 8750 /* 8751 * Configure nested capabilities after core CPU capabilities so that 8752 * nested support can be conditional on base support, e.g. so that KVM 8753 * can hide/show features based on kvm_cpu_cap_has(). 8754 */ 8755 if (nested) { 8756 r = nested_vmx_hardware_setup(kvm_vmx_exit_handlers); 8757 if (r) 8758 return r; 8759 } 8760 8761 kvm_set_posted_intr_wakeup_handler(pi_wakeup_handler); 8762 8763 /* 8764 * On Intel CPUs that lack self-snoop feature, letting the guest control 8765 * memory types may result in unexpected behavior. So always ignore guest 8766 * PAT on those CPUs and map VM as writeback, not allowing userspace to 8767 * disable the quirk. 8768 * 8769 * On certain Intel CPUs (e.g. SPR, ICX), though self-snoop feature is 8770 * supported, UC is slow enough to cause issues with some older guests (e.g. 8771 * an old version of bochs driver uses ioremap() instead of ioremap_wc() to 8772 * map the video RAM, causing wayland desktop to fail to get started 8773 * correctly). To avoid breaking those older guests that rely on KVM to force 8774 * memory type to WB, provide KVM_X86_QUIRK_IGNORE_GUEST_PAT to preserve the 8775 * safer (for performance) default behavior. 8776 * 8777 * On top of this, non-coherent DMA devices need the guest to flush CPU 8778 * caches properly. This also requires honoring guest PAT, and is forced 8779 * independent of the quirk in vmx_ignore_guest_pat(). 8780 */ 8781 if (!static_cpu_has(X86_FEATURE_SELFSNOOP)) 8782 kvm_caps.supported_quirks &= ~KVM_X86_QUIRK_IGNORE_GUEST_PAT; 8783 8784 kvm_caps.inapplicable_quirks &= ~KVM_X86_QUIRK_IGNORE_GUEST_PAT; 8785 8786 return 0; 8787 } 8788 8789 void vmx_exit(void) 8790 { 8791 allow_smaller_maxphyaddr = false; 8792 8793 vmx_cleanup_l1d_flush(); 8794 8795 kvm_x86_vendor_exit(); 8796 } 8797 8798 int __init vmx_init(void) 8799 { 8800 int r, cpu; 8801 8802 KVM_SANITY_CHECK_VM_STRUCT_SIZE(kvm_vmx); 8803 8804 if (!kvm_is_vmx_supported()) 8805 return -EOPNOTSUPP; 8806 8807 /* 8808 * Note, VMCS and eVMCS configuration only touch VMX knobs/variables, 8809 * i.e. there's nothing to unwind if a later step fails. 8810 */ 8811 hv_init_evmcs(); 8812 8813 /* 8814 * Parse the VMCS config and VMX capabilities before anything else, so 8815 * that the information is available to all setup flows. 8816 */ 8817 if (setup_vmcs_config(&vmcs_config, &vmx_capability) < 0) 8818 return -EIO; 8819 8820 r = kvm_x86_vendor_init(&vt_init_ops); 8821 if (r) 8822 return r; 8823 8824 /* Must be called after common x86 init so enable_ept is setup. */ 8825 r = vmx_setup_l1d_flush(); 8826 if (r) 8827 goto err_l1d_flush; 8828 8829 for_each_possible_cpu(cpu) { 8830 INIT_LIST_HEAD(&per_cpu(loaded_vmcss_on_cpu, cpu)); 8831 8832 pi_init_cpu(cpu); 8833 } 8834 8835 vmx_check_vmcs12_offsets(); 8836 8837 return 0; 8838 8839 err_l1d_flush: 8840 kvm_x86_vendor_exit(); 8841 return r; 8842 } 8843