1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3 * Kernel-based Virtual Machine driver for Linux
4 *
5 * This module enables machines with Intel VT-x extensions to run virtual
6 * machines without emulation or binary translation.
7 *
8 * Copyright (C) 2006 Qumranet, Inc.
9 * Copyright 2010 Red Hat, Inc. and/or its affiliates.
10 *
11 * Authors:
12 * Avi Kivity <avi@qumranet.com>
13 * Yaniv Kamay <yaniv@qumranet.com>
14 */
15 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
16
17 #include <linux/highmem.h>
18 #include <linux/hrtimer.h>
19 #include <linux/kernel.h>
20 #include <linux/kvm_host.h>
21 #include <linux/module.h>
22 #include <linux/moduleparam.h>
23 #include <linux/mod_devicetable.h>
24 #include <linux/mm.h>
25 #include <linux/objtool.h>
26 #include <linux/sched.h>
27 #include <linux/sched/smt.h>
28 #include <linux/slab.h>
29 #include <linux/tboot.h>
30 #include <linux/trace_events.h>
31
32 #include <asm/apic.h>
33 #include <asm/asm.h>
34 #include <asm/cpu.h>
35 #include <asm/cpu_device_id.h>
36 #include <asm/debugreg.h>
37 #include <asm/desc.h>
38 #include <asm/fpu/api.h>
39 #include <asm/fpu/xstate.h>
40 #include <asm/fred.h>
41 #include <asm/idtentry.h>
42 #include <asm/io.h>
43 #include <asm/irq_remapping.h>
44 #include <asm/reboot.h>
45 #include <asm/perf_event.h>
46 #include <asm/mmu_context.h>
47 #include <asm/mshyperv.h>
48 #include <asm/msr.h>
49 #include <asm/mwait.h>
50 #include <asm/spec-ctrl.h>
51 #include <asm/vmx.h>
52
53 #include <trace/events/ipi.h>
54
55 #include "capabilities.h"
56 #include "common.h"
57 #include "cpuid.h"
58 #include "hyperv.h"
59 #include "kvm_onhyperv.h"
60 #include "irq.h"
61 #include "kvm_cache_regs.h"
62 #include "lapic.h"
63 #include "mmu.h"
64 #include "nested.h"
65 #include "pmu.h"
66 #include "sgx.h"
67 #include "trace.h"
68 #include "vmcs.h"
69 #include "vmcs12.h"
70 #include "vmx.h"
71 #include "x86.h"
72 #include "x86_ops.h"
73 #include "smm.h"
74 #include "vmx_onhyperv.h"
75 #include "posted_intr.h"
76
77 #include "mmu/spte.h"
78
79 MODULE_AUTHOR("Qumranet");
80 MODULE_DESCRIPTION("KVM support for VMX (Intel VT-x) extensions");
81 MODULE_LICENSE("GPL");
82
83 #ifdef MODULE
84 static const struct x86_cpu_id vmx_cpu_id[] = {
85 X86_MATCH_FEATURE(X86_FEATURE_VMX, NULL),
86 {}
87 };
88 MODULE_DEVICE_TABLE(x86cpu, vmx_cpu_id);
89 #endif
90
91 bool __read_mostly enable_vpid = 1;
92 module_param_named(vpid, enable_vpid, bool, 0444);
93
94 static bool __read_mostly enable_vnmi = 1;
95 module_param_named(vnmi, enable_vnmi, bool, 0444);
96
97 bool __read_mostly flexpriority_enabled = 1;
98 module_param_named(flexpriority, flexpriority_enabled, bool, 0444);
99
100 bool __read_mostly enable_ept = 1;
101 module_param_named(ept, enable_ept, bool, 0444);
102
103 bool __read_mostly enable_unrestricted_guest = 1;
104 module_param_named(unrestricted_guest,
105 enable_unrestricted_guest, bool, 0444);
106
107 bool __read_mostly enable_ept_ad_bits = 1;
108 module_param_named(eptad, enable_ept_ad_bits, bool, 0444);
109
110 static bool __read_mostly emulate_invalid_guest_state = true;
111 module_param(emulate_invalid_guest_state, bool, 0444);
112
113 static bool __read_mostly fasteoi = 1;
114 module_param(fasteoi, bool, 0444);
115
116 module_param(enable_apicv, bool, 0444);
117 module_param(enable_ipiv, bool, 0444);
118
119 module_param(enable_device_posted_irqs, bool, 0444);
120
121 /*
122 * If nested=1, nested virtualization is supported, i.e., guests may use
123 * VMX and be a hypervisor for its own guests. If nested=0, guests may not
124 * use VMX instructions.
125 */
126 static bool __read_mostly nested = 1;
127 module_param(nested, bool, 0444);
128
129 bool __read_mostly enable_pml = 1;
130 module_param_named(pml, enable_pml, bool, 0444);
131
132 static bool __read_mostly error_on_inconsistent_vmcs_config = true;
133 module_param(error_on_inconsistent_vmcs_config, bool, 0444);
134
135 static bool __read_mostly dump_invalid_vmcs = 0;
136 module_param(dump_invalid_vmcs, bool, 0644);
137
138 #define MSR_BITMAP_MODE_X2APIC 1
139 #define MSR_BITMAP_MODE_X2APIC_APICV 2
140
141 #define KVM_VMX_TSC_MULTIPLIER_MAX 0xffffffffffffffffULL
142
143 /* Guest_tsc -> host_tsc conversion requires 64-bit division. */
144 static int __read_mostly cpu_preemption_timer_multi;
145 static bool __read_mostly enable_preemption_timer = 1;
146 #ifdef CONFIG_X86_64
147 module_param_named(preemption_timer, enable_preemption_timer, bool, S_IRUGO);
148 #endif
149
150 extern bool __read_mostly allow_smaller_maxphyaddr;
151 module_param(allow_smaller_maxphyaddr, bool, S_IRUGO);
152
153 module_param(enable_mediated_pmu, bool, 0444);
154
155 #define KVM_VM_CR0_ALWAYS_OFF (X86_CR0_NW | X86_CR0_CD)
156 #define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST X86_CR0_NE
157 #define KVM_VM_CR0_ALWAYS_ON \
158 (KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST | X86_CR0_PG | X86_CR0_PE)
159
160 #define KVM_VM_CR4_ALWAYS_ON_UNRESTRICTED_GUEST X86_CR4_VMXE
161 #define KVM_PMODE_VM_CR4_ALWAYS_ON (X86_CR4_PAE | X86_CR4_VMXE)
162 #define KVM_RMODE_VM_CR4_ALWAYS_ON (X86_CR4_VME | X86_CR4_PAE | X86_CR4_VMXE)
163
164 #define RMODE_GUEST_OWNED_EFLAGS_BITS (~(X86_EFLAGS_IOPL | X86_EFLAGS_VM))
165
166 #define MSR_IA32_RTIT_STATUS_MASK (~(RTIT_STATUS_FILTEREN | \
167 RTIT_STATUS_CONTEXTEN | RTIT_STATUS_TRIGGEREN | \
168 RTIT_STATUS_ERROR | RTIT_STATUS_STOPPED | \
169 RTIT_STATUS_BYTECNT))
170
171 /*
172 * These 2 parameters are used to config the controls for Pause-Loop Exiting:
173 * ple_gap: upper bound on the amount of time between two successive
174 * executions of PAUSE in a loop. Also indicate if ple enabled.
175 * According to test, this time is usually smaller than 128 cycles.
176 * ple_window: upper bound on the amount of time a guest is allowed to execute
177 * in a PAUSE loop. Tests indicate that most spinlocks are held for
178 * less than 2^12 cycles
179 * Time is measured based on a counter that runs at the same rate as the TSC,
180 * refer SDM volume 3b section 21.6.13 & 22.1.3.
181 */
182 static unsigned int ple_gap = KVM_DEFAULT_PLE_GAP;
183 module_param(ple_gap, uint, 0444);
184
185 static unsigned int ple_window = KVM_VMX_DEFAULT_PLE_WINDOW;
186 module_param(ple_window, uint, 0444);
187
188 /* Default doubles per-vcpu window every exit. */
189 static unsigned int ple_window_grow = KVM_DEFAULT_PLE_WINDOW_GROW;
190 module_param(ple_window_grow, uint, 0444);
191
192 /* Default resets per-vcpu window every exit to ple_window. */
193 static unsigned int ple_window_shrink = KVM_DEFAULT_PLE_WINDOW_SHRINK;
194 module_param(ple_window_shrink, uint, 0444);
195
196 /* Default is to compute the maximum so we can never overflow. */
197 static unsigned int ple_window_max = KVM_VMX_DEFAULT_PLE_WINDOW_MAX;
198 module_param(ple_window_max, uint, 0444);
199
200 /* Default is SYSTEM mode, 1 for host-guest mode (which is BROKEN) */
201 int __read_mostly pt_mode = PT_MODE_SYSTEM;
202 #ifdef CONFIG_BROKEN
203 module_param(pt_mode, int, S_IRUGO);
204 #endif
205
206 struct x86_pmu_lbr __ro_after_init vmx_lbr_caps;
207
208 #ifdef CONFIG_CPU_MITIGATIONS
209 static DEFINE_STATIC_KEY_FALSE(vmx_l1d_should_flush);
210 static DEFINE_STATIC_KEY_FALSE(vmx_l1d_flush_cond);
211 static DEFINE_MUTEX(vmx_l1d_flush_mutex);
212
213 /* Storage for pre module init parameter parsing */
214 static enum vmx_l1d_flush_state __read_mostly vmentry_l1d_flush_param = VMENTER_L1D_FLUSH_AUTO;
215
216 static const struct {
217 const char *option;
218 bool for_parse;
219 } vmentry_l1d_param[] = {
220 [VMENTER_L1D_FLUSH_AUTO] = {"auto", true},
221 [VMENTER_L1D_FLUSH_NEVER] = {"never", true},
222 [VMENTER_L1D_FLUSH_COND] = {"cond", true},
223 [VMENTER_L1D_FLUSH_ALWAYS] = {"always", true},
224 [VMENTER_L1D_FLUSH_EPT_DISABLED] = {"EPT disabled", false},
225 [VMENTER_L1D_FLUSH_NOT_REQUIRED] = {"not required", false},
226 };
227
228 #define L1D_CACHE_ORDER 4
229 static void *vmx_l1d_flush_pages;
230
__vmx_setup_l1d_flush(enum vmx_l1d_flush_state l1tf)231 static int __vmx_setup_l1d_flush(enum vmx_l1d_flush_state l1tf)
232 {
233 struct page *page;
234 unsigned int i;
235
236 if (!boot_cpu_has_bug(X86_BUG_L1TF)) {
237 l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_NOT_REQUIRED;
238 return 0;
239 }
240
241 if (!enable_ept) {
242 l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_EPT_DISABLED;
243 return 0;
244 }
245
246 if (kvm_host.arch_capabilities & ARCH_CAP_SKIP_VMENTRY_L1DFLUSH) {
247 l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_NOT_REQUIRED;
248 return 0;
249 }
250
251 /* If set to auto use the default l1tf mitigation method */
252 if (l1tf == VMENTER_L1D_FLUSH_AUTO) {
253 switch (l1tf_mitigation) {
254 case L1TF_MITIGATION_OFF:
255 l1tf = VMENTER_L1D_FLUSH_NEVER;
256 break;
257 case L1TF_MITIGATION_AUTO:
258 case L1TF_MITIGATION_FLUSH_NOWARN:
259 case L1TF_MITIGATION_FLUSH:
260 case L1TF_MITIGATION_FLUSH_NOSMT:
261 l1tf = VMENTER_L1D_FLUSH_COND;
262 break;
263 case L1TF_MITIGATION_FULL:
264 case L1TF_MITIGATION_FULL_FORCE:
265 l1tf = VMENTER_L1D_FLUSH_ALWAYS;
266 break;
267 }
268 } else if (l1tf_mitigation == L1TF_MITIGATION_FULL_FORCE) {
269 l1tf = VMENTER_L1D_FLUSH_ALWAYS;
270 }
271
272 if (l1tf != VMENTER_L1D_FLUSH_NEVER && !vmx_l1d_flush_pages &&
273 !boot_cpu_has(X86_FEATURE_FLUSH_L1D)) {
274 /*
275 * This allocation for vmx_l1d_flush_pages is not tied to a VM
276 * lifetime and so should not be charged to a memcg.
277 */
278 page = alloc_pages(GFP_KERNEL, L1D_CACHE_ORDER);
279 if (!page)
280 return -ENOMEM;
281 vmx_l1d_flush_pages = page_address(page);
282
283 /*
284 * Initialize each page with a different pattern in
285 * order to protect against KSM in the nested
286 * virtualization case.
287 */
288 for (i = 0; i < 1u << L1D_CACHE_ORDER; ++i) {
289 memset(vmx_l1d_flush_pages + i * PAGE_SIZE, i + 1,
290 PAGE_SIZE);
291 }
292 }
293
294 l1tf_vmx_mitigation = l1tf;
295
296 if (l1tf != VMENTER_L1D_FLUSH_NEVER)
297 static_branch_enable(&vmx_l1d_should_flush);
298 else
299 static_branch_disable(&vmx_l1d_should_flush);
300
301 if (l1tf == VMENTER_L1D_FLUSH_COND)
302 static_branch_enable(&vmx_l1d_flush_cond);
303 else
304 static_branch_disable(&vmx_l1d_flush_cond);
305 return 0;
306 }
307
vmx_setup_l1d_flush(void)308 static int vmx_setup_l1d_flush(void)
309 {
310 /*
311 * Hand the parameter mitigation value in which was stored in the pre
312 * module init parser. If no parameter was given, it will contain
313 * 'auto' which will be turned into the default 'cond' mitigation mode.
314 */
315 return __vmx_setup_l1d_flush(vmentry_l1d_flush_param);
316 }
317
vmx_cleanup_l1d_flush(void)318 static void vmx_cleanup_l1d_flush(void)
319 {
320 if (vmx_l1d_flush_pages) {
321 free_pages((unsigned long)vmx_l1d_flush_pages, L1D_CACHE_ORDER);
322 vmx_l1d_flush_pages = NULL;
323 }
324 /* Restore state so sysfs ignores VMX */
325 l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_AUTO;
326 }
327
vmentry_l1d_flush_parse(const char * s)328 static int vmentry_l1d_flush_parse(const char *s)
329 {
330 unsigned int i;
331
332 if (s) {
333 for (i = 0; i < ARRAY_SIZE(vmentry_l1d_param); i++) {
334 if (vmentry_l1d_param[i].for_parse &&
335 sysfs_streq(s, vmentry_l1d_param[i].option))
336 return i;
337 }
338 }
339 return -EINVAL;
340 }
341
vmentry_l1d_flush_set(const char * s,const struct kernel_param * kp)342 static int vmentry_l1d_flush_set(const char *s, const struct kernel_param *kp)
343 {
344 int l1tf, ret;
345
346 l1tf = vmentry_l1d_flush_parse(s);
347 if (l1tf < 0)
348 return l1tf;
349
350 if (!boot_cpu_has(X86_BUG_L1TF))
351 return 0;
352
353 /*
354 * Has vmx_init() run already? If not then this is the pre init
355 * parameter parsing. In that case just store the value and let
356 * vmx_init() do the proper setup after enable_ept has been
357 * established.
358 */
359 if (l1tf_vmx_mitigation == VMENTER_L1D_FLUSH_AUTO) {
360 vmentry_l1d_flush_param = l1tf;
361 return 0;
362 }
363
364 mutex_lock(&vmx_l1d_flush_mutex);
365 ret = __vmx_setup_l1d_flush(l1tf);
366 mutex_unlock(&vmx_l1d_flush_mutex);
367 return ret;
368 }
369
vmentry_l1d_flush_get(char * s,const struct kernel_param * kp)370 static int vmentry_l1d_flush_get(char *s, const struct kernel_param *kp)
371 {
372 if (WARN_ON_ONCE(l1tf_vmx_mitigation >= ARRAY_SIZE(vmentry_l1d_param)))
373 return sysfs_emit(s, "???\n");
374
375 return sysfs_emit(s, "%s\n", vmentry_l1d_param[l1tf_vmx_mitigation].option);
376 }
377
378 /*
379 * Software based L1D cache flush which is used when microcode providing
380 * the cache control MSR is not loaded.
381 *
382 * The L1D cache is 32 KiB on Nehalem and later microarchitectures, but to
383 * flush it is required to read in 64 KiB because the replacement algorithm
384 * is not exactly LRU. This could be sized at runtime via topology
385 * information but as all relevant affected CPUs have 32KiB L1D cache size
386 * there is no point in doing so.
387 */
vmx_l1d_flush(struct kvm_vcpu * vcpu)388 static noinstr void vmx_l1d_flush(struct kvm_vcpu *vcpu)
389 {
390 int size = PAGE_SIZE << L1D_CACHE_ORDER;
391
392 if (!static_branch_unlikely(&vmx_l1d_should_flush))
393 return;
394
395 /*
396 * This code is only executed when the flush mode is 'cond' or
397 * 'always'
398 */
399 if (static_branch_likely(&vmx_l1d_flush_cond)) {
400 /*
401 * Clear the per-cpu flush bit, it gets set again if the vCPU
402 * is reloaded, i.e. if the vCPU is scheduled out or if KVM
403 * exits to userspace, or if KVM reaches one of the unsafe
404 * VMEXIT handlers, e.g. if KVM calls into the emulator,
405 * or from the interrupt handlers.
406 */
407 if (!kvm_get_cpu_l1tf_flush_l1d())
408 return;
409 kvm_clear_cpu_l1tf_flush_l1d();
410 }
411
412 vcpu->stat.l1d_flush++;
413
414 if (static_cpu_has(X86_FEATURE_FLUSH_L1D)) {
415 native_wrmsrq(MSR_IA32_FLUSH_CMD, L1D_FLUSH);
416 return;
417 }
418
419 asm volatile(
420 /* First ensure the pages are in the TLB */
421 "xorl %%eax, %%eax\n"
422 ".Lpopulate_tlb:\n\t"
423 "movzbl (%[flush_pages], %%" _ASM_AX "), %%ecx\n\t"
424 "addl $4096, %%eax\n\t"
425 "cmpl %%eax, %[size]\n\t"
426 "jne .Lpopulate_tlb\n\t"
427 "xorl %%eax, %%eax\n\t"
428 "cpuid\n\t"
429 /* Now fill the cache */
430 "xorl %%eax, %%eax\n"
431 ".Lfill_cache:\n"
432 "movzbl (%[flush_pages], %%" _ASM_AX "), %%ecx\n\t"
433 "addl $64, %%eax\n\t"
434 "cmpl %%eax, %[size]\n\t"
435 "jne .Lfill_cache\n\t"
436 "lfence\n"
437 :: [flush_pages] "r" (vmx_l1d_flush_pages),
438 [size] "r" (size)
439 : "eax", "ebx", "ecx", "edx");
440 }
441
442 #else /* CONFIG_CPU_MITIGATIONS*/
vmx_setup_l1d_flush(void)443 static int vmx_setup_l1d_flush(void)
444 {
445 l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_NEVER;
446 return 0;
447 }
vmx_cleanup_l1d_flush(void)448 static void vmx_cleanup_l1d_flush(void)
449 {
450 l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_AUTO;
451 }
vmx_l1d_flush(struct kvm_vcpu * vcpu)452 static __always_inline void vmx_l1d_flush(struct kvm_vcpu *vcpu)
453 {
454
455 }
vmentry_l1d_flush_set(const char * s,const struct kernel_param * kp)456 static int vmentry_l1d_flush_set(const char *s, const struct kernel_param *kp)
457 {
458 pr_warn_once("Kernel compiled without mitigations, ignoring vmentry_l1d_flush\n");
459 return 0;
460 }
vmentry_l1d_flush_get(char * s,const struct kernel_param * kp)461 static int vmentry_l1d_flush_get(char *s, const struct kernel_param *kp)
462 {
463 return sysfs_emit(s, "never\n");
464 }
465 #endif
466
467 static const struct kernel_param_ops vmentry_l1d_flush_ops = {
468 .set = vmentry_l1d_flush_set,
469 .get = vmentry_l1d_flush_get,
470 };
471 module_param_cb(vmentry_l1d_flush, &vmentry_l1d_flush_ops, NULL, 0644);
472
vmx_disable_fb_clear(struct vcpu_vmx * vmx)473 static __always_inline void vmx_disable_fb_clear(struct vcpu_vmx *vmx)
474 {
475 u64 msr;
476
477 if (!vmx->disable_fb_clear)
478 return;
479
480 msr = native_rdmsrq(MSR_IA32_MCU_OPT_CTRL);
481 msr |= FB_CLEAR_DIS;
482 native_wrmsrq(MSR_IA32_MCU_OPT_CTRL, msr);
483 /* Cache the MSR value to avoid reading it later */
484 vmx->msr_ia32_mcu_opt_ctrl = msr;
485 }
486
vmx_enable_fb_clear(struct vcpu_vmx * vmx)487 static __always_inline void vmx_enable_fb_clear(struct vcpu_vmx *vmx)
488 {
489 if (!vmx->disable_fb_clear)
490 return;
491
492 vmx->msr_ia32_mcu_opt_ctrl &= ~FB_CLEAR_DIS;
493 native_wrmsrq(MSR_IA32_MCU_OPT_CTRL, vmx->msr_ia32_mcu_opt_ctrl);
494 }
495
vmx_update_fb_clear_dis(struct kvm_vcpu * vcpu,struct vcpu_vmx * vmx)496 static void vmx_update_fb_clear_dis(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx)
497 {
498 /*
499 * Disable VERW's behavior of clearing CPU buffers for the guest if the
500 * CPU isn't affected by MDS/TAA, and the host hasn't forcefully enabled
501 * the mitigation. Disabling the clearing behavior provides a
502 * performance boost for guests that aren't aware that manually clearing
503 * CPU buffers is unnecessary, at the cost of MSR accesses on VM-Entry
504 * and VM-Exit.
505 */
506 vmx->disable_fb_clear = !cpu_feature_enabled(X86_FEATURE_CLEAR_CPU_BUF) &&
507 (kvm_host.arch_capabilities & ARCH_CAP_FB_CLEAR_CTRL) &&
508 !boot_cpu_has_bug(X86_BUG_MDS) &&
509 !boot_cpu_has_bug(X86_BUG_TAA);
510
511 /*
512 * If guest will not execute VERW, there is no need to set FB_CLEAR_DIS
513 * at VMEntry. Skip the MSR read/write when a guest has no use case to
514 * execute VERW.
515 */
516 if ((vcpu->arch.arch_capabilities & ARCH_CAP_FB_CLEAR) ||
517 ((vcpu->arch.arch_capabilities & ARCH_CAP_MDS_NO) &&
518 (vcpu->arch.arch_capabilities & ARCH_CAP_TAA_NO) &&
519 (vcpu->arch.arch_capabilities & ARCH_CAP_PSDP_NO) &&
520 (vcpu->arch.arch_capabilities & ARCH_CAP_FBSDP_NO) &&
521 (vcpu->arch.arch_capabilities & ARCH_CAP_SBDR_SSDP_NO)))
522 vmx->disable_fb_clear = false;
523 }
524
525 static u32 vmx_segment_access_rights(struct kvm_segment *var);
526
527 void vmx_vmexit(void);
528
529 #define vmx_insn_failed(fmt...) \
530 do { \
531 WARN_ONCE(1, fmt); \
532 pr_warn_ratelimited(fmt); \
533 } while (0)
534
vmread_error(unsigned long field)535 noinline void vmread_error(unsigned long field)
536 {
537 vmx_insn_failed("vmread failed: field=%lx\n", field);
538 }
539
540 #ifndef CONFIG_CC_HAS_ASM_GOTO_OUTPUT
vmread_error_trampoline2(unsigned long field,bool fault)541 noinstr void vmread_error_trampoline2(unsigned long field, bool fault)
542 {
543 if (fault) {
544 kvm_spurious_fault();
545 } else {
546 instrumentation_begin();
547 vmread_error(field);
548 instrumentation_end();
549 }
550 }
551 #endif
552
vmwrite_error(unsigned long field,unsigned long value)553 noinline void vmwrite_error(unsigned long field, unsigned long value)
554 {
555 vmx_insn_failed("vmwrite failed: field=%lx val=%lx err=%u\n",
556 field, value, vmcs_read32(VM_INSTRUCTION_ERROR));
557 }
558
vmclear_error(struct vmcs * vmcs,u64 phys_addr)559 noinline void vmclear_error(struct vmcs *vmcs, u64 phys_addr)
560 {
561 vmx_insn_failed("vmclear failed: %p/%llx err=%u\n",
562 vmcs, phys_addr, vmcs_read32(VM_INSTRUCTION_ERROR));
563 }
564
vmptrld_error(struct vmcs * vmcs,u64 phys_addr)565 noinline void vmptrld_error(struct vmcs *vmcs, u64 phys_addr)
566 {
567 vmx_insn_failed("vmptrld failed: %p/%llx err=%u\n",
568 vmcs, phys_addr, vmcs_read32(VM_INSTRUCTION_ERROR));
569 }
570
invvpid_error(unsigned long ext,u16 vpid,gva_t gva)571 noinline void invvpid_error(unsigned long ext, u16 vpid, gva_t gva)
572 {
573 vmx_insn_failed("invvpid failed: ext=0x%lx vpid=%u gva=0x%lx\n",
574 ext, vpid, gva);
575 }
576
invept_error(unsigned long ext,u64 eptp)577 noinline void invept_error(unsigned long ext, u64 eptp)
578 {
579 vmx_insn_failed("invept failed: ext=0x%lx eptp=%llx\n", ext, eptp);
580 }
581
582 static DEFINE_PER_CPU(struct vmcs *, vmxarea);
583 DEFINE_PER_CPU(struct vmcs *, current_vmcs);
584 /*
585 * We maintain a per-CPU linked-list of VMCS loaded on that CPU. This is needed
586 * when a CPU is brought down, and we need to VMCLEAR all VMCSs loaded on it.
587 */
588 static DEFINE_PER_CPU(struct list_head, loaded_vmcss_on_cpu);
589
590 static DECLARE_BITMAP(vmx_vpid_bitmap, VMX_NR_VPIDS);
591 static DEFINE_SPINLOCK(vmx_vpid_lock);
592
593 struct vmcs_config vmcs_config __ro_after_init;
594 struct vmx_capability vmx_capability __ro_after_init;
595
596 #define VMX_SEGMENT_FIELD(seg) \
597 [VCPU_SREG_##seg] = { \
598 .selector = GUEST_##seg##_SELECTOR, \
599 .base = GUEST_##seg##_BASE, \
600 .limit = GUEST_##seg##_LIMIT, \
601 .ar_bytes = GUEST_##seg##_AR_BYTES, \
602 }
603
604 static const struct kvm_vmx_segment_field {
605 unsigned selector;
606 unsigned base;
607 unsigned limit;
608 unsigned ar_bytes;
609 } kvm_vmx_segment_fields[] = {
610 VMX_SEGMENT_FIELD(CS),
611 VMX_SEGMENT_FIELD(DS),
612 VMX_SEGMENT_FIELD(ES),
613 VMX_SEGMENT_FIELD(FS),
614 VMX_SEGMENT_FIELD(GS),
615 VMX_SEGMENT_FIELD(SS),
616 VMX_SEGMENT_FIELD(TR),
617 VMX_SEGMENT_FIELD(LDTR),
618 };
619
620
621 static unsigned long host_idt_base;
622
623 #if IS_ENABLED(CONFIG_HYPERV)
624 static bool __read_mostly enlightened_vmcs = true;
625 module_param(enlightened_vmcs, bool, 0444);
626
hv_enable_l2_tlb_flush(struct kvm_vcpu * vcpu)627 static int hv_enable_l2_tlb_flush(struct kvm_vcpu *vcpu)
628 {
629 struct hv_enlightened_vmcs *evmcs;
630 hpa_t partition_assist_page = hv_get_partition_assist_page(vcpu);
631
632 if (partition_assist_page == INVALID_PAGE)
633 return -ENOMEM;
634
635 evmcs = (struct hv_enlightened_vmcs *)to_vmx(vcpu)->loaded_vmcs->vmcs;
636
637 evmcs->partition_assist_page = partition_assist_page;
638 evmcs->hv_vm_id = (unsigned long)vcpu->kvm;
639 evmcs->hv_enlightenments_control.nested_flush_hypercall = 1;
640
641 return 0;
642 }
643
hv_init_evmcs(void)644 static __init void hv_init_evmcs(void)
645 {
646 int cpu;
647
648 if (!enlightened_vmcs)
649 return;
650
651 /*
652 * Enlightened VMCS usage should be recommended and the host needs
653 * to support eVMCS v1 or above.
654 */
655 if (ms_hyperv.hints & HV_X64_ENLIGHTENED_VMCS_RECOMMENDED &&
656 (ms_hyperv.nested_features & HV_X64_ENLIGHTENED_VMCS_VERSION) >=
657 KVM_EVMCS_VERSION) {
658
659 /* Check that we have assist pages on all online CPUs */
660 for_each_online_cpu(cpu) {
661 if (!hv_get_vp_assist_page(cpu)) {
662 enlightened_vmcs = false;
663 break;
664 }
665 }
666
667 if (enlightened_vmcs) {
668 pr_info("Using Hyper-V Enlightened VMCS\n");
669 static_branch_enable(&__kvm_is_using_evmcs);
670 }
671
672 if (ms_hyperv.nested_features & HV_X64_NESTED_DIRECT_FLUSH)
673 vt_x86_ops.enable_l2_tlb_flush
674 = hv_enable_l2_tlb_flush;
675 } else {
676 enlightened_vmcs = false;
677 }
678 }
679
hv_reset_evmcs(void)680 static void hv_reset_evmcs(void)
681 {
682 struct hv_vp_assist_page *vp_ap;
683
684 if (!kvm_is_using_evmcs())
685 return;
686
687 /*
688 * KVM should enable eVMCS if and only if all CPUs have a VP assist
689 * page, and should reject CPU onlining if eVMCS is enabled the CPU
690 * doesn't have a VP assist page allocated.
691 */
692 vp_ap = hv_get_vp_assist_page(smp_processor_id());
693 if (WARN_ON_ONCE(!vp_ap))
694 return;
695
696 /*
697 * Reset everything to support using non-enlightened VMCS access later
698 * (e.g. when we reload the module with enlightened_vmcs=0)
699 */
700 vp_ap->nested_control.features.directhypercall = 0;
701 vp_ap->current_nested_vmcs = 0;
702 vp_ap->enlighten_vmentry = 0;
703 }
704
705 #else /* IS_ENABLED(CONFIG_HYPERV) */
hv_init_evmcs(void)706 static void hv_init_evmcs(void) {}
hv_reset_evmcs(void)707 static void hv_reset_evmcs(void) {}
708 #endif /* IS_ENABLED(CONFIG_HYPERV) */
709
710 /*
711 * Comment's format: document - errata name - stepping - processor name.
712 * Refer from
713 * https://www.virtualbox.org/svn/vbox/trunk/src/VBox/VMM/VMMR0/HMR0.cpp
714 */
715 static u32 vmx_preemption_cpu_tfms[] = {
716 /* 323344.pdf - BA86 - D0 - Xeon 7500 Series */
717 0x000206E6,
718 /* 323056.pdf - AAX65 - C2 - Xeon L3406 */
719 /* 322814.pdf - AAT59 - C2 - i7-600, i5-500, i5-400 and i3-300 Mobile */
720 /* 322911.pdf - AAU65 - C2 - i5-600, i3-500 Desktop and Pentium G6950 */
721 0x00020652,
722 /* 322911.pdf - AAU65 - K0 - i5-600, i3-500 Desktop and Pentium G6950 */
723 0x00020655,
724 /* 322373.pdf - AAO95 - B1 - Xeon 3400 Series */
725 /* 322166.pdf - AAN92 - B1 - i7-800 and i5-700 Desktop */
726 /*
727 * 320767.pdf - AAP86 - B1 -
728 * i7-900 Mobile Extreme, i7-800 and i7-700 Mobile
729 */
730 0x000106E5,
731 /* 321333.pdf - AAM126 - C0 - Xeon 3500 */
732 0x000106A0,
733 /* 321333.pdf - AAM126 - C1 - Xeon 3500 */
734 0x000106A1,
735 /* 320836.pdf - AAJ124 - C0 - i7-900 Desktop Extreme and i7-900 Desktop */
736 0x000106A4,
737 /* 321333.pdf - AAM126 - D0 - Xeon 3500 */
738 /* 321324.pdf - AAK139 - D0 - Xeon 5500 */
739 /* 320836.pdf - AAJ124 - D0 - i7-900 Extreme and i7-900 Desktop */
740 0x000106A5,
741 /* Xeon E3-1220 V2 */
742 0x000306A8,
743 };
744
cpu_has_broken_vmx_preemption_timer(void)745 static inline bool cpu_has_broken_vmx_preemption_timer(void)
746 {
747 u32 eax = cpuid_eax(0x00000001), i;
748
749 /* Clear the reserved bits */
750 eax &= ~(0x3U << 14 | 0xfU << 28);
751 for (i = 0; i < ARRAY_SIZE(vmx_preemption_cpu_tfms); i++)
752 if (eax == vmx_preemption_cpu_tfms[i])
753 return true;
754
755 return false;
756 }
757
cpu_need_virtualize_apic_accesses(struct kvm_vcpu * vcpu)758 static inline bool cpu_need_virtualize_apic_accesses(struct kvm_vcpu *vcpu)
759 {
760 return flexpriority_enabled && lapic_in_kernel(vcpu);
761 }
762
vmx_find_uret_msr(struct vcpu_vmx * vmx,u32 msr)763 struct vmx_uret_msr *vmx_find_uret_msr(struct vcpu_vmx *vmx, u32 msr)
764 {
765 int i;
766
767 i = kvm_find_user_return_msr(msr);
768 if (i >= 0)
769 return &vmx->guest_uret_msrs[i];
770 return NULL;
771 }
772
vmx_set_guest_uret_msr(struct vcpu_vmx * vmx,struct vmx_uret_msr * msr,u64 data)773 static int vmx_set_guest_uret_msr(struct vcpu_vmx *vmx,
774 struct vmx_uret_msr *msr, u64 data)
775 {
776 unsigned int slot = msr - vmx->guest_uret_msrs;
777 int ret = 0;
778
779 if (msr->load_into_hardware) {
780 preempt_disable();
781 ret = kvm_set_user_return_msr(slot, data, msr->mask);
782 preempt_enable();
783 }
784 if (!ret)
785 msr->data = data;
786 return ret;
787 }
788
789 /*
790 * Disable VMX and clear CR4.VMXE (even if VMXOFF faults)
791 *
792 * Note, VMXOFF causes a #UD if the CPU is !post-VMXON, but it's impossible to
793 * atomically track post-VMXON state, e.g. this may be called in NMI context.
794 * Eat all faults as all other faults on VMXOFF faults are mode related, i.e.
795 * faults are guaranteed to be due to the !post-VMXON check unless the CPU is
796 * magically in RM, VM86, compat mode, or at CPL>0.
797 */
kvm_cpu_vmxoff(void)798 static int kvm_cpu_vmxoff(void)
799 {
800 asm goto("1: vmxoff\n\t"
801 _ASM_EXTABLE(1b, %l[fault])
802 ::: "cc", "memory" : fault);
803
804 cr4_clear_bits(X86_CR4_VMXE);
805 return 0;
806
807 fault:
808 cr4_clear_bits(X86_CR4_VMXE);
809 return -EIO;
810 }
811
vmx_emergency_disable_virtualization_cpu(void)812 void vmx_emergency_disable_virtualization_cpu(void)
813 {
814 int cpu = raw_smp_processor_id();
815 struct loaded_vmcs *v;
816
817 kvm_rebooting = true;
818
819 /*
820 * Note, CR4.VMXE can be _cleared_ in NMI context, but it can only be
821 * set in task context. If this races with VMX is disabled by an NMI,
822 * VMCLEAR and VMXOFF may #UD, but KVM will eat those faults due to
823 * kvm_rebooting set.
824 */
825 if (!(__read_cr4() & X86_CR4_VMXE))
826 return;
827
828 list_for_each_entry(v, &per_cpu(loaded_vmcss_on_cpu, cpu),
829 loaded_vmcss_on_cpu_link) {
830 vmcs_clear(v->vmcs);
831 if (v->shadow_vmcs)
832 vmcs_clear(v->shadow_vmcs);
833 }
834
835 kvm_cpu_vmxoff();
836 }
837
__loaded_vmcs_clear(void * arg)838 static void __loaded_vmcs_clear(void *arg)
839 {
840 struct loaded_vmcs *loaded_vmcs = arg;
841 int cpu = raw_smp_processor_id();
842
843 if (loaded_vmcs->cpu != cpu)
844 return; /* vcpu migration can race with cpu offline */
845 if (per_cpu(current_vmcs, cpu) == loaded_vmcs->vmcs)
846 per_cpu(current_vmcs, cpu) = NULL;
847
848 vmcs_clear(loaded_vmcs->vmcs);
849 if (loaded_vmcs->shadow_vmcs && loaded_vmcs->launched)
850 vmcs_clear(loaded_vmcs->shadow_vmcs);
851
852 list_del(&loaded_vmcs->loaded_vmcss_on_cpu_link);
853
854 /*
855 * Ensure all writes to loaded_vmcs, including deleting it from its
856 * current percpu list, complete before setting loaded_vmcs->cpu to
857 * -1, otherwise a different cpu can see loaded_vmcs->cpu == -1 first
858 * and add loaded_vmcs to its percpu list before it's deleted from this
859 * cpu's list. Pairs with the smp_rmb() in vmx_vcpu_load_vmcs().
860 */
861 smp_wmb();
862
863 loaded_vmcs->cpu = -1;
864 loaded_vmcs->launched = 0;
865 }
866
loaded_vmcs_clear(struct loaded_vmcs * loaded_vmcs)867 static void loaded_vmcs_clear(struct loaded_vmcs *loaded_vmcs)
868 {
869 int cpu = loaded_vmcs->cpu;
870
871 if (cpu != -1)
872 smp_call_function_single(cpu,
873 __loaded_vmcs_clear, loaded_vmcs, 1);
874 }
875
vmx_segment_cache_test_set(struct vcpu_vmx * vmx,unsigned seg,unsigned field)876 static bool vmx_segment_cache_test_set(struct vcpu_vmx *vmx, unsigned seg,
877 unsigned field)
878 {
879 bool ret;
880 u32 mask = 1 << (seg * SEG_FIELD_NR + field);
881
882 if (!kvm_register_is_available(&vmx->vcpu, VCPU_EXREG_SEGMENTS)) {
883 kvm_register_mark_available(&vmx->vcpu, VCPU_EXREG_SEGMENTS);
884 vmx->segment_cache.bitmask = 0;
885 }
886 ret = vmx->segment_cache.bitmask & mask;
887 vmx->segment_cache.bitmask |= mask;
888 return ret;
889 }
890
vmx_read_guest_seg_selector(struct vcpu_vmx * vmx,unsigned seg)891 static u16 vmx_read_guest_seg_selector(struct vcpu_vmx *vmx, unsigned seg)
892 {
893 u16 *p = &vmx->segment_cache.seg[seg].selector;
894
895 if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_SEL))
896 *p = vmcs_read16(kvm_vmx_segment_fields[seg].selector);
897 return *p;
898 }
899
vmx_read_guest_seg_base(struct vcpu_vmx * vmx,unsigned seg)900 static ulong vmx_read_guest_seg_base(struct vcpu_vmx *vmx, unsigned seg)
901 {
902 ulong *p = &vmx->segment_cache.seg[seg].base;
903
904 if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_BASE))
905 *p = vmcs_readl(kvm_vmx_segment_fields[seg].base);
906 return *p;
907 }
908
vmx_read_guest_seg_limit(struct vcpu_vmx * vmx,unsigned seg)909 static u32 vmx_read_guest_seg_limit(struct vcpu_vmx *vmx, unsigned seg)
910 {
911 u32 *p = &vmx->segment_cache.seg[seg].limit;
912
913 if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_LIMIT))
914 *p = vmcs_read32(kvm_vmx_segment_fields[seg].limit);
915 return *p;
916 }
917
vmx_read_guest_seg_ar(struct vcpu_vmx * vmx,unsigned seg)918 static u32 vmx_read_guest_seg_ar(struct vcpu_vmx *vmx, unsigned seg)
919 {
920 u32 *p = &vmx->segment_cache.seg[seg].ar;
921
922 if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_AR))
923 *p = vmcs_read32(kvm_vmx_segment_fields[seg].ar_bytes);
924 return *p;
925 }
926
vmx_update_exception_bitmap(struct kvm_vcpu * vcpu)927 void vmx_update_exception_bitmap(struct kvm_vcpu *vcpu)
928 {
929 u32 eb;
930
931 eb = (1u << PF_VECTOR) | (1u << UD_VECTOR) | (1u << MC_VECTOR) |
932 (1u << DB_VECTOR) | (1u << AC_VECTOR);
933 /*
934 * #VE isn't used for VMX. To test against unexpected changes
935 * related to #VE for VMX, intercept unexpected #VE and warn on it.
936 */
937 if (IS_ENABLED(CONFIG_KVM_INTEL_PROVE_VE))
938 eb |= 1u << VE_VECTOR;
939 /*
940 * Guest access to VMware backdoor ports could legitimately
941 * trigger #GP because of TSS I/O permission bitmap.
942 * We intercept those #GP and allow access to them anyway
943 * as VMware does.
944 */
945 if (enable_vmware_backdoor)
946 eb |= (1u << GP_VECTOR);
947 if ((vcpu->guest_debug &
948 (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP)) ==
949 (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP))
950 eb |= 1u << BP_VECTOR;
951 if (to_vmx(vcpu)->rmode.vm86_active)
952 eb = ~0;
953 if (!vmx_need_pf_intercept(vcpu))
954 eb &= ~(1u << PF_VECTOR);
955
956 /* When we are running a nested L2 guest and L1 specified for it a
957 * certain exception bitmap, we must trap the same exceptions and pass
958 * them to L1. When running L2, we will only handle the exceptions
959 * specified above if L1 did not want them.
960 */
961 if (is_guest_mode(vcpu))
962 eb |= get_vmcs12(vcpu)->exception_bitmap;
963 else {
964 int mask = 0, match = 0;
965
966 if (enable_ept && (eb & (1u << PF_VECTOR))) {
967 /*
968 * If EPT is enabled, #PF is currently only intercepted
969 * if MAXPHYADDR is smaller on the guest than on the
970 * host. In that case we only care about present,
971 * non-reserved faults. For vmcs02, however, PFEC_MASK
972 * and PFEC_MATCH are set in prepare_vmcs02_rare.
973 */
974 mask = PFERR_PRESENT_MASK | PFERR_RSVD_MASK;
975 match = PFERR_PRESENT_MASK;
976 }
977 vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, mask);
978 vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, match);
979 }
980
981 /*
982 * Disabling xfd interception indicates that dynamic xfeatures
983 * might be used in the guest. Always trap #NM in this case
984 * to save guest xfd_err timely.
985 */
986 if (vcpu->arch.xfd_no_write_intercept)
987 eb |= (1u << NM_VECTOR);
988
989 vmcs_write32(EXCEPTION_BITMAP, eb);
990 }
991
992 /*
993 * Check if MSR is intercepted for currently loaded MSR bitmap.
994 */
msr_write_intercepted(struct vcpu_vmx * vmx,u32 msr)995 static bool msr_write_intercepted(struct vcpu_vmx *vmx, u32 msr)
996 {
997 if (!(exec_controls_get(vmx) & CPU_BASED_USE_MSR_BITMAPS))
998 return true;
999
1000 return vmx_test_msr_bitmap_write(vmx->loaded_vmcs->msr_bitmap, msr);
1001 }
1002
__vmx_vcpu_run_flags(struct vcpu_vmx * vmx)1003 unsigned int __vmx_vcpu_run_flags(struct vcpu_vmx *vmx)
1004 {
1005 unsigned int flags = 0;
1006
1007 if (vmx->loaded_vmcs->launched)
1008 flags |= VMX_RUN_VMRESUME;
1009
1010 /*
1011 * If writes to the SPEC_CTRL MSR aren't intercepted, the guest is free
1012 * to change it directly without causing a vmexit. In that case read
1013 * it after vmexit and store it in vmx->spec_ctrl.
1014 */
1015 if (!msr_write_intercepted(vmx, MSR_IA32_SPEC_CTRL))
1016 flags |= VMX_RUN_SAVE_SPEC_CTRL;
1017
1018 if (cpu_feature_enabled(X86_FEATURE_CLEAR_CPU_BUF_VM_MMIO) &&
1019 kvm_vcpu_can_access_host_mmio(&vmx->vcpu))
1020 flags |= VMX_RUN_CLEAR_CPU_BUFFERS_FOR_MMIO;
1021
1022 return flags;
1023 }
1024
clear_atomic_switch_msr_special(struct vcpu_vmx * vmx,unsigned long entry,unsigned long exit)1025 static __always_inline void clear_atomic_switch_msr_special(struct vcpu_vmx *vmx,
1026 unsigned long entry, unsigned long exit)
1027 {
1028 vm_entry_controls_clearbit(vmx, entry);
1029 vm_exit_controls_clearbit(vmx, exit);
1030 }
1031
vmx_find_loadstore_msr_slot(struct vmx_msrs * m,u32 msr)1032 static int vmx_find_loadstore_msr_slot(struct vmx_msrs *m, u32 msr)
1033 {
1034 unsigned int i;
1035
1036 for (i = 0; i < m->nr; ++i) {
1037 if (m->val[i].index == msr)
1038 return i;
1039 }
1040 return -ENOENT;
1041 }
1042
vmx_remove_auto_msr(struct vmx_msrs * m,u32 msr,unsigned long vmcs_count_field)1043 static void vmx_remove_auto_msr(struct vmx_msrs *m, u32 msr,
1044 unsigned long vmcs_count_field)
1045 {
1046 int i;
1047
1048 i = vmx_find_loadstore_msr_slot(m, msr);
1049 if (i < 0)
1050 return;
1051
1052 --m->nr;
1053 m->val[i] = m->val[m->nr];
1054 vmcs_write32(vmcs_count_field, m->nr);
1055 }
1056
clear_atomic_switch_msr(struct vcpu_vmx * vmx,unsigned msr)1057 static void clear_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr)
1058 {
1059 struct msr_autoload *m = &vmx->msr_autoload;
1060
1061 switch (msr) {
1062 case MSR_EFER:
1063 if (cpu_has_load_ia32_efer()) {
1064 clear_atomic_switch_msr_special(vmx,
1065 VM_ENTRY_LOAD_IA32_EFER,
1066 VM_EXIT_LOAD_IA32_EFER);
1067 return;
1068 }
1069 break;
1070 case MSR_CORE_PERF_GLOBAL_CTRL:
1071 if (cpu_has_load_perf_global_ctrl()) {
1072 clear_atomic_switch_msr_special(vmx,
1073 VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL,
1074 VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL);
1075 return;
1076 }
1077 break;
1078 }
1079
1080 vmx_remove_auto_msr(&m->guest, msr, VM_ENTRY_MSR_LOAD_COUNT);
1081 vmx_remove_auto_msr(&m->host, msr, VM_EXIT_MSR_LOAD_COUNT);
1082 }
1083
add_atomic_switch_msr_special(struct vcpu_vmx * vmx,unsigned long entry,unsigned long exit,unsigned long guest_val_vmcs,unsigned long host_val_vmcs,u64 guest_val,u64 host_val)1084 static __always_inline void add_atomic_switch_msr_special(struct vcpu_vmx *vmx,
1085 unsigned long entry, unsigned long exit,
1086 unsigned long guest_val_vmcs, unsigned long host_val_vmcs,
1087 u64 guest_val, u64 host_val)
1088 {
1089 vmcs_write64(guest_val_vmcs, guest_val);
1090 if (host_val_vmcs != HOST_IA32_EFER)
1091 vmcs_write64(host_val_vmcs, host_val);
1092 vm_entry_controls_setbit(vmx, entry);
1093 vm_exit_controls_setbit(vmx, exit);
1094 }
1095
vmx_add_auto_msr(struct vmx_msrs * m,u32 msr,u64 value,unsigned long vmcs_count_field,struct kvm * kvm)1096 static void vmx_add_auto_msr(struct vmx_msrs *m, u32 msr, u64 value,
1097 unsigned long vmcs_count_field, struct kvm *kvm)
1098 {
1099 int i;
1100
1101 i = vmx_find_loadstore_msr_slot(m, msr);
1102 if (i < 0) {
1103 if (KVM_BUG_ON(m->nr == MAX_NR_LOADSTORE_MSRS, kvm))
1104 return;
1105
1106 i = m->nr++;
1107 m->val[i].index = msr;
1108 vmcs_write32(vmcs_count_field, m->nr);
1109 }
1110 m->val[i].value = value;
1111 }
1112
add_atomic_switch_msr(struct vcpu_vmx * vmx,unsigned msr,u64 guest_val,u64 host_val)1113 static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr,
1114 u64 guest_val, u64 host_val)
1115 {
1116 struct msr_autoload *m = &vmx->msr_autoload;
1117 struct kvm *kvm = vmx->vcpu.kvm;
1118
1119 switch (msr) {
1120 case MSR_EFER:
1121 if (cpu_has_load_ia32_efer()) {
1122 add_atomic_switch_msr_special(vmx,
1123 VM_ENTRY_LOAD_IA32_EFER,
1124 VM_EXIT_LOAD_IA32_EFER,
1125 GUEST_IA32_EFER,
1126 HOST_IA32_EFER,
1127 guest_val, host_val);
1128 return;
1129 }
1130 break;
1131 case MSR_CORE_PERF_GLOBAL_CTRL:
1132 if (cpu_has_load_perf_global_ctrl()) {
1133 add_atomic_switch_msr_special(vmx,
1134 VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL,
1135 VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL,
1136 GUEST_IA32_PERF_GLOBAL_CTRL,
1137 HOST_IA32_PERF_GLOBAL_CTRL,
1138 guest_val, host_val);
1139 return;
1140 }
1141 break;
1142 case MSR_IA32_PEBS_ENABLE:
1143 /* PEBS needs a quiescent period after being disabled (to write
1144 * a record). Disabling PEBS through VMX MSR swapping doesn't
1145 * provide that period, so a CPU could write host's record into
1146 * guest's memory.
1147 */
1148 wrmsrq(MSR_IA32_PEBS_ENABLE, 0);
1149 }
1150
1151 vmx_add_auto_msr(&m->guest, msr, guest_val, VM_ENTRY_MSR_LOAD_COUNT, kvm);
1152 vmx_add_auto_msr(&m->guest, msr, host_val, VM_EXIT_MSR_LOAD_COUNT, kvm);
1153 }
1154
update_transition_efer(struct vcpu_vmx * vmx)1155 static bool update_transition_efer(struct vcpu_vmx *vmx)
1156 {
1157 u64 guest_efer = vmx->vcpu.arch.efer;
1158 u64 ignore_bits = 0;
1159 int i;
1160
1161 /* Shadow paging assumes NX to be available. */
1162 if (!enable_ept)
1163 guest_efer |= EFER_NX;
1164
1165 /*
1166 * LMA and LME handled by hardware; SCE meaningless outside long mode.
1167 */
1168 ignore_bits |= EFER_SCE;
1169 #ifdef CONFIG_X86_64
1170 ignore_bits |= EFER_LMA | EFER_LME;
1171 /* SCE is meaningful only in long mode on Intel */
1172 if (guest_efer & EFER_LMA)
1173 ignore_bits &= ~(u64)EFER_SCE;
1174 #endif
1175
1176 /*
1177 * On EPT, we can't emulate NX, so we must switch EFER atomically.
1178 * On CPUs that support "load IA32_EFER", always switch EFER
1179 * atomically, since it's faster than switching it manually.
1180 */
1181 if (cpu_has_load_ia32_efer() ||
1182 (enable_ept && ((vmx->vcpu.arch.efer ^ kvm_host.efer) & EFER_NX))) {
1183 if (!(guest_efer & EFER_LMA))
1184 guest_efer &= ~EFER_LME;
1185 if (guest_efer != kvm_host.efer)
1186 add_atomic_switch_msr(vmx, MSR_EFER, guest_efer, kvm_host.efer);
1187 else
1188 clear_atomic_switch_msr(vmx, MSR_EFER);
1189 return false;
1190 }
1191
1192 i = kvm_find_user_return_msr(MSR_EFER);
1193 if (i < 0)
1194 return false;
1195
1196 clear_atomic_switch_msr(vmx, MSR_EFER);
1197
1198 guest_efer &= ~ignore_bits;
1199 guest_efer |= kvm_host.efer & ignore_bits;
1200
1201 vmx->guest_uret_msrs[i].data = guest_efer;
1202 vmx->guest_uret_msrs[i].mask = ~ignore_bits;
1203
1204 return true;
1205 }
1206
vmx_add_autostore_msr(struct vcpu_vmx * vmx,u32 msr)1207 static void vmx_add_autostore_msr(struct vcpu_vmx *vmx, u32 msr)
1208 {
1209 vmx_add_auto_msr(&vmx->msr_autostore, msr, 0, VM_EXIT_MSR_STORE_COUNT,
1210 vmx->vcpu.kvm);
1211 }
1212
vmx_remove_autostore_msr(struct vcpu_vmx * vmx,u32 msr)1213 static void vmx_remove_autostore_msr(struct vcpu_vmx *vmx, u32 msr)
1214 {
1215 vmx_remove_auto_msr(&vmx->msr_autostore, msr, VM_EXIT_MSR_STORE_COUNT);
1216 }
1217
1218 #ifdef CONFIG_X86_32
1219 /*
1220 * On 32-bit kernels, VM exits still load the FS and GS bases from the
1221 * VMCS rather than the segment table. KVM uses this helper to figure
1222 * out the current bases to poke them into the VMCS before entry.
1223 */
segment_base(u16 selector)1224 static unsigned long segment_base(u16 selector)
1225 {
1226 struct desc_struct *table;
1227 unsigned long v;
1228
1229 if (!(selector & ~SEGMENT_RPL_MASK))
1230 return 0;
1231
1232 table = get_current_gdt_ro();
1233
1234 if ((selector & SEGMENT_TI_MASK) == SEGMENT_LDT) {
1235 u16 ldt_selector = kvm_read_ldt();
1236
1237 if (!(ldt_selector & ~SEGMENT_RPL_MASK))
1238 return 0;
1239
1240 table = (struct desc_struct *)segment_base(ldt_selector);
1241 }
1242 v = get_desc_base(&table[selector >> 3]);
1243 return v;
1244 }
1245 #endif
1246
pt_can_write_msr(struct vcpu_vmx * vmx)1247 static inline bool pt_can_write_msr(struct vcpu_vmx *vmx)
1248 {
1249 return vmx_pt_mode_is_host_guest() &&
1250 !(vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN);
1251 }
1252
pt_output_base_valid(struct kvm_vcpu * vcpu,u64 base)1253 static inline bool pt_output_base_valid(struct kvm_vcpu *vcpu, u64 base)
1254 {
1255 /* The base must be 128-byte aligned and a legal physical address. */
1256 return kvm_vcpu_is_legal_aligned_gpa(vcpu, base, 128);
1257 }
1258
pt_load_msr(struct pt_ctx * ctx,u32 addr_range)1259 static inline void pt_load_msr(struct pt_ctx *ctx, u32 addr_range)
1260 {
1261 u32 i;
1262
1263 wrmsrq(MSR_IA32_RTIT_STATUS, ctx->status);
1264 wrmsrq(MSR_IA32_RTIT_OUTPUT_BASE, ctx->output_base);
1265 wrmsrq(MSR_IA32_RTIT_OUTPUT_MASK, ctx->output_mask);
1266 wrmsrq(MSR_IA32_RTIT_CR3_MATCH, ctx->cr3_match);
1267 for (i = 0; i < addr_range; i++) {
1268 wrmsrq(MSR_IA32_RTIT_ADDR0_A + i * 2, ctx->addr_a[i]);
1269 wrmsrq(MSR_IA32_RTIT_ADDR0_B + i * 2, ctx->addr_b[i]);
1270 }
1271 }
1272
pt_save_msr(struct pt_ctx * ctx,u32 addr_range)1273 static inline void pt_save_msr(struct pt_ctx *ctx, u32 addr_range)
1274 {
1275 u32 i;
1276
1277 rdmsrq(MSR_IA32_RTIT_STATUS, ctx->status);
1278 rdmsrq(MSR_IA32_RTIT_OUTPUT_BASE, ctx->output_base);
1279 rdmsrq(MSR_IA32_RTIT_OUTPUT_MASK, ctx->output_mask);
1280 rdmsrq(MSR_IA32_RTIT_CR3_MATCH, ctx->cr3_match);
1281 for (i = 0; i < addr_range; i++) {
1282 rdmsrq(MSR_IA32_RTIT_ADDR0_A + i * 2, ctx->addr_a[i]);
1283 rdmsrq(MSR_IA32_RTIT_ADDR0_B + i * 2, ctx->addr_b[i]);
1284 }
1285 }
1286
pt_guest_enter(struct vcpu_vmx * vmx)1287 static void pt_guest_enter(struct vcpu_vmx *vmx)
1288 {
1289 if (vmx_pt_mode_is_system())
1290 return;
1291
1292 /*
1293 * GUEST_IA32_RTIT_CTL is already set in the VMCS.
1294 * Save host state before VM entry.
1295 */
1296 rdmsrq(MSR_IA32_RTIT_CTL, vmx->pt_desc.host.ctl);
1297 if (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) {
1298 wrmsrq(MSR_IA32_RTIT_CTL, 0);
1299 pt_save_msr(&vmx->pt_desc.host, vmx->pt_desc.num_address_ranges);
1300 pt_load_msr(&vmx->pt_desc.guest, vmx->pt_desc.num_address_ranges);
1301 }
1302 }
1303
pt_guest_exit(struct vcpu_vmx * vmx)1304 static void pt_guest_exit(struct vcpu_vmx *vmx)
1305 {
1306 if (vmx_pt_mode_is_system())
1307 return;
1308
1309 if (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) {
1310 pt_save_msr(&vmx->pt_desc.guest, vmx->pt_desc.num_address_ranges);
1311 pt_load_msr(&vmx->pt_desc.host, vmx->pt_desc.num_address_ranges);
1312 }
1313
1314 /*
1315 * KVM requires VM_EXIT_CLEAR_IA32_RTIT_CTL to expose PT to the guest,
1316 * i.e. RTIT_CTL is always cleared on VM-Exit. Restore it if necessary.
1317 */
1318 if (vmx->pt_desc.host.ctl)
1319 wrmsrq(MSR_IA32_RTIT_CTL, vmx->pt_desc.host.ctl);
1320 }
1321
vmx_set_host_fs_gs(struct vmcs_host_state * host,u16 fs_sel,u16 gs_sel,unsigned long fs_base,unsigned long gs_base)1322 void vmx_set_host_fs_gs(struct vmcs_host_state *host, u16 fs_sel, u16 gs_sel,
1323 unsigned long fs_base, unsigned long gs_base)
1324 {
1325 if (unlikely(fs_sel != host->fs_sel)) {
1326 if (!(fs_sel & 7))
1327 vmcs_write16(HOST_FS_SELECTOR, fs_sel);
1328 else
1329 vmcs_write16(HOST_FS_SELECTOR, 0);
1330 host->fs_sel = fs_sel;
1331 }
1332 if (unlikely(gs_sel != host->gs_sel)) {
1333 if (!(gs_sel & 7))
1334 vmcs_write16(HOST_GS_SELECTOR, gs_sel);
1335 else
1336 vmcs_write16(HOST_GS_SELECTOR, 0);
1337 host->gs_sel = gs_sel;
1338 }
1339 if (unlikely(fs_base != host->fs_base)) {
1340 vmcs_writel(HOST_FS_BASE, fs_base);
1341 host->fs_base = fs_base;
1342 }
1343 if (unlikely(gs_base != host->gs_base)) {
1344 vmcs_writel(HOST_GS_BASE, gs_base);
1345 host->gs_base = gs_base;
1346 }
1347 }
1348
vmx_prepare_switch_to_guest(struct kvm_vcpu * vcpu)1349 void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
1350 {
1351 struct vcpu_vmx *vmx = to_vmx(vcpu);
1352 struct vcpu_vt *vt = to_vt(vcpu);
1353 struct vmcs_host_state *host_state;
1354 #ifdef CONFIG_X86_64
1355 int cpu = raw_smp_processor_id();
1356 #endif
1357 unsigned long fs_base, gs_base;
1358 u16 fs_sel, gs_sel;
1359 int i;
1360
1361 /*
1362 * Note that guest MSRs to be saved/restored can also be changed
1363 * when guest state is loaded. This happens when guest transitions
1364 * to/from long-mode by setting MSR_EFER.LMA.
1365 */
1366 if (!vmx->guest_uret_msrs_loaded) {
1367 vmx->guest_uret_msrs_loaded = true;
1368 for (i = 0; i < kvm_nr_uret_msrs; ++i) {
1369 if (!vmx->guest_uret_msrs[i].load_into_hardware)
1370 continue;
1371
1372 kvm_set_user_return_msr(i,
1373 vmx->guest_uret_msrs[i].data,
1374 vmx->guest_uret_msrs[i].mask);
1375 }
1376 }
1377
1378 if (vmx->nested.need_vmcs12_to_shadow_sync)
1379 nested_sync_vmcs12_to_shadow(vcpu);
1380
1381 if (vt->guest_state_loaded)
1382 return;
1383
1384 host_state = &vmx->loaded_vmcs->host_state;
1385
1386 /*
1387 * Set host fs and gs selectors. Unfortunately, 22.2.3 does not
1388 * allow segment selectors with cpl > 0 or ti == 1.
1389 */
1390 host_state->ldt_sel = kvm_read_ldt();
1391
1392 #ifdef CONFIG_X86_64
1393 savesegment(ds, host_state->ds_sel);
1394 savesegment(es, host_state->es_sel);
1395
1396 gs_base = cpu_kernelmode_gs_base(cpu);
1397 if (likely(is_64bit_mm(current->mm))) {
1398 current_save_fsgs();
1399 fs_sel = current->thread.fsindex;
1400 gs_sel = current->thread.gsindex;
1401 fs_base = current->thread.fsbase;
1402 vt->msr_host_kernel_gs_base = current->thread.gsbase;
1403 } else {
1404 savesegment(fs, fs_sel);
1405 savesegment(gs, gs_sel);
1406 fs_base = read_msr(MSR_FS_BASE);
1407 vt->msr_host_kernel_gs_base = read_msr(MSR_KERNEL_GS_BASE);
1408 }
1409
1410 wrmsrq(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
1411 #else
1412 savesegment(fs, fs_sel);
1413 savesegment(gs, gs_sel);
1414 fs_base = segment_base(fs_sel);
1415 gs_base = segment_base(gs_sel);
1416 #endif
1417
1418 vmx_set_host_fs_gs(host_state, fs_sel, gs_sel, fs_base, gs_base);
1419 vt->guest_state_loaded = true;
1420 }
1421
vmx_prepare_switch_to_host(struct vcpu_vmx * vmx)1422 static void vmx_prepare_switch_to_host(struct vcpu_vmx *vmx)
1423 {
1424 struct vmcs_host_state *host_state;
1425
1426 if (!vmx->vt.guest_state_loaded)
1427 return;
1428
1429 host_state = &vmx->loaded_vmcs->host_state;
1430
1431 ++vmx->vcpu.stat.host_state_reload;
1432
1433 #ifdef CONFIG_X86_64
1434 rdmsrq(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
1435 #endif
1436 if (host_state->ldt_sel || (host_state->gs_sel & 7)) {
1437 kvm_load_ldt(host_state->ldt_sel);
1438 #ifdef CONFIG_X86_64
1439 load_gs_index(host_state->gs_sel);
1440 #else
1441 loadsegment(gs, host_state->gs_sel);
1442 #endif
1443 }
1444 if (host_state->fs_sel & 7)
1445 loadsegment(fs, host_state->fs_sel);
1446 #ifdef CONFIG_X86_64
1447 if (unlikely(host_state->ds_sel | host_state->es_sel)) {
1448 loadsegment(ds, host_state->ds_sel);
1449 loadsegment(es, host_state->es_sel);
1450 }
1451 #endif
1452 invalidate_tss_limit();
1453 #ifdef CONFIG_X86_64
1454 wrmsrq(MSR_KERNEL_GS_BASE, vmx->vt.msr_host_kernel_gs_base);
1455 #endif
1456 load_fixmap_gdt(raw_smp_processor_id());
1457 vmx->vt.guest_state_loaded = false;
1458 vmx->guest_uret_msrs_loaded = false;
1459 }
1460
1461 #ifdef CONFIG_X86_64
vmx_read_guest_host_msr(struct vcpu_vmx * vmx,u32 msr,u64 * cache)1462 static u64 vmx_read_guest_host_msr(struct vcpu_vmx *vmx, u32 msr, u64 *cache)
1463 {
1464 preempt_disable();
1465 if (vmx->vt.guest_state_loaded)
1466 *cache = read_msr(msr);
1467 preempt_enable();
1468 return *cache;
1469 }
1470
vmx_write_guest_host_msr(struct vcpu_vmx * vmx,u32 msr,u64 data,u64 * cache)1471 static void vmx_write_guest_host_msr(struct vcpu_vmx *vmx, u32 msr, u64 data,
1472 u64 *cache)
1473 {
1474 preempt_disable();
1475 if (vmx->vt.guest_state_loaded)
1476 wrmsrns(msr, data);
1477 preempt_enable();
1478 *cache = data;
1479 }
1480
vmx_read_guest_kernel_gs_base(struct vcpu_vmx * vmx)1481 static u64 vmx_read_guest_kernel_gs_base(struct vcpu_vmx *vmx)
1482 {
1483 return vmx_read_guest_host_msr(vmx, MSR_KERNEL_GS_BASE,
1484 &vmx->msr_guest_kernel_gs_base);
1485 }
1486
vmx_write_guest_kernel_gs_base(struct vcpu_vmx * vmx,u64 data)1487 static void vmx_write_guest_kernel_gs_base(struct vcpu_vmx *vmx, u64 data)
1488 {
1489 vmx_write_guest_host_msr(vmx, MSR_KERNEL_GS_BASE, data,
1490 &vmx->msr_guest_kernel_gs_base);
1491 }
1492 #endif
1493
grow_ple_window(struct kvm_vcpu * vcpu)1494 static void grow_ple_window(struct kvm_vcpu *vcpu)
1495 {
1496 struct vcpu_vmx *vmx = to_vmx(vcpu);
1497 unsigned int old = vmx->ple_window;
1498
1499 vmx->ple_window = __grow_ple_window(old, ple_window,
1500 ple_window_grow,
1501 ple_window_max);
1502
1503 if (vmx->ple_window != old) {
1504 vmx->ple_window_dirty = true;
1505 trace_kvm_ple_window_update(vcpu->vcpu_id,
1506 vmx->ple_window, old);
1507 }
1508 }
1509
shrink_ple_window(struct kvm_vcpu * vcpu)1510 static void shrink_ple_window(struct kvm_vcpu *vcpu)
1511 {
1512 struct vcpu_vmx *vmx = to_vmx(vcpu);
1513 unsigned int old = vmx->ple_window;
1514
1515 vmx->ple_window = __shrink_ple_window(old, ple_window,
1516 ple_window_shrink,
1517 ple_window);
1518
1519 if (vmx->ple_window != old) {
1520 vmx->ple_window_dirty = true;
1521 trace_kvm_ple_window_update(vcpu->vcpu_id,
1522 vmx->ple_window, old);
1523 }
1524 }
1525
vmx_vcpu_load_vmcs(struct kvm_vcpu * vcpu,int cpu)1526 void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu)
1527 {
1528 struct vcpu_vmx *vmx = to_vmx(vcpu);
1529 bool already_loaded = vmx->loaded_vmcs->cpu == cpu;
1530 struct vmcs *prev;
1531
1532 if (!already_loaded) {
1533 loaded_vmcs_clear(vmx->loaded_vmcs);
1534 local_irq_disable();
1535
1536 /*
1537 * Ensure loaded_vmcs->cpu is read before adding loaded_vmcs to
1538 * this cpu's percpu list, otherwise it may not yet be deleted
1539 * from its previous cpu's percpu list. Pairs with the
1540 * smb_wmb() in __loaded_vmcs_clear().
1541 */
1542 smp_rmb();
1543
1544 list_add(&vmx->loaded_vmcs->loaded_vmcss_on_cpu_link,
1545 &per_cpu(loaded_vmcss_on_cpu, cpu));
1546 local_irq_enable();
1547 }
1548
1549 prev = per_cpu(current_vmcs, cpu);
1550 if (prev != vmx->loaded_vmcs->vmcs) {
1551 per_cpu(current_vmcs, cpu) = vmx->loaded_vmcs->vmcs;
1552 vmcs_load(vmx->loaded_vmcs->vmcs);
1553 }
1554
1555 if (!already_loaded) {
1556 void *gdt = get_current_gdt_ro();
1557
1558 /*
1559 * Flush all EPTP/VPID contexts, the new pCPU may have stale
1560 * TLB entries from its previous association with the vCPU.
1561 */
1562 kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
1563
1564 /*
1565 * Linux uses per-cpu TSS and GDT, so set these when switching
1566 * processors. See 22.2.4.
1567 */
1568 vmcs_writel(HOST_TR_BASE,
1569 (unsigned long)&get_cpu_entry_area(cpu)->tss.x86_tss);
1570 vmcs_writel(HOST_GDTR_BASE, (unsigned long)gdt); /* 22.2.4 */
1571
1572 if (IS_ENABLED(CONFIG_IA32_EMULATION) || IS_ENABLED(CONFIG_X86_32)) {
1573 /* 22.2.3 */
1574 vmcs_writel(HOST_IA32_SYSENTER_ESP,
1575 (unsigned long)(cpu_entry_stack(cpu) + 1));
1576 }
1577
1578 vmx->loaded_vmcs->cpu = cpu;
1579 }
1580 }
1581
1582 /*
1583 * Switches to specified vcpu, until a matching vcpu_put(), but assumes
1584 * vcpu mutex is already taken.
1585 */
vmx_vcpu_load(struct kvm_vcpu * vcpu,int cpu)1586 void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
1587 {
1588 if (vcpu->scheduled_out && !kvm_pause_in_guest(vcpu->kvm))
1589 shrink_ple_window(vcpu);
1590
1591 vmx_vcpu_load_vmcs(vcpu, cpu);
1592
1593 vmx_vcpu_pi_load(vcpu, cpu);
1594 }
1595
vmx_vcpu_put(struct kvm_vcpu * vcpu)1596 void vmx_vcpu_put(struct kvm_vcpu *vcpu)
1597 {
1598 vmx_vcpu_pi_put(vcpu);
1599
1600 vmx_prepare_switch_to_host(to_vmx(vcpu));
1601 }
1602
vmx_switch_loaded_vmcs(struct kvm_vcpu * vcpu,struct loaded_vmcs * vmcs)1603 static void vmx_switch_loaded_vmcs(struct kvm_vcpu *vcpu,
1604 struct loaded_vmcs *vmcs)
1605 {
1606 struct vcpu_vmx *vmx = to_vmx(vcpu);
1607 int cpu;
1608
1609 cpu = get_cpu();
1610 vmx->loaded_vmcs = vmcs;
1611 vmx_vcpu_load_vmcs(vcpu, cpu);
1612 put_cpu();
1613 }
1614
vmx_load_vmcs01(struct kvm_vcpu * vcpu)1615 static void vmx_load_vmcs01(struct kvm_vcpu *vcpu)
1616 {
1617 struct vcpu_vmx *vmx = to_vmx(vcpu);
1618
1619 if (!is_guest_mode(vcpu)) {
1620 WARN_ON_ONCE(vmx->loaded_vmcs != &vmx->vmcs01);
1621 return;
1622 }
1623
1624 WARN_ON_ONCE(vmx->loaded_vmcs != &vmx->nested.vmcs02);
1625 vmx_switch_loaded_vmcs(vcpu, &vmx->vmcs01);
1626 }
1627
vmx_put_vmcs01(struct kvm_vcpu * vcpu)1628 static void vmx_put_vmcs01(struct kvm_vcpu *vcpu)
1629 {
1630 if (!is_guest_mode(vcpu))
1631 return;
1632
1633 vmx_switch_loaded_vmcs(vcpu, &to_vmx(vcpu)->nested.vmcs02);
1634 }
DEFINE_GUARD(vmx_vmcs01,struct kvm_vcpu *,vmx_load_vmcs01 (_T),vmx_put_vmcs01 (_T))1635 DEFINE_GUARD(vmx_vmcs01, struct kvm_vcpu *,
1636 vmx_load_vmcs01(_T), vmx_put_vmcs01(_T))
1637
1638 bool vmx_emulation_required(struct kvm_vcpu *vcpu)
1639 {
1640 return emulate_invalid_guest_state && !vmx_guest_state_valid(vcpu);
1641 }
1642
vmx_get_rflags(struct kvm_vcpu * vcpu)1643 unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu)
1644 {
1645 struct vcpu_vmx *vmx = to_vmx(vcpu);
1646 unsigned long rflags, save_rflags;
1647
1648 if (!kvm_register_is_available(vcpu, VCPU_EXREG_RFLAGS)) {
1649 kvm_register_mark_available(vcpu, VCPU_EXREG_RFLAGS);
1650 rflags = vmcs_readl(GUEST_RFLAGS);
1651 if (vmx->rmode.vm86_active) {
1652 rflags &= RMODE_GUEST_OWNED_EFLAGS_BITS;
1653 save_rflags = vmx->rmode.save_rflags;
1654 rflags |= save_rflags & ~RMODE_GUEST_OWNED_EFLAGS_BITS;
1655 }
1656 vmx->rflags = rflags;
1657 }
1658 return vmx->rflags;
1659 }
1660
vmx_set_rflags(struct kvm_vcpu * vcpu,unsigned long rflags)1661 void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
1662 {
1663 struct vcpu_vmx *vmx = to_vmx(vcpu);
1664 unsigned long old_rflags;
1665
1666 /*
1667 * Unlike CR0 and CR4, RFLAGS handling requires checking if the vCPU
1668 * is an unrestricted guest in order to mark L2 as needing emulation
1669 * if L1 runs L2 as a restricted guest.
1670 */
1671 if (is_unrestricted_guest(vcpu)) {
1672 kvm_register_mark_available(vcpu, VCPU_EXREG_RFLAGS);
1673 vmx->rflags = rflags;
1674 vmcs_writel(GUEST_RFLAGS, rflags);
1675 return;
1676 }
1677
1678 old_rflags = vmx_get_rflags(vcpu);
1679 vmx->rflags = rflags;
1680 if (vmx->rmode.vm86_active) {
1681 vmx->rmode.save_rflags = rflags;
1682 rflags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM;
1683 }
1684 vmcs_writel(GUEST_RFLAGS, rflags);
1685
1686 if ((old_rflags ^ vmx->rflags) & X86_EFLAGS_VM)
1687 vmx->vt.emulation_required = vmx_emulation_required(vcpu);
1688 }
1689
vmx_get_if_flag(struct kvm_vcpu * vcpu)1690 bool vmx_get_if_flag(struct kvm_vcpu *vcpu)
1691 {
1692 return vmx_get_rflags(vcpu) & X86_EFLAGS_IF;
1693 }
1694
vmx_get_interrupt_shadow(struct kvm_vcpu * vcpu)1695 u32 vmx_get_interrupt_shadow(struct kvm_vcpu *vcpu)
1696 {
1697 u32 interruptibility = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
1698 int ret = 0;
1699
1700 if (interruptibility & GUEST_INTR_STATE_STI)
1701 ret |= KVM_X86_SHADOW_INT_STI;
1702 if (interruptibility & GUEST_INTR_STATE_MOV_SS)
1703 ret |= KVM_X86_SHADOW_INT_MOV_SS;
1704
1705 return ret;
1706 }
1707
vmx_set_interrupt_shadow(struct kvm_vcpu * vcpu,int mask)1708 void vmx_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask)
1709 {
1710 u32 interruptibility_old = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
1711 u32 interruptibility = interruptibility_old;
1712
1713 interruptibility &= ~(GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS);
1714
1715 if (mask & KVM_X86_SHADOW_INT_MOV_SS)
1716 interruptibility |= GUEST_INTR_STATE_MOV_SS;
1717 else if (mask & KVM_X86_SHADOW_INT_STI)
1718 interruptibility |= GUEST_INTR_STATE_STI;
1719
1720 if ((interruptibility != interruptibility_old))
1721 vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, interruptibility);
1722 }
1723
vmx_rtit_ctl_check(struct kvm_vcpu * vcpu,u64 data)1724 static int vmx_rtit_ctl_check(struct kvm_vcpu *vcpu, u64 data)
1725 {
1726 struct vcpu_vmx *vmx = to_vmx(vcpu);
1727 unsigned long value;
1728
1729 /*
1730 * Any MSR write that attempts to change bits marked reserved will
1731 * case a #GP fault.
1732 */
1733 if (data & vmx->pt_desc.ctl_bitmask)
1734 return 1;
1735
1736 /*
1737 * Any attempt to modify IA32_RTIT_CTL while TraceEn is set will
1738 * result in a #GP unless the same write also clears TraceEn.
1739 */
1740 if ((vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) &&
1741 (data & RTIT_CTL_TRACEEN) &&
1742 data != vmx->pt_desc.guest.ctl)
1743 return 1;
1744
1745 /*
1746 * WRMSR to IA32_RTIT_CTL that sets TraceEn but clears this bit
1747 * and FabricEn would cause #GP, if
1748 * CPUID.(EAX=14H, ECX=0):ECX.SNGLRGNOUT[bit 2] = 0
1749 */
1750 if ((data & RTIT_CTL_TRACEEN) && !(data & RTIT_CTL_TOPA) &&
1751 !(data & RTIT_CTL_FABRIC_EN) &&
1752 !intel_pt_validate_cap(vmx->pt_desc.caps,
1753 PT_CAP_single_range_output))
1754 return 1;
1755
1756 /*
1757 * MTCFreq, CycThresh and PSBFreq encodings check, any MSR write that
1758 * utilize encodings marked reserved will cause a #GP fault.
1759 */
1760 value = intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_mtc_periods);
1761 if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_mtc) &&
1762 !test_bit((data & RTIT_CTL_MTC_RANGE) >>
1763 RTIT_CTL_MTC_RANGE_OFFSET, &value))
1764 return 1;
1765 value = intel_pt_validate_cap(vmx->pt_desc.caps,
1766 PT_CAP_cycle_thresholds);
1767 if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_psb_cyc) &&
1768 !test_bit((data & RTIT_CTL_CYC_THRESH) >>
1769 RTIT_CTL_CYC_THRESH_OFFSET, &value))
1770 return 1;
1771 value = intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_psb_periods);
1772 if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_psb_cyc) &&
1773 !test_bit((data & RTIT_CTL_PSB_FREQ) >>
1774 RTIT_CTL_PSB_FREQ_OFFSET, &value))
1775 return 1;
1776
1777 /*
1778 * If ADDRx_CFG is reserved or the encodings is >2 will
1779 * cause a #GP fault.
1780 */
1781 value = (data & RTIT_CTL_ADDR0) >> RTIT_CTL_ADDR0_OFFSET;
1782 if ((value && (vmx->pt_desc.num_address_ranges < 1)) || (value > 2))
1783 return 1;
1784 value = (data & RTIT_CTL_ADDR1) >> RTIT_CTL_ADDR1_OFFSET;
1785 if ((value && (vmx->pt_desc.num_address_ranges < 2)) || (value > 2))
1786 return 1;
1787 value = (data & RTIT_CTL_ADDR2) >> RTIT_CTL_ADDR2_OFFSET;
1788 if ((value && (vmx->pt_desc.num_address_ranges < 3)) || (value > 2))
1789 return 1;
1790 value = (data & RTIT_CTL_ADDR3) >> RTIT_CTL_ADDR3_OFFSET;
1791 if ((value && (vmx->pt_desc.num_address_ranges < 4)) || (value > 2))
1792 return 1;
1793
1794 return 0;
1795 }
1796
vmx_check_emulate_instruction(struct kvm_vcpu * vcpu,int emul_type,void * insn,int insn_len)1797 int vmx_check_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type,
1798 void *insn, int insn_len)
1799 {
1800 /*
1801 * Emulation of instructions in SGX enclaves is impossible as RIP does
1802 * not point at the failing instruction, and even if it did, the code
1803 * stream is inaccessible. Inject #UD instead of exiting to userspace
1804 * so that guest userspace can't DoS the guest simply by triggering
1805 * emulation (enclaves are CPL3 only).
1806 */
1807 if (vmx_get_exit_reason(vcpu).enclave_mode) {
1808 kvm_queue_exception(vcpu, UD_VECTOR);
1809 return X86EMUL_PROPAGATE_FAULT;
1810 }
1811
1812 /* Check that emulation is possible during event vectoring */
1813 if ((to_vmx(vcpu)->idt_vectoring_info & VECTORING_INFO_VALID_MASK) &&
1814 !kvm_can_emulate_event_vectoring(emul_type))
1815 return X86EMUL_UNHANDLEABLE_VECTORING;
1816
1817 return X86EMUL_CONTINUE;
1818 }
1819
skip_emulated_instruction(struct kvm_vcpu * vcpu)1820 static int skip_emulated_instruction(struct kvm_vcpu *vcpu)
1821 {
1822 union vmx_exit_reason exit_reason = vmx_get_exit_reason(vcpu);
1823 unsigned long rip, orig_rip;
1824 u32 instr_len;
1825
1826 /*
1827 * Using VMCS.VM_EXIT_INSTRUCTION_LEN on EPT misconfig depends on
1828 * undefined behavior: Intel's SDM doesn't mandate the VMCS field be
1829 * set when EPT misconfig occurs. In practice, real hardware updates
1830 * VM_EXIT_INSTRUCTION_LEN on EPT misconfig, but other hypervisors
1831 * (namely Hyper-V) don't set it due to it being undefined behavior,
1832 * i.e. we end up advancing IP with some random value.
1833 */
1834 if (!static_cpu_has(X86_FEATURE_HYPERVISOR) ||
1835 exit_reason.basic != EXIT_REASON_EPT_MISCONFIG) {
1836 instr_len = vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
1837
1838 /*
1839 * Emulating an enclave's instructions isn't supported as KVM
1840 * cannot access the enclave's memory or its true RIP, e.g. the
1841 * vmcs.GUEST_RIP points at the exit point of the enclave, not
1842 * the RIP that actually triggered the VM-Exit. But, because
1843 * most instructions that cause VM-Exit will #UD in an enclave,
1844 * most instruction-based VM-Exits simply do not occur.
1845 *
1846 * There are a few exceptions, notably the debug instructions
1847 * INT1ICEBRK and INT3, as they are allowed in debug enclaves
1848 * and generate #DB/#BP as expected, which KVM might intercept.
1849 * But again, the CPU does the dirty work and saves an instr
1850 * length of zero so VMMs don't shoot themselves in the foot.
1851 * WARN if KVM tries to skip a non-zero length instruction on
1852 * a VM-Exit from an enclave.
1853 */
1854 if (!instr_len)
1855 goto rip_updated;
1856
1857 WARN_ONCE(exit_reason.enclave_mode,
1858 "skipping instruction after SGX enclave VM-Exit");
1859
1860 orig_rip = kvm_rip_read(vcpu);
1861 rip = orig_rip + instr_len;
1862 #ifdef CONFIG_X86_64
1863 /*
1864 * We need to mask out the high 32 bits of RIP if not in 64-bit
1865 * mode, but just finding out that we are in 64-bit mode is
1866 * quite expensive. Only do it if there was a carry.
1867 */
1868 if (unlikely(((rip ^ orig_rip) >> 31) == 3) && !is_64_bit_mode(vcpu))
1869 rip = (u32)rip;
1870 #endif
1871 kvm_rip_write(vcpu, rip);
1872 } else {
1873 if (!kvm_emulate_instruction(vcpu, EMULTYPE_SKIP))
1874 return 0;
1875 }
1876
1877 rip_updated:
1878 /* skipping an emulated instruction also counts */
1879 vmx_set_interrupt_shadow(vcpu, 0);
1880
1881 return 1;
1882 }
1883
1884 /*
1885 * Recognizes a pending MTF VM-exit and records the nested state for later
1886 * delivery.
1887 */
vmx_update_emulated_instruction(struct kvm_vcpu * vcpu)1888 void vmx_update_emulated_instruction(struct kvm_vcpu *vcpu)
1889 {
1890 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
1891 struct vcpu_vmx *vmx = to_vmx(vcpu);
1892
1893 if (!is_guest_mode(vcpu))
1894 return;
1895
1896 /*
1897 * Per the SDM, MTF takes priority over debug-trap exceptions besides
1898 * TSS T-bit traps and ICEBP (INT1). KVM doesn't emulate T-bit traps
1899 * or ICEBP (in the emulator proper), and skipping of ICEBP after an
1900 * intercepted #DB deliberately avoids single-step #DB and MTF updates
1901 * as ICEBP is higher priority than both. As instruction emulation is
1902 * completed at this point (i.e. KVM is at the instruction boundary),
1903 * any #DB exception pending delivery must be a debug-trap of lower
1904 * priority than MTF. Record the pending MTF state to be delivered in
1905 * vmx_check_nested_events().
1906 */
1907 if (nested_cpu_has_mtf(vmcs12) &&
1908 (!vcpu->arch.exception.pending ||
1909 vcpu->arch.exception.vector == DB_VECTOR) &&
1910 (!vcpu->arch.exception_vmexit.pending ||
1911 vcpu->arch.exception_vmexit.vector == DB_VECTOR)) {
1912 vmx->nested.mtf_pending = true;
1913 kvm_make_request(KVM_REQ_EVENT, vcpu);
1914 } else {
1915 vmx->nested.mtf_pending = false;
1916 }
1917 }
1918
vmx_skip_emulated_instruction(struct kvm_vcpu * vcpu)1919 int vmx_skip_emulated_instruction(struct kvm_vcpu *vcpu)
1920 {
1921 vmx_update_emulated_instruction(vcpu);
1922 return skip_emulated_instruction(vcpu);
1923 }
1924
vmx_clear_hlt(struct kvm_vcpu * vcpu)1925 static void vmx_clear_hlt(struct kvm_vcpu *vcpu)
1926 {
1927 /*
1928 * Ensure that we clear the HLT state in the VMCS. We don't need to
1929 * explicitly skip the instruction because if the HLT state is set,
1930 * then the instruction is already executing and RIP has already been
1931 * advanced.
1932 */
1933 if (kvm_hlt_in_guest(vcpu->kvm) &&
1934 vmcs_read32(GUEST_ACTIVITY_STATE) == GUEST_ACTIVITY_HLT)
1935 vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE);
1936 }
1937
vmx_inject_exception(struct kvm_vcpu * vcpu)1938 void vmx_inject_exception(struct kvm_vcpu *vcpu)
1939 {
1940 struct kvm_queued_exception *ex = &vcpu->arch.exception;
1941 u32 intr_info = ex->vector | INTR_INFO_VALID_MASK;
1942 struct vcpu_vmx *vmx = to_vmx(vcpu);
1943
1944 kvm_deliver_exception_payload(vcpu, ex);
1945
1946 if (ex->has_error_code) {
1947 /*
1948 * Despite the error code being architecturally defined as 32
1949 * bits, and the VMCS field being 32 bits, Intel CPUs and thus
1950 * VMX don't actually supporting setting bits 31:16. Hardware
1951 * will (should) never provide a bogus error code, but AMD CPUs
1952 * do generate error codes with bits 31:16 set, and so KVM's
1953 * ABI lets userspace shove in arbitrary 32-bit values. Drop
1954 * the upper bits to avoid VM-Fail, losing information that
1955 * doesn't really exist is preferable to killing the VM.
1956 */
1957 vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, (u16)ex->error_code);
1958 intr_info |= INTR_INFO_DELIVER_CODE_MASK;
1959 }
1960
1961 if (vmx->rmode.vm86_active) {
1962 int inc_eip = 0;
1963 if (kvm_exception_is_soft(ex->vector))
1964 inc_eip = vcpu->arch.event_exit_inst_len;
1965 kvm_inject_realmode_interrupt(vcpu, ex->vector, inc_eip);
1966 return;
1967 }
1968
1969 WARN_ON_ONCE(vmx->vt.emulation_required);
1970
1971 if (kvm_exception_is_soft(ex->vector)) {
1972 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
1973 vmx->vcpu.arch.event_exit_inst_len);
1974 intr_info |= INTR_TYPE_SOFT_EXCEPTION;
1975 } else
1976 intr_info |= INTR_TYPE_HARD_EXCEPTION;
1977
1978 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr_info);
1979
1980 vmx_clear_hlt(vcpu);
1981 }
1982
vmx_setup_uret_msr(struct vcpu_vmx * vmx,unsigned int msr,bool load_into_hardware)1983 static void vmx_setup_uret_msr(struct vcpu_vmx *vmx, unsigned int msr,
1984 bool load_into_hardware)
1985 {
1986 struct vmx_uret_msr *uret_msr;
1987
1988 uret_msr = vmx_find_uret_msr(vmx, msr);
1989 if (!uret_msr)
1990 return;
1991
1992 uret_msr->load_into_hardware = load_into_hardware;
1993 }
1994
1995 /*
1996 * Configuring user return MSRs to automatically save, load, and restore MSRs
1997 * that need to be shoved into hardware when running the guest. Note, omitting
1998 * an MSR here does _NOT_ mean it's not emulated, only that it will not be
1999 * loaded into hardware when running the guest.
2000 */
vmx_setup_uret_msrs(struct vcpu_vmx * vmx)2001 static void vmx_setup_uret_msrs(struct vcpu_vmx *vmx)
2002 {
2003 #ifdef CONFIG_X86_64
2004 bool load_syscall_msrs;
2005
2006 /*
2007 * The SYSCALL MSRs are only needed on long mode guests, and only
2008 * when EFER.SCE is set.
2009 */
2010 load_syscall_msrs = is_long_mode(&vmx->vcpu) &&
2011 (vmx->vcpu.arch.efer & EFER_SCE);
2012
2013 vmx_setup_uret_msr(vmx, MSR_STAR, load_syscall_msrs);
2014 vmx_setup_uret_msr(vmx, MSR_LSTAR, load_syscall_msrs);
2015 vmx_setup_uret_msr(vmx, MSR_SYSCALL_MASK, load_syscall_msrs);
2016 #endif
2017 vmx_setup_uret_msr(vmx, MSR_EFER, update_transition_efer(vmx));
2018
2019 vmx_setup_uret_msr(vmx, MSR_TSC_AUX,
2020 guest_cpu_cap_has(&vmx->vcpu, X86_FEATURE_RDTSCP) ||
2021 guest_cpu_cap_has(&vmx->vcpu, X86_FEATURE_RDPID));
2022
2023 /*
2024 * hle=0, rtm=0, tsx_ctrl=1 can be found with some combinations of new
2025 * kernel and old userspace. If those guests run on a tsx=off host, do
2026 * allow guests to use TSX_CTRL, but don't change the value in hardware
2027 * so that TSX remains always disabled.
2028 */
2029 vmx_setup_uret_msr(vmx, MSR_IA32_TSX_CTRL, boot_cpu_has(X86_FEATURE_RTM));
2030
2031 /*
2032 * The set of MSRs to load may have changed, reload MSRs before the
2033 * next VM-Enter.
2034 */
2035 vmx->guest_uret_msrs_loaded = false;
2036 }
2037
vmx_get_l2_tsc_offset(struct kvm_vcpu * vcpu)2038 u64 vmx_get_l2_tsc_offset(struct kvm_vcpu *vcpu)
2039 {
2040 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
2041
2042 if (nested_cpu_has(vmcs12, CPU_BASED_USE_TSC_OFFSETTING))
2043 return vmcs12->tsc_offset;
2044
2045 return 0;
2046 }
2047
vmx_get_l2_tsc_multiplier(struct kvm_vcpu * vcpu)2048 u64 vmx_get_l2_tsc_multiplier(struct kvm_vcpu *vcpu)
2049 {
2050 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
2051
2052 if (nested_cpu_has(vmcs12, CPU_BASED_USE_TSC_OFFSETTING) &&
2053 nested_cpu_has2(vmcs12, SECONDARY_EXEC_TSC_SCALING))
2054 return vmcs12->tsc_multiplier;
2055
2056 return kvm_caps.default_tsc_scaling_ratio;
2057 }
2058
vmx_write_tsc_offset(struct kvm_vcpu * vcpu)2059 void vmx_write_tsc_offset(struct kvm_vcpu *vcpu)
2060 {
2061 vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset);
2062 }
2063
vmx_write_tsc_multiplier(struct kvm_vcpu * vcpu)2064 void vmx_write_tsc_multiplier(struct kvm_vcpu *vcpu)
2065 {
2066 vmcs_write64(TSC_MULTIPLIER, vcpu->arch.tsc_scaling_ratio);
2067 }
2068
2069 /*
2070 * Userspace is allowed to set any supported IA32_FEATURE_CONTROL regardless of
2071 * guest CPUID. Note, KVM allows userspace to set "VMX in SMX" to maintain
2072 * backwards compatibility even though KVM doesn't support emulating SMX. And
2073 * because userspace set "VMX in SMX", the guest must also be allowed to set it,
2074 * e.g. if the MSR is left unlocked and the guest does a RMW operation.
2075 */
2076 #define KVM_SUPPORTED_FEATURE_CONTROL (FEAT_CTL_LOCKED | \
2077 FEAT_CTL_VMX_ENABLED_INSIDE_SMX | \
2078 FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX | \
2079 FEAT_CTL_SGX_LC_ENABLED | \
2080 FEAT_CTL_SGX_ENABLED | \
2081 FEAT_CTL_LMCE_ENABLED)
2082
is_vmx_feature_control_msr_valid(struct vcpu_vmx * vmx,struct msr_data * msr)2083 static inline bool is_vmx_feature_control_msr_valid(struct vcpu_vmx *vmx,
2084 struct msr_data *msr)
2085 {
2086 uint64_t valid_bits;
2087
2088 /*
2089 * Ensure KVM_SUPPORTED_FEATURE_CONTROL is updated when new bits are
2090 * exposed to the guest.
2091 */
2092 WARN_ON_ONCE(vmx->msr_ia32_feature_control_valid_bits &
2093 ~KVM_SUPPORTED_FEATURE_CONTROL);
2094
2095 if (!msr->host_initiated &&
2096 (vmx->msr_ia32_feature_control & FEAT_CTL_LOCKED))
2097 return false;
2098
2099 if (msr->host_initiated)
2100 valid_bits = KVM_SUPPORTED_FEATURE_CONTROL;
2101 else
2102 valid_bits = vmx->msr_ia32_feature_control_valid_bits;
2103
2104 return !(msr->data & ~valid_bits);
2105 }
2106
vmx_get_feature_msr(u32 msr,u64 * data)2107 int vmx_get_feature_msr(u32 msr, u64 *data)
2108 {
2109 switch (msr) {
2110 case KVM_FIRST_EMULATED_VMX_MSR ... KVM_LAST_EMULATED_VMX_MSR:
2111 if (!nested)
2112 return 1;
2113 return vmx_get_vmx_msr(&vmcs_config.nested, msr, data);
2114 default:
2115 return KVM_MSR_RET_UNSUPPORTED;
2116 }
2117 }
2118
2119 /*
2120 * Reads an msr value (of 'msr_info->index') into 'msr_info->data'.
2121 * Returns 0 on success, non-0 otherwise.
2122 * Assumes vcpu_load() was already called.
2123 */
vmx_get_msr(struct kvm_vcpu * vcpu,struct msr_data * msr_info)2124 int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
2125 {
2126 struct vcpu_vmx *vmx = to_vmx(vcpu);
2127 struct vmx_uret_msr *msr;
2128 u32 index;
2129
2130 switch (msr_info->index) {
2131 #ifdef CONFIG_X86_64
2132 case MSR_FS_BASE:
2133 msr_info->data = vmcs_readl(GUEST_FS_BASE);
2134 break;
2135 case MSR_GS_BASE:
2136 msr_info->data = vmcs_readl(GUEST_GS_BASE);
2137 break;
2138 case MSR_KERNEL_GS_BASE:
2139 msr_info->data = vmx_read_guest_kernel_gs_base(vmx);
2140 break;
2141 #endif
2142 case MSR_EFER:
2143 return kvm_get_msr_common(vcpu, msr_info);
2144 case MSR_IA32_TSX_CTRL:
2145 if (!msr_info->host_initiated &&
2146 !(vcpu->arch.arch_capabilities & ARCH_CAP_TSX_CTRL_MSR))
2147 return 1;
2148 goto find_uret_msr;
2149 case MSR_IA32_UMWAIT_CONTROL:
2150 if (!msr_info->host_initiated && !vmx_has_waitpkg(vmx))
2151 return 1;
2152
2153 msr_info->data = vmx->msr_ia32_umwait_control;
2154 break;
2155 case MSR_IA32_SPEC_CTRL:
2156 if (!msr_info->host_initiated &&
2157 !guest_has_spec_ctrl_msr(vcpu))
2158 return 1;
2159
2160 msr_info->data = to_vmx(vcpu)->spec_ctrl;
2161 break;
2162 case MSR_IA32_SYSENTER_CS:
2163 msr_info->data = vmcs_read32(GUEST_SYSENTER_CS);
2164 break;
2165 case MSR_IA32_SYSENTER_EIP:
2166 msr_info->data = vmcs_readl(GUEST_SYSENTER_EIP);
2167 break;
2168 case MSR_IA32_SYSENTER_ESP:
2169 msr_info->data = vmcs_readl(GUEST_SYSENTER_ESP);
2170 break;
2171 case MSR_IA32_BNDCFGS:
2172 if (!kvm_mpx_supported() ||
2173 (!msr_info->host_initiated &&
2174 !guest_cpu_cap_has(vcpu, X86_FEATURE_MPX)))
2175 return 1;
2176 msr_info->data = vmcs_read64(GUEST_BNDCFGS);
2177 break;
2178 case MSR_IA32_MCG_EXT_CTL:
2179 if (!msr_info->host_initiated &&
2180 !(vmx->msr_ia32_feature_control &
2181 FEAT_CTL_LMCE_ENABLED))
2182 return 1;
2183 msr_info->data = vcpu->arch.mcg_ext_ctl;
2184 break;
2185 case MSR_IA32_FEAT_CTL:
2186 msr_info->data = vmx->msr_ia32_feature_control;
2187 break;
2188 case MSR_IA32_SGXLEPUBKEYHASH0 ... MSR_IA32_SGXLEPUBKEYHASH3:
2189 if (!msr_info->host_initiated &&
2190 !guest_cpu_cap_has(vcpu, X86_FEATURE_SGX_LC))
2191 return 1;
2192 msr_info->data = to_vmx(vcpu)->msr_ia32_sgxlepubkeyhash
2193 [msr_info->index - MSR_IA32_SGXLEPUBKEYHASH0];
2194 break;
2195 case KVM_FIRST_EMULATED_VMX_MSR ... KVM_LAST_EMULATED_VMX_MSR:
2196 if (!guest_cpu_cap_has(vcpu, X86_FEATURE_VMX))
2197 return 1;
2198 if (vmx_get_vmx_msr(&vmx->nested.msrs, msr_info->index,
2199 &msr_info->data))
2200 return 1;
2201 #ifdef CONFIG_KVM_HYPERV
2202 /*
2203 * Enlightened VMCS v1 doesn't have certain VMCS fields but
2204 * instead of just ignoring the features, different Hyper-V
2205 * versions are either trying to use them and fail or do some
2206 * sanity checking and refuse to boot. Filter all unsupported
2207 * features out.
2208 */
2209 if (!msr_info->host_initiated && guest_cpu_cap_has_evmcs(vcpu))
2210 nested_evmcs_filter_control_msr(vcpu, msr_info->index,
2211 &msr_info->data);
2212 #endif
2213 break;
2214 case MSR_IA32_RTIT_CTL:
2215 if (!vmx_pt_mode_is_host_guest())
2216 return 1;
2217 msr_info->data = vmx->pt_desc.guest.ctl;
2218 break;
2219 case MSR_IA32_RTIT_STATUS:
2220 if (!vmx_pt_mode_is_host_guest())
2221 return 1;
2222 msr_info->data = vmx->pt_desc.guest.status;
2223 break;
2224 case MSR_IA32_RTIT_CR3_MATCH:
2225 if (!vmx_pt_mode_is_host_guest() ||
2226 !intel_pt_validate_cap(vmx->pt_desc.caps,
2227 PT_CAP_cr3_filtering))
2228 return 1;
2229 msr_info->data = vmx->pt_desc.guest.cr3_match;
2230 break;
2231 case MSR_IA32_RTIT_OUTPUT_BASE:
2232 if (!vmx_pt_mode_is_host_guest() ||
2233 (!intel_pt_validate_cap(vmx->pt_desc.caps,
2234 PT_CAP_topa_output) &&
2235 !intel_pt_validate_cap(vmx->pt_desc.caps,
2236 PT_CAP_single_range_output)))
2237 return 1;
2238 msr_info->data = vmx->pt_desc.guest.output_base;
2239 break;
2240 case MSR_IA32_RTIT_OUTPUT_MASK:
2241 if (!vmx_pt_mode_is_host_guest() ||
2242 (!intel_pt_validate_cap(vmx->pt_desc.caps,
2243 PT_CAP_topa_output) &&
2244 !intel_pt_validate_cap(vmx->pt_desc.caps,
2245 PT_CAP_single_range_output)))
2246 return 1;
2247 msr_info->data = vmx->pt_desc.guest.output_mask;
2248 break;
2249 case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B:
2250 index = msr_info->index - MSR_IA32_RTIT_ADDR0_A;
2251 if (!vmx_pt_mode_is_host_guest() ||
2252 (index >= 2 * vmx->pt_desc.num_address_ranges))
2253 return 1;
2254 if (index % 2)
2255 msr_info->data = vmx->pt_desc.guest.addr_b[index / 2];
2256 else
2257 msr_info->data = vmx->pt_desc.guest.addr_a[index / 2];
2258 break;
2259 case MSR_IA32_S_CET:
2260 msr_info->data = vmcs_readl(GUEST_S_CET);
2261 break;
2262 case MSR_KVM_INTERNAL_GUEST_SSP:
2263 msr_info->data = vmcs_readl(GUEST_SSP);
2264 break;
2265 case MSR_IA32_INT_SSP_TAB:
2266 msr_info->data = vmcs_readl(GUEST_INTR_SSP_TABLE);
2267 break;
2268 case MSR_IA32_DEBUGCTLMSR:
2269 msr_info->data = vmx_guest_debugctl_read();
2270 break;
2271 default:
2272 find_uret_msr:
2273 msr = vmx_find_uret_msr(vmx, msr_info->index);
2274 if (msr) {
2275 msr_info->data = msr->data;
2276 break;
2277 }
2278 return kvm_get_msr_common(vcpu, msr_info);
2279 }
2280
2281 return 0;
2282 }
2283
nested_vmx_truncate_sysenter_addr(struct kvm_vcpu * vcpu,u64 data)2284 static u64 nested_vmx_truncate_sysenter_addr(struct kvm_vcpu *vcpu,
2285 u64 data)
2286 {
2287 #ifdef CONFIG_X86_64
2288 if (!guest_cpu_cap_has(vcpu, X86_FEATURE_LM))
2289 return (u32)data;
2290 #endif
2291 return (unsigned long)data;
2292 }
2293
vmx_get_supported_debugctl(struct kvm_vcpu * vcpu,bool host_initiated)2294 u64 vmx_get_supported_debugctl(struct kvm_vcpu *vcpu, bool host_initiated)
2295 {
2296 u64 debugctl = 0;
2297
2298 if (boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT) &&
2299 (host_initiated || guest_cpu_cap_has(vcpu, X86_FEATURE_BUS_LOCK_DETECT)))
2300 debugctl |= DEBUGCTLMSR_BUS_LOCK_DETECT;
2301
2302 if ((kvm_caps.supported_perf_cap & PERF_CAP_LBR_FMT) &&
2303 (host_initiated || intel_pmu_lbr_is_enabled(vcpu)))
2304 debugctl |= DEBUGCTLMSR_LBR | DEBUGCTLMSR_FREEZE_LBRS_ON_PMI;
2305
2306 if (boot_cpu_has(X86_FEATURE_RTM) &&
2307 (host_initiated || guest_cpu_cap_has(vcpu, X86_FEATURE_RTM)))
2308 debugctl |= DEBUGCTLMSR_RTM_DEBUG;
2309
2310 return debugctl;
2311 }
2312
vmx_is_valid_debugctl(struct kvm_vcpu * vcpu,u64 data,bool host_initiated)2313 bool vmx_is_valid_debugctl(struct kvm_vcpu *vcpu, u64 data, bool host_initiated)
2314 {
2315 u64 invalid;
2316
2317 invalid = data & ~vmx_get_supported_debugctl(vcpu, host_initiated);
2318 if (invalid & (DEBUGCTLMSR_BTF | DEBUGCTLMSR_LBR)) {
2319 kvm_pr_unimpl_wrmsr(vcpu, MSR_IA32_DEBUGCTLMSR, data);
2320 invalid &= ~(DEBUGCTLMSR_BTF | DEBUGCTLMSR_LBR);
2321 }
2322 return !invalid;
2323 }
2324
2325 /*
2326 * Writes msr value into the appropriate "register".
2327 * Returns 0 on success, non-0 otherwise.
2328 * Assumes vcpu_load() was already called.
2329 */
vmx_set_msr(struct kvm_vcpu * vcpu,struct msr_data * msr_info)2330 int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
2331 {
2332 struct vcpu_vmx *vmx = to_vmx(vcpu);
2333 struct vmx_uret_msr *msr;
2334 int ret = 0;
2335 u32 msr_index = msr_info->index;
2336 u64 data = msr_info->data;
2337 u32 index;
2338
2339 switch (msr_index) {
2340 case MSR_EFER:
2341 ret = kvm_set_msr_common(vcpu, msr_info);
2342 break;
2343 #ifdef CONFIG_X86_64
2344 case MSR_FS_BASE:
2345 vmx_segment_cache_clear(vmx);
2346 vmcs_writel(GUEST_FS_BASE, data);
2347 break;
2348 case MSR_GS_BASE:
2349 vmx_segment_cache_clear(vmx);
2350 vmcs_writel(GUEST_GS_BASE, data);
2351 break;
2352 case MSR_KERNEL_GS_BASE:
2353 vmx_write_guest_kernel_gs_base(vmx, data);
2354 break;
2355 case MSR_IA32_XFD:
2356 ret = kvm_set_msr_common(vcpu, msr_info);
2357 /*
2358 * Always intercepting WRMSR could incur non-negligible
2359 * overhead given xfd might be changed frequently in
2360 * guest context switch. Disable write interception
2361 * upon the first write with a non-zero value (indicating
2362 * potential usage on dynamic xfeatures). Also update
2363 * exception bitmap to trap #NM for proper virtualization
2364 * of guest xfd_err.
2365 */
2366 if (!ret && data) {
2367 vmx_disable_intercept_for_msr(vcpu, MSR_IA32_XFD,
2368 MSR_TYPE_RW);
2369 vcpu->arch.xfd_no_write_intercept = true;
2370 vmx_update_exception_bitmap(vcpu);
2371 }
2372 break;
2373 #endif
2374 case MSR_IA32_SYSENTER_CS:
2375 if (is_guest_mode(vcpu))
2376 get_vmcs12(vcpu)->guest_sysenter_cs = data;
2377 vmcs_write32(GUEST_SYSENTER_CS, data);
2378 break;
2379 case MSR_IA32_SYSENTER_EIP:
2380 if (is_guest_mode(vcpu)) {
2381 data = nested_vmx_truncate_sysenter_addr(vcpu, data);
2382 get_vmcs12(vcpu)->guest_sysenter_eip = data;
2383 }
2384 vmcs_writel(GUEST_SYSENTER_EIP, data);
2385 break;
2386 case MSR_IA32_SYSENTER_ESP:
2387 if (is_guest_mode(vcpu)) {
2388 data = nested_vmx_truncate_sysenter_addr(vcpu, data);
2389 get_vmcs12(vcpu)->guest_sysenter_esp = data;
2390 }
2391 vmcs_writel(GUEST_SYSENTER_ESP, data);
2392 break;
2393 case MSR_IA32_DEBUGCTLMSR:
2394 if (!vmx_is_valid_debugctl(vcpu, data, msr_info->host_initiated))
2395 return 1;
2396
2397 data &= vmx_get_supported_debugctl(vcpu, msr_info->host_initiated);
2398
2399 if (is_guest_mode(vcpu) && get_vmcs12(vcpu)->vm_exit_controls &
2400 VM_EXIT_SAVE_DEBUG_CONTROLS)
2401 get_vmcs12(vcpu)->guest_ia32_debugctl = data;
2402
2403 vmx_guest_debugctl_write(vcpu, data);
2404
2405 if (intel_pmu_lbr_is_enabled(vcpu) && !to_vmx(vcpu)->lbr_desc.event &&
2406 (data & DEBUGCTLMSR_LBR))
2407 intel_pmu_create_guest_lbr_event(vcpu);
2408 return 0;
2409 case MSR_IA32_BNDCFGS:
2410 if (!kvm_mpx_supported() ||
2411 (!msr_info->host_initiated &&
2412 !guest_cpu_cap_has(vcpu, X86_FEATURE_MPX)))
2413 return 1;
2414 if (is_noncanonical_msr_address(data & PAGE_MASK, vcpu) ||
2415 (data & MSR_IA32_BNDCFGS_RSVD))
2416 return 1;
2417
2418 if (is_guest_mode(vcpu) &&
2419 ((vmx->nested.msrs.entry_ctls_high & VM_ENTRY_LOAD_BNDCFGS) ||
2420 (vmx->nested.msrs.exit_ctls_high & VM_EXIT_CLEAR_BNDCFGS)))
2421 get_vmcs12(vcpu)->guest_bndcfgs = data;
2422
2423 vmcs_write64(GUEST_BNDCFGS, data);
2424 break;
2425 case MSR_IA32_UMWAIT_CONTROL:
2426 if (!msr_info->host_initiated && !vmx_has_waitpkg(vmx))
2427 return 1;
2428
2429 /* The reserved bit 1 and non-32 bit [63:32] should be zero */
2430 if (data & (BIT_ULL(1) | GENMASK_ULL(63, 32)))
2431 return 1;
2432
2433 vmx->msr_ia32_umwait_control = data;
2434 break;
2435 case MSR_IA32_SPEC_CTRL:
2436 if (!msr_info->host_initiated &&
2437 !guest_has_spec_ctrl_msr(vcpu))
2438 return 1;
2439
2440 if (kvm_spec_ctrl_test_value(data))
2441 return 1;
2442
2443 vmx->spec_ctrl = data;
2444 if (!data)
2445 break;
2446
2447 /*
2448 * For non-nested:
2449 * When it's written (to non-zero) for the first time, pass
2450 * it through.
2451 *
2452 * For nested:
2453 * The handling of the MSR bitmap for L2 guests is done in
2454 * nested_vmx_prepare_msr_bitmap. We should not touch the
2455 * vmcs02.msr_bitmap here since it gets completely overwritten
2456 * in the merging. We update the vmcs01 here for L1 as well
2457 * since it will end up touching the MSR anyway now.
2458 */
2459 vmx_disable_intercept_for_msr(vcpu,
2460 MSR_IA32_SPEC_CTRL,
2461 MSR_TYPE_RW);
2462 break;
2463 case MSR_IA32_TSX_CTRL:
2464 if (!msr_info->host_initiated &&
2465 !(vcpu->arch.arch_capabilities & ARCH_CAP_TSX_CTRL_MSR))
2466 return 1;
2467 if (data & ~(TSX_CTRL_RTM_DISABLE | TSX_CTRL_CPUID_CLEAR))
2468 return 1;
2469 goto find_uret_msr;
2470 case MSR_IA32_CR_PAT:
2471 ret = kvm_set_msr_common(vcpu, msr_info);
2472 if (ret)
2473 break;
2474
2475 if (is_guest_mode(vcpu) &&
2476 get_vmcs12(vcpu)->vm_exit_controls & VM_EXIT_SAVE_IA32_PAT)
2477 get_vmcs12(vcpu)->guest_ia32_pat = data;
2478
2479 if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT)
2480 vmcs_write64(GUEST_IA32_PAT, data);
2481 break;
2482 case MSR_IA32_MCG_EXT_CTL:
2483 if ((!msr_info->host_initiated &&
2484 !(to_vmx(vcpu)->msr_ia32_feature_control &
2485 FEAT_CTL_LMCE_ENABLED)) ||
2486 (data & ~MCG_EXT_CTL_LMCE_EN))
2487 return 1;
2488 vcpu->arch.mcg_ext_ctl = data;
2489 break;
2490 case MSR_IA32_FEAT_CTL:
2491 if (!is_vmx_feature_control_msr_valid(vmx, msr_info))
2492 return 1;
2493
2494 vmx->msr_ia32_feature_control = data;
2495 if (msr_info->host_initiated && data == 0)
2496 vmx_leave_nested(vcpu);
2497
2498 /* SGX may be enabled/disabled by guest's firmware */
2499 vmx_write_encls_bitmap(vcpu, NULL);
2500 break;
2501 case MSR_IA32_SGXLEPUBKEYHASH0 ... MSR_IA32_SGXLEPUBKEYHASH3:
2502 /*
2503 * On real hardware, the LE hash MSRs are writable before
2504 * the firmware sets bit 0 in MSR 0x7a ("activating" SGX),
2505 * at which point SGX related bits in IA32_FEATURE_CONTROL
2506 * become writable.
2507 *
2508 * KVM does not emulate SGX activation for simplicity, so
2509 * allow writes to the LE hash MSRs if IA32_FEATURE_CONTROL
2510 * is unlocked. This is technically not architectural
2511 * behavior, but it's close enough.
2512 */
2513 if (!msr_info->host_initiated &&
2514 (!guest_cpu_cap_has(vcpu, X86_FEATURE_SGX_LC) ||
2515 ((vmx->msr_ia32_feature_control & FEAT_CTL_LOCKED) &&
2516 !(vmx->msr_ia32_feature_control & FEAT_CTL_SGX_LC_ENABLED))))
2517 return 1;
2518 vmx->msr_ia32_sgxlepubkeyhash
2519 [msr_index - MSR_IA32_SGXLEPUBKEYHASH0] = data;
2520 break;
2521 case KVM_FIRST_EMULATED_VMX_MSR ... KVM_LAST_EMULATED_VMX_MSR:
2522 if (!msr_info->host_initiated)
2523 return 1; /* they are read-only */
2524 if (!guest_cpu_cap_has(vcpu, X86_FEATURE_VMX))
2525 return 1;
2526 return vmx_set_vmx_msr(vcpu, msr_index, data);
2527 case MSR_IA32_RTIT_CTL:
2528 if (!vmx_pt_mode_is_host_guest() ||
2529 vmx_rtit_ctl_check(vcpu, data) ||
2530 vmx->nested.vmxon)
2531 return 1;
2532 vmcs_write64(GUEST_IA32_RTIT_CTL, data);
2533 vmx->pt_desc.guest.ctl = data;
2534 pt_update_intercept_for_msr(vcpu);
2535 break;
2536 case MSR_IA32_RTIT_STATUS:
2537 if (!pt_can_write_msr(vmx))
2538 return 1;
2539 if (data & MSR_IA32_RTIT_STATUS_MASK)
2540 return 1;
2541 vmx->pt_desc.guest.status = data;
2542 break;
2543 case MSR_IA32_RTIT_CR3_MATCH:
2544 if (!pt_can_write_msr(vmx))
2545 return 1;
2546 if (!intel_pt_validate_cap(vmx->pt_desc.caps,
2547 PT_CAP_cr3_filtering))
2548 return 1;
2549 vmx->pt_desc.guest.cr3_match = data;
2550 break;
2551 case MSR_IA32_RTIT_OUTPUT_BASE:
2552 if (!pt_can_write_msr(vmx))
2553 return 1;
2554 if (!intel_pt_validate_cap(vmx->pt_desc.caps,
2555 PT_CAP_topa_output) &&
2556 !intel_pt_validate_cap(vmx->pt_desc.caps,
2557 PT_CAP_single_range_output))
2558 return 1;
2559 if (!pt_output_base_valid(vcpu, data))
2560 return 1;
2561 vmx->pt_desc.guest.output_base = data;
2562 break;
2563 case MSR_IA32_RTIT_OUTPUT_MASK:
2564 if (!pt_can_write_msr(vmx))
2565 return 1;
2566 if (!intel_pt_validate_cap(vmx->pt_desc.caps,
2567 PT_CAP_topa_output) &&
2568 !intel_pt_validate_cap(vmx->pt_desc.caps,
2569 PT_CAP_single_range_output))
2570 return 1;
2571 vmx->pt_desc.guest.output_mask = data;
2572 break;
2573 case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B:
2574 if (!pt_can_write_msr(vmx))
2575 return 1;
2576 index = msr_info->index - MSR_IA32_RTIT_ADDR0_A;
2577 if (index >= 2 * vmx->pt_desc.num_address_ranges)
2578 return 1;
2579 if (is_noncanonical_msr_address(data, vcpu))
2580 return 1;
2581 if (index % 2)
2582 vmx->pt_desc.guest.addr_b[index / 2] = data;
2583 else
2584 vmx->pt_desc.guest.addr_a[index / 2] = data;
2585 break;
2586 case MSR_IA32_S_CET:
2587 vmcs_writel(GUEST_S_CET, data);
2588 break;
2589 case MSR_KVM_INTERNAL_GUEST_SSP:
2590 vmcs_writel(GUEST_SSP, data);
2591 break;
2592 case MSR_IA32_INT_SSP_TAB:
2593 vmcs_writel(GUEST_INTR_SSP_TABLE, data);
2594 break;
2595 case MSR_IA32_PERF_CAPABILITIES:
2596 if (data & PERF_CAP_LBR_FMT) {
2597 if ((data & PERF_CAP_LBR_FMT) !=
2598 (kvm_caps.supported_perf_cap & PERF_CAP_LBR_FMT))
2599 return 1;
2600 if (!cpuid_model_is_consistent(vcpu))
2601 return 1;
2602 }
2603 if (data & PERF_CAP_PEBS_FORMAT) {
2604 if ((data & PERF_CAP_PEBS_MASK) !=
2605 (kvm_caps.supported_perf_cap & PERF_CAP_PEBS_MASK))
2606 return 1;
2607 if (!guest_cpu_cap_has(vcpu, X86_FEATURE_DS))
2608 return 1;
2609 if (!guest_cpu_cap_has(vcpu, X86_FEATURE_DTES64))
2610 return 1;
2611 if (!cpuid_model_is_consistent(vcpu))
2612 return 1;
2613 }
2614 ret = kvm_set_msr_common(vcpu, msr_info);
2615 break;
2616
2617 default:
2618 find_uret_msr:
2619 msr = vmx_find_uret_msr(vmx, msr_index);
2620 if (msr)
2621 ret = vmx_set_guest_uret_msr(vmx, msr, data);
2622 else
2623 ret = kvm_set_msr_common(vcpu, msr_info);
2624 }
2625
2626 /* FB_CLEAR may have changed, also update the FB_CLEAR_DIS behavior */
2627 if (msr_index == MSR_IA32_ARCH_CAPABILITIES)
2628 vmx_update_fb_clear_dis(vcpu, vmx);
2629
2630 return ret;
2631 }
2632
vmx_cache_reg(struct kvm_vcpu * vcpu,enum kvm_reg reg)2633 void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
2634 {
2635 unsigned long guest_owned_bits;
2636
2637 kvm_register_mark_available(vcpu, reg);
2638
2639 switch (reg) {
2640 case VCPU_REGS_RSP:
2641 vcpu->arch.regs[VCPU_REGS_RSP] = vmcs_readl(GUEST_RSP);
2642 break;
2643 case VCPU_REGS_RIP:
2644 vcpu->arch.regs[VCPU_REGS_RIP] = vmcs_readl(GUEST_RIP);
2645 break;
2646 case VCPU_EXREG_PDPTR:
2647 if (enable_ept)
2648 ept_save_pdptrs(vcpu);
2649 break;
2650 case VCPU_EXREG_CR0:
2651 guest_owned_bits = vcpu->arch.cr0_guest_owned_bits;
2652
2653 vcpu->arch.cr0 &= ~guest_owned_bits;
2654 vcpu->arch.cr0 |= vmcs_readl(GUEST_CR0) & guest_owned_bits;
2655 break;
2656 case VCPU_EXREG_CR3:
2657 /*
2658 * When intercepting CR3 loads, e.g. for shadowing paging, KVM's
2659 * CR3 is loaded into hardware, not the guest's CR3.
2660 */
2661 if (!(exec_controls_get(to_vmx(vcpu)) & CPU_BASED_CR3_LOAD_EXITING))
2662 vcpu->arch.cr3 = vmcs_readl(GUEST_CR3);
2663 break;
2664 case VCPU_EXREG_CR4:
2665 guest_owned_bits = vcpu->arch.cr4_guest_owned_bits;
2666
2667 vcpu->arch.cr4 &= ~guest_owned_bits;
2668 vcpu->arch.cr4 |= vmcs_readl(GUEST_CR4) & guest_owned_bits;
2669 break;
2670 default:
2671 KVM_BUG_ON(1, vcpu->kvm);
2672 break;
2673 }
2674 }
2675
2676 /*
2677 * There is no X86_FEATURE for SGX yet, but anyway we need to query CPUID
2678 * directly instead of going through cpu_has(), to ensure KVM is trapping
2679 * ENCLS whenever it's supported in hardware. It does not matter whether
2680 * the host OS supports or has enabled SGX.
2681 */
cpu_has_sgx(void)2682 static bool cpu_has_sgx(void)
2683 {
2684 return cpuid_eax(0) >= 0x12 && (cpuid_eax(0x12) & BIT(0));
2685 }
2686
adjust_vmx_controls(u32 ctl_min,u32 ctl_opt,u32 msr,u32 * result)2687 static int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt, u32 msr, u32 *result)
2688 {
2689 u32 vmx_msr_low, vmx_msr_high;
2690 u32 ctl = ctl_min | ctl_opt;
2691
2692 rdmsr(msr, vmx_msr_low, vmx_msr_high);
2693
2694 ctl &= vmx_msr_high; /* bit == 0 in high word ==> must be zero */
2695 ctl |= vmx_msr_low; /* bit == 1 in low word ==> must be one */
2696
2697 /* Ensure minimum (required) set of control bits are supported. */
2698 if (ctl_min & ~ctl)
2699 return -EIO;
2700
2701 *result = ctl;
2702 return 0;
2703 }
2704
adjust_vmx_controls64(u64 ctl_opt,u32 msr)2705 static u64 adjust_vmx_controls64(u64 ctl_opt, u32 msr)
2706 {
2707 u64 allowed;
2708
2709 rdmsrq(msr, allowed);
2710
2711 return ctl_opt & allowed;
2712 }
2713
2714 #define vmx_check_entry_exit_pairs(pairs, entry_controls, exit_controls) \
2715 ({ \
2716 int i, r = 0; \
2717 \
2718 BUILD_BUG_ON(sizeof(pairs[0].entry_control) != sizeof(entry_controls)); \
2719 BUILD_BUG_ON(sizeof(pairs[0].exit_control) != sizeof(exit_controls)); \
2720 \
2721 for (i = 0; i < ARRAY_SIZE(pairs); i++) { \
2722 typeof(entry_controls) n_ctrl = pairs[i].entry_control; \
2723 typeof(exit_controls) x_ctrl = pairs[i].exit_control; \
2724 \
2725 if (!(entry_controls & n_ctrl) == !(exit_controls & x_ctrl)) \
2726 continue; \
2727 \
2728 pr_warn_once("Inconsistent VM-Entry/VM-Exit pair, " \
2729 "entry = %llx (%llx), exit = %llx (%llx)\n", \
2730 (u64)(entry_controls & n_ctrl), (u64)n_ctrl, \
2731 (u64)(exit_controls & x_ctrl), (u64)x_ctrl); \
2732 \
2733 if (error_on_inconsistent_vmcs_config) \
2734 r = -EIO; \
2735 \
2736 entry_controls &= ~n_ctrl; \
2737 exit_controls &= ~x_ctrl; \
2738 } \
2739 r; \
2740 })
2741
setup_vmcs_config(struct vmcs_config * vmcs_conf,struct vmx_capability * vmx_cap)2742 static int setup_vmcs_config(struct vmcs_config *vmcs_conf,
2743 struct vmx_capability *vmx_cap)
2744 {
2745 u32 _pin_based_exec_control = 0;
2746 u32 _cpu_based_exec_control = 0;
2747 u32 _cpu_based_2nd_exec_control = 0;
2748 u64 _cpu_based_3rd_exec_control = 0;
2749 u32 _vmexit_control = 0;
2750 u32 _vmentry_control = 0;
2751 u64 basic_msr;
2752 u64 misc_msr;
2753
2754 /*
2755 * LOAD/SAVE_DEBUG_CONTROLS are absent because both are mandatory.
2756 * SAVE_IA32_PAT and SAVE_IA32_EFER are absent because KVM always
2757 * intercepts writes to PAT and EFER, i.e. never enables those controls.
2758 */
2759 struct {
2760 u32 entry_control;
2761 u32 exit_control;
2762 } const vmcs_entry_exit_pairs[] = {
2763 { VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL, VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL },
2764 { VM_ENTRY_LOAD_IA32_PAT, VM_EXIT_LOAD_IA32_PAT },
2765 { VM_ENTRY_LOAD_IA32_EFER, VM_EXIT_LOAD_IA32_EFER },
2766 { VM_ENTRY_LOAD_BNDCFGS, VM_EXIT_CLEAR_BNDCFGS },
2767 { VM_ENTRY_LOAD_IA32_RTIT_CTL, VM_EXIT_CLEAR_IA32_RTIT_CTL },
2768 { VM_ENTRY_LOAD_CET_STATE, VM_EXIT_LOAD_CET_STATE },
2769 };
2770
2771 memset(vmcs_conf, 0, sizeof(*vmcs_conf));
2772
2773 if (adjust_vmx_controls(KVM_REQUIRED_VMX_CPU_BASED_VM_EXEC_CONTROL,
2774 KVM_OPTIONAL_VMX_CPU_BASED_VM_EXEC_CONTROL,
2775 MSR_IA32_VMX_PROCBASED_CTLS,
2776 &_cpu_based_exec_control))
2777 return -EIO;
2778 if (_cpu_based_exec_control & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) {
2779 if (adjust_vmx_controls(KVM_REQUIRED_VMX_SECONDARY_VM_EXEC_CONTROL,
2780 KVM_OPTIONAL_VMX_SECONDARY_VM_EXEC_CONTROL,
2781 MSR_IA32_VMX_PROCBASED_CTLS2,
2782 &_cpu_based_2nd_exec_control))
2783 return -EIO;
2784 }
2785 if (!IS_ENABLED(CONFIG_KVM_INTEL_PROVE_VE))
2786 _cpu_based_2nd_exec_control &= ~SECONDARY_EXEC_EPT_VIOLATION_VE;
2787
2788 #ifndef CONFIG_X86_64
2789 if (!(_cpu_based_2nd_exec_control &
2790 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))
2791 _cpu_based_exec_control &= ~CPU_BASED_TPR_SHADOW;
2792 #endif
2793
2794 if (!(_cpu_based_exec_control & CPU_BASED_TPR_SHADOW))
2795 _cpu_based_2nd_exec_control &= ~(
2796 SECONDARY_EXEC_APIC_REGISTER_VIRT |
2797 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
2798 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
2799
2800 rdmsr_safe(MSR_IA32_VMX_EPT_VPID_CAP,
2801 &vmx_cap->ept, &vmx_cap->vpid);
2802
2803 if (!(_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_EPT) &&
2804 vmx_cap->ept) {
2805 pr_warn_once("EPT CAP should not exist if not support "
2806 "1-setting enable EPT VM-execution control\n");
2807
2808 if (error_on_inconsistent_vmcs_config)
2809 return -EIO;
2810
2811 vmx_cap->ept = 0;
2812 _cpu_based_2nd_exec_control &= ~SECONDARY_EXEC_EPT_VIOLATION_VE;
2813 }
2814 if (!(_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_VPID) &&
2815 vmx_cap->vpid) {
2816 pr_warn_once("VPID CAP should not exist if not support "
2817 "1-setting enable VPID VM-execution control\n");
2818
2819 if (error_on_inconsistent_vmcs_config)
2820 return -EIO;
2821
2822 vmx_cap->vpid = 0;
2823 }
2824
2825 if (!cpu_has_sgx())
2826 _cpu_based_2nd_exec_control &= ~SECONDARY_EXEC_ENCLS_EXITING;
2827
2828 if (_cpu_based_exec_control & CPU_BASED_ACTIVATE_TERTIARY_CONTROLS)
2829 _cpu_based_3rd_exec_control =
2830 adjust_vmx_controls64(KVM_OPTIONAL_VMX_TERTIARY_VM_EXEC_CONTROL,
2831 MSR_IA32_VMX_PROCBASED_CTLS3);
2832
2833 if (adjust_vmx_controls(KVM_REQUIRED_VMX_VM_EXIT_CONTROLS,
2834 KVM_OPTIONAL_VMX_VM_EXIT_CONTROLS,
2835 MSR_IA32_VMX_EXIT_CTLS,
2836 &_vmexit_control))
2837 return -EIO;
2838
2839 if (adjust_vmx_controls(KVM_REQUIRED_VMX_PIN_BASED_VM_EXEC_CONTROL,
2840 KVM_OPTIONAL_VMX_PIN_BASED_VM_EXEC_CONTROL,
2841 MSR_IA32_VMX_PINBASED_CTLS,
2842 &_pin_based_exec_control))
2843 return -EIO;
2844
2845 if (cpu_has_broken_vmx_preemption_timer())
2846 _pin_based_exec_control &= ~PIN_BASED_VMX_PREEMPTION_TIMER;
2847 if (!(_cpu_based_2nd_exec_control &
2848 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY))
2849 _pin_based_exec_control &= ~PIN_BASED_POSTED_INTR;
2850
2851 if (adjust_vmx_controls(KVM_REQUIRED_VMX_VM_ENTRY_CONTROLS,
2852 KVM_OPTIONAL_VMX_VM_ENTRY_CONTROLS,
2853 MSR_IA32_VMX_ENTRY_CTLS,
2854 &_vmentry_control))
2855 return -EIO;
2856
2857 if (vmx_check_entry_exit_pairs(vmcs_entry_exit_pairs,
2858 _vmentry_control, _vmexit_control))
2859 return -EIO;
2860
2861 /*
2862 * Some cpus support VM_{ENTRY,EXIT}_IA32_PERF_GLOBAL_CTRL but they
2863 * can't be used due to an errata where VM Exit may incorrectly clear
2864 * IA32_PERF_GLOBAL_CTRL[34:32]. Workaround the errata by using the
2865 * MSR load mechanism to switch IA32_PERF_GLOBAL_CTRL.
2866 */
2867 switch (boot_cpu_data.x86_vfm) {
2868 case INTEL_NEHALEM_EP: /* AAK155 */
2869 case INTEL_NEHALEM: /* AAP115 */
2870 case INTEL_WESTMERE: /* AAT100 */
2871 case INTEL_WESTMERE_EP: /* BC86,AAY89,BD102 */
2872 case INTEL_NEHALEM_EX: /* BA97 */
2873 _vmentry_control &= ~VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL;
2874 _vmexit_control &= ~VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL;
2875 pr_warn_once("VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL "
2876 "does not work properly. Using workaround\n");
2877 break;
2878 default:
2879 break;
2880 }
2881
2882 rdmsrq(MSR_IA32_VMX_BASIC, basic_msr);
2883
2884 /* IA-32 SDM Vol 3B: VMCS size is never greater than 4kB. */
2885 if (vmx_basic_vmcs_size(basic_msr) > PAGE_SIZE)
2886 return -EIO;
2887
2888 #ifdef CONFIG_X86_64
2889 /*
2890 * KVM expects to be able to shove all legal physical addresses into
2891 * VMCS fields for 64-bit kernels, and per the SDM, "This bit is always
2892 * 0 for processors that support Intel 64 architecture".
2893 */
2894 if (basic_msr & VMX_BASIC_32BIT_PHYS_ADDR_ONLY)
2895 return -EIO;
2896 #endif
2897
2898 /* Require Write-Back (WB) memory type for VMCS accesses. */
2899 if (vmx_basic_vmcs_mem_type(basic_msr) != X86_MEMTYPE_WB)
2900 return -EIO;
2901
2902 rdmsrq(MSR_IA32_VMX_MISC, misc_msr);
2903
2904 vmcs_conf->basic = basic_msr;
2905 vmcs_conf->pin_based_exec_ctrl = _pin_based_exec_control;
2906 vmcs_conf->cpu_based_exec_ctrl = _cpu_based_exec_control;
2907 vmcs_conf->cpu_based_2nd_exec_ctrl = _cpu_based_2nd_exec_control;
2908 vmcs_conf->cpu_based_3rd_exec_ctrl = _cpu_based_3rd_exec_control;
2909 vmcs_conf->vmexit_ctrl = _vmexit_control;
2910 vmcs_conf->vmentry_ctrl = _vmentry_control;
2911 vmcs_conf->misc = misc_msr;
2912
2913 #if IS_ENABLED(CONFIG_HYPERV)
2914 if (enlightened_vmcs)
2915 evmcs_sanitize_exec_ctrls(vmcs_conf);
2916 #endif
2917
2918 return 0;
2919 }
2920
__kvm_is_vmx_supported(void)2921 static bool __kvm_is_vmx_supported(void)
2922 {
2923 int cpu = smp_processor_id();
2924
2925 if (!(cpuid_ecx(1) & feature_bit(VMX))) {
2926 pr_err("VMX not supported by CPU %d\n", cpu);
2927 return false;
2928 }
2929
2930 if (!this_cpu_has(X86_FEATURE_MSR_IA32_FEAT_CTL) ||
2931 !this_cpu_has(X86_FEATURE_VMX)) {
2932 pr_err("VMX not enabled (by BIOS) in MSR_IA32_FEAT_CTL on CPU %d\n", cpu);
2933 return false;
2934 }
2935
2936 return true;
2937 }
2938
kvm_is_vmx_supported(void)2939 static bool kvm_is_vmx_supported(void)
2940 {
2941 bool supported;
2942
2943 migrate_disable();
2944 supported = __kvm_is_vmx_supported();
2945 migrate_enable();
2946
2947 return supported;
2948 }
2949
vmx_check_processor_compat(void)2950 int vmx_check_processor_compat(void)
2951 {
2952 int cpu = raw_smp_processor_id();
2953 struct vmcs_config vmcs_conf;
2954 struct vmx_capability vmx_cap;
2955
2956 if (!__kvm_is_vmx_supported())
2957 return -EIO;
2958
2959 if (setup_vmcs_config(&vmcs_conf, &vmx_cap) < 0) {
2960 pr_err("Failed to setup VMCS config on CPU %d\n", cpu);
2961 return -EIO;
2962 }
2963 if (nested)
2964 nested_vmx_setup_ctls_msrs(&vmcs_conf, vmx_cap.ept);
2965
2966 if (memcmp(&vmcs_config, &vmcs_conf, sizeof(struct vmcs_config))) {
2967 u32 *gold = (void *)&vmcs_config;
2968 u32 *mine = (void *)&vmcs_conf;
2969 int i;
2970
2971 BUILD_BUG_ON(sizeof(struct vmcs_config) % sizeof(u32));
2972
2973 pr_err("VMCS config on CPU %d doesn't match reference config:", cpu);
2974 for (i = 0; i < sizeof(struct vmcs_config) / sizeof(u32); i++) {
2975 if (gold[i] == mine[i])
2976 continue;
2977
2978 pr_cont("\n Offset %u REF = 0x%08x, CPU%u = 0x%08x, mismatch = 0x%08x",
2979 i * (int)sizeof(u32), gold[i], cpu, mine[i], gold[i] ^ mine[i]);
2980 }
2981 pr_cont("\n");
2982 return -EIO;
2983 }
2984 return 0;
2985 }
2986
kvm_cpu_vmxon(u64 vmxon_pointer)2987 static int kvm_cpu_vmxon(u64 vmxon_pointer)
2988 {
2989 u64 msr;
2990
2991 cr4_set_bits(X86_CR4_VMXE);
2992
2993 asm goto("1: vmxon %[vmxon_pointer]\n\t"
2994 _ASM_EXTABLE(1b, %l[fault])
2995 : : [vmxon_pointer] "m"(vmxon_pointer)
2996 : : fault);
2997 return 0;
2998
2999 fault:
3000 WARN_ONCE(1, "VMXON faulted, MSR_IA32_FEAT_CTL (0x3a) = 0x%llx\n",
3001 rdmsrq_safe(MSR_IA32_FEAT_CTL, &msr) ? 0xdeadbeef : msr);
3002 cr4_clear_bits(X86_CR4_VMXE);
3003
3004 return -EFAULT;
3005 }
3006
vmx_enable_virtualization_cpu(void)3007 int vmx_enable_virtualization_cpu(void)
3008 {
3009 int cpu = raw_smp_processor_id();
3010 u64 phys_addr = __pa(per_cpu(vmxarea, cpu));
3011 int r;
3012
3013 if (cr4_read_shadow() & X86_CR4_VMXE)
3014 return -EBUSY;
3015
3016 /*
3017 * This can happen if we hot-added a CPU but failed to allocate
3018 * VP assist page for it.
3019 */
3020 if (kvm_is_using_evmcs() && !hv_get_vp_assist_page(cpu))
3021 return -EFAULT;
3022
3023 intel_pt_handle_vmx(1);
3024
3025 r = kvm_cpu_vmxon(phys_addr);
3026 if (r) {
3027 intel_pt_handle_vmx(0);
3028 return r;
3029 }
3030
3031 return 0;
3032 }
3033
vmclear_local_loaded_vmcss(void)3034 static void vmclear_local_loaded_vmcss(void)
3035 {
3036 int cpu = raw_smp_processor_id();
3037 struct loaded_vmcs *v, *n;
3038
3039 list_for_each_entry_safe(v, n, &per_cpu(loaded_vmcss_on_cpu, cpu),
3040 loaded_vmcss_on_cpu_link)
3041 __loaded_vmcs_clear(v);
3042 }
3043
vmx_disable_virtualization_cpu(void)3044 void vmx_disable_virtualization_cpu(void)
3045 {
3046 vmclear_local_loaded_vmcss();
3047
3048 if (kvm_cpu_vmxoff())
3049 kvm_spurious_fault();
3050
3051 hv_reset_evmcs();
3052
3053 intel_pt_handle_vmx(0);
3054 }
3055
alloc_vmcs_cpu(bool shadow,int cpu,gfp_t flags)3056 struct vmcs *alloc_vmcs_cpu(bool shadow, int cpu, gfp_t flags)
3057 {
3058 int node = cpu_to_node(cpu);
3059 struct page *pages;
3060 struct vmcs *vmcs;
3061
3062 pages = __alloc_pages_node(node, flags, 0);
3063 if (!pages)
3064 return NULL;
3065 vmcs = page_address(pages);
3066 memset(vmcs, 0, vmx_basic_vmcs_size(vmcs_config.basic));
3067
3068 /* KVM supports Enlightened VMCS v1 only */
3069 if (kvm_is_using_evmcs())
3070 vmcs->hdr.revision_id = KVM_EVMCS_VERSION;
3071 else
3072 vmcs->hdr.revision_id = vmx_basic_vmcs_revision_id(vmcs_config.basic);
3073
3074 if (shadow)
3075 vmcs->hdr.shadow_vmcs = 1;
3076 return vmcs;
3077 }
3078
free_vmcs(struct vmcs * vmcs)3079 void free_vmcs(struct vmcs *vmcs)
3080 {
3081 free_page((unsigned long)vmcs);
3082 }
3083
3084 /*
3085 * Free a VMCS, but before that VMCLEAR it on the CPU where it was last loaded
3086 */
free_loaded_vmcs(struct loaded_vmcs * loaded_vmcs)3087 void free_loaded_vmcs(struct loaded_vmcs *loaded_vmcs)
3088 {
3089 if (!loaded_vmcs->vmcs)
3090 return;
3091 loaded_vmcs_clear(loaded_vmcs);
3092 free_vmcs(loaded_vmcs->vmcs);
3093 loaded_vmcs->vmcs = NULL;
3094 if (loaded_vmcs->msr_bitmap)
3095 free_page((unsigned long)loaded_vmcs->msr_bitmap);
3096 WARN_ON(loaded_vmcs->shadow_vmcs != NULL);
3097 }
3098
alloc_loaded_vmcs(struct loaded_vmcs * loaded_vmcs)3099 int alloc_loaded_vmcs(struct loaded_vmcs *loaded_vmcs)
3100 {
3101 loaded_vmcs->vmcs = alloc_vmcs(false);
3102 if (!loaded_vmcs->vmcs)
3103 return -ENOMEM;
3104
3105 vmcs_clear(loaded_vmcs->vmcs);
3106
3107 loaded_vmcs->shadow_vmcs = NULL;
3108 loaded_vmcs->hv_timer_soft_disabled = false;
3109 loaded_vmcs->cpu = -1;
3110 loaded_vmcs->launched = 0;
3111
3112 if (cpu_has_vmx_msr_bitmap()) {
3113 loaded_vmcs->msr_bitmap = (unsigned long *)
3114 __get_free_page(GFP_KERNEL_ACCOUNT);
3115 if (!loaded_vmcs->msr_bitmap)
3116 goto out_vmcs;
3117 memset(loaded_vmcs->msr_bitmap, 0xff, PAGE_SIZE);
3118 }
3119
3120 memset(&loaded_vmcs->host_state, 0, sizeof(struct vmcs_host_state));
3121 memset(&loaded_vmcs->controls_shadow, 0,
3122 sizeof(struct vmcs_controls_shadow));
3123
3124 return 0;
3125
3126 out_vmcs:
3127 free_loaded_vmcs(loaded_vmcs);
3128 return -ENOMEM;
3129 }
3130
free_kvm_area(void)3131 static void free_kvm_area(void)
3132 {
3133 int cpu;
3134
3135 for_each_possible_cpu(cpu) {
3136 free_vmcs(per_cpu(vmxarea, cpu));
3137 per_cpu(vmxarea, cpu) = NULL;
3138 }
3139 }
3140
alloc_kvm_area(void)3141 static __init int alloc_kvm_area(void)
3142 {
3143 int cpu;
3144
3145 for_each_possible_cpu(cpu) {
3146 struct vmcs *vmcs;
3147
3148 vmcs = alloc_vmcs_cpu(false, cpu, GFP_KERNEL);
3149 if (!vmcs) {
3150 free_kvm_area();
3151 return -ENOMEM;
3152 }
3153
3154 /*
3155 * When eVMCS is enabled, alloc_vmcs_cpu() sets
3156 * vmcs->revision_id to KVM_EVMCS_VERSION instead of
3157 * revision_id reported by MSR_IA32_VMX_BASIC.
3158 *
3159 * However, even though not explicitly documented by
3160 * TLFS, VMXArea passed as VMXON argument should
3161 * still be marked with revision_id reported by
3162 * physical CPU.
3163 */
3164 if (kvm_is_using_evmcs())
3165 vmcs->hdr.revision_id = vmx_basic_vmcs_revision_id(vmcs_config.basic);
3166
3167 per_cpu(vmxarea, cpu) = vmcs;
3168 }
3169 return 0;
3170 }
3171
fix_pmode_seg(struct kvm_vcpu * vcpu,int seg,struct kvm_segment * save)3172 static void fix_pmode_seg(struct kvm_vcpu *vcpu, int seg,
3173 struct kvm_segment *save)
3174 {
3175 if (!emulate_invalid_guest_state) {
3176 /*
3177 * CS and SS RPL should be equal during guest entry according
3178 * to VMX spec, but in reality it is not always so. Since vcpu
3179 * is in the middle of the transition from real mode to
3180 * protected mode it is safe to assume that RPL 0 is a good
3181 * default value.
3182 */
3183 if (seg == VCPU_SREG_CS || seg == VCPU_SREG_SS)
3184 save->selector &= ~SEGMENT_RPL_MASK;
3185 save->dpl = save->selector & SEGMENT_RPL_MASK;
3186 save->s = 1;
3187 }
3188 __vmx_set_segment(vcpu, save, seg);
3189 }
3190
enter_pmode(struct kvm_vcpu * vcpu)3191 static void enter_pmode(struct kvm_vcpu *vcpu)
3192 {
3193 unsigned long flags;
3194 struct vcpu_vmx *vmx = to_vmx(vcpu);
3195
3196 /*
3197 * Update real mode segment cache. It may be not up-to-date if segment
3198 * register was written while vcpu was in a guest mode.
3199 */
3200 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_ES], VCPU_SREG_ES);
3201 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_DS], VCPU_SREG_DS);
3202 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_FS], VCPU_SREG_FS);
3203 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_GS], VCPU_SREG_GS);
3204 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_SS], VCPU_SREG_SS);
3205 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_CS], VCPU_SREG_CS);
3206
3207 vmx->rmode.vm86_active = 0;
3208
3209 __vmx_set_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_TR], VCPU_SREG_TR);
3210
3211 flags = vmcs_readl(GUEST_RFLAGS);
3212 flags &= RMODE_GUEST_OWNED_EFLAGS_BITS;
3213 flags |= vmx->rmode.save_rflags & ~RMODE_GUEST_OWNED_EFLAGS_BITS;
3214 vmcs_writel(GUEST_RFLAGS, flags);
3215
3216 vmcs_writel(GUEST_CR4, (vmcs_readl(GUEST_CR4) & ~X86_CR4_VME) |
3217 (vmcs_readl(CR4_READ_SHADOW) & X86_CR4_VME));
3218
3219 vmx_update_exception_bitmap(vcpu);
3220
3221 fix_pmode_seg(vcpu, VCPU_SREG_CS, &vmx->rmode.segs[VCPU_SREG_CS]);
3222 fix_pmode_seg(vcpu, VCPU_SREG_SS, &vmx->rmode.segs[VCPU_SREG_SS]);
3223 fix_pmode_seg(vcpu, VCPU_SREG_ES, &vmx->rmode.segs[VCPU_SREG_ES]);
3224 fix_pmode_seg(vcpu, VCPU_SREG_DS, &vmx->rmode.segs[VCPU_SREG_DS]);
3225 fix_pmode_seg(vcpu, VCPU_SREG_FS, &vmx->rmode.segs[VCPU_SREG_FS]);
3226 fix_pmode_seg(vcpu, VCPU_SREG_GS, &vmx->rmode.segs[VCPU_SREG_GS]);
3227 }
3228
fix_rmode_seg(int seg,struct kvm_segment * save)3229 static void fix_rmode_seg(int seg, struct kvm_segment *save)
3230 {
3231 const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
3232 struct kvm_segment var = *save;
3233
3234 var.dpl = 0x3;
3235 if (seg == VCPU_SREG_CS)
3236 var.type = 0x3;
3237
3238 if (!emulate_invalid_guest_state) {
3239 var.selector = var.base >> 4;
3240 var.base = var.base & 0xffff0;
3241 var.limit = 0xffff;
3242 var.g = 0;
3243 var.db = 0;
3244 var.present = 1;
3245 var.s = 1;
3246 var.l = 0;
3247 var.unusable = 0;
3248 var.type = 0x3;
3249 var.avl = 0;
3250 if (save->base & 0xf)
3251 pr_warn_once("segment base is not paragraph aligned "
3252 "when entering protected mode (seg=%d)", seg);
3253 }
3254
3255 vmcs_write16(sf->selector, var.selector);
3256 vmcs_writel(sf->base, var.base);
3257 vmcs_write32(sf->limit, var.limit);
3258 vmcs_write32(sf->ar_bytes, vmx_segment_access_rights(&var));
3259 }
3260
enter_rmode(struct kvm_vcpu * vcpu)3261 static void enter_rmode(struct kvm_vcpu *vcpu)
3262 {
3263 unsigned long flags;
3264 struct vcpu_vmx *vmx = to_vmx(vcpu);
3265 struct kvm_vmx *kvm_vmx = to_kvm_vmx(vcpu->kvm);
3266
3267 /*
3268 * KVM should never use VM86 to virtualize Real Mode when L2 is active,
3269 * as using VM86 is unnecessary if unrestricted guest is enabled, and
3270 * if unrestricted guest is disabled, VM-Enter (from L1) with CR0.PG=0
3271 * should VM-Fail and KVM should reject userspace attempts to stuff
3272 * CR0.PG=0 when L2 is active.
3273 */
3274 WARN_ON_ONCE(is_guest_mode(vcpu));
3275
3276 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_TR], VCPU_SREG_TR);
3277 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_ES], VCPU_SREG_ES);
3278 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_DS], VCPU_SREG_DS);
3279 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_FS], VCPU_SREG_FS);
3280 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_GS], VCPU_SREG_GS);
3281 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_SS], VCPU_SREG_SS);
3282 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_CS], VCPU_SREG_CS);
3283
3284 vmx->rmode.vm86_active = 1;
3285
3286 vmx_segment_cache_clear(vmx);
3287
3288 vmcs_writel(GUEST_TR_BASE, kvm_vmx->tss_addr);
3289 vmcs_write32(GUEST_TR_LIMIT, RMODE_TSS_SIZE - 1);
3290 vmcs_write32(GUEST_TR_AR_BYTES, 0x008b);
3291
3292 flags = vmcs_readl(GUEST_RFLAGS);
3293 vmx->rmode.save_rflags = flags;
3294
3295 flags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM;
3296
3297 vmcs_writel(GUEST_RFLAGS, flags);
3298 vmcs_writel(GUEST_CR4, vmcs_readl(GUEST_CR4) | X86_CR4_VME);
3299 vmx_update_exception_bitmap(vcpu);
3300
3301 fix_rmode_seg(VCPU_SREG_SS, &vmx->rmode.segs[VCPU_SREG_SS]);
3302 fix_rmode_seg(VCPU_SREG_CS, &vmx->rmode.segs[VCPU_SREG_CS]);
3303 fix_rmode_seg(VCPU_SREG_ES, &vmx->rmode.segs[VCPU_SREG_ES]);
3304 fix_rmode_seg(VCPU_SREG_DS, &vmx->rmode.segs[VCPU_SREG_DS]);
3305 fix_rmode_seg(VCPU_SREG_GS, &vmx->rmode.segs[VCPU_SREG_GS]);
3306 fix_rmode_seg(VCPU_SREG_FS, &vmx->rmode.segs[VCPU_SREG_FS]);
3307 }
3308
vmx_set_efer(struct kvm_vcpu * vcpu,u64 efer)3309 int vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer)
3310 {
3311 struct vcpu_vmx *vmx = to_vmx(vcpu);
3312
3313 /* Nothing to do if hardware doesn't support EFER. */
3314 if (!vmx_find_uret_msr(vmx, MSR_EFER))
3315 return 0;
3316
3317 vcpu->arch.efer = efer;
3318 #ifdef CONFIG_X86_64
3319 if (efer & EFER_LMA)
3320 vm_entry_controls_setbit(vmx, VM_ENTRY_IA32E_MODE);
3321 else
3322 vm_entry_controls_clearbit(vmx, VM_ENTRY_IA32E_MODE);
3323 #else
3324 if (KVM_BUG_ON(efer & EFER_LMA, vcpu->kvm))
3325 return 1;
3326 #endif
3327
3328 vmx_setup_uret_msrs(vmx);
3329 return 0;
3330 }
3331
3332 #ifdef CONFIG_X86_64
3333
enter_lmode(struct kvm_vcpu * vcpu)3334 static void enter_lmode(struct kvm_vcpu *vcpu)
3335 {
3336 u32 guest_tr_ar;
3337
3338 vmx_segment_cache_clear(to_vmx(vcpu));
3339
3340 guest_tr_ar = vmcs_read32(GUEST_TR_AR_BYTES);
3341 if ((guest_tr_ar & VMX_AR_TYPE_MASK) != VMX_AR_TYPE_BUSY_64_TSS) {
3342 pr_debug_ratelimited("%s: tss fixup for long mode. \n",
3343 __func__);
3344 vmcs_write32(GUEST_TR_AR_BYTES,
3345 (guest_tr_ar & ~VMX_AR_TYPE_MASK)
3346 | VMX_AR_TYPE_BUSY_64_TSS);
3347 }
3348 vmx_set_efer(vcpu, vcpu->arch.efer | EFER_LMA);
3349 }
3350
exit_lmode(struct kvm_vcpu * vcpu)3351 static void exit_lmode(struct kvm_vcpu *vcpu)
3352 {
3353 vmx_set_efer(vcpu, vcpu->arch.efer & ~EFER_LMA);
3354 }
3355
3356 #endif
3357
vmx_flush_tlb_all(struct kvm_vcpu * vcpu)3358 void vmx_flush_tlb_all(struct kvm_vcpu *vcpu)
3359 {
3360 struct vcpu_vmx *vmx = to_vmx(vcpu);
3361
3362 /*
3363 * INVEPT must be issued when EPT is enabled, irrespective of VPID, as
3364 * the CPU is not required to invalidate guest-physical mappings on
3365 * VM-Entry, even if VPID is disabled. Guest-physical mappings are
3366 * associated with the root EPT structure and not any particular VPID
3367 * (INVVPID also isn't required to invalidate guest-physical mappings).
3368 */
3369 if (enable_ept) {
3370 ept_sync_global();
3371 } else if (enable_vpid) {
3372 if (cpu_has_vmx_invvpid_global()) {
3373 vpid_sync_vcpu_global();
3374 } else {
3375 vpid_sync_vcpu_single(vmx->vpid);
3376 vpid_sync_vcpu_single(vmx->nested.vpid02);
3377 }
3378 }
3379 }
3380
vmx_get_current_vpid(struct kvm_vcpu * vcpu)3381 static inline int vmx_get_current_vpid(struct kvm_vcpu *vcpu)
3382 {
3383 if (is_guest_mode(vcpu) && nested_cpu_has_vpid(get_vmcs12(vcpu)))
3384 return nested_get_vpid02(vcpu);
3385 return to_vmx(vcpu)->vpid;
3386 }
3387
construct_eptp(hpa_t root_hpa)3388 static u64 construct_eptp(hpa_t root_hpa)
3389 {
3390 u64 eptp = root_hpa | VMX_EPTP_MT_WB;
3391 struct kvm_mmu_page *root;
3392
3393 if (kvm_mmu_is_dummy_root(root_hpa))
3394 return eptp | VMX_EPTP_PWL_4;
3395
3396 /*
3397 * EPT roots should always have an associated MMU page. Return a "bad"
3398 * EPTP to induce VM-Fail instead of continuing on in a unknown state.
3399 */
3400 root = root_to_sp(root_hpa);
3401 if (WARN_ON_ONCE(!root))
3402 return INVALID_PAGE;
3403
3404 eptp |= (root->role.level == 5) ? VMX_EPTP_PWL_5 : VMX_EPTP_PWL_4;
3405
3406 if (enable_ept_ad_bits && !root->role.ad_disabled)
3407 eptp |= VMX_EPTP_AD_ENABLE_BIT;
3408
3409 return eptp;
3410 }
3411
vmx_flush_tlb_ept_root(hpa_t root_hpa)3412 static void vmx_flush_tlb_ept_root(hpa_t root_hpa)
3413 {
3414 u64 eptp = construct_eptp(root_hpa);
3415
3416 if (VALID_PAGE(eptp))
3417 ept_sync_context(eptp);
3418 else
3419 ept_sync_global();
3420 }
3421
vmx_flush_tlb_current(struct kvm_vcpu * vcpu)3422 void vmx_flush_tlb_current(struct kvm_vcpu *vcpu)
3423 {
3424 struct kvm_mmu *mmu = vcpu->arch.mmu;
3425 u64 root_hpa = mmu->root.hpa;
3426
3427 /* No flush required if the current context is invalid. */
3428 if (!VALID_PAGE(root_hpa))
3429 return;
3430
3431 if (enable_ept)
3432 vmx_flush_tlb_ept_root(root_hpa);
3433 else
3434 vpid_sync_context(vmx_get_current_vpid(vcpu));
3435 }
3436
vmx_flush_tlb_gva(struct kvm_vcpu * vcpu,gva_t addr)3437 void vmx_flush_tlb_gva(struct kvm_vcpu *vcpu, gva_t addr)
3438 {
3439 /*
3440 * vpid_sync_vcpu_addr() is a nop if vpid==0, see the comment in
3441 * vmx_flush_tlb_guest() for an explanation of why this is ok.
3442 */
3443 vpid_sync_vcpu_addr(vmx_get_current_vpid(vcpu), addr);
3444 }
3445
vmx_flush_tlb_guest(struct kvm_vcpu * vcpu)3446 void vmx_flush_tlb_guest(struct kvm_vcpu *vcpu)
3447 {
3448 /*
3449 * vpid_sync_context() is a nop if vpid==0, e.g. if enable_vpid==0 or a
3450 * vpid couldn't be allocated for this vCPU. VM-Enter and VM-Exit are
3451 * required to flush GVA->{G,H}PA mappings from the TLB if vpid is
3452 * disabled (VM-Enter with vpid enabled and vpid==0 is disallowed),
3453 * i.e. no explicit INVVPID is necessary.
3454 */
3455 vpid_sync_context(vmx_get_current_vpid(vcpu));
3456 }
3457
vmx_ept_load_pdptrs(struct kvm_vcpu * vcpu)3458 void vmx_ept_load_pdptrs(struct kvm_vcpu *vcpu)
3459 {
3460 struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
3461
3462 if (!kvm_register_is_dirty(vcpu, VCPU_EXREG_PDPTR))
3463 return;
3464
3465 if (is_pae_paging(vcpu)) {
3466 vmcs_write64(GUEST_PDPTR0, mmu->pdptrs[0]);
3467 vmcs_write64(GUEST_PDPTR1, mmu->pdptrs[1]);
3468 vmcs_write64(GUEST_PDPTR2, mmu->pdptrs[2]);
3469 vmcs_write64(GUEST_PDPTR3, mmu->pdptrs[3]);
3470 }
3471 }
3472
ept_save_pdptrs(struct kvm_vcpu * vcpu)3473 void ept_save_pdptrs(struct kvm_vcpu *vcpu)
3474 {
3475 struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
3476
3477 if (WARN_ON_ONCE(!is_pae_paging(vcpu)))
3478 return;
3479
3480 mmu->pdptrs[0] = vmcs_read64(GUEST_PDPTR0);
3481 mmu->pdptrs[1] = vmcs_read64(GUEST_PDPTR1);
3482 mmu->pdptrs[2] = vmcs_read64(GUEST_PDPTR2);
3483 mmu->pdptrs[3] = vmcs_read64(GUEST_PDPTR3);
3484
3485 kvm_register_mark_available(vcpu, VCPU_EXREG_PDPTR);
3486 }
3487
3488 #define CR3_EXITING_BITS (CPU_BASED_CR3_LOAD_EXITING | \
3489 CPU_BASED_CR3_STORE_EXITING)
3490
vmx_is_valid_cr0(struct kvm_vcpu * vcpu,unsigned long cr0)3491 bool vmx_is_valid_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
3492 {
3493 if (is_guest_mode(vcpu))
3494 return nested_guest_cr0_valid(vcpu, cr0);
3495
3496 if (to_vmx(vcpu)->nested.vmxon)
3497 return nested_host_cr0_valid(vcpu, cr0);
3498
3499 return true;
3500 }
3501
vmx_set_cr0(struct kvm_vcpu * vcpu,unsigned long cr0)3502 void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
3503 {
3504 struct vcpu_vmx *vmx = to_vmx(vcpu);
3505 unsigned long hw_cr0, old_cr0_pg;
3506 u32 tmp;
3507
3508 old_cr0_pg = kvm_read_cr0_bits(vcpu, X86_CR0_PG);
3509
3510 hw_cr0 = (cr0 & ~KVM_VM_CR0_ALWAYS_OFF);
3511 if (enable_unrestricted_guest)
3512 hw_cr0 |= KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST;
3513 else {
3514 hw_cr0 |= KVM_VM_CR0_ALWAYS_ON;
3515 if (!enable_ept)
3516 hw_cr0 |= X86_CR0_WP;
3517
3518 if (vmx->rmode.vm86_active && (cr0 & X86_CR0_PE))
3519 enter_pmode(vcpu);
3520
3521 if (!vmx->rmode.vm86_active && !(cr0 & X86_CR0_PE))
3522 enter_rmode(vcpu);
3523 }
3524
3525 vmcs_writel(CR0_READ_SHADOW, cr0);
3526 vmcs_writel(GUEST_CR0, hw_cr0);
3527 vcpu->arch.cr0 = cr0;
3528 kvm_register_mark_available(vcpu, VCPU_EXREG_CR0);
3529
3530 #ifdef CONFIG_X86_64
3531 if (vcpu->arch.efer & EFER_LME) {
3532 if (!old_cr0_pg && (cr0 & X86_CR0_PG))
3533 enter_lmode(vcpu);
3534 else if (old_cr0_pg && !(cr0 & X86_CR0_PG))
3535 exit_lmode(vcpu);
3536 }
3537 #endif
3538
3539 if (enable_ept && !enable_unrestricted_guest) {
3540 /*
3541 * Ensure KVM has an up-to-date snapshot of the guest's CR3. If
3542 * the below code _enables_ CR3 exiting, vmx_cache_reg() will
3543 * (correctly) stop reading vmcs.GUEST_CR3 because it thinks
3544 * KVM's CR3 is installed.
3545 */
3546 if (!kvm_register_is_available(vcpu, VCPU_EXREG_CR3))
3547 vmx_cache_reg(vcpu, VCPU_EXREG_CR3);
3548
3549 /*
3550 * When running with EPT but not unrestricted guest, KVM must
3551 * intercept CR3 accesses when paging is _disabled_. This is
3552 * necessary because restricted guests can't actually run with
3553 * paging disabled, and so KVM stuffs its own CR3 in order to
3554 * run the guest when identity mapped page tables.
3555 *
3556 * Do _NOT_ check the old CR0.PG, e.g. to optimize away the
3557 * update, it may be stale with respect to CR3 interception,
3558 * e.g. after nested VM-Enter.
3559 *
3560 * Lastly, honor L1's desires, i.e. intercept CR3 loads and/or
3561 * stores to forward them to L1, even if KVM does not need to
3562 * intercept them to preserve its identity mapped page tables.
3563 */
3564 if (!(cr0 & X86_CR0_PG)) {
3565 exec_controls_setbit(vmx, CR3_EXITING_BITS);
3566 } else if (!is_guest_mode(vcpu)) {
3567 exec_controls_clearbit(vmx, CR3_EXITING_BITS);
3568 } else {
3569 tmp = exec_controls_get(vmx);
3570 tmp &= ~CR3_EXITING_BITS;
3571 tmp |= get_vmcs12(vcpu)->cpu_based_vm_exec_control & CR3_EXITING_BITS;
3572 exec_controls_set(vmx, tmp);
3573 }
3574
3575 /* Note, vmx_set_cr4() consumes the new vcpu->arch.cr0. */
3576 if ((old_cr0_pg ^ cr0) & X86_CR0_PG)
3577 vmx_set_cr4(vcpu, kvm_read_cr4(vcpu));
3578
3579 /*
3580 * When !CR0_PG -> CR0_PG, vcpu->arch.cr3 becomes active, but
3581 * GUEST_CR3 is still vmx->ept_identity_map_addr if EPT + !URG.
3582 */
3583 if (!(old_cr0_pg & X86_CR0_PG) && (cr0 & X86_CR0_PG))
3584 kvm_register_mark_dirty(vcpu, VCPU_EXREG_CR3);
3585 }
3586
3587 /* depends on vcpu->arch.cr0 to be set to a new value */
3588 vmx->vt.emulation_required = vmx_emulation_required(vcpu);
3589 }
3590
vmx_get_max_ept_level(void)3591 static int vmx_get_max_ept_level(void)
3592 {
3593 if (cpu_has_vmx_ept_5levels())
3594 return 5;
3595 return 4;
3596 }
3597
vmx_load_mmu_pgd(struct kvm_vcpu * vcpu,hpa_t root_hpa,int root_level)3598 void vmx_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa, int root_level)
3599 {
3600 struct kvm *kvm = vcpu->kvm;
3601 bool update_guest_cr3 = true;
3602 unsigned long guest_cr3;
3603
3604 if (enable_ept) {
3605 KVM_MMU_WARN_ON(root_to_sp(root_hpa) &&
3606 root_level != root_to_sp(root_hpa)->role.level);
3607 vmcs_write64(EPT_POINTER, construct_eptp(root_hpa));
3608
3609 hv_track_root_tdp(vcpu, root_hpa);
3610
3611 if (!enable_unrestricted_guest && !is_paging(vcpu))
3612 guest_cr3 = to_kvm_vmx(kvm)->ept_identity_map_addr;
3613 else if (kvm_register_is_dirty(vcpu, VCPU_EXREG_CR3))
3614 guest_cr3 = vcpu->arch.cr3;
3615 else /* vmcs.GUEST_CR3 is already up-to-date. */
3616 update_guest_cr3 = false;
3617 vmx_ept_load_pdptrs(vcpu);
3618 } else {
3619 guest_cr3 = root_hpa | kvm_get_active_pcid(vcpu) |
3620 kvm_get_active_cr3_lam_bits(vcpu);
3621 }
3622
3623 if (update_guest_cr3)
3624 vmcs_writel(GUEST_CR3, guest_cr3);
3625 }
3626
vmx_is_valid_cr4(struct kvm_vcpu * vcpu,unsigned long cr4)3627 bool vmx_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
3628 {
3629 /*
3630 * We operate under the default treatment of SMM, so VMX cannot be
3631 * enabled under SMM. Note, whether or not VMXE is allowed at all,
3632 * i.e. is a reserved bit, is handled by common x86 code.
3633 */
3634 if ((cr4 & X86_CR4_VMXE) && is_smm(vcpu))
3635 return false;
3636
3637 if (to_vmx(vcpu)->nested.vmxon && !nested_cr4_valid(vcpu, cr4))
3638 return false;
3639
3640 return true;
3641 }
3642
vmx_set_cr4(struct kvm_vcpu * vcpu,unsigned long cr4)3643 void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
3644 {
3645 unsigned long old_cr4 = kvm_read_cr4(vcpu);
3646 struct vcpu_vmx *vmx = to_vmx(vcpu);
3647 unsigned long hw_cr4;
3648
3649 /*
3650 * Pass through host's Machine Check Enable value to hw_cr4, which
3651 * is in force while we are in guest mode. Do not let guests control
3652 * this bit, even if host CR4.MCE == 0.
3653 */
3654 hw_cr4 = (cr4_read_shadow() & X86_CR4_MCE) | (cr4 & ~X86_CR4_MCE);
3655 if (enable_unrestricted_guest)
3656 hw_cr4 |= KVM_VM_CR4_ALWAYS_ON_UNRESTRICTED_GUEST;
3657 else if (vmx->rmode.vm86_active)
3658 hw_cr4 |= KVM_RMODE_VM_CR4_ALWAYS_ON;
3659 else
3660 hw_cr4 |= KVM_PMODE_VM_CR4_ALWAYS_ON;
3661
3662 if (vmx_umip_emulated()) {
3663 if (cr4 & X86_CR4_UMIP) {
3664 secondary_exec_controls_setbit(vmx, SECONDARY_EXEC_DESC);
3665 hw_cr4 &= ~X86_CR4_UMIP;
3666 } else if (!is_guest_mode(vcpu) ||
3667 !nested_cpu_has2(get_vmcs12(vcpu), SECONDARY_EXEC_DESC)) {
3668 secondary_exec_controls_clearbit(vmx, SECONDARY_EXEC_DESC);
3669 }
3670 }
3671
3672 vcpu->arch.cr4 = cr4;
3673 kvm_register_mark_available(vcpu, VCPU_EXREG_CR4);
3674
3675 if (!enable_unrestricted_guest) {
3676 if (enable_ept) {
3677 if (!is_paging(vcpu)) {
3678 hw_cr4 &= ~X86_CR4_PAE;
3679 hw_cr4 |= X86_CR4_PSE;
3680 } else if (!(cr4 & X86_CR4_PAE)) {
3681 hw_cr4 &= ~X86_CR4_PAE;
3682 }
3683 }
3684
3685 /*
3686 * SMEP/SMAP/PKU is disabled if CPU is in non-paging mode in
3687 * hardware. To emulate this behavior, SMEP/SMAP/PKU needs
3688 * to be manually disabled when guest switches to non-paging
3689 * mode.
3690 *
3691 * If !enable_unrestricted_guest, the CPU is always running
3692 * with CR0.PG=1 and CR4 needs to be modified.
3693 * If enable_unrestricted_guest, the CPU automatically
3694 * disables SMEP/SMAP/PKU when the guest sets CR0.PG=0.
3695 */
3696 if (!is_paging(vcpu))
3697 hw_cr4 &= ~(X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_PKE);
3698 }
3699
3700 vmcs_writel(CR4_READ_SHADOW, cr4);
3701 vmcs_writel(GUEST_CR4, hw_cr4);
3702
3703 if ((cr4 ^ old_cr4) & (X86_CR4_OSXSAVE | X86_CR4_PKE))
3704 vcpu->arch.cpuid_dynamic_bits_dirty = true;
3705 }
3706
vmx_get_segment(struct kvm_vcpu * vcpu,struct kvm_segment * var,int seg)3707 void vmx_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg)
3708 {
3709 struct vcpu_vmx *vmx = to_vmx(vcpu);
3710 u32 ar;
3711
3712 if (vmx->rmode.vm86_active && seg != VCPU_SREG_LDTR) {
3713 *var = vmx->rmode.segs[seg];
3714 if (seg == VCPU_SREG_TR
3715 || var->selector == vmx_read_guest_seg_selector(vmx, seg))
3716 return;
3717 var->base = vmx_read_guest_seg_base(vmx, seg);
3718 var->selector = vmx_read_guest_seg_selector(vmx, seg);
3719 return;
3720 }
3721 var->base = vmx_read_guest_seg_base(vmx, seg);
3722 var->limit = vmx_read_guest_seg_limit(vmx, seg);
3723 var->selector = vmx_read_guest_seg_selector(vmx, seg);
3724 ar = vmx_read_guest_seg_ar(vmx, seg);
3725 var->unusable = (ar >> 16) & 1;
3726 var->type = ar & 15;
3727 var->s = (ar >> 4) & 1;
3728 var->dpl = (ar >> 5) & 3;
3729 /*
3730 * Some userspaces do not preserve unusable property. Since usable
3731 * segment has to be present according to VMX spec we can use present
3732 * property to amend userspace bug by making unusable segment always
3733 * nonpresent. vmx_segment_access_rights() already marks nonpresent
3734 * segment as unusable.
3735 */
3736 var->present = !var->unusable;
3737 var->avl = (ar >> 12) & 1;
3738 var->l = (ar >> 13) & 1;
3739 var->db = (ar >> 14) & 1;
3740 var->g = (ar >> 15) & 1;
3741 }
3742
vmx_get_segment_base(struct kvm_vcpu * vcpu,int seg)3743 u64 vmx_get_segment_base(struct kvm_vcpu *vcpu, int seg)
3744 {
3745 struct kvm_segment s;
3746
3747 if (to_vmx(vcpu)->rmode.vm86_active) {
3748 vmx_get_segment(vcpu, &s, seg);
3749 return s.base;
3750 }
3751 return vmx_read_guest_seg_base(to_vmx(vcpu), seg);
3752 }
3753
__vmx_get_cpl(struct kvm_vcpu * vcpu,bool no_cache)3754 static int __vmx_get_cpl(struct kvm_vcpu *vcpu, bool no_cache)
3755 {
3756 struct vcpu_vmx *vmx = to_vmx(vcpu);
3757 int ar;
3758
3759 if (unlikely(vmx->rmode.vm86_active))
3760 return 0;
3761
3762 if (no_cache)
3763 ar = vmcs_read32(GUEST_SS_AR_BYTES);
3764 else
3765 ar = vmx_read_guest_seg_ar(vmx, VCPU_SREG_SS);
3766 return VMX_AR_DPL(ar);
3767 }
3768
vmx_get_cpl(struct kvm_vcpu * vcpu)3769 int vmx_get_cpl(struct kvm_vcpu *vcpu)
3770 {
3771 return __vmx_get_cpl(vcpu, false);
3772 }
3773
vmx_get_cpl_no_cache(struct kvm_vcpu * vcpu)3774 int vmx_get_cpl_no_cache(struct kvm_vcpu *vcpu)
3775 {
3776 return __vmx_get_cpl(vcpu, true);
3777 }
3778
vmx_segment_access_rights(struct kvm_segment * var)3779 static u32 vmx_segment_access_rights(struct kvm_segment *var)
3780 {
3781 u32 ar;
3782
3783 ar = var->type & 15;
3784 ar |= (var->s & 1) << 4;
3785 ar |= (var->dpl & 3) << 5;
3786 ar |= (var->present & 1) << 7;
3787 ar |= (var->avl & 1) << 12;
3788 ar |= (var->l & 1) << 13;
3789 ar |= (var->db & 1) << 14;
3790 ar |= (var->g & 1) << 15;
3791 ar |= (var->unusable || !var->present) << 16;
3792
3793 return ar;
3794 }
3795
__vmx_set_segment(struct kvm_vcpu * vcpu,struct kvm_segment * var,int seg)3796 void __vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg)
3797 {
3798 struct vcpu_vmx *vmx = to_vmx(vcpu);
3799 const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
3800
3801 vmx_segment_cache_clear(vmx);
3802
3803 if (vmx->rmode.vm86_active && seg != VCPU_SREG_LDTR) {
3804 vmx->rmode.segs[seg] = *var;
3805 if (seg == VCPU_SREG_TR)
3806 vmcs_write16(sf->selector, var->selector);
3807 else if (var->s)
3808 fix_rmode_seg(seg, &vmx->rmode.segs[seg]);
3809 return;
3810 }
3811
3812 vmcs_writel(sf->base, var->base);
3813 vmcs_write32(sf->limit, var->limit);
3814 vmcs_write16(sf->selector, var->selector);
3815
3816 /*
3817 * Fix the "Accessed" bit in AR field of segment registers for older
3818 * qemu binaries.
3819 * IA32 arch specifies that at the time of processor reset the
3820 * "Accessed" bit in the AR field of segment registers is 1. And qemu
3821 * is setting it to 0 in the userland code. This causes invalid guest
3822 * state vmexit when "unrestricted guest" mode is turned on.
3823 * Fix for this setup issue in cpu_reset is being pushed in the qemu
3824 * tree. Newer qemu binaries with that qemu fix would not need this
3825 * kvm hack.
3826 */
3827 if (is_unrestricted_guest(vcpu) && (seg != VCPU_SREG_LDTR))
3828 var->type |= 0x1; /* Accessed */
3829
3830 vmcs_write32(sf->ar_bytes, vmx_segment_access_rights(var));
3831 }
3832
vmx_set_segment(struct kvm_vcpu * vcpu,struct kvm_segment * var,int seg)3833 void vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg)
3834 {
3835 __vmx_set_segment(vcpu, var, seg);
3836
3837 to_vmx(vcpu)->vt.emulation_required = vmx_emulation_required(vcpu);
3838 }
3839
vmx_get_cs_db_l_bits(struct kvm_vcpu * vcpu,int * db,int * l)3840 void vmx_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
3841 {
3842 u32 ar = vmx_read_guest_seg_ar(to_vmx(vcpu), VCPU_SREG_CS);
3843
3844 *db = (ar >> 14) & 1;
3845 *l = (ar >> 13) & 1;
3846 }
3847
vmx_get_idt(struct kvm_vcpu * vcpu,struct desc_ptr * dt)3848 void vmx_get_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
3849 {
3850 dt->size = vmcs_read32(GUEST_IDTR_LIMIT);
3851 dt->address = vmcs_readl(GUEST_IDTR_BASE);
3852 }
3853
vmx_set_idt(struct kvm_vcpu * vcpu,struct desc_ptr * dt)3854 void vmx_set_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
3855 {
3856 vmcs_write32(GUEST_IDTR_LIMIT, dt->size);
3857 vmcs_writel(GUEST_IDTR_BASE, dt->address);
3858 }
3859
vmx_get_gdt(struct kvm_vcpu * vcpu,struct desc_ptr * dt)3860 void vmx_get_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
3861 {
3862 dt->size = vmcs_read32(GUEST_GDTR_LIMIT);
3863 dt->address = vmcs_readl(GUEST_GDTR_BASE);
3864 }
3865
vmx_set_gdt(struct kvm_vcpu * vcpu,struct desc_ptr * dt)3866 void vmx_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
3867 {
3868 vmcs_write32(GUEST_GDTR_LIMIT, dt->size);
3869 vmcs_writel(GUEST_GDTR_BASE, dt->address);
3870 }
3871
rmode_segment_valid(struct kvm_vcpu * vcpu,int seg)3872 static bool rmode_segment_valid(struct kvm_vcpu *vcpu, int seg)
3873 {
3874 struct kvm_segment var;
3875 u32 ar;
3876
3877 vmx_get_segment(vcpu, &var, seg);
3878 var.dpl = 0x3;
3879 if (seg == VCPU_SREG_CS)
3880 var.type = 0x3;
3881 ar = vmx_segment_access_rights(&var);
3882
3883 if (var.base != (var.selector << 4))
3884 return false;
3885 if (var.limit != 0xffff)
3886 return false;
3887 if (ar != 0xf3)
3888 return false;
3889
3890 return true;
3891 }
3892
code_segment_valid(struct kvm_vcpu * vcpu)3893 static bool code_segment_valid(struct kvm_vcpu *vcpu)
3894 {
3895 struct kvm_segment cs;
3896 unsigned int cs_rpl;
3897
3898 vmx_get_segment(vcpu, &cs, VCPU_SREG_CS);
3899 cs_rpl = cs.selector & SEGMENT_RPL_MASK;
3900
3901 if (cs.unusable)
3902 return false;
3903 if (~cs.type & (VMX_AR_TYPE_CODE_MASK|VMX_AR_TYPE_ACCESSES_MASK))
3904 return false;
3905 if (!cs.s)
3906 return false;
3907 if (cs.type & VMX_AR_TYPE_WRITEABLE_MASK) {
3908 if (cs.dpl > cs_rpl)
3909 return false;
3910 } else {
3911 if (cs.dpl != cs_rpl)
3912 return false;
3913 }
3914 if (!cs.present)
3915 return false;
3916
3917 /* TODO: Add Reserved field check, this'll require a new member in the kvm_segment_field structure */
3918 return true;
3919 }
3920
stack_segment_valid(struct kvm_vcpu * vcpu)3921 static bool stack_segment_valid(struct kvm_vcpu *vcpu)
3922 {
3923 struct kvm_segment ss;
3924 unsigned int ss_rpl;
3925
3926 vmx_get_segment(vcpu, &ss, VCPU_SREG_SS);
3927 ss_rpl = ss.selector & SEGMENT_RPL_MASK;
3928
3929 if (ss.unusable)
3930 return true;
3931 if (ss.type != 3 && ss.type != 7)
3932 return false;
3933 if (!ss.s)
3934 return false;
3935 if (ss.dpl != ss_rpl) /* DPL != RPL */
3936 return false;
3937 if (!ss.present)
3938 return false;
3939
3940 return true;
3941 }
3942
data_segment_valid(struct kvm_vcpu * vcpu,int seg)3943 static bool data_segment_valid(struct kvm_vcpu *vcpu, int seg)
3944 {
3945 struct kvm_segment var;
3946 unsigned int rpl;
3947
3948 vmx_get_segment(vcpu, &var, seg);
3949 rpl = var.selector & SEGMENT_RPL_MASK;
3950
3951 if (var.unusable)
3952 return true;
3953 if (!var.s)
3954 return false;
3955 if (!var.present)
3956 return false;
3957 if (~var.type & (VMX_AR_TYPE_CODE_MASK|VMX_AR_TYPE_WRITEABLE_MASK)) {
3958 if (var.dpl < rpl) /* DPL < RPL */
3959 return false;
3960 }
3961
3962 /* TODO: Add other members to kvm_segment_field to allow checking for other access
3963 * rights flags
3964 */
3965 return true;
3966 }
3967
tr_valid(struct kvm_vcpu * vcpu)3968 static bool tr_valid(struct kvm_vcpu *vcpu)
3969 {
3970 struct kvm_segment tr;
3971
3972 vmx_get_segment(vcpu, &tr, VCPU_SREG_TR);
3973
3974 if (tr.unusable)
3975 return false;
3976 if (tr.selector & SEGMENT_TI_MASK) /* TI = 1 */
3977 return false;
3978 if (tr.type != 3 && tr.type != 11) /* TODO: Check if guest is in IA32e mode */
3979 return false;
3980 if (!tr.present)
3981 return false;
3982
3983 return true;
3984 }
3985
ldtr_valid(struct kvm_vcpu * vcpu)3986 static bool ldtr_valid(struct kvm_vcpu *vcpu)
3987 {
3988 struct kvm_segment ldtr;
3989
3990 vmx_get_segment(vcpu, &ldtr, VCPU_SREG_LDTR);
3991
3992 if (ldtr.unusable)
3993 return true;
3994 if (ldtr.selector & SEGMENT_TI_MASK) /* TI = 1 */
3995 return false;
3996 if (ldtr.type != 2)
3997 return false;
3998 if (!ldtr.present)
3999 return false;
4000
4001 return true;
4002 }
4003
cs_ss_rpl_check(struct kvm_vcpu * vcpu)4004 static bool cs_ss_rpl_check(struct kvm_vcpu *vcpu)
4005 {
4006 struct kvm_segment cs, ss;
4007
4008 vmx_get_segment(vcpu, &cs, VCPU_SREG_CS);
4009 vmx_get_segment(vcpu, &ss, VCPU_SREG_SS);
4010
4011 return ((cs.selector & SEGMENT_RPL_MASK) ==
4012 (ss.selector & SEGMENT_RPL_MASK));
4013 }
4014
4015 /*
4016 * Check if guest state is valid. Returns true if valid, false if
4017 * not.
4018 * We assume that registers are always usable
4019 */
__vmx_guest_state_valid(struct kvm_vcpu * vcpu)4020 bool __vmx_guest_state_valid(struct kvm_vcpu *vcpu)
4021 {
4022 /* real mode guest state checks */
4023 if (!is_protmode(vcpu) || (vmx_get_rflags(vcpu) & X86_EFLAGS_VM)) {
4024 if (!rmode_segment_valid(vcpu, VCPU_SREG_CS))
4025 return false;
4026 if (!rmode_segment_valid(vcpu, VCPU_SREG_SS))
4027 return false;
4028 if (!rmode_segment_valid(vcpu, VCPU_SREG_DS))
4029 return false;
4030 if (!rmode_segment_valid(vcpu, VCPU_SREG_ES))
4031 return false;
4032 if (!rmode_segment_valid(vcpu, VCPU_SREG_FS))
4033 return false;
4034 if (!rmode_segment_valid(vcpu, VCPU_SREG_GS))
4035 return false;
4036 } else {
4037 /* protected mode guest state checks */
4038 if (!cs_ss_rpl_check(vcpu))
4039 return false;
4040 if (!code_segment_valid(vcpu))
4041 return false;
4042 if (!stack_segment_valid(vcpu))
4043 return false;
4044 if (!data_segment_valid(vcpu, VCPU_SREG_DS))
4045 return false;
4046 if (!data_segment_valid(vcpu, VCPU_SREG_ES))
4047 return false;
4048 if (!data_segment_valid(vcpu, VCPU_SREG_FS))
4049 return false;
4050 if (!data_segment_valid(vcpu, VCPU_SREG_GS))
4051 return false;
4052 if (!tr_valid(vcpu))
4053 return false;
4054 if (!ldtr_valid(vcpu))
4055 return false;
4056 }
4057 /* TODO:
4058 * - Add checks on RIP
4059 * - Add checks on RFLAGS
4060 */
4061
4062 return true;
4063 }
4064
init_rmode_tss(struct kvm * kvm,void __user * ua)4065 static int init_rmode_tss(struct kvm *kvm, void __user *ua)
4066 {
4067 const void *zero_page = (const void *) __va(page_to_phys(ZERO_PAGE(0)));
4068 u16 data;
4069 int i;
4070
4071 for (i = 0; i < 3; i++) {
4072 if (__copy_to_user(ua + PAGE_SIZE * i, zero_page, PAGE_SIZE))
4073 return -EFAULT;
4074 }
4075
4076 data = TSS_BASE_SIZE + TSS_REDIRECTION_SIZE;
4077 if (__copy_to_user(ua + TSS_IOPB_BASE_OFFSET, &data, sizeof(u16)))
4078 return -EFAULT;
4079
4080 data = ~0;
4081 if (__copy_to_user(ua + RMODE_TSS_SIZE - 1, &data, sizeof(u8)))
4082 return -EFAULT;
4083
4084 return 0;
4085 }
4086
init_rmode_identity_map(struct kvm * kvm)4087 static int init_rmode_identity_map(struct kvm *kvm)
4088 {
4089 struct kvm_vmx *kvm_vmx = to_kvm_vmx(kvm);
4090 int i, r = 0;
4091 void __user *uaddr;
4092 u32 tmp;
4093
4094 /* Protect kvm_vmx->ept_identity_pagetable_done. */
4095 mutex_lock(&kvm->slots_lock);
4096
4097 if (likely(kvm_vmx->ept_identity_pagetable_done))
4098 goto out;
4099
4100 if (!kvm_vmx->ept_identity_map_addr)
4101 kvm_vmx->ept_identity_map_addr = VMX_EPT_IDENTITY_PAGETABLE_ADDR;
4102
4103 uaddr = __x86_set_memory_region(kvm,
4104 IDENTITY_PAGETABLE_PRIVATE_MEMSLOT,
4105 kvm_vmx->ept_identity_map_addr,
4106 PAGE_SIZE);
4107 if (IS_ERR(uaddr)) {
4108 r = PTR_ERR(uaddr);
4109 goto out;
4110 }
4111
4112 /* Set up identity-mapping pagetable for EPT in real mode */
4113 for (i = 0; i < (PAGE_SIZE / sizeof(tmp)); i++) {
4114 tmp = (i << 22) + (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER |
4115 _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_PSE);
4116 if (__copy_to_user(uaddr + i * sizeof(tmp), &tmp, sizeof(tmp))) {
4117 r = -EFAULT;
4118 goto out;
4119 }
4120 }
4121 kvm_vmx->ept_identity_pagetable_done = true;
4122
4123 out:
4124 mutex_unlock(&kvm->slots_lock);
4125 return r;
4126 }
4127
seg_setup(int seg)4128 static void seg_setup(int seg)
4129 {
4130 const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
4131 unsigned int ar;
4132
4133 vmcs_write16(sf->selector, 0);
4134 vmcs_writel(sf->base, 0);
4135 vmcs_write32(sf->limit, 0xffff);
4136 ar = 0x93;
4137 if (seg == VCPU_SREG_CS)
4138 ar |= 0x08; /* code segment */
4139
4140 vmcs_write32(sf->ar_bytes, ar);
4141 }
4142
allocate_vpid(void)4143 int allocate_vpid(void)
4144 {
4145 int vpid;
4146
4147 if (!enable_vpid)
4148 return 0;
4149 spin_lock(&vmx_vpid_lock);
4150 vpid = find_first_zero_bit(vmx_vpid_bitmap, VMX_NR_VPIDS);
4151 if (vpid < VMX_NR_VPIDS)
4152 __set_bit(vpid, vmx_vpid_bitmap);
4153 else
4154 vpid = 0;
4155 spin_unlock(&vmx_vpid_lock);
4156 return vpid;
4157 }
4158
free_vpid(int vpid)4159 void free_vpid(int vpid)
4160 {
4161 if (!enable_vpid || vpid == 0)
4162 return;
4163 spin_lock(&vmx_vpid_lock);
4164 __clear_bit(vpid, vmx_vpid_bitmap);
4165 spin_unlock(&vmx_vpid_lock);
4166 }
4167
vmx_msr_bitmap_l01_changed(struct vcpu_vmx * vmx)4168 static void vmx_msr_bitmap_l01_changed(struct vcpu_vmx *vmx)
4169 {
4170 /*
4171 * When KVM is a nested hypervisor on top of Hyper-V and uses
4172 * 'Enlightened MSR Bitmap' feature L0 needs to know that MSR
4173 * bitmap has changed.
4174 */
4175 if (kvm_is_using_evmcs()) {
4176 struct hv_enlightened_vmcs *evmcs = (void *)vmx->vmcs01.vmcs;
4177
4178 if (evmcs->hv_enlightenments_control.msr_bitmap)
4179 evmcs->hv_clean_fields &=
4180 ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP;
4181 }
4182
4183 vmx->nested.force_msr_bitmap_recalc = true;
4184 }
4185
vmx_set_intercept_for_msr(struct kvm_vcpu * vcpu,u32 msr,int type,bool set)4186 void vmx_set_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type, bool set)
4187 {
4188 struct vcpu_vmx *vmx = to_vmx(vcpu);
4189 unsigned long *msr_bitmap = vmx->vmcs01.msr_bitmap;
4190
4191 if (!cpu_has_vmx_msr_bitmap())
4192 return;
4193
4194 vmx_msr_bitmap_l01_changed(vmx);
4195
4196 if (type & MSR_TYPE_R) {
4197 if (!set && kvm_msr_allowed(vcpu, msr, KVM_MSR_FILTER_READ))
4198 vmx_clear_msr_bitmap_read(msr_bitmap, msr);
4199 else
4200 vmx_set_msr_bitmap_read(msr_bitmap, msr);
4201 }
4202
4203 if (type & MSR_TYPE_W) {
4204 if (!set && kvm_msr_allowed(vcpu, msr, KVM_MSR_FILTER_WRITE))
4205 vmx_clear_msr_bitmap_write(msr_bitmap, msr);
4206 else
4207 vmx_set_msr_bitmap_write(msr_bitmap, msr);
4208 }
4209 }
4210
vmx_update_msr_bitmap_x2apic(struct kvm_vcpu * vcpu)4211 static void vmx_update_msr_bitmap_x2apic(struct kvm_vcpu *vcpu)
4212 {
4213 /*
4214 * x2APIC indices for 64-bit accesses into the RDMSR and WRMSR halves
4215 * of the MSR bitmap. KVM emulates APIC registers up through 0x3f0,
4216 * i.e. MSR 0x83f, and so only needs to dynamically manipulate 64 bits.
4217 */
4218 const int read_idx = APIC_BASE_MSR / BITS_PER_LONG_LONG;
4219 const int write_idx = read_idx + (0x800 / sizeof(u64));
4220 struct vcpu_vmx *vmx = to_vmx(vcpu);
4221 u64 *msr_bitmap = (u64 *)vmx->vmcs01.msr_bitmap;
4222 u8 mode;
4223
4224 if (!cpu_has_vmx_msr_bitmap() || WARN_ON_ONCE(!lapic_in_kernel(vcpu)))
4225 return;
4226
4227 if (cpu_has_secondary_exec_ctrls() &&
4228 (secondary_exec_controls_get(vmx) &
4229 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE)) {
4230 mode = MSR_BITMAP_MODE_X2APIC;
4231 if (enable_apicv && kvm_vcpu_apicv_active(vcpu))
4232 mode |= MSR_BITMAP_MODE_X2APIC_APICV;
4233 } else {
4234 mode = 0;
4235 }
4236
4237 if (mode == vmx->x2apic_msr_bitmap_mode)
4238 return;
4239
4240 vmx->x2apic_msr_bitmap_mode = mode;
4241
4242 /*
4243 * Reset the bitmap for MSRs 0x800 - 0x83f. Leave AMD's uber-extended
4244 * registers (0x840 and above) intercepted, KVM doesn't support them.
4245 * Intercept all writes by default and poke holes as needed. Pass
4246 * through reads for all valid registers by default in x2APIC+APICv
4247 * mode, only the current timer count needs on-demand emulation by KVM.
4248 */
4249 if (mode & MSR_BITMAP_MODE_X2APIC_APICV)
4250 msr_bitmap[read_idx] = ~kvm_lapic_readable_reg_mask(vcpu->arch.apic);
4251 else
4252 msr_bitmap[read_idx] = ~0ull;
4253 msr_bitmap[write_idx] = ~0ull;
4254
4255 /*
4256 * TPR reads and writes can be virtualized even if virtual interrupt
4257 * delivery is not in use.
4258 */
4259 vmx_set_intercept_for_msr(vcpu, X2APIC_MSR(APIC_TASKPRI), MSR_TYPE_RW,
4260 !(mode & MSR_BITMAP_MODE_X2APIC));
4261
4262 if (mode & MSR_BITMAP_MODE_X2APIC_APICV) {
4263 vmx_enable_intercept_for_msr(vcpu, X2APIC_MSR(APIC_TMCCT), MSR_TYPE_RW);
4264 vmx_disable_intercept_for_msr(vcpu, X2APIC_MSR(APIC_EOI), MSR_TYPE_W);
4265 vmx_disable_intercept_for_msr(vcpu, X2APIC_MSR(APIC_SELF_IPI), MSR_TYPE_W);
4266 if (enable_ipiv)
4267 vmx_disable_intercept_for_msr(vcpu, X2APIC_MSR(APIC_ICR), MSR_TYPE_RW);
4268 }
4269 }
4270
pt_update_intercept_for_msr(struct kvm_vcpu * vcpu)4271 void pt_update_intercept_for_msr(struct kvm_vcpu *vcpu)
4272 {
4273 struct vcpu_vmx *vmx = to_vmx(vcpu);
4274 bool flag = !(vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN);
4275 u32 i;
4276
4277 vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_STATUS, MSR_TYPE_RW, flag);
4278 vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_OUTPUT_BASE, MSR_TYPE_RW, flag);
4279 vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_OUTPUT_MASK, MSR_TYPE_RW, flag);
4280 vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_CR3_MATCH, MSR_TYPE_RW, flag);
4281 for (i = 0; i < vmx->pt_desc.num_address_ranges; i++) {
4282 vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_ADDR0_A + i * 2, MSR_TYPE_RW, flag);
4283 vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_ADDR0_B + i * 2, MSR_TYPE_RW, flag);
4284 }
4285 }
4286
vmx_recalc_pmu_msr_intercepts(struct kvm_vcpu * vcpu)4287 static void vmx_recalc_pmu_msr_intercepts(struct kvm_vcpu *vcpu)
4288 {
4289 u64 vm_exit_controls_bits = VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL |
4290 VM_EXIT_SAVE_IA32_PERF_GLOBAL_CTRL;
4291 bool has_mediated_pmu = kvm_vcpu_has_mediated_pmu(vcpu);
4292 struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
4293 struct vcpu_vmx *vmx = to_vmx(vcpu);
4294 bool intercept = !has_mediated_pmu;
4295 int i;
4296
4297 if (!enable_mediated_pmu)
4298 return;
4299
4300 if (!cpu_has_save_perf_global_ctrl()) {
4301 vm_exit_controls_bits &= ~VM_EXIT_SAVE_IA32_PERF_GLOBAL_CTRL;
4302
4303 if (has_mediated_pmu)
4304 vmx_add_autostore_msr(vmx, MSR_CORE_PERF_GLOBAL_CTRL);
4305 else
4306 vmx_remove_autostore_msr(vmx, MSR_CORE_PERF_GLOBAL_CTRL);
4307 }
4308
4309 vm_entry_controls_changebit(vmx, VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL,
4310 has_mediated_pmu);
4311
4312 vm_exit_controls_changebit(vmx, vm_exit_controls_bits, has_mediated_pmu);
4313
4314 for (i = 0; i < pmu->nr_arch_gp_counters; i++) {
4315 vmx_set_intercept_for_msr(vcpu, MSR_IA32_PERFCTR0 + i,
4316 MSR_TYPE_RW, intercept);
4317 vmx_set_intercept_for_msr(vcpu, MSR_IA32_PMC0 + i, MSR_TYPE_RW,
4318 intercept || !fw_writes_is_enabled(vcpu));
4319 }
4320 for ( ; i < kvm_pmu_cap.num_counters_gp; i++) {
4321 vmx_set_intercept_for_msr(vcpu, MSR_IA32_PERFCTR0 + i,
4322 MSR_TYPE_RW, true);
4323 vmx_set_intercept_for_msr(vcpu, MSR_IA32_PMC0 + i,
4324 MSR_TYPE_RW, true);
4325 }
4326
4327 for (i = 0; i < pmu->nr_arch_fixed_counters; i++)
4328 vmx_set_intercept_for_msr(vcpu, MSR_CORE_PERF_FIXED_CTR0 + i,
4329 MSR_TYPE_RW, intercept);
4330 for ( ; i < kvm_pmu_cap.num_counters_fixed; i++)
4331 vmx_set_intercept_for_msr(vcpu, MSR_CORE_PERF_FIXED_CTR0 + i,
4332 MSR_TYPE_RW, true);
4333
4334 intercept = kvm_need_perf_global_ctrl_intercept(vcpu);
4335 vmx_set_intercept_for_msr(vcpu, MSR_CORE_PERF_GLOBAL_STATUS,
4336 MSR_TYPE_RW, intercept);
4337 vmx_set_intercept_for_msr(vcpu, MSR_CORE_PERF_GLOBAL_CTRL,
4338 MSR_TYPE_RW, intercept);
4339 vmx_set_intercept_for_msr(vcpu, MSR_CORE_PERF_GLOBAL_OVF_CTRL,
4340 MSR_TYPE_RW, intercept);
4341 }
4342
vmx_recalc_msr_intercepts(struct kvm_vcpu * vcpu)4343 static void vmx_recalc_msr_intercepts(struct kvm_vcpu *vcpu)
4344 {
4345 bool intercept;
4346
4347 if (!cpu_has_vmx_msr_bitmap())
4348 return;
4349
4350 vmx_disable_intercept_for_msr(vcpu, MSR_IA32_TSC, MSR_TYPE_R);
4351 #ifdef CONFIG_X86_64
4352 vmx_disable_intercept_for_msr(vcpu, MSR_FS_BASE, MSR_TYPE_RW);
4353 vmx_disable_intercept_for_msr(vcpu, MSR_GS_BASE, MSR_TYPE_RW);
4354 vmx_disable_intercept_for_msr(vcpu, MSR_KERNEL_GS_BASE, MSR_TYPE_RW);
4355 #endif
4356 vmx_disable_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_CS, MSR_TYPE_RW);
4357 vmx_disable_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_ESP, MSR_TYPE_RW);
4358 vmx_disable_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_EIP, MSR_TYPE_RW);
4359 if (kvm_cstate_in_guest(vcpu->kvm)) {
4360 vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C1_RES, MSR_TYPE_R);
4361 vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C3_RESIDENCY, MSR_TYPE_R);
4362 vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C6_RESIDENCY, MSR_TYPE_R);
4363 vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C7_RESIDENCY, MSR_TYPE_R);
4364 }
4365 if (kvm_aperfmperf_in_guest(vcpu->kvm)) {
4366 vmx_disable_intercept_for_msr(vcpu, MSR_IA32_APERF, MSR_TYPE_R);
4367 vmx_disable_intercept_for_msr(vcpu, MSR_IA32_MPERF, MSR_TYPE_R);
4368 }
4369
4370 /* PT MSRs can be passed through iff PT is exposed to the guest. */
4371 if (vmx_pt_mode_is_host_guest())
4372 pt_update_intercept_for_msr(vcpu);
4373
4374 if (vcpu->arch.xfd_no_write_intercept)
4375 vmx_disable_intercept_for_msr(vcpu, MSR_IA32_XFD, MSR_TYPE_RW);
4376
4377 vmx_set_intercept_for_msr(vcpu, MSR_IA32_SPEC_CTRL, MSR_TYPE_RW,
4378 !to_vmx(vcpu)->spec_ctrl);
4379
4380 if (kvm_cpu_cap_has(X86_FEATURE_XFD))
4381 vmx_set_intercept_for_msr(vcpu, MSR_IA32_XFD_ERR, MSR_TYPE_R,
4382 !guest_cpu_cap_has(vcpu, X86_FEATURE_XFD));
4383
4384 if (cpu_feature_enabled(X86_FEATURE_IBPB))
4385 vmx_set_intercept_for_msr(vcpu, MSR_IA32_PRED_CMD, MSR_TYPE_W,
4386 !guest_has_pred_cmd_msr(vcpu));
4387
4388 if (cpu_feature_enabled(X86_FEATURE_FLUSH_L1D))
4389 vmx_set_intercept_for_msr(vcpu, MSR_IA32_FLUSH_CMD, MSR_TYPE_W,
4390 !guest_cpu_cap_has(vcpu, X86_FEATURE_FLUSH_L1D));
4391
4392 if (kvm_cpu_cap_has(X86_FEATURE_SHSTK)) {
4393 intercept = !guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK);
4394
4395 vmx_set_intercept_for_msr(vcpu, MSR_IA32_PL0_SSP, MSR_TYPE_RW, intercept);
4396 vmx_set_intercept_for_msr(vcpu, MSR_IA32_PL1_SSP, MSR_TYPE_RW, intercept);
4397 vmx_set_intercept_for_msr(vcpu, MSR_IA32_PL2_SSP, MSR_TYPE_RW, intercept);
4398 vmx_set_intercept_for_msr(vcpu, MSR_IA32_PL3_SSP, MSR_TYPE_RW, intercept);
4399 }
4400
4401 if (kvm_cpu_cap_has(X86_FEATURE_SHSTK) || kvm_cpu_cap_has(X86_FEATURE_IBT)) {
4402 intercept = !guest_cpu_cap_has(vcpu, X86_FEATURE_IBT) &&
4403 !guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK);
4404
4405 vmx_set_intercept_for_msr(vcpu, MSR_IA32_U_CET, MSR_TYPE_RW, intercept);
4406 vmx_set_intercept_for_msr(vcpu, MSR_IA32_S_CET, MSR_TYPE_RW, intercept);
4407 }
4408
4409 vmx_recalc_pmu_msr_intercepts(vcpu);
4410
4411 /*
4412 * x2APIC and LBR MSR intercepts are modified on-demand and cannot be
4413 * filtered by userspace.
4414 */
4415 }
4416
vmx_recalc_instruction_intercepts(struct kvm_vcpu * vcpu)4417 static void vmx_recalc_instruction_intercepts(struct kvm_vcpu *vcpu)
4418 {
4419 exec_controls_changebit(to_vmx(vcpu), CPU_BASED_RDPMC_EXITING,
4420 kvm_need_rdpmc_intercept(vcpu));
4421 }
4422
vmx_recalc_intercepts(struct kvm_vcpu * vcpu)4423 void vmx_recalc_intercepts(struct kvm_vcpu *vcpu)
4424 {
4425 vmx_recalc_instruction_intercepts(vcpu);
4426 vmx_recalc_msr_intercepts(vcpu);
4427 }
4428
vmx_deliver_nested_posted_interrupt(struct kvm_vcpu * vcpu,int vector)4429 static int vmx_deliver_nested_posted_interrupt(struct kvm_vcpu *vcpu,
4430 int vector)
4431 {
4432 struct vcpu_vmx *vmx = to_vmx(vcpu);
4433
4434 /*
4435 * DO NOT query the vCPU's vmcs12, as vmcs12 is dynamically allocated
4436 * and freed, and must not be accessed outside of vcpu->mutex. The
4437 * vCPU's cached PI NV is valid if and only if posted interrupts
4438 * enabled in its vmcs12, i.e. checking the vector also checks that
4439 * L1 has enabled posted interrupts for L2.
4440 */
4441 if (is_guest_mode(vcpu) &&
4442 vector == vmx->nested.posted_intr_nv) {
4443 /*
4444 * If a posted intr is not recognized by hardware,
4445 * we will accomplish it in the next vmentry.
4446 */
4447 vmx->nested.pi_pending = true;
4448 kvm_make_request(KVM_REQ_EVENT, vcpu);
4449
4450 /*
4451 * This pairs with the smp_mb_*() after setting vcpu->mode in
4452 * vcpu_enter_guest() to guarantee the vCPU sees the event
4453 * request if triggering a posted interrupt "fails" because
4454 * vcpu->mode != IN_GUEST_MODE. The extra barrier is needed as
4455 * the smb_wmb() in kvm_make_request() only ensures everything
4456 * done before making the request is visible when the request
4457 * is visible, it doesn't ensure ordering between the store to
4458 * vcpu->requests and the load from vcpu->mode.
4459 */
4460 smp_mb__after_atomic();
4461
4462 /* the PIR and ON have been set by L1. */
4463 kvm_vcpu_trigger_posted_interrupt(vcpu, POSTED_INTR_NESTED_VECTOR);
4464 return 0;
4465 }
4466 return -1;
4467 }
4468 /*
4469 * Send interrupt to vcpu via posted interrupt way.
4470 * 1. If target vcpu is running(non-root mode), send posted interrupt
4471 * notification to vcpu and hardware will sync PIR to vIRR atomically.
4472 * 2. If target vcpu isn't running(root mode), kick it to pick up the
4473 * interrupt from PIR in next vmentry.
4474 */
vmx_deliver_posted_interrupt(struct kvm_vcpu * vcpu,int vector)4475 static int vmx_deliver_posted_interrupt(struct kvm_vcpu *vcpu, int vector)
4476 {
4477 struct vcpu_vt *vt = to_vt(vcpu);
4478 int r;
4479
4480 r = vmx_deliver_nested_posted_interrupt(vcpu, vector);
4481 if (!r)
4482 return 0;
4483
4484 /* Note, this is called iff the local APIC is in-kernel. */
4485 if (!vcpu->arch.apic->apicv_active)
4486 return -1;
4487
4488 __vmx_deliver_posted_interrupt(vcpu, &vt->pi_desc, vector);
4489 return 0;
4490 }
4491
vmx_deliver_interrupt(struct kvm_lapic * apic,int delivery_mode,int trig_mode,int vector)4492 void vmx_deliver_interrupt(struct kvm_lapic *apic, int delivery_mode,
4493 int trig_mode, int vector)
4494 {
4495 struct kvm_vcpu *vcpu = apic->vcpu;
4496
4497 if (vmx_deliver_posted_interrupt(vcpu, vector)) {
4498 kvm_lapic_set_irr(vector, apic);
4499 kvm_make_request(KVM_REQ_EVENT, vcpu);
4500 kvm_vcpu_kick(vcpu);
4501 } else {
4502 trace_kvm_apicv_accept_irq(vcpu->vcpu_id, delivery_mode,
4503 trig_mode, vector);
4504 }
4505 }
4506
4507 /*
4508 * Set up the vmcs's constant host-state fields, i.e., host-state fields that
4509 * will not change in the lifetime of the guest.
4510 * Note that host-state that does change is set elsewhere. E.g., host-state
4511 * that is set differently for each CPU is set in vmx_vcpu_load(), not here.
4512 */
vmx_set_constant_host_state(struct vcpu_vmx * vmx)4513 void vmx_set_constant_host_state(struct vcpu_vmx *vmx)
4514 {
4515 u32 low32, high32;
4516 unsigned long tmpl;
4517 unsigned long cr0, cr3, cr4;
4518
4519 cr0 = read_cr0();
4520 WARN_ON(cr0 & X86_CR0_TS);
4521 vmcs_writel(HOST_CR0, cr0); /* 22.2.3 */
4522
4523 /*
4524 * Save the most likely value for this task's CR3 in the VMCS.
4525 * We can't use __get_current_cr3_fast() because we're not atomic.
4526 */
4527 cr3 = __read_cr3();
4528 vmcs_writel(HOST_CR3, cr3); /* 22.2.3 FIXME: shadow tables */
4529 vmx->loaded_vmcs->host_state.cr3 = cr3;
4530
4531 /* Save the most likely value for this task's CR4 in the VMCS. */
4532 cr4 = cr4_read_shadow();
4533 vmcs_writel(HOST_CR4, cr4); /* 22.2.3, 22.2.5 */
4534 vmx->loaded_vmcs->host_state.cr4 = cr4;
4535
4536 vmcs_write16(HOST_CS_SELECTOR, __KERNEL_CS); /* 22.2.4 */
4537 #ifdef CONFIG_X86_64
4538 /*
4539 * Load null selectors, so we can avoid reloading them in
4540 * vmx_prepare_switch_to_host(), in case userspace uses
4541 * the null selectors too (the expected case).
4542 */
4543 vmcs_write16(HOST_DS_SELECTOR, 0);
4544 vmcs_write16(HOST_ES_SELECTOR, 0);
4545 #else
4546 vmcs_write16(HOST_DS_SELECTOR, __KERNEL_DS); /* 22.2.4 */
4547 vmcs_write16(HOST_ES_SELECTOR, __KERNEL_DS); /* 22.2.4 */
4548 #endif
4549 vmcs_write16(HOST_SS_SELECTOR, __KERNEL_DS); /* 22.2.4 */
4550 vmcs_write16(HOST_TR_SELECTOR, GDT_ENTRY_TSS*8); /* 22.2.4 */
4551
4552 vmcs_writel(HOST_IDTR_BASE, host_idt_base); /* 22.2.4 */
4553
4554 vmcs_writel(HOST_RIP, (unsigned long)vmx_vmexit); /* 22.2.5 */
4555
4556 rdmsr(MSR_IA32_SYSENTER_CS, low32, high32);
4557 vmcs_write32(HOST_IA32_SYSENTER_CS, low32);
4558
4559 /*
4560 * SYSENTER is used for 32-bit system calls on either 32-bit or
4561 * 64-bit kernels. It is always zero If neither is allowed, otherwise
4562 * vmx_vcpu_load_vmcs loads it with the per-CPU entry stack (and may
4563 * have already done so!).
4564 */
4565 if (!IS_ENABLED(CONFIG_IA32_EMULATION) && !IS_ENABLED(CONFIG_X86_32))
4566 vmcs_writel(HOST_IA32_SYSENTER_ESP, 0);
4567
4568 rdmsrq(MSR_IA32_SYSENTER_EIP, tmpl);
4569 vmcs_writel(HOST_IA32_SYSENTER_EIP, tmpl); /* 22.2.3 */
4570
4571 if (vmcs_config.vmexit_ctrl & VM_EXIT_LOAD_IA32_PAT) {
4572 rdmsr(MSR_IA32_CR_PAT, low32, high32);
4573 vmcs_write64(HOST_IA32_PAT, low32 | ((u64) high32 << 32));
4574 }
4575
4576 if (cpu_has_load_ia32_efer())
4577 vmcs_write64(HOST_IA32_EFER, kvm_host.efer);
4578
4579 /*
4580 * Supervisor shadow stack is not enabled on host side, i.e.,
4581 * host IA32_S_CET.SHSTK_EN bit is guaranteed to 0 now, per SDM
4582 * description(RDSSP instruction), SSP is not readable in CPL0,
4583 * so resetting the two registers to 0s at VM-Exit does no harm
4584 * to kernel execution. When execution flow exits to userspace,
4585 * SSP is reloaded from IA32_PL3_SSP. Check SDM Vol.2A/B Chapter
4586 * 3 and 4 for details.
4587 */
4588 if (cpu_has_load_cet_ctrl()) {
4589 vmcs_writel(HOST_S_CET, kvm_host.s_cet);
4590 vmcs_writel(HOST_SSP, 0);
4591 vmcs_writel(HOST_INTR_SSP_TABLE, 0);
4592 }
4593
4594 /*
4595 * When running a guest with a mediated PMU, guest state is resident in
4596 * hardware after VM-Exit. Zero PERF_GLOBAL_CTRL on exit so that host
4597 * activity doesn't bleed into the guest counters. When running with
4598 * an emulated PMU, PERF_GLOBAL_CTRL is dynamically computed on every
4599 * entry/exit to merge guest and host PMU usage.
4600 */
4601 if (enable_mediated_pmu)
4602 vmcs_write64(HOST_IA32_PERF_GLOBAL_CTRL, 0);
4603 }
4604
set_cr4_guest_host_mask(struct vcpu_vmx * vmx)4605 void set_cr4_guest_host_mask(struct vcpu_vmx *vmx)
4606 {
4607 struct kvm_vcpu *vcpu = &vmx->vcpu;
4608
4609 vcpu->arch.cr4_guest_owned_bits = KVM_POSSIBLE_CR4_GUEST_BITS &
4610 ~vcpu->arch.cr4_guest_rsvd_bits;
4611 if (!enable_ept) {
4612 vcpu->arch.cr4_guest_owned_bits &= ~X86_CR4_TLBFLUSH_BITS;
4613 vcpu->arch.cr4_guest_owned_bits &= ~X86_CR4_PDPTR_BITS;
4614 }
4615 if (is_guest_mode(&vmx->vcpu))
4616 vcpu->arch.cr4_guest_owned_bits &=
4617 ~get_vmcs12(vcpu)->cr4_guest_host_mask;
4618 vmcs_writel(CR4_GUEST_HOST_MASK, ~vcpu->arch.cr4_guest_owned_bits);
4619 }
4620
vmx_pin_based_exec_ctrl(struct vcpu_vmx * vmx)4621 static u32 vmx_pin_based_exec_ctrl(struct vcpu_vmx *vmx)
4622 {
4623 u32 pin_based_exec_ctrl = vmcs_config.pin_based_exec_ctrl;
4624
4625 if (!kvm_vcpu_apicv_active(&vmx->vcpu))
4626 pin_based_exec_ctrl &= ~PIN_BASED_POSTED_INTR;
4627
4628 if (!enable_vnmi)
4629 pin_based_exec_ctrl &= ~PIN_BASED_VIRTUAL_NMIS;
4630
4631 if (!enable_preemption_timer)
4632 pin_based_exec_ctrl &= ~PIN_BASED_VMX_PREEMPTION_TIMER;
4633
4634 return pin_based_exec_ctrl;
4635 }
4636
vmx_get_initial_vmentry_ctrl(void)4637 static u32 vmx_get_initial_vmentry_ctrl(void)
4638 {
4639 u32 vmentry_ctrl = vmcs_config.vmentry_ctrl;
4640
4641 if (vmx_pt_mode_is_system())
4642 vmentry_ctrl &= ~(VM_ENTRY_PT_CONCEAL_PIP |
4643 VM_ENTRY_LOAD_IA32_RTIT_CTL);
4644 /*
4645 * IA32e mode, and loading of EFER and PERF_GLOBAL_CTRL are toggled dynamically.
4646 */
4647 vmentry_ctrl &= ~(VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL |
4648 VM_ENTRY_LOAD_IA32_EFER |
4649 VM_ENTRY_IA32E_MODE);
4650
4651 return vmentry_ctrl;
4652 }
4653
vmx_get_initial_vmexit_ctrl(void)4654 static u32 vmx_get_initial_vmexit_ctrl(void)
4655 {
4656 u32 vmexit_ctrl = vmcs_config.vmexit_ctrl;
4657
4658 /*
4659 * Not used by KVM and never set in vmcs01 or vmcs02, but emulated for
4660 * nested virtualization and thus allowed to be set in vmcs12.
4661 */
4662 vmexit_ctrl &= ~(VM_EXIT_SAVE_IA32_PAT | VM_EXIT_SAVE_IA32_EFER |
4663 VM_EXIT_SAVE_VMX_PREEMPTION_TIMER);
4664
4665 if (vmx_pt_mode_is_system())
4666 vmexit_ctrl &= ~(VM_EXIT_PT_CONCEAL_PIP |
4667 VM_EXIT_CLEAR_IA32_RTIT_CTL);
4668 /* Loading of EFER and PERF_GLOBAL_CTRL are toggled dynamically */
4669 return vmexit_ctrl &
4670 ~(VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL | VM_EXIT_LOAD_IA32_EFER |
4671 VM_EXIT_SAVE_IA32_PERF_GLOBAL_CTRL);
4672 }
4673
vmx_refresh_apicv_exec_ctrl(struct kvm_vcpu * vcpu)4674 void vmx_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu)
4675 {
4676 struct vcpu_vmx *vmx = to_vmx(vcpu);
4677
4678 guard(vmx_vmcs01)(vcpu);
4679
4680 pin_controls_set(vmx, vmx_pin_based_exec_ctrl(vmx));
4681
4682 secondary_exec_controls_changebit(vmx,
4683 SECONDARY_EXEC_APIC_REGISTER_VIRT |
4684 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY,
4685 kvm_vcpu_apicv_active(vcpu));
4686 if (enable_ipiv)
4687 tertiary_exec_controls_changebit(vmx, TERTIARY_EXEC_IPI_VIRT,
4688 kvm_vcpu_apicv_active(vcpu));
4689
4690 vmx_update_msr_bitmap_x2apic(vcpu);
4691 }
4692
vmx_exec_control(struct vcpu_vmx * vmx)4693 static u32 vmx_exec_control(struct vcpu_vmx *vmx)
4694 {
4695 u32 exec_control = vmcs_config.cpu_based_exec_ctrl;
4696
4697 /*
4698 * Not used by KVM, but fully supported for nesting, i.e. are allowed in
4699 * vmcs12 and propagated to vmcs02 when set in vmcs12.
4700 */
4701 exec_control &= ~(CPU_BASED_RDTSC_EXITING |
4702 CPU_BASED_USE_IO_BITMAPS |
4703 CPU_BASED_MONITOR_TRAP_FLAG |
4704 CPU_BASED_PAUSE_EXITING);
4705
4706 /* INTR_WINDOW_EXITING and NMI_WINDOW_EXITING are toggled dynamically */
4707 exec_control &= ~(CPU_BASED_INTR_WINDOW_EXITING |
4708 CPU_BASED_NMI_WINDOW_EXITING);
4709
4710 if (vmx->vcpu.arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT)
4711 exec_control &= ~CPU_BASED_MOV_DR_EXITING;
4712
4713 if (!cpu_need_tpr_shadow(&vmx->vcpu))
4714 exec_control &= ~CPU_BASED_TPR_SHADOW;
4715
4716 #ifdef CONFIG_X86_64
4717 if (exec_control & CPU_BASED_TPR_SHADOW)
4718 exec_control &= ~(CPU_BASED_CR8_LOAD_EXITING |
4719 CPU_BASED_CR8_STORE_EXITING);
4720 else
4721 exec_control |= CPU_BASED_CR8_STORE_EXITING |
4722 CPU_BASED_CR8_LOAD_EXITING;
4723 #endif
4724 /* No need to intercept CR3 access or INVPLG when using EPT. */
4725 if (enable_ept)
4726 exec_control &= ~(CPU_BASED_CR3_LOAD_EXITING |
4727 CPU_BASED_CR3_STORE_EXITING |
4728 CPU_BASED_INVLPG_EXITING);
4729 if (kvm_mwait_in_guest(vmx->vcpu.kvm))
4730 exec_control &= ~(CPU_BASED_MWAIT_EXITING |
4731 CPU_BASED_MONITOR_EXITING);
4732 if (kvm_hlt_in_guest(vmx->vcpu.kvm))
4733 exec_control &= ~CPU_BASED_HLT_EXITING;
4734 return exec_control;
4735 }
4736
vmx_tertiary_exec_control(struct vcpu_vmx * vmx)4737 static u64 vmx_tertiary_exec_control(struct vcpu_vmx *vmx)
4738 {
4739 u64 exec_control = vmcs_config.cpu_based_3rd_exec_ctrl;
4740
4741 /*
4742 * IPI virtualization relies on APICv. Disable IPI virtualization if
4743 * APICv is inhibited.
4744 */
4745 if (!enable_ipiv || !kvm_vcpu_apicv_active(&vmx->vcpu))
4746 exec_control &= ~TERTIARY_EXEC_IPI_VIRT;
4747
4748 return exec_control;
4749 }
4750
4751 /*
4752 * Adjust a single secondary execution control bit to intercept/allow an
4753 * instruction in the guest. This is usually done based on whether or not a
4754 * feature has been exposed to the guest in order to correctly emulate faults.
4755 */
4756 static inline void
vmx_adjust_secondary_exec_control(struct vcpu_vmx * vmx,u32 * exec_control,u32 control,bool enabled,bool exiting)4757 vmx_adjust_secondary_exec_control(struct vcpu_vmx *vmx, u32 *exec_control,
4758 u32 control, bool enabled, bool exiting)
4759 {
4760 /*
4761 * If the control is for an opt-in feature, clear the control if the
4762 * feature is not exposed to the guest, i.e. not enabled. If the
4763 * control is opt-out, i.e. an exiting control, clear the control if
4764 * the feature _is_ exposed to the guest, i.e. exiting/interception is
4765 * disabled for the associated instruction. Note, the caller is
4766 * responsible presetting exec_control to set all supported bits.
4767 */
4768 if (enabled == exiting)
4769 *exec_control &= ~control;
4770
4771 /*
4772 * Update the nested MSR settings so that a nested VMM can/can't set
4773 * controls for features that are/aren't exposed to the guest.
4774 */
4775 if (nested &&
4776 kvm_check_has_quirk(vmx->vcpu.kvm, KVM_X86_QUIRK_STUFF_FEATURE_MSRS)) {
4777 /*
4778 * All features that can be added or removed to VMX MSRs must
4779 * be supported in the first place for nested virtualization.
4780 */
4781 if (WARN_ON_ONCE(!(vmcs_config.nested.secondary_ctls_high & control)))
4782 enabled = false;
4783
4784 if (enabled)
4785 vmx->nested.msrs.secondary_ctls_high |= control;
4786 else
4787 vmx->nested.msrs.secondary_ctls_high &= ~control;
4788 }
4789 }
4790
4791 /*
4792 * Wrapper macro for the common case of adjusting a secondary execution control
4793 * based on a single guest CPUID bit, with a dedicated feature bit. This also
4794 * verifies that the control is actually supported by KVM and hardware.
4795 */
4796 #define vmx_adjust_sec_exec_control(vmx, exec_control, name, feat_name, ctrl_name, exiting) \
4797 ({ \
4798 struct kvm_vcpu *__vcpu = &(vmx)->vcpu; \
4799 bool __enabled; \
4800 \
4801 if (cpu_has_vmx_##name()) { \
4802 __enabled = guest_cpu_cap_has(__vcpu, X86_FEATURE_##feat_name); \
4803 vmx_adjust_secondary_exec_control(vmx, exec_control, SECONDARY_EXEC_##ctrl_name,\
4804 __enabled, exiting); \
4805 } \
4806 })
4807
4808 /* More macro magic for ENABLE_/opt-in versus _EXITING/opt-out controls. */
4809 #define vmx_adjust_sec_exec_feature(vmx, exec_control, lname, uname) \
4810 vmx_adjust_sec_exec_control(vmx, exec_control, lname, uname, ENABLE_##uname, false)
4811
4812 #define vmx_adjust_sec_exec_exiting(vmx, exec_control, lname, uname) \
4813 vmx_adjust_sec_exec_control(vmx, exec_control, lname, uname, uname##_EXITING, true)
4814
vmx_secondary_exec_control(struct vcpu_vmx * vmx)4815 static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx)
4816 {
4817 struct kvm_vcpu *vcpu = &vmx->vcpu;
4818
4819 u32 exec_control = vmcs_config.cpu_based_2nd_exec_ctrl;
4820
4821 if (vmx_pt_mode_is_system())
4822 exec_control &= ~(SECONDARY_EXEC_PT_USE_GPA | SECONDARY_EXEC_PT_CONCEAL_VMX);
4823 if (!cpu_need_virtualize_apic_accesses(vcpu))
4824 exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
4825 if (vmx->vpid == 0)
4826 exec_control &= ~SECONDARY_EXEC_ENABLE_VPID;
4827 if (!enable_ept) {
4828 exec_control &= ~SECONDARY_EXEC_ENABLE_EPT;
4829 exec_control &= ~SECONDARY_EXEC_EPT_VIOLATION_VE;
4830 enable_unrestricted_guest = 0;
4831 }
4832 if (!enable_unrestricted_guest)
4833 exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST;
4834 if (kvm_pause_in_guest(vmx->vcpu.kvm))
4835 exec_control &= ~SECONDARY_EXEC_PAUSE_LOOP_EXITING;
4836 if (!kvm_vcpu_apicv_active(vcpu))
4837 exec_control &= ~(SECONDARY_EXEC_APIC_REGISTER_VIRT |
4838 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
4839 exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;
4840
4841 /*
4842 * KVM doesn't support VMFUNC for L1, but the control is set in KVM's
4843 * base configuration as KVM emulates VMFUNC[EPTP_SWITCHING] for L2.
4844 */
4845 exec_control &= ~SECONDARY_EXEC_ENABLE_VMFUNC;
4846
4847 /* SECONDARY_EXEC_DESC is enabled/disabled on writes to CR4.UMIP,
4848 * in vmx_set_cr4. */
4849 exec_control &= ~SECONDARY_EXEC_DESC;
4850
4851 /* SECONDARY_EXEC_SHADOW_VMCS is enabled when L1 executes VMPTRLD
4852 (handle_vmptrld).
4853 We can NOT enable shadow_vmcs here because we don't have yet
4854 a current VMCS12
4855 */
4856 exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS;
4857
4858 /*
4859 * PML is enabled/disabled when dirty logging of memsmlots changes, but
4860 * it needs to be set here when dirty logging is already active, e.g.
4861 * if this vCPU was created after dirty logging was enabled.
4862 */
4863 if (!enable_pml || !atomic_read(&vcpu->kvm->nr_memslots_dirty_logging))
4864 exec_control &= ~SECONDARY_EXEC_ENABLE_PML;
4865
4866 vmx_adjust_sec_exec_feature(vmx, &exec_control, xsaves, XSAVES);
4867
4868 /*
4869 * RDPID is also gated by ENABLE_RDTSCP, turn on the control if either
4870 * feature is exposed to the guest. This creates a virtualization hole
4871 * if both are supported in hardware but only one is exposed to the
4872 * guest, but letting the guest execute RDTSCP or RDPID when either one
4873 * is advertised is preferable to emulating the advertised instruction
4874 * in KVM on #UD, and obviously better than incorrectly injecting #UD.
4875 */
4876 if (cpu_has_vmx_rdtscp()) {
4877 bool rdpid_or_rdtscp_enabled =
4878 guest_cpu_cap_has(vcpu, X86_FEATURE_RDTSCP) ||
4879 guest_cpu_cap_has(vcpu, X86_FEATURE_RDPID);
4880
4881 vmx_adjust_secondary_exec_control(vmx, &exec_control,
4882 SECONDARY_EXEC_ENABLE_RDTSCP,
4883 rdpid_or_rdtscp_enabled, false);
4884 }
4885
4886 vmx_adjust_sec_exec_feature(vmx, &exec_control, invpcid, INVPCID);
4887
4888 vmx_adjust_sec_exec_exiting(vmx, &exec_control, rdrand, RDRAND);
4889 vmx_adjust_sec_exec_exiting(vmx, &exec_control, rdseed, RDSEED);
4890
4891 vmx_adjust_sec_exec_control(vmx, &exec_control, waitpkg, WAITPKG,
4892 ENABLE_USR_WAIT_PAUSE, false);
4893
4894 if (!vcpu->kvm->arch.bus_lock_detection_enabled)
4895 exec_control &= ~SECONDARY_EXEC_BUS_LOCK_DETECTION;
4896
4897 if (!kvm_notify_vmexit_enabled(vcpu->kvm))
4898 exec_control &= ~SECONDARY_EXEC_NOTIFY_VM_EXITING;
4899
4900 return exec_control;
4901 }
4902
vmx_get_pid_table_order(struct kvm * kvm)4903 static inline int vmx_get_pid_table_order(struct kvm *kvm)
4904 {
4905 return get_order(kvm->arch.max_vcpu_ids * sizeof(*to_kvm_vmx(kvm)->pid_table));
4906 }
4907
vmx_alloc_ipiv_pid_table(struct kvm * kvm)4908 static int vmx_alloc_ipiv_pid_table(struct kvm *kvm)
4909 {
4910 struct page *pages;
4911 struct kvm_vmx *kvm_vmx = to_kvm_vmx(kvm);
4912
4913 if (!irqchip_in_kernel(kvm) || !enable_ipiv)
4914 return 0;
4915
4916 if (kvm_vmx->pid_table)
4917 return 0;
4918
4919 pages = alloc_pages(GFP_KERNEL_ACCOUNT | __GFP_ZERO,
4920 vmx_get_pid_table_order(kvm));
4921 if (!pages)
4922 return -ENOMEM;
4923
4924 kvm_vmx->pid_table = (void *)page_address(pages);
4925 return 0;
4926 }
4927
vmx_vcpu_precreate(struct kvm * kvm)4928 int vmx_vcpu_precreate(struct kvm *kvm)
4929 {
4930 return vmx_alloc_ipiv_pid_table(kvm);
4931 }
4932
4933 #define VMX_XSS_EXIT_BITMAP 0
4934
init_vmcs(struct vcpu_vmx * vmx)4935 static void init_vmcs(struct vcpu_vmx *vmx)
4936 {
4937 struct kvm *kvm = vmx->vcpu.kvm;
4938 struct kvm_vmx *kvm_vmx = to_kvm_vmx(kvm);
4939
4940 if (nested)
4941 nested_vmx_set_vmcs_shadowing_bitmap();
4942
4943 if (cpu_has_vmx_msr_bitmap())
4944 vmcs_write64(MSR_BITMAP, __pa(vmx->vmcs01.msr_bitmap));
4945
4946 vmcs_write64(VMCS_LINK_POINTER, INVALID_GPA); /* 22.3.1.5 */
4947
4948 /* Control */
4949 pin_controls_set(vmx, vmx_pin_based_exec_ctrl(vmx));
4950
4951 exec_controls_set(vmx, vmx_exec_control(vmx));
4952
4953 if (cpu_has_secondary_exec_ctrls()) {
4954 secondary_exec_controls_set(vmx, vmx_secondary_exec_control(vmx));
4955 if (vmx->ve_info)
4956 vmcs_write64(VE_INFORMATION_ADDRESS,
4957 __pa(vmx->ve_info));
4958 }
4959
4960 if (cpu_has_tertiary_exec_ctrls())
4961 tertiary_exec_controls_set(vmx, vmx_tertiary_exec_control(vmx));
4962
4963 if (enable_apicv && lapic_in_kernel(&vmx->vcpu)) {
4964 vmcs_write64(EOI_EXIT_BITMAP0, 0);
4965 vmcs_write64(EOI_EXIT_BITMAP1, 0);
4966 vmcs_write64(EOI_EXIT_BITMAP2, 0);
4967 vmcs_write64(EOI_EXIT_BITMAP3, 0);
4968
4969 vmcs_write16(GUEST_INTR_STATUS, 0);
4970
4971 vmcs_write16(POSTED_INTR_NV, POSTED_INTR_VECTOR);
4972 vmcs_write64(POSTED_INTR_DESC_ADDR, __pa((&vmx->vt.pi_desc)));
4973 }
4974
4975 if (vmx_can_use_ipiv(&vmx->vcpu)) {
4976 vmcs_write64(PID_POINTER_TABLE, __pa(kvm_vmx->pid_table));
4977 vmcs_write16(LAST_PID_POINTER_INDEX, kvm->arch.max_vcpu_ids - 1);
4978 }
4979
4980 if (!kvm_pause_in_guest(kvm)) {
4981 vmcs_write32(PLE_GAP, ple_gap);
4982 vmx->ple_window = ple_window;
4983 vmx->ple_window_dirty = true;
4984 }
4985
4986 if (kvm_notify_vmexit_enabled(kvm))
4987 vmcs_write32(NOTIFY_WINDOW, kvm->arch.notify_window);
4988
4989 vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, 0);
4990 vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, 0);
4991 vmcs_write32(CR3_TARGET_COUNT, 0); /* 22.2.1 */
4992
4993 vmcs_write16(HOST_FS_SELECTOR, 0); /* 22.2.4 */
4994 vmcs_write16(HOST_GS_SELECTOR, 0); /* 22.2.4 */
4995 vmx_set_constant_host_state(vmx);
4996 vmcs_writel(HOST_FS_BASE, 0); /* 22.2.4 */
4997 vmcs_writel(HOST_GS_BASE, 0); /* 22.2.4 */
4998
4999 if (cpu_has_vmx_vmfunc())
5000 vmcs_write64(VM_FUNCTION_CONTROL, 0);
5001
5002 vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0);
5003 vmcs_write64(VM_EXIT_MSR_STORE_ADDR, __pa(vmx->msr_autostore.val));
5004 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0);
5005 vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host.val));
5006 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0);
5007 vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest.val));
5008
5009 if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT)
5010 vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat);
5011
5012 vm_exit_controls_set(vmx, vmx_get_initial_vmexit_ctrl());
5013
5014 /* 22.2.1, 20.8.1 */
5015 vm_entry_controls_set(vmx, vmx_get_initial_vmentry_ctrl());
5016
5017 vmx->vcpu.arch.cr0_guest_owned_bits = vmx_l1_guest_owned_cr0_bits();
5018 vmcs_writel(CR0_GUEST_HOST_MASK, ~vmx->vcpu.arch.cr0_guest_owned_bits);
5019
5020 set_cr4_guest_host_mask(vmx);
5021
5022 if (vmx->vpid != 0)
5023 vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid);
5024
5025 if (cpu_has_vmx_xsaves())
5026 vmcs_write64(XSS_EXIT_BITMAP, VMX_XSS_EXIT_BITMAP);
5027
5028 if (enable_pml) {
5029 vmcs_write64(PML_ADDRESS, page_to_phys(vmx->pml_pg));
5030 vmcs_write16(GUEST_PML_INDEX, PML_HEAD_INDEX);
5031 }
5032
5033 vmx_write_encls_bitmap(&vmx->vcpu, NULL);
5034
5035 if (vmx_pt_mode_is_host_guest()) {
5036 memset(&vmx->pt_desc, 0, sizeof(vmx->pt_desc));
5037 /* Bit[6~0] are forced to 1, writes are ignored. */
5038 vmx->pt_desc.guest.output_mask = 0x7F;
5039 vmcs_write64(GUEST_IA32_RTIT_CTL, 0);
5040 }
5041
5042 vmcs_write32(GUEST_SYSENTER_CS, 0);
5043 vmcs_writel(GUEST_SYSENTER_ESP, 0);
5044 vmcs_writel(GUEST_SYSENTER_EIP, 0);
5045
5046 vmx_guest_debugctl_write(&vmx->vcpu, 0);
5047
5048 if (cpu_has_vmx_tpr_shadow()) {
5049 vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 0);
5050 if (cpu_need_tpr_shadow(&vmx->vcpu))
5051 vmcs_write64(VIRTUAL_APIC_PAGE_ADDR,
5052 __pa(vmx->vcpu.arch.apic->regs));
5053 vmcs_write32(TPR_THRESHOLD, 0);
5054 }
5055
5056 vmx_setup_uret_msrs(vmx);
5057 }
5058
__vmx_vcpu_reset(struct kvm_vcpu * vcpu)5059 static void __vmx_vcpu_reset(struct kvm_vcpu *vcpu)
5060 {
5061 struct vcpu_vmx *vmx = to_vmx(vcpu);
5062
5063 init_vmcs(vmx);
5064
5065 if (nested &&
5066 kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_STUFF_FEATURE_MSRS))
5067 memcpy(&vmx->nested.msrs, &vmcs_config.nested, sizeof(vmx->nested.msrs));
5068
5069 vcpu_setup_sgx_lepubkeyhash(vcpu);
5070
5071 vmx->nested.posted_intr_nv = -1;
5072 vmx->nested.vmxon_ptr = INVALID_GPA;
5073 vmx->nested.current_vmptr = INVALID_GPA;
5074
5075 #ifdef CONFIG_KVM_HYPERV
5076 vmx->nested.hv_evmcs_vmptr = EVMPTR_INVALID;
5077 #endif
5078
5079 if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_STUFF_FEATURE_MSRS))
5080 vcpu->arch.microcode_version = 0x100000000ULL;
5081 vmx->msr_ia32_feature_control_valid_bits = FEAT_CTL_LOCKED;
5082
5083 /*
5084 * Enforce invariant: pi_desc.nv is always either POSTED_INTR_VECTOR
5085 * or POSTED_INTR_WAKEUP_VECTOR.
5086 */
5087 vmx->vt.pi_desc.nv = POSTED_INTR_VECTOR;
5088 __pi_set_sn(&vmx->vt.pi_desc);
5089 }
5090
vmx_vcpu_reset(struct kvm_vcpu * vcpu,bool init_event)5091 void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
5092 {
5093 struct vcpu_vmx *vmx = to_vmx(vcpu);
5094
5095 if (!init_event)
5096 __vmx_vcpu_reset(vcpu);
5097
5098 vmx->rmode.vm86_active = 0;
5099 vmx->spec_ctrl = 0;
5100
5101 vmx->msr_ia32_umwait_control = 0;
5102
5103 vmx->hv_deadline_tsc = -1;
5104 kvm_set_cr8(vcpu, 0);
5105
5106 seg_setup(VCPU_SREG_CS);
5107 vmcs_write16(GUEST_CS_SELECTOR, 0xf000);
5108 vmcs_writel(GUEST_CS_BASE, 0xffff0000ul);
5109
5110 seg_setup(VCPU_SREG_DS);
5111 seg_setup(VCPU_SREG_ES);
5112 seg_setup(VCPU_SREG_FS);
5113 seg_setup(VCPU_SREG_GS);
5114 seg_setup(VCPU_SREG_SS);
5115
5116 vmcs_write16(GUEST_TR_SELECTOR, 0);
5117 vmcs_writel(GUEST_TR_BASE, 0);
5118 vmcs_write32(GUEST_TR_LIMIT, 0xffff);
5119 vmcs_write32(GUEST_TR_AR_BYTES, 0x008b);
5120
5121 vmcs_write16(GUEST_LDTR_SELECTOR, 0);
5122 vmcs_writel(GUEST_LDTR_BASE, 0);
5123 vmcs_write32(GUEST_LDTR_LIMIT, 0xffff);
5124 vmcs_write32(GUEST_LDTR_AR_BYTES, 0x00082);
5125
5126 vmcs_writel(GUEST_GDTR_BASE, 0);
5127 vmcs_write32(GUEST_GDTR_LIMIT, 0xffff);
5128
5129 vmcs_writel(GUEST_IDTR_BASE, 0);
5130 vmcs_write32(GUEST_IDTR_LIMIT, 0xffff);
5131
5132 vmx_segment_cache_clear(vmx);
5133 kvm_register_mark_available(vcpu, VCPU_EXREG_SEGMENTS);
5134
5135 vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE);
5136 vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0);
5137 vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS, 0);
5138 if (kvm_mpx_supported())
5139 vmcs_write64(GUEST_BNDCFGS, 0);
5140
5141 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0); /* 22.2.1 */
5142
5143 if (kvm_cpu_cap_has(X86_FEATURE_SHSTK)) {
5144 vmcs_writel(GUEST_SSP, 0);
5145 vmcs_writel(GUEST_INTR_SSP_TABLE, 0);
5146 }
5147 if (kvm_cpu_cap_has(X86_FEATURE_IBT) ||
5148 kvm_cpu_cap_has(X86_FEATURE_SHSTK))
5149 vmcs_writel(GUEST_S_CET, 0);
5150
5151 kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu);
5152
5153 vpid_sync_context(vmx->vpid);
5154
5155 vmx_update_fb_clear_dis(vcpu, vmx);
5156 }
5157
vmx_enable_irq_window(struct kvm_vcpu * vcpu)5158 void vmx_enable_irq_window(struct kvm_vcpu *vcpu)
5159 {
5160 exec_controls_setbit(to_vmx(vcpu), CPU_BASED_INTR_WINDOW_EXITING);
5161 }
5162
vmx_enable_nmi_window(struct kvm_vcpu * vcpu)5163 void vmx_enable_nmi_window(struct kvm_vcpu *vcpu)
5164 {
5165 if (!enable_vnmi ||
5166 vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_STI) {
5167 vmx_enable_irq_window(vcpu);
5168 return;
5169 }
5170
5171 exec_controls_setbit(to_vmx(vcpu), CPU_BASED_NMI_WINDOW_EXITING);
5172 }
5173
vmx_inject_irq(struct kvm_vcpu * vcpu,bool reinjected)5174 void vmx_inject_irq(struct kvm_vcpu *vcpu, bool reinjected)
5175 {
5176 struct vcpu_vmx *vmx = to_vmx(vcpu);
5177 uint32_t intr;
5178 int irq = vcpu->arch.interrupt.nr;
5179
5180 trace_kvm_inj_virq(irq, vcpu->arch.interrupt.soft, reinjected);
5181
5182 ++vcpu->stat.irq_injections;
5183 if (vmx->rmode.vm86_active) {
5184 int inc_eip = 0;
5185 if (vcpu->arch.interrupt.soft)
5186 inc_eip = vcpu->arch.event_exit_inst_len;
5187 kvm_inject_realmode_interrupt(vcpu, irq, inc_eip);
5188 return;
5189 }
5190 intr = irq | INTR_INFO_VALID_MASK;
5191 if (vcpu->arch.interrupt.soft) {
5192 intr |= INTR_TYPE_SOFT_INTR;
5193 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
5194 vmx->vcpu.arch.event_exit_inst_len);
5195 } else
5196 intr |= INTR_TYPE_EXT_INTR;
5197 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr);
5198
5199 vmx_clear_hlt(vcpu);
5200 }
5201
vmx_inject_nmi(struct kvm_vcpu * vcpu)5202 void vmx_inject_nmi(struct kvm_vcpu *vcpu)
5203 {
5204 struct vcpu_vmx *vmx = to_vmx(vcpu);
5205
5206 if (!enable_vnmi) {
5207 /*
5208 * Tracking the NMI-blocked state in software is built upon
5209 * finding the next open IRQ window. This, in turn, depends on
5210 * well-behaving guests: They have to keep IRQs disabled at
5211 * least as long as the NMI handler runs. Otherwise we may
5212 * cause NMI nesting, maybe breaking the guest. But as this is
5213 * highly unlikely, we can live with the residual risk.
5214 */
5215 vmx->loaded_vmcs->soft_vnmi_blocked = 1;
5216 vmx->loaded_vmcs->vnmi_blocked_time = 0;
5217 }
5218
5219 ++vcpu->stat.nmi_injections;
5220 vmx->loaded_vmcs->nmi_known_unmasked = false;
5221
5222 if (vmx->rmode.vm86_active) {
5223 kvm_inject_realmode_interrupt(vcpu, NMI_VECTOR, 0);
5224 return;
5225 }
5226
5227 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
5228 INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR);
5229
5230 vmx_clear_hlt(vcpu);
5231 }
5232
vmx_get_nmi_mask(struct kvm_vcpu * vcpu)5233 bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu)
5234 {
5235 struct vcpu_vmx *vmx = to_vmx(vcpu);
5236 bool masked;
5237
5238 if (!enable_vnmi)
5239 return vmx->loaded_vmcs->soft_vnmi_blocked;
5240 if (vmx->loaded_vmcs->nmi_known_unmasked)
5241 return false;
5242 masked = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_NMI;
5243 vmx->loaded_vmcs->nmi_known_unmasked = !masked;
5244 return masked;
5245 }
5246
vmx_set_nmi_mask(struct kvm_vcpu * vcpu,bool masked)5247 void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked)
5248 {
5249 struct vcpu_vmx *vmx = to_vmx(vcpu);
5250
5251 if (!enable_vnmi) {
5252 if (vmx->loaded_vmcs->soft_vnmi_blocked != masked) {
5253 vmx->loaded_vmcs->soft_vnmi_blocked = masked;
5254 vmx->loaded_vmcs->vnmi_blocked_time = 0;
5255 }
5256 } else {
5257 vmx->loaded_vmcs->nmi_known_unmasked = !masked;
5258 if (masked)
5259 vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
5260 GUEST_INTR_STATE_NMI);
5261 else
5262 vmcs_clear_bits(GUEST_INTERRUPTIBILITY_INFO,
5263 GUEST_INTR_STATE_NMI);
5264 }
5265 }
5266
vmx_nmi_blocked(struct kvm_vcpu * vcpu)5267 bool vmx_nmi_blocked(struct kvm_vcpu *vcpu)
5268 {
5269 if (is_guest_mode(vcpu) && nested_exit_on_nmi(vcpu))
5270 return false;
5271
5272 if (!enable_vnmi && to_vmx(vcpu)->loaded_vmcs->soft_vnmi_blocked)
5273 return true;
5274
5275 return (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
5276 (GUEST_INTR_STATE_MOV_SS | GUEST_INTR_STATE_STI |
5277 GUEST_INTR_STATE_NMI));
5278 }
5279
vmx_nmi_allowed(struct kvm_vcpu * vcpu,bool for_injection)5280 int vmx_nmi_allowed(struct kvm_vcpu *vcpu, bool for_injection)
5281 {
5282 if (to_vmx(vcpu)->nested.nested_run_pending)
5283 return -EBUSY;
5284
5285 /* An NMI must not be injected into L2 if it's supposed to VM-Exit. */
5286 if (for_injection && is_guest_mode(vcpu) && nested_exit_on_nmi(vcpu))
5287 return -EBUSY;
5288
5289 return !vmx_nmi_blocked(vcpu);
5290 }
5291
__vmx_interrupt_blocked(struct kvm_vcpu * vcpu)5292 bool __vmx_interrupt_blocked(struct kvm_vcpu *vcpu)
5293 {
5294 return !(vmx_get_rflags(vcpu) & X86_EFLAGS_IF) ||
5295 (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
5296 (GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS));
5297 }
5298
vmx_interrupt_blocked(struct kvm_vcpu * vcpu)5299 bool vmx_interrupt_blocked(struct kvm_vcpu *vcpu)
5300 {
5301 if (is_guest_mode(vcpu) && nested_exit_on_intr(vcpu))
5302 return false;
5303
5304 return __vmx_interrupt_blocked(vcpu);
5305 }
5306
vmx_interrupt_allowed(struct kvm_vcpu * vcpu,bool for_injection)5307 int vmx_interrupt_allowed(struct kvm_vcpu *vcpu, bool for_injection)
5308 {
5309 if (to_vmx(vcpu)->nested.nested_run_pending)
5310 return -EBUSY;
5311
5312 /*
5313 * An IRQ must not be injected into L2 if it's supposed to VM-Exit,
5314 * e.g. if the IRQ arrived asynchronously after checking nested events.
5315 */
5316 if (for_injection && is_guest_mode(vcpu) && nested_exit_on_intr(vcpu))
5317 return -EBUSY;
5318
5319 return !vmx_interrupt_blocked(vcpu);
5320 }
5321
vmx_set_tss_addr(struct kvm * kvm,unsigned int addr)5322 int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr)
5323 {
5324 void __user *ret;
5325
5326 if (enable_unrestricted_guest)
5327 return 0;
5328
5329 mutex_lock(&kvm->slots_lock);
5330 ret = __x86_set_memory_region(kvm, TSS_PRIVATE_MEMSLOT, addr,
5331 PAGE_SIZE * 3);
5332 mutex_unlock(&kvm->slots_lock);
5333
5334 if (IS_ERR(ret))
5335 return PTR_ERR(ret);
5336
5337 to_kvm_vmx(kvm)->tss_addr = addr;
5338
5339 return init_rmode_tss(kvm, ret);
5340 }
5341
vmx_set_identity_map_addr(struct kvm * kvm,u64 ident_addr)5342 int vmx_set_identity_map_addr(struct kvm *kvm, u64 ident_addr)
5343 {
5344 to_kvm_vmx(kvm)->ept_identity_map_addr = ident_addr;
5345 return 0;
5346 }
5347
rmode_exception(struct kvm_vcpu * vcpu,int vec)5348 static bool rmode_exception(struct kvm_vcpu *vcpu, int vec)
5349 {
5350 switch (vec) {
5351 case BP_VECTOR:
5352 /*
5353 * Update instruction length as we may reinject the exception
5354 * from user space while in guest debugging mode.
5355 */
5356 to_vmx(vcpu)->vcpu.arch.event_exit_inst_len =
5357 vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
5358 if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP)
5359 return false;
5360 fallthrough;
5361 case DB_VECTOR:
5362 return !(vcpu->guest_debug &
5363 (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP));
5364 case DE_VECTOR:
5365 case OF_VECTOR:
5366 case BR_VECTOR:
5367 case UD_VECTOR:
5368 case DF_VECTOR:
5369 case SS_VECTOR:
5370 case GP_VECTOR:
5371 case MF_VECTOR:
5372 return true;
5373 }
5374 return false;
5375 }
5376
handle_rmode_exception(struct kvm_vcpu * vcpu,int vec,u32 err_code)5377 static int handle_rmode_exception(struct kvm_vcpu *vcpu,
5378 int vec, u32 err_code)
5379 {
5380 /*
5381 * Instruction with address size override prefix opcode 0x67
5382 * Cause the #SS fault with 0 error code in VM86 mode.
5383 */
5384 if (((vec == GP_VECTOR) || (vec == SS_VECTOR)) && err_code == 0) {
5385 if (kvm_emulate_instruction(vcpu, 0)) {
5386 if (vcpu->arch.halt_request) {
5387 vcpu->arch.halt_request = 0;
5388 return kvm_emulate_halt_noskip(vcpu);
5389 }
5390 return 1;
5391 }
5392 return 0;
5393 }
5394
5395 /*
5396 * Forward all other exceptions that are valid in real mode.
5397 * FIXME: Breaks guest debugging in real mode, needs to be fixed with
5398 * the required debugging infrastructure rework.
5399 */
5400 kvm_queue_exception(vcpu, vec);
5401 return 1;
5402 }
5403
handle_machine_check(struct kvm_vcpu * vcpu)5404 static int handle_machine_check(struct kvm_vcpu *vcpu)
5405 {
5406 /* handled by vmx_vcpu_run() */
5407 return 1;
5408 }
5409
5410 /*
5411 * If the host has split lock detection disabled, then #AC is
5412 * unconditionally injected into the guest, which is the pre split lock
5413 * detection behaviour.
5414 *
5415 * If the host has split lock detection enabled then #AC is
5416 * only injected into the guest when:
5417 * - Guest CPL == 3 (user mode)
5418 * - Guest has #AC detection enabled in CR0
5419 * - Guest EFLAGS has AC bit set
5420 */
vmx_guest_inject_ac(struct kvm_vcpu * vcpu)5421 bool vmx_guest_inject_ac(struct kvm_vcpu *vcpu)
5422 {
5423 if (!boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT))
5424 return true;
5425
5426 return vmx_get_cpl(vcpu) == 3 && kvm_is_cr0_bit_set(vcpu, X86_CR0_AM) &&
5427 (kvm_get_rflags(vcpu) & X86_EFLAGS_AC);
5428 }
5429
is_xfd_nm_fault(struct kvm_vcpu * vcpu)5430 static bool is_xfd_nm_fault(struct kvm_vcpu *vcpu)
5431 {
5432 return vcpu->arch.guest_fpu.fpstate->xfd &&
5433 !kvm_is_cr0_bit_set(vcpu, X86_CR0_TS);
5434 }
5435
vmx_handle_page_fault(struct kvm_vcpu * vcpu,u32 error_code)5436 static int vmx_handle_page_fault(struct kvm_vcpu *vcpu, u32 error_code)
5437 {
5438 unsigned long cr2 = vmx_get_exit_qual(vcpu);
5439
5440 if (vcpu->arch.apf.host_apf_flags)
5441 goto handle_pf;
5442
5443 /* When using EPT, KVM intercepts #PF only to detect illegal GPAs. */
5444 WARN_ON_ONCE(enable_ept && !allow_smaller_maxphyaddr);
5445
5446 /*
5447 * On SGX2 hardware, EPCM violations are delivered as #PF with the SGX
5448 * flag set in the error code (SGX1 hardware generates #GP(0)). EPCM
5449 * violations have nothing to do with shadow paging and can never be
5450 * resolved by KVM; always reflect them into the guest.
5451 */
5452 if (error_code & PFERR_SGX_MASK) {
5453 WARN_ON_ONCE(!IS_ENABLED(CONFIG_X86_SGX_KVM) ||
5454 !cpu_feature_enabled(X86_FEATURE_SGX2));
5455
5456 if (guest_cpu_cap_has(vcpu, X86_FEATURE_SGX2))
5457 kvm_fixup_and_inject_pf_error(vcpu, cr2, error_code);
5458 else
5459 kvm_inject_gp(vcpu, 0);
5460 return 1;
5461 }
5462
5463 /*
5464 * If EPT is enabled, fixup and inject the #PF. KVM intercepts #PFs
5465 * only to set PFERR_RSVD as appropriate (hardware won't set RSVD due
5466 * to the GPA being legal with respect to host.MAXPHYADDR).
5467 */
5468 if (enable_ept) {
5469 kvm_fixup_and_inject_pf_error(vcpu, cr2, error_code);
5470 return 1;
5471 }
5472
5473 handle_pf:
5474 return kvm_handle_page_fault(vcpu, error_code, cr2, NULL, 0);
5475 }
5476
handle_exception_nmi(struct kvm_vcpu * vcpu)5477 static int handle_exception_nmi(struct kvm_vcpu *vcpu)
5478 {
5479 struct vcpu_vmx *vmx = to_vmx(vcpu);
5480 struct kvm_run *kvm_run = vcpu->run;
5481 u32 intr_info, ex_no, error_code;
5482 unsigned long dr6;
5483 u32 vect_info;
5484
5485 vect_info = vmx->idt_vectoring_info;
5486 intr_info = vmx_get_intr_info(vcpu);
5487
5488 /*
5489 * Machine checks are handled by handle_exception_irqoff(), or by
5490 * vmx_vcpu_run() if a #MC occurs on VM-Entry. NMIs are handled by
5491 * vmx_vcpu_enter_exit().
5492 */
5493 if (is_machine_check(intr_info) || is_nmi(intr_info))
5494 return 1;
5495
5496 /*
5497 * Queue the exception here instead of in handle_nm_fault_irqoff().
5498 * This ensures the nested_vmx check is not skipped so vmexit can
5499 * be reflected to L1 (when it intercepts #NM) before reaching this
5500 * point.
5501 */
5502 if (is_nm_fault(intr_info)) {
5503 kvm_queue_exception_p(vcpu, NM_VECTOR,
5504 is_xfd_nm_fault(vcpu) ? vcpu->arch.guest_fpu.xfd_err : 0);
5505 return 1;
5506 }
5507
5508 if (is_invalid_opcode(intr_info))
5509 return handle_ud(vcpu);
5510
5511 if (WARN_ON_ONCE(is_ve_fault(intr_info))) {
5512 struct vmx_ve_information *ve_info = vmx->ve_info;
5513
5514 WARN_ONCE(ve_info->exit_reason != EXIT_REASON_EPT_VIOLATION,
5515 "Unexpected #VE on VM-Exit reason 0x%x", ve_info->exit_reason);
5516 dump_vmcs(vcpu);
5517 kvm_mmu_print_sptes(vcpu, ve_info->guest_physical_address, "#VE");
5518 return 1;
5519 }
5520
5521 error_code = 0;
5522 if (intr_info & INTR_INFO_DELIVER_CODE_MASK)
5523 error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
5524
5525 if (!vmx->rmode.vm86_active && is_gp_fault(intr_info)) {
5526 WARN_ON_ONCE(!enable_vmware_backdoor);
5527
5528 /*
5529 * VMware backdoor emulation on #GP interception only handles
5530 * IN{S}, OUT{S}, and RDPMC, none of which generate a non-zero
5531 * error code on #GP.
5532 */
5533 if (error_code) {
5534 kvm_queue_exception_e(vcpu, GP_VECTOR, error_code);
5535 return 1;
5536 }
5537 return kvm_emulate_instruction(vcpu, EMULTYPE_VMWARE_GP);
5538 }
5539
5540 /*
5541 * The #PF with PFEC.RSVD = 1 indicates the guest is accessing
5542 * MMIO, it is better to report an internal error.
5543 * See the comments in vmx_handle_exit.
5544 */
5545 if ((vect_info & VECTORING_INFO_VALID_MASK) &&
5546 !(is_page_fault(intr_info) && !(error_code & PFERR_RSVD_MASK))) {
5547 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
5548 vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_SIMUL_EX;
5549 vcpu->run->internal.ndata = 4;
5550 vcpu->run->internal.data[0] = vect_info;
5551 vcpu->run->internal.data[1] = intr_info;
5552 vcpu->run->internal.data[2] = error_code;
5553 vcpu->run->internal.data[3] = vcpu->arch.last_vmentry_cpu;
5554 return 0;
5555 }
5556
5557 if (is_page_fault(intr_info))
5558 return vmx_handle_page_fault(vcpu, error_code);
5559
5560 ex_no = intr_info & INTR_INFO_VECTOR_MASK;
5561
5562 if (vmx->rmode.vm86_active && rmode_exception(vcpu, ex_no))
5563 return handle_rmode_exception(vcpu, ex_no, error_code);
5564
5565 switch (ex_no) {
5566 case DB_VECTOR:
5567 dr6 = vmx_get_exit_qual(vcpu);
5568 if (!(vcpu->guest_debug &
5569 (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))) {
5570 /*
5571 * If the #DB was due to ICEBP, a.k.a. INT1, skip the
5572 * instruction. ICEBP generates a trap-like #DB, but
5573 * despite its interception control being tied to #DB,
5574 * is an instruction intercept, i.e. the VM-Exit occurs
5575 * on the ICEBP itself. Use the inner "skip" helper to
5576 * avoid single-step #DB and MTF updates, as ICEBP is
5577 * higher priority. Note, skipping ICEBP still clears
5578 * STI and MOVSS blocking.
5579 *
5580 * For all other #DBs, set vmcs.PENDING_DBG_EXCEPTIONS.BS
5581 * if single-step is enabled in RFLAGS and STI or MOVSS
5582 * blocking is active, as the CPU doesn't set the bit
5583 * on VM-Exit due to #DB interception. VM-Entry has a
5584 * consistency check that a single-step #DB is pending
5585 * in this scenario as the previous instruction cannot
5586 * have toggled RFLAGS.TF 0=>1 (because STI and POP/MOV
5587 * don't modify RFLAGS), therefore the one instruction
5588 * delay when activating single-step breakpoints must
5589 * have already expired. Note, the CPU sets/clears BS
5590 * as appropriate for all other VM-Exits types.
5591 */
5592 if (is_icebp(intr_info))
5593 WARN_ON(!skip_emulated_instruction(vcpu));
5594 else if ((vmx_get_rflags(vcpu) & X86_EFLAGS_TF) &&
5595 (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
5596 (GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS)))
5597 vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS,
5598 vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS) | DR6_BS);
5599
5600 kvm_queue_exception_p(vcpu, DB_VECTOR, dr6);
5601 return 1;
5602 }
5603 kvm_run->debug.arch.dr6 = dr6 | DR6_ACTIVE_LOW;
5604 kvm_run->debug.arch.dr7 = vmcs_readl(GUEST_DR7);
5605 fallthrough;
5606 case BP_VECTOR:
5607 /*
5608 * Update instruction length as we may reinject #BP from
5609 * user space while in guest debugging mode. Reading it for
5610 * #DB as well causes no harm, it is not used in that case.
5611 */
5612 vmx->vcpu.arch.event_exit_inst_len =
5613 vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
5614 kvm_run->exit_reason = KVM_EXIT_DEBUG;
5615 kvm_run->debug.arch.pc = kvm_get_linear_rip(vcpu);
5616 kvm_run->debug.arch.exception = ex_no;
5617 break;
5618 case AC_VECTOR:
5619 if (vmx_guest_inject_ac(vcpu)) {
5620 kvm_queue_exception_e(vcpu, AC_VECTOR, error_code);
5621 return 1;
5622 }
5623
5624 /*
5625 * Handle split lock. Depending on detection mode this will
5626 * either warn and disable split lock detection for this
5627 * task or force SIGBUS on it.
5628 */
5629 if (handle_guest_split_lock(kvm_rip_read(vcpu)))
5630 return 1;
5631 fallthrough;
5632 default:
5633 kvm_run->exit_reason = KVM_EXIT_EXCEPTION;
5634 kvm_run->ex.exception = ex_no;
5635 kvm_run->ex.error_code = error_code;
5636 break;
5637 }
5638 return 0;
5639 }
5640
handle_external_interrupt(struct kvm_vcpu * vcpu)5641 static __always_inline int handle_external_interrupt(struct kvm_vcpu *vcpu)
5642 {
5643 ++vcpu->stat.irq_exits;
5644 return 1;
5645 }
5646
handle_triple_fault(struct kvm_vcpu * vcpu)5647 static int handle_triple_fault(struct kvm_vcpu *vcpu)
5648 {
5649 vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN;
5650 vcpu->mmio_needed = 0;
5651 return 0;
5652 }
5653
handle_io(struct kvm_vcpu * vcpu)5654 static int handle_io(struct kvm_vcpu *vcpu)
5655 {
5656 unsigned long exit_qualification;
5657 int size, in, string;
5658 unsigned port;
5659
5660 exit_qualification = vmx_get_exit_qual(vcpu);
5661 string = (exit_qualification & 16) != 0;
5662
5663 ++vcpu->stat.io_exits;
5664
5665 if (string)
5666 return kvm_emulate_instruction(vcpu, 0);
5667
5668 port = exit_qualification >> 16;
5669 size = (exit_qualification & 7) + 1;
5670 in = (exit_qualification & 8) != 0;
5671
5672 return kvm_fast_pio(vcpu, size, port, in);
5673 }
5674
vmx_patch_hypercall(struct kvm_vcpu * vcpu,unsigned char * hypercall)5675 void vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall)
5676 {
5677 /*
5678 * Patch in the VMCALL instruction:
5679 */
5680 hypercall[0] = 0x0f;
5681 hypercall[1] = 0x01;
5682 hypercall[2] = 0xc1;
5683 }
5684
5685 /* called to set cr0 as appropriate for a mov-to-cr0 exit. */
handle_set_cr0(struct kvm_vcpu * vcpu,unsigned long val)5686 static int handle_set_cr0(struct kvm_vcpu *vcpu, unsigned long val)
5687 {
5688 if (is_guest_mode(vcpu)) {
5689 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
5690 unsigned long orig_val = val;
5691
5692 /*
5693 * We get here when L2 changed cr0 in a way that did not change
5694 * any of L1's shadowed bits (see nested_vmx_exit_handled_cr),
5695 * but did change L0 shadowed bits. So we first calculate the
5696 * effective cr0 value that L1 would like to write into the
5697 * hardware. It consists of the L2-owned bits from the new
5698 * value combined with the L1-owned bits from L1's guest_cr0.
5699 */
5700 val = (val & ~vmcs12->cr0_guest_host_mask) |
5701 (vmcs12->guest_cr0 & vmcs12->cr0_guest_host_mask);
5702
5703 if (kvm_set_cr0(vcpu, val))
5704 return 1;
5705 vmcs_writel(CR0_READ_SHADOW, orig_val);
5706 return 0;
5707 } else {
5708 return kvm_set_cr0(vcpu, val);
5709 }
5710 }
5711
handle_set_cr4(struct kvm_vcpu * vcpu,unsigned long val)5712 static int handle_set_cr4(struct kvm_vcpu *vcpu, unsigned long val)
5713 {
5714 if (is_guest_mode(vcpu)) {
5715 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
5716 unsigned long orig_val = val;
5717
5718 /* analogously to handle_set_cr0 */
5719 val = (val & ~vmcs12->cr4_guest_host_mask) |
5720 (vmcs12->guest_cr4 & vmcs12->cr4_guest_host_mask);
5721 if (kvm_set_cr4(vcpu, val))
5722 return 1;
5723 vmcs_writel(CR4_READ_SHADOW, orig_val);
5724 return 0;
5725 } else
5726 return kvm_set_cr4(vcpu, val);
5727 }
5728
handle_desc(struct kvm_vcpu * vcpu)5729 static int handle_desc(struct kvm_vcpu *vcpu)
5730 {
5731 /*
5732 * UMIP emulation relies on intercepting writes to CR4.UMIP, i.e. this
5733 * and other code needs to be updated if UMIP can be guest owned.
5734 */
5735 BUILD_BUG_ON(KVM_POSSIBLE_CR4_GUEST_BITS & X86_CR4_UMIP);
5736
5737 WARN_ON_ONCE(!kvm_is_cr4_bit_set(vcpu, X86_CR4_UMIP));
5738 return kvm_emulate_instruction(vcpu, 0);
5739 }
5740
handle_cr(struct kvm_vcpu * vcpu)5741 static int handle_cr(struct kvm_vcpu *vcpu)
5742 {
5743 unsigned long exit_qualification, val;
5744 int cr;
5745 int reg;
5746 int err;
5747 int ret;
5748
5749 exit_qualification = vmx_get_exit_qual(vcpu);
5750 cr = exit_qualification & 15;
5751 reg = (exit_qualification >> 8) & 15;
5752 switch ((exit_qualification >> 4) & 3) {
5753 case 0: /* mov to cr */
5754 val = kvm_register_read(vcpu, reg);
5755 trace_kvm_cr_write(cr, val);
5756 switch (cr) {
5757 case 0:
5758 err = handle_set_cr0(vcpu, val);
5759 return kvm_complete_insn_gp(vcpu, err);
5760 case 3:
5761 WARN_ON_ONCE(enable_unrestricted_guest);
5762
5763 err = kvm_set_cr3(vcpu, val);
5764 return kvm_complete_insn_gp(vcpu, err);
5765 case 4:
5766 err = handle_set_cr4(vcpu, val);
5767 return kvm_complete_insn_gp(vcpu, err);
5768 case 8: {
5769 u8 cr8_prev = kvm_get_cr8(vcpu);
5770 u8 cr8 = (u8)val;
5771 err = kvm_set_cr8(vcpu, cr8);
5772 ret = kvm_complete_insn_gp(vcpu, err);
5773 if (lapic_in_kernel(vcpu))
5774 return ret;
5775 if (cr8_prev <= cr8)
5776 return ret;
5777 /*
5778 * TODO: we might be squashing a
5779 * KVM_GUESTDBG_SINGLESTEP-triggered
5780 * KVM_EXIT_DEBUG here.
5781 */
5782 vcpu->run->exit_reason = KVM_EXIT_SET_TPR;
5783 return 0;
5784 }
5785 }
5786 break;
5787 case 2: /* clts */
5788 KVM_BUG(1, vcpu->kvm, "Guest always owns CR0.TS");
5789 return -EIO;
5790 case 1: /*mov from cr*/
5791 switch (cr) {
5792 case 3:
5793 WARN_ON_ONCE(enable_unrestricted_guest);
5794
5795 val = kvm_read_cr3(vcpu);
5796 kvm_register_write(vcpu, reg, val);
5797 trace_kvm_cr_read(cr, val);
5798 return kvm_skip_emulated_instruction(vcpu);
5799 case 8:
5800 val = kvm_get_cr8(vcpu);
5801 kvm_register_write(vcpu, reg, val);
5802 trace_kvm_cr_read(cr, val);
5803 return kvm_skip_emulated_instruction(vcpu);
5804 }
5805 break;
5806 case 3: /* lmsw */
5807 val = (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f;
5808 trace_kvm_cr_write(0, (kvm_read_cr0_bits(vcpu, ~0xful) | val));
5809 kvm_lmsw(vcpu, val);
5810
5811 return kvm_skip_emulated_instruction(vcpu);
5812 default:
5813 break;
5814 }
5815 vcpu->run->exit_reason = 0;
5816 vcpu_unimpl(vcpu, "unhandled control register: op %d cr %d\n",
5817 (int)(exit_qualification >> 4) & 3, cr);
5818 return 0;
5819 }
5820
handle_dr(struct kvm_vcpu * vcpu)5821 static int handle_dr(struct kvm_vcpu *vcpu)
5822 {
5823 unsigned long exit_qualification;
5824 int dr, dr7, reg;
5825 int err = 1;
5826
5827 exit_qualification = vmx_get_exit_qual(vcpu);
5828 dr = exit_qualification & DEBUG_REG_ACCESS_NUM;
5829
5830 /* First, if DR does not exist, trigger UD */
5831 if (!kvm_require_dr(vcpu, dr))
5832 return 1;
5833
5834 if (vmx_get_cpl(vcpu) > 0)
5835 goto out;
5836
5837 dr7 = vmcs_readl(GUEST_DR7);
5838 if (dr7 & DR7_GD) {
5839 /*
5840 * As the vm-exit takes precedence over the debug trap, we
5841 * need to emulate the latter, either for the host or the
5842 * guest debugging itself.
5843 */
5844 if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) {
5845 vcpu->run->debug.arch.dr6 = DR6_BD | DR6_ACTIVE_LOW;
5846 vcpu->run->debug.arch.dr7 = dr7;
5847 vcpu->run->debug.arch.pc = kvm_get_linear_rip(vcpu);
5848 vcpu->run->debug.arch.exception = DB_VECTOR;
5849 vcpu->run->exit_reason = KVM_EXIT_DEBUG;
5850 return 0;
5851 } else {
5852 kvm_queue_exception_p(vcpu, DB_VECTOR, DR6_BD);
5853 return 1;
5854 }
5855 }
5856
5857 if (vcpu->guest_debug == 0) {
5858 exec_controls_clearbit(to_vmx(vcpu), CPU_BASED_MOV_DR_EXITING);
5859
5860 /*
5861 * No more DR vmexits; force a reload of the debug registers
5862 * and reenter on this instruction. The next vmexit will
5863 * retrieve the full state of the debug registers.
5864 */
5865 vcpu->arch.switch_db_regs |= KVM_DEBUGREG_WONT_EXIT;
5866 return 1;
5867 }
5868
5869 reg = DEBUG_REG_ACCESS_REG(exit_qualification);
5870 if (exit_qualification & TYPE_MOV_FROM_DR) {
5871 kvm_register_write(vcpu, reg, kvm_get_dr(vcpu, dr));
5872 err = 0;
5873 } else {
5874 err = kvm_set_dr(vcpu, dr, kvm_register_read(vcpu, reg));
5875 }
5876
5877 out:
5878 return kvm_complete_insn_gp(vcpu, err);
5879 }
5880
vmx_sync_dirty_debug_regs(struct kvm_vcpu * vcpu)5881 void vmx_sync_dirty_debug_regs(struct kvm_vcpu *vcpu)
5882 {
5883 get_debugreg(vcpu->arch.db[0], 0);
5884 get_debugreg(vcpu->arch.db[1], 1);
5885 get_debugreg(vcpu->arch.db[2], 2);
5886 get_debugreg(vcpu->arch.db[3], 3);
5887 get_debugreg(vcpu->arch.dr6, 6);
5888 vcpu->arch.dr7 = vmcs_readl(GUEST_DR7);
5889
5890 vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_WONT_EXIT;
5891 exec_controls_setbit(to_vmx(vcpu), CPU_BASED_MOV_DR_EXITING);
5892
5893 /*
5894 * exc_debug expects dr6 to be cleared after it runs, avoid that it sees
5895 * a stale dr6 from the guest.
5896 */
5897 set_debugreg(DR6_RESERVED, 6);
5898 }
5899
vmx_set_dr7(struct kvm_vcpu * vcpu,unsigned long val)5900 void vmx_set_dr7(struct kvm_vcpu *vcpu, unsigned long val)
5901 {
5902 vmcs_writel(GUEST_DR7, val);
5903 }
5904
handle_tpr_below_threshold(struct kvm_vcpu * vcpu)5905 static int handle_tpr_below_threshold(struct kvm_vcpu *vcpu)
5906 {
5907 kvm_apic_update_ppr(vcpu);
5908 return 1;
5909 }
5910
handle_interrupt_window(struct kvm_vcpu * vcpu)5911 static int handle_interrupt_window(struct kvm_vcpu *vcpu)
5912 {
5913 exec_controls_clearbit(to_vmx(vcpu), CPU_BASED_INTR_WINDOW_EXITING);
5914
5915 kvm_make_request(KVM_REQ_EVENT, vcpu);
5916
5917 ++vcpu->stat.irq_window_exits;
5918 return 1;
5919 }
5920
handle_invlpg(struct kvm_vcpu * vcpu)5921 static int handle_invlpg(struct kvm_vcpu *vcpu)
5922 {
5923 unsigned long exit_qualification = vmx_get_exit_qual(vcpu);
5924
5925 kvm_mmu_invlpg(vcpu, exit_qualification);
5926 return kvm_skip_emulated_instruction(vcpu);
5927 }
5928
handle_apic_access(struct kvm_vcpu * vcpu)5929 static int handle_apic_access(struct kvm_vcpu *vcpu)
5930 {
5931 if (likely(fasteoi)) {
5932 unsigned long exit_qualification = vmx_get_exit_qual(vcpu);
5933 int access_type, offset;
5934
5935 access_type = exit_qualification & APIC_ACCESS_TYPE;
5936 offset = exit_qualification & APIC_ACCESS_OFFSET;
5937 /*
5938 * Sane guest uses MOV to write EOI, with written value
5939 * not cared. So make a short-circuit here by avoiding
5940 * heavy instruction emulation.
5941 */
5942 if ((access_type == TYPE_LINEAR_APIC_INST_WRITE) &&
5943 (offset == APIC_EOI)) {
5944 kvm_lapic_set_eoi(vcpu);
5945 return kvm_skip_emulated_instruction(vcpu);
5946 }
5947 }
5948 return kvm_emulate_instruction(vcpu, 0);
5949 }
5950
handle_apic_eoi_induced(struct kvm_vcpu * vcpu)5951 static int handle_apic_eoi_induced(struct kvm_vcpu *vcpu)
5952 {
5953 unsigned long exit_qualification = vmx_get_exit_qual(vcpu);
5954 int vector = exit_qualification & 0xff;
5955
5956 /* EOI-induced VM exit is trap-like and thus no need to adjust IP */
5957 kvm_apic_set_eoi_accelerated(vcpu, vector);
5958 return 1;
5959 }
5960
handle_apic_write(struct kvm_vcpu * vcpu)5961 static int handle_apic_write(struct kvm_vcpu *vcpu)
5962 {
5963 unsigned long exit_qualification = vmx_get_exit_qual(vcpu);
5964
5965 /*
5966 * APIC-write VM-Exit is trap-like, KVM doesn't need to advance RIP and
5967 * hardware has done any necessary aliasing, offset adjustments, etc...
5968 * for the access. I.e. the correct value has already been written to
5969 * the vAPIC page for the correct 16-byte chunk. KVM needs only to
5970 * retrieve the register value and emulate the access.
5971 */
5972 u32 offset = exit_qualification & 0xff0;
5973
5974 kvm_apic_write_nodecode(vcpu, offset);
5975 return 1;
5976 }
5977
handle_task_switch(struct kvm_vcpu * vcpu)5978 static int handle_task_switch(struct kvm_vcpu *vcpu)
5979 {
5980 struct vcpu_vmx *vmx = to_vmx(vcpu);
5981 unsigned long exit_qualification;
5982 bool has_error_code = false;
5983 u32 error_code = 0;
5984 u16 tss_selector;
5985 int reason, type, idt_v, idt_index;
5986
5987 idt_v = (vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK);
5988 idt_index = (vmx->idt_vectoring_info & VECTORING_INFO_VECTOR_MASK);
5989 type = (vmx->idt_vectoring_info & VECTORING_INFO_TYPE_MASK);
5990
5991 exit_qualification = vmx_get_exit_qual(vcpu);
5992
5993 reason = (u32)exit_qualification >> 30;
5994 if (reason == TASK_SWITCH_GATE && idt_v) {
5995 switch (type) {
5996 case INTR_TYPE_NMI_INTR:
5997 vcpu->arch.nmi_injected = false;
5998 vmx_set_nmi_mask(vcpu, true);
5999 break;
6000 case INTR_TYPE_EXT_INTR:
6001 case INTR_TYPE_SOFT_INTR:
6002 kvm_clear_interrupt_queue(vcpu);
6003 break;
6004 case INTR_TYPE_HARD_EXCEPTION:
6005 if (vmx->idt_vectoring_info &
6006 VECTORING_INFO_DELIVER_CODE_MASK) {
6007 has_error_code = true;
6008 error_code =
6009 vmcs_read32(IDT_VECTORING_ERROR_CODE);
6010 }
6011 fallthrough;
6012 case INTR_TYPE_SOFT_EXCEPTION:
6013 kvm_clear_exception_queue(vcpu);
6014 break;
6015 default:
6016 break;
6017 }
6018 }
6019 tss_selector = exit_qualification;
6020
6021 if (!idt_v || (type != INTR_TYPE_HARD_EXCEPTION &&
6022 type != INTR_TYPE_EXT_INTR &&
6023 type != INTR_TYPE_NMI_INTR))
6024 WARN_ON(!skip_emulated_instruction(vcpu));
6025
6026 /*
6027 * TODO: What about debug traps on tss switch?
6028 * Are we supposed to inject them and update dr6?
6029 */
6030 return kvm_task_switch(vcpu, tss_selector,
6031 type == INTR_TYPE_SOFT_INTR ? idt_index : -1,
6032 reason, has_error_code, error_code);
6033 }
6034
handle_ept_violation(struct kvm_vcpu * vcpu)6035 static int handle_ept_violation(struct kvm_vcpu *vcpu)
6036 {
6037 unsigned long exit_qualification = vmx_get_exit_qual(vcpu);
6038 gpa_t gpa;
6039
6040 /*
6041 * EPT violation happened while executing iret from NMI,
6042 * "blocked by NMI" bit has to be set before next VM entry.
6043 * There are errata that may cause this bit to not be set:
6044 * AAK134, BY25.
6045 */
6046 if (!(to_vmx(vcpu)->idt_vectoring_info & VECTORING_INFO_VALID_MASK) &&
6047 enable_vnmi &&
6048 (exit_qualification & INTR_INFO_UNBLOCK_NMI))
6049 vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, GUEST_INTR_STATE_NMI);
6050
6051 gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
6052 trace_kvm_page_fault(vcpu, gpa, exit_qualification);
6053
6054 /*
6055 * Check that the GPA doesn't exceed physical memory limits, as that is
6056 * a guest page fault. We have to emulate the instruction here, because
6057 * if the illegal address is that of a paging structure, then
6058 * EPT_VIOLATION_ACC_WRITE bit is set. Alternatively, if supported we
6059 * would also use advanced VM-exit information for EPT violations to
6060 * reconstruct the page fault error code.
6061 */
6062 if (unlikely(allow_smaller_maxphyaddr && !kvm_vcpu_is_legal_gpa(vcpu, gpa)))
6063 return kvm_emulate_instruction(vcpu, 0);
6064
6065 return __vmx_handle_ept_violation(vcpu, gpa, exit_qualification);
6066 }
6067
handle_ept_misconfig(struct kvm_vcpu * vcpu)6068 static int handle_ept_misconfig(struct kvm_vcpu *vcpu)
6069 {
6070 gpa_t gpa;
6071
6072 if (vmx_check_emulate_instruction(vcpu, EMULTYPE_PF, NULL, 0))
6073 return 1;
6074
6075 /*
6076 * A nested guest cannot optimize MMIO vmexits, because we have an
6077 * nGPA here instead of the required GPA.
6078 */
6079 gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
6080 if (!is_guest_mode(vcpu) &&
6081 !kvm_io_bus_write(vcpu, KVM_FAST_MMIO_BUS, gpa, 0, NULL)) {
6082 trace_kvm_fast_mmio(gpa);
6083 return kvm_skip_emulated_instruction(vcpu);
6084 }
6085
6086 return kvm_mmu_page_fault(vcpu, gpa, PFERR_RSVD_MASK, NULL, 0);
6087 }
6088
handle_nmi_window(struct kvm_vcpu * vcpu)6089 static int handle_nmi_window(struct kvm_vcpu *vcpu)
6090 {
6091 if (KVM_BUG_ON(!enable_vnmi, vcpu->kvm))
6092 return -EIO;
6093
6094 exec_controls_clearbit(to_vmx(vcpu), CPU_BASED_NMI_WINDOW_EXITING);
6095 ++vcpu->stat.nmi_window_exits;
6096 kvm_make_request(KVM_REQ_EVENT, vcpu);
6097
6098 return 1;
6099 }
6100
6101 /*
6102 * Returns true if emulation is required (due to the vCPU having invalid state
6103 * with unsrestricted guest mode disabled) and KVM can't faithfully emulate the
6104 * current vCPU state.
6105 */
vmx_unhandleable_emulation_required(struct kvm_vcpu * vcpu)6106 static bool vmx_unhandleable_emulation_required(struct kvm_vcpu *vcpu)
6107 {
6108 struct vcpu_vmx *vmx = to_vmx(vcpu);
6109
6110 if (!vmx->vt.emulation_required)
6111 return false;
6112
6113 /*
6114 * It is architecturally impossible for emulation to be required when a
6115 * nested VM-Enter is pending completion, as VM-Enter will VM-Fail if
6116 * guest state is invalid and unrestricted guest is disabled, i.e. KVM
6117 * should synthesize VM-Fail instead emulation L2 code. This path is
6118 * only reachable if userspace modifies L2 guest state after KVM has
6119 * performed the nested VM-Enter consistency checks.
6120 */
6121 if (vmx->nested.nested_run_pending)
6122 return true;
6123
6124 /*
6125 * KVM only supports emulating exceptions if the vCPU is in Real Mode.
6126 * If emulation is required, KVM can't perform a successful VM-Enter to
6127 * inject the exception.
6128 */
6129 return !vmx->rmode.vm86_active &&
6130 (kvm_is_exception_pending(vcpu) || vcpu->arch.exception.injected);
6131 }
6132
handle_invalid_guest_state(struct kvm_vcpu * vcpu)6133 static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
6134 {
6135 struct vcpu_vmx *vmx = to_vmx(vcpu);
6136 bool intr_window_requested;
6137 unsigned count = 130;
6138
6139 intr_window_requested = exec_controls_get(vmx) &
6140 CPU_BASED_INTR_WINDOW_EXITING;
6141
6142 while (vmx->vt.emulation_required && count-- != 0) {
6143 if (intr_window_requested && !vmx_interrupt_blocked(vcpu))
6144 return handle_interrupt_window(&vmx->vcpu);
6145
6146 if (kvm_test_request(KVM_REQ_EVENT, vcpu))
6147 return 1;
6148
6149 /*
6150 * Ensure that any updates to kvm->buses[] observed by the
6151 * previous instruction (emulated or otherwise) are also
6152 * visible to the instruction KVM is about to emulate.
6153 */
6154 smp_rmb();
6155
6156 if (!kvm_emulate_instruction(vcpu, 0))
6157 return 0;
6158
6159 if (vmx_unhandleable_emulation_required(vcpu)) {
6160 kvm_prepare_emulation_failure_exit(vcpu);
6161 return 0;
6162 }
6163
6164 if (vcpu->arch.halt_request) {
6165 vcpu->arch.halt_request = 0;
6166 return kvm_emulate_halt_noskip(vcpu);
6167 }
6168
6169 /*
6170 * Note, return 1 and not 0, vcpu_run() will invoke
6171 * xfer_to_guest_mode() which will create a proper return
6172 * code.
6173 */
6174 if (__xfer_to_guest_mode_work_pending())
6175 return 1;
6176 }
6177
6178 return 1;
6179 }
6180
vmx_vcpu_pre_run(struct kvm_vcpu * vcpu)6181 int vmx_vcpu_pre_run(struct kvm_vcpu *vcpu)
6182 {
6183 if (vmx_unhandleable_emulation_required(vcpu)) {
6184 kvm_prepare_emulation_failure_exit(vcpu);
6185 return 0;
6186 }
6187
6188 return 1;
6189 }
6190
6191 /*
6192 * Indicate a busy-waiting vcpu in spinlock. We do not enable the PAUSE
6193 * exiting, so only get here on cpu with PAUSE-Loop-Exiting.
6194 */
handle_pause(struct kvm_vcpu * vcpu)6195 static int handle_pause(struct kvm_vcpu *vcpu)
6196 {
6197 if (!kvm_pause_in_guest(vcpu->kvm))
6198 grow_ple_window(vcpu);
6199
6200 /*
6201 * Intel sdm vol3 ch-25.1.3 says: The "PAUSE-loop exiting"
6202 * VM-execution control is ignored if CPL > 0. OTOH, KVM
6203 * never set PAUSE_EXITING and just set PLE if supported,
6204 * so the vcpu must be CPL=0 if it gets a PAUSE exit.
6205 */
6206 kvm_vcpu_on_spin(vcpu, true);
6207 return kvm_skip_emulated_instruction(vcpu);
6208 }
6209
handle_monitor_trap(struct kvm_vcpu * vcpu)6210 static int handle_monitor_trap(struct kvm_vcpu *vcpu)
6211 {
6212 return 1;
6213 }
6214
handle_invpcid(struct kvm_vcpu * vcpu)6215 static int handle_invpcid(struct kvm_vcpu *vcpu)
6216 {
6217 u32 vmx_instruction_info;
6218 unsigned long type;
6219 gva_t gva;
6220 struct {
6221 u64 pcid;
6222 u64 gla;
6223 } operand;
6224 int gpr_index;
6225
6226 if (!guest_cpu_cap_has(vcpu, X86_FEATURE_INVPCID)) {
6227 kvm_queue_exception(vcpu, UD_VECTOR);
6228 return 1;
6229 }
6230
6231 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
6232 gpr_index = vmx_get_instr_info_reg2(vmx_instruction_info);
6233 type = kvm_register_read(vcpu, gpr_index);
6234
6235 /* According to the Intel instruction reference, the memory operand
6236 * is read even if it isn't needed (e.g., for type==all)
6237 */
6238 if (get_vmx_mem_address(vcpu, vmx_get_exit_qual(vcpu),
6239 vmx_instruction_info, false,
6240 sizeof(operand), &gva))
6241 return 1;
6242
6243 return kvm_handle_invpcid(vcpu, type, gva);
6244 }
6245
handle_pml_full(struct kvm_vcpu * vcpu)6246 static int handle_pml_full(struct kvm_vcpu *vcpu)
6247 {
6248 unsigned long exit_qualification;
6249
6250 trace_kvm_pml_full(vcpu->vcpu_id);
6251
6252 exit_qualification = vmx_get_exit_qual(vcpu);
6253
6254 /*
6255 * PML buffer FULL happened while executing iret from NMI,
6256 * "blocked by NMI" bit has to be set before next VM entry.
6257 */
6258 if (!(to_vmx(vcpu)->idt_vectoring_info & VECTORING_INFO_VALID_MASK) &&
6259 enable_vnmi &&
6260 (exit_qualification & INTR_INFO_UNBLOCK_NMI))
6261 vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
6262 GUEST_INTR_STATE_NMI);
6263
6264 /*
6265 * PML buffer already flushed at beginning of VMEXIT. Nothing to do
6266 * here.., and there's no userspace involvement needed for PML.
6267 */
6268 return 1;
6269 }
6270
handle_fastpath_preemption_timer(struct kvm_vcpu * vcpu,bool force_immediate_exit)6271 static fastpath_t handle_fastpath_preemption_timer(struct kvm_vcpu *vcpu,
6272 bool force_immediate_exit)
6273 {
6274 struct vcpu_vmx *vmx = to_vmx(vcpu);
6275
6276 /*
6277 * In the *extremely* unlikely scenario that this is a spurious VM-Exit
6278 * due to the timer expiring while it was "soft" disabled, just eat the
6279 * exit and re-enter the guest.
6280 */
6281 if (unlikely(vmx->loaded_vmcs->hv_timer_soft_disabled))
6282 return EXIT_FASTPATH_REENTER_GUEST;
6283
6284 /*
6285 * If the timer expired because KVM used it to force an immediate exit,
6286 * then mission accomplished.
6287 */
6288 if (force_immediate_exit)
6289 return EXIT_FASTPATH_EXIT_HANDLED;
6290
6291 /*
6292 * If L2 is active, go down the slow path as emulating the guest timer
6293 * expiration likely requires synthesizing a nested VM-Exit.
6294 */
6295 if (is_guest_mode(vcpu))
6296 return EXIT_FASTPATH_NONE;
6297
6298 kvm_lapic_expired_hv_timer(vcpu);
6299 return EXIT_FASTPATH_REENTER_GUEST;
6300 }
6301
handle_preemption_timer(struct kvm_vcpu * vcpu)6302 static int handle_preemption_timer(struct kvm_vcpu *vcpu)
6303 {
6304 /*
6305 * This non-fastpath handler is reached if and only if the preemption
6306 * timer was being used to emulate a guest timer while L2 is active.
6307 * All other scenarios are supposed to be handled in the fastpath.
6308 */
6309 WARN_ON_ONCE(!is_guest_mode(vcpu));
6310 kvm_lapic_expired_hv_timer(vcpu);
6311 return 1;
6312 }
6313
6314 /*
6315 * When nested=0, all VMX instruction VM Exits filter here. The handlers
6316 * are overwritten by nested_vmx_hardware_setup() when nested=1.
6317 */
handle_vmx_instruction(struct kvm_vcpu * vcpu)6318 static int handle_vmx_instruction(struct kvm_vcpu *vcpu)
6319 {
6320 kvm_queue_exception(vcpu, UD_VECTOR);
6321 return 1;
6322 }
6323
handle_tdx_instruction(struct kvm_vcpu * vcpu)6324 static int handle_tdx_instruction(struct kvm_vcpu *vcpu)
6325 {
6326 kvm_queue_exception(vcpu, UD_VECTOR);
6327 return 1;
6328 }
6329
6330 #ifndef CONFIG_X86_SGX_KVM
handle_encls(struct kvm_vcpu * vcpu)6331 static int handle_encls(struct kvm_vcpu *vcpu)
6332 {
6333 /*
6334 * SGX virtualization is disabled. There is no software enable bit for
6335 * SGX, so KVM intercepts all ENCLS leafs and injects a #UD to prevent
6336 * the guest from executing ENCLS (when SGX is supported by hardware).
6337 */
6338 kvm_queue_exception(vcpu, UD_VECTOR);
6339 return 1;
6340 }
6341 #endif /* CONFIG_X86_SGX_KVM */
6342
handle_bus_lock_vmexit(struct kvm_vcpu * vcpu)6343 static int handle_bus_lock_vmexit(struct kvm_vcpu *vcpu)
6344 {
6345 /*
6346 * Hardware may or may not set the BUS_LOCK_DETECTED flag on BUS_LOCK
6347 * VM-Exits. Unconditionally set the flag here and leave the handling to
6348 * vmx_handle_exit().
6349 */
6350 to_vt(vcpu)->exit_reason.bus_lock_detected = true;
6351 return 1;
6352 }
6353
handle_notify(struct kvm_vcpu * vcpu)6354 static int handle_notify(struct kvm_vcpu *vcpu)
6355 {
6356 unsigned long exit_qual = vmx_get_exit_qual(vcpu);
6357 bool context_invalid = exit_qual & NOTIFY_VM_CONTEXT_INVALID;
6358
6359 ++vcpu->stat.notify_window_exits;
6360
6361 /*
6362 * Notify VM exit happened while executing iret from NMI,
6363 * "blocked by NMI" bit has to be set before next VM entry.
6364 */
6365 if (enable_vnmi && (exit_qual & INTR_INFO_UNBLOCK_NMI))
6366 vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
6367 GUEST_INTR_STATE_NMI);
6368
6369 if (vcpu->kvm->arch.notify_vmexit_flags & KVM_X86_NOTIFY_VMEXIT_USER ||
6370 context_invalid) {
6371 vcpu->run->exit_reason = KVM_EXIT_NOTIFY;
6372 vcpu->run->notify.flags = context_invalid ?
6373 KVM_NOTIFY_CONTEXT_INVALID : 0;
6374 return 0;
6375 }
6376
6377 return 1;
6378 }
6379
vmx_get_msr_imm_reg(struct kvm_vcpu * vcpu)6380 static int vmx_get_msr_imm_reg(struct kvm_vcpu *vcpu)
6381 {
6382 return vmx_get_instr_info_reg(vmcs_read32(VMX_INSTRUCTION_INFO));
6383 }
6384
handle_rdmsr_imm(struct kvm_vcpu * vcpu)6385 static int handle_rdmsr_imm(struct kvm_vcpu *vcpu)
6386 {
6387 return kvm_emulate_rdmsr_imm(vcpu, vmx_get_exit_qual(vcpu),
6388 vmx_get_msr_imm_reg(vcpu));
6389 }
6390
handle_wrmsr_imm(struct kvm_vcpu * vcpu)6391 static int handle_wrmsr_imm(struct kvm_vcpu *vcpu)
6392 {
6393 return kvm_emulate_wrmsr_imm(vcpu, vmx_get_exit_qual(vcpu),
6394 vmx_get_msr_imm_reg(vcpu));
6395 }
6396
6397 /*
6398 * The exit handlers return 1 if the exit was handled fully and guest execution
6399 * may resume. Otherwise they set the kvm_run parameter to indicate what needs
6400 * to be done to userspace and return 0.
6401 */
6402 static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
6403 [EXIT_REASON_EXCEPTION_NMI] = handle_exception_nmi,
6404 [EXIT_REASON_EXTERNAL_INTERRUPT] = handle_external_interrupt,
6405 [EXIT_REASON_TRIPLE_FAULT] = handle_triple_fault,
6406 [EXIT_REASON_NMI_WINDOW] = handle_nmi_window,
6407 [EXIT_REASON_IO_INSTRUCTION] = handle_io,
6408 [EXIT_REASON_CR_ACCESS] = handle_cr,
6409 [EXIT_REASON_DR_ACCESS] = handle_dr,
6410 [EXIT_REASON_CPUID] = kvm_emulate_cpuid,
6411 [EXIT_REASON_MSR_READ] = kvm_emulate_rdmsr,
6412 [EXIT_REASON_MSR_WRITE] = kvm_emulate_wrmsr,
6413 [EXIT_REASON_INTERRUPT_WINDOW] = handle_interrupt_window,
6414 [EXIT_REASON_HLT] = kvm_emulate_halt,
6415 [EXIT_REASON_INVD] = kvm_emulate_invd,
6416 [EXIT_REASON_INVLPG] = handle_invlpg,
6417 [EXIT_REASON_RDPMC] = kvm_emulate_rdpmc,
6418 [EXIT_REASON_VMCALL] = kvm_emulate_hypercall,
6419 [EXIT_REASON_VMCLEAR] = handle_vmx_instruction,
6420 [EXIT_REASON_VMLAUNCH] = handle_vmx_instruction,
6421 [EXIT_REASON_VMPTRLD] = handle_vmx_instruction,
6422 [EXIT_REASON_VMPTRST] = handle_vmx_instruction,
6423 [EXIT_REASON_VMREAD] = handle_vmx_instruction,
6424 [EXIT_REASON_VMRESUME] = handle_vmx_instruction,
6425 [EXIT_REASON_VMWRITE] = handle_vmx_instruction,
6426 [EXIT_REASON_VMOFF] = handle_vmx_instruction,
6427 [EXIT_REASON_VMON] = handle_vmx_instruction,
6428 [EXIT_REASON_TPR_BELOW_THRESHOLD] = handle_tpr_below_threshold,
6429 [EXIT_REASON_APIC_ACCESS] = handle_apic_access,
6430 [EXIT_REASON_APIC_WRITE] = handle_apic_write,
6431 [EXIT_REASON_EOI_INDUCED] = handle_apic_eoi_induced,
6432 [EXIT_REASON_WBINVD] = kvm_emulate_wbinvd,
6433 [EXIT_REASON_XSETBV] = kvm_emulate_xsetbv,
6434 [EXIT_REASON_TASK_SWITCH] = handle_task_switch,
6435 [EXIT_REASON_MCE_DURING_VMENTRY] = handle_machine_check,
6436 [EXIT_REASON_GDTR_IDTR] = handle_desc,
6437 [EXIT_REASON_LDTR_TR] = handle_desc,
6438 [EXIT_REASON_EPT_VIOLATION] = handle_ept_violation,
6439 [EXIT_REASON_EPT_MISCONFIG] = handle_ept_misconfig,
6440 [EXIT_REASON_PAUSE_INSTRUCTION] = handle_pause,
6441 [EXIT_REASON_MWAIT_INSTRUCTION] = kvm_emulate_mwait,
6442 [EXIT_REASON_MONITOR_TRAP_FLAG] = handle_monitor_trap,
6443 [EXIT_REASON_MONITOR_INSTRUCTION] = kvm_emulate_monitor,
6444 [EXIT_REASON_INVEPT] = handle_vmx_instruction,
6445 [EXIT_REASON_INVVPID] = handle_vmx_instruction,
6446 [EXIT_REASON_RDRAND] = kvm_handle_invalid_op,
6447 [EXIT_REASON_RDSEED] = kvm_handle_invalid_op,
6448 [EXIT_REASON_PML_FULL] = handle_pml_full,
6449 [EXIT_REASON_INVPCID] = handle_invpcid,
6450 [EXIT_REASON_VMFUNC] = handle_vmx_instruction,
6451 [EXIT_REASON_PREEMPTION_TIMER] = handle_preemption_timer,
6452 [EXIT_REASON_ENCLS] = handle_encls,
6453 [EXIT_REASON_BUS_LOCK] = handle_bus_lock_vmexit,
6454 [EXIT_REASON_NOTIFY] = handle_notify,
6455 [EXIT_REASON_SEAMCALL] = handle_tdx_instruction,
6456 [EXIT_REASON_TDCALL] = handle_tdx_instruction,
6457 [EXIT_REASON_MSR_READ_IMM] = handle_rdmsr_imm,
6458 [EXIT_REASON_MSR_WRITE_IMM] = handle_wrmsr_imm,
6459 };
6460
6461 static const int kvm_vmx_max_exit_handlers =
6462 ARRAY_SIZE(kvm_vmx_exit_handlers);
6463
vmx_get_exit_info(struct kvm_vcpu * vcpu,u32 * reason,u64 * info1,u64 * info2,u32 * intr_info,u32 * error_code)6464 void vmx_get_exit_info(struct kvm_vcpu *vcpu, u32 *reason,
6465 u64 *info1, u64 *info2, u32 *intr_info, u32 *error_code)
6466 {
6467 struct vcpu_vmx *vmx = to_vmx(vcpu);
6468
6469 *reason = vmx->vt.exit_reason.full;
6470 *info1 = vmx_get_exit_qual(vcpu);
6471 if (!(vmx->vt.exit_reason.failed_vmentry)) {
6472 *info2 = vmx->idt_vectoring_info;
6473 *intr_info = vmx_get_intr_info(vcpu);
6474 if (is_exception_with_error_code(*intr_info))
6475 *error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
6476 else
6477 *error_code = 0;
6478 } else {
6479 *info2 = 0;
6480 *intr_info = 0;
6481 *error_code = 0;
6482 }
6483 }
6484
vmx_get_entry_info(struct kvm_vcpu * vcpu,u32 * intr_info,u32 * error_code)6485 void vmx_get_entry_info(struct kvm_vcpu *vcpu, u32 *intr_info, u32 *error_code)
6486 {
6487 *intr_info = vmcs_read32(VM_ENTRY_INTR_INFO_FIELD);
6488 if (is_exception_with_error_code(*intr_info))
6489 *error_code = vmcs_read32(VM_ENTRY_EXCEPTION_ERROR_CODE);
6490 else
6491 *error_code = 0;
6492 }
6493
vmx_destroy_pml_buffer(struct vcpu_vmx * vmx)6494 static void vmx_destroy_pml_buffer(struct vcpu_vmx *vmx)
6495 {
6496 if (vmx->pml_pg) {
6497 __free_page(vmx->pml_pg);
6498 vmx->pml_pg = NULL;
6499 }
6500 }
6501
vmx_flush_pml_buffer(struct kvm_vcpu * vcpu)6502 static void vmx_flush_pml_buffer(struct kvm_vcpu *vcpu)
6503 {
6504 struct vcpu_vmx *vmx = to_vmx(vcpu);
6505 u16 pml_idx, pml_tail_index;
6506 u64 *pml_buf;
6507 int i;
6508
6509 pml_idx = vmcs_read16(GUEST_PML_INDEX);
6510
6511 /* Do nothing if PML buffer is empty */
6512 if (pml_idx == PML_HEAD_INDEX)
6513 return;
6514 /*
6515 * PML index always points to the next available PML buffer entity
6516 * unless PML log has just overflowed.
6517 */
6518 pml_tail_index = (pml_idx >= PML_LOG_NR_ENTRIES) ? 0 : pml_idx + 1;
6519
6520 /*
6521 * PML log is written backwards: the CPU first writes the entry 511
6522 * then the entry 510, and so on.
6523 *
6524 * Read the entries in the same order they were written, to ensure that
6525 * the dirty ring is filled in the same order the CPU wrote them.
6526 */
6527 pml_buf = page_address(vmx->pml_pg);
6528
6529 for (i = PML_HEAD_INDEX; i >= pml_tail_index; i--) {
6530 u64 gpa;
6531
6532 gpa = pml_buf[i];
6533 WARN_ON(gpa & (PAGE_SIZE - 1));
6534 kvm_vcpu_mark_page_dirty(vcpu, gpa >> PAGE_SHIFT);
6535 }
6536
6537 /* reset PML index */
6538 vmcs_write16(GUEST_PML_INDEX, PML_HEAD_INDEX);
6539 }
6540
nested_vmx_mark_all_vmcs12_pages_dirty(struct kvm_vcpu * vcpu)6541 static void nested_vmx_mark_all_vmcs12_pages_dirty(struct kvm_vcpu *vcpu)
6542 {
6543 struct vcpu_vmx *vmx = to_vmx(vcpu);
6544
6545 kvm_vcpu_map_mark_dirty(vcpu, &vmx->nested.apic_access_page_map);
6546 kvm_vcpu_map_mark_dirty(vcpu, &vmx->nested.virtual_apic_map);
6547 kvm_vcpu_map_mark_dirty(vcpu, &vmx->nested.pi_desc_map);
6548 }
6549
vmx_dump_sel(char * name,uint32_t sel)6550 static void vmx_dump_sel(char *name, uint32_t sel)
6551 {
6552 pr_err("%s sel=0x%04x, attr=0x%05x, limit=0x%08x, base=0x%016lx\n",
6553 name, vmcs_read16(sel),
6554 vmcs_read32(sel + GUEST_ES_AR_BYTES - GUEST_ES_SELECTOR),
6555 vmcs_read32(sel + GUEST_ES_LIMIT - GUEST_ES_SELECTOR),
6556 vmcs_readl(sel + GUEST_ES_BASE - GUEST_ES_SELECTOR));
6557 }
6558
vmx_dump_dtsel(char * name,uint32_t limit)6559 static void vmx_dump_dtsel(char *name, uint32_t limit)
6560 {
6561 pr_err("%s limit=0x%08x, base=0x%016lx\n",
6562 name, vmcs_read32(limit),
6563 vmcs_readl(limit + GUEST_GDTR_BASE - GUEST_GDTR_LIMIT));
6564 }
6565
vmx_dump_msrs(char * name,struct vmx_msrs * m)6566 static void vmx_dump_msrs(char *name, struct vmx_msrs *m)
6567 {
6568 unsigned int i;
6569 struct vmx_msr_entry *e;
6570
6571 pr_err("MSR %s:\n", name);
6572 for (i = 0, e = m->val; i < m->nr; ++i, ++e)
6573 pr_err(" %2d: msr=0x%08x value=0x%016llx\n", i, e->index, e->value);
6574 }
6575
dump_vmcs(struct kvm_vcpu * vcpu)6576 void dump_vmcs(struct kvm_vcpu *vcpu)
6577 {
6578 struct vcpu_vmx *vmx = to_vmx(vcpu);
6579 u32 vmentry_ctl, vmexit_ctl;
6580 u32 cpu_based_exec_ctrl, pin_based_exec_ctrl, secondary_exec_control;
6581 u64 tertiary_exec_control;
6582 unsigned long cr4;
6583 int efer_slot;
6584
6585 if (!dump_invalid_vmcs) {
6586 pr_warn_ratelimited("set kvm_intel.dump_invalid_vmcs=1 to dump internal KVM state.\n");
6587 return;
6588 }
6589
6590 vmentry_ctl = vmcs_read32(VM_ENTRY_CONTROLS);
6591 vmexit_ctl = vmcs_read32(VM_EXIT_CONTROLS);
6592 cpu_based_exec_ctrl = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
6593 pin_based_exec_ctrl = vmcs_read32(PIN_BASED_VM_EXEC_CONTROL);
6594 cr4 = vmcs_readl(GUEST_CR4);
6595
6596 if (cpu_has_secondary_exec_ctrls())
6597 secondary_exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
6598 else
6599 secondary_exec_control = 0;
6600
6601 if (cpu_has_tertiary_exec_ctrls())
6602 tertiary_exec_control = vmcs_read64(TERTIARY_VM_EXEC_CONTROL);
6603 else
6604 tertiary_exec_control = 0;
6605
6606 pr_err("VMCS %p, last attempted VM-entry on CPU %d\n",
6607 vmx->loaded_vmcs->vmcs, vcpu->arch.last_vmentry_cpu);
6608 pr_err("*** Guest State ***\n");
6609 pr_err("CR0: actual=0x%016lx, shadow=0x%016lx, gh_mask=%016lx\n",
6610 vmcs_readl(GUEST_CR0), vmcs_readl(CR0_READ_SHADOW),
6611 vmcs_readl(CR0_GUEST_HOST_MASK));
6612 pr_err("CR4: actual=0x%016lx, shadow=0x%016lx, gh_mask=%016lx\n",
6613 cr4, vmcs_readl(CR4_READ_SHADOW), vmcs_readl(CR4_GUEST_HOST_MASK));
6614 pr_err("CR3 = 0x%016lx\n", vmcs_readl(GUEST_CR3));
6615 if (cpu_has_vmx_ept()) {
6616 pr_err("PDPTR0 = 0x%016llx PDPTR1 = 0x%016llx\n",
6617 vmcs_read64(GUEST_PDPTR0), vmcs_read64(GUEST_PDPTR1));
6618 pr_err("PDPTR2 = 0x%016llx PDPTR3 = 0x%016llx\n",
6619 vmcs_read64(GUEST_PDPTR2), vmcs_read64(GUEST_PDPTR3));
6620 }
6621 pr_err("RSP = 0x%016lx RIP = 0x%016lx\n",
6622 vmcs_readl(GUEST_RSP), vmcs_readl(GUEST_RIP));
6623 pr_err("RFLAGS=0x%08lx DR7 = 0x%016lx\n",
6624 vmcs_readl(GUEST_RFLAGS), vmcs_readl(GUEST_DR7));
6625 pr_err("Sysenter RSP=%016lx CS:RIP=%04x:%016lx\n",
6626 vmcs_readl(GUEST_SYSENTER_ESP),
6627 vmcs_read32(GUEST_SYSENTER_CS), vmcs_readl(GUEST_SYSENTER_EIP));
6628 vmx_dump_sel("CS: ", GUEST_CS_SELECTOR);
6629 vmx_dump_sel("DS: ", GUEST_DS_SELECTOR);
6630 vmx_dump_sel("SS: ", GUEST_SS_SELECTOR);
6631 vmx_dump_sel("ES: ", GUEST_ES_SELECTOR);
6632 vmx_dump_sel("FS: ", GUEST_FS_SELECTOR);
6633 vmx_dump_sel("GS: ", GUEST_GS_SELECTOR);
6634 vmx_dump_dtsel("GDTR:", GUEST_GDTR_LIMIT);
6635 vmx_dump_sel("LDTR:", GUEST_LDTR_SELECTOR);
6636 vmx_dump_dtsel("IDTR:", GUEST_IDTR_LIMIT);
6637 vmx_dump_sel("TR: ", GUEST_TR_SELECTOR);
6638 efer_slot = vmx_find_loadstore_msr_slot(&vmx->msr_autoload.guest, MSR_EFER);
6639 if (vmentry_ctl & VM_ENTRY_LOAD_IA32_EFER)
6640 pr_err("EFER= 0x%016llx\n", vmcs_read64(GUEST_IA32_EFER));
6641 else if (efer_slot >= 0)
6642 pr_err("EFER= 0x%016llx (autoload)\n",
6643 vmx->msr_autoload.guest.val[efer_slot].value);
6644 else if (vmentry_ctl & VM_ENTRY_IA32E_MODE)
6645 pr_err("EFER= 0x%016llx (effective)\n",
6646 vcpu->arch.efer | (EFER_LMA | EFER_LME));
6647 else
6648 pr_err("EFER= 0x%016llx (effective)\n",
6649 vcpu->arch.efer & ~(EFER_LMA | EFER_LME));
6650 if (vmentry_ctl & VM_ENTRY_LOAD_IA32_PAT)
6651 pr_err("PAT = 0x%016llx\n", vmcs_read64(GUEST_IA32_PAT));
6652 pr_err("DebugCtl = 0x%016llx DebugExceptions = 0x%016lx\n",
6653 vmcs_read64(GUEST_IA32_DEBUGCTL),
6654 vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS));
6655 if (cpu_has_load_perf_global_ctrl() &&
6656 vmentry_ctl & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL)
6657 pr_err("PerfGlobCtl = 0x%016llx\n",
6658 vmcs_read64(GUEST_IA32_PERF_GLOBAL_CTRL));
6659 if (vmentry_ctl & VM_ENTRY_LOAD_BNDCFGS)
6660 pr_err("BndCfgS = 0x%016llx\n", vmcs_read64(GUEST_BNDCFGS));
6661 pr_err("Interruptibility = %08x ActivityState = %08x\n",
6662 vmcs_read32(GUEST_INTERRUPTIBILITY_INFO),
6663 vmcs_read32(GUEST_ACTIVITY_STATE));
6664 if (secondary_exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY)
6665 pr_err("InterruptStatus = %04x\n",
6666 vmcs_read16(GUEST_INTR_STATUS));
6667 if (vmcs_read32(VM_ENTRY_MSR_LOAD_COUNT) > 0)
6668 vmx_dump_msrs("guest autoload", &vmx->msr_autoload.guest);
6669 if (vmcs_read32(VM_EXIT_MSR_STORE_COUNT) > 0)
6670 vmx_dump_msrs("autostore", &vmx->msr_autostore);
6671
6672 if (vmentry_ctl & VM_ENTRY_LOAD_CET_STATE)
6673 pr_err("S_CET = 0x%016lx, SSP = 0x%016lx, SSP TABLE = 0x%016lx\n",
6674 vmcs_readl(GUEST_S_CET), vmcs_readl(GUEST_SSP),
6675 vmcs_readl(GUEST_INTR_SSP_TABLE));
6676 pr_err("*** Host State ***\n");
6677 pr_err("RIP = 0x%016lx RSP = 0x%016lx\n",
6678 vmcs_readl(HOST_RIP), vmcs_readl(HOST_RSP));
6679 pr_err("CS=%04x SS=%04x DS=%04x ES=%04x FS=%04x GS=%04x TR=%04x\n",
6680 vmcs_read16(HOST_CS_SELECTOR), vmcs_read16(HOST_SS_SELECTOR),
6681 vmcs_read16(HOST_DS_SELECTOR), vmcs_read16(HOST_ES_SELECTOR),
6682 vmcs_read16(HOST_FS_SELECTOR), vmcs_read16(HOST_GS_SELECTOR),
6683 vmcs_read16(HOST_TR_SELECTOR));
6684 pr_err("FSBase=%016lx GSBase=%016lx TRBase=%016lx\n",
6685 vmcs_readl(HOST_FS_BASE), vmcs_readl(HOST_GS_BASE),
6686 vmcs_readl(HOST_TR_BASE));
6687 pr_err("GDTBase=%016lx IDTBase=%016lx\n",
6688 vmcs_readl(HOST_GDTR_BASE), vmcs_readl(HOST_IDTR_BASE));
6689 pr_err("CR0=%016lx CR3=%016lx CR4=%016lx\n",
6690 vmcs_readl(HOST_CR0), vmcs_readl(HOST_CR3),
6691 vmcs_readl(HOST_CR4));
6692 pr_err("Sysenter RSP=%016lx CS:RIP=%04x:%016lx\n",
6693 vmcs_readl(HOST_IA32_SYSENTER_ESP),
6694 vmcs_read32(HOST_IA32_SYSENTER_CS),
6695 vmcs_readl(HOST_IA32_SYSENTER_EIP));
6696 if (vmexit_ctl & VM_EXIT_LOAD_IA32_EFER)
6697 pr_err("EFER= 0x%016llx\n", vmcs_read64(HOST_IA32_EFER));
6698 if (vmexit_ctl & VM_EXIT_LOAD_IA32_PAT)
6699 pr_err("PAT = 0x%016llx\n", vmcs_read64(HOST_IA32_PAT));
6700 if (cpu_has_load_perf_global_ctrl() &&
6701 vmexit_ctl & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL)
6702 pr_err("PerfGlobCtl = 0x%016llx\n",
6703 vmcs_read64(HOST_IA32_PERF_GLOBAL_CTRL));
6704 if (vmcs_read32(VM_EXIT_MSR_LOAD_COUNT) > 0)
6705 vmx_dump_msrs("host autoload", &vmx->msr_autoload.host);
6706 if (vmexit_ctl & VM_EXIT_LOAD_CET_STATE)
6707 pr_err("S_CET = 0x%016lx, SSP = 0x%016lx, SSP TABLE = 0x%016lx\n",
6708 vmcs_readl(HOST_S_CET), vmcs_readl(HOST_SSP),
6709 vmcs_readl(HOST_INTR_SSP_TABLE));
6710
6711 pr_err("*** Control State ***\n");
6712 pr_err("CPUBased=0x%08x SecondaryExec=0x%08x TertiaryExec=0x%016llx\n",
6713 cpu_based_exec_ctrl, secondary_exec_control, tertiary_exec_control);
6714 pr_err("PinBased=0x%08x EntryControls=%08x ExitControls=%08x\n",
6715 pin_based_exec_ctrl, vmentry_ctl, vmexit_ctl);
6716 pr_err("ExceptionBitmap=%08x PFECmask=%08x PFECmatch=%08x\n",
6717 vmcs_read32(EXCEPTION_BITMAP),
6718 vmcs_read32(PAGE_FAULT_ERROR_CODE_MASK),
6719 vmcs_read32(PAGE_FAULT_ERROR_CODE_MATCH));
6720 pr_err("VMEntry: intr_info=%08x errcode=%08x ilen=%08x\n",
6721 vmcs_read32(VM_ENTRY_INTR_INFO_FIELD),
6722 vmcs_read32(VM_ENTRY_EXCEPTION_ERROR_CODE),
6723 vmcs_read32(VM_ENTRY_INSTRUCTION_LEN));
6724 pr_err("VMExit: intr_info=%08x errcode=%08x ilen=%08x\n",
6725 vmcs_read32(VM_EXIT_INTR_INFO),
6726 vmcs_read32(VM_EXIT_INTR_ERROR_CODE),
6727 vmcs_read32(VM_EXIT_INSTRUCTION_LEN));
6728 pr_err(" reason=%08x qualification=%016lx\n",
6729 vmcs_read32(VM_EXIT_REASON), vmcs_readl(EXIT_QUALIFICATION));
6730 pr_err("IDTVectoring: info=%08x errcode=%08x\n",
6731 vmcs_read32(IDT_VECTORING_INFO_FIELD),
6732 vmcs_read32(IDT_VECTORING_ERROR_CODE));
6733 pr_err("TSC Offset = 0x%016llx\n", vmcs_read64(TSC_OFFSET));
6734 if (secondary_exec_control & SECONDARY_EXEC_TSC_SCALING)
6735 pr_err("TSC Multiplier = 0x%016llx\n",
6736 vmcs_read64(TSC_MULTIPLIER));
6737 if (cpu_based_exec_ctrl & CPU_BASED_TPR_SHADOW) {
6738 if (secondary_exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY) {
6739 u16 status = vmcs_read16(GUEST_INTR_STATUS);
6740 pr_err("SVI|RVI = %02x|%02x ", status >> 8, status & 0xff);
6741 }
6742 pr_cont("TPR Threshold = 0x%02x\n", vmcs_read32(TPR_THRESHOLD));
6743 if (secondary_exec_control & SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)
6744 pr_err("APIC-access addr = 0x%016llx ", vmcs_read64(APIC_ACCESS_ADDR));
6745 pr_cont("virt-APIC addr = 0x%016llx\n", vmcs_read64(VIRTUAL_APIC_PAGE_ADDR));
6746 }
6747 if (pin_based_exec_ctrl & PIN_BASED_POSTED_INTR)
6748 pr_err("PostedIntrVec = 0x%02x\n", vmcs_read16(POSTED_INTR_NV));
6749 if ((secondary_exec_control & SECONDARY_EXEC_ENABLE_EPT))
6750 pr_err("EPT pointer = 0x%016llx\n", vmcs_read64(EPT_POINTER));
6751 if (secondary_exec_control & SECONDARY_EXEC_PAUSE_LOOP_EXITING)
6752 pr_err("PLE Gap=%08x Window=%08x\n",
6753 vmcs_read32(PLE_GAP), vmcs_read32(PLE_WINDOW));
6754 if (secondary_exec_control & SECONDARY_EXEC_ENABLE_VPID)
6755 pr_err("Virtual processor ID = 0x%04x\n",
6756 vmcs_read16(VIRTUAL_PROCESSOR_ID));
6757 if (secondary_exec_control & SECONDARY_EXEC_EPT_VIOLATION_VE) {
6758 struct vmx_ve_information *ve_info = vmx->ve_info;
6759 u64 ve_info_pa = vmcs_read64(VE_INFORMATION_ADDRESS);
6760
6761 /*
6762 * If KVM is dumping the VMCS, then something has gone wrong
6763 * already. Derefencing an address from the VMCS, which could
6764 * very well be corrupted, is a terrible idea. The virtual
6765 * address is known so use it.
6766 */
6767 pr_err("VE info address = 0x%016llx%s\n", ve_info_pa,
6768 ve_info_pa == __pa(ve_info) ? "" : "(corrupted!)");
6769 pr_err("ve_info: 0x%08x 0x%08x 0x%016llx 0x%016llx 0x%016llx 0x%04x\n",
6770 ve_info->exit_reason, ve_info->delivery,
6771 ve_info->exit_qualification,
6772 ve_info->guest_linear_address,
6773 ve_info->guest_physical_address, ve_info->eptp_index);
6774 }
6775 }
6776
6777 /*
6778 * The guest has exited. See if we can fix it or if we need userspace
6779 * assistance.
6780 */
__vmx_handle_exit(struct kvm_vcpu * vcpu,fastpath_t exit_fastpath)6781 static int __vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
6782 {
6783 struct vcpu_vmx *vmx = to_vmx(vcpu);
6784 union vmx_exit_reason exit_reason = vmx_get_exit_reason(vcpu);
6785 u32 vectoring_info = vmx->idt_vectoring_info;
6786 u16 exit_handler_index;
6787
6788 /*
6789 * Flush logged GPAs PML buffer, this will make dirty_bitmap more
6790 * updated. Another good is, in kvm_vm_ioctl_get_dirty_log, before
6791 * querying dirty_bitmap, we only need to kick all vcpus out of guest
6792 * mode as if vcpus is in root mode, the PML buffer must has been
6793 * flushed already. Note, PML is never enabled in hardware while
6794 * running L2.
6795 */
6796 if (enable_pml && !is_guest_mode(vcpu))
6797 vmx_flush_pml_buffer(vcpu);
6798
6799 /*
6800 * KVM should never reach this point with a pending nested VM-Enter.
6801 * More specifically, short-circuiting VM-Entry to emulate L2 due to
6802 * invalid guest state should never happen as that means KVM knowingly
6803 * allowed a nested VM-Enter with an invalid vmcs12. More below.
6804 */
6805 if (KVM_BUG_ON(vmx->nested.nested_run_pending, vcpu->kvm))
6806 return -EIO;
6807
6808 if (is_guest_mode(vcpu)) {
6809 /*
6810 * PML is never enabled when running L2, bail immediately if a
6811 * PML full exit occurs as something is horribly wrong.
6812 */
6813 if (exit_reason.basic == EXIT_REASON_PML_FULL)
6814 goto unexpected_vmexit;
6815
6816 /*
6817 * The host physical addresses of some pages of guest memory
6818 * are loaded into the vmcs02 (e.g. vmcs12's Virtual APIC
6819 * Page). The CPU may write to these pages via their host
6820 * physical address while L2 is running, bypassing any
6821 * address-translation-based dirty tracking (e.g. EPT write
6822 * protection).
6823 *
6824 * Mark them dirty on every exit from L2 to prevent them from
6825 * getting out of sync with dirty tracking.
6826 */
6827 nested_vmx_mark_all_vmcs12_pages_dirty(vcpu);
6828
6829 /*
6830 * Synthesize a triple fault if L2 state is invalid. In normal
6831 * operation, nested VM-Enter rejects any attempt to enter L2
6832 * with invalid state. However, those checks are skipped if
6833 * state is being stuffed via RSM or KVM_SET_NESTED_STATE. If
6834 * L2 state is invalid, it means either L1 modified SMRAM state
6835 * or userspace provided bad state. Synthesize TRIPLE_FAULT as
6836 * doing so is architecturally allowed in the RSM case, and is
6837 * the least awful solution for the userspace case without
6838 * risking false positives.
6839 */
6840 if (vmx->vt.emulation_required) {
6841 nested_vmx_vmexit(vcpu, EXIT_REASON_TRIPLE_FAULT, 0, 0);
6842 return 1;
6843 }
6844
6845 if (nested_vmx_reflect_vmexit(vcpu))
6846 return 1;
6847 }
6848
6849 /* If guest state is invalid, start emulating. L2 is handled above. */
6850 if (vmx->vt.emulation_required)
6851 return handle_invalid_guest_state(vcpu);
6852
6853 if (exit_reason.failed_vmentry) {
6854 dump_vmcs(vcpu);
6855 vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY;
6856 vcpu->run->fail_entry.hardware_entry_failure_reason
6857 = exit_reason.full;
6858 vcpu->run->fail_entry.cpu = vcpu->arch.last_vmentry_cpu;
6859 return 0;
6860 }
6861
6862 if (unlikely(vmx->fail)) {
6863 dump_vmcs(vcpu);
6864 vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY;
6865 vcpu->run->fail_entry.hardware_entry_failure_reason
6866 = vmcs_read32(VM_INSTRUCTION_ERROR);
6867 vcpu->run->fail_entry.cpu = vcpu->arch.last_vmentry_cpu;
6868 return 0;
6869 }
6870
6871 if ((vectoring_info & VECTORING_INFO_VALID_MASK) &&
6872 (exit_reason.basic != EXIT_REASON_EXCEPTION_NMI &&
6873 exit_reason.basic != EXIT_REASON_EPT_VIOLATION &&
6874 exit_reason.basic != EXIT_REASON_PML_FULL &&
6875 exit_reason.basic != EXIT_REASON_APIC_ACCESS &&
6876 exit_reason.basic != EXIT_REASON_TASK_SWITCH &&
6877 exit_reason.basic != EXIT_REASON_NOTIFY &&
6878 exit_reason.basic != EXIT_REASON_EPT_MISCONFIG)) {
6879 kvm_prepare_event_vectoring_exit(vcpu, INVALID_GPA);
6880 return 0;
6881 }
6882
6883 if (unlikely(!enable_vnmi &&
6884 vmx->loaded_vmcs->soft_vnmi_blocked)) {
6885 if (!vmx_interrupt_blocked(vcpu)) {
6886 vmx->loaded_vmcs->soft_vnmi_blocked = 0;
6887 } else if (vmx->loaded_vmcs->vnmi_blocked_time > 1000000000LL &&
6888 vcpu->arch.nmi_pending) {
6889 /*
6890 * This CPU don't support us in finding the end of an
6891 * NMI-blocked window if the guest runs with IRQs
6892 * disabled. So we pull the trigger after 1 s of
6893 * futile waiting, but inform the user about this.
6894 */
6895 printk(KERN_WARNING "%s: Breaking out of NMI-blocked "
6896 "state on VCPU %d after 1 s timeout\n",
6897 __func__, vcpu->vcpu_id);
6898 vmx->loaded_vmcs->soft_vnmi_blocked = 0;
6899 }
6900 }
6901
6902 if (exit_fastpath != EXIT_FASTPATH_NONE)
6903 return 1;
6904
6905 if (exit_reason.basic >= kvm_vmx_max_exit_handlers)
6906 goto unexpected_vmexit;
6907 #ifdef CONFIG_MITIGATION_RETPOLINE
6908 if (exit_reason.basic == EXIT_REASON_MSR_WRITE)
6909 return kvm_emulate_wrmsr(vcpu);
6910 else if (exit_reason.basic == EXIT_REASON_MSR_WRITE_IMM)
6911 return handle_wrmsr_imm(vcpu);
6912 else if (exit_reason.basic == EXIT_REASON_PREEMPTION_TIMER)
6913 return handle_preemption_timer(vcpu);
6914 else if (exit_reason.basic == EXIT_REASON_INTERRUPT_WINDOW)
6915 return handle_interrupt_window(vcpu);
6916 else if (exit_reason.basic == EXIT_REASON_EXTERNAL_INTERRUPT)
6917 return handle_external_interrupt(vcpu);
6918 else if (exit_reason.basic == EXIT_REASON_HLT)
6919 return kvm_emulate_halt(vcpu);
6920 else if (exit_reason.basic == EXIT_REASON_EPT_MISCONFIG)
6921 return handle_ept_misconfig(vcpu);
6922 #endif
6923
6924 exit_handler_index = array_index_nospec((u16)exit_reason.basic,
6925 kvm_vmx_max_exit_handlers);
6926 if (!kvm_vmx_exit_handlers[exit_handler_index])
6927 goto unexpected_vmexit;
6928
6929 return kvm_vmx_exit_handlers[exit_handler_index](vcpu);
6930
6931 unexpected_vmexit:
6932 dump_vmcs(vcpu);
6933 kvm_prepare_unexpected_reason_exit(vcpu, exit_reason.full);
6934 return 0;
6935 }
6936
vmx_handle_exit(struct kvm_vcpu * vcpu,fastpath_t exit_fastpath)6937 int vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
6938 {
6939 int ret = __vmx_handle_exit(vcpu, exit_fastpath);
6940
6941 /*
6942 * Exit to user space when bus lock detected to inform that there is
6943 * a bus lock in guest.
6944 */
6945 if (vmx_get_exit_reason(vcpu).bus_lock_detected) {
6946 if (ret > 0)
6947 vcpu->run->exit_reason = KVM_EXIT_X86_BUS_LOCK;
6948
6949 vcpu->run->flags |= KVM_RUN_X86_BUS_LOCK;
6950 return 0;
6951 }
6952 return ret;
6953 }
6954
vmx_update_cr8_intercept(struct kvm_vcpu * vcpu,int tpr,int irr)6955 void vmx_update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
6956 {
6957 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
6958 int tpr_threshold;
6959
6960 if (is_guest_mode(vcpu) &&
6961 nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW))
6962 return;
6963
6964 guard(vmx_vmcs01)(vcpu);
6965
6966 tpr_threshold = (irr == -1 || tpr < irr) ? 0 : irr;
6967 vmcs_write32(TPR_THRESHOLD, tpr_threshold);
6968 }
6969
vmx_set_virtual_apic_mode(struct kvm_vcpu * vcpu)6970 void vmx_set_virtual_apic_mode(struct kvm_vcpu *vcpu)
6971 {
6972 struct vcpu_vmx *vmx = to_vmx(vcpu);
6973 u32 sec_exec_control;
6974
6975 if (!lapic_in_kernel(vcpu))
6976 return;
6977
6978 if (!flexpriority_enabled &&
6979 !cpu_has_vmx_virtualize_x2apic_mode())
6980 return;
6981
6982 guard(vmx_vmcs01)(vcpu);
6983
6984 sec_exec_control = secondary_exec_controls_get(vmx);
6985 sec_exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
6986 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE);
6987
6988 switch (kvm_get_apic_mode(vcpu)) {
6989 case LAPIC_MODE_INVALID:
6990 WARN_ONCE(true, "Invalid local APIC state");
6991 break;
6992 case LAPIC_MODE_DISABLED:
6993 break;
6994 case LAPIC_MODE_XAPIC:
6995 if (flexpriority_enabled) {
6996 sec_exec_control |=
6997 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
6998 kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu);
6999
7000 /*
7001 * Flush the TLB, reloading the APIC access page will
7002 * only do so if its physical address has changed, but
7003 * the guest may have inserted a non-APIC mapping into
7004 * the TLB while the APIC access page was disabled.
7005 *
7006 * If L2 is active, immediately flush L1's TLB instead
7007 * of requesting a flush of the current TLB, because
7008 * the current TLB context is L2's.
7009 */
7010 if (!is_guest_mode(vcpu))
7011 kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
7012 else if (!enable_ept)
7013 vpid_sync_context(vmx->vpid);
7014 else if (VALID_PAGE(vcpu->arch.root_mmu.root.hpa))
7015 vmx_flush_tlb_ept_root(vcpu->arch.root_mmu.root.hpa);
7016 }
7017 break;
7018 case LAPIC_MODE_X2APIC:
7019 if (cpu_has_vmx_virtualize_x2apic_mode())
7020 sec_exec_control |=
7021 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;
7022 break;
7023 }
7024 secondary_exec_controls_set(vmx, sec_exec_control);
7025
7026 vmx_update_msr_bitmap_x2apic(vcpu);
7027 }
7028
vmx_set_apic_access_page_addr(struct kvm_vcpu * vcpu)7029 void vmx_set_apic_access_page_addr(struct kvm_vcpu *vcpu)
7030 {
7031 const gfn_t gfn = APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT;
7032 struct kvm *kvm = vcpu->kvm;
7033 struct kvm_memslots *slots = kvm_memslots(kvm);
7034 struct kvm_memory_slot *slot;
7035 struct page *refcounted_page;
7036 unsigned long mmu_seq;
7037 kvm_pfn_t pfn;
7038 bool writable;
7039
7040 /* Note, the VIRTUALIZE_APIC_ACCESSES check needs to query vmcs01. */
7041 guard(vmx_vmcs01)(vcpu);
7042
7043 if (!(secondary_exec_controls_get(to_vmx(vcpu)) &
7044 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))
7045 return;
7046
7047 /*
7048 * Explicitly grab the memslot using KVM's internal slot ID to ensure
7049 * KVM doesn't unintentionally grab a userspace memslot. It _should_
7050 * be impossible for userspace to create a memslot for the APIC when
7051 * APICv is enabled, but paranoia won't hurt in this case.
7052 */
7053 slot = id_to_memslot(slots, APIC_ACCESS_PAGE_PRIVATE_MEMSLOT);
7054 if (!slot || slot->flags & KVM_MEMSLOT_INVALID)
7055 return;
7056
7057 /*
7058 * Ensure that the mmu_notifier sequence count is read before KVM
7059 * retrieves the pfn from the primary MMU. Note, the memslot is
7060 * protected by SRCU, not the mmu_notifier. Pairs with the smp_wmb()
7061 * in kvm_mmu_invalidate_end().
7062 */
7063 mmu_seq = kvm->mmu_invalidate_seq;
7064 smp_rmb();
7065
7066 /*
7067 * No need to retry if the memslot does not exist or is invalid. KVM
7068 * controls the APIC-access page memslot, and only deletes the memslot
7069 * if APICv is permanently inhibited, i.e. the memslot won't reappear.
7070 */
7071 pfn = __kvm_faultin_pfn(slot, gfn, FOLL_WRITE, &writable, &refcounted_page);
7072 if (is_error_noslot_pfn(pfn))
7073 return;
7074
7075 read_lock(&vcpu->kvm->mmu_lock);
7076 if (mmu_invalidate_retry_gfn(kvm, mmu_seq, gfn))
7077 kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu);
7078 else
7079 vmcs_write64(APIC_ACCESS_ADDR, pfn_to_hpa(pfn));
7080
7081 /*
7082 * Do not pin the APIC access page in memory so that it can be freely
7083 * migrated, the MMU notifier will call us again if it is migrated or
7084 * swapped out. KVM backs the memslot with anonymous memory, the pfn
7085 * should always point at a refcounted page (if the pfn is valid).
7086 */
7087 if (!WARN_ON_ONCE(!refcounted_page))
7088 kvm_release_page_clean(refcounted_page);
7089
7090 /*
7091 * No need for a manual TLB flush at this point, KVM has already done a
7092 * flush if there were SPTEs pointing at the previous page.
7093 */
7094 read_unlock(&vcpu->kvm->mmu_lock);
7095 }
7096
vmx_hwapic_isr_update(struct kvm_vcpu * vcpu,int max_isr)7097 void vmx_hwapic_isr_update(struct kvm_vcpu *vcpu, int max_isr)
7098 {
7099 u16 status;
7100 u8 old;
7101
7102 if (max_isr == -1)
7103 max_isr = 0;
7104
7105 /*
7106 * Always update SVI in vmcs01, as SVI is only relevant for L2 if and
7107 * only if Virtual Interrupt Delivery is enabled in vmcs12, and if VID
7108 * is enabled then L2 EOIs affect L2's vAPIC, not L1's vAPIC.
7109 */
7110 guard(vmx_vmcs01)(vcpu);
7111
7112 status = vmcs_read16(GUEST_INTR_STATUS);
7113 old = status >> 8;
7114 if (max_isr != old) {
7115 status &= 0xff;
7116 status |= max_isr << 8;
7117 vmcs_write16(GUEST_INTR_STATUS, status);
7118 }
7119 }
7120
vmx_set_rvi(int vector)7121 static void vmx_set_rvi(int vector)
7122 {
7123 u16 status;
7124 u8 old;
7125
7126 if (vector == -1)
7127 vector = 0;
7128
7129 status = vmcs_read16(GUEST_INTR_STATUS);
7130 old = (u8)status & 0xff;
7131 if ((u8)vector != old) {
7132 status &= ~0xff;
7133 status |= (u8)vector;
7134 vmcs_write16(GUEST_INTR_STATUS, status);
7135 }
7136 }
7137
vmx_sync_pir_to_irr(struct kvm_vcpu * vcpu)7138 int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu)
7139 {
7140 struct vcpu_vt *vt = to_vt(vcpu);
7141 int max_irr;
7142 bool got_posted_interrupt;
7143
7144 if (KVM_BUG_ON(!enable_apicv, vcpu->kvm))
7145 return -EIO;
7146
7147 if (pi_test_on(&vt->pi_desc)) {
7148 pi_clear_on(&vt->pi_desc);
7149 /*
7150 * IOMMU can write to PID.ON, so the barrier matters even on UP.
7151 * But on x86 this is just a compiler barrier anyway.
7152 */
7153 smp_mb__after_atomic();
7154 got_posted_interrupt =
7155 kvm_apic_update_irr(vcpu, vt->pi_desc.pir, &max_irr);
7156 } else {
7157 max_irr = kvm_lapic_find_highest_irr(vcpu);
7158 got_posted_interrupt = false;
7159 }
7160
7161 /*
7162 * Newly recognized interrupts are injected via either virtual interrupt
7163 * delivery (RVI) or KVM_REQ_EVENT. Virtual interrupt delivery is
7164 * disabled in two cases:
7165 *
7166 * 1) If L2 is running and the vCPU has a new pending interrupt. If L1
7167 * wants to exit on interrupts, KVM_REQ_EVENT is needed to synthesize a
7168 * VM-Exit to L1. If L1 doesn't want to exit, the interrupt is injected
7169 * into L2, but KVM doesn't use virtual interrupt delivery to inject
7170 * interrupts into L2, and so KVM_REQ_EVENT is again needed.
7171 *
7172 * 2) If APICv is disabled for this vCPU, assigned devices may still
7173 * attempt to post interrupts. The posted interrupt vector will cause
7174 * a VM-Exit and the subsequent entry will call sync_pir_to_irr.
7175 */
7176 if (!is_guest_mode(vcpu) && kvm_vcpu_apicv_active(vcpu))
7177 vmx_set_rvi(max_irr);
7178 else if (got_posted_interrupt)
7179 kvm_make_request(KVM_REQ_EVENT, vcpu);
7180
7181 return max_irr;
7182 }
7183
vmx_load_eoi_exitmap(struct kvm_vcpu * vcpu,u64 * eoi_exit_bitmap)7184 void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap)
7185 {
7186 if (!kvm_vcpu_apicv_active(vcpu))
7187 return;
7188
7189 vmcs_write64(EOI_EXIT_BITMAP0, eoi_exit_bitmap[0]);
7190 vmcs_write64(EOI_EXIT_BITMAP1, eoi_exit_bitmap[1]);
7191 vmcs_write64(EOI_EXIT_BITMAP2, eoi_exit_bitmap[2]);
7192 vmcs_write64(EOI_EXIT_BITMAP3, eoi_exit_bitmap[3]);
7193 }
7194
7195 void vmx_do_interrupt_irqoff(unsigned long entry);
7196 void vmx_do_nmi_irqoff(void);
7197
handle_nm_fault_irqoff(struct kvm_vcpu * vcpu)7198 static void handle_nm_fault_irqoff(struct kvm_vcpu *vcpu)
7199 {
7200 /*
7201 * Save xfd_err to guest_fpu before interrupt is enabled, so the
7202 * MSR value is not clobbered by the host activity before the guest
7203 * has chance to consume it.
7204 *
7205 * Update the guest's XFD_ERR if and only if XFD is enabled, as the #NM
7206 * interception may have been caused by L1 interception. Per the SDM,
7207 * XFD_ERR is not modified for non-XFD #NM, i.e. if CR0.TS=1.
7208 *
7209 * Note, XFD_ERR is updated _before_ the #NM interception check, i.e.
7210 * unlike CR2 and DR6, the value is not a payload that is attached to
7211 * the #NM exception.
7212 */
7213 if (is_xfd_nm_fault(vcpu))
7214 rdmsrq(MSR_IA32_XFD_ERR, vcpu->arch.guest_fpu.xfd_err);
7215 }
7216
handle_exception_irqoff(struct kvm_vcpu * vcpu,u32 intr_info)7217 static void handle_exception_irqoff(struct kvm_vcpu *vcpu, u32 intr_info)
7218 {
7219 /* if exit due to PF check for async PF */
7220 if (is_page_fault(intr_info))
7221 vcpu->arch.apf.host_apf_flags = kvm_read_and_reset_apf_flags();
7222 /* if exit due to NM, handle before interrupts are enabled */
7223 else if (is_nm_fault(intr_info))
7224 handle_nm_fault_irqoff(vcpu);
7225 /* Handle machine checks before interrupts are enabled */
7226 else if (is_machine_check(intr_info))
7227 kvm_machine_check();
7228 }
7229
handle_external_interrupt_irqoff(struct kvm_vcpu * vcpu,u32 intr_info)7230 static void handle_external_interrupt_irqoff(struct kvm_vcpu *vcpu,
7231 u32 intr_info)
7232 {
7233 unsigned int vector = intr_info & INTR_INFO_VECTOR_MASK;
7234
7235 if (KVM_BUG(!is_external_intr(intr_info), vcpu->kvm,
7236 "unexpected VM-Exit interrupt info: 0x%x", intr_info))
7237 return;
7238
7239 /*
7240 * Invoke the kernel's IRQ handler for the vector. Use the FRED path
7241 * when it's available even if FRED isn't fully enabled, e.g. even if
7242 * FRED isn't supported in hardware, in order to avoid the indirect
7243 * CALL in the non-FRED path.
7244 */
7245 kvm_before_interrupt(vcpu, KVM_HANDLING_IRQ);
7246 if (IS_ENABLED(CONFIG_X86_FRED))
7247 fred_entry_from_kvm(EVENT_TYPE_EXTINT, vector);
7248 else
7249 vmx_do_interrupt_irqoff(gate_offset((gate_desc *)host_idt_base + vector));
7250 kvm_after_interrupt(vcpu);
7251
7252 vcpu->arch.at_instruction_boundary = true;
7253 }
7254
vmx_handle_exit_irqoff(struct kvm_vcpu * vcpu)7255 void vmx_handle_exit_irqoff(struct kvm_vcpu *vcpu)
7256 {
7257 if (to_vt(vcpu)->emulation_required)
7258 return;
7259
7260 switch (vmx_get_exit_reason(vcpu).basic) {
7261 case EXIT_REASON_EXTERNAL_INTERRUPT:
7262 handle_external_interrupt_irqoff(vcpu, vmx_get_intr_info(vcpu));
7263 break;
7264 case EXIT_REASON_EXCEPTION_NMI:
7265 handle_exception_irqoff(vcpu, vmx_get_intr_info(vcpu));
7266 break;
7267 case EXIT_REASON_MCE_DURING_VMENTRY:
7268 kvm_machine_check();
7269 break;
7270 default:
7271 break;
7272 }
7273 }
7274
7275 /*
7276 * The kvm parameter can be NULL (module initialization, or invocation before
7277 * VM creation). Be sure to check the kvm parameter before using it.
7278 */
vmx_has_emulated_msr(struct kvm * kvm,u32 index)7279 bool vmx_has_emulated_msr(struct kvm *kvm, u32 index)
7280 {
7281 switch (index) {
7282 case MSR_IA32_SMBASE:
7283 if (!IS_ENABLED(CONFIG_KVM_SMM))
7284 return false;
7285 /*
7286 * We cannot do SMM unless we can run the guest in big
7287 * real mode.
7288 */
7289 return enable_unrestricted_guest || emulate_invalid_guest_state;
7290 case KVM_FIRST_EMULATED_VMX_MSR ... KVM_LAST_EMULATED_VMX_MSR:
7291 return nested;
7292 case MSR_AMD64_VIRT_SPEC_CTRL:
7293 case MSR_AMD64_TSC_RATIO:
7294 /* This is AMD only. */
7295 return false;
7296 default:
7297 return true;
7298 }
7299 }
7300
vmx_recover_nmi_blocking(struct vcpu_vmx * vmx)7301 static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx)
7302 {
7303 u32 exit_intr_info;
7304 bool unblock_nmi;
7305 u8 vector;
7306 bool idtv_info_valid;
7307
7308 idtv_info_valid = vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK;
7309
7310 if (enable_vnmi) {
7311 if (vmx->loaded_vmcs->nmi_known_unmasked)
7312 return;
7313
7314 exit_intr_info = vmx_get_intr_info(&vmx->vcpu);
7315 unblock_nmi = (exit_intr_info & INTR_INFO_UNBLOCK_NMI) != 0;
7316 vector = exit_intr_info & INTR_INFO_VECTOR_MASK;
7317 /*
7318 * SDM 3: 27.7.1.2 (September 2008)
7319 * Re-set bit "block by NMI" before VM entry if vmexit caused by
7320 * a guest IRET fault.
7321 * SDM 3: 23.2.2 (September 2008)
7322 * Bit 12 is undefined in any of the following cases:
7323 * If the VM exit sets the valid bit in the IDT-vectoring
7324 * information field.
7325 * If the VM exit is due to a double fault.
7326 */
7327 if ((exit_intr_info & INTR_INFO_VALID_MASK) && unblock_nmi &&
7328 vector != DF_VECTOR && !idtv_info_valid)
7329 vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
7330 GUEST_INTR_STATE_NMI);
7331 else
7332 vmx->loaded_vmcs->nmi_known_unmasked =
7333 !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO)
7334 & GUEST_INTR_STATE_NMI);
7335 } else if (unlikely(vmx->loaded_vmcs->soft_vnmi_blocked))
7336 vmx->loaded_vmcs->vnmi_blocked_time +=
7337 ktime_to_ns(ktime_sub(ktime_get(),
7338 vmx->loaded_vmcs->entry_time));
7339 }
7340
__vmx_complete_interrupts(struct kvm_vcpu * vcpu,u32 idt_vectoring_info,int instr_len_field,int error_code_field)7341 static void __vmx_complete_interrupts(struct kvm_vcpu *vcpu,
7342 u32 idt_vectoring_info,
7343 int instr_len_field,
7344 int error_code_field)
7345 {
7346 u8 vector;
7347 int type;
7348 bool idtv_info_valid;
7349
7350 idtv_info_valid = idt_vectoring_info & VECTORING_INFO_VALID_MASK;
7351
7352 vcpu->arch.nmi_injected = false;
7353 kvm_clear_exception_queue(vcpu);
7354 kvm_clear_interrupt_queue(vcpu);
7355
7356 if (!idtv_info_valid)
7357 return;
7358
7359 kvm_make_request(KVM_REQ_EVENT, vcpu);
7360
7361 vector = idt_vectoring_info & VECTORING_INFO_VECTOR_MASK;
7362 type = idt_vectoring_info & VECTORING_INFO_TYPE_MASK;
7363
7364 switch (type) {
7365 case INTR_TYPE_NMI_INTR:
7366 vcpu->arch.nmi_injected = true;
7367 /*
7368 * SDM 3: 27.7.1.2 (September 2008)
7369 * Clear bit "block by NMI" before VM entry if a NMI
7370 * delivery faulted.
7371 */
7372 vmx_set_nmi_mask(vcpu, false);
7373 break;
7374 case INTR_TYPE_SOFT_EXCEPTION:
7375 vcpu->arch.event_exit_inst_len = vmcs_read32(instr_len_field);
7376 fallthrough;
7377 case INTR_TYPE_HARD_EXCEPTION: {
7378 u32 error_code = 0;
7379
7380 if (idt_vectoring_info & VECTORING_INFO_DELIVER_CODE_MASK)
7381 error_code = vmcs_read32(error_code_field);
7382
7383 kvm_requeue_exception(vcpu, vector,
7384 idt_vectoring_info & VECTORING_INFO_DELIVER_CODE_MASK,
7385 error_code);
7386 break;
7387 }
7388 case INTR_TYPE_SOFT_INTR:
7389 vcpu->arch.event_exit_inst_len = vmcs_read32(instr_len_field);
7390 fallthrough;
7391 case INTR_TYPE_EXT_INTR:
7392 kvm_queue_interrupt(vcpu, vector, type == INTR_TYPE_SOFT_INTR);
7393 break;
7394 default:
7395 break;
7396 }
7397 }
7398
vmx_complete_interrupts(struct vcpu_vmx * vmx)7399 static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
7400 {
7401 __vmx_complete_interrupts(&vmx->vcpu, vmx->idt_vectoring_info,
7402 VM_EXIT_INSTRUCTION_LEN,
7403 IDT_VECTORING_ERROR_CODE);
7404 }
7405
vmx_cancel_injection(struct kvm_vcpu * vcpu)7406 void vmx_cancel_injection(struct kvm_vcpu *vcpu)
7407 {
7408 __vmx_complete_interrupts(vcpu,
7409 vmcs_read32(VM_ENTRY_INTR_INFO_FIELD),
7410 VM_ENTRY_INSTRUCTION_LEN,
7411 VM_ENTRY_EXCEPTION_ERROR_CODE);
7412
7413 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0);
7414 }
7415
atomic_switch_perf_msrs(struct vcpu_vmx * vmx)7416 static void atomic_switch_perf_msrs(struct vcpu_vmx *vmx)
7417 {
7418 int i, nr_msrs;
7419 struct perf_guest_switch_msr *msrs;
7420 struct kvm_pmu *pmu = vcpu_to_pmu(&vmx->vcpu);
7421
7422 if (kvm_vcpu_has_mediated_pmu(&vmx->vcpu))
7423 return;
7424
7425 pmu->host_cross_mapped_mask = 0;
7426 if (pmu->pebs_enable & pmu->global_ctrl)
7427 intel_pmu_cross_mapped_check(pmu);
7428
7429 /* Note, nr_msrs may be garbage if perf_guest_get_msrs() returns NULL. */
7430 msrs = perf_guest_get_msrs(&nr_msrs, (void *)pmu);
7431 if (!msrs)
7432 return;
7433
7434 for (i = 0; i < nr_msrs; i++)
7435 if (msrs[i].host == msrs[i].guest)
7436 clear_atomic_switch_msr(vmx, msrs[i].msr);
7437 else
7438 add_atomic_switch_msr(vmx, msrs[i].msr, msrs[i].guest,
7439 msrs[i].host);
7440 }
7441
vmx_refresh_guest_perf_global_control(struct kvm_vcpu * vcpu)7442 static void vmx_refresh_guest_perf_global_control(struct kvm_vcpu *vcpu)
7443 {
7444 struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
7445 struct vcpu_vmx *vmx = to_vmx(vcpu);
7446
7447 if (msr_write_intercepted(vmx, MSR_CORE_PERF_GLOBAL_CTRL))
7448 return;
7449
7450 if (!cpu_has_save_perf_global_ctrl()) {
7451 int slot = vmx_find_loadstore_msr_slot(&vmx->msr_autostore,
7452 MSR_CORE_PERF_GLOBAL_CTRL);
7453
7454 if (WARN_ON_ONCE(slot < 0))
7455 return;
7456
7457 pmu->global_ctrl = vmx->msr_autostore.val[slot].value;
7458 vmcs_write64(GUEST_IA32_PERF_GLOBAL_CTRL, pmu->global_ctrl);
7459 return;
7460 }
7461
7462 pmu->global_ctrl = vmcs_read64(GUEST_IA32_PERF_GLOBAL_CTRL);
7463 }
7464
vmx_update_hv_timer(struct kvm_vcpu * vcpu,bool force_immediate_exit)7465 static void vmx_update_hv_timer(struct kvm_vcpu *vcpu, bool force_immediate_exit)
7466 {
7467 struct vcpu_vmx *vmx = to_vmx(vcpu);
7468 u64 tscl;
7469 u32 delta_tsc;
7470
7471 if (force_immediate_exit) {
7472 vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, 0);
7473 vmx->loaded_vmcs->hv_timer_soft_disabled = false;
7474 } else if (vmx->hv_deadline_tsc != -1) {
7475 tscl = rdtsc();
7476 if (vmx->hv_deadline_tsc > tscl)
7477 /* set_hv_timer ensures the delta fits in 32-bits */
7478 delta_tsc = (u32)((vmx->hv_deadline_tsc - tscl) >>
7479 cpu_preemption_timer_multi);
7480 else
7481 delta_tsc = 0;
7482
7483 vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, delta_tsc);
7484 vmx->loaded_vmcs->hv_timer_soft_disabled = false;
7485 } else if (!vmx->loaded_vmcs->hv_timer_soft_disabled) {
7486 vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, -1);
7487 vmx->loaded_vmcs->hv_timer_soft_disabled = true;
7488 }
7489 }
7490
vmx_update_host_rsp(struct vcpu_vmx * vmx,unsigned long host_rsp)7491 void noinstr vmx_update_host_rsp(struct vcpu_vmx *vmx, unsigned long host_rsp)
7492 {
7493 if (unlikely(host_rsp != vmx->loaded_vmcs->host_state.rsp)) {
7494 vmx->loaded_vmcs->host_state.rsp = host_rsp;
7495 vmcs_writel(HOST_RSP, host_rsp);
7496 }
7497 }
7498
vmx_spec_ctrl_restore_host(struct vcpu_vmx * vmx,unsigned int flags)7499 void noinstr vmx_spec_ctrl_restore_host(struct vcpu_vmx *vmx,
7500 unsigned int flags)
7501 {
7502 u64 hostval = this_cpu_read(x86_spec_ctrl_current);
7503
7504 if (!cpu_feature_enabled(X86_FEATURE_MSR_SPEC_CTRL))
7505 return;
7506
7507 if (flags & VMX_RUN_SAVE_SPEC_CTRL)
7508 vmx->spec_ctrl = native_rdmsrq(MSR_IA32_SPEC_CTRL);
7509
7510 /*
7511 * If the guest/host SPEC_CTRL values differ, restore the host value.
7512 *
7513 * For legacy IBRS, the IBRS bit always needs to be written after
7514 * transitioning from a less privileged predictor mode, regardless of
7515 * whether the guest/host values differ.
7516 */
7517 if (cpu_feature_enabled(X86_FEATURE_KERNEL_IBRS) ||
7518 vmx->spec_ctrl != hostval)
7519 native_wrmsrq(MSR_IA32_SPEC_CTRL, hostval);
7520
7521 barrier_nospec();
7522 }
7523
vmx_exit_handlers_fastpath(struct kvm_vcpu * vcpu,bool force_immediate_exit)7524 static fastpath_t vmx_exit_handlers_fastpath(struct kvm_vcpu *vcpu,
7525 bool force_immediate_exit)
7526 {
7527 /*
7528 * If L2 is active, some VMX preemption timer exits can be handled in
7529 * the fastpath even, all other exits must use the slow path.
7530 */
7531 if (is_guest_mode(vcpu) &&
7532 vmx_get_exit_reason(vcpu).basic != EXIT_REASON_PREEMPTION_TIMER)
7533 return EXIT_FASTPATH_NONE;
7534
7535 switch (vmx_get_exit_reason(vcpu).basic) {
7536 case EXIT_REASON_MSR_WRITE:
7537 return handle_fastpath_wrmsr(vcpu);
7538 case EXIT_REASON_MSR_WRITE_IMM:
7539 return handle_fastpath_wrmsr_imm(vcpu, vmx_get_exit_qual(vcpu),
7540 vmx_get_msr_imm_reg(vcpu));
7541 case EXIT_REASON_PREEMPTION_TIMER:
7542 return handle_fastpath_preemption_timer(vcpu, force_immediate_exit);
7543 case EXIT_REASON_HLT:
7544 return handle_fastpath_hlt(vcpu);
7545 case EXIT_REASON_INVD:
7546 return handle_fastpath_invd(vcpu);
7547 default:
7548 return EXIT_FASTPATH_NONE;
7549 }
7550 }
7551
vmx_handle_nmi(struct kvm_vcpu * vcpu)7552 noinstr void vmx_handle_nmi(struct kvm_vcpu *vcpu)
7553 {
7554 if ((u16)vmx_get_exit_reason(vcpu).basic != EXIT_REASON_EXCEPTION_NMI ||
7555 !is_nmi(vmx_get_intr_info(vcpu)))
7556 return;
7557
7558 kvm_before_interrupt(vcpu, KVM_HANDLING_NMI);
7559 if (cpu_feature_enabled(X86_FEATURE_FRED))
7560 fred_entry_from_kvm(EVENT_TYPE_NMI, NMI_VECTOR);
7561 else
7562 vmx_do_nmi_irqoff();
7563 kvm_after_interrupt(vcpu);
7564 }
7565
vmx_vcpu_enter_exit(struct kvm_vcpu * vcpu,unsigned int flags)7566 static noinstr void vmx_vcpu_enter_exit(struct kvm_vcpu *vcpu,
7567 unsigned int flags)
7568 {
7569 struct vcpu_vmx *vmx = to_vmx(vcpu);
7570
7571 guest_state_enter_irqoff();
7572
7573 vmx_l1d_flush(vcpu);
7574
7575 vmx_disable_fb_clear(vmx);
7576
7577 if (vcpu->arch.cr2 != native_read_cr2())
7578 native_write_cr2(vcpu->arch.cr2);
7579
7580 vmx->fail = __vmx_vcpu_run(vmx, (unsigned long *)&vcpu->arch.regs,
7581 flags);
7582
7583 vcpu->arch.cr2 = native_read_cr2();
7584 vcpu->arch.regs_avail &= ~VMX_REGS_LAZY_LOAD_SET;
7585
7586 vmx->idt_vectoring_info = 0;
7587
7588 vmx_enable_fb_clear(vmx);
7589
7590 if (unlikely(vmx->fail)) {
7591 vmx->vt.exit_reason.full = 0xdead;
7592 goto out;
7593 }
7594
7595 vmx->vt.exit_reason.full = vmcs_read32(VM_EXIT_REASON);
7596 if (likely(!vmx_get_exit_reason(vcpu).failed_vmentry))
7597 vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
7598
7599 vmx_handle_nmi(vcpu);
7600
7601 out:
7602 guest_state_exit_irqoff();
7603 }
7604
vmx_vcpu_run(struct kvm_vcpu * vcpu,u64 run_flags)7605 fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, u64 run_flags)
7606 {
7607 bool force_immediate_exit = run_flags & KVM_RUN_FORCE_IMMEDIATE_EXIT;
7608 struct vcpu_vmx *vmx = to_vmx(vcpu);
7609 unsigned long cr3, cr4;
7610
7611 /* Record the guest's net vcpu time for enforced NMI injections. */
7612 if (unlikely(!enable_vnmi &&
7613 vmx->loaded_vmcs->soft_vnmi_blocked))
7614 vmx->loaded_vmcs->entry_time = ktime_get();
7615
7616 /*
7617 * Don't enter VMX if guest state is invalid, let the exit handler
7618 * start emulation until we arrive back to a valid state. Synthesize a
7619 * consistency check VM-Exit due to invalid guest state and bail.
7620 */
7621 if (unlikely(vmx->vt.emulation_required)) {
7622 vmx->fail = 0;
7623
7624 vmx->vt.exit_reason.full = EXIT_REASON_INVALID_STATE;
7625 vmx->vt.exit_reason.failed_vmentry = 1;
7626 kvm_register_mark_available(vcpu, VCPU_EXREG_EXIT_INFO_1);
7627 vmx->vt.exit_qualification = ENTRY_FAIL_DEFAULT;
7628 kvm_register_mark_available(vcpu, VCPU_EXREG_EXIT_INFO_2);
7629 vmx->vt.exit_intr_info = 0;
7630 return EXIT_FASTPATH_NONE;
7631 }
7632
7633 trace_kvm_entry(vcpu, force_immediate_exit);
7634
7635 if (vmx->ple_window_dirty) {
7636 vmx->ple_window_dirty = false;
7637 vmcs_write32(PLE_WINDOW, vmx->ple_window);
7638 }
7639
7640 /*
7641 * We did this in prepare_switch_to_guest, because it needs to
7642 * be within srcu_read_lock.
7643 */
7644 WARN_ON_ONCE(vmx->nested.need_vmcs12_to_shadow_sync);
7645
7646 if (kvm_register_is_dirty(vcpu, VCPU_REGS_RSP))
7647 vmcs_writel(GUEST_RSP, vcpu->arch.regs[VCPU_REGS_RSP]);
7648 if (kvm_register_is_dirty(vcpu, VCPU_REGS_RIP))
7649 vmcs_writel(GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]);
7650 vcpu->arch.regs_dirty = 0;
7651
7652 if (run_flags & KVM_RUN_LOAD_GUEST_DR6)
7653 set_debugreg(vcpu->arch.dr6, 6);
7654
7655 if (run_flags & KVM_RUN_LOAD_DEBUGCTL)
7656 vmx_reload_guest_debugctl(vcpu);
7657
7658 /*
7659 * Refresh vmcs.HOST_CR3 if necessary. This must be done immediately
7660 * prior to VM-Enter, as the kernel may load a new ASID (PCID) any time
7661 * it switches back to the current->mm, which can occur in KVM context
7662 * when switching to a temporary mm to patch kernel code, e.g. if KVM
7663 * toggles a static key while handling a VM-Exit.
7664 */
7665 cr3 = __get_current_cr3_fast();
7666 if (unlikely(cr3 != vmx->loaded_vmcs->host_state.cr3)) {
7667 vmcs_writel(HOST_CR3, cr3);
7668 vmx->loaded_vmcs->host_state.cr3 = cr3;
7669 }
7670
7671 cr4 = cr4_read_shadow();
7672 if (unlikely(cr4 != vmx->loaded_vmcs->host_state.cr4)) {
7673 vmcs_writel(HOST_CR4, cr4);
7674 vmx->loaded_vmcs->host_state.cr4 = cr4;
7675 }
7676
7677 /* When single-stepping over STI and MOV SS, we must clear the
7678 * corresponding interruptibility bits in the guest state. Otherwise
7679 * vmentry fails as it then expects bit 14 (BS) in pending debug
7680 * exceptions being set, but that's not correct for the guest debugging
7681 * case. */
7682 if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
7683 vmx_set_interrupt_shadow(vcpu, 0);
7684
7685 pt_guest_enter(vmx);
7686
7687 atomic_switch_perf_msrs(vmx);
7688 if (intel_pmu_lbr_is_enabled(vcpu))
7689 vmx_passthrough_lbr_msrs(vcpu);
7690
7691 if (enable_preemption_timer)
7692 vmx_update_hv_timer(vcpu, force_immediate_exit);
7693 else if (force_immediate_exit)
7694 smp_send_reschedule(vcpu->cpu);
7695
7696 kvm_wait_lapic_expire(vcpu);
7697
7698 /* The actual VMENTER/EXIT is in the .noinstr.text section. */
7699 vmx_vcpu_enter_exit(vcpu, __vmx_vcpu_run_flags(vmx));
7700
7701 /* All fields are clean at this point */
7702 if (kvm_is_using_evmcs()) {
7703 current_evmcs->hv_clean_fields |=
7704 HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL;
7705
7706 current_evmcs->hv_vp_id = kvm_hv_get_vpindex(vcpu);
7707 }
7708
7709 /* MSR_IA32_DEBUGCTLMSR is zeroed on vmexit. Restore it if needed */
7710 if (vcpu->arch.host_debugctl)
7711 update_debugctlmsr(vcpu->arch.host_debugctl);
7712
7713 #ifndef CONFIG_X86_64
7714 /*
7715 * The sysexit path does not restore ds/es, so we must set them to
7716 * a reasonable value ourselves.
7717 *
7718 * We can't defer this to vmx_prepare_switch_to_host() since that
7719 * function may be executed in interrupt context, which saves and
7720 * restore segments around it, nullifying its effect.
7721 */
7722 loadsegment(ds, __USER_DS);
7723 loadsegment(es, __USER_DS);
7724 #endif
7725
7726 pt_guest_exit(vmx);
7727
7728 if (is_guest_mode(vcpu)) {
7729 /*
7730 * Track VMLAUNCH/VMRESUME that have made past guest state
7731 * checking.
7732 */
7733 if (vmx->nested.nested_run_pending &&
7734 !vmx_get_exit_reason(vcpu).failed_vmentry)
7735 ++vcpu->stat.nested_run;
7736
7737 vmx->nested.nested_run_pending = 0;
7738 }
7739
7740 if (unlikely(vmx->fail))
7741 return EXIT_FASTPATH_NONE;
7742
7743 trace_kvm_exit(vcpu, KVM_ISA_VMX);
7744
7745 if (unlikely(vmx_get_exit_reason(vcpu).failed_vmentry))
7746 return EXIT_FASTPATH_NONE;
7747
7748 vmx->loaded_vmcs->launched = 1;
7749
7750 vmx_refresh_guest_perf_global_control(vcpu);
7751
7752 vmx_recover_nmi_blocking(vmx);
7753 vmx_complete_interrupts(vmx);
7754
7755 return vmx_exit_handlers_fastpath(vcpu, force_immediate_exit);
7756 }
7757
vmx_vcpu_free(struct kvm_vcpu * vcpu)7758 void vmx_vcpu_free(struct kvm_vcpu *vcpu)
7759 {
7760 struct vcpu_vmx *vmx = to_vmx(vcpu);
7761
7762 if (enable_pml)
7763 vmx_destroy_pml_buffer(vmx);
7764 free_vpid(vmx->vpid);
7765 nested_vmx_free_vcpu(vcpu);
7766 free_loaded_vmcs(vmx->loaded_vmcs);
7767 free_page((unsigned long)vmx->ve_info);
7768 }
7769
vmx_vcpu_create(struct kvm_vcpu * vcpu)7770 int vmx_vcpu_create(struct kvm_vcpu *vcpu)
7771 {
7772 struct vmx_uret_msr *tsx_ctrl;
7773 struct vcpu_vmx *vmx;
7774 int i, err;
7775
7776 BUILD_BUG_ON(offsetof(struct vcpu_vmx, vcpu) != 0);
7777 vmx = to_vmx(vcpu);
7778
7779 INIT_LIST_HEAD(&vmx->vt.pi_wakeup_list);
7780
7781 err = -ENOMEM;
7782
7783 vmx->vpid = allocate_vpid();
7784
7785 /*
7786 * If PML is turned on, failure on enabling PML just results in failure
7787 * of creating the vcpu, therefore we can simplify PML logic (by
7788 * avoiding dealing with cases, such as enabling PML partially on vcpus
7789 * for the guest), etc.
7790 */
7791 if (enable_pml) {
7792 vmx->pml_pg = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
7793 if (!vmx->pml_pg)
7794 goto free_vpid;
7795 }
7796
7797 for (i = 0; i < kvm_nr_uret_msrs; ++i)
7798 vmx->guest_uret_msrs[i].mask = -1ull;
7799 if (boot_cpu_has(X86_FEATURE_RTM)) {
7800 /*
7801 * TSX_CTRL_CPUID_CLEAR is handled in the CPUID interception.
7802 * Keep the host value unchanged to avoid changing CPUID bits
7803 * under the host kernel's feet.
7804 */
7805 tsx_ctrl = vmx_find_uret_msr(vmx, MSR_IA32_TSX_CTRL);
7806 if (tsx_ctrl)
7807 tsx_ctrl->mask = ~(u64)TSX_CTRL_CPUID_CLEAR;
7808 }
7809
7810 err = alloc_loaded_vmcs(&vmx->vmcs01);
7811 if (err < 0)
7812 goto free_pml;
7813
7814 /*
7815 * Use Hyper-V 'Enlightened MSR Bitmap' feature when KVM runs as a
7816 * nested (L1) hypervisor and Hyper-V in L0 supports it. Enable the
7817 * feature only for vmcs01, KVM currently isn't equipped to realize any
7818 * performance benefits from enabling it for vmcs02.
7819 */
7820 if (kvm_is_using_evmcs() &&
7821 (ms_hyperv.nested_features & HV_X64_NESTED_MSR_BITMAP)) {
7822 struct hv_enlightened_vmcs *evmcs = (void *)vmx->vmcs01.vmcs;
7823
7824 evmcs->hv_enlightenments_control.msr_bitmap = 1;
7825 }
7826
7827 vmx->loaded_vmcs = &vmx->vmcs01;
7828
7829 if (cpu_need_virtualize_apic_accesses(vcpu)) {
7830 err = kvm_alloc_apic_access_page(vcpu->kvm);
7831 if (err)
7832 goto free_vmcs;
7833 }
7834
7835 if (enable_ept && !enable_unrestricted_guest) {
7836 err = init_rmode_identity_map(vcpu->kvm);
7837 if (err)
7838 goto free_vmcs;
7839 }
7840
7841 err = -ENOMEM;
7842 if (vmcs_config.cpu_based_2nd_exec_ctrl & SECONDARY_EXEC_EPT_VIOLATION_VE) {
7843 struct page *page;
7844
7845 BUILD_BUG_ON(sizeof(*vmx->ve_info) > PAGE_SIZE);
7846
7847 /* ve_info must be page aligned. */
7848 page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
7849 if (!page)
7850 goto free_vmcs;
7851
7852 vmx->ve_info = page_to_virt(page);
7853 }
7854
7855 if (vmx_can_use_ipiv(vcpu))
7856 WRITE_ONCE(to_kvm_vmx(vcpu->kvm)->pid_table[vcpu->vcpu_id],
7857 __pa(&vmx->vt.pi_desc) | PID_TABLE_ENTRY_VALID);
7858
7859 return 0;
7860
7861 free_vmcs:
7862 free_loaded_vmcs(vmx->loaded_vmcs);
7863 free_pml:
7864 vmx_destroy_pml_buffer(vmx);
7865 free_vpid:
7866 free_vpid(vmx->vpid);
7867 return err;
7868 }
7869
7870 #define L1TF_MSG_SMT "L1TF CPU bug present and SMT on, data leak possible. See CVE-2018-3646 and https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/l1tf.html for details.\n"
7871 #define L1TF_MSG_L1D "L1TF CPU bug present and virtualization mitigation disabled, data leak possible. See CVE-2018-3646 and https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/l1tf.html for details.\n"
7872
vmx_vm_init(struct kvm * kvm)7873 int vmx_vm_init(struct kvm *kvm)
7874 {
7875 if (!ple_gap)
7876 kvm_disable_exits(kvm, KVM_X86_DISABLE_EXITS_PAUSE);
7877
7878 if (boot_cpu_has(X86_BUG_L1TF) && enable_ept) {
7879 switch (l1tf_mitigation) {
7880 case L1TF_MITIGATION_OFF:
7881 case L1TF_MITIGATION_FLUSH_NOWARN:
7882 /* 'I explicitly don't care' is set */
7883 break;
7884 case L1TF_MITIGATION_AUTO:
7885 case L1TF_MITIGATION_FLUSH:
7886 case L1TF_MITIGATION_FLUSH_NOSMT:
7887 case L1TF_MITIGATION_FULL:
7888 /*
7889 * Warn upon starting the first VM in a potentially
7890 * insecure environment.
7891 */
7892 if (sched_smt_active())
7893 pr_warn_once(L1TF_MSG_SMT);
7894 if (l1tf_vmx_mitigation == VMENTER_L1D_FLUSH_NEVER)
7895 pr_warn_once(L1TF_MSG_L1D);
7896 break;
7897 case L1TF_MITIGATION_FULL_FORCE:
7898 /* Flush is enforced */
7899 break;
7900 }
7901 }
7902
7903 if (enable_pml)
7904 kvm->arch.cpu_dirty_log_size = PML_LOG_NR_ENTRIES;
7905 return 0;
7906 }
7907
vmx_ignore_guest_pat(struct kvm * kvm)7908 static inline bool vmx_ignore_guest_pat(struct kvm *kvm)
7909 {
7910 /*
7911 * Non-coherent DMA devices need the guest to flush CPU properly.
7912 * In that case it is not possible to map all guest RAM as WB, so
7913 * always trust guest PAT.
7914 */
7915 return !kvm_arch_has_noncoherent_dma(kvm) &&
7916 kvm_check_has_quirk(kvm, KVM_X86_QUIRK_IGNORE_GUEST_PAT);
7917 }
7918
vmx_get_mt_mask(struct kvm_vcpu * vcpu,gfn_t gfn,bool is_mmio)7919 u8 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
7920 {
7921 /*
7922 * Force UC for host MMIO regions, as allowing the guest to access MMIO
7923 * with cacheable accesses will result in Machine Checks.
7924 */
7925 if (is_mmio)
7926 return MTRR_TYPE_UNCACHABLE << VMX_EPT_MT_EPTE_SHIFT;
7927
7928 /* Force WB if ignoring guest PAT */
7929 if (vmx_ignore_guest_pat(vcpu->kvm))
7930 return (MTRR_TYPE_WRBACK << VMX_EPT_MT_EPTE_SHIFT) | VMX_EPT_IPAT_BIT;
7931
7932 return (MTRR_TYPE_WRBACK << VMX_EPT_MT_EPTE_SHIFT);
7933 }
7934
vmcs_set_secondary_exec_control(struct vcpu_vmx * vmx,u32 new_ctl)7935 static void vmcs_set_secondary_exec_control(struct vcpu_vmx *vmx, u32 new_ctl)
7936 {
7937 /*
7938 * These bits in the secondary execution controls field
7939 * are dynamic, the others are mostly based on the hypervisor
7940 * architecture and the guest's CPUID. Do not touch the
7941 * dynamic bits.
7942 */
7943 u32 mask =
7944 SECONDARY_EXEC_SHADOW_VMCS |
7945 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
7946 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
7947 SECONDARY_EXEC_DESC;
7948
7949 u32 cur_ctl = secondary_exec_controls_get(vmx);
7950
7951 secondary_exec_controls_set(vmx, (new_ctl & ~mask) | (cur_ctl & mask));
7952 }
7953
7954 /*
7955 * Generate MSR_IA32_VMX_CR{0,4}_FIXED1 according to CPUID. Only set bits
7956 * (indicating "allowed-1") if they are supported in the guest's CPUID.
7957 */
nested_vmx_cr_fixed1_bits_update(struct kvm_vcpu * vcpu)7958 static void nested_vmx_cr_fixed1_bits_update(struct kvm_vcpu *vcpu)
7959 {
7960 struct vcpu_vmx *vmx = to_vmx(vcpu);
7961 struct kvm_cpuid_entry2 *entry;
7962
7963 vmx->nested.msrs.cr0_fixed1 = 0xffffffff;
7964 vmx->nested.msrs.cr4_fixed1 = X86_CR4_PCE;
7965
7966 #define cr4_fixed1_update(_cr4_mask, _reg, _cpuid_mask) do { \
7967 if (entry && (entry->_reg & (_cpuid_mask))) \
7968 vmx->nested.msrs.cr4_fixed1 |= (_cr4_mask); \
7969 } while (0)
7970
7971 entry = kvm_find_cpuid_entry(vcpu, 0x1);
7972 cr4_fixed1_update(X86_CR4_VME, edx, feature_bit(VME));
7973 cr4_fixed1_update(X86_CR4_PVI, edx, feature_bit(VME));
7974 cr4_fixed1_update(X86_CR4_TSD, edx, feature_bit(TSC));
7975 cr4_fixed1_update(X86_CR4_DE, edx, feature_bit(DE));
7976 cr4_fixed1_update(X86_CR4_PSE, edx, feature_bit(PSE));
7977 cr4_fixed1_update(X86_CR4_PAE, edx, feature_bit(PAE));
7978 cr4_fixed1_update(X86_CR4_MCE, edx, feature_bit(MCE));
7979 cr4_fixed1_update(X86_CR4_PGE, edx, feature_bit(PGE));
7980 cr4_fixed1_update(X86_CR4_OSFXSR, edx, feature_bit(FXSR));
7981 cr4_fixed1_update(X86_CR4_OSXMMEXCPT, edx, feature_bit(XMM));
7982 cr4_fixed1_update(X86_CR4_VMXE, ecx, feature_bit(VMX));
7983 cr4_fixed1_update(X86_CR4_SMXE, ecx, feature_bit(SMX));
7984 cr4_fixed1_update(X86_CR4_PCIDE, ecx, feature_bit(PCID));
7985 cr4_fixed1_update(X86_CR4_OSXSAVE, ecx, feature_bit(XSAVE));
7986
7987 entry = kvm_find_cpuid_entry_index(vcpu, 0x7, 0);
7988 cr4_fixed1_update(X86_CR4_FSGSBASE, ebx, feature_bit(FSGSBASE));
7989 cr4_fixed1_update(X86_CR4_SMEP, ebx, feature_bit(SMEP));
7990 cr4_fixed1_update(X86_CR4_SMAP, ebx, feature_bit(SMAP));
7991 cr4_fixed1_update(X86_CR4_PKE, ecx, feature_bit(PKU));
7992 cr4_fixed1_update(X86_CR4_UMIP, ecx, feature_bit(UMIP));
7993 cr4_fixed1_update(X86_CR4_LA57, ecx, feature_bit(LA57));
7994 cr4_fixed1_update(X86_CR4_CET, ecx, feature_bit(SHSTK));
7995 cr4_fixed1_update(X86_CR4_CET, edx, feature_bit(IBT));
7996
7997 entry = kvm_find_cpuid_entry_index(vcpu, 0x7, 1);
7998 cr4_fixed1_update(X86_CR4_LAM_SUP, eax, feature_bit(LAM));
7999
8000 #undef cr4_fixed1_update
8001 }
8002
update_intel_pt_cfg(struct kvm_vcpu * vcpu)8003 static void update_intel_pt_cfg(struct kvm_vcpu *vcpu)
8004 {
8005 struct vcpu_vmx *vmx = to_vmx(vcpu);
8006 struct kvm_cpuid_entry2 *best = NULL;
8007 int i;
8008
8009 for (i = 0; i < PT_CPUID_LEAVES; i++) {
8010 best = kvm_find_cpuid_entry_index(vcpu, 0x14, i);
8011 if (!best)
8012 return;
8013 vmx->pt_desc.caps[CPUID_EAX + i*PT_CPUID_REGS_NUM] = best->eax;
8014 vmx->pt_desc.caps[CPUID_EBX + i*PT_CPUID_REGS_NUM] = best->ebx;
8015 vmx->pt_desc.caps[CPUID_ECX + i*PT_CPUID_REGS_NUM] = best->ecx;
8016 vmx->pt_desc.caps[CPUID_EDX + i*PT_CPUID_REGS_NUM] = best->edx;
8017 }
8018
8019 /* Get the number of configurable Address Ranges for filtering */
8020 vmx->pt_desc.num_address_ranges = intel_pt_validate_cap(vmx->pt_desc.caps,
8021 PT_CAP_num_address_ranges);
8022
8023 /* Initialize and clear the no dependency bits */
8024 vmx->pt_desc.ctl_bitmask = ~(RTIT_CTL_TRACEEN | RTIT_CTL_OS |
8025 RTIT_CTL_USR | RTIT_CTL_TSC_EN | RTIT_CTL_DISRETC |
8026 RTIT_CTL_BRANCH_EN);
8027
8028 /*
8029 * If CPUID.(EAX=14H,ECX=0):EBX[0]=1 CR3Filter can be set otherwise
8030 * will inject an #GP
8031 */
8032 if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_cr3_filtering))
8033 vmx->pt_desc.ctl_bitmask &= ~RTIT_CTL_CR3EN;
8034
8035 /*
8036 * If CPUID.(EAX=14H,ECX=0):EBX[1]=1 CYCEn, CycThresh and
8037 * PSBFreq can be set
8038 */
8039 if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_psb_cyc))
8040 vmx->pt_desc.ctl_bitmask &= ~(RTIT_CTL_CYCLEACC |
8041 RTIT_CTL_CYC_THRESH | RTIT_CTL_PSB_FREQ);
8042
8043 /*
8044 * If CPUID.(EAX=14H,ECX=0):EBX[3]=1 MTCEn and MTCFreq can be set
8045 */
8046 if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_mtc))
8047 vmx->pt_desc.ctl_bitmask &= ~(RTIT_CTL_MTC_EN |
8048 RTIT_CTL_MTC_RANGE);
8049
8050 /* If CPUID.(EAX=14H,ECX=0):EBX[4]=1 FUPonPTW and PTWEn can be set */
8051 if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_ptwrite))
8052 vmx->pt_desc.ctl_bitmask &= ~(RTIT_CTL_FUP_ON_PTW |
8053 RTIT_CTL_PTW_EN);
8054
8055 /* If CPUID.(EAX=14H,ECX=0):EBX[5]=1 PwrEvEn can be set */
8056 if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_power_event_trace))
8057 vmx->pt_desc.ctl_bitmask &= ~RTIT_CTL_PWR_EVT_EN;
8058
8059 /* If CPUID.(EAX=14H,ECX=0):ECX[0]=1 ToPA can be set */
8060 if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_topa_output))
8061 vmx->pt_desc.ctl_bitmask &= ~RTIT_CTL_TOPA;
8062
8063 /* If CPUID.(EAX=14H,ECX=0):ECX[3]=1 FabricEn can be set */
8064 if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_output_subsys))
8065 vmx->pt_desc.ctl_bitmask &= ~RTIT_CTL_FABRIC_EN;
8066
8067 /* unmask address range configure area */
8068 for (i = 0; i < vmx->pt_desc.num_address_ranges; i++)
8069 vmx->pt_desc.ctl_bitmask &= ~(0xfULL << (32 + i * 4));
8070 }
8071
vmx_vcpu_after_set_cpuid(struct kvm_vcpu * vcpu)8072 void vmx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
8073 {
8074 struct vcpu_vmx *vmx = to_vmx(vcpu);
8075
8076 /*
8077 * XSAVES is effectively enabled if and only if XSAVE is also exposed
8078 * to the guest. XSAVES depends on CR4.OSXSAVE, and CR4.OSXSAVE can be
8079 * set if and only if XSAVE is supported.
8080 */
8081 if (!guest_cpu_cap_has(vcpu, X86_FEATURE_XSAVE))
8082 guest_cpu_cap_clear(vcpu, X86_FEATURE_XSAVES);
8083
8084 vmx_setup_uret_msrs(vmx);
8085
8086 if (cpu_has_secondary_exec_ctrls())
8087 vmcs_set_secondary_exec_control(vmx,
8088 vmx_secondary_exec_control(vmx));
8089
8090 if (guest_cpu_cap_has(vcpu, X86_FEATURE_VMX))
8091 vmx->msr_ia32_feature_control_valid_bits |=
8092 FEAT_CTL_VMX_ENABLED_INSIDE_SMX |
8093 FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX;
8094 else
8095 vmx->msr_ia32_feature_control_valid_bits &=
8096 ~(FEAT_CTL_VMX_ENABLED_INSIDE_SMX |
8097 FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX);
8098
8099 if (guest_cpu_cap_has(vcpu, X86_FEATURE_VMX))
8100 nested_vmx_cr_fixed1_bits_update(vcpu);
8101
8102 if (boot_cpu_has(X86_FEATURE_INTEL_PT) &&
8103 guest_cpu_cap_has(vcpu, X86_FEATURE_INTEL_PT))
8104 update_intel_pt_cfg(vcpu);
8105
8106 if (boot_cpu_has(X86_FEATURE_RTM)) {
8107 struct vmx_uret_msr *msr;
8108 msr = vmx_find_uret_msr(vmx, MSR_IA32_TSX_CTRL);
8109 if (msr) {
8110 bool enabled = guest_cpu_cap_has(vcpu, X86_FEATURE_RTM);
8111 vmx_set_guest_uret_msr(vmx, msr, enabled ? 0 : TSX_CTRL_RTM_DISABLE);
8112 }
8113 }
8114
8115 set_cr4_guest_host_mask(vmx);
8116
8117 vmx_write_encls_bitmap(vcpu, NULL);
8118 if (guest_cpu_cap_has(vcpu, X86_FEATURE_SGX))
8119 vmx->msr_ia32_feature_control_valid_bits |= FEAT_CTL_SGX_ENABLED;
8120 else
8121 vmx->msr_ia32_feature_control_valid_bits &= ~FEAT_CTL_SGX_ENABLED;
8122
8123 if (guest_cpu_cap_has(vcpu, X86_FEATURE_SGX_LC))
8124 vmx->msr_ia32_feature_control_valid_bits |=
8125 FEAT_CTL_SGX_LC_ENABLED;
8126 else
8127 vmx->msr_ia32_feature_control_valid_bits &=
8128 ~FEAT_CTL_SGX_LC_ENABLED;
8129
8130 /* Refresh #PF interception to account for MAXPHYADDR changes. */
8131 vmx_update_exception_bitmap(vcpu);
8132 }
8133
vmx_get_perf_capabilities(void)8134 static __init u64 vmx_get_perf_capabilities(void)
8135 {
8136 u64 perf_cap = PERF_CAP_FW_WRITES;
8137 u64 host_perf_cap = 0;
8138
8139 if (!enable_pmu)
8140 return 0;
8141
8142 if (boot_cpu_has(X86_FEATURE_PDCM))
8143 rdmsrq(MSR_IA32_PERF_CAPABILITIES, host_perf_cap);
8144
8145 if (!cpu_feature_enabled(X86_FEATURE_ARCH_LBR) &&
8146 !enable_mediated_pmu) {
8147 x86_perf_get_lbr(&vmx_lbr_caps);
8148
8149 /*
8150 * KVM requires LBR callstack support, as the overhead due to
8151 * context switching LBRs without said support is too high.
8152 * See intel_pmu_create_guest_lbr_event() for more info.
8153 */
8154 if (!vmx_lbr_caps.has_callstack)
8155 memset(&vmx_lbr_caps, 0, sizeof(vmx_lbr_caps));
8156 else if (vmx_lbr_caps.nr)
8157 perf_cap |= host_perf_cap & PERF_CAP_LBR_FMT;
8158 }
8159
8160 if (vmx_pebs_supported()) {
8161 perf_cap |= host_perf_cap & PERF_CAP_PEBS_MASK;
8162
8163 /*
8164 * Disallow adaptive PEBS as it is functionally broken, can be
8165 * used by the guest to read *host* LBRs, and can be used to
8166 * bypass userspace event filters. To correctly and safely
8167 * support adaptive PEBS, KVM needs to:
8168 *
8169 * 1. Account for the ADAPTIVE flag when (re)programming fixed
8170 * counters.
8171 *
8172 * 2. Gain support from perf (or take direct control of counter
8173 * programming) to support events without adaptive PEBS
8174 * enabled for the hardware counter.
8175 *
8176 * 3. Ensure LBR MSRs cannot hold host data on VM-Entry with
8177 * adaptive PEBS enabled and MSR_PEBS_DATA_CFG.LBRS=1.
8178 *
8179 * 4. Document which PMU events are effectively exposed to the
8180 * guest via adaptive PEBS, and make adaptive PEBS mutually
8181 * exclusive with KVM_SET_PMU_EVENT_FILTER if necessary.
8182 */
8183 perf_cap &= ~PERF_CAP_PEBS_BASELINE;
8184 }
8185
8186 return perf_cap;
8187 }
8188
vmx_set_cpu_caps(void)8189 static __init void vmx_set_cpu_caps(void)
8190 {
8191 kvm_initialize_cpu_caps();
8192
8193 /* CPUID 0x1 */
8194 if (nested)
8195 kvm_cpu_cap_set(X86_FEATURE_VMX);
8196
8197 /* CPUID 0x7 */
8198 if (kvm_mpx_supported())
8199 kvm_cpu_cap_check_and_set(X86_FEATURE_MPX);
8200 if (!cpu_has_vmx_invpcid())
8201 kvm_cpu_cap_clear(X86_FEATURE_INVPCID);
8202 if (vmx_pt_mode_is_host_guest())
8203 kvm_cpu_cap_check_and_set(X86_FEATURE_INTEL_PT);
8204 if (vmx_pebs_supported()) {
8205 kvm_cpu_cap_check_and_set(X86_FEATURE_DS);
8206 kvm_cpu_cap_check_and_set(X86_FEATURE_DTES64);
8207 }
8208
8209 if (!enable_pmu)
8210 kvm_cpu_cap_clear(X86_FEATURE_PDCM);
8211 kvm_caps.supported_perf_cap = vmx_get_perf_capabilities();
8212
8213 if (!enable_sgx) {
8214 kvm_cpu_cap_clear(X86_FEATURE_SGX);
8215 kvm_cpu_cap_clear(X86_FEATURE_SGX_LC);
8216 kvm_cpu_cap_clear(X86_FEATURE_SGX1);
8217 kvm_cpu_cap_clear(X86_FEATURE_SGX2);
8218 kvm_cpu_cap_clear(X86_FEATURE_SGX_EDECCSSA);
8219 }
8220
8221 if (vmx_umip_emulated())
8222 kvm_cpu_cap_set(X86_FEATURE_UMIP);
8223
8224 /* CPUID 0xD.1 */
8225 if (!cpu_has_vmx_xsaves())
8226 kvm_cpu_cap_clear(X86_FEATURE_XSAVES);
8227
8228 /* CPUID 0x80000001 and 0x7 (RDPID) */
8229 if (!cpu_has_vmx_rdtscp()) {
8230 kvm_cpu_cap_clear(X86_FEATURE_RDTSCP);
8231 kvm_cpu_cap_clear(X86_FEATURE_RDPID);
8232 }
8233
8234 if (cpu_has_vmx_waitpkg())
8235 kvm_cpu_cap_check_and_set(X86_FEATURE_WAITPKG);
8236
8237 /*
8238 * Disable CET if unrestricted_guest is unsupported as KVM doesn't
8239 * enforce CET HW behaviors in emulator. On platforms with
8240 * VMX_BASIC[bit56] == 0, inject #CP at VMX entry with error code
8241 * fails, so disable CET in this case too.
8242 */
8243 if (!cpu_has_load_cet_ctrl() || !enable_unrestricted_guest ||
8244 !cpu_has_vmx_basic_no_hw_errcode_cc()) {
8245 kvm_cpu_cap_clear(X86_FEATURE_SHSTK);
8246 kvm_cpu_cap_clear(X86_FEATURE_IBT);
8247 }
8248
8249 kvm_setup_xss_caps();
8250 kvm_finalize_cpu_caps();
8251 }
8252
vmx_is_io_intercepted(struct kvm_vcpu * vcpu,struct x86_instruction_info * info,unsigned long * exit_qualification)8253 static bool vmx_is_io_intercepted(struct kvm_vcpu *vcpu,
8254 struct x86_instruction_info *info,
8255 unsigned long *exit_qualification)
8256 {
8257 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
8258 unsigned short port;
8259 int size;
8260 bool imm;
8261
8262 /*
8263 * If the 'use IO bitmaps' VM-execution control is 0, IO instruction
8264 * VM-exits depend on the 'unconditional IO exiting' VM-execution
8265 * control.
8266 *
8267 * Otherwise, IO instruction VM-exits are controlled by the IO bitmaps.
8268 */
8269 if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS))
8270 return nested_cpu_has(vmcs12, CPU_BASED_UNCOND_IO_EXITING);
8271
8272 if (info->intercept == x86_intercept_in ||
8273 info->intercept == x86_intercept_ins) {
8274 port = info->src_val;
8275 size = info->dst_bytes;
8276 imm = info->src_type == OP_IMM;
8277 } else {
8278 port = info->dst_val;
8279 size = info->src_bytes;
8280 imm = info->dst_type == OP_IMM;
8281 }
8282
8283
8284 *exit_qualification = ((unsigned long)port << 16) | (size - 1);
8285
8286 if (info->intercept == x86_intercept_ins ||
8287 info->intercept == x86_intercept_outs)
8288 *exit_qualification |= BIT(4);
8289
8290 if (info->rep_prefix)
8291 *exit_qualification |= BIT(5);
8292
8293 if (imm)
8294 *exit_qualification |= BIT(6);
8295
8296 return nested_vmx_check_io_bitmaps(vcpu, port, size);
8297 }
8298
vmx_check_intercept(struct kvm_vcpu * vcpu,struct x86_instruction_info * info,enum x86_intercept_stage stage,struct x86_exception * exception)8299 int vmx_check_intercept(struct kvm_vcpu *vcpu,
8300 struct x86_instruction_info *info,
8301 enum x86_intercept_stage stage,
8302 struct x86_exception *exception)
8303 {
8304 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
8305 unsigned long exit_qualification = 0;
8306 u32 vm_exit_reason;
8307 u64 exit_insn_len;
8308
8309 switch (info->intercept) {
8310 case x86_intercept_rdpid:
8311 /*
8312 * RDPID causes #UD if not enabled through secondary execution
8313 * controls (ENABLE_RDTSCP). Note, the implicit MSR access to
8314 * TSC_AUX is NOT subject to interception, i.e. checking only
8315 * the dedicated execution control is architecturally correct.
8316 */
8317 if (!nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_RDTSCP)) {
8318 exception->vector = UD_VECTOR;
8319 exception->error_code_valid = false;
8320 return X86EMUL_PROPAGATE_FAULT;
8321 }
8322 return X86EMUL_CONTINUE;
8323
8324 case x86_intercept_in:
8325 case x86_intercept_ins:
8326 case x86_intercept_out:
8327 case x86_intercept_outs:
8328 if (!vmx_is_io_intercepted(vcpu, info, &exit_qualification))
8329 return X86EMUL_CONTINUE;
8330
8331 vm_exit_reason = EXIT_REASON_IO_INSTRUCTION;
8332 break;
8333
8334 case x86_intercept_lgdt:
8335 case x86_intercept_lidt:
8336 case x86_intercept_lldt:
8337 case x86_intercept_ltr:
8338 case x86_intercept_sgdt:
8339 case x86_intercept_sidt:
8340 case x86_intercept_sldt:
8341 case x86_intercept_str:
8342 if (!nested_cpu_has2(vmcs12, SECONDARY_EXEC_DESC))
8343 return X86EMUL_CONTINUE;
8344
8345 if (info->intercept == x86_intercept_lldt ||
8346 info->intercept == x86_intercept_ltr ||
8347 info->intercept == x86_intercept_sldt ||
8348 info->intercept == x86_intercept_str)
8349 vm_exit_reason = EXIT_REASON_LDTR_TR;
8350 else
8351 vm_exit_reason = EXIT_REASON_GDTR_IDTR;
8352 /*
8353 * FIXME: Decode the ModR/M to generate the correct exit
8354 * qualification for memory operands.
8355 */
8356 break;
8357
8358 case x86_intercept_hlt:
8359 if (!nested_cpu_has(vmcs12, CPU_BASED_HLT_EXITING))
8360 return X86EMUL_CONTINUE;
8361
8362 vm_exit_reason = EXIT_REASON_HLT;
8363 break;
8364
8365 case x86_intercept_pause:
8366 /*
8367 * PAUSE is a single-byte NOP with a REPE prefix, i.e. collides
8368 * with vanilla NOPs in the emulator. Apply the interception
8369 * check only to actual PAUSE instructions. Don't check
8370 * PAUSE-loop-exiting, software can't expect a given PAUSE to
8371 * exit, i.e. KVM is within its rights to allow L2 to execute
8372 * the PAUSE.
8373 */
8374 if ((info->rep_prefix != REPE_PREFIX) ||
8375 !nested_cpu_has(vmcs12, CPU_BASED_PAUSE_EXITING))
8376 return X86EMUL_CONTINUE;
8377
8378 vm_exit_reason = EXIT_REASON_PAUSE_INSTRUCTION;
8379 break;
8380
8381 /* TODO: check more intercepts... */
8382 default:
8383 return X86EMUL_UNHANDLEABLE;
8384 }
8385
8386 exit_insn_len = abs_diff((s64)info->next_rip, (s64)info->rip);
8387 if (!exit_insn_len || exit_insn_len > X86_MAX_INSTRUCTION_LENGTH)
8388 return X86EMUL_UNHANDLEABLE;
8389
8390 __nested_vmx_vmexit(vcpu, vm_exit_reason, 0, exit_qualification,
8391 exit_insn_len);
8392 return X86EMUL_INTERCEPTED;
8393 }
8394
8395 #ifdef CONFIG_X86_64
8396 /* (a << shift) / divisor, return 1 if overflow otherwise 0 */
u64_shl_div_u64(u64 a,unsigned int shift,u64 divisor,u64 * result)8397 static inline int u64_shl_div_u64(u64 a, unsigned int shift,
8398 u64 divisor, u64 *result)
8399 {
8400 u64 low = a << shift, high = a >> (64 - shift);
8401
8402 /* To avoid the overflow on divq */
8403 if (high >= divisor)
8404 return 1;
8405
8406 /* Low hold the result, high hold rem which is discarded */
8407 asm("divq %2\n\t" : "=a" (low), "=d" (high) :
8408 "rm" (divisor), "0" (low), "1" (high));
8409 *result = low;
8410
8411 return 0;
8412 }
8413
vmx_set_hv_timer(struct kvm_vcpu * vcpu,u64 guest_deadline_tsc,bool * expired)8414 int vmx_set_hv_timer(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc,
8415 bool *expired)
8416 {
8417 struct vcpu_vmx *vmx;
8418 u64 tscl, guest_tscl, delta_tsc, lapic_timer_advance_cycles;
8419 struct kvm_timer *ktimer = &vcpu->arch.apic->lapic_timer;
8420
8421 vmx = to_vmx(vcpu);
8422 tscl = rdtsc();
8423 guest_tscl = kvm_read_l1_tsc(vcpu, tscl);
8424 delta_tsc = max(guest_deadline_tsc, guest_tscl) - guest_tscl;
8425 lapic_timer_advance_cycles = nsec_to_cycles(vcpu,
8426 ktimer->timer_advance_ns);
8427
8428 if (delta_tsc > lapic_timer_advance_cycles)
8429 delta_tsc -= lapic_timer_advance_cycles;
8430 else
8431 delta_tsc = 0;
8432
8433 /* Convert to host delta tsc if tsc scaling is enabled */
8434 if (vcpu->arch.l1_tsc_scaling_ratio != kvm_caps.default_tsc_scaling_ratio &&
8435 delta_tsc && u64_shl_div_u64(delta_tsc,
8436 kvm_caps.tsc_scaling_ratio_frac_bits,
8437 vcpu->arch.l1_tsc_scaling_ratio, &delta_tsc))
8438 return -ERANGE;
8439
8440 /*
8441 * If the delta tsc can't fit in the 32 bit after the multi shift,
8442 * we can't use the preemption timer.
8443 * It's possible that it fits on later vmentries, but checking
8444 * on every vmentry is costly so we just use an hrtimer.
8445 */
8446 if (delta_tsc >> (cpu_preemption_timer_multi + 32))
8447 return -ERANGE;
8448
8449 vmx->hv_deadline_tsc = tscl + delta_tsc;
8450 *expired = !delta_tsc;
8451 return 0;
8452 }
8453
vmx_cancel_hv_timer(struct kvm_vcpu * vcpu)8454 void vmx_cancel_hv_timer(struct kvm_vcpu *vcpu)
8455 {
8456 to_vmx(vcpu)->hv_deadline_tsc = -1;
8457 }
8458 #endif
8459
vmx_update_cpu_dirty_logging(struct kvm_vcpu * vcpu)8460 void vmx_update_cpu_dirty_logging(struct kvm_vcpu *vcpu)
8461 {
8462 struct vcpu_vmx *vmx = to_vmx(vcpu);
8463
8464 if (WARN_ON_ONCE(!enable_pml))
8465 return;
8466
8467 guard(vmx_vmcs01)(vcpu);
8468
8469 /*
8470 * Note, nr_memslots_dirty_logging can be changed concurrent with this
8471 * code, but in that case another update request will be made and so
8472 * the guest will never run with a stale PML value.
8473 */
8474 if (atomic_read(&vcpu->kvm->nr_memslots_dirty_logging))
8475 secondary_exec_controls_setbit(vmx, SECONDARY_EXEC_ENABLE_PML);
8476 else
8477 secondary_exec_controls_clearbit(vmx, SECONDARY_EXEC_ENABLE_PML);
8478 }
8479
vmx_setup_mce(struct kvm_vcpu * vcpu)8480 void vmx_setup_mce(struct kvm_vcpu *vcpu)
8481 {
8482 if (vcpu->arch.mcg_cap & MCG_LMCE_P)
8483 to_vmx(vcpu)->msr_ia32_feature_control_valid_bits |=
8484 FEAT_CTL_LMCE_ENABLED;
8485 else
8486 to_vmx(vcpu)->msr_ia32_feature_control_valid_bits &=
8487 ~FEAT_CTL_LMCE_ENABLED;
8488 }
8489
8490 #ifdef CONFIG_KVM_SMM
vmx_smi_allowed(struct kvm_vcpu * vcpu,bool for_injection)8491 int vmx_smi_allowed(struct kvm_vcpu *vcpu, bool for_injection)
8492 {
8493 /* we need a nested vmexit to enter SMM, postpone if run is pending */
8494 if (to_vmx(vcpu)->nested.nested_run_pending)
8495 return -EBUSY;
8496 return !is_smm(vcpu);
8497 }
8498
vmx_enter_smm(struct kvm_vcpu * vcpu,union kvm_smram * smram)8499 int vmx_enter_smm(struct kvm_vcpu *vcpu, union kvm_smram *smram)
8500 {
8501 struct vcpu_vmx *vmx = to_vmx(vcpu);
8502
8503 /*
8504 * TODO: Implement custom flows for forcing the vCPU out/in of L2 on
8505 * SMI and RSM. Using the common VM-Exit + VM-Enter routines is wrong
8506 * SMI and RSM only modify state that is saved and restored via SMRAM.
8507 * E.g. most MSRs are left untouched, but many are modified by VM-Exit
8508 * and VM-Enter, and thus L2's values may be corrupted on SMI+RSM.
8509 */
8510 vmx->nested.smm.guest_mode = is_guest_mode(vcpu);
8511 if (vmx->nested.smm.guest_mode)
8512 nested_vmx_vmexit(vcpu, -1, 0, 0);
8513
8514 vmx->nested.smm.vmxon = vmx->nested.vmxon;
8515 vmx->nested.vmxon = false;
8516 vmx_clear_hlt(vcpu);
8517 return 0;
8518 }
8519
vmx_leave_smm(struct kvm_vcpu * vcpu,const union kvm_smram * smram)8520 int vmx_leave_smm(struct kvm_vcpu *vcpu, const union kvm_smram *smram)
8521 {
8522 struct vcpu_vmx *vmx = to_vmx(vcpu);
8523 int ret;
8524
8525 if (vmx->nested.smm.vmxon) {
8526 vmx->nested.vmxon = true;
8527 vmx->nested.smm.vmxon = false;
8528 }
8529
8530 if (vmx->nested.smm.guest_mode) {
8531 ret = nested_vmx_enter_non_root_mode(vcpu, false);
8532 if (ret)
8533 return ret;
8534
8535 vmx->nested.nested_run_pending = 1;
8536 vmx->nested.smm.guest_mode = false;
8537 }
8538 return 0;
8539 }
8540
vmx_enable_smi_window(struct kvm_vcpu * vcpu)8541 void vmx_enable_smi_window(struct kvm_vcpu *vcpu)
8542 {
8543 /* RSM will cause a vmexit anyway. */
8544 }
8545 #endif
8546
vmx_apic_init_signal_blocked(struct kvm_vcpu * vcpu)8547 bool vmx_apic_init_signal_blocked(struct kvm_vcpu *vcpu)
8548 {
8549 return to_vmx(vcpu)->nested.vmxon && !is_guest_mode(vcpu);
8550 }
8551
vmx_migrate_timers(struct kvm_vcpu * vcpu)8552 void vmx_migrate_timers(struct kvm_vcpu *vcpu)
8553 {
8554 if (is_guest_mode(vcpu)) {
8555 struct hrtimer *timer = &to_vmx(vcpu)->nested.preemption_timer;
8556
8557 if (hrtimer_try_to_cancel(timer) == 1)
8558 hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED);
8559 }
8560 }
8561
vmx_hardware_unsetup(void)8562 void vmx_hardware_unsetup(void)
8563 {
8564 kvm_set_posted_intr_wakeup_handler(NULL);
8565
8566 if (nested)
8567 nested_vmx_hardware_unsetup();
8568
8569 free_kvm_area();
8570 }
8571
vmx_vm_destroy(struct kvm * kvm)8572 void vmx_vm_destroy(struct kvm *kvm)
8573 {
8574 struct kvm_vmx *kvm_vmx = to_kvm_vmx(kvm);
8575
8576 free_pages((unsigned long)kvm_vmx->pid_table, vmx_get_pid_table_order(kvm));
8577 }
8578
8579 /*
8580 * Note, the SDM states that the linear address is masked *after* the modified
8581 * canonicality check, whereas KVM masks (untags) the address and then performs
8582 * a "normal" canonicality check. Functionally, the two methods are identical,
8583 * and when the masking occurs relative to the canonicality check isn't visible
8584 * to software, i.e. KVM's behavior doesn't violate the SDM.
8585 */
vmx_get_untagged_addr(struct kvm_vcpu * vcpu,gva_t gva,unsigned int flags)8586 gva_t vmx_get_untagged_addr(struct kvm_vcpu *vcpu, gva_t gva, unsigned int flags)
8587 {
8588 int lam_bit;
8589 unsigned long cr3_bits;
8590
8591 if (flags & (X86EMUL_F_FETCH | X86EMUL_F_IMPLICIT | X86EMUL_F_INVLPG))
8592 return gva;
8593
8594 if (!is_64_bit_mode(vcpu))
8595 return gva;
8596
8597 /*
8598 * Bit 63 determines if the address should be treated as user address
8599 * or a supervisor address.
8600 */
8601 if (!(gva & BIT_ULL(63))) {
8602 cr3_bits = kvm_get_active_cr3_lam_bits(vcpu);
8603 if (!(cr3_bits & (X86_CR3_LAM_U57 | X86_CR3_LAM_U48)))
8604 return gva;
8605
8606 /* LAM_U48 is ignored if LAM_U57 is set. */
8607 lam_bit = cr3_bits & X86_CR3_LAM_U57 ? 56 : 47;
8608 } else {
8609 if (!kvm_is_cr4_bit_set(vcpu, X86_CR4_LAM_SUP))
8610 return gva;
8611
8612 lam_bit = kvm_is_cr4_bit_set(vcpu, X86_CR4_LA57) ? 56 : 47;
8613 }
8614
8615 /*
8616 * Untag the address by sign-extending the lam_bit, but NOT to bit 63.
8617 * Bit 63 is retained from the raw virtual address so that untagging
8618 * doesn't change a user access to a supervisor access, and vice versa.
8619 */
8620 return (sign_extend64(gva, lam_bit) & ~BIT_ULL(63)) | (gva & BIT_ULL(63));
8621 }
8622
vmx_handle_intel_pt_intr(void)8623 static unsigned int vmx_handle_intel_pt_intr(void)
8624 {
8625 struct kvm_vcpu *vcpu = kvm_get_running_vcpu();
8626
8627 /* '0' on failure so that the !PT case can use a RET0 static call. */
8628 if (!vcpu || !kvm_handling_nmi_from_guest(vcpu))
8629 return 0;
8630
8631 kvm_make_request(KVM_REQ_PMI, vcpu);
8632 __set_bit(MSR_CORE_PERF_GLOBAL_OVF_CTRL_TRACE_TOPA_PMI_BIT,
8633 (unsigned long *)&vcpu->arch.pmu.global_status);
8634 return 1;
8635 }
8636
vmx_setup_user_return_msrs(void)8637 static __init void vmx_setup_user_return_msrs(void)
8638 {
8639
8640 /*
8641 * Though SYSCALL is only supported in 64-bit mode on Intel CPUs, kvm
8642 * will emulate SYSCALL in legacy mode if the vendor string in guest
8643 * CPUID.0:{EBX,ECX,EDX} is "AuthenticAMD" or "AMDisbetter!" To
8644 * support this emulation, MSR_STAR is included in the list for i386,
8645 * but is never loaded into hardware. MSR_CSTAR is also never loaded
8646 * into hardware and is here purely for emulation purposes.
8647 */
8648 const u32 vmx_uret_msrs_list[] = {
8649 #ifdef CONFIG_X86_64
8650 MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR,
8651 #endif
8652 MSR_EFER, MSR_TSC_AUX, MSR_STAR,
8653 MSR_IA32_TSX_CTRL,
8654 };
8655 int i;
8656
8657 BUILD_BUG_ON(ARRAY_SIZE(vmx_uret_msrs_list) != MAX_NR_USER_RETURN_MSRS);
8658
8659 for (i = 0; i < ARRAY_SIZE(vmx_uret_msrs_list); ++i)
8660 kvm_add_user_return_msr(vmx_uret_msrs_list[i]);
8661 }
8662
vmx_setup_me_spte_mask(void)8663 static void __init vmx_setup_me_spte_mask(void)
8664 {
8665 u64 me_mask = 0;
8666
8667 /*
8668 * On pre-MKTME system, boot_cpu_data.x86_phys_bits equals to
8669 * kvm_host.maxphyaddr. On MKTME and/or TDX capable systems,
8670 * boot_cpu_data.x86_phys_bits holds the actual physical address
8671 * w/o the KeyID bits, and kvm_host.maxphyaddr equals to
8672 * MAXPHYADDR reported by CPUID. Those bits between are KeyID bits.
8673 */
8674 if (boot_cpu_data.x86_phys_bits != kvm_host.maxphyaddr)
8675 me_mask = rsvd_bits(boot_cpu_data.x86_phys_bits,
8676 kvm_host.maxphyaddr - 1);
8677
8678 /*
8679 * Unlike SME, host kernel doesn't support setting up any
8680 * MKTME KeyID on Intel platforms. No memory encryption
8681 * bits should be included into the SPTE.
8682 */
8683 kvm_mmu_set_me_spte_mask(0, me_mask);
8684 }
8685
vmx_hardware_setup(void)8686 __init int vmx_hardware_setup(void)
8687 {
8688 unsigned long host_bndcfgs;
8689 struct desc_ptr dt;
8690 int r;
8691
8692 store_idt(&dt);
8693 host_idt_base = dt.address;
8694
8695 vmx_setup_user_return_msrs();
8696
8697
8698 if (boot_cpu_has(X86_FEATURE_NX))
8699 kvm_enable_efer_bits(EFER_NX);
8700
8701 if (boot_cpu_has(X86_FEATURE_MPX)) {
8702 rdmsrq(MSR_IA32_BNDCFGS, host_bndcfgs);
8703 WARN_ONCE(host_bndcfgs, "BNDCFGS in host will be lost");
8704 }
8705
8706 if (!cpu_has_vmx_mpx())
8707 kvm_caps.supported_xcr0 &= ~(XFEATURE_MASK_BNDREGS |
8708 XFEATURE_MASK_BNDCSR);
8709
8710 if (!cpu_has_vmx_vpid() || !cpu_has_vmx_invvpid() ||
8711 !(cpu_has_vmx_invvpid_single() || cpu_has_vmx_invvpid_global()))
8712 enable_vpid = 0;
8713
8714 if (!cpu_has_vmx_ept() ||
8715 !cpu_has_vmx_ept_4levels() ||
8716 !cpu_has_vmx_ept_mt_wb() ||
8717 !cpu_has_vmx_invept_global())
8718 enable_ept = 0;
8719
8720 /* NX support is required for shadow paging. */
8721 if (!enable_ept && !boot_cpu_has(X86_FEATURE_NX)) {
8722 pr_err_ratelimited("NX (Execute Disable) not supported\n");
8723 return -EOPNOTSUPP;
8724 }
8725
8726 /*
8727 * Shadow paging doesn't have a (further) performance penalty
8728 * from GUEST_MAXPHYADDR < HOST_MAXPHYADDR so enable it
8729 * by default
8730 */
8731 if (!enable_ept)
8732 allow_smaller_maxphyaddr = true;
8733
8734 if (!cpu_has_vmx_ept_ad_bits() || !enable_ept)
8735 enable_ept_ad_bits = 0;
8736
8737 if (!cpu_has_vmx_unrestricted_guest() || !enable_ept)
8738 enable_unrestricted_guest = 0;
8739
8740 if (!cpu_has_vmx_flexpriority())
8741 flexpriority_enabled = 0;
8742
8743 if (!cpu_has_virtual_nmis())
8744 enable_vnmi = 0;
8745
8746 #ifdef CONFIG_X86_SGX_KVM
8747 if (!cpu_has_vmx_encls_vmexit())
8748 enable_sgx = false;
8749 #endif
8750
8751 /*
8752 * set_apic_access_page_addr() is used to reload apic access
8753 * page upon invalidation. No need to do anything if not
8754 * using the APIC_ACCESS_ADDR VMCS field.
8755 */
8756 if (!flexpriority_enabled)
8757 vt_x86_ops.set_apic_access_page_addr = NULL;
8758
8759 if (!cpu_has_vmx_tpr_shadow())
8760 vt_x86_ops.update_cr8_intercept = NULL;
8761
8762 #if IS_ENABLED(CONFIG_HYPERV)
8763 if (ms_hyperv.nested_features & HV_X64_NESTED_GUEST_MAPPING_FLUSH
8764 && enable_ept) {
8765 vt_x86_ops.flush_remote_tlbs = hv_flush_remote_tlbs;
8766 vt_x86_ops.flush_remote_tlbs_range = hv_flush_remote_tlbs_range;
8767 }
8768 #endif
8769
8770 if (!cpu_has_vmx_ple()) {
8771 ple_gap = 0;
8772 ple_window = 0;
8773 ple_window_grow = 0;
8774 ple_window_max = 0;
8775 ple_window_shrink = 0;
8776 }
8777
8778 if (!cpu_has_vmx_apicv())
8779 enable_apicv = 0;
8780 if (!enable_apicv)
8781 vt_x86_ops.sync_pir_to_irr = NULL;
8782
8783 if (!enable_apicv || !cpu_has_vmx_ipiv())
8784 enable_ipiv = false;
8785
8786 if (cpu_has_vmx_tsc_scaling())
8787 kvm_caps.has_tsc_control = true;
8788
8789 kvm_caps.max_tsc_scaling_ratio = KVM_VMX_TSC_MULTIPLIER_MAX;
8790 kvm_caps.tsc_scaling_ratio_frac_bits = 48;
8791 kvm_caps.has_bus_lock_exit = cpu_has_vmx_bus_lock_detection();
8792 kvm_caps.has_notify_vmexit = cpu_has_notify_vmexit();
8793
8794 set_bit(0, vmx_vpid_bitmap); /* 0 is reserved for host */
8795
8796 if (enable_ept)
8797 kvm_mmu_set_ept_masks(enable_ept_ad_bits,
8798 cpu_has_vmx_ept_execute_only());
8799 else
8800 vt_x86_ops.get_mt_mask = NULL;
8801
8802 /*
8803 * Setup shadow_me_value/shadow_me_mask to include MKTME KeyID
8804 * bits to shadow_zero_check.
8805 */
8806 vmx_setup_me_spte_mask();
8807
8808 kvm_configure_mmu(enable_ept, 0, vmx_get_max_ept_level(),
8809 ept_caps_to_lpage_level(vmx_capability.ept));
8810
8811 /*
8812 * Only enable PML when hardware supports PML feature, and both EPT
8813 * and EPT A/D bit features are enabled -- PML depends on them to work.
8814 */
8815 if (!enable_ept || !enable_ept_ad_bits || !cpu_has_vmx_pml())
8816 enable_pml = 0;
8817
8818 if (!cpu_has_vmx_preemption_timer())
8819 enable_preemption_timer = false;
8820
8821 if (enable_preemption_timer) {
8822 u64 use_timer_freq = 5000ULL * 1000 * 1000;
8823
8824 cpu_preemption_timer_multi =
8825 vmx_misc_preemption_timer_rate(vmcs_config.misc);
8826
8827 if (tsc_khz)
8828 use_timer_freq = (u64)tsc_khz * 1000;
8829 use_timer_freq >>= cpu_preemption_timer_multi;
8830
8831 /*
8832 * KVM "disables" the preemption timer by setting it to its max
8833 * value. Don't use the timer if it might cause spurious exits
8834 * at a rate faster than 0.1 Hz (of uninterrupted guest time).
8835 */
8836 if (use_timer_freq > 0xffffffffu / 10)
8837 enable_preemption_timer = false;
8838 }
8839
8840 if (!enable_preemption_timer) {
8841 vt_x86_ops.set_hv_timer = NULL;
8842 vt_x86_ops.cancel_hv_timer = NULL;
8843 }
8844
8845 kvm_caps.supported_mce_cap |= MCG_LMCE_P;
8846 kvm_caps.supported_mce_cap |= MCG_CMCI_P;
8847
8848 if (pt_mode != PT_MODE_SYSTEM && pt_mode != PT_MODE_HOST_GUEST)
8849 return -EINVAL;
8850 if (!enable_ept || !enable_pmu || !cpu_has_vmx_intel_pt())
8851 pt_mode = PT_MODE_SYSTEM;
8852 if (pt_mode == PT_MODE_HOST_GUEST)
8853 vt_init_ops.handle_intel_pt_intr = vmx_handle_intel_pt_intr;
8854 else
8855 vt_init_ops.handle_intel_pt_intr = NULL;
8856
8857 setup_default_sgx_lepubkeyhash();
8858
8859 vmx_set_cpu_caps();
8860
8861 /*
8862 * Configure nested capabilities after core CPU capabilities so that
8863 * nested support can be conditional on base support, e.g. so that KVM
8864 * can hide/show features based on kvm_cpu_cap_has().
8865 */
8866 if (nested) {
8867 r = nested_vmx_hardware_setup(kvm_vmx_exit_handlers);
8868 if (r)
8869 return r;
8870 }
8871
8872 r = alloc_kvm_area();
8873 if (r)
8874 goto err_kvm_area;
8875
8876 kvm_set_posted_intr_wakeup_handler(pi_wakeup_handler);
8877
8878 /*
8879 * On Intel CPUs that lack self-snoop feature, letting the guest control
8880 * memory types may result in unexpected behavior. So always ignore guest
8881 * PAT on those CPUs and map VM as writeback, not allowing userspace to
8882 * disable the quirk.
8883 *
8884 * On certain Intel CPUs (e.g. SPR, ICX), though self-snoop feature is
8885 * supported, UC is slow enough to cause issues with some older guests (e.g.
8886 * an old version of bochs driver uses ioremap() instead of ioremap_wc() to
8887 * map the video RAM, causing wayland desktop to fail to get started
8888 * correctly). To avoid breaking those older guests that rely on KVM to force
8889 * memory type to WB, provide KVM_X86_QUIRK_IGNORE_GUEST_PAT to preserve the
8890 * safer (for performance) default behavior.
8891 *
8892 * On top of this, non-coherent DMA devices need the guest to flush CPU
8893 * caches properly. This also requires honoring guest PAT, and is forced
8894 * independent of the quirk in vmx_ignore_guest_pat().
8895 */
8896 if (!static_cpu_has(X86_FEATURE_SELFSNOOP))
8897 kvm_caps.supported_quirks &= ~KVM_X86_QUIRK_IGNORE_GUEST_PAT;
8898
8899 kvm_caps.inapplicable_quirks &= ~KVM_X86_QUIRK_IGNORE_GUEST_PAT;
8900
8901 return 0;
8902
8903 err_kvm_area:
8904 if (nested)
8905 nested_vmx_hardware_unsetup();
8906 return r;
8907 }
8908
vmx_exit(void)8909 void vmx_exit(void)
8910 {
8911 allow_smaller_maxphyaddr = false;
8912
8913 vmx_cleanup_l1d_flush();
8914
8915 kvm_x86_vendor_exit();
8916 }
8917
vmx_init(void)8918 int __init vmx_init(void)
8919 {
8920 int r, cpu;
8921
8922 KVM_SANITY_CHECK_VM_STRUCT_SIZE(kvm_vmx);
8923
8924 if (!kvm_is_vmx_supported())
8925 return -EOPNOTSUPP;
8926
8927 /*
8928 * Note, VMCS and eVMCS configuration only touch VMX knobs/variables,
8929 * i.e. there's nothing to unwind if a later step fails.
8930 */
8931 hv_init_evmcs();
8932
8933 /*
8934 * Parse the VMCS config and VMX capabilities before anything else, so
8935 * that the information is available to all setup flows.
8936 */
8937 if (setup_vmcs_config(&vmcs_config, &vmx_capability) < 0)
8938 return -EIO;
8939
8940 r = kvm_x86_vendor_init(&vt_init_ops);
8941 if (r)
8942 return r;
8943
8944 /* Must be called after common x86 init so enable_ept is setup. */
8945 r = vmx_setup_l1d_flush();
8946 if (r)
8947 goto err_l1d_flush;
8948
8949 for_each_possible_cpu(cpu) {
8950 INIT_LIST_HEAD(&per_cpu(loaded_vmcss_on_cpu, cpu));
8951
8952 pi_init_cpu(cpu);
8953 }
8954
8955 vmx_check_vmcs12_offsets();
8956
8957 return 0;
8958
8959 err_l1d_flush:
8960 kvm_x86_vendor_exit();
8961 return r;
8962 }
8963