1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3 * Kernel-based Virtual Machine driver for Linux
4 *
5 * This module enables machines with Intel VT-x extensions to run virtual
6 * machines without emulation or binary translation.
7 *
8 * Copyright (C) 2006 Qumranet, Inc.
9 * Copyright 2010 Red Hat, Inc. and/or its affiliates.
10 *
11 * Authors:
12 * Avi Kivity <avi@qumranet.com>
13 * Yaniv Kamay <yaniv@qumranet.com>
14 */
15 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
16
17 #include <linux/highmem.h>
18 #include <linux/hrtimer.h>
19 #include <linux/kernel.h>
20 #include <linux/kvm_host.h>
21 #include <linux/module.h>
22 #include <linux/moduleparam.h>
23 #include <linux/mod_devicetable.h>
24 #include <linux/mm.h>
25 #include <linux/objtool.h>
26 #include <linux/sched.h>
27 #include <linux/sched/smt.h>
28 #include <linux/slab.h>
29 #include <linux/tboot.h>
30 #include <linux/trace_events.h>
31
32 #include <asm/apic.h>
33 #include <asm/asm.h>
34 #include <asm/cpu.h>
35 #include <asm/cpu_device_id.h>
36 #include <asm/debugreg.h>
37 #include <asm/desc.h>
38 #include <asm/fpu/api.h>
39 #include <asm/fpu/xstate.h>
40 #include <asm/fred.h>
41 #include <asm/idtentry.h>
42 #include <asm/io.h>
43 #include <asm/irq_remapping.h>
44 #include <asm/reboot.h>
45 #include <asm/perf_event.h>
46 #include <asm/mmu_context.h>
47 #include <asm/mshyperv.h>
48 #include <asm/msr.h>
49 #include <asm/mwait.h>
50 #include <asm/spec-ctrl.h>
51 #include <asm/virt.h>
52 #include <asm/vmx.h>
53
54 #include <trace/events/ipi.h>
55
56 #include "capabilities.h"
57 #include "common.h"
58 #include "cpuid.h"
59 #include "hyperv.h"
60 #include "kvm_onhyperv.h"
61 #include "irq.h"
62 #include "kvm_cache_regs.h"
63 #include "lapic.h"
64 #include "mmu.h"
65 #include "nested.h"
66 #include "pmu.h"
67 #include "sgx.h"
68 #include "trace.h"
69 #include "vmcs.h"
70 #include "vmcs12.h"
71 #include "vmx.h"
72 #include "x86.h"
73 #include "x86_ops.h"
74 #include "smm.h"
75 #include "vmx_onhyperv.h"
76 #include "posted_intr.h"
77
78 #include "mmu/spte.h"
79
80 MODULE_AUTHOR("Qumranet");
81 MODULE_DESCRIPTION("KVM support for VMX (Intel VT-x) extensions");
82 MODULE_LICENSE("GPL");
83
84 #ifdef MODULE
85 static const struct x86_cpu_id vmx_cpu_id[] = {
86 X86_MATCH_FEATURE(X86_FEATURE_VMX, NULL),
87 {}
88 };
89 MODULE_DEVICE_TABLE(x86cpu, vmx_cpu_id);
90 #endif
91
92 bool __read_mostly enable_vpid = 1;
93 module_param_named(vpid, enable_vpid, bool, 0444);
94
95 static bool __read_mostly enable_vnmi = 1;
96 module_param_named(vnmi, enable_vnmi, bool, 0444);
97
98 bool __read_mostly flexpriority_enabled = 1;
99 module_param_named(flexpriority, flexpriority_enabled, bool, 0444);
100
101 bool __read_mostly enable_ept = 1;
102 module_param_named(ept, enable_ept, bool, 0444);
103
104 bool __read_mostly enable_unrestricted_guest = 1;
105 module_param_named(unrestricted_guest,
106 enable_unrestricted_guest, bool, 0444);
107
108 bool __read_mostly enable_ept_ad_bits = 1;
109 module_param_named(eptad, enable_ept_ad_bits, bool, 0444);
110
111 static bool __read_mostly emulate_invalid_guest_state = true;
112 module_param(emulate_invalid_guest_state, bool, 0444);
113
114 static bool __read_mostly fasteoi = 1;
115 module_param(fasteoi, bool, 0444);
116
117 module_param(enable_apicv, bool, 0444);
118 module_param(enable_ipiv, bool, 0444);
119
120 module_param(enable_device_posted_irqs, bool, 0444);
121
122 /*
123 * If nested=1, nested virtualization is supported, i.e., guests may use
124 * VMX and be a hypervisor for its own guests. If nested=0, guests may not
125 * use VMX instructions.
126 */
127 static bool __read_mostly nested = 1;
128 module_param(nested, bool, 0444);
129
130 bool __read_mostly enable_pml = 1;
131 module_param_named(pml, enable_pml, bool, 0444);
132
133 static bool __read_mostly error_on_inconsistent_vmcs_config = true;
134 module_param(error_on_inconsistent_vmcs_config, bool, 0444);
135
136 static bool __read_mostly dump_invalid_vmcs = 0;
137 module_param(dump_invalid_vmcs, bool, 0644);
138
139 #define MSR_BITMAP_MODE_X2APIC 1
140 #define MSR_BITMAP_MODE_X2APIC_APICV 2
141
142 #define KVM_VMX_TSC_MULTIPLIER_MAX 0xffffffffffffffffULL
143
144 /* Guest_tsc -> host_tsc conversion requires 64-bit division. */
145 static int __read_mostly cpu_preemption_timer_multi;
146 static bool __read_mostly enable_preemption_timer = 1;
147 #ifdef CONFIG_X86_64
148 module_param_named(preemption_timer, enable_preemption_timer, bool, S_IRUGO);
149 #endif
150
151 extern bool __read_mostly allow_smaller_maxphyaddr;
152 module_param(allow_smaller_maxphyaddr, bool, S_IRUGO);
153
154 module_param(enable_mediated_pmu, bool, 0444);
155
156 #define KVM_VM_CR0_ALWAYS_OFF (X86_CR0_NW | X86_CR0_CD)
157 #define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST X86_CR0_NE
158 #define KVM_VM_CR0_ALWAYS_ON \
159 (KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST | X86_CR0_PG | X86_CR0_PE)
160
161 #define KVM_VM_CR4_ALWAYS_ON_UNRESTRICTED_GUEST X86_CR4_VMXE
162 #define KVM_PMODE_VM_CR4_ALWAYS_ON (X86_CR4_PAE | X86_CR4_VMXE)
163 #define KVM_RMODE_VM_CR4_ALWAYS_ON (X86_CR4_VME | X86_CR4_PAE | X86_CR4_VMXE)
164
165 #define RMODE_GUEST_OWNED_EFLAGS_BITS (~(X86_EFLAGS_IOPL | X86_EFLAGS_VM))
166
167 #define MSR_IA32_RTIT_STATUS_MASK (~(RTIT_STATUS_FILTEREN | \
168 RTIT_STATUS_CONTEXTEN | RTIT_STATUS_TRIGGEREN | \
169 RTIT_STATUS_ERROR | RTIT_STATUS_STOPPED | \
170 RTIT_STATUS_BYTECNT))
171
172 /*
173 * These 2 parameters are used to config the controls for Pause-Loop Exiting:
174 * ple_gap: upper bound on the amount of time between two successive
175 * executions of PAUSE in a loop. Also indicate if ple enabled.
176 * According to test, this time is usually smaller than 128 cycles.
177 * ple_window: upper bound on the amount of time a guest is allowed to execute
178 * in a PAUSE loop. Tests indicate that most spinlocks are held for
179 * less than 2^12 cycles
180 * Time is measured based on a counter that runs at the same rate as the TSC,
181 * refer SDM volume 3b section 21.6.13 & 22.1.3.
182 */
183 static unsigned int ple_gap = KVM_DEFAULT_PLE_GAP;
184 module_param(ple_gap, uint, 0444);
185
186 static unsigned int ple_window = KVM_VMX_DEFAULT_PLE_WINDOW;
187 module_param(ple_window, uint, 0444);
188
189 /* Default doubles per-vcpu window every exit. */
190 static unsigned int ple_window_grow = KVM_DEFAULT_PLE_WINDOW_GROW;
191 module_param(ple_window_grow, uint, 0444);
192
193 /* Default resets per-vcpu window every exit to ple_window. */
194 static unsigned int ple_window_shrink = KVM_DEFAULT_PLE_WINDOW_SHRINK;
195 module_param(ple_window_shrink, uint, 0444);
196
197 /* Default is to compute the maximum so we can never overflow. */
198 static unsigned int ple_window_max = KVM_VMX_DEFAULT_PLE_WINDOW_MAX;
199 module_param(ple_window_max, uint, 0444);
200
201 /* Default is SYSTEM mode, 1 for host-guest mode (which is BROKEN) */
202 int __read_mostly pt_mode = PT_MODE_SYSTEM;
203 #ifdef CONFIG_BROKEN
204 module_param(pt_mode, int, S_IRUGO);
205 #endif
206
207 struct x86_pmu_lbr __ro_after_init vmx_lbr_caps;
208
209 #ifdef CONFIG_CPU_MITIGATIONS
210 static DEFINE_STATIC_KEY_FALSE(vmx_l1d_should_flush);
211 static DEFINE_STATIC_KEY_FALSE(vmx_l1d_flush_cond);
212 static DEFINE_MUTEX(vmx_l1d_flush_mutex);
213
214 /* Storage for pre module init parameter parsing */
215 static enum vmx_l1d_flush_state __read_mostly vmentry_l1d_flush_param = VMENTER_L1D_FLUSH_AUTO;
216
217 static const struct {
218 const char *option;
219 bool for_parse;
220 } vmentry_l1d_param[] = {
221 [VMENTER_L1D_FLUSH_AUTO] = {"auto", true},
222 [VMENTER_L1D_FLUSH_NEVER] = {"never", true},
223 [VMENTER_L1D_FLUSH_COND] = {"cond", true},
224 [VMENTER_L1D_FLUSH_ALWAYS] = {"always", true},
225 [VMENTER_L1D_FLUSH_EPT_DISABLED] = {"EPT disabled", false},
226 [VMENTER_L1D_FLUSH_NOT_REQUIRED] = {"not required", false},
227 };
228
229 #define L1D_CACHE_ORDER 4
230 static void *vmx_l1d_flush_pages;
231
__vmx_setup_l1d_flush(enum vmx_l1d_flush_state l1tf)232 static int __vmx_setup_l1d_flush(enum vmx_l1d_flush_state l1tf)
233 {
234 struct page *page;
235 unsigned int i;
236
237 if (!boot_cpu_has_bug(X86_BUG_L1TF)) {
238 l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_NOT_REQUIRED;
239 return 0;
240 }
241
242 if (!enable_ept) {
243 l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_EPT_DISABLED;
244 return 0;
245 }
246
247 if (kvm_host.arch_capabilities & ARCH_CAP_SKIP_VMENTRY_L1DFLUSH) {
248 l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_NOT_REQUIRED;
249 return 0;
250 }
251
252 /* If set to auto use the default l1tf mitigation method */
253 if (l1tf == VMENTER_L1D_FLUSH_AUTO) {
254 switch (l1tf_mitigation) {
255 case L1TF_MITIGATION_OFF:
256 l1tf = VMENTER_L1D_FLUSH_NEVER;
257 break;
258 case L1TF_MITIGATION_AUTO:
259 case L1TF_MITIGATION_FLUSH_NOWARN:
260 case L1TF_MITIGATION_FLUSH:
261 case L1TF_MITIGATION_FLUSH_NOSMT:
262 l1tf = VMENTER_L1D_FLUSH_COND;
263 break;
264 case L1TF_MITIGATION_FULL:
265 case L1TF_MITIGATION_FULL_FORCE:
266 l1tf = VMENTER_L1D_FLUSH_ALWAYS;
267 break;
268 }
269 } else if (l1tf_mitigation == L1TF_MITIGATION_FULL_FORCE) {
270 l1tf = VMENTER_L1D_FLUSH_ALWAYS;
271 }
272
273 if (l1tf != VMENTER_L1D_FLUSH_NEVER && !vmx_l1d_flush_pages &&
274 !boot_cpu_has(X86_FEATURE_FLUSH_L1D)) {
275 /*
276 * This allocation for vmx_l1d_flush_pages is not tied to a VM
277 * lifetime and so should not be charged to a memcg.
278 */
279 page = alloc_pages(GFP_KERNEL, L1D_CACHE_ORDER);
280 if (!page)
281 return -ENOMEM;
282 vmx_l1d_flush_pages = page_address(page);
283
284 /*
285 * Initialize each page with a different pattern in
286 * order to protect against KSM in the nested
287 * virtualization case.
288 */
289 for (i = 0; i < 1u << L1D_CACHE_ORDER; ++i) {
290 memset(vmx_l1d_flush_pages + i * PAGE_SIZE, i + 1,
291 PAGE_SIZE);
292 }
293 }
294
295 l1tf_vmx_mitigation = l1tf;
296
297 if (l1tf != VMENTER_L1D_FLUSH_NEVER)
298 static_branch_enable(&vmx_l1d_should_flush);
299 else
300 static_branch_disable(&vmx_l1d_should_flush);
301
302 if (l1tf == VMENTER_L1D_FLUSH_COND)
303 static_branch_enable(&vmx_l1d_flush_cond);
304 else
305 static_branch_disable(&vmx_l1d_flush_cond);
306 return 0;
307 }
308
vmx_setup_l1d_flush(void)309 static int vmx_setup_l1d_flush(void)
310 {
311 /*
312 * Hand the parameter mitigation value in which was stored in the pre
313 * module init parser. If no parameter was given, it will contain
314 * 'auto' which will be turned into the default 'cond' mitigation mode.
315 */
316 return __vmx_setup_l1d_flush(vmentry_l1d_flush_param);
317 }
318
vmx_cleanup_l1d_flush(void)319 static void vmx_cleanup_l1d_flush(void)
320 {
321 if (vmx_l1d_flush_pages) {
322 free_pages((unsigned long)vmx_l1d_flush_pages, L1D_CACHE_ORDER);
323 vmx_l1d_flush_pages = NULL;
324 }
325 /* Restore state so sysfs ignores VMX */
326 l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_AUTO;
327 }
328
vmentry_l1d_flush_parse(const char * s)329 static int vmentry_l1d_flush_parse(const char *s)
330 {
331 unsigned int i;
332
333 if (s) {
334 for (i = 0; i < ARRAY_SIZE(vmentry_l1d_param); i++) {
335 if (vmentry_l1d_param[i].for_parse &&
336 sysfs_streq(s, vmentry_l1d_param[i].option))
337 return i;
338 }
339 }
340 return -EINVAL;
341 }
342
vmentry_l1d_flush_set(const char * s,const struct kernel_param * kp)343 static int vmentry_l1d_flush_set(const char *s, const struct kernel_param *kp)
344 {
345 int l1tf, ret;
346
347 l1tf = vmentry_l1d_flush_parse(s);
348 if (l1tf < 0)
349 return l1tf;
350
351 if (!boot_cpu_has(X86_BUG_L1TF))
352 return 0;
353
354 /*
355 * Has vmx_init() run already? If not then this is the pre init
356 * parameter parsing. In that case just store the value and let
357 * vmx_init() do the proper setup after enable_ept has been
358 * established.
359 */
360 if (l1tf_vmx_mitigation == VMENTER_L1D_FLUSH_AUTO) {
361 vmentry_l1d_flush_param = l1tf;
362 return 0;
363 }
364
365 mutex_lock(&vmx_l1d_flush_mutex);
366 ret = __vmx_setup_l1d_flush(l1tf);
367 mutex_unlock(&vmx_l1d_flush_mutex);
368 return ret;
369 }
370
vmentry_l1d_flush_get(char * s,const struct kernel_param * kp)371 static int vmentry_l1d_flush_get(char *s, const struct kernel_param *kp)
372 {
373 if (WARN_ON_ONCE(l1tf_vmx_mitigation >= ARRAY_SIZE(vmentry_l1d_param)))
374 return sysfs_emit(s, "???\n");
375
376 return sysfs_emit(s, "%s\n", vmentry_l1d_param[l1tf_vmx_mitigation].option);
377 }
378
379 /*
380 * Software based L1D cache flush which is used when microcode providing
381 * the cache control MSR is not loaded.
382 *
383 * The L1D cache is 32 KiB on Nehalem and later microarchitectures, but to
384 * flush it is required to read in 64 KiB because the replacement algorithm
385 * is not exactly LRU. This could be sized at runtime via topology
386 * information but as all relevant affected CPUs have 32KiB L1D cache size
387 * there is no point in doing so.
388 */
vmx_l1d_flush(struct kvm_vcpu * vcpu)389 static noinstr void vmx_l1d_flush(struct kvm_vcpu *vcpu)
390 {
391 int size = PAGE_SIZE << L1D_CACHE_ORDER;
392
393 if (!static_branch_unlikely(&vmx_l1d_should_flush))
394 return;
395
396 /*
397 * This code is only executed when the flush mode is 'cond' or
398 * 'always'
399 */
400 if (static_branch_likely(&vmx_l1d_flush_cond)) {
401 /*
402 * Clear the per-cpu flush bit, it gets set again if the vCPU
403 * is reloaded, i.e. if the vCPU is scheduled out or if KVM
404 * exits to userspace, or if KVM reaches one of the unsafe
405 * VMEXIT handlers, e.g. if KVM calls into the emulator,
406 * or from the interrupt handlers.
407 */
408 if (!kvm_get_cpu_l1tf_flush_l1d())
409 return;
410 kvm_clear_cpu_l1tf_flush_l1d();
411 }
412
413 vcpu->stat.l1d_flush++;
414
415 if (static_cpu_has(X86_FEATURE_FLUSH_L1D)) {
416 native_wrmsrq(MSR_IA32_FLUSH_CMD, L1D_FLUSH);
417 return;
418 }
419
420 asm volatile(
421 /* First ensure the pages are in the TLB */
422 "xorl %%eax, %%eax\n"
423 ".Lpopulate_tlb:\n\t"
424 "movzbl (%[flush_pages], %%" _ASM_AX "), %%ecx\n\t"
425 "addl $4096, %%eax\n\t"
426 "cmpl %%eax, %[size]\n\t"
427 "jne .Lpopulate_tlb\n\t"
428 "xorl %%eax, %%eax\n\t"
429 "cpuid\n\t"
430 /* Now fill the cache */
431 "xorl %%eax, %%eax\n"
432 ".Lfill_cache:\n"
433 "movzbl (%[flush_pages], %%" _ASM_AX "), %%ecx\n\t"
434 "addl $64, %%eax\n\t"
435 "cmpl %%eax, %[size]\n\t"
436 "jne .Lfill_cache\n\t"
437 "lfence\n"
438 :: [flush_pages] "r" (vmx_l1d_flush_pages),
439 [size] "r" (size)
440 : "eax", "ebx", "ecx", "edx");
441 }
442
443 #else /* CONFIG_CPU_MITIGATIONS*/
vmx_setup_l1d_flush(void)444 static int vmx_setup_l1d_flush(void)
445 {
446 l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_NEVER;
447 return 0;
448 }
vmx_cleanup_l1d_flush(void)449 static void vmx_cleanup_l1d_flush(void)
450 {
451 l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_AUTO;
452 }
vmx_l1d_flush(struct kvm_vcpu * vcpu)453 static __always_inline void vmx_l1d_flush(struct kvm_vcpu *vcpu)
454 {
455
456 }
vmentry_l1d_flush_set(const char * s,const struct kernel_param * kp)457 static int vmentry_l1d_flush_set(const char *s, const struct kernel_param *kp)
458 {
459 pr_warn_once("Kernel compiled without mitigations, ignoring vmentry_l1d_flush\n");
460 return 0;
461 }
vmentry_l1d_flush_get(char * s,const struct kernel_param * kp)462 static int vmentry_l1d_flush_get(char *s, const struct kernel_param *kp)
463 {
464 return sysfs_emit(s, "never\n");
465 }
466 #endif
467
468 static const struct kernel_param_ops vmentry_l1d_flush_ops = {
469 .set = vmentry_l1d_flush_set,
470 .get = vmentry_l1d_flush_get,
471 };
472 module_param_cb(vmentry_l1d_flush, &vmentry_l1d_flush_ops, NULL, 0644);
473
vmx_disable_fb_clear(struct vcpu_vmx * vmx)474 static __always_inline void vmx_disable_fb_clear(struct vcpu_vmx *vmx)
475 {
476 u64 msr;
477
478 if (!vmx->disable_fb_clear)
479 return;
480
481 msr = native_rdmsrq(MSR_IA32_MCU_OPT_CTRL);
482 msr |= FB_CLEAR_DIS;
483 native_wrmsrq(MSR_IA32_MCU_OPT_CTRL, msr);
484 /* Cache the MSR value to avoid reading it later */
485 vmx->msr_ia32_mcu_opt_ctrl = msr;
486 }
487
vmx_enable_fb_clear(struct vcpu_vmx * vmx)488 static __always_inline void vmx_enable_fb_clear(struct vcpu_vmx *vmx)
489 {
490 if (!vmx->disable_fb_clear)
491 return;
492
493 vmx->msr_ia32_mcu_opt_ctrl &= ~FB_CLEAR_DIS;
494 native_wrmsrq(MSR_IA32_MCU_OPT_CTRL, vmx->msr_ia32_mcu_opt_ctrl);
495 }
496
vmx_update_fb_clear_dis(struct kvm_vcpu * vcpu,struct vcpu_vmx * vmx)497 static void vmx_update_fb_clear_dis(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx)
498 {
499 /*
500 * Disable VERW's behavior of clearing CPU buffers for the guest if the
501 * CPU isn't affected by MDS/TAA, and the host hasn't forcefully enabled
502 * the mitigation. Disabling the clearing behavior provides a
503 * performance boost for guests that aren't aware that manually clearing
504 * CPU buffers is unnecessary, at the cost of MSR accesses on VM-Entry
505 * and VM-Exit.
506 */
507 vmx->disable_fb_clear = !cpu_feature_enabled(X86_FEATURE_CLEAR_CPU_BUF) &&
508 (kvm_host.arch_capabilities & ARCH_CAP_FB_CLEAR_CTRL) &&
509 !boot_cpu_has_bug(X86_BUG_MDS) &&
510 !boot_cpu_has_bug(X86_BUG_TAA);
511
512 /*
513 * If guest will not execute VERW, there is no need to set FB_CLEAR_DIS
514 * at VMEntry. Skip the MSR read/write when a guest has no use case to
515 * execute VERW.
516 */
517 if ((vcpu->arch.arch_capabilities & ARCH_CAP_FB_CLEAR) ||
518 ((vcpu->arch.arch_capabilities & ARCH_CAP_MDS_NO) &&
519 (vcpu->arch.arch_capabilities & ARCH_CAP_TAA_NO) &&
520 (vcpu->arch.arch_capabilities & ARCH_CAP_PSDP_NO) &&
521 (vcpu->arch.arch_capabilities & ARCH_CAP_FBSDP_NO) &&
522 (vcpu->arch.arch_capabilities & ARCH_CAP_SBDR_SSDP_NO)))
523 vmx->disable_fb_clear = false;
524 }
525
526 static u32 vmx_segment_access_rights(struct kvm_segment *var);
527
528 void vmx_vmexit(void);
529
530 #define vmx_insn_failed(fmt...) \
531 do { \
532 WARN_ONCE(1, fmt); \
533 pr_warn_ratelimited(fmt); \
534 } while (0)
535
vmread_error(unsigned long field)536 noinline void vmread_error(unsigned long field)
537 {
538 vmx_insn_failed("vmread failed: field=%lx\n", field);
539 }
540
541 #ifndef CONFIG_CC_HAS_ASM_GOTO_OUTPUT
vmread_error_trampoline2(unsigned long field,bool fault)542 noinstr void vmread_error_trampoline2(unsigned long field, bool fault)
543 {
544 if (fault) {
545 kvm_spurious_fault();
546 } else {
547 instrumentation_begin();
548 vmread_error(field);
549 instrumentation_end();
550 }
551 }
552 #endif
553
vmwrite_error(unsigned long field,unsigned long value)554 noinline void vmwrite_error(unsigned long field, unsigned long value)
555 {
556 vmx_insn_failed("vmwrite failed: field=%lx val=%lx err=%u\n",
557 field, value, vmcs_read32(VM_INSTRUCTION_ERROR));
558 }
559
vmclear_error(struct vmcs * vmcs,u64 phys_addr)560 noinline void vmclear_error(struct vmcs *vmcs, u64 phys_addr)
561 {
562 vmx_insn_failed("vmclear failed: %p/%llx err=%u\n",
563 vmcs, phys_addr, vmcs_read32(VM_INSTRUCTION_ERROR));
564 }
565
vmptrld_error(struct vmcs * vmcs,u64 phys_addr)566 noinline void vmptrld_error(struct vmcs *vmcs, u64 phys_addr)
567 {
568 vmx_insn_failed("vmptrld failed: %p/%llx err=%u\n",
569 vmcs, phys_addr, vmcs_read32(VM_INSTRUCTION_ERROR));
570 }
571
invvpid_error(unsigned long ext,u16 vpid,gva_t gva)572 noinline void invvpid_error(unsigned long ext, u16 vpid, gva_t gva)
573 {
574 vmx_insn_failed("invvpid failed: ext=0x%lx vpid=%u gva=0x%lx\n",
575 ext, vpid, gva);
576 }
577
invept_error(unsigned long ext,u64 eptp)578 noinline void invept_error(unsigned long ext, u64 eptp)
579 {
580 vmx_insn_failed("invept failed: ext=0x%lx eptp=%llx\n", ext, eptp);
581 }
582
583 DEFINE_PER_CPU(struct vmcs *, current_vmcs);
584 /*
585 * We maintain a per-CPU linked-list of VMCS loaded on that CPU. This is needed
586 * when a CPU is brought down, and we need to VMCLEAR all VMCSs loaded on it.
587 */
588 static DEFINE_PER_CPU(struct list_head, loaded_vmcss_on_cpu);
589
590 static DECLARE_BITMAP(vmx_vpid_bitmap, VMX_NR_VPIDS);
591 static DEFINE_SPINLOCK(vmx_vpid_lock);
592
593 struct vmcs_config vmcs_config __ro_after_init;
594 struct vmx_capability vmx_capability __ro_after_init;
595
596 #define VMX_SEGMENT_FIELD(seg) \
597 [VCPU_SREG_##seg] = { \
598 .selector = GUEST_##seg##_SELECTOR, \
599 .base = GUEST_##seg##_BASE, \
600 .limit = GUEST_##seg##_LIMIT, \
601 .ar_bytes = GUEST_##seg##_AR_BYTES, \
602 }
603
604 static const struct kvm_vmx_segment_field {
605 unsigned selector;
606 unsigned base;
607 unsigned limit;
608 unsigned ar_bytes;
609 } kvm_vmx_segment_fields[] = {
610 VMX_SEGMENT_FIELD(CS),
611 VMX_SEGMENT_FIELD(DS),
612 VMX_SEGMENT_FIELD(ES),
613 VMX_SEGMENT_FIELD(FS),
614 VMX_SEGMENT_FIELD(GS),
615 VMX_SEGMENT_FIELD(SS),
616 VMX_SEGMENT_FIELD(TR),
617 VMX_SEGMENT_FIELD(LDTR),
618 };
619
620
621 static unsigned long host_idt_base;
622
623 #if IS_ENABLED(CONFIG_HYPERV)
624 static bool __read_mostly enlightened_vmcs = true;
625 module_param(enlightened_vmcs, bool, 0444);
626
hv_enable_l2_tlb_flush(struct kvm_vcpu * vcpu)627 static int hv_enable_l2_tlb_flush(struct kvm_vcpu *vcpu)
628 {
629 struct hv_enlightened_vmcs *evmcs;
630 hpa_t partition_assist_page = hv_get_partition_assist_page(vcpu);
631
632 if (partition_assist_page == INVALID_PAGE)
633 return -ENOMEM;
634
635 evmcs = (struct hv_enlightened_vmcs *)to_vmx(vcpu)->loaded_vmcs->vmcs;
636
637 evmcs->partition_assist_page = partition_assist_page;
638 evmcs->hv_vm_id = (unsigned long)vcpu->kvm;
639 evmcs->hv_enlightenments_control.nested_flush_hypercall = 1;
640
641 return 0;
642 }
643
hv_init_evmcs(void)644 static __init void hv_init_evmcs(void)
645 {
646 int cpu;
647
648 if (!enlightened_vmcs)
649 return;
650
651 /*
652 * Enlightened VMCS usage should be recommended and the host needs
653 * to support eVMCS v1 or above.
654 */
655 if (ms_hyperv.hints & HV_X64_ENLIGHTENED_VMCS_RECOMMENDED &&
656 (ms_hyperv.nested_features & HV_X64_ENLIGHTENED_VMCS_VERSION) >=
657 KVM_EVMCS_VERSION) {
658
659 /* Check that we have assist pages on all online CPUs */
660 for_each_online_cpu(cpu) {
661 if (!hv_get_vp_assist_page(cpu)) {
662 enlightened_vmcs = false;
663 break;
664 }
665 }
666
667 if (enlightened_vmcs) {
668 pr_info("Using Hyper-V Enlightened VMCS\n");
669 static_branch_enable(&__kvm_is_using_evmcs);
670 }
671
672 if (ms_hyperv.nested_features & HV_X64_NESTED_DIRECT_FLUSH)
673 vt_x86_ops.enable_l2_tlb_flush
674 = hv_enable_l2_tlb_flush;
675 } else {
676 enlightened_vmcs = false;
677 }
678 }
679
hv_reset_evmcs(void)680 static void hv_reset_evmcs(void)
681 {
682 struct hv_vp_assist_page *vp_ap;
683
684 if (!kvm_is_using_evmcs())
685 return;
686
687 /*
688 * KVM should enable eVMCS if and only if all CPUs have a VP assist
689 * page, and should reject CPU onlining if eVMCS is enabled the CPU
690 * doesn't have a VP assist page allocated.
691 */
692 vp_ap = hv_get_vp_assist_page(smp_processor_id());
693 if (WARN_ON_ONCE(!vp_ap))
694 return;
695
696 /*
697 * Reset everything to support using non-enlightened VMCS access later
698 * (e.g. when we reload the module with enlightened_vmcs=0)
699 */
700 vp_ap->nested_control.features.directhypercall = 0;
701 vp_ap->current_nested_vmcs = 0;
702 vp_ap->enlighten_vmentry = 0;
703 }
704
705 #else /* IS_ENABLED(CONFIG_HYPERV) */
hv_init_evmcs(void)706 static void hv_init_evmcs(void) {}
hv_reset_evmcs(void)707 static void hv_reset_evmcs(void) {}
708 #endif /* IS_ENABLED(CONFIG_HYPERV) */
709
710 /*
711 * Comment's format: document - errata name - stepping - processor name.
712 * Refer from
713 * https://www.virtualbox.org/svn/vbox/trunk/src/VBox/VMM/VMMR0/HMR0.cpp
714 */
715 static u32 vmx_preemption_cpu_tfms[] = {
716 /* 323344.pdf - BA86 - D0 - Xeon 7500 Series */
717 0x000206E6,
718 /* 323056.pdf - AAX65 - C2 - Xeon L3406 */
719 /* 322814.pdf - AAT59 - C2 - i7-600, i5-500, i5-400 and i3-300 Mobile */
720 /* 322911.pdf - AAU65 - C2 - i5-600, i3-500 Desktop and Pentium G6950 */
721 0x00020652,
722 /* 322911.pdf - AAU65 - K0 - i5-600, i3-500 Desktop and Pentium G6950 */
723 0x00020655,
724 /* 322373.pdf - AAO95 - B1 - Xeon 3400 Series */
725 /* 322166.pdf - AAN92 - B1 - i7-800 and i5-700 Desktop */
726 /*
727 * 320767.pdf - AAP86 - B1 -
728 * i7-900 Mobile Extreme, i7-800 and i7-700 Mobile
729 */
730 0x000106E5,
731 /* 321333.pdf - AAM126 - C0 - Xeon 3500 */
732 0x000106A0,
733 /* 321333.pdf - AAM126 - C1 - Xeon 3500 */
734 0x000106A1,
735 /* 320836.pdf - AAJ124 - C0 - i7-900 Desktop Extreme and i7-900 Desktop */
736 0x000106A4,
737 /* 321333.pdf - AAM126 - D0 - Xeon 3500 */
738 /* 321324.pdf - AAK139 - D0 - Xeon 5500 */
739 /* 320836.pdf - AAJ124 - D0 - i7-900 Extreme and i7-900 Desktop */
740 0x000106A5,
741 /* Xeon E3-1220 V2 */
742 0x000306A8,
743 };
744
cpu_has_broken_vmx_preemption_timer(void)745 static inline bool cpu_has_broken_vmx_preemption_timer(void)
746 {
747 u32 eax = cpuid_eax(0x00000001), i;
748
749 /* Clear the reserved bits */
750 eax &= ~(0x3U << 14 | 0xfU << 28);
751 for (i = 0; i < ARRAY_SIZE(vmx_preemption_cpu_tfms); i++)
752 if (eax == vmx_preemption_cpu_tfms[i])
753 return true;
754
755 return false;
756 }
757
cpu_need_virtualize_apic_accesses(struct kvm_vcpu * vcpu)758 static inline bool cpu_need_virtualize_apic_accesses(struct kvm_vcpu *vcpu)
759 {
760 return flexpriority_enabled && lapic_in_kernel(vcpu);
761 }
762
vmx_find_uret_msr(struct vcpu_vmx * vmx,u32 msr)763 struct vmx_uret_msr *vmx_find_uret_msr(struct vcpu_vmx *vmx, u32 msr)
764 {
765 int i;
766
767 i = kvm_find_user_return_msr(msr);
768 if (i >= 0)
769 return &vmx->guest_uret_msrs[i];
770 return NULL;
771 }
772
vmx_set_guest_uret_msr(struct vcpu_vmx * vmx,struct vmx_uret_msr * msr,u64 data)773 static int vmx_set_guest_uret_msr(struct vcpu_vmx *vmx,
774 struct vmx_uret_msr *msr, u64 data)
775 {
776 unsigned int slot = msr - vmx->guest_uret_msrs;
777 int ret = 0;
778
779 if (msr->load_into_hardware) {
780 preempt_disable();
781 ret = kvm_set_user_return_msr(slot, data, msr->mask);
782 preempt_enable();
783 }
784 if (!ret)
785 msr->data = data;
786 return ret;
787 }
788
vmx_emergency_disable_virtualization_cpu(void)789 void vmx_emergency_disable_virtualization_cpu(void)
790 {
791 int cpu = raw_smp_processor_id();
792 struct loaded_vmcs *v;
793
794 list_for_each_entry(v, &per_cpu(loaded_vmcss_on_cpu, cpu),
795 loaded_vmcss_on_cpu_link) {
796 vmcs_clear(v->vmcs);
797 if (v->shadow_vmcs)
798 vmcs_clear(v->shadow_vmcs);
799 }
800 }
801
__loaded_vmcs_clear(void * arg)802 static void __loaded_vmcs_clear(void *arg)
803 {
804 struct loaded_vmcs *loaded_vmcs = arg;
805 int cpu = raw_smp_processor_id();
806
807 if (loaded_vmcs->cpu != cpu)
808 return; /* vcpu migration can race with cpu offline */
809 if (per_cpu(current_vmcs, cpu) == loaded_vmcs->vmcs)
810 per_cpu(current_vmcs, cpu) = NULL;
811
812 vmcs_clear(loaded_vmcs->vmcs);
813 if (loaded_vmcs->shadow_vmcs && loaded_vmcs->launched)
814 vmcs_clear(loaded_vmcs->shadow_vmcs);
815
816 list_del(&loaded_vmcs->loaded_vmcss_on_cpu_link);
817
818 /*
819 * Ensure all writes to loaded_vmcs, including deleting it from its
820 * current percpu list, complete before setting loaded_vmcs->cpu to
821 * -1, otherwise a different cpu can see loaded_vmcs->cpu == -1 first
822 * and add loaded_vmcs to its percpu list before it's deleted from this
823 * cpu's list. Pairs with the smp_rmb() in vmx_vcpu_load_vmcs().
824 */
825 smp_wmb();
826
827 loaded_vmcs->cpu = -1;
828 loaded_vmcs->launched = 0;
829 }
830
loaded_vmcs_clear(struct loaded_vmcs * loaded_vmcs)831 static void loaded_vmcs_clear(struct loaded_vmcs *loaded_vmcs)
832 {
833 int cpu = loaded_vmcs->cpu;
834
835 if (cpu != -1)
836 smp_call_function_single(cpu,
837 __loaded_vmcs_clear, loaded_vmcs, 1);
838 }
839
vmx_segment_cache_test_set(struct vcpu_vmx * vmx,unsigned seg,unsigned field)840 static bool vmx_segment_cache_test_set(struct vcpu_vmx *vmx, unsigned seg,
841 unsigned field)
842 {
843 bool ret;
844 u32 mask = 1 << (seg * SEG_FIELD_NR + field);
845
846 if (!kvm_register_is_available(&vmx->vcpu, VCPU_EXREG_SEGMENTS)) {
847 kvm_register_mark_available(&vmx->vcpu, VCPU_EXREG_SEGMENTS);
848 vmx->segment_cache.bitmask = 0;
849 }
850 ret = vmx->segment_cache.bitmask & mask;
851 vmx->segment_cache.bitmask |= mask;
852 return ret;
853 }
854
vmx_read_guest_seg_selector(struct vcpu_vmx * vmx,unsigned seg)855 static u16 vmx_read_guest_seg_selector(struct vcpu_vmx *vmx, unsigned seg)
856 {
857 u16 *p = &vmx->segment_cache.seg[seg].selector;
858
859 if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_SEL))
860 *p = vmcs_read16(kvm_vmx_segment_fields[seg].selector);
861 return *p;
862 }
863
vmx_read_guest_seg_base(struct vcpu_vmx * vmx,unsigned seg)864 static ulong vmx_read_guest_seg_base(struct vcpu_vmx *vmx, unsigned seg)
865 {
866 ulong *p = &vmx->segment_cache.seg[seg].base;
867
868 if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_BASE))
869 *p = vmcs_readl(kvm_vmx_segment_fields[seg].base);
870 return *p;
871 }
872
vmx_read_guest_seg_limit(struct vcpu_vmx * vmx,unsigned seg)873 static u32 vmx_read_guest_seg_limit(struct vcpu_vmx *vmx, unsigned seg)
874 {
875 u32 *p = &vmx->segment_cache.seg[seg].limit;
876
877 if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_LIMIT))
878 *p = vmcs_read32(kvm_vmx_segment_fields[seg].limit);
879 return *p;
880 }
881
vmx_read_guest_seg_ar(struct vcpu_vmx * vmx,unsigned seg)882 static u32 vmx_read_guest_seg_ar(struct vcpu_vmx *vmx, unsigned seg)
883 {
884 u32 *p = &vmx->segment_cache.seg[seg].ar;
885
886 if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_AR))
887 *p = vmcs_read32(kvm_vmx_segment_fields[seg].ar_bytes);
888 return *p;
889 }
890
vmx_update_exception_bitmap(struct kvm_vcpu * vcpu)891 void vmx_update_exception_bitmap(struct kvm_vcpu *vcpu)
892 {
893 u32 eb;
894
895 eb = (1u << PF_VECTOR) | (1u << UD_VECTOR) | (1u << MC_VECTOR) |
896 (1u << DB_VECTOR) | (1u << AC_VECTOR);
897 /*
898 * #VE isn't used for VMX. To test against unexpected changes
899 * related to #VE for VMX, intercept unexpected #VE and warn on it.
900 */
901 if (IS_ENABLED(CONFIG_KVM_INTEL_PROVE_VE))
902 eb |= 1u << VE_VECTOR;
903 /*
904 * Guest access to VMware backdoor ports could legitimately
905 * trigger #GP because of TSS I/O permission bitmap.
906 * We intercept those #GP and allow access to them anyway
907 * as VMware does.
908 */
909 if (enable_vmware_backdoor)
910 eb |= (1u << GP_VECTOR);
911 if ((vcpu->guest_debug &
912 (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP)) ==
913 (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP))
914 eb |= 1u << BP_VECTOR;
915 if (to_vmx(vcpu)->rmode.vm86_active)
916 eb = ~0;
917 if (!vmx_need_pf_intercept(vcpu))
918 eb &= ~(1u << PF_VECTOR);
919
920 /* When we are running a nested L2 guest and L1 specified for it a
921 * certain exception bitmap, we must trap the same exceptions and pass
922 * them to L1. When running L2, we will only handle the exceptions
923 * specified above if L1 did not want them.
924 */
925 if (is_guest_mode(vcpu))
926 eb |= get_vmcs12(vcpu)->exception_bitmap;
927 else {
928 int mask = 0, match = 0;
929
930 if (enable_ept && (eb & (1u << PF_VECTOR))) {
931 /*
932 * If EPT is enabled, #PF is currently only intercepted
933 * if MAXPHYADDR is smaller on the guest than on the
934 * host. In that case we only care about present,
935 * non-reserved faults. For vmcs02, however, PFEC_MASK
936 * and PFEC_MATCH are set in prepare_vmcs02_rare.
937 */
938 mask = PFERR_PRESENT_MASK | PFERR_RSVD_MASK;
939 match = PFERR_PRESENT_MASK;
940 }
941 vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, mask);
942 vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, match);
943 }
944
945 /*
946 * Disabling xfd interception indicates that dynamic xfeatures
947 * might be used in the guest. Always trap #NM in this case
948 * to save guest xfd_err timely.
949 */
950 if (vcpu->arch.xfd_no_write_intercept)
951 eb |= (1u << NM_VECTOR);
952
953 vmcs_write32(EXCEPTION_BITMAP, eb);
954 }
955
956 /*
957 * Check if MSR is intercepted for currently loaded MSR bitmap.
958 */
msr_write_intercepted(struct vcpu_vmx * vmx,u32 msr)959 static bool msr_write_intercepted(struct vcpu_vmx *vmx, u32 msr)
960 {
961 if (!(exec_controls_get(vmx) & CPU_BASED_USE_MSR_BITMAPS))
962 return true;
963
964 return vmx_test_msr_bitmap_write(vmx->loaded_vmcs->msr_bitmap, msr);
965 }
966
__vmx_vcpu_run_flags(struct vcpu_vmx * vmx)967 unsigned int __vmx_vcpu_run_flags(struct vcpu_vmx *vmx)
968 {
969 unsigned int flags = 0;
970
971 if (vmx->loaded_vmcs->launched)
972 flags |= VMX_RUN_VMRESUME;
973
974 /*
975 * If writes to the SPEC_CTRL MSR aren't intercepted, the guest is free
976 * to change it directly without causing a vmexit. In that case read
977 * it after vmexit and store it in vmx->spec_ctrl.
978 */
979 if (!msr_write_intercepted(vmx, MSR_IA32_SPEC_CTRL))
980 flags |= VMX_RUN_SAVE_SPEC_CTRL;
981
982 if (cpu_feature_enabled(X86_FEATURE_CLEAR_CPU_BUF_VM_MMIO) &&
983 kvm_vcpu_can_access_host_mmio(&vmx->vcpu))
984 flags |= VMX_RUN_CLEAR_CPU_BUFFERS_FOR_MMIO;
985
986 return flags;
987 }
988
clear_atomic_switch_msr_special(struct vcpu_vmx * vmx,unsigned long entry,unsigned long exit)989 static __always_inline void clear_atomic_switch_msr_special(struct vcpu_vmx *vmx,
990 unsigned long entry, unsigned long exit)
991 {
992 vm_entry_controls_clearbit(vmx, entry);
993 vm_exit_controls_clearbit(vmx, exit);
994 }
995
vmx_find_loadstore_msr_slot(struct vmx_msrs * m,u32 msr)996 static int vmx_find_loadstore_msr_slot(struct vmx_msrs *m, u32 msr)
997 {
998 unsigned int i;
999
1000 for (i = 0; i < m->nr; ++i) {
1001 if (m->val[i].index == msr)
1002 return i;
1003 }
1004 return -ENOENT;
1005 }
1006
vmx_remove_auto_msr(struct vmx_msrs * m,u32 msr,unsigned long vmcs_count_field)1007 static void vmx_remove_auto_msr(struct vmx_msrs *m, u32 msr,
1008 unsigned long vmcs_count_field)
1009 {
1010 int i;
1011
1012 i = vmx_find_loadstore_msr_slot(m, msr);
1013 if (i < 0)
1014 return;
1015
1016 --m->nr;
1017 m->val[i] = m->val[m->nr];
1018 vmcs_write32(vmcs_count_field, m->nr);
1019 }
1020
clear_atomic_switch_msr(struct vcpu_vmx * vmx,unsigned msr)1021 static void clear_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr)
1022 {
1023 struct msr_autoload *m = &vmx->msr_autoload;
1024
1025 switch (msr) {
1026 case MSR_EFER:
1027 if (cpu_has_load_ia32_efer()) {
1028 clear_atomic_switch_msr_special(vmx,
1029 VM_ENTRY_LOAD_IA32_EFER,
1030 VM_EXIT_LOAD_IA32_EFER);
1031 return;
1032 }
1033 break;
1034 case MSR_CORE_PERF_GLOBAL_CTRL:
1035 if (cpu_has_load_perf_global_ctrl()) {
1036 clear_atomic_switch_msr_special(vmx,
1037 VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL,
1038 VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL);
1039 return;
1040 }
1041 break;
1042 }
1043
1044 vmx_remove_auto_msr(&m->guest, msr, VM_ENTRY_MSR_LOAD_COUNT);
1045 vmx_remove_auto_msr(&m->host, msr, VM_EXIT_MSR_LOAD_COUNT);
1046 }
1047
add_atomic_switch_msr_special(struct vcpu_vmx * vmx,unsigned long entry,unsigned long exit,unsigned long guest_val_vmcs,unsigned long host_val_vmcs,u64 guest_val,u64 host_val)1048 static __always_inline void add_atomic_switch_msr_special(struct vcpu_vmx *vmx,
1049 unsigned long entry, unsigned long exit,
1050 unsigned long guest_val_vmcs, unsigned long host_val_vmcs,
1051 u64 guest_val, u64 host_val)
1052 {
1053 vmcs_write64(guest_val_vmcs, guest_val);
1054 if (host_val_vmcs != HOST_IA32_EFER)
1055 vmcs_write64(host_val_vmcs, host_val);
1056 vm_entry_controls_setbit(vmx, entry);
1057 vm_exit_controls_setbit(vmx, exit);
1058 }
1059
vmx_add_auto_msr(struct vmx_msrs * m,u32 msr,u64 value,unsigned long vmcs_count_field,struct kvm * kvm)1060 static void vmx_add_auto_msr(struct vmx_msrs *m, u32 msr, u64 value,
1061 unsigned long vmcs_count_field, struct kvm *kvm)
1062 {
1063 int i;
1064
1065 i = vmx_find_loadstore_msr_slot(m, msr);
1066 if (i < 0) {
1067 if (KVM_BUG_ON(m->nr == MAX_NR_LOADSTORE_MSRS, kvm))
1068 return;
1069
1070 i = m->nr++;
1071 m->val[i].index = msr;
1072 vmcs_write32(vmcs_count_field, m->nr);
1073 }
1074 m->val[i].value = value;
1075 }
1076
add_atomic_switch_msr(struct vcpu_vmx * vmx,unsigned msr,u64 guest_val,u64 host_val)1077 static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr,
1078 u64 guest_val, u64 host_val)
1079 {
1080 struct msr_autoload *m = &vmx->msr_autoload;
1081 struct kvm *kvm = vmx->vcpu.kvm;
1082
1083 switch (msr) {
1084 case MSR_EFER:
1085 if (cpu_has_load_ia32_efer()) {
1086 add_atomic_switch_msr_special(vmx,
1087 VM_ENTRY_LOAD_IA32_EFER,
1088 VM_EXIT_LOAD_IA32_EFER,
1089 GUEST_IA32_EFER,
1090 HOST_IA32_EFER,
1091 guest_val, host_val);
1092 return;
1093 }
1094 break;
1095 case MSR_CORE_PERF_GLOBAL_CTRL:
1096 if (cpu_has_load_perf_global_ctrl()) {
1097 add_atomic_switch_msr_special(vmx,
1098 VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL,
1099 VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL,
1100 GUEST_IA32_PERF_GLOBAL_CTRL,
1101 HOST_IA32_PERF_GLOBAL_CTRL,
1102 guest_val, host_val);
1103 return;
1104 }
1105 break;
1106 case MSR_IA32_PEBS_ENABLE:
1107 /* PEBS needs a quiescent period after being disabled (to write
1108 * a record). Disabling PEBS through VMX MSR swapping doesn't
1109 * provide that period, so a CPU could write host's record into
1110 * guest's memory.
1111 */
1112 wrmsrq(MSR_IA32_PEBS_ENABLE, 0);
1113 }
1114
1115 vmx_add_auto_msr(&m->guest, msr, guest_val, VM_ENTRY_MSR_LOAD_COUNT, kvm);
1116 vmx_add_auto_msr(&m->host, msr, host_val, VM_EXIT_MSR_LOAD_COUNT, kvm);
1117 }
1118
update_transition_efer(struct vcpu_vmx * vmx)1119 static bool update_transition_efer(struct vcpu_vmx *vmx)
1120 {
1121 u64 guest_efer = vmx->vcpu.arch.efer;
1122 u64 ignore_bits = 0;
1123 int i;
1124
1125 /* Shadow paging assumes NX to be available. */
1126 if (!enable_ept)
1127 guest_efer |= EFER_NX;
1128
1129 /*
1130 * LMA and LME handled by hardware; SCE meaningless outside long mode.
1131 */
1132 ignore_bits |= EFER_SCE;
1133 #ifdef CONFIG_X86_64
1134 ignore_bits |= EFER_LMA | EFER_LME;
1135 /* SCE is meaningful only in long mode on Intel */
1136 if (guest_efer & EFER_LMA)
1137 ignore_bits &= ~(u64)EFER_SCE;
1138 #endif
1139
1140 /*
1141 * On EPT, we can't emulate NX, so we must switch EFER atomically.
1142 * On CPUs that support "load IA32_EFER", always switch EFER
1143 * atomically, since it's faster than switching it manually.
1144 */
1145 if (cpu_has_load_ia32_efer() ||
1146 (enable_ept && ((vmx->vcpu.arch.efer ^ kvm_host.efer) & EFER_NX))) {
1147 if (!(guest_efer & EFER_LMA))
1148 guest_efer &= ~EFER_LME;
1149 if (guest_efer != kvm_host.efer)
1150 add_atomic_switch_msr(vmx, MSR_EFER, guest_efer, kvm_host.efer);
1151 else
1152 clear_atomic_switch_msr(vmx, MSR_EFER);
1153 return false;
1154 }
1155
1156 i = kvm_find_user_return_msr(MSR_EFER);
1157 if (i < 0)
1158 return false;
1159
1160 clear_atomic_switch_msr(vmx, MSR_EFER);
1161
1162 guest_efer &= ~ignore_bits;
1163 guest_efer |= kvm_host.efer & ignore_bits;
1164
1165 vmx->guest_uret_msrs[i].data = guest_efer;
1166 vmx->guest_uret_msrs[i].mask = ~ignore_bits;
1167
1168 return true;
1169 }
1170
vmx_add_autostore_msr(struct vcpu_vmx * vmx,u32 msr)1171 static void vmx_add_autostore_msr(struct vcpu_vmx *vmx, u32 msr)
1172 {
1173 vmx_add_auto_msr(&vmx->msr_autostore, msr, 0, VM_EXIT_MSR_STORE_COUNT,
1174 vmx->vcpu.kvm);
1175 }
1176
vmx_remove_autostore_msr(struct vcpu_vmx * vmx,u32 msr)1177 static void vmx_remove_autostore_msr(struct vcpu_vmx *vmx, u32 msr)
1178 {
1179 vmx_remove_auto_msr(&vmx->msr_autostore, msr, VM_EXIT_MSR_STORE_COUNT);
1180 }
1181
1182 #ifdef CONFIG_X86_32
1183 /*
1184 * On 32-bit kernels, VM exits still load the FS and GS bases from the
1185 * VMCS rather than the segment table. KVM uses this helper to figure
1186 * out the current bases to poke them into the VMCS before entry.
1187 */
segment_base(u16 selector)1188 static unsigned long segment_base(u16 selector)
1189 {
1190 struct desc_struct *table;
1191 unsigned long v;
1192
1193 if (!(selector & ~SEGMENT_RPL_MASK))
1194 return 0;
1195
1196 table = get_current_gdt_ro();
1197
1198 if ((selector & SEGMENT_TI_MASK) == SEGMENT_LDT) {
1199 u16 ldt_selector = kvm_read_ldt();
1200
1201 if (!(ldt_selector & ~SEGMENT_RPL_MASK))
1202 return 0;
1203
1204 table = (struct desc_struct *)segment_base(ldt_selector);
1205 }
1206 v = get_desc_base(&table[selector >> 3]);
1207 return v;
1208 }
1209 #endif
1210
pt_can_write_msr(struct vcpu_vmx * vmx)1211 static inline bool pt_can_write_msr(struct vcpu_vmx *vmx)
1212 {
1213 return vmx_pt_mode_is_host_guest() &&
1214 !(vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN);
1215 }
1216
pt_output_base_valid(struct kvm_vcpu * vcpu,u64 base)1217 static inline bool pt_output_base_valid(struct kvm_vcpu *vcpu, u64 base)
1218 {
1219 /* The base must be 128-byte aligned and a legal physical address. */
1220 return kvm_vcpu_is_legal_aligned_gpa(vcpu, base, 128);
1221 }
1222
pt_load_msr(struct pt_ctx * ctx,u32 addr_range)1223 static inline void pt_load_msr(struct pt_ctx *ctx, u32 addr_range)
1224 {
1225 u32 i;
1226
1227 wrmsrq(MSR_IA32_RTIT_STATUS, ctx->status);
1228 wrmsrq(MSR_IA32_RTIT_OUTPUT_BASE, ctx->output_base);
1229 wrmsrq(MSR_IA32_RTIT_OUTPUT_MASK, ctx->output_mask);
1230 wrmsrq(MSR_IA32_RTIT_CR3_MATCH, ctx->cr3_match);
1231 for (i = 0; i < addr_range; i++) {
1232 wrmsrq(MSR_IA32_RTIT_ADDR0_A + i * 2, ctx->addr_a[i]);
1233 wrmsrq(MSR_IA32_RTIT_ADDR0_B + i * 2, ctx->addr_b[i]);
1234 }
1235 }
1236
pt_save_msr(struct pt_ctx * ctx,u32 addr_range)1237 static inline void pt_save_msr(struct pt_ctx *ctx, u32 addr_range)
1238 {
1239 u32 i;
1240
1241 rdmsrq(MSR_IA32_RTIT_STATUS, ctx->status);
1242 rdmsrq(MSR_IA32_RTIT_OUTPUT_BASE, ctx->output_base);
1243 rdmsrq(MSR_IA32_RTIT_OUTPUT_MASK, ctx->output_mask);
1244 rdmsrq(MSR_IA32_RTIT_CR3_MATCH, ctx->cr3_match);
1245 for (i = 0; i < addr_range; i++) {
1246 rdmsrq(MSR_IA32_RTIT_ADDR0_A + i * 2, ctx->addr_a[i]);
1247 rdmsrq(MSR_IA32_RTIT_ADDR0_B + i * 2, ctx->addr_b[i]);
1248 }
1249 }
1250
pt_guest_enter(struct vcpu_vmx * vmx)1251 static void pt_guest_enter(struct vcpu_vmx *vmx)
1252 {
1253 if (vmx_pt_mode_is_system())
1254 return;
1255
1256 /*
1257 * GUEST_IA32_RTIT_CTL is already set in the VMCS.
1258 * Save host state before VM entry.
1259 */
1260 rdmsrq(MSR_IA32_RTIT_CTL, vmx->pt_desc.host.ctl);
1261 if (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) {
1262 wrmsrq(MSR_IA32_RTIT_CTL, 0);
1263 pt_save_msr(&vmx->pt_desc.host, vmx->pt_desc.num_address_ranges);
1264 pt_load_msr(&vmx->pt_desc.guest, vmx->pt_desc.num_address_ranges);
1265 }
1266 }
1267
pt_guest_exit(struct vcpu_vmx * vmx)1268 static void pt_guest_exit(struct vcpu_vmx *vmx)
1269 {
1270 if (vmx_pt_mode_is_system())
1271 return;
1272
1273 if (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) {
1274 pt_save_msr(&vmx->pt_desc.guest, vmx->pt_desc.num_address_ranges);
1275 pt_load_msr(&vmx->pt_desc.host, vmx->pt_desc.num_address_ranges);
1276 }
1277
1278 /*
1279 * KVM requires VM_EXIT_CLEAR_IA32_RTIT_CTL to expose PT to the guest,
1280 * i.e. RTIT_CTL is always cleared on VM-Exit. Restore it if necessary.
1281 */
1282 if (vmx->pt_desc.host.ctl)
1283 wrmsrq(MSR_IA32_RTIT_CTL, vmx->pt_desc.host.ctl);
1284 }
1285
vmx_set_host_fs_gs(struct vmcs_host_state * host,u16 fs_sel,u16 gs_sel,unsigned long fs_base,unsigned long gs_base)1286 void vmx_set_host_fs_gs(struct vmcs_host_state *host, u16 fs_sel, u16 gs_sel,
1287 unsigned long fs_base, unsigned long gs_base)
1288 {
1289 if (unlikely(fs_sel != host->fs_sel)) {
1290 if (!(fs_sel & 7))
1291 vmcs_write16(HOST_FS_SELECTOR, fs_sel);
1292 else
1293 vmcs_write16(HOST_FS_SELECTOR, 0);
1294 host->fs_sel = fs_sel;
1295 }
1296 if (unlikely(gs_sel != host->gs_sel)) {
1297 if (!(gs_sel & 7))
1298 vmcs_write16(HOST_GS_SELECTOR, gs_sel);
1299 else
1300 vmcs_write16(HOST_GS_SELECTOR, 0);
1301 host->gs_sel = gs_sel;
1302 }
1303 if (unlikely(fs_base != host->fs_base)) {
1304 vmcs_writel(HOST_FS_BASE, fs_base);
1305 host->fs_base = fs_base;
1306 }
1307 if (unlikely(gs_base != host->gs_base)) {
1308 vmcs_writel(HOST_GS_BASE, gs_base);
1309 host->gs_base = gs_base;
1310 }
1311 }
1312
vmx_prepare_switch_to_guest(struct kvm_vcpu * vcpu)1313 void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
1314 {
1315 struct vcpu_vmx *vmx = to_vmx(vcpu);
1316 struct vcpu_vt *vt = to_vt(vcpu);
1317 struct vmcs_host_state *host_state;
1318 #ifdef CONFIG_X86_64
1319 int cpu = raw_smp_processor_id();
1320 #endif
1321 unsigned long fs_base, gs_base;
1322 u16 fs_sel, gs_sel;
1323 int i;
1324
1325 /*
1326 * Note that guest MSRs to be saved/restored can also be changed
1327 * when guest state is loaded. This happens when guest transitions
1328 * to/from long-mode by setting MSR_EFER.LMA.
1329 */
1330 if (!vmx->guest_uret_msrs_loaded) {
1331 vmx->guest_uret_msrs_loaded = true;
1332 for (i = 0; i < kvm_nr_uret_msrs; ++i) {
1333 if (!vmx->guest_uret_msrs[i].load_into_hardware)
1334 continue;
1335
1336 kvm_set_user_return_msr(i,
1337 vmx->guest_uret_msrs[i].data,
1338 vmx->guest_uret_msrs[i].mask);
1339 }
1340 }
1341
1342 if (vmx->nested.need_vmcs12_to_shadow_sync)
1343 nested_sync_vmcs12_to_shadow(vcpu);
1344
1345 if (vt->guest_state_loaded)
1346 return;
1347
1348 host_state = &vmx->loaded_vmcs->host_state;
1349
1350 /*
1351 * Set host fs and gs selectors. Unfortunately, 22.2.3 does not
1352 * allow segment selectors with cpl > 0 or ti == 1.
1353 */
1354 host_state->ldt_sel = kvm_read_ldt();
1355
1356 #ifdef CONFIG_X86_64
1357 savesegment(ds, host_state->ds_sel);
1358 savesegment(es, host_state->es_sel);
1359
1360 gs_base = cpu_kernelmode_gs_base(cpu);
1361 if (likely(is_64bit_mm(current->mm))) {
1362 current_save_fsgs();
1363 fs_sel = current->thread.fsindex;
1364 gs_sel = current->thread.gsindex;
1365 fs_base = current->thread.fsbase;
1366 vt->msr_host_kernel_gs_base = current->thread.gsbase;
1367 } else {
1368 savesegment(fs, fs_sel);
1369 savesegment(gs, gs_sel);
1370 fs_base = read_msr(MSR_FS_BASE);
1371 vt->msr_host_kernel_gs_base = read_msr(MSR_KERNEL_GS_BASE);
1372 }
1373
1374 wrmsrq(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
1375 #else
1376 savesegment(fs, fs_sel);
1377 savesegment(gs, gs_sel);
1378 fs_base = segment_base(fs_sel);
1379 gs_base = segment_base(gs_sel);
1380 #endif
1381
1382 vmx_set_host_fs_gs(host_state, fs_sel, gs_sel, fs_base, gs_base);
1383 vt->guest_state_loaded = true;
1384 }
1385
vmx_prepare_switch_to_host(struct vcpu_vmx * vmx)1386 static void vmx_prepare_switch_to_host(struct vcpu_vmx *vmx)
1387 {
1388 struct vmcs_host_state *host_state;
1389
1390 if (!vmx->vt.guest_state_loaded)
1391 return;
1392
1393 host_state = &vmx->loaded_vmcs->host_state;
1394
1395 ++vmx->vcpu.stat.host_state_reload;
1396
1397 #ifdef CONFIG_X86_64
1398 rdmsrq(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
1399 #endif
1400 if (host_state->ldt_sel || (host_state->gs_sel & 7)) {
1401 kvm_load_ldt(host_state->ldt_sel);
1402 #ifdef CONFIG_X86_64
1403 load_gs_index(host_state->gs_sel);
1404 #else
1405 loadsegment(gs, host_state->gs_sel);
1406 #endif
1407 }
1408 if (host_state->fs_sel & 7)
1409 loadsegment(fs, host_state->fs_sel);
1410 #ifdef CONFIG_X86_64
1411 if (unlikely(host_state->ds_sel | host_state->es_sel)) {
1412 loadsegment(ds, host_state->ds_sel);
1413 loadsegment(es, host_state->es_sel);
1414 }
1415 #endif
1416 invalidate_tss_limit();
1417 #ifdef CONFIG_X86_64
1418 wrmsrq(MSR_KERNEL_GS_BASE, vmx->vt.msr_host_kernel_gs_base);
1419 #endif
1420 load_fixmap_gdt(raw_smp_processor_id());
1421 vmx->vt.guest_state_loaded = false;
1422 vmx->guest_uret_msrs_loaded = false;
1423 }
1424
1425 #ifdef CONFIG_X86_64
vmx_read_guest_host_msr(struct vcpu_vmx * vmx,u32 msr,u64 * cache)1426 static u64 vmx_read_guest_host_msr(struct vcpu_vmx *vmx, u32 msr, u64 *cache)
1427 {
1428 preempt_disable();
1429 if (vmx->vt.guest_state_loaded)
1430 *cache = read_msr(msr);
1431 preempt_enable();
1432 return *cache;
1433 }
1434
vmx_write_guest_host_msr(struct vcpu_vmx * vmx,u32 msr,u64 data,u64 * cache)1435 static void vmx_write_guest_host_msr(struct vcpu_vmx *vmx, u32 msr, u64 data,
1436 u64 *cache)
1437 {
1438 preempt_disable();
1439 if (vmx->vt.guest_state_loaded)
1440 wrmsrns(msr, data);
1441 preempt_enable();
1442 *cache = data;
1443 }
1444
vmx_read_guest_kernel_gs_base(struct vcpu_vmx * vmx)1445 static u64 vmx_read_guest_kernel_gs_base(struct vcpu_vmx *vmx)
1446 {
1447 return vmx_read_guest_host_msr(vmx, MSR_KERNEL_GS_BASE,
1448 &vmx->msr_guest_kernel_gs_base);
1449 }
1450
vmx_write_guest_kernel_gs_base(struct vcpu_vmx * vmx,u64 data)1451 static void vmx_write_guest_kernel_gs_base(struct vcpu_vmx *vmx, u64 data)
1452 {
1453 vmx_write_guest_host_msr(vmx, MSR_KERNEL_GS_BASE, data,
1454 &vmx->msr_guest_kernel_gs_base);
1455 }
1456 #endif
1457
grow_ple_window(struct kvm_vcpu * vcpu)1458 static void grow_ple_window(struct kvm_vcpu *vcpu)
1459 {
1460 struct vcpu_vmx *vmx = to_vmx(vcpu);
1461 unsigned int old = vmx->ple_window;
1462
1463 vmx->ple_window = __grow_ple_window(old, ple_window,
1464 ple_window_grow,
1465 ple_window_max);
1466
1467 if (vmx->ple_window != old) {
1468 vmx->ple_window_dirty = true;
1469 trace_kvm_ple_window_update(vcpu->vcpu_id,
1470 vmx->ple_window, old);
1471 }
1472 }
1473
shrink_ple_window(struct kvm_vcpu * vcpu)1474 static void shrink_ple_window(struct kvm_vcpu *vcpu)
1475 {
1476 struct vcpu_vmx *vmx = to_vmx(vcpu);
1477 unsigned int old = vmx->ple_window;
1478
1479 vmx->ple_window = __shrink_ple_window(old, ple_window,
1480 ple_window_shrink,
1481 ple_window);
1482
1483 if (vmx->ple_window != old) {
1484 vmx->ple_window_dirty = true;
1485 trace_kvm_ple_window_update(vcpu->vcpu_id,
1486 vmx->ple_window, old);
1487 }
1488 }
1489
vmx_vcpu_load_vmcs(struct kvm_vcpu * vcpu,int cpu)1490 void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu)
1491 {
1492 struct vcpu_vmx *vmx = to_vmx(vcpu);
1493 bool already_loaded = vmx->loaded_vmcs->cpu == cpu;
1494 struct vmcs *prev;
1495
1496 if (!already_loaded) {
1497 loaded_vmcs_clear(vmx->loaded_vmcs);
1498 local_irq_disable();
1499
1500 /*
1501 * Ensure loaded_vmcs->cpu is read before adding loaded_vmcs to
1502 * this cpu's percpu list, otherwise it may not yet be deleted
1503 * from its previous cpu's percpu list. Pairs with the
1504 * smb_wmb() in __loaded_vmcs_clear().
1505 */
1506 smp_rmb();
1507
1508 list_add(&vmx->loaded_vmcs->loaded_vmcss_on_cpu_link,
1509 &per_cpu(loaded_vmcss_on_cpu, cpu));
1510 local_irq_enable();
1511 }
1512
1513 prev = per_cpu(current_vmcs, cpu);
1514 if (prev != vmx->loaded_vmcs->vmcs) {
1515 per_cpu(current_vmcs, cpu) = vmx->loaded_vmcs->vmcs;
1516 vmcs_load(vmx->loaded_vmcs->vmcs);
1517 }
1518
1519 if (!already_loaded) {
1520 void *gdt = get_current_gdt_ro();
1521
1522 /*
1523 * Flush all EPTP/VPID contexts, the new pCPU may have stale
1524 * TLB entries from its previous association with the vCPU.
1525 */
1526 kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
1527
1528 /*
1529 * Linux uses per-cpu TSS and GDT, so set these when switching
1530 * processors. See 22.2.4.
1531 */
1532 vmcs_writel(HOST_TR_BASE,
1533 (unsigned long)&get_cpu_entry_area(cpu)->tss.x86_tss);
1534 vmcs_writel(HOST_GDTR_BASE, (unsigned long)gdt); /* 22.2.4 */
1535
1536 if (IS_ENABLED(CONFIG_IA32_EMULATION) || IS_ENABLED(CONFIG_X86_32)) {
1537 /* 22.2.3 */
1538 vmcs_writel(HOST_IA32_SYSENTER_ESP,
1539 (unsigned long)(cpu_entry_stack(cpu) + 1));
1540 }
1541
1542 vmx->loaded_vmcs->cpu = cpu;
1543 }
1544 }
1545
1546 /*
1547 * Switches to specified vcpu, until a matching vcpu_put(), but assumes
1548 * vcpu mutex is already taken.
1549 */
vmx_vcpu_load(struct kvm_vcpu * vcpu,int cpu)1550 void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
1551 {
1552 if (vcpu->scheduled_out && !kvm_pause_in_guest(vcpu->kvm))
1553 shrink_ple_window(vcpu);
1554
1555 vmx_vcpu_load_vmcs(vcpu, cpu);
1556
1557 vmx_vcpu_pi_load(vcpu, cpu);
1558 }
1559
vmx_vcpu_put(struct kvm_vcpu * vcpu)1560 void vmx_vcpu_put(struct kvm_vcpu *vcpu)
1561 {
1562 vmx_vcpu_pi_put(vcpu);
1563
1564 vmx_prepare_switch_to_host(to_vmx(vcpu));
1565 }
1566
vmx_switch_loaded_vmcs(struct kvm_vcpu * vcpu,struct loaded_vmcs * vmcs)1567 static void vmx_switch_loaded_vmcs(struct kvm_vcpu *vcpu,
1568 struct loaded_vmcs *vmcs)
1569 {
1570 struct vcpu_vmx *vmx = to_vmx(vcpu);
1571 int cpu;
1572
1573 cpu = get_cpu();
1574 vmx->loaded_vmcs = vmcs;
1575 vmx_vcpu_load_vmcs(vcpu, cpu);
1576 put_cpu();
1577 }
1578
vmx_load_vmcs01(struct kvm_vcpu * vcpu)1579 static void vmx_load_vmcs01(struct kvm_vcpu *vcpu)
1580 {
1581 struct vcpu_vmx *vmx = to_vmx(vcpu);
1582
1583 if (!is_guest_mode(vcpu)) {
1584 WARN_ON_ONCE(vmx->loaded_vmcs != &vmx->vmcs01);
1585 return;
1586 }
1587
1588 WARN_ON_ONCE(vmx->loaded_vmcs != &vmx->nested.vmcs02);
1589 vmx_switch_loaded_vmcs(vcpu, &vmx->vmcs01);
1590 }
1591
vmx_put_vmcs01(struct kvm_vcpu * vcpu)1592 static void vmx_put_vmcs01(struct kvm_vcpu *vcpu)
1593 {
1594 if (!is_guest_mode(vcpu))
1595 return;
1596
1597 vmx_switch_loaded_vmcs(vcpu, &to_vmx(vcpu)->nested.vmcs02);
1598 }
DEFINE_GUARD(vmx_vmcs01,struct kvm_vcpu *,vmx_load_vmcs01 (_T),vmx_put_vmcs01 (_T))1599 DEFINE_GUARD(vmx_vmcs01, struct kvm_vcpu *,
1600 vmx_load_vmcs01(_T), vmx_put_vmcs01(_T))
1601
1602 bool vmx_emulation_required(struct kvm_vcpu *vcpu)
1603 {
1604 return emulate_invalid_guest_state && !vmx_guest_state_valid(vcpu);
1605 }
1606
vmx_get_rflags(struct kvm_vcpu * vcpu)1607 unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu)
1608 {
1609 struct vcpu_vmx *vmx = to_vmx(vcpu);
1610 unsigned long rflags, save_rflags;
1611
1612 if (!kvm_register_is_available(vcpu, VCPU_EXREG_RFLAGS)) {
1613 kvm_register_mark_available(vcpu, VCPU_EXREG_RFLAGS);
1614 rflags = vmcs_readl(GUEST_RFLAGS);
1615 if (vmx->rmode.vm86_active) {
1616 rflags &= RMODE_GUEST_OWNED_EFLAGS_BITS;
1617 save_rflags = vmx->rmode.save_rflags;
1618 rflags |= save_rflags & ~RMODE_GUEST_OWNED_EFLAGS_BITS;
1619 }
1620 vmx->rflags = rflags;
1621 }
1622 return vmx->rflags;
1623 }
1624
vmx_set_rflags(struct kvm_vcpu * vcpu,unsigned long rflags)1625 void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
1626 {
1627 struct vcpu_vmx *vmx = to_vmx(vcpu);
1628 unsigned long old_rflags;
1629
1630 /*
1631 * Unlike CR0 and CR4, RFLAGS handling requires checking if the vCPU
1632 * is an unrestricted guest in order to mark L2 as needing emulation
1633 * if L1 runs L2 as a restricted guest.
1634 */
1635 if (is_unrestricted_guest(vcpu)) {
1636 kvm_register_mark_available(vcpu, VCPU_EXREG_RFLAGS);
1637 vmx->rflags = rflags;
1638 vmcs_writel(GUEST_RFLAGS, rflags);
1639 return;
1640 }
1641
1642 old_rflags = vmx_get_rflags(vcpu);
1643 vmx->rflags = rflags;
1644 if (vmx->rmode.vm86_active) {
1645 vmx->rmode.save_rflags = rflags;
1646 rflags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM;
1647 }
1648 vmcs_writel(GUEST_RFLAGS, rflags);
1649
1650 if ((old_rflags ^ vmx->rflags) & X86_EFLAGS_VM)
1651 vmx->vt.emulation_required = vmx_emulation_required(vcpu);
1652 }
1653
vmx_get_if_flag(struct kvm_vcpu * vcpu)1654 bool vmx_get_if_flag(struct kvm_vcpu *vcpu)
1655 {
1656 return vmx_get_rflags(vcpu) & X86_EFLAGS_IF;
1657 }
1658
vmx_get_interrupt_shadow(struct kvm_vcpu * vcpu)1659 u32 vmx_get_interrupt_shadow(struct kvm_vcpu *vcpu)
1660 {
1661 u32 interruptibility = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
1662 int ret = 0;
1663
1664 if (interruptibility & GUEST_INTR_STATE_STI)
1665 ret |= KVM_X86_SHADOW_INT_STI;
1666 if (interruptibility & GUEST_INTR_STATE_MOV_SS)
1667 ret |= KVM_X86_SHADOW_INT_MOV_SS;
1668
1669 return ret;
1670 }
1671
vmx_set_interrupt_shadow(struct kvm_vcpu * vcpu,int mask)1672 void vmx_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask)
1673 {
1674 u32 interruptibility_old = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
1675 u32 interruptibility = interruptibility_old;
1676
1677 interruptibility &= ~(GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS);
1678
1679 if (mask & KVM_X86_SHADOW_INT_MOV_SS)
1680 interruptibility |= GUEST_INTR_STATE_MOV_SS;
1681 else if (mask & KVM_X86_SHADOW_INT_STI)
1682 interruptibility |= GUEST_INTR_STATE_STI;
1683
1684 if ((interruptibility != interruptibility_old))
1685 vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, interruptibility);
1686 }
1687
vmx_rtit_ctl_check(struct kvm_vcpu * vcpu,u64 data)1688 static int vmx_rtit_ctl_check(struct kvm_vcpu *vcpu, u64 data)
1689 {
1690 struct vcpu_vmx *vmx = to_vmx(vcpu);
1691 unsigned long value;
1692
1693 /*
1694 * Any MSR write that attempts to change bits marked reserved will
1695 * case a #GP fault.
1696 */
1697 if (data & vmx->pt_desc.ctl_bitmask)
1698 return 1;
1699
1700 /*
1701 * Any attempt to modify IA32_RTIT_CTL while TraceEn is set will
1702 * result in a #GP unless the same write also clears TraceEn.
1703 */
1704 if ((vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) &&
1705 (data & RTIT_CTL_TRACEEN) &&
1706 data != vmx->pt_desc.guest.ctl)
1707 return 1;
1708
1709 /*
1710 * WRMSR to IA32_RTIT_CTL that sets TraceEn but clears this bit
1711 * and FabricEn would cause #GP, if
1712 * CPUID.(EAX=14H, ECX=0):ECX.SNGLRGNOUT[bit 2] = 0
1713 */
1714 if ((data & RTIT_CTL_TRACEEN) && !(data & RTIT_CTL_TOPA) &&
1715 !(data & RTIT_CTL_FABRIC_EN) &&
1716 !intel_pt_validate_cap(vmx->pt_desc.caps,
1717 PT_CAP_single_range_output))
1718 return 1;
1719
1720 /*
1721 * MTCFreq, CycThresh and PSBFreq encodings check, any MSR write that
1722 * utilize encodings marked reserved will cause a #GP fault.
1723 */
1724 value = intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_mtc_periods);
1725 if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_mtc) &&
1726 !test_bit((data & RTIT_CTL_MTC_RANGE) >>
1727 RTIT_CTL_MTC_RANGE_OFFSET, &value))
1728 return 1;
1729 value = intel_pt_validate_cap(vmx->pt_desc.caps,
1730 PT_CAP_cycle_thresholds);
1731 if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_psb_cyc) &&
1732 !test_bit((data & RTIT_CTL_CYC_THRESH) >>
1733 RTIT_CTL_CYC_THRESH_OFFSET, &value))
1734 return 1;
1735 value = intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_psb_periods);
1736 if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_psb_cyc) &&
1737 !test_bit((data & RTIT_CTL_PSB_FREQ) >>
1738 RTIT_CTL_PSB_FREQ_OFFSET, &value))
1739 return 1;
1740
1741 /*
1742 * If ADDRx_CFG is reserved or the encodings is >2 will
1743 * cause a #GP fault.
1744 */
1745 value = (data & RTIT_CTL_ADDR0) >> RTIT_CTL_ADDR0_OFFSET;
1746 if ((value && (vmx->pt_desc.num_address_ranges < 1)) || (value > 2))
1747 return 1;
1748 value = (data & RTIT_CTL_ADDR1) >> RTIT_CTL_ADDR1_OFFSET;
1749 if ((value && (vmx->pt_desc.num_address_ranges < 2)) || (value > 2))
1750 return 1;
1751 value = (data & RTIT_CTL_ADDR2) >> RTIT_CTL_ADDR2_OFFSET;
1752 if ((value && (vmx->pt_desc.num_address_ranges < 3)) || (value > 2))
1753 return 1;
1754 value = (data & RTIT_CTL_ADDR3) >> RTIT_CTL_ADDR3_OFFSET;
1755 if ((value && (vmx->pt_desc.num_address_ranges < 4)) || (value > 2))
1756 return 1;
1757
1758 return 0;
1759 }
1760
vmx_check_emulate_instruction(struct kvm_vcpu * vcpu,int emul_type,void * insn,int insn_len)1761 int vmx_check_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type,
1762 void *insn, int insn_len)
1763 {
1764 /*
1765 * Emulation of instructions in SGX enclaves is impossible as RIP does
1766 * not point at the failing instruction, and even if it did, the code
1767 * stream is inaccessible. Inject #UD instead of exiting to userspace
1768 * so that guest userspace can't DoS the guest simply by triggering
1769 * emulation (enclaves are CPL3 only).
1770 */
1771 if (vmx_get_exit_reason(vcpu).enclave_mode) {
1772 kvm_queue_exception(vcpu, UD_VECTOR);
1773 return X86EMUL_PROPAGATE_FAULT;
1774 }
1775
1776 /* Check that emulation is possible during event vectoring */
1777 if ((to_vmx(vcpu)->idt_vectoring_info & VECTORING_INFO_VALID_MASK) &&
1778 !kvm_can_emulate_event_vectoring(emul_type))
1779 return X86EMUL_UNHANDLEABLE_VECTORING;
1780
1781 return X86EMUL_CONTINUE;
1782 }
1783
skip_emulated_instruction(struct kvm_vcpu * vcpu)1784 static int skip_emulated_instruction(struct kvm_vcpu *vcpu)
1785 {
1786 union vmx_exit_reason exit_reason = vmx_get_exit_reason(vcpu);
1787 unsigned long rip, orig_rip;
1788 u32 instr_len;
1789
1790 /*
1791 * Using VMCS.VM_EXIT_INSTRUCTION_LEN on EPT misconfig depends on
1792 * undefined behavior: Intel's SDM doesn't mandate the VMCS field be
1793 * set when EPT misconfig occurs. In practice, real hardware updates
1794 * VM_EXIT_INSTRUCTION_LEN on EPT misconfig, but other hypervisors
1795 * (namely Hyper-V) don't set it due to it being undefined behavior,
1796 * i.e. we end up advancing IP with some random value.
1797 */
1798 if (!static_cpu_has(X86_FEATURE_HYPERVISOR) ||
1799 exit_reason.basic != EXIT_REASON_EPT_MISCONFIG) {
1800 instr_len = vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
1801
1802 /*
1803 * Emulating an enclave's instructions isn't supported as KVM
1804 * cannot access the enclave's memory or its true RIP, e.g. the
1805 * vmcs.GUEST_RIP points at the exit point of the enclave, not
1806 * the RIP that actually triggered the VM-Exit. But, because
1807 * most instructions that cause VM-Exit will #UD in an enclave,
1808 * most instruction-based VM-Exits simply do not occur.
1809 *
1810 * There are a few exceptions, notably the debug instructions
1811 * INT1ICEBRK and INT3, as they are allowed in debug enclaves
1812 * and generate #DB/#BP as expected, which KVM might intercept.
1813 * But again, the CPU does the dirty work and saves an instr
1814 * length of zero so VMMs don't shoot themselves in the foot.
1815 * WARN if KVM tries to skip a non-zero length instruction on
1816 * a VM-Exit from an enclave.
1817 */
1818 if (!instr_len)
1819 goto rip_updated;
1820
1821 WARN_ONCE(exit_reason.enclave_mode,
1822 "skipping instruction after SGX enclave VM-Exit");
1823
1824 orig_rip = kvm_rip_read(vcpu);
1825 rip = orig_rip + instr_len;
1826 #ifdef CONFIG_X86_64
1827 /*
1828 * We need to mask out the high 32 bits of RIP if not in 64-bit
1829 * mode, but just finding out that we are in 64-bit mode is
1830 * quite expensive. Only do it if there was a carry.
1831 */
1832 if (unlikely(((rip ^ orig_rip) >> 31) == 3) && !is_64_bit_mode(vcpu))
1833 rip = (u32)rip;
1834 #endif
1835 kvm_rip_write(vcpu, rip);
1836 } else {
1837 if (!kvm_emulate_instruction(vcpu, EMULTYPE_SKIP))
1838 return 0;
1839 }
1840
1841 rip_updated:
1842 /* skipping an emulated instruction also counts */
1843 vmx_set_interrupt_shadow(vcpu, 0);
1844
1845 return 1;
1846 }
1847
1848 /*
1849 * Recognizes a pending MTF VM-exit and records the nested state for later
1850 * delivery.
1851 */
vmx_update_emulated_instruction(struct kvm_vcpu * vcpu)1852 void vmx_update_emulated_instruction(struct kvm_vcpu *vcpu)
1853 {
1854 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
1855 struct vcpu_vmx *vmx = to_vmx(vcpu);
1856
1857 if (!is_guest_mode(vcpu))
1858 return;
1859
1860 /*
1861 * Per the SDM, MTF takes priority over debug-trap exceptions besides
1862 * TSS T-bit traps and ICEBP (INT1). KVM doesn't emulate T-bit traps
1863 * or ICEBP (in the emulator proper), and skipping of ICEBP after an
1864 * intercepted #DB deliberately avoids single-step #DB and MTF updates
1865 * as ICEBP is higher priority than both. As instruction emulation is
1866 * completed at this point (i.e. KVM is at the instruction boundary),
1867 * any #DB exception pending delivery must be a debug-trap of lower
1868 * priority than MTF. Record the pending MTF state to be delivered in
1869 * vmx_check_nested_events().
1870 */
1871 if (nested_cpu_has_mtf(vmcs12) &&
1872 (!vcpu->arch.exception.pending ||
1873 vcpu->arch.exception.vector == DB_VECTOR) &&
1874 (!vcpu->arch.exception_vmexit.pending ||
1875 vcpu->arch.exception_vmexit.vector == DB_VECTOR)) {
1876 vmx->nested.mtf_pending = true;
1877 kvm_make_request(KVM_REQ_EVENT, vcpu);
1878 } else {
1879 vmx->nested.mtf_pending = false;
1880 }
1881 }
1882
vmx_skip_emulated_instruction(struct kvm_vcpu * vcpu)1883 int vmx_skip_emulated_instruction(struct kvm_vcpu *vcpu)
1884 {
1885 vmx_update_emulated_instruction(vcpu);
1886 return skip_emulated_instruction(vcpu);
1887 }
1888
vmx_clear_hlt(struct kvm_vcpu * vcpu)1889 static void vmx_clear_hlt(struct kvm_vcpu *vcpu)
1890 {
1891 /*
1892 * Ensure that we clear the HLT state in the VMCS. We don't need to
1893 * explicitly skip the instruction because if the HLT state is set,
1894 * then the instruction is already executing and RIP has already been
1895 * advanced.
1896 */
1897 if (kvm_hlt_in_guest(vcpu->kvm) &&
1898 vmcs_read32(GUEST_ACTIVITY_STATE) == GUEST_ACTIVITY_HLT)
1899 vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE);
1900 }
1901
vmx_inject_exception(struct kvm_vcpu * vcpu)1902 void vmx_inject_exception(struct kvm_vcpu *vcpu)
1903 {
1904 struct kvm_queued_exception *ex = &vcpu->arch.exception;
1905 u32 intr_info = ex->vector | INTR_INFO_VALID_MASK;
1906 struct vcpu_vmx *vmx = to_vmx(vcpu);
1907
1908 kvm_deliver_exception_payload(vcpu, ex);
1909
1910 if (ex->has_error_code) {
1911 /*
1912 * Despite the error code being architecturally defined as 32
1913 * bits, and the VMCS field being 32 bits, Intel CPUs and thus
1914 * VMX don't actually supporting setting bits 31:16. Hardware
1915 * will (should) never provide a bogus error code, but AMD CPUs
1916 * do generate error codes with bits 31:16 set, and so KVM's
1917 * ABI lets userspace shove in arbitrary 32-bit values. Drop
1918 * the upper bits to avoid VM-Fail, losing information that
1919 * doesn't really exist is preferable to killing the VM.
1920 */
1921 vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, (u16)ex->error_code);
1922 intr_info |= INTR_INFO_DELIVER_CODE_MASK;
1923 }
1924
1925 if (vmx->rmode.vm86_active) {
1926 int inc_eip = 0;
1927 if (kvm_exception_is_soft(ex->vector))
1928 inc_eip = vcpu->arch.event_exit_inst_len;
1929 kvm_inject_realmode_interrupt(vcpu, ex->vector, inc_eip);
1930 return;
1931 }
1932
1933 WARN_ON_ONCE(vmx->vt.emulation_required);
1934
1935 if (kvm_exception_is_soft(ex->vector)) {
1936 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
1937 vmx->vcpu.arch.event_exit_inst_len);
1938 intr_info |= INTR_TYPE_SOFT_EXCEPTION;
1939 } else
1940 intr_info |= INTR_TYPE_HARD_EXCEPTION;
1941
1942 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr_info);
1943
1944 vmx_clear_hlt(vcpu);
1945 }
1946
vmx_setup_uret_msr(struct vcpu_vmx * vmx,unsigned int msr,bool load_into_hardware)1947 static void vmx_setup_uret_msr(struct vcpu_vmx *vmx, unsigned int msr,
1948 bool load_into_hardware)
1949 {
1950 struct vmx_uret_msr *uret_msr;
1951
1952 uret_msr = vmx_find_uret_msr(vmx, msr);
1953 if (!uret_msr)
1954 return;
1955
1956 uret_msr->load_into_hardware = load_into_hardware;
1957 }
1958
1959 /*
1960 * Configuring user return MSRs to automatically save, load, and restore MSRs
1961 * that need to be shoved into hardware when running the guest. Note, omitting
1962 * an MSR here does _NOT_ mean it's not emulated, only that it will not be
1963 * loaded into hardware when running the guest.
1964 */
vmx_setup_uret_msrs(struct vcpu_vmx * vmx)1965 static void vmx_setup_uret_msrs(struct vcpu_vmx *vmx)
1966 {
1967 #ifdef CONFIG_X86_64
1968 bool load_syscall_msrs;
1969
1970 /*
1971 * The SYSCALL MSRs are only needed on long mode guests, and only
1972 * when EFER.SCE is set.
1973 */
1974 load_syscall_msrs = is_long_mode(&vmx->vcpu) &&
1975 (vmx->vcpu.arch.efer & EFER_SCE);
1976
1977 vmx_setup_uret_msr(vmx, MSR_STAR, load_syscall_msrs);
1978 vmx_setup_uret_msr(vmx, MSR_LSTAR, load_syscall_msrs);
1979 vmx_setup_uret_msr(vmx, MSR_SYSCALL_MASK, load_syscall_msrs);
1980 #endif
1981 vmx_setup_uret_msr(vmx, MSR_EFER, update_transition_efer(vmx));
1982
1983 vmx_setup_uret_msr(vmx, MSR_TSC_AUX,
1984 guest_cpu_cap_has(&vmx->vcpu, X86_FEATURE_RDTSCP) ||
1985 guest_cpu_cap_has(&vmx->vcpu, X86_FEATURE_RDPID));
1986
1987 /*
1988 * hle=0, rtm=0, tsx_ctrl=1 can be found with some combinations of new
1989 * kernel and old userspace. If those guests run on a tsx=off host, do
1990 * allow guests to use TSX_CTRL, but don't change the value in hardware
1991 * so that TSX remains always disabled.
1992 */
1993 vmx_setup_uret_msr(vmx, MSR_IA32_TSX_CTRL, boot_cpu_has(X86_FEATURE_RTM));
1994
1995 /*
1996 * The set of MSRs to load may have changed, reload MSRs before the
1997 * next VM-Enter.
1998 */
1999 vmx->guest_uret_msrs_loaded = false;
2000 }
2001
vmx_get_l2_tsc_offset(struct kvm_vcpu * vcpu)2002 u64 vmx_get_l2_tsc_offset(struct kvm_vcpu *vcpu)
2003 {
2004 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
2005
2006 if (nested_cpu_has(vmcs12, CPU_BASED_USE_TSC_OFFSETTING))
2007 return vmcs12->tsc_offset;
2008
2009 return 0;
2010 }
2011
vmx_get_l2_tsc_multiplier(struct kvm_vcpu * vcpu)2012 u64 vmx_get_l2_tsc_multiplier(struct kvm_vcpu *vcpu)
2013 {
2014 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
2015
2016 if (nested_cpu_has(vmcs12, CPU_BASED_USE_TSC_OFFSETTING) &&
2017 nested_cpu_has2(vmcs12, SECONDARY_EXEC_TSC_SCALING))
2018 return vmcs12->tsc_multiplier;
2019
2020 return kvm_caps.default_tsc_scaling_ratio;
2021 }
2022
vmx_write_tsc_offset(struct kvm_vcpu * vcpu)2023 void vmx_write_tsc_offset(struct kvm_vcpu *vcpu)
2024 {
2025 vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset);
2026 }
2027
vmx_write_tsc_multiplier(struct kvm_vcpu * vcpu)2028 void vmx_write_tsc_multiplier(struct kvm_vcpu *vcpu)
2029 {
2030 vmcs_write64(TSC_MULTIPLIER, vcpu->arch.tsc_scaling_ratio);
2031 }
2032
2033 /*
2034 * Userspace is allowed to set any supported IA32_FEATURE_CONTROL regardless of
2035 * guest CPUID. Note, KVM allows userspace to set "VMX in SMX" to maintain
2036 * backwards compatibility even though KVM doesn't support emulating SMX. And
2037 * because userspace set "VMX in SMX", the guest must also be allowed to set it,
2038 * e.g. if the MSR is left unlocked and the guest does a RMW operation.
2039 */
2040 #define KVM_SUPPORTED_FEATURE_CONTROL (FEAT_CTL_LOCKED | \
2041 FEAT_CTL_VMX_ENABLED_INSIDE_SMX | \
2042 FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX | \
2043 FEAT_CTL_SGX_LC_ENABLED | \
2044 FEAT_CTL_SGX_ENABLED | \
2045 FEAT_CTL_LMCE_ENABLED)
2046
is_vmx_feature_control_msr_valid(struct vcpu_vmx * vmx,struct msr_data * msr)2047 static inline bool is_vmx_feature_control_msr_valid(struct vcpu_vmx *vmx,
2048 struct msr_data *msr)
2049 {
2050 uint64_t valid_bits;
2051
2052 /*
2053 * Ensure KVM_SUPPORTED_FEATURE_CONTROL is updated when new bits are
2054 * exposed to the guest.
2055 */
2056 WARN_ON_ONCE(vmx->msr_ia32_feature_control_valid_bits &
2057 ~KVM_SUPPORTED_FEATURE_CONTROL);
2058
2059 if (!msr->host_initiated &&
2060 (vmx->msr_ia32_feature_control & FEAT_CTL_LOCKED))
2061 return false;
2062
2063 if (msr->host_initiated)
2064 valid_bits = KVM_SUPPORTED_FEATURE_CONTROL;
2065 else
2066 valid_bits = vmx->msr_ia32_feature_control_valid_bits;
2067
2068 return !(msr->data & ~valid_bits);
2069 }
2070
vmx_get_feature_msr(u32 msr,u64 * data)2071 int vmx_get_feature_msr(u32 msr, u64 *data)
2072 {
2073 switch (msr) {
2074 case KVM_FIRST_EMULATED_VMX_MSR ... KVM_LAST_EMULATED_VMX_MSR:
2075 if (!nested)
2076 return 1;
2077 return vmx_get_vmx_msr(&vmcs_config.nested, msr, data);
2078 default:
2079 return KVM_MSR_RET_UNSUPPORTED;
2080 }
2081 }
2082
2083 /*
2084 * Reads an msr value (of 'msr_info->index') into 'msr_info->data'.
2085 * Returns 0 on success, non-0 otherwise.
2086 * Assumes vcpu_load() was already called.
2087 */
vmx_get_msr(struct kvm_vcpu * vcpu,struct msr_data * msr_info)2088 int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
2089 {
2090 struct vcpu_vmx *vmx = to_vmx(vcpu);
2091 struct vmx_uret_msr *msr;
2092 u32 index;
2093
2094 switch (msr_info->index) {
2095 #ifdef CONFIG_X86_64
2096 case MSR_FS_BASE:
2097 msr_info->data = vmcs_readl(GUEST_FS_BASE);
2098 break;
2099 case MSR_GS_BASE:
2100 msr_info->data = vmcs_readl(GUEST_GS_BASE);
2101 break;
2102 case MSR_KERNEL_GS_BASE:
2103 msr_info->data = vmx_read_guest_kernel_gs_base(vmx);
2104 break;
2105 #endif
2106 case MSR_EFER:
2107 return kvm_get_msr_common(vcpu, msr_info);
2108 case MSR_IA32_TSX_CTRL:
2109 if (!msr_info->host_initiated &&
2110 !(vcpu->arch.arch_capabilities & ARCH_CAP_TSX_CTRL_MSR))
2111 return 1;
2112 goto find_uret_msr;
2113 case MSR_IA32_UMWAIT_CONTROL:
2114 if (!msr_info->host_initiated && !vmx_has_waitpkg(vmx))
2115 return 1;
2116
2117 msr_info->data = vmx->msr_ia32_umwait_control;
2118 break;
2119 case MSR_IA32_SPEC_CTRL:
2120 if (!msr_info->host_initiated &&
2121 !guest_has_spec_ctrl_msr(vcpu))
2122 return 1;
2123
2124 msr_info->data = to_vmx(vcpu)->spec_ctrl;
2125 break;
2126 case MSR_IA32_SYSENTER_CS:
2127 msr_info->data = vmcs_read32(GUEST_SYSENTER_CS);
2128 break;
2129 case MSR_IA32_SYSENTER_EIP:
2130 msr_info->data = vmcs_readl(GUEST_SYSENTER_EIP);
2131 break;
2132 case MSR_IA32_SYSENTER_ESP:
2133 msr_info->data = vmcs_readl(GUEST_SYSENTER_ESP);
2134 break;
2135 case MSR_IA32_BNDCFGS:
2136 if (!kvm_mpx_supported() ||
2137 (!msr_info->host_initiated &&
2138 !guest_cpu_cap_has(vcpu, X86_FEATURE_MPX)))
2139 return 1;
2140 msr_info->data = vmcs_read64(GUEST_BNDCFGS);
2141 break;
2142 case MSR_IA32_MCG_EXT_CTL:
2143 if (!msr_info->host_initiated &&
2144 !(vmx->msr_ia32_feature_control &
2145 FEAT_CTL_LMCE_ENABLED))
2146 return 1;
2147 msr_info->data = vcpu->arch.mcg_ext_ctl;
2148 break;
2149 case MSR_IA32_FEAT_CTL:
2150 msr_info->data = vmx->msr_ia32_feature_control;
2151 break;
2152 case MSR_IA32_SGXLEPUBKEYHASH0 ... MSR_IA32_SGXLEPUBKEYHASH3:
2153 if (!msr_info->host_initiated &&
2154 !guest_cpu_cap_has(vcpu, X86_FEATURE_SGX_LC))
2155 return 1;
2156 msr_info->data = to_vmx(vcpu)->msr_ia32_sgxlepubkeyhash
2157 [msr_info->index - MSR_IA32_SGXLEPUBKEYHASH0];
2158 break;
2159 case KVM_FIRST_EMULATED_VMX_MSR ... KVM_LAST_EMULATED_VMX_MSR:
2160 if (!guest_cpu_cap_has(vcpu, X86_FEATURE_VMX))
2161 return 1;
2162 if (vmx_get_vmx_msr(&vmx->nested.msrs, msr_info->index,
2163 &msr_info->data))
2164 return 1;
2165 #ifdef CONFIG_KVM_HYPERV
2166 /*
2167 * Enlightened VMCS v1 doesn't have certain VMCS fields but
2168 * instead of just ignoring the features, different Hyper-V
2169 * versions are either trying to use them and fail or do some
2170 * sanity checking and refuse to boot. Filter all unsupported
2171 * features out.
2172 */
2173 if (!msr_info->host_initiated && guest_cpu_cap_has_evmcs(vcpu))
2174 nested_evmcs_filter_control_msr(vcpu, msr_info->index,
2175 &msr_info->data);
2176 #endif
2177 break;
2178 case MSR_IA32_RTIT_CTL:
2179 if (!vmx_pt_mode_is_host_guest())
2180 return 1;
2181 msr_info->data = vmx->pt_desc.guest.ctl;
2182 break;
2183 case MSR_IA32_RTIT_STATUS:
2184 if (!vmx_pt_mode_is_host_guest())
2185 return 1;
2186 msr_info->data = vmx->pt_desc.guest.status;
2187 break;
2188 case MSR_IA32_RTIT_CR3_MATCH:
2189 if (!vmx_pt_mode_is_host_guest() ||
2190 !intel_pt_validate_cap(vmx->pt_desc.caps,
2191 PT_CAP_cr3_filtering))
2192 return 1;
2193 msr_info->data = vmx->pt_desc.guest.cr3_match;
2194 break;
2195 case MSR_IA32_RTIT_OUTPUT_BASE:
2196 if (!vmx_pt_mode_is_host_guest() ||
2197 (!intel_pt_validate_cap(vmx->pt_desc.caps,
2198 PT_CAP_topa_output) &&
2199 !intel_pt_validate_cap(vmx->pt_desc.caps,
2200 PT_CAP_single_range_output)))
2201 return 1;
2202 msr_info->data = vmx->pt_desc.guest.output_base;
2203 break;
2204 case MSR_IA32_RTIT_OUTPUT_MASK:
2205 if (!vmx_pt_mode_is_host_guest() ||
2206 (!intel_pt_validate_cap(vmx->pt_desc.caps,
2207 PT_CAP_topa_output) &&
2208 !intel_pt_validate_cap(vmx->pt_desc.caps,
2209 PT_CAP_single_range_output)))
2210 return 1;
2211 msr_info->data = vmx->pt_desc.guest.output_mask;
2212 break;
2213 case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B:
2214 index = msr_info->index - MSR_IA32_RTIT_ADDR0_A;
2215 if (!vmx_pt_mode_is_host_guest() ||
2216 (index >= 2 * vmx->pt_desc.num_address_ranges))
2217 return 1;
2218 if (index % 2)
2219 msr_info->data = vmx->pt_desc.guest.addr_b[index / 2];
2220 else
2221 msr_info->data = vmx->pt_desc.guest.addr_a[index / 2];
2222 break;
2223 case MSR_IA32_S_CET:
2224 msr_info->data = vmcs_readl(GUEST_S_CET);
2225 break;
2226 case MSR_KVM_INTERNAL_GUEST_SSP:
2227 msr_info->data = vmcs_readl(GUEST_SSP);
2228 break;
2229 case MSR_IA32_INT_SSP_TAB:
2230 msr_info->data = vmcs_readl(GUEST_INTR_SSP_TABLE);
2231 break;
2232 case MSR_IA32_DEBUGCTLMSR:
2233 msr_info->data = vmx_guest_debugctl_read();
2234 break;
2235 default:
2236 find_uret_msr:
2237 msr = vmx_find_uret_msr(vmx, msr_info->index);
2238 if (msr) {
2239 msr_info->data = msr->data;
2240 break;
2241 }
2242 return kvm_get_msr_common(vcpu, msr_info);
2243 }
2244
2245 return 0;
2246 }
2247
nested_vmx_truncate_sysenter_addr(struct kvm_vcpu * vcpu,u64 data)2248 static u64 nested_vmx_truncate_sysenter_addr(struct kvm_vcpu *vcpu,
2249 u64 data)
2250 {
2251 #ifdef CONFIG_X86_64
2252 if (!guest_cpu_cap_has(vcpu, X86_FEATURE_LM))
2253 return (u32)data;
2254 #endif
2255 return (unsigned long)data;
2256 }
2257
vmx_get_supported_debugctl(struct kvm_vcpu * vcpu,bool host_initiated)2258 u64 vmx_get_supported_debugctl(struct kvm_vcpu *vcpu, bool host_initiated)
2259 {
2260 u64 debugctl = 0;
2261
2262 if (boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT) &&
2263 (host_initiated || guest_cpu_cap_has(vcpu, X86_FEATURE_BUS_LOCK_DETECT)))
2264 debugctl |= DEBUGCTLMSR_BUS_LOCK_DETECT;
2265
2266 if ((kvm_caps.supported_perf_cap & PERF_CAP_LBR_FMT) &&
2267 (host_initiated || intel_pmu_lbr_is_enabled(vcpu)))
2268 debugctl |= DEBUGCTLMSR_LBR | DEBUGCTLMSR_FREEZE_LBRS_ON_PMI;
2269
2270 if (boot_cpu_has(X86_FEATURE_RTM) &&
2271 (host_initiated || guest_cpu_cap_has(vcpu, X86_FEATURE_RTM)))
2272 debugctl |= DEBUGCTLMSR_RTM_DEBUG;
2273
2274 return debugctl;
2275 }
2276
vmx_is_valid_debugctl(struct kvm_vcpu * vcpu,u64 data,bool host_initiated)2277 bool vmx_is_valid_debugctl(struct kvm_vcpu *vcpu, u64 data, bool host_initiated)
2278 {
2279 u64 invalid;
2280
2281 invalid = data & ~vmx_get_supported_debugctl(vcpu, host_initiated);
2282 if (invalid & (DEBUGCTLMSR_BTF | DEBUGCTLMSR_LBR)) {
2283 kvm_pr_unimpl_wrmsr(vcpu, MSR_IA32_DEBUGCTLMSR, data);
2284 invalid &= ~(DEBUGCTLMSR_BTF | DEBUGCTLMSR_LBR);
2285 }
2286 return !invalid;
2287 }
2288
2289 /*
2290 * Writes msr value into the appropriate "register".
2291 * Returns 0 on success, non-0 otherwise.
2292 * Assumes vcpu_load() was already called.
2293 */
vmx_set_msr(struct kvm_vcpu * vcpu,struct msr_data * msr_info)2294 int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
2295 {
2296 struct vcpu_vmx *vmx = to_vmx(vcpu);
2297 struct vmx_uret_msr *msr;
2298 int ret = 0;
2299 u32 msr_index = msr_info->index;
2300 u64 data = msr_info->data;
2301 u32 index;
2302
2303 switch (msr_index) {
2304 case MSR_EFER:
2305 ret = kvm_set_msr_common(vcpu, msr_info);
2306 break;
2307 #ifdef CONFIG_X86_64
2308 case MSR_FS_BASE:
2309 vmx_segment_cache_clear(vmx);
2310 vmcs_writel(GUEST_FS_BASE, data);
2311 break;
2312 case MSR_GS_BASE:
2313 vmx_segment_cache_clear(vmx);
2314 vmcs_writel(GUEST_GS_BASE, data);
2315 break;
2316 case MSR_KERNEL_GS_BASE:
2317 vmx_write_guest_kernel_gs_base(vmx, data);
2318 break;
2319 case MSR_IA32_XFD:
2320 ret = kvm_set_msr_common(vcpu, msr_info);
2321 /*
2322 * Always intercepting WRMSR could incur non-negligible
2323 * overhead given xfd might be changed frequently in
2324 * guest context switch. Disable write interception
2325 * upon the first write with a non-zero value (indicating
2326 * potential usage on dynamic xfeatures). Also update
2327 * exception bitmap to trap #NM for proper virtualization
2328 * of guest xfd_err.
2329 */
2330 if (!ret && data) {
2331 vmx_disable_intercept_for_msr(vcpu, MSR_IA32_XFD,
2332 MSR_TYPE_RW);
2333 vcpu->arch.xfd_no_write_intercept = true;
2334 vmx_update_exception_bitmap(vcpu);
2335 }
2336 break;
2337 #endif
2338 case MSR_IA32_SYSENTER_CS:
2339 if (is_guest_mode(vcpu))
2340 get_vmcs12(vcpu)->guest_sysenter_cs = data;
2341 vmcs_write32(GUEST_SYSENTER_CS, data);
2342 break;
2343 case MSR_IA32_SYSENTER_EIP:
2344 if (is_guest_mode(vcpu)) {
2345 data = nested_vmx_truncate_sysenter_addr(vcpu, data);
2346 get_vmcs12(vcpu)->guest_sysenter_eip = data;
2347 }
2348 vmcs_writel(GUEST_SYSENTER_EIP, data);
2349 break;
2350 case MSR_IA32_SYSENTER_ESP:
2351 if (is_guest_mode(vcpu)) {
2352 data = nested_vmx_truncate_sysenter_addr(vcpu, data);
2353 get_vmcs12(vcpu)->guest_sysenter_esp = data;
2354 }
2355 vmcs_writel(GUEST_SYSENTER_ESP, data);
2356 break;
2357 case MSR_IA32_DEBUGCTLMSR:
2358 if (!vmx_is_valid_debugctl(vcpu, data, msr_info->host_initiated))
2359 return 1;
2360
2361 data &= vmx_get_supported_debugctl(vcpu, msr_info->host_initiated);
2362
2363 if (is_guest_mode(vcpu) && get_vmcs12(vcpu)->vm_exit_controls &
2364 VM_EXIT_SAVE_DEBUG_CONTROLS)
2365 get_vmcs12(vcpu)->guest_ia32_debugctl = data;
2366
2367 vmx_guest_debugctl_write(vcpu, data);
2368
2369 if (intel_pmu_lbr_is_enabled(vcpu) && !to_vmx(vcpu)->lbr_desc.event &&
2370 (data & DEBUGCTLMSR_LBR))
2371 intel_pmu_create_guest_lbr_event(vcpu);
2372 return 0;
2373 case MSR_IA32_BNDCFGS:
2374 if (!kvm_mpx_supported() ||
2375 (!msr_info->host_initiated &&
2376 !guest_cpu_cap_has(vcpu, X86_FEATURE_MPX)))
2377 return 1;
2378 if (is_noncanonical_msr_address(data & PAGE_MASK, vcpu) ||
2379 (data & MSR_IA32_BNDCFGS_RSVD))
2380 return 1;
2381
2382 if (is_guest_mode(vcpu) &&
2383 ((vmx->nested.msrs.entry_ctls_high & VM_ENTRY_LOAD_BNDCFGS) ||
2384 (vmx->nested.msrs.exit_ctls_high & VM_EXIT_CLEAR_BNDCFGS)))
2385 get_vmcs12(vcpu)->guest_bndcfgs = data;
2386
2387 vmcs_write64(GUEST_BNDCFGS, data);
2388 break;
2389 case MSR_IA32_UMWAIT_CONTROL:
2390 if (!msr_info->host_initiated && !vmx_has_waitpkg(vmx))
2391 return 1;
2392
2393 /* The reserved bit 1 and non-32 bit [63:32] should be zero */
2394 if (data & (BIT_ULL(1) | GENMASK_ULL(63, 32)))
2395 return 1;
2396
2397 vmx->msr_ia32_umwait_control = data;
2398 break;
2399 case MSR_IA32_SPEC_CTRL:
2400 if (!msr_info->host_initiated &&
2401 !guest_has_spec_ctrl_msr(vcpu))
2402 return 1;
2403
2404 if (kvm_spec_ctrl_test_value(data))
2405 return 1;
2406
2407 vmx->spec_ctrl = data;
2408 if (!data)
2409 break;
2410
2411 /*
2412 * For non-nested:
2413 * When it's written (to non-zero) for the first time, pass
2414 * it through.
2415 *
2416 * For nested:
2417 * The handling of the MSR bitmap for L2 guests is done in
2418 * nested_vmx_prepare_msr_bitmap. We should not touch the
2419 * vmcs02.msr_bitmap here since it gets completely overwritten
2420 * in the merging. We update the vmcs01 here for L1 as well
2421 * since it will end up touching the MSR anyway now.
2422 */
2423 vmx_disable_intercept_for_msr(vcpu,
2424 MSR_IA32_SPEC_CTRL,
2425 MSR_TYPE_RW);
2426 break;
2427 case MSR_IA32_TSX_CTRL:
2428 if (!msr_info->host_initiated &&
2429 !(vcpu->arch.arch_capabilities & ARCH_CAP_TSX_CTRL_MSR))
2430 return 1;
2431 if (data & ~(TSX_CTRL_RTM_DISABLE | TSX_CTRL_CPUID_CLEAR))
2432 return 1;
2433 goto find_uret_msr;
2434 case MSR_IA32_CR_PAT:
2435 ret = kvm_set_msr_common(vcpu, msr_info);
2436 if (ret)
2437 break;
2438
2439 if (is_guest_mode(vcpu) &&
2440 get_vmcs12(vcpu)->vm_exit_controls & VM_EXIT_SAVE_IA32_PAT)
2441 get_vmcs12(vcpu)->guest_ia32_pat = data;
2442
2443 if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT)
2444 vmcs_write64(GUEST_IA32_PAT, data);
2445 break;
2446 case MSR_IA32_MCG_EXT_CTL:
2447 if ((!msr_info->host_initiated &&
2448 !(to_vmx(vcpu)->msr_ia32_feature_control &
2449 FEAT_CTL_LMCE_ENABLED)) ||
2450 (data & ~MCG_EXT_CTL_LMCE_EN))
2451 return 1;
2452 vcpu->arch.mcg_ext_ctl = data;
2453 break;
2454 case MSR_IA32_FEAT_CTL:
2455 if (!is_vmx_feature_control_msr_valid(vmx, msr_info))
2456 return 1;
2457
2458 vmx->msr_ia32_feature_control = data;
2459 if (msr_info->host_initiated && data == 0)
2460 vmx_leave_nested(vcpu);
2461
2462 /* SGX may be enabled/disabled by guest's firmware */
2463 vmx_write_encls_bitmap(vcpu, NULL);
2464 break;
2465 case MSR_IA32_SGXLEPUBKEYHASH0 ... MSR_IA32_SGXLEPUBKEYHASH3:
2466 /*
2467 * On real hardware, the LE hash MSRs are writable before
2468 * the firmware sets bit 0 in MSR 0x7a ("activating" SGX),
2469 * at which point SGX related bits in IA32_FEATURE_CONTROL
2470 * become writable.
2471 *
2472 * KVM does not emulate SGX activation for simplicity, so
2473 * allow writes to the LE hash MSRs if IA32_FEATURE_CONTROL
2474 * is unlocked. This is technically not architectural
2475 * behavior, but it's close enough.
2476 */
2477 if (!msr_info->host_initiated &&
2478 (!guest_cpu_cap_has(vcpu, X86_FEATURE_SGX_LC) ||
2479 ((vmx->msr_ia32_feature_control & FEAT_CTL_LOCKED) &&
2480 !(vmx->msr_ia32_feature_control & FEAT_CTL_SGX_LC_ENABLED))))
2481 return 1;
2482 vmx->msr_ia32_sgxlepubkeyhash
2483 [msr_index - MSR_IA32_SGXLEPUBKEYHASH0] = data;
2484 break;
2485 case KVM_FIRST_EMULATED_VMX_MSR ... KVM_LAST_EMULATED_VMX_MSR:
2486 if (!msr_info->host_initiated)
2487 return 1; /* they are read-only */
2488 if (!guest_cpu_cap_has(vcpu, X86_FEATURE_VMX))
2489 return 1;
2490 return vmx_set_vmx_msr(vcpu, msr_index, data);
2491 case MSR_IA32_RTIT_CTL:
2492 if (!vmx_pt_mode_is_host_guest() ||
2493 vmx_rtit_ctl_check(vcpu, data) ||
2494 vmx->nested.vmxon)
2495 return 1;
2496 vmcs_write64(GUEST_IA32_RTIT_CTL, data);
2497 vmx->pt_desc.guest.ctl = data;
2498 pt_update_intercept_for_msr(vcpu);
2499 break;
2500 case MSR_IA32_RTIT_STATUS:
2501 if (!pt_can_write_msr(vmx))
2502 return 1;
2503 if (data & MSR_IA32_RTIT_STATUS_MASK)
2504 return 1;
2505 vmx->pt_desc.guest.status = data;
2506 break;
2507 case MSR_IA32_RTIT_CR3_MATCH:
2508 if (!pt_can_write_msr(vmx))
2509 return 1;
2510 if (!intel_pt_validate_cap(vmx->pt_desc.caps,
2511 PT_CAP_cr3_filtering))
2512 return 1;
2513 vmx->pt_desc.guest.cr3_match = data;
2514 break;
2515 case MSR_IA32_RTIT_OUTPUT_BASE:
2516 if (!pt_can_write_msr(vmx))
2517 return 1;
2518 if (!intel_pt_validate_cap(vmx->pt_desc.caps,
2519 PT_CAP_topa_output) &&
2520 !intel_pt_validate_cap(vmx->pt_desc.caps,
2521 PT_CAP_single_range_output))
2522 return 1;
2523 if (!pt_output_base_valid(vcpu, data))
2524 return 1;
2525 vmx->pt_desc.guest.output_base = data;
2526 break;
2527 case MSR_IA32_RTIT_OUTPUT_MASK:
2528 if (!pt_can_write_msr(vmx))
2529 return 1;
2530 if (!intel_pt_validate_cap(vmx->pt_desc.caps,
2531 PT_CAP_topa_output) &&
2532 !intel_pt_validate_cap(vmx->pt_desc.caps,
2533 PT_CAP_single_range_output))
2534 return 1;
2535 vmx->pt_desc.guest.output_mask = data;
2536 break;
2537 case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B:
2538 if (!pt_can_write_msr(vmx))
2539 return 1;
2540 index = msr_info->index - MSR_IA32_RTIT_ADDR0_A;
2541 if (index >= 2 * vmx->pt_desc.num_address_ranges)
2542 return 1;
2543 if (is_noncanonical_msr_address(data, vcpu))
2544 return 1;
2545 if (index % 2)
2546 vmx->pt_desc.guest.addr_b[index / 2] = data;
2547 else
2548 vmx->pt_desc.guest.addr_a[index / 2] = data;
2549 break;
2550 case MSR_IA32_S_CET:
2551 vmcs_writel(GUEST_S_CET, data);
2552 break;
2553 case MSR_KVM_INTERNAL_GUEST_SSP:
2554 vmcs_writel(GUEST_SSP, data);
2555 break;
2556 case MSR_IA32_INT_SSP_TAB:
2557 vmcs_writel(GUEST_INTR_SSP_TABLE, data);
2558 break;
2559 case MSR_IA32_PERF_CAPABILITIES:
2560 if (data & PERF_CAP_LBR_FMT) {
2561 if ((data & PERF_CAP_LBR_FMT) !=
2562 (kvm_caps.supported_perf_cap & PERF_CAP_LBR_FMT))
2563 return 1;
2564 if (!cpuid_model_is_consistent(vcpu))
2565 return 1;
2566 }
2567 if (data & PERF_CAP_PEBS_FORMAT) {
2568 if ((data & PERF_CAP_PEBS_MASK) !=
2569 (kvm_caps.supported_perf_cap & PERF_CAP_PEBS_MASK))
2570 return 1;
2571 if (!guest_cpu_cap_has(vcpu, X86_FEATURE_DS))
2572 return 1;
2573 if (!guest_cpu_cap_has(vcpu, X86_FEATURE_DTES64))
2574 return 1;
2575 if (!cpuid_model_is_consistent(vcpu))
2576 return 1;
2577 }
2578 ret = kvm_set_msr_common(vcpu, msr_info);
2579 break;
2580
2581 default:
2582 find_uret_msr:
2583 msr = vmx_find_uret_msr(vmx, msr_index);
2584 if (msr)
2585 ret = vmx_set_guest_uret_msr(vmx, msr, data);
2586 else
2587 ret = kvm_set_msr_common(vcpu, msr_info);
2588 }
2589
2590 /* FB_CLEAR may have changed, also update the FB_CLEAR_DIS behavior */
2591 if (msr_index == MSR_IA32_ARCH_CAPABILITIES)
2592 vmx_update_fb_clear_dis(vcpu, vmx);
2593
2594 return ret;
2595 }
2596
vmx_cache_reg(struct kvm_vcpu * vcpu,enum kvm_reg reg)2597 void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
2598 {
2599 unsigned long guest_owned_bits;
2600
2601 kvm_register_mark_available(vcpu, reg);
2602
2603 switch (reg) {
2604 case VCPU_REGS_RSP:
2605 vcpu->arch.regs[VCPU_REGS_RSP] = vmcs_readl(GUEST_RSP);
2606 break;
2607 case VCPU_REGS_RIP:
2608 vcpu->arch.regs[VCPU_REGS_RIP] = vmcs_readl(GUEST_RIP);
2609 break;
2610 case VCPU_EXREG_PDPTR:
2611 if (enable_ept)
2612 ept_save_pdptrs(vcpu);
2613 break;
2614 case VCPU_EXREG_CR0:
2615 guest_owned_bits = vcpu->arch.cr0_guest_owned_bits;
2616
2617 vcpu->arch.cr0 &= ~guest_owned_bits;
2618 vcpu->arch.cr0 |= vmcs_readl(GUEST_CR0) & guest_owned_bits;
2619 break;
2620 case VCPU_EXREG_CR3:
2621 /*
2622 * When intercepting CR3 loads, e.g. for shadowing paging, KVM's
2623 * CR3 is loaded into hardware, not the guest's CR3.
2624 */
2625 if (!(exec_controls_get(to_vmx(vcpu)) & CPU_BASED_CR3_LOAD_EXITING))
2626 vcpu->arch.cr3 = vmcs_readl(GUEST_CR3);
2627 break;
2628 case VCPU_EXREG_CR4:
2629 guest_owned_bits = vcpu->arch.cr4_guest_owned_bits;
2630
2631 vcpu->arch.cr4 &= ~guest_owned_bits;
2632 vcpu->arch.cr4 |= vmcs_readl(GUEST_CR4) & guest_owned_bits;
2633 break;
2634 default:
2635 KVM_BUG_ON(1, vcpu->kvm);
2636 break;
2637 }
2638 }
2639
2640 /*
2641 * There is no X86_FEATURE for SGX yet, but anyway we need to query CPUID
2642 * directly instead of going through cpu_has(), to ensure KVM is trapping
2643 * ENCLS whenever it's supported in hardware. It does not matter whether
2644 * the host OS supports or has enabled SGX.
2645 */
cpu_has_sgx(void)2646 static bool cpu_has_sgx(void)
2647 {
2648 return cpuid_eax(0) >= 0x12 && (cpuid_eax(0x12) & BIT(0));
2649 }
2650
adjust_vmx_controls(u32 ctl_min,u32 ctl_opt,u32 msr,u32 * result)2651 static int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt, u32 msr, u32 *result)
2652 {
2653 u32 vmx_msr_low, vmx_msr_high;
2654 u32 ctl = ctl_min | ctl_opt;
2655
2656 rdmsr(msr, vmx_msr_low, vmx_msr_high);
2657
2658 ctl &= vmx_msr_high; /* bit == 0 in high word ==> must be zero */
2659 ctl |= vmx_msr_low; /* bit == 1 in low word ==> must be one */
2660
2661 /* Ensure minimum (required) set of control bits are supported. */
2662 if (ctl_min & ~ctl)
2663 return -EIO;
2664
2665 *result = ctl;
2666 return 0;
2667 }
2668
adjust_vmx_controls64(u64 ctl_opt,u32 msr)2669 static u64 adjust_vmx_controls64(u64 ctl_opt, u32 msr)
2670 {
2671 u64 allowed;
2672
2673 rdmsrq(msr, allowed);
2674
2675 return ctl_opt & allowed;
2676 }
2677
2678 #define vmx_check_entry_exit_pairs(pairs, entry_controls, exit_controls) \
2679 ({ \
2680 int i, r = 0; \
2681 \
2682 BUILD_BUG_ON(sizeof(pairs[0].entry_control) != sizeof(entry_controls)); \
2683 BUILD_BUG_ON(sizeof(pairs[0].exit_control) != sizeof(exit_controls)); \
2684 \
2685 for (i = 0; i < ARRAY_SIZE(pairs); i++) { \
2686 typeof(entry_controls) n_ctrl = pairs[i].entry_control; \
2687 typeof(exit_controls) x_ctrl = pairs[i].exit_control; \
2688 \
2689 if (!(entry_controls & n_ctrl) == !(exit_controls & x_ctrl)) \
2690 continue; \
2691 \
2692 pr_warn_once("Inconsistent VM-Entry/VM-Exit pair, " \
2693 "entry = %llx (%llx), exit = %llx (%llx)\n", \
2694 (u64)(entry_controls & n_ctrl), (u64)n_ctrl, \
2695 (u64)(exit_controls & x_ctrl), (u64)x_ctrl); \
2696 \
2697 if (error_on_inconsistent_vmcs_config) \
2698 r = -EIO; \
2699 \
2700 entry_controls &= ~n_ctrl; \
2701 exit_controls &= ~x_ctrl; \
2702 } \
2703 r; \
2704 })
2705
setup_vmcs_config(struct vmcs_config * vmcs_conf,struct vmx_capability * vmx_cap)2706 static int setup_vmcs_config(struct vmcs_config *vmcs_conf,
2707 struct vmx_capability *vmx_cap)
2708 {
2709 u32 _pin_based_exec_control = 0;
2710 u32 _cpu_based_exec_control = 0;
2711 u32 _cpu_based_2nd_exec_control = 0;
2712 u64 _cpu_based_3rd_exec_control = 0;
2713 u32 _vmexit_control = 0;
2714 u32 _vmentry_control = 0;
2715 u64 basic_msr;
2716 u64 misc_msr;
2717
2718 /*
2719 * LOAD/SAVE_DEBUG_CONTROLS are absent because both are mandatory.
2720 * SAVE_IA32_PAT and SAVE_IA32_EFER are absent because KVM always
2721 * intercepts writes to PAT and EFER, i.e. never enables those controls.
2722 */
2723 struct {
2724 u32 entry_control;
2725 u32 exit_control;
2726 } const vmcs_entry_exit_pairs[] = {
2727 { VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL, VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL },
2728 { VM_ENTRY_LOAD_IA32_PAT, VM_EXIT_LOAD_IA32_PAT },
2729 { VM_ENTRY_LOAD_IA32_EFER, VM_EXIT_LOAD_IA32_EFER },
2730 { VM_ENTRY_LOAD_BNDCFGS, VM_EXIT_CLEAR_BNDCFGS },
2731 { VM_ENTRY_LOAD_IA32_RTIT_CTL, VM_EXIT_CLEAR_IA32_RTIT_CTL },
2732 { VM_ENTRY_LOAD_CET_STATE, VM_EXIT_LOAD_CET_STATE },
2733 };
2734
2735 memset(vmcs_conf, 0, sizeof(*vmcs_conf));
2736
2737 if (adjust_vmx_controls(KVM_REQUIRED_VMX_CPU_BASED_VM_EXEC_CONTROL,
2738 KVM_OPTIONAL_VMX_CPU_BASED_VM_EXEC_CONTROL,
2739 MSR_IA32_VMX_PROCBASED_CTLS,
2740 &_cpu_based_exec_control))
2741 return -EIO;
2742 if (_cpu_based_exec_control & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) {
2743 if (adjust_vmx_controls(KVM_REQUIRED_VMX_SECONDARY_VM_EXEC_CONTROL,
2744 KVM_OPTIONAL_VMX_SECONDARY_VM_EXEC_CONTROL,
2745 MSR_IA32_VMX_PROCBASED_CTLS2,
2746 &_cpu_based_2nd_exec_control))
2747 return -EIO;
2748 }
2749 if (!IS_ENABLED(CONFIG_KVM_INTEL_PROVE_VE))
2750 _cpu_based_2nd_exec_control &= ~SECONDARY_EXEC_EPT_VIOLATION_VE;
2751
2752 #ifndef CONFIG_X86_64
2753 if (!(_cpu_based_2nd_exec_control &
2754 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))
2755 _cpu_based_exec_control &= ~CPU_BASED_TPR_SHADOW;
2756 #endif
2757
2758 if (!(_cpu_based_exec_control & CPU_BASED_TPR_SHADOW))
2759 _cpu_based_2nd_exec_control &= ~(
2760 SECONDARY_EXEC_APIC_REGISTER_VIRT |
2761 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
2762 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
2763
2764 rdmsr_safe(MSR_IA32_VMX_EPT_VPID_CAP,
2765 &vmx_cap->ept, &vmx_cap->vpid);
2766
2767 if (!(_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_EPT) &&
2768 vmx_cap->ept) {
2769 pr_warn_once("EPT CAP should not exist if not support "
2770 "1-setting enable EPT VM-execution control\n");
2771
2772 if (error_on_inconsistent_vmcs_config)
2773 return -EIO;
2774
2775 vmx_cap->ept = 0;
2776 _cpu_based_2nd_exec_control &= ~SECONDARY_EXEC_EPT_VIOLATION_VE;
2777 }
2778 if (!(_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_VPID) &&
2779 vmx_cap->vpid) {
2780 pr_warn_once("VPID CAP should not exist if not support "
2781 "1-setting enable VPID VM-execution control\n");
2782
2783 if (error_on_inconsistent_vmcs_config)
2784 return -EIO;
2785
2786 vmx_cap->vpid = 0;
2787 }
2788
2789 if (!cpu_has_sgx())
2790 _cpu_based_2nd_exec_control &= ~SECONDARY_EXEC_ENCLS_EXITING;
2791
2792 if (_cpu_based_exec_control & CPU_BASED_ACTIVATE_TERTIARY_CONTROLS)
2793 _cpu_based_3rd_exec_control =
2794 adjust_vmx_controls64(KVM_OPTIONAL_VMX_TERTIARY_VM_EXEC_CONTROL,
2795 MSR_IA32_VMX_PROCBASED_CTLS3);
2796
2797 if (adjust_vmx_controls(KVM_REQUIRED_VMX_VM_EXIT_CONTROLS,
2798 KVM_OPTIONAL_VMX_VM_EXIT_CONTROLS,
2799 MSR_IA32_VMX_EXIT_CTLS,
2800 &_vmexit_control))
2801 return -EIO;
2802
2803 if (adjust_vmx_controls(KVM_REQUIRED_VMX_PIN_BASED_VM_EXEC_CONTROL,
2804 KVM_OPTIONAL_VMX_PIN_BASED_VM_EXEC_CONTROL,
2805 MSR_IA32_VMX_PINBASED_CTLS,
2806 &_pin_based_exec_control))
2807 return -EIO;
2808
2809 if (cpu_has_broken_vmx_preemption_timer())
2810 _pin_based_exec_control &= ~PIN_BASED_VMX_PREEMPTION_TIMER;
2811 if (!(_cpu_based_2nd_exec_control &
2812 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY))
2813 _pin_based_exec_control &= ~PIN_BASED_POSTED_INTR;
2814
2815 if (adjust_vmx_controls(KVM_REQUIRED_VMX_VM_ENTRY_CONTROLS,
2816 KVM_OPTIONAL_VMX_VM_ENTRY_CONTROLS,
2817 MSR_IA32_VMX_ENTRY_CTLS,
2818 &_vmentry_control))
2819 return -EIO;
2820
2821 if (vmx_check_entry_exit_pairs(vmcs_entry_exit_pairs,
2822 _vmentry_control, _vmexit_control))
2823 return -EIO;
2824
2825 /*
2826 * Some cpus support VM_{ENTRY,EXIT}_IA32_PERF_GLOBAL_CTRL but they
2827 * can't be used due to an errata where VM Exit may incorrectly clear
2828 * IA32_PERF_GLOBAL_CTRL[34:32]. Workaround the errata by using the
2829 * MSR load mechanism to switch IA32_PERF_GLOBAL_CTRL.
2830 */
2831 switch (boot_cpu_data.x86_vfm) {
2832 case INTEL_NEHALEM_EP: /* AAK155 */
2833 case INTEL_NEHALEM: /* AAP115 */
2834 case INTEL_WESTMERE: /* AAT100 */
2835 case INTEL_WESTMERE_EP: /* BC86,AAY89,BD102 */
2836 case INTEL_NEHALEM_EX: /* BA97 */
2837 _vmentry_control &= ~VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL;
2838 _vmexit_control &= ~VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL;
2839 pr_warn_once("VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL "
2840 "does not work properly. Using workaround\n");
2841 break;
2842 default:
2843 break;
2844 }
2845
2846 rdmsrq(MSR_IA32_VMX_BASIC, basic_msr);
2847
2848 /* IA-32 SDM Vol 3B: VMCS size is never greater than 4kB. */
2849 if (vmx_basic_vmcs_size(basic_msr) > PAGE_SIZE)
2850 return -EIO;
2851
2852 #ifdef CONFIG_X86_64
2853 /*
2854 * KVM expects to be able to shove all legal physical addresses into
2855 * VMCS fields for 64-bit kernels, and per the SDM, "This bit is always
2856 * 0 for processors that support Intel 64 architecture".
2857 */
2858 if (basic_msr & VMX_BASIC_32BIT_PHYS_ADDR_ONLY)
2859 return -EIO;
2860 #endif
2861
2862 /* Require Write-Back (WB) memory type for VMCS accesses. */
2863 if (vmx_basic_vmcs_mem_type(basic_msr) != X86_MEMTYPE_WB)
2864 return -EIO;
2865
2866 rdmsrq(MSR_IA32_VMX_MISC, misc_msr);
2867
2868 vmcs_conf->basic = basic_msr;
2869 vmcs_conf->pin_based_exec_ctrl = _pin_based_exec_control;
2870 vmcs_conf->cpu_based_exec_ctrl = _cpu_based_exec_control;
2871 vmcs_conf->cpu_based_2nd_exec_ctrl = _cpu_based_2nd_exec_control;
2872 vmcs_conf->cpu_based_3rd_exec_ctrl = _cpu_based_3rd_exec_control;
2873 vmcs_conf->vmexit_ctrl = _vmexit_control;
2874 vmcs_conf->vmentry_ctrl = _vmentry_control;
2875 vmcs_conf->misc = misc_msr;
2876
2877 #if IS_ENABLED(CONFIG_HYPERV)
2878 if (enlightened_vmcs)
2879 evmcs_sanitize_exec_ctrls(vmcs_conf);
2880 #endif
2881
2882 return 0;
2883 }
2884
__kvm_is_vmx_supported(void)2885 static bool __kvm_is_vmx_supported(void)
2886 {
2887 int cpu = smp_processor_id();
2888
2889 if (!(cpuid_ecx(1) & feature_bit(VMX))) {
2890 pr_err("VMX not supported by CPU %d\n", cpu);
2891 return false;
2892 }
2893
2894 if (!this_cpu_has(X86_FEATURE_MSR_IA32_FEAT_CTL)) {
2895 pr_err("VMX not enabled (by BIOS) in MSR_IA32_FEAT_CTL on CPU %d\n", cpu);
2896 return false;
2897 }
2898
2899 if (!this_cpu_has(X86_FEATURE_VMX)) {
2900 pr_err("VMX not fully enabled on CPU %d. Check kernel logs and/or BIOS\n", cpu);
2901 return false;
2902 }
2903
2904 return true;
2905 }
2906
kvm_is_vmx_supported(void)2907 static bool kvm_is_vmx_supported(void)
2908 {
2909 bool supported;
2910
2911 migrate_disable();
2912 supported = __kvm_is_vmx_supported();
2913 migrate_enable();
2914
2915 return supported;
2916 }
2917
vmx_check_processor_compat(void)2918 int vmx_check_processor_compat(void)
2919 {
2920 int cpu = raw_smp_processor_id();
2921 struct vmcs_config vmcs_conf;
2922 struct vmx_capability vmx_cap;
2923
2924 if (!__kvm_is_vmx_supported())
2925 return -EIO;
2926
2927 if (setup_vmcs_config(&vmcs_conf, &vmx_cap) < 0) {
2928 pr_err("Failed to setup VMCS config on CPU %d\n", cpu);
2929 return -EIO;
2930 }
2931 if (nested)
2932 nested_vmx_setup_ctls_msrs(&vmcs_conf, vmx_cap.ept);
2933
2934 if (memcmp(&vmcs_config, &vmcs_conf, sizeof(struct vmcs_config))) {
2935 u32 *gold = (void *)&vmcs_config;
2936 u32 *mine = (void *)&vmcs_conf;
2937 int i;
2938
2939 BUILD_BUG_ON(sizeof(struct vmcs_config) % sizeof(u32));
2940
2941 pr_err("VMCS config on CPU %d doesn't match reference config:", cpu);
2942 for (i = 0; i < sizeof(struct vmcs_config) / sizeof(u32); i++) {
2943 if (gold[i] == mine[i])
2944 continue;
2945
2946 pr_cont("\n Offset %u REF = 0x%08x, CPU%u = 0x%08x, mismatch = 0x%08x",
2947 i * (int)sizeof(u32), gold[i], cpu, mine[i], gold[i] ^ mine[i]);
2948 }
2949 pr_cont("\n");
2950 return -EIO;
2951 }
2952 return 0;
2953 }
2954
vmx_enable_virtualization_cpu(void)2955 int vmx_enable_virtualization_cpu(void)
2956 {
2957 int cpu = raw_smp_processor_id();
2958
2959 /*
2960 * This can happen if we hot-added a CPU but failed to allocate
2961 * VP assist page for it.
2962 */
2963 if (kvm_is_using_evmcs() && !hv_get_vp_assist_page(cpu))
2964 return -EFAULT;
2965
2966 return x86_virt_get_ref(X86_FEATURE_VMX);
2967 }
2968
vmclear_local_loaded_vmcss(void)2969 static void vmclear_local_loaded_vmcss(void)
2970 {
2971 int cpu = raw_smp_processor_id();
2972 struct loaded_vmcs *v, *n;
2973
2974 list_for_each_entry_safe(v, n, &per_cpu(loaded_vmcss_on_cpu, cpu),
2975 loaded_vmcss_on_cpu_link)
2976 __loaded_vmcs_clear(v);
2977 }
2978
vmx_disable_virtualization_cpu(void)2979 void vmx_disable_virtualization_cpu(void)
2980 {
2981 vmclear_local_loaded_vmcss();
2982
2983 x86_virt_put_ref(X86_FEATURE_VMX);
2984
2985 hv_reset_evmcs();
2986 }
2987
alloc_vmcs_cpu(bool shadow,int cpu,gfp_t flags)2988 struct vmcs *alloc_vmcs_cpu(bool shadow, int cpu, gfp_t flags)
2989 {
2990 int node = cpu_to_node(cpu);
2991 struct page *pages;
2992 struct vmcs *vmcs;
2993
2994 pages = __alloc_pages_node(node, flags, 0);
2995 if (!pages)
2996 return NULL;
2997 vmcs = page_address(pages);
2998 memset(vmcs, 0, vmx_basic_vmcs_size(vmcs_config.basic));
2999
3000 /* KVM supports Enlightened VMCS v1 only */
3001 if (kvm_is_using_evmcs())
3002 vmcs->hdr.revision_id = KVM_EVMCS_VERSION;
3003 else
3004 vmcs->hdr.revision_id = vmx_basic_vmcs_revision_id(vmcs_config.basic);
3005
3006 if (shadow)
3007 vmcs->hdr.shadow_vmcs = 1;
3008 return vmcs;
3009 }
3010
free_vmcs(struct vmcs * vmcs)3011 void free_vmcs(struct vmcs *vmcs)
3012 {
3013 free_page((unsigned long)vmcs);
3014 }
3015
3016 /*
3017 * Free a VMCS, but before that VMCLEAR it on the CPU where it was last loaded
3018 */
free_loaded_vmcs(struct loaded_vmcs * loaded_vmcs)3019 void free_loaded_vmcs(struct loaded_vmcs *loaded_vmcs)
3020 {
3021 if (!loaded_vmcs->vmcs)
3022 return;
3023 loaded_vmcs_clear(loaded_vmcs);
3024 free_vmcs(loaded_vmcs->vmcs);
3025 loaded_vmcs->vmcs = NULL;
3026 if (loaded_vmcs->msr_bitmap)
3027 free_page((unsigned long)loaded_vmcs->msr_bitmap);
3028 WARN_ON(loaded_vmcs->shadow_vmcs != NULL);
3029 }
3030
alloc_loaded_vmcs(struct loaded_vmcs * loaded_vmcs)3031 int alloc_loaded_vmcs(struct loaded_vmcs *loaded_vmcs)
3032 {
3033 loaded_vmcs->vmcs = alloc_vmcs(false);
3034 if (!loaded_vmcs->vmcs)
3035 return -ENOMEM;
3036
3037 vmcs_clear(loaded_vmcs->vmcs);
3038
3039 loaded_vmcs->shadow_vmcs = NULL;
3040 loaded_vmcs->hv_timer_soft_disabled = false;
3041 loaded_vmcs->cpu = -1;
3042 loaded_vmcs->launched = 0;
3043
3044 if (cpu_has_vmx_msr_bitmap()) {
3045 loaded_vmcs->msr_bitmap = (unsigned long *)
3046 __get_free_page(GFP_KERNEL_ACCOUNT);
3047 if (!loaded_vmcs->msr_bitmap)
3048 goto out_vmcs;
3049 memset(loaded_vmcs->msr_bitmap, 0xff, PAGE_SIZE);
3050 }
3051
3052 memset(&loaded_vmcs->host_state, 0, sizeof(struct vmcs_host_state));
3053 memset(&loaded_vmcs->controls_shadow, 0,
3054 sizeof(struct vmcs_controls_shadow));
3055
3056 return 0;
3057
3058 out_vmcs:
3059 free_loaded_vmcs(loaded_vmcs);
3060 return -ENOMEM;
3061 }
3062
fix_pmode_seg(struct kvm_vcpu * vcpu,int seg,struct kvm_segment * save)3063 static void fix_pmode_seg(struct kvm_vcpu *vcpu, int seg,
3064 struct kvm_segment *save)
3065 {
3066 if (!emulate_invalid_guest_state) {
3067 /*
3068 * CS and SS RPL should be equal during guest entry according
3069 * to VMX spec, but in reality it is not always so. Since vcpu
3070 * is in the middle of the transition from real mode to
3071 * protected mode it is safe to assume that RPL 0 is a good
3072 * default value.
3073 */
3074 if (seg == VCPU_SREG_CS || seg == VCPU_SREG_SS)
3075 save->selector &= ~SEGMENT_RPL_MASK;
3076 save->dpl = save->selector & SEGMENT_RPL_MASK;
3077 save->s = 1;
3078 }
3079 __vmx_set_segment(vcpu, save, seg);
3080 }
3081
enter_pmode(struct kvm_vcpu * vcpu)3082 static void enter_pmode(struct kvm_vcpu *vcpu)
3083 {
3084 unsigned long flags;
3085 struct vcpu_vmx *vmx = to_vmx(vcpu);
3086
3087 /*
3088 * Update real mode segment cache. It may be not up-to-date if segment
3089 * register was written while vcpu was in a guest mode.
3090 */
3091 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_ES], VCPU_SREG_ES);
3092 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_DS], VCPU_SREG_DS);
3093 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_FS], VCPU_SREG_FS);
3094 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_GS], VCPU_SREG_GS);
3095 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_SS], VCPU_SREG_SS);
3096 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_CS], VCPU_SREG_CS);
3097
3098 vmx->rmode.vm86_active = 0;
3099
3100 __vmx_set_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_TR], VCPU_SREG_TR);
3101
3102 flags = vmcs_readl(GUEST_RFLAGS);
3103 flags &= RMODE_GUEST_OWNED_EFLAGS_BITS;
3104 flags |= vmx->rmode.save_rflags & ~RMODE_GUEST_OWNED_EFLAGS_BITS;
3105 vmcs_writel(GUEST_RFLAGS, flags);
3106
3107 vmcs_writel(GUEST_CR4, (vmcs_readl(GUEST_CR4) & ~X86_CR4_VME) |
3108 (vmcs_readl(CR4_READ_SHADOW) & X86_CR4_VME));
3109
3110 vmx_update_exception_bitmap(vcpu);
3111
3112 fix_pmode_seg(vcpu, VCPU_SREG_CS, &vmx->rmode.segs[VCPU_SREG_CS]);
3113 fix_pmode_seg(vcpu, VCPU_SREG_SS, &vmx->rmode.segs[VCPU_SREG_SS]);
3114 fix_pmode_seg(vcpu, VCPU_SREG_ES, &vmx->rmode.segs[VCPU_SREG_ES]);
3115 fix_pmode_seg(vcpu, VCPU_SREG_DS, &vmx->rmode.segs[VCPU_SREG_DS]);
3116 fix_pmode_seg(vcpu, VCPU_SREG_FS, &vmx->rmode.segs[VCPU_SREG_FS]);
3117 fix_pmode_seg(vcpu, VCPU_SREG_GS, &vmx->rmode.segs[VCPU_SREG_GS]);
3118 }
3119
fix_rmode_seg(int seg,struct kvm_segment * save)3120 static void fix_rmode_seg(int seg, struct kvm_segment *save)
3121 {
3122 const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
3123 struct kvm_segment var = *save;
3124
3125 var.dpl = 0x3;
3126 if (seg == VCPU_SREG_CS)
3127 var.type = 0x3;
3128
3129 if (!emulate_invalid_guest_state) {
3130 var.selector = var.base >> 4;
3131 var.base = var.base & 0xffff0;
3132 var.limit = 0xffff;
3133 var.g = 0;
3134 var.db = 0;
3135 var.present = 1;
3136 var.s = 1;
3137 var.l = 0;
3138 var.unusable = 0;
3139 var.type = 0x3;
3140 var.avl = 0;
3141 if (save->base & 0xf)
3142 pr_warn_once("segment base is not paragraph aligned "
3143 "when entering protected mode (seg=%d)", seg);
3144 }
3145
3146 vmcs_write16(sf->selector, var.selector);
3147 vmcs_writel(sf->base, var.base);
3148 vmcs_write32(sf->limit, var.limit);
3149 vmcs_write32(sf->ar_bytes, vmx_segment_access_rights(&var));
3150 }
3151
enter_rmode(struct kvm_vcpu * vcpu)3152 static void enter_rmode(struct kvm_vcpu *vcpu)
3153 {
3154 unsigned long flags;
3155 struct vcpu_vmx *vmx = to_vmx(vcpu);
3156 struct kvm_vmx *kvm_vmx = to_kvm_vmx(vcpu->kvm);
3157
3158 /*
3159 * KVM should never use VM86 to virtualize Real Mode when L2 is active,
3160 * as using VM86 is unnecessary if unrestricted guest is enabled, and
3161 * if unrestricted guest is disabled, VM-Enter (from L1) with CR0.PG=0
3162 * should VM-Fail and KVM should reject userspace attempts to stuff
3163 * CR0.PG=0 when L2 is active.
3164 */
3165 WARN_ON_ONCE(is_guest_mode(vcpu));
3166
3167 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_TR], VCPU_SREG_TR);
3168 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_ES], VCPU_SREG_ES);
3169 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_DS], VCPU_SREG_DS);
3170 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_FS], VCPU_SREG_FS);
3171 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_GS], VCPU_SREG_GS);
3172 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_SS], VCPU_SREG_SS);
3173 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_CS], VCPU_SREG_CS);
3174
3175 vmx->rmode.vm86_active = 1;
3176
3177 vmx_segment_cache_clear(vmx);
3178
3179 vmcs_writel(GUEST_TR_BASE, kvm_vmx->tss_addr);
3180 vmcs_write32(GUEST_TR_LIMIT, RMODE_TSS_SIZE - 1);
3181 vmcs_write32(GUEST_TR_AR_BYTES, 0x008b);
3182
3183 flags = vmcs_readl(GUEST_RFLAGS);
3184 vmx->rmode.save_rflags = flags;
3185
3186 flags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM;
3187
3188 vmcs_writel(GUEST_RFLAGS, flags);
3189 vmcs_writel(GUEST_CR4, vmcs_readl(GUEST_CR4) | X86_CR4_VME);
3190 vmx_update_exception_bitmap(vcpu);
3191
3192 fix_rmode_seg(VCPU_SREG_SS, &vmx->rmode.segs[VCPU_SREG_SS]);
3193 fix_rmode_seg(VCPU_SREG_CS, &vmx->rmode.segs[VCPU_SREG_CS]);
3194 fix_rmode_seg(VCPU_SREG_ES, &vmx->rmode.segs[VCPU_SREG_ES]);
3195 fix_rmode_seg(VCPU_SREG_DS, &vmx->rmode.segs[VCPU_SREG_DS]);
3196 fix_rmode_seg(VCPU_SREG_GS, &vmx->rmode.segs[VCPU_SREG_GS]);
3197 fix_rmode_seg(VCPU_SREG_FS, &vmx->rmode.segs[VCPU_SREG_FS]);
3198 }
3199
vmx_set_efer(struct kvm_vcpu * vcpu,u64 efer)3200 int vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer)
3201 {
3202 struct vcpu_vmx *vmx = to_vmx(vcpu);
3203
3204 /* Nothing to do if hardware doesn't support EFER. */
3205 if (!vmx_find_uret_msr(vmx, MSR_EFER))
3206 return 0;
3207
3208 vcpu->arch.efer = efer;
3209 #ifdef CONFIG_X86_64
3210 if (efer & EFER_LMA)
3211 vm_entry_controls_setbit(vmx, VM_ENTRY_IA32E_MODE);
3212 else
3213 vm_entry_controls_clearbit(vmx, VM_ENTRY_IA32E_MODE);
3214 #else
3215 if (KVM_BUG_ON(efer & EFER_LMA, vcpu->kvm))
3216 return 1;
3217 #endif
3218
3219 vmx_setup_uret_msrs(vmx);
3220 return 0;
3221 }
3222
3223 #ifdef CONFIG_X86_64
3224
enter_lmode(struct kvm_vcpu * vcpu)3225 static void enter_lmode(struct kvm_vcpu *vcpu)
3226 {
3227 u32 guest_tr_ar;
3228
3229 vmx_segment_cache_clear(to_vmx(vcpu));
3230
3231 guest_tr_ar = vmcs_read32(GUEST_TR_AR_BYTES);
3232 if ((guest_tr_ar & VMX_AR_TYPE_MASK) != VMX_AR_TYPE_BUSY_64_TSS) {
3233 pr_debug_ratelimited("%s: tss fixup for long mode. \n",
3234 __func__);
3235 vmcs_write32(GUEST_TR_AR_BYTES,
3236 (guest_tr_ar & ~VMX_AR_TYPE_MASK)
3237 | VMX_AR_TYPE_BUSY_64_TSS);
3238 }
3239 vmx_set_efer(vcpu, vcpu->arch.efer | EFER_LMA);
3240 }
3241
exit_lmode(struct kvm_vcpu * vcpu)3242 static void exit_lmode(struct kvm_vcpu *vcpu)
3243 {
3244 vmx_set_efer(vcpu, vcpu->arch.efer & ~EFER_LMA);
3245 }
3246
3247 #endif
3248
vmx_flush_tlb_all(struct kvm_vcpu * vcpu)3249 void vmx_flush_tlb_all(struct kvm_vcpu *vcpu)
3250 {
3251 struct vcpu_vmx *vmx = to_vmx(vcpu);
3252
3253 /*
3254 * INVEPT must be issued when EPT is enabled, irrespective of VPID, as
3255 * the CPU is not required to invalidate guest-physical mappings on
3256 * VM-Entry, even if VPID is disabled. Guest-physical mappings are
3257 * associated with the root EPT structure and not any particular VPID
3258 * (INVVPID also isn't required to invalidate guest-physical mappings).
3259 */
3260 if (enable_ept) {
3261 ept_sync_global();
3262 } else if (enable_vpid) {
3263 if (cpu_has_vmx_invvpid_global()) {
3264 vpid_sync_vcpu_global();
3265 } else {
3266 vpid_sync_vcpu_single(vmx->vpid);
3267 vpid_sync_vcpu_single(vmx->nested.vpid02);
3268 }
3269 }
3270 }
3271
vmx_get_current_vpid(struct kvm_vcpu * vcpu)3272 static inline int vmx_get_current_vpid(struct kvm_vcpu *vcpu)
3273 {
3274 if (is_guest_mode(vcpu) && nested_cpu_has_vpid(get_vmcs12(vcpu)))
3275 return nested_get_vpid02(vcpu);
3276 return to_vmx(vcpu)->vpid;
3277 }
3278
construct_eptp(hpa_t root_hpa)3279 static u64 construct_eptp(hpa_t root_hpa)
3280 {
3281 u64 eptp = root_hpa | VMX_EPTP_MT_WB;
3282 struct kvm_mmu_page *root;
3283
3284 if (kvm_mmu_is_dummy_root(root_hpa))
3285 return eptp | VMX_EPTP_PWL_4;
3286
3287 /*
3288 * EPT roots should always have an associated MMU page. Return a "bad"
3289 * EPTP to induce VM-Fail instead of continuing on in a unknown state.
3290 */
3291 root = root_to_sp(root_hpa);
3292 if (WARN_ON_ONCE(!root))
3293 return INVALID_PAGE;
3294
3295 eptp |= (root->role.level == 5) ? VMX_EPTP_PWL_5 : VMX_EPTP_PWL_4;
3296
3297 if (enable_ept_ad_bits && !root->role.ad_disabled)
3298 eptp |= VMX_EPTP_AD_ENABLE_BIT;
3299
3300 return eptp;
3301 }
3302
vmx_flush_tlb_ept_root(hpa_t root_hpa)3303 static void vmx_flush_tlb_ept_root(hpa_t root_hpa)
3304 {
3305 u64 eptp = construct_eptp(root_hpa);
3306
3307 if (VALID_PAGE(eptp))
3308 ept_sync_context(eptp);
3309 else
3310 ept_sync_global();
3311 }
3312
vmx_flush_tlb_current(struct kvm_vcpu * vcpu)3313 void vmx_flush_tlb_current(struct kvm_vcpu *vcpu)
3314 {
3315 struct kvm_mmu *mmu = vcpu->arch.mmu;
3316 u64 root_hpa = mmu->root.hpa;
3317
3318 /* No flush required if the current context is invalid. */
3319 if (!VALID_PAGE(root_hpa))
3320 return;
3321
3322 if (enable_ept)
3323 vmx_flush_tlb_ept_root(root_hpa);
3324 else
3325 vpid_sync_context(vmx_get_current_vpid(vcpu));
3326 }
3327
vmx_flush_tlb_gva(struct kvm_vcpu * vcpu,gva_t addr)3328 void vmx_flush_tlb_gva(struct kvm_vcpu *vcpu, gva_t addr)
3329 {
3330 /*
3331 * vpid_sync_vcpu_addr() is a nop if vpid==0, see the comment in
3332 * vmx_flush_tlb_guest() for an explanation of why this is ok.
3333 */
3334 vpid_sync_vcpu_addr(vmx_get_current_vpid(vcpu), addr);
3335 }
3336
vmx_flush_tlb_guest(struct kvm_vcpu * vcpu)3337 void vmx_flush_tlb_guest(struct kvm_vcpu *vcpu)
3338 {
3339 /*
3340 * vpid_sync_context() is a nop if vpid==0, e.g. if enable_vpid==0 or a
3341 * vpid couldn't be allocated for this vCPU. VM-Enter and VM-Exit are
3342 * required to flush GVA->{G,H}PA mappings from the TLB if vpid is
3343 * disabled (VM-Enter with vpid enabled and vpid==0 is disallowed),
3344 * i.e. no explicit INVVPID is necessary.
3345 */
3346 vpid_sync_context(vmx_get_current_vpid(vcpu));
3347 }
3348
vmx_ept_load_pdptrs(struct kvm_vcpu * vcpu)3349 void vmx_ept_load_pdptrs(struct kvm_vcpu *vcpu)
3350 {
3351 struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
3352
3353 if (!kvm_register_is_dirty(vcpu, VCPU_EXREG_PDPTR))
3354 return;
3355
3356 if (is_pae_paging(vcpu)) {
3357 vmcs_write64(GUEST_PDPTR0, mmu->pdptrs[0]);
3358 vmcs_write64(GUEST_PDPTR1, mmu->pdptrs[1]);
3359 vmcs_write64(GUEST_PDPTR2, mmu->pdptrs[2]);
3360 vmcs_write64(GUEST_PDPTR3, mmu->pdptrs[3]);
3361 }
3362 }
3363
ept_save_pdptrs(struct kvm_vcpu * vcpu)3364 void ept_save_pdptrs(struct kvm_vcpu *vcpu)
3365 {
3366 struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
3367
3368 if (WARN_ON_ONCE(!is_pae_paging(vcpu)))
3369 return;
3370
3371 mmu->pdptrs[0] = vmcs_read64(GUEST_PDPTR0);
3372 mmu->pdptrs[1] = vmcs_read64(GUEST_PDPTR1);
3373 mmu->pdptrs[2] = vmcs_read64(GUEST_PDPTR2);
3374 mmu->pdptrs[3] = vmcs_read64(GUEST_PDPTR3);
3375
3376 kvm_register_mark_available(vcpu, VCPU_EXREG_PDPTR);
3377 }
3378
3379 #define CR3_EXITING_BITS (CPU_BASED_CR3_LOAD_EXITING | \
3380 CPU_BASED_CR3_STORE_EXITING)
3381
vmx_is_valid_cr0(struct kvm_vcpu * vcpu,unsigned long cr0)3382 bool vmx_is_valid_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
3383 {
3384 if (is_guest_mode(vcpu))
3385 return nested_guest_cr0_valid(vcpu, cr0);
3386
3387 if (to_vmx(vcpu)->nested.vmxon)
3388 return nested_host_cr0_valid(vcpu, cr0);
3389
3390 return true;
3391 }
3392
vmx_set_cr0(struct kvm_vcpu * vcpu,unsigned long cr0)3393 void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
3394 {
3395 struct vcpu_vmx *vmx = to_vmx(vcpu);
3396 unsigned long hw_cr0, old_cr0_pg;
3397 u32 tmp;
3398
3399 old_cr0_pg = kvm_read_cr0_bits(vcpu, X86_CR0_PG);
3400
3401 hw_cr0 = (cr0 & ~KVM_VM_CR0_ALWAYS_OFF);
3402 if (enable_unrestricted_guest)
3403 hw_cr0 |= KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST;
3404 else {
3405 hw_cr0 |= KVM_VM_CR0_ALWAYS_ON;
3406 if (!enable_ept)
3407 hw_cr0 |= X86_CR0_WP;
3408
3409 if (vmx->rmode.vm86_active && (cr0 & X86_CR0_PE))
3410 enter_pmode(vcpu);
3411
3412 if (!vmx->rmode.vm86_active && !(cr0 & X86_CR0_PE))
3413 enter_rmode(vcpu);
3414 }
3415
3416 vmcs_writel(CR0_READ_SHADOW, cr0);
3417 vmcs_writel(GUEST_CR0, hw_cr0);
3418 vcpu->arch.cr0 = cr0;
3419 kvm_register_mark_available(vcpu, VCPU_EXREG_CR0);
3420
3421 #ifdef CONFIG_X86_64
3422 if (vcpu->arch.efer & EFER_LME) {
3423 if (!old_cr0_pg && (cr0 & X86_CR0_PG))
3424 enter_lmode(vcpu);
3425 else if (old_cr0_pg && !(cr0 & X86_CR0_PG))
3426 exit_lmode(vcpu);
3427 }
3428 #endif
3429
3430 if (enable_ept && !enable_unrestricted_guest) {
3431 /*
3432 * Ensure KVM has an up-to-date snapshot of the guest's CR3. If
3433 * the below code _enables_ CR3 exiting, vmx_cache_reg() will
3434 * (correctly) stop reading vmcs.GUEST_CR3 because it thinks
3435 * KVM's CR3 is installed.
3436 */
3437 if (!kvm_register_is_available(vcpu, VCPU_EXREG_CR3))
3438 vmx_cache_reg(vcpu, VCPU_EXREG_CR3);
3439
3440 /*
3441 * When running with EPT but not unrestricted guest, KVM must
3442 * intercept CR3 accesses when paging is _disabled_. This is
3443 * necessary because restricted guests can't actually run with
3444 * paging disabled, and so KVM stuffs its own CR3 in order to
3445 * run the guest when identity mapped page tables.
3446 *
3447 * Do _NOT_ check the old CR0.PG, e.g. to optimize away the
3448 * update, it may be stale with respect to CR3 interception,
3449 * e.g. after nested VM-Enter.
3450 *
3451 * Lastly, honor L1's desires, i.e. intercept CR3 loads and/or
3452 * stores to forward them to L1, even if KVM does not need to
3453 * intercept them to preserve its identity mapped page tables.
3454 */
3455 if (!(cr0 & X86_CR0_PG)) {
3456 exec_controls_setbit(vmx, CR3_EXITING_BITS);
3457 } else if (!is_guest_mode(vcpu)) {
3458 exec_controls_clearbit(vmx, CR3_EXITING_BITS);
3459 } else {
3460 tmp = exec_controls_get(vmx);
3461 tmp &= ~CR3_EXITING_BITS;
3462 tmp |= get_vmcs12(vcpu)->cpu_based_vm_exec_control & CR3_EXITING_BITS;
3463 exec_controls_set(vmx, tmp);
3464 }
3465
3466 /* Note, vmx_set_cr4() consumes the new vcpu->arch.cr0. */
3467 if ((old_cr0_pg ^ cr0) & X86_CR0_PG)
3468 vmx_set_cr4(vcpu, kvm_read_cr4(vcpu));
3469
3470 /*
3471 * When !CR0_PG -> CR0_PG, vcpu->arch.cr3 becomes active, but
3472 * GUEST_CR3 is still vmx->ept_identity_map_addr if EPT + !URG.
3473 */
3474 if (!(old_cr0_pg & X86_CR0_PG) && (cr0 & X86_CR0_PG))
3475 kvm_register_mark_dirty(vcpu, VCPU_EXREG_CR3);
3476 }
3477
3478 /* depends on vcpu->arch.cr0 to be set to a new value */
3479 vmx->vt.emulation_required = vmx_emulation_required(vcpu);
3480 }
3481
vmx_get_max_ept_level(void)3482 static int vmx_get_max_ept_level(void)
3483 {
3484 if (cpu_has_vmx_ept_5levels())
3485 return 5;
3486 return 4;
3487 }
3488
vmx_load_mmu_pgd(struct kvm_vcpu * vcpu,hpa_t root_hpa,int root_level)3489 void vmx_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa, int root_level)
3490 {
3491 struct kvm *kvm = vcpu->kvm;
3492 bool update_guest_cr3 = true;
3493 unsigned long guest_cr3;
3494
3495 if (enable_ept) {
3496 KVM_MMU_WARN_ON(root_to_sp(root_hpa) &&
3497 root_level != root_to_sp(root_hpa)->role.level);
3498 vmcs_write64(EPT_POINTER, construct_eptp(root_hpa));
3499
3500 hv_track_root_tdp(vcpu, root_hpa);
3501
3502 if (!enable_unrestricted_guest && !is_paging(vcpu))
3503 guest_cr3 = to_kvm_vmx(kvm)->ept_identity_map_addr;
3504 else if (kvm_register_is_dirty(vcpu, VCPU_EXREG_CR3))
3505 guest_cr3 = vcpu->arch.cr3;
3506 else /* vmcs.GUEST_CR3 is already up-to-date. */
3507 update_guest_cr3 = false;
3508 vmx_ept_load_pdptrs(vcpu);
3509 } else {
3510 guest_cr3 = root_hpa | kvm_get_active_pcid(vcpu) |
3511 kvm_get_active_cr3_lam_bits(vcpu);
3512 }
3513
3514 if (update_guest_cr3)
3515 vmcs_writel(GUEST_CR3, guest_cr3);
3516 }
3517
vmx_is_valid_cr4(struct kvm_vcpu * vcpu,unsigned long cr4)3518 bool vmx_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
3519 {
3520 /*
3521 * We operate under the default treatment of SMM, so VMX cannot be
3522 * enabled under SMM. Note, whether or not VMXE is allowed at all,
3523 * i.e. is a reserved bit, is handled by common x86 code.
3524 */
3525 if ((cr4 & X86_CR4_VMXE) && is_smm(vcpu))
3526 return false;
3527
3528 if (to_vmx(vcpu)->nested.vmxon && !nested_cr4_valid(vcpu, cr4))
3529 return false;
3530
3531 return true;
3532 }
3533
vmx_set_cr4(struct kvm_vcpu * vcpu,unsigned long cr4)3534 void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
3535 {
3536 unsigned long old_cr4 = kvm_read_cr4(vcpu);
3537 struct vcpu_vmx *vmx = to_vmx(vcpu);
3538 unsigned long hw_cr4;
3539
3540 /*
3541 * Pass through host's Machine Check Enable value to hw_cr4, which
3542 * is in force while we are in guest mode. Do not let guests control
3543 * this bit, even if host CR4.MCE == 0.
3544 */
3545 hw_cr4 = (cr4_read_shadow() & X86_CR4_MCE) | (cr4 & ~X86_CR4_MCE);
3546 if (enable_unrestricted_guest)
3547 hw_cr4 |= KVM_VM_CR4_ALWAYS_ON_UNRESTRICTED_GUEST;
3548 else if (vmx->rmode.vm86_active)
3549 hw_cr4 |= KVM_RMODE_VM_CR4_ALWAYS_ON;
3550 else
3551 hw_cr4 |= KVM_PMODE_VM_CR4_ALWAYS_ON;
3552
3553 if (vmx_umip_emulated()) {
3554 if (cr4 & X86_CR4_UMIP) {
3555 secondary_exec_controls_setbit(vmx, SECONDARY_EXEC_DESC);
3556 hw_cr4 &= ~X86_CR4_UMIP;
3557 } else if (!is_guest_mode(vcpu) ||
3558 !nested_cpu_has2(get_vmcs12(vcpu), SECONDARY_EXEC_DESC)) {
3559 secondary_exec_controls_clearbit(vmx, SECONDARY_EXEC_DESC);
3560 }
3561 }
3562
3563 vcpu->arch.cr4 = cr4;
3564 kvm_register_mark_available(vcpu, VCPU_EXREG_CR4);
3565
3566 if (!enable_unrestricted_guest) {
3567 if (enable_ept) {
3568 if (!is_paging(vcpu)) {
3569 hw_cr4 &= ~X86_CR4_PAE;
3570 hw_cr4 |= X86_CR4_PSE;
3571 } else if (!(cr4 & X86_CR4_PAE)) {
3572 hw_cr4 &= ~X86_CR4_PAE;
3573 }
3574 }
3575
3576 /*
3577 * SMEP/SMAP/PKU is disabled if CPU is in non-paging mode in
3578 * hardware. To emulate this behavior, SMEP/SMAP/PKU needs
3579 * to be manually disabled when guest switches to non-paging
3580 * mode.
3581 *
3582 * If !enable_unrestricted_guest, the CPU is always running
3583 * with CR0.PG=1 and CR4 needs to be modified.
3584 * If enable_unrestricted_guest, the CPU automatically
3585 * disables SMEP/SMAP/PKU when the guest sets CR0.PG=0.
3586 */
3587 if (!is_paging(vcpu))
3588 hw_cr4 &= ~(X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_PKE);
3589 }
3590
3591 vmcs_writel(CR4_READ_SHADOW, cr4);
3592 vmcs_writel(GUEST_CR4, hw_cr4);
3593
3594 if ((cr4 ^ old_cr4) & (X86_CR4_OSXSAVE | X86_CR4_PKE))
3595 vcpu->arch.cpuid_dynamic_bits_dirty = true;
3596 }
3597
vmx_get_segment(struct kvm_vcpu * vcpu,struct kvm_segment * var,int seg)3598 void vmx_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg)
3599 {
3600 struct vcpu_vmx *vmx = to_vmx(vcpu);
3601 u32 ar;
3602
3603 if (vmx->rmode.vm86_active && seg != VCPU_SREG_LDTR) {
3604 *var = vmx->rmode.segs[seg];
3605 if (seg == VCPU_SREG_TR
3606 || var->selector == vmx_read_guest_seg_selector(vmx, seg))
3607 return;
3608 var->base = vmx_read_guest_seg_base(vmx, seg);
3609 var->selector = vmx_read_guest_seg_selector(vmx, seg);
3610 return;
3611 }
3612 var->base = vmx_read_guest_seg_base(vmx, seg);
3613 var->limit = vmx_read_guest_seg_limit(vmx, seg);
3614 var->selector = vmx_read_guest_seg_selector(vmx, seg);
3615 ar = vmx_read_guest_seg_ar(vmx, seg);
3616 var->unusable = (ar >> 16) & 1;
3617 var->type = ar & 15;
3618 var->s = (ar >> 4) & 1;
3619 var->dpl = (ar >> 5) & 3;
3620 /*
3621 * Some userspaces do not preserve unusable property. Since usable
3622 * segment has to be present according to VMX spec we can use present
3623 * property to amend userspace bug by making unusable segment always
3624 * nonpresent. vmx_segment_access_rights() already marks nonpresent
3625 * segment as unusable.
3626 */
3627 var->present = !var->unusable;
3628 var->avl = (ar >> 12) & 1;
3629 var->l = (ar >> 13) & 1;
3630 var->db = (ar >> 14) & 1;
3631 var->g = (ar >> 15) & 1;
3632 }
3633
vmx_get_segment_base(struct kvm_vcpu * vcpu,int seg)3634 u64 vmx_get_segment_base(struct kvm_vcpu *vcpu, int seg)
3635 {
3636 struct kvm_segment s;
3637
3638 if (to_vmx(vcpu)->rmode.vm86_active) {
3639 vmx_get_segment(vcpu, &s, seg);
3640 return s.base;
3641 }
3642 return vmx_read_guest_seg_base(to_vmx(vcpu), seg);
3643 }
3644
__vmx_get_cpl(struct kvm_vcpu * vcpu,bool no_cache)3645 static int __vmx_get_cpl(struct kvm_vcpu *vcpu, bool no_cache)
3646 {
3647 struct vcpu_vmx *vmx = to_vmx(vcpu);
3648 int ar;
3649
3650 if (unlikely(vmx->rmode.vm86_active))
3651 return 0;
3652
3653 if (no_cache)
3654 ar = vmcs_read32(GUEST_SS_AR_BYTES);
3655 else
3656 ar = vmx_read_guest_seg_ar(vmx, VCPU_SREG_SS);
3657 return VMX_AR_DPL(ar);
3658 }
3659
vmx_get_cpl(struct kvm_vcpu * vcpu)3660 int vmx_get_cpl(struct kvm_vcpu *vcpu)
3661 {
3662 return __vmx_get_cpl(vcpu, false);
3663 }
3664
vmx_get_cpl_no_cache(struct kvm_vcpu * vcpu)3665 int vmx_get_cpl_no_cache(struct kvm_vcpu *vcpu)
3666 {
3667 return __vmx_get_cpl(vcpu, true);
3668 }
3669
vmx_segment_access_rights(struct kvm_segment * var)3670 static u32 vmx_segment_access_rights(struct kvm_segment *var)
3671 {
3672 u32 ar;
3673
3674 ar = var->type & 15;
3675 ar |= (var->s & 1) << 4;
3676 ar |= (var->dpl & 3) << 5;
3677 ar |= (var->present & 1) << 7;
3678 ar |= (var->avl & 1) << 12;
3679 ar |= (var->l & 1) << 13;
3680 ar |= (var->db & 1) << 14;
3681 ar |= (var->g & 1) << 15;
3682 ar |= (var->unusable || !var->present) << 16;
3683
3684 return ar;
3685 }
3686
__vmx_set_segment(struct kvm_vcpu * vcpu,struct kvm_segment * var,int seg)3687 void __vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg)
3688 {
3689 struct vcpu_vmx *vmx = to_vmx(vcpu);
3690 const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
3691
3692 vmx_segment_cache_clear(vmx);
3693
3694 if (vmx->rmode.vm86_active && seg != VCPU_SREG_LDTR) {
3695 vmx->rmode.segs[seg] = *var;
3696 if (seg == VCPU_SREG_TR)
3697 vmcs_write16(sf->selector, var->selector);
3698 else if (var->s)
3699 fix_rmode_seg(seg, &vmx->rmode.segs[seg]);
3700 return;
3701 }
3702
3703 vmcs_writel(sf->base, var->base);
3704 vmcs_write32(sf->limit, var->limit);
3705 vmcs_write16(sf->selector, var->selector);
3706
3707 /*
3708 * Fix the "Accessed" bit in AR field of segment registers for older
3709 * qemu binaries.
3710 * IA32 arch specifies that at the time of processor reset the
3711 * "Accessed" bit in the AR field of segment registers is 1. And qemu
3712 * is setting it to 0 in the userland code. This causes invalid guest
3713 * state vmexit when "unrestricted guest" mode is turned on.
3714 * Fix for this setup issue in cpu_reset is being pushed in the qemu
3715 * tree. Newer qemu binaries with that qemu fix would not need this
3716 * kvm hack.
3717 */
3718 if (is_unrestricted_guest(vcpu) && (seg != VCPU_SREG_LDTR))
3719 var->type |= 0x1; /* Accessed */
3720
3721 vmcs_write32(sf->ar_bytes, vmx_segment_access_rights(var));
3722 }
3723
vmx_set_segment(struct kvm_vcpu * vcpu,struct kvm_segment * var,int seg)3724 void vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg)
3725 {
3726 __vmx_set_segment(vcpu, var, seg);
3727
3728 to_vmx(vcpu)->vt.emulation_required = vmx_emulation_required(vcpu);
3729 }
3730
vmx_get_cs_db_l_bits(struct kvm_vcpu * vcpu,int * db,int * l)3731 void vmx_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
3732 {
3733 u32 ar = vmx_read_guest_seg_ar(to_vmx(vcpu), VCPU_SREG_CS);
3734
3735 *db = (ar >> 14) & 1;
3736 *l = (ar >> 13) & 1;
3737 }
3738
vmx_get_idt(struct kvm_vcpu * vcpu,struct desc_ptr * dt)3739 void vmx_get_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
3740 {
3741 dt->size = vmcs_read32(GUEST_IDTR_LIMIT);
3742 dt->address = vmcs_readl(GUEST_IDTR_BASE);
3743 }
3744
vmx_set_idt(struct kvm_vcpu * vcpu,struct desc_ptr * dt)3745 void vmx_set_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
3746 {
3747 vmcs_write32(GUEST_IDTR_LIMIT, dt->size);
3748 vmcs_writel(GUEST_IDTR_BASE, dt->address);
3749 }
3750
vmx_get_gdt(struct kvm_vcpu * vcpu,struct desc_ptr * dt)3751 void vmx_get_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
3752 {
3753 dt->size = vmcs_read32(GUEST_GDTR_LIMIT);
3754 dt->address = vmcs_readl(GUEST_GDTR_BASE);
3755 }
3756
vmx_set_gdt(struct kvm_vcpu * vcpu,struct desc_ptr * dt)3757 void vmx_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
3758 {
3759 vmcs_write32(GUEST_GDTR_LIMIT, dt->size);
3760 vmcs_writel(GUEST_GDTR_BASE, dt->address);
3761 }
3762
rmode_segment_valid(struct kvm_vcpu * vcpu,int seg)3763 static bool rmode_segment_valid(struct kvm_vcpu *vcpu, int seg)
3764 {
3765 struct kvm_segment var;
3766 u32 ar;
3767
3768 vmx_get_segment(vcpu, &var, seg);
3769 var.dpl = 0x3;
3770 if (seg == VCPU_SREG_CS)
3771 var.type = 0x3;
3772 ar = vmx_segment_access_rights(&var);
3773
3774 if (var.base != (var.selector << 4))
3775 return false;
3776 if (var.limit != 0xffff)
3777 return false;
3778 if (ar != 0xf3)
3779 return false;
3780
3781 return true;
3782 }
3783
code_segment_valid(struct kvm_vcpu * vcpu)3784 static bool code_segment_valid(struct kvm_vcpu *vcpu)
3785 {
3786 struct kvm_segment cs;
3787 unsigned int cs_rpl;
3788
3789 vmx_get_segment(vcpu, &cs, VCPU_SREG_CS);
3790 cs_rpl = cs.selector & SEGMENT_RPL_MASK;
3791
3792 if (cs.unusable)
3793 return false;
3794 if (~cs.type & (VMX_AR_TYPE_CODE_MASK|VMX_AR_TYPE_ACCESSES_MASK))
3795 return false;
3796 if (!cs.s)
3797 return false;
3798 if (cs.type & VMX_AR_TYPE_WRITEABLE_MASK) {
3799 if (cs.dpl > cs_rpl)
3800 return false;
3801 } else {
3802 if (cs.dpl != cs_rpl)
3803 return false;
3804 }
3805 if (!cs.present)
3806 return false;
3807
3808 /* TODO: Add Reserved field check, this'll require a new member in the kvm_segment_field structure */
3809 return true;
3810 }
3811
stack_segment_valid(struct kvm_vcpu * vcpu)3812 static bool stack_segment_valid(struct kvm_vcpu *vcpu)
3813 {
3814 struct kvm_segment ss;
3815 unsigned int ss_rpl;
3816
3817 vmx_get_segment(vcpu, &ss, VCPU_SREG_SS);
3818 ss_rpl = ss.selector & SEGMENT_RPL_MASK;
3819
3820 if (ss.unusable)
3821 return true;
3822 if (ss.type != 3 && ss.type != 7)
3823 return false;
3824 if (!ss.s)
3825 return false;
3826 if (ss.dpl != ss_rpl) /* DPL != RPL */
3827 return false;
3828 if (!ss.present)
3829 return false;
3830
3831 return true;
3832 }
3833
data_segment_valid(struct kvm_vcpu * vcpu,int seg)3834 static bool data_segment_valid(struct kvm_vcpu *vcpu, int seg)
3835 {
3836 struct kvm_segment var;
3837 unsigned int rpl;
3838
3839 vmx_get_segment(vcpu, &var, seg);
3840 rpl = var.selector & SEGMENT_RPL_MASK;
3841
3842 if (var.unusable)
3843 return true;
3844 if (!var.s)
3845 return false;
3846 if (!var.present)
3847 return false;
3848 if (~var.type & (VMX_AR_TYPE_CODE_MASK|VMX_AR_TYPE_WRITEABLE_MASK)) {
3849 if (var.dpl < rpl) /* DPL < RPL */
3850 return false;
3851 }
3852
3853 /* TODO: Add other members to kvm_segment_field to allow checking for other access
3854 * rights flags
3855 */
3856 return true;
3857 }
3858
tr_valid(struct kvm_vcpu * vcpu)3859 static bool tr_valid(struct kvm_vcpu *vcpu)
3860 {
3861 struct kvm_segment tr;
3862
3863 vmx_get_segment(vcpu, &tr, VCPU_SREG_TR);
3864
3865 if (tr.unusable)
3866 return false;
3867 if (tr.selector & SEGMENT_TI_MASK) /* TI = 1 */
3868 return false;
3869 if (tr.type != 3 && tr.type != 11) /* TODO: Check if guest is in IA32e mode */
3870 return false;
3871 if (!tr.present)
3872 return false;
3873
3874 return true;
3875 }
3876
ldtr_valid(struct kvm_vcpu * vcpu)3877 static bool ldtr_valid(struct kvm_vcpu *vcpu)
3878 {
3879 struct kvm_segment ldtr;
3880
3881 vmx_get_segment(vcpu, &ldtr, VCPU_SREG_LDTR);
3882
3883 if (ldtr.unusable)
3884 return true;
3885 if (ldtr.selector & SEGMENT_TI_MASK) /* TI = 1 */
3886 return false;
3887 if (ldtr.type != 2)
3888 return false;
3889 if (!ldtr.present)
3890 return false;
3891
3892 return true;
3893 }
3894
cs_ss_rpl_check(struct kvm_vcpu * vcpu)3895 static bool cs_ss_rpl_check(struct kvm_vcpu *vcpu)
3896 {
3897 struct kvm_segment cs, ss;
3898
3899 vmx_get_segment(vcpu, &cs, VCPU_SREG_CS);
3900 vmx_get_segment(vcpu, &ss, VCPU_SREG_SS);
3901
3902 return ((cs.selector & SEGMENT_RPL_MASK) ==
3903 (ss.selector & SEGMENT_RPL_MASK));
3904 }
3905
3906 /*
3907 * Check if guest state is valid. Returns true if valid, false if
3908 * not.
3909 * We assume that registers are always usable
3910 */
__vmx_guest_state_valid(struct kvm_vcpu * vcpu)3911 bool __vmx_guest_state_valid(struct kvm_vcpu *vcpu)
3912 {
3913 /* real mode guest state checks */
3914 if (!is_protmode(vcpu) || (vmx_get_rflags(vcpu) & X86_EFLAGS_VM)) {
3915 if (!rmode_segment_valid(vcpu, VCPU_SREG_CS))
3916 return false;
3917 if (!rmode_segment_valid(vcpu, VCPU_SREG_SS))
3918 return false;
3919 if (!rmode_segment_valid(vcpu, VCPU_SREG_DS))
3920 return false;
3921 if (!rmode_segment_valid(vcpu, VCPU_SREG_ES))
3922 return false;
3923 if (!rmode_segment_valid(vcpu, VCPU_SREG_FS))
3924 return false;
3925 if (!rmode_segment_valid(vcpu, VCPU_SREG_GS))
3926 return false;
3927 } else {
3928 /* protected mode guest state checks */
3929 if (!cs_ss_rpl_check(vcpu))
3930 return false;
3931 if (!code_segment_valid(vcpu))
3932 return false;
3933 if (!stack_segment_valid(vcpu))
3934 return false;
3935 if (!data_segment_valid(vcpu, VCPU_SREG_DS))
3936 return false;
3937 if (!data_segment_valid(vcpu, VCPU_SREG_ES))
3938 return false;
3939 if (!data_segment_valid(vcpu, VCPU_SREG_FS))
3940 return false;
3941 if (!data_segment_valid(vcpu, VCPU_SREG_GS))
3942 return false;
3943 if (!tr_valid(vcpu))
3944 return false;
3945 if (!ldtr_valid(vcpu))
3946 return false;
3947 }
3948 /* TODO:
3949 * - Add checks on RIP
3950 * - Add checks on RFLAGS
3951 */
3952
3953 return true;
3954 }
3955
init_rmode_tss(struct kvm * kvm,void __user * ua)3956 static int init_rmode_tss(struct kvm *kvm, void __user *ua)
3957 {
3958 const void *zero_page = (const void *) __va(page_to_phys(ZERO_PAGE(0)));
3959 u16 data;
3960 int i;
3961
3962 for (i = 0; i < 3; i++) {
3963 if (__copy_to_user(ua + PAGE_SIZE * i, zero_page, PAGE_SIZE))
3964 return -EFAULT;
3965 }
3966
3967 data = TSS_BASE_SIZE + TSS_REDIRECTION_SIZE;
3968 if (__copy_to_user(ua + TSS_IOPB_BASE_OFFSET, &data, sizeof(u16)))
3969 return -EFAULT;
3970
3971 data = ~0;
3972 if (__copy_to_user(ua + RMODE_TSS_SIZE - 1, &data, sizeof(u8)))
3973 return -EFAULT;
3974
3975 return 0;
3976 }
3977
init_rmode_identity_map(struct kvm * kvm)3978 static int init_rmode_identity_map(struct kvm *kvm)
3979 {
3980 struct kvm_vmx *kvm_vmx = to_kvm_vmx(kvm);
3981 int i, r = 0;
3982 void __user *uaddr;
3983 u32 tmp;
3984
3985 /* Protect kvm_vmx->ept_identity_pagetable_done. */
3986 mutex_lock(&kvm->slots_lock);
3987
3988 if (likely(kvm_vmx->ept_identity_pagetable_done))
3989 goto out;
3990
3991 if (!kvm_vmx->ept_identity_map_addr)
3992 kvm_vmx->ept_identity_map_addr = VMX_EPT_IDENTITY_PAGETABLE_ADDR;
3993
3994 uaddr = __x86_set_memory_region(kvm,
3995 IDENTITY_PAGETABLE_PRIVATE_MEMSLOT,
3996 kvm_vmx->ept_identity_map_addr,
3997 PAGE_SIZE);
3998 if (IS_ERR(uaddr)) {
3999 r = PTR_ERR(uaddr);
4000 goto out;
4001 }
4002
4003 /* Set up identity-mapping pagetable for EPT in real mode */
4004 for (i = 0; i < (PAGE_SIZE / sizeof(tmp)); i++) {
4005 tmp = (i << 22) + (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER |
4006 _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_PSE);
4007 if (__copy_to_user(uaddr + i * sizeof(tmp), &tmp, sizeof(tmp))) {
4008 r = -EFAULT;
4009 goto out;
4010 }
4011 }
4012 kvm_vmx->ept_identity_pagetable_done = true;
4013
4014 out:
4015 mutex_unlock(&kvm->slots_lock);
4016 return r;
4017 }
4018
seg_setup(int seg)4019 static void seg_setup(int seg)
4020 {
4021 const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
4022 unsigned int ar;
4023
4024 vmcs_write16(sf->selector, 0);
4025 vmcs_writel(sf->base, 0);
4026 vmcs_write32(sf->limit, 0xffff);
4027 ar = 0x93;
4028 if (seg == VCPU_SREG_CS)
4029 ar |= 0x08; /* code segment */
4030
4031 vmcs_write32(sf->ar_bytes, ar);
4032 }
4033
allocate_vpid(void)4034 int allocate_vpid(void)
4035 {
4036 int vpid;
4037
4038 if (!enable_vpid)
4039 return 0;
4040 spin_lock(&vmx_vpid_lock);
4041 vpid = find_first_zero_bit(vmx_vpid_bitmap, VMX_NR_VPIDS);
4042 if (vpid < VMX_NR_VPIDS)
4043 __set_bit(vpid, vmx_vpid_bitmap);
4044 else
4045 vpid = 0;
4046 spin_unlock(&vmx_vpid_lock);
4047 return vpid;
4048 }
4049
free_vpid(int vpid)4050 void free_vpid(int vpid)
4051 {
4052 if (!enable_vpid || vpid == 0)
4053 return;
4054 spin_lock(&vmx_vpid_lock);
4055 __clear_bit(vpid, vmx_vpid_bitmap);
4056 spin_unlock(&vmx_vpid_lock);
4057 }
4058
vmx_msr_bitmap_l01_changed(struct vcpu_vmx * vmx)4059 static void vmx_msr_bitmap_l01_changed(struct vcpu_vmx *vmx)
4060 {
4061 /*
4062 * When KVM is a nested hypervisor on top of Hyper-V and uses
4063 * 'Enlightened MSR Bitmap' feature L0 needs to know that MSR
4064 * bitmap has changed.
4065 */
4066 if (kvm_is_using_evmcs()) {
4067 struct hv_enlightened_vmcs *evmcs = (void *)vmx->vmcs01.vmcs;
4068
4069 if (evmcs->hv_enlightenments_control.msr_bitmap)
4070 evmcs->hv_clean_fields &=
4071 ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP;
4072 }
4073
4074 vmx->nested.force_msr_bitmap_recalc = true;
4075 }
4076
vmx_set_intercept_for_msr(struct kvm_vcpu * vcpu,u32 msr,int type,bool set)4077 void vmx_set_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type, bool set)
4078 {
4079 struct vcpu_vmx *vmx = to_vmx(vcpu);
4080 unsigned long *msr_bitmap = vmx->vmcs01.msr_bitmap;
4081
4082 if (!cpu_has_vmx_msr_bitmap())
4083 return;
4084
4085 vmx_msr_bitmap_l01_changed(vmx);
4086
4087 if (type & MSR_TYPE_R) {
4088 if (!set && kvm_msr_allowed(vcpu, msr, KVM_MSR_FILTER_READ))
4089 vmx_clear_msr_bitmap_read(msr_bitmap, msr);
4090 else
4091 vmx_set_msr_bitmap_read(msr_bitmap, msr);
4092 }
4093
4094 if (type & MSR_TYPE_W) {
4095 if (!set && kvm_msr_allowed(vcpu, msr, KVM_MSR_FILTER_WRITE))
4096 vmx_clear_msr_bitmap_write(msr_bitmap, msr);
4097 else
4098 vmx_set_msr_bitmap_write(msr_bitmap, msr);
4099 }
4100 }
4101
vmx_update_msr_bitmap_x2apic(struct kvm_vcpu * vcpu)4102 static void vmx_update_msr_bitmap_x2apic(struct kvm_vcpu *vcpu)
4103 {
4104 /*
4105 * x2APIC indices for 64-bit accesses into the RDMSR and WRMSR halves
4106 * of the MSR bitmap. KVM emulates APIC registers up through 0x3f0,
4107 * i.e. MSR 0x83f, and so only needs to dynamically manipulate 64 bits.
4108 */
4109 const int read_idx = APIC_BASE_MSR / BITS_PER_LONG_LONG;
4110 const int write_idx = read_idx + (0x800 / sizeof(u64));
4111 struct vcpu_vmx *vmx = to_vmx(vcpu);
4112 u64 *msr_bitmap = (u64 *)vmx->vmcs01.msr_bitmap;
4113 u8 mode;
4114
4115 if (!cpu_has_vmx_msr_bitmap() || WARN_ON_ONCE(!lapic_in_kernel(vcpu)))
4116 return;
4117
4118 if (cpu_has_secondary_exec_ctrls() &&
4119 (secondary_exec_controls_get(vmx) &
4120 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE)) {
4121 mode = MSR_BITMAP_MODE_X2APIC;
4122 if (enable_apicv && kvm_vcpu_apicv_active(vcpu))
4123 mode |= MSR_BITMAP_MODE_X2APIC_APICV;
4124 } else {
4125 mode = 0;
4126 }
4127
4128 if (mode == vmx->x2apic_msr_bitmap_mode)
4129 return;
4130
4131 vmx->x2apic_msr_bitmap_mode = mode;
4132
4133 /*
4134 * Reset the bitmap for MSRs 0x800 - 0x83f. Leave AMD's uber-extended
4135 * registers (0x840 and above) intercepted, KVM doesn't support them.
4136 * Intercept all writes by default and poke holes as needed. Pass
4137 * through reads for all valid registers by default in x2APIC+APICv
4138 * mode, only the current timer count needs on-demand emulation by KVM.
4139 */
4140 if (mode & MSR_BITMAP_MODE_X2APIC_APICV)
4141 msr_bitmap[read_idx] = ~kvm_lapic_readable_reg_mask(vcpu->arch.apic);
4142 else
4143 msr_bitmap[read_idx] = ~0ull;
4144 msr_bitmap[write_idx] = ~0ull;
4145
4146 /*
4147 * TPR reads and writes can be virtualized even if virtual interrupt
4148 * delivery is not in use.
4149 */
4150 vmx_set_intercept_for_msr(vcpu, X2APIC_MSR(APIC_TASKPRI), MSR_TYPE_RW,
4151 !(mode & MSR_BITMAP_MODE_X2APIC));
4152
4153 if (mode & MSR_BITMAP_MODE_X2APIC_APICV) {
4154 vmx_enable_intercept_for_msr(vcpu, X2APIC_MSR(APIC_TMCCT), MSR_TYPE_RW);
4155 vmx_disable_intercept_for_msr(vcpu, X2APIC_MSR(APIC_EOI), MSR_TYPE_W);
4156 vmx_disable_intercept_for_msr(vcpu, X2APIC_MSR(APIC_SELF_IPI), MSR_TYPE_W);
4157 if (enable_ipiv)
4158 vmx_disable_intercept_for_msr(vcpu, X2APIC_MSR(APIC_ICR), MSR_TYPE_RW);
4159 }
4160 }
4161
pt_update_intercept_for_msr(struct kvm_vcpu * vcpu)4162 void pt_update_intercept_for_msr(struct kvm_vcpu *vcpu)
4163 {
4164 struct vcpu_vmx *vmx = to_vmx(vcpu);
4165 bool flag = !(vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN);
4166 u32 i;
4167
4168 vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_STATUS, MSR_TYPE_RW, flag);
4169 vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_OUTPUT_BASE, MSR_TYPE_RW, flag);
4170 vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_OUTPUT_MASK, MSR_TYPE_RW, flag);
4171 vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_CR3_MATCH, MSR_TYPE_RW, flag);
4172 for (i = 0; i < vmx->pt_desc.num_address_ranges; i++) {
4173 vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_ADDR0_A + i * 2, MSR_TYPE_RW, flag);
4174 vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_ADDR0_B + i * 2, MSR_TYPE_RW, flag);
4175 }
4176 }
4177
vmx_recalc_pmu_msr_intercepts(struct kvm_vcpu * vcpu)4178 static void vmx_recalc_pmu_msr_intercepts(struct kvm_vcpu *vcpu)
4179 {
4180 u64 vm_exit_controls_bits = VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL |
4181 VM_EXIT_SAVE_IA32_PERF_GLOBAL_CTRL;
4182 bool has_mediated_pmu = kvm_vcpu_has_mediated_pmu(vcpu);
4183 struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
4184 struct vcpu_vmx *vmx = to_vmx(vcpu);
4185 bool intercept = !has_mediated_pmu;
4186 int i;
4187
4188 if (!enable_mediated_pmu)
4189 return;
4190
4191 if (!cpu_has_save_perf_global_ctrl()) {
4192 vm_exit_controls_bits &= ~VM_EXIT_SAVE_IA32_PERF_GLOBAL_CTRL;
4193
4194 if (has_mediated_pmu)
4195 vmx_add_autostore_msr(vmx, MSR_CORE_PERF_GLOBAL_CTRL);
4196 else
4197 vmx_remove_autostore_msr(vmx, MSR_CORE_PERF_GLOBAL_CTRL);
4198 }
4199
4200 vm_entry_controls_changebit(vmx, VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL,
4201 has_mediated_pmu);
4202
4203 vm_exit_controls_changebit(vmx, vm_exit_controls_bits, has_mediated_pmu);
4204
4205 for (i = 0; i < pmu->nr_arch_gp_counters; i++) {
4206 vmx_set_intercept_for_msr(vcpu, MSR_IA32_PERFCTR0 + i,
4207 MSR_TYPE_RW, intercept);
4208 vmx_set_intercept_for_msr(vcpu, MSR_IA32_PMC0 + i, MSR_TYPE_RW,
4209 intercept || !fw_writes_is_enabled(vcpu));
4210 }
4211 for ( ; i < kvm_pmu_cap.num_counters_gp; i++) {
4212 vmx_set_intercept_for_msr(vcpu, MSR_IA32_PERFCTR0 + i,
4213 MSR_TYPE_RW, true);
4214 vmx_set_intercept_for_msr(vcpu, MSR_IA32_PMC0 + i,
4215 MSR_TYPE_RW, true);
4216 }
4217
4218 for (i = 0; i < pmu->nr_arch_fixed_counters; i++)
4219 vmx_set_intercept_for_msr(vcpu, MSR_CORE_PERF_FIXED_CTR0 + i,
4220 MSR_TYPE_RW, intercept);
4221 for ( ; i < kvm_pmu_cap.num_counters_fixed; i++)
4222 vmx_set_intercept_for_msr(vcpu, MSR_CORE_PERF_FIXED_CTR0 + i,
4223 MSR_TYPE_RW, true);
4224
4225 intercept = kvm_need_perf_global_ctrl_intercept(vcpu);
4226 vmx_set_intercept_for_msr(vcpu, MSR_CORE_PERF_GLOBAL_STATUS,
4227 MSR_TYPE_RW, intercept);
4228 vmx_set_intercept_for_msr(vcpu, MSR_CORE_PERF_GLOBAL_CTRL,
4229 MSR_TYPE_RW, intercept);
4230 vmx_set_intercept_for_msr(vcpu, MSR_CORE_PERF_GLOBAL_OVF_CTRL,
4231 MSR_TYPE_RW, intercept);
4232 }
4233
vmx_recalc_msr_intercepts(struct kvm_vcpu * vcpu)4234 static void vmx_recalc_msr_intercepts(struct kvm_vcpu *vcpu)
4235 {
4236 bool intercept;
4237
4238 if (!cpu_has_vmx_msr_bitmap())
4239 return;
4240
4241 vmx_disable_intercept_for_msr(vcpu, MSR_IA32_TSC, MSR_TYPE_R);
4242 #ifdef CONFIG_X86_64
4243 vmx_disable_intercept_for_msr(vcpu, MSR_FS_BASE, MSR_TYPE_RW);
4244 vmx_disable_intercept_for_msr(vcpu, MSR_GS_BASE, MSR_TYPE_RW);
4245 vmx_disable_intercept_for_msr(vcpu, MSR_KERNEL_GS_BASE, MSR_TYPE_RW);
4246 #endif
4247 vmx_disable_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_CS, MSR_TYPE_RW);
4248 vmx_disable_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_ESP, MSR_TYPE_RW);
4249 vmx_disable_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_EIP, MSR_TYPE_RW);
4250 if (kvm_cstate_in_guest(vcpu->kvm)) {
4251 vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C1_RES, MSR_TYPE_R);
4252 vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C3_RESIDENCY, MSR_TYPE_R);
4253 vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C6_RESIDENCY, MSR_TYPE_R);
4254 vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C7_RESIDENCY, MSR_TYPE_R);
4255 }
4256 if (kvm_aperfmperf_in_guest(vcpu->kvm)) {
4257 vmx_disable_intercept_for_msr(vcpu, MSR_IA32_APERF, MSR_TYPE_R);
4258 vmx_disable_intercept_for_msr(vcpu, MSR_IA32_MPERF, MSR_TYPE_R);
4259 }
4260
4261 /* PT MSRs can be passed through iff PT is exposed to the guest. */
4262 if (vmx_pt_mode_is_host_guest())
4263 pt_update_intercept_for_msr(vcpu);
4264
4265 if (vcpu->arch.xfd_no_write_intercept)
4266 vmx_disable_intercept_for_msr(vcpu, MSR_IA32_XFD, MSR_TYPE_RW);
4267
4268 vmx_set_intercept_for_msr(vcpu, MSR_IA32_SPEC_CTRL, MSR_TYPE_RW,
4269 !to_vmx(vcpu)->spec_ctrl);
4270
4271 if (kvm_cpu_cap_has(X86_FEATURE_XFD))
4272 vmx_set_intercept_for_msr(vcpu, MSR_IA32_XFD_ERR, MSR_TYPE_R,
4273 !guest_cpu_cap_has(vcpu, X86_FEATURE_XFD));
4274
4275 if (cpu_feature_enabled(X86_FEATURE_IBPB))
4276 vmx_set_intercept_for_msr(vcpu, MSR_IA32_PRED_CMD, MSR_TYPE_W,
4277 !guest_has_pred_cmd_msr(vcpu));
4278
4279 if (cpu_feature_enabled(X86_FEATURE_FLUSH_L1D))
4280 vmx_set_intercept_for_msr(vcpu, MSR_IA32_FLUSH_CMD, MSR_TYPE_W,
4281 !guest_cpu_cap_has(vcpu, X86_FEATURE_FLUSH_L1D));
4282
4283 if (kvm_cpu_cap_has(X86_FEATURE_SHSTK)) {
4284 intercept = !guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK);
4285
4286 vmx_set_intercept_for_msr(vcpu, MSR_IA32_PL0_SSP, MSR_TYPE_RW, intercept);
4287 vmx_set_intercept_for_msr(vcpu, MSR_IA32_PL1_SSP, MSR_TYPE_RW, intercept);
4288 vmx_set_intercept_for_msr(vcpu, MSR_IA32_PL2_SSP, MSR_TYPE_RW, intercept);
4289 vmx_set_intercept_for_msr(vcpu, MSR_IA32_PL3_SSP, MSR_TYPE_RW, intercept);
4290 }
4291
4292 if (kvm_cpu_cap_has(X86_FEATURE_SHSTK) || kvm_cpu_cap_has(X86_FEATURE_IBT)) {
4293 intercept = !guest_cpu_cap_has(vcpu, X86_FEATURE_IBT) &&
4294 !guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK);
4295
4296 vmx_set_intercept_for_msr(vcpu, MSR_IA32_U_CET, MSR_TYPE_RW, intercept);
4297 vmx_set_intercept_for_msr(vcpu, MSR_IA32_S_CET, MSR_TYPE_RW, intercept);
4298 }
4299
4300 vmx_recalc_pmu_msr_intercepts(vcpu);
4301
4302 /*
4303 * x2APIC and LBR MSR intercepts are modified on-demand and cannot be
4304 * filtered by userspace.
4305 */
4306 }
4307
vmx_recalc_instruction_intercepts(struct kvm_vcpu * vcpu)4308 static void vmx_recalc_instruction_intercepts(struct kvm_vcpu *vcpu)
4309 {
4310 exec_controls_changebit(to_vmx(vcpu), CPU_BASED_RDPMC_EXITING,
4311 kvm_need_rdpmc_intercept(vcpu));
4312 }
4313
vmx_recalc_intercepts(struct kvm_vcpu * vcpu)4314 void vmx_recalc_intercepts(struct kvm_vcpu *vcpu)
4315 {
4316 vmx_recalc_instruction_intercepts(vcpu);
4317 vmx_recalc_msr_intercepts(vcpu);
4318 }
4319
vmx_deliver_nested_posted_interrupt(struct kvm_vcpu * vcpu,int vector)4320 static int vmx_deliver_nested_posted_interrupt(struct kvm_vcpu *vcpu,
4321 int vector)
4322 {
4323 struct vcpu_vmx *vmx = to_vmx(vcpu);
4324
4325 /*
4326 * DO NOT query the vCPU's vmcs12, as vmcs12 is dynamically allocated
4327 * and freed, and must not be accessed outside of vcpu->mutex. The
4328 * vCPU's cached PI NV is valid if and only if posted interrupts
4329 * enabled in its vmcs12, i.e. checking the vector also checks that
4330 * L1 has enabled posted interrupts for L2.
4331 */
4332 if (is_guest_mode(vcpu) &&
4333 vector == vmx->nested.posted_intr_nv) {
4334 /*
4335 * If a posted intr is not recognized by hardware,
4336 * we will accomplish it in the next vmentry.
4337 */
4338 vmx->nested.pi_pending = true;
4339 kvm_make_request(KVM_REQ_EVENT, vcpu);
4340
4341 /*
4342 * This pairs with the smp_mb_*() after setting vcpu->mode in
4343 * vcpu_enter_guest() to guarantee the vCPU sees the event
4344 * request if triggering a posted interrupt "fails" because
4345 * vcpu->mode != IN_GUEST_MODE. The extra barrier is needed as
4346 * the smb_wmb() in kvm_make_request() only ensures everything
4347 * done before making the request is visible when the request
4348 * is visible, it doesn't ensure ordering between the store to
4349 * vcpu->requests and the load from vcpu->mode.
4350 */
4351 smp_mb__after_atomic();
4352
4353 /* the PIR and ON have been set by L1. */
4354 kvm_vcpu_trigger_posted_interrupt(vcpu, POSTED_INTR_NESTED_VECTOR);
4355 return 0;
4356 }
4357 return -1;
4358 }
4359 /*
4360 * Send interrupt to vcpu via posted interrupt way.
4361 * 1. If target vcpu is running(non-root mode), send posted interrupt
4362 * notification to vcpu and hardware will sync PIR to vIRR atomically.
4363 * 2. If target vcpu isn't running(root mode), kick it to pick up the
4364 * interrupt from PIR in next vmentry.
4365 */
vmx_deliver_posted_interrupt(struct kvm_vcpu * vcpu,int vector)4366 static int vmx_deliver_posted_interrupt(struct kvm_vcpu *vcpu, int vector)
4367 {
4368 struct vcpu_vt *vt = to_vt(vcpu);
4369 int r;
4370
4371 r = vmx_deliver_nested_posted_interrupt(vcpu, vector);
4372 if (!r)
4373 return 0;
4374
4375 /* Note, this is called iff the local APIC is in-kernel. */
4376 if (!vcpu->arch.apic->apicv_active)
4377 return -1;
4378
4379 __vmx_deliver_posted_interrupt(vcpu, &vt->pi_desc, vector);
4380 return 0;
4381 }
4382
vmx_deliver_interrupt(struct kvm_lapic * apic,int delivery_mode,int trig_mode,int vector)4383 void vmx_deliver_interrupt(struct kvm_lapic *apic, int delivery_mode,
4384 int trig_mode, int vector)
4385 {
4386 struct kvm_vcpu *vcpu = apic->vcpu;
4387
4388 if (vmx_deliver_posted_interrupt(vcpu, vector)) {
4389 kvm_lapic_set_irr(vector, apic);
4390 kvm_make_request(KVM_REQ_EVENT, vcpu);
4391 kvm_vcpu_kick(vcpu);
4392 } else {
4393 trace_kvm_apicv_accept_irq(vcpu->vcpu_id, delivery_mode,
4394 trig_mode, vector);
4395 }
4396 }
4397
4398 /*
4399 * Set up the vmcs's constant host-state fields, i.e., host-state fields that
4400 * will not change in the lifetime of the guest.
4401 * Note that host-state that does change is set elsewhere. E.g., host-state
4402 * that is set differently for each CPU is set in vmx_vcpu_load(), not here.
4403 */
vmx_set_constant_host_state(struct vcpu_vmx * vmx)4404 void vmx_set_constant_host_state(struct vcpu_vmx *vmx)
4405 {
4406 u32 low32, high32;
4407 unsigned long tmpl;
4408 unsigned long cr0, cr3, cr4;
4409
4410 cr0 = read_cr0();
4411 WARN_ON(cr0 & X86_CR0_TS);
4412 vmcs_writel(HOST_CR0, cr0); /* 22.2.3 */
4413
4414 /*
4415 * Save the most likely value for this task's CR3 in the VMCS.
4416 * We can't use __get_current_cr3_fast() because we're not atomic.
4417 */
4418 cr3 = __read_cr3();
4419 vmcs_writel(HOST_CR3, cr3); /* 22.2.3 FIXME: shadow tables */
4420 vmx->loaded_vmcs->host_state.cr3 = cr3;
4421
4422 /* Save the most likely value for this task's CR4 in the VMCS. */
4423 cr4 = cr4_read_shadow();
4424 vmcs_writel(HOST_CR4, cr4); /* 22.2.3, 22.2.5 */
4425 vmx->loaded_vmcs->host_state.cr4 = cr4;
4426
4427 vmcs_write16(HOST_CS_SELECTOR, __KERNEL_CS); /* 22.2.4 */
4428 #ifdef CONFIG_X86_64
4429 /*
4430 * Load null selectors, so we can avoid reloading them in
4431 * vmx_prepare_switch_to_host(), in case userspace uses
4432 * the null selectors too (the expected case).
4433 */
4434 vmcs_write16(HOST_DS_SELECTOR, 0);
4435 vmcs_write16(HOST_ES_SELECTOR, 0);
4436 #else
4437 vmcs_write16(HOST_DS_SELECTOR, __KERNEL_DS); /* 22.2.4 */
4438 vmcs_write16(HOST_ES_SELECTOR, __KERNEL_DS); /* 22.2.4 */
4439 #endif
4440 vmcs_write16(HOST_SS_SELECTOR, __KERNEL_DS); /* 22.2.4 */
4441 vmcs_write16(HOST_TR_SELECTOR, GDT_ENTRY_TSS*8); /* 22.2.4 */
4442
4443 vmcs_writel(HOST_IDTR_BASE, host_idt_base); /* 22.2.4 */
4444
4445 vmcs_writel(HOST_RIP, (unsigned long)vmx_vmexit); /* 22.2.5 */
4446
4447 rdmsr(MSR_IA32_SYSENTER_CS, low32, high32);
4448 vmcs_write32(HOST_IA32_SYSENTER_CS, low32);
4449
4450 /*
4451 * SYSENTER is used for 32-bit system calls on either 32-bit or
4452 * 64-bit kernels. It is always zero If neither is allowed, otherwise
4453 * vmx_vcpu_load_vmcs loads it with the per-CPU entry stack (and may
4454 * have already done so!).
4455 */
4456 if (!IS_ENABLED(CONFIG_IA32_EMULATION) && !IS_ENABLED(CONFIG_X86_32))
4457 vmcs_writel(HOST_IA32_SYSENTER_ESP, 0);
4458
4459 rdmsrq(MSR_IA32_SYSENTER_EIP, tmpl);
4460 vmcs_writel(HOST_IA32_SYSENTER_EIP, tmpl); /* 22.2.3 */
4461
4462 if (vmcs_config.vmexit_ctrl & VM_EXIT_LOAD_IA32_PAT) {
4463 rdmsr(MSR_IA32_CR_PAT, low32, high32);
4464 vmcs_write64(HOST_IA32_PAT, low32 | ((u64) high32 << 32));
4465 }
4466
4467 if (cpu_has_load_ia32_efer())
4468 vmcs_write64(HOST_IA32_EFER, kvm_host.efer);
4469
4470 /*
4471 * Supervisor shadow stack is not enabled on host side, i.e.,
4472 * host IA32_S_CET.SHSTK_EN bit is guaranteed to 0 now, per SDM
4473 * description(RDSSP instruction), SSP is not readable in CPL0,
4474 * so resetting the two registers to 0s at VM-Exit does no harm
4475 * to kernel execution. When execution flow exits to userspace,
4476 * SSP is reloaded from IA32_PL3_SSP. Check SDM Vol.2A/B Chapter
4477 * 3 and 4 for details.
4478 */
4479 if (cpu_has_load_cet_ctrl()) {
4480 vmcs_writel(HOST_S_CET, kvm_host.s_cet);
4481 vmcs_writel(HOST_SSP, 0);
4482 vmcs_writel(HOST_INTR_SSP_TABLE, 0);
4483 }
4484
4485 /*
4486 * When running a guest with a mediated PMU, guest state is resident in
4487 * hardware after VM-Exit. Zero PERF_GLOBAL_CTRL on exit so that host
4488 * activity doesn't bleed into the guest counters. When running with
4489 * an emulated PMU, PERF_GLOBAL_CTRL is dynamically computed on every
4490 * entry/exit to merge guest and host PMU usage.
4491 */
4492 if (enable_mediated_pmu)
4493 vmcs_write64(HOST_IA32_PERF_GLOBAL_CTRL, 0);
4494 }
4495
set_cr4_guest_host_mask(struct vcpu_vmx * vmx)4496 void set_cr4_guest_host_mask(struct vcpu_vmx *vmx)
4497 {
4498 struct kvm_vcpu *vcpu = &vmx->vcpu;
4499
4500 vcpu->arch.cr4_guest_owned_bits = KVM_POSSIBLE_CR4_GUEST_BITS &
4501 ~vcpu->arch.cr4_guest_rsvd_bits;
4502 if (!enable_ept) {
4503 vcpu->arch.cr4_guest_owned_bits &= ~X86_CR4_TLBFLUSH_BITS;
4504 vcpu->arch.cr4_guest_owned_bits &= ~X86_CR4_PDPTR_BITS;
4505 }
4506 if (is_guest_mode(&vmx->vcpu))
4507 vcpu->arch.cr4_guest_owned_bits &=
4508 ~get_vmcs12(vcpu)->cr4_guest_host_mask;
4509 vmcs_writel(CR4_GUEST_HOST_MASK, ~vcpu->arch.cr4_guest_owned_bits);
4510 }
4511
vmx_pin_based_exec_ctrl(struct vcpu_vmx * vmx)4512 static u32 vmx_pin_based_exec_ctrl(struct vcpu_vmx *vmx)
4513 {
4514 u32 pin_based_exec_ctrl = vmcs_config.pin_based_exec_ctrl;
4515
4516 if (!kvm_vcpu_apicv_active(&vmx->vcpu))
4517 pin_based_exec_ctrl &= ~PIN_BASED_POSTED_INTR;
4518
4519 if (!enable_vnmi)
4520 pin_based_exec_ctrl &= ~PIN_BASED_VIRTUAL_NMIS;
4521
4522 if (!enable_preemption_timer)
4523 pin_based_exec_ctrl &= ~PIN_BASED_VMX_PREEMPTION_TIMER;
4524
4525 return pin_based_exec_ctrl;
4526 }
4527
vmx_get_initial_vmentry_ctrl(void)4528 static u32 vmx_get_initial_vmentry_ctrl(void)
4529 {
4530 u32 vmentry_ctrl = vmcs_config.vmentry_ctrl;
4531
4532 if (vmx_pt_mode_is_system())
4533 vmentry_ctrl &= ~(VM_ENTRY_PT_CONCEAL_PIP |
4534 VM_ENTRY_LOAD_IA32_RTIT_CTL);
4535 /*
4536 * IA32e mode, and loading of EFER and PERF_GLOBAL_CTRL are toggled dynamically.
4537 */
4538 vmentry_ctrl &= ~(VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL |
4539 VM_ENTRY_LOAD_IA32_EFER |
4540 VM_ENTRY_IA32E_MODE);
4541
4542 return vmentry_ctrl;
4543 }
4544
vmx_get_initial_vmexit_ctrl(void)4545 static u32 vmx_get_initial_vmexit_ctrl(void)
4546 {
4547 u32 vmexit_ctrl = vmcs_config.vmexit_ctrl;
4548
4549 /*
4550 * Not used by KVM and never set in vmcs01 or vmcs02, but emulated for
4551 * nested virtualization and thus allowed to be set in vmcs12.
4552 */
4553 vmexit_ctrl &= ~(VM_EXIT_SAVE_IA32_PAT | VM_EXIT_SAVE_IA32_EFER |
4554 VM_EXIT_SAVE_VMX_PREEMPTION_TIMER);
4555
4556 if (vmx_pt_mode_is_system())
4557 vmexit_ctrl &= ~(VM_EXIT_PT_CONCEAL_PIP |
4558 VM_EXIT_CLEAR_IA32_RTIT_CTL);
4559 /* Loading of EFER and PERF_GLOBAL_CTRL are toggled dynamically */
4560 return vmexit_ctrl &
4561 ~(VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL | VM_EXIT_LOAD_IA32_EFER |
4562 VM_EXIT_SAVE_IA32_PERF_GLOBAL_CTRL);
4563 }
4564
vmx_refresh_apicv_exec_ctrl(struct kvm_vcpu * vcpu)4565 void vmx_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu)
4566 {
4567 struct vcpu_vmx *vmx = to_vmx(vcpu);
4568
4569 guard(vmx_vmcs01)(vcpu);
4570
4571 pin_controls_set(vmx, vmx_pin_based_exec_ctrl(vmx));
4572
4573 secondary_exec_controls_changebit(vmx,
4574 SECONDARY_EXEC_APIC_REGISTER_VIRT |
4575 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY,
4576 kvm_vcpu_apicv_active(vcpu));
4577 if (enable_ipiv)
4578 tertiary_exec_controls_changebit(vmx, TERTIARY_EXEC_IPI_VIRT,
4579 kvm_vcpu_apicv_active(vcpu));
4580
4581 vmx_update_msr_bitmap_x2apic(vcpu);
4582 }
4583
vmx_exec_control(struct vcpu_vmx * vmx)4584 static u32 vmx_exec_control(struct vcpu_vmx *vmx)
4585 {
4586 u32 exec_control = vmcs_config.cpu_based_exec_ctrl;
4587
4588 /*
4589 * Not used by KVM, but fully supported for nesting, i.e. are allowed in
4590 * vmcs12 and propagated to vmcs02 when set in vmcs12.
4591 */
4592 exec_control &= ~(CPU_BASED_RDTSC_EXITING |
4593 CPU_BASED_USE_IO_BITMAPS |
4594 CPU_BASED_MONITOR_TRAP_FLAG |
4595 CPU_BASED_PAUSE_EXITING);
4596
4597 /* INTR_WINDOW_EXITING and NMI_WINDOW_EXITING are toggled dynamically */
4598 exec_control &= ~(CPU_BASED_INTR_WINDOW_EXITING |
4599 CPU_BASED_NMI_WINDOW_EXITING);
4600
4601 if (vmx->vcpu.arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT)
4602 exec_control &= ~CPU_BASED_MOV_DR_EXITING;
4603
4604 if (!cpu_need_tpr_shadow(&vmx->vcpu))
4605 exec_control &= ~CPU_BASED_TPR_SHADOW;
4606
4607 #ifdef CONFIG_X86_64
4608 if (exec_control & CPU_BASED_TPR_SHADOW)
4609 exec_control &= ~(CPU_BASED_CR8_LOAD_EXITING |
4610 CPU_BASED_CR8_STORE_EXITING);
4611 else
4612 exec_control |= CPU_BASED_CR8_STORE_EXITING |
4613 CPU_BASED_CR8_LOAD_EXITING;
4614 #endif
4615 /* No need to intercept CR3 access or INVPLG when using EPT. */
4616 if (enable_ept)
4617 exec_control &= ~(CPU_BASED_CR3_LOAD_EXITING |
4618 CPU_BASED_CR3_STORE_EXITING |
4619 CPU_BASED_INVLPG_EXITING);
4620 if (kvm_mwait_in_guest(vmx->vcpu.kvm))
4621 exec_control &= ~(CPU_BASED_MWAIT_EXITING |
4622 CPU_BASED_MONITOR_EXITING);
4623 if (kvm_hlt_in_guest(vmx->vcpu.kvm))
4624 exec_control &= ~CPU_BASED_HLT_EXITING;
4625 return exec_control;
4626 }
4627
vmx_tertiary_exec_control(struct vcpu_vmx * vmx)4628 static u64 vmx_tertiary_exec_control(struct vcpu_vmx *vmx)
4629 {
4630 u64 exec_control = vmcs_config.cpu_based_3rd_exec_ctrl;
4631
4632 /*
4633 * IPI virtualization relies on APICv. Disable IPI virtualization if
4634 * APICv is inhibited.
4635 */
4636 if (!enable_ipiv || !kvm_vcpu_apicv_active(&vmx->vcpu))
4637 exec_control &= ~TERTIARY_EXEC_IPI_VIRT;
4638
4639 return exec_control;
4640 }
4641
4642 /*
4643 * Adjust a single secondary execution control bit to intercept/allow an
4644 * instruction in the guest. This is usually done based on whether or not a
4645 * feature has been exposed to the guest in order to correctly emulate faults.
4646 */
4647 static inline void
vmx_adjust_secondary_exec_control(struct vcpu_vmx * vmx,u32 * exec_control,u32 control,bool enabled,bool exiting)4648 vmx_adjust_secondary_exec_control(struct vcpu_vmx *vmx, u32 *exec_control,
4649 u32 control, bool enabled, bool exiting)
4650 {
4651 /*
4652 * If the control is for an opt-in feature, clear the control if the
4653 * feature is not exposed to the guest, i.e. not enabled. If the
4654 * control is opt-out, i.e. an exiting control, clear the control if
4655 * the feature _is_ exposed to the guest, i.e. exiting/interception is
4656 * disabled for the associated instruction. Note, the caller is
4657 * responsible presetting exec_control to set all supported bits.
4658 */
4659 if (enabled == exiting)
4660 *exec_control &= ~control;
4661
4662 /*
4663 * Update the nested MSR settings so that a nested VMM can/can't set
4664 * controls for features that are/aren't exposed to the guest.
4665 */
4666 if (nested &&
4667 kvm_check_has_quirk(vmx->vcpu.kvm, KVM_X86_QUIRK_STUFF_FEATURE_MSRS)) {
4668 /*
4669 * All features that can be added or removed to VMX MSRs must
4670 * be supported in the first place for nested virtualization.
4671 */
4672 if (WARN_ON_ONCE(!(vmcs_config.nested.secondary_ctls_high & control)))
4673 enabled = false;
4674
4675 if (enabled)
4676 vmx->nested.msrs.secondary_ctls_high |= control;
4677 else
4678 vmx->nested.msrs.secondary_ctls_high &= ~control;
4679 }
4680 }
4681
4682 /*
4683 * Wrapper macro for the common case of adjusting a secondary execution control
4684 * based on a single guest CPUID bit, with a dedicated feature bit. This also
4685 * verifies that the control is actually supported by KVM and hardware.
4686 */
4687 #define vmx_adjust_sec_exec_control(vmx, exec_control, name, feat_name, ctrl_name, exiting) \
4688 ({ \
4689 struct kvm_vcpu *__vcpu = &(vmx)->vcpu; \
4690 bool __enabled; \
4691 \
4692 if (cpu_has_vmx_##name()) { \
4693 __enabled = guest_cpu_cap_has(__vcpu, X86_FEATURE_##feat_name); \
4694 vmx_adjust_secondary_exec_control(vmx, exec_control, SECONDARY_EXEC_##ctrl_name,\
4695 __enabled, exiting); \
4696 } \
4697 })
4698
4699 /* More macro magic for ENABLE_/opt-in versus _EXITING/opt-out controls. */
4700 #define vmx_adjust_sec_exec_feature(vmx, exec_control, lname, uname) \
4701 vmx_adjust_sec_exec_control(vmx, exec_control, lname, uname, ENABLE_##uname, false)
4702
4703 #define vmx_adjust_sec_exec_exiting(vmx, exec_control, lname, uname) \
4704 vmx_adjust_sec_exec_control(vmx, exec_control, lname, uname, uname##_EXITING, true)
4705
vmx_secondary_exec_control(struct vcpu_vmx * vmx)4706 static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx)
4707 {
4708 struct kvm_vcpu *vcpu = &vmx->vcpu;
4709
4710 u32 exec_control = vmcs_config.cpu_based_2nd_exec_ctrl;
4711
4712 if (vmx_pt_mode_is_system())
4713 exec_control &= ~(SECONDARY_EXEC_PT_USE_GPA | SECONDARY_EXEC_PT_CONCEAL_VMX);
4714 if (!cpu_need_virtualize_apic_accesses(vcpu))
4715 exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
4716 if (vmx->vpid == 0)
4717 exec_control &= ~SECONDARY_EXEC_ENABLE_VPID;
4718 if (!enable_ept) {
4719 exec_control &= ~SECONDARY_EXEC_ENABLE_EPT;
4720 exec_control &= ~SECONDARY_EXEC_EPT_VIOLATION_VE;
4721 enable_unrestricted_guest = 0;
4722 }
4723 if (!enable_unrestricted_guest)
4724 exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST;
4725 if (kvm_pause_in_guest(vmx->vcpu.kvm))
4726 exec_control &= ~SECONDARY_EXEC_PAUSE_LOOP_EXITING;
4727 if (!kvm_vcpu_apicv_active(vcpu))
4728 exec_control &= ~(SECONDARY_EXEC_APIC_REGISTER_VIRT |
4729 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
4730 exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;
4731
4732 /*
4733 * KVM doesn't support VMFUNC for L1, but the control is set in KVM's
4734 * base configuration as KVM emulates VMFUNC[EPTP_SWITCHING] for L2.
4735 */
4736 exec_control &= ~SECONDARY_EXEC_ENABLE_VMFUNC;
4737
4738 /* SECONDARY_EXEC_DESC is enabled/disabled on writes to CR4.UMIP,
4739 * in vmx_set_cr4. */
4740 exec_control &= ~SECONDARY_EXEC_DESC;
4741
4742 /* SECONDARY_EXEC_SHADOW_VMCS is enabled when L1 executes VMPTRLD
4743 (handle_vmptrld).
4744 We can NOT enable shadow_vmcs here because we don't have yet
4745 a current VMCS12
4746 */
4747 exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS;
4748
4749 /*
4750 * PML is enabled/disabled when dirty logging of memsmlots changes, but
4751 * it needs to be set here when dirty logging is already active, e.g.
4752 * if this vCPU was created after dirty logging was enabled.
4753 */
4754 if (!enable_pml || !atomic_read(&vcpu->kvm->nr_memslots_dirty_logging))
4755 exec_control &= ~SECONDARY_EXEC_ENABLE_PML;
4756
4757 vmx_adjust_sec_exec_feature(vmx, &exec_control, xsaves, XSAVES);
4758
4759 /*
4760 * RDPID is also gated by ENABLE_RDTSCP, turn on the control if either
4761 * feature is exposed to the guest. This creates a virtualization hole
4762 * if both are supported in hardware but only one is exposed to the
4763 * guest, but letting the guest execute RDTSCP or RDPID when either one
4764 * is advertised is preferable to emulating the advertised instruction
4765 * in KVM on #UD, and obviously better than incorrectly injecting #UD.
4766 */
4767 if (cpu_has_vmx_rdtscp()) {
4768 bool rdpid_or_rdtscp_enabled =
4769 guest_cpu_cap_has(vcpu, X86_FEATURE_RDTSCP) ||
4770 guest_cpu_cap_has(vcpu, X86_FEATURE_RDPID);
4771
4772 vmx_adjust_secondary_exec_control(vmx, &exec_control,
4773 SECONDARY_EXEC_ENABLE_RDTSCP,
4774 rdpid_or_rdtscp_enabled, false);
4775 }
4776
4777 vmx_adjust_sec_exec_feature(vmx, &exec_control, invpcid, INVPCID);
4778
4779 vmx_adjust_sec_exec_exiting(vmx, &exec_control, rdrand, RDRAND);
4780 vmx_adjust_sec_exec_exiting(vmx, &exec_control, rdseed, RDSEED);
4781
4782 vmx_adjust_sec_exec_control(vmx, &exec_control, waitpkg, WAITPKG,
4783 ENABLE_USR_WAIT_PAUSE, false);
4784
4785 if (!vcpu->kvm->arch.bus_lock_detection_enabled)
4786 exec_control &= ~SECONDARY_EXEC_BUS_LOCK_DETECTION;
4787
4788 if (!kvm_notify_vmexit_enabled(vcpu->kvm))
4789 exec_control &= ~SECONDARY_EXEC_NOTIFY_VM_EXITING;
4790
4791 return exec_control;
4792 }
4793
vmx_get_pid_table_order(struct kvm * kvm)4794 static inline int vmx_get_pid_table_order(struct kvm *kvm)
4795 {
4796 return get_order(kvm->arch.max_vcpu_ids * sizeof(*to_kvm_vmx(kvm)->pid_table));
4797 }
4798
vmx_alloc_ipiv_pid_table(struct kvm * kvm)4799 static int vmx_alloc_ipiv_pid_table(struct kvm *kvm)
4800 {
4801 struct page *pages;
4802 struct kvm_vmx *kvm_vmx = to_kvm_vmx(kvm);
4803
4804 if (!irqchip_in_kernel(kvm) || !enable_ipiv)
4805 return 0;
4806
4807 if (kvm_vmx->pid_table)
4808 return 0;
4809
4810 pages = alloc_pages(GFP_KERNEL_ACCOUNT | __GFP_ZERO,
4811 vmx_get_pid_table_order(kvm));
4812 if (!pages)
4813 return -ENOMEM;
4814
4815 kvm_vmx->pid_table = (void *)page_address(pages);
4816 return 0;
4817 }
4818
vmx_vcpu_precreate(struct kvm * kvm)4819 int vmx_vcpu_precreate(struct kvm *kvm)
4820 {
4821 return vmx_alloc_ipiv_pid_table(kvm);
4822 }
4823
4824 #define VMX_XSS_EXIT_BITMAP 0
4825
init_vmcs(struct vcpu_vmx * vmx)4826 static void init_vmcs(struct vcpu_vmx *vmx)
4827 {
4828 struct kvm *kvm = vmx->vcpu.kvm;
4829 struct kvm_vmx *kvm_vmx = to_kvm_vmx(kvm);
4830
4831 if (nested)
4832 nested_vmx_set_vmcs_shadowing_bitmap();
4833
4834 if (cpu_has_vmx_msr_bitmap())
4835 vmcs_write64(MSR_BITMAP, __pa(vmx->vmcs01.msr_bitmap));
4836
4837 vmcs_write64(VMCS_LINK_POINTER, INVALID_GPA); /* 22.3.1.5 */
4838
4839 /* Control */
4840 pin_controls_set(vmx, vmx_pin_based_exec_ctrl(vmx));
4841
4842 exec_controls_set(vmx, vmx_exec_control(vmx));
4843
4844 if (cpu_has_secondary_exec_ctrls()) {
4845 secondary_exec_controls_set(vmx, vmx_secondary_exec_control(vmx));
4846 if (vmx->ve_info)
4847 vmcs_write64(VE_INFORMATION_ADDRESS,
4848 __pa(vmx->ve_info));
4849 }
4850
4851 if (cpu_has_tertiary_exec_ctrls())
4852 tertiary_exec_controls_set(vmx, vmx_tertiary_exec_control(vmx));
4853
4854 if (enable_apicv && lapic_in_kernel(&vmx->vcpu)) {
4855 vmcs_write64(EOI_EXIT_BITMAP0, 0);
4856 vmcs_write64(EOI_EXIT_BITMAP1, 0);
4857 vmcs_write64(EOI_EXIT_BITMAP2, 0);
4858 vmcs_write64(EOI_EXIT_BITMAP3, 0);
4859
4860 vmcs_write16(GUEST_INTR_STATUS, 0);
4861
4862 vmcs_write16(POSTED_INTR_NV, POSTED_INTR_VECTOR);
4863 vmcs_write64(POSTED_INTR_DESC_ADDR, __pa((&vmx->vt.pi_desc)));
4864 }
4865
4866 if (vmx_can_use_ipiv(&vmx->vcpu)) {
4867 vmcs_write64(PID_POINTER_TABLE, __pa(kvm_vmx->pid_table));
4868 vmcs_write16(LAST_PID_POINTER_INDEX, kvm->arch.max_vcpu_ids - 1);
4869 }
4870
4871 if (!kvm_pause_in_guest(kvm)) {
4872 vmcs_write32(PLE_GAP, ple_gap);
4873 vmx->ple_window = ple_window;
4874 vmx->ple_window_dirty = true;
4875 }
4876
4877 if (kvm_notify_vmexit_enabled(kvm))
4878 vmcs_write32(NOTIFY_WINDOW, kvm->arch.notify_window);
4879
4880 vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, 0);
4881 vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, 0);
4882 vmcs_write32(CR3_TARGET_COUNT, 0); /* 22.2.1 */
4883
4884 vmcs_write16(HOST_FS_SELECTOR, 0); /* 22.2.4 */
4885 vmcs_write16(HOST_GS_SELECTOR, 0); /* 22.2.4 */
4886 vmx_set_constant_host_state(vmx);
4887 vmcs_writel(HOST_FS_BASE, 0); /* 22.2.4 */
4888 vmcs_writel(HOST_GS_BASE, 0); /* 22.2.4 */
4889
4890 if (cpu_has_vmx_vmfunc())
4891 vmcs_write64(VM_FUNCTION_CONTROL, 0);
4892
4893 vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0);
4894 vmcs_write64(VM_EXIT_MSR_STORE_ADDR, __pa(vmx->msr_autostore.val));
4895 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0);
4896 vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host.val));
4897 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0);
4898 vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest.val));
4899
4900 if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT)
4901 vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat);
4902
4903 vm_exit_controls_set(vmx, vmx_get_initial_vmexit_ctrl());
4904
4905 /* 22.2.1, 20.8.1 */
4906 vm_entry_controls_set(vmx, vmx_get_initial_vmentry_ctrl());
4907
4908 vmx->vcpu.arch.cr0_guest_owned_bits = vmx_l1_guest_owned_cr0_bits();
4909 vmcs_writel(CR0_GUEST_HOST_MASK, ~vmx->vcpu.arch.cr0_guest_owned_bits);
4910
4911 set_cr4_guest_host_mask(vmx);
4912
4913 if (vmx->vpid != 0)
4914 vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid);
4915
4916 if (cpu_has_vmx_xsaves())
4917 vmcs_write64(XSS_EXIT_BITMAP, VMX_XSS_EXIT_BITMAP);
4918
4919 if (enable_pml) {
4920 vmcs_write64(PML_ADDRESS, page_to_phys(vmx->pml_pg));
4921 vmcs_write16(GUEST_PML_INDEX, PML_HEAD_INDEX);
4922 }
4923
4924 vmx_write_encls_bitmap(&vmx->vcpu, NULL);
4925
4926 if (vmx_pt_mode_is_host_guest()) {
4927 memset(&vmx->pt_desc, 0, sizeof(vmx->pt_desc));
4928 /* Bit[6~0] are forced to 1, writes are ignored. */
4929 vmx->pt_desc.guest.output_mask = 0x7F;
4930 vmcs_write64(GUEST_IA32_RTIT_CTL, 0);
4931 }
4932
4933 vmcs_write32(GUEST_SYSENTER_CS, 0);
4934 vmcs_writel(GUEST_SYSENTER_ESP, 0);
4935 vmcs_writel(GUEST_SYSENTER_EIP, 0);
4936
4937 vmx_guest_debugctl_write(&vmx->vcpu, 0);
4938
4939 if (cpu_has_vmx_tpr_shadow()) {
4940 vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 0);
4941 if (cpu_need_tpr_shadow(&vmx->vcpu))
4942 vmcs_write64(VIRTUAL_APIC_PAGE_ADDR,
4943 __pa(vmx->vcpu.arch.apic->regs));
4944 vmcs_write32(TPR_THRESHOLD, 0);
4945 }
4946
4947 vmx_setup_uret_msrs(vmx);
4948 }
4949
__vmx_vcpu_reset(struct kvm_vcpu * vcpu)4950 static void __vmx_vcpu_reset(struct kvm_vcpu *vcpu)
4951 {
4952 struct vcpu_vmx *vmx = to_vmx(vcpu);
4953
4954 init_vmcs(vmx);
4955
4956 if (nested &&
4957 kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_STUFF_FEATURE_MSRS))
4958 memcpy(&vmx->nested.msrs, &vmcs_config.nested, sizeof(vmx->nested.msrs));
4959
4960 vcpu_setup_sgx_lepubkeyhash(vcpu);
4961
4962 vmx->nested.posted_intr_nv = -1;
4963 vmx->nested.vmxon_ptr = INVALID_GPA;
4964 vmx->nested.current_vmptr = INVALID_GPA;
4965
4966 #ifdef CONFIG_KVM_HYPERV
4967 vmx->nested.hv_evmcs_vmptr = EVMPTR_INVALID;
4968 #endif
4969
4970 if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_STUFF_FEATURE_MSRS))
4971 vcpu->arch.microcode_version = 0x100000000ULL;
4972 vmx->msr_ia32_feature_control_valid_bits = FEAT_CTL_LOCKED;
4973
4974 /*
4975 * Enforce invariant: pi_desc.nv is always either POSTED_INTR_VECTOR
4976 * or POSTED_INTR_WAKEUP_VECTOR.
4977 */
4978 vmx->vt.pi_desc.nv = POSTED_INTR_VECTOR;
4979 __pi_set_sn(&vmx->vt.pi_desc);
4980 }
4981
vmx_vcpu_reset(struct kvm_vcpu * vcpu,bool init_event)4982 void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
4983 {
4984 struct vcpu_vmx *vmx = to_vmx(vcpu);
4985
4986 if (!init_event)
4987 __vmx_vcpu_reset(vcpu);
4988
4989 vmx->rmode.vm86_active = 0;
4990 vmx->spec_ctrl = 0;
4991
4992 vmx->msr_ia32_umwait_control = 0;
4993
4994 vmx->hv_deadline_tsc = -1;
4995 kvm_set_cr8(vcpu, 0);
4996
4997 seg_setup(VCPU_SREG_CS);
4998 vmcs_write16(GUEST_CS_SELECTOR, 0xf000);
4999 vmcs_writel(GUEST_CS_BASE, 0xffff0000ul);
5000
5001 seg_setup(VCPU_SREG_DS);
5002 seg_setup(VCPU_SREG_ES);
5003 seg_setup(VCPU_SREG_FS);
5004 seg_setup(VCPU_SREG_GS);
5005 seg_setup(VCPU_SREG_SS);
5006
5007 vmcs_write16(GUEST_TR_SELECTOR, 0);
5008 vmcs_writel(GUEST_TR_BASE, 0);
5009 vmcs_write32(GUEST_TR_LIMIT, 0xffff);
5010 vmcs_write32(GUEST_TR_AR_BYTES, 0x008b);
5011
5012 vmcs_write16(GUEST_LDTR_SELECTOR, 0);
5013 vmcs_writel(GUEST_LDTR_BASE, 0);
5014 vmcs_write32(GUEST_LDTR_LIMIT, 0xffff);
5015 vmcs_write32(GUEST_LDTR_AR_BYTES, 0x00082);
5016
5017 vmcs_writel(GUEST_GDTR_BASE, 0);
5018 vmcs_write32(GUEST_GDTR_LIMIT, 0xffff);
5019
5020 vmcs_writel(GUEST_IDTR_BASE, 0);
5021 vmcs_write32(GUEST_IDTR_LIMIT, 0xffff);
5022
5023 vmx_segment_cache_clear(vmx);
5024 kvm_register_mark_available(vcpu, VCPU_EXREG_SEGMENTS);
5025
5026 vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE);
5027 vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0);
5028 vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS, 0);
5029 if (kvm_mpx_supported())
5030 vmcs_write64(GUEST_BNDCFGS, 0);
5031
5032 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0); /* 22.2.1 */
5033
5034 if (kvm_cpu_cap_has(X86_FEATURE_SHSTK)) {
5035 vmcs_writel(GUEST_SSP, 0);
5036 vmcs_writel(GUEST_INTR_SSP_TABLE, 0);
5037 }
5038 if (kvm_cpu_cap_has(X86_FEATURE_IBT) ||
5039 kvm_cpu_cap_has(X86_FEATURE_SHSTK))
5040 vmcs_writel(GUEST_S_CET, 0);
5041
5042 kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu);
5043
5044 vpid_sync_context(vmx->vpid);
5045
5046 vmx_update_fb_clear_dis(vcpu, vmx);
5047 }
5048
vmx_enable_irq_window(struct kvm_vcpu * vcpu)5049 void vmx_enable_irq_window(struct kvm_vcpu *vcpu)
5050 {
5051 exec_controls_setbit(to_vmx(vcpu), CPU_BASED_INTR_WINDOW_EXITING);
5052 }
5053
vmx_enable_nmi_window(struct kvm_vcpu * vcpu)5054 void vmx_enable_nmi_window(struct kvm_vcpu *vcpu)
5055 {
5056 if (!enable_vnmi ||
5057 vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_STI) {
5058 vmx_enable_irq_window(vcpu);
5059 return;
5060 }
5061
5062 exec_controls_setbit(to_vmx(vcpu), CPU_BASED_NMI_WINDOW_EXITING);
5063 }
5064
vmx_inject_irq(struct kvm_vcpu * vcpu,bool reinjected)5065 void vmx_inject_irq(struct kvm_vcpu *vcpu, bool reinjected)
5066 {
5067 struct vcpu_vmx *vmx = to_vmx(vcpu);
5068 uint32_t intr;
5069 int irq = vcpu->arch.interrupt.nr;
5070
5071 trace_kvm_inj_virq(irq, vcpu->arch.interrupt.soft, reinjected);
5072
5073 ++vcpu->stat.irq_injections;
5074 if (vmx->rmode.vm86_active) {
5075 int inc_eip = 0;
5076 if (vcpu->arch.interrupt.soft)
5077 inc_eip = vcpu->arch.event_exit_inst_len;
5078 kvm_inject_realmode_interrupt(vcpu, irq, inc_eip);
5079 return;
5080 }
5081 intr = irq | INTR_INFO_VALID_MASK;
5082 if (vcpu->arch.interrupt.soft) {
5083 intr |= INTR_TYPE_SOFT_INTR;
5084 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
5085 vmx->vcpu.arch.event_exit_inst_len);
5086 } else
5087 intr |= INTR_TYPE_EXT_INTR;
5088 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr);
5089
5090 vmx_clear_hlt(vcpu);
5091 }
5092
vmx_inject_nmi(struct kvm_vcpu * vcpu)5093 void vmx_inject_nmi(struct kvm_vcpu *vcpu)
5094 {
5095 struct vcpu_vmx *vmx = to_vmx(vcpu);
5096
5097 if (!enable_vnmi) {
5098 /*
5099 * Tracking the NMI-blocked state in software is built upon
5100 * finding the next open IRQ window. This, in turn, depends on
5101 * well-behaving guests: They have to keep IRQs disabled at
5102 * least as long as the NMI handler runs. Otherwise we may
5103 * cause NMI nesting, maybe breaking the guest. But as this is
5104 * highly unlikely, we can live with the residual risk.
5105 */
5106 vmx->loaded_vmcs->soft_vnmi_blocked = 1;
5107 vmx->loaded_vmcs->vnmi_blocked_time = 0;
5108 }
5109
5110 ++vcpu->stat.nmi_injections;
5111 vmx->loaded_vmcs->nmi_known_unmasked = false;
5112
5113 if (vmx->rmode.vm86_active) {
5114 kvm_inject_realmode_interrupt(vcpu, NMI_VECTOR, 0);
5115 return;
5116 }
5117
5118 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
5119 INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR);
5120
5121 vmx_clear_hlt(vcpu);
5122 }
5123
vmx_get_nmi_mask(struct kvm_vcpu * vcpu)5124 bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu)
5125 {
5126 struct vcpu_vmx *vmx = to_vmx(vcpu);
5127 bool masked;
5128
5129 if (!enable_vnmi)
5130 return vmx->loaded_vmcs->soft_vnmi_blocked;
5131 if (vmx->loaded_vmcs->nmi_known_unmasked)
5132 return false;
5133 masked = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_NMI;
5134 vmx->loaded_vmcs->nmi_known_unmasked = !masked;
5135 return masked;
5136 }
5137
vmx_set_nmi_mask(struct kvm_vcpu * vcpu,bool masked)5138 void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked)
5139 {
5140 struct vcpu_vmx *vmx = to_vmx(vcpu);
5141
5142 if (!enable_vnmi) {
5143 if (vmx->loaded_vmcs->soft_vnmi_blocked != masked) {
5144 vmx->loaded_vmcs->soft_vnmi_blocked = masked;
5145 vmx->loaded_vmcs->vnmi_blocked_time = 0;
5146 }
5147 } else {
5148 vmx->loaded_vmcs->nmi_known_unmasked = !masked;
5149 if (masked)
5150 vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
5151 GUEST_INTR_STATE_NMI);
5152 else
5153 vmcs_clear_bits(GUEST_INTERRUPTIBILITY_INFO,
5154 GUEST_INTR_STATE_NMI);
5155 }
5156 }
5157
vmx_nmi_blocked(struct kvm_vcpu * vcpu)5158 bool vmx_nmi_blocked(struct kvm_vcpu *vcpu)
5159 {
5160 if (is_guest_mode(vcpu) && nested_exit_on_nmi(vcpu))
5161 return false;
5162
5163 if (!enable_vnmi && to_vmx(vcpu)->loaded_vmcs->soft_vnmi_blocked)
5164 return true;
5165
5166 return (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
5167 (GUEST_INTR_STATE_MOV_SS | GUEST_INTR_STATE_STI |
5168 GUEST_INTR_STATE_NMI));
5169 }
5170
vmx_nmi_allowed(struct kvm_vcpu * vcpu,bool for_injection)5171 int vmx_nmi_allowed(struct kvm_vcpu *vcpu, bool for_injection)
5172 {
5173 if (vcpu->arch.nested_run_pending)
5174 return -EBUSY;
5175
5176 /* An NMI must not be injected into L2 if it's supposed to VM-Exit. */
5177 if (for_injection && is_guest_mode(vcpu) && nested_exit_on_nmi(vcpu))
5178 return -EBUSY;
5179
5180 return !vmx_nmi_blocked(vcpu);
5181 }
5182
__vmx_interrupt_blocked(struct kvm_vcpu * vcpu)5183 bool __vmx_interrupt_blocked(struct kvm_vcpu *vcpu)
5184 {
5185 return !(vmx_get_rflags(vcpu) & X86_EFLAGS_IF) ||
5186 (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
5187 (GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS));
5188 }
5189
vmx_interrupt_blocked(struct kvm_vcpu * vcpu)5190 bool vmx_interrupt_blocked(struct kvm_vcpu *vcpu)
5191 {
5192 if (is_guest_mode(vcpu) && nested_exit_on_intr(vcpu))
5193 return false;
5194
5195 return __vmx_interrupt_blocked(vcpu);
5196 }
5197
vmx_interrupt_allowed(struct kvm_vcpu * vcpu,bool for_injection)5198 int vmx_interrupt_allowed(struct kvm_vcpu *vcpu, bool for_injection)
5199 {
5200 if (vcpu->arch.nested_run_pending)
5201 return -EBUSY;
5202
5203 /*
5204 * An IRQ must not be injected into L2 if it's supposed to VM-Exit,
5205 * e.g. if the IRQ arrived asynchronously after checking nested events.
5206 */
5207 if (for_injection && is_guest_mode(vcpu) && nested_exit_on_intr(vcpu))
5208 return -EBUSY;
5209
5210 return !vmx_interrupt_blocked(vcpu);
5211 }
5212
vmx_set_tss_addr(struct kvm * kvm,unsigned int addr)5213 int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr)
5214 {
5215 void __user *ret;
5216
5217 if (enable_unrestricted_guest)
5218 return 0;
5219
5220 mutex_lock(&kvm->slots_lock);
5221 ret = __x86_set_memory_region(kvm, TSS_PRIVATE_MEMSLOT, addr,
5222 PAGE_SIZE * 3);
5223 mutex_unlock(&kvm->slots_lock);
5224
5225 if (IS_ERR(ret))
5226 return PTR_ERR(ret);
5227
5228 to_kvm_vmx(kvm)->tss_addr = addr;
5229
5230 return init_rmode_tss(kvm, ret);
5231 }
5232
vmx_set_identity_map_addr(struct kvm * kvm,u64 ident_addr)5233 int vmx_set_identity_map_addr(struct kvm *kvm, u64 ident_addr)
5234 {
5235 to_kvm_vmx(kvm)->ept_identity_map_addr = ident_addr;
5236 return 0;
5237 }
5238
rmode_exception(struct kvm_vcpu * vcpu,int vec)5239 static bool rmode_exception(struct kvm_vcpu *vcpu, int vec)
5240 {
5241 switch (vec) {
5242 case BP_VECTOR:
5243 /*
5244 * Update instruction length as we may reinject the exception
5245 * from user space while in guest debugging mode.
5246 */
5247 to_vmx(vcpu)->vcpu.arch.event_exit_inst_len =
5248 vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
5249 if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP)
5250 return false;
5251 fallthrough;
5252 case DB_VECTOR:
5253 return !(vcpu->guest_debug &
5254 (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP));
5255 case DE_VECTOR:
5256 case OF_VECTOR:
5257 case BR_VECTOR:
5258 case UD_VECTOR:
5259 case DF_VECTOR:
5260 case SS_VECTOR:
5261 case GP_VECTOR:
5262 case MF_VECTOR:
5263 return true;
5264 }
5265 return false;
5266 }
5267
handle_rmode_exception(struct kvm_vcpu * vcpu,int vec,u32 err_code)5268 static int handle_rmode_exception(struct kvm_vcpu *vcpu,
5269 int vec, u32 err_code)
5270 {
5271 /*
5272 * Instruction with address size override prefix opcode 0x67
5273 * Cause the #SS fault with 0 error code in VM86 mode.
5274 */
5275 if (((vec == GP_VECTOR) || (vec == SS_VECTOR)) && err_code == 0) {
5276 if (kvm_emulate_instruction(vcpu, 0)) {
5277 if (vcpu->arch.halt_request) {
5278 vcpu->arch.halt_request = 0;
5279 return kvm_emulate_halt_noskip(vcpu);
5280 }
5281 return 1;
5282 }
5283 return 0;
5284 }
5285
5286 /*
5287 * Forward all other exceptions that are valid in real mode.
5288 * FIXME: Breaks guest debugging in real mode, needs to be fixed with
5289 * the required debugging infrastructure rework.
5290 */
5291 kvm_queue_exception(vcpu, vec);
5292 return 1;
5293 }
5294
handle_machine_check(struct kvm_vcpu * vcpu)5295 static int handle_machine_check(struct kvm_vcpu *vcpu)
5296 {
5297 /* handled by vmx_vcpu_run() */
5298 return 1;
5299 }
5300
5301 /*
5302 * If the host has split lock detection disabled, then #AC is
5303 * unconditionally injected into the guest, which is the pre split lock
5304 * detection behaviour.
5305 *
5306 * If the host has split lock detection enabled then #AC is
5307 * only injected into the guest when:
5308 * - Guest CPL == 3 (user mode)
5309 * - Guest has #AC detection enabled in CR0
5310 * - Guest EFLAGS has AC bit set
5311 */
vmx_guest_inject_ac(struct kvm_vcpu * vcpu)5312 bool vmx_guest_inject_ac(struct kvm_vcpu *vcpu)
5313 {
5314 if (!boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT))
5315 return true;
5316
5317 return vmx_get_cpl(vcpu) == 3 && kvm_is_cr0_bit_set(vcpu, X86_CR0_AM) &&
5318 (kvm_get_rflags(vcpu) & X86_EFLAGS_AC);
5319 }
5320
is_xfd_nm_fault(struct kvm_vcpu * vcpu)5321 static bool is_xfd_nm_fault(struct kvm_vcpu *vcpu)
5322 {
5323 return vcpu->arch.guest_fpu.fpstate->xfd &&
5324 !kvm_is_cr0_bit_set(vcpu, X86_CR0_TS);
5325 }
5326
vmx_handle_page_fault(struct kvm_vcpu * vcpu,u32 error_code)5327 static int vmx_handle_page_fault(struct kvm_vcpu *vcpu, u32 error_code)
5328 {
5329 unsigned long cr2 = vmx_get_exit_qual(vcpu);
5330
5331 if (vcpu->arch.apf.host_apf_flags)
5332 goto handle_pf;
5333
5334 /* When using EPT, KVM intercepts #PF only to detect illegal GPAs. */
5335 WARN_ON_ONCE(enable_ept && !allow_smaller_maxphyaddr);
5336
5337 /*
5338 * On SGX2 hardware, EPCM violations are delivered as #PF with the SGX
5339 * flag set in the error code (SGX1 hardware generates #GP(0)). EPCM
5340 * violations have nothing to do with shadow paging and can never be
5341 * resolved by KVM; always reflect them into the guest.
5342 */
5343 if (error_code & PFERR_SGX_MASK) {
5344 WARN_ON_ONCE(!IS_ENABLED(CONFIG_X86_SGX_KVM) ||
5345 !cpu_feature_enabled(X86_FEATURE_SGX2));
5346
5347 if (guest_cpu_cap_has(vcpu, X86_FEATURE_SGX2))
5348 kvm_fixup_and_inject_pf_error(vcpu, cr2, error_code);
5349 else
5350 kvm_inject_gp(vcpu, 0);
5351 return 1;
5352 }
5353
5354 /*
5355 * If EPT is enabled, fixup and inject the #PF. KVM intercepts #PFs
5356 * only to set PFERR_RSVD as appropriate (hardware won't set RSVD due
5357 * to the GPA being legal with respect to host.MAXPHYADDR).
5358 */
5359 if (enable_ept) {
5360 kvm_fixup_and_inject_pf_error(vcpu, cr2, error_code);
5361 return 1;
5362 }
5363
5364 handle_pf:
5365 return kvm_handle_page_fault(vcpu, error_code, cr2, NULL, 0);
5366 }
5367
handle_exception_nmi(struct kvm_vcpu * vcpu)5368 static int handle_exception_nmi(struct kvm_vcpu *vcpu)
5369 {
5370 struct vcpu_vmx *vmx = to_vmx(vcpu);
5371 struct kvm_run *kvm_run = vcpu->run;
5372 u32 intr_info, ex_no, error_code;
5373 unsigned long dr6;
5374 u32 vect_info;
5375
5376 vect_info = vmx->idt_vectoring_info;
5377 intr_info = vmx_get_intr_info(vcpu);
5378
5379 /*
5380 * Machine checks are handled by handle_exception_irqoff(), or by
5381 * vmx_vcpu_run() if a #MC occurs on VM-Entry. NMIs are handled by
5382 * vmx_vcpu_enter_exit().
5383 */
5384 if (is_machine_check(intr_info) || is_nmi(intr_info))
5385 return 1;
5386
5387 /*
5388 * Queue the exception here instead of in handle_nm_fault_irqoff().
5389 * This ensures the nested_vmx check is not skipped so vmexit can
5390 * be reflected to L1 (when it intercepts #NM) before reaching this
5391 * point.
5392 */
5393 if (is_nm_fault(intr_info)) {
5394 kvm_queue_exception_p(vcpu, NM_VECTOR,
5395 is_xfd_nm_fault(vcpu) ? vcpu->arch.guest_fpu.xfd_err : 0);
5396 return 1;
5397 }
5398
5399 if (is_invalid_opcode(intr_info))
5400 return handle_ud(vcpu);
5401
5402 if (WARN_ON_ONCE(is_ve_fault(intr_info))) {
5403 struct vmx_ve_information *ve_info = vmx->ve_info;
5404
5405 WARN_ONCE(ve_info->exit_reason != EXIT_REASON_EPT_VIOLATION,
5406 "Unexpected #VE on VM-Exit reason 0x%x", ve_info->exit_reason);
5407 dump_vmcs(vcpu);
5408 kvm_mmu_print_sptes(vcpu, ve_info->guest_physical_address, "#VE");
5409 return 1;
5410 }
5411
5412 error_code = 0;
5413 if (intr_info & INTR_INFO_DELIVER_CODE_MASK)
5414 error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
5415
5416 if (!vmx->rmode.vm86_active && is_gp_fault(intr_info)) {
5417 WARN_ON_ONCE(!enable_vmware_backdoor);
5418
5419 /*
5420 * VMware backdoor emulation on #GP interception only handles
5421 * IN{S}, OUT{S}, and RDPMC, none of which generate a non-zero
5422 * error code on #GP.
5423 */
5424 if (error_code) {
5425 kvm_queue_exception_e(vcpu, GP_VECTOR, error_code);
5426 return 1;
5427 }
5428 return kvm_emulate_instruction(vcpu, EMULTYPE_VMWARE_GP);
5429 }
5430
5431 /*
5432 * The #PF with PFEC.RSVD = 1 indicates the guest is accessing
5433 * MMIO, it is better to report an internal error.
5434 * See the comments in vmx_handle_exit.
5435 */
5436 if ((vect_info & VECTORING_INFO_VALID_MASK) &&
5437 !(is_page_fault(intr_info) && !(error_code & PFERR_RSVD_MASK))) {
5438 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
5439 vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_SIMUL_EX;
5440 vcpu->run->internal.ndata = 4;
5441 vcpu->run->internal.data[0] = vect_info;
5442 vcpu->run->internal.data[1] = intr_info;
5443 vcpu->run->internal.data[2] = error_code;
5444 vcpu->run->internal.data[3] = vcpu->arch.last_vmentry_cpu;
5445 return 0;
5446 }
5447
5448 if (is_page_fault(intr_info))
5449 return vmx_handle_page_fault(vcpu, error_code);
5450
5451 ex_no = intr_info & INTR_INFO_VECTOR_MASK;
5452
5453 if (vmx->rmode.vm86_active && rmode_exception(vcpu, ex_no))
5454 return handle_rmode_exception(vcpu, ex_no, error_code);
5455
5456 switch (ex_no) {
5457 case DB_VECTOR:
5458 dr6 = vmx_get_exit_qual(vcpu);
5459 if (!(vcpu->guest_debug &
5460 (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))) {
5461 /*
5462 * If the #DB was due to ICEBP, a.k.a. INT1, skip the
5463 * instruction. ICEBP generates a trap-like #DB, but
5464 * despite its interception control being tied to #DB,
5465 * is an instruction intercept, i.e. the VM-Exit occurs
5466 * on the ICEBP itself. Use the inner "skip" helper to
5467 * avoid single-step #DB and MTF updates, as ICEBP is
5468 * higher priority. Note, skipping ICEBP still clears
5469 * STI and MOVSS blocking.
5470 *
5471 * For all other #DBs, set vmcs.PENDING_DBG_EXCEPTIONS.BS
5472 * if single-step is enabled in RFLAGS and STI or MOVSS
5473 * blocking is active, as the CPU doesn't set the bit
5474 * on VM-Exit due to #DB interception. VM-Entry has a
5475 * consistency check that a single-step #DB is pending
5476 * in this scenario as the previous instruction cannot
5477 * have toggled RFLAGS.TF 0=>1 (because STI and POP/MOV
5478 * don't modify RFLAGS), therefore the one instruction
5479 * delay when activating single-step breakpoints must
5480 * have already expired. Note, the CPU sets/clears BS
5481 * as appropriate for all other VM-Exits types.
5482 */
5483 if (is_icebp(intr_info))
5484 WARN_ON(!skip_emulated_instruction(vcpu));
5485 else if ((vmx_get_rflags(vcpu) & X86_EFLAGS_TF) &&
5486 (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
5487 (GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS)))
5488 vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS,
5489 vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS) | DR6_BS);
5490
5491 kvm_queue_exception_p(vcpu, DB_VECTOR, dr6);
5492 return 1;
5493 }
5494 kvm_run->debug.arch.dr6 = dr6 | DR6_ACTIVE_LOW;
5495 kvm_run->debug.arch.dr7 = vmcs_readl(GUEST_DR7);
5496 fallthrough;
5497 case BP_VECTOR:
5498 /*
5499 * Update instruction length as we may reinject #BP from
5500 * user space while in guest debugging mode. Reading it for
5501 * #DB as well causes no harm, it is not used in that case.
5502 */
5503 vmx->vcpu.arch.event_exit_inst_len =
5504 vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
5505 kvm_run->exit_reason = KVM_EXIT_DEBUG;
5506 kvm_run->debug.arch.pc = kvm_get_linear_rip(vcpu);
5507 kvm_run->debug.arch.exception = ex_no;
5508 break;
5509 case AC_VECTOR:
5510 if (vmx_guest_inject_ac(vcpu)) {
5511 kvm_queue_exception_e(vcpu, AC_VECTOR, error_code);
5512 return 1;
5513 }
5514
5515 /*
5516 * Handle split lock. Depending on detection mode this will
5517 * either warn and disable split lock detection for this
5518 * task or force SIGBUS on it.
5519 */
5520 if (handle_guest_split_lock(kvm_rip_read(vcpu)))
5521 return 1;
5522 fallthrough;
5523 default:
5524 kvm_run->exit_reason = KVM_EXIT_EXCEPTION;
5525 kvm_run->ex.exception = ex_no;
5526 kvm_run->ex.error_code = error_code;
5527 break;
5528 }
5529 return 0;
5530 }
5531
handle_external_interrupt(struct kvm_vcpu * vcpu)5532 static __always_inline int handle_external_interrupt(struct kvm_vcpu *vcpu)
5533 {
5534 ++vcpu->stat.irq_exits;
5535 return 1;
5536 }
5537
handle_triple_fault(struct kvm_vcpu * vcpu)5538 static int handle_triple_fault(struct kvm_vcpu *vcpu)
5539 {
5540 vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN;
5541 vcpu->mmio_needed = 0;
5542 return 0;
5543 }
5544
handle_io(struct kvm_vcpu * vcpu)5545 static int handle_io(struct kvm_vcpu *vcpu)
5546 {
5547 unsigned long exit_qualification;
5548 int size, in, string;
5549 unsigned port;
5550
5551 exit_qualification = vmx_get_exit_qual(vcpu);
5552 string = (exit_qualification & 16) != 0;
5553
5554 ++vcpu->stat.io_exits;
5555
5556 if (string)
5557 return kvm_emulate_instruction(vcpu, 0);
5558
5559 port = exit_qualification >> 16;
5560 size = (exit_qualification & 7) + 1;
5561 in = (exit_qualification & 8) != 0;
5562
5563 return kvm_fast_pio(vcpu, size, port, in);
5564 }
5565
vmx_patch_hypercall(struct kvm_vcpu * vcpu,unsigned char * hypercall)5566 void vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall)
5567 {
5568 /*
5569 * Patch in the VMCALL instruction:
5570 */
5571 hypercall[0] = 0x0f;
5572 hypercall[1] = 0x01;
5573 hypercall[2] = 0xc1;
5574 }
5575
5576 /* called to set cr0 as appropriate for a mov-to-cr0 exit. */
handle_set_cr0(struct kvm_vcpu * vcpu,unsigned long val)5577 static int handle_set_cr0(struct kvm_vcpu *vcpu, unsigned long val)
5578 {
5579 if (is_guest_mode(vcpu)) {
5580 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
5581 unsigned long orig_val = val;
5582
5583 /*
5584 * We get here when L2 changed cr0 in a way that did not change
5585 * any of L1's shadowed bits (see nested_vmx_exit_handled_cr),
5586 * but did change L0 shadowed bits. So we first calculate the
5587 * effective cr0 value that L1 would like to write into the
5588 * hardware. It consists of the L2-owned bits from the new
5589 * value combined with the L1-owned bits from L1's guest_cr0.
5590 */
5591 val = (val & ~vmcs12->cr0_guest_host_mask) |
5592 (vmcs12->guest_cr0 & vmcs12->cr0_guest_host_mask);
5593
5594 if (kvm_set_cr0(vcpu, val))
5595 return 1;
5596 vmcs_writel(CR0_READ_SHADOW, orig_val);
5597 return 0;
5598 } else {
5599 return kvm_set_cr0(vcpu, val);
5600 }
5601 }
5602
handle_set_cr4(struct kvm_vcpu * vcpu,unsigned long val)5603 static int handle_set_cr4(struct kvm_vcpu *vcpu, unsigned long val)
5604 {
5605 if (is_guest_mode(vcpu)) {
5606 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
5607 unsigned long orig_val = val;
5608
5609 /* analogously to handle_set_cr0 */
5610 val = (val & ~vmcs12->cr4_guest_host_mask) |
5611 (vmcs12->guest_cr4 & vmcs12->cr4_guest_host_mask);
5612 if (kvm_set_cr4(vcpu, val))
5613 return 1;
5614 vmcs_writel(CR4_READ_SHADOW, orig_val);
5615 return 0;
5616 } else
5617 return kvm_set_cr4(vcpu, val);
5618 }
5619
handle_desc(struct kvm_vcpu * vcpu)5620 static int handle_desc(struct kvm_vcpu *vcpu)
5621 {
5622 /*
5623 * UMIP emulation relies on intercepting writes to CR4.UMIP, i.e. this
5624 * and other code needs to be updated if UMIP can be guest owned.
5625 */
5626 BUILD_BUG_ON(KVM_POSSIBLE_CR4_GUEST_BITS & X86_CR4_UMIP);
5627
5628 WARN_ON_ONCE(!kvm_is_cr4_bit_set(vcpu, X86_CR4_UMIP));
5629 return kvm_emulate_instruction(vcpu, 0);
5630 }
5631
handle_cr(struct kvm_vcpu * vcpu)5632 static int handle_cr(struct kvm_vcpu *vcpu)
5633 {
5634 unsigned long exit_qualification, val;
5635 int cr;
5636 int reg;
5637 int err;
5638 int ret;
5639
5640 exit_qualification = vmx_get_exit_qual(vcpu);
5641 cr = exit_qualification & 15;
5642 reg = (exit_qualification >> 8) & 15;
5643 switch ((exit_qualification >> 4) & 3) {
5644 case 0: /* mov to cr */
5645 val = kvm_register_read(vcpu, reg);
5646 trace_kvm_cr_write(cr, val);
5647 switch (cr) {
5648 case 0:
5649 err = handle_set_cr0(vcpu, val);
5650 return kvm_complete_insn_gp(vcpu, err);
5651 case 3:
5652 WARN_ON_ONCE(enable_unrestricted_guest);
5653
5654 err = kvm_set_cr3(vcpu, val);
5655 return kvm_complete_insn_gp(vcpu, err);
5656 case 4:
5657 err = handle_set_cr4(vcpu, val);
5658 return kvm_complete_insn_gp(vcpu, err);
5659 case 8: {
5660 u8 cr8_prev = kvm_get_cr8(vcpu);
5661 u8 cr8 = (u8)val;
5662 err = kvm_set_cr8(vcpu, cr8);
5663 ret = kvm_complete_insn_gp(vcpu, err);
5664 if (lapic_in_kernel(vcpu))
5665 return ret;
5666 if (cr8_prev <= cr8)
5667 return ret;
5668 /*
5669 * TODO: we might be squashing a
5670 * KVM_GUESTDBG_SINGLESTEP-triggered
5671 * KVM_EXIT_DEBUG here.
5672 */
5673 vcpu->run->exit_reason = KVM_EXIT_SET_TPR;
5674 return 0;
5675 }
5676 }
5677 break;
5678 case 2: /* clts */
5679 KVM_BUG(1, vcpu->kvm, "Guest always owns CR0.TS");
5680 return -EIO;
5681 case 1: /*mov from cr*/
5682 switch (cr) {
5683 case 3:
5684 WARN_ON_ONCE(enable_unrestricted_guest);
5685
5686 val = kvm_read_cr3(vcpu);
5687 kvm_register_write(vcpu, reg, val);
5688 trace_kvm_cr_read(cr, val);
5689 return kvm_skip_emulated_instruction(vcpu);
5690 case 8:
5691 val = kvm_get_cr8(vcpu);
5692 kvm_register_write(vcpu, reg, val);
5693 trace_kvm_cr_read(cr, val);
5694 return kvm_skip_emulated_instruction(vcpu);
5695 }
5696 break;
5697 case 3: /* lmsw */
5698 val = (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f;
5699 trace_kvm_cr_write(0, (kvm_read_cr0_bits(vcpu, ~0xful) | val));
5700 kvm_lmsw(vcpu, val);
5701
5702 return kvm_skip_emulated_instruction(vcpu);
5703 default:
5704 break;
5705 }
5706 vcpu->run->exit_reason = 0;
5707 vcpu_unimpl(vcpu, "unhandled control register: op %d cr %d\n",
5708 (int)(exit_qualification >> 4) & 3, cr);
5709 return 0;
5710 }
5711
handle_dr(struct kvm_vcpu * vcpu)5712 static int handle_dr(struct kvm_vcpu *vcpu)
5713 {
5714 unsigned long exit_qualification;
5715 int dr, dr7, reg;
5716 int err = 1;
5717
5718 exit_qualification = vmx_get_exit_qual(vcpu);
5719 dr = exit_qualification & DEBUG_REG_ACCESS_NUM;
5720
5721 /* First, if DR does not exist, trigger UD */
5722 if (!kvm_require_dr(vcpu, dr))
5723 return 1;
5724
5725 if (vmx_get_cpl(vcpu) > 0)
5726 goto out;
5727
5728 dr7 = vmcs_readl(GUEST_DR7);
5729 if (dr7 & DR7_GD) {
5730 /*
5731 * As the vm-exit takes precedence over the debug trap, we
5732 * need to emulate the latter, either for the host or the
5733 * guest debugging itself.
5734 */
5735 if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) {
5736 vcpu->run->debug.arch.dr6 = DR6_BD | DR6_ACTIVE_LOW;
5737 vcpu->run->debug.arch.dr7 = dr7;
5738 vcpu->run->debug.arch.pc = kvm_get_linear_rip(vcpu);
5739 vcpu->run->debug.arch.exception = DB_VECTOR;
5740 vcpu->run->exit_reason = KVM_EXIT_DEBUG;
5741 return 0;
5742 } else {
5743 kvm_queue_exception_p(vcpu, DB_VECTOR, DR6_BD);
5744 return 1;
5745 }
5746 }
5747
5748 if (vcpu->guest_debug == 0) {
5749 exec_controls_clearbit(to_vmx(vcpu), CPU_BASED_MOV_DR_EXITING);
5750
5751 /*
5752 * No more DR vmexits; force a reload of the debug registers
5753 * and reenter on this instruction. The next vmexit will
5754 * retrieve the full state of the debug registers.
5755 */
5756 vcpu->arch.switch_db_regs |= KVM_DEBUGREG_WONT_EXIT;
5757 return 1;
5758 }
5759
5760 reg = DEBUG_REG_ACCESS_REG(exit_qualification);
5761 if (exit_qualification & TYPE_MOV_FROM_DR) {
5762 kvm_register_write(vcpu, reg, kvm_get_dr(vcpu, dr));
5763 err = 0;
5764 } else {
5765 err = kvm_set_dr(vcpu, dr, kvm_register_read(vcpu, reg));
5766 }
5767
5768 out:
5769 return kvm_complete_insn_gp(vcpu, err);
5770 }
5771
vmx_sync_dirty_debug_regs(struct kvm_vcpu * vcpu)5772 void vmx_sync_dirty_debug_regs(struct kvm_vcpu *vcpu)
5773 {
5774 get_debugreg(vcpu->arch.db[0], 0);
5775 get_debugreg(vcpu->arch.db[1], 1);
5776 get_debugreg(vcpu->arch.db[2], 2);
5777 get_debugreg(vcpu->arch.db[3], 3);
5778 get_debugreg(vcpu->arch.dr6, 6);
5779 vcpu->arch.dr7 = vmcs_readl(GUEST_DR7);
5780
5781 vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_WONT_EXIT;
5782 exec_controls_setbit(to_vmx(vcpu), CPU_BASED_MOV_DR_EXITING);
5783
5784 /*
5785 * exc_debug expects dr6 to be cleared after it runs, avoid that it sees
5786 * a stale dr6 from the guest.
5787 */
5788 set_debugreg(DR6_RESERVED, 6);
5789 }
5790
vmx_set_dr7(struct kvm_vcpu * vcpu,unsigned long val)5791 void vmx_set_dr7(struct kvm_vcpu *vcpu, unsigned long val)
5792 {
5793 vmcs_writel(GUEST_DR7, val);
5794 }
5795
handle_tpr_below_threshold(struct kvm_vcpu * vcpu)5796 static int handle_tpr_below_threshold(struct kvm_vcpu *vcpu)
5797 {
5798 kvm_apic_update_ppr(vcpu);
5799 return 1;
5800 }
5801
handle_interrupt_window(struct kvm_vcpu * vcpu)5802 static int handle_interrupt_window(struct kvm_vcpu *vcpu)
5803 {
5804 exec_controls_clearbit(to_vmx(vcpu), CPU_BASED_INTR_WINDOW_EXITING);
5805
5806 kvm_make_request(KVM_REQ_EVENT, vcpu);
5807
5808 ++vcpu->stat.irq_window_exits;
5809 return 1;
5810 }
5811
handle_invlpg(struct kvm_vcpu * vcpu)5812 static int handle_invlpg(struct kvm_vcpu *vcpu)
5813 {
5814 unsigned long exit_qualification = vmx_get_exit_qual(vcpu);
5815
5816 kvm_mmu_invlpg(vcpu, exit_qualification);
5817 return kvm_skip_emulated_instruction(vcpu);
5818 }
5819
handle_apic_access(struct kvm_vcpu * vcpu)5820 static int handle_apic_access(struct kvm_vcpu *vcpu)
5821 {
5822 if (likely(fasteoi)) {
5823 unsigned long exit_qualification = vmx_get_exit_qual(vcpu);
5824 int access_type, offset;
5825
5826 access_type = exit_qualification & APIC_ACCESS_TYPE;
5827 offset = exit_qualification & APIC_ACCESS_OFFSET;
5828 /*
5829 * Sane guest uses MOV to write EOI, with written value
5830 * not cared. So make a short-circuit here by avoiding
5831 * heavy instruction emulation.
5832 */
5833 if ((access_type == TYPE_LINEAR_APIC_INST_WRITE) &&
5834 (offset == APIC_EOI)) {
5835 kvm_lapic_set_eoi(vcpu);
5836 return kvm_skip_emulated_instruction(vcpu);
5837 }
5838 }
5839 return kvm_emulate_instruction(vcpu, 0);
5840 }
5841
handle_apic_eoi_induced(struct kvm_vcpu * vcpu)5842 static int handle_apic_eoi_induced(struct kvm_vcpu *vcpu)
5843 {
5844 unsigned long exit_qualification = vmx_get_exit_qual(vcpu);
5845 int vector = exit_qualification & 0xff;
5846
5847 /* EOI-induced VM exit is trap-like and thus no need to adjust IP */
5848 kvm_apic_set_eoi_accelerated(vcpu, vector);
5849 return 1;
5850 }
5851
handle_apic_write(struct kvm_vcpu * vcpu)5852 static int handle_apic_write(struct kvm_vcpu *vcpu)
5853 {
5854 unsigned long exit_qualification = vmx_get_exit_qual(vcpu);
5855
5856 /*
5857 * APIC-write VM-Exit is trap-like, KVM doesn't need to advance RIP and
5858 * hardware has done any necessary aliasing, offset adjustments, etc...
5859 * for the access. I.e. the correct value has already been written to
5860 * the vAPIC page for the correct 16-byte chunk. KVM needs only to
5861 * retrieve the register value and emulate the access.
5862 */
5863 u32 offset = exit_qualification & 0xff0;
5864
5865 kvm_apic_write_nodecode(vcpu, offset);
5866 return 1;
5867 }
5868
handle_task_switch(struct kvm_vcpu * vcpu)5869 static int handle_task_switch(struct kvm_vcpu *vcpu)
5870 {
5871 struct vcpu_vmx *vmx = to_vmx(vcpu);
5872 unsigned long exit_qualification;
5873 bool has_error_code = false;
5874 u32 error_code = 0;
5875 u16 tss_selector;
5876 int reason, type, idt_v, idt_index;
5877
5878 idt_v = (vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK);
5879 idt_index = (vmx->idt_vectoring_info & VECTORING_INFO_VECTOR_MASK);
5880 type = (vmx->idt_vectoring_info & VECTORING_INFO_TYPE_MASK);
5881
5882 exit_qualification = vmx_get_exit_qual(vcpu);
5883
5884 reason = (u32)exit_qualification >> 30;
5885 if (reason == TASK_SWITCH_GATE && idt_v) {
5886 switch (type) {
5887 case INTR_TYPE_NMI_INTR:
5888 vcpu->arch.nmi_injected = false;
5889 vmx_set_nmi_mask(vcpu, true);
5890 break;
5891 case INTR_TYPE_EXT_INTR:
5892 case INTR_TYPE_SOFT_INTR:
5893 kvm_clear_interrupt_queue(vcpu);
5894 break;
5895 case INTR_TYPE_HARD_EXCEPTION:
5896 if (vmx->idt_vectoring_info &
5897 VECTORING_INFO_DELIVER_CODE_MASK) {
5898 has_error_code = true;
5899 error_code =
5900 vmcs_read32(IDT_VECTORING_ERROR_CODE);
5901 }
5902 fallthrough;
5903 case INTR_TYPE_SOFT_EXCEPTION:
5904 kvm_clear_exception_queue(vcpu);
5905 break;
5906 default:
5907 break;
5908 }
5909 }
5910 tss_selector = exit_qualification;
5911
5912 if (!idt_v || (type != INTR_TYPE_HARD_EXCEPTION &&
5913 type != INTR_TYPE_EXT_INTR &&
5914 type != INTR_TYPE_NMI_INTR))
5915 WARN_ON(!skip_emulated_instruction(vcpu));
5916
5917 /*
5918 * TODO: What about debug traps on tss switch?
5919 * Are we supposed to inject them and update dr6?
5920 */
5921 return kvm_task_switch(vcpu, tss_selector,
5922 type == INTR_TYPE_SOFT_INTR ? idt_index : -1,
5923 reason, has_error_code, error_code);
5924 }
5925
handle_ept_violation(struct kvm_vcpu * vcpu)5926 static int handle_ept_violation(struct kvm_vcpu *vcpu)
5927 {
5928 unsigned long exit_qualification = vmx_get_exit_qual(vcpu);
5929 gpa_t gpa;
5930
5931 /*
5932 * EPT violation happened while executing iret from NMI,
5933 * "blocked by NMI" bit has to be set before next VM entry.
5934 * There are errata that may cause this bit to not be set:
5935 * AAK134, BY25.
5936 */
5937 if (!(to_vmx(vcpu)->idt_vectoring_info & VECTORING_INFO_VALID_MASK) &&
5938 enable_vnmi &&
5939 (exit_qualification & INTR_INFO_UNBLOCK_NMI))
5940 vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, GUEST_INTR_STATE_NMI);
5941
5942 gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
5943 trace_kvm_page_fault(vcpu, gpa, exit_qualification);
5944
5945 /*
5946 * Check that the GPA doesn't exceed physical memory limits, as that is
5947 * a guest page fault. We have to emulate the instruction here, because
5948 * if the illegal address is that of a paging structure, then
5949 * EPT_VIOLATION_ACC_WRITE bit is set. Alternatively, if supported we
5950 * would also use advanced VM-exit information for EPT violations to
5951 * reconstruct the page fault error code.
5952 */
5953 if (unlikely(allow_smaller_maxphyaddr && !kvm_vcpu_is_legal_gpa(vcpu, gpa)))
5954 return kvm_emulate_instruction(vcpu, 0);
5955
5956 return __vmx_handle_ept_violation(vcpu, gpa, exit_qualification);
5957 }
5958
handle_ept_misconfig(struct kvm_vcpu * vcpu)5959 static int handle_ept_misconfig(struct kvm_vcpu *vcpu)
5960 {
5961 gpa_t gpa;
5962
5963 if (vmx_check_emulate_instruction(vcpu, EMULTYPE_PF, NULL, 0))
5964 return 1;
5965
5966 /*
5967 * A nested guest cannot optimize MMIO vmexits, because we have an
5968 * nGPA here instead of the required GPA.
5969 */
5970 gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
5971 if (!is_guest_mode(vcpu) &&
5972 !kvm_io_bus_write(vcpu, KVM_FAST_MMIO_BUS, gpa, 0, NULL)) {
5973 trace_kvm_fast_mmio(gpa);
5974 return kvm_skip_emulated_instruction(vcpu);
5975 }
5976
5977 return kvm_mmu_page_fault(vcpu, gpa, PFERR_RSVD_MASK, NULL, 0);
5978 }
5979
handle_nmi_window(struct kvm_vcpu * vcpu)5980 static int handle_nmi_window(struct kvm_vcpu *vcpu)
5981 {
5982 if (KVM_BUG_ON(!enable_vnmi, vcpu->kvm))
5983 return -EIO;
5984
5985 exec_controls_clearbit(to_vmx(vcpu), CPU_BASED_NMI_WINDOW_EXITING);
5986 ++vcpu->stat.nmi_window_exits;
5987 kvm_make_request(KVM_REQ_EVENT, vcpu);
5988
5989 return 1;
5990 }
5991
5992 /*
5993 * Returns true if emulation is required (due to the vCPU having invalid state
5994 * with unsrestricted guest mode disabled) and KVM can't faithfully emulate the
5995 * current vCPU state.
5996 */
vmx_unhandleable_emulation_required(struct kvm_vcpu * vcpu)5997 static bool vmx_unhandleable_emulation_required(struct kvm_vcpu *vcpu)
5998 {
5999 struct vcpu_vmx *vmx = to_vmx(vcpu);
6000
6001 if (!vmx->vt.emulation_required)
6002 return false;
6003
6004 /*
6005 * It is architecturally impossible for emulation to be required when a
6006 * nested VM-Enter is pending completion, as VM-Enter will VM-Fail if
6007 * guest state is invalid and unrestricted guest is disabled, i.e. KVM
6008 * should synthesize VM-Fail instead emulation L2 code. This path is
6009 * only reachable if userspace modifies L2 guest state after KVM has
6010 * performed the nested VM-Enter consistency checks.
6011 */
6012 if (vcpu->arch.nested_run_pending)
6013 return true;
6014
6015 /*
6016 * KVM only supports emulating exceptions if the vCPU is in Real Mode.
6017 * If emulation is required, KVM can't perform a successful VM-Enter to
6018 * inject the exception.
6019 */
6020 return !vmx->rmode.vm86_active &&
6021 (kvm_is_exception_pending(vcpu) || vcpu->arch.exception.injected);
6022 }
6023
handle_invalid_guest_state(struct kvm_vcpu * vcpu)6024 static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
6025 {
6026 struct vcpu_vmx *vmx = to_vmx(vcpu);
6027 bool intr_window_requested;
6028 unsigned count = 130;
6029
6030 intr_window_requested = exec_controls_get(vmx) &
6031 CPU_BASED_INTR_WINDOW_EXITING;
6032
6033 while (vmx->vt.emulation_required && count-- != 0) {
6034 if (intr_window_requested && !vmx_interrupt_blocked(vcpu))
6035 return handle_interrupt_window(&vmx->vcpu);
6036
6037 if (kvm_test_request(KVM_REQ_EVENT, vcpu))
6038 return 1;
6039
6040 /*
6041 * Ensure that any updates to kvm->buses[] observed by the
6042 * previous instruction (emulated or otherwise) are also
6043 * visible to the instruction KVM is about to emulate.
6044 */
6045 smp_rmb();
6046
6047 if (!kvm_emulate_instruction(vcpu, 0))
6048 return 0;
6049
6050 if (vmx_unhandleable_emulation_required(vcpu)) {
6051 kvm_prepare_emulation_failure_exit(vcpu);
6052 return 0;
6053 }
6054
6055 if (vcpu->arch.halt_request) {
6056 vcpu->arch.halt_request = 0;
6057 return kvm_emulate_halt_noskip(vcpu);
6058 }
6059
6060 /*
6061 * Note, return 1 and not 0, vcpu_run() will invoke
6062 * xfer_to_guest_mode() which will create a proper return
6063 * code.
6064 */
6065 if (__xfer_to_guest_mode_work_pending())
6066 return 1;
6067 }
6068
6069 return 1;
6070 }
6071
vmx_vcpu_pre_run(struct kvm_vcpu * vcpu)6072 int vmx_vcpu_pre_run(struct kvm_vcpu *vcpu)
6073 {
6074 if (vmx_unhandleable_emulation_required(vcpu)) {
6075 kvm_prepare_emulation_failure_exit(vcpu);
6076 return 0;
6077 }
6078
6079 return 1;
6080 }
6081
6082 /*
6083 * Indicate a busy-waiting vcpu in spinlock. We do not enable the PAUSE
6084 * exiting, so only get here on cpu with PAUSE-Loop-Exiting.
6085 */
handle_pause(struct kvm_vcpu * vcpu)6086 static int handle_pause(struct kvm_vcpu *vcpu)
6087 {
6088 if (!kvm_pause_in_guest(vcpu->kvm))
6089 grow_ple_window(vcpu);
6090
6091 /*
6092 * Intel sdm vol3 ch-25.1.3 says: The "PAUSE-loop exiting"
6093 * VM-execution control is ignored if CPL > 0. OTOH, KVM
6094 * never set PAUSE_EXITING and just set PLE if supported,
6095 * so the vcpu must be CPL=0 if it gets a PAUSE exit.
6096 */
6097 kvm_vcpu_on_spin(vcpu, true);
6098 return kvm_skip_emulated_instruction(vcpu);
6099 }
6100
handle_monitor_trap(struct kvm_vcpu * vcpu)6101 static int handle_monitor_trap(struct kvm_vcpu *vcpu)
6102 {
6103 return 1;
6104 }
6105
handle_invpcid(struct kvm_vcpu * vcpu)6106 static int handle_invpcid(struct kvm_vcpu *vcpu)
6107 {
6108 u32 vmx_instruction_info;
6109 unsigned long type;
6110 gva_t gva;
6111 struct {
6112 u64 pcid;
6113 u64 gla;
6114 } operand;
6115 int gpr_index;
6116
6117 if (!guest_cpu_cap_has(vcpu, X86_FEATURE_INVPCID)) {
6118 kvm_queue_exception(vcpu, UD_VECTOR);
6119 return 1;
6120 }
6121
6122 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
6123 gpr_index = vmx_get_instr_info_reg2(vmx_instruction_info);
6124 type = kvm_register_read(vcpu, gpr_index);
6125
6126 /* According to the Intel instruction reference, the memory operand
6127 * is read even if it isn't needed (e.g., for type==all)
6128 */
6129 if (get_vmx_mem_address(vcpu, vmx_get_exit_qual(vcpu),
6130 vmx_instruction_info, false,
6131 sizeof(operand), &gva))
6132 return 1;
6133
6134 return kvm_handle_invpcid(vcpu, type, gva);
6135 }
6136
handle_pml_full(struct kvm_vcpu * vcpu)6137 static int handle_pml_full(struct kvm_vcpu *vcpu)
6138 {
6139 unsigned long exit_qualification;
6140
6141 trace_kvm_pml_full(vcpu->vcpu_id);
6142
6143 exit_qualification = vmx_get_exit_qual(vcpu);
6144
6145 /*
6146 * PML buffer FULL happened while executing iret from NMI,
6147 * "blocked by NMI" bit has to be set before next VM entry.
6148 */
6149 if (!(to_vmx(vcpu)->idt_vectoring_info & VECTORING_INFO_VALID_MASK) &&
6150 enable_vnmi &&
6151 (exit_qualification & INTR_INFO_UNBLOCK_NMI))
6152 vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
6153 GUEST_INTR_STATE_NMI);
6154
6155 /*
6156 * PML buffer already flushed at beginning of VMEXIT. Nothing to do
6157 * here.., and there's no userspace involvement needed for PML.
6158 */
6159 return 1;
6160 }
6161
handle_fastpath_preemption_timer(struct kvm_vcpu * vcpu,bool force_immediate_exit)6162 static fastpath_t handle_fastpath_preemption_timer(struct kvm_vcpu *vcpu,
6163 bool force_immediate_exit)
6164 {
6165 struct vcpu_vmx *vmx = to_vmx(vcpu);
6166
6167 /*
6168 * In the *extremely* unlikely scenario that this is a spurious VM-Exit
6169 * due to the timer expiring while it was "soft" disabled, just eat the
6170 * exit and re-enter the guest.
6171 */
6172 if (unlikely(vmx->loaded_vmcs->hv_timer_soft_disabled))
6173 return EXIT_FASTPATH_REENTER_GUEST;
6174
6175 /*
6176 * If the timer expired because KVM used it to force an immediate exit,
6177 * then mission accomplished.
6178 */
6179 if (force_immediate_exit)
6180 return EXIT_FASTPATH_EXIT_HANDLED;
6181
6182 /*
6183 * If L2 is active, go down the slow path as emulating the guest timer
6184 * expiration likely requires synthesizing a nested VM-Exit.
6185 */
6186 if (is_guest_mode(vcpu))
6187 return EXIT_FASTPATH_NONE;
6188
6189 kvm_lapic_expired_hv_timer(vcpu);
6190 return EXIT_FASTPATH_REENTER_GUEST;
6191 }
6192
handle_preemption_timer(struct kvm_vcpu * vcpu)6193 static int handle_preemption_timer(struct kvm_vcpu *vcpu)
6194 {
6195 /*
6196 * This non-fastpath handler is reached if and only if the preemption
6197 * timer was being used to emulate a guest timer while L2 is active.
6198 * All other scenarios are supposed to be handled in the fastpath.
6199 */
6200 WARN_ON_ONCE(!is_guest_mode(vcpu));
6201 kvm_lapic_expired_hv_timer(vcpu);
6202 return 1;
6203 }
6204
6205 /*
6206 * When nested=0, all VMX instruction VM Exits filter here. The handlers
6207 * are overwritten by nested_vmx_hardware_setup() when nested=1.
6208 */
handle_vmx_instruction(struct kvm_vcpu * vcpu)6209 static int handle_vmx_instruction(struct kvm_vcpu *vcpu)
6210 {
6211 kvm_queue_exception(vcpu, UD_VECTOR);
6212 return 1;
6213 }
6214
handle_tdx_instruction(struct kvm_vcpu * vcpu)6215 static int handle_tdx_instruction(struct kvm_vcpu *vcpu)
6216 {
6217 kvm_queue_exception(vcpu, UD_VECTOR);
6218 return 1;
6219 }
6220
6221 #ifndef CONFIG_X86_SGX_KVM
handle_encls(struct kvm_vcpu * vcpu)6222 static int handle_encls(struct kvm_vcpu *vcpu)
6223 {
6224 /*
6225 * SGX virtualization is disabled. There is no software enable bit for
6226 * SGX, so KVM intercepts all ENCLS leafs and injects a #UD to prevent
6227 * the guest from executing ENCLS (when SGX is supported by hardware).
6228 */
6229 kvm_queue_exception(vcpu, UD_VECTOR);
6230 return 1;
6231 }
6232 #endif /* CONFIG_X86_SGX_KVM */
6233
handle_bus_lock_vmexit(struct kvm_vcpu * vcpu)6234 static int handle_bus_lock_vmexit(struct kvm_vcpu *vcpu)
6235 {
6236 /*
6237 * Hardware may or may not set the BUS_LOCK_DETECTED flag on BUS_LOCK
6238 * VM-Exits. Unconditionally set the flag here and leave the handling to
6239 * vmx_handle_exit().
6240 */
6241 to_vt(vcpu)->exit_reason.bus_lock_detected = true;
6242 return 1;
6243 }
6244
handle_notify(struct kvm_vcpu * vcpu)6245 static int handle_notify(struct kvm_vcpu *vcpu)
6246 {
6247 unsigned long exit_qual = vmx_get_exit_qual(vcpu);
6248 bool context_invalid = exit_qual & NOTIFY_VM_CONTEXT_INVALID;
6249
6250 ++vcpu->stat.notify_window_exits;
6251
6252 /*
6253 * Notify VM exit happened while executing iret from NMI,
6254 * "blocked by NMI" bit has to be set before next VM entry.
6255 */
6256 if (enable_vnmi && (exit_qual & INTR_INFO_UNBLOCK_NMI))
6257 vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
6258 GUEST_INTR_STATE_NMI);
6259
6260 if (vcpu->kvm->arch.notify_vmexit_flags & KVM_X86_NOTIFY_VMEXIT_USER ||
6261 context_invalid) {
6262 vcpu->run->exit_reason = KVM_EXIT_NOTIFY;
6263 vcpu->run->notify.flags = context_invalid ?
6264 KVM_NOTIFY_CONTEXT_INVALID : 0;
6265 return 0;
6266 }
6267
6268 return 1;
6269 }
6270
vmx_get_msr_imm_reg(struct kvm_vcpu * vcpu)6271 static int vmx_get_msr_imm_reg(struct kvm_vcpu *vcpu)
6272 {
6273 return vmx_get_instr_info_reg(vmcs_read32(VMX_INSTRUCTION_INFO));
6274 }
6275
handle_rdmsr_imm(struct kvm_vcpu * vcpu)6276 static int handle_rdmsr_imm(struct kvm_vcpu *vcpu)
6277 {
6278 return kvm_emulate_rdmsr_imm(vcpu, vmx_get_exit_qual(vcpu),
6279 vmx_get_msr_imm_reg(vcpu));
6280 }
6281
handle_wrmsr_imm(struct kvm_vcpu * vcpu)6282 static int handle_wrmsr_imm(struct kvm_vcpu *vcpu)
6283 {
6284 return kvm_emulate_wrmsr_imm(vcpu, vmx_get_exit_qual(vcpu),
6285 vmx_get_msr_imm_reg(vcpu));
6286 }
6287
6288 /*
6289 * The exit handlers return 1 if the exit was handled fully and guest execution
6290 * may resume. Otherwise they set the kvm_run parameter to indicate what needs
6291 * to be done to userspace and return 0.
6292 */
6293 static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
6294 [EXIT_REASON_EXCEPTION_NMI] = handle_exception_nmi,
6295 [EXIT_REASON_EXTERNAL_INTERRUPT] = handle_external_interrupt,
6296 [EXIT_REASON_TRIPLE_FAULT] = handle_triple_fault,
6297 [EXIT_REASON_NMI_WINDOW] = handle_nmi_window,
6298 [EXIT_REASON_IO_INSTRUCTION] = handle_io,
6299 [EXIT_REASON_CR_ACCESS] = handle_cr,
6300 [EXIT_REASON_DR_ACCESS] = handle_dr,
6301 [EXIT_REASON_CPUID] = kvm_emulate_cpuid,
6302 [EXIT_REASON_MSR_READ] = kvm_emulate_rdmsr,
6303 [EXIT_REASON_MSR_WRITE] = kvm_emulate_wrmsr,
6304 [EXIT_REASON_INTERRUPT_WINDOW] = handle_interrupt_window,
6305 [EXIT_REASON_HLT] = kvm_emulate_halt,
6306 [EXIT_REASON_INVD] = kvm_emulate_invd,
6307 [EXIT_REASON_INVLPG] = handle_invlpg,
6308 [EXIT_REASON_RDPMC] = kvm_emulate_rdpmc,
6309 [EXIT_REASON_VMCALL] = kvm_emulate_hypercall,
6310 [EXIT_REASON_VMCLEAR] = handle_vmx_instruction,
6311 [EXIT_REASON_VMLAUNCH] = handle_vmx_instruction,
6312 [EXIT_REASON_VMPTRLD] = handle_vmx_instruction,
6313 [EXIT_REASON_VMPTRST] = handle_vmx_instruction,
6314 [EXIT_REASON_VMREAD] = handle_vmx_instruction,
6315 [EXIT_REASON_VMRESUME] = handle_vmx_instruction,
6316 [EXIT_REASON_VMWRITE] = handle_vmx_instruction,
6317 [EXIT_REASON_VMOFF] = handle_vmx_instruction,
6318 [EXIT_REASON_VMON] = handle_vmx_instruction,
6319 [EXIT_REASON_TPR_BELOW_THRESHOLD] = handle_tpr_below_threshold,
6320 [EXIT_REASON_APIC_ACCESS] = handle_apic_access,
6321 [EXIT_REASON_APIC_WRITE] = handle_apic_write,
6322 [EXIT_REASON_EOI_INDUCED] = handle_apic_eoi_induced,
6323 [EXIT_REASON_WBINVD] = kvm_emulate_wbinvd,
6324 [EXIT_REASON_XSETBV] = kvm_emulate_xsetbv,
6325 [EXIT_REASON_TASK_SWITCH] = handle_task_switch,
6326 [EXIT_REASON_MCE_DURING_VMENTRY] = handle_machine_check,
6327 [EXIT_REASON_GDTR_IDTR] = handle_desc,
6328 [EXIT_REASON_LDTR_TR] = handle_desc,
6329 [EXIT_REASON_EPT_VIOLATION] = handle_ept_violation,
6330 [EXIT_REASON_EPT_MISCONFIG] = handle_ept_misconfig,
6331 [EXIT_REASON_PAUSE_INSTRUCTION] = handle_pause,
6332 [EXIT_REASON_MWAIT_INSTRUCTION] = kvm_emulate_mwait,
6333 [EXIT_REASON_MONITOR_TRAP_FLAG] = handle_monitor_trap,
6334 [EXIT_REASON_MONITOR_INSTRUCTION] = kvm_emulate_monitor,
6335 [EXIT_REASON_INVEPT] = handle_vmx_instruction,
6336 [EXIT_REASON_INVVPID] = handle_vmx_instruction,
6337 [EXIT_REASON_RDRAND] = kvm_handle_invalid_op,
6338 [EXIT_REASON_RDSEED] = kvm_handle_invalid_op,
6339 [EXIT_REASON_PML_FULL] = handle_pml_full,
6340 [EXIT_REASON_INVPCID] = handle_invpcid,
6341 [EXIT_REASON_VMFUNC] = handle_vmx_instruction,
6342 [EXIT_REASON_PREEMPTION_TIMER] = handle_preemption_timer,
6343 [EXIT_REASON_ENCLS] = handle_encls,
6344 [EXIT_REASON_BUS_LOCK] = handle_bus_lock_vmexit,
6345 [EXIT_REASON_NOTIFY] = handle_notify,
6346 [EXIT_REASON_SEAMCALL] = handle_tdx_instruction,
6347 [EXIT_REASON_TDCALL] = handle_tdx_instruction,
6348 [EXIT_REASON_MSR_READ_IMM] = handle_rdmsr_imm,
6349 [EXIT_REASON_MSR_WRITE_IMM] = handle_wrmsr_imm,
6350 };
6351
6352 static const int kvm_vmx_max_exit_handlers =
6353 ARRAY_SIZE(kvm_vmx_exit_handlers);
6354
vmx_get_exit_info(struct kvm_vcpu * vcpu,u32 * reason,u64 * info1,u64 * info2,u32 * intr_info,u32 * error_code)6355 void vmx_get_exit_info(struct kvm_vcpu *vcpu, u32 *reason,
6356 u64 *info1, u64 *info2, u32 *intr_info, u32 *error_code)
6357 {
6358 struct vcpu_vmx *vmx = to_vmx(vcpu);
6359
6360 *reason = vmx->vt.exit_reason.full;
6361 *info1 = vmx_get_exit_qual(vcpu);
6362 if (!(vmx->vt.exit_reason.failed_vmentry)) {
6363 *info2 = vmx->idt_vectoring_info;
6364 *intr_info = vmx_get_intr_info(vcpu);
6365 if (is_exception_with_error_code(*intr_info))
6366 *error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
6367 else
6368 *error_code = 0;
6369 } else {
6370 *info2 = 0;
6371 *intr_info = 0;
6372 *error_code = 0;
6373 }
6374 }
6375
vmx_get_entry_info(struct kvm_vcpu * vcpu,u32 * intr_info,u32 * error_code)6376 void vmx_get_entry_info(struct kvm_vcpu *vcpu, u32 *intr_info, u32 *error_code)
6377 {
6378 *intr_info = vmcs_read32(VM_ENTRY_INTR_INFO_FIELD);
6379 if (is_exception_with_error_code(*intr_info))
6380 *error_code = vmcs_read32(VM_ENTRY_EXCEPTION_ERROR_CODE);
6381 else
6382 *error_code = 0;
6383 }
6384
vmx_destroy_pml_buffer(struct vcpu_vmx * vmx)6385 static void vmx_destroy_pml_buffer(struct vcpu_vmx *vmx)
6386 {
6387 if (vmx->pml_pg) {
6388 __free_page(vmx->pml_pg);
6389 vmx->pml_pg = NULL;
6390 }
6391 }
6392
vmx_flush_pml_buffer(struct kvm_vcpu * vcpu)6393 static void vmx_flush_pml_buffer(struct kvm_vcpu *vcpu)
6394 {
6395 struct vcpu_vmx *vmx = to_vmx(vcpu);
6396 u16 pml_idx, pml_tail_index;
6397 u64 *pml_buf;
6398 int i;
6399
6400 pml_idx = vmcs_read16(GUEST_PML_INDEX);
6401
6402 /* Do nothing if PML buffer is empty */
6403 if (pml_idx == PML_HEAD_INDEX)
6404 return;
6405 /*
6406 * PML index always points to the next available PML buffer entity
6407 * unless PML log has just overflowed.
6408 */
6409 pml_tail_index = (pml_idx >= PML_LOG_NR_ENTRIES) ? 0 : pml_idx + 1;
6410
6411 /*
6412 * PML log is written backwards: the CPU first writes the entry 511
6413 * then the entry 510, and so on.
6414 *
6415 * Read the entries in the same order they were written, to ensure that
6416 * the dirty ring is filled in the same order the CPU wrote them.
6417 */
6418 pml_buf = page_address(vmx->pml_pg);
6419
6420 for (i = PML_HEAD_INDEX; i >= pml_tail_index; i--) {
6421 u64 gpa;
6422
6423 gpa = pml_buf[i];
6424 WARN_ON(gpa & (PAGE_SIZE - 1));
6425 kvm_vcpu_mark_page_dirty(vcpu, gpa >> PAGE_SHIFT);
6426 }
6427
6428 /* reset PML index */
6429 vmcs_write16(GUEST_PML_INDEX, PML_HEAD_INDEX);
6430 }
6431
nested_vmx_mark_all_vmcs12_pages_dirty(struct kvm_vcpu * vcpu)6432 static void nested_vmx_mark_all_vmcs12_pages_dirty(struct kvm_vcpu *vcpu)
6433 {
6434 struct vcpu_vmx *vmx = to_vmx(vcpu);
6435
6436 kvm_vcpu_map_mark_dirty(vcpu, &vmx->nested.apic_access_page_map);
6437 kvm_vcpu_map_mark_dirty(vcpu, &vmx->nested.virtual_apic_map);
6438 kvm_vcpu_map_mark_dirty(vcpu, &vmx->nested.pi_desc_map);
6439 }
6440
vmx_dump_sel(char * name,uint32_t sel)6441 static void vmx_dump_sel(char *name, uint32_t sel)
6442 {
6443 pr_err("%s sel=0x%04x, attr=0x%05x, limit=0x%08x, base=0x%016lx\n",
6444 name, vmcs_read16(sel),
6445 vmcs_read32(sel + GUEST_ES_AR_BYTES - GUEST_ES_SELECTOR),
6446 vmcs_read32(sel + GUEST_ES_LIMIT - GUEST_ES_SELECTOR),
6447 vmcs_readl(sel + GUEST_ES_BASE - GUEST_ES_SELECTOR));
6448 }
6449
vmx_dump_dtsel(char * name,uint32_t limit)6450 static void vmx_dump_dtsel(char *name, uint32_t limit)
6451 {
6452 pr_err("%s limit=0x%08x, base=0x%016lx\n",
6453 name, vmcs_read32(limit),
6454 vmcs_readl(limit + GUEST_GDTR_BASE - GUEST_GDTR_LIMIT));
6455 }
6456
vmx_dump_msrs(char * name,struct vmx_msrs * m)6457 static void vmx_dump_msrs(char *name, struct vmx_msrs *m)
6458 {
6459 unsigned int i;
6460 struct vmx_msr_entry *e;
6461
6462 pr_err("MSR %s:\n", name);
6463 for (i = 0, e = m->val; i < m->nr; ++i, ++e)
6464 pr_err(" %2d: msr=0x%08x value=0x%016llx\n", i, e->index, e->value);
6465 }
6466
dump_vmcs(struct kvm_vcpu * vcpu)6467 void dump_vmcs(struct kvm_vcpu *vcpu)
6468 {
6469 struct vcpu_vmx *vmx = to_vmx(vcpu);
6470 u32 vmentry_ctl, vmexit_ctl;
6471 u32 cpu_based_exec_ctrl, pin_based_exec_ctrl, secondary_exec_control;
6472 u64 tertiary_exec_control;
6473 unsigned long cr4;
6474 int efer_slot;
6475
6476 if (!dump_invalid_vmcs) {
6477 pr_warn_ratelimited("set kvm_intel.dump_invalid_vmcs=1 to dump internal KVM state.\n");
6478 return;
6479 }
6480
6481 vmentry_ctl = vmcs_read32(VM_ENTRY_CONTROLS);
6482 vmexit_ctl = vmcs_read32(VM_EXIT_CONTROLS);
6483 cpu_based_exec_ctrl = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
6484 pin_based_exec_ctrl = vmcs_read32(PIN_BASED_VM_EXEC_CONTROL);
6485 cr4 = vmcs_readl(GUEST_CR4);
6486
6487 if (cpu_has_secondary_exec_ctrls())
6488 secondary_exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
6489 else
6490 secondary_exec_control = 0;
6491
6492 if (cpu_has_tertiary_exec_ctrls())
6493 tertiary_exec_control = vmcs_read64(TERTIARY_VM_EXEC_CONTROL);
6494 else
6495 tertiary_exec_control = 0;
6496
6497 pr_err("VMCS %p, last attempted VM-entry on CPU %d\n",
6498 vmx->loaded_vmcs->vmcs, vcpu->arch.last_vmentry_cpu);
6499 pr_err("*** Guest State ***\n");
6500 pr_err("CR0: actual=0x%016lx, shadow=0x%016lx, gh_mask=%016lx\n",
6501 vmcs_readl(GUEST_CR0), vmcs_readl(CR0_READ_SHADOW),
6502 vmcs_readl(CR0_GUEST_HOST_MASK));
6503 pr_err("CR4: actual=0x%016lx, shadow=0x%016lx, gh_mask=%016lx\n",
6504 cr4, vmcs_readl(CR4_READ_SHADOW), vmcs_readl(CR4_GUEST_HOST_MASK));
6505 pr_err("CR3 = 0x%016lx\n", vmcs_readl(GUEST_CR3));
6506 if (cpu_has_vmx_ept()) {
6507 pr_err("PDPTR0 = 0x%016llx PDPTR1 = 0x%016llx\n",
6508 vmcs_read64(GUEST_PDPTR0), vmcs_read64(GUEST_PDPTR1));
6509 pr_err("PDPTR2 = 0x%016llx PDPTR3 = 0x%016llx\n",
6510 vmcs_read64(GUEST_PDPTR2), vmcs_read64(GUEST_PDPTR3));
6511 }
6512 pr_err("RSP = 0x%016lx RIP = 0x%016lx\n",
6513 vmcs_readl(GUEST_RSP), vmcs_readl(GUEST_RIP));
6514 pr_err("RFLAGS=0x%08lx DR7 = 0x%016lx\n",
6515 vmcs_readl(GUEST_RFLAGS), vmcs_readl(GUEST_DR7));
6516 pr_err("Sysenter RSP=%016lx CS:RIP=%04x:%016lx\n",
6517 vmcs_readl(GUEST_SYSENTER_ESP),
6518 vmcs_read32(GUEST_SYSENTER_CS), vmcs_readl(GUEST_SYSENTER_EIP));
6519 vmx_dump_sel("CS: ", GUEST_CS_SELECTOR);
6520 vmx_dump_sel("DS: ", GUEST_DS_SELECTOR);
6521 vmx_dump_sel("SS: ", GUEST_SS_SELECTOR);
6522 vmx_dump_sel("ES: ", GUEST_ES_SELECTOR);
6523 vmx_dump_sel("FS: ", GUEST_FS_SELECTOR);
6524 vmx_dump_sel("GS: ", GUEST_GS_SELECTOR);
6525 vmx_dump_dtsel("GDTR:", GUEST_GDTR_LIMIT);
6526 vmx_dump_sel("LDTR:", GUEST_LDTR_SELECTOR);
6527 vmx_dump_dtsel("IDTR:", GUEST_IDTR_LIMIT);
6528 vmx_dump_sel("TR: ", GUEST_TR_SELECTOR);
6529 efer_slot = vmx_find_loadstore_msr_slot(&vmx->msr_autoload.guest, MSR_EFER);
6530 if (vmentry_ctl & VM_ENTRY_LOAD_IA32_EFER)
6531 pr_err("EFER= 0x%016llx\n", vmcs_read64(GUEST_IA32_EFER));
6532 else if (efer_slot >= 0)
6533 pr_err("EFER= 0x%016llx (autoload)\n",
6534 vmx->msr_autoload.guest.val[efer_slot].value);
6535 else if (vmentry_ctl & VM_ENTRY_IA32E_MODE)
6536 pr_err("EFER= 0x%016llx (effective)\n",
6537 vcpu->arch.efer | (EFER_LMA | EFER_LME));
6538 else
6539 pr_err("EFER= 0x%016llx (effective)\n",
6540 vcpu->arch.efer & ~(EFER_LMA | EFER_LME));
6541 if (vmentry_ctl & VM_ENTRY_LOAD_IA32_PAT)
6542 pr_err("PAT = 0x%016llx\n", vmcs_read64(GUEST_IA32_PAT));
6543 pr_err("DebugCtl = 0x%016llx DebugExceptions = 0x%016lx\n",
6544 vmcs_read64(GUEST_IA32_DEBUGCTL),
6545 vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS));
6546 if (cpu_has_load_perf_global_ctrl() &&
6547 vmentry_ctl & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL)
6548 pr_err("PerfGlobCtl = 0x%016llx\n",
6549 vmcs_read64(GUEST_IA32_PERF_GLOBAL_CTRL));
6550 if (vmentry_ctl & VM_ENTRY_LOAD_BNDCFGS)
6551 pr_err("BndCfgS = 0x%016llx\n", vmcs_read64(GUEST_BNDCFGS));
6552 pr_err("Interruptibility = %08x ActivityState = %08x\n",
6553 vmcs_read32(GUEST_INTERRUPTIBILITY_INFO),
6554 vmcs_read32(GUEST_ACTIVITY_STATE));
6555 if (secondary_exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY)
6556 pr_err("InterruptStatus = %04x\n",
6557 vmcs_read16(GUEST_INTR_STATUS));
6558 if (vmcs_read32(VM_ENTRY_MSR_LOAD_COUNT) > 0)
6559 vmx_dump_msrs("guest autoload", &vmx->msr_autoload.guest);
6560 if (vmcs_read32(VM_EXIT_MSR_STORE_COUNT) > 0)
6561 vmx_dump_msrs("autostore", &vmx->msr_autostore);
6562
6563 if (vmentry_ctl & VM_ENTRY_LOAD_CET_STATE)
6564 pr_err("S_CET = 0x%016lx, SSP = 0x%016lx, SSP TABLE = 0x%016lx\n",
6565 vmcs_readl(GUEST_S_CET), vmcs_readl(GUEST_SSP),
6566 vmcs_readl(GUEST_INTR_SSP_TABLE));
6567 pr_err("*** Host State ***\n");
6568 pr_err("RIP = 0x%016lx RSP = 0x%016lx\n",
6569 vmcs_readl(HOST_RIP), vmcs_readl(HOST_RSP));
6570 pr_err("CS=%04x SS=%04x DS=%04x ES=%04x FS=%04x GS=%04x TR=%04x\n",
6571 vmcs_read16(HOST_CS_SELECTOR), vmcs_read16(HOST_SS_SELECTOR),
6572 vmcs_read16(HOST_DS_SELECTOR), vmcs_read16(HOST_ES_SELECTOR),
6573 vmcs_read16(HOST_FS_SELECTOR), vmcs_read16(HOST_GS_SELECTOR),
6574 vmcs_read16(HOST_TR_SELECTOR));
6575 pr_err("FSBase=%016lx GSBase=%016lx TRBase=%016lx\n",
6576 vmcs_readl(HOST_FS_BASE), vmcs_readl(HOST_GS_BASE),
6577 vmcs_readl(HOST_TR_BASE));
6578 pr_err("GDTBase=%016lx IDTBase=%016lx\n",
6579 vmcs_readl(HOST_GDTR_BASE), vmcs_readl(HOST_IDTR_BASE));
6580 pr_err("CR0=%016lx CR3=%016lx CR4=%016lx\n",
6581 vmcs_readl(HOST_CR0), vmcs_readl(HOST_CR3),
6582 vmcs_readl(HOST_CR4));
6583 pr_err("Sysenter RSP=%016lx CS:RIP=%04x:%016lx\n",
6584 vmcs_readl(HOST_IA32_SYSENTER_ESP),
6585 vmcs_read32(HOST_IA32_SYSENTER_CS),
6586 vmcs_readl(HOST_IA32_SYSENTER_EIP));
6587 if (vmexit_ctl & VM_EXIT_LOAD_IA32_EFER)
6588 pr_err("EFER= 0x%016llx\n", vmcs_read64(HOST_IA32_EFER));
6589 if (vmexit_ctl & VM_EXIT_LOAD_IA32_PAT)
6590 pr_err("PAT = 0x%016llx\n", vmcs_read64(HOST_IA32_PAT));
6591 if (cpu_has_load_perf_global_ctrl() &&
6592 vmexit_ctl & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL)
6593 pr_err("PerfGlobCtl = 0x%016llx\n",
6594 vmcs_read64(HOST_IA32_PERF_GLOBAL_CTRL));
6595 if (vmcs_read32(VM_EXIT_MSR_LOAD_COUNT) > 0)
6596 vmx_dump_msrs("host autoload", &vmx->msr_autoload.host);
6597 if (vmexit_ctl & VM_EXIT_LOAD_CET_STATE)
6598 pr_err("S_CET = 0x%016lx, SSP = 0x%016lx, SSP TABLE = 0x%016lx\n",
6599 vmcs_readl(HOST_S_CET), vmcs_readl(HOST_SSP),
6600 vmcs_readl(HOST_INTR_SSP_TABLE));
6601
6602 pr_err("*** Control State ***\n");
6603 pr_err("CPUBased=0x%08x SecondaryExec=0x%08x TertiaryExec=0x%016llx\n",
6604 cpu_based_exec_ctrl, secondary_exec_control, tertiary_exec_control);
6605 pr_err("PinBased=0x%08x EntryControls=%08x ExitControls=%08x\n",
6606 pin_based_exec_ctrl, vmentry_ctl, vmexit_ctl);
6607 pr_err("ExceptionBitmap=%08x PFECmask=%08x PFECmatch=%08x\n",
6608 vmcs_read32(EXCEPTION_BITMAP),
6609 vmcs_read32(PAGE_FAULT_ERROR_CODE_MASK),
6610 vmcs_read32(PAGE_FAULT_ERROR_CODE_MATCH));
6611 pr_err("VMEntry: intr_info=%08x errcode=%08x ilen=%08x\n",
6612 vmcs_read32(VM_ENTRY_INTR_INFO_FIELD),
6613 vmcs_read32(VM_ENTRY_EXCEPTION_ERROR_CODE),
6614 vmcs_read32(VM_ENTRY_INSTRUCTION_LEN));
6615 pr_err("VMExit: intr_info=%08x errcode=%08x ilen=%08x\n",
6616 vmcs_read32(VM_EXIT_INTR_INFO),
6617 vmcs_read32(VM_EXIT_INTR_ERROR_CODE),
6618 vmcs_read32(VM_EXIT_INSTRUCTION_LEN));
6619 pr_err(" reason=%08x qualification=%016lx\n",
6620 vmcs_read32(VM_EXIT_REASON), vmcs_readl(EXIT_QUALIFICATION));
6621 pr_err("IDTVectoring: info=%08x errcode=%08x\n",
6622 vmcs_read32(IDT_VECTORING_INFO_FIELD),
6623 vmcs_read32(IDT_VECTORING_ERROR_CODE));
6624 pr_err("TSC Offset = 0x%016llx\n", vmcs_read64(TSC_OFFSET));
6625 if (secondary_exec_control & SECONDARY_EXEC_TSC_SCALING)
6626 pr_err("TSC Multiplier = 0x%016llx\n",
6627 vmcs_read64(TSC_MULTIPLIER));
6628 if (cpu_based_exec_ctrl & CPU_BASED_TPR_SHADOW) {
6629 if (secondary_exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY) {
6630 u16 status = vmcs_read16(GUEST_INTR_STATUS);
6631 pr_err("SVI|RVI = %02x|%02x ", status >> 8, status & 0xff);
6632 }
6633 pr_cont("TPR Threshold = 0x%02x\n", vmcs_read32(TPR_THRESHOLD));
6634 if (secondary_exec_control & SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)
6635 pr_err("APIC-access addr = 0x%016llx ", vmcs_read64(APIC_ACCESS_ADDR));
6636 pr_cont("virt-APIC addr = 0x%016llx\n", vmcs_read64(VIRTUAL_APIC_PAGE_ADDR));
6637 }
6638 if (pin_based_exec_ctrl & PIN_BASED_POSTED_INTR)
6639 pr_err("PostedIntrVec = 0x%02x\n", vmcs_read16(POSTED_INTR_NV));
6640 if ((secondary_exec_control & SECONDARY_EXEC_ENABLE_EPT))
6641 pr_err("EPT pointer = 0x%016llx\n", vmcs_read64(EPT_POINTER));
6642 if (secondary_exec_control & SECONDARY_EXEC_PAUSE_LOOP_EXITING)
6643 pr_err("PLE Gap=%08x Window=%08x\n",
6644 vmcs_read32(PLE_GAP), vmcs_read32(PLE_WINDOW));
6645 if (secondary_exec_control & SECONDARY_EXEC_ENABLE_VPID)
6646 pr_err("Virtual processor ID = 0x%04x\n",
6647 vmcs_read16(VIRTUAL_PROCESSOR_ID));
6648 if (secondary_exec_control & SECONDARY_EXEC_EPT_VIOLATION_VE) {
6649 struct vmx_ve_information *ve_info = vmx->ve_info;
6650 u64 ve_info_pa = vmcs_read64(VE_INFORMATION_ADDRESS);
6651
6652 /*
6653 * If KVM is dumping the VMCS, then something has gone wrong
6654 * already. Derefencing an address from the VMCS, which could
6655 * very well be corrupted, is a terrible idea. The virtual
6656 * address is known so use it.
6657 */
6658 pr_err("VE info address = 0x%016llx%s\n", ve_info_pa,
6659 ve_info_pa == __pa(ve_info) ? "" : "(corrupted!)");
6660 pr_err("ve_info: 0x%08x 0x%08x 0x%016llx 0x%016llx 0x%016llx 0x%04x\n",
6661 ve_info->exit_reason, ve_info->delivery,
6662 ve_info->exit_qualification,
6663 ve_info->guest_linear_address,
6664 ve_info->guest_physical_address, ve_info->eptp_index);
6665 }
6666 }
6667
6668 /*
6669 * The guest has exited. See if we can fix it or if we need userspace
6670 * assistance.
6671 */
__vmx_handle_exit(struct kvm_vcpu * vcpu,fastpath_t exit_fastpath)6672 static int __vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
6673 {
6674 struct vcpu_vmx *vmx = to_vmx(vcpu);
6675 union vmx_exit_reason exit_reason = vmx_get_exit_reason(vcpu);
6676 u32 vectoring_info = vmx->idt_vectoring_info;
6677 u16 exit_handler_index;
6678
6679 /*
6680 * Flush logged GPAs PML buffer, this will make dirty_bitmap more
6681 * updated. Another good is, in kvm_vm_ioctl_get_dirty_log, before
6682 * querying dirty_bitmap, we only need to kick all vcpus out of guest
6683 * mode as if vcpus is in root mode, the PML buffer must has been
6684 * flushed already. Note, PML is never enabled in hardware while
6685 * running L2.
6686 */
6687 if (enable_pml && !is_guest_mode(vcpu))
6688 vmx_flush_pml_buffer(vcpu);
6689
6690 /*
6691 * KVM should never reach this point with a pending nested VM-Enter.
6692 * More specifically, short-circuiting VM-Entry to emulate L2 due to
6693 * invalid guest state should never happen as that means KVM knowingly
6694 * allowed a nested VM-Enter with an invalid vmcs12. More below.
6695 */
6696 if (KVM_BUG_ON(vcpu->arch.nested_run_pending, vcpu->kvm))
6697 return -EIO;
6698
6699 if (is_guest_mode(vcpu)) {
6700 /*
6701 * PML is never enabled when running L2, bail immediately if a
6702 * PML full exit occurs as something is horribly wrong.
6703 */
6704 if (exit_reason.basic == EXIT_REASON_PML_FULL)
6705 goto unexpected_vmexit;
6706
6707 /*
6708 * The host physical addresses of some pages of guest memory
6709 * are loaded into the vmcs02 (e.g. vmcs12's Virtual APIC
6710 * Page). The CPU may write to these pages via their host
6711 * physical address while L2 is running, bypassing any
6712 * address-translation-based dirty tracking (e.g. EPT write
6713 * protection).
6714 *
6715 * Mark them dirty on every exit from L2 to prevent them from
6716 * getting out of sync with dirty tracking.
6717 */
6718 nested_vmx_mark_all_vmcs12_pages_dirty(vcpu);
6719
6720 /*
6721 * Synthesize a triple fault if L2 state is invalid. In normal
6722 * operation, nested VM-Enter rejects any attempt to enter L2
6723 * with invalid state. However, those checks are skipped if
6724 * state is being stuffed via RSM or KVM_SET_NESTED_STATE. If
6725 * L2 state is invalid, it means either L1 modified SMRAM state
6726 * or userspace provided bad state. Synthesize TRIPLE_FAULT as
6727 * doing so is architecturally allowed in the RSM case, and is
6728 * the least awful solution for the userspace case without
6729 * risking false positives.
6730 */
6731 if (vmx->vt.emulation_required) {
6732 nested_vmx_vmexit(vcpu, EXIT_REASON_TRIPLE_FAULT, 0, 0);
6733 return 1;
6734 }
6735
6736 if (nested_vmx_reflect_vmexit(vcpu))
6737 return 1;
6738 }
6739
6740 /* If guest state is invalid, start emulating. L2 is handled above. */
6741 if (vmx->vt.emulation_required)
6742 return handle_invalid_guest_state(vcpu);
6743
6744 if (exit_reason.failed_vmentry) {
6745 dump_vmcs(vcpu);
6746 vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY;
6747 vcpu->run->fail_entry.hardware_entry_failure_reason
6748 = exit_reason.full;
6749 vcpu->run->fail_entry.cpu = vcpu->arch.last_vmentry_cpu;
6750 return 0;
6751 }
6752
6753 if (unlikely(vmx->fail)) {
6754 dump_vmcs(vcpu);
6755 vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY;
6756 vcpu->run->fail_entry.hardware_entry_failure_reason
6757 = vmcs_read32(VM_INSTRUCTION_ERROR);
6758 vcpu->run->fail_entry.cpu = vcpu->arch.last_vmentry_cpu;
6759 return 0;
6760 }
6761
6762 if ((vectoring_info & VECTORING_INFO_VALID_MASK) &&
6763 (exit_reason.basic != EXIT_REASON_EXCEPTION_NMI &&
6764 exit_reason.basic != EXIT_REASON_EPT_VIOLATION &&
6765 exit_reason.basic != EXIT_REASON_PML_FULL &&
6766 exit_reason.basic != EXIT_REASON_APIC_ACCESS &&
6767 exit_reason.basic != EXIT_REASON_TASK_SWITCH &&
6768 exit_reason.basic != EXIT_REASON_NOTIFY &&
6769 exit_reason.basic != EXIT_REASON_EPT_MISCONFIG)) {
6770 kvm_prepare_event_vectoring_exit(vcpu, INVALID_GPA);
6771 return 0;
6772 }
6773
6774 if (unlikely(!enable_vnmi &&
6775 vmx->loaded_vmcs->soft_vnmi_blocked)) {
6776 if (!vmx_interrupt_blocked(vcpu)) {
6777 vmx->loaded_vmcs->soft_vnmi_blocked = 0;
6778 } else if (vmx->loaded_vmcs->vnmi_blocked_time > 1000000000LL &&
6779 vcpu->arch.nmi_pending) {
6780 /*
6781 * This CPU don't support us in finding the end of an
6782 * NMI-blocked window if the guest runs with IRQs
6783 * disabled. So we pull the trigger after 1 s of
6784 * futile waiting, but inform the user about this.
6785 */
6786 printk(KERN_WARNING "%s: Breaking out of NMI-blocked "
6787 "state on VCPU %d after 1 s timeout\n",
6788 __func__, vcpu->vcpu_id);
6789 vmx->loaded_vmcs->soft_vnmi_blocked = 0;
6790 }
6791 }
6792
6793 if (exit_fastpath != EXIT_FASTPATH_NONE)
6794 return 1;
6795
6796 if (exit_reason.basic >= kvm_vmx_max_exit_handlers)
6797 goto unexpected_vmexit;
6798 #ifdef CONFIG_MITIGATION_RETPOLINE
6799 if (exit_reason.basic == EXIT_REASON_MSR_WRITE)
6800 return kvm_emulate_wrmsr(vcpu);
6801 else if (exit_reason.basic == EXIT_REASON_MSR_WRITE_IMM)
6802 return handle_wrmsr_imm(vcpu);
6803 else if (exit_reason.basic == EXIT_REASON_PREEMPTION_TIMER)
6804 return handle_preemption_timer(vcpu);
6805 else if (exit_reason.basic == EXIT_REASON_INTERRUPT_WINDOW)
6806 return handle_interrupt_window(vcpu);
6807 else if (exit_reason.basic == EXIT_REASON_EXTERNAL_INTERRUPT)
6808 return handle_external_interrupt(vcpu);
6809 else if (exit_reason.basic == EXIT_REASON_HLT)
6810 return kvm_emulate_halt(vcpu);
6811 else if (exit_reason.basic == EXIT_REASON_EPT_MISCONFIG)
6812 return handle_ept_misconfig(vcpu);
6813 #endif
6814
6815 exit_handler_index = array_index_nospec((u16)exit_reason.basic,
6816 kvm_vmx_max_exit_handlers);
6817 if (!kvm_vmx_exit_handlers[exit_handler_index])
6818 goto unexpected_vmexit;
6819
6820 return kvm_vmx_exit_handlers[exit_handler_index](vcpu);
6821
6822 unexpected_vmexit:
6823 dump_vmcs(vcpu);
6824 kvm_prepare_unexpected_reason_exit(vcpu, exit_reason.full);
6825 return 0;
6826 }
6827
vmx_handle_exit(struct kvm_vcpu * vcpu,fastpath_t exit_fastpath)6828 int vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
6829 {
6830 int ret = __vmx_handle_exit(vcpu, exit_fastpath);
6831
6832 /*
6833 * Exit to user space when bus lock detected to inform that there is
6834 * a bus lock in guest.
6835 */
6836 if (vmx_get_exit_reason(vcpu).bus_lock_detected) {
6837 if (ret > 0)
6838 vcpu->run->exit_reason = KVM_EXIT_X86_BUS_LOCK;
6839
6840 vcpu->run->flags |= KVM_RUN_X86_BUS_LOCK;
6841 return 0;
6842 }
6843 return ret;
6844 }
6845
vmx_update_cr8_intercept(struct kvm_vcpu * vcpu,int tpr,int irr)6846 void vmx_update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
6847 {
6848 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
6849 int tpr_threshold;
6850
6851 if (is_guest_mode(vcpu) &&
6852 nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW))
6853 return;
6854
6855 guard(vmx_vmcs01)(vcpu);
6856
6857 tpr_threshold = (irr == -1 || tpr < irr) ? 0 : irr;
6858 vmcs_write32(TPR_THRESHOLD, tpr_threshold);
6859 }
6860
vmx_set_virtual_apic_mode(struct kvm_vcpu * vcpu)6861 void vmx_set_virtual_apic_mode(struct kvm_vcpu *vcpu)
6862 {
6863 struct vcpu_vmx *vmx = to_vmx(vcpu);
6864 u32 sec_exec_control;
6865
6866 if (!lapic_in_kernel(vcpu))
6867 return;
6868
6869 if (!flexpriority_enabled &&
6870 !cpu_has_vmx_virtualize_x2apic_mode())
6871 return;
6872
6873 guard(vmx_vmcs01)(vcpu);
6874
6875 sec_exec_control = secondary_exec_controls_get(vmx);
6876 sec_exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
6877 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE);
6878
6879 switch (kvm_get_apic_mode(vcpu)) {
6880 case LAPIC_MODE_INVALID:
6881 WARN_ONCE(true, "Invalid local APIC state");
6882 break;
6883 case LAPIC_MODE_DISABLED:
6884 break;
6885 case LAPIC_MODE_XAPIC:
6886 if (flexpriority_enabled) {
6887 sec_exec_control |=
6888 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
6889 kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu);
6890
6891 /*
6892 * Flush the TLB, reloading the APIC access page will
6893 * only do so if its physical address has changed, but
6894 * the guest may have inserted a non-APIC mapping into
6895 * the TLB while the APIC access page was disabled.
6896 *
6897 * If L2 is active, immediately flush L1's TLB instead
6898 * of requesting a flush of the current TLB, because
6899 * the current TLB context is L2's.
6900 */
6901 if (!is_guest_mode(vcpu))
6902 kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
6903 else if (!enable_ept)
6904 vpid_sync_context(vmx->vpid);
6905 else if (VALID_PAGE(vcpu->arch.root_mmu.root.hpa))
6906 vmx_flush_tlb_ept_root(vcpu->arch.root_mmu.root.hpa);
6907 }
6908 break;
6909 case LAPIC_MODE_X2APIC:
6910 if (cpu_has_vmx_virtualize_x2apic_mode())
6911 sec_exec_control |=
6912 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;
6913 break;
6914 }
6915 secondary_exec_controls_set(vmx, sec_exec_control);
6916
6917 vmx_update_msr_bitmap_x2apic(vcpu);
6918 }
6919
vmx_set_apic_access_page_addr(struct kvm_vcpu * vcpu)6920 void vmx_set_apic_access_page_addr(struct kvm_vcpu *vcpu)
6921 {
6922 const gfn_t gfn = APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT;
6923 struct kvm *kvm = vcpu->kvm;
6924 struct kvm_memslots *slots = kvm_memslots(kvm);
6925 struct kvm_memory_slot *slot;
6926 struct page *refcounted_page;
6927 unsigned long mmu_seq;
6928 kvm_pfn_t pfn;
6929 bool writable;
6930
6931 /* Note, the VIRTUALIZE_APIC_ACCESSES check needs to query vmcs01. */
6932 guard(vmx_vmcs01)(vcpu);
6933
6934 if (!(secondary_exec_controls_get(to_vmx(vcpu)) &
6935 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))
6936 return;
6937
6938 /*
6939 * Explicitly grab the memslot using KVM's internal slot ID to ensure
6940 * KVM doesn't unintentionally grab a userspace memslot. It _should_
6941 * be impossible for userspace to create a memslot for the APIC when
6942 * APICv is enabled, but paranoia won't hurt in this case.
6943 */
6944 slot = id_to_memslot(slots, APIC_ACCESS_PAGE_PRIVATE_MEMSLOT);
6945 if (!slot || slot->flags & KVM_MEMSLOT_INVALID)
6946 return;
6947
6948 /*
6949 * Ensure that the mmu_notifier sequence count is read before KVM
6950 * retrieves the pfn from the primary MMU. Note, the memslot is
6951 * protected by SRCU, not the mmu_notifier. Pairs with the smp_wmb()
6952 * in kvm_mmu_invalidate_end().
6953 */
6954 mmu_seq = kvm->mmu_invalidate_seq;
6955 smp_rmb();
6956
6957 /*
6958 * No need to retry if the memslot does not exist or is invalid. KVM
6959 * controls the APIC-access page memslot, and only deletes the memslot
6960 * if APICv is permanently inhibited, i.e. the memslot won't reappear.
6961 */
6962 pfn = __kvm_faultin_pfn(slot, gfn, FOLL_WRITE, &writable, &refcounted_page);
6963 if (is_error_noslot_pfn(pfn))
6964 return;
6965
6966 read_lock(&vcpu->kvm->mmu_lock);
6967 if (mmu_invalidate_retry_gfn(kvm, mmu_seq, gfn))
6968 kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu);
6969 else
6970 vmcs_write64(APIC_ACCESS_ADDR, pfn_to_hpa(pfn));
6971
6972 /*
6973 * Do not pin the APIC access page in memory so that it can be freely
6974 * migrated, the MMU notifier will call us again if it is migrated or
6975 * swapped out. KVM backs the memslot with anonymous memory, the pfn
6976 * should always point at a refcounted page (if the pfn is valid).
6977 */
6978 if (!WARN_ON_ONCE(!refcounted_page))
6979 kvm_release_page_clean(refcounted_page);
6980
6981 /*
6982 * No need for a manual TLB flush at this point, KVM has already done a
6983 * flush if there were SPTEs pointing at the previous page.
6984 */
6985 read_unlock(&vcpu->kvm->mmu_lock);
6986 }
6987
vmx_hwapic_isr_update(struct kvm_vcpu * vcpu,int max_isr)6988 void vmx_hwapic_isr_update(struct kvm_vcpu *vcpu, int max_isr)
6989 {
6990 u16 status;
6991 u8 old;
6992
6993 if (max_isr == -1)
6994 max_isr = 0;
6995
6996 /*
6997 * Always update SVI in vmcs01, as SVI is only relevant for L2 if and
6998 * only if Virtual Interrupt Delivery is enabled in vmcs12, and if VID
6999 * is enabled then L2 EOIs affect L2's vAPIC, not L1's vAPIC.
7000 */
7001 guard(vmx_vmcs01)(vcpu);
7002
7003 status = vmcs_read16(GUEST_INTR_STATUS);
7004 old = status >> 8;
7005 if (max_isr != old) {
7006 status &= 0xff;
7007 status |= max_isr << 8;
7008 vmcs_write16(GUEST_INTR_STATUS, status);
7009 }
7010 }
7011
vmx_set_rvi(int vector)7012 static void vmx_set_rvi(int vector)
7013 {
7014 u16 status;
7015 u8 old;
7016
7017 if (vector == -1)
7018 vector = 0;
7019
7020 status = vmcs_read16(GUEST_INTR_STATUS);
7021 old = (u8)status & 0xff;
7022 if ((u8)vector != old) {
7023 status &= ~0xff;
7024 status |= (u8)vector;
7025 vmcs_write16(GUEST_INTR_STATUS, status);
7026 }
7027 }
7028
vmx_sync_pir_to_irr(struct kvm_vcpu * vcpu)7029 int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu)
7030 {
7031 struct vcpu_vt *vt = to_vt(vcpu);
7032 int max_irr;
7033 bool got_posted_interrupt;
7034
7035 if (KVM_BUG_ON(!enable_apicv, vcpu->kvm))
7036 return -EIO;
7037
7038 if (pi_test_on(&vt->pi_desc)) {
7039 pi_clear_on(&vt->pi_desc);
7040 /*
7041 * IOMMU can write to PID.ON, so the barrier matters even on UP.
7042 * But on x86 this is just a compiler barrier anyway.
7043 */
7044 smp_mb__after_atomic();
7045 got_posted_interrupt =
7046 kvm_apic_update_irr(vcpu, vt->pi_desc.pir, &max_irr);
7047 } else {
7048 max_irr = kvm_lapic_find_highest_irr(vcpu);
7049 got_posted_interrupt = false;
7050 }
7051
7052 /*
7053 * Newly recognized interrupts are injected via either virtual interrupt
7054 * delivery (RVI) or KVM_REQ_EVENT. Virtual interrupt delivery is
7055 * disabled in two cases:
7056 *
7057 * 1) If L2 is running and the vCPU has a new pending interrupt. If L1
7058 * wants to exit on interrupts, KVM_REQ_EVENT is needed to synthesize a
7059 * VM-Exit to L1. If L1 doesn't want to exit, the interrupt is injected
7060 * into L2, but KVM doesn't use virtual interrupt delivery to inject
7061 * interrupts into L2, and so KVM_REQ_EVENT is again needed.
7062 *
7063 * 2) If APICv is disabled for this vCPU, assigned devices may still
7064 * attempt to post interrupts. The posted interrupt vector will cause
7065 * a VM-Exit and the subsequent entry will call sync_pir_to_irr.
7066 */
7067 if (!is_guest_mode(vcpu) && kvm_vcpu_apicv_active(vcpu))
7068 vmx_set_rvi(max_irr);
7069 else if (got_posted_interrupt)
7070 kvm_make_request(KVM_REQ_EVENT, vcpu);
7071
7072 return max_irr;
7073 }
7074
vmx_load_eoi_exitmap(struct kvm_vcpu * vcpu,u64 * eoi_exit_bitmap)7075 void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap)
7076 {
7077 if (!kvm_vcpu_apicv_active(vcpu))
7078 return;
7079
7080 vmcs_write64(EOI_EXIT_BITMAP0, eoi_exit_bitmap[0]);
7081 vmcs_write64(EOI_EXIT_BITMAP1, eoi_exit_bitmap[1]);
7082 vmcs_write64(EOI_EXIT_BITMAP2, eoi_exit_bitmap[2]);
7083 vmcs_write64(EOI_EXIT_BITMAP3, eoi_exit_bitmap[3]);
7084 }
7085
7086 void vmx_do_interrupt_irqoff(unsigned long entry);
7087 void vmx_do_nmi_irqoff(void);
7088
handle_nm_fault_irqoff(struct kvm_vcpu * vcpu)7089 static void handle_nm_fault_irqoff(struct kvm_vcpu *vcpu)
7090 {
7091 /*
7092 * Save xfd_err to guest_fpu before interrupt is enabled, so the
7093 * MSR value is not clobbered by the host activity before the guest
7094 * has chance to consume it.
7095 *
7096 * Update the guest's XFD_ERR if and only if XFD is enabled, as the #NM
7097 * interception may have been caused by L1 interception. Per the SDM,
7098 * XFD_ERR is not modified for non-XFD #NM, i.e. if CR0.TS=1.
7099 *
7100 * Note, XFD_ERR is updated _before_ the #NM interception check, i.e.
7101 * unlike CR2 and DR6, the value is not a payload that is attached to
7102 * the #NM exception.
7103 */
7104 if (is_xfd_nm_fault(vcpu))
7105 rdmsrq(MSR_IA32_XFD_ERR, vcpu->arch.guest_fpu.xfd_err);
7106 }
7107
handle_exception_irqoff(struct kvm_vcpu * vcpu,u32 intr_info)7108 static void handle_exception_irqoff(struct kvm_vcpu *vcpu, u32 intr_info)
7109 {
7110 /* if exit due to PF check for async PF */
7111 if (is_page_fault(intr_info))
7112 vcpu->arch.apf.host_apf_flags = kvm_read_and_reset_apf_flags();
7113 /* if exit due to NM, handle before interrupts are enabled */
7114 else if (is_nm_fault(intr_info))
7115 handle_nm_fault_irqoff(vcpu);
7116 /* Handle machine checks before interrupts are enabled */
7117 else if (is_machine_check(intr_info))
7118 kvm_machine_check();
7119 }
7120
handle_external_interrupt_irqoff(struct kvm_vcpu * vcpu,u32 intr_info)7121 static void handle_external_interrupt_irqoff(struct kvm_vcpu *vcpu,
7122 u32 intr_info)
7123 {
7124 unsigned int vector = intr_info & INTR_INFO_VECTOR_MASK;
7125
7126 if (KVM_BUG(!is_external_intr(intr_info), vcpu->kvm,
7127 "unexpected VM-Exit interrupt info: 0x%x", intr_info))
7128 return;
7129
7130 /*
7131 * Invoke the kernel's IRQ handler for the vector. Use the FRED path
7132 * when it's available even if FRED isn't fully enabled, e.g. even if
7133 * FRED isn't supported in hardware, in order to avoid the indirect
7134 * CALL in the non-FRED path.
7135 */
7136 kvm_before_interrupt(vcpu, KVM_HANDLING_IRQ);
7137 if (IS_ENABLED(CONFIG_X86_FRED))
7138 fred_entry_from_kvm(EVENT_TYPE_EXTINT, vector);
7139 else
7140 vmx_do_interrupt_irqoff(gate_offset((gate_desc *)host_idt_base + vector));
7141 kvm_after_interrupt(vcpu);
7142
7143 vcpu->arch.at_instruction_boundary = true;
7144 }
7145
vmx_handle_exit_irqoff(struct kvm_vcpu * vcpu)7146 void vmx_handle_exit_irqoff(struct kvm_vcpu *vcpu)
7147 {
7148 if (to_vt(vcpu)->emulation_required)
7149 return;
7150
7151 switch (vmx_get_exit_reason(vcpu).basic) {
7152 case EXIT_REASON_EXTERNAL_INTERRUPT:
7153 handle_external_interrupt_irqoff(vcpu, vmx_get_intr_info(vcpu));
7154 break;
7155 case EXIT_REASON_EXCEPTION_NMI:
7156 handle_exception_irqoff(vcpu, vmx_get_intr_info(vcpu));
7157 break;
7158 case EXIT_REASON_MCE_DURING_VMENTRY:
7159 kvm_machine_check();
7160 break;
7161 default:
7162 break;
7163 }
7164 }
7165
7166 /*
7167 * The kvm parameter can be NULL (module initialization, or invocation before
7168 * VM creation). Be sure to check the kvm parameter before using it.
7169 */
vmx_has_emulated_msr(struct kvm * kvm,u32 index)7170 bool vmx_has_emulated_msr(struct kvm *kvm, u32 index)
7171 {
7172 switch (index) {
7173 case MSR_IA32_SMBASE:
7174 if (!IS_ENABLED(CONFIG_KVM_SMM))
7175 return false;
7176 /*
7177 * We cannot do SMM unless we can run the guest in big
7178 * real mode.
7179 */
7180 return enable_unrestricted_guest || emulate_invalid_guest_state;
7181 case KVM_FIRST_EMULATED_VMX_MSR ... KVM_LAST_EMULATED_VMX_MSR:
7182 return nested;
7183 case MSR_AMD64_VIRT_SPEC_CTRL:
7184 case MSR_AMD64_TSC_RATIO:
7185 /* This is AMD only. */
7186 return false;
7187 default:
7188 return true;
7189 }
7190 }
7191
vmx_recover_nmi_blocking(struct vcpu_vmx * vmx)7192 static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx)
7193 {
7194 u32 exit_intr_info;
7195 bool unblock_nmi;
7196 u8 vector;
7197 bool idtv_info_valid;
7198
7199 idtv_info_valid = vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK;
7200
7201 if (enable_vnmi) {
7202 if (vmx->loaded_vmcs->nmi_known_unmasked)
7203 return;
7204
7205 exit_intr_info = vmx_get_intr_info(&vmx->vcpu);
7206 unblock_nmi = (exit_intr_info & INTR_INFO_UNBLOCK_NMI) != 0;
7207 vector = exit_intr_info & INTR_INFO_VECTOR_MASK;
7208 /*
7209 * SDM 3: 27.7.1.2 (September 2008)
7210 * Re-set bit "block by NMI" before VM entry if vmexit caused by
7211 * a guest IRET fault.
7212 * SDM 3: 23.2.2 (September 2008)
7213 * Bit 12 is undefined in any of the following cases:
7214 * If the VM exit sets the valid bit in the IDT-vectoring
7215 * information field.
7216 * If the VM exit is due to a double fault.
7217 */
7218 if ((exit_intr_info & INTR_INFO_VALID_MASK) && unblock_nmi &&
7219 vector != DF_VECTOR && !idtv_info_valid)
7220 vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
7221 GUEST_INTR_STATE_NMI);
7222 else
7223 vmx->loaded_vmcs->nmi_known_unmasked =
7224 !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO)
7225 & GUEST_INTR_STATE_NMI);
7226 } else if (unlikely(vmx->loaded_vmcs->soft_vnmi_blocked))
7227 vmx->loaded_vmcs->vnmi_blocked_time +=
7228 ktime_to_ns(ktime_sub(ktime_get(),
7229 vmx->loaded_vmcs->entry_time));
7230 }
7231
__vmx_complete_interrupts(struct kvm_vcpu * vcpu,u32 idt_vectoring_info,int instr_len_field,int error_code_field)7232 static void __vmx_complete_interrupts(struct kvm_vcpu *vcpu,
7233 u32 idt_vectoring_info,
7234 int instr_len_field,
7235 int error_code_field)
7236 {
7237 u8 vector;
7238 int type;
7239 bool idtv_info_valid;
7240
7241 idtv_info_valid = idt_vectoring_info & VECTORING_INFO_VALID_MASK;
7242
7243 vcpu->arch.nmi_injected = false;
7244 kvm_clear_exception_queue(vcpu);
7245 kvm_clear_interrupt_queue(vcpu);
7246
7247 if (!idtv_info_valid)
7248 return;
7249
7250 kvm_make_request(KVM_REQ_EVENT, vcpu);
7251
7252 vector = idt_vectoring_info & VECTORING_INFO_VECTOR_MASK;
7253 type = idt_vectoring_info & VECTORING_INFO_TYPE_MASK;
7254
7255 switch (type) {
7256 case INTR_TYPE_NMI_INTR:
7257 vcpu->arch.nmi_injected = true;
7258 /*
7259 * SDM 3: 27.7.1.2 (September 2008)
7260 * Clear bit "block by NMI" before VM entry if a NMI
7261 * delivery faulted.
7262 */
7263 vmx_set_nmi_mask(vcpu, false);
7264 break;
7265 case INTR_TYPE_SOFT_EXCEPTION:
7266 vcpu->arch.event_exit_inst_len = vmcs_read32(instr_len_field);
7267 fallthrough;
7268 case INTR_TYPE_HARD_EXCEPTION: {
7269 u32 error_code = 0;
7270
7271 if (idt_vectoring_info & VECTORING_INFO_DELIVER_CODE_MASK)
7272 error_code = vmcs_read32(error_code_field);
7273
7274 kvm_requeue_exception(vcpu, vector,
7275 idt_vectoring_info & VECTORING_INFO_DELIVER_CODE_MASK,
7276 error_code);
7277 break;
7278 }
7279 case INTR_TYPE_SOFT_INTR:
7280 vcpu->arch.event_exit_inst_len = vmcs_read32(instr_len_field);
7281 fallthrough;
7282 case INTR_TYPE_EXT_INTR:
7283 kvm_queue_interrupt(vcpu, vector, type == INTR_TYPE_SOFT_INTR);
7284 break;
7285 default:
7286 break;
7287 }
7288 }
7289
vmx_complete_interrupts(struct vcpu_vmx * vmx)7290 static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
7291 {
7292 __vmx_complete_interrupts(&vmx->vcpu, vmx->idt_vectoring_info,
7293 VM_EXIT_INSTRUCTION_LEN,
7294 IDT_VECTORING_ERROR_CODE);
7295 }
7296
vmx_cancel_injection(struct kvm_vcpu * vcpu)7297 void vmx_cancel_injection(struct kvm_vcpu *vcpu)
7298 {
7299 __vmx_complete_interrupts(vcpu,
7300 vmcs_read32(VM_ENTRY_INTR_INFO_FIELD),
7301 VM_ENTRY_INSTRUCTION_LEN,
7302 VM_ENTRY_EXCEPTION_ERROR_CODE);
7303
7304 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0);
7305 }
7306
atomic_switch_perf_msrs(struct vcpu_vmx * vmx)7307 static void atomic_switch_perf_msrs(struct vcpu_vmx *vmx)
7308 {
7309 int i, nr_msrs;
7310 struct perf_guest_switch_msr *msrs;
7311 struct kvm_pmu *pmu = vcpu_to_pmu(&vmx->vcpu);
7312
7313 if (kvm_vcpu_has_mediated_pmu(&vmx->vcpu))
7314 return;
7315
7316 pmu->host_cross_mapped_mask = 0;
7317 if (pmu->pebs_enable & pmu->global_ctrl)
7318 intel_pmu_cross_mapped_check(pmu);
7319
7320 /* Note, nr_msrs may be garbage if perf_guest_get_msrs() returns NULL. */
7321 msrs = perf_guest_get_msrs(&nr_msrs, (void *)pmu);
7322 if (!msrs)
7323 return;
7324
7325 for (i = 0; i < nr_msrs; i++)
7326 if (msrs[i].host == msrs[i].guest)
7327 clear_atomic_switch_msr(vmx, msrs[i].msr);
7328 else
7329 add_atomic_switch_msr(vmx, msrs[i].msr, msrs[i].guest,
7330 msrs[i].host);
7331 }
7332
vmx_refresh_guest_perf_global_control(struct kvm_vcpu * vcpu)7333 static void vmx_refresh_guest_perf_global_control(struct kvm_vcpu *vcpu)
7334 {
7335 struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
7336 struct vcpu_vmx *vmx = to_vmx(vcpu);
7337
7338 if (msr_write_intercepted(vmx, MSR_CORE_PERF_GLOBAL_CTRL))
7339 return;
7340
7341 if (!cpu_has_save_perf_global_ctrl()) {
7342 int slot = vmx_find_loadstore_msr_slot(&vmx->msr_autostore,
7343 MSR_CORE_PERF_GLOBAL_CTRL);
7344
7345 if (WARN_ON_ONCE(slot < 0))
7346 return;
7347
7348 pmu->global_ctrl = vmx->msr_autostore.val[slot].value;
7349 vmcs_write64(GUEST_IA32_PERF_GLOBAL_CTRL, pmu->global_ctrl);
7350 return;
7351 }
7352
7353 pmu->global_ctrl = vmcs_read64(GUEST_IA32_PERF_GLOBAL_CTRL);
7354 }
7355
vmx_update_hv_timer(struct kvm_vcpu * vcpu,bool force_immediate_exit)7356 static void vmx_update_hv_timer(struct kvm_vcpu *vcpu, bool force_immediate_exit)
7357 {
7358 struct vcpu_vmx *vmx = to_vmx(vcpu);
7359 u64 tscl;
7360 u32 delta_tsc;
7361
7362 if (force_immediate_exit) {
7363 vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, 0);
7364 vmx->loaded_vmcs->hv_timer_soft_disabled = false;
7365 } else if (vmx->hv_deadline_tsc != -1) {
7366 tscl = rdtsc();
7367 if (vmx->hv_deadline_tsc > tscl)
7368 /* set_hv_timer ensures the delta fits in 32-bits */
7369 delta_tsc = (u32)((vmx->hv_deadline_tsc - tscl) >>
7370 cpu_preemption_timer_multi);
7371 else
7372 delta_tsc = 0;
7373
7374 vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, delta_tsc);
7375 vmx->loaded_vmcs->hv_timer_soft_disabled = false;
7376 } else if (!vmx->loaded_vmcs->hv_timer_soft_disabled) {
7377 vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, -1);
7378 vmx->loaded_vmcs->hv_timer_soft_disabled = true;
7379 }
7380 }
7381
vmx_update_host_rsp(struct vcpu_vmx * vmx,unsigned long host_rsp)7382 void noinstr vmx_update_host_rsp(struct vcpu_vmx *vmx, unsigned long host_rsp)
7383 {
7384 if (unlikely(host_rsp != vmx->loaded_vmcs->host_state.rsp)) {
7385 vmx->loaded_vmcs->host_state.rsp = host_rsp;
7386 vmcs_writel(HOST_RSP, host_rsp);
7387 }
7388 }
7389
vmx_spec_ctrl_restore_host(struct vcpu_vmx * vmx,unsigned int flags)7390 void noinstr vmx_spec_ctrl_restore_host(struct vcpu_vmx *vmx,
7391 unsigned int flags)
7392 {
7393 u64 hostval = this_cpu_read(x86_spec_ctrl_current);
7394
7395 if (!cpu_feature_enabled(X86_FEATURE_MSR_SPEC_CTRL))
7396 return;
7397
7398 if (flags & VMX_RUN_SAVE_SPEC_CTRL)
7399 vmx->spec_ctrl = native_rdmsrq(MSR_IA32_SPEC_CTRL);
7400
7401 /*
7402 * If the guest/host SPEC_CTRL values differ, restore the host value.
7403 *
7404 * For legacy IBRS, the IBRS bit always needs to be written after
7405 * transitioning from a less privileged predictor mode, regardless of
7406 * whether the guest/host values differ.
7407 */
7408 if (cpu_feature_enabled(X86_FEATURE_KERNEL_IBRS) ||
7409 vmx->spec_ctrl != hostval)
7410 native_wrmsrq(MSR_IA32_SPEC_CTRL, hostval);
7411
7412 barrier_nospec();
7413 }
7414
vmx_exit_handlers_fastpath(struct kvm_vcpu * vcpu,bool force_immediate_exit)7415 static fastpath_t vmx_exit_handlers_fastpath(struct kvm_vcpu *vcpu,
7416 bool force_immediate_exit)
7417 {
7418 /*
7419 * If L2 is active, some VMX preemption timer exits can be handled in
7420 * the fastpath even, all other exits must use the slow path.
7421 */
7422 if (is_guest_mode(vcpu) &&
7423 vmx_get_exit_reason(vcpu).basic != EXIT_REASON_PREEMPTION_TIMER)
7424 return EXIT_FASTPATH_NONE;
7425
7426 switch (vmx_get_exit_reason(vcpu).basic) {
7427 case EXIT_REASON_MSR_WRITE:
7428 return handle_fastpath_wrmsr(vcpu);
7429 case EXIT_REASON_MSR_WRITE_IMM:
7430 return handle_fastpath_wrmsr_imm(vcpu, vmx_get_exit_qual(vcpu),
7431 vmx_get_msr_imm_reg(vcpu));
7432 case EXIT_REASON_PREEMPTION_TIMER:
7433 return handle_fastpath_preemption_timer(vcpu, force_immediate_exit);
7434 case EXIT_REASON_HLT:
7435 return handle_fastpath_hlt(vcpu);
7436 case EXIT_REASON_INVD:
7437 return handle_fastpath_invd(vcpu);
7438 default:
7439 return EXIT_FASTPATH_NONE;
7440 }
7441 }
7442
vmx_handle_nmi(struct kvm_vcpu * vcpu)7443 noinstr void vmx_handle_nmi(struct kvm_vcpu *vcpu)
7444 {
7445 if ((u16)vmx_get_exit_reason(vcpu).basic != EXIT_REASON_EXCEPTION_NMI ||
7446 !is_nmi(vmx_get_intr_info(vcpu)))
7447 return;
7448
7449 kvm_before_interrupt(vcpu, KVM_HANDLING_NMI);
7450 if (cpu_feature_enabled(X86_FEATURE_FRED))
7451 fred_entry_from_kvm(EVENT_TYPE_NMI, NMI_VECTOR);
7452 else
7453 vmx_do_nmi_irqoff();
7454 kvm_after_interrupt(vcpu);
7455 }
7456
vmx_vcpu_enter_exit(struct kvm_vcpu * vcpu,unsigned int flags)7457 static noinstr void vmx_vcpu_enter_exit(struct kvm_vcpu *vcpu,
7458 unsigned int flags)
7459 {
7460 struct vcpu_vmx *vmx = to_vmx(vcpu);
7461
7462 guest_state_enter_irqoff();
7463
7464 vmx_l1d_flush(vcpu);
7465
7466 vmx_disable_fb_clear(vmx);
7467
7468 if (vcpu->arch.cr2 != native_read_cr2())
7469 native_write_cr2(vcpu->arch.cr2);
7470
7471 vmx->fail = __vmx_vcpu_run(vmx, (unsigned long *)&vcpu->arch.regs,
7472 flags);
7473
7474 vcpu->arch.cr2 = native_read_cr2();
7475 vcpu->arch.regs_avail &= ~VMX_REGS_LAZY_LOAD_SET;
7476
7477 vmx->idt_vectoring_info = 0;
7478
7479 vmx_enable_fb_clear(vmx);
7480
7481 if (unlikely(vmx->fail)) {
7482 vmx->vt.exit_reason.full = 0xdead;
7483 goto out;
7484 }
7485
7486 vmx->vt.exit_reason.full = vmcs_read32(VM_EXIT_REASON);
7487 if (likely(!vmx_get_exit_reason(vcpu).failed_vmentry))
7488 vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
7489
7490 vmx_handle_nmi(vcpu);
7491
7492 out:
7493 guest_state_exit_irqoff();
7494 }
7495
vmx_vcpu_run(struct kvm_vcpu * vcpu,u64 run_flags)7496 fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, u64 run_flags)
7497 {
7498 bool force_immediate_exit = run_flags & KVM_RUN_FORCE_IMMEDIATE_EXIT;
7499 struct vcpu_vmx *vmx = to_vmx(vcpu);
7500 unsigned long cr3, cr4;
7501
7502 /* Record the guest's net vcpu time for enforced NMI injections. */
7503 if (unlikely(!enable_vnmi &&
7504 vmx->loaded_vmcs->soft_vnmi_blocked))
7505 vmx->loaded_vmcs->entry_time = ktime_get();
7506
7507 /*
7508 * Don't enter VMX if guest state is invalid, let the exit handler
7509 * start emulation until we arrive back to a valid state. Synthesize a
7510 * consistency check VM-Exit due to invalid guest state and bail.
7511 */
7512 if (unlikely(vmx->vt.emulation_required)) {
7513 vmx->fail = 0;
7514
7515 vmx->vt.exit_reason.full = EXIT_REASON_INVALID_STATE;
7516 vmx->vt.exit_reason.failed_vmentry = 1;
7517 kvm_register_mark_available(vcpu, VCPU_EXREG_EXIT_INFO_1);
7518 vmx->vt.exit_qualification = ENTRY_FAIL_DEFAULT;
7519 kvm_register_mark_available(vcpu, VCPU_EXREG_EXIT_INFO_2);
7520 vmx->vt.exit_intr_info = 0;
7521 return EXIT_FASTPATH_NONE;
7522 }
7523
7524 trace_kvm_entry(vcpu, force_immediate_exit);
7525
7526 if (vmx->ple_window_dirty) {
7527 vmx->ple_window_dirty = false;
7528 vmcs_write32(PLE_WINDOW, vmx->ple_window);
7529 }
7530
7531 /*
7532 * We did this in prepare_switch_to_guest, because it needs to
7533 * be within srcu_read_lock.
7534 */
7535 WARN_ON_ONCE(vmx->nested.need_vmcs12_to_shadow_sync);
7536
7537 if (kvm_register_is_dirty(vcpu, VCPU_REGS_RSP))
7538 vmcs_writel(GUEST_RSP, vcpu->arch.regs[VCPU_REGS_RSP]);
7539 if (kvm_register_is_dirty(vcpu, VCPU_REGS_RIP))
7540 vmcs_writel(GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]);
7541 vcpu->arch.regs_dirty = 0;
7542
7543 if (run_flags & KVM_RUN_LOAD_GUEST_DR6)
7544 set_debugreg(vcpu->arch.dr6, 6);
7545
7546 if (run_flags & KVM_RUN_LOAD_DEBUGCTL)
7547 vmx_reload_guest_debugctl(vcpu);
7548
7549 /*
7550 * Refresh vmcs.HOST_CR3 if necessary. This must be done immediately
7551 * prior to VM-Enter, as the kernel may load a new ASID (PCID) any time
7552 * it switches back to the current->mm, which can occur in KVM context
7553 * when switching to a temporary mm to patch kernel code, e.g. if KVM
7554 * toggles a static key while handling a VM-Exit.
7555 */
7556 cr3 = __get_current_cr3_fast();
7557 if (unlikely(cr3 != vmx->loaded_vmcs->host_state.cr3)) {
7558 vmcs_writel(HOST_CR3, cr3);
7559 vmx->loaded_vmcs->host_state.cr3 = cr3;
7560 }
7561
7562 cr4 = cr4_read_shadow();
7563 if (unlikely(cr4 != vmx->loaded_vmcs->host_state.cr4)) {
7564 vmcs_writel(HOST_CR4, cr4);
7565 vmx->loaded_vmcs->host_state.cr4 = cr4;
7566 }
7567
7568 /* When single-stepping over STI and MOV SS, we must clear the
7569 * corresponding interruptibility bits in the guest state. Otherwise
7570 * vmentry fails as it then expects bit 14 (BS) in pending debug
7571 * exceptions being set, but that's not correct for the guest debugging
7572 * case. */
7573 if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
7574 vmx_set_interrupt_shadow(vcpu, 0);
7575
7576 pt_guest_enter(vmx);
7577
7578 atomic_switch_perf_msrs(vmx);
7579 if (intel_pmu_lbr_is_enabled(vcpu))
7580 vmx_passthrough_lbr_msrs(vcpu);
7581
7582 if (enable_preemption_timer)
7583 vmx_update_hv_timer(vcpu, force_immediate_exit);
7584 else if (force_immediate_exit)
7585 smp_send_reschedule(vcpu->cpu);
7586
7587 kvm_wait_lapic_expire(vcpu);
7588
7589 /* The actual VMENTER/EXIT is in the .noinstr.text section. */
7590 vmx_vcpu_enter_exit(vcpu, __vmx_vcpu_run_flags(vmx));
7591
7592 /* All fields are clean at this point */
7593 if (kvm_is_using_evmcs()) {
7594 current_evmcs->hv_clean_fields |=
7595 HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL;
7596
7597 current_evmcs->hv_vp_id = kvm_hv_get_vpindex(vcpu);
7598 }
7599
7600 /* MSR_IA32_DEBUGCTLMSR is zeroed on vmexit. Restore it if needed */
7601 if (vcpu->arch.host_debugctl)
7602 update_debugctlmsr(vcpu->arch.host_debugctl);
7603
7604 #ifndef CONFIG_X86_64
7605 /*
7606 * The sysexit path does not restore ds/es, so we must set them to
7607 * a reasonable value ourselves.
7608 *
7609 * We can't defer this to vmx_prepare_switch_to_host() since that
7610 * function may be executed in interrupt context, which saves and
7611 * restore segments around it, nullifying its effect.
7612 */
7613 loadsegment(ds, __USER_DS);
7614 loadsegment(es, __USER_DS);
7615 #endif
7616
7617 pt_guest_exit(vmx);
7618
7619 if (is_guest_mode(vcpu)) {
7620 /*
7621 * Track VMLAUNCH/VMRESUME that have made past guest state
7622 * checking.
7623 */
7624 if (vcpu->arch.nested_run_pending &&
7625 !vmx_get_exit_reason(vcpu).failed_vmentry)
7626 ++vcpu->stat.nested_run;
7627
7628 vcpu->arch.nested_run_pending = 0;
7629 }
7630
7631 if (unlikely(vmx->fail))
7632 return EXIT_FASTPATH_NONE;
7633
7634 trace_kvm_exit(vcpu, KVM_ISA_VMX);
7635
7636 if (unlikely(vmx_get_exit_reason(vcpu).failed_vmentry))
7637 return EXIT_FASTPATH_NONE;
7638
7639 vmx->loaded_vmcs->launched = 1;
7640
7641 vmx_refresh_guest_perf_global_control(vcpu);
7642
7643 vmx_recover_nmi_blocking(vmx);
7644 vmx_complete_interrupts(vmx);
7645
7646 return vmx_exit_handlers_fastpath(vcpu, force_immediate_exit);
7647 }
7648
vmx_vcpu_free(struct kvm_vcpu * vcpu)7649 void vmx_vcpu_free(struct kvm_vcpu *vcpu)
7650 {
7651 struct vcpu_vmx *vmx = to_vmx(vcpu);
7652
7653 if (enable_pml)
7654 vmx_destroy_pml_buffer(vmx);
7655 free_vpid(vmx->vpid);
7656 nested_vmx_free_vcpu(vcpu);
7657 free_loaded_vmcs(vmx->loaded_vmcs);
7658 free_page((unsigned long)vmx->ve_info);
7659 }
7660
vmx_vcpu_create(struct kvm_vcpu * vcpu)7661 int vmx_vcpu_create(struct kvm_vcpu *vcpu)
7662 {
7663 struct vmx_uret_msr *tsx_ctrl;
7664 struct vcpu_vmx *vmx;
7665 int i, err;
7666
7667 BUILD_BUG_ON(offsetof(struct vcpu_vmx, vcpu) != 0);
7668 vmx = to_vmx(vcpu);
7669
7670 INIT_LIST_HEAD(&vmx->vt.pi_wakeup_list);
7671
7672 err = -ENOMEM;
7673
7674 vmx->vpid = allocate_vpid();
7675
7676 /*
7677 * If PML is turned on, failure on enabling PML just results in failure
7678 * of creating the vcpu, therefore we can simplify PML logic (by
7679 * avoiding dealing with cases, such as enabling PML partially on vcpus
7680 * for the guest), etc.
7681 */
7682 if (enable_pml) {
7683 vmx->pml_pg = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
7684 if (!vmx->pml_pg)
7685 goto free_vpid;
7686 }
7687
7688 for (i = 0; i < kvm_nr_uret_msrs; ++i)
7689 vmx->guest_uret_msrs[i].mask = -1ull;
7690 if (boot_cpu_has(X86_FEATURE_RTM)) {
7691 /*
7692 * TSX_CTRL_CPUID_CLEAR is handled in the CPUID interception.
7693 * Keep the host value unchanged to avoid changing CPUID bits
7694 * under the host kernel's feet.
7695 */
7696 tsx_ctrl = vmx_find_uret_msr(vmx, MSR_IA32_TSX_CTRL);
7697 if (tsx_ctrl)
7698 tsx_ctrl->mask = ~(u64)TSX_CTRL_CPUID_CLEAR;
7699 }
7700
7701 err = alloc_loaded_vmcs(&vmx->vmcs01);
7702 if (err < 0)
7703 goto free_pml;
7704
7705 /*
7706 * Use Hyper-V 'Enlightened MSR Bitmap' feature when KVM runs as a
7707 * nested (L1) hypervisor and Hyper-V in L0 supports it. Enable the
7708 * feature only for vmcs01, KVM currently isn't equipped to realize any
7709 * performance benefits from enabling it for vmcs02.
7710 */
7711 if (kvm_is_using_evmcs() &&
7712 (ms_hyperv.nested_features & HV_X64_NESTED_MSR_BITMAP)) {
7713 struct hv_enlightened_vmcs *evmcs = (void *)vmx->vmcs01.vmcs;
7714
7715 evmcs->hv_enlightenments_control.msr_bitmap = 1;
7716 }
7717
7718 vmx->loaded_vmcs = &vmx->vmcs01;
7719
7720 if (cpu_need_virtualize_apic_accesses(vcpu)) {
7721 err = kvm_alloc_apic_access_page(vcpu->kvm);
7722 if (err)
7723 goto free_vmcs;
7724 }
7725
7726 if (enable_ept && !enable_unrestricted_guest) {
7727 err = init_rmode_identity_map(vcpu->kvm);
7728 if (err)
7729 goto free_vmcs;
7730 }
7731
7732 err = -ENOMEM;
7733 if (vmcs_config.cpu_based_2nd_exec_ctrl & SECONDARY_EXEC_EPT_VIOLATION_VE) {
7734 struct page *page;
7735
7736 BUILD_BUG_ON(sizeof(*vmx->ve_info) > PAGE_SIZE);
7737
7738 /* ve_info must be page aligned. */
7739 page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
7740 if (!page)
7741 goto free_vmcs;
7742
7743 vmx->ve_info = page_to_virt(page);
7744 }
7745
7746 if (vmx_can_use_ipiv(vcpu))
7747 WRITE_ONCE(to_kvm_vmx(vcpu->kvm)->pid_table[vcpu->vcpu_id],
7748 __pa(&vmx->vt.pi_desc) | PID_TABLE_ENTRY_VALID);
7749
7750 return 0;
7751
7752 free_vmcs:
7753 free_loaded_vmcs(vmx->loaded_vmcs);
7754 free_pml:
7755 vmx_destroy_pml_buffer(vmx);
7756 free_vpid:
7757 free_vpid(vmx->vpid);
7758 return err;
7759 }
7760
7761 #define L1TF_MSG_SMT "L1TF CPU bug present and SMT on, data leak possible. See CVE-2018-3646 and https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/l1tf.html for details.\n"
7762 #define L1TF_MSG_L1D "L1TF CPU bug present and virtualization mitigation disabled, data leak possible. See CVE-2018-3646 and https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/l1tf.html for details.\n"
7763
vmx_vm_init(struct kvm * kvm)7764 int vmx_vm_init(struct kvm *kvm)
7765 {
7766 if (!ple_gap)
7767 kvm_disable_exits(kvm, KVM_X86_DISABLE_EXITS_PAUSE);
7768
7769 if (boot_cpu_has(X86_BUG_L1TF) && enable_ept) {
7770 switch (l1tf_mitigation) {
7771 case L1TF_MITIGATION_OFF:
7772 case L1TF_MITIGATION_FLUSH_NOWARN:
7773 /* 'I explicitly don't care' is set */
7774 break;
7775 case L1TF_MITIGATION_AUTO:
7776 case L1TF_MITIGATION_FLUSH:
7777 case L1TF_MITIGATION_FLUSH_NOSMT:
7778 case L1TF_MITIGATION_FULL:
7779 /*
7780 * Warn upon starting the first VM in a potentially
7781 * insecure environment.
7782 */
7783 if (sched_smt_active())
7784 pr_warn_once(L1TF_MSG_SMT);
7785 if (l1tf_vmx_mitigation == VMENTER_L1D_FLUSH_NEVER)
7786 pr_warn_once(L1TF_MSG_L1D);
7787 break;
7788 case L1TF_MITIGATION_FULL_FORCE:
7789 /* Flush is enforced */
7790 break;
7791 }
7792 }
7793
7794 if (enable_pml)
7795 kvm->arch.cpu_dirty_log_size = PML_LOG_NR_ENTRIES;
7796 return 0;
7797 }
7798
vmx_ignore_guest_pat(struct kvm * kvm)7799 static inline bool vmx_ignore_guest_pat(struct kvm *kvm)
7800 {
7801 /*
7802 * Non-coherent DMA devices need the guest to flush CPU properly.
7803 * In that case it is not possible to map all guest RAM as WB, so
7804 * always trust guest PAT.
7805 */
7806 return !kvm_arch_has_noncoherent_dma(kvm) &&
7807 kvm_check_has_quirk(kvm, KVM_X86_QUIRK_IGNORE_GUEST_PAT);
7808 }
7809
vmx_get_mt_mask(struct kvm_vcpu * vcpu,gfn_t gfn,bool is_mmio)7810 u8 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
7811 {
7812 /*
7813 * Force UC for host MMIO regions, as allowing the guest to access MMIO
7814 * with cacheable accesses will result in Machine Checks.
7815 */
7816 if (is_mmio)
7817 return MTRR_TYPE_UNCACHABLE << VMX_EPT_MT_EPTE_SHIFT;
7818
7819 /* Force WB if ignoring guest PAT */
7820 if (vmx_ignore_guest_pat(vcpu->kvm))
7821 return (MTRR_TYPE_WRBACK << VMX_EPT_MT_EPTE_SHIFT) | VMX_EPT_IPAT_BIT;
7822
7823 return (MTRR_TYPE_WRBACK << VMX_EPT_MT_EPTE_SHIFT);
7824 }
7825
vmcs_set_secondary_exec_control(struct vcpu_vmx * vmx,u32 new_ctl)7826 static void vmcs_set_secondary_exec_control(struct vcpu_vmx *vmx, u32 new_ctl)
7827 {
7828 /*
7829 * These bits in the secondary execution controls field
7830 * are dynamic, the others are mostly based on the hypervisor
7831 * architecture and the guest's CPUID. Do not touch the
7832 * dynamic bits.
7833 */
7834 u32 mask =
7835 SECONDARY_EXEC_SHADOW_VMCS |
7836 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
7837 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
7838 SECONDARY_EXEC_DESC;
7839
7840 u32 cur_ctl = secondary_exec_controls_get(vmx);
7841
7842 secondary_exec_controls_set(vmx, (new_ctl & ~mask) | (cur_ctl & mask));
7843 }
7844
7845 /*
7846 * Generate MSR_IA32_VMX_CR{0,4}_FIXED1 according to CPUID. Only set bits
7847 * (indicating "allowed-1") if they are supported in the guest's CPUID.
7848 */
nested_vmx_cr_fixed1_bits_update(struct kvm_vcpu * vcpu)7849 static void nested_vmx_cr_fixed1_bits_update(struct kvm_vcpu *vcpu)
7850 {
7851 struct vcpu_vmx *vmx = to_vmx(vcpu);
7852 struct kvm_cpuid_entry2 *entry;
7853
7854 vmx->nested.msrs.cr0_fixed1 = 0xffffffff;
7855 vmx->nested.msrs.cr4_fixed1 = X86_CR4_PCE;
7856
7857 #define cr4_fixed1_update(_cr4_mask, _reg, _cpuid_mask) do { \
7858 if (entry && (entry->_reg & (_cpuid_mask))) \
7859 vmx->nested.msrs.cr4_fixed1 |= (_cr4_mask); \
7860 } while (0)
7861
7862 entry = kvm_find_cpuid_entry(vcpu, 0x1);
7863 cr4_fixed1_update(X86_CR4_VME, edx, feature_bit(VME));
7864 cr4_fixed1_update(X86_CR4_PVI, edx, feature_bit(VME));
7865 cr4_fixed1_update(X86_CR4_TSD, edx, feature_bit(TSC));
7866 cr4_fixed1_update(X86_CR4_DE, edx, feature_bit(DE));
7867 cr4_fixed1_update(X86_CR4_PSE, edx, feature_bit(PSE));
7868 cr4_fixed1_update(X86_CR4_PAE, edx, feature_bit(PAE));
7869 cr4_fixed1_update(X86_CR4_MCE, edx, feature_bit(MCE));
7870 cr4_fixed1_update(X86_CR4_PGE, edx, feature_bit(PGE));
7871 cr4_fixed1_update(X86_CR4_OSFXSR, edx, feature_bit(FXSR));
7872 cr4_fixed1_update(X86_CR4_OSXMMEXCPT, edx, feature_bit(XMM));
7873 cr4_fixed1_update(X86_CR4_VMXE, ecx, feature_bit(VMX));
7874 cr4_fixed1_update(X86_CR4_SMXE, ecx, feature_bit(SMX));
7875 cr4_fixed1_update(X86_CR4_PCIDE, ecx, feature_bit(PCID));
7876 cr4_fixed1_update(X86_CR4_OSXSAVE, ecx, feature_bit(XSAVE));
7877
7878 entry = kvm_find_cpuid_entry_index(vcpu, 0x7, 0);
7879 cr4_fixed1_update(X86_CR4_FSGSBASE, ebx, feature_bit(FSGSBASE));
7880 cr4_fixed1_update(X86_CR4_SMEP, ebx, feature_bit(SMEP));
7881 cr4_fixed1_update(X86_CR4_SMAP, ebx, feature_bit(SMAP));
7882 cr4_fixed1_update(X86_CR4_PKE, ecx, feature_bit(PKU));
7883 cr4_fixed1_update(X86_CR4_UMIP, ecx, feature_bit(UMIP));
7884 cr4_fixed1_update(X86_CR4_LA57, ecx, feature_bit(LA57));
7885 cr4_fixed1_update(X86_CR4_CET, ecx, feature_bit(SHSTK));
7886 cr4_fixed1_update(X86_CR4_CET, edx, feature_bit(IBT));
7887
7888 entry = kvm_find_cpuid_entry_index(vcpu, 0x7, 1);
7889 cr4_fixed1_update(X86_CR4_LAM_SUP, eax, feature_bit(LAM));
7890
7891 #undef cr4_fixed1_update
7892 }
7893
update_intel_pt_cfg(struct kvm_vcpu * vcpu)7894 static void update_intel_pt_cfg(struct kvm_vcpu *vcpu)
7895 {
7896 struct vcpu_vmx *vmx = to_vmx(vcpu);
7897 struct kvm_cpuid_entry2 *best = NULL;
7898 int i;
7899
7900 for (i = 0; i < PT_CPUID_LEAVES; i++) {
7901 best = kvm_find_cpuid_entry_index(vcpu, 0x14, i);
7902 if (!best)
7903 return;
7904 vmx->pt_desc.caps[CPUID_EAX + i*PT_CPUID_REGS_NUM] = best->eax;
7905 vmx->pt_desc.caps[CPUID_EBX + i*PT_CPUID_REGS_NUM] = best->ebx;
7906 vmx->pt_desc.caps[CPUID_ECX + i*PT_CPUID_REGS_NUM] = best->ecx;
7907 vmx->pt_desc.caps[CPUID_EDX + i*PT_CPUID_REGS_NUM] = best->edx;
7908 }
7909
7910 /* Get the number of configurable Address Ranges for filtering */
7911 vmx->pt_desc.num_address_ranges = intel_pt_validate_cap(vmx->pt_desc.caps,
7912 PT_CAP_num_address_ranges);
7913
7914 /* Initialize and clear the no dependency bits */
7915 vmx->pt_desc.ctl_bitmask = ~(RTIT_CTL_TRACEEN | RTIT_CTL_OS |
7916 RTIT_CTL_USR | RTIT_CTL_TSC_EN | RTIT_CTL_DISRETC |
7917 RTIT_CTL_BRANCH_EN);
7918
7919 /*
7920 * If CPUID.(EAX=14H,ECX=0):EBX[0]=1 CR3Filter can be set otherwise
7921 * will inject an #GP
7922 */
7923 if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_cr3_filtering))
7924 vmx->pt_desc.ctl_bitmask &= ~RTIT_CTL_CR3EN;
7925
7926 /*
7927 * If CPUID.(EAX=14H,ECX=0):EBX[1]=1 CYCEn, CycThresh and
7928 * PSBFreq can be set
7929 */
7930 if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_psb_cyc))
7931 vmx->pt_desc.ctl_bitmask &= ~(RTIT_CTL_CYCLEACC |
7932 RTIT_CTL_CYC_THRESH | RTIT_CTL_PSB_FREQ);
7933
7934 /*
7935 * If CPUID.(EAX=14H,ECX=0):EBX[3]=1 MTCEn and MTCFreq can be set
7936 */
7937 if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_mtc))
7938 vmx->pt_desc.ctl_bitmask &= ~(RTIT_CTL_MTC_EN |
7939 RTIT_CTL_MTC_RANGE);
7940
7941 /* If CPUID.(EAX=14H,ECX=0):EBX[4]=1 FUPonPTW and PTWEn can be set */
7942 if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_ptwrite))
7943 vmx->pt_desc.ctl_bitmask &= ~(RTIT_CTL_FUP_ON_PTW |
7944 RTIT_CTL_PTW_EN);
7945
7946 /* If CPUID.(EAX=14H,ECX=0):EBX[5]=1 PwrEvEn can be set */
7947 if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_power_event_trace))
7948 vmx->pt_desc.ctl_bitmask &= ~RTIT_CTL_PWR_EVT_EN;
7949
7950 /* If CPUID.(EAX=14H,ECX=0):ECX[0]=1 ToPA can be set */
7951 if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_topa_output))
7952 vmx->pt_desc.ctl_bitmask &= ~RTIT_CTL_TOPA;
7953
7954 /* If CPUID.(EAX=14H,ECX=0):ECX[3]=1 FabricEn can be set */
7955 if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_output_subsys))
7956 vmx->pt_desc.ctl_bitmask &= ~RTIT_CTL_FABRIC_EN;
7957
7958 /* unmask address range configure area */
7959 for (i = 0; i < vmx->pt_desc.num_address_ranges; i++)
7960 vmx->pt_desc.ctl_bitmask &= ~(0xfULL << (32 + i * 4));
7961 }
7962
vmx_vcpu_after_set_cpuid(struct kvm_vcpu * vcpu)7963 void vmx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
7964 {
7965 struct vcpu_vmx *vmx = to_vmx(vcpu);
7966
7967 /*
7968 * XSAVES is effectively enabled if and only if XSAVE is also exposed
7969 * to the guest. XSAVES depends on CR4.OSXSAVE, and CR4.OSXSAVE can be
7970 * set if and only if XSAVE is supported.
7971 */
7972 if (!guest_cpu_cap_has(vcpu, X86_FEATURE_XSAVE))
7973 guest_cpu_cap_clear(vcpu, X86_FEATURE_XSAVES);
7974
7975 vmx_setup_uret_msrs(vmx);
7976
7977 if (cpu_has_secondary_exec_ctrls())
7978 vmcs_set_secondary_exec_control(vmx,
7979 vmx_secondary_exec_control(vmx));
7980
7981 if (guest_cpu_cap_has(vcpu, X86_FEATURE_VMX))
7982 vmx->msr_ia32_feature_control_valid_bits |=
7983 FEAT_CTL_VMX_ENABLED_INSIDE_SMX |
7984 FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX;
7985 else
7986 vmx->msr_ia32_feature_control_valid_bits &=
7987 ~(FEAT_CTL_VMX_ENABLED_INSIDE_SMX |
7988 FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX);
7989
7990 if (guest_cpu_cap_has(vcpu, X86_FEATURE_VMX))
7991 nested_vmx_cr_fixed1_bits_update(vcpu);
7992
7993 if (boot_cpu_has(X86_FEATURE_INTEL_PT) &&
7994 guest_cpu_cap_has(vcpu, X86_FEATURE_INTEL_PT))
7995 update_intel_pt_cfg(vcpu);
7996
7997 if (boot_cpu_has(X86_FEATURE_RTM)) {
7998 struct vmx_uret_msr *msr;
7999 msr = vmx_find_uret_msr(vmx, MSR_IA32_TSX_CTRL);
8000 if (msr) {
8001 bool enabled = guest_cpu_cap_has(vcpu, X86_FEATURE_RTM);
8002 vmx_set_guest_uret_msr(vmx, msr, enabled ? 0 : TSX_CTRL_RTM_DISABLE);
8003 }
8004 }
8005
8006 set_cr4_guest_host_mask(vmx);
8007
8008 vmx_write_encls_bitmap(vcpu, NULL);
8009 if (guest_cpu_cap_has(vcpu, X86_FEATURE_SGX))
8010 vmx->msr_ia32_feature_control_valid_bits |= FEAT_CTL_SGX_ENABLED;
8011 else
8012 vmx->msr_ia32_feature_control_valid_bits &= ~FEAT_CTL_SGX_ENABLED;
8013
8014 if (guest_cpu_cap_has(vcpu, X86_FEATURE_SGX_LC))
8015 vmx->msr_ia32_feature_control_valid_bits |=
8016 FEAT_CTL_SGX_LC_ENABLED;
8017 else
8018 vmx->msr_ia32_feature_control_valid_bits &=
8019 ~FEAT_CTL_SGX_LC_ENABLED;
8020
8021 /* Refresh #PF interception to account for MAXPHYADDR changes. */
8022 vmx_update_exception_bitmap(vcpu);
8023 }
8024
vmx_get_perf_capabilities(void)8025 static __init u64 vmx_get_perf_capabilities(void)
8026 {
8027 u64 perf_cap = PERF_CAP_FW_WRITES;
8028 u64 host_perf_cap = 0;
8029
8030 if (!enable_pmu)
8031 return 0;
8032
8033 if (boot_cpu_has(X86_FEATURE_PDCM))
8034 rdmsrq(MSR_IA32_PERF_CAPABILITIES, host_perf_cap);
8035
8036 if (!cpu_feature_enabled(X86_FEATURE_ARCH_LBR) &&
8037 !enable_mediated_pmu) {
8038 x86_perf_get_lbr(&vmx_lbr_caps);
8039
8040 /*
8041 * KVM requires LBR callstack support, as the overhead due to
8042 * context switching LBRs without said support is too high.
8043 * See intel_pmu_create_guest_lbr_event() for more info.
8044 */
8045 if (!vmx_lbr_caps.has_callstack)
8046 memset(&vmx_lbr_caps, 0, sizeof(vmx_lbr_caps));
8047 else if (vmx_lbr_caps.nr)
8048 perf_cap |= host_perf_cap & PERF_CAP_LBR_FMT;
8049 }
8050
8051 if (vmx_pebs_supported()) {
8052 perf_cap |= host_perf_cap & PERF_CAP_PEBS_MASK;
8053
8054 /*
8055 * Disallow adaptive PEBS as it is functionally broken, can be
8056 * used by the guest to read *host* LBRs, and can be used to
8057 * bypass userspace event filters. To correctly and safely
8058 * support adaptive PEBS, KVM needs to:
8059 *
8060 * 1. Account for the ADAPTIVE flag when (re)programming fixed
8061 * counters.
8062 *
8063 * 2. Gain support from perf (or take direct control of counter
8064 * programming) to support events without adaptive PEBS
8065 * enabled for the hardware counter.
8066 *
8067 * 3. Ensure LBR MSRs cannot hold host data on VM-Entry with
8068 * adaptive PEBS enabled and MSR_PEBS_DATA_CFG.LBRS=1.
8069 *
8070 * 4. Document which PMU events are effectively exposed to the
8071 * guest via adaptive PEBS, and make adaptive PEBS mutually
8072 * exclusive with KVM_SET_PMU_EVENT_FILTER if necessary.
8073 */
8074 perf_cap &= ~PERF_CAP_PEBS_BASELINE;
8075 }
8076
8077 return perf_cap;
8078 }
8079
vmx_set_cpu_caps(void)8080 static __init void vmx_set_cpu_caps(void)
8081 {
8082 kvm_initialize_cpu_caps();
8083
8084 /* CPUID 0x1 */
8085 if (nested)
8086 kvm_cpu_cap_set(X86_FEATURE_VMX);
8087
8088 /* CPUID 0x7 */
8089 if (kvm_mpx_supported())
8090 kvm_cpu_cap_check_and_set(X86_FEATURE_MPX);
8091 if (!cpu_has_vmx_invpcid())
8092 kvm_cpu_cap_clear(X86_FEATURE_INVPCID);
8093 if (vmx_pt_mode_is_host_guest())
8094 kvm_cpu_cap_check_and_set(X86_FEATURE_INTEL_PT);
8095 if (vmx_pebs_supported()) {
8096 kvm_cpu_cap_check_and_set(X86_FEATURE_DS);
8097 kvm_cpu_cap_check_and_set(X86_FEATURE_DTES64);
8098 }
8099
8100 if (!enable_pmu)
8101 kvm_cpu_cap_clear(X86_FEATURE_PDCM);
8102 kvm_caps.supported_perf_cap = vmx_get_perf_capabilities();
8103
8104 if (!enable_sgx) {
8105 kvm_cpu_cap_clear(X86_FEATURE_SGX);
8106 kvm_cpu_cap_clear(X86_FEATURE_SGX_LC);
8107 kvm_cpu_cap_clear(X86_FEATURE_SGX1);
8108 kvm_cpu_cap_clear(X86_FEATURE_SGX2);
8109 kvm_cpu_cap_clear(X86_FEATURE_SGX_EDECCSSA);
8110 }
8111
8112 if (vmx_umip_emulated())
8113 kvm_cpu_cap_set(X86_FEATURE_UMIP);
8114
8115 /* CPUID 0xD.1 */
8116 if (!cpu_has_vmx_xsaves())
8117 kvm_cpu_cap_clear(X86_FEATURE_XSAVES);
8118
8119 /* CPUID 0x80000001 and 0x7 (RDPID) */
8120 if (!cpu_has_vmx_rdtscp()) {
8121 kvm_cpu_cap_clear(X86_FEATURE_RDTSCP);
8122 kvm_cpu_cap_clear(X86_FEATURE_RDPID);
8123 }
8124
8125 if (cpu_has_vmx_waitpkg())
8126 kvm_cpu_cap_check_and_set(X86_FEATURE_WAITPKG);
8127
8128 /*
8129 * Disable CET if unrestricted_guest is unsupported as KVM doesn't
8130 * enforce CET HW behaviors in emulator. On platforms with
8131 * VMX_BASIC[bit56] == 0, inject #CP at VMX entry with error code
8132 * fails, so disable CET in this case too.
8133 */
8134 if (!cpu_has_load_cet_ctrl() || !enable_unrestricted_guest ||
8135 !cpu_has_vmx_basic_no_hw_errcode_cc()) {
8136 kvm_cpu_cap_clear(X86_FEATURE_SHSTK);
8137 kvm_cpu_cap_clear(X86_FEATURE_IBT);
8138 }
8139
8140 kvm_setup_xss_caps();
8141 kvm_finalize_cpu_caps();
8142 }
8143
vmx_is_io_intercepted(struct kvm_vcpu * vcpu,struct x86_instruction_info * info,unsigned long * exit_qualification)8144 static bool vmx_is_io_intercepted(struct kvm_vcpu *vcpu,
8145 struct x86_instruction_info *info,
8146 unsigned long *exit_qualification)
8147 {
8148 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
8149 unsigned short port;
8150 int size;
8151 bool imm;
8152
8153 /*
8154 * If the 'use IO bitmaps' VM-execution control is 0, IO instruction
8155 * VM-exits depend on the 'unconditional IO exiting' VM-execution
8156 * control.
8157 *
8158 * Otherwise, IO instruction VM-exits are controlled by the IO bitmaps.
8159 */
8160 if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS))
8161 return nested_cpu_has(vmcs12, CPU_BASED_UNCOND_IO_EXITING);
8162
8163 if (info->intercept == x86_intercept_in ||
8164 info->intercept == x86_intercept_ins) {
8165 port = info->src_val;
8166 size = info->dst_bytes;
8167 imm = info->src_type == OP_IMM;
8168 } else {
8169 port = info->dst_val;
8170 size = info->src_bytes;
8171 imm = info->dst_type == OP_IMM;
8172 }
8173
8174
8175 *exit_qualification = ((unsigned long)port << 16) | (size - 1);
8176
8177 if (info->intercept == x86_intercept_ins ||
8178 info->intercept == x86_intercept_outs)
8179 *exit_qualification |= BIT(4);
8180
8181 if (info->rep_prefix)
8182 *exit_qualification |= BIT(5);
8183
8184 if (imm)
8185 *exit_qualification |= BIT(6);
8186
8187 return nested_vmx_check_io_bitmaps(vcpu, port, size);
8188 }
8189
vmx_check_intercept(struct kvm_vcpu * vcpu,struct x86_instruction_info * info,enum x86_intercept_stage stage,struct x86_exception * exception)8190 int vmx_check_intercept(struct kvm_vcpu *vcpu,
8191 struct x86_instruction_info *info,
8192 enum x86_intercept_stage stage,
8193 struct x86_exception *exception)
8194 {
8195 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
8196 unsigned long exit_qualification = 0;
8197 u32 vm_exit_reason;
8198 u64 exit_insn_len;
8199
8200 switch (info->intercept) {
8201 case x86_intercept_rdpid:
8202 /*
8203 * RDPID causes #UD if not enabled through secondary execution
8204 * controls (ENABLE_RDTSCP). Note, the implicit MSR access to
8205 * TSC_AUX is NOT subject to interception, i.e. checking only
8206 * the dedicated execution control is architecturally correct.
8207 */
8208 if (!nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_RDTSCP)) {
8209 exception->vector = UD_VECTOR;
8210 exception->error_code_valid = false;
8211 return X86EMUL_PROPAGATE_FAULT;
8212 }
8213 return X86EMUL_CONTINUE;
8214
8215 case x86_intercept_in:
8216 case x86_intercept_ins:
8217 case x86_intercept_out:
8218 case x86_intercept_outs:
8219 if (!vmx_is_io_intercepted(vcpu, info, &exit_qualification))
8220 return X86EMUL_CONTINUE;
8221
8222 vm_exit_reason = EXIT_REASON_IO_INSTRUCTION;
8223 break;
8224
8225 case x86_intercept_lgdt:
8226 case x86_intercept_lidt:
8227 case x86_intercept_lldt:
8228 case x86_intercept_ltr:
8229 case x86_intercept_sgdt:
8230 case x86_intercept_sidt:
8231 case x86_intercept_sldt:
8232 case x86_intercept_str:
8233 if (!nested_cpu_has2(vmcs12, SECONDARY_EXEC_DESC))
8234 return X86EMUL_CONTINUE;
8235
8236 if (info->intercept == x86_intercept_lldt ||
8237 info->intercept == x86_intercept_ltr ||
8238 info->intercept == x86_intercept_sldt ||
8239 info->intercept == x86_intercept_str)
8240 vm_exit_reason = EXIT_REASON_LDTR_TR;
8241 else
8242 vm_exit_reason = EXIT_REASON_GDTR_IDTR;
8243 /*
8244 * FIXME: Decode the ModR/M to generate the correct exit
8245 * qualification for memory operands.
8246 */
8247 break;
8248
8249 case x86_intercept_hlt:
8250 if (!nested_cpu_has(vmcs12, CPU_BASED_HLT_EXITING))
8251 return X86EMUL_CONTINUE;
8252
8253 vm_exit_reason = EXIT_REASON_HLT;
8254 break;
8255
8256 case x86_intercept_pause:
8257 /*
8258 * PAUSE is a single-byte NOP with a REPE prefix, i.e. collides
8259 * with vanilla NOPs in the emulator. Apply the interception
8260 * check only to actual PAUSE instructions. Don't check
8261 * PAUSE-loop-exiting, software can't expect a given PAUSE to
8262 * exit, i.e. KVM is within its rights to allow L2 to execute
8263 * the PAUSE.
8264 */
8265 if ((info->rep_prefix != REPE_PREFIX) ||
8266 !nested_cpu_has(vmcs12, CPU_BASED_PAUSE_EXITING))
8267 return X86EMUL_CONTINUE;
8268
8269 vm_exit_reason = EXIT_REASON_PAUSE_INSTRUCTION;
8270 break;
8271
8272 /* TODO: check more intercepts... */
8273 default:
8274 return X86EMUL_UNHANDLEABLE;
8275 }
8276
8277 exit_insn_len = abs_diff((s64)info->next_rip, (s64)info->rip);
8278 if (!exit_insn_len || exit_insn_len > X86_MAX_INSTRUCTION_LENGTH)
8279 return X86EMUL_UNHANDLEABLE;
8280
8281 __nested_vmx_vmexit(vcpu, vm_exit_reason, 0, exit_qualification,
8282 exit_insn_len);
8283 return X86EMUL_INTERCEPTED;
8284 }
8285
8286 #ifdef CONFIG_X86_64
8287 /* (a << shift) / divisor, return 1 if overflow otherwise 0 */
u64_shl_div_u64(u64 a,unsigned int shift,u64 divisor,u64 * result)8288 static inline int u64_shl_div_u64(u64 a, unsigned int shift,
8289 u64 divisor, u64 *result)
8290 {
8291 u64 low = a << shift, high = a >> (64 - shift);
8292
8293 /* To avoid the overflow on divq */
8294 if (high >= divisor)
8295 return 1;
8296
8297 /* Low hold the result, high hold rem which is discarded */
8298 asm("divq %2\n\t" : "=a" (low), "=d" (high) :
8299 "rm" (divisor), "0" (low), "1" (high));
8300 *result = low;
8301
8302 return 0;
8303 }
8304
vmx_set_hv_timer(struct kvm_vcpu * vcpu,u64 guest_deadline_tsc,bool * expired)8305 int vmx_set_hv_timer(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc,
8306 bool *expired)
8307 {
8308 struct vcpu_vmx *vmx;
8309 u64 tscl, guest_tscl, delta_tsc, lapic_timer_advance_cycles;
8310 struct kvm_timer *ktimer = &vcpu->arch.apic->lapic_timer;
8311
8312 vmx = to_vmx(vcpu);
8313 tscl = rdtsc();
8314 guest_tscl = kvm_read_l1_tsc(vcpu, tscl);
8315 delta_tsc = max(guest_deadline_tsc, guest_tscl) - guest_tscl;
8316 lapic_timer_advance_cycles = nsec_to_cycles(vcpu,
8317 ktimer->timer_advance_ns);
8318
8319 if (delta_tsc > lapic_timer_advance_cycles)
8320 delta_tsc -= lapic_timer_advance_cycles;
8321 else
8322 delta_tsc = 0;
8323
8324 /* Convert to host delta tsc if tsc scaling is enabled */
8325 if (vcpu->arch.l1_tsc_scaling_ratio != kvm_caps.default_tsc_scaling_ratio &&
8326 delta_tsc && u64_shl_div_u64(delta_tsc,
8327 kvm_caps.tsc_scaling_ratio_frac_bits,
8328 vcpu->arch.l1_tsc_scaling_ratio, &delta_tsc))
8329 return -ERANGE;
8330
8331 /*
8332 * If the delta tsc can't fit in the 32 bit after the multi shift,
8333 * we can't use the preemption timer.
8334 * It's possible that it fits on later vmentries, but checking
8335 * on every vmentry is costly so we just use an hrtimer.
8336 */
8337 if (delta_tsc >> (cpu_preemption_timer_multi + 32))
8338 return -ERANGE;
8339
8340 vmx->hv_deadline_tsc = tscl + delta_tsc;
8341 *expired = !delta_tsc;
8342 return 0;
8343 }
8344
vmx_cancel_hv_timer(struct kvm_vcpu * vcpu)8345 void vmx_cancel_hv_timer(struct kvm_vcpu *vcpu)
8346 {
8347 to_vmx(vcpu)->hv_deadline_tsc = -1;
8348 }
8349 #endif
8350
vmx_update_cpu_dirty_logging(struct kvm_vcpu * vcpu)8351 void vmx_update_cpu_dirty_logging(struct kvm_vcpu *vcpu)
8352 {
8353 struct vcpu_vmx *vmx = to_vmx(vcpu);
8354
8355 if (WARN_ON_ONCE(!enable_pml))
8356 return;
8357
8358 guard(vmx_vmcs01)(vcpu);
8359
8360 /*
8361 * Note, nr_memslots_dirty_logging can be changed concurrent with this
8362 * code, but in that case another update request will be made and so
8363 * the guest will never run with a stale PML value.
8364 */
8365 if (atomic_read(&vcpu->kvm->nr_memslots_dirty_logging))
8366 secondary_exec_controls_setbit(vmx, SECONDARY_EXEC_ENABLE_PML);
8367 else
8368 secondary_exec_controls_clearbit(vmx, SECONDARY_EXEC_ENABLE_PML);
8369 }
8370
vmx_setup_mce(struct kvm_vcpu * vcpu)8371 void vmx_setup_mce(struct kvm_vcpu *vcpu)
8372 {
8373 if (vcpu->arch.mcg_cap & MCG_LMCE_P)
8374 to_vmx(vcpu)->msr_ia32_feature_control_valid_bits |=
8375 FEAT_CTL_LMCE_ENABLED;
8376 else
8377 to_vmx(vcpu)->msr_ia32_feature_control_valid_bits &=
8378 ~FEAT_CTL_LMCE_ENABLED;
8379 }
8380
8381 #ifdef CONFIG_KVM_SMM
vmx_smi_allowed(struct kvm_vcpu * vcpu,bool for_injection)8382 int vmx_smi_allowed(struct kvm_vcpu *vcpu, bool for_injection)
8383 {
8384 /* we need a nested vmexit to enter SMM, postpone if run is pending */
8385 if (vcpu->arch.nested_run_pending)
8386 return -EBUSY;
8387 return !is_smm(vcpu);
8388 }
8389
vmx_enter_smm(struct kvm_vcpu * vcpu,union kvm_smram * smram)8390 int vmx_enter_smm(struct kvm_vcpu *vcpu, union kvm_smram *smram)
8391 {
8392 struct vcpu_vmx *vmx = to_vmx(vcpu);
8393
8394 /*
8395 * TODO: Implement custom flows for forcing the vCPU out/in of L2 on
8396 * SMI and RSM. Using the common VM-Exit + VM-Enter routines is wrong
8397 * SMI and RSM only modify state that is saved and restored via SMRAM.
8398 * E.g. most MSRs are left untouched, but many are modified by VM-Exit
8399 * and VM-Enter, and thus L2's values may be corrupted on SMI+RSM.
8400 */
8401 vmx->nested.smm.guest_mode = is_guest_mode(vcpu);
8402 if (vmx->nested.smm.guest_mode)
8403 nested_vmx_vmexit(vcpu, -1, 0, 0);
8404
8405 vmx->nested.smm.vmxon = vmx->nested.vmxon;
8406 vmx->nested.vmxon = false;
8407 vmx_clear_hlt(vcpu);
8408 return 0;
8409 }
8410
vmx_leave_smm(struct kvm_vcpu * vcpu,const union kvm_smram * smram)8411 int vmx_leave_smm(struct kvm_vcpu *vcpu, const union kvm_smram *smram)
8412 {
8413 struct vcpu_vmx *vmx = to_vmx(vcpu);
8414 int ret;
8415
8416 if (vmx->nested.smm.vmxon) {
8417 vmx->nested.vmxon = true;
8418 vmx->nested.smm.vmxon = false;
8419 }
8420
8421 if (vmx->nested.smm.guest_mode) {
8422 /* Triple fault if the state is invalid. */
8423 if (nested_vmx_check_restored_vmcs12(vcpu) < 0)
8424 return 1;
8425
8426 ret = nested_vmx_enter_non_root_mode(vcpu, false);
8427 if (ret != NVMX_VMENTRY_SUCCESS)
8428 return 1;
8429
8430 vcpu->arch.nested_run_pending = KVM_NESTED_RUN_PENDING;
8431 vmx->nested.smm.guest_mode = false;
8432 }
8433 return 0;
8434 }
8435
vmx_enable_smi_window(struct kvm_vcpu * vcpu)8436 void vmx_enable_smi_window(struct kvm_vcpu *vcpu)
8437 {
8438 /* RSM will cause a vmexit anyway. */
8439 }
8440 #endif
8441
vmx_apic_init_signal_blocked(struct kvm_vcpu * vcpu)8442 bool vmx_apic_init_signal_blocked(struct kvm_vcpu *vcpu)
8443 {
8444 return to_vmx(vcpu)->nested.vmxon && !is_guest_mode(vcpu);
8445 }
8446
vmx_migrate_timers(struct kvm_vcpu * vcpu)8447 void vmx_migrate_timers(struct kvm_vcpu *vcpu)
8448 {
8449 if (is_guest_mode(vcpu)) {
8450 struct hrtimer *timer = &to_vmx(vcpu)->nested.preemption_timer;
8451
8452 if (hrtimer_try_to_cancel(timer) == 1)
8453 hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED);
8454 }
8455 }
8456
vmx_hardware_unsetup(void)8457 void vmx_hardware_unsetup(void)
8458 {
8459 kvm_set_posted_intr_wakeup_handler(NULL);
8460
8461 if (nested)
8462 nested_vmx_hardware_unsetup();
8463 }
8464
vmx_vm_destroy(struct kvm * kvm)8465 void vmx_vm_destroy(struct kvm *kvm)
8466 {
8467 struct kvm_vmx *kvm_vmx = to_kvm_vmx(kvm);
8468
8469 free_pages((unsigned long)kvm_vmx->pid_table, vmx_get_pid_table_order(kvm));
8470 }
8471
8472 /*
8473 * Note, the SDM states that the linear address is masked *after* the modified
8474 * canonicality check, whereas KVM masks (untags) the address and then performs
8475 * a "normal" canonicality check. Functionally, the two methods are identical,
8476 * and when the masking occurs relative to the canonicality check isn't visible
8477 * to software, i.e. KVM's behavior doesn't violate the SDM.
8478 */
vmx_get_untagged_addr(struct kvm_vcpu * vcpu,gva_t gva,unsigned int flags)8479 gva_t vmx_get_untagged_addr(struct kvm_vcpu *vcpu, gva_t gva, unsigned int flags)
8480 {
8481 int lam_bit;
8482 unsigned long cr3_bits;
8483
8484 if (flags & (X86EMUL_F_FETCH | X86EMUL_F_IMPLICIT | X86EMUL_F_INVLPG))
8485 return gva;
8486
8487 if (!is_64_bit_mode(vcpu))
8488 return gva;
8489
8490 /*
8491 * Bit 63 determines if the address should be treated as user address
8492 * or a supervisor address.
8493 */
8494 if (!(gva & BIT_ULL(63))) {
8495 cr3_bits = kvm_get_active_cr3_lam_bits(vcpu);
8496 if (!(cr3_bits & (X86_CR3_LAM_U57 | X86_CR3_LAM_U48)))
8497 return gva;
8498
8499 /* LAM_U48 is ignored if LAM_U57 is set. */
8500 lam_bit = cr3_bits & X86_CR3_LAM_U57 ? 56 : 47;
8501 } else {
8502 if (!kvm_is_cr4_bit_set(vcpu, X86_CR4_LAM_SUP))
8503 return gva;
8504
8505 lam_bit = kvm_is_cr4_bit_set(vcpu, X86_CR4_LA57) ? 56 : 47;
8506 }
8507
8508 /*
8509 * Untag the address by sign-extending the lam_bit, but NOT to bit 63.
8510 * Bit 63 is retained from the raw virtual address so that untagging
8511 * doesn't change a user access to a supervisor access, and vice versa.
8512 */
8513 return (sign_extend64(gva, lam_bit) & ~BIT_ULL(63)) | (gva & BIT_ULL(63));
8514 }
8515
vmx_handle_intel_pt_intr(void)8516 static unsigned int vmx_handle_intel_pt_intr(void)
8517 {
8518 struct kvm_vcpu *vcpu = kvm_get_running_vcpu();
8519
8520 /* '0' on failure so that the !PT case can use a RET0 static call. */
8521 if (!vcpu || !kvm_handling_nmi_from_guest(vcpu))
8522 return 0;
8523
8524 kvm_make_request(KVM_REQ_PMI, vcpu);
8525 __set_bit(MSR_CORE_PERF_GLOBAL_OVF_CTRL_TRACE_TOPA_PMI_BIT,
8526 (unsigned long *)&vcpu->arch.pmu.global_status);
8527 return 1;
8528 }
8529
vmx_setup_user_return_msrs(void)8530 static __init void vmx_setup_user_return_msrs(void)
8531 {
8532
8533 /*
8534 * Though SYSCALL is only supported in 64-bit mode on Intel CPUs, kvm
8535 * will emulate SYSCALL in legacy mode if the vendor string in guest
8536 * CPUID.0:{EBX,ECX,EDX} is "AuthenticAMD" or "AMDisbetter!" To
8537 * support this emulation, MSR_STAR is included in the list for i386,
8538 * but is never loaded into hardware. MSR_CSTAR is also never loaded
8539 * into hardware and is here purely for emulation purposes.
8540 */
8541 const u32 vmx_uret_msrs_list[] = {
8542 #ifdef CONFIG_X86_64
8543 MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR,
8544 #endif
8545 MSR_EFER, MSR_TSC_AUX, MSR_STAR,
8546 MSR_IA32_TSX_CTRL,
8547 };
8548 int i;
8549
8550 BUILD_BUG_ON(ARRAY_SIZE(vmx_uret_msrs_list) != MAX_NR_USER_RETURN_MSRS);
8551
8552 for (i = 0; i < ARRAY_SIZE(vmx_uret_msrs_list); ++i)
8553 kvm_add_user_return_msr(vmx_uret_msrs_list[i]);
8554 }
8555
vmx_setup_me_spte_mask(void)8556 static void __init vmx_setup_me_spte_mask(void)
8557 {
8558 u64 me_mask = 0;
8559
8560 /*
8561 * On pre-MKTME system, boot_cpu_data.x86_phys_bits equals to
8562 * kvm_host.maxphyaddr. On MKTME and/or TDX capable systems,
8563 * boot_cpu_data.x86_phys_bits holds the actual physical address
8564 * w/o the KeyID bits, and kvm_host.maxphyaddr equals to
8565 * MAXPHYADDR reported by CPUID. Those bits between are KeyID bits.
8566 */
8567 if (boot_cpu_data.x86_phys_bits != kvm_host.maxphyaddr)
8568 me_mask = rsvd_bits(boot_cpu_data.x86_phys_bits,
8569 kvm_host.maxphyaddr - 1);
8570
8571 /*
8572 * Unlike SME, host kernel doesn't support setting up any
8573 * MKTME KeyID on Intel platforms. No memory encryption
8574 * bits should be included into the SPTE.
8575 */
8576 kvm_mmu_set_me_spte_mask(0, me_mask);
8577 }
8578
vmx_hardware_setup(void)8579 __init int vmx_hardware_setup(void)
8580 {
8581 unsigned long host_bndcfgs;
8582 struct desc_ptr dt;
8583 int r;
8584
8585 store_idt(&dt);
8586 host_idt_base = dt.address;
8587
8588 vmx_setup_user_return_msrs();
8589
8590 if (boot_cpu_has(X86_FEATURE_MPX)) {
8591 rdmsrq(MSR_IA32_BNDCFGS, host_bndcfgs);
8592 WARN_ONCE(host_bndcfgs, "BNDCFGS in host will be lost");
8593 }
8594
8595 if (!cpu_has_vmx_mpx())
8596 kvm_caps.supported_xcr0 &= ~(XFEATURE_MASK_BNDREGS |
8597 XFEATURE_MASK_BNDCSR);
8598
8599 if (!cpu_has_vmx_vpid() || !cpu_has_vmx_invvpid() ||
8600 !(cpu_has_vmx_invvpid_single() || cpu_has_vmx_invvpid_global()))
8601 enable_vpid = 0;
8602
8603 if (!cpu_has_vmx_ept() ||
8604 !cpu_has_vmx_ept_4levels() ||
8605 !cpu_has_vmx_ept_mt_wb() ||
8606 !cpu_has_vmx_invept_global())
8607 enable_ept = 0;
8608
8609 /* NX support is required for shadow paging. */
8610 if (!enable_ept && !boot_cpu_has(X86_FEATURE_NX)) {
8611 pr_err_ratelimited("NX (Execute Disable) not supported\n");
8612 return -EOPNOTSUPP;
8613 }
8614
8615 /*
8616 * Shadow paging doesn't have a (further) performance penalty
8617 * from GUEST_MAXPHYADDR < HOST_MAXPHYADDR so enable it
8618 * by default
8619 */
8620 if (!enable_ept)
8621 allow_smaller_maxphyaddr = true;
8622
8623 if (!cpu_has_vmx_ept_ad_bits() || !enable_ept)
8624 enable_ept_ad_bits = 0;
8625
8626 if (!cpu_has_vmx_unrestricted_guest() || !enable_ept)
8627 enable_unrestricted_guest = 0;
8628
8629 if (!cpu_has_vmx_flexpriority())
8630 flexpriority_enabled = 0;
8631
8632 if (!cpu_has_virtual_nmis())
8633 enable_vnmi = 0;
8634
8635 #ifdef CONFIG_X86_SGX_KVM
8636 if (!cpu_has_vmx_encls_vmexit())
8637 enable_sgx = false;
8638 #endif
8639
8640 /*
8641 * set_apic_access_page_addr() is used to reload apic access
8642 * page upon invalidation. No need to do anything if not
8643 * using the APIC_ACCESS_ADDR VMCS field.
8644 */
8645 if (!flexpriority_enabled)
8646 vt_x86_ops.set_apic_access_page_addr = NULL;
8647
8648 if (!cpu_has_vmx_tpr_shadow())
8649 vt_x86_ops.update_cr8_intercept = NULL;
8650
8651 #if IS_ENABLED(CONFIG_HYPERV)
8652 if (ms_hyperv.nested_features & HV_X64_NESTED_GUEST_MAPPING_FLUSH
8653 && enable_ept) {
8654 vt_x86_ops.flush_remote_tlbs = hv_flush_remote_tlbs;
8655 vt_x86_ops.flush_remote_tlbs_range = hv_flush_remote_tlbs_range;
8656 }
8657 #endif
8658
8659 if (!cpu_has_vmx_ple()) {
8660 ple_gap = 0;
8661 ple_window = 0;
8662 ple_window_grow = 0;
8663 ple_window_max = 0;
8664 ple_window_shrink = 0;
8665 }
8666
8667 if (!cpu_has_vmx_apicv())
8668 enable_apicv = 0;
8669 if (!enable_apicv)
8670 vt_x86_ops.sync_pir_to_irr = NULL;
8671
8672 if (!enable_apicv || !cpu_has_vmx_ipiv())
8673 enable_ipiv = false;
8674
8675 if (cpu_has_vmx_tsc_scaling())
8676 kvm_caps.has_tsc_control = true;
8677
8678 kvm_caps.max_tsc_scaling_ratio = KVM_VMX_TSC_MULTIPLIER_MAX;
8679 kvm_caps.tsc_scaling_ratio_frac_bits = 48;
8680 kvm_caps.has_bus_lock_exit = cpu_has_vmx_bus_lock_detection();
8681 kvm_caps.has_notify_vmexit = cpu_has_notify_vmexit();
8682
8683 set_bit(0, vmx_vpid_bitmap); /* 0 is reserved for host */
8684
8685 if (enable_ept)
8686 kvm_mmu_set_ept_masks(enable_ept_ad_bits,
8687 cpu_has_vmx_ept_execute_only());
8688 else
8689 vt_x86_ops.get_mt_mask = NULL;
8690
8691 /*
8692 * Setup shadow_me_value/shadow_me_mask to include MKTME KeyID
8693 * bits to shadow_zero_check.
8694 */
8695 vmx_setup_me_spte_mask();
8696
8697 kvm_configure_mmu(enable_ept, 0, vmx_get_max_ept_level(),
8698 ept_caps_to_lpage_level(vmx_capability.ept));
8699
8700 /*
8701 * Only enable PML when hardware supports PML feature, and both EPT
8702 * and EPT A/D bit features are enabled -- PML depends on them to work.
8703 */
8704 if (!enable_ept || !enable_ept_ad_bits || !cpu_has_vmx_pml())
8705 enable_pml = 0;
8706
8707 if (!cpu_has_vmx_preemption_timer())
8708 enable_preemption_timer = false;
8709
8710 if (enable_preemption_timer) {
8711 u64 use_timer_freq = 5000ULL * 1000 * 1000;
8712
8713 cpu_preemption_timer_multi =
8714 vmx_misc_preemption_timer_rate(vmcs_config.misc);
8715
8716 if (tsc_khz)
8717 use_timer_freq = (u64)tsc_khz * 1000;
8718 use_timer_freq >>= cpu_preemption_timer_multi;
8719
8720 /*
8721 * KVM "disables" the preemption timer by setting it to its max
8722 * value. Don't use the timer if it might cause spurious exits
8723 * at a rate faster than 0.1 Hz (of uninterrupted guest time).
8724 */
8725 if (use_timer_freq > 0xffffffffu / 10)
8726 enable_preemption_timer = false;
8727 }
8728
8729 if (!enable_preemption_timer) {
8730 vt_x86_ops.set_hv_timer = NULL;
8731 vt_x86_ops.cancel_hv_timer = NULL;
8732 }
8733
8734 kvm_caps.supported_mce_cap |= MCG_LMCE_P;
8735 kvm_caps.supported_mce_cap |= MCG_CMCI_P;
8736
8737 if (pt_mode != PT_MODE_SYSTEM && pt_mode != PT_MODE_HOST_GUEST)
8738 return -EINVAL;
8739 if (!enable_ept || !enable_pmu || !cpu_has_vmx_intel_pt())
8740 pt_mode = PT_MODE_SYSTEM;
8741 if (pt_mode == PT_MODE_HOST_GUEST)
8742 vt_init_ops.handle_intel_pt_intr = vmx_handle_intel_pt_intr;
8743 else
8744 vt_init_ops.handle_intel_pt_intr = NULL;
8745
8746 setup_default_sgx_lepubkeyhash();
8747
8748 vmx_set_cpu_caps();
8749
8750 /*
8751 * Configure nested capabilities after core CPU capabilities so that
8752 * nested support can be conditional on base support, e.g. so that KVM
8753 * can hide/show features based on kvm_cpu_cap_has().
8754 */
8755 if (nested) {
8756 r = nested_vmx_hardware_setup(kvm_vmx_exit_handlers);
8757 if (r)
8758 return r;
8759 }
8760
8761 kvm_set_posted_intr_wakeup_handler(pi_wakeup_handler);
8762
8763 /*
8764 * On Intel CPUs that lack self-snoop feature, letting the guest control
8765 * memory types may result in unexpected behavior. So always ignore guest
8766 * PAT on those CPUs and map VM as writeback, not allowing userspace to
8767 * disable the quirk.
8768 *
8769 * On certain Intel CPUs (e.g. SPR, ICX), though self-snoop feature is
8770 * supported, UC is slow enough to cause issues with some older guests (e.g.
8771 * an old version of bochs driver uses ioremap() instead of ioremap_wc() to
8772 * map the video RAM, causing wayland desktop to fail to get started
8773 * correctly). To avoid breaking those older guests that rely on KVM to force
8774 * memory type to WB, provide KVM_X86_QUIRK_IGNORE_GUEST_PAT to preserve the
8775 * safer (for performance) default behavior.
8776 *
8777 * On top of this, non-coherent DMA devices need the guest to flush CPU
8778 * caches properly. This also requires honoring guest PAT, and is forced
8779 * independent of the quirk in vmx_ignore_guest_pat().
8780 */
8781 if (!static_cpu_has(X86_FEATURE_SELFSNOOP))
8782 kvm_caps.supported_quirks &= ~KVM_X86_QUIRK_IGNORE_GUEST_PAT;
8783
8784 kvm_caps.inapplicable_quirks &= ~KVM_X86_QUIRK_IGNORE_GUEST_PAT;
8785
8786 return 0;
8787 }
8788
vmx_exit(void)8789 void vmx_exit(void)
8790 {
8791 allow_smaller_maxphyaddr = false;
8792
8793 vmx_cleanup_l1d_flush();
8794
8795 kvm_x86_vendor_exit();
8796 }
8797
vmx_init(void)8798 int __init vmx_init(void)
8799 {
8800 int r, cpu;
8801
8802 KVM_SANITY_CHECK_VM_STRUCT_SIZE(kvm_vmx);
8803
8804 if (!kvm_is_vmx_supported())
8805 return -EOPNOTSUPP;
8806
8807 /*
8808 * Note, VMCS and eVMCS configuration only touch VMX knobs/variables,
8809 * i.e. there's nothing to unwind if a later step fails.
8810 */
8811 hv_init_evmcs();
8812
8813 /*
8814 * Parse the VMCS config and VMX capabilities before anything else, so
8815 * that the information is available to all setup flows.
8816 */
8817 if (setup_vmcs_config(&vmcs_config, &vmx_capability) < 0)
8818 return -EIO;
8819
8820 r = kvm_x86_vendor_init(&vt_init_ops);
8821 if (r)
8822 return r;
8823
8824 /* Must be called after common x86 init so enable_ept is setup. */
8825 r = vmx_setup_l1d_flush();
8826 if (r)
8827 goto err_l1d_flush;
8828
8829 for_each_possible_cpu(cpu) {
8830 INIT_LIST_HEAD(&per_cpu(loaded_vmcss_on_cpu, cpu));
8831
8832 pi_init_cpu(cpu);
8833 }
8834
8835 vmx_check_vmcs12_offsets();
8836
8837 return 0;
8838
8839 err_l1d_flush:
8840 kvm_x86_vendor_exit();
8841 return r;
8842 }
8843