1 // SPDX-License-Identifier: GPL-2.0-only
2 #include <linux/cpu.h>
3 #include <linux/cpumask.h>
4 #include <linux/errno.h>
5 #include <linux/kvm_types.h>
6 #include <linux/list.h>
7 #include <linux/percpu.h>
8
9 #include <asm/perf_event.h>
10 #include <asm/processor.h>
11 #include <asm/virt.h>
12 #include <asm/vmx.h>
13
14 struct x86_virt_ops {
15 int feature;
16 int (*enable_virtualization_cpu)(void);
17 int (*disable_virtualization_cpu)(void);
18 void (*emergency_disable_virtualization_cpu)(void);
19 };
20 static struct x86_virt_ops virt_ops __ro_after_init;
21
22 __visible bool virt_rebooting;
23 EXPORT_SYMBOL_FOR_KVM(virt_rebooting);
24
25 static DEFINE_PER_CPU(int, virtualization_nr_users);
26
27 static cpu_emergency_virt_cb __rcu *kvm_emergency_callback;
28
x86_virt_register_emergency_callback(cpu_emergency_virt_cb * callback)29 void x86_virt_register_emergency_callback(cpu_emergency_virt_cb *callback)
30 {
31 if (WARN_ON_ONCE(rcu_access_pointer(kvm_emergency_callback)))
32 return;
33
34 rcu_assign_pointer(kvm_emergency_callback, callback);
35 }
36 EXPORT_SYMBOL_FOR_KVM(x86_virt_register_emergency_callback);
37
x86_virt_unregister_emergency_callback(cpu_emergency_virt_cb * callback)38 void x86_virt_unregister_emergency_callback(cpu_emergency_virt_cb *callback)
39 {
40 if (WARN_ON_ONCE(rcu_access_pointer(kvm_emergency_callback) != callback))
41 return;
42
43 rcu_assign_pointer(kvm_emergency_callback, NULL);
44 synchronize_rcu();
45 }
46 EXPORT_SYMBOL_FOR_KVM(x86_virt_unregister_emergency_callback);
47
x86_virt_invoke_kvm_emergency_callback(void)48 static void x86_virt_invoke_kvm_emergency_callback(void)
49 {
50 cpu_emergency_virt_cb *kvm_callback;
51
52 /*
53 * RCU may not be watching the crashing CPU here, so rcu_dereference()
54 * triggers a suspicious-RCU-usage splat. In principle, a concurrent
55 * KVM module unload could race with this read; see commit 2baa33a8ddd6
56 * ("KVM: x86: Leave user-return notifier registered on reboot/shutdown")
57 * which notes that nothing prevents module unload during panic/reboot.
58 *
59 * However, taking a lock here would be riskier than the current race:
60 * the system is going down via NMI shootdown, and any lock could be
61 * held by an already-stopped CPU. Use rcu_dereference_raw() to silence
62 * the lockdep splat and accept the comically small remaining race;
63 * panic context inherently cannot guarantee complete correctness.
64 */
65 kvm_callback = rcu_dereference_raw(kvm_emergency_callback);
66 if (kvm_callback)
67 kvm_callback();
68 }
69
70 #if IS_ENABLED(CONFIG_KVM_INTEL)
71 static DEFINE_PER_CPU(struct vmcs *, root_vmcs);
72
x86_virt_cpu_vmxon(void)73 static int x86_virt_cpu_vmxon(void)
74 {
75 u64 vmxon_pointer = __pa(per_cpu(root_vmcs, raw_smp_processor_id()));
76 u64 msr;
77
78 cr4_set_bits(X86_CR4_VMXE);
79
80 asm goto("1: vmxon %[vmxon_pointer]\n\t"
81 _ASM_EXTABLE(1b, %l[fault])
82 : : [vmxon_pointer] "m"(vmxon_pointer)
83 : : fault);
84 return 0;
85
86 fault:
87 WARN_ONCE(1, "VMXON faulted, MSR_IA32_FEAT_CTL (0x3a) = 0x%llx\n",
88 rdmsrq_safe(MSR_IA32_FEAT_CTL, &msr) ? 0xdeadbeef : msr);
89 cr4_clear_bits(X86_CR4_VMXE);
90
91 return -EFAULT;
92 }
93
x86_vmx_enable_virtualization_cpu(void)94 static int x86_vmx_enable_virtualization_cpu(void)
95 {
96 int r;
97
98 if (cr4_read_shadow() & X86_CR4_VMXE)
99 return -EBUSY;
100
101 intel_pt_handle_vmx(1);
102
103 r = x86_virt_cpu_vmxon();
104 if (r) {
105 intel_pt_handle_vmx(0);
106 return r;
107 }
108
109 return 0;
110 }
111
112 /*
113 * Disable VMX and clear CR4.VMXE (even if VMXOFF faults)
114 *
115 * Note, VMXOFF causes a #UD if the CPU is !post-VMXON, but it's impossible to
116 * atomically track post-VMXON state, e.g. this may be called in NMI context.
117 * Eat all faults as all other faults on VMXOFF faults are mode related, i.e.
118 * faults are guaranteed to be due to the !post-VMXON check unless the CPU is
119 * magically in RM, VM86, compat mode, or at CPL>0.
120 */
x86_vmx_disable_virtualization_cpu(void)121 static int x86_vmx_disable_virtualization_cpu(void)
122 {
123 int r = -EIO;
124
125 asm goto("1: vmxoff\n\t"
126 _ASM_EXTABLE(1b, %l[fault])
127 ::: "cc", "memory" : fault);
128 r = 0;
129
130 fault:
131 cr4_clear_bits(X86_CR4_VMXE);
132 intel_pt_handle_vmx(0);
133 return r;
134 }
135
x86_vmx_emergency_disable_virtualization_cpu(void)136 static void x86_vmx_emergency_disable_virtualization_cpu(void)
137 {
138 virt_rebooting = true;
139
140 /*
141 * Note, CR4.VMXE can be _cleared_ in NMI context, but it can only be
142 * set in task context. If this races with _another_ emergency call
143 * from NMI context, VMCLEAR (in KVM) and VMXOFF may #UD, but KVM and
144 * the kernel will eat those faults due to virt_rebooting being set by
145 * the interrupting NMI callback.
146 */
147 if (!(__read_cr4() & X86_CR4_VMXE))
148 return;
149
150 x86_virt_invoke_kvm_emergency_callback();
151
152 x86_vmx_disable_virtualization_cpu();
153 }
154
x86_vmx_exit(void)155 static __init void x86_vmx_exit(void)
156 {
157 int cpu;
158
159 for_each_possible_cpu(cpu) {
160 free_page((unsigned long)per_cpu(root_vmcs, cpu));
161 per_cpu(root_vmcs, cpu) = NULL;
162 }
163 }
164
__x86_vmx_init(void)165 static __init int __x86_vmx_init(void)
166 {
167 const struct x86_virt_ops vmx_ops = {
168 .feature = X86_FEATURE_VMX,
169 .enable_virtualization_cpu = x86_vmx_enable_virtualization_cpu,
170 .disable_virtualization_cpu = x86_vmx_disable_virtualization_cpu,
171 .emergency_disable_virtualization_cpu = x86_vmx_emergency_disable_virtualization_cpu,
172 };
173
174 u64 basic_msr;
175 u32 rev_id;
176 int cpu;
177
178 if (!cpu_feature_enabled(X86_FEATURE_VMX))
179 return -EOPNOTSUPP;
180
181 rdmsrq(MSR_IA32_VMX_BASIC, basic_msr);
182
183 /* IA-32 SDM Vol 3B: VMCS size is never greater than 4kB. */
184 if (WARN_ON_ONCE(vmx_basic_vmcs_size(basic_msr) > PAGE_SIZE))
185 return -EIO;
186
187 /*
188 * Even if eVMCS is enabled (or will be enabled?), and even though not
189 * explicitly documented by TLFS, the root VMCS passed to VMXON should
190 * still be marked with the revision_id reported by the physical CPU.
191 */
192 rev_id = vmx_basic_vmcs_revision_id(basic_msr);
193
194 for_each_possible_cpu(cpu) {
195 int node = cpu_to_node(cpu);
196 struct page *page;
197 struct vmcs *vmcs;
198
199 page = __alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0);
200 if (WARN_ON_ONCE(!page)) {
201 x86_vmx_exit();
202 return -ENOMEM;
203 }
204
205 vmcs = page_address(page);
206 vmcs->hdr.revision_id = rev_id;
207 per_cpu(root_vmcs, cpu) = vmcs;
208 }
209
210 memcpy(&virt_ops, &vmx_ops, sizeof(virt_ops));
211 return 0;
212 }
213
x86_vmx_init(void)214 static __init int x86_vmx_init(void)
215 {
216 int r;
217
218 r = __x86_vmx_init();
219 if (r)
220 setup_clear_cpu_cap(X86_FEATURE_VMX);
221 return r;
222 }
223 #else
x86_vmx_init(void)224 static __init int x86_vmx_init(void) { return -EOPNOTSUPP; }
x86_vmx_exit(void)225 static __init void x86_vmx_exit(void) { }
226 #endif
227
228 #if IS_ENABLED(CONFIG_KVM_AMD)
x86_svm_enable_virtualization_cpu(void)229 static int x86_svm_enable_virtualization_cpu(void)
230 {
231 u64 efer;
232
233 rdmsrq(MSR_EFER, efer);
234 if (efer & EFER_SVME)
235 return -EBUSY;
236
237 wrmsrq(MSR_EFER, efer | EFER_SVME);
238 return 0;
239 }
240
x86_svm_disable_virtualization_cpu(void)241 static int x86_svm_disable_virtualization_cpu(void)
242 {
243 int r = -EIO;
244 u64 efer;
245
246 /*
247 * Force GIF=1 prior to disabling SVM, e.g. to ensure INIT and
248 * NMI aren't blocked.
249 */
250 asm goto("1: stgi\n\t"
251 _ASM_EXTABLE(1b, %l[fault])
252 ::: "memory" : fault);
253 r = 0;
254
255 fault:
256 rdmsrq(MSR_EFER, efer);
257 wrmsrq(MSR_EFER, efer & ~EFER_SVME);
258 return r;
259 }
260
x86_svm_emergency_disable_virtualization_cpu(void)261 static void x86_svm_emergency_disable_virtualization_cpu(void)
262 {
263 u64 efer;
264
265 virt_rebooting = true;
266
267 rdmsrq(MSR_EFER, efer);
268 if (!(efer & EFER_SVME))
269 return;
270
271 x86_virt_invoke_kvm_emergency_callback();
272
273 x86_svm_disable_virtualization_cpu();
274 }
275
x86_svm_init(void)276 static __init int x86_svm_init(void)
277 {
278 const struct x86_virt_ops svm_ops = {
279 .feature = X86_FEATURE_SVM,
280 .enable_virtualization_cpu = x86_svm_enable_virtualization_cpu,
281 .disable_virtualization_cpu = x86_svm_disable_virtualization_cpu,
282 .emergency_disable_virtualization_cpu = x86_svm_emergency_disable_virtualization_cpu,
283 };
284
285 if (!cpu_feature_enabled(X86_FEATURE_SVM) ||
286 cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT))
287 return -EOPNOTSUPP;
288
289 memcpy(&virt_ops, &svm_ops, sizeof(virt_ops));
290 return 0;
291 }
292 #else
x86_svm_init(void)293 static __init int x86_svm_init(void) { return -EOPNOTSUPP; }
294 #endif
295
x86_virt_get_ref(int feat)296 int x86_virt_get_ref(int feat)
297 {
298 int r;
299
300 /* Ensure the !feature check can't get false positives. */
301 BUILD_BUG_ON(!X86_FEATURE_SVM || !X86_FEATURE_VMX);
302
303 if (!virt_ops.feature || virt_ops.feature != feat)
304 return -EOPNOTSUPP;
305
306 guard(preempt)();
307
308 if (this_cpu_inc_return(virtualization_nr_users) > 1)
309 return 0;
310
311 r = virt_ops.enable_virtualization_cpu();
312 if (r)
313 WARN_ON_ONCE(this_cpu_dec_return(virtualization_nr_users));
314
315 return r;
316 }
317 EXPORT_SYMBOL_FOR_KVM(x86_virt_get_ref);
318
x86_virt_put_ref(int feat)319 void x86_virt_put_ref(int feat)
320 {
321 guard(preempt)();
322
323 if (WARN_ON_ONCE(!this_cpu_read(virtualization_nr_users)) ||
324 this_cpu_dec_return(virtualization_nr_users))
325 return;
326
327 BUG_ON(virt_ops.disable_virtualization_cpu() && !virt_rebooting);
328 }
329 EXPORT_SYMBOL_FOR_KVM(x86_virt_put_ref);
330
331 /*
332 * Disable virtualization, i.e. VMX or SVM, to ensure INIT is recognized during
333 * reboot. VMX blocks INIT if the CPU is post-VMXON, and SVM blocks INIT if
334 * GIF=0, i.e. if the crash occurred between CLGI and STGI.
335 */
x86_virt_emergency_disable_virtualization_cpu(void)336 int x86_virt_emergency_disable_virtualization_cpu(void)
337 {
338 if (!virt_ops.feature)
339 return -EOPNOTSUPP;
340
341 /*
342 * IRQs must be disabled as virtualization is enabled in hardware via
343 * function call IPIs, i.e. IRQs need to be disabled to guarantee
344 * virtualization stays disabled.
345 */
346 lockdep_assert_irqs_disabled();
347
348 /*
349 * Do the NMI shootdown even if virtualization is off on _this_ CPU, as
350 * other CPUs may have virtualization enabled.
351 *
352 * TODO: Track whether or not virtualization might be enabled on other
353 * CPUs? May not be worth avoiding the NMI shootdown...
354 */
355 virt_ops.emergency_disable_virtualization_cpu();
356 return 0;
357 }
358
x86_virt_init(void)359 void __init x86_virt_init(void)
360 {
361 /*
362 * Attempt to initialize both SVM and VMX, and simply use whichever one
363 * is present. Rsefuse to enable/use SVM or VMX if both are somehow
364 * supported. No known CPU supports both SVM and VMX.
365 */
366 bool has_vmx = !x86_vmx_init();
367 bool has_svm = !x86_svm_init();
368
369 if (WARN_ON_ONCE(has_vmx && has_svm)) {
370 x86_vmx_exit();
371 memset(&virt_ops, 0, sizeof(virt_ops));
372 }
373 }
374