1 // SPDX-License-Identifier: GPL-2.0-only 2 #include <linux/cpu.h> 3 #include <linux/cpumask.h> 4 #include <linux/errno.h> 5 #include <linux/kvm_types.h> 6 #include <linux/list.h> 7 #include <linux/percpu.h> 8 9 #include <asm/perf_event.h> 10 #include <asm/processor.h> 11 #include <asm/virt.h> 12 #include <asm/vmx.h> 13 14 struct x86_virt_ops { 15 int feature; 16 int (*enable_virtualization_cpu)(void); 17 int (*disable_virtualization_cpu)(void); 18 void (*emergency_disable_virtualization_cpu)(void); 19 }; 20 static struct x86_virt_ops virt_ops __ro_after_init; 21 22 __visible bool virt_rebooting; 23 EXPORT_SYMBOL_FOR_KVM(virt_rebooting); 24 25 static DEFINE_PER_CPU(int, virtualization_nr_users); 26 27 static cpu_emergency_virt_cb __rcu *kvm_emergency_callback; 28 29 void x86_virt_register_emergency_callback(cpu_emergency_virt_cb *callback) 30 { 31 if (WARN_ON_ONCE(rcu_access_pointer(kvm_emergency_callback))) 32 return; 33 34 rcu_assign_pointer(kvm_emergency_callback, callback); 35 } 36 EXPORT_SYMBOL_FOR_KVM(x86_virt_register_emergency_callback); 37 38 void x86_virt_unregister_emergency_callback(cpu_emergency_virt_cb *callback) 39 { 40 if (WARN_ON_ONCE(rcu_access_pointer(kvm_emergency_callback) != callback)) 41 return; 42 43 rcu_assign_pointer(kvm_emergency_callback, NULL); 44 synchronize_rcu(); 45 } 46 EXPORT_SYMBOL_FOR_KVM(x86_virt_unregister_emergency_callback); 47 48 static void x86_virt_invoke_kvm_emergency_callback(void) 49 { 50 cpu_emergency_virt_cb *kvm_callback; 51 52 /* 53 * RCU may not be watching the crashing CPU here, so rcu_dereference() 54 * triggers a suspicious-RCU-usage splat. In principle, a concurrent 55 * KVM module unload could race with this read; see commit 2baa33a8ddd6 56 * ("KVM: x86: Leave user-return notifier registered on reboot/shutdown") 57 * which notes that nothing prevents module unload during panic/reboot. 58 * 59 * However, taking a lock here would be riskier than the current race: 60 * the system is going down via NMI shootdown, and any lock could be 61 * held by an already-stopped CPU. Use rcu_dereference_raw() to silence 62 * the lockdep splat and accept the comically small remaining race; 63 * panic context inherently cannot guarantee complete correctness. 64 */ 65 kvm_callback = rcu_dereference_raw(kvm_emergency_callback); 66 if (kvm_callback) 67 kvm_callback(); 68 } 69 70 #if IS_ENABLED(CONFIG_KVM_INTEL) 71 static DEFINE_PER_CPU(struct vmcs *, root_vmcs); 72 73 static int x86_virt_cpu_vmxon(void) 74 { 75 u64 vmxon_pointer = __pa(per_cpu(root_vmcs, raw_smp_processor_id())); 76 u64 msr; 77 78 cr4_set_bits(X86_CR4_VMXE); 79 80 asm goto("1: vmxon %[vmxon_pointer]\n\t" 81 _ASM_EXTABLE(1b, %l[fault]) 82 : : [vmxon_pointer] "m"(vmxon_pointer) 83 : : fault); 84 return 0; 85 86 fault: 87 WARN_ONCE(1, "VMXON faulted, MSR_IA32_FEAT_CTL (0x3a) = 0x%llx\n", 88 rdmsrq_safe(MSR_IA32_FEAT_CTL, &msr) ? 0xdeadbeef : msr); 89 cr4_clear_bits(X86_CR4_VMXE); 90 91 return -EFAULT; 92 } 93 94 static int x86_vmx_enable_virtualization_cpu(void) 95 { 96 int r; 97 98 if (cr4_read_shadow() & X86_CR4_VMXE) 99 return -EBUSY; 100 101 intel_pt_handle_vmx(1); 102 103 r = x86_virt_cpu_vmxon(); 104 if (r) { 105 intel_pt_handle_vmx(0); 106 return r; 107 } 108 109 return 0; 110 } 111 112 /* 113 * Disable VMX and clear CR4.VMXE (even if VMXOFF faults) 114 * 115 * Note, VMXOFF causes a #UD if the CPU is !post-VMXON, but it's impossible to 116 * atomically track post-VMXON state, e.g. this may be called in NMI context. 117 * Eat all faults as all other faults on VMXOFF faults are mode related, i.e. 118 * faults are guaranteed to be due to the !post-VMXON check unless the CPU is 119 * magically in RM, VM86, compat mode, or at CPL>0. 120 */ 121 static int x86_vmx_disable_virtualization_cpu(void) 122 { 123 int r = -EIO; 124 125 asm goto("1: vmxoff\n\t" 126 _ASM_EXTABLE(1b, %l[fault]) 127 ::: "cc", "memory" : fault); 128 r = 0; 129 130 fault: 131 cr4_clear_bits(X86_CR4_VMXE); 132 intel_pt_handle_vmx(0); 133 return r; 134 } 135 136 static void x86_vmx_emergency_disable_virtualization_cpu(void) 137 { 138 virt_rebooting = true; 139 140 /* 141 * Note, CR4.VMXE can be _cleared_ in NMI context, but it can only be 142 * set in task context. If this races with _another_ emergency call 143 * from NMI context, VMCLEAR (in KVM) and VMXOFF may #UD, but KVM and 144 * the kernel will eat those faults due to virt_rebooting being set by 145 * the interrupting NMI callback. 146 */ 147 if (!(__read_cr4() & X86_CR4_VMXE)) 148 return; 149 150 x86_virt_invoke_kvm_emergency_callback(); 151 152 x86_vmx_disable_virtualization_cpu(); 153 } 154 155 static __init void x86_vmx_exit(void) 156 { 157 int cpu; 158 159 for_each_possible_cpu(cpu) { 160 free_page((unsigned long)per_cpu(root_vmcs, cpu)); 161 per_cpu(root_vmcs, cpu) = NULL; 162 } 163 } 164 165 static __init int __x86_vmx_init(void) 166 { 167 const struct x86_virt_ops vmx_ops = { 168 .feature = X86_FEATURE_VMX, 169 .enable_virtualization_cpu = x86_vmx_enable_virtualization_cpu, 170 .disable_virtualization_cpu = x86_vmx_disable_virtualization_cpu, 171 .emergency_disable_virtualization_cpu = x86_vmx_emergency_disable_virtualization_cpu, 172 }; 173 174 u64 basic_msr; 175 u32 rev_id; 176 int cpu; 177 178 if (!cpu_feature_enabled(X86_FEATURE_VMX)) 179 return -EOPNOTSUPP; 180 181 rdmsrq(MSR_IA32_VMX_BASIC, basic_msr); 182 183 /* IA-32 SDM Vol 3B: VMCS size is never greater than 4kB. */ 184 if (WARN_ON_ONCE(vmx_basic_vmcs_size(basic_msr) > PAGE_SIZE)) 185 return -EIO; 186 187 /* 188 * Even if eVMCS is enabled (or will be enabled?), and even though not 189 * explicitly documented by TLFS, the root VMCS passed to VMXON should 190 * still be marked with the revision_id reported by the physical CPU. 191 */ 192 rev_id = vmx_basic_vmcs_revision_id(basic_msr); 193 194 for_each_possible_cpu(cpu) { 195 int node = cpu_to_node(cpu); 196 struct page *page; 197 struct vmcs *vmcs; 198 199 page = __alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0); 200 if (WARN_ON_ONCE(!page)) { 201 x86_vmx_exit(); 202 return -ENOMEM; 203 } 204 205 vmcs = page_address(page); 206 vmcs->hdr.revision_id = rev_id; 207 per_cpu(root_vmcs, cpu) = vmcs; 208 } 209 210 memcpy(&virt_ops, &vmx_ops, sizeof(virt_ops)); 211 return 0; 212 } 213 214 static __init int x86_vmx_init(void) 215 { 216 int r; 217 218 r = __x86_vmx_init(); 219 if (r) 220 setup_clear_cpu_cap(X86_FEATURE_VMX); 221 return r; 222 } 223 #else 224 static __init int x86_vmx_init(void) { return -EOPNOTSUPP; } 225 static __init void x86_vmx_exit(void) { } 226 #endif 227 228 #if IS_ENABLED(CONFIG_KVM_AMD) 229 static int x86_svm_enable_virtualization_cpu(void) 230 { 231 u64 efer; 232 233 rdmsrq(MSR_EFER, efer); 234 if (efer & EFER_SVME) 235 return -EBUSY; 236 237 wrmsrq(MSR_EFER, efer | EFER_SVME); 238 return 0; 239 } 240 241 static int x86_svm_disable_virtualization_cpu(void) 242 { 243 int r = -EIO; 244 u64 efer; 245 246 /* 247 * Force GIF=1 prior to disabling SVM, e.g. to ensure INIT and 248 * NMI aren't blocked. 249 */ 250 asm goto("1: stgi\n\t" 251 _ASM_EXTABLE(1b, %l[fault]) 252 ::: "memory" : fault); 253 r = 0; 254 255 fault: 256 rdmsrq(MSR_EFER, efer); 257 wrmsrq(MSR_EFER, efer & ~EFER_SVME); 258 return r; 259 } 260 261 static void x86_svm_emergency_disable_virtualization_cpu(void) 262 { 263 u64 efer; 264 265 virt_rebooting = true; 266 267 rdmsrq(MSR_EFER, efer); 268 if (!(efer & EFER_SVME)) 269 return; 270 271 x86_virt_invoke_kvm_emergency_callback(); 272 273 x86_svm_disable_virtualization_cpu(); 274 } 275 276 static __init int x86_svm_init(void) 277 { 278 const struct x86_virt_ops svm_ops = { 279 .feature = X86_FEATURE_SVM, 280 .enable_virtualization_cpu = x86_svm_enable_virtualization_cpu, 281 .disable_virtualization_cpu = x86_svm_disable_virtualization_cpu, 282 .emergency_disable_virtualization_cpu = x86_svm_emergency_disable_virtualization_cpu, 283 }; 284 285 if (!cpu_feature_enabled(X86_FEATURE_SVM) || 286 cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT)) 287 return -EOPNOTSUPP; 288 289 memcpy(&virt_ops, &svm_ops, sizeof(virt_ops)); 290 return 0; 291 } 292 #else 293 static __init int x86_svm_init(void) { return -EOPNOTSUPP; } 294 #endif 295 296 int x86_virt_get_ref(int feat) 297 { 298 int r; 299 300 /* Ensure the !feature check can't get false positives. */ 301 BUILD_BUG_ON(!X86_FEATURE_SVM || !X86_FEATURE_VMX); 302 303 if (!virt_ops.feature || virt_ops.feature != feat) 304 return -EOPNOTSUPP; 305 306 guard(preempt)(); 307 308 if (this_cpu_inc_return(virtualization_nr_users) > 1) 309 return 0; 310 311 r = virt_ops.enable_virtualization_cpu(); 312 if (r) 313 WARN_ON_ONCE(this_cpu_dec_return(virtualization_nr_users)); 314 315 return r; 316 } 317 EXPORT_SYMBOL_FOR_KVM(x86_virt_get_ref); 318 319 void x86_virt_put_ref(int feat) 320 { 321 guard(preempt)(); 322 323 if (WARN_ON_ONCE(!this_cpu_read(virtualization_nr_users)) || 324 this_cpu_dec_return(virtualization_nr_users)) 325 return; 326 327 BUG_ON(virt_ops.disable_virtualization_cpu() && !virt_rebooting); 328 } 329 EXPORT_SYMBOL_FOR_KVM(x86_virt_put_ref); 330 331 /* 332 * Disable virtualization, i.e. VMX or SVM, to ensure INIT is recognized during 333 * reboot. VMX blocks INIT if the CPU is post-VMXON, and SVM blocks INIT if 334 * GIF=0, i.e. if the crash occurred between CLGI and STGI. 335 */ 336 int x86_virt_emergency_disable_virtualization_cpu(void) 337 { 338 if (!virt_ops.feature) 339 return -EOPNOTSUPP; 340 341 /* 342 * IRQs must be disabled as virtualization is enabled in hardware via 343 * function call IPIs, i.e. IRQs need to be disabled to guarantee 344 * virtualization stays disabled. 345 */ 346 lockdep_assert_irqs_disabled(); 347 348 /* 349 * Do the NMI shootdown even if virtualization is off on _this_ CPU, as 350 * other CPUs may have virtualization enabled. 351 * 352 * TODO: Track whether or not virtualization might be enabled on other 353 * CPUs? May not be worth avoiding the NMI shootdown... 354 */ 355 virt_ops.emergency_disable_virtualization_cpu(); 356 return 0; 357 } 358 359 void __init x86_virt_init(void) 360 { 361 /* 362 * Attempt to initialize both SVM and VMX, and simply use whichever one 363 * is present. Rsefuse to enable/use SVM or VMX if both are somehow 364 * supported. No known CPU supports both SVM and VMX. 365 */ 366 bool has_vmx = !x86_vmx_init(); 367 bool has_svm = !x86_svm_init(); 368 369 if (WARN_ON_ONCE(has_vmx && has_svm)) { 370 x86_vmx_exit(); 371 memset(&virt_ops, 0, sizeof(virt_ops)); 372 } 373 } 374