1 // SPDX-License-Identifier: GPL-2.0
2 #include <linux/cleanup.h>
3 #include <linux/cpu.h>
4 #include <asm/cpufeature.h>
5 #include <asm/fpu/xcr.h>
6 #include <linux/misc_cgroup.h>
7 #include <linux/mmu_context.h>
8 #include <asm/tdx.h>
9 #include "capabilities.h"
10 #include "mmu.h"
11 #include "x86_ops.h"
12 #include "lapic.h"
13 #include "tdx.h"
14 #include "vmx.h"
15 #include "mmu/spte.h"
16 #include "common.h"
17 #include "posted_intr.h"
18 #include "irq.h"
19 #include <trace/events/kvm.h>
20 #include "trace.h"
21
22 #pragma GCC poison to_vmx
23
24 #undef pr_fmt
25 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
26
27 #define pr_tdx_error(__fn, __err) \
28 pr_err_ratelimited("SEAMCALL %s failed: 0x%llx\n", #__fn, __err)
29
30 #define __pr_tdx_error_N(__fn_str, __err, __fmt, ...) \
31 pr_err_ratelimited("SEAMCALL " __fn_str " failed: 0x%llx, " __fmt, __err, __VA_ARGS__)
32
33 #define pr_tdx_error_1(__fn, __err, __rcx) \
34 __pr_tdx_error_N(#__fn, __err, "rcx 0x%llx\n", __rcx)
35
36 #define pr_tdx_error_2(__fn, __err, __rcx, __rdx) \
37 __pr_tdx_error_N(#__fn, __err, "rcx 0x%llx, rdx 0x%llx\n", __rcx, __rdx)
38
39 #define pr_tdx_error_3(__fn, __err, __rcx, __rdx, __r8) \
40 __pr_tdx_error_N(#__fn, __err, "rcx 0x%llx, rdx 0x%llx, r8 0x%llx\n", __rcx, __rdx, __r8)
41
42 bool enable_tdx __ro_after_init;
43 module_param_named(tdx, enable_tdx, bool, 0444);
44
45 #define TDX_SHARED_BIT_PWL_5 gpa_to_gfn(BIT_ULL(51))
46 #define TDX_SHARED_BIT_PWL_4 gpa_to_gfn(BIT_ULL(47))
47
48 static enum cpuhp_state tdx_cpuhp_state;
49
50 static const struct tdx_sys_info *tdx_sysinfo;
51
tdh_vp_rd_failed(struct vcpu_tdx * tdx,char * uclass,u32 field,u64 err)52 void tdh_vp_rd_failed(struct vcpu_tdx *tdx, char *uclass, u32 field, u64 err)
53 {
54 KVM_BUG_ON(1, tdx->vcpu.kvm);
55 pr_err("TDH_VP_RD[%s.0x%x] failed 0x%llx\n", uclass, field, err);
56 }
57
tdh_vp_wr_failed(struct vcpu_tdx * tdx,char * uclass,char * op,u32 field,u64 val,u64 err)58 void tdh_vp_wr_failed(struct vcpu_tdx *tdx, char *uclass, char *op, u32 field,
59 u64 val, u64 err)
60 {
61 KVM_BUG_ON(1, tdx->vcpu.kvm);
62 pr_err("TDH_VP_WR[%s.0x%x]%s0x%llx failed: 0x%llx\n", uclass, field, op, val, err);
63 }
64
65 #define KVM_SUPPORTED_TD_ATTRS (TDX_TD_ATTR_SEPT_VE_DISABLE)
66
to_kvm_tdx(struct kvm * kvm)67 static __always_inline struct kvm_tdx *to_kvm_tdx(struct kvm *kvm)
68 {
69 return container_of(kvm, struct kvm_tdx, kvm);
70 }
71
to_tdx(struct kvm_vcpu * vcpu)72 static __always_inline struct vcpu_tdx *to_tdx(struct kvm_vcpu *vcpu)
73 {
74 return container_of(vcpu, struct vcpu_tdx, vcpu);
75 }
76
tdx_get_supported_attrs(const struct tdx_sys_info_td_conf * td_conf)77 static u64 tdx_get_supported_attrs(const struct tdx_sys_info_td_conf *td_conf)
78 {
79 u64 val = KVM_SUPPORTED_TD_ATTRS;
80
81 if ((val & td_conf->attributes_fixed1) != td_conf->attributes_fixed1)
82 return 0;
83
84 val &= td_conf->attributes_fixed0;
85
86 return val;
87 }
88
tdx_get_supported_xfam(const struct tdx_sys_info_td_conf * td_conf)89 static u64 tdx_get_supported_xfam(const struct tdx_sys_info_td_conf *td_conf)
90 {
91 u64 val = kvm_caps.supported_xcr0 | kvm_caps.supported_xss;
92
93 if ((val & td_conf->xfam_fixed1) != td_conf->xfam_fixed1)
94 return 0;
95
96 val &= td_conf->xfam_fixed0;
97
98 return val;
99 }
100
tdx_get_guest_phys_addr_bits(const u32 eax)101 static int tdx_get_guest_phys_addr_bits(const u32 eax)
102 {
103 return (eax & GENMASK(23, 16)) >> 16;
104 }
105
tdx_set_guest_phys_addr_bits(const u32 eax,int addr_bits)106 static u32 tdx_set_guest_phys_addr_bits(const u32 eax, int addr_bits)
107 {
108 return (eax & ~GENMASK(23, 16)) | (addr_bits & 0xff) << 16;
109 }
110
111 #define TDX_FEATURE_TSX (__feature_bit(X86_FEATURE_HLE) | __feature_bit(X86_FEATURE_RTM))
112
has_tsx(const struct kvm_cpuid_entry2 * entry)113 static bool has_tsx(const struct kvm_cpuid_entry2 *entry)
114 {
115 return entry->function == 7 && entry->index == 0 &&
116 (entry->ebx & TDX_FEATURE_TSX);
117 }
118
clear_tsx(struct kvm_cpuid_entry2 * entry)119 static void clear_tsx(struct kvm_cpuid_entry2 *entry)
120 {
121 entry->ebx &= ~TDX_FEATURE_TSX;
122 }
123
has_waitpkg(const struct kvm_cpuid_entry2 * entry)124 static bool has_waitpkg(const struct kvm_cpuid_entry2 *entry)
125 {
126 return entry->function == 7 && entry->index == 0 &&
127 (entry->ecx & __feature_bit(X86_FEATURE_WAITPKG));
128 }
129
clear_waitpkg(struct kvm_cpuid_entry2 * entry)130 static void clear_waitpkg(struct kvm_cpuid_entry2 *entry)
131 {
132 entry->ecx &= ~__feature_bit(X86_FEATURE_WAITPKG);
133 }
134
tdx_clear_unsupported_cpuid(struct kvm_cpuid_entry2 * entry)135 static void tdx_clear_unsupported_cpuid(struct kvm_cpuid_entry2 *entry)
136 {
137 if (has_tsx(entry))
138 clear_tsx(entry);
139
140 if (has_waitpkg(entry))
141 clear_waitpkg(entry);
142 }
143
tdx_unsupported_cpuid(const struct kvm_cpuid_entry2 * entry)144 static bool tdx_unsupported_cpuid(const struct kvm_cpuid_entry2 *entry)
145 {
146 return has_tsx(entry) || has_waitpkg(entry);
147 }
148
149 #define KVM_TDX_CPUID_NO_SUBLEAF ((__u32)-1)
150
td_init_cpuid_entry2(struct kvm_cpuid_entry2 * entry,unsigned char idx)151 static void td_init_cpuid_entry2(struct kvm_cpuid_entry2 *entry, unsigned char idx)
152 {
153 const struct tdx_sys_info_td_conf *td_conf = &tdx_sysinfo->td_conf;
154
155 entry->function = (u32)td_conf->cpuid_config_leaves[idx];
156 entry->index = td_conf->cpuid_config_leaves[idx] >> 32;
157 entry->eax = (u32)td_conf->cpuid_config_values[idx][0];
158 entry->ebx = td_conf->cpuid_config_values[idx][0] >> 32;
159 entry->ecx = (u32)td_conf->cpuid_config_values[idx][1];
160 entry->edx = td_conf->cpuid_config_values[idx][1] >> 32;
161
162 if (entry->index == KVM_TDX_CPUID_NO_SUBLEAF)
163 entry->index = 0;
164
165 /*
166 * The TDX module doesn't allow configuring the guest phys addr bits
167 * (EAX[23:16]). However, KVM uses it as an interface to the userspace
168 * to configure the GPAW. Report these bits as configurable.
169 */
170 if (entry->function == 0x80000008)
171 entry->eax = tdx_set_guest_phys_addr_bits(entry->eax, 0xff);
172
173 tdx_clear_unsupported_cpuid(entry);
174 }
175
init_kvm_tdx_caps(const struct tdx_sys_info_td_conf * td_conf,struct kvm_tdx_capabilities * caps)176 static int init_kvm_tdx_caps(const struct tdx_sys_info_td_conf *td_conf,
177 struct kvm_tdx_capabilities *caps)
178 {
179 int i;
180
181 caps->supported_attrs = tdx_get_supported_attrs(td_conf);
182 if (!caps->supported_attrs)
183 return -EIO;
184
185 caps->supported_xfam = tdx_get_supported_xfam(td_conf);
186 if (!caps->supported_xfam)
187 return -EIO;
188
189 caps->cpuid.nent = td_conf->num_cpuid_config;
190
191 for (i = 0; i < td_conf->num_cpuid_config; i++)
192 td_init_cpuid_entry2(&caps->cpuid.entries[i], i);
193
194 return 0;
195 }
196
197 /*
198 * Some SEAMCALLs acquire the TDX module globally, and can fail with
199 * TDX_OPERAND_BUSY. Use a global mutex to serialize these SEAMCALLs.
200 */
201 static DEFINE_MUTEX(tdx_lock);
202
203 static atomic_t nr_configured_hkid;
204
tdx_operand_busy(u64 err)205 static bool tdx_operand_busy(u64 err)
206 {
207 return (err & TDX_SEAMCALL_STATUS_MASK) == TDX_OPERAND_BUSY;
208 }
209
210
211 /*
212 * A per-CPU list of TD vCPUs associated with a given CPU.
213 * Protected by interrupt mask. Only manipulated by the CPU owning this per-CPU
214 * list.
215 * - When a vCPU is loaded onto a CPU, it is removed from the per-CPU list of
216 * the old CPU during the IPI callback running on the old CPU, and then added
217 * to the per-CPU list of the new CPU.
218 * - When a TD is tearing down, all vCPUs are disassociated from their current
219 * running CPUs and removed from the per-CPU list during the IPI callback
220 * running on those CPUs.
221 * - When a CPU is brought down, traverse the per-CPU list to disassociate all
222 * associated TD vCPUs and remove them from the per-CPU list.
223 */
224 static DEFINE_PER_CPU(struct list_head, associated_tdvcpus);
225
tdvmcall_exit_type(struct kvm_vcpu * vcpu)226 static __always_inline unsigned long tdvmcall_exit_type(struct kvm_vcpu *vcpu)
227 {
228 return to_tdx(vcpu)->vp_enter_args.r10;
229 }
230
tdvmcall_leaf(struct kvm_vcpu * vcpu)231 static __always_inline unsigned long tdvmcall_leaf(struct kvm_vcpu *vcpu)
232 {
233 return to_tdx(vcpu)->vp_enter_args.r11;
234 }
235
tdvmcall_set_return_code(struct kvm_vcpu * vcpu,long val)236 static __always_inline void tdvmcall_set_return_code(struct kvm_vcpu *vcpu,
237 long val)
238 {
239 to_tdx(vcpu)->vp_enter_args.r10 = val;
240 }
241
tdvmcall_set_return_val(struct kvm_vcpu * vcpu,unsigned long val)242 static __always_inline void tdvmcall_set_return_val(struct kvm_vcpu *vcpu,
243 unsigned long val)
244 {
245 to_tdx(vcpu)->vp_enter_args.r11 = val;
246 }
247
tdx_hkid_free(struct kvm_tdx * kvm_tdx)248 static inline void tdx_hkid_free(struct kvm_tdx *kvm_tdx)
249 {
250 tdx_guest_keyid_free(kvm_tdx->hkid);
251 kvm_tdx->hkid = -1;
252 atomic_dec(&nr_configured_hkid);
253 misc_cg_uncharge(MISC_CG_RES_TDX, kvm_tdx->misc_cg, 1);
254 put_misc_cg(kvm_tdx->misc_cg);
255 kvm_tdx->misc_cg = NULL;
256 }
257
is_hkid_assigned(struct kvm_tdx * kvm_tdx)258 static inline bool is_hkid_assigned(struct kvm_tdx *kvm_tdx)
259 {
260 return kvm_tdx->hkid > 0;
261 }
262
tdx_disassociate_vp(struct kvm_vcpu * vcpu)263 static inline void tdx_disassociate_vp(struct kvm_vcpu *vcpu)
264 {
265 lockdep_assert_irqs_disabled();
266
267 list_del(&to_tdx(vcpu)->cpu_list);
268
269 /*
270 * Ensure tdx->cpu_list is updated before setting vcpu->cpu to -1,
271 * otherwise, a different CPU can see vcpu->cpu = -1 and add the vCPU
272 * to its list before it's deleted from this CPU's list.
273 */
274 smp_wmb();
275
276 vcpu->cpu = -1;
277 }
278
tdx_clear_page(struct page * page)279 static void tdx_clear_page(struct page *page)
280 {
281 const void *zero_page = (const void *) page_to_virt(ZERO_PAGE(0));
282 void *dest = page_to_virt(page);
283 unsigned long i;
284
285 /*
286 * The page could have been poisoned. MOVDIR64B also clears
287 * the poison bit so the kernel can safely use the page again.
288 */
289 for (i = 0; i < PAGE_SIZE; i += 64)
290 movdir64b(dest + i, zero_page);
291 /*
292 * MOVDIR64B store uses WC buffer. Prevent following memory reads
293 * from seeing potentially poisoned cache.
294 */
295 __mb();
296 }
297
tdx_no_vcpus_enter_start(struct kvm * kvm)298 static void tdx_no_vcpus_enter_start(struct kvm *kvm)
299 {
300 struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
301
302 lockdep_assert_held_write(&kvm->mmu_lock);
303
304 WRITE_ONCE(kvm_tdx->wait_for_sept_zap, true);
305
306 kvm_make_all_cpus_request(kvm, KVM_REQ_OUTSIDE_GUEST_MODE);
307 }
308
tdx_no_vcpus_enter_stop(struct kvm * kvm)309 static void tdx_no_vcpus_enter_stop(struct kvm *kvm)
310 {
311 struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
312
313 lockdep_assert_held_write(&kvm->mmu_lock);
314
315 WRITE_ONCE(kvm_tdx->wait_for_sept_zap, false);
316 }
317
318 /* TDH.PHYMEM.PAGE.RECLAIM is allowed only when destroying the TD. */
__tdx_reclaim_page(struct page * page)319 static int __tdx_reclaim_page(struct page *page)
320 {
321 u64 err, rcx, rdx, r8;
322
323 err = tdh_phymem_page_reclaim(page, &rcx, &rdx, &r8);
324
325 /*
326 * No need to check for TDX_OPERAND_BUSY; all TD pages are freed
327 * before the HKID is released and control pages have also been
328 * released at this point, so there is no possibility of contention.
329 */
330 if (WARN_ON_ONCE(err)) {
331 pr_tdx_error_3(TDH_PHYMEM_PAGE_RECLAIM, err, rcx, rdx, r8);
332 return -EIO;
333 }
334 return 0;
335 }
336
tdx_reclaim_page(struct page * page)337 static int tdx_reclaim_page(struct page *page)
338 {
339 int r;
340
341 r = __tdx_reclaim_page(page);
342 if (!r)
343 tdx_clear_page(page);
344 return r;
345 }
346
347
348 /*
349 * Reclaim the TD control page(s) which are crypto-protected by TDX guest's
350 * private KeyID. Assume the cache associated with the TDX private KeyID has
351 * been flushed.
352 */
tdx_reclaim_control_page(struct page * ctrl_page)353 static void tdx_reclaim_control_page(struct page *ctrl_page)
354 {
355 /*
356 * Leak the page if the kernel failed to reclaim the page.
357 * The kernel cannot use it safely anymore.
358 */
359 if (tdx_reclaim_page(ctrl_page))
360 return;
361
362 __free_page(ctrl_page);
363 }
364
365 struct tdx_flush_vp_arg {
366 struct kvm_vcpu *vcpu;
367 u64 err;
368 };
369
tdx_flush_vp(void * _arg)370 static void tdx_flush_vp(void *_arg)
371 {
372 struct tdx_flush_vp_arg *arg = _arg;
373 struct kvm_vcpu *vcpu = arg->vcpu;
374 u64 err;
375
376 arg->err = 0;
377 lockdep_assert_irqs_disabled();
378
379 /* Task migration can race with CPU offlining. */
380 if (unlikely(vcpu->cpu != raw_smp_processor_id()))
381 return;
382
383 /*
384 * No need to do TDH_VP_FLUSH if the vCPU hasn't been initialized. The
385 * list tracking still needs to be updated so that it's correct if/when
386 * the vCPU does get initialized.
387 */
388 if (to_tdx(vcpu)->state != VCPU_TD_STATE_UNINITIALIZED) {
389 /*
390 * No need to retry. TDX Resources needed for TDH.VP.FLUSH are:
391 * TDVPR as exclusive, TDR as shared, and TDCS as shared. This
392 * vp flush function is called when destructing vCPU/TD or vCPU
393 * migration. No other thread uses TDVPR in those cases.
394 */
395 err = tdh_vp_flush(&to_tdx(vcpu)->vp);
396 if (unlikely(err && err != TDX_VCPU_NOT_ASSOCIATED)) {
397 /*
398 * This function is called in IPI context. Do not use
399 * printk to avoid console semaphore.
400 * The caller prints out the error message, instead.
401 */
402 if (err)
403 arg->err = err;
404 }
405 }
406
407 tdx_disassociate_vp(vcpu);
408 }
409
tdx_flush_vp_on_cpu(struct kvm_vcpu * vcpu)410 static void tdx_flush_vp_on_cpu(struct kvm_vcpu *vcpu)
411 {
412 struct tdx_flush_vp_arg arg = {
413 .vcpu = vcpu,
414 };
415 int cpu = vcpu->cpu;
416
417 if (unlikely(cpu == -1))
418 return;
419
420 smp_call_function_single(cpu, tdx_flush_vp, &arg, 1);
421 if (KVM_BUG_ON(arg.err, vcpu->kvm))
422 pr_tdx_error(TDH_VP_FLUSH, arg.err);
423 }
424
tdx_disable_virtualization_cpu(void)425 void tdx_disable_virtualization_cpu(void)
426 {
427 int cpu = raw_smp_processor_id();
428 struct list_head *tdvcpus = &per_cpu(associated_tdvcpus, cpu);
429 struct tdx_flush_vp_arg arg;
430 struct vcpu_tdx *tdx, *tmp;
431 unsigned long flags;
432
433 local_irq_save(flags);
434 /* Safe variant needed as tdx_disassociate_vp() deletes the entry. */
435 list_for_each_entry_safe(tdx, tmp, tdvcpus, cpu_list) {
436 arg.vcpu = &tdx->vcpu;
437 tdx_flush_vp(&arg);
438 }
439 local_irq_restore(flags);
440 }
441
442 #define TDX_SEAMCALL_RETRIES 10000
443
smp_func_do_phymem_cache_wb(void * unused)444 static void smp_func_do_phymem_cache_wb(void *unused)
445 {
446 u64 err = 0;
447 bool resume;
448 int i;
449
450 /*
451 * TDH.PHYMEM.CACHE.WB flushes caches associated with any TDX private
452 * KeyID on the package or core. The TDX module may not finish the
453 * cache flush but return TDX_INTERRUPTED_RESUMEABLE instead. The
454 * kernel should retry it until it returns success w/o rescheduling.
455 */
456 for (i = TDX_SEAMCALL_RETRIES; i > 0; i--) {
457 resume = !!err;
458 err = tdh_phymem_cache_wb(resume);
459 switch (err) {
460 case TDX_INTERRUPTED_RESUMABLE:
461 continue;
462 case TDX_NO_HKID_READY_TO_WBCACHE:
463 err = TDX_SUCCESS; /* Already done by other thread */
464 fallthrough;
465 default:
466 goto out;
467 }
468 }
469
470 out:
471 if (WARN_ON_ONCE(err))
472 pr_tdx_error(TDH_PHYMEM_CACHE_WB, err);
473 }
474
tdx_mmu_release_hkid(struct kvm * kvm)475 void tdx_mmu_release_hkid(struct kvm *kvm)
476 {
477 bool packages_allocated, targets_allocated;
478 struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
479 cpumask_var_t packages, targets;
480 struct kvm_vcpu *vcpu;
481 unsigned long j;
482 int i;
483 u64 err;
484
485 if (!is_hkid_assigned(kvm_tdx))
486 return;
487
488 packages_allocated = zalloc_cpumask_var(&packages, GFP_KERNEL);
489 targets_allocated = zalloc_cpumask_var(&targets, GFP_KERNEL);
490 cpus_read_lock();
491
492 kvm_for_each_vcpu(j, vcpu, kvm)
493 tdx_flush_vp_on_cpu(vcpu);
494
495 /*
496 * TDH.PHYMEM.CACHE.WB tries to acquire the TDX module global lock
497 * and can fail with TDX_OPERAND_BUSY when it fails to get the lock.
498 * Multiple TDX guests can be destroyed simultaneously. Take the
499 * mutex to prevent it from getting error.
500 */
501 mutex_lock(&tdx_lock);
502
503 /*
504 * Releasing HKID is in vm_destroy().
505 * After the above flushing vps, there should be no more vCPU
506 * associations, as all vCPU fds have been released at this stage.
507 */
508 err = tdh_mng_vpflushdone(&kvm_tdx->td);
509 if (err == TDX_FLUSHVP_NOT_DONE)
510 goto out;
511 if (KVM_BUG_ON(err, kvm)) {
512 pr_tdx_error(TDH_MNG_VPFLUSHDONE, err);
513 pr_err("tdh_mng_vpflushdone() failed. HKID %d is leaked.\n",
514 kvm_tdx->hkid);
515 goto out;
516 }
517
518 for_each_online_cpu(i) {
519 if (packages_allocated &&
520 cpumask_test_and_set_cpu(topology_physical_package_id(i),
521 packages))
522 continue;
523 if (targets_allocated)
524 cpumask_set_cpu(i, targets);
525 }
526 if (targets_allocated)
527 on_each_cpu_mask(targets, smp_func_do_phymem_cache_wb, NULL, true);
528 else
529 on_each_cpu(smp_func_do_phymem_cache_wb, NULL, true);
530 /*
531 * In the case of error in smp_func_do_phymem_cache_wb(), the following
532 * tdh_mng_key_freeid() will fail.
533 */
534 err = tdh_mng_key_freeid(&kvm_tdx->td);
535 if (KVM_BUG_ON(err, kvm)) {
536 pr_tdx_error(TDH_MNG_KEY_FREEID, err);
537 pr_err("tdh_mng_key_freeid() failed. HKID %d is leaked.\n",
538 kvm_tdx->hkid);
539 } else {
540 tdx_hkid_free(kvm_tdx);
541 }
542
543 out:
544 mutex_unlock(&tdx_lock);
545 cpus_read_unlock();
546 free_cpumask_var(targets);
547 free_cpumask_var(packages);
548 }
549
tdx_reclaim_td_control_pages(struct kvm * kvm)550 static void tdx_reclaim_td_control_pages(struct kvm *kvm)
551 {
552 struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
553 u64 err;
554 int i;
555
556 /*
557 * tdx_mmu_release_hkid() failed to reclaim HKID. Something went wrong
558 * heavily with TDX module. Give up freeing TD pages. As the function
559 * already warned, don't warn it again.
560 */
561 if (is_hkid_assigned(kvm_tdx))
562 return;
563
564 if (kvm_tdx->td.tdcs_pages) {
565 for (i = 0; i < kvm_tdx->td.tdcs_nr_pages; i++) {
566 if (!kvm_tdx->td.tdcs_pages[i])
567 continue;
568
569 tdx_reclaim_control_page(kvm_tdx->td.tdcs_pages[i]);
570 }
571 kfree(kvm_tdx->td.tdcs_pages);
572 kvm_tdx->td.tdcs_pages = NULL;
573 }
574
575 if (!kvm_tdx->td.tdr_page)
576 return;
577
578 if (__tdx_reclaim_page(kvm_tdx->td.tdr_page))
579 return;
580
581 /*
582 * Use a SEAMCALL to ask the TDX module to flush the cache based on the
583 * KeyID. TDX module may access TDR while operating on TD (Especially
584 * when it is reclaiming TDCS).
585 */
586 err = tdh_phymem_page_wbinvd_tdr(&kvm_tdx->td);
587 if (KVM_BUG_ON(err, kvm)) {
588 pr_tdx_error(TDH_PHYMEM_PAGE_WBINVD, err);
589 return;
590 }
591 tdx_clear_page(kvm_tdx->td.tdr_page);
592
593 __free_page(kvm_tdx->td.tdr_page);
594 kvm_tdx->td.tdr_page = NULL;
595 }
596
tdx_vm_destroy(struct kvm * kvm)597 void tdx_vm_destroy(struct kvm *kvm)
598 {
599 struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
600
601 tdx_reclaim_td_control_pages(kvm);
602
603 kvm_tdx->state = TD_STATE_UNINITIALIZED;
604 }
605
tdx_do_tdh_mng_key_config(void * param)606 static int tdx_do_tdh_mng_key_config(void *param)
607 {
608 struct kvm_tdx *kvm_tdx = param;
609 u64 err;
610
611 /* TDX_RND_NO_ENTROPY related retries are handled by sc_retry() */
612 err = tdh_mng_key_config(&kvm_tdx->td);
613
614 if (KVM_BUG_ON(err, &kvm_tdx->kvm)) {
615 pr_tdx_error(TDH_MNG_KEY_CONFIG, err);
616 return -EIO;
617 }
618
619 return 0;
620 }
621
tdx_vm_init(struct kvm * kvm)622 int tdx_vm_init(struct kvm *kvm)
623 {
624 struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
625
626 kvm->arch.has_protected_state = true;
627 kvm->arch.has_private_mem = true;
628 kvm->arch.disabled_quirks |= KVM_X86_QUIRK_IGNORE_GUEST_PAT;
629
630 /*
631 * Because guest TD is protected, VMM can't parse the instruction in TD.
632 * Instead, guest uses MMIO hypercall. For unmodified device driver,
633 * #VE needs to be injected for MMIO and #VE handler in TD converts MMIO
634 * instruction into MMIO hypercall.
635 *
636 * SPTE value for MMIO needs to be setup so that #VE is injected into
637 * TD instead of triggering EPT MISCONFIG.
638 * - RWX=0 so that EPT violation is triggered.
639 * - suppress #VE bit is cleared to inject #VE.
640 */
641 kvm_mmu_set_mmio_spte_value(kvm, 0);
642
643 /*
644 * TDX has its own limit of maximum vCPUs it can support for all
645 * TDX guests in addition to KVM_MAX_VCPUS. TDX module reports
646 * such limit via the MAX_VCPU_PER_TD global metadata. In
647 * practice, it reflects the number of logical CPUs that ALL
648 * platforms that the TDX module supports can possibly have.
649 *
650 * Limit TDX guest's maximum vCPUs to the number of logical CPUs
651 * the platform has. Simply forwarding the MAX_VCPU_PER_TD to
652 * userspace would result in an unpredictable ABI.
653 */
654 kvm->max_vcpus = min_t(int, kvm->max_vcpus, num_present_cpus());
655
656 kvm_tdx->state = TD_STATE_UNINITIALIZED;
657
658 return 0;
659 }
660
tdx_vcpu_create(struct kvm_vcpu * vcpu)661 int tdx_vcpu_create(struct kvm_vcpu *vcpu)
662 {
663 struct kvm_tdx *kvm_tdx = to_kvm_tdx(vcpu->kvm);
664 struct vcpu_tdx *tdx = to_tdx(vcpu);
665
666 if (kvm_tdx->state != TD_STATE_INITIALIZED)
667 return -EIO;
668
669 /*
670 * TDX module mandates APICv, which requires an in-kernel local APIC.
671 * Disallow an in-kernel I/O APIC, because level-triggered interrupts
672 * and thus the I/O APIC as a whole can't be faithfully emulated in KVM.
673 */
674 if (!irqchip_split(vcpu->kvm))
675 return -EINVAL;
676
677 fpstate_set_confidential(&vcpu->arch.guest_fpu);
678 vcpu->arch.apic->guest_apic_protected = true;
679 INIT_LIST_HEAD(&tdx->vt.pi_wakeup_list);
680
681 vcpu->arch.efer = EFER_SCE | EFER_LME | EFER_LMA | EFER_NX;
682
683 vcpu->arch.switch_db_regs = KVM_DEBUGREG_AUTO_SWITCH;
684 vcpu->arch.cr0_guest_owned_bits = -1ul;
685 vcpu->arch.cr4_guest_owned_bits = -1ul;
686
687 /* KVM can't change TSC offset/multiplier as TDX module manages them. */
688 vcpu->arch.guest_tsc_protected = true;
689 vcpu->arch.tsc_offset = kvm_tdx->tsc_offset;
690 vcpu->arch.l1_tsc_offset = vcpu->arch.tsc_offset;
691 vcpu->arch.tsc_scaling_ratio = kvm_tdx->tsc_multiplier;
692 vcpu->arch.l1_tsc_scaling_ratio = kvm_tdx->tsc_multiplier;
693
694 vcpu->arch.guest_state_protected =
695 !(to_kvm_tdx(vcpu->kvm)->attributes & TDX_TD_ATTR_DEBUG);
696
697 if ((kvm_tdx->xfam & XFEATURE_MASK_XTILE) == XFEATURE_MASK_XTILE)
698 vcpu->arch.xfd_no_write_intercept = true;
699
700 tdx->vt.pi_desc.nv = POSTED_INTR_VECTOR;
701 __pi_set_sn(&tdx->vt.pi_desc);
702
703 tdx->state = VCPU_TD_STATE_UNINITIALIZED;
704
705 return 0;
706 }
707
tdx_vcpu_load(struct kvm_vcpu * vcpu,int cpu)708 void tdx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
709 {
710 struct vcpu_tdx *tdx = to_tdx(vcpu);
711
712 vmx_vcpu_pi_load(vcpu, cpu);
713 if (vcpu->cpu == cpu || !is_hkid_assigned(to_kvm_tdx(vcpu->kvm)))
714 return;
715
716 tdx_flush_vp_on_cpu(vcpu);
717
718 KVM_BUG_ON(cpu != raw_smp_processor_id(), vcpu->kvm);
719 local_irq_disable();
720 /*
721 * Pairs with the smp_wmb() in tdx_disassociate_vp() to ensure
722 * vcpu->cpu is read before tdx->cpu_list.
723 */
724 smp_rmb();
725
726 list_add(&tdx->cpu_list, &per_cpu(associated_tdvcpus, cpu));
727 local_irq_enable();
728 }
729
tdx_interrupt_allowed(struct kvm_vcpu * vcpu)730 bool tdx_interrupt_allowed(struct kvm_vcpu *vcpu)
731 {
732 /*
733 * KVM can't get the interrupt status of TDX guest and it assumes
734 * interrupt is always allowed unless TDX guest calls TDVMCALL with HLT,
735 * which passes the interrupt blocked flag.
736 */
737 return vmx_get_exit_reason(vcpu).basic != EXIT_REASON_HLT ||
738 !to_tdx(vcpu)->vp_enter_args.r12;
739 }
740
tdx_protected_apic_has_interrupt(struct kvm_vcpu * vcpu)741 bool tdx_protected_apic_has_interrupt(struct kvm_vcpu *vcpu)
742 {
743 u64 vcpu_state_details;
744
745 if (pi_has_pending_interrupt(vcpu))
746 return true;
747
748 /*
749 * Only check RVI pending for HALTED case with IRQ enabled.
750 * For non-HLT cases, KVM doesn't care about STI/SS shadows. And if the
751 * interrupt was pending before TD exit, then it _must_ be blocked,
752 * otherwise the interrupt would have been serviced at the instruction
753 * boundary.
754 */
755 if (vmx_get_exit_reason(vcpu).basic != EXIT_REASON_HLT ||
756 to_tdx(vcpu)->vp_enter_args.r12)
757 return false;
758
759 vcpu_state_details =
760 td_state_non_arch_read64(to_tdx(vcpu), TD_VCPU_STATE_DETAILS_NON_ARCH);
761
762 return tdx_vcpu_state_details_intr_pending(vcpu_state_details);
763 }
764
765 /*
766 * Compared to vmx_prepare_switch_to_guest(), there is not much to do
767 * as SEAMCALL/SEAMRET calls take care of most of save and restore.
768 */
tdx_prepare_switch_to_guest(struct kvm_vcpu * vcpu)769 void tdx_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
770 {
771 struct vcpu_vt *vt = to_vt(vcpu);
772
773 if (vt->guest_state_loaded)
774 return;
775
776 if (likely(is_64bit_mm(current->mm)))
777 vt->msr_host_kernel_gs_base = current->thread.gsbase;
778 else
779 vt->msr_host_kernel_gs_base = read_msr(MSR_KERNEL_GS_BASE);
780
781 vt->host_debugctlmsr = get_debugctlmsr();
782
783 vt->guest_state_loaded = true;
784 }
785
786 struct tdx_uret_msr {
787 u32 msr;
788 unsigned int slot;
789 u64 defval;
790 };
791
792 static struct tdx_uret_msr tdx_uret_msrs[] = {
793 {.msr = MSR_SYSCALL_MASK, .defval = 0x20200 },
794 {.msr = MSR_STAR,},
795 {.msr = MSR_LSTAR,},
796 {.msr = MSR_TSC_AUX,},
797 };
798
tdx_user_return_msr_update_cache(void)799 static void tdx_user_return_msr_update_cache(void)
800 {
801 int i;
802
803 for (i = 0; i < ARRAY_SIZE(tdx_uret_msrs); i++)
804 kvm_user_return_msr_update_cache(tdx_uret_msrs[i].slot,
805 tdx_uret_msrs[i].defval);
806 }
807
tdx_prepare_switch_to_host(struct kvm_vcpu * vcpu)808 static void tdx_prepare_switch_to_host(struct kvm_vcpu *vcpu)
809 {
810 struct vcpu_vt *vt = to_vt(vcpu);
811 struct vcpu_tdx *tdx = to_tdx(vcpu);
812
813 if (!vt->guest_state_loaded)
814 return;
815
816 ++vcpu->stat.host_state_reload;
817 wrmsrl(MSR_KERNEL_GS_BASE, vt->msr_host_kernel_gs_base);
818
819 if (tdx->guest_entered) {
820 tdx_user_return_msr_update_cache();
821 tdx->guest_entered = false;
822 }
823
824 vt->guest_state_loaded = false;
825 }
826
tdx_vcpu_put(struct kvm_vcpu * vcpu)827 void tdx_vcpu_put(struct kvm_vcpu *vcpu)
828 {
829 vmx_vcpu_pi_put(vcpu);
830 tdx_prepare_switch_to_host(vcpu);
831 }
832
tdx_vcpu_free(struct kvm_vcpu * vcpu)833 void tdx_vcpu_free(struct kvm_vcpu *vcpu)
834 {
835 struct kvm_tdx *kvm_tdx = to_kvm_tdx(vcpu->kvm);
836 struct vcpu_tdx *tdx = to_tdx(vcpu);
837 int i;
838
839 /*
840 * It is not possible to reclaim pages while hkid is assigned. It might
841 * be assigned if:
842 * 1. the TD VM is being destroyed but freeing hkid failed, in which
843 * case the pages are leaked
844 * 2. TD VCPU creation failed and this on the error path, in which case
845 * there is nothing to do anyway
846 */
847 if (is_hkid_assigned(kvm_tdx))
848 return;
849
850 if (tdx->vp.tdcx_pages) {
851 for (i = 0; i < kvm_tdx->td.tdcx_nr_pages; i++) {
852 if (tdx->vp.tdcx_pages[i])
853 tdx_reclaim_control_page(tdx->vp.tdcx_pages[i]);
854 }
855 kfree(tdx->vp.tdcx_pages);
856 tdx->vp.tdcx_pages = NULL;
857 }
858 if (tdx->vp.tdvpr_page) {
859 tdx_reclaim_control_page(tdx->vp.tdvpr_page);
860 tdx->vp.tdvpr_page = 0;
861 }
862
863 tdx->state = VCPU_TD_STATE_UNINITIALIZED;
864 }
865
tdx_vcpu_pre_run(struct kvm_vcpu * vcpu)866 int tdx_vcpu_pre_run(struct kvm_vcpu *vcpu)
867 {
868 if (unlikely(to_tdx(vcpu)->state != VCPU_TD_STATE_INITIALIZED ||
869 to_kvm_tdx(vcpu->kvm)->state != TD_STATE_RUNNABLE))
870 return -EINVAL;
871
872 return 1;
873 }
874
tdcall_to_vmx_exit_reason(struct kvm_vcpu * vcpu)875 static __always_inline u32 tdcall_to_vmx_exit_reason(struct kvm_vcpu *vcpu)
876 {
877 switch (tdvmcall_leaf(vcpu)) {
878 case EXIT_REASON_CPUID:
879 case EXIT_REASON_HLT:
880 case EXIT_REASON_IO_INSTRUCTION:
881 case EXIT_REASON_MSR_READ:
882 case EXIT_REASON_MSR_WRITE:
883 return tdvmcall_leaf(vcpu);
884 case EXIT_REASON_EPT_VIOLATION:
885 return EXIT_REASON_EPT_MISCONFIG;
886 default:
887 break;
888 }
889
890 return EXIT_REASON_TDCALL;
891 }
892
tdx_to_vmx_exit_reason(struct kvm_vcpu * vcpu)893 static __always_inline u32 tdx_to_vmx_exit_reason(struct kvm_vcpu *vcpu)
894 {
895 struct vcpu_tdx *tdx = to_tdx(vcpu);
896 u32 exit_reason;
897
898 switch (tdx->vp_enter_ret & TDX_SEAMCALL_STATUS_MASK) {
899 case TDX_SUCCESS:
900 case TDX_NON_RECOVERABLE_VCPU:
901 case TDX_NON_RECOVERABLE_TD:
902 case TDX_NON_RECOVERABLE_TD_NON_ACCESSIBLE:
903 case TDX_NON_RECOVERABLE_TD_WRONG_APIC_MODE:
904 break;
905 default:
906 return -1u;
907 }
908
909 exit_reason = tdx->vp_enter_ret;
910
911 switch (exit_reason) {
912 case EXIT_REASON_TDCALL:
913 if (tdvmcall_exit_type(vcpu))
914 return EXIT_REASON_VMCALL;
915
916 return tdcall_to_vmx_exit_reason(vcpu);
917 case EXIT_REASON_EPT_MISCONFIG:
918 /*
919 * Defer KVM_BUG_ON() until tdx_handle_exit() because this is in
920 * non-instrumentable code with interrupts disabled.
921 */
922 return -1u;
923 default:
924 break;
925 }
926
927 return exit_reason;
928 }
929
tdx_vcpu_enter_exit(struct kvm_vcpu * vcpu)930 static noinstr void tdx_vcpu_enter_exit(struct kvm_vcpu *vcpu)
931 {
932 struct vcpu_tdx *tdx = to_tdx(vcpu);
933 struct vcpu_vt *vt = to_vt(vcpu);
934
935 guest_state_enter_irqoff();
936
937 tdx->vp_enter_ret = tdh_vp_enter(&tdx->vp, &tdx->vp_enter_args);
938
939 vt->exit_reason.full = tdx_to_vmx_exit_reason(vcpu);
940
941 vt->exit_qualification = tdx->vp_enter_args.rcx;
942 tdx->ext_exit_qualification = tdx->vp_enter_args.rdx;
943 tdx->exit_gpa = tdx->vp_enter_args.r8;
944 vt->exit_intr_info = tdx->vp_enter_args.r9;
945
946 vmx_handle_nmi(vcpu);
947
948 guest_state_exit_irqoff();
949 }
950
tdx_failed_vmentry(struct kvm_vcpu * vcpu)951 static bool tdx_failed_vmentry(struct kvm_vcpu *vcpu)
952 {
953 return vmx_get_exit_reason(vcpu).failed_vmentry &&
954 vmx_get_exit_reason(vcpu).full != -1u;
955 }
956
tdx_exit_handlers_fastpath(struct kvm_vcpu * vcpu)957 static fastpath_t tdx_exit_handlers_fastpath(struct kvm_vcpu *vcpu)
958 {
959 u64 vp_enter_ret = to_tdx(vcpu)->vp_enter_ret;
960
961 /*
962 * TDX_OPERAND_BUSY could be returned for SEPT due to 0-step mitigation
963 * or for TD EPOCH due to contention with TDH.MEM.TRACK on TDH.VP.ENTER.
964 *
965 * When KVM requests KVM_REQ_OUTSIDE_GUEST_MODE, which has both
966 * KVM_REQUEST_WAIT and KVM_REQUEST_NO_ACTION set, it requires target
967 * vCPUs leaving fastpath so that interrupt can be enabled to ensure the
968 * IPIs can be delivered. Return EXIT_FASTPATH_EXIT_HANDLED instead of
969 * EXIT_FASTPATH_REENTER_GUEST to exit fastpath, otherwise, the
970 * requester may be blocked endlessly.
971 */
972 if (unlikely(tdx_operand_busy(vp_enter_ret)))
973 return EXIT_FASTPATH_EXIT_HANDLED;
974
975 return EXIT_FASTPATH_NONE;
976 }
977
978 #define TDX_REGS_AVAIL_SET (BIT_ULL(VCPU_EXREG_EXIT_INFO_1) | \
979 BIT_ULL(VCPU_EXREG_EXIT_INFO_2) | \
980 BIT_ULL(VCPU_REGS_RAX) | \
981 BIT_ULL(VCPU_REGS_RBX) | \
982 BIT_ULL(VCPU_REGS_RCX) | \
983 BIT_ULL(VCPU_REGS_RDX) | \
984 BIT_ULL(VCPU_REGS_RBP) | \
985 BIT_ULL(VCPU_REGS_RSI) | \
986 BIT_ULL(VCPU_REGS_RDI) | \
987 BIT_ULL(VCPU_REGS_R8) | \
988 BIT_ULL(VCPU_REGS_R9) | \
989 BIT_ULL(VCPU_REGS_R10) | \
990 BIT_ULL(VCPU_REGS_R11) | \
991 BIT_ULL(VCPU_REGS_R12) | \
992 BIT_ULL(VCPU_REGS_R13) | \
993 BIT_ULL(VCPU_REGS_R14) | \
994 BIT_ULL(VCPU_REGS_R15))
995
tdx_load_host_xsave_state(struct kvm_vcpu * vcpu)996 static void tdx_load_host_xsave_state(struct kvm_vcpu *vcpu)
997 {
998 struct kvm_tdx *kvm_tdx = to_kvm_tdx(vcpu->kvm);
999
1000 /*
1001 * All TDX hosts support PKRU; but even if they didn't,
1002 * vcpu->arch.host_pkru would be 0 and the wrpkru would be
1003 * skipped.
1004 */
1005 if (vcpu->arch.host_pkru != 0)
1006 wrpkru(vcpu->arch.host_pkru);
1007
1008 if (kvm_host.xcr0 != (kvm_tdx->xfam & kvm_caps.supported_xcr0))
1009 xsetbv(XCR_XFEATURE_ENABLED_MASK, kvm_host.xcr0);
1010
1011 /*
1012 * Likewise, even if a TDX hosts didn't support XSS both arms of
1013 * the comparison would be 0 and the wrmsrl would be skipped.
1014 */
1015 if (kvm_host.xss != (kvm_tdx->xfam & kvm_caps.supported_xss))
1016 wrmsrl(MSR_IA32_XSS, kvm_host.xss);
1017 }
1018
1019 #define TDX_DEBUGCTL_PRESERVED (DEBUGCTLMSR_BTF | \
1020 DEBUGCTLMSR_FREEZE_PERFMON_ON_PMI | \
1021 DEBUGCTLMSR_FREEZE_IN_SMM)
1022
tdx_vcpu_run(struct kvm_vcpu * vcpu,bool force_immediate_exit)1023 fastpath_t tdx_vcpu_run(struct kvm_vcpu *vcpu, bool force_immediate_exit)
1024 {
1025 struct vcpu_tdx *tdx = to_tdx(vcpu);
1026 struct vcpu_vt *vt = to_vt(vcpu);
1027
1028 /*
1029 * force_immediate_exit requires vCPU entering for events injection with
1030 * an immediately exit followed. But The TDX module doesn't guarantee
1031 * entry, it's already possible for KVM to _think_ it completely entry
1032 * to the guest without actually having done so.
1033 * Since KVM never needs to force an immediate exit for TDX, and can't
1034 * do direct injection, just warn on force_immediate_exit.
1035 */
1036 WARN_ON_ONCE(force_immediate_exit);
1037
1038 /*
1039 * Wait until retry of SEPT-zap-related SEAMCALL completes before
1040 * allowing vCPU entry to avoid contention with tdh_vp_enter() and
1041 * TDCALLs.
1042 */
1043 if (unlikely(READ_ONCE(to_kvm_tdx(vcpu->kvm)->wait_for_sept_zap)))
1044 return EXIT_FASTPATH_EXIT_HANDLED;
1045
1046 trace_kvm_entry(vcpu, force_immediate_exit);
1047
1048 if (pi_test_on(&vt->pi_desc)) {
1049 apic->send_IPI_self(POSTED_INTR_VECTOR);
1050
1051 if (pi_test_pir(kvm_lapic_get_reg(vcpu->arch.apic, APIC_LVTT) &
1052 APIC_VECTOR_MASK, &vt->pi_desc))
1053 kvm_wait_lapic_expire(vcpu);
1054 }
1055
1056 tdx_vcpu_enter_exit(vcpu);
1057
1058 if (vt->host_debugctlmsr & ~TDX_DEBUGCTL_PRESERVED)
1059 update_debugctlmsr(vt->host_debugctlmsr);
1060
1061 tdx_load_host_xsave_state(vcpu);
1062 tdx->guest_entered = true;
1063
1064 vcpu->arch.regs_avail &= TDX_REGS_AVAIL_SET;
1065
1066 if (unlikely(tdx->vp_enter_ret == EXIT_REASON_EPT_MISCONFIG))
1067 return EXIT_FASTPATH_NONE;
1068
1069 if (unlikely((tdx->vp_enter_ret & TDX_SW_ERROR) == TDX_SW_ERROR))
1070 return EXIT_FASTPATH_NONE;
1071
1072 if (unlikely(vmx_get_exit_reason(vcpu).basic == EXIT_REASON_MCE_DURING_VMENTRY))
1073 kvm_machine_check();
1074
1075 trace_kvm_exit(vcpu, KVM_ISA_VMX);
1076
1077 if (unlikely(tdx_failed_vmentry(vcpu)))
1078 return EXIT_FASTPATH_NONE;
1079
1080 return tdx_exit_handlers_fastpath(vcpu);
1081 }
1082
tdx_inject_nmi(struct kvm_vcpu * vcpu)1083 void tdx_inject_nmi(struct kvm_vcpu *vcpu)
1084 {
1085 ++vcpu->stat.nmi_injections;
1086 td_management_write8(to_tdx(vcpu), TD_VCPU_PEND_NMI, 1);
1087 /*
1088 * From KVM's perspective, NMI injection is completed right after
1089 * writing to PEND_NMI. KVM doesn't care whether an NMI is injected by
1090 * the TDX module or not.
1091 */
1092 vcpu->arch.nmi_injected = false;
1093 /*
1094 * TDX doesn't support KVM to request NMI window exit. If there is
1095 * still a pending vNMI, KVM is not able to inject it along with the
1096 * one pending in TDX module in a back-to-back way. Since the previous
1097 * vNMI is still pending in TDX module, i.e. it has not been delivered
1098 * to TDX guest yet, it's OK to collapse the pending vNMI into the
1099 * previous one. The guest is expected to handle all the NMI sources
1100 * when handling the first vNMI.
1101 */
1102 vcpu->arch.nmi_pending = 0;
1103 }
1104
tdx_handle_exception_nmi(struct kvm_vcpu * vcpu)1105 static int tdx_handle_exception_nmi(struct kvm_vcpu *vcpu)
1106 {
1107 u32 intr_info = vmx_get_intr_info(vcpu);
1108
1109 /*
1110 * Machine checks are handled by handle_exception_irqoff(), or by
1111 * tdx_handle_exit() with TDX_NON_RECOVERABLE set if a #MC occurs on
1112 * VM-Entry. NMIs are handled by tdx_vcpu_enter_exit().
1113 */
1114 if (is_nmi(intr_info) || is_machine_check(intr_info))
1115 return 1;
1116
1117 vcpu->run->exit_reason = KVM_EXIT_EXCEPTION;
1118 vcpu->run->ex.exception = intr_info & INTR_INFO_VECTOR_MASK;
1119 vcpu->run->ex.error_code = 0;
1120
1121 return 0;
1122 }
1123
complete_hypercall_exit(struct kvm_vcpu * vcpu)1124 static int complete_hypercall_exit(struct kvm_vcpu *vcpu)
1125 {
1126 tdvmcall_set_return_code(vcpu, vcpu->run->hypercall.ret);
1127 return 1;
1128 }
1129
tdx_emulate_vmcall(struct kvm_vcpu * vcpu)1130 static int tdx_emulate_vmcall(struct kvm_vcpu *vcpu)
1131 {
1132 kvm_rax_write(vcpu, to_tdx(vcpu)->vp_enter_args.r10);
1133 kvm_rbx_write(vcpu, to_tdx(vcpu)->vp_enter_args.r11);
1134 kvm_rcx_write(vcpu, to_tdx(vcpu)->vp_enter_args.r12);
1135 kvm_rdx_write(vcpu, to_tdx(vcpu)->vp_enter_args.r13);
1136 kvm_rsi_write(vcpu, to_tdx(vcpu)->vp_enter_args.r14);
1137
1138 return __kvm_emulate_hypercall(vcpu, 0, complete_hypercall_exit);
1139 }
1140
1141 /*
1142 * Split into chunks and check interrupt pending between chunks. This allows
1143 * for timely injection of interrupts to prevent issues with guest lockup
1144 * detection.
1145 */
1146 #define TDX_MAP_GPA_MAX_LEN (2 * 1024 * 1024)
1147 static void __tdx_map_gpa(struct vcpu_tdx *tdx);
1148
tdx_complete_vmcall_map_gpa(struct kvm_vcpu * vcpu)1149 static int tdx_complete_vmcall_map_gpa(struct kvm_vcpu *vcpu)
1150 {
1151 struct vcpu_tdx *tdx = to_tdx(vcpu);
1152
1153 if (vcpu->run->hypercall.ret) {
1154 tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_INVALID_OPERAND);
1155 tdx->vp_enter_args.r11 = tdx->map_gpa_next;
1156 return 1;
1157 }
1158
1159 tdx->map_gpa_next += TDX_MAP_GPA_MAX_LEN;
1160 if (tdx->map_gpa_next >= tdx->map_gpa_end)
1161 return 1;
1162
1163 /*
1164 * Stop processing the remaining part if there is a pending interrupt,
1165 * which could be qualified to deliver. Skip checking pending RVI for
1166 * TDVMCALL_MAP_GPA, see comments in tdx_protected_apic_has_interrupt().
1167 */
1168 if (kvm_vcpu_has_events(vcpu)) {
1169 tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_RETRY);
1170 tdx->vp_enter_args.r11 = tdx->map_gpa_next;
1171 return 1;
1172 }
1173
1174 __tdx_map_gpa(tdx);
1175 return 0;
1176 }
1177
__tdx_map_gpa(struct vcpu_tdx * tdx)1178 static void __tdx_map_gpa(struct vcpu_tdx *tdx)
1179 {
1180 u64 gpa = tdx->map_gpa_next;
1181 u64 size = tdx->map_gpa_end - tdx->map_gpa_next;
1182
1183 if (size > TDX_MAP_GPA_MAX_LEN)
1184 size = TDX_MAP_GPA_MAX_LEN;
1185
1186 tdx->vcpu.run->exit_reason = KVM_EXIT_HYPERCALL;
1187 tdx->vcpu.run->hypercall.nr = KVM_HC_MAP_GPA_RANGE;
1188 /*
1189 * In principle this should have been -KVM_ENOSYS, but userspace (QEMU <=9.2)
1190 * assumed that vcpu->run->hypercall.ret is never changed by KVM and thus that
1191 * it was always zero on KVM_EXIT_HYPERCALL. Since KVM is now overwriting
1192 * vcpu->run->hypercall.ret, ensuring that it is zero to not break QEMU.
1193 */
1194 tdx->vcpu.run->hypercall.ret = 0;
1195 tdx->vcpu.run->hypercall.args[0] = gpa & ~gfn_to_gpa(kvm_gfn_direct_bits(tdx->vcpu.kvm));
1196 tdx->vcpu.run->hypercall.args[1] = size / PAGE_SIZE;
1197 tdx->vcpu.run->hypercall.args[2] = vt_is_tdx_private_gpa(tdx->vcpu.kvm, gpa) ?
1198 KVM_MAP_GPA_RANGE_ENCRYPTED :
1199 KVM_MAP_GPA_RANGE_DECRYPTED;
1200 tdx->vcpu.run->hypercall.flags = KVM_EXIT_HYPERCALL_LONG_MODE;
1201
1202 tdx->vcpu.arch.complete_userspace_io = tdx_complete_vmcall_map_gpa;
1203 }
1204
tdx_map_gpa(struct kvm_vcpu * vcpu)1205 static int tdx_map_gpa(struct kvm_vcpu *vcpu)
1206 {
1207 struct vcpu_tdx *tdx = to_tdx(vcpu);
1208 u64 gpa = tdx->vp_enter_args.r12;
1209 u64 size = tdx->vp_enter_args.r13;
1210 u64 ret;
1211
1212 /*
1213 * Converting TDVMCALL_MAP_GPA to KVM_HC_MAP_GPA_RANGE requires
1214 * userspace to enable KVM_CAP_EXIT_HYPERCALL with KVM_HC_MAP_GPA_RANGE
1215 * bit set. This is a base call so it should always be supported, but
1216 * KVM has no way to ensure that userspace implements the GHCI correctly.
1217 * So if KVM_HC_MAP_GPA_RANGE does not cause a VMEXIT, return an error
1218 * to the guest.
1219 */
1220 if (!user_exit_on_hypercall(vcpu->kvm, KVM_HC_MAP_GPA_RANGE)) {
1221 ret = TDVMCALL_STATUS_SUBFUNC_UNSUPPORTED;
1222 goto error;
1223 }
1224
1225 if (gpa + size <= gpa || !kvm_vcpu_is_legal_gpa(vcpu, gpa) ||
1226 !kvm_vcpu_is_legal_gpa(vcpu, gpa + size - 1) ||
1227 (vt_is_tdx_private_gpa(vcpu->kvm, gpa) !=
1228 vt_is_tdx_private_gpa(vcpu->kvm, gpa + size - 1))) {
1229 ret = TDVMCALL_STATUS_INVALID_OPERAND;
1230 goto error;
1231 }
1232
1233 if (!PAGE_ALIGNED(gpa) || !PAGE_ALIGNED(size)) {
1234 ret = TDVMCALL_STATUS_ALIGN_ERROR;
1235 goto error;
1236 }
1237
1238 tdx->map_gpa_end = gpa + size;
1239 tdx->map_gpa_next = gpa;
1240
1241 __tdx_map_gpa(tdx);
1242 return 0;
1243
1244 error:
1245 tdvmcall_set_return_code(vcpu, ret);
1246 tdx->vp_enter_args.r11 = gpa;
1247 return 1;
1248 }
1249
tdx_report_fatal_error(struct kvm_vcpu * vcpu)1250 static int tdx_report_fatal_error(struct kvm_vcpu *vcpu)
1251 {
1252 struct vcpu_tdx *tdx = to_tdx(vcpu);
1253 u64 *regs = vcpu->run->system_event.data;
1254 u64 *module_regs = &tdx->vp_enter_args.r8;
1255 int index = VCPU_REGS_RAX;
1256
1257 vcpu->run->exit_reason = KVM_EXIT_SYSTEM_EVENT;
1258 vcpu->run->system_event.type = KVM_SYSTEM_EVENT_TDX_FATAL;
1259 vcpu->run->system_event.ndata = 16;
1260
1261 /* Dump 16 general-purpose registers to userspace in ascending order. */
1262 regs[index++] = tdx->vp_enter_ret;
1263 regs[index++] = tdx->vp_enter_args.rcx;
1264 regs[index++] = tdx->vp_enter_args.rdx;
1265 regs[index++] = tdx->vp_enter_args.rbx;
1266 regs[index++] = 0;
1267 regs[index++] = 0;
1268 regs[index++] = tdx->vp_enter_args.rsi;
1269 regs[index] = tdx->vp_enter_args.rdi;
1270 for (index = 0; index < 8; index++)
1271 regs[VCPU_REGS_R8 + index] = module_regs[index];
1272
1273 return 0;
1274 }
1275
tdx_emulate_cpuid(struct kvm_vcpu * vcpu)1276 static int tdx_emulate_cpuid(struct kvm_vcpu *vcpu)
1277 {
1278 u32 eax, ebx, ecx, edx;
1279 struct vcpu_tdx *tdx = to_tdx(vcpu);
1280
1281 /* EAX and ECX for cpuid is stored in R12 and R13. */
1282 eax = tdx->vp_enter_args.r12;
1283 ecx = tdx->vp_enter_args.r13;
1284
1285 kvm_cpuid(vcpu, &eax, &ebx, &ecx, &edx, false);
1286
1287 tdx->vp_enter_args.r12 = eax;
1288 tdx->vp_enter_args.r13 = ebx;
1289 tdx->vp_enter_args.r14 = ecx;
1290 tdx->vp_enter_args.r15 = edx;
1291
1292 return 1;
1293 }
1294
tdx_complete_pio_out(struct kvm_vcpu * vcpu)1295 static int tdx_complete_pio_out(struct kvm_vcpu *vcpu)
1296 {
1297 vcpu->arch.pio.count = 0;
1298 return 1;
1299 }
1300
tdx_complete_pio_in(struct kvm_vcpu * vcpu)1301 static int tdx_complete_pio_in(struct kvm_vcpu *vcpu)
1302 {
1303 struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
1304 unsigned long val = 0;
1305 int ret;
1306
1307 ret = ctxt->ops->pio_in_emulated(ctxt, vcpu->arch.pio.size,
1308 vcpu->arch.pio.port, &val, 1);
1309
1310 WARN_ON_ONCE(!ret);
1311
1312 tdvmcall_set_return_val(vcpu, val);
1313
1314 return 1;
1315 }
1316
tdx_emulate_io(struct kvm_vcpu * vcpu)1317 static int tdx_emulate_io(struct kvm_vcpu *vcpu)
1318 {
1319 struct vcpu_tdx *tdx = to_tdx(vcpu);
1320 struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
1321 unsigned long val = 0;
1322 unsigned int port;
1323 u64 size, write;
1324 int ret;
1325
1326 ++vcpu->stat.io_exits;
1327
1328 size = tdx->vp_enter_args.r12;
1329 write = tdx->vp_enter_args.r13;
1330 port = tdx->vp_enter_args.r14;
1331
1332 if ((write != 0 && write != 1) || (size != 1 && size != 2 && size != 4)) {
1333 tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_INVALID_OPERAND);
1334 return 1;
1335 }
1336
1337 if (write) {
1338 val = tdx->vp_enter_args.r15;
1339 ret = ctxt->ops->pio_out_emulated(ctxt, size, port, &val, 1);
1340 } else {
1341 ret = ctxt->ops->pio_in_emulated(ctxt, size, port, &val, 1);
1342 }
1343
1344 if (!ret)
1345 vcpu->arch.complete_userspace_io = write ? tdx_complete_pio_out :
1346 tdx_complete_pio_in;
1347 else if (!write)
1348 tdvmcall_set_return_val(vcpu, val);
1349
1350 return ret;
1351 }
1352
tdx_complete_mmio_read(struct kvm_vcpu * vcpu)1353 static int tdx_complete_mmio_read(struct kvm_vcpu *vcpu)
1354 {
1355 unsigned long val = 0;
1356 gpa_t gpa;
1357 int size;
1358
1359 gpa = vcpu->mmio_fragments[0].gpa;
1360 size = vcpu->mmio_fragments[0].len;
1361
1362 memcpy(&val, vcpu->run->mmio.data, size);
1363 tdvmcall_set_return_val(vcpu, val);
1364 trace_kvm_mmio(KVM_TRACE_MMIO_READ, size, gpa, &val);
1365 return 1;
1366 }
1367
tdx_mmio_write(struct kvm_vcpu * vcpu,gpa_t gpa,int size,unsigned long val)1368 static inline int tdx_mmio_write(struct kvm_vcpu *vcpu, gpa_t gpa, int size,
1369 unsigned long val)
1370 {
1371 if (!kvm_io_bus_write(vcpu, KVM_FAST_MMIO_BUS, gpa, 0, NULL)) {
1372 trace_kvm_fast_mmio(gpa);
1373 return 0;
1374 }
1375
1376 trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, size, gpa, &val);
1377 if (kvm_io_bus_write(vcpu, KVM_MMIO_BUS, gpa, size, &val))
1378 return -EOPNOTSUPP;
1379
1380 return 0;
1381 }
1382
tdx_mmio_read(struct kvm_vcpu * vcpu,gpa_t gpa,int size)1383 static inline int tdx_mmio_read(struct kvm_vcpu *vcpu, gpa_t gpa, int size)
1384 {
1385 unsigned long val;
1386
1387 if (kvm_io_bus_read(vcpu, KVM_MMIO_BUS, gpa, size, &val))
1388 return -EOPNOTSUPP;
1389
1390 tdvmcall_set_return_val(vcpu, val);
1391 trace_kvm_mmio(KVM_TRACE_MMIO_READ, size, gpa, &val);
1392 return 0;
1393 }
1394
tdx_emulate_mmio(struct kvm_vcpu * vcpu)1395 static int tdx_emulate_mmio(struct kvm_vcpu *vcpu)
1396 {
1397 struct vcpu_tdx *tdx = to_tdx(vcpu);
1398 int size, write, r;
1399 unsigned long val;
1400 gpa_t gpa;
1401
1402 size = tdx->vp_enter_args.r12;
1403 write = tdx->vp_enter_args.r13;
1404 gpa = tdx->vp_enter_args.r14;
1405 val = write ? tdx->vp_enter_args.r15 : 0;
1406
1407 if (size != 1 && size != 2 && size != 4 && size != 8)
1408 goto error;
1409 if (write != 0 && write != 1)
1410 goto error;
1411
1412 /*
1413 * TDG.VP.VMCALL<MMIO> allows only shared GPA, it makes no sense to
1414 * do MMIO emulation for private GPA.
1415 */
1416 if (vt_is_tdx_private_gpa(vcpu->kvm, gpa) ||
1417 vt_is_tdx_private_gpa(vcpu->kvm, gpa + size - 1))
1418 goto error;
1419
1420 gpa = gpa & ~gfn_to_gpa(kvm_gfn_direct_bits(vcpu->kvm));
1421
1422 if (write)
1423 r = tdx_mmio_write(vcpu, gpa, size, val);
1424 else
1425 r = tdx_mmio_read(vcpu, gpa, size);
1426 if (!r)
1427 /* Kernel completed device emulation. */
1428 return 1;
1429
1430 /* Request the device emulation to userspace device model. */
1431 vcpu->mmio_is_write = write;
1432 if (!write)
1433 vcpu->arch.complete_userspace_io = tdx_complete_mmio_read;
1434
1435 vcpu->run->mmio.phys_addr = gpa;
1436 vcpu->run->mmio.len = size;
1437 vcpu->run->mmio.is_write = write;
1438 vcpu->run->exit_reason = KVM_EXIT_MMIO;
1439
1440 if (write) {
1441 memcpy(vcpu->run->mmio.data, &val, size);
1442 } else {
1443 vcpu->mmio_fragments[0].gpa = gpa;
1444 vcpu->mmio_fragments[0].len = size;
1445 trace_kvm_mmio(KVM_TRACE_MMIO_READ_UNSATISFIED, size, gpa, NULL);
1446 }
1447 return 0;
1448
1449 error:
1450 tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_INVALID_OPERAND);
1451 return 1;
1452 }
1453
tdx_complete_get_td_vm_call_info(struct kvm_vcpu * vcpu)1454 static int tdx_complete_get_td_vm_call_info(struct kvm_vcpu *vcpu)
1455 {
1456 struct vcpu_tdx *tdx = to_tdx(vcpu);
1457
1458 tdvmcall_set_return_code(vcpu, vcpu->run->tdx.get_tdvmcall_info.ret);
1459
1460 /*
1461 * For now, there is no TDVMCALL beyond GHCI base API supported by KVM
1462 * directly without the support from userspace, just set the value
1463 * returned from userspace.
1464 */
1465 tdx->vp_enter_args.r11 = vcpu->run->tdx.get_tdvmcall_info.r11;
1466 tdx->vp_enter_args.r12 = vcpu->run->tdx.get_tdvmcall_info.r12;
1467 tdx->vp_enter_args.r13 = vcpu->run->tdx.get_tdvmcall_info.r13;
1468 tdx->vp_enter_args.r14 = vcpu->run->tdx.get_tdvmcall_info.r14;
1469
1470 return 1;
1471 }
1472
tdx_get_td_vm_call_info(struct kvm_vcpu * vcpu)1473 static int tdx_get_td_vm_call_info(struct kvm_vcpu *vcpu)
1474 {
1475 struct vcpu_tdx *tdx = to_tdx(vcpu);
1476
1477 switch (tdx->vp_enter_args.r12) {
1478 case 0:
1479 tdx->vp_enter_args.r11 = 0;
1480 tdx->vp_enter_args.r12 = 0;
1481 tdx->vp_enter_args.r13 = 0;
1482 tdx->vp_enter_args.r14 = 0;
1483 tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_SUCCESS);
1484 return 1;
1485 case 1:
1486 vcpu->run->tdx.get_tdvmcall_info.leaf = tdx->vp_enter_args.r12;
1487 vcpu->run->exit_reason = KVM_EXIT_TDX;
1488 vcpu->run->tdx.flags = 0;
1489 vcpu->run->tdx.nr = TDVMCALL_GET_TD_VM_CALL_INFO;
1490 vcpu->run->tdx.get_tdvmcall_info.ret = TDVMCALL_STATUS_SUCCESS;
1491 vcpu->run->tdx.get_tdvmcall_info.r11 = 0;
1492 vcpu->run->tdx.get_tdvmcall_info.r12 = 0;
1493 vcpu->run->tdx.get_tdvmcall_info.r13 = 0;
1494 vcpu->run->tdx.get_tdvmcall_info.r14 = 0;
1495 vcpu->arch.complete_userspace_io = tdx_complete_get_td_vm_call_info;
1496 return 0;
1497 default:
1498 tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_INVALID_OPERAND);
1499 return 1;
1500 }
1501 }
1502
tdx_complete_simple(struct kvm_vcpu * vcpu)1503 static int tdx_complete_simple(struct kvm_vcpu *vcpu)
1504 {
1505 tdvmcall_set_return_code(vcpu, vcpu->run->tdx.unknown.ret);
1506 return 1;
1507 }
1508
tdx_get_quote(struct kvm_vcpu * vcpu)1509 static int tdx_get_quote(struct kvm_vcpu *vcpu)
1510 {
1511 struct vcpu_tdx *tdx = to_tdx(vcpu);
1512 u64 gpa = tdx->vp_enter_args.r12;
1513 u64 size = tdx->vp_enter_args.r13;
1514
1515 /* The gpa of buffer must have shared bit set. */
1516 if (vt_is_tdx_private_gpa(vcpu->kvm, gpa)) {
1517 tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_INVALID_OPERAND);
1518 return 1;
1519 }
1520
1521 vcpu->run->exit_reason = KVM_EXIT_TDX;
1522 vcpu->run->tdx.flags = 0;
1523 vcpu->run->tdx.nr = TDVMCALL_GET_QUOTE;
1524 vcpu->run->tdx.get_quote.ret = TDVMCALL_STATUS_SUBFUNC_UNSUPPORTED;
1525 vcpu->run->tdx.get_quote.gpa = gpa & ~gfn_to_gpa(kvm_gfn_direct_bits(tdx->vcpu.kvm));
1526 vcpu->run->tdx.get_quote.size = size;
1527
1528 vcpu->arch.complete_userspace_io = tdx_complete_simple;
1529
1530 return 0;
1531 }
1532
handle_tdvmcall(struct kvm_vcpu * vcpu)1533 static int handle_tdvmcall(struct kvm_vcpu *vcpu)
1534 {
1535 switch (tdvmcall_leaf(vcpu)) {
1536 case TDVMCALL_MAP_GPA:
1537 return tdx_map_gpa(vcpu);
1538 case TDVMCALL_REPORT_FATAL_ERROR:
1539 return tdx_report_fatal_error(vcpu);
1540 case TDVMCALL_GET_TD_VM_CALL_INFO:
1541 return tdx_get_td_vm_call_info(vcpu);
1542 case TDVMCALL_GET_QUOTE:
1543 return tdx_get_quote(vcpu);
1544 default:
1545 break;
1546 }
1547
1548 tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_SUBFUNC_UNSUPPORTED);
1549 return 1;
1550 }
1551
tdx_load_mmu_pgd(struct kvm_vcpu * vcpu,hpa_t root_hpa,int pgd_level)1552 void tdx_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa, int pgd_level)
1553 {
1554 u64 shared_bit = (pgd_level == 5) ? TDX_SHARED_BIT_PWL_5 :
1555 TDX_SHARED_BIT_PWL_4;
1556
1557 if (KVM_BUG_ON(shared_bit != kvm_gfn_direct_bits(vcpu->kvm), vcpu->kvm))
1558 return;
1559
1560 td_vmcs_write64(to_tdx(vcpu), SHARED_EPT_POINTER, root_hpa);
1561 }
1562
tdx_unpin(struct kvm * kvm,struct page * page)1563 static void tdx_unpin(struct kvm *kvm, struct page *page)
1564 {
1565 put_page(page);
1566 }
1567
tdx_mem_page_aug(struct kvm * kvm,gfn_t gfn,enum pg_level level,struct page * page)1568 static int tdx_mem_page_aug(struct kvm *kvm, gfn_t gfn,
1569 enum pg_level level, struct page *page)
1570 {
1571 int tdx_level = pg_level_to_tdx_sept_level(level);
1572 struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
1573 gpa_t gpa = gfn_to_gpa(gfn);
1574 u64 entry, level_state;
1575 u64 err;
1576
1577 err = tdh_mem_page_aug(&kvm_tdx->td, gpa, tdx_level, page, &entry, &level_state);
1578 if (unlikely(tdx_operand_busy(err))) {
1579 tdx_unpin(kvm, page);
1580 return -EBUSY;
1581 }
1582
1583 if (KVM_BUG_ON(err, kvm)) {
1584 pr_tdx_error_2(TDH_MEM_PAGE_AUG, err, entry, level_state);
1585 tdx_unpin(kvm, page);
1586 return -EIO;
1587 }
1588
1589 return 0;
1590 }
1591
1592 /*
1593 * KVM_TDX_INIT_MEM_REGION calls kvm_gmem_populate() to map guest pages; the
1594 * callback tdx_gmem_post_populate() then maps pages into private memory.
1595 * through the a seamcall TDH.MEM.PAGE.ADD(). The SEAMCALL also requires the
1596 * private EPT structures for the page to have been built before, which is
1597 * done via kvm_tdp_map_page(). nr_premapped counts the number of pages that
1598 * were added to the EPT structures but not added with TDH.MEM.PAGE.ADD().
1599 * The counter has to be zero on KVM_TDX_FINALIZE_VM, to ensure that there
1600 * are no half-initialized shared EPT pages.
1601 */
tdx_mem_page_record_premap_cnt(struct kvm * kvm,gfn_t gfn,enum pg_level level,kvm_pfn_t pfn)1602 static int tdx_mem_page_record_premap_cnt(struct kvm *kvm, gfn_t gfn,
1603 enum pg_level level, kvm_pfn_t pfn)
1604 {
1605 struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
1606
1607 if (KVM_BUG_ON(kvm->arch.pre_fault_allowed, kvm))
1608 return -EINVAL;
1609
1610 /* nr_premapped will be decreased when tdh_mem_page_add() is called. */
1611 atomic64_inc(&kvm_tdx->nr_premapped);
1612 return 0;
1613 }
1614
tdx_sept_set_private_spte(struct kvm * kvm,gfn_t gfn,enum pg_level level,kvm_pfn_t pfn)1615 int tdx_sept_set_private_spte(struct kvm *kvm, gfn_t gfn,
1616 enum pg_level level, kvm_pfn_t pfn)
1617 {
1618 struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
1619 struct page *page = pfn_to_page(pfn);
1620
1621 /* TODO: handle large pages. */
1622 if (KVM_BUG_ON(level != PG_LEVEL_4K, kvm))
1623 return -EINVAL;
1624
1625 /*
1626 * Because guest_memfd doesn't support page migration with
1627 * a_ops->migrate_folio (yet), no callback is triggered for KVM on page
1628 * migration. Until guest_memfd supports page migration, prevent page
1629 * migration.
1630 * TODO: Once guest_memfd introduces callback on page migration,
1631 * implement it and remove get_page/put_page().
1632 */
1633 get_page(page);
1634
1635 /*
1636 * Read 'pre_fault_allowed' before 'kvm_tdx->state'; see matching
1637 * barrier in tdx_td_finalize().
1638 */
1639 smp_rmb();
1640 if (likely(kvm_tdx->state == TD_STATE_RUNNABLE))
1641 return tdx_mem_page_aug(kvm, gfn, level, page);
1642
1643 return tdx_mem_page_record_premap_cnt(kvm, gfn, level, pfn);
1644 }
1645
tdx_sept_drop_private_spte(struct kvm * kvm,gfn_t gfn,enum pg_level level,struct page * page)1646 static int tdx_sept_drop_private_spte(struct kvm *kvm, gfn_t gfn,
1647 enum pg_level level, struct page *page)
1648 {
1649 int tdx_level = pg_level_to_tdx_sept_level(level);
1650 struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
1651 gpa_t gpa = gfn_to_gpa(gfn);
1652 u64 err, entry, level_state;
1653
1654 /* TODO: handle large pages. */
1655 if (KVM_BUG_ON(level != PG_LEVEL_4K, kvm))
1656 return -EINVAL;
1657
1658 if (KVM_BUG_ON(!is_hkid_assigned(kvm_tdx), kvm))
1659 return -EINVAL;
1660
1661 /*
1662 * When zapping private page, write lock is held. So no race condition
1663 * with other vcpu sept operation.
1664 * Race with TDH.VP.ENTER due to (0-step mitigation) and Guest TDCALLs.
1665 */
1666 err = tdh_mem_page_remove(&kvm_tdx->td, gpa, tdx_level, &entry,
1667 &level_state);
1668
1669 if (unlikely(tdx_operand_busy(err))) {
1670 /*
1671 * The second retry is expected to succeed after kicking off all
1672 * other vCPUs and prevent them from invoking TDH.VP.ENTER.
1673 */
1674 tdx_no_vcpus_enter_start(kvm);
1675 err = tdh_mem_page_remove(&kvm_tdx->td, gpa, tdx_level, &entry,
1676 &level_state);
1677 tdx_no_vcpus_enter_stop(kvm);
1678 }
1679
1680 if (KVM_BUG_ON(err, kvm)) {
1681 pr_tdx_error_2(TDH_MEM_PAGE_REMOVE, err, entry, level_state);
1682 return -EIO;
1683 }
1684
1685 err = tdh_phymem_page_wbinvd_hkid((u16)kvm_tdx->hkid, page);
1686
1687 if (KVM_BUG_ON(err, kvm)) {
1688 pr_tdx_error(TDH_PHYMEM_PAGE_WBINVD, err);
1689 return -EIO;
1690 }
1691 tdx_clear_page(page);
1692 tdx_unpin(kvm, page);
1693 return 0;
1694 }
1695
tdx_sept_link_private_spt(struct kvm * kvm,gfn_t gfn,enum pg_level level,void * private_spt)1696 int tdx_sept_link_private_spt(struct kvm *kvm, gfn_t gfn,
1697 enum pg_level level, void *private_spt)
1698 {
1699 int tdx_level = pg_level_to_tdx_sept_level(level);
1700 gpa_t gpa = gfn_to_gpa(gfn);
1701 struct page *page = virt_to_page(private_spt);
1702 u64 err, entry, level_state;
1703
1704 err = tdh_mem_sept_add(&to_kvm_tdx(kvm)->td, gpa, tdx_level, page, &entry,
1705 &level_state);
1706 if (unlikely(tdx_operand_busy(err)))
1707 return -EBUSY;
1708
1709 if (KVM_BUG_ON(err, kvm)) {
1710 pr_tdx_error_2(TDH_MEM_SEPT_ADD, err, entry, level_state);
1711 return -EIO;
1712 }
1713
1714 return 0;
1715 }
1716
1717 /*
1718 * Check if the error returned from a SEPT zap SEAMCALL is due to that a page is
1719 * mapped by KVM_TDX_INIT_MEM_REGION without tdh_mem_page_add() being called
1720 * successfully.
1721 *
1722 * Since tdh_mem_sept_add() must have been invoked successfully before a
1723 * non-leaf entry present in the mirrored page table, the SEPT ZAP related
1724 * SEAMCALLs should not encounter err TDX_EPT_WALK_FAILED. They should instead
1725 * find TDX_EPT_ENTRY_STATE_INCORRECT due to an empty leaf entry found in the
1726 * SEPT.
1727 *
1728 * Further check if the returned entry from SEPT walking is with RWX permissions
1729 * to filter out anything unexpected.
1730 *
1731 * Note: @level is pg_level, not the tdx_level. The tdx_level extracted from
1732 * level_state returned from a SEAMCALL error is the same as that passed into
1733 * the SEAMCALL.
1734 */
tdx_is_sept_zap_err_due_to_premap(struct kvm_tdx * kvm_tdx,u64 err,u64 entry,int level)1735 static int tdx_is_sept_zap_err_due_to_premap(struct kvm_tdx *kvm_tdx, u64 err,
1736 u64 entry, int level)
1737 {
1738 if (!err || kvm_tdx->state == TD_STATE_RUNNABLE)
1739 return false;
1740
1741 if (err != (TDX_EPT_ENTRY_STATE_INCORRECT | TDX_OPERAND_ID_RCX))
1742 return false;
1743
1744 if ((is_last_spte(entry, level) && (entry & VMX_EPT_RWX_MASK)))
1745 return false;
1746
1747 return true;
1748 }
1749
tdx_sept_zap_private_spte(struct kvm * kvm,gfn_t gfn,enum pg_level level,struct page * page)1750 static int tdx_sept_zap_private_spte(struct kvm *kvm, gfn_t gfn,
1751 enum pg_level level, struct page *page)
1752 {
1753 int tdx_level = pg_level_to_tdx_sept_level(level);
1754 struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
1755 gpa_t gpa = gfn_to_gpa(gfn) & KVM_HPAGE_MASK(level);
1756 u64 err, entry, level_state;
1757
1758 /* For now large page isn't supported yet. */
1759 WARN_ON_ONCE(level != PG_LEVEL_4K);
1760
1761 err = tdh_mem_range_block(&kvm_tdx->td, gpa, tdx_level, &entry, &level_state);
1762
1763 if (unlikely(tdx_operand_busy(err))) {
1764 /* After no vCPUs enter, the second retry is expected to succeed */
1765 tdx_no_vcpus_enter_start(kvm);
1766 err = tdh_mem_range_block(&kvm_tdx->td, gpa, tdx_level, &entry, &level_state);
1767 tdx_no_vcpus_enter_stop(kvm);
1768 }
1769 if (tdx_is_sept_zap_err_due_to_premap(kvm_tdx, err, entry, level) &&
1770 !KVM_BUG_ON(!atomic64_read(&kvm_tdx->nr_premapped), kvm)) {
1771 atomic64_dec(&kvm_tdx->nr_premapped);
1772 tdx_unpin(kvm, page);
1773 return 0;
1774 }
1775
1776 if (KVM_BUG_ON(err, kvm)) {
1777 pr_tdx_error_2(TDH_MEM_RANGE_BLOCK, err, entry, level_state);
1778 return -EIO;
1779 }
1780 return 1;
1781 }
1782
1783 /*
1784 * Ensure shared and private EPTs to be flushed on all vCPUs.
1785 * tdh_mem_track() is the only caller that increases TD epoch. An increase in
1786 * the TD epoch (e.g., to value "N + 1") is successful only if no vCPUs are
1787 * running in guest mode with the value "N - 1".
1788 *
1789 * A successful execution of tdh_mem_track() ensures that vCPUs can only run in
1790 * guest mode with TD epoch value "N" if no TD exit occurs after the TD epoch
1791 * being increased to "N + 1".
1792 *
1793 * Kicking off all vCPUs after that further results in no vCPUs can run in guest
1794 * mode with TD epoch value "N", which unblocks the next tdh_mem_track() (e.g.
1795 * to increase TD epoch to "N + 2").
1796 *
1797 * TDX module will flush EPT on the next TD enter and make vCPUs to run in
1798 * guest mode with TD epoch value "N + 1".
1799 *
1800 * kvm_make_all_cpus_request() guarantees all vCPUs are out of guest mode by
1801 * waiting empty IPI handler ack_kick().
1802 *
1803 * No action is required to the vCPUs being kicked off since the kicking off
1804 * occurs certainly after TD epoch increment and before the next
1805 * tdh_mem_track().
1806 */
tdx_track(struct kvm * kvm)1807 static void tdx_track(struct kvm *kvm)
1808 {
1809 struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
1810 u64 err;
1811
1812 /* If TD isn't finalized, it's before any vcpu running. */
1813 if (unlikely(kvm_tdx->state != TD_STATE_RUNNABLE))
1814 return;
1815
1816 lockdep_assert_held_write(&kvm->mmu_lock);
1817
1818 err = tdh_mem_track(&kvm_tdx->td);
1819 if (unlikely(tdx_operand_busy(err))) {
1820 /* After no vCPUs enter, the second retry is expected to succeed */
1821 tdx_no_vcpus_enter_start(kvm);
1822 err = tdh_mem_track(&kvm_tdx->td);
1823 tdx_no_vcpus_enter_stop(kvm);
1824 }
1825
1826 if (KVM_BUG_ON(err, kvm))
1827 pr_tdx_error(TDH_MEM_TRACK, err);
1828
1829 kvm_make_all_cpus_request(kvm, KVM_REQ_OUTSIDE_GUEST_MODE);
1830 }
1831
tdx_sept_free_private_spt(struct kvm * kvm,gfn_t gfn,enum pg_level level,void * private_spt)1832 int tdx_sept_free_private_spt(struct kvm *kvm, gfn_t gfn,
1833 enum pg_level level, void *private_spt)
1834 {
1835 struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
1836
1837 /*
1838 * free_external_spt() is only called after hkid is freed when TD is
1839 * tearing down.
1840 * KVM doesn't (yet) zap page table pages in mirror page table while
1841 * TD is active, though guest pages mapped in mirror page table could be
1842 * zapped during TD is active, e.g. for shared <-> private conversion
1843 * and slot move/deletion.
1844 */
1845 if (KVM_BUG_ON(is_hkid_assigned(kvm_tdx), kvm))
1846 return -EINVAL;
1847
1848 /*
1849 * The HKID assigned to this TD was already freed and cache was
1850 * already flushed. We don't have to flush again.
1851 */
1852 return tdx_reclaim_page(virt_to_page(private_spt));
1853 }
1854
tdx_sept_remove_private_spte(struct kvm * kvm,gfn_t gfn,enum pg_level level,kvm_pfn_t pfn)1855 int tdx_sept_remove_private_spte(struct kvm *kvm, gfn_t gfn,
1856 enum pg_level level, kvm_pfn_t pfn)
1857 {
1858 struct page *page = pfn_to_page(pfn);
1859 int ret;
1860
1861 /*
1862 * HKID is released after all private pages have been removed, and set
1863 * before any might be populated. Warn if zapping is attempted when
1864 * there can't be anything populated in the private EPT.
1865 */
1866 if (KVM_BUG_ON(!is_hkid_assigned(to_kvm_tdx(kvm)), kvm))
1867 return -EINVAL;
1868
1869 ret = tdx_sept_zap_private_spte(kvm, gfn, level, page);
1870 if (ret <= 0)
1871 return ret;
1872
1873 /*
1874 * TDX requires TLB tracking before dropping private page. Do
1875 * it here, although it is also done later.
1876 */
1877 tdx_track(kvm);
1878
1879 return tdx_sept_drop_private_spte(kvm, gfn, level, page);
1880 }
1881
tdx_deliver_interrupt(struct kvm_lapic * apic,int delivery_mode,int trig_mode,int vector)1882 void tdx_deliver_interrupt(struct kvm_lapic *apic, int delivery_mode,
1883 int trig_mode, int vector)
1884 {
1885 struct kvm_vcpu *vcpu = apic->vcpu;
1886 struct vcpu_tdx *tdx = to_tdx(vcpu);
1887
1888 /* TDX supports only posted interrupt. No lapic emulation. */
1889 __vmx_deliver_posted_interrupt(vcpu, &tdx->vt.pi_desc, vector);
1890
1891 trace_kvm_apicv_accept_irq(vcpu->vcpu_id, delivery_mode, trig_mode, vector);
1892 }
1893
tdx_is_sept_violation_unexpected_pending(struct kvm_vcpu * vcpu)1894 static inline bool tdx_is_sept_violation_unexpected_pending(struct kvm_vcpu *vcpu)
1895 {
1896 u64 eeq_type = to_tdx(vcpu)->ext_exit_qualification & TDX_EXT_EXIT_QUAL_TYPE_MASK;
1897 u64 eq = vmx_get_exit_qual(vcpu);
1898
1899 if (eeq_type != TDX_EXT_EXIT_QUAL_TYPE_PENDING_EPT_VIOLATION)
1900 return false;
1901
1902 return !(eq & EPT_VIOLATION_PROT_MASK) && !(eq & EPT_VIOLATION_EXEC_FOR_RING3_LIN);
1903 }
1904
tdx_handle_ept_violation(struct kvm_vcpu * vcpu)1905 static int tdx_handle_ept_violation(struct kvm_vcpu *vcpu)
1906 {
1907 unsigned long exit_qual;
1908 gpa_t gpa = to_tdx(vcpu)->exit_gpa;
1909 bool local_retry = false;
1910 int ret;
1911
1912 if (vt_is_tdx_private_gpa(vcpu->kvm, gpa)) {
1913 if (tdx_is_sept_violation_unexpected_pending(vcpu)) {
1914 pr_warn("Guest access before accepting 0x%llx on vCPU %d\n",
1915 gpa, vcpu->vcpu_id);
1916 kvm_vm_dead(vcpu->kvm);
1917 return -EIO;
1918 }
1919 /*
1920 * Always treat SEPT violations as write faults. Ignore the
1921 * EXIT_QUALIFICATION reported by TDX-SEAM for SEPT violations.
1922 * TD private pages are always RWX in the SEPT tables,
1923 * i.e. they're always mapped writable. Just as importantly,
1924 * treating SEPT violations as write faults is necessary to
1925 * avoid COW allocations, which will cause TDAUGPAGE failures
1926 * due to aliasing a single HPA to multiple GPAs.
1927 */
1928 exit_qual = EPT_VIOLATION_ACC_WRITE;
1929
1930 /* Only private GPA triggers zero-step mitigation */
1931 local_retry = true;
1932 } else {
1933 exit_qual = vmx_get_exit_qual(vcpu);
1934 /*
1935 * EPT violation due to instruction fetch should never be
1936 * triggered from shared memory in TDX guest. If such EPT
1937 * violation occurs, treat it as broken hardware.
1938 */
1939 if (KVM_BUG_ON(exit_qual & EPT_VIOLATION_ACC_INSTR, vcpu->kvm))
1940 return -EIO;
1941 }
1942
1943 trace_kvm_page_fault(vcpu, gpa, exit_qual);
1944
1945 /*
1946 * To minimize TDH.VP.ENTER invocations, retry locally for private GPA
1947 * mapping in TDX.
1948 *
1949 * KVM may return RET_PF_RETRY for private GPA due to
1950 * - contentions when atomically updating SPTEs of the mirror page table
1951 * - in-progress GFN invalidation or memslot removal.
1952 * - TDX_OPERAND_BUSY error from TDH.MEM.PAGE.AUG or TDH.MEM.SEPT.ADD,
1953 * caused by contentions with TDH.VP.ENTER (with zero-step mitigation)
1954 * or certain TDCALLs.
1955 *
1956 * If TDH.VP.ENTER is invoked more times than the threshold set by the
1957 * TDX module before KVM resolves the private GPA mapping, the TDX
1958 * module will activate zero-step mitigation during TDH.VP.ENTER. This
1959 * process acquires an SEPT tree lock in the TDX module, leading to
1960 * further contentions with TDH.MEM.PAGE.AUG or TDH.MEM.SEPT.ADD
1961 * operations on other vCPUs.
1962 *
1963 * Breaking out of local retries for kvm_vcpu_has_events() is for
1964 * interrupt injection. kvm_vcpu_has_events() should not see pending
1965 * events for TDX. Since KVM can't determine if IRQs (or NMIs) are
1966 * blocked by TDs, false positives are inevitable i.e., KVM may re-enter
1967 * the guest even if the IRQ/NMI can't be delivered.
1968 *
1969 * Note: even without breaking out of local retries, zero-step
1970 * mitigation may still occur due to
1971 * - invoking of TDH.VP.ENTER after KVM_EXIT_MEMORY_FAULT,
1972 * - a single RIP causing EPT violations for more GFNs than the
1973 * threshold count.
1974 * This is safe, as triggering zero-step mitigation only introduces
1975 * contentions to page installation SEAMCALLs on other vCPUs, which will
1976 * handle retries locally in their EPT violation handlers.
1977 */
1978 while (1) {
1979 ret = __vmx_handle_ept_violation(vcpu, gpa, exit_qual);
1980
1981 if (ret != RET_PF_RETRY || !local_retry)
1982 break;
1983
1984 if (kvm_vcpu_has_events(vcpu) || signal_pending(current))
1985 break;
1986
1987 if (kvm_check_request(KVM_REQ_VM_DEAD, vcpu)) {
1988 ret = -EIO;
1989 break;
1990 }
1991
1992 cond_resched();
1993 }
1994 return ret;
1995 }
1996
tdx_complete_emulated_msr(struct kvm_vcpu * vcpu,int err)1997 int tdx_complete_emulated_msr(struct kvm_vcpu *vcpu, int err)
1998 {
1999 if (err) {
2000 tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_INVALID_OPERAND);
2001 return 1;
2002 }
2003
2004 if (vmx_get_exit_reason(vcpu).basic == EXIT_REASON_MSR_READ)
2005 tdvmcall_set_return_val(vcpu, kvm_read_edx_eax(vcpu));
2006
2007 return 1;
2008 }
2009
2010
tdx_handle_exit(struct kvm_vcpu * vcpu,fastpath_t fastpath)2011 int tdx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t fastpath)
2012 {
2013 struct vcpu_tdx *tdx = to_tdx(vcpu);
2014 u64 vp_enter_ret = tdx->vp_enter_ret;
2015 union vmx_exit_reason exit_reason = vmx_get_exit_reason(vcpu);
2016
2017 if (fastpath != EXIT_FASTPATH_NONE)
2018 return 1;
2019
2020 if (unlikely(vp_enter_ret == EXIT_REASON_EPT_MISCONFIG)) {
2021 KVM_BUG_ON(1, vcpu->kvm);
2022 return -EIO;
2023 }
2024
2025 /*
2026 * Handle TDX SW errors, including TDX_SEAMCALL_UD, TDX_SEAMCALL_GP and
2027 * TDX_SEAMCALL_VMFAILINVALID.
2028 */
2029 if (unlikely((vp_enter_ret & TDX_SW_ERROR) == TDX_SW_ERROR)) {
2030 KVM_BUG_ON(!kvm_rebooting, vcpu->kvm);
2031 goto unhandled_exit;
2032 }
2033
2034 if (unlikely(tdx_failed_vmentry(vcpu))) {
2035 /*
2036 * If the guest state is protected, that means off-TD debug is
2037 * not enabled, TDX_NON_RECOVERABLE must be set.
2038 */
2039 WARN_ON_ONCE(vcpu->arch.guest_state_protected &&
2040 !(vp_enter_ret & TDX_NON_RECOVERABLE));
2041 vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY;
2042 vcpu->run->fail_entry.hardware_entry_failure_reason = exit_reason.full;
2043 vcpu->run->fail_entry.cpu = vcpu->arch.last_vmentry_cpu;
2044 return 0;
2045 }
2046
2047 if (unlikely(vp_enter_ret & (TDX_ERROR | TDX_NON_RECOVERABLE)) &&
2048 exit_reason.basic != EXIT_REASON_TRIPLE_FAULT) {
2049 kvm_pr_unimpl("TD vp_enter_ret 0x%llx\n", vp_enter_ret);
2050 goto unhandled_exit;
2051 }
2052
2053 WARN_ON_ONCE(exit_reason.basic != EXIT_REASON_TRIPLE_FAULT &&
2054 (vp_enter_ret & TDX_SEAMCALL_STATUS_MASK) != TDX_SUCCESS);
2055
2056 switch (exit_reason.basic) {
2057 case EXIT_REASON_TRIPLE_FAULT:
2058 vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN;
2059 vcpu->mmio_needed = 0;
2060 return 0;
2061 case EXIT_REASON_EXCEPTION_NMI:
2062 return tdx_handle_exception_nmi(vcpu);
2063 case EXIT_REASON_EXTERNAL_INTERRUPT:
2064 ++vcpu->stat.irq_exits;
2065 return 1;
2066 case EXIT_REASON_CPUID:
2067 return tdx_emulate_cpuid(vcpu);
2068 case EXIT_REASON_HLT:
2069 return kvm_emulate_halt_noskip(vcpu);
2070 case EXIT_REASON_TDCALL:
2071 return handle_tdvmcall(vcpu);
2072 case EXIT_REASON_VMCALL:
2073 return tdx_emulate_vmcall(vcpu);
2074 case EXIT_REASON_IO_INSTRUCTION:
2075 return tdx_emulate_io(vcpu);
2076 case EXIT_REASON_MSR_READ:
2077 kvm_rcx_write(vcpu, tdx->vp_enter_args.r12);
2078 return kvm_emulate_rdmsr(vcpu);
2079 case EXIT_REASON_MSR_WRITE:
2080 kvm_rcx_write(vcpu, tdx->vp_enter_args.r12);
2081 kvm_rax_write(vcpu, tdx->vp_enter_args.r13 & -1u);
2082 kvm_rdx_write(vcpu, tdx->vp_enter_args.r13 >> 32);
2083 return kvm_emulate_wrmsr(vcpu);
2084 case EXIT_REASON_EPT_MISCONFIG:
2085 return tdx_emulate_mmio(vcpu);
2086 case EXIT_REASON_EPT_VIOLATION:
2087 return tdx_handle_ept_violation(vcpu);
2088 case EXIT_REASON_OTHER_SMI:
2089 /*
2090 * Unlike VMX, SMI in SEAM non-root mode (i.e. when
2091 * TD guest vCPU is running) will cause VM exit to TDX module,
2092 * then SEAMRET to KVM. Once it exits to KVM, SMI is delivered
2093 * and handled by kernel handler right away.
2094 *
2095 * The Other SMI exit can also be caused by the SEAM non-root
2096 * machine check delivered via Machine Check System Management
2097 * Interrupt (MSMI), but it has already been handled by the
2098 * kernel machine check handler, i.e., the memory page has been
2099 * marked as poisoned and it won't be freed to the free list
2100 * when the TDX guest is terminated (the TDX module marks the
2101 * guest as dead and prevent it from further running when
2102 * machine check happens in SEAM non-root).
2103 *
2104 * - A MSMI will not reach here, it's handled as non_recoverable
2105 * case above.
2106 * - If it's not an MSMI, no need to do anything here.
2107 */
2108 return 1;
2109 default:
2110 break;
2111 }
2112
2113 unhandled_exit:
2114 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
2115 vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_UNEXPECTED_EXIT_REASON;
2116 vcpu->run->internal.ndata = 2;
2117 vcpu->run->internal.data[0] = vp_enter_ret;
2118 vcpu->run->internal.data[1] = vcpu->arch.last_vmentry_cpu;
2119 return 0;
2120 }
2121
tdx_get_exit_info(struct kvm_vcpu * vcpu,u32 * reason,u64 * info1,u64 * info2,u32 * intr_info,u32 * error_code)2122 void tdx_get_exit_info(struct kvm_vcpu *vcpu, u32 *reason,
2123 u64 *info1, u64 *info2, u32 *intr_info, u32 *error_code)
2124 {
2125 struct vcpu_tdx *tdx = to_tdx(vcpu);
2126
2127 *reason = tdx->vt.exit_reason.full;
2128 if (*reason != -1u) {
2129 *info1 = vmx_get_exit_qual(vcpu);
2130 *info2 = tdx->ext_exit_qualification;
2131 *intr_info = vmx_get_intr_info(vcpu);
2132 } else {
2133 *info1 = 0;
2134 *info2 = 0;
2135 *intr_info = 0;
2136 }
2137
2138 *error_code = 0;
2139 }
2140
tdx_has_emulated_msr(u32 index)2141 bool tdx_has_emulated_msr(u32 index)
2142 {
2143 switch (index) {
2144 case MSR_IA32_UCODE_REV:
2145 case MSR_IA32_ARCH_CAPABILITIES:
2146 case MSR_IA32_POWER_CTL:
2147 case MSR_IA32_CR_PAT:
2148 case MSR_MTRRcap:
2149 case MTRRphysBase_MSR(0) ... MSR_MTRRfix4K_F8000:
2150 case MSR_MTRRdefType:
2151 case MSR_IA32_TSC_DEADLINE:
2152 case MSR_IA32_MISC_ENABLE:
2153 case MSR_PLATFORM_INFO:
2154 case MSR_MISC_FEATURES_ENABLES:
2155 case MSR_IA32_APICBASE:
2156 case MSR_EFER:
2157 case MSR_IA32_FEAT_CTL:
2158 case MSR_IA32_MCG_CAP:
2159 case MSR_IA32_MCG_STATUS:
2160 case MSR_IA32_MCG_CTL:
2161 case MSR_IA32_MCG_EXT_CTL:
2162 case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1:
2163 case MSR_IA32_MC0_CTL2 ... MSR_IA32_MCx_CTL2(KVM_MAX_MCE_BANKS) - 1:
2164 /* MSR_IA32_MCx_{CTL, STATUS, ADDR, MISC, CTL2} */
2165 case MSR_KVM_POLL_CONTROL:
2166 return true;
2167 case APIC_BASE_MSR ... APIC_BASE_MSR + 0xff:
2168 /*
2169 * x2APIC registers that are virtualized by the CPU can't be
2170 * emulated, KVM doesn't have access to the virtual APIC page.
2171 */
2172 switch (index) {
2173 case X2APIC_MSR(APIC_TASKPRI):
2174 case X2APIC_MSR(APIC_PROCPRI):
2175 case X2APIC_MSR(APIC_EOI):
2176 case X2APIC_MSR(APIC_ISR) ... X2APIC_MSR(APIC_ISR + APIC_ISR_NR):
2177 case X2APIC_MSR(APIC_TMR) ... X2APIC_MSR(APIC_TMR + APIC_ISR_NR):
2178 case X2APIC_MSR(APIC_IRR) ... X2APIC_MSR(APIC_IRR + APIC_ISR_NR):
2179 return false;
2180 default:
2181 return true;
2182 }
2183 default:
2184 return false;
2185 }
2186 }
2187
tdx_is_read_only_msr(u32 index)2188 static bool tdx_is_read_only_msr(u32 index)
2189 {
2190 return index == MSR_IA32_APICBASE || index == MSR_EFER ||
2191 index == MSR_IA32_FEAT_CTL;
2192 }
2193
tdx_get_msr(struct kvm_vcpu * vcpu,struct msr_data * msr)2194 int tdx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
2195 {
2196 switch (msr->index) {
2197 case MSR_IA32_FEAT_CTL:
2198 /*
2199 * MCE and MCA are advertised via cpuid. Guest kernel could
2200 * check if LMCE is enabled or not.
2201 */
2202 msr->data = FEAT_CTL_LOCKED;
2203 if (vcpu->arch.mcg_cap & MCG_LMCE_P)
2204 msr->data |= FEAT_CTL_LMCE_ENABLED;
2205 return 0;
2206 case MSR_IA32_MCG_EXT_CTL:
2207 if (!msr->host_initiated && !(vcpu->arch.mcg_cap & MCG_LMCE_P))
2208 return 1;
2209 msr->data = vcpu->arch.mcg_ext_ctl;
2210 return 0;
2211 default:
2212 if (!tdx_has_emulated_msr(msr->index))
2213 return 1;
2214
2215 return kvm_get_msr_common(vcpu, msr);
2216 }
2217 }
2218
tdx_set_msr(struct kvm_vcpu * vcpu,struct msr_data * msr)2219 int tdx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
2220 {
2221 switch (msr->index) {
2222 case MSR_IA32_MCG_EXT_CTL:
2223 if ((!msr->host_initiated && !(vcpu->arch.mcg_cap & MCG_LMCE_P)) ||
2224 (msr->data & ~MCG_EXT_CTL_LMCE_EN))
2225 return 1;
2226 vcpu->arch.mcg_ext_ctl = msr->data;
2227 return 0;
2228 default:
2229 if (tdx_is_read_only_msr(msr->index))
2230 return 1;
2231
2232 if (!tdx_has_emulated_msr(msr->index))
2233 return 1;
2234
2235 return kvm_set_msr_common(vcpu, msr);
2236 }
2237 }
2238
tdx_get_capabilities(struct kvm_tdx_cmd * cmd)2239 static int tdx_get_capabilities(struct kvm_tdx_cmd *cmd)
2240 {
2241 const struct tdx_sys_info_td_conf *td_conf = &tdx_sysinfo->td_conf;
2242 struct kvm_tdx_capabilities __user *user_caps;
2243 struct kvm_tdx_capabilities *caps = NULL;
2244 int ret = 0;
2245
2246 /* flags is reserved for future use */
2247 if (cmd->flags)
2248 return -EINVAL;
2249
2250 caps = kmalloc(sizeof(*caps) +
2251 sizeof(struct kvm_cpuid_entry2) * td_conf->num_cpuid_config,
2252 GFP_KERNEL);
2253 if (!caps)
2254 return -ENOMEM;
2255
2256 user_caps = u64_to_user_ptr(cmd->data);
2257 if (copy_from_user(caps, user_caps, sizeof(*caps))) {
2258 ret = -EFAULT;
2259 goto out;
2260 }
2261
2262 if (caps->cpuid.nent < td_conf->num_cpuid_config) {
2263 ret = -E2BIG;
2264 goto out;
2265 }
2266
2267 ret = init_kvm_tdx_caps(td_conf, caps);
2268 if (ret)
2269 goto out;
2270
2271 if (copy_to_user(user_caps, caps, sizeof(*caps))) {
2272 ret = -EFAULT;
2273 goto out;
2274 }
2275
2276 if (copy_to_user(user_caps->cpuid.entries, caps->cpuid.entries,
2277 caps->cpuid.nent *
2278 sizeof(caps->cpuid.entries[0])))
2279 ret = -EFAULT;
2280
2281 out:
2282 /* kfree() accepts NULL. */
2283 kfree(caps);
2284 return ret;
2285 }
2286
2287 /*
2288 * KVM reports guest physical address in CPUID.0x800000008.EAX[23:16], which is
2289 * similar to TDX's GPAW. Use this field as the interface for userspace to
2290 * configure the GPAW and EPT level for TDs.
2291 *
2292 * Only values 48 and 52 are supported. Value 52 means GPAW-52 and EPT level
2293 * 5, Value 48 means GPAW-48 and EPT level 4. For value 48, GPAW-48 is always
2294 * supported. Value 52 is only supported when the platform supports 5 level
2295 * EPT.
2296 */
setup_tdparams_eptp_controls(struct kvm_cpuid2 * cpuid,struct td_params * td_params)2297 static int setup_tdparams_eptp_controls(struct kvm_cpuid2 *cpuid,
2298 struct td_params *td_params)
2299 {
2300 const struct kvm_cpuid_entry2 *entry;
2301 int guest_pa;
2302
2303 entry = kvm_find_cpuid_entry2(cpuid->entries, cpuid->nent, 0x80000008, 0);
2304 if (!entry)
2305 return -EINVAL;
2306
2307 guest_pa = tdx_get_guest_phys_addr_bits(entry->eax);
2308
2309 if (guest_pa != 48 && guest_pa != 52)
2310 return -EINVAL;
2311
2312 if (guest_pa == 52 && !cpu_has_vmx_ept_5levels())
2313 return -EINVAL;
2314
2315 td_params->eptp_controls = VMX_EPTP_MT_WB;
2316 if (guest_pa == 52) {
2317 td_params->eptp_controls |= VMX_EPTP_PWL_5;
2318 td_params->config_flags |= TDX_CONFIG_FLAGS_MAX_GPAW;
2319 } else {
2320 td_params->eptp_controls |= VMX_EPTP_PWL_4;
2321 }
2322
2323 return 0;
2324 }
2325
setup_tdparams_cpuids(struct kvm_cpuid2 * cpuid,struct td_params * td_params)2326 static int setup_tdparams_cpuids(struct kvm_cpuid2 *cpuid,
2327 struct td_params *td_params)
2328 {
2329 const struct tdx_sys_info_td_conf *td_conf = &tdx_sysinfo->td_conf;
2330 const struct kvm_cpuid_entry2 *entry;
2331 struct tdx_cpuid_value *value;
2332 int i, copy_cnt = 0;
2333
2334 /*
2335 * td_params.cpuid_values: The number and the order of cpuid_value must
2336 * be same to the one of struct tdsysinfo.{num_cpuid_config, cpuid_configs}
2337 * It's assumed that td_params was zeroed.
2338 */
2339 for (i = 0; i < td_conf->num_cpuid_config; i++) {
2340 struct kvm_cpuid_entry2 tmp;
2341
2342 td_init_cpuid_entry2(&tmp, i);
2343
2344 entry = kvm_find_cpuid_entry2(cpuid->entries, cpuid->nent,
2345 tmp.function, tmp.index);
2346 if (!entry)
2347 continue;
2348
2349 if (tdx_unsupported_cpuid(entry))
2350 return -EINVAL;
2351
2352 copy_cnt++;
2353
2354 value = &td_params->cpuid_values[i];
2355 value->eax = entry->eax;
2356 value->ebx = entry->ebx;
2357 value->ecx = entry->ecx;
2358 value->edx = entry->edx;
2359
2360 /*
2361 * TDX module does not accept nonzero bits 16..23 for the
2362 * CPUID[0x80000008].EAX, see setup_tdparams_eptp_controls().
2363 */
2364 if (tmp.function == 0x80000008)
2365 value->eax = tdx_set_guest_phys_addr_bits(value->eax, 0);
2366 }
2367
2368 /*
2369 * Rely on the TDX module to reject invalid configuration, but it can't
2370 * check of leafs that don't have a proper slot in td_params->cpuid_values
2371 * to stick then. So fail if there were entries that didn't get copied to
2372 * td_params.
2373 */
2374 if (copy_cnt != cpuid->nent)
2375 return -EINVAL;
2376
2377 return 0;
2378 }
2379
setup_tdparams(struct kvm * kvm,struct td_params * td_params,struct kvm_tdx_init_vm * init_vm)2380 static int setup_tdparams(struct kvm *kvm, struct td_params *td_params,
2381 struct kvm_tdx_init_vm *init_vm)
2382 {
2383 const struct tdx_sys_info_td_conf *td_conf = &tdx_sysinfo->td_conf;
2384 struct kvm_cpuid2 *cpuid = &init_vm->cpuid;
2385 int ret;
2386
2387 if (kvm->created_vcpus)
2388 return -EBUSY;
2389
2390 if (init_vm->attributes & ~tdx_get_supported_attrs(td_conf))
2391 return -EINVAL;
2392
2393 if (init_vm->xfam & ~tdx_get_supported_xfam(td_conf))
2394 return -EINVAL;
2395
2396 td_params->max_vcpus = kvm->max_vcpus;
2397 td_params->attributes = init_vm->attributes | td_conf->attributes_fixed1;
2398 td_params->xfam = init_vm->xfam | td_conf->xfam_fixed1;
2399
2400 td_params->config_flags = TDX_CONFIG_FLAGS_NO_RBP_MOD;
2401 td_params->tsc_frequency = TDX_TSC_KHZ_TO_25MHZ(kvm->arch.default_tsc_khz);
2402
2403 ret = setup_tdparams_eptp_controls(cpuid, td_params);
2404 if (ret)
2405 return ret;
2406
2407 ret = setup_tdparams_cpuids(cpuid, td_params);
2408 if (ret)
2409 return ret;
2410
2411 #define MEMCPY_SAME_SIZE(dst, src) \
2412 do { \
2413 BUILD_BUG_ON(sizeof(dst) != sizeof(src)); \
2414 memcpy((dst), (src), sizeof(dst)); \
2415 } while (0)
2416
2417 MEMCPY_SAME_SIZE(td_params->mrconfigid, init_vm->mrconfigid);
2418 MEMCPY_SAME_SIZE(td_params->mrowner, init_vm->mrowner);
2419 MEMCPY_SAME_SIZE(td_params->mrownerconfig, init_vm->mrownerconfig);
2420
2421 return 0;
2422 }
2423
__tdx_td_init(struct kvm * kvm,struct td_params * td_params,u64 * seamcall_err)2424 static int __tdx_td_init(struct kvm *kvm, struct td_params *td_params,
2425 u64 *seamcall_err)
2426 {
2427 struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
2428 cpumask_var_t packages;
2429 struct page **tdcs_pages = NULL;
2430 struct page *tdr_page;
2431 int ret, i;
2432 u64 err, rcx;
2433
2434 *seamcall_err = 0;
2435 ret = tdx_guest_keyid_alloc();
2436 if (ret < 0)
2437 return ret;
2438 kvm_tdx->hkid = ret;
2439 kvm_tdx->misc_cg = get_current_misc_cg();
2440 ret = misc_cg_try_charge(MISC_CG_RES_TDX, kvm_tdx->misc_cg, 1);
2441 if (ret)
2442 goto free_hkid;
2443
2444 ret = -ENOMEM;
2445
2446 atomic_inc(&nr_configured_hkid);
2447
2448 tdr_page = alloc_page(GFP_KERNEL);
2449 if (!tdr_page)
2450 goto free_hkid;
2451
2452 kvm_tdx->td.tdcs_nr_pages = tdx_sysinfo->td_ctrl.tdcs_base_size / PAGE_SIZE;
2453 /* TDVPS = TDVPR(4K page) + TDCX(multiple 4K pages), -1 for TDVPR. */
2454 kvm_tdx->td.tdcx_nr_pages = tdx_sysinfo->td_ctrl.tdvps_base_size / PAGE_SIZE - 1;
2455 tdcs_pages = kcalloc(kvm_tdx->td.tdcs_nr_pages, sizeof(*kvm_tdx->td.tdcs_pages),
2456 GFP_KERNEL | __GFP_ZERO);
2457 if (!tdcs_pages)
2458 goto free_tdr;
2459
2460 for (i = 0; i < kvm_tdx->td.tdcs_nr_pages; i++) {
2461 tdcs_pages[i] = alloc_page(GFP_KERNEL);
2462 if (!tdcs_pages[i])
2463 goto free_tdcs;
2464 }
2465
2466 if (!zalloc_cpumask_var(&packages, GFP_KERNEL))
2467 goto free_tdcs;
2468
2469 cpus_read_lock();
2470
2471 /*
2472 * Need at least one CPU of the package to be online in order to
2473 * program all packages for host key id. Check it.
2474 */
2475 for_each_present_cpu(i)
2476 cpumask_set_cpu(topology_physical_package_id(i), packages);
2477 for_each_online_cpu(i)
2478 cpumask_clear_cpu(topology_physical_package_id(i), packages);
2479 if (!cpumask_empty(packages)) {
2480 ret = -EIO;
2481 /*
2482 * Because it's hard for human operator to figure out the
2483 * reason, warn it.
2484 */
2485 #define MSG_ALLPKG "All packages need to have online CPU to create TD. Online CPU and retry.\n"
2486 pr_warn_ratelimited(MSG_ALLPKG);
2487 goto free_packages;
2488 }
2489
2490 /*
2491 * TDH.MNG.CREATE tries to grab the global TDX module and fails
2492 * with TDX_OPERAND_BUSY when it fails to grab. Take the global
2493 * lock to prevent it from failure.
2494 */
2495 mutex_lock(&tdx_lock);
2496 kvm_tdx->td.tdr_page = tdr_page;
2497 err = tdh_mng_create(&kvm_tdx->td, kvm_tdx->hkid);
2498 mutex_unlock(&tdx_lock);
2499
2500 if (err == TDX_RND_NO_ENTROPY) {
2501 ret = -EAGAIN;
2502 goto free_packages;
2503 }
2504
2505 if (WARN_ON_ONCE(err)) {
2506 pr_tdx_error(TDH_MNG_CREATE, err);
2507 ret = -EIO;
2508 goto free_packages;
2509 }
2510
2511 for_each_online_cpu(i) {
2512 int pkg = topology_physical_package_id(i);
2513
2514 if (cpumask_test_and_set_cpu(pkg, packages))
2515 continue;
2516
2517 /*
2518 * Program the memory controller in the package with an
2519 * encryption key associated to a TDX private host key id
2520 * assigned to this TDR. Concurrent operations on same memory
2521 * controller results in TDX_OPERAND_BUSY. No locking needed
2522 * beyond the cpus_read_lock() above as it serializes against
2523 * hotplug and the first online CPU of the package is always
2524 * used. We never have two CPUs in the same socket trying to
2525 * program the key.
2526 */
2527 ret = smp_call_on_cpu(i, tdx_do_tdh_mng_key_config,
2528 kvm_tdx, true);
2529 if (ret)
2530 break;
2531 }
2532 cpus_read_unlock();
2533 free_cpumask_var(packages);
2534 if (ret) {
2535 i = 0;
2536 goto teardown;
2537 }
2538
2539 kvm_tdx->td.tdcs_pages = tdcs_pages;
2540 for (i = 0; i < kvm_tdx->td.tdcs_nr_pages; i++) {
2541 err = tdh_mng_addcx(&kvm_tdx->td, tdcs_pages[i]);
2542 if (err == TDX_RND_NO_ENTROPY) {
2543 /* Here it's hard to allow userspace to retry. */
2544 ret = -EAGAIN;
2545 goto teardown;
2546 }
2547 if (WARN_ON_ONCE(err)) {
2548 pr_tdx_error(TDH_MNG_ADDCX, err);
2549 ret = -EIO;
2550 goto teardown;
2551 }
2552 }
2553
2554 err = tdh_mng_init(&kvm_tdx->td, __pa(td_params), &rcx);
2555 if ((err & TDX_SEAMCALL_STATUS_MASK) == TDX_OPERAND_INVALID) {
2556 /*
2557 * Because a user gives operands, don't warn.
2558 * Return a hint to the user because it's sometimes hard for the
2559 * user to figure out which operand is invalid. SEAMCALL status
2560 * code includes which operand caused invalid operand error.
2561 */
2562 *seamcall_err = err;
2563 ret = -EINVAL;
2564 goto teardown;
2565 } else if (WARN_ON_ONCE(err)) {
2566 pr_tdx_error_1(TDH_MNG_INIT, err, rcx);
2567 ret = -EIO;
2568 goto teardown;
2569 }
2570
2571 return 0;
2572
2573 /*
2574 * The sequence for freeing resources from a partially initialized TD
2575 * varies based on where in the initialization flow failure occurred.
2576 * Simply use the full teardown and destroy, which naturally play nice
2577 * with partial initialization.
2578 */
2579 teardown:
2580 /* Only free pages not yet added, so start at 'i' */
2581 for (; i < kvm_tdx->td.tdcs_nr_pages; i++) {
2582 if (tdcs_pages[i]) {
2583 __free_page(tdcs_pages[i]);
2584 tdcs_pages[i] = NULL;
2585 }
2586 }
2587 if (!kvm_tdx->td.tdcs_pages)
2588 kfree(tdcs_pages);
2589
2590 tdx_mmu_release_hkid(kvm);
2591 tdx_reclaim_td_control_pages(kvm);
2592
2593 return ret;
2594
2595 free_packages:
2596 cpus_read_unlock();
2597 free_cpumask_var(packages);
2598
2599 free_tdcs:
2600 for (i = 0; i < kvm_tdx->td.tdcs_nr_pages; i++) {
2601 if (tdcs_pages[i])
2602 __free_page(tdcs_pages[i]);
2603 }
2604 kfree(tdcs_pages);
2605 kvm_tdx->td.tdcs_pages = NULL;
2606
2607 free_tdr:
2608 if (tdr_page)
2609 __free_page(tdr_page);
2610 kvm_tdx->td.tdr_page = 0;
2611
2612 free_hkid:
2613 tdx_hkid_free(kvm_tdx);
2614
2615 return ret;
2616 }
2617
tdx_td_metadata_field_read(struct kvm_tdx * tdx,u64 field_id,u64 * data)2618 static u64 tdx_td_metadata_field_read(struct kvm_tdx *tdx, u64 field_id,
2619 u64 *data)
2620 {
2621 u64 err;
2622
2623 err = tdh_mng_rd(&tdx->td, field_id, data);
2624
2625 return err;
2626 }
2627
2628 #define TDX_MD_UNREADABLE_LEAF_MASK GENMASK(30, 7)
2629 #define TDX_MD_UNREADABLE_SUBLEAF_MASK GENMASK(31, 7)
2630
tdx_read_cpuid(struct kvm_vcpu * vcpu,u32 leaf,u32 sub_leaf,bool sub_leaf_set,int * entry_index,struct kvm_cpuid_entry2 * out)2631 static int tdx_read_cpuid(struct kvm_vcpu *vcpu, u32 leaf, u32 sub_leaf,
2632 bool sub_leaf_set, int *entry_index,
2633 struct kvm_cpuid_entry2 *out)
2634 {
2635 struct kvm_tdx *kvm_tdx = to_kvm_tdx(vcpu->kvm);
2636 u64 field_id = TD_MD_FIELD_ID_CPUID_VALUES;
2637 u64 ebx_eax, edx_ecx;
2638 u64 err = 0;
2639
2640 if (sub_leaf > 0b1111111)
2641 return -EINVAL;
2642
2643 if (*entry_index >= KVM_MAX_CPUID_ENTRIES)
2644 return -EINVAL;
2645
2646 if (leaf & TDX_MD_UNREADABLE_LEAF_MASK ||
2647 sub_leaf & TDX_MD_UNREADABLE_SUBLEAF_MASK)
2648 return -EINVAL;
2649
2650 /*
2651 * bit 23:17, REVSERVED: reserved, must be 0;
2652 * bit 16, LEAF_31: leaf number bit 31;
2653 * bit 15:9, LEAF_6_0: leaf number bits 6:0, leaf bits 30:7 are
2654 * implicitly 0;
2655 * bit 8, SUBLEAF_NA: sub-leaf not applicable flag;
2656 * bit 7:1, SUBLEAF_6_0: sub-leaf number bits 6:0. If SUBLEAF_NA is 1,
2657 * the SUBLEAF_6_0 is all-1.
2658 * sub-leaf bits 31:7 are implicitly 0;
2659 * bit 0, ELEMENT_I: Element index within field;
2660 */
2661 field_id |= ((leaf & 0x80000000) ? 1 : 0) << 16;
2662 field_id |= (leaf & 0x7f) << 9;
2663 if (sub_leaf_set)
2664 field_id |= (sub_leaf & 0x7f) << 1;
2665 else
2666 field_id |= 0x1fe;
2667
2668 err = tdx_td_metadata_field_read(kvm_tdx, field_id, &ebx_eax);
2669 if (err) //TODO check for specific errors
2670 goto err_out;
2671
2672 out->eax = (u32) ebx_eax;
2673 out->ebx = (u32) (ebx_eax >> 32);
2674
2675 field_id++;
2676 err = tdx_td_metadata_field_read(kvm_tdx, field_id, &edx_ecx);
2677 /*
2678 * It's weird that reading edx_ecx fails while reading ebx_eax
2679 * succeeded.
2680 */
2681 if (WARN_ON_ONCE(err))
2682 goto err_out;
2683
2684 out->ecx = (u32) edx_ecx;
2685 out->edx = (u32) (edx_ecx >> 32);
2686
2687 out->function = leaf;
2688 out->index = sub_leaf;
2689 out->flags |= sub_leaf_set ? KVM_CPUID_FLAG_SIGNIFCANT_INDEX : 0;
2690
2691 /*
2692 * Work around missing support on old TDX modules, fetch
2693 * guest maxpa from gfn_direct_bits.
2694 */
2695 if (leaf == 0x80000008) {
2696 gpa_t gpa_bits = gfn_to_gpa(kvm_gfn_direct_bits(vcpu->kvm));
2697 unsigned int g_maxpa = __ffs(gpa_bits) + 1;
2698
2699 out->eax = tdx_set_guest_phys_addr_bits(out->eax, g_maxpa);
2700 }
2701
2702 (*entry_index)++;
2703
2704 return 0;
2705
2706 err_out:
2707 out->eax = 0;
2708 out->ebx = 0;
2709 out->ecx = 0;
2710 out->edx = 0;
2711
2712 return -EIO;
2713 }
2714
tdx_td_init(struct kvm * kvm,struct kvm_tdx_cmd * cmd)2715 static int tdx_td_init(struct kvm *kvm, struct kvm_tdx_cmd *cmd)
2716 {
2717 struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
2718 struct kvm_tdx_init_vm *init_vm;
2719 struct td_params *td_params = NULL;
2720 int ret;
2721
2722 BUILD_BUG_ON(sizeof(*init_vm) != 256 + sizeof_field(struct kvm_tdx_init_vm, cpuid));
2723 BUILD_BUG_ON(sizeof(struct td_params) != 1024);
2724
2725 if (kvm_tdx->state != TD_STATE_UNINITIALIZED)
2726 return -EINVAL;
2727
2728 if (cmd->flags)
2729 return -EINVAL;
2730
2731 init_vm = kmalloc(sizeof(*init_vm) +
2732 sizeof(init_vm->cpuid.entries[0]) * KVM_MAX_CPUID_ENTRIES,
2733 GFP_KERNEL);
2734 if (!init_vm)
2735 return -ENOMEM;
2736
2737 if (copy_from_user(init_vm, u64_to_user_ptr(cmd->data), sizeof(*init_vm))) {
2738 ret = -EFAULT;
2739 goto out;
2740 }
2741
2742 if (init_vm->cpuid.nent > KVM_MAX_CPUID_ENTRIES) {
2743 ret = -E2BIG;
2744 goto out;
2745 }
2746
2747 if (copy_from_user(init_vm->cpuid.entries,
2748 u64_to_user_ptr(cmd->data) + sizeof(*init_vm),
2749 flex_array_size(init_vm, cpuid.entries, init_vm->cpuid.nent))) {
2750 ret = -EFAULT;
2751 goto out;
2752 }
2753
2754 if (memchr_inv(init_vm->reserved, 0, sizeof(init_vm->reserved))) {
2755 ret = -EINVAL;
2756 goto out;
2757 }
2758
2759 if (init_vm->cpuid.padding) {
2760 ret = -EINVAL;
2761 goto out;
2762 }
2763
2764 td_params = kzalloc(sizeof(struct td_params), GFP_KERNEL);
2765 if (!td_params) {
2766 ret = -ENOMEM;
2767 goto out;
2768 }
2769
2770 ret = setup_tdparams(kvm, td_params, init_vm);
2771 if (ret)
2772 goto out;
2773
2774 ret = __tdx_td_init(kvm, td_params, &cmd->hw_error);
2775 if (ret)
2776 goto out;
2777
2778 kvm_tdx->tsc_offset = td_tdcs_exec_read64(kvm_tdx, TD_TDCS_EXEC_TSC_OFFSET);
2779 kvm_tdx->tsc_multiplier = td_tdcs_exec_read64(kvm_tdx, TD_TDCS_EXEC_TSC_MULTIPLIER);
2780 kvm_tdx->attributes = td_params->attributes;
2781 kvm_tdx->xfam = td_params->xfam;
2782
2783 if (td_params->config_flags & TDX_CONFIG_FLAGS_MAX_GPAW)
2784 kvm->arch.gfn_direct_bits = TDX_SHARED_BIT_PWL_5;
2785 else
2786 kvm->arch.gfn_direct_bits = TDX_SHARED_BIT_PWL_4;
2787
2788 kvm_tdx->state = TD_STATE_INITIALIZED;
2789 out:
2790 /* kfree() accepts NULL. */
2791 kfree(init_vm);
2792 kfree(td_params);
2793
2794 return ret;
2795 }
2796
tdx_flush_tlb_current(struct kvm_vcpu * vcpu)2797 void tdx_flush_tlb_current(struct kvm_vcpu *vcpu)
2798 {
2799 /*
2800 * flush_tlb_current() is invoked when the first time for the vcpu to
2801 * run or when root of shared EPT is invalidated.
2802 * KVM only needs to flush shared EPT because the TDX module handles TLB
2803 * invalidation for private EPT in tdh_vp_enter();
2804 *
2805 * A single context invalidation for shared EPT can be performed here.
2806 * However, this single context invalidation requires the private EPTP
2807 * rather than the shared EPTP to flush shared EPT, as shared EPT uses
2808 * private EPTP as its ASID for TLB invalidation.
2809 *
2810 * To avoid reading back private EPTP, perform a global invalidation for
2811 * shared EPT instead to keep this function simple.
2812 */
2813 ept_sync_global();
2814 }
2815
tdx_flush_tlb_all(struct kvm_vcpu * vcpu)2816 void tdx_flush_tlb_all(struct kvm_vcpu *vcpu)
2817 {
2818 /*
2819 * TDX has called tdx_track() in tdx_sept_remove_private_spte() to
2820 * ensure that private EPT will be flushed on the next TD enter. No need
2821 * to call tdx_track() here again even when this callback is a result of
2822 * zapping private EPT.
2823 *
2824 * Due to the lack of the context to determine which EPT has been
2825 * affected by zapping, invoke invept() directly here for both shared
2826 * EPT and private EPT for simplicity, though it's not necessary for
2827 * private EPT.
2828 */
2829 ept_sync_global();
2830 }
2831
tdx_td_finalize(struct kvm * kvm,struct kvm_tdx_cmd * cmd)2832 static int tdx_td_finalize(struct kvm *kvm, struct kvm_tdx_cmd *cmd)
2833 {
2834 struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
2835
2836 guard(mutex)(&kvm->slots_lock);
2837
2838 if (!is_hkid_assigned(kvm_tdx) || kvm_tdx->state == TD_STATE_RUNNABLE)
2839 return -EINVAL;
2840 /*
2841 * Pages are pending for KVM_TDX_INIT_MEM_REGION to issue
2842 * TDH.MEM.PAGE.ADD().
2843 */
2844 if (atomic64_read(&kvm_tdx->nr_premapped))
2845 return -EINVAL;
2846
2847 cmd->hw_error = tdh_mr_finalize(&kvm_tdx->td);
2848 if (tdx_operand_busy(cmd->hw_error))
2849 return -EBUSY;
2850 if (KVM_BUG_ON(cmd->hw_error, kvm)) {
2851 pr_tdx_error(TDH_MR_FINALIZE, cmd->hw_error);
2852 return -EIO;
2853 }
2854
2855 kvm_tdx->state = TD_STATE_RUNNABLE;
2856 /* TD_STATE_RUNNABLE must be set before 'pre_fault_allowed' */
2857 smp_wmb();
2858 kvm->arch.pre_fault_allowed = true;
2859 return 0;
2860 }
2861
tdx_vm_ioctl(struct kvm * kvm,void __user * argp)2862 int tdx_vm_ioctl(struct kvm *kvm, void __user *argp)
2863 {
2864 struct kvm_tdx_cmd tdx_cmd;
2865 int r;
2866
2867 if (copy_from_user(&tdx_cmd, argp, sizeof(struct kvm_tdx_cmd)))
2868 return -EFAULT;
2869
2870 /*
2871 * Userspace should never set hw_error. It is used to fill
2872 * hardware-defined error by the kernel.
2873 */
2874 if (tdx_cmd.hw_error)
2875 return -EINVAL;
2876
2877 mutex_lock(&kvm->lock);
2878
2879 switch (tdx_cmd.id) {
2880 case KVM_TDX_CAPABILITIES:
2881 r = tdx_get_capabilities(&tdx_cmd);
2882 break;
2883 case KVM_TDX_INIT_VM:
2884 r = tdx_td_init(kvm, &tdx_cmd);
2885 break;
2886 case KVM_TDX_FINALIZE_VM:
2887 r = tdx_td_finalize(kvm, &tdx_cmd);
2888 break;
2889 default:
2890 r = -EINVAL;
2891 goto out;
2892 }
2893
2894 if (copy_to_user(argp, &tdx_cmd, sizeof(struct kvm_tdx_cmd)))
2895 r = -EFAULT;
2896
2897 out:
2898 mutex_unlock(&kvm->lock);
2899 return r;
2900 }
2901
2902 /* VMM can pass one 64bit auxiliary data to vcpu via RCX for guest BIOS. */
tdx_td_vcpu_init(struct kvm_vcpu * vcpu,u64 vcpu_rcx)2903 static int tdx_td_vcpu_init(struct kvm_vcpu *vcpu, u64 vcpu_rcx)
2904 {
2905 struct kvm_tdx *kvm_tdx = to_kvm_tdx(vcpu->kvm);
2906 struct vcpu_tdx *tdx = to_tdx(vcpu);
2907 struct page *page;
2908 int ret, i;
2909 u64 err;
2910
2911 page = alloc_page(GFP_KERNEL);
2912 if (!page)
2913 return -ENOMEM;
2914 tdx->vp.tdvpr_page = page;
2915
2916 tdx->vp.tdcx_pages = kcalloc(kvm_tdx->td.tdcx_nr_pages, sizeof(*tdx->vp.tdcx_pages),
2917 GFP_KERNEL);
2918 if (!tdx->vp.tdcx_pages) {
2919 ret = -ENOMEM;
2920 goto free_tdvpr;
2921 }
2922
2923 for (i = 0; i < kvm_tdx->td.tdcx_nr_pages; i++) {
2924 page = alloc_page(GFP_KERNEL);
2925 if (!page) {
2926 ret = -ENOMEM;
2927 goto free_tdcx;
2928 }
2929 tdx->vp.tdcx_pages[i] = page;
2930 }
2931
2932 err = tdh_vp_create(&kvm_tdx->td, &tdx->vp);
2933 if (KVM_BUG_ON(err, vcpu->kvm)) {
2934 ret = -EIO;
2935 pr_tdx_error(TDH_VP_CREATE, err);
2936 goto free_tdcx;
2937 }
2938
2939 for (i = 0; i < kvm_tdx->td.tdcx_nr_pages; i++) {
2940 err = tdh_vp_addcx(&tdx->vp, tdx->vp.tdcx_pages[i]);
2941 if (KVM_BUG_ON(err, vcpu->kvm)) {
2942 pr_tdx_error(TDH_VP_ADDCX, err);
2943 /*
2944 * Pages already added are reclaimed by the vcpu_free
2945 * method, but the rest are freed here.
2946 */
2947 for (; i < kvm_tdx->td.tdcx_nr_pages; i++) {
2948 __free_page(tdx->vp.tdcx_pages[i]);
2949 tdx->vp.tdcx_pages[i] = NULL;
2950 }
2951 return -EIO;
2952 }
2953 }
2954
2955 err = tdh_vp_init(&tdx->vp, vcpu_rcx, vcpu->vcpu_id);
2956 if (KVM_BUG_ON(err, vcpu->kvm)) {
2957 pr_tdx_error(TDH_VP_INIT, err);
2958 return -EIO;
2959 }
2960
2961 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
2962
2963 return 0;
2964
2965 free_tdcx:
2966 for (i = 0; i < kvm_tdx->td.tdcx_nr_pages; i++) {
2967 if (tdx->vp.tdcx_pages[i])
2968 __free_page(tdx->vp.tdcx_pages[i]);
2969 tdx->vp.tdcx_pages[i] = NULL;
2970 }
2971 kfree(tdx->vp.tdcx_pages);
2972 tdx->vp.tdcx_pages = NULL;
2973
2974 free_tdvpr:
2975 if (tdx->vp.tdvpr_page)
2976 __free_page(tdx->vp.tdvpr_page);
2977 tdx->vp.tdvpr_page = 0;
2978
2979 return ret;
2980 }
2981
2982 /* Sometimes reads multipple subleafs. Return how many enties were written. */
tdx_vcpu_get_cpuid_leaf(struct kvm_vcpu * vcpu,u32 leaf,int * entry_index,struct kvm_cpuid_entry2 * output_e)2983 static int tdx_vcpu_get_cpuid_leaf(struct kvm_vcpu *vcpu, u32 leaf, int *entry_index,
2984 struct kvm_cpuid_entry2 *output_e)
2985 {
2986 int sub_leaf = 0;
2987 int ret;
2988
2989 /* First try without a subleaf */
2990 ret = tdx_read_cpuid(vcpu, leaf, 0, false, entry_index, output_e);
2991
2992 /* If success, or invalid leaf, just give up */
2993 if (ret != -EIO)
2994 return ret;
2995
2996 /*
2997 * If the try without a subleaf failed, try reading subleafs until
2998 * failure. The TDX module only supports 6 bits of subleaf index.
2999 */
3000 while (1) {
3001 /* Keep reading subleafs until there is a failure. */
3002 if (tdx_read_cpuid(vcpu, leaf, sub_leaf, true, entry_index, output_e))
3003 return !sub_leaf;
3004
3005 sub_leaf++;
3006 output_e++;
3007 }
3008
3009 return 0;
3010 }
3011
tdx_vcpu_get_cpuid(struct kvm_vcpu * vcpu,struct kvm_tdx_cmd * cmd)3012 static int tdx_vcpu_get_cpuid(struct kvm_vcpu *vcpu, struct kvm_tdx_cmd *cmd)
3013 {
3014 struct kvm_cpuid2 __user *output, *td_cpuid;
3015 int r = 0, i = 0, leaf;
3016 u32 level;
3017
3018 output = u64_to_user_ptr(cmd->data);
3019 td_cpuid = kzalloc(sizeof(*td_cpuid) +
3020 sizeof(output->entries[0]) * KVM_MAX_CPUID_ENTRIES,
3021 GFP_KERNEL);
3022 if (!td_cpuid)
3023 return -ENOMEM;
3024
3025 if (copy_from_user(td_cpuid, output, sizeof(*output))) {
3026 r = -EFAULT;
3027 goto out;
3028 }
3029
3030 /* Read max CPUID for normal range */
3031 if (tdx_vcpu_get_cpuid_leaf(vcpu, 0, &i, &td_cpuid->entries[i])) {
3032 r = -EIO;
3033 goto out;
3034 }
3035 level = td_cpuid->entries[0].eax;
3036
3037 for (leaf = 1; leaf <= level; leaf++)
3038 tdx_vcpu_get_cpuid_leaf(vcpu, leaf, &i, &td_cpuid->entries[i]);
3039
3040 /* Read max CPUID for extended range */
3041 if (tdx_vcpu_get_cpuid_leaf(vcpu, 0x80000000, &i, &td_cpuid->entries[i])) {
3042 r = -EIO;
3043 goto out;
3044 }
3045 level = td_cpuid->entries[i - 1].eax;
3046
3047 for (leaf = 0x80000001; leaf <= level; leaf++)
3048 tdx_vcpu_get_cpuid_leaf(vcpu, leaf, &i, &td_cpuid->entries[i]);
3049
3050 if (td_cpuid->nent < i)
3051 r = -E2BIG;
3052 td_cpuid->nent = i;
3053
3054 if (copy_to_user(output, td_cpuid, sizeof(*output))) {
3055 r = -EFAULT;
3056 goto out;
3057 }
3058
3059 if (r == -E2BIG)
3060 goto out;
3061
3062 if (copy_to_user(output->entries, td_cpuid->entries,
3063 td_cpuid->nent * sizeof(struct kvm_cpuid_entry2)))
3064 r = -EFAULT;
3065
3066 out:
3067 kfree(td_cpuid);
3068
3069 return r;
3070 }
3071
tdx_vcpu_init(struct kvm_vcpu * vcpu,struct kvm_tdx_cmd * cmd)3072 static int tdx_vcpu_init(struct kvm_vcpu *vcpu, struct kvm_tdx_cmd *cmd)
3073 {
3074 u64 apic_base;
3075 struct vcpu_tdx *tdx = to_tdx(vcpu);
3076 int ret;
3077
3078 if (cmd->flags)
3079 return -EINVAL;
3080
3081 if (tdx->state != VCPU_TD_STATE_UNINITIALIZED)
3082 return -EINVAL;
3083
3084 /*
3085 * TDX requires X2APIC, userspace is responsible for configuring guest
3086 * CPUID accordingly.
3087 */
3088 apic_base = APIC_DEFAULT_PHYS_BASE | LAPIC_MODE_X2APIC |
3089 (kvm_vcpu_is_reset_bsp(vcpu) ? MSR_IA32_APICBASE_BSP : 0);
3090 if (kvm_apic_set_base(vcpu, apic_base, true))
3091 return -EINVAL;
3092
3093 ret = tdx_td_vcpu_init(vcpu, (u64)cmd->data);
3094 if (ret)
3095 return ret;
3096
3097 td_vmcs_write16(tdx, POSTED_INTR_NV, POSTED_INTR_VECTOR);
3098 td_vmcs_write64(tdx, POSTED_INTR_DESC_ADDR, __pa(&tdx->vt.pi_desc));
3099 td_vmcs_setbit32(tdx, PIN_BASED_VM_EXEC_CONTROL, PIN_BASED_POSTED_INTR);
3100
3101 tdx->state = VCPU_TD_STATE_INITIALIZED;
3102
3103 return 0;
3104 }
3105
tdx_vcpu_reset(struct kvm_vcpu * vcpu,bool init_event)3106 void tdx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
3107 {
3108 /*
3109 * Yell on INIT, as TDX doesn't support INIT, i.e. KVM should drop all
3110 * INIT events.
3111 *
3112 * Defer initializing vCPU for RESET state until KVM_TDX_INIT_VCPU, as
3113 * userspace needs to define the vCPU model before KVM can initialize
3114 * vCPU state, e.g. to enable x2APIC.
3115 */
3116 WARN_ON_ONCE(init_event);
3117 }
3118
3119 struct tdx_gmem_post_populate_arg {
3120 struct kvm_vcpu *vcpu;
3121 __u32 flags;
3122 };
3123
tdx_gmem_post_populate(struct kvm * kvm,gfn_t gfn,kvm_pfn_t pfn,void __user * src,int order,void * _arg)3124 static int tdx_gmem_post_populate(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn,
3125 void __user *src, int order, void *_arg)
3126 {
3127 u64 error_code = PFERR_GUEST_FINAL_MASK | PFERR_PRIVATE_ACCESS;
3128 struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
3129 struct tdx_gmem_post_populate_arg *arg = _arg;
3130 struct kvm_vcpu *vcpu = arg->vcpu;
3131 gpa_t gpa = gfn_to_gpa(gfn);
3132 u8 level = PG_LEVEL_4K;
3133 struct page *src_page;
3134 int ret, i;
3135 u64 err, entry, level_state;
3136
3137 /*
3138 * Get the source page if it has been faulted in. Return failure if the
3139 * source page has been swapped out or unmapped in primary memory.
3140 */
3141 ret = get_user_pages_fast((unsigned long)src, 1, 0, &src_page);
3142 if (ret < 0)
3143 return ret;
3144 if (ret != 1)
3145 return -ENOMEM;
3146
3147 ret = kvm_tdp_map_page(vcpu, gpa, error_code, &level);
3148 if (ret < 0)
3149 goto out;
3150
3151 /*
3152 * The private mem cannot be zapped after kvm_tdp_map_page()
3153 * because all paths are covered by slots_lock and the
3154 * filemap invalidate lock. Check that they are indeed enough.
3155 */
3156 if (IS_ENABLED(CONFIG_KVM_PROVE_MMU)) {
3157 scoped_guard(read_lock, &kvm->mmu_lock) {
3158 if (KVM_BUG_ON(!kvm_tdp_mmu_gpa_is_mapped(vcpu, gpa), kvm)) {
3159 ret = -EIO;
3160 goto out;
3161 }
3162 }
3163 }
3164
3165 ret = 0;
3166 err = tdh_mem_page_add(&kvm_tdx->td, gpa, pfn_to_page(pfn),
3167 src_page, &entry, &level_state);
3168 if (err) {
3169 ret = unlikely(tdx_operand_busy(err)) ? -EBUSY : -EIO;
3170 goto out;
3171 }
3172
3173 if (!KVM_BUG_ON(!atomic64_read(&kvm_tdx->nr_premapped), kvm))
3174 atomic64_dec(&kvm_tdx->nr_premapped);
3175
3176 if (arg->flags & KVM_TDX_MEASURE_MEMORY_REGION) {
3177 for (i = 0; i < PAGE_SIZE; i += TDX_EXTENDMR_CHUNKSIZE) {
3178 err = tdh_mr_extend(&kvm_tdx->td, gpa + i, &entry,
3179 &level_state);
3180 if (err) {
3181 ret = -EIO;
3182 break;
3183 }
3184 }
3185 }
3186
3187 out:
3188 put_page(src_page);
3189 return ret;
3190 }
3191
tdx_vcpu_init_mem_region(struct kvm_vcpu * vcpu,struct kvm_tdx_cmd * cmd)3192 static int tdx_vcpu_init_mem_region(struct kvm_vcpu *vcpu, struct kvm_tdx_cmd *cmd)
3193 {
3194 struct vcpu_tdx *tdx = to_tdx(vcpu);
3195 struct kvm *kvm = vcpu->kvm;
3196 struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
3197 struct kvm_tdx_init_mem_region region;
3198 struct tdx_gmem_post_populate_arg arg;
3199 long gmem_ret;
3200 int ret;
3201
3202 if (tdx->state != VCPU_TD_STATE_INITIALIZED)
3203 return -EINVAL;
3204
3205 guard(mutex)(&kvm->slots_lock);
3206
3207 /* Once TD is finalized, the initial guest memory is fixed. */
3208 if (kvm_tdx->state == TD_STATE_RUNNABLE)
3209 return -EINVAL;
3210
3211 if (cmd->flags & ~KVM_TDX_MEASURE_MEMORY_REGION)
3212 return -EINVAL;
3213
3214 if (copy_from_user(®ion, u64_to_user_ptr(cmd->data), sizeof(region)))
3215 return -EFAULT;
3216
3217 if (!PAGE_ALIGNED(region.source_addr) || !PAGE_ALIGNED(region.gpa) ||
3218 !region.nr_pages ||
3219 region.gpa + (region.nr_pages << PAGE_SHIFT) <= region.gpa ||
3220 !vt_is_tdx_private_gpa(kvm, region.gpa) ||
3221 !vt_is_tdx_private_gpa(kvm, region.gpa + (region.nr_pages << PAGE_SHIFT) - 1))
3222 return -EINVAL;
3223
3224 kvm_mmu_reload(vcpu);
3225 ret = 0;
3226 while (region.nr_pages) {
3227 if (signal_pending(current)) {
3228 ret = -EINTR;
3229 break;
3230 }
3231
3232 arg = (struct tdx_gmem_post_populate_arg) {
3233 .vcpu = vcpu,
3234 .flags = cmd->flags,
3235 };
3236 gmem_ret = kvm_gmem_populate(kvm, gpa_to_gfn(region.gpa),
3237 u64_to_user_ptr(region.source_addr),
3238 1, tdx_gmem_post_populate, &arg);
3239 if (gmem_ret < 0) {
3240 ret = gmem_ret;
3241 break;
3242 }
3243
3244 if (gmem_ret != 1) {
3245 ret = -EIO;
3246 break;
3247 }
3248
3249 region.source_addr += PAGE_SIZE;
3250 region.gpa += PAGE_SIZE;
3251 region.nr_pages--;
3252
3253 cond_resched();
3254 }
3255
3256 if (copy_to_user(u64_to_user_ptr(cmd->data), ®ion, sizeof(region)))
3257 ret = -EFAULT;
3258 return ret;
3259 }
3260
tdx_vcpu_ioctl(struct kvm_vcpu * vcpu,void __user * argp)3261 int tdx_vcpu_ioctl(struct kvm_vcpu *vcpu, void __user *argp)
3262 {
3263 struct kvm_tdx *kvm_tdx = to_kvm_tdx(vcpu->kvm);
3264 struct kvm_tdx_cmd cmd;
3265 int ret;
3266
3267 if (!is_hkid_assigned(kvm_tdx) || kvm_tdx->state == TD_STATE_RUNNABLE)
3268 return -EINVAL;
3269
3270 if (copy_from_user(&cmd, argp, sizeof(cmd)))
3271 return -EFAULT;
3272
3273 if (cmd.hw_error)
3274 return -EINVAL;
3275
3276 switch (cmd.id) {
3277 case KVM_TDX_INIT_VCPU:
3278 ret = tdx_vcpu_init(vcpu, &cmd);
3279 break;
3280 case KVM_TDX_INIT_MEM_REGION:
3281 ret = tdx_vcpu_init_mem_region(vcpu, &cmd);
3282 break;
3283 case KVM_TDX_GET_CPUID:
3284 ret = tdx_vcpu_get_cpuid(vcpu, &cmd);
3285 break;
3286 default:
3287 ret = -EINVAL;
3288 break;
3289 }
3290
3291 return ret;
3292 }
3293
tdx_gmem_private_max_mapping_level(struct kvm * kvm,kvm_pfn_t pfn)3294 int tdx_gmem_private_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn)
3295 {
3296 return PG_LEVEL_4K;
3297 }
3298
tdx_online_cpu(unsigned int cpu)3299 static int tdx_online_cpu(unsigned int cpu)
3300 {
3301 unsigned long flags;
3302 int r;
3303
3304 /* Sanity check CPU is already in post-VMXON */
3305 WARN_ON_ONCE(!(cr4_read_shadow() & X86_CR4_VMXE));
3306
3307 local_irq_save(flags);
3308 r = tdx_cpu_enable();
3309 local_irq_restore(flags);
3310
3311 return r;
3312 }
3313
tdx_offline_cpu(unsigned int cpu)3314 static int tdx_offline_cpu(unsigned int cpu)
3315 {
3316 int i;
3317
3318 /* No TD is running. Allow any cpu to be offline. */
3319 if (!atomic_read(&nr_configured_hkid))
3320 return 0;
3321
3322 /*
3323 * In order to reclaim TDX HKID, (i.e. when deleting guest TD), need to
3324 * call TDH.PHYMEM.PAGE.WBINVD on all packages to program all memory
3325 * controller with pconfig. If we have active TDX HKID, refuse to
3326 * offline the last online cpu.
3327 */
3328 for_each_online_cpu(i) {
3329 /*
3330 * Found another online cpu on the same package.
3331 * Allow to offline.
3332 */
3333 if (i != cpu && topology_physical_package_id(i) ==
3334 topology_physical_package_id(cpu))
3335 return 0;
3336 }
3337
3338 /*
3339 * This is the last cpu of this package. Don't offline it.
3340 *
3341 * Because it's hard for human operator to understand the
3342 * reason, warn it.
3343 */
3344 #define MSG_ALLPKG_ONLINE \
3345 "TDX requires all packages to have an online CPU. Delete all TDs in order to offline all CPUs of a package.\n"
3346 pr_warn_ratelimited(MSG_ALLPKG_ONLINE);
3347 return -EBUSY;
3348 }
3349
__do_tdx_cleanup(void)3350 static void __do_tdx_cleanup(void)
3351 {
3352 /*
3353 * Once TDX module is initialized, it cannot be disabled and
3354 * re-initialized again w/o runtime update (which isn't
3355 * supported by kernel). Only need to remove the cpuhp here.
3356 * The TDX host core code tracks TDX status and can handle
3357 * 'multiple enabling' scenario.
3358 */
3359 WARN_ON_ONCE(!tdx_cpuhp_state);
3360 cpuhp_remove_state_nocalls_cpuslocked(tdx_cpuhp_state);
3361 tdx_cpuhp_state = 0;
3362 }
3363
__tdx_cleanup(void)3364 static void __tdx_cleanup(void)
3365 {
3366 cpus_read_lock();
3367 __do_tdx_cleanup();
3368 cpus_read_unlock();
3369 }
3370
__do_tdx_bringup(void)3371 static int __init __do_tdx_bringup(void)
3372 {
3373 int r;
3374
3375 /*
3376 * TDX-specific cpuhp callback to call tdx_cpu_enable() on all
3377 * online CPUs before calling tdx_enable(), and on any new
3378 * going-online CPU to make sure it is ready for TDX guest.
3379 */
3380 r = cpuhp_setup_state_cpuslocked(CPUHP_AP_ONLINE_DYN,
3381 "kvm/cpu/tdx:online",
3382 tdx_online_cpu, tdx_offline_cpu);
3383 if (r < 0)
3384 return r;
3385
3386 tdx_cpuhp_state = r;
3387
3388 r = tdx_enable();
3389 if (r)
3390 __do_tdx_cleanup();
3391
3392 return r;
3393 }
3394
__tdx_bringup(void)3395 static int __init __tdx_bringup(void)
3396 {
3397 const struct tdx_sys_info_td_conf *td_conf;
3398 int r, i;
3399
3400 for (i = 0; i < ARRAY_SIZE(tdx_uret_msrs); i++) {
3401 /*
3402 * Check if MSRs (tdx_uret_msrs) can be saved/restored
3403 * before returning to user space.
3404 *
3405 * this_cpu_ptr(user_return_msrs)->registered isn't checked
3406 * because the registration is done at vcpu runtime by
3407 * tdx_user_return_msr_update_cache().
3408 */
3409 tdx_uret_msrs[i].slot = kvm_find_user_return_msr(tdx_uret_msrs[i].msr);
3410 if (tdx_uret_msrs[i].slot == -1) {
3411 /* If any MSR isn't supported, it is a KVM bug */
3412 pr_err("MSR %x isn't included by kvm_find_user_return_msr\n",
3413 tdx_uret_msrs[i].msr);
3414 return -EIO;
3415 }
3416 }
3417
3418 /*
3419 * Enabling TDX requires enabling hardware virtualization first,
3420 * as making SEAMCALLs requires CPU being in post-VMXON state.
3421 */
3422 r = kvm_enable_virtualization();
3423 if (r)
3424 return r;
3425
3426 cpus_read_lock();
3427 r = __do_tdx_bringup();
3428 cpus_read_unlock();
3429
3430 if (r)
3431 goto tdx_bringup_err;
3432
3433 /* Get TDX global information for later use */
3434 tdx_sysinfo = tdx_get_sysinfo();
3435 if (WARN_ON_ONCE(!tdx_sysinfo)) {
3436 r = -EINVAL;
3437 goto get_sysinfo_err;
3438 }
3439
3440 /* Check TDX module and KVM capabilities */
3441 if (!tdx_get_supported_attrs(&tdx_sysinfo->td_conf) ||
3442 !tdx_get_supported_xfam(&tdx_sysinfo->td_conf))
3443 goto get_sysinfo_err;
3444
3445 if (!(tdx_sysinfo->features.tdx_features0 & MD_FIELD_ID_FEATURES0_TOPOLOGY_ENUM))
3446 goto get_sysinfo_err;
3447
3448 /*
3449 * TDX has its own limit of maximum vCPUs it can support for all
3450 * TDX guests in addition to KVM_MAX_VCPUS. Userspace needs to
3451 * query TDX guest's maximum vCPUs by checking KVM_CAP_MAX_VCPU
3452 * extension on per-VM basis.
3453 *
3454 * TDX module reports such limit via the MAX_VCPU_PER_TD global
3455 * metadata. Different modules may report different values.
3456 * Some old module may also not support this metadata (in which
3457 * case this limit is U16_MAX).
3458 *
3459 * In practice, the reported value reflects the maximum logical
3460 * CPUs that ALL the platforms that the module supports can
3461 * possibly have.
3462 *
3463 * Simply forwarding the MAX_VCPU_PER_TD to userspace could
3464 * result in an unpredictable ABI. KVM instead always advertise
3465 * the number of logical CPUs the platform has as the maximum
3466 * vCPUs for TDX guests.
3467 *
3468 * Make sure MAX_VCPU_PER_TD reported by TDX module is not
3469 * smaller than the number of logical CPUs, otherwise KVM will
3470 * report an unsupported value to userspace.
3471 *
3472 * Note, a platform with TDX enabled in the BIOS cannot support
3473 * physical CPU hotplug, and TDX requires the BIOS has marked
3474 * all logical CPUs in MADT table as enabled. Just use
3475 * num_present_cpus() for the number of logical CPUs.
3476 */
3477 td_conf = &tdx_sysinfo->td_conf;
3478 if (td_conf->max_vcpus_per_td < num_present_cpus()) {
3479 pr_err("Disable TDX: MAX_VCPU_PER_TD (%u) smaller than number of logical CPUs (%u).\n",
3480 td_conf->max_vcpus_per_td, num_present_cpus());
3481 r = -EINVAL;
3482 goto get_sysinfo_err;
3483 }
3484
3485 if (misc_cg_set_capacity(MISC_CG_RES_TDX, tdx_get_nr_guest_keyids())) {
3486 r = -EINVAL;
3487 goto get_sysinfo_err;
3488 }
3489
3490 /*
3491 * Leave hardware virtualization enabled after TDX is enabled
3492 * successfully. TDX CPU hotplug depends on this.
3493 */
3494 return 0;
3495
3496 get_sysinfo_err:
3497 __tdx_cleanup();
3498 tdx_bringup_err:
3499 kvm_disable_virtualization();
3500 return r;
3501 }
3502
tdx_cleanup(void)3503 void tdx_cleanup(void)
3504 {
3505 if (enable_tdx) {
3506 misc_cg_set_capacity(MISC_CG_RES_TDX, 0);
3507 __tdx_cleanup();
3508 kvm_disable_virtualization();
3509 }
3510 }
3511
tdx_bringup(void)3512 int __init tdx_bringup(void)
3513 {
3514 int r, i;
3515
3516 /* tdx_disable_virtualization_cpu() uses associated_tdvcpus. */
3517 for_each_possible_cpu(i)
3518 INIT_LIST_HEAD(&per_cpu(associated_tdvcpus, i));
3519
3520 if (!enable_tdx)
3521 return 0;
3522
3523 if (!enable_ept) {
3524 pr_err("EPT is required for TDX\n");
3525 goto success_disable_tdx;
3526 }
3527
3528 if (!tdp_mmu_enabled || !enable_mmio_caching || !enable_ept_ad_bits) {
3529 pr_err("TDP MMU and MMIO caching and EPT A/D bit is required for TDX\n");
3530 goto success_disable_tdx;
3531 }
3532
3533 if (!enable_apicv) {
3534 pr_err("APICv is required for TDX\n");
3535 goto success_disable_tdx;
3536 }
3537
3538 if (!cpu_feature_enabled(X86_FEATURE_OSXSAVE)) {
3539 pr_err("tdx: OSXSAVE is required for TDX\n");
3540 goto success_disable_tdx;
3541 }
3542
3543 if (!cpu_feature_enabled(X86_FEATURE_MOVDIR64B)) {
3544 pr_err("tdx: MOVDIR64B is required for TDX\n");
3545 goto success_disable_tdx;
3546 }
3547
3548 if (!cpu_feature_enabled(X86_FEATURE_SELFSNOOP)) {
3549 pr_err("Self-snoop is required for TDX\n");
3550 goto success_disable_tdx;
3551 }
3552
3553 if (!cpu_feature_enabled(X86_FEATURE_TDX_HOST_PLATFORM)) {
3554 pr_err("tdx: no TDX private KeyIDs available\n");
3555 goto success_disable_tdx;
3556 }
3557
3558 if (!enable_virt_at_load) {
3559 pr_err("tdx: tdx requires kvm.enable_virt_at_load=1\n");
3560 goto success_disable_tdx;
3561 }
3562
3563 /*
3564 * Ideally KVM should probe whether TDX module has been loaded
3565 * first and then try to bring it up. But TDX needs to use SEAMCALL
3566 * to probe whether the module is loaded (there is no CPUID or MSR
3567 * for that), and making SEAMCALL requires enabling virtualization
3568 * first, just like the rest steps of bringing up TDX module.
3569 *
3570 * So, for simplicity do everything in __tdx_bringup(); the first
3571 * SEAMCALL will return -ENODEV when the module is not loaded. The
3572 * only complication is having to make sure that initialization
3573 * SEAMCALLs don't return TDX_SEAMCALL_VMFAILINVALID in other
3574 * cases.
3575 */
3576 r = __tdx_bringup();
3577 if (r) {
3578 /*
3579 * Disable TDX only but don't fail to load module if
3580 * the TDX module could not be loaded. No need to print
3581 * message saying "module is not loaded" because it was
3582 * printed when the first SEAMCALL failed.
3583 */
3584 if (r == -ENODEV)
3585 goto success_disable_tdx;
3586
3587 enable_tdx = 0;
3588 }
3589
3590 return r;
3591
3592 success_disable_tdx:
3593 enable_tdx = 0;
3594 return 0;
3595 }
3596