1 // SPDX-License-Identifier: GPL-2.0
2 #include <linux/cleanup.h>
3 #include <linux/cpu.h>
4 #include <asm/cpufeature.h>
5 #include <asm/fpu/xcr.h>
6 #include <linux/misc_cgroup.h>
7 #include <linux/mmu_context.h>
8 #include <asm/tdx.h>
9 #include "capabilities.h"
10 #include "mmu.h"
11 #include "x86_ops.h"
12 #include "lapic.h"
13 #include "tdx.h"
14 #include "vmx.h"
15 #include "mmu/spte.h"
16 #include "common.h"
17 #include "posted_intr.h"
18 #include "irq.h"
19 #include <trace/events/kvm.h>
20 #include "trace.h"
21
22 #pragma GCC poison to_vmx
23
24 #undef pr_fmt
25 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
26
27 #define pr_tdx_error(__fn, __err) \
28 pr_err_ratelimited("SEAMCALL %s failed: 0x%llx\n", #__fn, __err)
29
30 #define __pr_tdx_error_N(__fn_str, __err, __fmt, ...) \
31 pr_err_ratelimited("SEAMCALL " __fn_str " failed: 0x%llx, " __fmt, __err, __VA_ARGS__)
32
33 #define pr_tdx_error_1(__fn, __err, __rcx) \
34 __pr_tdx_error_N(#__fn, __err, "rcx 0x%llx\n", __rcx)
35
36 #define pr_tdx_error_2(__fn, __err, __rcx, __rdx) \
37 __pr_tdx_error_N(#__fn, __err, "rcx 0x%llx, rdx 0x%llx\n", __rcx, __rdx)
38
39 #define pr_tdx_error_3(__fn, __err, __rcx, __rdx, __r8) \
40 __pr_tdx_error_N(#__fn, __err, "rcx 0x%llx, rdx 0x%llx, r8 0x%llx\n", __rcx, __rdx, __r8)
41
42 bool enable_tdx __ro_after_init;
43 module_param_named(tdx, enable_tdx, bool, 0444);
44
45 #define TDX_SHARED_BIT_PWL_5 gpa_to_gfn(BIT_ULL(51))
46 #define TDX_SHARED_BIT_PWL_4 gpa_to_gfn(BIT_ULL(47))
47
48 static enum cpuhp_state tdx_cpuhp_state;
49
50 static const struct tdx_sys_info *tdx_sysinfo;
51
tdh_vp_rd_failed(struct vcpu_tdx * tdx,char * uclass,u32 field,u64 err)52 void tdh_vp_rd_failed(struct vcpu_tdx *tdx, char *uclass, u32 field, u64 err)
53 {
54 KVM_BUG_ON(1, tdx->vcpu.kvm);
55 pr_err("TDH_VP_RD[%s.0x%x] failed 0x%llx\n", uclass, field, err);
56 }
57
tdh_vp_wr_failed(struct vcpu_tdx * tdx,char * uclass,char * op,u32 field,u64 val,u64 err)58 void tdh_vp_wr_failed(struct vcpu_tdx *tdx, char *uclass, char *op, u32 field,
59 u64 val, u64 err)
60 {
61 KVM_BUG_ON(1, tdx->vcpu.kvm);
62 pr_err("TDH_VP_WR[%s.0x%x]%s0x%llx failed: 0x%llx\n", uclass, field, op, val, err);
63 }
64
65 #define KVM_SUPPORTED_TD_ATTRS (TDX_TD_ATTR_SEPT_VE_DISABLE)
66
to_kvm_tdx(struct kvm * kvm)67 static __always_inline struct kvm_tdx *to_kvm_tdx(struct kvm *kvm)
68 {
69 return container_of(kvm, struct kvm_tdx, kvm);
70 }
71
to_tdx(struct kvm_vcpu * vcpu)72 static __always_inline struct vcpu_tdx *to_tdx(struct kvm_vcpu *vcpu)
73 {
74 return container_of(vcpu, struct vcpu_tdx, vcpu);
75 }
76
tdx_get_supported_attrs(const struct tdx_sys_info_td_conf * td_conf)77 static u64 tdx_get_supported_attrs(const struct tdx_sys_info_td_conf *td_conf)
78 {
79 u64 val = KVM_SUPPORTED_TD_ATTRS;
80
81 if ((val & td_conf->attributes_fixed1) != td_conf->attributes_fixed1)
82 return 0;
83
84 val &= td_conf->attributes_fixed0;
85
86 return val;
87 }
88
tdx_get_supported_xfam(const struct tdx_sys_info_td_conf * td_conf)89 static u64 tdx_get_supported_xfam(const struct tdx_sys_info_td_conf *td_conf)
90 {
91 u64 val = kvm_caps.supported_xcr0 | kvm_caps.supported_xss;
92
93 if ((val & td_conf->xfam_fixed1) != td_conf->xfam_fixed1)
94 return 0;
95
96 val &= td_conf->xfam_fixed0;
97
98 return val;
99 }
100
tdx_get_guest_phys_addr_bits(const u32 eax)101 static int tdx_get_guest_phys_addr_bits(const u32 eax)
102 {
103 return (eax & GENMASK(23, 16)) >> 16;
104 }
105
tdx_set_guest_phys_addr_bits(const u32 eax,int addr_bits)106 static u32 tdx_set_guest_phys_addr_bits(const u32 eax, int addr_bits)
107 {
108 return (eax & ~GENMASK(23, 16)) | (addr_bits & 0xff) << 16;
109 }
110
111 #define TDX_FEATURE_TSX (__feature_bit(X86_FEATURE_HLE) | __feature_bit(X86_FEATURE_RTM))
112
has_tsx(const struct kvm_cpuid_entry2 * entry)113 static bool has_tsx(const struct kvm_cpuid_entry2 *entry)
114 {
115 return entry->function == 7 && entry->index == 0 &&
116 (entry->ebx & TDX_FEATURE_TSX);
117 }
118
clear_tsx(struct kvm_cpuid_entry2 * entry)119 static void clear_tsx(struct kvm_cpuid_entry2 *entry)
120 {
121 entry->ebx &= ~TDX_FEATURE_TSX;
122 }
123
has_waitpkg(const struct kvm_cpuid_entry2 * entry)124 static bool has_waitpkg(const struct kvm_cpuid_entry2 *entry)
125 {
126 return entry->function == 7 && entry->index == 0 &&
127 (entry->ecx & __feature_bit(X86_FEATURE_WAITPKG));
128 }
129
clear_waitpkg(struct kvm_cpuid_entry2 * entry)130 static void clear_waitpkg(struct kvm_cpuid_entry2 *entry)
131 {
132 entry->ecx &= ~__feature_bit(X86_FEATURE_WAITPKG);
133 }
134
tdx_clear_unsupported_cpuid(struct kvm_cpuid_entry2 * entry)135 static void tdx_clear_unsupported_cpuid(struct kvm_cpuid_entry2 *entry)
136 {
137 if (has_tsx(entry))
138 clear_tsx(entry);
139
140 if (has_waitpkg(entry))
141 clear_waitpkg(entry);
142 }
143
tdx_unsupported_cpuid(const struct kvm_cpuid_entry2 * entry)144 static bool tdx_unsupported_cpuid(const struct kvm_cpuid_entry2 *entry)
145 {
146 return has_tsx(entry) || has_waitpkg(entry);
147 }
148
149 #define KVM_TDX_CPUID_NO_SUBLEAF ((__u32)-1)
150
td_init_cpuid_entry2(struct kvm_cpuid_entry2 * entry,unsigned char idx)151 static void td_init_cpuid_entry2(struct kvm_cpuid_entry2 *entry, unsigned char idx)
152 {
153 const struct tdx_sys_info_td_conf *td_conf = &tdx_sysinfo->td_conf;
154
155 entry->function = (u32)td_conf->cpuid_config_leaves[idx];
156 entry->index = td_conf->cpuid_config_leaves[idx] >> 32;
157 entry->eax = (u32)td_conf->cpuid_config_values[idx][0];
158 entry->ebx = td_conf->cpuid_config_values[idx][0] >> 32;
159 entry->ecx = (u32)td_conf->cpuid_config_values[idx][1];
160 entry->edx = td_conf->cpuid_config_values[idx][1] >> 32;
161
162 if (entry->index == KVM_TDX_CPUID_NO_SUBLEAF)
163 entry->index = 0;
164
165 /*
166 * The TDX module doesn't allow configuring the guest phys addr bits
167 * (EAX[23:16]). However, KVM uses it as an interface to the userspace
168 * to configure the GPAW. Report these bits as configurable.
169 */
170 if (entry->function == 0x80000008)
171 entry->eax = tdx_set_guest_phys_addr_bits(entry->eax, 0xff);
172
173 tdx_clear_unsupported_cpuid(entry);
174 }
175
176 #define TDVMCALLINFO_SETUP_EVENT_NOTIFY_INTERRUPT BIT(1)
177
init_kvm_tdx_caps(const struct tdx_sys_info_td_conf * td_conf,struct kvm_tdx_capabilities * caps)178 static int init_kvm_tdx_caps(const struct tdx_sys_info_td_conf *td_conf,
179 struct kvm_tdx_capabilities *caps)
180 {
181 int i;
182
183 caps->supported_attrs = tdx_get_supported_attrs(td_conf);
184 if (!caps->supported_attrs)
185 return -EIO;
186
187 caps->supported_xfam = tdx_get_supported_xfam(td_conf);
188 if (!caps->supported_xfam)
189 return -EIO;
190
191 caps->cpuid.nent = td_conf->num_cpuid_config;
192
193 caps->user_tdvmcallinfo_1_r11 =
194 TDVMCALLINFO_SETUP_EVENT_NOTIFY_INTERRUPT;
195
196 for (i = 0; i < td_conf->num_cpuid_config; i++)
197 td_init_cpuid_entry2(&caps->cpuid.entries[i], i);
198
199 return 0;
200 }
201
202 /*
203 * Some SEAMCALLs acquire the TDX module globally, and can fail with
204 * TDX_OPERAND_BUSY. Use a global mutex to serialize these SEAMCALLs.
205 */
206 static DEFINE_MUTEX(tdx_lock);
207
208 static atomic_t nr_configured_hkid;
209
tdx_operand_busy(u64 err)210 static bool tdx_operand_busy(u64 err)
211 {
212 return (err & TDX_SEAMCALL_STATUS_MASK) == TDX_OPERAND_BUSY;
213 }
214
215
216 /*
217 * A per-CPU list of TD vCPUs associated with a given CPU.
218 * Protected by interrupt mask. Only manipulated by the CPU owning this per-CPU
219 * list.
220 * - When a vCPU is loaded onto a CPU, it is removed from the per-CPU list of
221 * the old CPU during the IPI callback running on the old CPU, and then added
222 * to the per-CPU list of the new CPU.
223 * - When a TD is tearing down, all vCPUs are disassociated from their current
224 * running CPUs and removed from the per-CPU list during the IPI callback
225 * running on those CPUs.
226 * - When a CPU is brought down, traverse the per-CPU list to disassociate all
227 * associated TD vCPUs and remove them from the per-CPU list.
228 */
229 static DEFINE_PER_CPU(struct list_head, associated_tdvcpus);
230
tdvmcall_exit_type(struct kvm_vcpu * vcpu)231 static __always_inline unsigned long tdvmcall_exit_type(struct kvm_vcpu *vcpu)
232 {
233 return to_tdx(vcpu)->vp_enter_args.r10;
234 }
235
tdvmcall_leaf(struct kvm_vcpu * vcpu)236 static __always_inline unsigned long tdvmcall_leaf(struct kvm_vcpu *vcpu)
237 {
238 return to_tdx(vcpu)->vp_enter_args.r11;
239 }
240
tdvmcall_set_return_code(struct kvm_vcpu * vcpu,long val)241 static __always_inline void tdvmcall_set_return_code(struct kvm_vcpu *vcpu,
242 long val)
243 {
244 to_tdx(vcpu)->vp_enter_args.r10 = val;
245 }
246
tdvmcall_set_return_val(struct kvm_vcpu * vcpu,unsigned long val)247 static __always_inline void tdvmcall_set_return_val(struct kvm_vcpu *vcpu,
248 unsigned long val)
249 {
250 to_tdx(vcpu)->vp_enter_args.r11 = val;
251 }
252
tdx_hkid_free(struct kvm_tdx * kvm_tdx)253 static inline void tdx_hkid_free(struct kvm_tdx *kvm_tdx)
254 {
255 tdx_guest_keyid_free(kvm_tdx->hkid);
256 kvm_tdx->hkid = -1;
257 atomic_dec(&nr_configured_hkid);
258 misc_cg_uncharge(MISC_CG_RES_TDX, kvm_tdx->misc_cg, 1);
259 put_misc_cg(kvm_tdx->misc_cg);
260 kvm_tdx->misc_cg = NULL;
261 }
262
is_hkid_assigned(struct kvm_tdx * kvm_tdx)263 static inline bool is_hkid_assigned(struct kvm_tdx *kvm_tdx)
264 {
265 return kvm_tdx->hkid > 0;
266 }
267
tdx_disassociate_vp(struct kvm_vcpu * vcpu)268 static inline void tdx_disassociate_vp(struct kvm_vcpu *vcpu)
269 {
270 lockdep_assert_irqs_disabled();
271
272 list_del(&to_tdx(vcpu)->cpu_list);
273
274 /*
275 * Ensure tdx->cpu_list is updated before setting vcpu->cpu to -1,
276 * otherwise, a different CPU can see vcpu->cpu = -1 and add the vCPU
277 * to its list before it's deleted from this CPU's list.
278 */
279 smp_wmb();
280
281 vcpu->cpu = -1;
282 }
283
tdx_no_vcpus_enter_start(struct kvm * kvm)284 static void tdx_no_vcpus_enter_start(struct kvm *kvm)
285 {
286 struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
287
288 lockdep_assert_held_write(&kvm->mmu_lock);
289
290 WRITE_ONCE(kvm_tdx->wait_for_sept_zap, true);
291
292 kvm_make_all_cpus_request(kvm, KVM_REQ_OUTSIDE_GUEST_MODE);
293 }
294
tdx_no_vcpus_enter_stop(struct kvm * kvm)295 static void tdx_no_vcpus_enter_stop(struct kvm *kvm)
296 {
297 struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
298
299 lockdep_assert_held_write(&kvm->mmu_lock);
300
301 WRITE_ONCE(kvm_tdx->wait_for_sept_zap, false);
302 }
303
304 /* TDH.PHYMEM.PAGE.RECLAIM is allowed only when destroying the TD. */
__tdx_reclaim_page(struct page * page)305 static int __tdx_reclaim_page(struct page *page)
306 {
307 u64 err, rcx, rdx, r8;
308
309 err = tdh_phymem_page_reclaim(page, &rcx, &rdx, &r8);
310
311 /*
312 * No need to check for TDX_OPERAND_BUSY; all TD pages are freed
313 * before the HKID is released and control pages have also been
314 * released at this point, so there is no possibility of contention.
315 */
316 if (WARN_ON_ONCE(err)) {
317 pr_tdx_error_3(TDH_PHYMEM_PAGE_RECLAIM, err, rcx, rdx, r8);
318 return -EIO;
319 }
320 return 0;
321 }
322
tdx_reclaim_page(struct page * page)323 static int tdx_reclaim_page(struct page *page)
324 {
325 int r;
326
327 r = __tdx_reclaim_page(page);
328 if (!r)
329 tdx_quirk_reset_page(page);
330 return r;
331 }
332
333
334 /*
335 * Reclaim the TD control page(s) which are crypto-protected by TDX guest's
336 * private KeyID. Assume the cache associated with the TDX private KeyID has
337 * been flushed.
338 */
tdx_reclaim_control_page(struct page * ctrl_page)339 static void tdx_reclaim_control_page(struct page *ctrl_page)
340 {
341 /*
342 * Leak the page if the kernel failed to reclaim the page.
343 * The kernel cannot use it safely anymore.
344 */
345 if (tdx_reclaim_page(ctrl_page))
346 return;
347
348 __free_page(ctrl_page);
349 }
350
351 struct tdx_flush_vp_arg {
352 struct kvm_vcpu *vcpu;
353 u64 err;
354 };
355
tdx_flush_vp(void * _arg)356 static void tdx_flush_vp(void *_arg)
357 {
358 struct tdx_flush_vp_arg *arg = _arg;
359 struct kvm_vcpu *vcpu = arg->vcpu;
360 u64 err;
361
362 arg->err = 0;
363 lockdep_assert_irqs_disabled();
364
365 /* Task migration can race with CPU offlining. */
366 if (unlikely(vcpu->cpu != raw_smp_processor_id()))
367 return;
368
369 /*
370 * No need to do TDH_VP_FLUSH if the vCPU hasn't been initialized. The
371 * list tracking still needs to be updated so that it's correct if/when
372 * the vCPU does get initialized.
373 */
374 if (to_tdx(vcpu)->state != VCPU_TD_STATE_UNINITIALIZED) {
375 /*
376 * No need to retry. TDX Resources needed for TDH.VP.FLUSH are:
377 * TDVPR as exclusive, TDR as shared, and TDCS as shared. This
378 * vp flush function is called when destructing vCPU/TD or vCPU
379 * migration. No other thread uses TDVPR in those cases.
380 */
381 err = tdh_vp_flush(&to_tdx(vcpu)->vp);
382 if (unlikely(err && err != TDX_VCPU_NOT_ASSOCIATED)) {
383 /*
384 * This function is called in IPI context. Do not use
385 * printk to avoid console semaphore.
386 * The caller prints out the error message, instead.
387 */
388 if (err)
389 arg->err = err;
390 }
391 }
392
393 tdx_disassociate_vp(vcpu);
394 }
395
tdx_flush_vp_on_cpu(struct kvm_vcpu * vcpu)396 static void tdx_flush_vp_on_cpu(struct kvm_vcpu *vcpu)
397 {
398 struct tdx_flush_vp_arg arg = {
399 .vcpu = vcpu,
400 };
401 int cpu = vcpu->cpu;
402
403 if (unlikely(cpu == -1))
404 return;
405
406 smp_call_function_single(cpu, tdx_flush_vp, &arg, 1);
407 if (KVM_BUG_ON(arg.err, vcpu->kvm))
408 pr_tdx_error(TDH_VP_FLUSH, arg.err);
409 }
410
tdx_disable_virtualization_cpu(void)411 void tdx_disable_virtualization_cpu(void)
412 {
413 int cpu = raw_smp_processor_id();
414 struct list_head *tdvcpus = &per_cpu(associated_tdvcpus, cpu);
415 struct tdx_flush_vp_arg arg;
416 struct vcpu_tdx *tdx, *tmp;
417 unsigned long flags;
418
419 local_irq_save(flags);
420 /* Safe variant needed as tdx_disassociate_vp() deletes the entry. */
421 list_for_each_entry_safe(tdx, tmp, tdvcpus, cpu_list) {
422 arg.vcpu = &tdx->vcpu;
423 tdx_flush_vp(&arg);
424 }
425 local_irq_restore(flags);
426
427 /*
428 * Flush cache now if kexec is possible: this is necessary to avoid
429 * having dirty private memory cachelines when the new kernel boots,
430 * but WBINVD is a relatively expensive operation and doing it during
431 * kexec can exacerbate races in native_stop_other_cpus(). Do it
432 * now, since this is a safe moment and there is going to be no more
433 * TDX activity on this CPU from this point on.
434 */
435 tdx_cpu_flush_cache_for_kexec();
436 }
437
438 #define TDX_SEAMCALL_RETRIES 10000
439
smp_func_do_phymem_cache_wb(void * unused)440 static void smp_func_do_phymem_cache_wb(void *unused)
441 {
442 u64 err = 0;
443 bool resume;
444 int i;
445
446 /*
447 * TDH.PHYMEM.CACHE.WB flushes caches associated with any TDX private
448 * KeyID on the package or core. The TDX module may not finish the
449 * cache flush but return TDX_INTERRUPTED_RESUMEABLE instead. The
450 * kernel should retry it until it returns success w/o rescheduling.
451 */
452 for (i = TDX_SEAMCALL_RETRIES; i > 0; i--) {
453 resume = !!err;
454 err = tdh_phymem_cache_wb(resume);
455 switch (err) {
456 case TDX_INTERRUPTED_RESUMABLE:
457 continue;
458 case TDX_NO_HKID_READY_TO_WBCACHE:
459 err = TDX_SUCCESS; /* Already done by other thread */
460 fallthrough;
461 default:
462 goto out;
463 }
464 }
465
466 out:
467 if (WARN_ON_ONCE(err))
468 pr_tdx_error(TDH_PHYMEM_CACHE_WB, err);
469 }
470
tdx_mmu_release_hkid(struct kvm * kvm)471 void tdx_mmu_release_hkid(struct kvm *kvm)
472 {
473 bool packages_allocated, targets_allocated;
474 struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
475 cpumask_var_t packages, targets;
476 struct kvm_vcpu *vcpu;
477 unsigned long j;
478 int i;
479 u64 err;
480
481 if (!is_hkid_assigned(kvm_tdx))
482 return;
483
484 packages_allocated = zalloc_cpumask_var(&packages, GFP_KERNEL);
485 targets_allocated = zalloc_cpumask_var(&targets, GFP_KERNEL);
486 cpus_read_lock();
487
488 kvm_for_each_vcpu(j, vcpu, kvm)
489 tdx_flush_vp_on_cpu(vcpu);
490
491 /*
492 * TDH.PHYMEM.CACHE.WB tries to acquire the TDX module global lock
493 * and can fail with TDX_OPERAND_BUSY when it fails to get the lock.
494 * Multiple TDX guests can be destroyed simultaneously. Take the
495 * mutex to prevent it from getting error.
496 */
497 mutex_lock(&tdx_lock);
498
499 /*
500 * Releasing HKID is in vm_destroy().
501 * After the above flushing vps, there should be no more vCPU
502 * associations, as all vCPU fds have been released at this stage.
503 */
504 err = tdh_mng_vpflushdone(&kvm_tdx->td);
505 if (err == TDX_FLUSHVP_NOT_DONE)
506 goto out;
507 if (KVM_BUG_ON(err, kvm)) {
508 pr_tdx_error(TDH_MNG_VPFLUSHDONE, err);
509 pr_err("tdh_mng_vpflushdone() failed. HKID %d is leaked.\n",
510 kvm_tdx->hkid);
511 goto out;
512 }
513
514 for_each_online_cpu(i) {
515 if (packages_allocated &&
516 cpumask_test_and_set_cpu(topology_physical_package_id(i),
517 packages))
518 continue;
519 if (targets_allocated)
520 cpumask_set_cpu(i, targets);
521 }
522 if (targets_allocated)
523 on_each_cpu_mask(targets, smp_func_do_phymem_cache_wb, NULL, true);
524 else
525 on_each_cpu(smp_func_do_phymem_cache_wb, NULL, true);
526 /*
527 * In the case of error in smp_func_do_phymem_cache_wb(), the following
528 * tdh_mng_key_freeid() will fail.
529 */
530 err = tdh_mng_key_freeid(&kvm_tdx->td);
531 if (KVM_BUG_ON(err, kvm)) {
532 pr_tdx_error(TDH_MNG_KEY_FREEID, err);
533 pr_err("tdh_mng_key_freeid() failed. HKID %d is leaked.\n",
534 kvm_tdx->hkid);
535 } else {
536 tdx_hkid_free(kvm_tdx);
537 }
538
539 out:
540 mutex_unlock(&tdx_lock);
541 cpus_read_unlock();
542 free_cpumask_var(targets);
543 free_cpumask_var(packages);
544 }
545
tdx_reclaim_td_control_pages(struct kvm * kvm)546 static void tdx_reclaim_td_control_pages(struct kvm *kvm)
547 {
548 struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
549 u64 err;
550 int i;
551
552 /*
553 * tdx_mmu_release_hkid() failed to reclaim HKID. Something went wrong
554 * heavily with TDX module. Give up freeing TD pages. As the function
555 * already warned, don't warn it again.
556 */
557 if (is_hkid_assigned(kvm_tdx))
558 return;
559
560 if (kvm_tdx->td.tdcs_pages) {
561 for (i = 0; i < kvm_tdx->td.tdcs_nr_pages; i++) {
562 if (!kvm_tdx->td.tdcs_pages[i])
563 continue;
564
565 tdx_reclaim_control_page(kvm_tdx->td.tdcs_pages[i]);
566 }
567 kfree(kvm_tdx->td.tdcs_pages);
568 kvm_tdx->td.tdcs_pages = NULL;
569 }
570
571 if (!kvm_tdx->td.tdr_page)
572 return;
573
574 if (__tdx_reclaim_page(kvm_tdx->td.tdr_page))
575 return;
576
577 /*
578 * Use a SEAMCALL to ask the TDX module to flush the cache based on the
579 * KeyID. TDX module may access TDR while operating on TD (Especially
580 * when it is reclaiming TDCS).
581 */
582 err = tdh_phymem_page_wbinvd_tdr(&kvm_tdx->td);
583 if (KVM_BUG_ON(err, kvm)) {
584 pr_tdx_error(TDH_PHYMEM_PAGE_WBINVD, err);
585 return;
586 }
587 tdx_quirk_reset_page(kvm_tdx->td.tdr_page);
588
589 __free_page(kvm_tdx->td.tdr_page);
590 kvm_tdx->td.tdr_page = NULL;
591 }
592
tdx_vm_destroy(struct kvm * kvm)593 void tdx_vm_destroy(struct kvm *kvm)
594 {
595 struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
596
597 tdx_reclaim_td_control_pages(kvm);
598
599 kvm_tdx->state = TD_STATE_UNINITIALIZED;
600 }
601
tdx_do_tdh_mng_key_config(void * param)602 static int tdx_do_tdh_mng_key_config(void *param)
603 {
604 struct kvm_tdx *kvm_tdx = param;
605 u64 err;
606
607 /* TDX_RND_NO_ENTROPY related retries are handled by sc_retry() */
608 err = tdh_mng_key_config(&kvm_tdx->td);
609
610 if (KVM_BUG_ON(err, &kvm_tdx->kvm)) {
611 pr_tdx_error(TDH_MNG_KEY_CONFIG, err);
612 return -EIO;
613 }
614
615 return 0;
616 }
617
tdx_vm_init(struct kvm * kvm)618 int tdx_vm_init(struct kvm *kvm)
619 {
620 struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
621
622 kvm->arch.has_protected_state = true;
623 /*
624 * TDX Module doesn't allow the hypervisor to modify the EOI-bitmap,
625 * i.e. all EOIs are accelerated and never trigger exits.
626 */
627 kvm->arch.has_protected_eoi = true;
628 kvm->arch.has_private_mem = true;
629 kvm->arch.disabled_quirks |= KVM_X86_QUIRK_IGNORE_GUEST_PAT;
630
631 /*
632 * Because guest TD is protected, VMM can't parse the instruction in TD.
633 * Instead, guest uses MMIO hypercall. For unmodified device driver,
634 * #VE needs to be injected for MMIO and #VE handler in TD converts MMIO
635 * instruction into MMIO hypercall.
636 *
637 * SPTE value for MMIO needs to be setup so that #VE is injected into
638 * TD instead of triggering EPT MISCONFIG.
639 * - RWX=0 so that EPT violation is triggered.
640 * - suppress #VE bit is cleared to inject #VE.
641 */
642 kvm_mmu_set_mmio_spte_value(kvm, 0);
643
644 /*
645 * TDX has its own limit of maximum vCPUs it can support for all
646 * TDX guests in addition to KVM_MAX_VCPUS. TDX module reports
647 * such limit via the MAX_VCPU_PER_TD global metadata. In
648 * practice, it reflects the number of logical CPUs that ALL
649 * platforms that the TDX module supports can possibly have.
650 *
651 * Limit TDX guest's maximum vCPUs to the number of logical CPUs
652 * the platform has. Simply forwarding the MAX_VCPU_PER_TD to
653 * userspace would result in an unpredictable ABI.
654 */
655 kvm->max_vcpus = min_t(int, kvm->max_vcpus, num_present_cpus());
656
657 kvm_tdx->state = TD_STATE_UNINITIALIZED;
658
659 return 0;
660 }
661
tdx_vcpu_create(struct kvm_vcpu * vcpu)662 int tdx_vcpu_create(struct kvm_vcpu *vcpu)
663 {
664 struct kvm_tdx *kvm_tdx = to_kvm_tdx(vcpu->kvm);
665 struct vcpu_tdx *tdx = to_tdx(vcpu);
666
667 if (kvm_tdx->state != TD_STATE_INITIALIZED)
668 return -EIO;
669
670 /*
671 * TDX module mandates APICv, which requires an in-kernel local APIC.
672 * Disallow an in-kernel I/O APIC, because level-triggered interrupts
673 * and thus the I/O APIC as a whole can't be faithfully emulated in KVM.
674 */
675 if (!irqchip_split(vcpu->kvm))
676 return -EINVAL;
677
678 fpstate_set_confidential(&vcpu->arch.guest_fpu);
679 vcpu->arch.apic->guest_apic_protected = true;
680 INIT_LIST_HEAD(&tdx->vt.pi_wakeup_list);
681
682 vcpu->arch.efer = EFER_SCE | EFER_LME | EFER_LMA | EFER_NX;
683
684 vcpu->arch.switch_db_regs = KVM_DEBUGREG_AUTO_SWITCH;
685 vcpu->arch.cr0_guest_owned_bits = -1ul;
686 vcpu->arch.cr4_guest_owned_bits = -1ul;
687
688 /* KVM can't change TSC offset/multiplier as TDX module manages them. */
689 vcpu->arch.guest_tsc_protected = true;
690 vcpu->arch.tsc_offset = kvm_tdx->tsc_offset;
691 vcpu->arch.l1_tsc_offset = vcpu->arch.tsc_offset;
692 vcpu->arch.tsc_scaling_ratio = kvm_tdx->tsc_multiplier;
693 vcpu->arch.l1_tsc_scaling_ratio = kvm_tdx->tsc_multiplier;
694
695 vcpu->arch.guest_state_protected =
696 !(to_kvm_tdx(vcpu->kvm)->attributes & TDX_TD_ATTR_DEBUG);
697
698 if ((kvm_tdx->xfam & XFEATURE_MASK_XTILE) == XFEATURE_MASK_XTILE)
699 vcpu->arch.xfd_no_write_intercept = true;
700
701 tdx->vt.pi_desc.nv = POSTED_INTR_VECTOR;
702 __pi_set_sn(&tdx->vt.pi_desc);
703
704 tdx->state = VCPU_TD_STATE_UNINITIALIZED;
705
706 return 0;
707 }
708
tdx_vcpu_load(struct kvm_vcpu * vcpu,int cpu)709 void tdx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
710 {
711 struct vcpu_tdx *tdx = to_tdx(vcpu);
712
713 vmx_vcpu_pi_load(vcpu, cpu);
714 if (vcpu->cpu == cpu || !is_hkid_assigned(to_kvm_tdx(vcpu->kvm)))
715 return;
716
717 tdx_flush_vp_on_cpu(vcpu);
718
719 KVM_BUG_ON(cpu != raw_smp_processor_id(), vcpu->kvm);
720 local_irq_disable();
721 /*
722 * Pairs with the smp_wmb() in tdx_disassociate_vp() to ensure
723 * vcpu->cpu is read before tdx->cpu_list.
724 */
725 smp_rmb();
726
727 list_add(&tdx->cpu_list, &per_cpu(associated_tdvcpus, cpu));
728 local_irq_enable();
729 }
730
tdx_interrupt_allowed(struct kvm_vcpu * vcpu)731 bool tdx_interrupt_allowed(struct kvm_vcpu *vcpu)
732 {
733 /*
734 * KVM can't get the interrupt status of TDX guest and it assumes
735 * interrupt is always allowed unless TDX guest calls TDVMCALL with HLT,
736 * which passes the interrupt blocked flag.
737 */
738 return vmx_get_exit_reason(vcpu).basic != EXIT_REASON_HLT ||
739 !to_tdx(vcpu)->vp_enter_args.r12;
740 }
741
tdx_protected_apic_has_interrupt(struct kvm_vcpu * vcpu)742 static bool tdx_protected_apic_has_interrupt(struct kvm_vcpu *vcpu)
743 {
744 u64 vcpu_state_details;
745
746 if (pi_has_pending_interrupt(vcpu))
747 return true;
748
749 /*
750 * Only check RVI pending for HALTED case with IRQ enabled.
751 * For non-HLT cases, KVM doesn't care about STI/SS shadows. And if the
752 * interrupt was pending before TD exit, then it _must_ be blocked,
753 * otherwise the interrupt would have been serviced at the instruction
754 * boundary.
755 */
756 if (vmx_get_exit_reason(vcpu).basic != EXIT_REASON_HLT ||
757 to_tdx(vcpu)->vp_enter_args.r12)
758 return false;
759
760 vcpu_state_details =
761 td_state_non_arch_read64(to_tdx(vcpu), TD_VCPU_STATE_DETAILS_NON_ARCH);
762
763 return tdx_vcpu_state_details_intr_pending(vcpu_state_details);
764 }
765
766 /*
767 * Compared to vmx_prepare_switch_to_guest(), there is not much to do
768 * as SEAMCALL/SEAMRET calls take care of most of save and restore.
769 */
tdx_prepare_switch_to_guest(struct kvm_vcpu * vcpu)770 void tdx_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
771 {
772 struct vcpu_vt *vt = to_vt(vcpu);
773
774 if (vt->guest_state_loaded)
775 return;
776
777 if (likely(is_64bit_mm(current->mm)))
778 vt->msr_host_kernel_gs_base = current->thread.gsbase;
779 else
780 vt->msr_host_kernel_gs_base = read_msr(MSR_KERNEL_GS_BASE);
781
782 vt->guest_state_loaded = true;
783 }
784
785 struct tdx_uret_msr {
786 u32 msr;
787 unsigned int slot;
788 u64 defval;
789 };
790
791 static struct tdx_uret_msr tdx_uret_msrs[] = {
792 {.msr = MSR_SYSCALL_MASK, .defval = 0x20200 },
793 {.msr = MSR_STAR,},
794 {.msr = MSR_LSTAR,},
795 {.msr = MSR_TSC_AUX,},
796 };
797
tdx_user_return_msr_update_cache(void)798 static void tdx_user_return_msr_update_cache(void)
799 {
800 int i;
801
802 for (i = 0; i < ARRAY_SIZE(tdx_uret_msrs); i++)
803 kvm_user_return_msr_update_cache(tdx_uret_msrs[i].slot,
804 tdx_uret_msrs[i].defval);
805 }
806
tdx_prepare_switch_to_host(struct kvm_vcpu * vcpu)807 static void tdx_prepare_switch_to_host(struct kvm_vcpu *vcpu)
808 {
809 struct vcpu_vt *vt = to_vt(vcpu);
810 struct vcpu_tdx *tdx = to_tdx(vcpu);
811
812 if (!vt->guest_state_loaded)
813 return;
814
815 ++vcpu->stat.host_state_reload;
816 wrmsrl(MSR_KERNEL_GS_BASE, vt->msr_host_kernel_gs_base);
817
818 if (tdx->guest_entered) {
819 tdx_user_return_msr_update_cache();
820 tdx->guest_entered = false;
821 }
822
823 vt->guest_state_loaded = false;
824 }
825
tdx_vcpu_put(struct kvm_vcpu * vcpu)826 void tdx_vcpu_put(struct kvm_vcpu *vcpu)
827 {
828 vmx_vcpu_pi_put(vcpu);
829 tdx_prepare_switch_to_host(vcpu);
830 }
831
tdx_vcpu_free(struct kvm_vcpu * vcpu)832 void tdx_vcpu_free(struct kvm_vcpu *vcpu)
833 {
834 struct kvm_tdx *kvm_tdx = to_kvm_tdx(vcpu->kvm);
835 struct vcpu_tdx *tdx = to_tdx(vcpu);
836 int i;
837
838 /*
839 * It is not possible to reclaim pages while hkid is assigned. It might
840 * be assigned if:
841 * 1. the TD VM is being destroyed but freeing hkid failed, in which
842 * case the pages are leaked
843 * 2. TD VCPU creation failed and this on the error path, in which case
844 * there is nothing to do anyway
845 */
846 if (is_hkid_assigned(kvm_tdx))
847 return;
848
849 if (tdx->vp.tdcx_pages) {
850 for (i = 0; i < kvm_tdx->td.tdcx_nr_pages; i++) {
851 if (tdx->vp.tdcx_pages[i])
852 tdx_reclaim_control_page(tdx->vp.tdcx_pages[i]);
853 }
854 kfree(tdx->vp.tdcx_pages);
855 tdx->vp.tdcx_pages = NULL;
856 }
857 if (tdx->vp.tdvpr_page) {
858 tdx_reclaim_control_page(tdx->vp.tdvpr_page);
859 tdx->vp.tdvpr_page = 0;
860 tdx->vp.tdvpr_pa = 0;
861 }
862
863 tdx->state = VCPU_TD_STATE_UNINITIALIZED;
864 }
865
tdx_vcpu_pre_run(struct kvm_vcpu * vcpu)866 int tdx_vcpu_pre_run(struct kvm_vcpu *vcpu)
867 {
868 if (unlikely(to_tdx(vcpu)->state != VCPU_TD_STATE_INITIALIZED ||
869 to_kvm_tdx(vcpu->kvm)->state != TD_STATE_RUNNABLE))
870 return -EINVAL;
871
872 return 1;
873 }
874
tdcall_to_vmx_exit_reason(struct kvm_vcpu * vcpu)875 static __always_inline u32 tdcall_to_vmx_exit_reason(struct kvm_vcpu *vcpu)
876 {
877 switch (tdvmcall_leaf(vcpu)) {
878 case EXIT_REASON_CPUID:
879 case EXIT_REASON_HLT:
880 case EXIT_REASON_IO_INSTRUCTION:
881 case EXIT_REASON_MSR_READ:
882 case EXIT_REASON_MSR_WRITE:
883 return tdvmcall_leaf(vcpu);
884 case EXIT_REASON_EPT_VIOLATION:
885 return EXIT_REASON_EPT_MISCONFIG;
886 default:
887 break;
888 }
889
890 return EXIT_REASON_TDCALL;
891 }
892
tdx_to_vmx_exit_reason(struct kvm_vcpu * vcpu)893 static __always_inline u32 tdx_to_vmx_exit_reason(struct kvm_vcpu *vcpu)
894 {
895 struct vcpu_tdx *tdx = to_tdx(vcpu);
896 u32 exit_reason;
897
898 switch (tdx->vp_enter_ret & TDX_SEAMCALL_STATUS_MASK) {
899 case TDX_SUCCESS:
900 case TDX_NON_RECOVERABLE_VCPU:
901 case TDX_NON_RECOVERABLE_TD:
902 case TDX_NON_RECOVERABLE_TD_NON_ACCESSIBLE:
903 case TDX_NON_RECOVERABLE_TD_WRONG_APIC_MODE:
904 break;
905 default:
906 return -1u;
907 }
908
909 exit_reason = tdx->vp_enter_ret;
910
911 switch (exit_reason) {
912 case EXIT_REASON_TDCALL:
913 if (tdvmcall_exit_type(vcpu))
914 return EXIT_REASON_VMCALL;
915
916 return tdcall_to_vmx_exit_reason(vcpu);
917 case EXIT_REASON_EPT_MISCONFIG:
918 /*
919 * Defer KVM_BUG_ON() until tdx_handle_exit() because this is in
920 * non-instrumentable code with interrupts disabled.
921 */
922 return -1u;
923 default:
924 break;
925 }
926
927 return exit_reason;
928 }
929
tdx_vcpu_enter_exit(struct kvm_vcpu * vcpu)930 static noinstr void tdx_vcpu_enter_exit(struct kvm_vcpu *vcpu)
931 {
932 struct vcpu_tdx *tdx = to_tdx(vcpu);
933 struct vcpu_vt *vt = to_vt(vcpu);
934
935 guest_state_enter_irqoff();
936
937 tdx->vp_enter_ret = tdh_vp_enter(&tdx->vp, &tdx->vp_enter_args);
938
939 vt->exit_reason.full = tdx_to_vmx_exit_reason(vcpu);
940
941 vt->exit_qualification = tdx->vp_enter_args.rcx;
942 tdx->ext_exit_qualification = tdx->vp_enter_args.rdx;
943 tdx->exit_gpa = tdx->vp_enter_args.r8;
944 vt->exit_intr_info = tdx->vp_enter_args.r9;
945
946 vmx_handle_nmi(vcpu);
947
948 guest_state_exit_irqoff();
949 }
950
tdx_failed_vmentry(struct kvm_vcpu * vcpu)951 static bool tdx_failed_vmentry(struct kvm_vcpu *vcpu)
952 {
953 return vmx_get_exit_reason(vcpu).failed_vmentry &&
954 vmx_get_exit_reason(vcpu).full != -1u;
955 }
956
tdx_exit_handlers_fastpath(struct kvm_vcpu * vcpu)957 static fastpath_t tdx_exit_handlers_fastpath(struct kvm_vcpu *vcpu)
958 {
959 u64 vp_enter_ret = to_tdx(vcpu)->vp_enter_ret;
960
961 /*
962 * TDX_OPERAND_BUSY could be returned for SEPT due to 0-step mitigation
963 * or for TD EPOCH due to contention with TDH.MEM.TRACK on TDH.VP.ENTER.
964 *
965 * When KVM requests KVM_REQ_OUTSIDE_GUEST_MODE, which has both
966 * KVM_REQUEST_WAIT and KVM_REQUEST_NO_ACTION set, it requires target
967 * vCPUs leaving fastpath so that interrupt can be enabled to ensure the
968 * IPIs can be delivered. Return EXIT_FASTPATH_EXIT_HANDLED instead of
969 * EXIT_FASTPATH_REENTER_GUEST to exit fastpath, otherwise, the
970 * requester may be blocked endlessly.
971 */
972 if (unlikely(tdx_operand_busy(vp_enter_ret)))
973 return EXIT_FASTPATH_EXIT_HANDLED;
974
975 return EXIT_FASTPATH_NONE;
976 }
977
978 #define TDX_REGS_AVAIL_SET (BIT_ULL(VCPU_EXREG_EXIT_INFO_1) | \
979 BIT_ULL(VCPU_EXREG_EXIT_INFO_2) | \
980 BIT_ULL(VCPU_REGS_RAX) | \
981 BIT_ULL(VCPU_REGS_RBX) | \
982 BIT_ULL(VCPU_REGS_RCX) | \
983 BIT_ULL(VCPU_REGS_RDX) | \
984 BIT_ULL(VCPU_REGS_RBP) | \
985 BIT_ULL(VCPU_REGS_RSI) | \
986 BIT_ULL(VCPU_REGS_RDI) | \
987 BIT_ULL(VCPU_REGS_R8) | \
988 BIT_ULL(VCPU_REGS_R9) | \
989 BIT_ULL(VCPU_REGS_R10) | \
990 BIT_ULL(VCPU_REGS_R11) | \
991 BIT_ULL(VCPU_REGS_R12) | \
992 BIT_ULL(VCPU_REGS_R13) | \
993 BIT_ULL(VCPU_REGS_R14) | \
994 BIT_ULL(VCPU_REGS_R15))
995
tdx_load_host_xsave_state(struct kvm_vcpu * vcpu)996 static void tdx_load_host_xsave_state(struct kvm_vcpu *vcpu)
997 {
998 struct kvm_tdx *kvm_tdx = to_kvm_tdx(vcpu->kvm);
999
1000 /*
1001 * All TDX hosts support PKRU; but even if they didn't,
1002 * vcpu->arch.host_pkru would be 0 and the wrpkru would be
1003 * skipped.
1004 */
1005 if (vcpu->arch.host_pkru != 0)
1006 wrpkru(vcpu->arch.host_pkru);
1007
1008 if (kvm_host.xcr0 != (kvm_tdx->xfam & kvm_caps.supported_xcr0))
1009 xsetbv(XCR_XFEATURE_ENABLED_MASK, kvm_host.xcr0);
1010
1011 /*
1012 * Likewise, even if a TDX hosts didn't support XSS both arms of
1013 * the comparison would be 0 and the wrmsrl would be skipped.
1014 */
1015 if (kvm_host.xss != (kvm_tdx->xfam & kvm_caps.supported_xss))
1016 wrmsrl(MSR_IA32_XSS, kvm_host.xss);
1017 }
1018
1019 #define TDX_DEBUGCTL_PRESERVED (DEBUGCTLMSR_BTF | \
1020 DEBUGCTLMSR_FREEZE_PERFMON_ON_PMI | \
1021 DEBUGCTLMSR_FREEZE_IN_SMM)
1022
tdx_vcpu_run(struct kvm_vcpu * vcpu,u64 run_flags)1023 fastpath_t tdx_vcpu_run(struct kvm_vcpu *vcpu, u64 run_flags)
1024 {
1025 struct vcpu_tdx *tdx = to_tdx(vcpu);
1026 struct vcpu_vt *vt = to_vt(vcpu);
1027
1028 /*
1029 * WARN if KVM wants to force an immediate exit, as the TDX module does
1030 * not guarantee entry into the guest, i.e. it's possible for KVM to
1031 * _think_ it completed entry to the guest and forced an immediate exit
1032 * without actually having done so. Luckily, KVM never needs to force
1033 * an immediate exit for TDX (KVM can't do direct event injection, so
1034 * just WARN and continue on.
1035 */
1036 WARN_ON_ONCE(run_flags);
1037
1038 /*
1039 * Wait until retry of SEPT-zap-related SEAMCALL completes before
1040 * allowing vCPU entry to avoid contention with tdh_vp_enter() and
1041 * TDCALLs.
1042 */
1043 if (unlikely(READ_ONCE(to_kvm_tdx(vcpu->kvm)->wait_for_sept_zap)))
1044 return EXIT_FASTPATH_EXIT_HANDLED;
1045
1046 trace_kvm_entry(vcpu, run_flags & KVM_RUN_FORCE_IMMEDIATE_EXIT);
1047
1048 if (pi_test_on(&vt->pi_desc)) {
1049 apic->send_IPI_self(POSTED_INTR_VECTOR);
1050
1051 if (pi_test_pir(kvm_lapic_get_reg(vcpu->arch.apic, APIC_LVTT) &
1052 APIC_VECTOR_MASK, &vt->pi_desc))
1053 kvm_wait_lapic_expire(vcpu);
1054 }
1055
1056 tdx_vcpu_enter_exit(vcpu);
1057
1058 if (vcpu->arch.host_debugctl & ~TDX_DEBUGCTL_PRESERVED)
1059 update_debugctlmsr(vcpu->arch.host_debugctl);
1060
1061 tdx_load_host_xsave_state(vcpu);
1062 tdx->guest_entered = true;
1063
1064 vcpu->arch.regs_avail &= TDX_REGS_AVAIL_SET;
1065
1066 if (unlikely(tdx->vp_enter_ret == EXIT_REASON_EPT_MISCONFIG))
1067 return EXIT_FASTPATH_NONE;
1068
1069 if (unlikely((tdx->vp_enter_ret & TDX_SW_ERROR) == TDX_SW_ERROR))
1070 return EXIT_FASTPATH_NONE;
1071
1072 if (unlikely(vmx_get_exit_reason(vcpu).basic == EXIT_REASON_MCE_DURING_VMENTRY))
1073 kvm_machine_check();
1074
1075 trace_kvm_exit(vcpu, KVM_ISA_VMX);
1076
1077 if (unlikely(tdx_failed_vmentry(vcpu)))
1078 return EXIT_FASTPATH_NONE;
1079
1080 return tdx_exit_handlers_fastpath(vcpu);
1081 }
1082
tdx_inject_nmi(struct kvm_vcpu * vcpu)1083 void tdx_inject_nmi(struct kvm_vcpu *vcpu)
1084 {
1085 ++vcpu->stat.nmi_injections;
1086 td_management_write8(to_tdx(vcpu), TD_VCPU_PEND_NMI, 1);
1087 /*
1088 * From KVM's perspective, NMI injection is completed right after
1089 * writing to PEND_NMI. KVM doesn't care whether an NMI is injected by
1090 * the TDX module or not.
1091 */
1092 vcpu->arch.nmi_injected = false;
1093 /*
1094 * TDX doesn't support KVM to request NMI window exit. If there is
1095 * still a pending vNMI, KVM is not able to inject it along with the
1096 * one pending in TDX module in a back-to-back way. Since the previous
1097 * vNMI is still pending in TDX module, i.e. it has not been delivered
1098 * to TDX guest yet, it's OK to collapse the pending vNMI into the
1099 * previous one. The guest is expected to handle all the NMI sources
1100 * when handling the first vNMI.
1101 */
1102 vcpu->arch.nmi_pending = 0;
1103 }
1104
tdx_handle_exception_nmi(struct kvm_vcpu * vcpu)1105 static int tdx_handle_exception_nmi(struct kvm_vcpu *vcpu)
1106 {
1107 u32 intr_info = vmx_get_intr_info(vcpu);
1108
1109 /*
1110 * Machine checks are handled by handle_exception_irqoff(), or by
1111 * tdx_handle_exit() with TDX_NON_RECOVERABLE set if a #MC occurs on
1112 * VM-Entry. NMIs are handled by tdx_vcpu_enter_exit().
1113 */
1114 if (is_nmi(intr_info) || is_machine_check(intr_info))
1115 return 1;
1116
1117 vcpu->run->exit_reason = KVM_EXIT_EXCEPTION;
1118 vcpu->run->ex.exception = intr_info & INTR_INFO_VECTOR_MASK;
1119 vcpu->run->ex.error_code = 0;
1120
1121 return 0;
1122 }
1123
complete_hypercall_exit(struct kvm_vcpu * vcpu)1124 static int complete_hypercall_exit(struct kvm_vcpu *vcpu)
1125 {
1126 tdvmcall_set_return_code(vcpu, vcpu->run->hypercall.ret);
1127 return 1;
1128 }
1129
tdx_emulate_vmcall(struct kvm_vcpu * vcpu)1130 static int tdx_emulate_vmcall(struct kvm_vcpu *vcpu)
1131 {
1132 kvm_rax_write(vcpu, to_tdx(vcpu)->vp_enter_args.r10);
1133 kvm_rbx_write(vcpu, to_tdx(vcpu)->vp_enter_args.r11);
1134 kvm_rcx_write(vcpu, to_tdx(vcpu)->vp_enter_args.r12);
1135 kvm_rdx_write(vcpu, to_tdx(vcpu)->vp_enter_args.r13);
1136 kvm_rsi_write(vcpu, to_tdx(vcpu)->vp_enter_args.r14);
1137
1138 return __kvm_emulate_hypercall(vcpu, 0, complete_hypercall_exit);
1139 }
1140
1141 /*
1142 * Split into chunks and check interrupt pending between chunks. This allows
1143 * for timely injection of interrupts to prevent issues with guest lockup
1144 * detection.
1145 */
1146 #define TDX_MAP_GPA_MAX_LEN (2 * 1024 * 1024)
1147 static void __tdx_map_gpa(struct vcpu_tdx *tdx);
1148
tdx_complete_vmcall_map_gpa(struct kvm_vcpu * vcpu)1149 static int tdx_complete_vmcall_map_gpa(struct kvm_vcpu *vcpu)
1150 {
1151 struct vcpu_tdx *tdx = to_tdx(vcpu);
1152
1153 if (vcpu->run->hypercall.ret) {
1154 tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_INVALID_OPERAND);
1155 tdx->vp_enter_args.r11 = tdx->map_gpa_next;
1156 return 1;
1157 }
1158
1159 tdx->map_gpa_next += TDX_MAP_GPA_MAX_LEN;
1160 if (tdx->map_gpa_next >= tdx->map_gpa_end)
1161 return 1;
1162
1163 /*
1164 * Stop processing the remaining part if there is a pending interrupt,
1165 * which could be qualified to deliver. Skip checking pending RVI for
1166 * TDVMCALL_MAP_GPA, see comments in tdx_protected_apic_has_interrupt().
1167 */
1168 if (kvm_vcpu_has_events(vcpu)) {
1169 tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_RETRY);
1170 tdx->vp_enter_args.r11 = tdx->map_gpa_next;
1171 return 1;
1172 }
1173
1174 __tdx_map_gpa(tdx);
1175 return 0;
1176 }
1177
__tdx_map_gpa(struct vcpu_tdx * tdx)1178 static void __tdx_map_gpa(struct vcpu_tdx *tdx)
1179 {
1180 u64 gpa = tdx->map_gpa_next;
1181 u64 size = tdx->map_gpa_end - tdx->map_gpa_next;
1182
1183 if (size > TDX_MAP_GPA_MAX_LEN)
1184 size = TDX_MAP_GPA_MAX_LEN;
1185
1186 tdx->vcpu.run->exit_reason = KVM_EXIT_HYPERCALL;
1187 tdx->vcpu.run->hypercall.nr = KVM_HC_MAP_GPA_RANGE;
1188 /*
1189 * In principle this should have been -KVM_ENOSYS, but userspace (QEMU <=9.2)
1190 * assumed that vcpu->run->hypercall.ret is never changed by KVM and thus that
1191 * it was always zero on KVM_EXIT_HYPERCALL. Since KVM is now overwriting
1192 * vcpu->run->hypercall.ret, ensuring that it is zero to not break QEMU.
1193 */
1194 tdx->vcpu.run->hypercall.ret = 0;
1195 tdx->vcpu.run->hypercall.args[0] = gpa & ~gfn_to_gpa(kvm_gfn_direct_bits(tdx->vcpu.kvm));
1196 tdx->vcpu.run->hypercall.args[1] = size / PAGE_SIZE;
1197 tdx->vcpu.run->hypercall.args[2] = vt_is_tdx_private_gpa(tdx->vcpu.kvm, gpa) ?
1198 KVM_MAP_GPA_RANGE_ENCRYPTED :
1199 KVM_MAP_GPA_RANGE_DECRYPTED;
1200 tdx->vcpu.run->hypercall.flags = KVM_EXIT_HYPERCALL_LONG_MODE;
1201
1202 tdx->vcpu.arch.complete_userspace_io = tdx_complete_vmcall_map_gpa;
1203 }
1204
tdx_map_gpa(struct kvm_vcpu * vcpu)1205 static int tdx_map_gpa(struct kvm_vcpu *vcpu)
1206 {
1207 struct vcpu_tdx *tdx = to_tdx(vcpu);
1208 u64 gpa = tdx->vp_enter_args.r12;
1209 u64 size = tdx->vp_enter_args.r13;
1210 u64 ret;
1211
1212 /*
1213 * Converting TDVMCALL_MAP_GPA to KVM_HC_MAP_GPA_RANGE requires
1214 * userspace to enable KVM_CAP_EXIT_HYPERCALL with KVM_HC_MAP_GPA_RANGE
1215 * bit set. This is a base call so it should always be supported, but
1216 * KVM has no way to ensure that userspace implements the GHCI correctly.
1217 * So if KVM_HC_MAP_GPA_RANGE does not cause a VMEXIT, return an error
1218 * to the guest.
1219 */
1220 if (!user_exit_on_hypercall(vcpu->kvm, KVM_HC_MAP_GPA_RANGE)) {
1221 ret = TDVMCALL_STATUS_SUBFUNC_UNSUPPORTED;
1222 goto error;
1223 }
1224
1225 if (gpa + size <= gpa || !kvm_vcpu_is_legal_gpa(vcpu, gpa) ||
1226 !kvm_vcpu_is_legal_gpa(vcpu, gpa + size - 1) ||
1227 (vt_is_tdx_private_gpa(vcpu->kvm, gpa) !=
1228 vt_is_tdx_private_gpa(vcpu->kvm, gpa + size - 1))) {
1229 ret = TDVMCALL_STATUS_INVALID_OPERAND;
1230 goto error;
1231 }
1232
1233 if (!PAGE_ALIGNED(gpa) || !PAGE_ALIGNED(size)) {
1234 ret = TDVMCALL_STATUS_ALIGN_ERROR;
1235 goto error;
1236 }
1237
1238 tdx->map_gpa_end = gpa + size;
1239 tdx->map_gpa_next = gpa;
1240
1241 __tdx_map_gpa(tdx);
1242 return 0;
1243
1244 error:
1245 tdvmcall_set_return_code(vcpu, ret);
1246 tdx->vp_enter_args.r11 = gpa;
1247 return 1;
1248 }
1249
tdx_report_fatal_error(struct kvm_vcpu * vcpu)1250 static int tdx_report_fatal_error(struct kvm_vcpu *vcpu)
1251 {
1252 struct vcpu_tdx *tdx = to_tdx(vcpu);
1253 u64 *regs = vcpu->run->system_event.data;
1254 u64 *module_regs = &tdx->vp_enter_args.r8;
1255 int index = VCPU_REGS_RAX;
1256
1257 vcpu->run->exit_reason = KVM_EXIT_SYSTEM_EVENT;
1258 vcpu->run->system_event.type = KVM_SYSTEM_EVENT_TDX_FATAL;
1259 vcpu->run->system_event.ndata = 16;
1260
1261 /* Dump 16 general-purpose registers to userspace in ascending order. */
1262 regs[index++] = tdx->vp_enter_ret;
1263 regs[index++] = tdx->vp_enter_args.rcx;
1264 regs[index++] = tdx->vp_enter_args.rdx;
1265 regs[index++] = tdx->vp_enter_args.rbx;
1266 regs[index++] = 0;
1267 regs[index++] = 0;
1268 regs[index++] = tdx->vp_enter_args.rsi;
1269 regs[index] = tdx->vp_enter_args.rdi;
1270 for (index = 0; index < 8; index++)
1271 regs[VCPU_REGS_R8 + index] = module_regs[index];
1272
1273 return 0;
1274 }
1275
tdx_emulate_cpuid(struct kvm_vcpu * vcpu)1276 static int tdx_emulate_cpuid(struct kvm_vcpu *vcpu)
1277 {
1278 u32 eax, ebx, ecx, edx;
1279 struct vcpu_tdx *tdx = to_tdx(vcpu);
1280
1281 /* EAX and ECX for cpuid is stored in R12 and R13. */
1282 eax = tdx->vp_enter_args.r12;
1283 ecx = tdx->vp_enter_args.r13;
1284
1285 kvm_cpuid(vcpu, &eax, &ebx, &ecx, &edx, false);
1286
1287 tdx->vp_enter_args.r12 = eax;
1288 tdx->vp_enter_args.r13 = ebx;
1289 tdx->vp_enter_args.r14 = ecx;
1290 tdx->vp_enter_args.r15 = edx;
1291
1292 return 1;
1293 }
1294
tdx_complete_pio_out(struct kvm_vcpu * vcpu)1295 static int tdx_complete_pio_out(struct kvm_vcpu *vcpu)
1296 {
1297 vcpu->arch.pio.count = 0;
1298 return 1;
1299 }
1300
tdx_complete_pio_in(struct kvm_vcpu * vcpu)1301 static int tdx_complete_pio_in(struct kvm_vcpu *vcpu)
1302 {
1303 struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
1304 unsigned long val = 0;
1305 int ret;
1306
1307 ret = ctxt->ops->pio_in_emulated(ctxt, vcpu->arch.pio.size,
1308 vcpu->arch.pio.port, &val, 1);
1309
1310 WARN_ON_ONCE(!ret);
1311
1312 tdvmcall_set_return_val(vcpu, val);
1313
1314 return 1;
1315 }
1316
tdx_emulate_io(struct kvm_vcpu * vcpu)1317 static int tdx_emulate_io(struct kvm_vcpu *vcpu)
1318 {
1319 struct vcpu_tdx *tdx = to_tdx(vcpu);
1320 struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
1321 unsigned long val = 0;
1322 unsigned int port;
1323 u64 size, write;
1324 int ret;
1325
1326 ++vcpu->stat.io_exits;
1327
1328 size = tdx->vp_enter_args.r12;
1329 write = tdx->vp_enter_args.r13;
1330 port = tdx->vp_enter_args.r14;
1331
1332 if ((write != 0 && write != 1) || (size != 1 && size != 2 && size != 4)) {
1333 tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_INVALID_OPERAND);
1334 return 1;
1335 }
1336
1337 if (write) {
1338 val = tdx->vp_enter_args.r15;
1339 ret = ctxt->ops->pio_out_emulated(ctxt, size, port, &val, 1);
1340 } else {
1341 ret = ctxt->ops->pio_in_emulated(ctxt, size, port, &val, 1);
1342 }
1343
1344 if (!ret)
1345 vcpu->arch.complete_userspace_io = write ? tdx_complete_pio_out :
1346 tdx_complete_pio_in;
1347 else if (!write)
1348 tdvmcall_set_return_val(vcpu, val);
1349
1350 return ret;
1351 }
1352
tdx_complete_mmio_read(struct kvm_vcpu * vcpu)1353 static int tdx_complete_mmio_read(struct kvm_vcpu *vcpu)
1354 {
1355 unsigned long val = 0;
1356 gpa_t gpa;
1357 int size;
1358
1359 gpa = vcpu->mmio_fragments[0].gpa;
1360 size = vcpu->mmio_fragments[0].len;
1361
1362 memcpy(&val, vcpu->run->mmio.data, size);
1363 tdvmcall_set_return_val(vcpu, val);
1364 trace_kvm_mmio(KVM_TRACE_MMIO_READ, size, gpa, &val);
1365 return 1;
1366 }
1367
tdx_mmio_write(struct kvm_vcpu * vcpu,gpa_t gpa,int size,unsigned long val)1368 static inline int tdx_mmio_write(struct kvm_vcpu *vcpu, gpa_t gpa, int size,
1369 unsigned long val)
1370 {
1371 if (!kvm_io_bus_write(vcpu, KVM_FAST_MMIO_BUS, gpa, 0, NULL)) {
1372 trace_kvm_fast_mmio(gpa);
1373 return 0;
1374 }
1375
1376 trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, size, gpa, &val);
1377 if (kvm_io_bus_write(vcpu, KVM_MMIO_BUS, gpa, size, &val))
1378 return -EOPNOTSUPP;
1379
1380 return 0;
1381 }
1382
tdx_mmio_read(struct kvm_vcpu * vcpu,gpa_t gpa,int size)1383 static inline int tdx_mmio_read(struct kvm_vcpu *vcpu, gpa_t gpa, int size)
1384 {
1385 unsigned long val;
1386
1387 if (kvm_io_bus_read(vcpu, KVM_MMIO_BUS, gpa, size, &val))
1388 return -EOPNOTSUPP;
1389
1390 tdvmcall_set_return_val(vcpu, val);
1391 trace_kvm_mmio(KVM_TRACE_MMIO_READ, size, gpa, &val);
1392 return 0;
1393 }
1394
tdx_emulate_mmio(struct kvm_vcpu * vcpu)1395 static int tdx_emulate_mmio(struct kvm_vcpu *vcpu)
1396 {
1397 struct vcpu_tdx *tdx = to_tdx(vcpu);
1398 int size, write, r;
1399 unsigned long val;
1400 gpa_t gpa;
1401
1402 size = tdx->vp_enter_args.r12;
1403 write = tdx->vp_enter_args.r13;
1404 gpa = tdx->vp_enter_args.r14;
1405 val = write ? tdx->vp_enter_args.r15 : 0;
1406
1407 if (size != 1 && size != 2 && size != 4 && size != 8)
1408 goto error;
1409 if (write != 0 && write != 1)
1410 goto error;
1411
1412 /*
1413 * TDG.VP.VMCALL<MMIO> allows only shared GPA, it makes no sense to
1414 * do MMIO emulation for private GPA.
1415 */
1416 if (vt_is_tdx_private_gpa(vcpu->kvm, gpa) ||
1417 vt_is_tdx_private_gpa(vcpu->kvm, gpa + size - 1))
1418 goto error;
1419
1420 gpa = gpa & ~gfn_to_gpa(kvm_gfn_direct_bits(vcpu->kvm));
1421
1422 if (write)
1423 r = tdx_mmio_write(vcpu, gpa, size, val);
1424 else
1425 r = tdx_mmio_read(vcpu, gpa, size);
1426 if (!r)
1427 /* Kernel completed device emulation. */
1428 return 1;
1429
1430 /* Request the device emulation to userspace device model. */
1431 vcpu->mmio_is_write = write;
1432 if (!write)
1433 vcpu->arch.complete_userspace_io = tdx_complete_mmio_read;
1434
1435 vcpu->run->mmio.phys_addr = gpa;
1436 vcpu->run->mmio.len = size;
1437 vcpu->run->mmio.is_write = write;
1438 vcpu->run->exit_reason = KVM_EXIT_MMIO;
1439
1440 if (write) {
1441 memcpy(vcpu->run->mmio.data, &val, size);
1442 } else {
1443 vcpu->mmio_fragments[0].gpa = gpa;
1444 vcpu->mmio_fragments[0].len = size;
1445 trace_kvm_mmio(KVM_TRACE_MMIO_READ_UNSATISFIED, size, gpa, NULL);
1446 }
1447 return 0;
1448
1449 error:
1450 tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_INVALID_OPERAND);
1451 return 1;
1452 }
1453
tdx_complete_get_td_vm_call_info(struct kvm_vcpu * vcpu)1454 static int tdx_complete_get_td_vm_call_info(struct kvm_vcpu *vcpu)
1455 {
1456 struct vcpu_tdx *tdx = to_tdx(vcpu);
1457
1458 tdvmcall_set_return_code(vcpu, vcpu->run->tdx.get_tdvmcall_info.ret);
1459
1460 /*
1461 * For now, there is no TDVMCALL beyond GHCI base API supported by KVM
1462 * directly without the support from userspace, just set the value
1463 * returned from userspace.
1464 */
1465 tdx->vp_enter_args.r11 = vcpu->run->tdx.get_tdvmcall_info.r11;
1466 tdx->vp_enter_args.r12 = vcpu->run->tdx.get_tdvmcall_info.r12;
1467 tdx->vp_enter_args.r13 = vcpu->run->tdx.get_tdvmcall_info.r13;
1468 tdx->vp_enter_args.r14 = vcpu->run->tdx.get_tdvmcall_info.r14;
1469
1470 return 1;
1471 }
1472
tdx_get_td_vm_call_info(struct kvm_vcpu * vcpu)1473 static int tdx_get_td_vm_call_info(struct kvm_vcpu *vcpu)
1474 {
1475 struct vcpu_tdx *tdx = to_tdx(vcpu);
1476
1477 switch (tdx->vp_enter_args.r12) {
1478 case 0:
1479 tdx->vp_enter_args.r11 = 0;
1480 tdx->vp_enter_args.r12 = 0;
1481 tdx->vp_enter_args.r13 = 0;
1482 tdx->vp_enter_args.r14 = 0;
1483 tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_SUCCESS);
1484 return 1;
1485 case 1:
1486 vcpu->run->tdx.get_tdvmcall_info.leaf = tdx->vp_enter_args.r12;
1487 vcpu->run->exit_reason = KVM_EXIT_TDX;
1488 vcpu->run->tdx.flags = 0;
1489 vcpu->run->tdx.nr = TDVMCALL_GET_TD_VM_CALL_INFO;
1490 vcpu->run->tdx.get_tdvmcall_info.ret = TDVMCALL_STATUS_SUCCESS;
1491 vcpu->run->tdx.get_tdvmcall_info.r11 = 0;
1492 vcpu->run->tdx.get_tdvmcall_info.r12 = 0;
1493 vcpu->run->tdx.get_tdvmcall_info.r13 = 0;
1494 vcpu->run->tdx.get_tdvmcall_info.r14 = 0;
1495 vcpu->arch.complete_userspace_io = tdx_complete_get_td_vm_call_info;
1496 return 0;
1497 default:
1498 tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_INVALID_OPERAND);
1499 return 1;
1500 }
1501 }
1502
tdx_complete_simple(struct kvm_vcpu * vcpu)1503 static int tdx_complete_simple(struct kvm_vcpu *vcpu)
1504 {
1505 tdvmcall_set_return_code(vcpu, vcpu->run->tdx.unknown.ret);
1506 return 1;
1507 }
1508
tdx_get_quote(struct kvm_vcpu * vcpu)1509 static int tdx_get_quote(struct kvm_vcpu *vcpu)
1510 {
1511 struct vcpu_tdx *tdx = to_tdx(vcpu);
1512 u64 gpa = tdx->vp_enter_args.r12;
1513 u64 size = tdx->vp_enter_args.r13;
1514
1515 /* The gpa of buffer must have shared bit set. */
1516 if (vt_is_tdx_private_gpa(vcpu->kvm, gpa)) {
1517 tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_INVALID_OPERAND);
1518 return 1;
1519 }
1520
1521 vcpu->run->exit_reason = KVM_EXIT_TDX;
1522 vcpu->run->tdx.flags = 0;
1523 vcpu->run->tdx.nr = TDVMCALL_GET_QUOTE;
1524 vcpu->run->tdx.get_quote.ret = TDVMCALL_STATUS_SUBFUNC_UNSUPPORTED;
1525 vcpu->run->tdx.get_quote.gpa = gpa & ~gfn_to_gpa(kvm_gfn_direct_bits(tdx->vcpu.kvm));
1526 vcpu->run->tdx.get_quote.size = size;
1527
1528 vcpu->arch.complete_userspace_io = tdx_complete_simple;
1529
1530 return 0;
1531 }
1532
tdx_setup_event_notify_interrupt(struct kvm_vcpu * vcpu)1533 static int tdx_setup_event_notify_interrupt(struct kvm_vcpu *vcpu)
1534 {
1535 struct vcpu_tdx *tdx = to_tdx(vcpu);
1536 u64 vector = tdx->vp_enter_args.r12;
1537
1538 if (vector < 32 || vector > 255) {
1539 tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_INVALID_OPERAND);
1540 return 1;
1541 }
1542
1543 vcpu->run->exit_reason = KVM_EXIT_TDX;
1544 vcpu->run->tdx.flags = 0;
1545 vcpu->run->tdx.nr = TDVMCALL_SETUP_EVENT_NOTIFY_INTERRUPT;
1546 vcpu->run->tdx.setup_event_notify.ret = TDVMCALL_STATUS_SUBFUNC_UNSUPPORTED;
1547 vcpu->run->tdx.setup_event_notify.vector = vector;
1548
1549 vcpu->arch.complete_userspace_io = tdx_complete_simple;
1550
1551 return 0;
1552 }
1553
handle_tdvmcall(struct kvm_vcpu * vcpu)1554 static int handle_tdvmcall(struct kvm_vcpu *vcpu)
1555 {
1556 switch (tdvmcall_leaf(vcpu)) {
1557 case TDVMCALL_MAP_GPA:
1558 return tdx_map_gpa(vcpu);
1559 case TDVMCALL_REPORT_FATAL_ERROR:
1560 return tdx_report_fatal_error(vcpu);
1561 case TDVMCALL_GET_TD_VM_CALL_INFO:
1562 return tdx_get_td_vm_call_info(vcpu);
1563 case TDVMCALL_GET_QUOTE:
1564 return tdx_get_quote(vcpu);
1565 case TDVMCALL_SETUP_EVENT_NOTIFY_INTERRUPT:
1566 return tdx_setup_event_notify_interrupt(vcpu);
1567 default:
1568 break;
1569 }
1570
1571 tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_SUBFUNC_UNSUPPORTED);
1572 return 1;
1573 }
1574
tdx_load_mmu_pgd(struct kvm_vcpu * vcpu,hpa_t root_hpa,int pgd_level)1575 void tdx_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa, int pgd_level)
1576 {
1577 u64 shared_bit = (pgd_level == 5) ? TDX_SHARED_BIT_PWL_5 :
1578 TDX_SHARED_BIT_PWL_4;
1579
1580 if (KVM_BUG_ON(shared_bit != kvm_gfn_direct_bits(vcpu->kvm), vcpu->kvm))
1581 return;
1582
1583 td_vmcs_write64(to_tdx(vcpu), SHARED_EPT_POINTER, root_hpa);
1584 }
1585
tdx_unpin(struct kvm * kvm,struct page * page)1586 static void tdx_unpin(struct kvm *kvm, struct page *page)
1587 {
1588 put_page(page);
1589 }
1590
tdx_mem_page_aug(struct kvm * kvm,gfn_t gfn,enum pg_level level,struct page * page)1591 static int tdx_mem_page_aug(struct kvm *kvm, gfn_t gfn,
1592 enum pg_level level, struct page *page)
1593 {
1594 int tdx_level = pg_level_to_tdx_sept_level(level);
1595 struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
1596 gpa_t gpa = gfn_to_gpa(gfn);
1597 u64 entry, level_state;
1598 u64 err;
1599
1600 err = tdh_mem_page_aug(&kvm_tdx->td, gpa, tdx_level, page, &entry, &level_state);
1601 if (unlikely(tdx_operand_busy(err))) {
1602 tdx_unpin(kvm, page);
1603 return -EBUSY;
1604 }
1605
1606 if (KVM_BUG_ON(err, kvm)) {
1607 pr_tdx_error_2(TDH_MEM_PAGE_AUG, err, entry, level_state);
1608 tdx_unpin(kvm, page);
1609 return -EIO;
1610 }
1611
1612 return 0;
1613 }
1614
1615 /*
1616 * KVM_TDX_INIT_MEM_REGION calls kvm_gmem_populate() to map guest pages; the
1617 * callback tdx_gmem_post_populate() then maps pages into private memory.
1618 * through the a seamcall TDH.MEM.PAGE.ADD(). The SEAMCALL also requires the
1619 * private EPT structures for the page to have been built before, which is
1620 * done via kvm_tdp_map_page(). nr_premapped counts the number of pages that
1621 * were added to the EPT structures but not added with TDH.MEM.PAGE.ADD().
1622 * The counter has to be zero on KVM_TDX_FINALIZE_VM, to ensure that there
1623 * are no half-initialized shared EPT pages.
1624 */
tdx_mem_page_record_premap_cnt(struct kvm * kvm,gfn_t gfn,enum pg_level level,kvm_pfn_t pfn)1625 static int tdx_mem_page_record_premap_cnt(struct kvm *kvm, gfn_t gfn,
1626 enum pg_level level, kvm_pfn_t pfn)
1627 {
1628 struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
1629
1630 if (KVM_BUG_ON(kvm->arch.pre_fault_allowed, kvm))
1631 return -EINVAL;
1632
1633 /* nr_premapped will be decreased when tdh_mem_page_add() is called. */
1634 atomic64_inc(&kvm_tdx->nr_premapped);
1635 return 0;
1636 }
1637
tdx_sept_set_private_spte(struct kvm * kvm,gfn_t gfn,enum pg_level level,kvm_pfn_t pfn)1638 static int tdx_sept_set_private_spte(struct kvm *kvm, gfn_t gfn,
1639 enum pg_level level, kvm_pfn_t pfn)
1640 {
1641 struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
1642 struct page *page = pfn_to_page(pfn);
1643
1644 /* TODO: handle large pages. */
1645 if (KVM_BUG_ON(level != PG_LEVEL_4K, kvm))
1646 return -EINVAL;
1647
1648 /*
1649 * Because guest_memfd doesn't support page migration with
1650 * a_ops->migrate_folio (yet), no callback is triggered for KVM on page
1651 * migration. Until guest_memfd supports page migration, prevent page
1652 * migration.
1653 * TODO: Once guest_memfd introduces callback on page migration,
1654 * implement it and remove get_page/put_page().
1655 */
1656 get_page(page);
1657
1658 /*
1659 * Read 'pre_fault_allowed' before 'kvm_tdx->state'; see matching
1660 * barrier in tdx_td_finalize().
1661 */
1662 smp_rmb();
1663 if (likely(kvm_tdx->state == TD_STATE_RUNNABLE))
1664 return tdx_mem_page_aug(kvm, gfn, level, page);
1665
1666 return tdx_mem_page_record_premap_cnt(kvm, gfn, level, pfn);
1667 }
1668
tdx_sept_drop_private_spte(struct kvm * kvm,gfn_t gfn,enum pg_level level,struct page * page)1669 static int tdx_sept_drop_private_spte(struct kvm *kvm, gfn_t gfn,
1670 enum pg_level level, struct page *page)
1671 {
1672 int tdx_level = pg_level_to_tdx_sept_level(level);
1673 struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
1674 gpa_t gpa = gfn_to_gpa(gfn);
1675 u64 err, entry, level_state;
1676
1677 /* TODO: handle large pages. */
1678 if (KVM_BUG_ON(level != PG_LEVEL_4K, kvm))
1679 return -EINVAL;
1680
1681 if (KVM_BUG_ON(!is_hkid_assigned(kvm_tdx), kvm))
1682 return -EINVAL;
1683
1684 /*
1685 * When zapping private page, write lock is held. So no race condition
1686 * with other vcpu sept operation.
1687 * Race with TDH.VP.ENTER due to (0-step mitigation) and Guest TDCALLs.
1688 */
1689 err = tdh_mem_page_remove(&kvm_tdx->td, gpa, tdx_level, &entry,
1690 &level_state);
1691
1692 if (unlikely(tdx_operand_busy(err))) {
1693 /*
1694 * The second retry is expected to succeed after kicking off all
1695 * other vCPUs and prevent them from invoking TDH.VP.ENTER.
1696 */
1697 tdx_no_vcpus_enter_start(kvm);
1698 err = tdh_mem_page_remove(&kvm_tdx->td, gpa, tdx_level, &entry,
1699 &level_state);
1700 tdx_no_vcpus_enter_stop(kvm);
1701 }
1702
1703 if (KVM_BUG_ON(err, kvm)) {
1704 pr_tdx_error_2(TDH_MEM_PAGE_REMOVE, err, entry, level_state);
1705 return -EIO;
1706 }
1707
1708 err = tdh_phymem_page_wbinvd_hkid((u16)kvm_tdx->hkid, page);
1709
1710 if (KVM_BUG_ON(err, kvm)) {
1711 pr_tdx_error(TDH_PHYMEM_PAGE_WBINVD, err);
1712 return -EIO;
1713 }
1714 tdx_quirk_reset_page(page);
1715 tdx_unpin(kvm, page);
1716 return 0;
1717 }
1718
tdx_sept_link_private_spt(struct kvm * kvm,gfn_t gfn,enum pg_level level,void * private_spt)1719 static int tdx_sept_link_private_spt(struct kvm *kvm, gfn_t gfn,
1720 enum pg_level level, void *private_spt)
1721 {
1722 int tdx_level = pg_level_to_tdx_sept_level(level);
1723 gpa_t gpa = gfn_to_gpa(gfn);
1724 struct page *page = virt_to_page(private_spt);
1725 u64 err, entry, level_state;
1726
1727 err = tdh_mem_sept_add(&to_kvm_tdx(kvm)->td, gpa, tdx_level, page, &entry,
1728 &level_state);
1729 if (unlikely(tdx_operand_busy(err)))
1730 return -EBUSY;
1731
1732 if (KVM_BUG_ON(err, kvm)) {
1733 pr_tdx_error_2(TDH_MEM_SEPT_ADD, err, entry, level_state);
1734 return -EIO;
1735 }
1736
1737 return 0;
1738 }
1739
1740 /*
1741 * Check if the error returned from a SEPT zap SEAMCALL is due to that a page is
1742 * mapped by KVM_TDX_INIT_MEM_REGION without tdh_mem_page_add() being called
1743 * successfully.
1744 *
1745 * Since tdh_mem_sept_add() must have been invoked successfully before a
1746 * non-leaf entry present in the mirrored page table, the SEPT ZAP related
1747 * SEAMCALLs should not encounter err TDX_EPT_WALK_FAILED. They should instead
1748 * find TDX_EPT_ENTRY_STATE_INCORRECT due to an empty leaf entry found in the
1749 * SEPT.
1750 *
1751 * Further check if the returned entry from SEPT walking is with RWX permissions
1752 * to filter out anything unexpected.
1753 *
1754 * Note: @level is pg_level, not the tdx_level. The tdx_level extracted from
1755 * level_state returned from a SEAMCALL error is the same as that passed into
1756 * the SEAMCALL.
1757 */
tdx_is_sept_zap_err_due_to_premap(struct kvm_tdx * kvm_tdx,u64 err,u64 entry,int level)1758 static int tdx_is_sept_zap_err_due_to_premap(struct kvm_tdx *kvm_tdx, u64 err,
1759 u64 entry, int level)
1760 {
1761 if (!err || kvm_tdx->state == TD_STATE_RUNNABLE)
1762 return false;
1763
1764 if (err != (TDX_EPT_ENTRY_STATE_INCORRECT | TDX_OPERAND_ID_RCX))
1765 return false;
1766
1767 if ((is_last_spte(entry, level) && (entry & VMX_EPT_RWX_MASK)))
1768 return false;
1769
1770 return true;
1771 }
1772
tdx_sept_zap_private_spte(struct kvm * kvm,gfn_t gfn,enum pg_level level,struct page * page)1773 static int tdx_sept_zap_private_spte(struct kvm *kvm, gfn_t gfn,
1774 enum pg_level level, struct page *page)
1775 {
1776 int tdx_level = pg_level_to_tdx_sept_level(level);
1777 struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
1778 gpa_t gpa = gfn_to_gpa(gfn) & KVM_HPAGE_MASK(level);
1779 u64 err, entry, level_state;
1780
1781 /* For now large page isn't supported yet. */
1782 WARN_ON_ONCE(level != PG_LEVEL_4K);
1783
1784 err = tdh_mem_range_block(&kvm_tdx->td, gpa, tdx_level, &entry, &level_state);
1785
1786 if (unlikely(tdx_operand_busy(err))) {
1787 /* After no vCPUs enter, the second retry is expected to succeed */
1788 tdx_no_vcpus_enter_start(kvm);
1789 err = tdh_mem_range_block(&kvm_tdx->td, gpa, tdx_level, &entry, &level_state);
1790 tdx_no_vcpus_enter_stop(kvm);
1791 }
1792 if (tdx_is_sept_zap_err_due_to_premap(kvm_tdx, err, entry, level) &&
1793 !KVM_BUG_ON(!atomic64_read(&kvm_tdx->nr_premapped), kvm)) {
1794 atomic64_dec(&kvm_tdx->nr_premapped);
1795 tdx_unpin(kvm, page);
1796 return 0;
1797 }
1798
1799 if (KVM_BUG_ON(err, kvm)) {
1800 pr_tdx_error_2(TDH_MEM_RANGE_BLOCK, err, entry, level_state);
1801 return -EIO;
1802 }
1803 return 1;
1804 }
1805
1806 /*
1807 * Ensure shared and private EPTs to be flushed on all vCPUs.
1808 * tdh_mem_track() is the only caller that increases TD epoch. An increase in
1809 * the TD epoch (e.g., to value "N + 1") is successful only if no vCPUs are
1810 * running in guest mode with the value "N - 1".
1811 *
1812 * A successful execution of tdh_mem_track() ensures that vCPUs can only run in
1813 * guest mode with TD epoch value "N" if no TD exit occurs after the TD epoch
1814 * being increased to "N + 1".
1815 *
1816 * Kicking off all vCPUs after that further results in no vCPUs can run in guest
1817 * mode with TD epoch value "N", which unblocks the next tdh_mem_track() (e.g.
1818 * to increase TD epoch to "N + 2").
1819 *
1820 * TDX module will flush EPT on the next TD enter and make vCPUs to run in
1821 * guest mode with TD epoch value "N + 1".
1822 *
1823 * kvm_make_all_cpus_request() guarantees all vCPUs are out of guest mode by
1824 * waiting empty IPI handler ack_kick().
1825 *
1826 * No action is required to the vCPUs being kicked off since the kicking off
1827 * occurs certainly after TD epoch increment and before the next
1828 * tdh_mem_track().
1829 */
tdx_track(struct kvm * kvm)1830 static void tdx_track(struct kvm *kvm)
1831 {
1832 struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
1833 u64 err;
1834
1835 /* If TD isn't finalized, it's before any vcpu running. */
1836 if (unlikely(kvm_tdx->state != TD_STATE_RUNNABLE))
1837 return;
1838
1839 lockdep_assert_held_write(&kvm->mmu_lock);
1840
1841 err = tdh_mem_track(&kvm_tdx->td);
1842 if (unlikely(tdx_operand_busy(err))) {
1843 /* After no vCPUs enter, the second retry is expected to succeed */
1844 tdx_no_vcpus_enter_start(kvm);
1845 err = tdh_mem_track(&kvm_tdx->td);
1846 tdx_no_vcpus_enter_stop(kvm);
1847 }
1848
1849 if (KVM_BUG_ON(err, kvm))
1850 pr_tdx_error(TDH_MEM_TRACK, err);
1851
1852 kvm_make_all_cpus_request(kvm, KVM_REQ_OUTSIDE_GUEST_MODE);
1853 }
1854
tdx_sept_free_private_spt(struct kvm * kvm,gfn_t gfn,enum pg_level level,void * private_spt)1855 static int tdx_sept_free_private_spt(struct kvm *kvm, gfn_t gfn,
1856 enum pg_level level, void *private_spt)
1857 {
1858 struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
1859
1860 /*
1861 * free_external_spt() is only called after hkid is freed when TD is
1862 * tearing down.
1863 * KVM doesn't (yet) zap page table pages in mirror page table while
1864 * TD is active, though guest pages mapped in mirror page table could be
1865 * zapped during TD is active, e.g. for shared <-> private conversion
1866 * and slot move/deletion.
1867 */
1868 if (KVM_BUG_ON(is_hkid_assigned(kvm_tdx), kvm))
1869 return -EINVAL;
1870
1871 /*
1872 * The HKID assigned to this TD was already freed and cache was
1873 * already flushed. We don't have to flush again.
1874 */
1875 return tdx_reclaim_page(virt_to_page(private_spt));
1876 }
1877
tdx_sept_remove_private_spte(struct kvm * kvm,gfn_t gfn,enum pg_level level,kvm_pfn_t pfn)1878 static int tdx_sept_remove_private_spte(struct kvm *kvm, gfn_t gfn,
1879 enum pg_level level, kvm_pfn_t pfn)
1880 {
1881 struct page *page = pfn_to_page(pfn);
1882 int ret;
1883
1884 /*
1885 * HKID is released after all private pages have been removed, and set
1886 * before any might be populated. Warn if zapping is attempted when
1887 * there can't be anything populated in the private EPT.
1888 */
1889 if (KVM_BUG_ON(!is_hkid_assigned(to_kvm_tdx(kvm)), kvm))
1890 return -EINVAL;
1891
1892 ret = tdx_sept_zap_private_spte(kvm, gfn, level, page);
1893 if (ret <= 0)
1894 return ret;
1895
1896 /*
1897 * TDX requires TLB tracking before dropping private page. Do
1898 * it here, although it is also done later.
1899 */
1900 tdx_track(kvm);
1901
1902 return tdx_sept_drop_private_spte(kvm, gfn, level, page);
1903 }
1904
tdx_deliver_interrupt(struct kvm_lapic * apic,int delivery_mode,int trig_mode,int vector)1905 void tdx_deliver_interrupt(struct kvm_lapic *apic, int delivery_mode,
1906 int trig_mode, int vector)
1907 {
1908 struct kvm_vcpu *vcpu = apic->vcpu;
1909 struct vcpu_tdx *tdx = to_tdx(vcpu);
1910
1911 /* TDX supports only posted interrupt. No lapic emulation. */
1912 __vmx_deliver_posted_interrupt(vcpu, &tdx->vt.pi_desc, vector);
1913
1914 trace_kvm_apicv_accept_irq(vcpu->vcpu_id, delivery_mode, trig_mode, vector);
1915 }
1916
tdx_is_sept_violation_unexpected_pending(struct kvm_vcpu * vcpu)1917 static inline bool tdx_is_sept_violation_unexpected_pending(struct kvm_vcpu *vcpu)
1918 {
1919 u64 eeq_type = to_tdx(vcpu)->ext_exit_qualification & TDX_EXT_EXIT_QUAL_TYPE_MASK;
1920 u64 eq = vmx_get_exit_qual(vcpu);
1921
1922 if (eeq_type != TDX_EXT_EXIT_QUAL_TYPE_PENDING_EPT_VIOLATION)
1923 return false;
1924
1925 return !(eq & EPT_VIOLATION_PROT_MASK) && !(eq & EPT_VIOLATION_EXEC_FOR_RING3_LIN);
1926 }
1927
tdx_handle_ept_violation(struct kvm_vcpu * vcpu)1928 static int tdx_handle_ept_violation(struct kvm_vcpu *vcpu)
1929 {
1930 unsigned long exit_qual;
1931 gpa_t gpa = to_tdx(vcpu)->exit_gpa;
1932 bool local_retry = false;
1933 int ret;
1934
1935 if (vt_is_tdx_private_gpa(vcpu->kvm, gpa)) {
1936 if (tdx_is_sept_violation_unexpected_pending(vcpu)) {
1937 pr_warn("Guest access before accepting 0x%llx on vCPU %d\n",
1938 gpa, vcpu->vcpu_id);
1939 kvm_vm_dead(vcpu->kvm);
1940 return -EIO;
1941 }
1942 /*
1943 * Always treat SEPT violations as write faults. Ignore the
1944 * EXIT_QUALIFICATION reported by TDX-SEAM for SEPT violations.
1945 * TD private pages are always RWX in the SEPT tables,
1946 * i.e. they're always mapped writable. Just as importantly,
1947 * treating SEPT violations as write faults is necessary to
1948 * avoid COW allocations, which will cause TDAUGPAGE failures
1949 * due to aliasing a single HPA to multiple GPAs.
1950 */
1951 exit_qual = EPT_VIOLATION_ACC_WRITE;
1952
1953 /* Only private GPA triggers zero-step mitigation */
1954 local_retry = true;
1955 } else {
1956 exit_qual = vmx_get_exit_qual(vcpu);
1957 /*
1958 * EPT violation due to instruction fetch should never be
1959 * triggered from shared memory in TDX guest. If such EPT
1960 * violation occurs, treat it as broken hardware.
1961 */
1962 if (KVM_BUG_ON(exit_qual & EPT_VIOLATION_ACC_INSTR, vcpu->kvm))
1963 return -EIO;
1964 }
1965
1966 trace_kvm_page_fault(vcpu, gpa, exit_qual);
1967
1968 /*
1969 * To minimize TDH.VP.ENTER invocations, retry locally for private GPA
1970 * mapping in TDX.
1971 *
1972 * KVM may return RET_PF_RETRY for private GPA due to
1973 * - contentions when atomically updating SPTEs of the mirror page table
1974 * - in-progress GFN invalidation or memslot removal.
1975 * - TDX_OPERAND_BUSY error from TDH.MEM.PAGE.AUG or TDH.MEM.SEPT.ADD,
1976 * caused by contentions with TDH.VP.ENTER (with zero-step mitigation)
1977 * or certain TDCALLs.
1978 *
1979 * If TDH.VP.ENTER is invoked more times than the threshold set by the
1980 * TDX module before KVM resolves the private GPA mapping, the TDX
1981 * module will activate zero-step mitigation during TDH.VP.ENTER. This
1982 * process acquires an SEPT tree lock in the TDX module, leading to
1983 * further contentions with TDH.MEM.PAGE.AUG or TDH.MEM.SEPT.ADD
1984 * operations on other vCPUs.
1985 *
1986 * Breaking out of local retries for kvm_vcpu_has_events() is for
1987 * interrupt injection. kvm_vcpu_has_events() should not see pending
1988 * events for TDX. Since KVM can't determine if IRQs (or NMIs) are
1989 * blocked by TDs, false positives are inevitable i.e., KVM may re-enter
1990 * the guest even if the IRQ/NMI can't be delivered.
1991 *
1992 * Note: even without breaking out of local retries, zero-step
1993 * mitigation may still occur due to
1994 * - invoking of TDH.VP.ENTER after KVM_EXIT_MEMORY_FAULT,
1995 * - a single RIP causing EPT violations for more GFNs than the
1996 * threshold count.
1997 * This is safe, as triggering zero-step mitigation only introduces
1998 * contentions to page installation SEAMCALLs on other vCPUs, which will
1999 * handle retries locally in their EPT violation handlers.
2000 */
2001 while (1) {
2002 struct kvm_memory_slot *slot;
2003
2004 ret = __vmx_handle_ept_violation(vcpu, gpa, exit_qual);
2005
2006 if (ret != RET_PF_RETRY || !local_retry)
2007 break;
2008
2009 if (kvm_vcpu_has_events(vcpu) || signal_pending(current))
2010 break;
2011
2012 if (kvm_check_request(KVM_REQ_VM_DEAD, vcpu)) {
2013 ret = -EIO;
2014 break;
2015 }
2016
2017 /*
2018 * Bail if the memslot is invalid, i.e. is being deleted, as
2019 * faulting in will never succeed and this task needs to drop
2020 * SRCU in order to let memslot deletion complete.
2021 */
2022 slot = kvm_vcpu_gfn_to_memslot(vcpu, gpa_to_gfn(gpa));
2023 if (slot && slot->flags & KVM_MEMSLOT_INVALID)
2024 break;
2025
2026 cond_resched();
2027 }
2028 return ret;
2029 }
2030
tdx_complete_emulated_msr(struct kvm_vcpu * vcpu,int err)2031 int tdx_complete_emulated_msr(struct kvm_vcpu *vcpu, int err)
2032 {
2033 if (err) {
2034 tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_INVALID_OPERAND);
2035 return 1;
2036 }
2037
2038 if (vmx_get_exit_reason(vcpu).basic == EXIT_REASON_MSR_READ)
2039 tdvmcall_set_return_val(vcpu, kvm_read_edx_eax(vcpu));
2040
2041 return 1;
2042 }
2043
2044
tdx_handle_exit(struct kvm_vcpu * vcpu,fastpath_t fastpath)2045 int tdx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t fastpath)
2046 {
2047 struct vcpu_tdx *tdx = to_tdx(vcpu);
2048 u64 vp_enter_ret = tdx->vp_enter_ret;
2049 union vmx_exit_reason exit_reason = vmx_get_exit_reason(vcpu);
2050
2051 if (fastpath != EXIT_FASTPATH_NONE)
2052 return 1;
2053
2054 if (unlikely(vp_enter_ret == EXIT_REASON_EPT_MISCONFIG)) {
2055 KVM_BUG_ON(1, vcpu->kvm);
2056 return -EIO;
2057 }
2058
2059 /*
2060 * Handle TDX SW errors, including TDX_SEAMCALL_UD, TDX_SEAMCALL_GP and
2061 * TDX_SEAMCALL_VMFAILINVALID.
2062 */
2063 if (unlikely((vp_enter_ret & TDX_SW_ERROR) == TDX_SW_ERROR)) {
2064 KVM_BUG_ON(!kvm_rebooting, vcpu->kvm);
2065 goto unhandled_exit;
2066 }
2067
2068 if (unlikely(tdx_failed_vmentry(vcpu))) {
2069 /*
2070 * If the guest state is protected, that means off-TD debug is
2071 * not enabled, TDX_NON_RECOVERABLE must be set.
2072 */
2073 WARN_ON_ONCE(vcpu->arch.guest_state_protected &&
2074 !(vp_enter_ret & TDX_NON_RECOVERABLE));
2075 vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY;
2076 vcpu->run->fail_entry.hardware_entry_failure_reason = exit_reason.full;
2077 vcpu->run->fail_entry.cpu = vcpu->arch.last_vmentry_cpu;
2078 return 0;
2079 }
2080
2081 if (unlikely(vp_enter_ret & (TDX_ERROR | TDX_NON_RECOVERABLE)) &&
2082 exit_reason.basic != EXIT_REASON_TRIPLE_FAULT) {
2083 kvm_pr_unimpl("TD vp_enter_ret 0x%llx\n", vp_enter_ret);
2084 goto unhandled_exit;
2085 }
2086
2087 WARN_ON_ONCE(exit_reason.basic != EXIT_REASON_TRIPLE_FAULT &&
2088 (vp_enter_ret & TDX_SEAMCALL_STATUS_MASK) != TDX_SUCCESS);
2089
2090 switch (exit_reason.basic) {
2091 case EXIT_REASON_TRIPLE_FAULT:
2092 vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN;
2093 vcpu->mmio_needed = 0;
2094 return 0;
2095 case EXIT_REASON_EXCEPTION_NMI:
2096 return tdx_handle_exception_nmi(vcpu);
2097 case EXIT_REASON_EXTERNAL_INTERRUPT:
2098 ++vcpu->stat.irq_exits;
2099 return 1;
2100 case EXIT_REASON_CPUID:
2101 return tdx_emulate_cpuid(vcpu);
2102 case EXIT_REASON_HLT:
2103 return kvm_emulate_halt_noskip(vcpu);
2104 case EXIT_REASON_TDCALL:
2105 return handle_tdvmcall(vcpu);
2106 case EXIT_REASON_VMCALL:
2107 return tdx_emulate_vmcall(vcpu);
2108 case EXIT_REASON_IO_INSTRUCTION:
2109 return tdx_emulate_io(vcpu);
2110 case EXIT_REASON_MSR_READ:
2111 kvm_rcx_write(vcpu, tdx->vp_enter_args.r12);
2112 return kvm_emulate_rdmsr(vcpu);
2113 case EXIT_REASON_MSR_WRITE:
2114 kvm_rcx_write(vcpu, tdx->vp_enter_args.r12);
2115 kvm_rax_write(vcpu, tdx->vp_enter_args.r13 & -1u);
2116 kvm_rdx_write(vcpu, tdx->vp_enter_args.r13 >> 32);
2117 return kvm_emulate_wrmsr(vcpu);
2118 case EXIT_REASON_EPT_MISCONFIG:
2119 return tdx_emulate_mmio(vcpu);
2120 case EXIT_REASON_EPT_VIOLATION:
2121 return tdx_handle_ept_violation(vcpu);
2122 case EXIT_REASON_OTHER_SMI:
2123 /*
2124 * Unlike VMX, SMI in SEAM non-root mode (i.e. when
2125 * TD guest vCPU is running) will cause VM exit to TDX module,
2126 * then SEAMRET to KVM. Once it exits to KVM, SMI is delivered
2127 * and handled by kernel handler right away.
2128 *
2129 * The Other SMI exit can also be caused by the SEAM non-root
2130 * machine check delivered via Machine Check System Management
2131 * Interrupt (MSMI), but it has already been handled by the
2132 * kernel machine check handler, i.e., the memory page has been
2133 * marked as poisoned and it won't be freed to the free list
2134 * when the TDX guest is terminated (the TDX module marks the
2135 * guest as dead and prevent it from further running when
2136 * machine check happens in SEAM non-root).
2137 *
2138 * - A MSMI will not reach here, it's handled as non_recoverable
2139 * case above.
2140 * - If it's not an MSMI, no need to do anything here.
2141 */
2142 return 1;
2143 default:
2144 break;
2145 }
2146
2147 unhandled_exit:
2148 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
2149 vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_UNEXPECTED_EXIT_REASON;
2150 vcpu->run->internal.ndata = 2;
2151 vcpu->run->internal.data[0] = vp_enter_ret;
2152 vcpu->run->internal.data[1] = vcpu->arch.last_vmentry_cpu;
2153 return 0;
2154 }
2155
tdx_get_exit_info(struct kvm_vcpu * vcpu,u32 * reason,u64 * info1,u64 * info2,u32 * intr_info,u32 * error_code)2156 void tdx_get_exit_info(struct kvm_vcpu *vcpu, u32 *reason,
2157 u64 *info1, u64 *info2, u32 *intr_info, u32 *error_code)
2158 {
2159 struct vcpu_tdx *tdx = to_tdx(vcpu);
2160
2161 *reason = tdx->vt.exit_reason.full;
2162 if (*reason != -1u) {
2163 *info1 = vmx_get_exit_qual(vcpu);
2164 *info2 = tdx->ext_exit_qualification;
2165 *intr_info = vmx_get_intr_info(vcpu);
2166 } else {
2167 *info1 = 0;
2168 *info2 = 0;
2169 *intr_info = 0;
2170 }
2171
2172 *error_code = 0;
2173 }
2174
tdx_has_emulated_msr(u32 index)2175 bool tdx_has_emulated_msr(u32 index)
2176 {
2177 switch (index) {
2178 case MSR_IA32_UCODE_REV:
2179 case MSR_IA32_ARCH_CAPABILITIES:
2180 case MSR_IA32_POWER_CTL:
2181 case MSR_IA32_CR_PAT:
2182 case MSR_MTRRcap:
2183 case MTRRphysBase_MSR(0) ... MSR_MTRRfix4K_F8000:
2184 case MSR_MTRRdefType:
2185 case MSR_IA32_TSC_DEADLINE:
2186 case MSR_IA32_MISC_ENABLE:
2187 case MSR_PLATFORM_INFO:
2188 case MSR_MISC_FEATURES_ENABLES:
2189 case MSR_IA32_APICBASE:
2190 case MSR_EFER:
2191 case MSR_IA32_FEAT_CTL:
2192 case MSR_IA32_MCG_CAP:
2193 case MSR_IA32_MCG_STATUS:
2194 case MSR_IA32_MCG_CTL:
2195 case MSR_IA32_MCG_EXT_CTL:
2196 case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1:
2197 case MSR_IA32_MC0_CTL2 ... MSR_IA32_MCx_CTL2(KVM_MAX_MCE_BANKS) - 1:
2198 /* MSR_IA32_MCx_{CTL, STATUS, ADDR, MISC, CTL2} */
2199 case MSR_KVM_POLL_CONTROL:
2200 return true;
2201 case APIC_BASE_MSR ... APIC_BASE_MSR + 0xff:
2202 /*
2203 * x2APIC registers that are virtualized by the CPU can't be
2204 * emulated, KVM doesn't have access to the virtual APIC page.
2205 */
2206 switch (index) {
2207 case X2APIC_MSR(APIC_TASKPRI):
2208 case X2APIC_MSR(APIC_PROCPRI):
2209 case X2APIC_MSR(APIC_EOI):
2210 case X2APIC_MSR(APIC_ISR) ... X2APIC_MSR(APIC_ISR + APIC_ISR_NR):
2211 case X2APIC_MSR(APIC_TMR) ... X2APIC_MSR(APIC_TMR + APIC_ISR_NR):
2212 case X2APIC_MSR(APIC_IRR) ... X2APIC_MSR(APIC_IRR + APIC_ISR_NR):
2213 return false;
2214 default:
2215 return true;
2216 }
2217 default:
2218 return false;
2219 }
2220 }
2221
tdx_is_read_only_msr(u32 index)2222 static bool tdx_is_read_only_msr(u32 index)
2223 {
2224 return index == MSR_IA32_APICBASE || index == MSR_EFER ||
2225 index == MSR_IA32_FEAT_CTL;
2226 }
2227
tdx_get_msr(struct kvm_vcpu * vcpu,struct msr_data * msr)2228 int tdx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
2229 {
2230 switch (msr->index) {
2231 case MSR_IA32_FEAT_CTL:
2232 /*
2233 * MCE and MCA are advertised via cpuid. Guest kernel could
2234 * check if LMCE is enabled or not.
2235 */
2236 msr->data = FEAT_CTL_LOCKED;
2237 if (vcpu->arch.mcg_cap & MCG_LMCE_P)
2238 msr->data |= FEAT_CTL_LMCE_ENABLED;
2239 return 0;
2240 case MSR_IA32_MCG_EXT_CTL:
2241 if (!msr->host_initiated && !(vcpu->arch.mcg_cap & MCG_LMCE_P))
2242 return 1;
2243 msr->data = vcpu->arch.mcg_ext_ctl;
2244 return 0;
2245 default:
2246 if (!tdx_has_emulated_msr(msr->index))
2247 return 1;
2248
2249 return kvm_get_msr_common(vcpu, msr);
2250 }
2251 }
2252
tdx_set_msr(struct kvm_vcpu * vcpu,struct msr_data * msr)2253 int tdx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
2254 {
2255 switch (msr->index) {
2256 case MSR_IA32_MCG_EXT_CTL:
2257 if ((!msr->host_initiated && !(vcpu->arch.mcg_cap & MCG_LMCE_P)) ||
2258 (msr->data & ~MCG_EXT_CTL_LMCE_EN))
2259 return 1;
2260 vcpu->arch.mcg_ext_ctl = msr->data;
2261 return 0;
2262 default:
2263 if (tdx_is_read_only_msr(msr->index))
2264 return 1;
2265
2266 if (!tdx_has_emulated_msr(msr->index))
2267 return 1;
2268
2269 return kvm_set_msr_common(vcpu, msr);
2270 }
2271 }
2272
tdx_get_capabilities(struct kvm_tdx_cmd * cmd)2273 static int tdx_get_capabilities(struct kvm_tdx_cmd *cmd)
2274 {
2275 const struct tdx_sys_info_td_conf *td_conf = &tdx_sysinfo->td_conf;
2276 struct kvm_tdx_capabilities __user *user_caps;
2277 struct kvm_tdx_capabilities *caps = NULL;
2278 u32 nr_user_entries;
2279 int ret = 0;
2280
2281 /* flags is reserved for future use */
2282 if (cmd->flags)
2283 return -EINVAL;
2284
2285 caps = kzalloc(sizeof(*caps) +
2286 sizeof(struct kvm_cpuid_entry2) * td_conf->num_cpuid_config,
2287 GFP_KERNEL);
2288 if (!caps)
2289 return -ENOMEM;
2290
2291 user_caps = u64_to_user_ptr(cmd->data);
2292 if (get_user(nr_user_entries, &user_caps->cpuid.nent)) {
2293 ret = -EFAULT;
2294 goto out;
2295 }
2296
2297 if (nr_user_entries < td_conf->num_cpuid_config) {
2298 ret = -E2BIG;
2299 goto out;
2300 }
2301
2302 ret = init_kvm_tdx_caps(td_conf, caps);
2303 if (ret)
2304 goto out;
2305
2306 if (copy_to_user(user_caps, caps, sizeof(*caps))) {
2307 ret = -EFAULT;
2308 goto out;
2309 }
2310
2311 if (copy_to_user(user_caps->cpuid.entries, caps->cpuid.entries,
2312 caps->cpuid.nent *
2313 sizeof(caps->cpuid.entries[0])))
2314 ret = -EFAULT;
2315
2316 out:
2317 /* kfree() accepts NULL. */
2318 kfree(caps);
2319 return ret;
2320 }
2321
2322 /*
2323 * KVM reports guest physical address in CPUID.0x800000008.EAX[23:16], which is
2324 * similar to TDX's GPAW. Use this field as the interface for userspace to
2325 * configure the GPAW and EPT level for TDs.
2326 *
2327 * Only values 48 and 52 are supported. Value 52 means GPAW-52 and EPT level
2328 * 5, Value 48 means GPAW-48 and EPT level 4. For value 48, GPAW-48 is always
2329 * supported. Value 52 is only supported when the platform supports 5 level
2330 * EPT.
2331 */
setup_tdparams_eptp_controls(struct kvm_cpuid2 * cpuid,struct td_params * td_params)2332 static int setup_tdparams_eptp_controls(struct kvm_cpuid2 *cpuid,
2333 struct td_params *td_params)
2334 {
2335 const struct kvm_cpuid_entry2 *entry;
2336 int guest_pa;
2337
2338 entry = kvm_find_cpuid_entry2(cpuid->entries, cpuid->nent, 0x80000008, 0);
2339 if (!entry)
2340 return -EINVAL;
2341
2342 guest_pa = tdx_get_guest_phys_addr_bits(entry->eax);
2343
2344 if (guest_pa != 48 && guest_pa != 52)
2345 return -EINVAL;
2346
2347 if (guest_pa == 52 && !cpu_has_vmx_ept_5levels())
2348 return -EINVAL;
2349
2350 td_params->eptp_controls = VMX_EPTP_MT_WB;
2351 if (guest_pa == 52) {
2352 td_params->eptp_controls |= VMX_EPTP_PWL_5;
2353 td_params->config_flags |= TDX_CONFIG_FLAGS_MAX_GPAW;
2354 } else {
2355 td_params->eptp_controls |= VMX_EPTP_PWL_4;
2356 }
2357
2358 return 0;
2359 }
2360
setup_tdparams_cpuids(struct kvm_cpuid2 * cpuid,struct td_params * td_params)2361 static int setup_tdparams_cpuids(struct kvm_cpuid2 *cpuid,
2362 struct td_params *td_params)
2363 {
2364 const struct tdx_sys_info_td_conf *td_conf = &tdx_sysinfo->td_conf;
2365 const struct kvm_cpuid_entry2 *entry;
2366 struct tdx_cpuid_value *value;
2367 int i, copy_cnt = 0;
2368
2369 /*
2370 * td_params.cpuid_values: The number and the order of cpuid_value must
2371 * be same to the one of struct tdsysinfo.{num_cpuid_config, cpuid_configs}
2372 * It's assumed that td_params was zeroed.
2373 */
2374 for (i = 0; i < td_conf->num_cpuid_config; i++) {
2375 struct kvm_cpuid_entry2 tmp;
2376
2377 td_init_cpuid_entry2(&tmp, i);
2378
2379 entry = kvm_find_cpuid_entry2(cpuid->entries, cpuid->nent,
2380 tmp.function, tmp.index);
2381 if (!entry)
2382 continue;
2383
2384 if (tdx_unsupported_cpuid(entry))
2385 return -EINVAL;
2386
2387 copy_cnt++;
2388
2389 value = &td_params->cpuid_values[i];
2390 value->eax = entry->eax;
2391 value->ebx = entry->ebx;
2392 value->ecx = entry->ecx;
2393 value->edx = entry->edx;
2394
2395 /*
2396 * TDX module does not accept nonzero bits 16..23 for the
2397 * CPUID[0x80000008].EAX, see setup_tdparams_eptp_controls().
2398 */
2399 if (tmp.function == 0x80000008)
2400 value->eax = tdx_set_guest_phys_addr_bits(value->eax, 0);
2401 }
2402
2403 /*
2404 * Rely on the TDX module to reject invalid configuration, but it can't
2405 * check of leafs that don't have a proper slot in td_params->cpuid_values
2406 * to stick then. So fail if there were entries that didn't get copied to
2407 * td_params.
2408 */
2409 if (copy_cnt != cpuid->nent)
2410 return -EINVAL;
2411
2412 return 0;
2413 }
2414
setup_tdparams(struct kvm * kvm,struct td_params * td_params,struct kvm_tdx_init_vm * init_vm)2415 static int setup_tdparams(struct kvm *kvm, struct td_params *td_params,
2416 struct kvm_tdx_init_vm *init_vm)
2417 {
2418 const struct tdx_sys_info_td_conf *td_conf = &tdx_sysinfo->td_conf;
2419 struct kvm_cpuid2 *cpuid = &init_vm->cpuid;
2420 int ret;
2421
2422 if (kvm->created_vcpus)
2423 return -EBUSY;
2424
2425 if (init_vm->attributes & ~tdx_get_supported_attrs(td_conf))
2426 return -EINVAL;
2427
2428 if (init_vm->xfam & ~tdx_get_supported_xfam(td_conf))
2429 return -EINVAL;
2430
2431 td_params->max_vcpus = kvm->max_vcpus;
2432 td_params->attributes = init_vm->attributes | td_conf->attributes_fixed1;
2433 td_params->xfam = init_vm->xfam | td_conf->xfam_fixed1;
2434
2435 td_params->config_flags = TDX_CONFIG_FLAGS_NO_RBP_MOD;
2436 td_params->tsc_frequency = TDX_TSC_KHZ_TO_25MHZ(kvm->arch.default_tsc_khz);
2437
2438 ret = setup_tdparams_eptp_controls(cpuid, td_params);
2439 if (ret)
2440 return ret;
2441
2442 ret = setup_tdparams_cpuids(cpuid, td_params);
2443 if (ret)
2444 return ret;
2445
2446 #define MEMCPY_SAME_SIZE(dst, src) \
2447 do { \
2448 BUILD_BUG_ON(sizeof(dst) != sizeof(src)); \
2449 memcpy((dst), (src), sizeof(dst)); \
2450 } while (0)
2451
2452 MEMCPY_SAME_SIZE(td_params->mrconfigid, init_vm->mrconfigid);
2453 MEMCPY_SAME_SIZE(td_params->mrowner, init_vm->mrowner);
2454 MEMCPY_SAME_SIZE(td_params->mrownerconfig, init_vm->mrownerconfig);
2455
2456 return 0;
2457 }
2458
__tdx_td_init(struct kvm * kvm,struct td_params * td_params,u64 * seamcall_err)2459 static int __tdx_td_init(struct kvm *kvm, struct td_params *td_params,
2460 u64 *seamcall_err)
2461 {
2462 struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
2463 cpumask_var_t packages;
2464 struct page **tdcs_pages = NULL;
2465 struct page *tdr_page;
2466 int ret, i;
2467 u64 err, rcx;
2468
2469 *seamcall_err = 0;
2470 ret = tdx_guest_keyid_alloc();
2471 if (ret < 0)
2472 return ret;
2473 kvm_tdx->hkid = ret;
2474 kvm_tdx->misc_cg = get_current_misc_cg();
2475 ret = misc_cg_try_charge(MISC_CG_RES_TDX, kvm_tdx->misc_cg, 1);
2476 if (ret)
2477 goto free_hkid;
2478
2479 ret = -ENOMEM;
2480
2481 atomic_inc(&nr_configured_hkid);
2482
2483 tdr_page = alloc_page(GFP_KERNEL);
2484 if (!tdr_page)
2485 goto free_hkid;
2486
2487 kvm_tdx->td.tdcs_nr_pages = tdx_sysinfo->td_ctrl.tdcs_base_size / PAGE_SIZE;
2488 /* TDVPS = TDVPR(4K page) + TDCX(multiple 4K pages), -1 for TDVPR. */
2489 kvm_tdx->td.tdcx_nr_pages = tdx_sysinfo->td_ctrl.tdvps_base_size / PAGE_SIZE - 1;
2490 tdcs_pages = kcalloc(kvm_tdx->td.tdcs_nr_pages, sizeof(*kvm_tdx->td.tdcs_pages),
2491 GFP_KERNEL);
2492 if (!tdcs_pages)
2493 goto free_tdr;
2494
2495 for (i = 0; i < kvm_tdx->td.tdcs_nr_pages; i++) {
2496 tdcs_pages[i] = alloc_page(GFP_KERNEL);
2497 if (!tdcs_pages[i])
2498 goto free_tdcs;
2499 }
2500
2501 if (!zalloc_cpumask_var(&packages, GFP_KERNEL))
2502 goto free_tdcs;
2503
2504 cpus_read_lock();
2505
2506 /*
2507 * Need at least one CPU of the package to be online in order to
2508 * program all packages for host key id. Check it.
2509 */
2510 for_each_present_cpu(i)
2511 cpumask_set_cpu(topology_physical_package_id(i), packages);
2512 for_each_online_cpu(i)
2513 cpumask_clear_cpu(topology_physical_package_id(i), packages);
2514 if (!cpumask_empty(packages)) {
2515 ret = -EIO;
2516 /*
2517 * Because it's hard for human operator to figure out the
2518 * reason, warn it.
2519 */
2520 #define MSG_ALLPKG "All packages need to have online CPU to create TD. Online CPU and retry.\n"
2521 pr_warn_ratelimited(MSG_ALLPKG);
2522 goto free_packages;
2523 }
2524
2525 /*
2526 * TDH.MNG.CREATE tries to grab the global TDX module and fails
2527 * with TDX_OPERAND_BUSY when it fails to grab. Take the global
2528 * lock to prevent it from failure.
2529 */
2530 mutex_lock(&tdx_lock);
2531 kvm_tdx->td.tdr_page = tdr_page;
2532 err = tdh_mng_create(&kvm_tdx->td, kvm_tdx->hkid);
2533 mutex_unlock(&tdx_lock);
2534
2535 if (err == TDX_RND_NO_ENTROPY) {
2536 ret = -EAGAIN;
2537 goto free_packages;
2538 }
2539
2540 if (WARN_ON_ONCE(err)) {
2541 pr_tdx_error(TDH_MNG_CREATE, err);
2542 ret = -EIO;
2543 goto free_packages;
2544 }
2545
2546 for_each_online_cpu(i) {
2547 int pkg = topology_physical_package_id(i);
2548
2549 if (cpumask_test_and_set_cpu(pkg, packages))
2550 continue;
2551
2552 /*
2553 * Program the memory controller in the package with an
2554 * encryption key associated to a TDX private host key id
2555 * assigned to this TDR. Concurrent operations on same memory
2556 * controller results in TDX_OPERAND_BUSY. No locking needed
2557 * beyond the cpus_read_lock() above as it serializes against
2558 * hotplug and the first online CPU of the package is always
2559 * used. We never have two CPUs in the same socket trying to
2560 * program the key.
2561 */
2562 ret = smp_call_on_cpu(i, tdx_do_tdh_mng_key_config,
2563 kvm_tdx, true);
2564 if (ret)
2565 break;
2566 }
2567 cpus_read_unlock();
2568 free_cpumask_var(packages);
2569 if (ret) {
2570 i = 0;
2571 goto teardown;
2572 }
2573
2574 kvm_tdx->td.tdcs_pages = tdcs_pages;
2575 for (i = 0; i < kvm_tdx->td.tdcs_nr_pages; i++) {
2576 err = tdh_mng_addcx(&kvm_tdx->td, tdcs_pages[i]);
2577 if (err == TDX_RND_NO_ENTROPY) {
2578 /* Here it's hard to allow userspace to retry. */
2579 ret = -EAGAIN;
2580 goto teardown;
2581 }
2582 if (WARN_ON_ONCE(err)) {
2583 pr_tdx_error(TDH_MNG_ADDCX, err);
2584 ret = -EIO;
2585 goto teardown;
2586 }
2587 }
2588
2589 err = tdh_mng_init(&kvm_tdx->td, __pa(td_params), &rcx);
2590 if ((err & TDX_SEAMCALL_STATUS_MASK) == TDX_OPERAND_INVALID) {
2591 /*
2592 * Because a user gives operands, don't warn.
2593 * Return a hint to the user because it's sometimes hard for the
2594 * user to figure out which operand is invalid. SEAMCALL status
2595 * code includes which operand caused invalid operand error.
2596 */
2597 *seamcall_err = err;
2598 ret = -EINVAL;
2599 goto teardown;
2600 } else if (WARN_ON_ONCE(err)) {
2601 pr_tdx_error_1(TDH_MNG_INIT, err, rcx);
2602 ret = -EIO;
2603 goto teardown;
2604 }
2605
2606 return 0;
2607
2608 /*
2609 * The sequence for freeing resources from a partially initialized TD
2610 * varies based on where in the initialization flow failure occurred.
2611 * Simply use the full teardown and destroy, which naturally play nice
2612 * with partial initialization.
2613 */
2614 teardown:
2615 /* Only free pages not yet added, so start at 'i' */
2616 for (; i < kvm_tdx->td.tdcs_nr_pages; i++) {
2617 if (tdcs_pages[i]) {
2618 __free_page(tdcs_pages[i]);
2619 tdcs_pages[i] = NULL;
2620 }
2621 }
2622 if (!kvm_tdx->td.tdcs_pages)
2623 kfree(tdcs_pages);
2624
2625 tdx_mmu_release_hkid(kvm);
2626 tdx_reclaim_td_control_pages(kvm);
2627
2628 return ret;
2629
2630 free_packages:
2631 cpus_read_unlock();
2632 free_cpumask_var(packages);
2633
2634 free_tdcs:
2635 for (i = 0; i < kvm_tdx->td.tdcs_nr_pages; i++) {
2636 if (tdcs_pages[i])
2637 __free_page(tdcs_pages[i]);
2638 }
2639 kfree(tdcs_pages);
2640 kvm_tdx->td.tdcs_pages = NULL;
2641
2642 free_tdr:
2643 if (tdr_page)
2644 __free_page(tdr_page);
2645 kvm_tdx->td.tdr_page = 0;
2646
2647 free_hkid:
2648 tdx_hkid_free(kvm_tdx);
2649
2650 return ret;
2651 }
2652
tdx_td_metadata_field_read(struct kvm_tdx * tdx,u64 field_id,u64 * data)2653 static u64 tdx_td_metadata_field_read(struct kvm_tdx *tdx, u64 field_id,
2654 u64 *data)
2655 {
2656 u64 err;
2657
2658 err = tdh_mng_rd(&tdx->td, field_id, data);
2659
2660 return err;
2661 }
2662
2663 #define TDX_MD_UNREADABLE_LEAF_MASK GENMASK(30, 7)
2664 #define TDX_MD_UNREADABLE_SUBLEAF_MASK GENMASK(31, 7)
2665
tdx_read_cpuid(struct kvm_vcpu * vcpu,u32 leaf,u32 sub_leaf,bool sub_leaf_set,int * entry_index,struct kvm_cpuid_entry2 * out)2666 static int tdx_read_cpuid(struct kvm_vcpu *vcpu, u32 leaf, u32 sub_leaf,
2667 bool sub_leaf_set, int *entry_index,
2668 struct kvm_cpuid_entry2 *out)
2669 {
2670 struct kvm_tdx *kvm_tdx = to_kvm_tdx(vcpu->kvm);
2671 u64 field_id = TD_MD_FIELD_ID_CPUID_VALUES;
2672 u64 ebx_eax, edx_ecx;
2673 u64 err = 0;
2674
2675 if (sub_leaf > 0b1111111)
2676 return -EINVAL;
2677
2678 if (*entry_index >= KVM_MAX_CPUID_ENTRIES)
2679 return -EINVAL;
2680
2681 if (leaf & TDX_MD_UNREADABLE_LEAF_MASK ||
2682 sub_leaf & TDX_MD_UNREADABLE_SUBLEAF_MASK)
2683 return -EINVAL;
2684
2685 /*
2686 * bit 23:17, REVSERVED: reserved, must be 0;
2687 * bit 16, LEAF_31: leaf number bit 31;
2688 * bit 15:9, LEAF_6_0: leaf number bits 6:0, leaf bits 30:7 are
2689 * implicitly 0;
2690 * bit 8, SUBLEAF_NA: sub-leaf not applicable flag;
2691 * bit 7:1, SUBLEAF_6_0: sub-leaf number bits 6:0. If SUBLEAF_NA is 1,
2692 * the SUBLEAF_6_0 is all-1.
2693 * sub-leaf bits 31:7 are implicitly 0;
2694 * bit 0, ELEMENT_I: Element index within field;
2695 */
2696 field_id |= ((leaf & 0x80000000) ? 1 : 0) << 16;
2697 field_id |= (leaf & 0x7f) << 9;
2698 if (sub_leaf_set)
2699 field_id |= (sub_leaf & 0x7f) << 1;
2700 else
2701 field_id |= 0x1fe;
2702
2703 err = tdx_td_metadata_field_read(kvm_tdx, field_id, &ebx_eax);
2704 if (err) //TODO check for specific errors
2705 goto err_out;
2706
2707 out->eax = (u32) ebx_eax;
2708 out->ebx = (u32) (ebx_eax >> 32);
2709
2710 field_id++;
2711 err = tdx_td_metadata_field_read(kvm_tdx, field_id, &edx_ecx);
2712 /*
2713 * It's weird that reading edx_ecx fails while reading ebx_eax
2714 * succeeded.
2715 */
2716 if (WARN_ON_ONCE(err))
2717 goto err_out;
2718
2719 out->ecx = (u32) edx_ecx;
2720 out->edx = (u32) (edx_ecx >> 32);
2721
2722 out->function = leaf;
2723 out->index = sub_leaf;
2724 out->flags |= sub_leaf_set ? KVM_CPUID_FLAG_SIGNIFCANT_INDEX : 0;
2725
2726 /*
2727 * Work around missing support on old TDX modules, fetch
2728 * guest maxpa from gfn_direct_bits.
2729 */
2730 if (leaf == 0x80000008) {
2731 gpa_t gpa_bits = gfn_to_gpa(kvm_gfn_direct_bits(vcpu->kvm));
2732 unsigned int g_maxpa = __ffs(gpa_bits) + 1;
2733
2734 out->eax = tdx_set_guest_phys_addr_bits(out->eax, g_maxpa);
2735 }
2736
2737 (*entry_index)++;
2738
2739 return 0;
2740
2741 err_out:
2742 out->eax = 0;
2743 out->ebx = 0;
2744 out->ecx = 0;
2745 out->edx = 0;
2746
2747 return -EIO;
2748 }
2749
tdx_td_init(struct kvm * kvm,struct kvm_tdx_cmd * cmd)2750 static int tdx_td_init(struct kvm *kvm, struct kvm_tdx_cmd *cmd)
2751 {
2752 struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
2753 struct kvm_tdx_init_vm *init_vm;
2754 struct td_params *td_params = NULL;
2755 int ret;
2756
2757 BUILD_BUG_ON(sizeof(*init_vm) != 256 + sizeof_field(struct kvm_tdx_init_vm, cpuid));
2758 BUILD_BUG_ON(sizeof(struct td_params) != 1024);
2759
2760 if (kvm_tdx->state != TD_STATE_UNINITIALIZED)
2761 return -EINVAL;
2762
2763 if (cmd->flags)
2764 return -EINVAL;
2765
2766 init_vm = kmalloc(sizeof(*init_vm) +
2767 sizeof(init_vm->cpuid.entries[0]) * KVM_MAX_CPUID_ENTRIES,
2768 GFP_KERNEL);
2769 if (!init_vm)
2770 return -ENOMEM;
2771
2772 if (copy_from_user(init_vm, u64_to_user_ptr(cmd->data), sizeof(*init_vm))) {
2773 ret = -EFAULT;
2774 goto out;
2775 }
2776
2777 if (init_vm->cpuid.nent > KVM_MAX_CPUID_ENTRIES) {
2778 ret = -E2BIG;
2779 goto out;
2780 }
2781
2782 if (copy_from_user(init_vm->cpuid.entries,
2783 u64_to_user_ptr(cmd->data) + sizeof(*init_vm),
2784 flex_array_size(init_vm, cpuid.entries, init_vm->cpuid.nent))) {
2785 ret = -EFAULT;
2786 goto out;
2787 }
2788
2789 if (memchr_inv(init_vm->reserved, 0, sizeof(init_vm->reserved))) {
2790 ret = -EINVAL;
2791 goto out;
2792 }
2793
2794 if (init_vm->cpuid.padding) {
2795 ret = -EINVAL;
2796 goto out;
2797 }
2798
2799 td_params = kzalloc(sizeof(struct td_params), GFP_KERNEL);
2800 if (!td_params) {
2801 ret = -ENOMEM;
2802 goto out;
2803 }
2804
2805 ret = setup_tdparams(kvm, td_params, init_vm);
2806 if (ret)
2807 goto out;
2808
2809 ret = __tdx_td_init(kvm, td_params, &cmd->hw_error);
2810 if (ret)
2811 goto out;
2812
2813 kvm_tdx->tsc_offset = td_tdcs_exec_read64(kvm_tdx, TD_TDCS_EXEC_TSC_OFFSET);
2814 kvm_tdx->tsc_multiplier = td_tdcs_exec_read64(kvm_tdx, TD_TDCS_EXEC_TSC_MULTIPLIER);
2815 kvm_tdx->attributes = td_params->attributes;
2816 kvm_tdx->xfam = td_params->xfam;
2817
2818 if (td_params->config_flags & TDX_CONFIG_FLAGS_MAX_GPAW)
2819 kvm->arch.gfn_direct_bits = TDX_SHARED_BIT_PWL_5;
2820 else
2821 kvm->arch.gfn_direct_bits = TDX_SHARED_BIT_PWL_4;
2822
2823 kvm_tdx->state = TD_STATE_INITIALIZED;
2824 out:
2825 /* kfree() accepts NULL. */
2826 kfree(init_vm);
2827 kfree(td_params);
2828
2829 return ret;
2830 }
2831
tdx_flush_tlb_current(struct kvm_vcpu * vcpu)2832 void tdx_flush_tlb_current(struct kvm_vcpu *vcpu)
2833 {
2834 /*
2835 * flush_tlb_current() is invoked when the first time for the vcpu to
2836 * run or when root of shared EPT is invalidated.
2837 * KVM only needs to flush shared EPT because the TDX module handles TLB
2838 * invalidation for private EPT in tdh_vp_enter();
2839 *
2840 * A single context invalidation for shared EPT can be performed here.
2841 * However, this single context invalidation requires the private EPTP
2842 * rather than the shared EPTP to flush shared EPT, as shared EPT uses
2843 * private EPTP as its ASID for TLB invalidation.
2844 *
2845 * To avoid reading back private EPTP, perform a global invalidation for
2846 * shared EPT instead to keep this function simple.
2847 */
2848 ept_sync_global();
2849 }
2850
tdx_flush_tlb_all(struct kvm_vcpu * vcpu)2851 void tdx_flush_tlb_all(struct kvm_vcpu *vcpu)
2852 {
2853 /*
2854 * TDX has called tdx_track() in tdx_sept_remove_private_spte() to
2855 * ensure that private EPT will be flushed on the next TD enter. No need
2856 * to call tdx_track() here again even when this callback is a result of
2857 * zapping private EPT.
2858 *
2859 * Due to the lack of the context to determine which EPT has been
2860 * affected by zapping, invoke invept() directly here for both shared
2861 * EPT and private EPT for simplicity, though it's not necessary for
2862 * private EPT.
2863 */
2864 ept_sync_global();
2865 }
2866
tdx_td_finalize(struct kvm * kvm,struct kvm_tdx_cmd * cmd)2867 static int tdx_td_finalize(struct kvm *kvm, struct kvm_tdx_cmd *cmd)
2868 {
2869 struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
2870
2871 guard(mutex)(&kvm->slots_lock);
2872
2873 if (!is_hkid_assigned(kvm_tdx) || kvm_tdx->state == TD_STATE_RUNNABLE)
2874 return -EINVAL;
2875 /*
2876 * Pages are pending for KVM_TDX_INIT_MEM_REGION to issue
2877 * TDH.MEM.PAGE.ADD().
2878 */
2879 if (atomic64_read(&kvm_tdx->nr_premapped))
2880 return -EINVAL;
2881
2882 cmd->hw_error = tdh_mr_finalize(&kvm_tdx->td);
2883 if (tdx_operand_busy(cmd->hw_error))
2884 return -EBUSY;
2885 if (KVM_BUG_ON(cmd->hw_error, kvm)) {
2886 pr_tdx_error(TDH_MR_FINALIZE, cmd->hw_error);
2887 return -EIO;
2888 }
2889
2890 kvm_tdx->state = TD_STATE_RUNNABLE;
2891 /* TD_STATE_RUNNABLE must be set before 'pre_fault_allowed' */
2892 smp_wmb();
2893 kvm->arch.pre_fault_allowed = true;
2894 return 0;
2895 }
2896
tdx_vm_ioctl(struct kvm * kvm,void __user * argp)2897 int tdx_vm_ioctl(struct kvm *kvm, void __user *argp)
2898 {
2899 struct kvm_tdx_cmd tdx_cmd;
2900 int r;
2901
2902 if (copy_from_user(&tdx_cmd, argp, sizeof(struct kvm_tdx_cmd)))
2903 return -EFAULT;
2904
2905 /*
2906 * Userspace should never set hw_error. It is used to fill
2907 * hardware-defined error by the kernel.
2908 */
2909 if (tdx_cmd.hw_error)
2910 return -EINVAL;
2911
2912 mutex_lock(&kvm->lock);
2913
2914 switch (tdx_cmd.id) {
2915 case KVM_TDX_CAPABILITIES:
2916 r = tdx_get_capabilities(&tdx_cmd);
2917 break;
2918 case KVM_TDX_INIT_VM:
2919 r = tdx_td_init(kvm, &tdx_cmd);
2920 break;
2921 case KVM_TDX_FINALIZE_VM:
2922 r = tdx_td_finalize(kvm, &tdx_cmd);
2923 break;
2924 default:
2925 r = -EINVAL;
2926 goto out;
2927 }
2928
2929 if (copy_to_user(argp, &tdx_cmd, sizeof(struct kvm_tdx_cmd)))
2930 r = -EFAULT;
2931
2932 out:
2933 mutex_unlock(&kvm->lock);
2934 return r;
2935 }
2936
2937 /* VMM can pass one 64bit auxiliary data to vcpu via RCX for guest BIOS. */
tdx_td_vcpu_init(struct kvm_vcpu * vcpu,u64 vcpu_rcx)2938 static int tdx_td_vcpu_init(struct kvm_vcpu *vcpu, u64 vcpu_rcx)
2939 {
2940 struct kvm_tdx *kvm_tdx = to_kvm_tdx(vcpu->kvm);
2941 struct vcpu_tdx *tdx = to_tdx(vcpu);
2942 struct page *page;
2943 int ret, i;
2944 u64 err;
2945
2946 page = alloc_page(GFP_KERNEL);
2947 if (!page)
2948 return -ENOMEM;
2949 tdx->vp.tdvpr_page = page;
2950
2951 /*
2952 * page_to_phys() does not work in 'noinstr' code, like guest
2953 * entry via tdh_vp_enter(). Precalculate and store it instead
2954 * of doing it at runtime later.
2955 */
2956 tdx->vp.tdvpr_pa = page_to_phys(tdx->vp.tdvpr_page);
2957
2958 tdx->vp.tdcx_pages = kcalloc(kvm_tdx->td.tdcx_nr_pages, sizeof(*tdx->vp.tdcx_pages),
2959 GFP_KERNEL);
2960 if (!tdx->vp.tdcx_pages) {
2961 ret = -ENOMEM;
2962 goto free_tdvpr;
2963 }
2964
2965 for (i = 0; i < kvm_tdx->td.tdcx_nr_pages; i++) {
2966 page = alloc_page(GFP_KERNEL);
2967 if (!page) {
2968 ret = -ENOMEM;
2969 goto free_tdcx;
2970 }
2971 tdx->vp.tdcx_pages[i] = page;
2972 }
2973
2974 err = tdh_vp_create(&kvm_tdx->td, &tdx->vp);
2975 if (KVM_BUG_ON(err, vcpu->kvm)) {
2976 ret = -EIO;
2977 pr_tdx_error(TDH_VP_CREATE, err);
2978 goto free_tdcx;
2979 }
2980
2981 for (i = 0; i < kvm_tdx->td.tdcx_nr_pages; i++) {
2982 err = tdh_vp_addcx(&tdx->vp, tdx->vp.tdcx_pages[i]);
2983 if (KVM_BUG_ON(err, vcpu->kvm)) {
2984 pr_tdx_error(TDH_VP_ADDCX, err);
2985 /*
2986 * Pages already added are reclaimed by the vcpu_free
2987 * method, but the rest are freed here.
2988 */
2989 for (; i < kvm_tdx->td.tdcx_nr_pages; i++) {
2990 __free_page(tdx->vp.tdcx_pages[i]);
2991 tdx->vp.tdcx_pages[i] = NULL;
2992 }
2993 return -EIO;
2994 }
2995 }
2996
2997 err = tdh_vp_init(&tdx->vp, vcpu_rcx, vcpu->vcpu_id);
2998 if (KVM_BUG_ON(err, vcpu->kvm)) {
2999 pr_tdx_error(TDH_VP_INIT, err);
3000 return -EIO;
3001 }
3002
3003 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
3004
3005 return 0;
3006
3007 free_tdcx:
3008 for (i = 0; i < kvm_tdx->td.tdcx_nr_pages; i++) {
3009 if (tdx->vp.tdcx_pages[i])
3010 __free_page(tdx->vp.tdcx_pages[i]);
3011 tdx->vp.tdcx_pages[i] = NULL;
3012 }
3013 kfree(tdx->vp.tdcx_pages);
3014 tdx->vp.tdcx_pages = NULL;
3015
3016 free_tdvpr:
3017 if (tdx->vp.tdvpr_page)
3018 __free_page(tdx->vp.tdvpr_page);
3019 tdx->vp.tdvpr_page = 0;
3020 tdx->vp.tdvpr_pa = 0;
3021
3022 return ret;
3023 }
3024
3025 /* Sometimes reads multipple subleafs. Return how many enties were written. */
tdx_vcpu_get_cpuid_leaf(struct kvm_vcpu * vcpu,u32 leaf,int * entry_index,struct kvm_cpuid_entry2 * output_e)3026 static int tdx_vcpu_get_cpuid_leaf(struct kvm_vcpu *vcpu, u32 leaf, int *entry_index,
3027 struct kvm_cpuid_entry2 *output_e)
3028 {
3029 int sub_leaf = 0;
3030 int ret;
3031
3032 /* First try without a subleaf */
3033 ret = tdx_read_cpuid(vcpu, leaf, 0, false, entry_index, output_e);
3034
3035 /* If success, or invalid leaf, just give up */
3036 if (ret != -EIO)
3037 return ret;
3038
3039 /*
3040 * If the try without a subleaf failed, try reading subleafs until
3041 * failure. The TDX module only supports 6 bits of subleaf index.
3042 */
3043 while (1) {
3044 /* Keep reading subleafs until there is a failure. */
3045 if (tdx_read_cpuid(vcpu, leaf, sub_leaf, true, entry_index, output_e))
3046 return !sub_leaf;
3047
3048 sub_leaf++;
3049 output_e++;
3050 }
3051
3052 return 0;
3053 }
3054
tdx_vcpu_get_cpuid(struct kvm_vcpu * vcpu,struct kvm_tdx_cmd * cmd)3055 static int tdx_vcpu_get_cpuid(struct kvm_vcpu *vcpu, struct kvm_tdx_cmd *cmd)
3056 {
3057 struct kvm_cpuid2 __user *output, *td_cpuid;
3058 int r = 0, i = 0, leaf;
3059 u32 level;
3060
3061 output = u64_to_user_ptr(cmd->data);
3062 td_cpuid = kzalloc(sizeof(*td_cpuid) +
3063 sizeof(output->entries[0]) * KVM_MAX_CPUID_ENTRIES,
3064 GFP_KERNEL);
3065 if (!td_cpuid)
3066 return -ENOMEM;
3067
3068 if (copy_from_user(td_cpuid, output, sizeof(*output))) {
3069 r = -EFAULT;
3070 goto out;
3071 }
3072
3073 /* Read max CPUID for normal range */
3074 if (tdx_vcpu_get_cpuid_leaf(vcpu, 0, &i, &td_cpuid->entries[i])) {
3075 r = -EIO;
3076 goto out;
3077 }
3078 level = td_cpuid->entries[0].eax;
3079
3080 for (leaf = 1; leaf <= level; leaf++)
3081 tdx_vcpu_get_cpuid_leaf(vcpu, leaf, &i, &td_cpuid->entries[i]);
3082
3083 /* Read max CPUID for extended range */
3084 if (tdx_vcpu_get_cpuid_leaf(vcpu, 0x80000000, &i, &td_cpuid->entries[i])) {
3085 r = -EIO;
3086 goto out;
3087 }
3088 level = td_cpuid->entries[i - 1].eax;
3089
3090 for (leaf = 0x80000001; leaf <= level; leaf++)
3091 tdx_vcpu_get_cpuid_leaf(vcpu, leaf, &i, &td_cpuid->entries[i]);
3092
3093 if (td_cpuid->nent < i)
3094 r = -E2BIG;
3095 td_cpuid->nent = i;
3096
3097 if (copy_to_user(output, td_cpuid, sizeof(*output))) {
3098 r = -EFAULT;
3099 goto out;
3100 }
3101
3102 if (r == -E2BIG)
3103 goto out;
3104
3105 if (copy_to_user(output->entries, td_cpuid->entries,
3106 td_cpuid->nent * sizeof(struct kvm_cpuid_entry2)))
3107 r = -EFAULT;
3108
3109 out:
3110 kfree(td_cpuid);
3111
3112 return r;
3113 }
3114
tdx_vcpu_init(struct kvm_vcpu * vcpu,struct kvm_tdx_cmd * cmd)3115 static int tdx_vcpu_init(struct kvm_vcpu *vcpu, struct kvm_tdx_cmd *cmd)
3116 {
3117 u64 apic_base;
3118 struct vcpu_tdx *tdx = to_tdx(vcpu);
3119 int ret;
3120
3121 if (cmd->flags)
3122 return -EINVAL;
3123
3124 if (tdx->state != VCPU_TD_STATE_UNINITIALIZED)
3125 return -EINVAL;
3126
3127 /*
3128 * TDX requires X2APIC, userspace is responsible for configuring guest
3129 * CPUID accordingly.
3130 */
3131 apic_base = APIC_DEFAULT_PHYS_BASE | LAPIC_MODE_X2APIC |
3132 (kvm_vcpu_is_reset_bsp(vcpu) ? MSR_IA32_APICBASE_BSP : 0);
3133 if (kvm_apic_set_base(vcpu, apic_base, true))
3134 return -EINVAL;
3135
3136 ret = tdx_td_vcpu_init(vcpu, (u64)cmd->data);
3137 if (ret)
3138 return ret;
3139
3140 td_vmcs_write16(tdx, POSTED_INTR_NV, POSTED_INTR_VECTOR);
3141 td_vmcs_write64(tdx, POSTED_INTR_DESC_ADDR, __pa(&tdx->vt.pi_desc));
3142 td_vmcs_setbit32(tdx, PIN_BASED_VM_EXEC_CONTROL, PIN_BASED_POSTED_INTR);
3143
3144 tdx->state = VCPU_TD_STATE_INITIALIZED;
3145
3146 return 0;
3147 }
3148
tdx_vcpu_reset(struct kvm_vcpu * vcpu,bool init_event)3149 void tdx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
3150 {
3151 /*
3152 * Yell on INIT, as TDX doesn't support INIT, i.e. KVM should drop all
3153 * INIT events.
3154 *
3155 * Defer initializing vCPU for RESET state until KVM_TDX_INIT_VCPU, as
3156 * userspace needs to define the vCPU model before KVM can initialize
3157 * vCPU state, e.g. to enable x2APIC.
3158 */
3159 WARN_ON_ONCE(init_event);
3160 }
3161
3162 struct tdx_gmem_post_populate_arg {
3163 struct kvm_vcpu *vcpu;
3164 __u32 flags;
3165 };
3166
tdx_gmem_post_populate(struct kvm * kvm,gfn_t gfn,kvm_pfn_t pfn,void __user * src,int order,void * _arg)3167 static int tdx_gmem_post_populate(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn,
3168 void __user *src, int order, void *_arg)
3169 {
3170 u64 error_code = PFERR_GUEST_FINAL_MASK | PFERR_PRIVATE_ACCESS;
3171 struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
3172 struct tdx_gmem_post_populate_arg *arg = _arg;
3173 struct kvm_vcpu *vcpu = arg->vcpu;
3174 gpa_t gpa = gfn_to_gpa(gfn);
3175 u8 level = PG_LEVEL_4K;
3176 struct page *src_page;
3177 int ret, i;
3178 u64 err, entry, level_state;
3179
3180 /*
3181 * Get the source page if it has been faulted in. Return failure if the
3182 * source page has been swapped out or unmapped in primary memory.
3183 */
3184 ret = get_user_pages_fast((unsigned long)src, 1, 0, &src_page);
3185 if (ret < 0)
3186 return ret;
3187 if (ret != 1)
3188 return -ENOMEM;
3189
3190 ret = kvm_tdp_map_page(vcpu, gpa, error_code, &level);
3191 if (ret < 0)
3192 goto out;
3193
3194 /*
3195 * The private mem cannot be zapped after kvm_tdp_map_page()
3196 * because all paths are covered by slots_lock and the
3197 * filemap invalidate lock. Check that they are indeed enough.
3198 */
3199 if (IS_ENABLED(CONFIG_KVM_PROVE_MMU)) {
3200 scoped_guard(read_lock, &kvm->mmu_lock) {
3201 if (KVM_BUG_ON(!kvm_tdp_mmu_gpa_is_mapped(vcpu, gpa), kvm)) {
3202 ret = -EIO;
3203 goto out;
3204 }
3205 }
3206 }
3207
3208 ret = 0;
3209 err = tdh_mem_page_add(&kvm_tdx->td, gpa, pfn_to_page(pfn),
3210 src_page, &entry, &level_state);
3211 if (err) {
3212 ret = unlikely(tdx_operand_busy(err)) ? -EBUSY : -EIO;
3213 goto out;
3214 }
3215
3216 if (!KVM_BUG_ON(!atomic64_read(&kvm_tdx->nr_premapped), kvm))
3217 atomic64_dec(&kvm_tdx->nr_premapped);
3218
3219 if (arg->flags & KVM_TDX_MEASURE_MEMORY_REGION) {
3220 for (i = 0; i < PAGE_SIZE; i += TDX_EXTENDMR_CHUNKSIZE) {
3221 err = tdh_mr_extend(&kvm_tdx->td, gpa + i, &entry,
3222 &level_state);
3223 if (err) {
3224 ret = -EIO;
3225 break;
3226 }
3227 }
3228 }
3229
3230 out:
3231 put_page(src_page);
3232 return ret;
3233 }
3234
tdx_vcpu_init_mem_region(struct kvm_vcpu * vcpu,struct kvm_tdx_cmd * cmd)3235 static int tdx_vcpu_init_mem_region(struct kvm_vcpu *vcpu, struct kvm_tdx_cmd *cmd)
3236 {
3237 struct vcpu_tdx *tdx = to_tdx(vcpu);
3238 struct kvm *kvm = vcpu->kvm;
3239 struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
3240 struct kvm_tdx_init_mem_region region;
3241 struct tdx_gmem_post_populate_arg arg;
3242 long gmem_ret;
3243 int ret;
3244
3245 if (tdx->state != VCPU_TD_STATE_INITIALIZED)
3246 return -EINVAL;
3247
3248 guard(mutex)(&kvm->slots_lock);
3249
3250 /* Once TD is finalized, the initial guest memory is fixed. */
3251 if (kvm_tdx->state == TD_STATE_RUNNABLE)
3252 return -EINVAL;
3253
3254 if (cmd->flags & ~KVM_TDX_MEASURE_MEMORY_REGION)
3255 return -EINVAL;
3256
3257 if (copy_from_user(®ion, u64_to_user_ptr(cmd->data), sizeof(region)))
3258 return -EFAULT;
3259
3260 if (!PAGE_ALIGNED(region.source_addr) || !PAGE_ALIGNED(region.gpa) ||
3261 !region.nr_pages ||
3262 region.gpa + (region.nr_pages << PAGE_SHIFT) <= region.gpa ||
3263 !vt_is_tdx_private_gpa(kvm, region.gpa) ||
3264 !vt_is_tdx_private_gpa(kvm, region.gpa + (region.nr_pages << PAGE_SHIFT) - 1))
3265 return -EINVAL;
3266
3267 kvm_mmu_reload(vcpu);
3268 ret = 0;
3269 while (region.nr_pages) {
3270 if (signal_pending(current)) {
3271 ret = -EINTR;
3272 break;
3273 }
3274
3275 arg = (struct tdx_gmem_post_populate_arg) {
3276 .vcpu = vcpu,
3277 .flags = cmd->flags,
3278 };
3279 gmem_ret = kvm_gmem_populate(kvm, gpa_to_gfn(region.gpa),
3280 u64_to_user_ptr(region.source_addr),
3281 1, tdx_gmem_post_populate, &arg);
3282 if (gmem_ret < 0) {
3283 ret = gmem_ret;
3284 break;
3285 }
3286
3287 if (gmem_ret != 1) {
3288 ret = -EIO;
3289 break;
3290 }
3291
3292 region.source_addr += PAGE_SIZE;
3293 region.gpa += PAGE_SIZE;
3294 region.nr_pages--;
3295
3296 cond_resched();
3297 }
3298
3299 if (copy_to_user(u64_to_user_ptr(cmd->data), ®ion, sizeof(region)))
3300 ret = -EFAULT;
3301 return ret;
3302 }
3303
tdx_vcpu_ioctl(struct kvm_vcpu * vcpu,void __user * argp)3304 int tdx_vcpu_ioctl(struct kvm_vcpu *vcpu, void __user *argp)
3305 {
3306 struct kvm_tdx *kvm_tdx = to_kvm_tdx(vcpu->kvm);
3307 struct kvm_tdx_cmd cmd;
3308 int ret;
3309
3310 if (!is_hkid_assigned(kvm_tdx) || kvm_tdx->state == TD_STATE_RUNNABLE)
3311 return -EINVAL;
3312
3313 if (copy_from_user(&cmd, argp, sizeof(cmd)))
3314 return -EFAULT;
3315
3316 if (cmd.hw_error)
3317 return -EINVAL;
3318
3319 switch (cmd.id) {
3320 case KVM_TDX_INIT_VCPU:
3321 ret = tdx_vcpu_init(vcpu, &cmd);
3322 break;
3323 case KVM_TDX_INIT_MEM_REGION:
3324 ret = tdx_vcpu_init_mem_region(vcpu, &cmd);
3325 break;
3326 case KVM_TDX_GET_CPUID:
3327 ret = tdx_vcpu_get_cpuid(vcpu, &cmd);
3328 break;
3329 default:
3330 ret = -EINVAL;
3331 break;
3332 }
3333
3334 return ret;
3335 }
3336
tdx_gmem_max_mapping_level(struct kvm * kvm,kvm_pfn_t pfn,bool is_private)3337 int tdx_gmem_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn, bool is_private)
3338 {
3339 if (!is_private)
3340 return 0;
3341
3342 return PG_LEVEL_4K;
3343 }
3344
tdx_online_cpu(unsigned int cpu)3345 static int tdx_online_cpu(unsigned int cpu)
3346 {
3347 unsigned long flags;
3348 int r;
3349
3350 /* Sanity check CPU is already in post-VMXON */
3351 WARN_ON_ONCE(!(cr4_read_shadow() & X86_CR4_VMXE));
3352
3353 local_irq_save(flags);
3354 r = tdx_cpu_enable();
3355 local_irq_restore(flags);
3356
3357 return r;
3358 }
3359
tdx_offline_cpu(unsigned int cpu)3360 static int tdx_offline_cpu(unsigned int cpu)
3361 {
3362 int i;
3363
3364 /* No TD is running. Allow any cpu to be offline. */
3365 if (!atomic_read(&nr_configured_hkid))
3366 return 0;
3367
3368 /*
3369 * In order to reclaim TDX HKID, (i.e. when deleting guest TD), need to
3370 * call TDH.PHYMEM.PAGE.WBINVD on all packages to program all memory
3371 * controller with pconfig. If we have active TDX HKID, refuse to
3372 * offline the last online cpu.
3373 */
3374 for_each_online_cpu(i) {
3375 /*
3376 * Found another online cpu on the same package.
3377 * Allow to offline.
3378 */
3379 if (i != cpu && topology_physical_package_id(i) ==
3380 topology_physical_package_id(cpu))
3381 return 0;
3382 }
3383
3384 /*
3385 * This is the last cpu of this package. Don't offline it.
3386 *
3387 * Because it's hard for human operator to understand the
3388 * reason, warn it.
3389 */
3390 #define MSG_ALLPKG_ONLINE \
3391 "TDX requires all packages to have an online CPU. Delete all TDs in order to offline all CPUs of a package.\n"
3392 pr_warn_ratelimited(MSG_ALLPKG_ONLINE);
3393 return -EBUSY;
3394 }
3395
__do_tdx_cleanup(void)3396 static void __do_tdx_cleanup(void)
3397 {
3398 /*
3399 * Once TDX module is initialized, it cannot be disabled and
3400 * re-initialized again w/o runtime update (which isn't
3401 * supported by kernel). Only need to remove the cpuhp here.
3402 * The TDX host core code tracks TDX status and can handle
3403 * 'multiple enabling' scenario.
3404 */
3405 WARN_ON_ONCE(!tdx_cpuhp_state);
3406 cpuhp_remove_state_nocalls_cpuslocked(tdx_cpuhp_state);
3407 tdx_cpuhp_state = 0;
3408 }
3409
__tdx_cleanup(void)3410 static void __tdx_cleanup(void)
3411 {
3412 cpus_read_lock();
3413 __do_tdx_cleanup();
3414 cpus_read_unlock();
3415 }
3416
__do_tdx_bringup(void)3417 static int __init __do_tdx_bringup(void)
3418 {
3419 int r;
3420
3421 /*
3422 * TDX-specific cpuhp callback to call tdx_cpu_enable() on all
3423 * online CPUs before calling tdx_enable(), and on any new
3424 * going-online CPU to make sure it is ready for TDX guest.
3425 */
3426 r = cpuhp_setup_state_cpuslocked(CPUHP_AP_ONLINE_DYN,
3427 "kvm/cpu/tdx:online",
3428 tdx_online_cpu, tdx_offline_cpu);
3429 if (r < 0)
3430 return r;
3431
3432 tdx_cpuhp_state = r;
3433
3434 r = tdx_enable();
3435 if (r)
3436 __do_tdx_cleanup();
3437
3438 return r;
3439 }
3440
__tdx_bringup(void)3441 static int __init __tdx_bringup(void)
3442 {
3443 const struct tdx_sys_info_td_conf *td_conf;
3444 int r, i;
3445
3446 for (i = 0; i < ARRAY_SIZE(tdx_uret_msrs); i++) {
3447 /*
3448 * Check if MSRs (tdx_uret_msrs) can be saved/restored
3449 * before returning to user space.
3450 *
3451 * this_cpu_ptr(user_return_msrs)->registered isn't checked
3452 * because the registration is done at vcpu runtime by
3453 * tdx_user_return_msr_update_cache().
3454 */
3455 tdx_uret_msrs[i].slot = kvm_find_user_return_msr(tdx_uret_msrs[i].msr);
3456 if (tdx_uret_msrs[i].slot == -1) {
3457 /* If any MSR isn't supported, it is a KVM bug */
3458 pr_err("MSR %x isn't included by kvm_find_user_return_msr\n",
3459 tdx_uret_msrs[i].msr);
3460 return -EIO;
3461 }
3462 }
3463
3464 /*
3465 * Enabling TDX requires enabling hardware virtualization first,
3466 * as making SEAMCALLs requires CPU being in post-VMXON state.
3467 */
3468 r = kvm_enable_virtualization();
3469 if (r)
3470 return r;
3471
3472 cpus_read_lock();
3473 r = __do_tdx_bringup();
3474 cpus_read_unlock();
3475
3476 if (r)
3477 goto tdx_bringup_err;
3478
3479 r = -EINVAL;
3480 /* Get TDX global information for later use */
3481 tdx_sysinfo = tdx_get_sysinfo();
3482 if (WARN_ON_ONCE(!tdx_sysinfo))
3483 goto get_sysinfo_err;
3484
3485 /* Check TDX module and KVM capabilities */
3486 if (!tdx_get_supported_attrs(&tdx_sysinfo->td_conf) ||
3487 !tdx_get_supported_xfam(&tdx_sysinfo->td_conf))
3488 goto get_sysinfo_err;
3489
3490 if (!(tdx_sysinfo->features.tdx_features0 & MD_FIELD_ID_FEATURES0_TOPOLOGY_ENUM))
3491 goto get_sysinfo_err;
3492
3493 /*
3494 * TDX has its own limit of maximum vCPUs it can support for all
3495 * TDX guests in addition to KVM_MAX_VCPUS. Userspace needs to
3496 * query TDX guest's maximum vCPUs by checking KVM_CAP_MAX_VCPU
3497 * extension on per-VM basis.
3498 *
3499 * TDX module reports such limit via the MAX_VCPU_PER_TD global
3500 * metadata. Different modules may report different values.
3501 * Some old module may also not support this metadata (in which
3502 * case this limit is U16_MAX).
3503 *
3504 * In practice, the reported value reflects the maximum logical
3505 * CPUs that ALL the platforms that the module supports can
3506 * possibly have.
3507 *
3508 * Simply forwarding the MAX_VCPU_PER_TD to userspace could
3509 * result in an unpredictable ABI. KVM instead always advertise
3510 * the number of logical CPUs the platform has as the maximum
3511 * vCPUs for TDX guests.
3512 *
3513 * Make sure MAX_VCPU_PER_TD reported by TDX module is not
3514 * smaller than the number of logical CPUs, otherwise KVM will
3515 * report an unsupported value to userspace.
3516 *
3517 * Note, a platform with TDX enabled in the BIOS cannot support
3518 * physical CPU hotplug, and TDX requires the BIOS has marked
3519 * all logical CPUs in MADT table as enabled. Just use
3520 * num_present_cpus() for the number of logical CPUs.
3521 */
3522 td_conf = &tdx_sysinfo->td_conf;
3523 if (td_conf->max_vcpus_per_td < num_present_cpus()) {
3524 pr_err("Disable TDX: MAX_VCPU_PER_TD (%u) smaller than number of logical CPUs (%u).\n",
3525 td_conf->max_vcpus_per_td, num_present_cpus());
3526 goto get_sysinfo_err;
3527 }
3528
3529 if (misc_cg_set_capacity(MISC_CG_RES_TDX, tdx_get_nr_guest_keyids()))
3530 goto get_sysinfo_err;
3531
3532 /*
3533 * Leave hardware virtualization enabled after TDX is enabled
3534 * successfully. TDX CPU hotplug depends on this.
3535 */
3536 return 0;
3537
3538 get_sysinfo_err:
3539 __tdx_cleanup();
3540 tdx_bringup_err:
3541 kvm_disable_virtualization();
3542 return r;
3543 }
3544
tdx_cleanup(void)3545 void tdx_cleanup(void)
3546 {
3547 if (enable_tdx) {
3548 misc_cg_set_capacity(MISC_CG_RES_TDX, 0);
3549 __tdx_cleanup();
3550 kvm_disable_virtualization();
3551 }
3552 }
3553
tdx_bringup(void)3554 int __init tdx_bringup(void)
3555 {
3556 int r, i;
3557
3558 /* tdx_disable_virtualization_cpu() uses associated_tdvcpus. */
3559 for_each_possible_cpu(i)
3560 INIT_LIST_HEAD(&per_cpu(associated_tdvcpus, i));
3561
3562 if (!enable_tdx)
3563 return 0;
3564
3565 if (!enable_ept) {
3566 pr_err("EPT is required for TDX\n");
3567 goto success_disable_tdx;
3568 }
3569
3570 if (!tdp_mmu_enabled || !enable_mmio_caching || !enable_ept_ad_bits) {
3571 pr_err("TDP MMU and MMIO caching and EPT A/D bit is required for TDX\n");
3572 goto success_disable_tdx;
3573 }
3574
3575 if (!enable_apicv) {
3576 pr_err("APICv is required for TDX\n");
3577 goto success_disable_tdx;
3578 }
3579
3580 if (!cpu_feature_enabled(X86_FEATURE_OSXSAVE)) {
3581 pr_err("tdx: OSXSAVE is required for TDX\n");
3582 goto success_disable_tdx;
3583 }
3584
3585 if (!cpu_feature_enabled(X86_FEATURE_MOVDIR64B)) {
3586 pr_err("tdx: MOVDIR64B is required for TDX\n");
3587 goto success_disable_tdx;
3588 }
3589
3590 if (!cpu_feature_enabled(X86_FEATURE_SELFSNOOP)) {
3591 pr_err("Self-snoop is required for TDX\n");
3592 goto success_disable_tdx;
3593 }
3594
3595 if (!cpu_feature_enabled(X86_FEATURE_TDX_HOST_PLATFORM)) {
3596 pr_err("tdx: no TDX private KeyIDs available\n");
3597 goto success_disable_tdx;
3598 }
3599
3600 if (!enable_virt_at_load) {
3601 pr_err("tdx: tdx requires kvm.enable_virt_at_load=1\n");
3602 goto success_disable_tdx;
3603 }
3604
3605 /*
3606 * Ideally KVM should probe whether TDX module has been loaded
3607 * first and then try to bring it up. But TDX needs to use SEAMCALL
3608 * to probe whether the module is loaded (there is no CPUID or MSR
3609 * for that), and making SEAMCALL requires enabling virtualization
3610 * first, just like the rest steps of bringing up TDX module.
3611 *
3612 * So, for simplicity do everything in __tdx_bringup(); the first
3613 * SEAMCALL will return -ENODEV when the module is not loaded. The
3614 * only complication is having to make sure that initialization
3615 * SEAMCALLs don't return TDX_SEAMCALL_VMFAILINVALID in other
3616 * cases.
3617 */
3618 r = __tdx_bringup();
3619 if (r) {
3620 /*
3621 * Disable TDX only but don't fail to load module if the TDX
3622 * module could not be loaded. No need to print message saying
3623 * "module is not loaded" because it was printed when the first
3624 * SEAMCALL failed. Don't bother unwinding the S-EPT hooks or
3625 * vm_size, as kvm_x86_ops have already been finalized (and are
3626 * intentionally not exported). The S-EPT code is unreachable,
3627 * and allocating a few more bytes per VM in a should-be-rare
3628 * failure scenario is a non-issue.
3629 */
3630 if (r == -ENODEV)
3631 goto success_disable_tdx;
3632
3633 enable_tdx = 0;
3634 }
3635
3636 return r;
3637
3638 success_disable_tdx:
3639 enable_tdx = 0;
3640 return 0;
3641 }
3642
tdx_hardware_setup(void)3643 void __init tdx_hardware_setup(void)
3644 {
3645 KVM_SANITY_CHECK_VM_STRUCT_SIZE(kvm_tdx);
3646
3647 /*
3648 * Note, if the TDX module can't be loaded, KVM TDX support will be
3649 * disabled but KVM will continue loading (see tdx_bringup()).
3650 */
3651 vt_x86_ops.vm_size = max_t(unsigned int, vt_x86_ops.vm_size, sizeof(struct kvm_tdx));
3652
3653 vt_x86_ops.link_external_spt = tdx_sept_link_private_spt;
3654 vt_x86_ops.set_external_spte = tdx_sept_set_private_spte;
3655 vt_x86_ops.free_external_spt = tdx_sept_free_private_spt;
3656 vt_x86_ops.remove_external_spte = tdx_sept_remove_private_spte;
3657 vt_x86_ops.protected_apic_has_interrupt = tdx_protected_apic_has_interrupt;
3658 }
3659