1 // SPDX-License-Identifier: GPL-2.0
2 #include <linux/cleanup.h>
3 #include <linux/cpu.h>
4 #include <asm/cpufeature.h>
5 #include <asm/fpu/xcr.h>
6 #include <linux/misc_cgroup.h>
7 #include <linux/mmu_context.h>
8 #include <asm/tdx.h>
9 #include "capabilities.h"
10 #include "mmu.h"
11 #include "x86_ops.h"
12 #include "lapic.h"
13 #include "tdx.h"
14 #include "vmx.h"
15 #include "mmu/spte.h"
16 #include "common.h"
17 #include "posted_intr.h"
18 #include "irq.h"
19 #include <trace/events/kvm.h>
20 #include "trace.h"
21
22 #pragma GCC poison to_vmx
23
24 #undef pr_fmt
25 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
26
27 #define __TDX_BUG_ON(__err, __f, __kvm, __fmt, __args...) \
28 ({ \
29 struct kvm *_kvm = (__kvm); \
30 bool __ret = !!(__err); \
31 \
32 if (WARN_ON_ONCE(__ret && (!_kvm || !_kvm->vm_bugged))) { \
33 if (_kvm) \
34 kvm_vm_bugged(_kvm); \
35 pr_err_ratelimited("SEAMCALL " __f " failed: 0x%llx" __fmt "\n",\
36 __err, __args); \
37 } \
38 unlikely(__ret); \
39 })
40
41 #define TDX_BUG_ON(__err, __fn, __kvm) \
42 __TDX_BUG_ON(__err, #__fn, __kvm, "%s", "")
43
44 #define TDX_BUG_ON_1(__err, __fn, a1, __kvm) \
45 __TDX_BUG_ON(__err, #__fn, __kvm, ", " #a1 " 0x%llx", a1)
46
47 #define TDX_BUG_ON_2(__err, __fn, a1, a2, __kvm) \
48 __TDX_BUG_ON(__err, #__fn, __kvm, ", " #a1 " 0x%llx, " #a2 " 0x%llx", a1, a2)
49
50 #define TDX_BUG_ON_3(__err, __fn, a1, a2, a3, __kvm) \
51 __TDX_BUG_ON(__err, #__fn, __kvm, ", " #a1 " 0x%llx, " #a2 ", 0x%llx, " #a3 " 0x%llx", \
52 a1, a2, a3)
53
54
55 bool enable_tdx __ro_after_init;
56 module_param_named(tdx, enable_tdx, bool, 0444);
57
58 #define TDX_SHARED_BIT_PWL_5 gpa_to_gfn(BIT_ULL(51))
59 #define TDX_SHARED_BIT_PWL_4 gpa_to_gfn(BIT_ULL(47))
60
61 static enum cpuhp_state tdx_cpuhp_state;
62
63 static const struct tdx_sys_info *tdx_sysinfo;
64
tdh_vp_rd_failed(struct vcpu_tdx * tdx,char * uclass,u32 field,u64 err)65 void tdh_vp_rd_failed(struct vcpu_tdx *tdx, char *uclass, u32 field, u64 err)
66 {
67 KVM_BUG_ON(1, tdx->vcpu.kvm);
68 pr_err("TDH_VP_RD[%s.0x%x] failed 0x%llx\n", uclass, field, err);
69 }
70
tdh_vp_wr_failed(struct vcpu_tdx * tdx,char * uclass,char * op,u32 field,u64 val,u64 err)71 void tdh_vp_wr_failed(struct vcpu_tdx *tdx, char *uclass, char *op, u32 field,
72 u64 val, u64 err)
73 {
74 KVM_BUG_ON(1, tdx->vcpu.kvm);
75 pr_err("TDH_VP_WR[%s.0x%x]%s0x%llx failed: 0x%llx\n", uclass, field, op, val, err);
76 }
77
78 #define KVM_SUPPORTED_TD_ATTRS (TDX_TD_ATTR_SEPT_VE_DISABLE)
79
to_kvm_tdx(struct kvm * kvm)80 static __always_inline struct kvm_tdx *to_kvm_tdx(struct kvm *kvm)
81 {
82 return container_of(kvm, struct kvm_tdx, kvm);
83 }
84
to_tdx(struct kvm_vcpu * vcpu)85 static __always_inline struct vcpu_tdx *to_tdx(struct kvm_vcpu *vcpu)
86 {
87 return container_of(vcpu, struct vcpu_tdx, vcpu);
88 }
89
tdx_get_supported_attrs(const struct tdx_sys_info_td_conf * td_conf)90 static u64 tdx_get_supported_attrs(const struct tdx_sys_info_td_conf *td_conf)
91 {
92 u64 val = KVM_SUPPORTED_TD_ATTRS;
93
94 if ((val & td_conf->attributes_fixed1) != td_conf->attributes_fixed1)
95 return 0;
96
97 val &= td_conf->attributes_fixed0;
98
99 return val;
100 }
101
tdx_get_supported_xfam(const struct tdx_sys_info_td_conf * td_conf)102 static u64 tdx_get_supported_xfam(const struct tdx_sys_info_td_conf *td_conf)
103 {
104 u64 val = kvm_caps.supported_xcr0 | kvm_caps.supported_xss;
105
106 if ((val & td_conf->xfam_fixed1) != td_conf->xfam_fixed1)
107 return 0;
108
109 val &= td_conf->xfam_fixed0;
110
111 return val;
112 }
113
tdx_get_guest_phys_addr_bits(const u32 eax)114 static int tdx_get_guest_phys_addr_bits(const u32 eax)
115 {
116 return (eax & GENMASK(23, 16)) >> 16;
117 }
118
tdx_set_guest_phys_addr_bits(const u32 eax,int addr_bits)119 static u32 tdx_set_guest_phys_addr_bits(const u32 eax, int addr_bits)
120 {
121 return (eax & ~GENMASK(23, 16)) | (addr_bits & 0xff) << 16;
122 }
123
124 #define TDX_FEATURE_TSX (__feature_bit(X86_FEATURE_HLE) | __feature_bit(X86_FEATURE_RTM))
125
has_tsx(const struct kvm_cpuid_entry2 * entry)126 static bool has_tsx(const struct kvm_cpuid_entry2 *entry)
127 {
128 return entry->function == 7 && entry->index == 0 &&
129 (entry->ebx & TDX_FEATURE_TSX);
130 }
131
clear_tsx(struct kvm_cpuid_entry2 * entry)132 static void clear_tsx(struct kvm_cpuid_entry2 *entry)
133 {
134 entry->ebx &= ~TDX_FEATURE_TSX;
135 }
136
has_waitpkg(const struct kvm_cpuid_entry2 * entry)137 static bool has_waitpkg(const struct kvm_cpuid_entry2 *entry)
138 {
139 return entry->function == 7 && entry->index == 0 &&
140 (entry->ecx & __feature_bit(X86_FEATURE_WAITPKG));
141 }
142
clear_waitpkg(struct kvm_cpuid_entry2 * entry)143 static void clear_waitpkg(struct kvm_cpuid_entry2 *entry)
144 {
145 entry->ecx &= ~__feature_bit(X86_FEATURE_WAITPKG);
146 }
147
tdx_clear_unsupported_cpuid(struct kvm_cpuid_entry2 * entry)148 static void tdx_clear_unsupported_cpuid(struct kvm_cpuid_entry2 *entry)
149 {
150 if (has_tsx(entry))
151 clear_tsx(entry);
152
153 if (has_waitpkg(entry))
154 clear_waitpkg(entry);
155 }
156
tdx_unsupported_cpuid(const struct kvm_cpuid_entry2 * entry)157 static bool tdx_unsupported_cpuid(const struct kvm_cpuid_entry2 *entry)
158 {
159 return has_tsx(entry) || has_waitpkg(entry);
160 }
161
162 #define KVM_TDX_CPUID_NO_SUBLEAF ((__u32)-1)
163
td_init_cpuid_entry2(struct kvm_cpuid_entry2 * entry,unsigned char idx)164 static void td_init_cpuid_entry2(struct kvm_cpuid_entry2 *entry, unsigned char idx)
165 {
166 const struct tdx_sys_info_td_conf *td_conf = &tdx_sysinfo->td_conf;
167
168 entry->function = (u32)td_conf->cpuid_config_leaves[idx];
169 entry->index = td_conf->cpuid_config_leaves[idx] >> 32;
170 entry->eax = (u32)td_conf->cpuid_config_values[idx][0];
171 entry->ebx = td_conf->cpuid_config_values[idx][0] >> 32;
172 entry->ecx = (u32)td_conf->cpuid_config_values[idx][1];
173 entry->edx = td_conf->cpuid_config_values[idx][1] >> 32;
174
175 if (entry->index == KVM_TDX_CPUID_NO_SUBLEAF)
176 entry->index = 0;
177
178 /*
179 * The TDX module doesn't allow configuring the guest phys addr bits
180 * (EAX[23:16]). However, KVM uses it as an interface to the userspace
181 * to configure the GPAW. Report these bits as configurable.
182 */
183 if (entry->function == 0x80000008)
184 entry->eax = tdx_set_guest_phys_addr_bits(entry->eax, 0xff);
185
186 tdx_clear_unsupported_cpuid(entry);
187 }
188
189 #define TDVMCALLINFO_SETUP_EVENT_NOTIFY_INTERRUPT BIT(1)
190
init_kvm_tdx_caps(const struct tdx_sys_info_td_conf * td_conf,struct kvm_tdx_capabilities * caps)191 static int init_kvm_tdx_caps(const struct tdx_sys_info_td_conf *td_conf,
192 struct kvm_tdx_capabilities *caps)
193 {
194 int i;
195
196 caps->supported_attrs = tdx_get_supported_attrs(td_conf);
197 if (!caps->supported_attrs)
198 return -EIO;
199
200 caps->supported_xfam = tdx_get_supported_xfam(td_conf);
201 if (!caps->supported_xfam)
202 return -EIO;
203
204 caps->cpuid.nent = td_conf->num_cpuid_config;
205
206 caps->user_tdvmcallinfo_1_r11 =
207 TDVMCALLINFO_SETUP_EVENT_NOTIFY_INTERRUPT;
208
209 for (i = 0; i < td_conf->num_cpuid_config; i++)
210 td_init_cpuid_entry2(&caps->cpuid.entries[i], i);
211
212 return 0;
213 }
214
215 /*
216 * Some SEAMCALLs acquire the TDX module globally, and can fail with
217 * TDX_OPERAND_BUSY. Use a global mutex to serialize these SEAMCALLs.
218 */
219 static DEFINE_MUTEX(tdx_lock);
220
221 static atomic_t nr_configured_hkid;
222
tdx_operand_busy(u64 err)223 static bool tdx_operand_busy(u64 err)
224 {
225 return (err & TDX_SEAMCALL_STATUS_MASK) == TDX_OPERAND_BUSY;
226 }
227
228
229 /*
230 * A per-CPU list of TD vCPUs associated with a given CPU.
231 * Protected by interrupt mask. Only manipulated by the CPU owning this per-CPU
232 * list.
233 * - When a vCPU is loaded onto a CPU, it is removed from the per-CPU list of
234 * the old CPU during the IPI callback running on the old CPU, and then added
235 * to the per-CPU list of the new CPU.
236 * - When a TD is tearing down, all vCPUs are disassociated from their current
237 * running CPUs and removed from the per-CPU list during the IPI callback
238 * running on those CPUs.
239 * - When a CPU is brought down, traverse the per-CPU list to disassociate all
240 * associated TD vCPUs and remove them from the per-CPU list.
241 */
242 static DEFINE_PER_CPU(struct list_head, associated_tdvcpus);
243
tdvmcall_exit_type(struct kvm_vcpu * vcpu)244 static __always_inline unsigned long tdvmcall_exit_type(struct kvm_vcpu *vcpu)
245 {
246 return to_tdx(vcpu)->vp_enter_args.r10;
247 }
248
tdvmcall_leaf(struct kvm_vcpu * vcpu)249 static __always_inline unsigned long tdvmcall_leaf(struct kvm_vcpu *vcpu)
250 {
251 return to_tdx(vcpu)->vp_enter_args.r11;
252 }
253
tdvmcall_set_return_code(struct kvm_vcpu * vcpu,long val)254 static __always_inline void tdvmcall_set_return_code(struct kvm_vcpu *vcpu,
255 long val)
256 {
257 to_tdx(vcpu)->vp_enter_args.r10 = val;
258 }
259
tdvmcall_set_return_val(struct kvm_vcpu * vcpu,unsigned long val)260 static __always_inline void tdvmcall_set_return_val(struct kvm_vcpu *vcpu,
261 unsigned long val)
262 {
263 to_tdx(vcpu)->vp_enter_args.r11 = val;
264 }
265
tdx_hkid_free(struct kvm_tdx * kvm_tdx)266 static inline void tdx_hkid_free(struct kvm_tdx *kvm_tdx)
267 {
268 tdx_guest_keyid_free(kvm_tdx->hkid);
269 kvm_tdx->hkid = -1;
270 atomic_dec(&nr_configured_hkid);
271 misc_cg_uncharge(MISC_CG_RES_TDX, kvm_tdx->misc_cg, 1);
272 put_misc_cg(kvm_tdx->misc_cg);
273 kvm_tdx->misc_cg = NULL;
274 }
275
is_hkid_assigned(struct kvm_tdx * kvm_tdx)276 static inline bool is_hkid_assigned(struct kvm_tdx *kvm_tdx)
277 {
278 return kvm_tdx->hkid > 0;
279 }
280
tdx_disassociate_vp(struct kvm_vcpu * vcpu)281 static inline void tdx_disassociate_vp(struct kvm_vcpu *vcpu)
282 {
283 lockdep_assert_irqs_disabled();
284
285 list_del(&to_tdx(vcpu)->cpu_list);
286
287 /*
288 * Ensure tdx->cpu_list is updated before setting vcpu->cpu to -1,
289 * otherwise, a different CPU can see vcpu->cpu = -1 and add the vCPU
290 * to its list before it's deleted from this CPU's list.
291 */
292 smp_wmb();
293
294 vcpu->cpu = -1;
295 }
296
297 /*
298 * Execute a SEAMCALL related to removing/blocking S-EPT entries, with a single
299 * retry (if necessary) after forcing vCPUs to exit and wait for the operation
300 * to complete. All flows that remove/block S-EPT entries run with mmu_lock
301 * held for write, i.e. are mutually exclusive with each other, but they aren't
302 * mutually exclusive with running vCPUs, and so can fail with "operand busy"
303 * if a vCPU acquires a relevant lock in the TDX-Module, e.g. when doing TDCALL.
304 *
305 * Note, the retry is guaranteed to succeed, absent KVM and/or TDX-Module bugs.
306 */
307 #define tdh_do_no_vcpus(tdh_func, kvm, args...) \
308 ({ \
309 struct kvm_tdx *__kvm_tdx = to_kvm_tdx(kvm); \
310 u64 __err; \
311 \
312 lockdep_assert_held_write(&kvm->mmu_lock); \
313 \
314 __err = tdh_func(args); \
315 if (unlikely(tdx_operand_busy(__err))) { \
316 WRITE_ONCE(__kvm_tdx->wait_for_sept_zap, true); \
317 kvm_make_all_cpus_request(kvm, KVM_REQ_OUTSIDE_GUEST_MODE); \
318 \
319 __err = tdh_func(args); \
320 \
321 WRITE_ONCE(__kvm_tdx->wait_for_sept_zap, false); \
322 } \
323 __err; \
324 })
325
326 /* TDH.PHYMEM.PAGE.RECLAIM is allowed only when destroying the TD. */
__tdx_reclaim_page(struct page * page)327 static int __tdx_reclaim_page(struct page *page)
328 {
329 u64 err, rcx, rdx, r8;
330
331 err = tdh_phymem_page_reclaim(page, &rcx, &rdx, &r8);
332
333 /*
334 * No need to check for TDX_OPERAND_BUSY; all TD pages are freed
335 * before the HKID is released and control pages have also been
336 * released at this point, so there is no possibility of contention.
337 */
338 if (TDX_BUG_ON_3(err, TDH_PHYMEM_PAGE_RECLAIM, rcx, rdx, r8, NULL))
339 return -EIO;
340
341 return 0;
342 }
343
tdx_reclaim_page(struct page * page)344 static int tdx_reclaim_page(struct page *page)
345 {
346 int r;
347
348 r = __tdx_reclaim_page(page);
349 if (!r)
350 tdx_quirk_reset_page(page);
351 return r;
352 }
353
354
355 /*
356 * Reclaim the TD control page(s) which are crypto-protected by TDX guest's
357 * private KeyID. Assume the cache associated with the TDX private KeyID has
358 * been flushed.
359 */
tdx_reclaim_control_page(struct page * ctrl_page)360 static void tdx_reclaim_control_page(struct page *ctrl_page)
361 {
362 /*
363 * Leak the page if the kernel failed to reclaim the page.
364 * The kernel cannot use it safely anymore.
365 */
366 if (tdx_reclaim_page(ctrl_page))
367 return;
368
369 __free_page(ctrl_page);
370 }
371
372 struct tdx_flush_vp_arg {
373 struct kvm_vcpu *vcpu;
374 u64 err;
375 };
376
tdx_flush_vp(void * _arg)377 static void tdx_flush_vp(void *_arg)
378 {
379 struct tdx_flush_vp_arg *arg = _arg;
380 struct kvm_vcpu *vcpu = arg->vcpu;
381 u64 err;
382
383 arg->err = 0;
384 lockdep_assert_irqs_disabled();
385
386 /* Task migration can race with CPU offlining. */
387 if (unlikely(vcpu->cpu != raw_smp_processor_id()))
388 return;
389
390 /*
391 * No need to do TDH_VP_FLUSH if the vCPU hasn't been initialized. The
392 * list tracking still needs to be updated so that it's correct if/when
393 * the vCPU does get initialized.
394 */
395 if (to_tdx(vcpu)->state != VCPU_TD_STATE_UNINITIALIZED) {
396 /*
397 * No need to retry. TDX Resources needed for TDH.VP.FLUSH are:
398 * TDVPR as exclusive, TDR as shared, and TDCS as shared. This
399 * vp flush function is called when destructing vCPU/TD or vCPU
400 * migration. No other thread uses TDVPR in those cases.
401 */
402 err = tdh_vp_flush(&to_tdx(vcpu)->vp);
403 if (unlikely(err && err != TDX_VCPU_NOT_ASSOCIATED)) {
404 /*
405 * This function is called in IPI context. Do not use
406 * printk to avoid console semaphore.
407 * The caller prints out the error message, instead.
408 */
409 if (err)
410 arg->err = err;
411 }
412 }
413
414 tdx_disassociate_vp(vcpu);
415 }
416
tdx_flush_vp_on_cpu(struct kvm_vcpu * vcpu)417 static void tdx_flush_vp_on_cpu(struct kvm_vcpu *vcpu)
418 {
419 struct tdx_flush_vp_arg arg = {
420 .vcpu = vcpu,
421 };
422 int cpu = vcpu->cpu;
423
424 if (unlikely(cpu == -1))
425 return;
426
427 smp_call_function_single(cpu, tdx_flush_vp, &arg, 1);
428
429 TDX_BUG_ON(arg.err, TDH_VP_FLUSH, vcpu->kvm);
430 }
431
tdx_disable_virtualization_cpu(void)432 void tdx_disable_virtualization_cpu(void)
433 {
434 int cpu = raw_smp_processor_id();
435 struct list_head *tdvcpus = &per_cpu(associated_tdvcpus, cpu);
436 struct tdx_flush_vp_arg arg;
437 struct vcpu_tdx *tdx, *tmp;
438 unsigned long flags;
439
440 local_irq_save(flags);
441 /* Safe variant needed as tdx_disassociate_vp() deletes the entry. */
442 list_for_each_entry_safe(tdx, tmp, tdvcpus, cpu_list) {
443 arg.vcpu = &tdx->vcpu;
444 tdx_flush_vp(&arg);
445 }
446 local_irq_restore(flags);
447
448 /*
449 * Flush cache now if kexec is possible: this is necessary to avoid
450 * having dirty private memory cachelines when the new kernel boots,
451 * but WBINVD is a relatively expensive operation and doing it during
452 * kexec can exacerbate races in native_stop_other_cpus(). Do it
453 * now, since this is a safe moment and there is going to be no more
454 * TDX activity on this CPU from this point on.
455 */
456 tdx_cpu_flush_cache_for_kexec();
457 }
458
459 #define TDX_SEAMCALL_RETRIES 10000
460
smp_func_do_phymem_cache_wb(void * unused)461 static void smp_func_do_phymem_cache_wb(void *unused)
462 {
463 u64 err = 0;
464 bool resume;
465 int i;
466
467 /*
468 * TDH.PHYMEM.CACHE.WB flushes caches associated with any TDX private
469 * KeyID on the package or core. The TDX module may not finish the
470 * cache flush but return TDX_INTERRUPTED_RESUMEABLE instead. The
471 * kernel should retry it until it returns success w/o rescheduling.
472 */
473 for (i = TDX_SEAMCALL_RETRIES; i > 0; i--) {
474 resume = !!err;
475 err = tdh_phymem_cache_wb(resume);
476 switch (err) {
477 case TDX_INTERRUPTED_RESUMABLE:
478 continue;
479 case TDX_NO_HKID_READY_TO_WBCACHE:
480 err = TDX_SUCCESS; /* Already done by other thread */
481 fallthrough;
482 default:
483 goto out;
484 }
485 }
486
487 out:
488 TDX_BUG_ON(err, TDH_PHYMEM_CACHE_WB, NULL);
489 }
490
tdx_mmu_release_hkid(struct kvm * kvm)491 void tdx_mmu_release_hkid(struct kvm *kvm)
492 {
493 bool packages_allocated, targets_allocated;
494 struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
495 cpumask_var_t packages, targets;
496 struct kvm_vcpu *vcpu;
497 unsigned long j;
498 int i;
499 u64 err;
500
501 if (!is_hkid_assigned(kvm_tdx))
502 return;
503
504 packages_allocated = zalloc_cpumask_var(&packages, GFP_KERNEL);
505 targets_allocated = zalloc_cpumask_var(&targets, GFP_KERNEL);
506 cpus_read_lock();
507
508 kvm_for_each_vcpu(j, vcpu, kvm)
509 tdx_flush_vp_on_cpu(vcpu);
510
511 /*
512 * TDH.PHYMEM.CACHE.WB tries to acquire the TDX module global lock
513 * and can fail with TDX_OPERAND_BUSY when it fails to get the lock.
514 * Multiple TDX guests can be destroyed simultaneously. Take the
515 * mutex to prevent it from getting error.
516 */
517 mutex_lock(&tdx_lock);
518
519 /*
520 * Releasing HKID is in vm_destroy().
521 * After the above flushing vps, there should be no more vCPU
522 * associations, as all vCPU fds have been released at this stage.
523 */
524 err = tdh_mng_vpflushdone(&kvm_tdx->td);
525 if (err == TDX_FLUSHVP_NOT_DONE)
526 goto out;
527 if (TDX_BUG_ON(err, TDH_MNG_VPFLUSHDONE, kvm)) {
528 pr_err("tdh_mng_vpflushdone() failed. HKID %d is leaked.\n",
529 kvm_tdx->hkid);
530 goto out;
531 }
532
533 for_each_online_cpu(i) {
534 if (packages_allocated &&
535 cpumask_test_and_set_cpu(topology_physical_package_id(i),
536 packages))
537 continue;
538 if (targets_allocated)
539 cpumask_set_cpu(i, targets);
540 }
541 if (targets_allocated)
542 on_each_cpu_mask(targets, smp_func_do_phymem_cache_wb, NULL, true);
543 else
544 on_each_cpu(smp_func_do_phymem_cache_wb, NULL, true);
545 /*
546 * In the case of error in smp_func_do_phymem_cache_wb(), the following
547 * tdh_mng_key_freeid() will fail.
548 */
549 err = tdh_mng_key_freeid(&kvm_tdx->td);
550 if (TDX_BUG_ON(err, TDH_MNG_KEY_FREEID, kvm)) {
551 pr_err("tdh_mng_key_freeid() failed. HKID %d is leaked.\n",
552 kvm_tdx->hkid);
553 } else {
554 tdx_hkid_free(kvm_tdx);
555 }
556
557 out:
558 mutex_unlock(&tdx_lock);
559 cpus_read_unlock();
560 free_cpumask_var(targets);
561 free_cpumask_var(packages);
562 }
563
tdx_reclaim_td_control_pages(struct kvm * kvm)564 static void tdx_reclaim_td_control_pages(struct kvm *kvm)
565 {
566 struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
567 u64 err;
568 int i;
569
570 /*
571 * tdx_mmu_release_hkid() failed to reclaim HKID. Something went wrong
572 * heavily with TDX module. Give up freeing TD pages. As the function
573 * already warned, don't warn it again.
574 */
575 if (is_hkid_assigned(kvm_tdx))
576 return;
577
578 if (kvm_tdx->td.tdcs_pages) {
579 for (i = 0; i < kvm_tdx->td.tdcs_nr_pages; i++) {
580 if (!kvm_tdx->td.tdcs_pages[i])
581 continue;
582
583 tdx_reclaim_control_page(kvm_tdx->td.tdcs_pages[i]);
584 }
585 kfree(kvm_tdx->td.tdcs_pages);
586 kvm_tdx->td.tdcs_pages = NULL;
587 }
588
589 if (!kvm_tdx->td.tdr_page)
590 return;
591
592 if (__tdx_reclaim_page(kvm_tdx->td.tdr_page))
593 return;
594
595 /*
596 * Use a SEAMCALL to ask the TDX module to flush the cache based on the
597 * KeyID. TDX module may access TDR while operating on TD (Especially
598 * when it is reclaiming TDCS).
599 */
600 err = tdh_phymem_page_wbinvd_tdr(&kvm_tdx->td);
601 if (TDX_BUG_ON(err, TDH_PHYMEM_PAGE_WBINVD, kvm))
602 return;
603
604 tdx_quirk_reset_page(kvm_tdx->td.tdr_page);
605
606 __free_page(kvm_tdx->td.tdr_page);
607 kvm_tdx->td.tdr_page = NULL;
608 }
609
tdx_vm_destroy(struct kvm * kvm)610 void tdx_vm_destroy(struct kvm *kvm)
611 {
612 struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
613
614 tdx_reclaim_td_control_pages(kvm);
615
616 kvm_tdx->state = TD_STATE_UNINITIALIZED;
617 }
618
tdx_do_tdh_mng_key_config(void * param)619 static int tdx_do_tdh_mng_key_config(void *param)
620 {
621 struct kvm_tdx *kvm_tdx = param;
622 u64 err;
623
624 /* TDX_RND_NO_ENTROPY related retries are handled by sc_retry() */
625 err = tdh_mng_key_config(&kvm_tdx->td);
626 if (TDX_BUG_ON(err, TDH_MNG_KEY_CONFIG, &kvm_tdx->kvm))
627 return -EIO;
628
629 return 0;
630 }
631
tdx_vm_init(struct kvm * kvm)632 int tdx_vm_init(struct kvm *kvm)
633 {
634 struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
635
636 kvm->arch.has_protected_state = true;
637 /*
638 * TDX Module doesn't allow the hypervisor to modify the EOI-bitmap,
639 * i.e. all EOIs are accelerated and never trigger exits.
640 */
641 kvm->arch.has_protected_eoi = true;
642 kvm->arch.has_private_mem = true;
643 kvm->arch.disabled_quirks |= KVM_X86_QUIRK_IGNORE_GUEST_PAT;
644
645 /*
646 * Because guest TD is protected, VMM can't parse the instruction in TD.
647 * Instead, guest uses MMIO hypercall. For unmodified device driver,
648 * #VE needs to be injected for MMIO and #VE handler in TD converts MMIO
649 * instruction into MMIO hypercall.
650 *
651 * SPTE value for MMIO needs to be setup so that #VE is injected into
652 * TD instead of triggering EPT MISCONFIG.
653 * - RWX=0 so that EPT violation is triggered.
654 * - suppress #VE bit is cleared to inject #VE.
655 */
656 kvm_mmu_set_mmio_spte_value(kvm, 0);
657
658 /*
659 * TDX has its own limit of maximum vCPUs it can support for all
660 * TDX guests in addition to KVM_MAX_VCPUS. TDX module reports
661 * such limit via the MAX_VCPU_PER_TD global metadata. In
662 * practice, it reflects the number of logical CPUs that ALL
663 * platforms that the TDX module supports can possibly have.
664 *
665 * Limit TDX guest's maximum vCPUs to the number of logical CPUs
666 * the platform has. Simply forwarding the MAX_VCPU_PER_TD to
667 * userspace would result in an unpredictable ABI.
668 */
669 kvm->max_vcpus = min_t(int, kvm->max_vcpus, num_present_cpus());
670
671 kvm_tdx->state = TD_STATE_UNINITIALIZED;
672
673 return 0;
674 }
675
tdx_vcpu_create(struct kvm_vcpu * vcpu)676 int tdx_vcpu_create(struct kvm_vcpu *vcpu)
677 {
678 struct kvm_tdx *kvm_tdx = to_kvm_tdx(vcpu->kvm);
679 struct vcpu_tdx *tdx = to_tdx(vcpu);
680
681 if (kvm_tdx->state != TD_STATE_INITIALIZED)
682 return -EIO;
683
684 /*
685 * TDX module mandates APICv, which requires an in-kernel local APIC.
686 * Disallow an in-kernel I/O APIC, because level-triggered interrupts
687 * and thus the I/O APIC as a whole can't be faithfully emulated in KVM.
688 */
689 if (!irqchip_split(vcpu->kvm))
690 return -EINVAL;
691
692 fpstate_set_confidential(&vcpu->arch.guest_fpu);
693 vcpu->arch.apic->guest_apic_protected = true;
694 INIT_LIST_HEAD(&tdx->vt.pi_wakeup_list);
695
696 vcpu->arch.efer = EFER_SCE | EFER_LME | EFER_LMA | EFER_NX;
697
698 vcpu->arch.switch_db_regs = KVM_DEBUGREG_AUTO_SWITCH;
699 vcpu->arch.cr0_guest_owned_bits = -1ul;
700 vcpu->arch.cr4_guest_owned_bits = -1ul;
701
702 /* KVM can't change TSC offset/multiplier as TDX module manages them. */
703 vcpu->arch.guest_tsc_protected = true;
704 vcpu->arch.tsc_offset = kvm_tdx->tsc_offset;
705 vcpu->arch.l1_tsc_offset = vcpu->arch.tsc_offset;
706 vcpu->arch.tsc_scaling_ratio = kvm_tdx->tsc_multiplier;
707 vcpu->arch.l1_tsc_scaling_ratio = kvm_tdx->tsc_multiplier;
708
709 vcpu->arch.guest_state_protected =
710 !(to_kvm_tdx(vcpu->kvm)->attributes & TDX_TD_ATTR_DEBUG);
711
712 if ((kvm_tdx->xfam & XFEATURE_MASK_XTILE) == XFEATURE_MASK_XTILE)
713 vcpu->arch.xfd_no_write_intercept = true;
714
715 tdx->vt.pi_desc.nv = POSTED_INTR_VECTOR;
716 __pi_set_sn(&tdx->vt.pi_desc);
717
718 tdx->state = VCPU_TD_STATE_UNINITIALIZED;
719
720 return 0;
721 }
722
tdx_vcpu_load(struct kvm_vcpu * vcpu,int cpu)723 void tdx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
724 {
725 struct vcpu_tdx *tdx = to_tdx(vcpu);
726
727 vmx_vcpu_pi_load(vcpu, cpu);
728 if (vcpu->cpu == cpu || !is_hkid_assigned(to_kvm_tdx(vcpu->kvm)))
729 return;
730
731 tdx_flush_vp_on_cpu(vcpu);
732
733 KVM_BUG_ON(cpu != raw_smp_processor_id(), vcpu->kvm);
734 local_irq_disable();
735 /*
736 * Pairs with the smp_wmb() in tdx_disassociate_vp() to ensure
737 * vcpu->cpu is read before tdx->cpu_list.
738 */
739 smp_rmb();
740
741 list_add(&tdx->cpu_list, &per_cpu(associated_tdvcpus, cpu));
742 local_irq_enable();
743 }
744
tdx_interrupt_allowed(struct kvm_vcpu * vcpu)745 bool tdx_interrupt_allowed(struct kvm_vcpu *vcpu)
746 {
747 /*
748 * KVM can't get the interrupt status of TDX guest and it assumes
749 * interrupt is always allowed unless TDX guest calls TDVMCALL with HLT,
750 * which passes the interrupt blocked flag.
751 */
752 return vmx_get_exit_reason(vcpu).basic != EXIT_REASON_HLT ||
753 !to_tdx(vcpu)->vp_enter_args.r12;
754 }
755
tdx_protected_apic_has_interrupt(struct kvm_vcpu * vcpu)756 static bool tdx_protected_apic_has_interrupt(struct kvm_vcpu *vcpu)
757 {
758 u64 vcpu_state_details;
759
760 if (pi_has_pending_interrupt(vcpu))
761 return true;
762
763 /*
764 * Only check RVI pending for HALTED case with IRQ enabled.
765 * For non-HLT cases, KVM doesn't care about STI/SS shadows. And if the
766 * interrupt was pending before TD exit, then it _must_ be blocked,
767 * otherwise the interrupt would have been serviced at the instruction
768 * boundary.
769 */
770 if (vmx_get_exit_reason(vcpu).basic != EXIT_REASON_HLT ||
771 to_tdx(vcpu)->vp_enter_args.r12)
772 return false;
773
774 vcpu_state_details =
775 td_state_non_arch_read64(to_tdx(vcpu), TD_VCPU_STATE_DETAILS_NON_ARCH);
776
777 return tdx_vcpu_state_details_intr_pending(vcpu_state_details);
778 }
779
780 struct tdx_uret_msr {
781 u32 msr;
782 unsigned int slot;
783 u64 defval;
784 };
785
786 static struct tdx_uret_msr tdx_uret_msrs[] = {
787 {.msr = MSR_SYSCALL_MASK, .defval = 0x20200 },
788 {.msr = MSR_STAR,},
789 {.msr = MSR_LSTAR,},
790 {.msr = MSR_TSC_AUX,},
791 };
792
tdx_prepare_switch_to_guest(struct kvm_vcpu * vcpu)793 void tdx_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
794 {
795 struct vcpu_vt *vt = to_vt(vcpu);
796 int i;
797
798 if (vt->guest_state_loaded)
799 return;
800
801 if (likely(is_64bit_mm(current->mm)))
802 vt->msr_host_kernel_gs_base = current->thread.gsbase;
803 else
804 vt->msr_host_kernel_gs_base = read_msr(MSR_KERNEL_GS_BASE);
805
806 vt->guest_state_loaded = true;
807
808 /*
809 * Explicitly set user-return MSRs that are clobbered by the TDX-Module
810 * if VP.ENTER succeeds, i.e. on TD-Exit, with the values that would be
811 * written by the TDX-Module. Don't rely on the TDX-Module to actually
812 * clobber the MSRs, as the contract is poorly defined and not upheld.
813 * E.g. the TDX-Module will synthesize an EPT Violation without doing
814 * VM-Enter if it suspects a zero-step attack, and never "restore" VMM
815 * state.
816 */
817 for (i = 0; i < ARRAY_SIZE(tdx_uret_msrs); i++)
818 kvm_set_user_return_msr(tdx_uret_msrs[i].slot,
819 tdx_uret_msrs[i].defval, -1ull);
820 }
821
tdx_prepare_switch_to_host(struct kvm_vcpu * vcpu)822 static void tdx_prepare_switch_to_host(struct kvm_vcpu *vcpu)
823 {
824 struct vcpu_vt *vt = to_vt(vcpu);
825
826 if (!vt->guest_state_loaded)
827 return;
828
829 ++vcpu->stat.host_state_reload;
830 wrmsrl(MSR_KERNEL_GS_BASE, vt->msr_host_kernel_gs_base);
831
832 vt->guest_state_loaded = false;
833 }
834
tdx_vcpu_put(struct kvm_vcpu * vcpu)835 void tdx_vcpu_put(struct kvm_vcpu *vcpu)
836 {
837 vmx_vcpu_pi_put(vcpu);
838 tdx_prepare_switch_to_host(vcpu);
839 }
840
841 /*
842 * Life cycles for a TD and a vCPU:
843 * 1. KVM_CREATE_VM ioctl.
844 * TD state is TD_STATE_UNINITIALIZED.
845 * hkid is not assigned at this stage.
846 * 2. KVM_TDX_INIT_VM ioctl.
847 * TD transitions to TD_STATE_INITIALIZED.
848 * hkid is assigned after this stage.
849 * 3. KVM_CREATE_VCPU ioctl. (only when TD is TD_STATE_INITIALIZED).
850 * 3.1 tdx_vcpu_create() transitions vCPU state to VCPU_TD_STATE_UNINITIALIZED.
851 * 3.2 vcpu_load() and vcpu_put() in kvm_arch_vcpu_create().
852 * 3.3 (conditional) if any error encountered after kvm_arch_vcpu_create()
853 * kvm_arch_vcpu_destroy() --> tdx_vcpu_free().
854 * 4. KVM_TDX_INIT_VCPU ioctl.
855 * tdx_vcpu_init() transitions vCPU state to VCPU_TD_STATE_INITIALIZED.
856 * vCPU control structures are allocated at this stage.
857 * 5. kvm_destroy_vm().
858 * 5.1 tdx_mmu_release_hkid(): (1) tdh_vp_flush(), disassociates all vCPUs.
859 * (2) puts hkid to !assigned state.
860 * 5.2 kvm_destroy_vcpus() --> tdx_vcpu_free():
861 * transitions vCPU to VCPU_TD_STATE_UNINITIALIZED state.
862 * 5.3 tdx_vm_destroy()
863 * transitions TD to TD_STATE_UNINITIALIZED state.
864 *
865 * tdx_vcpu_free() can be invoked only at 3.3 or 5.2.
866 * - If at 3.3, hkid is still assigned, but the vCPU must be in
867 * VCPU_TD_STATE_UNINITIALIZED state.
868 * - if at 5.2, hkid must be !assigned and all vCPUs must be in
869 * VCPU_TD_STATE_INITIALIZED state and have been dissociated.
870 */
tdx_vcpu_free(struct kvm_vcpu * vcpu)871 void tdx_vcpu_free(struct kvm_vcpu *vcpu)
872 {
873 struct kvm_tdx *kvm_tdx = to_kvm_tdx(vcpu->kvm);
874 struct vcpu_tdx *tdx = to_tdx(vcpu);
875 int i;
876
877 if (vcpu->cpu != -1) {
878 KVM_BUG_ON(tdx->state == VCPU_TD_STATE_INITIALIZED, vcpu->kvm);
879 tdx_flush_vp_on_cpu(vcpu);
880 return;
881 }
882
883 /*
884 * It is not possible to reclaim pages while hkid is assigned. It might
885 * be assigned if the TD VM is being destroyed but freeing hkid failed,
886 * in which case the pages are leaked.
887 */
888 if (is_hkid_assigned(kvm_tdx))
889 return;
890
891 if (tdx->vp.tdcx_pages) {
892 for (i = 0; i < kvm_tdx->td.tdcx_nr_pages; i++) {
893 if (tdx->vp.tdcx_pages[i])
894 tdx_reclaim_control_page(tdx->vp.tdcx_pages[i]);
895 }
896 kfree(tdx->vp.tdcx_pages);
897 tdx->vp.tdcx_pages = NULL;
898 }
899 if (tdx->vp.tdvpr_page) {
900 tdx_reclaim_control_page(tdx->vp.tdvpr_page);
901 tdx->vp.tdvpr_page = NULL;
902 tdx->vp.tdvpr_pa = 0;
903 }
904
905 tdx->state = VCPU_TD_STATE_UNINITIALIZED;
906 }
907
tdx_vcpu_pre_run(struct kvm_vcpu * vcpu)908 int tdx_vcpu_pre_run(struct kvm_vcpu *vcpu)
909 {
910 if (unlikely(to_tdx(vcpu)->state != VCPU_TD_STATE_INITIALIZED ||
911 to_kvm_tdx(vcpu->kvm)->state != TD_STATE_RUNNABLE))
912 return -EINVAL;
913
914 return 1;
915 }
916
tdcall_to_vmx_exit_reason(struct kvm_vcpu * vcpu)917 static __always_inline u32 tdcall_to_vmx_exit_reason(struct kvm_vcpu *vcpu)
918 {
919 switch (tdvmcall_leaf(vcpu)) {
920 case EXIT_REASON_CPUID:
921 case EXIT_REASON_HLT:
922 case EXIT_REASON_IO_INSTRUCTION:
923 case EXIT_REASON_MSR_READ:
924 case EXIT_REASON_MSR_WRITE:
925 return tdvmcall_leaf(vcpu);
926 case EXIT_REASON_EPT_VIOLATION:
927 return EXIT_REASON_EPT_MISCONFIG;
928 default:
929 break;
930 }
931
932 return EXIT_REASON_TDCALL;
933 }
934
tdx_to_vmx_exit_reason(struct kvm_vcpu * vcpu)935 static __always_inline u32 tdx_to_vmx_exit_reason(struct kvm_vcpu *vcpu)
936 {
937 struct vcpu_tdx *tdx = to_tdx(vcpu);
938 u32 exit_reason;
939
940 switch (tdx->vp_enter_ret & TDX_SEAMCALL_STATUS_MASK) {
941 case TDX_SUCCESS:
942 case TDX_NON_RECOVERABLE_VCPU:
943 case TDX_NON_RECOVERABLE_TD:
944 case TDX_NON_RECOVERABLE_TD_NON_ACCESSIBLE:
945 case TDX_NON_RECOVERABLE_TD_WRONG_APIC_MODE:
946 break;
947 default:
948 return -1u;
949 }
950
951 exit_reason = tdx->vp_enter_ret;
952
953 switch (exit_reason) {
954 case EXIT_REASON_TDCALL:
955 if (tdvmcall_exit_type(vcpu))
956 return EXIT_REASON_VMCALL;
957
958 return tdcall_to_vmx_exit_reason(vcpu);
959 case EXIT_REASON_EPT_MISCONFIG:
960 /*
961 * Defer KVM_BUG_ON() until tdx_handle_exit() because this is in
962 * non-instrumentable code with interrupts disabled.
963 */
964 return -1u;
965 default:
966 break;
967 }
968
969 return exit_reason;
970 }
971
tdx_vcpu_enter_exit(struct kvm_vcpu * vcpu)972 static noinstr void tdx_vcpu_enter_exit(struct kvm_vcpu *vcpu)
973 {
974 struct vcpu_tdx *tdx = to_tdx(vcpu);
975 struct vcpu_vt *vt = to_vt(vcpu);
976
977 guest_state_enter_irqoff();
978
979 tdx->vp_enter_ret = tdh_vp_enter(&tdx->vp, &tdx->vp_enter_args);
980
981 vt->exit_reason.full = tdx_to_vmx_exit_reason(vcpu);
982
983 vt->exit_qualification = tdx->vp_enter_args.rcx;
984 tdx->ext_exit_qualification = tdx->vp_enter_args.rdx;
985 tdx->exit_gpa = tdx->vp_enter_args.r8;
986 vt->exit_intr_info = tdx->vp_enter_args.r9;
987
988 vmx_handle_nmi(vcpu);
989
990 guest_state_exit_irqoff();
991 }
992
tdx_failed_vmentry(struct kvm_vcpu * vcpu)993 static bool tdx_failed_vmentry(struct kvm_vcpu *vcpu)
994 {
995 return vmx_get_exit_reason(vcpu).failed_vmentry &&
996 vmx_get_exit_reason(vcpu).full != -1u;
997 }
998
tdx_exit_handlers_fastpath(struct kvm_vcpu * vcpu)999 static fastpath_t tdx_exit_handlers_fastpath(struct kvm_vcpu *vcpu)
1000 {
1001 u64 vp_enter_ret = to_tdx(vcpu)->vp_enter_ret;
1002
1003 /*
1004 * TDX_OPERAND_BUSY could be returned for SEPT due to 0-step mitigation
1005 * or for TD EPOCH due to contention with TDH.MEM.TRACK on TDH.VP.ENTER.
1006 *
1007 * When KVM requests KVM_REQ_OUTSIDE_GUEST_MODE, which has both
1008 * KVM_REQUEST_WAIT and KVM_REQUEST_NO_ACTION set, it requires target
1009 * vCPUs leaving fastpath so that interrupt can be enabled to ensure the
1010 * IPIs can be delivered. Return EXIT_FASTPATH_EXIT_HANDLED instead of
1011 * EXIT_FASTPATH_REENTER_GUEST to exit fastpath, otherwise, the
1012 * requester may be blocked endlessly.
1013 */
1014 if (unlikely(tdx_operand_busy(vp_enter_ret)))
1015 return EXIT_FASTPATH_EXIT_HANDLED;
1016
1017 return EXIT_FASTPATH_NONE;
1018 }
1019
1020 #define TDX_REGS_AVAIL_SET (BIT_ULL(VCPU_EXREG_EXIT_INFO_1) | \
1021 BIT_ULL(VCPU_EXREG_EXIT_INFO_2) | \
1022 BIT_ULL(VCPU_REGS_RAX) | \
1023 BIT_ULL(VCPU_REGS_RBX) | \
1024 BIT_ULL(VCPU_REGS_RCX) | \
1025 BIT_ULL(VCPU_REGS_RDX) | \
1026 BIT_ULL(VCPU_REGS_RBP) | \
1027 BIT_ULL(VCPU_REGS_RSI) | \
1028 BIT_ULL(VCPU_REGS_RDI) | \
1029 BIT_ULL(VCPU_REGS_R8) | \
1030 BIT_ULL(VCPU_REGS_R9) | \
1031 BIT_ULL(VCPU_REGS_R10) | \
1032 BIT_ULL(VCPU_REGS_R11) | \
1033 BIT_ULL(VCPU_REGS_R12) | \
1034 BIT_ULL(VCPU_REGS_R13) | \
1035 BIT_ULL(VCPU_REGS_R14) | \
1036 BIT_ULL(VCPU_REGS_R15))
1037
tdx_load_host_xsave_state(struct kvm_vcpu * vcpu)1038 static void tdx_load_host_xsave_state(struct kvm_vcpu *vcpu)
1039 {
1040 struct kvm_tdx *kvm_tdx = to_kvm_tdx(vcpu->kvm);
1041
1042 /*
1043 * All TDX hosts support PKRU; but even if they didn't,
1044 * vcpu->arch.host_pkru would be 0 and the wrpkru would be
1045 * skipped.
1046 */
1047 if (vcpu->arch.host_pkru != 0)
1048 wrpkru(vcpu->arch.host_pkru);
1049
1050 if (kvm_host.xcr0 != (kvm_tdx->xfam & kvm_caps.supported_xcr0))
1051 xsetbv(XCR_XFEATURE_ENABLED_MASK, kvm_host.xcr0);
1052
1053 /*
1054 * Likewise, even if a TDX hosts didn't support XSS both arms of
1055 * the comparison would be 0 and the wrmsrl would be skipped.
1056 */
1057 if (kvm_host.xss != (kvm_tdx->xfam & kvm_caps.supported_xss))
1058 wrmsrl(MSR_IA32_XSS, kvm_host.xss);
1059 }
1060
1061 #define TDX_DEBUGCTL_PRESERVED (DEBUGCTLMSR_BTF | \
1062 DEBUGCTLMSR_FREEZE_PERFMON_ON_PMI | \
1063 DEBUGCTLMSR_FREEZE_IN_SMM)
1064
tdx_vcpu_run(struct kvm_vcpu * vcpu,u64 run_flags)1065 fastpath_t tdx_vcpu_run(struct kvm_vcpu *vcpu, u64 run_flags)
1066 {
1067 struct vcpu_tdx *tdx = to_tdx(vcpu);
1068 struct vcpu_vt *vt = to_vt(vcpu);
1069
1070 /*
1071 * WARN if KVM wants to force an immediate exit, as the TDX module does
1072 * not guarantee entry into the guest, i.e. it's possible for KVM to
1073 * _think_ it completed entry to the guest and forced an immediate exit
1074 * without actually having done so. Luckily, KVM never needs to force
1075 * an immediate exit for TDX (KVM can't do direct event injection, so
1076 * just WARN and continue on.
1077 */
1078 WARN_ON_ONCE(run_flags);
1079
1080 /*
1081 * Wait until retry of SEPT-zap-related SEAMCALL completes before
1082 * allowing vCPU entry to avoid contention with tdh_vp_enter() and
1083 * TDCALLs.
1084 */
1085 if (unlikely(READ_ONCE(to_kvm_tdx(vcpu->kvm)->wait_for_sept_zap)))
1086 return EXIT_FASTPATH_EXIT_HANDLED;
1087
1088 trace_kvm_entry(vcpu, run_flags & KVM_RUN_FORCE_IMMEDIATE_EXIT);
1089
1090 if (pi_test_on(&vt->pi_desc)) {
1091 apic->send_IPI_self(POSTED_INTR_VECTOR);
1092
1093 if (pi_test_pir(kvm_lapic_get_reg(vcpu->arch.apic, APIC_LVTT) &
1094 APIC_VECTOR_MASK, &vt->pi_desc))
1095 kvm_wait_lapic_expire(vcpu);
1096 }
1097
1098 tdx_vcpu_enter_exit(vcpu);
1099
1100 if (vcpu->arch.host_debugctl & ~TDX_DEBUGCTL_PRESERVED)
1101 update_debugctlmsr(vcpu->arch.host_debugctl);
1102
1103 tdx_load_host_xsave_state(vcpu);
1104
1105 vcpu->arch.regs_avail &= TDX_REGS_AVAIL_SET;
1106
1107 if (unlikely(tdx->vp_enter_ret == EXIT_REASON_EPT_MISCONFIG))
1108 return EXIT_FASTPATH_NONE;
1109
1110 if (unlikely((tdx->vp_enter_ret & TDX_SW_ERROR) == TDX_SW_ERROR))
1111 return EXIT_FASTPATH_NONE;
1112
1113 trace_kvm_exit(vcpu, KVM_ISA_VMX);
1114
1115 if (unlikely(tdx_failed_vmentry(vcpu)))
1116 return EXIT_FASTPATH_NONE;
1117
1118 return tdx_exit_handlers_fastpath(vcpu);
1119 }
1120
tdx_inject_nmi(struct kvm_vcpu * vcpu)1121 void tdx_inject_nmi(struct kvm_vcpu *vcpu)
1122 {
1123 ++vcpu->stat.nmi_injections;
1124 td_management_write8(to_tdx(vcpu), TD_VCPU_PEND_NMI, 1);
1125 /*
1126 * From KVM's perspective, NMI injection is completed right after
1127 * writing to PEND_NMI. KVM doesn't care whether an NMI is injected by
1128 * the TDX module or not.
1129 */
1130 vcpu->arch.nmi_injected = false;
1131 /*
1132 * TDX doesn't support KVM to request NMI window exit. If there is
1133 * still a pending vNMI, KVM is not able to inject it along with the
1134 * one pending in TDX module in a back-to-back way. Since the previous
1135 * vNMI is still pending in TDX module, i.e. it has not been delivered
1136 * to TDX guest yet, it's OK to collapse the pending vNMI into the
1137 * previous one. The guest is expected to handle all the NMI sources
1138 * when handling the first vNMI.
1139 */
1140 vcpu->arch.nmi_pending = 0;
1141 }
1142
tdx_handle_exception_nmi(struct kvm_vcpu * vcpu)1143 static int tdx_handle_exception_nmi(struct kvm_vcpu *vcpu)
1144 {
1145 u32 intr_info = vmx_get_intr_info(vcpu);
1146
1147 /*
1148 * Machine checks are handled by handle_exception_irqoff(), or by
1149 * tdx_handle_exit() with TDX_NON_RECOVERABLE set if a #MC occurs on
1150 * VM-Entry. NMIs are handled by tdx_vcpu_enter_exit().
1151 */
1152 if (is_nmi(intr_info) || is_machine_check(intr_info))
1153 return 1;
1154
1155 vcpu->run->exit_reason = KVM_EXIT_EXCEPTION;
1156 vcpu->run->ex.exception = intr_info & INTR_INFO_VECTOR_MASK;
1157 vcpu->run->ex.error_code = 0;
1158
1159 return 0;
1160 }
1161
complete_hypercall_exit(struct kvm_vcpu * vcpu)1162 static int complete_hypercall_exit(struct kvm_vcpu *vcpu)
1163 {
1164 tdvmcall_set_return_code(vcpu, vcpu->run->hypercall.ret);
1165 return 1;
1166 }
1167
tdx_emulate_vmcall(struct kvm_vcpu * vcpu)1168 static int tdx_emulate_vmcall(struct kvm_vcpu *vcpu)
1169 {
1170 kvm_rax_write(vcpu, to_tdx(vcpu)->vp_enter_args.r10);
1171 kvm_rbx_write(vcpu, to_tdx(vcpu)->vp_enter_args.r11);
1172 kvm_rcx_write(vcpu, to_tdx(vcpu)->vp_enter_args.r12);
1173 kvm_rdx_write(vcpu, to_tdx(vcpu)->vp_enter_args.r13);
1174 kvm_rsi_write(vcpu, to_tdx(vcpu)->vp_enter_args.r14);
1175
1176 return __kvm_emulate_hypercall(vcpu, 0, complete_hypercall_exit);
1177 }
1178
1179 /*
1180 * Split into chunks and check interrupt pending between chunks. This allows
1181 * for timely injection of interrupts to prevent issues with guest lockup
1182 * detection.
1183 */
1184 #define TDX_MAP_GPA_MAX_LEN (2 * 1024 * 1024)
1185 static void __tdx_map_gpa(struct vcpu_tdx *tdx);
1186
tdx_complete_vmcall_map_gpa(struct kvm_vcpu * vcpu)1187 static int tdx_complete_vmcall_map_gpa(struct kvm_vcpu *vcpu)
1188 {
1189 struct vcpu_tdx *tdx = to_tdx(vcpu);
1190
1191 if (vcpu->run->hypercall.ret) {
1192 tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_INVALID_OPERAND);
1193 tdx->vp_enter_args.r11 = tdx->map_gpa_next;
1194 return 1;
1195 }
1196
1197 tdx->map_gpa_next += TDX_MAP_GPA_MAX_LEN;
1198 if (tdx->map_gpa_next >= tdx->map_gpa_end)
1199 return 1;
1200
1201 /*
1202 * Stop processing the remaining part if there is a pending interrupt,
1203 * which could be qualified to deliver. Skip checking pending RVI for
1204 * TDVMCALL_MAP_GPA, see comments in tdx_protected_apic_has_interrupt().
1205 */
1206 if (kvm_vcpu_has_events(vcpu)) {
1207 tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_RETRY);
1208 tdx->vp_enter_args.r11 = tdx->map_gpa_next;
1209 return 1;
1210 }
1211
1212 __tdx_map_gpa(tdx);
1213 return 0;
1214 }
1215
__tdx_map_gpa(struct vcpu_tdx * tdx)1216 static void __tdx_map_gpa(struct vcpu_tdx *tdx)
1217 {
1218 u64 gpa = tdx->map_gpa_next;
1219 u64 size = tdx->map_gpa_end - tdx->map_gpa_next;
1220
1221 if (size > TDX_MAP_GPA_MAX_LEN)
1222 size = TDX_MAP_GPA_MAX_LEN;
1223
1224 tdx->vcpu.run->exit_reason = KVM_EXIT_HYPERCALL;
1225 tdx->vcpu.run->hypercall.nr = KVM_HC_MAP_GPA_RANGE;
1226 /*
1227 * In principle this should have been -KVM_ENOSYS, but userspace (QEMU <=9.2)
1228 * assumed that vcpu->run->hypercall.ret is never changed by KVM and thus that
1229 * it was always zero on KVM_EXIT_HYPERCALL. Since KVM is now overwriting
1230 * vcpu->run->hypercall.ret, ensuring that it is zero to not break QEMU.
1231 */
1232 tdx->vcpu.run->hypercall.ret = 0;
1233 tdx->vcpu.run->hypercall.args[0] = gpa & ~gfn_to_gpa(kvm_gfn_direct_bits(tdx->vcpu.kvm));
1234 tdx->vcpu.run->hypercall.args[1] = size / PAGE_SIZE;
1235 tdx->vcpu.run->hypercall.args[2] = vt_is_tdx_private_gpa(tdx->vcpu.kvm, gpa) ?
1236 KVM_MAP_GPA_RANGE_ENCRYPTED :
1237 KVM_MAP_GPA_RANGE_DECRYPTED;
1238 tdx->vcpu.run->hypercall.flags = KVM_EXIT_HYPERCALL_LONG_MODE;
1239
1240 tdx->vcpu.arch.complete_userspace_io = tdx_complete_vmcall_map_gpa;
1241 }
1242
tdx_map_gpa(struct kvm_vcpu * vcpu)1243 static int tdx_map_gpa(struct kvm_vcpu *vcpu)
1244 {
1245 struct vcpu_tdx *tdx = to_tdx(vcpu);
1246 u64 gpa = tdx->vp_enter_args.r12;
1247 u64 size = tdx->vp_enter_args.r13;
1248 u64 ret;
1249
1250 /*
1251 * Converting TDVMCALL_MAP_GPA to KVM_HC_MAP_GPA_RANGE requires
1252 * userspace to enable KVM_CAP_EXIT_HYPERCALL with KVM_HC_MAP_GPA_RANGE
1253 * bit set. This is a base call so it should always be supported, but
1254 * KVM has no way to ensure that userspace implements the GHCI correctly.
1255 * So if KVM_HC_MAP_GPA_RANGE does not cause a VMEXIT, return an error
1256 * to the guest.
1257 */
1258 if (!user_exit_on_hypercall(vcpu->kvm, KVM_HC_MAP_GPA_RANGE)) {
1259 ret = TDVMCALL_STATUS_SUBFUNC_UNSUPPORTED;
1260 goto error;
1261 }
1262
1263 if (gpa + size <= gpa || !kvm_vcpu_is_legal_gpa(vcpu, gpa) ||
1264 !kvm_vcpu_is_legal_gpa(vcpu, gpa + size - 1) ||
1265 (vt_is_tdx_private_gpa(vcpu->kvm, gpa) !=
1266 vt_is_tdx_private_gpa(vcpu->kvm, gpa + size - 1))) {
1267 ret = TDVMCALL_STATUS_INVALID_OPERAND;
1268 goto error;
1269 }
1270
1271 if (!PAGE_ALIGNED(gpa) || !PAGE_ALIGNED(size)) {
1272 ret = TDVMCALL_STATUS_ALIGN_ERROR;
1273 goto error;
1274 }
1275
1276 tdx->map_gpa_end = gpa + size;
1277 tdx->map_gpa_next = gpa;
1278
1279 __tdx_map_gpa(tdx);
1280 return 0;
1281
1282 error:
1283 tdvmcall_set_return_code(vcpu, ret);
1284 tdx->vp_enter_args.r11 = gpa;
1285 return 1;
1286 }
1287
tdx_report_fatal_error(struct kvm_vcpu * vcpu)1288 static int tdx_report_fatal_error(struct kvm_vcpu *vcpu)
1289 {
1290 struct vcpu_tdx *tdx = to_tdx(vcpu);
1291 u64 *regs = vcpu->run->system_event.data;
1292 u64 *module_regs = &tdx->vp_enter_args.r8;
1293 int index = VCPU_REGS_RAX;
1294
1295 vcpu->run->exit_reason = KVM_EXIT_SYSTEM_EVENT;
1296 vcpu->run->system_event.type = KVM_SYSTEM_EVENT_TDX_FATAL;
1297 vcpu->run->system_event.ndata = 16;
1298
1299 /* Dump 16 general-purpose registers to userspace in ascending order. */
1300 regs[index++] = tdx->vp_enter_ret;
1301 regs[index++] = tdx->vp_enter_args.rcx;
1302 regs[index++] = tdx->vp_enter_args.rdx;
1303 regs[index++] = tdx->vp_enter_args.rbx;
1304 regs[index++] = 0;
1305 regs[index++] = 0;
1306 regs[index++] = tdx->vp_enter_args.rsi;
1307 regs[index] = tdx->vp_enter_args.rdi;
1308 for (index = 0; index < 8; index++)
1309 regs[VCPU_REGS_R8 + index] = module_regs[index];
1310
1311 return 0;
1312 }
1313
tdx_emulate_cpuid(struct kvm_vcpu * vcpu)1314 static int tdx_emulate_cpuid(struct kvm_vcpu *vcpu)
1315 {
1316 u32 eax, ebx, ecx, edx;
1317 struct vcpu_tdx *tdx = to_tdx(vcpu);
1318
1319 /* EAX and ECX for cpuid is stored in R12 and R13. */
1320 eax = tdx->vp_enter_args.r12;
1321 ecx = tdx->vp_enter_args.r13;
1322
1323 kvm_cpuid(vcpu, &eax, &ebx, &ecx, &edx, false);
1324
1325 tdx->vp_enter_args.r12 = eax;
1326 tdx->vp_enter_args.r13 = ebx;
1327 tdx->vp_enter_args.r14 = ecx;
1328 tdx->vp_enter_args.r15 = edx;
1329
1330 return 1;
1331 }
1332
tdx_complete_pio_out(struct kvm_vcpu * vcpu)1333 static int tdx_complete_pio_out(struct kvm_vcpu *vcpu)
1334 {
1335 vcpu->arch.pio.count = 0;
1336 return 1;
1337 }
1338
tdx_complete_pio_in(struct kvm_vcpu * vcpu)1339 static int tdx_complete_pio_in(struct kvm_vcpu *vcpu)
1340 {
1341 struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
1342 unsigned long val = 0;
1343 int ret;
1344
1345 ret = ctxt->ops->pio_in_emulated(ctxt, vcpu->arch.pio.size,
1346 vcpu->arch.pio.port, &val, 1);
1347
1348 WARN_ON_ONCE(!ret);
1349
1350 tdvmcall_set_return_val(vcpu, val);
1351
1352 return 1;
1353 }
1354
tdx_emulate_io(struct kvm_vcpu * vcpu)1355 static int tdx_emulate_io(struct kvm_vcpu *vcpu)
1356 {
1357 struct vcpu_tdx *tdx = to_tdx(vcpu);
1358 struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
1359 unsigned long val = 0;
1360 unsigned int port;
1361 u64 size, write;
1362 int ret;
1363
1364 ++vcpu->stat.io_exits;
1365
1366 size = tdx->vp_enter_args.r12;
1367 write = tdx->vp_enter_args.r13;
1368 port = tdx->vp_enter_args.r14;
1369
1370 if ((write != 0 && write != 1) || (size != 1 && size != 2 && size != 4)) {
1371 tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_INVALID_OPERAND);
1372 return 1;
1373 }
1374
1375 if (write) {
1376 val = tdx->vp_enter_args.r15;
1377 ret = ctxt->ops->pio_out_emulated(ctxt, size, port, &val, 1);
1378 } else {
1379 ret = ctxt->ops->pio_in_emulated(ctxt, size, port, &val, 1);
1380 }
1381
1382 if (!ret)
1383 vcpu->arch.complete_userspace_io = write ? tdx_complete_pio_out :
1384 tdx_complete_pio_in;
1385 else if (!write)
1386 tdvmcall_set_return_val(vcpu, val);
1387
1388 return ret;
1389 }
1390
tdx_complete_mmio_read(struct kvm_vcpu * vcpu)1391 static int tdx_complete_mmio_read(struct kvm_vcpu *vcpu)
1392 {
1393 unsigned long val = 0;
1394 gpa_t gpa;
1395 int size;
1396
1397 gpa = vcpu->mmio_fragments[0].gpa;
1398 size = vcpu->mmio_fragments[0].len;
1399
1400 memcpy(&val, vcpu->run->mmio.data, size);
1401 tdvmcall_set_return_val(vcpu, val);
1402 trace_kvm_mmio(KVM_TRACE_MMIO_READ, size, gpa, &val);
1403 return 1;
1404 }
1405
tdx_mmio_write(struct kvm_vcpu * vcpu,gpa_t gpa,int size,unsigned long val)1406 static inline int tdx_mmio_write(struct kvm_vcpu *vcpu, gpa_t gpa, int size,
1407 unsigned long val)
1408 {
1409 if (!kvm_io_bus_write(vcpu, KVM_FAST_MMIO_BUS, gpa, 0, NULL)) {
1410 trace_kvm_fast_mmio(gpa);
1411 return 0;
1412 }
1413
1414 trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, size, gpa, &val);
1415 if (kvm_io_bus_write(vcpu, KVM_MMIO_BUS, gpa, size, &val))
1416 return -EOPNOTSUPP;
1417
1418 return 0;
1419 }
1420
tdx_mmio_read(struct kvm_vcpu * vcpu,gpa_t gpa,int size)1421 static inline int tdx_mmio_read(struct kvm_vcpu *vcpu, gpa_t gpa, int size)
1422 {
1423 unsigned long val;
1424
1425 if (kvm_io_bus_read(vcpu, KVM_MMIO_BUS, gpa, size, &val))
1426 return -EOPNOTSUPP;
1427
1428 tdvmcall_set_return_val(vcpu, val);
1429 trace_kvm_mmio(KVM_TRACE_MMIO_READ, size, gpa, &val);
1430 return 0;
1431 }
1432
tdx_emulate_mmio(struct kvm_vcpu * vcpu)1433 static int tdx_emulate_mmio(struct kvm_vcpu *vcpu)
1434 {
1435 struct vcpu_tdx *tdx = to_tdx(vcpu);
1436 int size, write, r;
1437 unsigned long val;
1438 gpa_t gpa;
1439
1440 size = tdx->vp_enter_args.r12;
1441 write = tdx->vp_enter_args.r13;
1442 gpa = tdx->vp_enter_args.r14;
1443 val = write ? tdx->vp_enter_args.r15 : 0;
1444
1445 if (size != 1 && size != 2 && size != 4 && size != 8)
1446 goto error;
1447 if (write != 0 && write != 1)
1448 goto error;
1449
1450 /*
1451 * TDG.VP.VMCALL<MMIO> allows only shared GPA, it makes no sense to
1452 * do MMIO emulation for private GPA.
1453 */
1454 if (vt_is_tdx_private_gpa(vcpu->kvm, gpa) ||
1455 vt_is_tdx_private_gpa(vcpu->kvm, gpa + size - 1))
1456 goto error;
1457
1458 gpa = gpa & ~gfn_to_gpa(kvm_gfn_direct_bits(vcpu->kvm));
1459
1460 if (write)
1461 r = tdx_mmio_write(vcpu, gpa, size, val);
1462 else
1463 r = tdx_mmio_read(vcpu, gpa, size);
1464 if (!r)
1465 /* Kernel completed device emulation. */
1466 return 1;
1467
1468 /* Request the device emulation to userspace device model. */
1469 vcpu->mmio_is_write = write;
1470 if (!write)
1471 vcpu->arch.complete_userspace_io = tdx_complete_mmio_read;
1472
1473 vcpu->run->mmio.phys_addr = gpa;
1474 vcpu->run->mmio.len = size;
1475 vcpu->run->mmio.is_write = write;
1476 vcpu->run->exit_reason = KVM_EXIT_MMIO;
1477
1478 if (write) {
1479 memcpy(vcpu->run->mmio.data, &val, size);
1480 } else {
1481 vcpu->mmio_fragments[0].gpa = gpa;
1482 vcpu->mmio_fragments[0].len = size;
1483 trace_kvm_mmio(KVM_TRACE_MMIO_READ_UNSATISFIED, size, gpa, NULL);
1484 }
1485 return 0;
1486
1487 error:
1488 tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_INVALID_OPERAND);
1489 return 1;
1490 }
1491
tdx_complete_get_td_vm_call_info(struct kvm_vcpu * vcpu)1492 static int tdx_complete_get_td_vm_call_info(struct kvm_vcpu *vcpu)
1493 {
1494 struct vcpu_tdx *tdx = to_tdx(vcpu);
1495
1496 tdvmcall_set_return_code(vcpu, vcpu->run->tdx.get_tdvmcall_info.ret);
1497
1498 /*
1499 * For now, there is no TDVMCALL beyond GHCI base API supported by KVM
1500 * directly without the support from userspace, just set the value
1501 * returned from userspace.
1502 */
1503 tdx->vp_enter_args.r11 = vcpu->run->tdx.get_tdvmcall_info.r11;
1504 tdx->vp_enter_args.r12 = vcpu->run->tdx.get_tdvmcall_info.r12;
1505 tdx->vp_enter_args.r13 = vcpu->run->tdx.get_tdvmcall_info.r13;
1506 tdx->vp_enter_args.r14 = vcpu->run->tdx.get_tdvmcall_info.r14;
1507
1508 return 1;
1509 }
1510
tdx_get_td_vm_call_info(struct kvm_vcpu * vcpu)1511 static int tdx_get_td_vm_call_info(struct kvm_vcpu *vcpu)
1512 {
1513 struct vcpu_tdx *tdx = to_tdx(vcpu);
1514
1515 switch (tdx->vp_enter_args.r12) {
1516 case 0:
1517 tdx->vp_enter_args.r11 = 0;
1518 tdx->vp_enter_args.r12 = 0;
1519 tdx->vp_enter_args.r13 = 0;
1520 tdx->vp_enter_args.r14 = 0;
1521 tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_SUCCESS);
1522 return 1;
1523 case 1:
1524 vcpu->run->tdx.get_tdvmcall_info.leaf = tdx->vp_enter_args.r12;
1525 vcpu->run->exit_reason = KVM_EXIT_TDX;
1526 vcpu->run->tdx.flags = 0;
1527 vcpu->run->tdx.nr = TDVMCALL_GET_TD_VM_CALL_INFO;
1528 vcpu->run->tdx.get_tdvmcall_info.ret = TDVMCALL_STATUS_SUCCESS;
1529 vcpu->run->tdx.get_tdvmcall_info.r11 = 0;
1530 vcpu->run->tdx.get_tdvmcall_info.r12 = 0;
1531 vcpu->run->tdx.get_tdvmcall_info.r13 = 0;
1532 vcpu->run->tdx.get_tdvmcall_info.r14 = 0;
1533 vcpu->arch.complete_userspace_io = tdx_complete_get_td_vm_call_info;
1534 return 0;
1535 default:
1536 tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_INVALID_OPERAND);
1537 return 1;
1538 }
1539 }
1540
tdx_complete_simple(struct kvm_vcpu * vcpu)1541 static int tdx_complete_simple(struct kvm_vcpu *vcpu)
1542 {
1543 tdvmcall_set_return_code(vcpu, vcpu->run->tdx.unknown.ret);
1544 return 1;
1545 }
1546
tdx_get_quote(struct kvm_vcpu * vcpu)1547 static int tdx_get_quote(struct kvm_vcpu *vcpu)
1548 {
1549 struct vcpu_tdx *tdx = to_tdx(vcpu);
1550 u64 gpa = tdx->vp_enter_args.r12;
1551 u64 size = tdx->vp_enter_args.r13;
1552
1553 /* The gpa of buffer must have shared bit set. */
1554 if (vt_is_tdx_private_gpa(vcpu->kvm, gpa)) {
1555 tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_INVALID_OPERAND);
1556 return 1;
1557 }
1558
1559 vcpu->run->exit_reason = KVM_EXIT_TDX;
1560 vcpu->run->tdx.flags = 0;
1561 vcpu->run->tdx.nr = TDVMCALL_GET_QUOTE;
1562 vcpu->run->tdx.get_quote.ret = TDVMCALL_STATUS_SUBFUNC_UNSUPPORTED;
1563 vcpu->run->tdx.get_quote.gpa = gpa & ~gfn_to_gpa(kvm_gfn_direct_bits(tdx->vcpu.kvm));
1564 vcpu->run->tdx.get_quote.size = size;
1565
1566 vcpu->arch.complete_userspace_io = tdx_complete_simple;
1567
1568 return 0;
1569 }
1570
tdx_setup_event_notify_interrupt(struct kvm_vcpu * vcpu)1571 static int tdx_setup_event_notify_interrupt(struct kvm_vcpu *vcpu)
1572 {
1573 struct vcpu_tdx *tdx = to_tdx(vcpu);
1574 u64 vector = tdx->vp_enter_args.r12;
1575
1576 if (vector < 32 || vector > 255) {
1577 tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_INVALID_OPERAND);
1578 return 1;
1579 }
1580
1581 vcpu->run->exit_reason = KVM_EXIT_TDX;
1582 vcpu->run->tdx.flags = 0;
1583 vcpu->run->tdx.nr = TDVMCALL_SETUP_EVENT_NOTIFY_INTERRUPT;
1584 vcpu->run->tdx.setup_event_notify.ret = TDVMCALL_STATUS_SUBFUNC_UNSUPPORTED;
1585 vcpu->run->tdx.setup_event_notify.vector = vector;
1586
1587 vcpu->arch.complete_userspace_io = tdx_complete_simple;
1588
1589 return 0;
1590 }
1591
handle_tdvmcall(struct kvm_vcpu * vcpu)1592 static int handle_tdvmcall(struct kvm_vcpu *vcpu)
1593 {
1594 switch (tdvmcall_leaf(vcpu)) {
1595 case TDVMCALL_MAP_GPA:
1596 return tdx_map_gpa(vcpu);
1597 case TDVMCALL_REPORT_FATAL_ERROR:
1598 return tdx_report_fatal_error(vcpu);
1599 case TDVMCALL_GET_TD_VM_CALL_INFO:
1600 return tdx_get_td_vm_call_info(vcpu);
1601 case TDVMCALL_GET_QUOTE:
1602 return tdx_get_quote(vcpu);
1603 case TDVMCALL_SETUP_EVENT_NOTIFY_INTERRUPT:
1604 return tdx_setup_event_notify_interrupt(vcpu);
1605 default:
1606 break;
1607 }
1608
1609 tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_SUBFUNC_UNSUPPORTED);
1610 return 1;
1611 }
1612
tdx_load_mmu_pgd(struct kvm_vcpu * vcpu,hpa_t root_hpa,int pgd_level)1613 void tdx_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa, int pgd_level)
1614 {
1615 u64 shared_bit = (pgd_level == 5) ? TDX_SHARED_BIT_PWL_5 :
1616 TDX_SHARED_BIT_PWL_4;
1617
1618 if (KVM_BUG_ON(shared_bit != kvm_gfn_direct_bits(vcpu->kvm), vcpu->kvm))
1619 return;
1620
1621 td_vmcs_write64(to_tdx(vcpu), SHARED_EPT_POINTER, root_hpa);
1622 }
1623
tdx_mem_page_add(struct kvm * kvm,gfn_t gfn,enum pg_level level,kvm_pfn_t pfn)1624 static int tdx_mem_page_add(struct kvm *kvm, gfn_t gfn, enum pg_level level,
1625 kvm_pfn_t pfn)
1626 {
1627 struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
1628 u64 err, entry, level_state;
1629 gpa_t gpa = gfn_to_gpa(gfn);
1630
1631 lockdep_assert_held(&kvm->slots_lock);
1632
1633 if (KVM_BUG_ON(kvm->arch.pre_fault_allowed, kvm) ||
1634 KVM_BUG_ON(!kvm_tdx->page_add_src, kvm))
1635 return -EIO;
1636
1637 err = tdh_mem_page_add(&kvm_tdx->td, gpa, pfn_to_page(pfn),
1638 kvm_tdx->page_add_src, &entry, &level_state);
1639 if (unlikely(tdx_operand_busy(err)))
1640 return -EBUSY;
1641
1642 if (TDX_BUG_ON_2(err, TDH_MEM_PAGE_ADD, entry, level_state, kvm))
1643 return -EIO;
1644
1645 return 0;
1646 }
1647
tdx_mem_page_aug(struct kvm * kvm,gfn_t gfn,enum pg_level level,kvm_pfn_t pfn)1648 static int tdx_mem_page_aug(struct kvm *kvm, gfn_t gfn,
1649 enum pg_level level, kvm_pfn_t pfn)
1650 {
1651 int tdx_level = pg_level_to_tdx_sept_level(level);
1652 struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
1653 struct page *page = pfn_to_page(pfn);
1654 gpa_t gpa = gfn_to_gpa(gfn);
1655 u64 entry, level_state;
1656 u64 err;
1657
1658 err = tdh_mem_page_aug(&kvm_tdx->td, gpa, tdx_level, page, &entry, &level_state);
1659 if (unlikely(tdx_operand_busy(err)))
1660 return -EBUSY;
1661
1662 if (TDX_BUG_ON_2(err, TDH_MEM_PAGE_AUG, entry, level_state, kvm))
1663 return -EIO;
1664
1665 return 0;
1666 }
1667
tdx_sept_set_private_spte(struct kvm * kvm,gfn_t gfn,enum pg_level level,u64 mirror_spte)1668 static int tdx_sept_set_private_spte(struct kvm *kvm, gfn_t gfn,
1669 enum pg_level level, u64 mirror_spte)
1670 {
1671 struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
1672 kvm_pfn_t pfn = spte_to_pfn(mirror_spte);
1673
1674 /* TODO: handle large pages. */
1675 if (KVM_BUG_ON(level != PG_LEVEL_4K, kvm))
1676 return -EIO;
1677
1678 WARN_ON_ONCE(!is_shadow_present_pte(mirror_spte) ||
1679 (mirror_spte & VMX_EPT_RWX_MASK) != VMX_EPT_RWX_MASK);
1680
1681 /*
1682 * Ensure pre_fault_allowed is read by kvm_arch_vcpu_pre_fault_memory()
1683 * before kvm_tdx->state. Userspace must not be allowed to pre-fault
1684 * arbitrary memory until the initial memory image is finalized. Pairs
1685 * with the smp_wmb() in tdx_td_finalize().
1686 */
1687 smp_rmb();
1688
1689 /*
1690 * If the TD isn't finalized/runnable, then userspace is initializing
1691 * the VM image via KVM_TDX_INIT_MEM_REGION; ADD the page to the TD.
1692 */
1693 if (unlikely(kvm_tdx->state != TD_STATE_RUNNABLE))
1694 return tdx_mem_page_add(kvm, gfn, level, pfn);
1695
1696 return tdx_mem_page_aug(kvm, gfn, level, pfn);
1697 }
1698
tdx_sept_link_private_spt(struct kvm * kvm,gfn_t gfn,enum pg_level level,void * private_spt)1699 static int tdx_sept_link_private_spt(struct kvm *kvm, gfn_t gfn,
1700 enum pg_level level, void *private_spt)
1701 {
1702 int tdx_level = pg_level_to_tdx_sept_level(level);
1703 gpa_t gpa = gfn_to_gpa(gfn);
1704 struct page *page = virt_to_page(private_spt);
1705 u64 err, entry, level_state;
1706
1707 err = tdh_mem_sept_add(&to_kvm_tdx(kvm)->td, gpa, tdx_level, page, &entry,
1708 &level_state);
1709 if (unlikely(tdx_operand_busy(err)))
1710 return -EBUSY;
1711
1712 if (TDX_BUG_ON_2(err, TDH_MEM_SEPT_ADD, entry, level_state, kvm))
1713 return -EIO;
1714
1715 return 0;
1716 }
1717
1718 /*
1719 * Ensure shared and private EPTs to be flushed on all vCPUs.
1720 * tdh_mem_track() is the only caller that increases TD epoch. An increase in
1721 * the TD epoch (e.g., to value "N + 1") is successful only if no vCPUs are
1722 * running in guest mode with the value "N - 1".
1723 *
1724 * A successful execution of tdh_mem_track() ensures that vCPUs can only run in
1725 * guest mode with TD epoch value "N" if no TD exit occurs after the TD epoch
1726 * being increased to "N + 1".
1727 *
1728 * Kicking off all vCPUs after that further results in no vCPUs can run in guest
1729 * mode with TD epoch value "N", which unblocks the next tdh_mem_track() (e.g.
1730 * to increase TD epoch to "N + 2").
1731 *
1732 * TDX module will flush EPT on the next TD enter and make vCPUs to run in
1733 * guest mode with TD epoch value "N + 1".
1734 *
1735 * kvm_make_all_cpus_request() guarantees all vCPUs are out of guest mode by
1736 * waiting empty IPI handler ack_kick().
1737 *
1738 * No action is required to the vCPUs being kicked off since the kicking off
1739 * occurs certainly after TD epoch increment and before the next
1740 * tdh_mem_track().
1741 */
tdx_track(struct kvm * kvm)1742 static void tdx_track(struct kvm *kvm)
1743 {
1744 struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
1745 u64 err;
1746
1747 /* If TD isn't finalized, it's before any vcpu running. */
1748 if (unlikely(kvm_tdx->state != TD_STATE_RUNNABLE))
1749 return;
1750
1751 /*
1752 * The full sequence of TDH.MEM.TRACK and forcing vCPUs out of guest
1753 * mode must be serialized, as TDH.MEM.TRACK will fail if the previous
1754 * tracking epoch hasn't completed.
1755 */
1756 lockdep_assert_held_write(&kvm->mmu_lock);
1757
1758 err = tdh_do_no_vcpus(tdh_mem_track, kvm, &kvm_tdx->td);
1759 TDX_BUG_ON(err, TDH_MEM_TRACK, kvm);
1760
1761 kvm_make_all_cpus_request(kvm, KVM_REQ_OUTSIDE_GUEST_MODE);
1762 }
1763
tdx_sept_free_private_spt(struct kvm * kvm,gfn_t gfn,enum pg_level level,void * private_spt)1764 static int tdx_sept_free_private_spt(struct kvm *kvm, gfn_t gfn,
1765 enum pg_level level, void *private_spt)
1766 {
1767 struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
1768
1769 /*
1770 * free_external_spt() is only called after hkid is freed when TD is
1771 * tearing down.
1772 * KVM doesn't (yet) zap page table pages in mirror page table while
1773 * TD is active, though guest pages mapped in mirror page table could be
1774 * zapped during TD is active, e.g. for shared <-> private conversion
1775 * and slot move/deletion.
1776 */
1777 if (KVM_BUG_ON(is_hkid_assigned(kvm_tdx), kvm))
1778 return -EIO;
1779
1780 /*
1781 * The HKID assigned to this TD was already freed and cache was
1782 * already flushed. We don't have to flush again.
1783 */
1784 return tdx_reclaim_page(virt_to_page(private_spt));
1785 }
1786
tdx_sept_remove_private_spte(struct kvm * kvm,gfn_t gfn,enum pg_level level,u64 mirror_spte)1787 static void tdx_sept_remove_private_spte(struct kvm *kvm, gfn_t gfn,
1788 enum pg_level level, u64 mirror_spte)
1789 {
1790 struct page *page = pfn_to_page(spte_to_pfn(mirror_spte));
1791 int tdx_level = pg_level_to_tdx_sept_level(level);
1792 struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
1793 gpa_t gpa = gfn_to_gpa(gfn);
1794 u64 err, entry, level_state;
1795
1796 lockdep_assert_held_write(&kvm->mmu_lock);
1797
1798 /*
1799 * HKID is released after all private pages have been removed, and set
1800 * before any might be populated. Warn if zapping is attempted when
1801 * there can't be anything populated in the private EPT.
1802 */
1803 if (KVM_BUG_ON(!is_hkid_assigned(to_kvm_tdx(kvm)), kvm))
1804 return;
1805
1806 /* TODO: handle large pages. */
1807 if (KVM_BUG_ON(level != PG_LEVEL_4K, kvm))
1808 return;
1809
1810 err = tdh_do_no_vcpus(tdh_mem_range_block, kvm, &kvm_tdx->td, gpa,
1811 tdx_level, &entry, &level_state);
1812 if (TDX_BUG_ON_2(err, TDH_MEM_RANGE_BLOCK, entry, level_state, kvm))
1813 return;
1814
1815 /*
1816 * TDX requires TLB tracking before dropping private page. Do
1817 * it here, although it is also done later.
1818 */
1819 tdx_track(kvm);
1820
1821 /*
1822 * When zapping private page, write lock is held. So no race condition
1823 * with other vcpu sept operation.
1824 * Race with TDH.VP.ENTER due to (0-step mitigation) and Guest TDCALLs.
1825 */
1826 err = tdh_do_no_vcpus(tdh_mem_page_remove, kvm, &kvm_tdx->td, gpa,
1827 tdx_level, &entry, &level_state);
1828 if (TDX_BUG_ON_2(err, TDH_MEM_PAGE_REMOVE, entry, level_state, kvm))
1829 return;
1830
1831 err = tdh_phymem_page_wbinvd_hkid((u16)kvm_tdx->hkid, page);
1832 if (TDX_BUG_ON(err, TDH_PHYMEM_PAGE_WBINVD, kvm))
1833 return;
1834
1835 tdx_quirk_reset_page(page);
1836 }
1837
tdx_deliver_interrupt(struct kvm_lapic * apic,int delivery_mode,int trig_mode,int vector)1838 void tdx_deliver_interrupt(struct kvm_lapic *apic, int delivery_mode,
1839 int trig_mode, int vector)
1840 {
1841 struct kvm_vcpu *vcpu = apic->vcpu;
1842 struct vcpu_tdx *tdx = to_tdx(vcpu);
1843
1844 /* TDX supports only posted interrupt. No lapic emulation. */
1845 __vmx_deliver_posted_interrupt(vcpu, &tdx->vt.pi_desc, vector);
1846
1847 trace_kvm_apicv_accept_irq(vcpu->vcpu_id, delivery_mode, trig_mode, vector);
1848 }
1849
tdx_is_sept_violation_unexpected_pending(struct kvm_vcpu * vcpu)1850 static inline bool tdx_is_sept_violation_unexpected_pending(struct kvm_vcpu *vcpu)
1851 {
1852 u64 eeq_type = to_tdx(vcpu)->ext_exit_qualification & TDX_EXT_EXIT_QUAL_TYPE_MASK;
1853 u64 eq = vmx_get_exit_qual(vcpu);
1854
1855 if (eeq_type != TDX_EXT_EXIT_QUAL_TYPE_PENDING_EPT_VIOLATION)
1856 return false;
1857
1858 return !(eq & EPT_VIOLATION_PROT_MASK) && !(eq & EPT_VIOLATION_EXEC_FOR_RING3_LIN);
1859 }
1860
tdx_handle_ept_violation(struct kvm_vcpu * vcpu)1861 static int tdx_handle_ept_violation(struct kvm_vcpu *vcpu)
1862 {
1863 unsigned long exit_qual;
1864 gpa_t gpa = to_tdx(vcpu)->exit_gpa;
1865 bool local_retry = false;
1866 int ret;
1867
1868 if (vt_is_tdx_private_gpa(vcpu->kvm, gpa)) {
1869 if (tdx_is_sept_violation_unexpected_pending(vcpu)) {
1870 pr_warn("Guest access before accepting 0x%llx on vCPU %d\n",
1871 gpa, vcpu->vcpu_id);
1872 kvm_vm_dead(vcpu->kvm);
1873 return -EIO;
1874 }
1875 /*
1876 * Always treat SEPT violations as write faults. Ignore the
1877 * EXIT_QUALIFICATION reported by TDX-SEAM for SEPT violations.
1878 * TD private pages are always RWX in the SEPT tables,
1879 * i.e. they're always mapped writable. Just as importantly,
1880 * treating SEPT violations as write faults is necessary to
1881 * avoid COW allocations, which will cause TDAUGPAGE failures
1882 * due to aliasing a single HPA to multiple GPAs.
1883 */
1884 exit_qual = EPT_VIOLATION_ACC_WRITE;
1885
1886 /* Only private GPA triggers zero-step mitigation */
1887 local_retry = true;
1888 } else {
1889 exit_qual = vmx_get_exit_qual(vcpu);
1890 /*
1891 * EPT violation due to instruction fetch should never be
1892 * triggered from shared memory in TDX guest. If such EPT
1893 * violation occurs, treat it as broken hardware.
1894 */
1895 if (KVM_BUG_ON(exit_qual & EPT_VIOLATION_ACC_INSTR, vcpu->kvm))
1896 return -EIO;
1897 }
1898
1899 trace_kvm_page_fault(vcpu, gpa, exit_qual);
1900
1901 /*
1902 * To minimize TDH.VP.ENTER invocations, retry locally for private GPA
1903 * mapping in TDX.
1904 *
1905 * KVM may return RET_PF_RETRY for private GPA due to
1906 * - contentions when atomically updating SPTEs of the mirror page table
1907 * - in-progress GFN invalidation or memslot removal.
1908 * - TDX_OPERAND_BUSY error from TDH.MEM.PAGE.AUG or TDH.MEM.SEPT.ADD,
1909 * caused by contentions with TDH.VP.ENTER (with zero-step mitigation)
1910 * or certain TDCALLs.
1911 *
1912 * If TDH.VP.ENTER is invoked more times than the threshold set by the
1913 * TDX module before KVM resolves the private GPA mapping, the TDX
1914 * module will activate zero-step mitigation during TDH.VP.ENTER. This
1915 * process acquires an SEPT tree lock in the TDX module, leading to
1916 * further contentions with TDH.MEM.PAGE.AUG or TDH.MEM.SEPT.ADD
1917 * operations on other vCPUs.
1918 *
1919 * Breaking out of local retries for kvm_vcpu_has_events() is for
1920 * interrupt injection. kvm_vcpu_has_events() should not see pending
1921 * events for TDX. Since KVM can't determine if IRQs (or NMIs) are
1922 * blocked by TDs, false positives are inevitable i.e., KVM may re-enter
1923 * the guest even if the IRQ/NMI can't be delivered.
1924 *
1925 * Note: even without breaking out of local retries, zero-step
1926 * mitigation may still occur due to
1927 * - invoking of TDH.VP.ENTER after KVM_EXIT_MEMORY_FAULT,
1928 * - a single RIP causing EPT violations for more GFNs than the
1929 * threshold count.
1930 * This is safe, as triggering zero-step mitigation only introduces
1931 * contentions to page installation SEAMCALLs on other vCPUs, which will
1932 * handle retries locally in their EPT violation handlers.
1933 */
1934 while (1) {
1935 struct kvm_memory_slot *slot;
1936
1937 ret = __vmx_handle_ept_violation(vcpu, gpa, exit_qual);
1938
1939 if (ret != RET_PF_RETRY || !local_retry)
1940 break;
1941
1942 if (kvm_vcpu_has_events(vcpu) || signal_pending(current))
1943 break;
1944
1945 if (kvm_check_request(KVM_REQ_VM_DEAD, vcpu)) {
1946 ret = -EIO;
1947 break;
1948 }
1949
1950 /*
1951 * Bail if the memslot is invalid, i.e. is being deleted, as
1952 * faulting in will never succeed and this task needs to drop
1953 * SRCU in order to let memslot deletion complete.
1954 */
1955 slot = kvm_vcpu_gfn_to_memslot(vcpu, gpa_to_gfn(gpa));
1956 if (slot && slot->flags & KVM_MEMSLOT_INVALID)
1957 break;
1958
1959 cond_resched();
1960 }
1961 return ret;
1962 }
1963
tdx_complete_emulated_msr(struct kvm_vcpu * vcpu,int err)1964 int tdx_complete_emulated_msr(struct kvm_vcpu *vcpu, int err)
1965 {
1966 if (err) {
1967 tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_INVALID_OPERAND);
1968 return 1;
1969 }
1970
1971 if (vmx_get_exit_reason(vcpu).basic == EXIT_REASON_MSR_READ)
1972 tdvmcall_set_return_val(vcpu, kvm_read_edx_eax(vcpu));
1973
1974 return 1;
1975 }
1976
1977
tdx_handle_exit(struct kvm_vcpu * vcpu,fastpath_t fastpath)1978 int tdx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t fastpath)
1979 {
1980 struct vcpu_tdx *tdx = to_tdx(vcpu);
1981 u64 vp_enter_ret = tdx->vp_enter_ret;
1982 union vmx_exit_reason exit_reason = vmx_get_exit_reason(vcpu);
1983
1984 if (fastpath != EXIT_FASTPATH_NONE)
1985 return 1;
1986
1987 if (unlikely(vp_enter_ret == EXIT_REASON_EPT_MISCONFIG)) {
1988 KVM_BUG_ON(1, vcpu->kvm);
1989 return -EIO;
1990 }
1991
1992 /*
1993 * Handle TDX SW errors, including TDX_SEAMCALL_UD, TDX_SEAMCALL_GP and
1994 * TDX_SEAMCALL_VMFAILINVALID.
1995 */
1996 if (unlikely((vp_enter_ret & TDX_SW_ERROR) == TDX_SW_ERROR)) {
1997 KVM_BUG_ON(!kvm_rebooting, vcpu->kvm);
1998 goto unhandled_exit;
1999 }
2000
2001 if (unlikely(tdx_failed_vmentry(vcpu))) {
2002 /*
2003 * If the guest state is protected, that means off-TD debug is
2004 * not enabled, TDX_NON_RECOVERABLE must be set.
2005 */
2006 WARN_ON_ONCE(vcpu->arch.guest_state_protected &&
2007 !(vp_enter_ret & TDX_NON_RECOVERABLE));
2008 vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY;
2009 vcpu->run->fail_entry.hardware_entry_failure_reason = exit_reason.full;
2010 vcpu->run->fail_entry.cpu = vcpu->arch.last_vmentry_cpu;
2011 return 0;
2012 }
2013
2014 if (unlikely(vp_enter_ret & (TDX_ERROR | TDX_NON_RECOVERABLE)) &&
2015 exit_reason.basic != EXIT_REASON_TRIPLE_FAULT) {
2016 kvm_pr_unimpl("TD vp_enter_ret 0x%llx\n", vp_enter_ret);
2017 goto unhandled_exit;
2018 }
2019
2020 WARN_ON_ONCE(exit_reason.basic != EXIT_REASON_TRIPLE_FAULT &&
2021 (vp_enter_ret & TDX_SEAMCALL_STATUS_MASK) != TDX_SUCCESS);
2022
2023 switch (exit_reason.basic) {
2024 case EXIT_REASON_TRIPLE_FAULT:
2025 vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN;
2026 vcpu->mmio_needed = 0;
2027 return 0;
2028 case EXIT_REASON_EXCEPTION_NMI:
2029 return tdx_handle_exception_nmi(vcpu);
2030 case EXIT_REASON_EXTERNAL_INTERRUPT:
2031 ++vcpu->stat.irq_exits;
2032 return 1;
2033 case EXIT_REASON_CPUID:
2034 return tdx_emulate_cpuid(vcpu);
2035 case EXIT_REASON_HLT:
2036 return kvm_emulate_halt_noskip(vcpu);
2037 case EXIT_REASON_TDCALL:
2038 return handle_tdvmcall(vcpu);
2039 case EXIT_REASON_VMCALL:
2040 return tdx_emulate_vmcall(vcpu);
2041 case EXIT_REASON_IO_INSTRUCTION:
2042 return tdx_emulate_io(vcpu);
2043 case EXIT_REASON_MSR_READ:
2044 kvm_rcx_write(vcpu, tdx->vp_enter_args.r12);
2045 return kvm_emulate_rdmsr(vcpu);
2046 case EXIT_REASON_MSR_WRITE:
2047 kvm_rcx_write(vcpu, tdx->vp_enter_args.r12);
2048 kvm_rax_write(vcpu, tdx->vp_enter_args.r13 & -1u);
2049 kvm_rdx_write(vcpu, tdx->vp_enter_args.r13 >> 32);
2050 return kvm_emulate_wrmsr(vcpu);
2051 case EXIT_REASON_EPT_MISCONFIG:
2052 return tdx_emulate_mmio(vcpu);
2053 case EXIT_REASON_EPT_VIOLATION:
2054 return tdx_handle_ept_violation(vcpu);
2055 case EXIT_REASON_OTHER_SMI:
2056 /*
2057 * Unlike VMX, SMI in SEAM non-root mode (i.e. when
2058 * TD guest vCPU is running) will cause VM exit to TDX module,
2059 * then SEAMRET to KVM. Once it exits to KVM, SMI is delivered
2060 * and handled by kernel handler right away.
2061 *
2062 * The Other SMI exit can also be caused by the SEAM non-root
2063 * machine check delivered via Machine Check System Management
2064 * Interrupt (MSMI), but it has already been handled by the
2065 * kernel machine check handler, i.e., the memory page has been
2066 * marked as poisoned and it won't be freed to the free list
2067 * when the TDX guest is terminated (the TDX module marks the
2068 * guest as dead and prevent it from further running when
2069 * machine check happens in SEAM non-root).
2070 *
2071 * - A MSMI will not reach here, it's handled as non_recoverable
2072 * case above.
2073 * - If it's not an MSMI, no need to do anything here.
2074 */
2075 return 1;
2076 default:
2077 break;
2078 }
2079
2080 unhandled_exit:
2081 kvm_prepare_unexpected_reason_exit(vcpu, vp_enter_ret);
2082 return 0;
2083 }
2084
tdx_get_exit_info(struct kvm_vcpu * vcpu,u32 * reason,u64 * info1,u64 * info2,u32 * intr_info,u32 * error_code)2085 void tdx_get_exit_info(struct kvm_vcpu *vcpu, u32 *reason,
2086 u64 *info1, u64 *info2, u32 *intr_info, u32 *error_code)
2087 {
2088 struct vcpu_tdx *tdx = to_tdx(vcpu);
2089
2090 *reason = tdx->vt.exit_reason.full;
2091 if (*reason != -1u) {
2092 *info1 = vmx_get_exit_qual(vcpu);
2093 *info2 = tdx->ext_exit_qualification;
2094 *intr_info = vmx_get_intr_info(vcpu);
2095 } else {
2096 *info1 = 0;
2097 *info2 = 0;
2098 *intr_info = 0;
2099 }
2100
2101 *error_code = 0;
2102 }
2103
tdx_has_emulated_msr(u32 index)2104 bool tdx_has_emulated_msr(u32 index)
2105 {
2106 switch (index) {
2107 case MSR_IA32_UCODE_REV:
2108 case MSR_IA32_ARCH_CAPABILITIES:
2109 case MSR_IA32_POWER_CTL:
2110 case MSR_IA32_CR_PAT:
2111 case MSR_MTRRcap:
2112 case MTRRphysBase_MSR(0) ... MSR_MTRRfix4K_F8000:
2113 case MSR_MTRRdefType:
2114 case MSR_IA32_TSC_DEADLINE:
2115 case MSR_IA32_MISC_ENABLE:
2116 case MSR_PLATFORM_INFO:
2117 case MSR_MISC_FEATURES_ENABLES:
2118 case MSR_IA32_APICBASE:
2119 case MSR_EFER:
2120 case MSR_IA32_FEAT_CTL:
2121 case MSR_IA32_MCG_CAP:
2122 case MSR_IA32_MCG_STATUS:
2123 case MSR_IA32_MCG_CTL:
2124 case MSR_IA32_MCG_EXT_CTL:
2125 case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1:
2126 case MSR_IA32_MC0_CTL2 ... MSR_IA32_MCx_CTL2(KVM_MAX_MCE_BANKS) - 1:
2127 /* MSR_IA32_MCx_{CTL, STATUS, ADDR, MISC, CTL2} */
2128 case MSR_KVM_POLL_CONTROL:
2129 return true;
2130 case APIC_BASE_MSR ... APIC_BASE_MSR + 0xff:
2131 /*
2132 * x2APIC registers that are virtualized by the CPU can't be
2133 * emulated, KVM doesn't have access to the virtual APIC page.
2134 */
2135 switch (index) {
2136 case X2APIC_MSR(APIC_TASKPRI):
2137 case X2APIC_MSR(APIC_PROCPRI):
2138 case X2APIC_MSR(APIC_EOI):
2139 case X2APIC_MSR(APIC_ISR) ... X2APIC_MSR(APIC_ISR + APIC_ISR_NR):
2140 case X2APIC_MSR(APIC_TMR) ... X2APIC_MSR(APIC_TMR + APIC_ISR_NR):
2141 case X2APIC_MSR(APIC_IRR) ... X2APIC_MSR(APIC_IRR + APIC_ISR_NR):
2142 return false;
2143 default:
2144 return true;
2145 }
2146 default:
2147 return false;
2148 }
2149 }
2150
tdx_is_read_only_msr(u32 index)2151 static bool tdx_is_read_only_msr(u32 index)
2152 {
2153 return index == MSR_IA32_APICBASE || index == MSR_EFER ||
2154 index == MSR_IA32_FEAT_CTL;
2155 }
2156
tdx_get_msr(struct kvm_vcpu * vcpu,struct msr_data * msr)2157 int tdx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
2158 {
2159 switch (msr->index) {
2160 case MSR_IA32_FEAT_CTL:
2161 /*
2162 * MCE and MCA are advertised via cpuid. Guest kernel could
2163 * check if LMCE is enabled or not.
2164 */
2165 msr->data = FEAT_CTL_LOCKED;
2166 if (vcpu->arch.mcg_cap & MCG_LMCE_P)
2167 msr->data |= FEAT_CTL_LMCE_ENABLED;
2168 return 0;
2169 case MSR_IA32_MCG_EXT_CTL:
2170 if (!msr->host_initiated && !(vcpu->arch.mcg_cap & MCG_LMCE_P))
2171 return 1;
2172 msr->data = vcpu->arch.mcg_ext_ctl;
2173 return 0;
2174 default:
2175 if (!tdx_has_emulated_msr(msr->index))
2176 return 1;
2177
2178 return kvm_get_msr_common(vcpu, msr);
2179 }
2180 }
2181
tdx_set_msr(struct kvm_vcpu * vcpu,struct msr_data * msr)2182 int tdx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
2183 {
2184 switch (msr->index) {
2185 case MSR_IA32_MCG_EXT_CTL:
2186 if ((!msr->host_initiated && !(vcpu->arch.mcg_cap & MCG_LMCE_P)) ||
2187 (msr->data & ~MCG_EXT_CTL_LMCE_EN))
2188 return 1;
2189 vcpu->arch.mcg_ext_ctl = msr->data;
2190 return 0;
2191 default:
2192 if (tdx_is_read_only_msr(msr->index))
2193 return 1;
2194
2195 if (!tdx_has_emulated_msr(msr->index))
2196 return 1;
2197
2198 return kvm_set_msr_common(vcpu, msr);
2199 }
2200 }
2201
tdx_get_capabilities(struct kvm_tdx_cmd * cmd)2202 static int tdx_get_capabilities(struct kvm_tdx_cmd *cmd)
2203 {
2204 const struct tdx_sys_info_td_conf *td_conf = &tdx_sysinfo->td_conf;
2205 struct kvm_tdx_capabilities __user *user_caps;
2206 struct kvm_tdx_capabilities *caps = NULL;
2207 u32 nr_user_entries;
2208 int ret = 0;
2209
2210 /* flags is reserved for future use */
2211 if (cmd->flags)
2212 return -EINVAL;
2213
2214 user_caps = u64_to_user_ptr(cmd->data);
2215 if (get_user(nr_user_entries, &user_caps->cpuid.nent))
2216 return -EFAULT;
2217
2218 if (nr_user_entries < td_conf->num_cpuid_config)
2219 return -E2BIG;
2220
2221 caps = kzalloc(struct_size(caps, cpuid.entries,
2222 td_conf->num_cpuid_config), GFP_KERNEL);
2223 if (!caps)
2224 return -ENOMEM;
2225
2226 ret = init_kvm_tdx_caps(td_conf, caps);
2227 if (ret)
2228 goto out;
2229
2230 if (copy_to_user(user_caps, caps, struct_size(caps, cpuid.entries,
2231 caps->cpuid.nent))) {
2232 ret = -EFAULT;
2233 goto out;
2234 }
2235
2236 out:
2237 /* kfree() accepts NULL. */
2238 kfree(caps);
2239 return ret;
2240 }
2241
2242 /*
2243 * KVM reports guest physical address in CPUID.0x800000008.EAX[23:16], which is
2244 * similar to TDX's GPAW. Use this field as the interface for userspace to
2245 * configure the GPAW and EPT level for TDs.
2246 *
2247 * Only values 48 and 52 are supported. Value 52 means GPAW-52 and EPT level
2248 * 5, Value 48 means GPAW-48 and EPT level 4. For value 48, GPAW-48 is always
2249 * supported. Value 52 is only supported when the platform supports 5 level
2250 * EPT.
2251 */
setup_tdparams_eptp_controls(struct kvm_cpuid2 * cpuid,struct td_params * td_params)2252 static int setup_tdparams_eptp_controls(struct kvm_cpuid2 *cpuid,
2253 struct td_params *td_params)
2254 {
2255 const struct kvm_cpuid_entry2 *entry;
2256 int guest_pa;
2257
2258 entry = kvm_find_cpuid_entry2(cpuid->entries, cpuid->nent, 0x80000008, 0);
2259 if (!entry)
2260 return -EINVAL;
2261
2262 guest_pa = tdx_get_guest_phys_addr_bits(entry->eax);
2263
2264 if (guest_pa != 48 && guest_pa != 52)
2265 return -EINVAL;
2266
2267 if (guest_pa == 52 && !cpu_has_vmx_ept_5levels())
2268 return -EINVAL;
2269
2270 td_params->eptp_controls = VMX_EPTP_MT_WB;
2271 if (guest_pa == 52) {
2272 td_params->eptp_controls |= VMX_EPTP_PWL_5;
2273 td_params->config_flags |= TDX_CONFIG_FLAGS_MAX_GPAW;
2274 } else {
2275 td_params->eptp_controls |= VMX_EPTP_PWL_4;
2276 }
2277
2278 return 0;
2279 }
2280
setup_tdparams_cpuids(struct kvm_cpuid2 * cpuid,struct td_params * td_params)2281 static int setup_tdparams_cpuids(struct kvm_cpuid2 *cpuid,
2282 struct td_params *td_params)
2283 {
2284 const struct tdx_sys_info_td_conf *td_conf = &tdx_sysinfo->td_conf;
2285 const struct kvm_cpuid_entry2 *entry;
2286 struct tdx_cpuid_value *value;
2287 int i, copy_cnt = 0;
2288
2289 /*
2290 * td_params.cpuid_values: The number and the order of cpuid_value must
2291 * be same to the one of struct tdsysinfo.{num_cpuid_config, cpuid_configs}
2292 * It's assumed that td_params was zeroed.
2293 */
2294 for (i = 0; i < td_conf->num_cpuid_config; i++) {
2295 struct kvm_cpuid_entry2 tmp;
2296
2297 td_init_cpuid_entry2(&tmp, i);
2298
2299 entry = kvm_find_cpuid_entry2(cpuid->entries, cpuid->nent,
2300 tmp.function, tmp.index);
2301 if (!entry)
2302 continue;
2303
2304 if (tdx_unsupported_cpuid(entry))
2305 return -EINVAL;
2306
2307 copy_cnt++;
2308
2309 value = &td_params->cpuid_values[i];
2310 value->eax = entry->eax;
2311 value->ebx = entry->ebx;
2312 value->ecx = entry->ecx;
2313 value->edx = entry->edx;
2314
2315 /*
2316 * TDX module does not accept nonzero bits 16..23 for the
2317 * CPUID[0x80000008].EAX, see setup_tdparams_eptp_controls().
2318 */
2319 if (tmp.function == 0x80000008)
2320 value->eax = tdx_set_guest_phys_addr_bits(value->eax, 0);
2321 }
2322
2323 /*
2324 * Rely on the TDX module to reject invalid configuration, but it can't
2325 * check of leafs that don't have a proper slot in td_params->cpuid_values
2326 * to stick then. So fail if there were entries that didn't get copied to
2327 * td_params.
2328 */
2329 if (copy_cnt != cpuid->nent)
2330 return -EINVAL;
2331
2332 return 0;
2333 }
2334
setup_tdparams(struct kvm * kvm,struct td_params * td_params,struct kvm_tdx_init_vm * init_vm)2335 static int setup_tdparams(struct kvm *kvm, struct td_params *td_params,
2336 struct kvm_tdx_init_vm *init_vm)
2337 {
2338 const struct tdx_sys_info_td_conf *td_conf = &tdx_sysinfo->td_conf;
2339 struct kvm_cpuid2 *cpuid = &init_vm->cpuid;
2340 int ret;
2341
2342 if (kvm->created_vcpus)
2343 return -EBUSY;
2344
2345 if (init_vm->attributes & ~tdx_get_supported_attrs(td_conf))
2346 return -EINVAL;
2347
2348 if (init_vm->xfam & ~tdx_get_supported_xfam(td_conf))
2349 return -EINVAL;
2350
2351 td_params->max_vcpus = kvm->max_vcpus;
2352 td_params->attributes = init_vm->attributes | td_conf->attributes_fixed1;
2353 td_params->xfam = init_vm->xfam | td_conf->xfam_fixed1;
2354
2355 td_params->config_flags = TDX_CONFIG_FLAGS_NO_RBP_MOD;
2356 td_params->tsc_frequency = TDX_TSC_KHZ_TO_25MHZ(kvm->arch.default_tsc_khz);
2357
2358 ret = setup_tdparams_eptp_controls(cpuid, td_params);
2359 if (ret)
2360 return ret;
2361
2362 ret = setup_tdparams_cpuids(cpuid, td_params);
2363 if (ret)
2364 return ret;
2365
2366 #define MEMCPY_SAME_SIZE(dst, src) \
2367 do { \
2368 BUILD_BUG_ON(sizeof(dst) != sizeof(src)); \
2369 memcpy((dst), (src), sizeof(dst)); \
2370 } while (0)
2371
2372 MEMCPY_SAME_SIZE(td_params->mrconfigid, init_vm->mrconfigid);
2373 MEMCPY_SAME_SIZE(td_params->mrowner, init_vm->mrowner);
2374 MEMCPY_SAME_SIZE(td_params->mrownerconfig, init_vm->mrownerconfig);
2375
2376 return 0;
2377 }
2378
__tdx_td_init(struct kvm * kvm,struct td_params * td_params,u64 * seamcall_err)2379 static int __tdx_td_init(struct kvm *kvm, struct td_params *td_params,
2380 u64 *seamcall_err)
2381 {
2382 struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
2383 cpumask_var_t packages;
2384 struct page **tdcs_pages = NULL;
2385 struct page *tdr_page;
2386 int ret, i;
2387 u64 err, rcx;
2388
2389 *seamcall_err = 0;
2390 ret = tdx_guest_keyid_alloc();
2391 if (ret < 0)
2392 return ret;
2393 kvm_tdx->hkid = ret;
2394 kvm_tdx->misc_cg = get_current_misc_cg();
2395 ret = misc_cg_try_charge(MISC_CG_RES_TDX, kvm_tdx->misc_cg, 1);
2396 if (ret)
2397 goto free_hkid;
2398
2399 ret = -ENOMEM;
2400
2401 atomic_inc(&nr_configured_hkid);
2402
2403 tdr_page = alloc_page(GFP_KERNEL);
2404 if (!tdr_page)
2405 goto free_hkid;
2406
2407 kvm_tdx->td.tdcs_nr_pages = tdx_sysinfo->td_ctrl.tdcs_base_size / PAGE_SIZE;
2408 /* TDVPS = TDVPR(4K page) + TDCX(multiple 4K pages), -1 for TDVPR. */
2409 kvm_tdx->td.tdcx_nr_pages = tdx_sysinfo->td_ctrl.tdvps_base_size / PAGE_SIZE - 1;
2410 tdcs_pages = kcalloc(kvm_tdx->td.tdcs_nr_pages, sizeof(*kvm_tdx->td.tdcs_pages),
2411 GFP_KERNEL);
2412 if (!tdcs_pages)
2413 goto free_tdr;
2414
2415 for (i = 0; i < kvm_tdx->td.tdcs_nr_pages; i++) {
2416 tdcs_pages[i] = alloc_page(GFP_KERNEL);
2417 if (!tdcs_pages[i])
2418 goto free_tdcs;
2419 }
2420
2421 if (!zalloc_cpumask_var(&packages, GFP_KERNEL))
2422 goto free_tdcs;
2423
2424 cpus_read_lock();
2425
2426 /*
2427 * Need at least one CPU of the package to be online in order to
2428 * program all packages for host key id. Check it.
2429 */
2430 for_each_present_cpu(i)
2431 cpumask_set_cpu(topology_physical_package_id(i), packages);
2432 for_each_online_cpu(i)
2433 cpumask_clear_cpu(topology_physical_package_id(i), packages);
2434 if (!cpumask_empty(packages)) {
2435 ret = -EIO;
2436 /*
2437 * Because it's hard for human operator to figure out the
2438 * reason, warn it.
2439 */
2440 #define MSG_ALLPKG "All packages need to have online CPU to create TD. Online CPU and retry.\n"
2441 pr_warn_ratelimited(MSG_ALLPKG);
2442 goto free_packages;
2443 }
2444
2445 /*
2446 * TDH.MNG.CREATE tries to grab the global TDX module and fails
2447 * with TDX_OPERAND_BUSY when it fails to grab. Take the global
2448 * lock to prevent it from failure.
2449 */
2450 mutex_lock(&tdx_lock);
2451 kvm_tdx->td.tdr_page = tdr_page;
2452 err = tdh_mng_create(&kvm_tdx->td, kvm_tdx->hkid);
2453 mutex_unlock(&tdx_lock);
2454
2455 if (err == TDX_RND_NO_ENTROPY) {
2456 ret = -EAGAIN;
2457 goto free_packages;
2458 }
2459
2460 if (TDX_BUG_ON(err, TDH_MNG_CREATE, kvm)) {
2461 ret = -EIO;
2462 goto free_packages;
2463 }
2464
2465 for_each_online_cpu(i) {
2466 int pkg = topology_physical_package_id(i);
2467
2468 if (cpumask_test_and_set_cpu(pkg, packages))
2469 continue;
2470
2471 /*
2472 * Program the memory controller in the package with an
2473 * encryption key associated to a TDX private host key id
2474 * assigned to this TDR. Concurrent operations on same memory
2475 * controller results in TDX_OPERAND_BUSY. No locking needed
2476 * beyond the cpus_read_lock() above as it serializes against
2477 * hotplug and the first online CPU of the package is always
2478 * used. We never have two CPUs in the same socket trying to
2479 * program the key.
2480 */
2481 ret = smp_call_on_cpu(i, tdx_do_tdh_mng_key_config,
2482 kvm_tdx, true);
2483 if (ret)
2484 break;
2485 }
2486 cpus_read_unlock();
2487 free_cpumask_var(packages);
2488 if (ret) {
2489 i = 0;
2490 goto teardown;
2491 }
2492
2493 kvm_tdx->td.tdcs_pages = tdcs_pages;
2494 for (i = 0; i < kvm_tdx->td.tdcs_nr_pages; i++) {
2495 err = tdh_mng_addcx(&kvm_tdx->td, tdcs_pages[i]);
2496 if (err == TDX_RND_NO_ENTROPY) {
2497 /* Here it's hard to allow userspace to retry. */
2498 ret = -EAGAIN;
2499 goto teardown;
2500 }
2501 if (TDX_BUG_ON(err, TDH_MNG_ADDCX, kvm)) {
2502 ret = -EIO;
2503 goto teardown;
2504 }
2505 }
2506
2507 err = tdh_mng_init(&kvm_tdx->td, __pa(td_params), &rcx);
2508 if ((err & TDX_SEAMCALL_STATUS_MASK) == TDX_OPERAND_INVALID) {
2509 /*
2510 * Because a user gives operands, don't warn.
2511 * Return a hint to the user because it's sometimes hard for the
2512 * user to figure out which operand is invalid. SEAMCALL status
2513 * code includes which operand caused invalid operand error.
2514 */
2515 *seamcall_err = err;
2516 ret = -EINVAL;
2517 goto teardown;
2518 } else if (TDX_BUG_ON_1(err, TDH_MNG_INIT, rcx, kvm)) {
2519 ret = -EIO;
2520 goto teardown;
2521 }
2522
2523 return 0;
2524
2525 /*
2526 * The sequence for freeing resources from a partially initialized TD
2527 * varies based on where in the initialization flow failure occurred.
2528 * Simply use the full teardown and destroy, which naturally play nice
2529 * with partial initialization.
2530 */
2531 teardown:
2532 /* Only free pages not yet added, so start at 'i' */
2533 for (; i < kvm_tdx->td.tdcs_nr_pages; i++) {
2534 if (tdcs_pages[i]) {
2535 __free_page(tdcs_pages[i]);
2536 tdcs_pages[i] = NULL;
2537 }
2538 }
2539 if (!kvm_tdx->td.tdcs_pages)
2540 kfree(tdcs_pages);
2541
2542 tdx_mmu_release_hkid(kvm);
2543 tdx_reclaim_td_control_pages(kvm);
2544
2545 return ret;
2546
2547 free_packages:
2548 cpus_read_unlock();
2549 free_cpumask_var(packages);
2550
2551 free_tdcs:
2552 for (i = 0; i < kvm_tdx->td.tdcs_nr_pages; i++) {
2553 if (tdcs_pages[i])
2554 __free_page(tdcs_pages[i]);
2555 }
2556 kfree(tdcs_pages);
2557 kvm_tdx->td.tdcs_pages = NULL;
2558
2559 free_tdr:
2560 if (tdr_page)
2561 __free_page(tdr_page);
2562 kvm_tdx->td.tdr_page = NULL;
2563
2564 free_hkid:
2565 tdx_hkid_free(kvm_tdx);
2566
2567 return ret;
2568 }
2569
tdx_td_metadata_field_read(struct kvm_tdx * tdx,u64 field_id,u64 * data)2570 static u64 tdx_td_metadata_field_read(struct kvm_tdx *tdx, u64 field_id,
2571 u64 *data)
2572 {
2573 u64 err;
2574
2575 err = tdh_mng_rd(&tdx->td, field_id, data);
2576
2577 return err;
2578 }
2579
2580 #define TDX_MD_UNREADABLE_LEAF_MASK GENMASK(30, 7)
2581 #define TDX_MD_UNREADABLE_SUBLEAF_MASK GENMASK(31, 7)
2582
tdx_read_cpuid(struct kvm_vcpu * vcpu,u32 leaf,u32 sub_leaf,bool sub_leaf_set,int * entry_index,struct kvm_cpuid_entry2 * out)2583 static int tdx_read_cpuid(struct kvm_vcpu *vcpu, u32 leaf, u32 sub_leaf,
2584 bool sub_leaf_set, int *entry_index,
2585 struct kvm_cpuid_entry2 *out)
2586 {
2587 struct kvm_tdx *kvm_tdx = to_kvm_tdx(vcpu->kvm);
2588 u64 field_id = TD_MD_FIELD_ID_CPUID_VALUES;
2589 u64 ebx_eax, edx_ecx;
2590 u64 err = 0;
2591
2592 if (sub_leaf > 0b1111111)
2593 return -EINVAL;
2594
2595 if (*entry_index >= KVM_MAX_CPUID_ENTRIES)
2596 return -EINVAL;
2597
2598 if (leaf & TDX_MD_UNREADABLE_LEAF_MASK ||
2599 sub_leaf & TDX_MD_UNREADABLE_SUBLEAF_MASK)
2600 return -EINVAL;
2601
2602 /*
2603 * bit 23:17, REVSERVED: reserved, must be 0;
2604 * bit 16, LEAF_31: leaf number bit 31;
2605 * bit 15:9, LEAF_6_0: leaf number bits 6:0, leaf bits 30:7 are
2606 * implicitly 0;
2607 * bit 8, SUBLEAF_NA: sub-leaf not applicable flag;
2608 * bit 7:1, SUBLEAF_6_0: sub-leaf number bits 6:0. If SUBLEAF_NA is 1,
2609 * the SUBLEAF_6_0 is all-1.
2610 * sub-leaf bits 31:7 are implicitly 0;
2611 * bit 0, ELEMENT_I: Element index within field;
2612 */
2613 field_id |= ((leaf & 0x80000000) ? 1 : 0) << 16;
2614 field_id |= (leaf & 0x7f) << 9;
2615 if (sub_leaf_set)
2616 field_id |= (sub_leaf & 0x7f) << 1;
2617 else
2618 field_id |= 0x1fe;
2619
2620 err = tdx_td_metadata_field_read(kvm_tdx, field_id, &ebx_eax);
2621 if (err) //TODO check for specific errors
2622 goto err_out;
2623
2624 out->eax = (u32) ebx_eax;
2625 out->ebx = (u32) (ebx_eax >> 32);
2626
2627 field_id++;
2628 err = tdx_td_metadata_field_read(kvm_tdx, field_id, &edx_ecx);
2629 /*
2630 * It's weird that reading edx_ecx fails while reading ebx_eax
2631 * succeeded.
2632 */
2633 if (WARN_ON_ONCE(err))
2634 goto err_out;
2635
2636 out->ecx = (u32) edx_ecx;
2637 out->edx = (u32) (edx_ecx >> 32);
2638
2639 out->function = leaf;
2640 out->index = sub_leaf;
2641 out->flags |= sub_leaf_set ? KVM_CPUID_FLAG_SIGNIFCANT_INDEX : 0;
2642
2643 /*
2644 * Work around missing support on old TDX modules, fetch
2645 * guest maxpa from gfn_direct_bits.
2646 */
2647 if (leaf == 0x80000008) {
2648 gpa_t gpa_bits = gfn_to_gpa(kvm_gfn_direct_bits(vcpu->kvm));
2649 unsigned int g_maxpa = __ffs(gpa_bits) + 1;
2650
2651 out->eax = tdx_set_guest_phys_addr_bits(out->eax, g_maxpa);
2652 }
2653
2654 (*entry_index)++;
2655
2656 return 0;
2657
2658 err_out:
2659 out->eax = 0;
2660 out->ebx = 0;
2661 out->ecx = 0;
2662 out->edx = 0;
2663
2664 return -EIO;
2665 }
2666
2667 typedef void *tdx_vm_state_guard_t;
2668
tdx_acquire_vm_state_locks(struct kvm * kvm)2669 static tdx_vm_state_guard_t tdx_acquire_vm_state_locks(struct kvm *kvm)
2670 {
2671 int r;
2672
2673 mutex_lock(&kvm->lock);
2674
2675 if (kvm->created_vcpus != atomic_read(&kvm->online_vcpus)) {
2676 r = -EBUSY;
2677 goto out_err;
2678 }
2679
2680 r = kvm_lock_all_vcpus(kvm);
2681 if (r)
2682 goto out_err;
2683
2684 /*
2685 * Note the unintuitive ordering! vcpu->mutex must be taken outside
2686 * kvm->slots_lock!
2687 */
2688 mutex_lock(&kvm->slots_lock);
2689 return kvm;
2690
2691 out_err:
2692 mutex_unlock(&kvm->lock);
2693 return ERR_PTR(r);
2694 }
2695
tdx_release_vm_state_locks(struct kvm * kvm)2696 static void tdx_release_vm_state_locks(struct kvm *kvm)
2697 {
2698 mutex_unlock(&kvm->slots_lock);
2699 kvm_unlock_all_vcpus(kvm);
2700 mutex_unlock(&kvm->lock);
2701 }
2702
2703 DEFINE_CLASS(tdx_vm_state_guard, tdx_vm_state_guard_t,
2704 if (!IS_ERR(_T)) tdx_release_vm_state_locks(_T),
2705 tdx_acquire_vm_state_locks(kvm), struct kvm *kvm);
2706
tdx_td_init(struct kvm * kvm,struct kvm_tdx_cmd * cmd)2707 static int tdx_td_init(struct kvm *kvm, struct kvm_tdx_cmd *cmd)
2708 {
2709 struct kvm_tdx_init_vm __user *user_data = u64_to_user_ptr(cmd->data);
2710 struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
2711 struct kvm_tdx_init_vm *init_vm;
2712 struct td_params *td_params = NULL;
2713 u32 nr_user_entries;
2714 int ret;
2715
2716 BUILD_BUG_ON(sizeof(*init_vm) != 256 + sizeof_field(struct kvm_tdx_init_vm, cpuid));
2717 BUILD_BUG_ON(sizeof(struct td_params) != 1024);
2718
2719 if (kvm_tdx->state != TD_STATE_UNINITIALIZED)
2720 return -EINVAL;
2721
2722 if (cmd->flags)
2723 return -EINVAL;
2724
2725 if (get_user(nr_user_entries, &user_data->cpuid.nent))
2726 return -EFAULT;
2727
2728 if (nr_user_entries > KVM_MAX_CPUID_ENTRIES)
2729 return -E2BIG;
2730
2731 init_vm = memdup_user(user_data,
2732 struct_size(user_data, cpuid.entries, nr_user_entries));
2733 if (IS_ERR(init_vm))
2734 return PTR_ERR(init_vm);
2735
2736 if (memchr_inv(init_vm->reserved, 0, sizeof(init_vm->reserved))) {
2737 ret = -EINVAL;
2738 goto out;
2739 }
2740
2741 if (init_vm->cpuid.padding) {
2742 ret = -EINVAL;
2743 goto out;
2744 }
2745
2746 td_params = kzalloc(sizeof(struct td_params), GFP_KERNEL);
2747 if (!td_params) {
2748 ret = -ENOMEM;
2749 goto out;
2750 }
2751
2752 ret = setup_tdparams(kvm, td_params, init_vm);
2753 if (ret)
2754 goto out;
2755
2756 ret = __tdx_td_init(kvm, td_params, &cmd->hw_error);
2757 if (ret)
2758 goto out;
2759
2760 kvm_tdx->tsc_offset = td_tdcs_exec_read64(kvm_tdx, TD_TDCS_EXEC_TSC_OFFSET);
2761 kvm_tdx->tsc_multiplier = td_tdcs_exec_read64(kvm_tdx, TD_TDCS_EXEC_TSC_MULTIPLIER);
2762 kvm_tdx->attributes = td_params->attributes;
2763 kvm_tdx->xfam = td_params->xfam;
2764
2765 if (td_params->config_flags & TDX_CONFIG_FLAGS_MAX_GPAW)
2766 kvm->arch.gfn_direct_bits = TDX_SHARED_BIT_PWL_5;
2767 else
2768 kvm->arch.gfn_direct_bits = TDX_SHARED_BIT_PWL_4;
2769
2770 kvm_tdx->state = TD_STATE_INITIALIZED;
2771 out:
2772 /* kfree() accepts NULL. */
2773 kfree(init_vm);
2774 kfree(td_params);
2775
2776 return ret;
2777 }
2778
tdx_flush_tlb_current(struct kvm_vcpu * vcpu)2779 void tdx_flush_tlb_current(struct kvm_vcpu *vcpu)
2780 {
2781 /*
2782 * flush_tlb_current() is invoked when the first time for the vcpu to
2783 * run or when root of shared EPT is invalidated.
2784 * KVM only needs to flush shared EPT because the TDX module handles TLB
2785 * invalidation for private EPT in tdh_vp_enter();
2786 *
2787 * A single context invalidation for shared EPT can be performed here.
2788 * However, this single context invalidation requires the private EPTP
2789 * rather than the shared EPTP to flush shared EPT, as shared EPT uses
2790 * private EPTP as its ASID for TLB invalidation.
2791 *
2792 * To avoid reading back private EPTP, perform a global invalidation for
2793 * shared EPT instead to keep this function simple.
2794 */
2795 ept_sync_global();
2796 }
2797
tdx_flush_tlb_all(struct kvm_vcpu * vcpu)2798 void tdx_flush_tlb_all(struct kvm_vcpu *vcpu)
2799 {
2800 /*
2801 * TDX has called tdx_track() in tdx_sept_remove_private_spte() to
2802 * ensure that private EPT will be flushed on the next TD enter. No need
2803 * to call tdx_track() here again even when this callback is a result of
2804 * zapping private EPT.
2805 *
2806 * Due to the lack of the context to determine which EPT has been
2807 * affected by zapping, invoke invept() directly here for both shared
2808 * EPT and private EPT for simplicity, though it's not necessary for
2809 * private EPT.
2810 */
2811 ept_sync_global();
2812 }
2813
tdx_td_finalize(struct kvm * kvm,struct kvm_tdx_cmd * cmd)2814 static int tdx_td_finalize(struct kvm *kvm, struct kvm_tdx_cmd *cmd)
2815 {
2816 struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
2817
2818 if (!is_hkid_assigned(kvm_tdx) || kvm_tdx->state == TD_STATE_RUNNABLE)
2819 return -EINVAL;
2820
2821 cmd->hw_error = tdh_mr_finalize(&kvm_tdx->td);
2822 if (tdx_operand_busy(cmd->hw_error))
2823 return -EBUSY;
2824 if (TDX_BUG_ON(cmd->hw_error, TDH_MR_FINALIZE, kvm))
2825 return -EIO;
2826
2827 kvm_tdx->state = TD_STATE_RUNNABLE;
2828 /* TD_STATE_RUNNABLE must be set before 'pre_fault_allowed' */
2829 smp_wmb();
2830 kvm->arch.pre_fault_allowed = true;
2831 return 0;
2832 }
2833
tdx_get_cmd(void __user * argp,struct kvm_tdx_cmd * cmd)2834 static int tdx_get_cmd(void __user *argp, struct kvm_tdx_cmd *cmd)
2835 {
2836 if (copy_from_user(cmd, argp, sizeof(*cmd)))
2837 return -EFAULT;
2838
2839 /*
2840 * Userspace should never set hw_error. KVM writes hw_error to report
2841 * hardware-defined error back to userspace.
2842 */
2843 if (cmd->hw_error)
2844 return -EINVAL;
2845
2846 return 0;
2847 }
2848
tdx_vm_ioctl(struct kvm * kvm,void __user * argp)2849 int tdx_vm_ioctl(struct kvm *kvm, void __user *argp)
2850 {
2851 struct kvm_tdx_cmd tdx_cmd;
2852 int r;
2853
2854 r = tdx_get_cmd(argp, &tdx_cmd);
2855 if (r)
2856 return r;
2857
2858 if (tdx_cmd.id == KVM_TDX_CAPABILITIES)
2859 return tdx_get_capabilities(&tdx_cmd);
2860
2861 CLASS(tdx_vm_state_guard, guard)(kvm);
2862 if (IS_ERR(guard))
2863 return PTR_ERR(guard);
2864
2865 switch (tdx_cmd.id) {
2866 case KVM_TDX_INIT_VM:
2867 r = tdx_td_init(kvm, &tdx_cmd);
2868 break;
2869 case KVM_TDX_FINALIZE_VM:
2870 r = tdx_td_finalize(kvm, &tdx_cmd);
2871 break;
2872 default:
2873 return -EINVAL;
2874 }
2875
2876 if (copy_to_user(argp, &tdx_cmd, sizeof(struct kvm_tdx_cmd)))
2877 return -EFAULT;
2878
2879 return r;
2880 }
2881
2882 /* VMM can pass one 64bit auxiliary data to vcpu via RCX for guest BIOS. */
tdx_td_vcpu_init(struct kvm_vcpu * vcpu,u64 vcpu_rcx)2883 static int tdx_td_vcpu_init(struct kvm_vcpu *vcpu, u64 vcpu_rcx)
2884 {
2885 struct kvm_tdx *kvm_tdx = to_kvm_tdx(vcpu->kvm);
2886 struct vcpu_tdx *tdx = to_tdx(vcpu);
2887 struct page *page;
2888 int ret, i;
2889 u64 err;
2890
2891 page = alloc_page(GFP_KERNEL);
2892 if (!page)
2893 return -ENOMEM;
2894 tdx->vp.tdvpr_page = page;
2895
2896 /*
2897 * page_to_phys() does not work in 'noinstr' code, like guest
2898 * entry via tdh_vp_enter(). Precalculate and store it instead
2899 * of doing it at runtime later.
2900 */
2901 tdx->vp.tdvpr_pa = page_to_phys(tdx->vp.tdvpr_page);
2902
2903 tdx->vp.tdcx_pages = kcalloc(kvm_tdx->td.tdcx_nr_pages, sizeof(*tdx->vp.tdcx_pages),
2904 GFP_KERNEL);
2905 if (!tdx->vp.tdcx_pages) {
2906 ret = -ENOMEM;
2907 goto free_tdvpr;
2908 }
2909
2910 for (i = 0; i < kvm_tdx->td.tdcx_nr_pages; i++) {
2911 page = alloc_page(GFP_KERNEL);
2912 if (!page) {
2913 ret = -ENOMEM;
2914 goto free_tdcx;
2915 }
2916 tdx->vp.tdcx_pages[i] = page;
2917 }
2918
2919 err = tdh_vp_create(&kvm_tdx->td, &tdx->vp);
2920 if (TDX_BUG_ON(err, TDH_VP_CREATE, vcpu->kvm)) {
2921 ret = -EIO;
2922 goto free_tdcx;
2923 }
2924
2925 for (i = 0; i < kvm_tdx->td.tdcx_nr_pages; i++) {
2926 err = tdh_vp_addcx(&tdx->vp, tdx->vp.tdcx_pages[i]);
2927 if (TDX_BUG_ON(err, TDH_VP_ADDCX, vcpu->kvm)) {
2928 /*
2929 * Pages already added are reclaimed by the vcpu_free
2930 * method, but the rest are freed here.
2931 */
2932 for (; i < kvm_tdx->td.tdcx_nr_pages; i++) {
2933 __free_page(tdx->vp.tdcx_pages[i]);
2934 tdx->vp.tdcx_pages[i] = NULL;
2935 }
2936 return -EIO;
2937 }
2938 }
2939
2940 /*
2941 * tdh_vp_init() can take an exclusive lock of the TDR resource inside
2942 * the TDX-Module. The TDR resource is also taken as shared in several
2943 * no-fail MMU paths, which could return TDX_OPERAND_BUSY on contention
2944 * (TDX-Module locks are try-lock implementations with no slow path).
2945 * Take mmu_lock for write to reflect the nature of the lock taken by
2946 * the TDX-Module, and to ensure the no-fail MMU paths succeed, e.g. if
2947 * a concurrent PUNCH_HOLE on guest_memfd triggers removal of SPTEs.
2948 */
2949 scoped_guard(write_lock, &vcpu->kvm->mmu_lock) {
2950 err = tdh_vp_init(&tdx->vp, vcpu_rcx, vcpu->vcpu_id);
2951 if (TDX_BUG_ON(err, TDH_VP_INIT, vcpu->kvm))
2952 return -EIO;
2953 }
2954
2955 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
2956
2957 return 0;
2958
2959 free_tdcx:
2960 for (i = 0; i < kvm_tdx->td.tdcx_nr_pages; i++) {
2961 if (tdx->vp.tdcx_pages[i])
2962 __free_page(tdx->vp.tdcx_pages[i]);
2963 tdx->vp.tdcx_pages[i] = NULL;
2964 }
2965 kfree(tdx->vp.tdcx_pages);
2966 tdx->vp.tdcx_pages = NULL;
2967
2968 free_tdvpr:
2969 if (tdx->vp.tdvpr_page)
2970 __free_page(tdx->vp.tdvpr_page);
2971 tdx->vp.tdvpr_page = NULL;
2972 tdx->vp.tdvpr_pa = 0;
2973
2974 return ret;
2975 }
2976
2977 /* Sometimes reads multipple subleafs. Return how many enties were written. */
tdx_vcpu_get_cpuid_leaf(struct kvm_vcpu * vcpu,u32 leaf,int * entry_index,struct kvm_cpuid_entry2 * output_e)2978 static int tdx_vcpu_get_cpuid_leaf(struct kvm_vcpu *vcpu, u32 leaf, int *entry_index,
2979 struct kvm_cpuid_entry2 *output_e)
2980 {
2981 int sub_leaf = 0;
2982 int ret;
2983
2984 /* First try without a subleaf */
2985 ret = tdx_read_cpuid(vcpu, leaf, 0, false, entry_index, output_e);
2986
2987 /* If success, or invalid leaf, just give up */
2988 if (ret != -EIO)
2989 return ret;
2990
2991 /*
2992 * If the try without a subleaf failed, try reading subleafs until
2993 * failure. The TDX module only supports 6 bits of subleaf index.
2994 */
2995 while (1) {
2996 /* Keep reading subleafs until there is a failure. */
2997 if (tdx_read_cpuid(vcpu, leaf, sub_leaf, true, entry_index, output_e))
2998 return !sub_leaf;
2999
3000 sub_leaf++;
3001 output_e++;
3002 }
3003
3004 return 0;
3005 }
3006
tdx_vcpu_get_cpuid(struct kvm_vcpu * vcpu,struct kvm_tdx_cmd * cmd)3007 static int tdx_vcpu_get_cpuid(struct kvm_vcpu *vcpu, struct kvm_tdx_cmd *cmd)
3008 {
3009 struct kvm_cpuid2 __user *output;
3010 struct kvm_cpuid2 *td_cpuid;
3011 int r = 0, i = 0, leaf;
3012 u32 level;
3013
3014 output = u64_to_user_ptr(cmd->data);
3015 td_cpuid = kzalloc(sizeof(*td_cpuid) +
3016 sizeof(output->entries[0]) * KVM_MAX_CPUID_ENTRIES,
3017 GFP_KERNEL);
3018 if (!td_cpuid)
3019 return -ENOMEM;
3020
3021 if (copy_from_user(td_cpuid, output, sizeof(*output))) {
3022 r = -EFAULT;
3023 goto out;
3024 }
3025
3026 /* Read max CPUID for normal range */
3027 if (tdx_vcpu_get_cpuid_leaf(vcpu, 0, &i, &td_cpuid->entries[i])) {
3028 r = -EIO;
3029 goto out;
3030 }
3031 level = td_cpuid->entries[0].eax;
3032
3033 for (leaf = 1; leaf <= level; leaf++)
3034 tdx_vcpu_get_cpuid_leaf(vcpu, leaf, &i, &td_cpuid->entries[i]);
3035
3036 /* Read max CPUID for extended range */
3037 if (tdx_vcpu_get_cpuid_leaf(vcpu, 0x80000000, &i, &td_cpuid->entries[i])) {
3038 r = -EIO;
3039 goto out;
3040 }
3041 level = td_cpuid->entries[i - 1].eax;
3042
3043 for (leaf = 0x80000001; leaf <= level; leaf++)
3044 tdx_vcpu_get_cpuid_leaf(vcpu, leaf, &i, &td_cpuid->entries[i]);
3045
3046 if (td_cpuid->nent < i)
3047 r = -E2BIG;
3048 td_cpuid->nent = i;
3049
3050 if (copy_to_user(output, td_cpuid, sizeof(*output))) {
3051 r = -EFAULT;
3052 goto out;
3053 }
3054
3055 if (r == -E2BIG)
3056 goto out;
3057
3058 if (copy_to_user(output->entries, td_cpuid->entries,
3059 td_cpuid->nent * sizeof(struct kvm_cpuid_entry2)))
3060 r = -EFAULT;
3061
3062 out:
3063 kfree(td_cpuid);
3064
3065 return r;
3066 }
3067
tdx_vcpu_init(struct kvm_vcpu * vcpu,struct kvm_tdx_cmd * cmd)3068 static int tdx_vcpu_init(struct kvm_vcpu *vcpu, struct kvm_tdx_cmd *cmd)
3069 {
3070 u64 apic_base;
3071 struct vcpu_tdx *tdx = to_tdx(vcpu);
3072 int ret;
3073
3074 if (cmd->flags)
3075 return -EINVAL;
3076
3077 if (tdx->state != VCPU_TD_STATE_UNINITIALIZED)
3078 return -EINVAL;
3079
3080 /*
3081 * TDX requires X2APIC, userspace is responsible for configuring guest
3082 * CPUID accordingly.
3083 */
3084 apic_base = APIC_DEFAULT_PHYS_BASE | LAPIC_MODE_X2APIC |
3085 (kvm_vcpu_is_reset_bsp(vcpu) ? MSR_IA32_APICBASE_BSP : 0);
3086 if (kvm_apic_set_base(vcpu, apic_base, true))
3087 return -EINVAL;
3088
3089 ret = tdx_td_vcpu_init(vcpu, (u64)cmd->data);
3090 if (ret)
3091 return ret;
3092
3093 td_vmcs_write16(tdx, POSTED_INTR_NV, POSTED_INTR_VECTOR);
3094 td_vmcs_write64(tdx, POSTED_INTR_DESC_ADDR, __pa(&tdx->vt.pi_desc));
3095 td_vmcs_setbit32(tdx, PIN_BASED_VM_EXEC_CONTROL, PIN_BASED_POSTED_INTR);
3096
3097 tdx->state = VCPU_TD_STATE_INITIALIZED;
3098
3099 return 0;
3100 }
3101
tdx_vcpu_reset(struct kvm_vcpu * vcpu,bool init_event)3102 void tdx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
3103 {
3104 /*
3105 * Yell on INIT, as TDX doesn't support INIT, i.e. KVM should drop all
3106 * INIT events.
3107 *
3108 * Defer initializing vCPU for RESET state until KVM_TDX_INIT_VCPU, as
3109 * userspace needs to define the vCPU model before KVM can initialize
3110 * vCPU state, e.g. to enable x2APIC.
3111 */
3112 WARN_ON_ONCE(init_event);
3113 }
3114
3115 struct tdx_gmem_post_populate_arg {
3116 struct kvm_vcpu *vcpu;
3117 __u32 flags;
3118 };
3119
tdx_gmem_post_populate(struct kvm * kvm,gfn_t gfn,kvm_pfn_t pfn,void __user * src,int order,void * _arg)3120 static int tdx_gmem_post_populate(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn,
3121 void __user *src, int order, void *_arg)
3122 {
3123 struct tdx_gmem_post_populate_arg *arg = _arg;
3124 struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
3125 u64 err, entry, level_state;
3126 gpa_t gpa = gfn_to_gpa(gfn);
3127 struct page *src_page;
3128 int ret, i;
3129
3130 if (KVM_BUG_ON(kvm_tdx->page_add_src, kvm))
3131 return -EIO;
3132
3133 /*
3134 * Get the source page if it has been faulted in. Return failure if the
3135 * source page has been swapped out or unmapped in primary memory.
3136 */
3137 ret = get_user_pages_fast((unsigned long)src, 1, 0, &src_page);
3138 if (ret < 0)
3139 return ret;
3140 if (ret != 1)
3141 return -ENOMEM;
3142
3143 kvm_tdx->page_add_src = src_page;
3144 ret = kvm_tdp_mmu_map_private_pfn(arg->vcpu, gfn, pfn);
3145 kvm_tdx->page_add_src = NULL;
3146
3147 put_page(src_page);
3148
3149 if (ret || !(arg->flags & KVM_TDX_MEASURE_MEMORY_REGION))
3150 return ret;
3151
3152 /*
3153 * Note, MR.EXTEND can fail if the S-EPT mapping is somehow removed
3154 * between mapping the pfn and now, but slots_lock prevents memslot
3155 * updates, filemap_invalidate_lock() prevents guest_memfd updates,
3156 * mmu_notifier events can't reach S-EPT entries, and KVM's internal
3157 * zapping flows are mutually exclusive with S-EPT mappings.
3158 */
3159 for (i = 0; i < PAGE_SIZE; i += TDX_EXTENDMR_CHUNKSIZE) {
3160 err = tdh_mr_extend(&kvm_tdx->td, gpa + i, &entry, &level_state);
3161 if (TDX_BUG_ON_2(err, TDH_MR_EXTEND, entry, level_state, kvm))
3162 return -EIO;
3163 }
3164
3165 return 0;
3166 }
3167
tdx_vcpu_init_mem_region(struct kvm_vcpu * vcpu,struct kvm_tdx_cmd * cmd)3168 static int tdx_vcpu_init_mem_region(struct kvm_vcpu *vcpu, struct kvm_tdx_cmd *cmd)
3169 {
3170 struct vcpu_tdx *tdx = to_tdx(vcpu);
3171 struct kvm *kvm = vcpu->kvm;
3172 struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
3173 struct kvm_tdx_init_mem_region region;
3174 struct tdx_gmem_post_populate_arg arg;
3175 long gmem_ret;
3176 int ret;
3177
3178 if (tdx->state != VCPU_TD_STATE_INITIALIZED)
3179 return -EINVAL;
3180
3181 /* Once TD is finalized, the initial guest memory is fixed. */
3182 if (kvm_tdx->state == TD_STATE_RUNNABLE)
3183 return -EINVAL;
3184
3185 if (cmd->flags & ~KVM_TDX_MEASURE_MEMORY_REGION)
3186 return -EINVAL;
3187
3188 if (copy_from_user(®ion, u64_to_user_ptr(cmd->data), sizeof(region)))
3189 return -EFAULT;
3190
3191 if (!PAGE_ALIGNED(region.source_addr) || !PAGE_ALIGNED(region.gpa) ||
3192 !region.nr_pages ||
3193 region.gpa + (region.nr_pages << PAGE_SHIFT) <= region.gpa ||
3194 !vt_is_tdx_private_gpa(kvm, region.gpa) ||
3195 !vt_is_tdx_private_gpa(kvm, region.gpa + (region.nr_pages << PAGE_SHIFT) - 1))
3196 return -EINVAL;
3197
3198 ret = 0;
3199 while (region.nr_pages) {
3200 if (signal_pending(current)) {
3201 ret = -EINTR;
3202 break;
3203 }
3204
3205 arg = (struct tdx_gmem_post_populate_arg) {
3206 .vcpu = vcpu,
3207 .flags = cmd->flags,
3208 };
3209 gmem_ret = kvm_gmem_populate(kvm, gpa_to_gfn(region.gpa),
3210 u64_to_user_ptr(region.source_addr),
3211 1, tdx_gmem_post_populate, &arg);
3212 if (gmem_ret < 0) {
3213 ret = gmem_ret;
3214 break;
3215 }
3216
3217 if (gmem_ret != 1) {
3218 ret = -EIO;
3219 break;
3220 }
3221
3222 region.source_addr += PAGE_SIZE;
3223 region.gpa += PAGE_SIZE;
3224 region.nr_pages--;
3225
3226 cond_resched();
3227 }
3228
3229 if (copy_to_user(u64_to_user_ptr(cmd->data), ®ion, sizeof(region)))
3230 ret = -EFAULT;
3231 return ret;
3232 }
3233
tdx_vcpu_unlocked_ioctl(struct kvm_vcpu * vcpu,void __user * argp)3234 int tdx_vcpu_unlocked_ioctl(struct kvm_vcpu *vcpu, void __user *argp)
3235 {
3236 struct kvm *kvm = vcpu->kvm;
3237 struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
3238 struct kvm_tdx_cmd cmd;
3239 int r;
3240
3241 r = tdx_get_cmd(argp, &cmd);
3242 if (r)
3243 return r;
3244
3245 CLASS(tdx_vm_state_guard, guard)(kvm);
3246 if (IS_ERR(guard))
3247 return PTR_ERR(guard);
3248
3249 if (!is_hkid_assigned(kvm_tdx) || kvm_tdx->state == TD_STATE_RUNNABLE)
3250 return -EINVAL;
3251
3252 vcpu_load(vcpu);
3253
3254 switch (cmd.id) {
3255 case KVM_TDX_INIT_MEM_REGION:
3256 r = tdx_vcpu_init_mem_region(vcpu, &cmd);
3257 break;
3258 case KVM_TDX_INIT_VCPU:
3259 r = tdx_vcpu_init(vcpu, &cmd);
3260 break;
3261 default:
3262 r = -ENOIOCTLCMD;
3263 break;
3264 }
3265
3266 vcpu_put(vcpu);
3267
3268 return r;
3269 }
3270
tdx_vcpu_ioctl(struct kvm_vcpu * vcpu,void __user * argp)3271 int tdx_vcpu_ioctl(struct kvm_vcpu *vcpu, void __user *argp)
3272 {
3273 struct kvm_tdx *kvm_tdx = to_kvm_tdx(vcpu->kvm);
3274 struct kvm_tdx_cmd cmd;
3275 int ret;
3276
3277 if (!is_hkid_assigned(kvm_tdx) || kvm_tdx->state == TD_STATE_RUNNABLE)
3278 return -EINVAL;
3279
3280 ret = tdx_get_cmd(argp, &cmd);
3281 if (ret)
3282 return ret;
3283
3284 switch (cmd.id) {
3285 case KVM_TDX_GET_CPUID:
3286 ret = tdx_vcpu_get_cpuid(vcpu, &cmd);
3287 break;
3288 default:
3289 ret = -EINVAL;
3290 break;
3291 }
3292
3293 return ret;
3294 }
3295
tdx_gmem_max_mapping_level(struct kvm * kvm,kvm_pfn_t pfn,bool is_private)3296 int tdx_gmem_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn, bool is_private)
3297 {
3298 if (!is_private)
3299 return 0;
3300
3301 return PG_LEVEL_4K;
3302 }
3303
tdx_online_cpu(unsigned int cpu)3304 static int tdx_online_cpu(unsigned int cpu)
3305 {
3306 unsigned long flags;
3307 int r;
3308
3309 /* Sanity check CPU is already in post-VMXON */
3310 WARN_ON_ONCE(!(cr4_read_shadow() & X86_CR4_VMXE));
3311
3312 local_irq_save(flags);
3313 r = tdx_cpu_enable();
3314 local_irq_restore(flags);
3315
3316 return r;
3317 }
3318
tdx_offline_cpu(unsigned int cpu)3319 static int tdx_offline_cpu(unsigned int cpu)
3320 {
3321 int i;
3322
3323 /* No TD is running. Allow any cpu to be offline. */
3324 if (!atomic_read(&nr_configured_hkid))
3325 return 0;
3326
3327 /*
3328 * In order to reclaim TDX HKID, (i.e. when deleting guest TD), need to
3329 * call TDH.PHYMEM.PAGE.WBINVD on all packages to program all memory
3330 * controller with pconfig. If we have active TDX HKID, refuse to
3331 * offline the last online cpu.
3332 */
3333 for_each_online_cpu(i) {
3334 /*
3335 * Found another online cpu on the same package.
3336 * Allow to offline.
3337 */
3338 if (i != cpu && topology_physical_package_id(i) ==
3339 topology_physical_package_id(cpu))
3340 return 0;
3341 }
3342
3343 /*
3344 * This is the last cpu of this package. Don't offline it.
3345 *
3346 * Because it's hard for human operator to understand the
3347 * reason, warn it.
3348 */
3349 #define MSG_ALLPKG_ONLINE \
3350 "TDX requires all packages to have an online CPU. Delete all TDs in order to offline all CPUs of a package.\n"
3351 pr_warn_ratelimited(MSG_ALLPKG_ONLINE);
3352 return -EBUSY;
3353 }
3354
__do_tdx_cleanup(void)3355 static void __do_tdx_cleanup(void)
3356 {
3357 /*
3358 * Once TDX module is initialized, it cannot be disabled and
3359 * re-initialized again w/o runtime update (which isn't
3360 * supported by kernel). Only need to remove the cpuhp here.
3361 * The TDX host core code tracks TDX status and can handle
3362 * 'multiple enabling' scenario.
3363 */
3364 WARN_ON_ONCE(!tdx_cpuhp_state);
3365 cpuhp_remove_state_nocalls_cpuslocked(tdx_cpuhp_state);
3366 tdx_cpuhp_state = 0;
3367 }
3368
__tdx_cleanup(void)3369 static void __tdx_cleanup(void)
3370 {
3371 cpus_read_lock();
3372 __do_tdx_cleanup();
3373 cpus_read_unlock();
3374 }
3375
__do_tdx_bringup(void)3376 static int __init __do_tdx_bringup(void)
3377 {
3378 int r;
3379
3380 /*
3381 * TDX-specific cpuhp callback to call tdx_cpu_enable() on all
3382 * online CPUs before calling tdx_enable(), and on any new
3383 * going-online CPU to make sure it is ready for TDX guest.
3384 */
3385 r = cpuhp_setup_state_cpuslocked(CPUHP_AP_ONLINE_DYN,
3386 "kvm/cpu/tdx:online",
3387 tdx_online_cpu, tdx_offline_cpu);
3388 if (r < 0)
3389 return r;
3390
3391 tdx_cpuhp_state = r;
3392
3393 r = tdx_enable();
3394 if (r)
3395 __do_tdx_cleanup();
3396
3397 return r;
3398 }
3399
__tdx_bringup(void)3400 static int __init __tdx_bringup(void)
3401 {
3402 const struct tdx_sys_info_td_conf *td_conf;
3403 int r, i;
3404
3405 for (i = 0; i < ARRAY_SIZE(tdx_uret_msrs); i++) {
3406 /*
3407 * Check if MSRs (tdx_uret_msrs) can be saved/restored
3408 * before returning to user space.
3409 */
3410 tdx_uret_msrs[i].slot = kvm_find_user_return_msr(tdx_uret_msrs[i].msr);
3411 if (tdx_uret_msrs[i].slot == -1) {
3412 /* If any MSR isn't supported, it is a KVM bug */
3413 pr_err("MSR %x isn't included by kvm_find_user_return_msr\n",
3414 tdx_uret_msrs[i].msr);
3415 return -EIO;
3416 }
3417 }
3418
3419 /*
3420 * Enabling TDX requires enabling hardware virtualization first,
3421 * as making SEAMCALLs requires CPU being in post-VMXON state.
3422 */
3423 r = kvm_enable_virtualization();
3424 if (r)
3425 return r;
3426
3427 cpus_read_lock();
3428 r = __do_tdx_bringup();
3429 cpus_read_unlock();
3430
3431 if (r)
3432 goto tdx_bringup_err;
3433
3434 r = -EINVAL;
3435 /* Get TDX global information for later use */
3436 tdx_sysinfo = tdx_get_sysinfo();
3437 if (WARN_ON_ONCE(!tdx_sysinfo))
3438 goto get_sysinfo_err;
3439
3440 /* Check TDX module and KVM capabilities */
3441 if (!tdx_get_supported_attrs(&tdx_sysinfo->td_conf) ||
3442 !tdx_get_supported_xfam(&tdx_sysinfo->td_conf))
3443 goto get_sysinfo_err;
3444
3445 if (!(tdx_sysinfo->features.tdx_features0 & MD_FIELD_ID_FEATURES0_TOPOLOGY_ENUM))
3446 goto get_sysinfo_err;
3447
3448 /*
3449 * TDX has its own limit of maximum vCPUs it can support for all
3450 * TDX guests in addition to KVM_MAX_VCPUS. Userspace needs to
3451 * query TDX guest's maximum vCPUs by checking KVM_CAP_MAX_VCPU
3452 * extension on per-VM basis.
3453 *
3454 * TDX module reports such limit via the MAX_VCPU_PER_TD global
3455 * metadata. Different modules may report different values.
3456 * Some old module may also not support this metadata (in which
3457 * case this limit is U16_MAX).
3458 *
3459 * In practice, the reported value reflects the maximum logical
3460 * CPUs that ALL the platforms that the module supports can
3461 * possibly have.
3462 *
3463 * Simply forwarding the MAX_VCPU_PER_TD to userspace could
3464 * result in an unpredictable ABI. KVM instead always advertise
3465 * the number of logical CPUs the platform has as the maximum
3466 * vCPUs for TDX guests.
3467 *
3468 * Make sure MAX_VCPU_PER_TD reported by TDX module is not
3469 * smaller than the number of logical CPUs, otherwise KVM will
3470 * report an unsupported value to userspace.
3471 *
3472 * Note, a platform with TDX enabled in the BIOS cannot support
3473 * physical CPU hotplug, and TDX requires the BIOS has marked
3474 * all logical CPUs in MADT table as enabled. Just use
3475 * num_present_cpus() for the number of logical CPUs.
3476 */
3477 td_conf = &tdx_sysinfo->td_conf;
3478 if (td_conf->max_vcpus_per_td < num_present_cpus()) {
3479 pr_err("Disable TDX: MAX_VCPU_PER_TD (%u) smaller than number of logical CPUs (%u).\n",
3480 td_conf->max_vcpus_per_td, num_present_cpus());
3481 goto get_sysinfo_err;
3482 }
3483
3484 if (misc_cg_set_capacity(MISC_CG_RES_TDX, tdx_get_nr_guest_keyids()))
3485 goto get_sysinfo_err;
3486
3487 /*
3488 * Leave hardware virtualization enabled after TDX is enabled
3489 * successfully. TDX CPU hotplug depends on this.
3490 */
3491 return 0;
3492
3493 get_sysinfo_err:
3494 __tdx_cleanup();
3495 tdx_bringup_err:
3496 kvm_disable_virtualization();
3497 return r;
3498 }
3499
tdx_cleanup(void)3500 void tdx_cleanup(void)
3501 {
3502 if (enable_tdx) {
3503 misc_cg_set_capacity(MISC_CG_RES_TDX, 0);
3504 __tdx_cleanup();
3505 kvm_disable_virtualization();
3506 }
3507 }
3508
tdx_bringup(void)3509 int __init tdx_bringup(void)
3510 {
3511 int r, i;
3512
3513 /* tdx_disable_virtualization_cpu() uses associated_tdvcpus. */
3514 for_each_possible_cpu(i)
3515 INIT_LIST_HEAD(&per_cpu(associated_tdvcpus, i));
3516
3517 if (!enable_tdx)
3518 return 0;
3519
3520 if (!enable_ept) {
3521 pr_err("EPT is required for TDX\n");
3522 goto success_disable_tdx;
3523 }
3524
3525 if (!tdp_mmu_enabled || !enable_mmio_caching || !enable_ept_ad_bits) {
3526 pr_err("TDP MMU and MMIO caching and EPT A/D bit is required for TDX\n");
3527 goto success_disable_tdx;
3528 }
3529
3530 if (!enable_apicv) {
3531 pr_err("APICv is required for TDX\n");
3532 goto success_disable_tdx;
3533 }
3534
3535 if (!cpu_feature_enabled(X86_FEATURE_OSXSAVE)) {
3536 pr_err("tdx: OSXSAVE is required for TDX\n");
3537 goto success_disable_tdx;
3538 }
3539
3540 if (!cpu_feature_enabled(X86_FEATURE_MOVDIR64B)) {
3541 pr_err("tdx: MOVDIR64B is required for TDX\n");
3542 goto success_disable_tdx;
3543 }
3544
3545 if (!cpu_feature_enabled(X86_FEATURE_SELFSNOOP)) {
3546 pr_err("Self-snoop is required for TDX\n");
3547 goto success_disable_tdx;
3548 }
3549
3550 if (!cpu_feature_enabled(X86_FEATURE_TDX_HOST_PLATFORM)) {
3551 pr_err("tdx: no TDX private KeyIDs available\n");
3552 goto success_disable_tdx;
3553 }
3554
3555 if (!enable_virt_at_load) {
3556 pr_err("tdx: tdx requires kvm.enable_virt_at_load=1\n");
3557 goto success_disable_tdx;
3558 }
3559
3560 /*
3561 * Ideally KVM should probe whether TDX module has been loaded
3562 * first and then try to bring it up. But TDX needs to use SEAMCALL
3563 * to probe whether the module is loaded (there is no CPUID or MSR
3564 * for that), and making SEAMCALL requires enabling virtualization
3565 * first, just like the rest steps of bringing up TDX module.
3566 *
3567 * So, for simplicity do everything in __tdx_bringup(); the first
3568 * SEAMCALL will return -ENODEV when the module is not loaded. The
3569 * only complication is having to make sure that initialization
3570 * SEAMCALLs don't return TDX_SEAMCALL_VMFAILINVALID in other
3571 * cases.
3572 */
3573 r = __tdx_bringup();
3574 if (r) {
3575 /*
3576 * Disable TDX only but don't fail to load module if the TDX
3577 * module could not be loaded. No need to print message saying
3578 * "module is not loaded" because it was printed when the first
3579 * SEAMCALL failed. Don't bother unwinding the S-EPT hooks or
3580 * vm_size, as kvm_x86_ops have already been finalized (and are
3581 * intentionally not exported). The S-EPT code is unreachable,
3582 * and allocating a few more bytes per VM in a should-be-rare
3583 * failure scenario is a non-issue.
3584 */
3585 if (r == -ENODEV)
3586 goto success_disable_tdx;
3587
3588 enable_tdx = 0;
3589 }
3590
3591 return r;
3592
3593 success_disable_tdx:
3594 enable_tdx = 0;
3595 return 0;
3596 }
3597
tdx_hardware_setup(void)3598 void __init tdx_hardware_setup(void)
3599 {
3600 KVM_SANITY_CHECK_VM_STRUCT_SIZE(kvm_tdx);
3601
3602 /*
3603 * Note, if the TDX module can't be loaded, KVM TDX support will be
3604 * disabled but KVM will continue loading (see tdx_bringup()).
3605 */
3606 vt_x86_ops.vm_size = max_t(unsigned int, vt_x86_ops.vm_size, sizeof(struct kvm_tdx));
3607
3608 vt_x86_ops.link_external_spt = tdx_sept_link_private_spt;
3609 vt_x86_ops.set_external_spte = tdx_sept_set_private_spte;
3610 vt_x86_ops.free_external_spt = tdx_sept_free_private_spt;
3611 vt_x86_ops.remove_external_spte = tdx_sept_remove_private_spte;
3612 vt_x86_ops.protected_apic_has_interrupt = tdx_protected_apic_has_interrupt;
3613 }
3614