xref: /linux/arch/x86/kvm/vmx/tdx.c (revision fbf5df34a4dbcd09d433dd4f0916bf9b2ddb16de)
1 // SPDX-License-Identifier: GPL-2.0
2 #include <linux/cleanup.h>
3 #include <linux/cpu.h>
4 #include <asm/cpufeature.h>
5 #include <asm/fpu/xcr.h>
6 #include <linux/misc_cgroup.h>
7 #include <linux/mmu_context.h>
8 #include <asm/tdx.h>
9 #include <asm/virt.h>
10 #include "capabilities.h"
11 #include "mmu.h"
12 #include "x86_ops.h"
13 #include "lapic.h"
14 #include "tdx.h"
15 #include "vmx.h"
16 #include "mmu/spte.h"
17 #include "common.h"
18 #include "posted_intr.h"
19 #include "irq.h"
20 #include <trace/events/kvm.h>
21 #include "trace.h"
22 
23 #pragma GCC poison to_vmx
24 
25 #undef pr_fmt
26 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
27 
28 #define __TDX_BUG_ON(__err, __f, __kvm, __fmt, __args...)			\
29 ({										\
30 	struct kvm *_kvm = (__kvm);						\
31 	bool __ret = !!(__err);							\
32 										\
33 	if (WARN_ON_ONCE(__ret && (!_kvm || !_kvm->vm_bugged))) {		\
34 		if (_kvm)							\
35 			kvm_vm_bugged(_kvm);					\
36 		pr_err_ratelimited("SEAMCALL " __f " failed: 0x%llx" __fmt "\n",\
37 				   __err,  __args);				\
38 	}									\
39 	unlikely(__ret);							\
40 })
41 
42 #define TDX_BUG_ON(__err, __fn, __kvm)				\
43 	__TDX_BUG_ON(__err, #__fn, __kvm, "%s", "")
44 
45 #define TDX_BUG_ON_1(__err, __fn, a1, __kvm)			\
46 	__TDX_BUG_ON(__err, #__fn, __kvm, ", " #a1 " 0x%llx", a1)
47 
48 #define TDX_BUG_ON_2(__err, __fn, a1, a2, __kvm)	\
49 	__TDX_BUG_ON(__err, #__fn, __kvm, ", " #a1 " 0x%llx, " #a2 " 0x%llx", a1, a2)
50 
51 #define TDX_BUG_ON_3(__err, __fn, a1, a2, a3, __kvm)	\
52 	__TDX_BUG_ON(__err, #__fn, __kvm, ", " #a1 " 0x%llx, " #a2 ", 0x%llx, " #a3 " 0x%llx", \
53 		     a1, a2, a3)
54 
55 
56 bool enable_tdx __ro_after_init;
57 module_param_named(tdx, enable_tdx, bool, 0444);
58 
59 #define TDX_SHARED_BIT_PWL_5 gpa_to_gfn(BIT_ULL(51))
60 #define TDX_SHARED_BIT_PWL_4 gpa_to_gfn(BIT_ULL(47))
61 
62 static const struct tdx_sys_info *tdx_sysinfo;
63 
64 void tdh_vp_rd_failed(struct vcpu_tdx *tdx, char *uclass, u32 field, u64 err)
65 {
66 	KVM_BUG_ON(1, tdx->vcpu.kvm);
67 	pr_err("TDH_VP_RD[%s.0x%x] failed 0x%llx\n", uclass, field, err);
68 }
69 
70 void tdh_vp_wr_failed(struct vcpu_tdx *tdx, char *uclass, char *op, u32 field,
71 		      u64 val, u64 err)
72 {
73 	KVM_BUG_ON(1, tdx->vcpu.kvm);
74 	pr_err("TDH_VP_WR[%s.0x%x]%s0x%llx failed: 0x%llx\n", uclass, field, op, val, err);
75 }
76 
77 #define KVM_SUPPORTED_TDX_TD_ATTRS (TDX_TD_ATTR_SEPT_VE_DISABLE)
78 
79 static __always_inline struct kvm_tdx *to_kvm_tdx(struct kvm *kvm)
80 {
81 	return container_of(kvm, struct kvm_tdx, kvm);
82 }
83 
84 static __always_inline struct vcpu_tdx *to_tdx(struct kvm_vcpu *vcpu)
85 {
86 	return container_of(vcpu, struct vcpu_tdx, vcpu);
87 }
88 
89 static u64 tdx_get_supported_attrs(const struct tdx_sys_info_td_conf *td_conf)
90 {
91 	u64 val = KVM_SUPPORTED_TDX_TD_ATTRS;
92 
93 	if ((val & td_conf->attributes_fixed1) != td_conf->attributes_fixed1)
94 		return 0;
95 
96 	val &= td_conf->attributes_fixed0;
97 
98 	return val;
99 }
100 
101 static u64 tdx_get_supported_xfam(const struct tdx_sys_info_td_conf *td_conf)
102 {
103 	u64 val = kvm_caps.supported_xcr0 | kvm_caps.supported_xss;
104 
105 	if ((val & td_conf->xfam_fixed1) != td_conf->xfam_fixed1)
106 		return 0;
107 
108 	val &= td_conf->xfam_fixed0;
109 
110 	return val;
111 }
112 
113 static int tdx_get_guest_phys_addr_bits(const u32 eax)
114 {
115 	return (eax & GENMASK(23, 16)) >> 16;
116 }
117 
118 static u32 tdx_set_guest_phys_addr_bits(const u32 eax, int addr_bits)
119 {
120 	return (eax & ~GENMASK(23, 16)) | (addr_bits & 0xff) << 16;
121 }
122 
123 #define TDX_FEATURE_TSX (__feature_bit(X86_FEATURE_HLE) | __feature_bit(X86_FEATURE_RTM))
124 
125 static bool has_tsx(const struct kvm_cpuid_entry2 *entry)
126 {
127 	return entry->function == 7 && entry->index == 0 &&
128 	       (entry->ebx & TDX_FEATURE_TSX);
129 }
130 
131 static void clear_tsx(struct kvm_cpuid_entry2 *entry)
132 {
133 	entry->ebx &= ~TDX_FEATURE_TSX;
134 }
135 
136 static bool has_waitpkg(const struct kvm_cpuid_entry2 *entry)
137 {
138 	return entry->function == 7 && entry->index == 0 &&
139 	       (entry->ecx & __feature_bit(X86_FEATURE_WAITPKG));
140 }
141 
142 static void clear_waitpkg(struct kvm_cpuid_entry2 *entry)
143 {
144 	entry->ecx &= ~__feature_bit(X86_FEATURE_WAITPKG);
145 }
146 
147 static void tdx_clear_unsupported_cpuid(struct kvm_cpuid_entry2 *entry)
148 {
149 	if (has_tsx(entry))
150 		clear_tsx(entry);
151 
152 	if (has_waitpkg(entry))
153 		clear_waitpkg(entry);
154 }
155 
156 static bool tdx_unsupported_cpuid(const struct kvm_cpuid_entry2 *entry)
157 {
158 	return has_tsx(entry) || has_waitpkg(entry);
159 }
160 
161 #define KVM_TDX_CPUID_NO_SUBLEAF	((__u32)-1)
162 
163 static void td_init_cpuid_entry2(struct kvm_cpuid_entry2 *entry, unsigned char idx)
164 {
165 	const struct tdx_sys_info_td_conf *td_conf = &tdx_sysinfo->td_conf;
166 
167 	entry->function = (u32)td_conf->cpuid_config_leaves[idx];
168 	entry->index = td_conf->cpuid_config_leaves[idx] >> 32;
169 	entry->eax = (u32)td_conf->cpuid_config_values[idx][0];
170 	entry->ebx = td_conf->cpuid_config_values[idx][0] >> 32;
171 	entry->ecx = (u32)td_conf->cpuid_config_values[idx][1];
172 	entry->edx = td_conf->cpuid_config_values[idx][1] >> 32;
173 
174 	if (entry->index == KVM_TDX_CPUID_NO_SUBLEAF)
175 		entry->index = 0;
176 
177 	/*
178 	 * The TDX module doesn't allow configuring the guest phys addr bits
179 	 * (EAX[23:16]).  However, KVM uses it as an interface to the userspace
180 	 * to configure the GPAW.  Report these bits as configurable.
181 	 */
182 	if (entry->function == 0x80000008)
183 		entry->eax = tdx_set_guest_phys_addr_bits(entry->eax, 0xff);
184 
185 	tdx_clear_unsupported_cpuid(entry);
186 }
187 
188 #define TDVMCALLINFO_SETUP_EVENT_NOTIFY_INTERRUPT	BIT(1)
189 
190 static int init_kvm_tdx_caps(const struct tdx_sys_info_td_conf *td_conf,
191 			     struct kvm_tdx_capabilities *caps)
192 {
193 	int i;
194 
195 	caps->supported_attrs = tdx_get_supported_attrs(td_conf);
196 	if (!caps->supported_attrs)
197 		return -EIO;
198 
199 	caps->supported_xfam = tdx_get_supported_xfam(td_conf);
200 	if (!caps->supported_xfam)
201 		return -EIO;
202 
203 	caps->cpuid.nent = td_conf->num_cpuid_config;
204 
205 	caps->user_tdvmcallinfo_1_r11 =
206 		TDVMCALLINFO_SETUP_EVENT_NOTIFY_INTERRUPT;
207 
208 	for (i = 0; i < td_conf->num_cpuid_config; i++)
209 		td_init_cpuid_entry2(&caps->cpuid.entries[i], i);
210 
211 	return 0;
212 }
213 
214 /*
215  * Some SEAMCALLs acquire the TDX module globally, and can fail with
216  * TDX_OPERAND_BUSY.  Use a global mutex to serialize these SEAMCALLs.
217  */
218 static DEFINE_MUTEX(tdx_lock);
219 
220 static bool tdx_operand_busy(u64 err)
221 {
222 	return (err & TDX_SEAMCALL_STATUS_MASK) == TDX_OPERAND_BUSY;
223 }
224 
225 
226 /*
227  * A per-CPU list of TD vCPUs associated with a given CPU.
228  * Protected by interrupt mask. Only manipulated by the CPU owning this per-CPU
229  * list.
230  * - When a vCPU is loaded onto a CPU, it is removed from the per-CPU list of
231  *   the old CPU during the IPI callback running on the old CPU, and then added
232  *   to the per-CPU list of the new CPU.
233  * - When a TD is tearing down, all vCPUs are disassociated from their current
234  *   running CPUs and removed from the per-CPU list during the IPI callback
235  *   running on those CPUs.
236  * - When a CPU is brought down, traverse the per-CPU list to disassociate all
237  *   associated TD vCPUs and remove them from the per-CPU list.
238  */
239 static DEFINE_PER_CPU(struct list_head, associated_tdvcpus);
240 
241 static __always_inline unsigned long tdvmcall_exit_type(struct kvm_vcpu *vcpu)
242 {
243 	return to_tdx(vcpu)->vp_enter_args.r10;
244 }
245 
246 static __always_inline unsigned long tdvmcall_leaf(struct kvm_vcpu *vcpu)
247 {
248 	return to_tdx(vcpu)->vp_enter_args.r11;
249 }
250 
251 static __always_inline void tdvmcall_set_return_code(struct kvm_vcpu *vcpu,
252 						     long val)
253 {
254 	to_tdx(vcpu)->vp_enter_args.r10 = val;
255 }
256 
257 static __always_inline void tdvmcall_set_return_val(struct kvm_vcpu *vcpu,
258 						    unsigned long val)
259 {
260 	to_tdx(vcpu)->vp_enter_args.r11 = val;
261 }
262 
263 static inline void tdx_hkid_free(struct kvm_tdx *kvm_tdx)
264 {
265 	tdx_guest_keyid_free(kvm_tdx->hkid);
266 	kvm_tdx->hkid = -1;
267 	misc_cg_uncharge(MISC_CG_RES_TDX, kvm_tdx->misc_cg, 1);
268 	put_misc_cg(kvm_tdx->misc_cg);
269 	kvm_tdx->misc_cg = NULL;
270 }
271 
272 static inline bool is_hkid_assigned(struct kvm_tdx *kvm_tdx)
273 {
274 	return kvm_tdx->hkid > 0;
275 }
276 
277 static inline void tdx_disassociate_vp(struct kvm_vcpu *vcpu)
278 {
279 	lockdep_assert_irqs_disabled();
280 
281 	list_del(&to_tdx(vcpu)->cpu_list);
282 
283 	/*
284 	 * Ensure tdx->cpu_list is updated before setting vcpu->cpu to -1,
285 	 * otherwise, a different CPU can see vcpu->cpu = -1 and add the vCPU
286 	 * to its list before it's deleted from this CPU's list.
287 	 */
288 	smp_wmb();
289 
290 	vcpu->cpu = -1;
291 }
292 
293 /*
294  * Execute a SEAMCALL related to removing/blocking S-EPT entries, with a single
295  * retry (if necessary) after forcing vCPUs to exit and wait for the operation
296  * to complete.  All flows that remove/block S-EPT entries run with mmu_lock
297  * held for write, i.e. are mutually exclusive with each other, but they aren't
298  * mutually exclusive with running vCPUs, and so can fail with "operand busy"
299  * if a vCPU acquires a relevant lock in the TDX-Module, e.g. when doing TDCALL.
300  *
301  * Note, the retry is guaranteed to succeed, absent KVM and/or TDX-Module bugs.
302  */
303 #define tdh_do_no_vcpus(tdh_func, kvm, args...)					\
304 ({										\
305 	struct kvm_tdx *__kvm_tdx = to_kvm_tdx(kvm);				\
306 	u64 __err;								\
307 										\
308 	lockdep_assert_held_write(&kvm->mmu_lock);				\
309 										\
310 	__err = tdh_func(args);							\
311 	if (unlikely(tdx_operand_busy(__err))) {				\
312 		WRITE_ONCE(__kvm_tdx->wait_for_sept_zap, true);			\
313 		kvm_make_all_cpus_request(kvm, KVM_REQ_OUTSIDE_GUEST_MODE);	\
314 										\
315 		__err = tdh_func(args);						\
316 										\
317 		WRITE_ONCE(__kvm_tdx->wait_for_sept_zap, false);		\
318 	}									\
319 	__err;									\
320 })
321 
322 /* TDH.PHYMEM.PAGE.RECLAIM is allowed only when destroying the TD. */
323 static int __tdx_reclaim_page(struct page *page)
324 {
325 	u64 err, rcx, rdx, r8;
326 
327 	err = tdh_phymem_page_reclaim(page, &rcx, &rdx, &r8);
328 
329 	/*
330 	 * No need to check for TDX_OPERAND_BUSY; all TD pages are freed
331 	 * before the HKID is released and control pages have also been
332 	 * released at this point, so there is no possibility of contention.
333 	 */
334 	if (TDX_BUG_ON_3(err, TDH_PHYMEM_PAGE_RECLAIM, rcx, rdx, r8, NULL))
335 		return -EIO;
336 
337 	return 0;
338 }
339 
340 static int tdx_reclaim_page(struct page *page)
341 {
342 	int r;
343 
344 	r = __tdx_reclaim_page(page);
345 	if (!r)
346 		tdx_quirk_reset_page(page);
347 	return r;
348 }
349 
350 
351 /*
352  * Reclaim the TD control page(s) which are crypto-protected by TDX guest's
353  * private KeyID.  Assume the cache associated with the TDX private KeyID has
354  * been flushed.
355  */
356 static void tdx_reclaim_control_page(struct page *ctrl_page)
357 {
358 	/*
359 	 * Leak the page if the kernel failed to reclaim the page.
360 	 * The kernel cannot use it safely anymore.
361 	 */
362 	if (tdx_reclaim_page(ctrl_page))
363 		return;
364 
365 	__free_page(ctrl_page);
366 }
367 
368 struct tdx_flush_vp_arg {
369 	struct kvm_vcpu *vcpu;
370 	u64 err;
371 };
372 
373 static void tdx_flush_vp(void *_arg)
374 {
375 	struct tdx_flush_vp_arg *arg = _arg;
376 	struct kvm_vcpu *vcpu = arg->vcpu;
377 	u64 err;
378 
379 	arg->err = 0;
380 	lockdep_assert_irqs_disabled();
381 
382 	/* Task migration can race with CPU offlining. */
383 	if (unlikely(vcpu->cpu != raw_smp_processor_id()))
384 		return;
385 
386 	/*
387 	 * No need to do TDH_VP_FLUSH if the vCPU hasn't been initialized.  The
388 	 * list tracking still needs to be updated so that it's correct if/when
389 	 * the vCPU does get initialized.
390 	 */
391 	if (to_tdx(vcpu)->state != VCPU_TD_STATE_UNINITIALIZED) {
392 		/*
393 		 * No need to retry.  TDX Resources needed for TDH.VP.FLUSH are:
394 		 * TDVPR as exclusive, TDR as shared, and TDCS as shared.  This
395 		 * vp flush function is called when destructing vCPU/TD or vCPU
396 		 * migration.  No other thread uses TDVPR in those cases.
397 		 */
398 		err = tdh_vp_flush(&to_tdx(vcpu)->vp);
399 		if (unlikely(err && err != TDX_VCPU_NOT_ASSOCIATED)) {
400 			/*
401 			 * This function is called in IPI context. Do not use
402 			 * printk to avoid console semaphore.
403 			 * The caller prints out the error message, instead.
404 			 */
405 			if (err)
406 				arg->err = err;
407 		}
408 	}
409 
410 	tdx_disassociate_vp(vcpu);
411 }
412 
413 static void tdx_flush_vp_on_cpu(struct kvm_vcpu *vcpu)
414 {
415 	struct tdx_flush_vp_arg arg = {
416 		.vcpu = vcpu,
417 	};
418 	int cpu = vcpu->cpu;
419 
420 	if (unlikely(cpu == -1))
421 		return;
422 
423 	smp_call_function_single(cpu, tdx_flush_vp, &arg, 1);
424 
425 	TDX_BUG_ON(arg.err, TDH_VP_FLUSH, vcpu->kvm);
426 }
427 
428 void tdx_disable_virtualization_cpu(void)
429 {
430 	int cpu = raw_smp_processor_id();
431 	struct list_head *tdvcpus = &per_cpu(associated_tdvcpus, cpu);
432 	struct tdx_flush_vp_arg arg;
433 	struct vcpu_tdx *tdx, *tmp;
434 	unsigned long flags;
435 
436 	local_irq_save(flags);
437 	/* Safe variant needed as tdx_disassociate_vp() deletes the entry. */
438 	list_for_each_entry_safe(tdx, tmp, tdvcpus, cpu_list) {
439 		arg.vcpu = &tdx->vcpu;
440 		tdx_flush_vp(&arg);
441 	}
442 	local_irq_restore(flags);
443 
444 	/*
445 	 * Flush cache now if kexec is possible: this is necessary to avoid
446 	 * having dirty private memory cachelines when the new kernel boots,
447 	 * but WBINVD is a relatively expensive operation and doing it during
448 	 * kexec can exacerbate races in native_stop_other_cpus().  Do it
449 	 * now, since this is a safe moment and there is going to be no more
450 	 * TDX activity on this CPU from this point on.
451 	 */
452 	tdx_cpu_flush_cache_for_kexec();
453 }
454 
455 #define TDX_SEAMCALL_RETRIES 10000
456 
457 static void smp_func_do_phymem_cache_wb(void *unused)
458 {
459 	u64 err = 0;
460 	bool resume;
461 	int i;
462 
463 	/*
464 	 * TDH.PHYMEM.CACHE.WB flushes caches associated with any TDX private
465 	 * KeyID on the package or core.  The TDX module may not finish the
466 	 * cache flush but return TDX_INTERRUPTED_RESUMEABLE instead.  The
467 	 * kernel should retry it until it returns success w/o rescheduling.
468 	 */
469 	for (i = TDX_SEAMCALL_RETRIES; i > 0; i--) {
470 		resume = !!err;
471 		err = tdh_phymem_cache_wb(resume);
472 		switch (err) {
473 		case TDX_INTERRUPTED_RESUMABLE:
474 			continue;
475 		case TDX_NO_HKID_READY_TO_WBCACHE:
476 			err = TDX_SUCCESS; /* Already done by other thread */
477 			fallthrough;
478 		default:
479 			goto out;
480 		}
481 	}
482 
483 out:
484 	TDX_BUG_ON(err, TDH_PHYMEM_CACHE_WB, NULL);
485 }
486 
487 void tdx_mmu_release_hkid(struct kvm *kvm)
488 {
489 	bool packages_allocated, targets_allocated;
490 	struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
491 	cpumask_var_t packages, targets;
492 	struct kvm_vcpu *vcpu;
493 	unsigned long j;
494 	int i;
495 	u64 err;
496 
497 	if (!is_hkid_assigned(kvm_tdx))
498 		return;
499 
500 	packages_allocated = zalloc_cpumask_var(&packages, GFP_KERNEL);
501 	targets_allocated = zalloc_cpumask_var(&targets, GFP_KERNEL);
502 	cpus_read_lock();
503 
504 	kvm_for_each_vcpu(j, vcpu, kvm)
505 		tdx_flush_vp_on_cpu(vcpu);
506 
507 	/*
508 	 * TDH.PHYMEM.CACHE.WB tries to acquire the TDX module global lock
509 	 * and can fail with TDX_OPERAND_BUSY when it fails to get the lock.
510 	 * Multiple TDX guests can be destroyed simultaneously. Take the
511 	 * mutex to prevent it from getting error.
512 	 */
513 	mutex_lock(&tdx_lock);
514 
515 	/*
516 	 * Releasing HKID is in vm_destroy().
517 	 * After the above flushing vps, there should be no more vCPU
518 	 * associations, as all vCPU fds have been released at this stage.
519 	 */
520 	err = tdh_mng_vpflushdone(&kvm_tdx->td);
521 	if (err == TDX_FLUSHVP_NOT_DONE)
522 		goto out;
523 	if (TDX_BUG_ON(err, TDH_MNG_VPFLUSHDONE, kvm)) {
524 		pr_err("tdh_mng_vpflushdone() failed. HKID %d is leaked.\n",
525 		       kvm_tdx->hkid);
526 		goto out;
527 	}
528 
529 	for_each_online_cpu(i) {
530 		if (packages_allocated &&
531 		    cpumask_test_and_set_cpu(topology_physical_package_id(i),
532 					     packages))
533 			continue;
534 		if (targets_allocated)
535 			cpumask_set_cpu(i, targets);
536 	}
537 	if (targets_allocated)
538 		on_each_cpu_mask(targets, smp_func_do_phymem_cache_wb, NULL, true);
539 	else
540 		on_each_cpu(smp_func_do_phymem_cache_wb, NULL, true);
541 	/*
542 	 * In the case of error in smp_func_do_phymem_cache_wb(), the following
543 	 * tdh_mng_key_freeid() will fail.
544 	 */
545 	err = tdh_mng_key_freeid(&kvm_tdx->td);
546 	if (TDX_BUG_ON(err, TDH_MNG_KEY_FREEID, kvm)) {
547 		pr_err("tdh_mng_key_freeid() failed. HKID %d is leaked.\n",
548 		       kvm_tdx->hkid);
549 	} else {
550 		tdx_hkid_free(kvm_tdx);
551 	}
552 
553 out:
554 	mutex_unlock(&tdx_lock);
555 	cpus_read_unlock();
556 	free_cpumask_var(targets);
557 	free_cpumask_var(packages);
558 }
559 
560 static void tdx_reclaim_td_control_pages(struct kvm *kvm)
561 {
562 	struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
563 	u64 err;
564 	int i;
565 
566 	/*
567 	 * tdx_mmu_release_hkid() failed to reclaim HKID.  Something went wrong
568 	 * heavily with TDX module.  Give up freeing TD pages.  As the function
569 	 * already warned, don't warn it again.
570 	 */
571 	if (is_hkid_assigned(kvm_tdx))
572 		return;
573 
574 	if (kvm_tdx->td.tdcs_pages) {
575 		for (i = 0; i < kvm_tdx->td.tdcs_nr_pages; i++) {
576 			if (!kvm_tdx->td.tdcs_pages[i])
577 				continue;
578 
579 			tdx_reclaim_control_page(kvm_tdx->td.tdcs_pages[i]);
580 		}
581 		kfree(kvm_tdx->td.tdcs_pages);
582 		kvm_tdx->td.tdcs_pages = NULL;
583 	}
584 
585 	if (!kvm_tdx->td.tdr_page)
586 		return;
587 
588 	if (__tdx_reclaim_page(kvm_tdx->td.tdr_page))
589 		return;
590 
591 	/*
592 	 * Use a SEAMCALL to ask the TDX module to flush the cache based on the
593 	 * KeyID. TDX module may access TDR while operating on TD (Especially
594 	 * when it is reclaiming TDCS).
595 	 */
596 	err = tdh_phymem_page_wbinvd_tdr(&kvm_tdx->td);
597 	if (TDX_BUG_ON(err, TDH_PHYMEM_PAGE_WBINVD, kvm))
598 		return;
599 
600 	tdx_quirk_reset_page(kvm_tdx->td.tdr_page);
601 
602 	__free_page(kvm_tdx->td.tdr_page);
603 	kvm_tdx->td.tdr_page = NULL;
604 }
605 
606 void tdx_vm_destroy(struct kvm *kvm)
607 {
608 	struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
609 
610 	tdx_reclaim_td_control_pages(kvm);
611 
612 	kvm_tdx->state = TD_STATE_UNINITIALIZED;
613 }
614 
615 static int tdx_do_tdh_mng_key_config(void *param)
616 {
617 	struct kvm_tdx *kvm_tdx = param;
618 	u64 err;
619 
620 	/* TDX_RND_NO_ENTROPY related retries are handled by sc_retry() */
621 	err = tdh_mng_key_config(&kvm_tdx->td);
622 	if (TDX_BUG_ON(err, TDH_MNG_KEY_CONFIG, &kvm_tdx->kvm))
623 		return -EIO;
624 
625 	return 0;
626 }
627 
628 int tdx_vm_init(struct kvm *kvm)
629 {
630 	struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
631 
632 	kvm->arch.has_protected_state = true;
633 	/*
634 	 * TDX Module doesn't allow the hypervisor to modify the EOI-bitmap,
635 	 * i.e. all EOIs are accelerated and never trigger exits.
636 	 */
637 	kvm->arch.has_protected_eoi = true;
638 	kvm->arch.has_private_mem = true;
639 	kvm->arch.disabled_quirks |= KVM_X86_QUIRK_IGNORE_GUEST_PAT;
640 
641 	/*
642 	 * Because guest TD is protected, VMM can't parse the instruction in TD.
643 	 * Instead, guest uses MMIO hypercall.  For unmodified device driver,
644 	 * #VE needs to be injected for MMIO and #VE handler in TD converts MMIO
645 	 * instruction into MMIO hypercall.
646 	 *
647 	 * SPTE value for MMIO needs to be setup so that #VE is injected into
648 	 * TD instead of triggering EPT MISCONFIG.
649 	 * - RWX=0 so that EPT violation is triggered.
650 	 * - suppress #VE bit is cleared to inject #VE.
651 	 */
652 	kvm_mmu_set_mmio_spte_value(kvm, 0);
653 
654 	/*
655 	 * TDX has its own limit of maximum vCPUs it can support for all
656 	 * TDX guests in addition to KVM_MAX_VCPUS.  TDX module reports
657 	 * such limit via the MAX_VCPU_PER_TD global metadata.  In
658 	 * practice, it reflects the number of logical CPUs that ALL
659 	 * platforms that the TDX module supports can possibly have.
660 	 *
661 	 * Limit TDX guest's maximum vCPUs to the number of logical CPUs
662 	 * the platform has.  Simply forwarding the MAX_VCPU_PER_TD to
663 	 * userspace would result in an unpredictable ABI.
664 	 */
665 	kvm->max_vcpus = min_t(int, kvm->max_vcpus, num_present_cpus());
666 
667 	kvm_tdx->state = TD_STATE_UNINITIALIZED;
668 
669 	return 0;
670 }
671 
672 int tdx_vcpu_create(struct kvm_vcpu *vcpu)
673 {
674 	struct kvm_tdx *kvm_tdx = to_kvm_tdx(vcpu->kvm);
675 	struct vcpu_tdx *tdx = to_tdx(vcpu);
676 
677 	if (kvm_tdx->state != TD_STATE_INITIALIZED)
678 		return -EIO;
679 
680 	/*
681 	 * TDX module mandates APICv, which requires an in-kernel local APIC.
682 	 * Disallow an in-kernel I/O APIC, because level-triggered interrupts
683 	 * and thus the I/O APIC as a whole can't be faithfully emulated in KVM.
684 	 */
685 	if (!irqchip_split(vcpu->kvm))
686 		return -EINVAL;
687 
688 	fpstate_set_confidential(&vcpu->arch.guest_fpu);
689 	vcpu->arch.apic->guest_apic_protected = true;
690 	INIT_LIST_HEAD(&tdx->vt.pi_wakeup_list);
691 
692 	vcpu->arch.efer = EFER_SCE | EFER_LME | EFER_LMA | EFER_NX;
693 
694 	vcpu->arch.switch_db_regs = KVM_DEBUGREG_AUTO_SWITCH;
695 	vcpu->arch.cr0_guest_owned_bits = -1ul;
696 	vcpu->arch.cr4_guest_owned_bits = -1ul;
697 
698 	/* KVM can't change TSC offset/multiplier as TDX module manages them. */
699 	vcpu->arch.guest_tsc_protected = true;
700 	vcpu->arch.tsc_offset = kvm_tdx->tsc_offset;
701 	vcpu->arch.l1_tsc_offset = vcpu->arch.tsc_offset;
702 	vcpu->arch.tsc_scaling_ratio = kvm_tdx->tsc_multiplier;
703 	vcpu->arch.l1_tsc_scaling_ratio = kvm_tdx->tsc_multiplier;
704 
705 	vcpu->arch.guest_state_protected =
706 		!(to_kvm_tdx(vcpu->kvm)->attributes & TDX_TD_ATTR_DEBUG);
707 
708 	if ((kvm_tdx->xfam & XFEATURE_MASK_XTILE) == XFEATURE_MASK_XTILE)
709 		vcpu->arch.xfd_no_write_intercept = true;
710 
711 	tdx->vt.pi_desc.nv = POSTED_INTR_VECTOR;
712 	__pi_set_sn(&tdx->vt.pi_desc);
713 
714 	tdx->state = VCPU_TD_STATE_UNINITIALIZED;
715 
716 	return 0;
717 }
718 
719 void tdx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
720 {
721 	struct vcpu_tdx *tdx = to_tdx(vcpu);
722 
723 	vmx_vcpu_pi_load(vcpu, cpu);
724 	if (vcpu->cpu == cpu || !is_hkid_assigned(to_kvm_tdx(vcpu->kvm)))
725 		return;
726 
727 	tdx_flush_vp_on_cpu(vcpu);
728 
729 	KVM_BUG_ON(cpu != raw_smp_processor_id(), vcpu->kvm);
730 	local_irq_disable();
731 	/*
732 	 * Pairs with the smp_wmb() in tdx_disassociate_vp() to ensure
733 	 * vcpu->cpu is read before tdx->cpu_list.
734 	 */
735 	smp_rmb();
736 
737 	list_add(&tdx->cpu_list, &per_cpu(associated_tdvcpus, cpu));
738 	local_irq_enable();
739 }
740 
741 bool tdx_interrupt_allowed(struct kvm_vcpu *vcpu)
742 {
743 	/*
744 	 * KVM can't get the interrupt status of TDX guest and it assumes
745 	 * interrupt is always allowed unless TDX guest calls TDVMCALL with HLT,
746 	 * which passes the interrupt blocked flag.
747 	 */
748 	return vmx_get_exit_reason(vcpu).basic != EXIT_REASON_HLT ||
749 	       !to_tdx(vcpu)->vp_enter_args.r12;
750 }
751 
752 static bool tdx_protected_apic_has_interrupt(struct kvm_vcpu *vcpu)
753 {
754 	u64 vcpu_state_details;
755 
756 	if (pi_has_pending_interrupt(vcpu))
757 		return true;
758 
759 	/*
760 	 * Only check RVI pending for HALTED case with IRQ enabled.
761 	 * For non-HLT cases, KVM doesn't care about STI/SS shadows.  And if the
762 	 * interrupt was pending before TD exit, then it _must_ be blocked,
763 	 * otherwise the interrupt would have been serviced at the instruction
764 	 * boundary.
765 	 */
766 	if (vmx_get_exit_reason(vcpu).basic != EXIT_REASON_HLT ||
767 	    to_tdx(vcpu)->vp_enter_args.r12)
768 		return false;
769 
770 	vcpu_state_details =
771 		td_state_non_arch_read64(to_tdx(vcpu), TD_VCPU_STATE_DETAILS_NON_ARCH);
772 
773 	return tdx_vcpu_state_details_intr_pending(vcpu_state_details);
774 }
775 
776 struct tdx_uret_msr {
777 	u32 msr;
778 	unsigned int slot;
779 	u64 defval;
780 };
781 
782 static struct tdx_uret_msr tdx_uret_msrs[] = {
783 	{.msr = MSR_SYSCALL_MASK, .defval = 0x20200 },
784 	{.msr = MSR_STAR,},
785 	{.msr = MSR_LSTAR,},
786 	{.msr = MSR_TSC_AUX,},
787 };
788 
789 void tdx_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
790 {
791 	struct vcpu_vt *vt = to_vt(vcpu);
792 	int i;
793 
794 	if (vt->guest_state_loaded)
795 		return;
796 
797 	if (likely(is_64bit_mm(current->mm)))
798 		vt->msr_host_kernel_gs_base = current->thread.gsbase;
799 	else
800 		vt->msr_host_kernel_gs_base = read_msr(MSR_KERNEL_GS_BASE);
801 
802 	vt->guest_state_loaded = true;
803 
804 	/*
805 	 * Explicitly set user-return MSRs that are clobbered by the TDX-Module
806 	 * if VP.ENTER succeeds, i.e. on TD-Exit, with the values that would be
807 	 * written by the TDX-Module.  Don't rely on the TDX-Module to actually
808 	 * clobber the MSRs, as the contract is poorly defined and not upheld.
809 	 * E.g. the TDX-Module will synthesize an EPT Violation without doing
810 	 * VM-Enter if it suspects a zero-step attack, and never "restore" VMM
811 	 * state.
812 	 */
813 	for (i = 0; i < ARRAY_SIZE(tdx_uret_msrs); i++)
814 		kvm_set_user_return_msr(tdx_uret_msrs[i].slot,
815 					tdx_uret_msrs[i].defval, -1ull);
816 }
817 
818 static void tdx_prepare_switch_to_host(struct kvm_vcpu *vcpu)
819 {
820 	struct vcpu_vt *vt = to_vt(vcpu);
821 
822 	if (!vt->guest_state_loaded)
823 		return;
824 
825 	++vcpu->stat.host_state_reload;
826 	wrmsrl(MSR_KERNEL_GS_BASE, vt->msr_host_kernel_gs_base);
827 
828 	vt->guest_state_loaded = false;
829 }
830 
831 void tdx_vcpu_put(struct kvm_vcpu *vcpu)
832 {
833 	vmx_vcpu_pi_put(vcpu);
834 	tdx_prepare_switch_to_host(vcpu);
835 }
836 
837 /*
838  * Life cycles for a TD and a vCPU:
839  * 1. KVM_CREATE_VM ioctl.
840  *    TD state is TD_STATE_UNINITIALIZED.
841  *    hkid is not assigned at this stage.
842  * 2. KVM_TDX_INIT_VM ioctl.
843  *    TD transitions to TD_STATE_INITIALIZED.
844  *    hkid is assigned after this stage.
845  * 3. KVM_CREATE_VCPU ioctl. (only when TD is TD_STATE_INITIALIZED).
846  *    3.1 tdx_vcpu_create() transitions vCPU state to VCPU_TD_STATE_UNINITIALIZED.
847  *    3.2 vcpu_load() and vcpu_put() in kvm_arch_vcpu_create().
848  *    3.3 (conditional) if any error encountered after kvm_arch_vcpu_create()
849  *        kvm_arch_vcpu_destroy() --> tdx_vcpu_free().
850  * 4. KVM_TDX_INIT_VCPU ioctl.
851  *    tdx_vcpu_init() transitions vCPU state to VCPU_TD_STATE_INITIALIZED.
852  *    vCPU control structures are allocated at this stage.
853  * 5. kvm_destroy_vm().
854  *    5.1 tdx_mmu_release_hkid(): (1) tdh_vp_flush(), disassociates all vCPUs.
855  *                                (2) puts hkid to !assigned state.
856  *    5.2 kvm_destroy_vcpus() --> tdx_vcpu_free():
857  *        transitions vCPU to VCPU_TD_STATE_UNINITIALIZED state.
858  *    5.3 tdx_vm_destroy()
859  *        transitions TD to TD_STATE_UNINITIALIZED state.
860  *
861  * tdx_vcpu_free() can be invoked only at 3.3 or 5.2.
862  * - If at 3.3, hkid is still assigned, but the vCPU must be in
863  *   VCPU_TD_STATE_UNINITIALIZED state.
864  * - if at 5.2, hkid must be !assigned and all vCPUs must be in
865  *   VCPU_TD_STATE_INITIALIZED state and have been dissociated.
866  */
867 void tdx_vcpu_free(struct kvm_vcpu *vcpu)
868 {
869 	struct kvm_tdx *kvm_tdx = to_kvm_tdx(vcpu->kvm);
870 	struct vcpu_tdx *tdx = to_tdx(vcpu);
871 	int i;
872 
873 	if (vcpu->cpu != -1) {
874 		KVM_BUG_ON(tdx->state == VCPU_TD_STATE_INITIALIZED, vcpu->kvm);
875 		tdx_flush_vp_on_cpu(vcpu);
876 		return;
877 	}
878 
879 	/*
880 	 * It is not possible to reclaim pages while hkid is assigned. It might
881 	 * be assigned if the TD VM is being destroyed but freeing hkid failed,
882 	 * in which case the pages are leaked.
883 	 */
884 	if (is_hkid_assigned(kvm_tdx))
885 		return;
886 
887 	if (tdx->vp.tdcx_pages) {
888 		for (i = 0; i < kvm_tdx->td.tdcx_nr_pages; i++) {
889 			if (tdx->vp.tdcx_pages[i])
890 				tdx_reclaim_control_page(tdx->vp.tdcx_pages[i]);
891 		}
892 		kfree(tdx->vp.tdcx_pages);
893 		tdx->vp.tdcx_pages = NULL;
894 	}
895 	if (tdx->vp.tdvpr_page) {
896 		tdx_reclaim_control_page(tdx->vp.tdvpr_page);
897 		tdx->vp.tdvpr_page = NULL;
898 		tdx->vp.tdvpr_pa = 0;
899 	}
900 
901 	tdx->state = VCPU_TD_STATE_UNINITIALIZED;
902 }
903 
904 int tdx_vcpu_pre_run(struct kvm_vcpu *vcpu)
905 {
906 	if (unlikely(to_tdx(vcpu)->state != VCPU_TD_STATE_INITIALIZED ||
907 		     to_kvm_tdx(vcpu->kvm)->state != TD_STATE_RUNNABLE))
908 		return -EINVAL;
909 
910 	return 1;
911 }
912 
913 static __always_inline u32 tdcall_to_vmx_exit_reason(struct kvm_vcpu *vcpu)
914 {
915 	switch (tdvmcall_leaf(vcpu)) {
916 	case EXIT_REASON_CPUID:
917 	case EXIT_REASON_HLT:
918 	case EXIT_REASON_IO_INSTRUCTION:
919 	case EXIT_REASON_MSR_READ:
920 	case EXIT_REASON_MSR_WRITE:
921 		return tdvmcall_leaf(vcpu);
922 	case EXIT_REASON_EPT_VIOLATION:
923 		return EXIT_REASON_EPT_MISCONFIG;
924 	default:
925 		break;
926 	}
927 
928 	return EXIT_REASON_TDCALL;
929 }
930 
931 static __always_inline u32 tdx_to_vmx_exit_reason(struct kvm_vcpu *vcpu)
932 {
933 	struct vcpu_tdx *tdx = to_tdx(vcpu);
934 	u32 exit_reason;
935 
936 	switch (tdx->vp_enter_ret & TDX_SEAMCALL_STATUS_MASK) {
937 	case TDX_SUCCESS:
938 	case TDX_NON_RECOVERABLE_VCPU:
939 	case TDX_NON_RECOVERABLE_TD:
940 	case TDX_NON_RECOVERABLE_TD_NON_ACCESSIBLE:
941 	case TDX_NON_RECOVERABLE_TD_WRONG_APIC_MODE:
942 		break;
943 	default:
944 		return -1u;
945 	}
946 
947 	exit_reason = tdx->vp_enter_ret;
948 
949 	switch (exit_reason) {
950 	case EXIT_REASON_TDCALL:
951 		if (tdvmcall_exit_type(vcpu))
952 			return EXIT_REASON_VMCALL;
953 
954 		return tdcall_to_vmx_exit_reason(vcpu);
955 	case EXIT_REASON_EPT_MISCONFIG:
956 		/*
957 		 * Defer KVM_BUG_ON() until tdx_handle_exit() because this is in
958 		 * non-instrumentable code with interrupts disabled.
959 		 */
960 		return -1u;
961 	default:
962 		break;
963 	}
964 
965 	return exit_reason;
966 }
967 
968 static noinstr void tdx_vcpu_enter_exit(struct kvm_vcpu *vcpu)
969 {
970 	struct vcpu_tdx *tdx = to_tdx(vcpu);
971 	struct vcpu_vt *vt = to_vt(vcpu);
972 
973 	guest_state_enter_irqoff();
974 
975 	tdx->vp_enter_ret = tdh_vp_enter(&tdx->vp, &tdx->vp_enter_args);
976 
977 	vt->exit_reason.full = tdx_to_vmx_exit_reason(vcpu);
978 
979 	vt->exit_qualification = tdx->vp_enter_args.rcx;
980 	tdx->ext_exit_qualification = tdx->vp_enter_args.rdx;
981 	tdx->exit_gpa = tdx->vp_enter_args.r8;
982 	vt->exit_intr_info = tdx->vp_enter_args.r9;
983 
984 	vmx_handle_nmi(vcpu);
985 
986 	guest_state_exit_irqoff();
987 }
988 
989 static bool tdx_failed_vmentry(struct kvm_vcpu *vcpu)
990 {
991 	return vmx_get_exit_reason(vcpu).failed_vmentry &&
992 	       vmx_get_exit_reason(vcpu).full != -1u;
993 }
994 
995 static fastpath_t tdx_exit_handlers_fastpath(struct kvm_vcpu *vcpu)
996 {
997 	u64 vp_enter_ret = to_tdx(vcpu)->vp_enter_ret;
998 
999 	/*
1000 	 * TDX_OPERAND_BUSY could be returned for SEPT due to 0-step mitigation
1001 	 * or for TD EPOCH due to contention with TDH.MEM.TRACK on TDH.VP.ENTER.
1002 	 *
1003 	 * When KVM requests KVM_REQ_OUTSIDE_GUEST_MODE, which has both
1004 	 * KVM_REQUEST_WAIT and KVM_REQUEST_NO_ACTION set, it requires target
1005 	 * vCPUs leaving fastpath so that interrupt can be enabled to ensure the
1006 	 * IPIs can be delivered. Return EXIT_FASTPATH_EXIT_HANDLED instead of
1007 	 * EXIT_FASTPATH_REENTER_GUEST to exit fastpath, otherwise, the
1008 	 * requester may be blocked endlessly.
1009 	 */
1010 	if (unlikely(tdx_operand_busy(vp_enter_ret)))
1011 		return EXIT_FASTPATH_EXIT_HANDLED;
1012 
1013 	return EXIT_FASTPATH_NONE;
1014 }
1015 
1016 #define TDX_REGS_AVAIL_SET	(BIT_ULL(VCPU_EXREG_EXIT_INFO_1) | \
1017 				 BIT_ULL(VCPU_EXREG_EXIT_INFO_2) | \
1018 				 BIT_ULL(VCPU_REGS_RAX) | \
1019 				 BIT_ULL(VCPU_REGS_RBX) | \
1020 				 BIT_ULL(VCPU_REGS_RCX) | \
1021 				 BIT_ULL(VCPU_REGS_RDX) | \
1022 				 BIT_ULL(VCPU_REGS_RBP) | \
1023 				 BIT_ULL(VCPU_REGS_RSI) | \
1024 				 BIT_ULL(VCPU_REGS_RDI) | \
1025 				 BIT_ULL(VCPU_REGS_R8) | \
1026 				 BIT_ULL(VCPU_REGS_R9) | \
1027 				 BIT_ULL(VCPU_REGS_R10) | \
1028 				 BIT_ULL(VCPU_REGS_R11) | \
1029 				 BIT_ULL(VCPU_REGS_R12) | \
1030 				 BIT_ULL(VCPU_REGS_R13) | \
1031 				 BIT_ULL(VCPU_REGS_R14) | \
1032 				 BIT_ULL(VCPU_REGS_R15))
1033 
1034 static void tdx_load_host_xsave_state(struct kvm_vcpu *vcpu)
1035 {
1036 	struct kvm_tdx *kvm_tdx = to_kvm_tdx(vcpu->kvm);
1037 
1038 	/*
1039 	 * All TDX hosts support PKRU; but even if they didn't,
1040 	 * vcpu->arch.host_pkru would be 0 and the wrpkru would be
1041 	 * skipped.
1042 	 */
1043 	if (vcpu->arch.host_pkru != 0)
1044 		wrpkru(vcpu->arch.host_pkru);
1045 
1046 	if (kvm_host.xcr0 != (kvm_tdx->xfam & kvm_caps.supported_xcr0))
1047 		xsetbv(XCR_XFEATURE_ENABLED_MASK, kvm_host.xcr0);
1048 
1049 	/*
1050 	 * Likewise, even if a TDX hosts didn't support XSS both arms of
1051 	 * the comparison would be 0 and the wrmsrl would be skipped.
1052 	 */
1053 	if (kvm_host.xss != (kvm_tdx->xfam & kvm_caps.supported_xss))
1054 		wrmsrl(MSR_IA32_XSS, kvm_host.xss);
1055 }
1056 
1057 #define TDX_DEBUGCTL_PRESERVED (DEBUGCTLMSR_BTF | \
1058 				DEBUGCTLMSR_FREEZE_PERFMON_ON_PMI | \
1059 				DEBUGCTLMSR_FREEZE_IN_SMM)
1060 
1061 fastpath_t tdx_vcpu_run(struct kvm_vcpu *vcpu, u64 run_flags)
1062 {
1063 	struct vcpu_tdx *tdx = to_tdx(vcpu);
1064 	struct vcpu_vt *vt = to_vt(vcpu);
1065 
1066 	/*
1067 	 * WARN if KVM wants to force an immediate exit, as the TDX module does
1068 	 * not guarantee entry into the guest, i.e. it's possible for KVM to
1069 	 * _think_ it completed entry to the guest and forced an immediate exit
1070 	 * without actually having done so.  Luckily, KVM never needs to force
1071 	 * an immediate exit for TDX (KVM can't do direct event injection, so
1072 	 * just WARN and continue on.
1073 	 */
1074 	WARN_ON_ONCE(run_flags);
1075 
1076 	/*
1077 	 * Wait until retry of SEPT-zap-related SEAMCALL completes before
1078 	 * allowing vCPU entry to avoid contention with tdh_vp_enter() and
1079 	 * TDCALLs.
1080 	 */
1081 	if (unlikely(READ_ONCE(to_kvm_tdx(vcpu->kvm)->wait_for_sept_zap)))
1082 		return EXIT_FASTPATH_EXIT_HANDLED;
1083 
1084 	trace_kvm_entry(vcpu, run_flags & KVM_RUN_FORCE_IMMEDIATE_EXIT);
1085 
1086 	if (pi_test_on(&vt->pi_desc)) {
1087 		apic->send_IPI_self(POSTED_INTR_VECTOR);
1088 
1089 		if (pi_test_pir(kvm_lapic_get_reg(vcpu->arch.apic, APIC_LVTT) &
1090 			       APIC_VECTOR_MASK, &vt->pi_desc))
1091 			kvm_wait_lapic_expire(vcpu);
1092 	}
1093 
1094 	tdx_vcpu_enter_exit(vcpu);
1095 
1096 	if (vcpu->arch.host_debugctl & ~TDX_DEBUGCTL_PRESERVED)
1097 		update_debugctlmsr(vcpu->arch.host_debugctl);
1098 
1099 	tdx_load_host_xsave_state(vcpu);
1100 
1101 	vcpu->arch.regs_avail &= TDX_REGS_AVAIL_SET;
1102 
1103 	if (unlikely(tdx->vp_enter_ret == EXIT_REASON_EPT_MISCONFIG))
1104 		return EXIT_FASTPATH_NONE;
1105 
1106 	if (unlikely((tdx->vp_enter_ret & TDX_SW_ERROR) == TDX_SW_ERROR))
1107 		return EXIT_FASTPATH_NONE;
1108 
1109 	trace_kvm_exit(vcpu, KVM_ISA_VMX);
1110 
1111 	if (unlikely(tdx_failed_vmentry(vcpu)))
1112 		return EXIT_FASTPATH_NONE;
1113 
1114 	return tdx_exit_handlers_fastpath(vcpu);
1115 }
1116 
1117 void tdx_inject_nmi(struct kvm_vcpu *vcpu)
1118 {
1119 	++vcpu->stat.nmi_injections;
1120 	td_management_write8(to_tdx(vcpu), TD_VCPU_PEND_NMI, 1);
1121 	/*
1122 	 * From KVM's perspective, NMI injection is completed right after
1123 	 * writing to PEND_NMI.  KVM doesn't care whether an NMI is injected by
1124 	 * the TDX module or not.
1125 	 */
1126 	vcpu->arch.nmi_injected = false;
1127 	/*
1128 	 * TDX doesn't support KVM to request NMI window exit.  If there is
1129 	 * still a pending vNMI, KVM is not able to inject it along with the
1130 	 * one pending in TDX module in a back-to-back way.  Since the previous
1131 	 * vNMI is still pending in TDX module, i.e. it has not been delivered
1132 	 * to TDX guest yet, it's OK to collapse the pending vNMI into the
1133 	 * previous one.  The guest is expected to handle all the NMI sources
1134 	 * when handling the first vNMI.
1135 	 */
1136 	vcpu->arch.nmi_pending = 0;
1137 }
1138 
1139 static int tdx_handle_exception_nmi(struct kvm_vcpu *vcpu)
1140 {
1141 	u32 intr_info = vmx_get_intr_info(vcpu);
1142 
1143 	/*
1144 	 * Machine checks are handled by handle_exception_irqoff(), or by
1145 	 * tdx_handle_exit() with TDX_NON_RECOVERABLE set if a #MC occurs on
1146 	 * VM-Entry.  NMIs are handled by tdx_vcpu_enter_exit().
1147 	 */
1148 	if (is_nmi(intr_info) || is_machine_check(intr_info))
1149 		return 1;
1150 
1151 	vcpu->run->exit_reason = KVM_EXIT_EXCEPTION;
1152 	vcpu->run->ex.exception = intr_info & INTR_INFO_VECTOR_MASK;
1153 	vcpu->run->ex.error_code = 0;
1154 
1155 	return 0;
1156 }
1157 
1158 static int complete_hypercall_exit(struct kvm_vcpu *vcpu)
1159 {
1160 	tdvmcall_set_return_code(vcpu, vcpu->run->hypercall.ret);
1161 	return 1;
1162 }
1163 
1164 static int tdx_emulate_vmcall(struct kvm_vcpu *vcpu)
1165 {
1166 	kvm_rax_write(vcpu, to_tdx(vcpu)->vp_enter_args.r10);
1167 	kvm_rbx_write(vcpu, to_tdx(vcpu)->vp_enter_args.r11);
1168 	kvm_rcx_write(vcpu, to_tdx(vcpu)->vp_enter_args.r12);
1169 	kvm_rdx_write(vcpu, to_tdx(vcpu)->vp_enter_args.r13);
1170 	kvm_rsi_write(vcpu, to_tdx(vcpu)->vp_enter_args.r14);
1171 
1172 	return __kvm_emulate_hypercall(vcpu, 0, complete_hypercall_exit);
1173 }
1174 
1175 /*
1176  * Split into chunks and check interrupt pending between chunks.  This allows
1177  * for timely injection of interrupts to prevent issues with guest lockup
1178  * detection.
1179  */
1180 #define TDX_MAP_GPA_MAX_LEN (2 * 1024 * 1024)
1181 static void __tdx_map_gpa(struct vcpu_tdx *tdx);
1182 
1183 static int tdx_complete_vmcall_map_gpa(struct kvm_vcpu *vcpu)
1184 {
1185 	struct vcpu_tdx *tdx = to_tdx(vcpu);
1186 
1187 	if (vcpu->run->hypercall.ret) {
1188 		tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_INVALID_OPERAND);
1189 		tdx->vp_enter_args.r11 = tdx->map_gpa_next;
1190 		return 1;
1191 	}
1192 
1193 	tdx->map_gpa_next += TDX_MAP_GPA_MAX_LEN;
1194 	if (tdx->map_gpa_next >= tdx->map_gpa_end)
1195 		return 1;
1196 
1197 	/*
1198 	 * Stop processing the remaining part if there is a pending interrupt,
1199 	 * which could be qualified to deliver.  Skip checking pending RVI for
1200 	 * TDVMCALL_MAP_GPA, see comments in tdx_protected_apic_has_interrupt().
1201 	 */
1202 	if (kvm_vcpu_has_events(vcpu)) {
1203 		tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_RETRY);
1204 		tdx->vp_enter_args.r11 = tdx->map_gpa_next;
1205 		return 1;
1206 	}
1207 
1208 	__tdx_map_gpa(tdx);
1209 	return 0;
1210 }
1211 
1212 static void __tdx_map_gpa(struct vcpu_tdx *tdx)
1213 {
1214 	u64 gpa = tdx->map_gpa_next;
1215 	u64 size = tdx->map_gpa_end - tdx->map_gpa_next;
1216 
1217 	if (size > TDX_MAP_GPA_MAX_LEN)
1218 		size = TDX_MAP_GPA_MAX_LEN;
1219 
1220 	tdx->vcpu.run->exit_reason       = KVM_EXIT_HYPERCALL;
1221 	tdx->vcpu.run->hypercall.nr      = KVM_HC_MAP_GPA_RANGE;
1222 	/*
1223 	 * In principle this should have been -KVM_ENOSYS, but userspace (QEMU <=9.2)
1224 	 * assumed that vcpu->run->hypercall.ret is never changed by KVM and thus that
1225 	 * it was always zero on KVM_EXIT_HYPERCALL.  Since KVM is now overwriting
1226 	 * vcpu->run->hypercall.ret, ensuring that it is zero to not break QEMU.
1227 	 */
1228 	tdx->vcpu.run->hypercall.ret = 0;
1229 	tdx->vcpu.run->hypercall.args[0] = gpa & ~gfn_to_gpa(kvm_gfn_direct_bits(tdx->vcpu.kvm));
1230 	tdx->vcpu.run->hypercall.args[1] = size / PAGE_SIZE;
1231 	tdx->vcpu.run->hypercall.args[2] = vt_is_tdx_private_gpa(tdx->vcpu.kvm, gpa) ?
1232 					   KVM_MAP_GPA_RANGE_ENCRYPTED :
1233 					   KVM_MAP_GPA_RANGE_DECRYPTED;
1234 	tdx->vcpu.run->hypercall.flags   = KVM_EXIT_HYPERCALL_LONG_MODE;
1235 
1236 	tdx->vcpu.arch.complete_userspace_io = tdx_complete_vmcall_map_gpa;
1237 }
1238 
1239 static int tdx_map_gpa(struct kvm_vcpu *vcpu)
1240 {
1241 	struct vcpu_tdx *tdx = to_tdx(vcpu);
1242 	u64 gpa = tdx->vp_enter_args.r12;
1243 	u64 size = tdx->vp_enter_args.r13;
1244 	u64 ret;
1245 
1246 	/*
1247 	 * Converting TDVMCALL_MAP_GPA to KVM_HC_MAP_GPA_RANGE requires
1248 	 * userspace to enable KVM_CAP_EXIT_HYPERCALL with KVM_HC_MAP_GPA_RANGE
1249 	 * bit set.  This is a base call so it should always be supported, but
1250 	 * KVM has no way to ensure that userspace implements the GHCI correctly.
1251 	 * So if KVM_HC_MAP_GPA_RANGE does not cause a VMEXIT, return an error
1252 	 * to the guest.
1253 	 */
1254 	if (!user_exit_on_hypercall(vcpu->kvm, KVM_HC_MAP_GPA_RANGE)) {
1255 		ret = TDVMCALL_STATUS_SUBFUNC_UNSUPPORTED;
1256 		goto error;
1257 	}
1258 
1259 	if (gpa + size <= gpa || !kvm_vcpu_is_legal_gpa(vcpu, gpa) ||
1260 	    !kvm_vcpu_is_legal_gpa(vcpu, gpa + size - 1) ||
1261 	    (vt_is_tdx_private_gpa(vcpu->kvm, gpa) !=
1262 	     vt_is_tdx_private_gpa(vcpu->kvm, gpa + size - 1))) {
1263 		ret = TDVMCALL_STATUS_INVALID_OPERAND;
1264 		goto error;
1265 	}
1266 
1267 	if (!PAGE_ALIGNED(gpa) || !PAGE_ALIGNED(size)) {
1268 		ret = TDVMCALL_STATUS_ALIGN_ERROR;
1269 		goto error;
1270 	}
1271 
1272 	tdx->map_gpa_end = gpa + size;
1273 	tdx->map_gpa_next = gpa;
1274 
1275 	__tdx_map_gpa(tdx);
1276 	return 0;
1277 
1278 error:
1279 	tdvmcall_set_return_code(vcpu, ret);
1280 	tdx->vp_enter_args.r11 = gpa;
1281 	return 1;
1282 }
1283 
1284 static int tdx_report_fatal_error(struct kvm_vcpu *vcpu)
1285 {
1286 	struct vcpu_tdx *tdx = to_tdx(vcpu);
1287 	u64 *regs = vcpu->run->system_event.data;
1288 	u64 *module_regs = &tdx->vp_enter_args.r8;
1289 	int index = VCPU_REGS_RAX;
1290 
1291 	vcpu->run->exit_reason = KVM_EXIT_SYSTEM_EVENT;
1292 	vcpu->run->system_event.type = KVM_SYSTEM_EVENT_TDX_FATAL;
1293 	vcpu->run->system_event.ndata = 16;
1294 
1295 	/* Dump 16 general-purpose registers to userspace in ascending order. */
1296 	regs[index++] = tdx->vp_enter_ret;
1297 	regs[index++] = tdx->vp_enter_args.rcx;
1298 	regs[index++] = tdx->vp_enter_args.rdx;
1299 	regs[index++] = tdx->vp_enter_args.rbx;
1300 	regs[index++] = 0;
1301 	regs[index++] = 0;
1302 	regs[index++] = tdx->vp_enter_args.rsi;
1303 	regs[index] = tdx->vp_enter_args.rdi;
1304 	for (index = 0; index < 8; index++)
1305 		regs[VCPU_REGS_R8 + index] = module_regs[index];
1306 
1307 	return 0;
1308 }
1309 
1310 static int tdx_emulate_cpuid(struct kvm_vcpu *vcpu)
1311 {
1312 	u32 eax, ebx, ecx, edx;
1313 	struct vcpu_tdx *tdx = to_tdx(vcpu);
1314 
1315 	/* EAX and ECX for cpuid is stored in R12 and R13. */
1316 	eax = tdx->vp_enter_args.r12;
1317 	ecx = tdx->vp_enter_args.r13;
1318 
1319 	kvm_cpuid(vcpu, &eax, &ebx, &ecx, &edx, false);
1320 
1321 	tdx->vp_enter_args.r12 = eax;
1322 	tdx->vp_enter_args.r13 = ebx;
1323 	tdx->vp_enter_args.r14 = ecx;
1324 	tdx->vp_enter_args.r15 = edx;
1325 
1326 	return 1;
1327 }
1328 
1329 static int tdx_complete_pio_out(struct kvm_vcpu *vcpu)
1330 {
1331 	vcpu->arch.pio.count = 0;
1332 	return 1;
1333 }
1334 
1335 static int tdx_complete_pio_in(struct kvm_vcpu *vcpu)
1336 {
1337 	struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
1338 	unsigned long val = 0;
1339 	int ret;
1340 
1341 	ret = ctxt->ops->pio_in_emulated(ctxt, vcpu->arch.pio.size,
1342 					 vcpu->arch.pio.port, &val, 1);
1343 
1344 	WARN_ON_ONCE(!ret);
1345 
1346 	tdvmcall_set_return_val(vcpu, val);
1347 
1348 	return 1;
1349 }
1350 
1351 static int tdx_emulate_io(struct kvm_vcpu *vcpu)
1352 {
1353 	struct vcpu_tdx *tdx = to_tdx(vcpu);
1354 	struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
1355 	unsigned long val = 0;
1356 	unsigned int port;
1357 	u64 size, write;
1358 	int ret;
1359 
1360 	++vcpu->stat.io_exits;
1361 
1362 	size = tdx->vp_enter_args.r12;
1363 	write = tdx->vp_enter_args.r13;
1364 	port = tdx->vp_enter_args.r14;
1365 
1366 	if ((write != 0 && write != 1) || (size != 1 && size != 2 && size != 4)) {
1367 		tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_INVALID_OPERAND);
1368 		return 1;
1369 	}
1370 
1371 	if (write) {
1372 		val = tdx->vp_enter_args.r15;
1373 		ret = ctxt->ops->pio_out_emulated(ctxt, size, port, &val, 1);
1374 	} else {
1375 		ret = ctxt->ops->pio_in_emulated(ctxt, size, port, &val, 1);
1376 	}
1377 
1378 	if (!ret)
1379 		vcpu->arch.complete_userspace_io = write ? tdx_complete_pio_out :
1380 							   tdx_complete_pio_in;
1381 	else if (!write)
1382 		tdvmcall_set_return_val(vcpu, val);
1383 
1384 	return ret;
1385 }
1386 
1387 static int tdx_complete_mmio_read(struct kvm_vcpu *vcpu)
1388 {
1389 	unsigned long val = 0;
1390 	gpa_t gpa;
1391 	int size;
1392 
1393 	gpa = vcpu->mmio_fragments[0].gpa;
1394 	size = vcpu->mmio_fragments[0].len;
1395 
1396 	memcpy(&val, vcpu->run->mmio.data, size);
1397 	tdvmcall_set_return_val(vcpu, val);
1398 	trace_kvm_mmio(KVM_TRACE_MMIO_READ, size, gpa, &val);
1399 	return 1;
1400 }
1401 
1402 static inline int tdx_mmio_write(struct kvm_vcpu *vcpu, gpa_t gpa, int size,
1403 				 unsigned long val)
1404 {
1405 	if (!kvm_io_bus_write(vcpu, KVM_FAST_MMIO_BUS, gpa, 0, NULL)) {
1406 		trace_kvm_fast_mmio(gpa);
1407 		return 0;
1408 	}
1409 
1410 	trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, size, gpa, &val);
1411 	if (kvm_io_bus_write(vcpu, KVM_MMIO_BUS, gpa, size, &val))
1412 		return -EOPNOTSUPP;
1413 
1414 	return 0;
1415 }
1416 
1417 static inline int tdx_mmio_read(struct kvm_vcpu *vcpu, gpa_t gpa, int size)
1418 {
1419 	unsigned long val;
1420 
1421 	if (kvm_io_bus_read(vcpu, KVM_MMIO_BUS, gpa, size, &val))
1422 		return -EOPNOTSUPP;
1423 
1424 	tdvmcall_set_return_val(vcpu, val);
1425 	trace_kvm_mmio(KVM_TRACE_MMIO_READ, size, gpa, &val);
1426 	return 0;
1427 }
1428 
1429 static int tdx_emulate_mmio(struct kvm_vcpu *vcpu)
1430 {
1431 	struct vcpu_tdx *tdx = to_tdx(vcpu);
1432 	int size, write, r;
1433 	unsigned long val;
1434 	gpa_t gpa;
1435 
1436 	size = tdx->vp_enter_args.r12;
1437 	write = tdx->vp_enter_args.r13;
1438 	gpa = tdx->vp_enter_args.r14;
1439 	val = write ? tdx->vp_enter_args.r15 : 0;
1440 
1441 	if (size != 1 && size != 2 && size != 4 && size != 8)
1442 		goto error;
1443 	if (write != 0 && write != 1)
1444 		goto error;
1445 
1446 	/*
1447 	 * TDG.VP.VMCALL<MMIO> allows only shared GPA, it makes no sense to
1448 	 * do MMIO emulation for private GPA.
1449 	 */
1450 	if (vt_is_tdx_private_gpa(vcpu->kvm, gpa) ||
1451 	    vt_is_tdx_private_gpa(vcpu->kvm, gpa + size - 1))
1452 		goto error;
1453 
1454 	gpa = gpa & ~gfn_to_gpa(kvm_gfn_direct_bits(vcpu->kvm));
1455 
1456 	if (write)
1457 		r = tdx_mmio_write(vcpu, gpa, size, val);
1458 	else
1459 		r = tdx_mmio_read(vcpu, gpa, size);
1460 	if (!r)
1461 		/* Kernel completed device emulation. */
1462 		return 1;
1463 
1464 	/* Request the device emulation to userspace device model. */
1465 	vcpu->mmio_is_write = write;
1466 
1467 	__kvm_prepare_emulated_mmio_exit(vcpu, gpa, size, &val, write);
1468 
1469 	if (!write) {
1470 		vcpu->arch.complete_userspace_io = tdx_complete_mmio_read;
1471 		vcpu->mmio_fragments[0].gpa = gpa;
1472 		vcpu->mmio_fragments[0].len = size;
1473 		trace_kvm_mmio(KVM_TRACE_MMIO_READ_UNSATISFIED, size, gpa, NULL);
1474 	}
1475 	return 0;
1476 
1477 error:
1478 	tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_INVALID_OPERAND);
1479 	return 1;
1480 }
1481 
1482 static int tdx_complete_get_td_vm_call_info(struct kvm_vcpu *vcpu)
1483 {
1484 	struct vcpu_tdx *tdx = to_tdx(vcpu);
1485 
1486 	tdvmcall_set_return_code(vcpu, vcpu->run->tdx.get_tdvmcall_info.ret);
1487 
1488 	/*
1489 	 * For now, there is no TDVMCALL beyond GHCI base API supported by KVM
1490 	 * directly without the support from userspace, just set the value
1491 	 * returned from userspace.
1492 	 */
1493 	tdx->vp_enter_args.r11 = vcpu->run->tdx.get_tdvmcall_info.r11;
1494 	tdx->vp_enter_args.r12 = vcpu->run->tdx.get_tdvmcall_info.r12;
1495 	tdx->vp_enter_args.r13 = vcpu->run->tdx.get_tdvmcall_info.r13;
1496 	tdx->vp_enter_args.r14 = vcpu->run->tdx.get_tdvmcall_info.r14;
1497 
1498 	return 1;
1499 }
1500 
1501 static int tdx_get_td_vm_call_info(struct kvm_vcpu *vcpu)
1502 {
1503 	struct vcpu_tdx *tdx = to_tdx(vcpu);
1504 
1505 	switch (tdx->vp_enter_args.r12) {
1506 	case 0:
1507 		tdx->vp_enter_args.r11 = 0;
1508 		tdx->vp_enter_args.r12 = 0;
1509 		tdx->vp_enter_args.r13 = 0;
1510 		tdx->vp_enter_args.r14 = 0;
1511 		tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_SUCCESS);
1512 		return 1;
1513 	case 1:
1514 		vcpu->run->tdx.get_tdvmcall_info.leaf = tdx->vp_enter_args.r12;
1515 		vcpu->run->exit_reason = KVM_EXIT_TDX;
1516 		vcpu->run->tdx.flags = 0;
1517 		vcpu->run->tdx.nr = TDVMCALL_GET_TD_VM_CALL_INFO;
1518 		vcpu->run->tdx.get_tdvmcall_info.ret = TDVMCALL_STATUS_SUCCESS;
1519 		vcpu->run->tdx.get_tdvmcall_info.r11 = 0;
1520 		vcpu->run->tdx.get_tdvmcall_info.r12 = 0;
1521 		vcpu->run->tdx.get_tdvmcall_info.r13 = 0;
1522 		vcpu->run->tdx.get_tdvmcall_info.r14 = 0;
1523 		vcpu->arch.complete_userspace_io = tdx_complete_get_td_vm_call_info;
1524 		return 0;
1525 	default:
1526 		tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_INVALID_OPERAND);
1527 		return 1;
1528 	}
1529 }
1530 
1531 static int tdx_complete_simple(struct kvm_vcpu *vcpu)
1532 {
1533 	tdvmcall_set_return_code(vcpu, vcpu->run->tdx.unknown.ret);
1534 	return 1;
1535 }
1536 
1537 static int tdx_get_quote(struct kvm_vcpu *vcpu)
1538 {
1539 	struct vcpu_tdx *tdx = to_tdx(vcpu);
1540 	u64 gpa = tdx->vp_enter_args.r12;
1541 	u64 size = tdx->vp_enter_args.r13;
1542 
1543 	/* The gpa of buffer must have shared bit set. */
1544 	if (vt_is_tdx_private_gpa(vcpu->kvm, gpa)) {
1545 		tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_INVALID_OPERAND);
1546 		return 1;
1547 	}
1548 
1549 	vcpu->run->exit_reason = KVM_EXIT_TDX;
1550 	vcpu->run->tdx.flags = 0;
1551 	vcpu->run->tdx.nr = TDVMCALL_GET_QUOTE;
1552 	vcpu->run->tdx.get_quote.ret = TDVMCALL_STATUS_SUBFUNC_UNSUPPORTED;
1553 	vcpu->run->tdx.get_quote.gpa = gpa & ~gfn_to_gpa(kvm_gfn_direct_bits(tdx->vcpu.kvm));
1554 	vcpu->run->tdx.get_quote.size = size;
1555 
1556 	vcpu->arch.complete_userspace_io = tdx_complete_simple;
1557 
1558 	return 0;
1559 }
1560 
1561 static int tdx_setup_event_notify_interrupt(struct kvm_vcpu *vcpu)
1562 {
1563 	struct vcpu_tdx *tdx = to_tdx(vcpu);
1564 	u64 vector = tdx->vp_enter_args.r12;
1565 
1566 	if (vector < 32 || vector > 255) {
1567 		tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_INVALID_OPERAND);
1568 		return 1;
1569 	}
1570 
1571 	vcpu->run->exit_reason = KVM_EXIT_TDX;
1572 	vcpu->run->tdx.flags = 0;
1573 	vcpu->run->tdx.nr = TDVMCALL_SETUP_EVENT_NOTIFY_INTERRUPT;
1574 	vcpu->run->tdx.setup_event_notify.ret = TDVMCALL_STATUS_SUBFUNC_UNSUPPORTED;
1575 	vcpu->run->tdx.setup_event_notify.vector = vector;
1576 
1577 	vcpu->arch.complete_userspace_io = tdx_complete_simple;
1578 
1579 	return 0;
1580 }
1581 
1582 static int handle_tdvmcall(struct kvm_vcpu *vcpu)
1583 {
1584 	switch (tdvmcall_leaf(vcpu)) {
1585 	case TDVMCALL_MAP_GPA:
1586 		return tdx_map_gpa(vcpu);
1587 	case TDVMCALL_REPORT_FATAL_ERROR:
1588 		return tdx_report_fatal_error(vcpu);
1589 	case TDVMCALL_GET_TD_VM_CALL_INFO:
1590 		return tdx_get_td_vm_call_info(vcpu);
1591 	case TDVMCALL_GET_QUOTE:
1592 		return tdx_get_quote(vcpu);
1593 	case TDVMCALL_SETUP_EVENT_NOTIFY_INTERRUPT:
1594 		return tdx_setup_event_notify_interrupt(vcpu);
1595 	default:
1596 		break;
1597 	}
1598 
1599 	tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_SUBFUNC_UNSUPPORTED);
1600 	return 1;
1601 }
1602 
1603 void tdx_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa, int pgd_level)
1604 {
1605 	u64 shared_bit = (pgd_level == 5) ? TDX_SHARED_BIT_PWL_5 :
1606 			  TDX_SHARED_BIT_PWL_4;
1607 
1608 	if (KVM_BUG_ON(shared_bit != kvm_gfn_direct_bits(vcpu->kvm), vcpu->kvm))
1609 		return;
1610 
1611 	td_vmcs_write64(to_tdx(vcpu), SHARED_EPT_POINTER, root_hpa);
1612 }
1613 
1614 static int tdx_mem_page_add(struct kvm *kvm, gfn_t gfn, enum pg_level level,
1615 			    kvm_pfn_t pfn)
1616 {
1617 	struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
1618 	u64 err, entry, level_state;
1619 	gpa_t gpa = gfn_to_gpa(gfn);
1620 
1621 	lockdep_assert_held(&kvm->slots_lock);
1622 
1623 	if (KVM_BUG_ON(kvm->arch.pre_fault_allowed, kvm) ||
1624 	    KVM_BUG_ON(!kvm_tdx->page_add_src, kvm))
1625 		return -EIO;
1626 
1627 	err = tdh_mem_page_add(&kvm_tdx->td, gpa, pfn_to_page(pfn),
1628 			       kvm_tdx->page_add_src, &entry, &level_state);
1629 	if (unlikely(tdx_operand_busy(err)))
1630 		return -EBUSY;
1631 
1632 	if (TDX_BUG_ON_2(err, TDH_MEM_PAGE_ADD, entry, level_state, kvm))
1633 		return -EIO;
1634 
1635 	return 0;
1636 }
1637 
1638 static int tdx_mem_page_aug(struct kvm *kvm, gfn_t gfn,
1639 			    enum pg_level level, kvm_pfn_t pfn)
1640 {
1641 	int tdx_level = pg_level_to_tdx_sept_level(level);
1642 	struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
1643 	struct page *page = pfn_to_page(pfn);
1644 	gpa_t gpa = gfn_to_gpa(gfn);
1645 	u64 entry, level_state;
1646 	u64 err;
1647 
1648 	err = tdh_mem_page_aug(&kvm_tdx->td, gpa, tdx_level, page, &entry, &level_state);
1649 	if (unlikely(tdx_operand_busy(err)))
1650 		return -EBUSY;
1651 
1652 	if (TDX_BUG_ON_2(err, TDH_MEM_PAGE_AUG, entry, level_state, kvm))
1653 		return -EIO;
1654 
1655 	return 0;
1656 }
1657 
1658 static int tdx_sept_set_private_spte(struct kvm *kvm, gfn_t gfn,
1659 				     enum pg_level level, u64 mirror_spte)
1660 {
1661 	struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
1662 	kvm_pfn_t pfn = spte_to_pfn(mirror_spte);
1663 
1664 	/* TODO: handle large pages. */
1665 	if (KVM_BUG_ON(level != PG_LEVEL_4K, kvm))
1666 		return -EIO;
1667 
1668 	WARN_ON_ONCE(!is_shadow_present_pte(mirror_spte) ||
1669 		     (mirror_spte & VMX_EPT_RWX_MASK) != VMX_EPT_RWX_MASK);
1670 
1671 	/*
1672 	 * Ensure pre_fault_allowed is read by kvm_arch_vcpu_pre_fault_memory()
1673 	 * before kvm_tdx->state.  Userspace must not be allowed to pre-fault
1674 	 * arbitrary memory until the initial memory image is finalized.  Pairs
1675 	 * with the smp_wmb() in tdx_td_finalize().
1676 	 */
1677 	smp_rmb();
1678 
1679 	/*
1680 	 * If the TD isn't finalized/runnable, then userspace is initializing
1681 	 * the VM image via KVM_TDX_INIT_MEM_REGION; ADD the page to the TD.
1682 	 */
1683 	if (unlikely(kvm_tdx->state != TD_STATE_RUNNABLE))
1684 		return tdx_mem_page_add(kvm, gfn, level, pfn);
1685 
1686 	return tdx_mem_page_aug(kvm, gfn, level, pfn);
1687 }
1688 
1689 static int tdx_sept_link_private_spt(struct kvm *kvm, gfn_t gfn,
1690 				     enum pg_level level, void *private_spt)
1691 {
1692 	int tdx_level = pg_level_to_tdx_sept_level(level);
1693 	gpa_t gpa = gfn_to_gpa(gfn);
1694 	struct page *page = virt_to_page(private_spt);
1695 	u64 err, entry, level_state;
1696 
1697 	err = tdh_mem_sept_add(&to_kvm_tdx(kvm)->td, gpa, tdx_level, page, &entry,
1698 			       &level_state);
1699 	if (unlikely(tdx_operand_busy(err)))
1700 		return -EBUSY;
1701 
1702 	if (TDX_BUG_ON_2(err, TDH_MEM_SEPT_ADD, entry, level_state, kvm))
1703 		return -EIO;
1704 
1705 	return 0;
1706 }
1707 
1708 /*
1709  * Ensure shared and private EPTs to be flushed on all vCPUs.
1710  * tdh_mem_track() is the only caller that increases TD epoch. An increase in
1711  * the TD epoch (e.g., to value "N + 1") is successful only if no vCPUs are
1712  * running in guest mode with the value "N - 1".
1713  *
1714  * A successful execution of tdh_mem_track() ensures that vCPUs can only run in
1715  * guest mode with TD epoch value "N" if no TD exit occurs after the TD epoch
1716  * being increased to "N + 1".
1717  *
1718  * Kicking off all vCPUs after that further results in no vCPUs can run in guest
1719  * mode with TD epoch value "N", which unblocks the next tdh_mem_track() (e.g.
1720  * to increase TD epoch to "N + 2").
1721  *
1722  * TDX module will flush EPT on the next TD enter and make vCPUs to run in
1723  * guest mode with TD epoch value "N + 1".
1724  *
1725  * kvm_make_all_cpus_request() guarantees all vCPUs are out of guest mode by
1726  * waiting empty IPI handler ack_kick().
1727  *
1728  * No action is required to the vCPUs being kicked off since the kicking off
1729  * occurs certainly after TD epoch increment and before the next
1730  * tdh_mem_track().
1731  */
1732 static void tdx_track(struct kvm *kvm)
1733 {
1734 	struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
1735 	u64 err;
1736 
1737 	/* If TD isn't finalized, it's before any vcpu running. */
1738 	if (unlikely(kvm_tdx->state != TD_STATE_RUNNABLE))
1739 		return;
1740 
1741 	/*
1742 	 * The full sequence of TDH.MEM.TRACK and forcing vCPUs out of guest
1743 	 * mode must be serialized, as TDH.MEM.TRACK will fail if the previous
1744 	 * tracking epoch hasn't completed.
1745 	 */
1746 	lockdep_assert_held_write(&kvm->mmu_lock);
1747 
1748 	err = tdh_do_no_vcpus(tdh_mem_track, kvm, &kvm_tdx->td);
1749 	TDX_BUG_ON(err, TDH_MEM_TRACK, kvm);
1750 
1751 	kvm_make_all_cpus_request(kvm, KVM_REQ_OUTSIDE_GUEST_MODE);
1752 }
1753 
1754 static int tdx_sept_free_private_spt(struct kvm *kvm, gfn_t gfn,
1755 				     enum pg_level level, void *private_spt)
1756 {
1757 	struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
1758 
1759 	/*
1760 	 * free_external_spt() is only called after hkid is freed when TD is
1761 	 * tearing down.
1762 	 * KVM doesn't (yet) zap page table pages in mirror page table while
1763 	 * TD is active, though guest pages mapped in mirror page table could be
1764 	 * zapped during TD is active, e.g. for shared <-> private conversion
1765 	 * and slot move/deletion.
1766 	 */
1767 	if (KVM_BUG_ON(is_hkid_assigned(kvm_tdx), kvm))
1768 		return -EIO;
1769 
1770 	/*
1771 	 * The HKID assigned to this TD was already freed and cache was
1772 	 * already flushed. We don't have to flush again.
1773 	 */
1774 	return tdx_reclaim_page(virt_to_page(private_spt));
1775 }
1776 
1777 static void tdx_sept_remove_private_spte(struct kvm *kvm, gfn_t gfn,
1778 					 enum pg_level level, u64 mirror_spte)
1779 {
1780 	struct page *page = pfn_to_page(spte_to_pfn(mirror_spte));
1781 	int tdx_level = pg_level_to_tdx_sept_level(level);
1782 	struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
1783 	gpa_t gpa = gfn_to_gpa(gfn);
1784 	u64 err, entry, level_state;
1785 
1786 	lockdep_assert_held_write(&kvm->mmu_lock);
1787 
1788 	/*
1789 	 * HKID is released after all private pages have been removed, and set
1790 	 * before any might be populated. Warn if zapping is attempted when
1791 	 * there can't be anything populated in the private EPT.
1792 	 */
1793 	if (KVM_BUG_ON(!is_hkid_assigned(to_kvm_tdx(kvm)), kvm))
1794 		return;
1795 
1796 	/* TODO: handle large pages. */
1797 	if (KVM_BUG_ON(level != PG_LEVEL_4K, kvm))
1798 		return;
1799 
1800 	err = tdh_do_no_vcpus(tdh_mem_range_block, kvm, &kvm_tdx->td, gpa,
1801 			      tdx_level, &entry, &level_state);
1802 	if (TDX_BUG_ON_2(err, TDH_MEM_RANGE_BLOCK, entry, level_state, kvm))
1803 		return;
1804 
1805 	/*
1806 	 * TDX requires TLB tracking before dropping private page.  Do
1807 	 * it here, although it is also done later.
1808 	 */
1809 	tdx_track(kvm);
1810 
1811 	/*
1812 	 * When zapping private page, write lock is held. So no race condition
1813 	 * with other vcpu sept operation.
1814 	 * Race with TDH.VP.ENTER due to (0-step mitigation) and Guest TDCALLs.
1815 	 */
1816 	err = tdh_do_no_vcpus(tdh_mem_page_remove, kvm, &kvm_tdx->td, gpa,
1817 			      tdx_level, &entry, &level_state);
1818 	if (TDX_BUG_ON_2(err, TDH_MEM_PAGE_REMOVE, entry, level_state, kvm))
1819 		return;
1820 
1821 	err = tdh_phymem_page_wbinvd_hkid((u16)kvm_tdx->hkid, page);
1822 	if (TDX_BUG_ON(err, TDH_PHYMEM_PAGE_WBINVD, kvm))
1823 		return;
1824 
1825 	tdx_quirk_reset_page(page);
1826 }
1827 
1828 void tdx_deliver_interrupt(struct kvm_lapic *apic, int delivery_mode,
1829 			   int trig_mode, int vector)
1830 {
1831 	struct kvm_vcpu *vcpu = apic->vcpu;
1832 	struct vcpu_tdx *tdx = to_tdx(vcpu);
1833 
1834 	/* TDX supports only posted interrupt.  No lapic emulation. */
1835 	__vmx_deliver_posted_interrupt(vcpu, &tdx->vt.pi_desc, vector);
1836 
1837 	trace_kvm_apicv_accept_irq(vcpu->vcpu_id, delivery_mode, trig_mode, vector);
1838 }
1839 
1840 static inline bool tdx_is_sept_violation_unexpected_pending(struct kvm_vcpu *vcpu)
1841 {
1842 	u64 eeq_type = to_tdx(vcpu)->ext_exit_qualification & TDX_EXT_EXIT_QUAL_TYPE_MASK;
1843 	u64 eq = vmx_get_exit_qual(vcpu);
1844 
1845 	if (eeq_type != TDX_EXT_EXIT_QUAL_TYPE_PENDING_EPT_VIOLATION)
1846 		return false;
1847 
1848 	return !(eq & EPT_VIOLATION_PROT_MASK) && !(eq & EPT_VIOLATION_EXEC_FOR_RING3_LIN);
1849 }
1850 
1851 static int tdx_handle_ept_violation(struct kvm_vcpu *vcpu)
1852 {
1853 	unsigned long exit_qual;
1854 	gpa_t gpa = to_tdx(vcpu)->exit_gpa;
1855 	bool local_retry = false;
1856 	int ret;
1857 
1858 	if (vt_is_tdx_private_gpa(vcpu->kvm, gpa)) {
1859 		if (tdx_is_sept_violation_unexpected_pending(vcpu)) {
1860 			pr_warn("Guest access before accepting 0x%llx on vCPU %d\n",
1861 				gpa, vcpu->vcpu_id);
1862 			kvm_vm_dead(vcpu->kvm);
1863 			return -EIO;
1864 		}
1865 		/*
1866 		 * Always treat SEPT violations as write faults.  Ignore the
1867 		 * EXIT_QUALIFICATION reported by TDX-SEAM for SEPT violations.
1868 		 * TD private pages are always RWX in the SEPT tables,
1869 		 * i.e. they're always mapped writable.  Just as importantly,
1870 		 * treating SEPT violations as write faults is necessary to
1871 		 * avoid COW allocations, which will cause TDAUGPAGE failures
1872 		 * due to aliasing a single HPA to multiple GPAs.
1873 		 */
1874 		exit_qual = EPT_VIOLATION_ACC_WRITE;
1875 
1876 		/* Only private GPA triggers zero-step mitigation */
1877 		local_retry = true;
1878 	} else {
1879 		exit_qual = vmx_get_exit_qual(vcpu);
1880 		/*
1881 		 * EPT violation due to instruction fetch should never be
1882 		 * triggered from shared memory in TDX guest.  If such EPT
1883 		 * violation occurs, treat it as broken hardware.
1884 		 */
1885 		if (KVM_BUG_ON(exit_qual & EPT_VIOLATION_ACC_INSTR, vcpu->kvm))
1886 			return -EIO;
1887 	}
1888 
1889 	trace_kvm_page_fault(vcpu, gpa, exit_qual);
1890 
1891 	/*
1892 	 * To minimize TDH.VP.ENTER invocations, retry locally for private GPA
1893 	 * mapping in TDX.
1894 	 *
1895 	 * KVM may return RET_PF_RETRY for private GPA due to
1896 	 * - contentions when atomically updating SPTEs of the mirror page table
1897 	 * - in-progress GFN invalidation or memslot removal.
1898 	 * - TDX_OPERAND_BUSY error from TDH.MEM.PAGE.AUG or TDH.MEM.SEPT.ADD,
1899 	 *   caused by contentions with TDH.VP.ENTER (with zero-step mitigation)
1900 	 *   or certain TDCALLs.
1901 	 *
1902 	 * If TDH.VP.ENTER is invoked more times than the threshold set by the
1903 	 * TDX module before KVM resolves the private GPA mapping, the TDX
1904 	 * module will activate zero-step mitigation during TDH.VP.ENTER. This
1905 	 * process acquires an SEPT tree lock in the TDX module, leading to
1906 	 * further contentions with TDH.MEM.PAGE.AUG or TDH.MEM.SEPT.ADD
1907 	 * operations on other vCPUs.
1908 	 *
1909 	 * Breaking out of local retries for kvm_vcpu_has_events() is for
1910 	 * interrupt injection. kvm_vcpu_has_events() should not see pending
1911 	 * events for TDX. Since KVM can't determine if IRQs (or NMIs) are
1912 	 * blocked by TDs, false positives are inevitable i.e., KVM may re-enter
1913 	 * the guest even if the IRQ/NMI can't be delivered.
1914 	 *
1915 	 * Note: even without breaking out of local retries, zero-step
1916 	 * mitigation may still occur due to
1917 	 * - invoking of TDH.VP.ENTER after KVM_EXIT_MEMORY_FAULT,
1918 	 * - a single RIP causing EPT violations for more GFNs than the
1919 	 *   threshold count.
1920 	 * This is safe, as triggering zero-step mitigation only introduces
1921 	 * contentions to page installation SEAMCALLs on other vCPUs, which will
1922 	 * handle retries locally in their EPT violation handlers.
1923 	 */
1924 	while (1) {
1925 		struct kvm_memory_slot *slot;
1926 
1927 		ret = __vmx_handle_ept_violation(vcpu, gpa, exit_qual);
1928 
1929 		if (ret != RET_PF_RETRY || !local_retry)
1930 			break;
1931 
1932 		if (kvm_vcpu_has_events(vcpu) || signal_pending(current))
1933 			break;
1934 
1935 		if (kvm_check_request(KVM_REQ_VM_DEAD, vcpu)) {
1936 			ret = -EIO;
1937 			break;
1938 		}
1939 
1940 		/*
1941 		 * Bail if the memslot is invalid, i.e. is being deleted, as
1942 		 * faulting in will never succeed and this task needs to drop
1943 		 * SRCU in order to let memslot deletion complete.
1944 		 */
1945 		slot = kvm_vcpu_gfn_to_memslot(vcpu, gpa_to_gfn(gpa));
1946 		if (slot && slot->flags & KVM_MEMSLOT_INVALID)
1947 			break;
1948 
1949 		cond_resched();
1950 	}
1951 	return ret;
1952 }
1953 
1954 int tdx_complete_emulated_msr(struct kvm_vcpu *vcpu, int err)
1955 {
1956 	if (err) {
1957 		tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_INVALID_OPERAND);
1958 		return 1;
1959 	}
1960 
1961 	if (vmx_get_exit_reason(vcpu).basic == EXIT_REASON_MSR_READ)
1962 		tdvmcall_set_return_val(vcpu, kvm_read_edx_eax(vcpu));
1963 
1964 	return 1;
1965 }
1966 
1967 
1968 int tdx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t fastpath)
1969 {
1970 	struct vcpu_tdx *tdx = to_tdx(vcpu);
1971 	u64 vp_enter_ret = tdx->vp_enter_ret;
1972 	union vmx_exit_reason exit_reason = vmx_get_exit_reason(vcpu);
1973 
1974 	if (fastpath != EXIT_FASTPATH_NONE)
1975 		return 1;
1976 
1977 	if (unlikely(vp_enter_ret == EXIT_REASON_EPT_MISCONFIG)) {
1978 		KVM_BUG_ON(1, vcpu->kvm);
1979 		return -EIO;
1980 	}
1981 
1982 	/*
1983 	 * Handle TDX SW errors, including TDX_SEAMCALL_UD, TDX_SEAMCALL_GP and
1984 	 * TDX_SEAMCALL_VMFAILINVALID.
1985 	 */
1986 	if (unlikely((vp_enter_ret & TDX_SW_ERROR) == TDX_SW_ERROR)) {
1987 		KVM_BUG_ON(!virt_rebooting, vcpu->kvm);
1988 		goto unhandled_exit;
1989 	}
1990 
1991 	if (unlikely(tdx_failed_vmentry(vcpu))) {
1992 		/*
1993 		 * If the guest state is protected, that means off-TD debug is
1994 		 * not enabled, TDX_NON_RECOVERABLE must be set.
1995 		 */
1996 		WARN_ON_ONCE(vcpu->arch.guest_state_protected &&
1997 				!(vp_enter_ret & TDX_NON_RECOVERABLE));
1998 		vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY;
1999 		vcpu->run->fail_entry.hardware_entry_failure_reason = exit_reason.full;
2000 		vcpu->run->fail_entry.cpu = vcpu->arch.last_vmentry_cpu;
2001 		return 0;
2002 	}
2003 
2004 	if (unlikely(vp_enter_ret & (TDX_ERROR | TDX_NON_RECOVERABLE)) &&
2005 		exit_reason.basic != EXIT_REASON_TRIPLE_FAULT) {
2006 		kvm_pr_unimpl("TD vp_enter_ret 0x%llx\n", vp_enter_ret);
2007 		goto unhandled_exit;
2008 	}
2009 
2010 	WARN_ON_ONCE(exit_reason.basic != EXIT_REASON_TRIPLE_FAULT &&
2011 		     (vp_enter_ret & TDX_SEAMCALL_STATUS_MASK) != TDX_SUCCESS);
2012 
2013 	switch (exit_reason.basic) {
2014 	case EXIT_REASON_TRIPLE_FAULT:
2015 		vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN;
2016 		vcpu->mmio_needed = 0;
2017 		return 0;
2018 	case EXIT_REASON_EXCEPTION_NMI:
2019 		return tdx_handle_exception_nmi(vcpu);
2020 	case EXIT_REASON_EXTERNAL_INTERRUPT:
2021 		++vcpu->stat.irq_exits;
2022 		return 1;
2023 	case EXIT_REASON_CPUID:
2024 		return tdx_emulate_cpuid(vcpu);
2025 	case EXIT_REASON_HLT:
2026 		return kvm_emulate_halt_noskip(vcpu);
2027 	case EXIT_REASON_TDCALL:
2028 		return handle_tdvmcall(vcpu);
2029 	case EXIT_REASON_VMCALL:
2030 		return tdx_emulate_vmcall(vcpu);
2031 	case EXIT_REASON_IO_INSTRUCTION:
2032 		return tdx_emulate_io(vcpu);
2033 	case EXIT_REASON_MSR_READ:
2034 		kvm_rcx_write(vcpu, tdx->vp_enter_args.r12);
2035 		return kvm_emulate_rdmsr(vcpu);
2036 	case EXIT_REASON_MSR_WRITE:
2037 		kvm_rcx_write(vcpu, tdx->vp_enter_args.r12);
2038 		kvm_rax_write(vcpu, tdx->vp_enter_args.r13 & -1u);
2039 		kvm_rdx_write(vcpu, tdx->vp_enter_args.r13 >> 32);
2040 		return kvm_emulate_wrmsr(vcpu);
2041 	case EXIT_REASON_EPT_MISCONFIG:
2042 		return tdx_emulate_mmio(vcpu);
2043 	case EXIT_REASON_EPT_VIOLATION:
2044 		return tdx_handle_ept_violation(vcpu);
2045 	case EXIT_REASON_OTHER_SMI:
2046 		/*
2047 		 * Unlike VMX, SMI in SEAM non-root mode (i.e. when
2048 		 * TD guest vCPU is running) will cause VM exit to TDX module,
2049 		 * then SEAMRET to KVM.  Once it exits to KVM, SMI is delivered
2050 		 * and handled by kernel handler right away.
2051 		 *
2052 		 * The Other SMI exit can also be caused by the SEAM non-root
2053 		 * machine check delivered via Machine Check System Management
2054 		 * Interrupt (MSMI), but it has already been handled by the
2055 		 * kernel machine check handler, i.e., the memory page has been
2056 		 * marked as poisoned and it won't be freed to the free list
2057 		 * when the TDX guest is terminated (the TDX module marks the
2058 		 * guest as dead and prevent it from further running when
2059 		 * machine check happens in SEAM non-root).
2060 		 *
2061 		 * - A MSMI will not reach here, it's handled as non_recoverable
2062 		 *   case above.
2063 		 * - If it's not an MSMI, no need to do anything here.
2064 		 */
2065 		return 1;
2066 	default:
2067 		break;
2068 	}
2069 
2070 unhandled_exit:
2071 	kvm_prepare_unexpected_reason_exit(vcpu, vp_enter_ret);
2072 	return 0;
2073 }
2074 
2075 void tdx_get_exit_info(struct kvm_vcpu *vcpu, u32 *reason,
2076 		u64 *info1, u64 *info2, u32 *intr_info, u32 *error_code)
2077 {
2078 	struct vcpu_tdx *tdx = to_tdx(vcpu);
2079 
2080 	*reason = tdx->vt.exit_reason.full;
2081 	if (*reason != -1u) {
2082 		*info1 = vmx_get_exit_qual(vcpu);
2083 		*info2 = tdx->ext_exit_qualification;
2084 		*intr_info = vmx_get_intr_info(vcpu);
2085 	} else {
2086 		*info1 = 0;
2087 		*info2 = 0;
2088 		*intr_info = 0;
2089 	}
2090 
2091 	*error_code = 0;
2092 }
2093 
2094 bool tdx_has_emulated_msr(u32 index)
2095 {
2096 	switch (index) {
2097 	case MSR_IA32_UCODE_REV:
2098 	case MSR_IA32_ARCH_CAPABILITIES:
2099 	case MSR_IA32_POWER_CTL:
2100 	case MSR_IA32_CR_PAT:
2101 	case MSR_MTRRcap:
2102 	case MTRRphysBase_MSR(0) ... MSR_MTRRfix4K_F8000:
2103 	case MSR_MTRRdefType:
2104 	case MSR_IA32_TSC_DEADLINE:
2105 	case MSR_IA32_MISC_ENABLE:
2106 	case MSR_PLATFORM_INFO:
2107 	case MSR_MISC_FEATURES_ENABLES:
2108 	case MSR_IA32_APICBASE:
2109 	case MSR_EFER:
2110 	case MSR_IA32_FEAT_CTL:
2111 	case MSR_IA32_MCG_CAP:
2112 	case MSR_IA32_MCG_STATUS:
2113 	case MSR_IA32_MCG_CTL:
2114 	case MSR_IA32_MCG_EXT_CTL:
2115 	case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1:
2116 	case MSR_IA32_MC0_CTL2 ... MSR_IA32_MCx_CTL2(KVM_MAX_MCE_BANKS) - 1:
2117 		/* MSR_IA32_MCx_{CTL, STATUS, ADDR, MISC, CTL2} */
2118 	case MSR_KVM_POLL_CONTROL:
2119 		return true;
2120 	case APIC_BASE_MSR ... APIC_BASE_MSR + 0xff:
2121 		/*
2122 		 * x2APIC registers that are virtualized by the CPU can't be
2123 		 * emulated, KVM doesn't have access to the virtual APIC page.
2124 		 */
2125 		switch (index) {
2126 		case X2APIC_MSR(APIC_TASKPRI):
2127 		case X2APIC_MSR(APIC_PROCPRI):
2128 		case X2APIC_MSR(APIC_EOI):
2129 		case X2APIC_MSR(APIC_ISR) ... X2APIC_MSR(APIC_ISR + APIC_ISR_NR):
2130 		case X2APIC_MSR(APIC_TMR) ... X2APIC_MSR(APIC_TMR + APIC_ISR_NR):
2131 		case X2APIC_MSR(APIC_IRR) ... X2APIC_MSR(APIC_IRR + APIC_ISR_NR):
2132 			return false;
2133 		default:
2134 			return true;
2135 		}
2136 	default:
2137 		return false;
2138 	}
2139 }
2140 
2141 static bool tdx_is_read_only_msr(u32 index)
2142 {
2143 	return  index == MSR_IA32_APICBASE || index == MSR_EFER ||
2144 		index == MSR_IA32_FEAT_CTL;
2145 }
2146 
2147 int tdx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
2148 {
2149 	switch (msr->index) {
2150 	case MSR_IA32_FEAT_CTL:
2151 		/*
2152 		 * MCE and MCA are advertised via cpuid. Guest kernel could
2153 		 * check if LMCE is enabled or not.
2154 		 */
2155 		msr->data = FEAT_CTL_LOCKED;
2156 		if (vcpu->arch.mcg_cap & MCG_LMCE_P)
2157 			msr->data |= FEAT_CTL_LMCE_ENABLED;
2158 		return 0;
2159 	case MSR_IA32_MCG_EXT_CTL:
2160 		if (!msr->host_initiated && !(vcpu->arch.mcg_cap & MCG_LMCE_P))
2161 			return 1;
2162 		msr->data = vcpu->arch.mcg_ext_ctl;
2163 		return 0;
2164 	default:
2165 		if (!tdx_has_emulated_msr(msr->index))
2166 			return 1;
2167 
2168 		return kvm_get_msr_common(vcpu, msr);
2169 	}
2170 }
2171 
2172 int tdx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
2173 {
2174 	switch (msr->index) {
2175 	case MSR_IA32_MCG_EXT_CTL:
2176 		if ((!msr->host_initiated && !(vcpu->arch.mcg_cap & MCG_LMCE_P)) ||
2177 		    (msr->data & ~MCG_EXT_CTL_LMCE_EN))
2178 			return 1;
2179 		vcpu->arch.mcg_ext_ctl = msr->data;
2180 		return 0;
2181 	default:
2182 		if (tdx_is_read_only_msr(msr->index))
2183 			return 1;
2184 
2185 		if (!tdx_has_emulated_msr(msr->index))
2186 			return 1;
2187 
2188 		return kvm_set_msr_common(vcpu, msr);
2189 	}
2190 }
2191 
2192 static int tdx_get_capabilities(struct kvm_tdx_cmd *cmd)
2193 {
2194 	const struct tdx_sys_info_td_conf *td_conf = &tdx_sysinfo->td_conf;
2195 	struct kvm_tdx_capabilities __user *user_caps;
2196 	struct kvm_tdx_capabilities *caps = NULL;
2197 	u32 nr_user_entries;
2198 	int ret = 0;
2199 
2200 	/* flags is reserved for future use */
2201 	if (cmd->flags)
2202 		return -EINVAL;
2203 
2204 	user_caps = u64_to_user_ptr(cmd->data);
2205 	if (get_user(nr_user_entries, &user_caps->cpuid.nent))
2206 		return -EFAULT;
2207 
2208 	if (nr_user_entries < td_conf->num_cpuid_config)
2209 		return -E2BIG;
2210 
2211 	caps = kzalloc_flex(*caps, cpuid.entries, td_conf->num_cpuid_config);
2212 	if (!caps)
2213 		return -ENOMEM;
2214 
2215 	ret = init_kvm_tdx_caps(td_conf, caps);
2216 	if (ret)
2217 		goto out;
2218 
2219 	if (copy_to_user(user_caps, caps, struct_size(caps, cpuid.entries,
2220 						      caps->cpuid.nent))) {
2221 		ret = -EFAULT;
2222 		goto out;
2223 	}
2224 
2225 out:
2226 	/* kfree() accepts NULL. */
2227 	kfree(caps);
2228 	return ret;
2229 }
2230 
2231 /*
2232  * KVM reports guest physical address in CPUID.0x800000008.EAX[23:16], which is
2233  * similar to TDX's GPAW. Use this field as the interface for userspace to
2234  * configure the GPAW and EPT level for TDs.
2235  *
2236  * Only values 48 and 52 are supported. Value 52 means GPAW-52 and EPT level
2237  * 5, Value 48 means GPAW-48 and EPT level 4. For value 48, GPAW-48 is always
2238  * supported. Value 52 is only supported when the platform supports 5 level
2239  * EPT.
2240  */
2241 static int setup_tdparams_eptp_controls(struct kvm_cpuid2 *cpuid,
2242 					struct td_params *td_params)
2243 {
2244 	const struct kvm_cpuid_entry2 *entry;
2245 	int guest_pa;
2246 
2247 	entry = kvm_find_cpuid_entry2(cpuid->entries, cpuid->nent, 0x80000008, 0);
2248 	if (!entry)
2249 		return -EINVAL;
2250 
2251 	guest_pa = tdx_get_guest_phys_addr_bits(entry->eax);
2252 
2253 	if (guest_pa != 48 && guest_pa != 52)
2254 		return -EINVAL;
2255 
2256 	if (guest_pa == 52 && !cpu_has_vmx_ept_5levels())
2257 		return -EINVAL;
2258 
2259 	td_params->eptp_controls = VMX_EPTP_MT_WB;
2260 	if (guest_pa == 52) {
2261 		td_params->eptp_controls |= VMX_EPTP_PWL_5;
2262 		td_params->config_flags |= TDX_CONFIG_FLAGS_MAX_GPAW;
2263 	} else {
2264 		td_params->eptp_controls |= VMX_EPTP_PWL_4;
2265 	}
2266 
2267 	return 0;
2268 }
2269 
2270 static int setup_tdparams_cpuids(struct kvm_cpuid2 *cpuid,
2271 				 struct td_params *td_params)
2272 {
2273 	const struct tdx_sys_info_td_conf *td_conf = &tdx_sysinfo->td_conf;
2274 	const struct kvm_cpuid_entry2 *entry;
2275 	struct tdx_cpuid_value *value;
2276 	int i, copy_cnt = 0;
2277 
2278 	/*
2279 	 * td_params.cpuid_values: The number and the order of cpuid_value must
2280 	 * be same to the one of struct tdsysinfo.{num_cpuid_config, cpuid_configs}
2281 	 * It's assumed that td_params was zeroed.
2282 	 */
2283 	for (i = 0; i < td_conf->num_cpuid_config; i++) {
2284 		struct kvm_cpuid_entry2 tmp;
2285 
2286 		td_init_cpuid_entry2(&tmp, i);
2287 
2288 		entry = kvm_find_cpuid_entry2(cpuid->entries, cpuid->nent,
2289 					      tmp.function, tmp.index);
2290 		if (!entry)
2291 			continue;
2292 
2293 		if (tdx_unsupported_cpuid(entry))
2294 			return -EINVAL;
2295 
2296 		copy_cnt++;
2297 
2298 		value = &td_params->cpuid_values[i];
2299 		value->eax = entry->eax;
2300 		value->ebx = entry->ebx;
2301 		value->ecx = entry->ecx;
2302 		value->edx = entry->edx;
2303 
2304 		/*
2305 		 * TDX module does not accept nonzero bits 16..23 for the
2306 		 * CPUID[0x80000008].EAX, see setup_tdparams_eptp_controls().
2307 		 */
2308 		if (tmp.function == 0x80000008)
2309 			value->eax = tdx_set_guest_phys_addr_bits(value->eax, 0);
2310 	}
2311 
2312 	/*
2313 	 * Rely on the TDX module to reject invalid configuration, but it can't
2314 	 * check of leafs that don't have a proper slot in td_params->cpuid_values
2315 	 * to stick then. So fail if there were entries that didn't get copied to
2316 	 * td_params.
2317 	 */
2318 	if (copy_cnt != cpuid->nent)
2319 		return -EINVAL;
2320 
2321 	return 0;
2322 }
2323 
2324 static int setup_tdparams(struct kvm *kvm, struct td_params *td_params,
2325 			struct kvm_tdx_init_vm *init_vm)
2326 {
2327 	const struct tdx_sys_info_td_conf *td_conf = &tdx_sysinfo->td_conf;
2328 	struct kvm_cpuid2 *cpuid = &init_vm->cpuid;
2329 	int ret;
2330 
2331 	if (kvm->created_vcpus)
2332 		return -EBUSY;
2333 
2334 	if (init_vm->attributes & ~tdx_get_supported_attrs(td_conf))
2335 		return -EINVAL;
2336 
2337 	if (init_vm->xfam & ~tdx_get_supported_xfam(td_conf))
2338 		return -EINVAL;
2339 
2340 	td_params->max_vcpus = kvm->max_vcpus;
2341 	td_params->attributes = init_vm->attributes | td_conf->attributes_fixed1;
2342 	td_params->xfam = init_vm->xfam | td_conf->xfam_fixed1;
2343 
2344 	td_params->config_flags = TDX_CONFIG_FLAGS_NO_RBP_MOD;
2345 	td_params->tsc_frequency = TDX_TSC_KHZ_TO_25MHZ(kvm->arch.default_tsc_khz);
2346 
2347 	ret = setup_tdparams_eptp_controls(cpuid, td_params);
2348 	if (ret)
2349 		return ret;
2350 
2351 	ret = setup_tdparams_cpuids(cpuid, td_params);
2352 	if (ret)
2353 		return ret;
2354 
2355 #define MEMCPY_SAME_SIZE(dst, src)				\
2356 	do {							\
2357 		BUILD_BUG_ON(sizeof(dst) != sizeof(src));	\
2358 		memcpy((dst), (src), sizeof(dst));		\
2359 	} while (0)
2360 
2361 	MEMCPY_SAME_SIZE(td_params->mrconfigid, init_vm->mrconfigid);
2362 	MEMCPY_SAME_SIZE(td_params->mrowner, init_vm->mrowner);
2363 	MEMCPY_SAME_SIZE(td_params->mrownerconfig, init_vm->mrownerconfig);
2364 
2365 	return 0;
2366 }
2367 
2368 static int __tdx_td_init(struct kvm *kvm, struct td_params *td_params,
2369 			 u64 *seamcall_err)
2370 {
2371 	struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
2372 	cpumask_var_t packages;
2373 	struct page **tdcs_pages = NULL;
2374 	struct page *tdr_page;
2375 	int ret, i;
2376 	u64 err, rcx;
2377 
2378 	*seamcall_err = 0;
2379 	ret = tdx_guest_keyid_alloc();
2380 	if (ret < 0)
2381 		return ret;
2382 	kvm_tdx->hkid = ret;
2383 	kvm_tdx->misc_cg = get_current_misc_cg();
2384 	ret = misc_cg_try_charge(MISC_CG_RES_TDX, kvm_tdx->misc_cg, 1);
2385 	if (ret)
2386 		goto free_hkid;
2387 
2388 	ret = -ENOMEM;
2389 
2390 	tdr_page = alloc_page(GFP_KERNEL);
2391 	if (!tdr_page)
2392 		goto free_hkid;
2393 
2394 	kvm_tdx->td.tdcs_nr_pages = tdx_sysinfo->td_ctrl.tdcs_base_size / PAGE_SIZE;
2395 	/* TDVPS = TDVPR(4K page) + TDCX(multiple 4K pages), -1 for TDVPR. */
2396 	kvm_tdx->td.tdcx_nr_pages = tdx_sysinfo->td_ctrl.tdvps_base_size / PAGE_SIZE - 1;
2397 	tdcs_pages = kzalloc_objs(*kvm_tdx->td.tdcs_pages,
2398 				  kvm_tdx->td.tdcs_nr_pages);
2399 	if (!tdcs_pages)
2400 		goto free_tdr;
2401 
2402 	for (i = 0; i < kvm_tdx->td.tdcs_nr_pages; i++) {
2403 		tdcs_pages[i] = alloc_page(GFP_KERNEL);
2404 		if (!tdcs_pages[i])
2405 			goto free_tdcs;
2406 	}
2407 
2408 	if (!zalloc_cpumask_var(&packages, GFP_KERNEL))
2409 		goto free_tdcs;
2410 
2411 	cpus_read_lock();
2412 
2413 	/*
2414 	 * Need at least one CPU of the package to be online in order to
2415 	 * program all packages for host key id.  Check it.
2416 	 */
2417 	for_each_present_cpu(i)
2418 		cpumask_set_cpu(topology_physical_package_id(i), packages);
2419 	for_each_online_cpu(i)
2420 		cpumask_clear_cpu(topology_physical_package_id(i), packages);
2421 	if (!cpumask_empty(packages)) {
2422 		ret = -EIO;
2423 		/*
2424 		 * Because it's hard for human operator to figure out the
2425 		 * reason, warn it.
2426 		 */
2427 #define MSG_ALLPKG	"All packages need to have online CPU to create TD. Online CPU and retry.\n"
2428 		pr_warn_ratelimited(MSG_ALLPKG);
2429 		goto free_packages;
2430 	}
2431 
2432 	/*
2433 	 * TDH.MNG.CREATE tries to grab the global TDX module and fails
2434 	 * with TDX_OPERAND_BUSY when it fails to grab.  Take the global
2435 	 * lock to prevent it from failure.
2436 	 */
2437 	mutex_lock(&tdx_lock);
2438 	kvm_tdx->td.tdr_page = tdr_page;
2439 	err = tdh_mng_create(&kvm_tdx->td, kvm_tdx->hkid);
2440 	mutex_unlock(&tdx_lock);
2441 
2442 	if (err == TDX_RND_NO_ENTROPY) {
2443 		ret = -EAGAIN;
2444 		goto free_packages;
2445 	}
2446 
2447 	if (TDX_BUG_ON(err, TDH_MNG_CREATE, kvm)) {
2448 		ret = -EIO;
2449 		goto free_packages;
2450 	}
2451 
2452 	for_each_online_cpu(i) {
2453 		int pkg = topology_physical_package_id(i);
2454 
2455 		if (cpumask_test_and_set_cpu(pkg, packages))
2456 			continue;
2457 
2458 		/*
2459 		 * Program the memory controller in the package with an
2460 		 * encryption key associated to a TDX private host key id
2461 		 * assigned to this TDR.  Concurrent operations on same memory
2462 		 * controller results in TDX_OPERAND_BUSY. No locking needed
2463 		 * beyond the cpus_read_lock() above as it serializes against
2464 		 * hotplug and the first online CPU of the package is always
2465 		 * used. We never have two CPUs in the same socket trying to
2466 		 * program the key.
2467 		 */
2468 		ret = smp_call_on_cpu(i, tdx_do_tdh_mng_key_config,
2469 				      kvm_tdx, true);
2470 		if (ret)
2471 			break;
2472 	}
2473 	cpus_read_unlock();
2474 	free_cpumask_var(packages);
2475 	if (ret) {
2476 		i = 0;
2477 		goto teardown;
2478 	}
2479 
2480 	kvm_tdx->td.tdcs_pages = tdcs_pages;
2481 	for (i = 0; i < kvm_tdx->td.tdcs_nr_pages; i++) {
2482 		err = tdh_mng_addcx(&kvm_tdx->td, tdcs_pages[i]);
2483 		if (err == TDX_RND_NO_ENTROPY) {
2484 			/* Here it's hard to allow userspace to retry. */
2485 			ret = -EAGAIN;
2486 			goto teardown;
2487 		}
2488 		if (TDX_BUG_ON(err, TDH_MNG_ADDCX, kvm)) {
2489 			ret = -EIO;
2490 			goto teardown;
2491 		}
2492 	}
2493 
2494 	err = tdh_mng_init(&kvm_tdx->td, __pa(td_params), &rcx);
2495 	if ((err & TDX_SEAMCALL_STATUS_MASK) == TDX_OPERAND_INVALID) {
2496 		/*
2497 		 * Because a user gives operands, don't warn.
2498 		 * Return a hint to the user because it's sometimes hard for the
2499 		 * user to figure out which operand is invalid.  SEAMCALL status
2500 		 * code includes which operand caused invalid operand error.
2501 		 */
2502 		*seamcall_err = err;
2503 		ret = -EINVAL;
2504 		goto teardown;
2505 	} else if (TDX_BUG_ON_1(err, TDH_MNG_INIT, rcx, kvm)) {
2506 		ret = -EIO;
2507 		goto teardown;
2508 	}
2509 
2510 	return 0;
2511 
2512 	/*
2513 	 * The sequence for freeing resources from a partially initialized TD
2514 	 * varies based on where in the initialization flow failure occurred.
2515 	 * Simply use the full teardown and destroy, which naturally play nice
2516 	 * with partial initialization.
2517 	 */
2518 teardown:
2519 	/* Only free pages not yet added, so start at 'i' */
2520 	for (; i < kvm_tdx->td.tdcs_nr_pages; i++) {
2521 		if (tdcs_pages[i]) {
2522 			__free_page(tdcs_pages[i]);
2523 			tdcs_pages[i] = NULL;
2524 		}
2525 	}
2526 	if (!kvm_tdx->td.tdcs_pages)
2527 		kfree(tdcs_pages);
2528 
2529 	tdx_mmu_release_hkid(kvm);
2530 	tdx_reclaim_td_control_pages(kvm);
2531 
2532 	return ret;
2533 
2534 free_packages:
2535 	cpus_read_unlock();
2536 	free_cpumask_var(packages);
2537 
2538 free_tdcs:
2539 	for (i = 0; i < kvm_tdx->td.tdcs_nr_pages; i++) {
2540 		if (tdcs_pages[i])
2541 			__free_page(tdcs_pages[i]);
2542 	}
2543 	kfree(tdcs_pages);
2544 	kvm_tdx->td.tdcs_pages = NULL;
2545 
2546 free_tdr:
2547 	if (tdr_page)
2548 		__free_page(tdr_page);
2549 	kvm_tdx->td.tdr_page = NULL;
2550 
2551 free_hkid:
2552 	tdx_hkid_free(kvm_tdx);
2553 
2554 	return ret;
2555 }
2556 
2557 static u64 tdx_td_metadata_field_read(struct kvm_tdx *tdx, u64 field_id,
2558 				      u64 *data)
2559 {
2560 	u64 err;
2561 
2562 	err = tdh_mng_rd(&tdx->td, field_id, data);
2563 
2564 	return err;
2565 }
2566 
2567 #define TDX_MD_UNREADABLE_LEAF_MASK	GENMASK(30, 7)
2568 #define TDX_MD_UNREADABLE_SUBLEAF_MASK	GENMASK(31, 7)
2569 
2570 static int tdx_read_cpuid(struct kvm_vcpu *vcpu, u32 leaf, u32 sub_leaf,
2571 			  bool sub_leaf_set, int *entry_index,
2572 			  struct kvm_cpuid_entry2 *out)
2573 {
2574 	struct kvm_tdx *kvm_tdx = to_kvm_tdx(vcpu->kvm);
2575 	u64 field_id = TD_MD_FIELD_ID_CPUID_VALUES;
2576 	u64 ebx_eax, edx_ecx;
2577 	u64 err = 0;
2578 
2579 	if (sub_leaf > 0b1111111)
2580 		return -EINVAL;
2581 
2582 	if (*entry_index >= KVM_MAX_CPUID_ENTRIES)
2583 		return -EINVAL;
2584 
2585 	if (leaf & TDX_MD_UNREADABLE_LEAF_MASK ||
2586 	    sub_leaf & TDX_MD_UNREADABLE_SUBLEAF_MASK)
2587 		return -EINVAL;
2588 
2589 	/*
2590 	 * bit 23:17, REVSERVED: reserved, must be 0;
2591 	 * bit 16,    LEAF_31: leaf number bit 31;
2592 	 * bit 15:9,  LEAF_6_0: leaf number bits 6:0, leaf bits 30:7 are
2593 	 *                      implicitly 0;
2594 	 * bit 8,     SUBLEAF_NA: sub-leaf not applicable flag;
2595 	 * bit 7:1,   SUBLEAF_6_0: sub-leaf number bits 6:0. If SUBLEAF_NA is 1,
2596 	 *                         the SUBLEAF_6_0 is all-1.
2597 	 *                         sub-leaf bits 31:7 are implicitly 0;
2598 	 * bit 0,     ELEMENT_I: Element index within field;
2599 	 */
2600 	field_id |= ((leaf & 0x80000000) ? 1 : 0) << 16;
2601 	field_id |= (leaf & 0x7f) << 9;
2602 	if (sub_leaf_set)
2603 		field_id |= (sub_leaf & 0x7f) << 1;
2604 	else
2605 		field_id |= 0x1fe;
2606 
2607 	err = tdx_td_metadata_field_read(kvm_tdx, field_id, &ebx_eax);
2608 	if (err) //TODO check for specific errors
2609 		goto err_out;
2610 
2611 	out->eax = (u32) ebx_eax;
2612 	out->ebx = (u32) (ebx_eax >> 32);
2613 
2614 	field_id++;
2615 	err = tdx_td_metadata_field_read(kvm_tdx, field_id, &edx_ecx);
2616 	/*
2617 	 * It's weird that reading edx_ecx fails while reading ebx_eax
2618 	 * succeeded.
2619 	 */
2620 	if (WARN_ON_ONCE(err))
2621 		goto err_out;
2622 
2623 	out->ecx = (u32) edx_ecx;
2624 	out->edx = (u32) (edx_ecx >> 32);
2625 
2626 	out->function = leaf;
2627 	out->index = sub_leaf;
2628 	out->flags |= sub_leaf_set ? KVM_CPUID_FLAG_SIGNIFCANT_INDEX : 0;
2629 
2630 	/*
2631 	 * Work around missing support on old TDX modules, fetch
2632 	 * guest maxpa from gfn_direct_bits.
2633 	 */
2634 	if (leaf == 0x80000008) {
2635 		gpa_t gpa_bits = gfn_to_gpa(kvm_gfn_direct_bits(vcpu->kvm));
2636 		unsigned int g_maxpa = __ffs(gpa_bits) + 1;
2637 
2638 		out->eax = tdx_set_guest_phys_addr_bits(out->eax, g_maxpa);
2639 	}
2640 
2641 	(*entry_index)++;
2642 
2643 	return 0;
2644 
2645 err_out:
2646 	out->eax = 0;
2647 	out->ebx = 0;
2648 	out->ecx = 0;
2649 	out->edx = 0;
2650 
2651 	return -EIO;
2652 }
2653 
2654 typedef void *tdx_vm_state_guard_t;
2655 
2656 static tdx_vm_state_guard_t tdx_acquire_vm_state_locks(struct kvm *kvm)
2657 {
2658 	int r;
2659 
2660 	mutex_lock(&kvm->lock);
2661 
2662 	if (kvm->created_vcpus != atomic_read(&kvm->online_vcpus)) {
2663 		r = -EBUSY;
2664 		goto out_err;
2665 	}
2666 
2667 	r = kvm_lock_all_vcpus(kvm);
2668 	if (r)
2669 		goto out_err;
2670 
2671 	/*
2672 	 * Note the unintuitive ordering!  vcpu->mutex must be taken outside
2673 	 * kvm->slots_lock!
2674 	 */
2675 	mutex_lock(&kvm->slots_lock);
2676 	return kvm;
2677 
2678 out_err:
2679 	mutex_unlock(&kvm->lock);
2680 	return ERR_PTR(r);
2681 }
2682 
2683 static void tdx_release_vm_state_locks(struct kvm *kvm)
2684 {
2685 	mutex_unlock(&kvm->slots_lock);
2686 	kvm_unlock_all_vcpus(kvm);
2687 	mutex_unlock(&kvm->lock);
2688 }
2689 
2690 DEFINE_CLASS(tdx_vm_state_guard, tdx_vm_state_guard_t,
2691 	     if (!IS_ERR(_T)) tdx_release_vm_state_locks(_T),
2692 	     tdx_acquire_vm_state_locks(kvm), struct kvm *kvm);
2693 
2694 static int tdx_td_init(struct kvm *kvm, struct kvm_tdx_cmd *cmd)
2695 {
2696 	struct kvm_tdx_init_vm __user *user_data = u64_to_user_ptr(cmd->data);
2697 	struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
2698 	struct kvm_tdx_init_vm *init_vm;
2699 	struct td_params *td_params = NULL;
2700 	u32 nr_user_entries;
2701 	int ret;
2702 
2703 	BUILD_BUG_ON(sizeof(*init_vm) != 256 + sizeof_field(struct kvm_tdx_init_vm, cpuid));
2704 	BUILD_BUG_ON(sizeof(struct td_params) != 1024);
2705 
2706 	if (kvm_tdx->state != TD_STATE_UNINITIALIZED)
2707 		return -EINVAL;
2708 
2709 	if (cmd->flags)
2710 		return -EINVAL;
2711 
2712 	if (get_user(nr_user_entries, &user_data->cpuid.nent))
2713 		return -EFAULT;
2714 
2715 	if (nr_user_entries > KVM_MAX_CPUID_ENTRIES)
2716 		return -E2BIG;
2717 
2718 	init_vm = memdup_user(user_data,
2719 			      struct_size(user_data, cpuid.entries, nr_user_entries));
2720 	if (IS_ERR(init_vm))
2721 		return PTR_ERR(init_vm);
2722 
2723 	if (memchr_inv(init_vm->reserved, 0, sizeof(init_vm->reserved))) {
2724 		ret = -EINVAL;
2725 		goto out;
2726 	}
2727 
2728 	if (init_vm->cpuid.padding) {
2729 		ret = -EINVAL;
2730 		goto out;
2731 	}
2732 
2733 	td_params = kzalloc_obj(struct td_params);
2734 	if (!td_params) {
2735 		ret = -ENOMEM;
2736 		goto out;
2737 	}
2738 
2739 	ret = setup_tdparams(kvm, td_params, init_vm);
2740 	if (ret)
2741 		goto out;
2742 
2743 	ret = __tdx_td_init(kvm, td_params, &cmd->hw_error);
2744 	if (ret)
2745 		goto out;
2746 
2747 	kvm_tdx->tsc_offset = td_tdcs_exec_read64(kvm_tdx, TD_TDCS_EXEC_TSC_OFFSET);
2748 	kvm_tdx->tsc_multiplier = td_tdcs_exec_read64(kvm_tdx, TD_TDCS_EXEC_TSC_MULTIPLIER);
2749 	kvm_tdx->attributes = td_params->attributes;
2750 	kvm_tdx->xfam = td_params->xfam;
2751 
2752 	if (td_params->config_flags & TDX_CONFIG_FLAGS_MAX_GPAW)
2753 		kvm->arch.gfn_direct_bits = TDX_SHARED_BIT_PWL_5;
2754 	else
2755 		kvm->arch.gfn_direct_bits = TDX_SHARED_BIT_PWL_4;
2756 
2757 	kvm_tdx->state = TD_STATE_INITIALIZED;
2758 out:
2759 	/* kfree() accepts NULL. */
2760 	kfree(init_vm);
2761 	kfree(td_params);
2762 
2763 	return ret;
2764 }
2765 
2766 void tdx_flush_tlb_current(struct kvm_vcpu *vcpu)
2767 {
2768 	/*
2769 	 * flush_tlb_current() is invoked when the first time for the vcpu to
2770 	 * run or when root of shared EPT is invalidated.
2771 	 * KVM only needs to flush shared EPT because the TDX module handles TLB
2772 	 * invalidation for private EPT in tdh_vp_enter();
2773 	 *
2774 	 * A single context invalidation for shared EPT can be performed here.
2775 	 * However, this single context invalidation requires the private EPTP
2776 	 * rather than the shared EPTP to flush shared EPT, as shared EPT uses
2777 	 * private EPTP as its ASID for TLB invalidation.
2778 	 *
2779 	 * To avoid reading back private EPTP, perform a global invalidation for
2780 	 * shared EPT instead to keep this function simple.
2781 	 */
2782 	ept_sync_global();
2783 }
2784 
2785 void tdx_flush_tlb_all(struct kvm_vcpu *vcpu)
2786 {
2787 	/*
2788 	 * TDX has called tdx_track() in tdx_sept_remove_private_spte() to
2789 	 * ensure that private EPT will be flushed on the next TD enter. No need
2790 	 * to call tdx_track() here again even when this callback is a result of
2791 	 * zapping private EPT.
2792 	 *
2793 	 * Due to the lack of the context to determine which EPT has been
2794 	 * affected by zapping, invoke invept() directly here for both shared
2795 	 * EPT and private EPT for simplicity, though it's not necessary for
2796 	 * private EPT.
2797 	 */
2798 	ept_sync_global();
2799 }
2800 
2801 static int tdx_td_finalize(struct kvm *kvm, struct kvm_tdx_cmd *cmd)
2802 {
2803 	struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
2804 
2805 	if (!is_hkid_assigned(kvm_tdx) || kvm_tdx->state == TD_STATE_RUNNABLE)
2806 		return -EINVAL;
2807 
2808 	cmd->hw_error = tdh_mr_finalize(&kvm_tdx->td);
2809 	if (tdx_operand_busy(cmd->hw_error))
2810 		return -EBUSY;
2811 	if (TDX_BUG_ON(cmd->hw_error, TDH_MR_FINALIZE, kvm))
2812 		return -EIO;
2813 
2814 	kvm_tdx->state = TD_STATE_RUNNABLE;
2815 	/* TD_STATE_RUNNABLE must be set before 'pre_fault_allowed' */
2816 	smp_wmb();
2817 	kvm->arch.pre_fault_allowed = true;
2818 	return 0;
2819 }
2820 
2821 static int tdx_get_cmd(void __user *argp, struct kvm_tdx_cmd *cmd)
2822 {
2823 	if (copy_from_user(cmd, argp, sizeof(*cmd)))
2824 		return -EFAULT;
2825 
2826 	/*
2827 	 * Userspace should never set hw_error.  KVM writes hw_error to report
2828 	 * hardware-defined error back to userspace.
2829 	 */
2830 	if (cmd->hw_error)
2831 		return -EINVAL;
2832 
2833 	return 0;
2834 }
2835 
2836 int tdx_vm_ioctl(struct kvm *kvm, void __user *argp)
2837 {
2838 	struct kvm_tdx_cmd tdx_cmd;
2839 	int r;
2840 
2841 	r = tdx_get_cmd(argp, &tdx_cmd);
2842 	if (r)
2843 		return r;
2844 
2845 	if (tdx_cmd.id == KVM_TDX_CAPABILITIES)
2846 		return tdx_get_capabilities(&tdx_cmd);
2847 
2848 	CLASS(tdx_vm_state_guard, guard)(kvm);
2849 	if (IS_ERR(guard))
2850 		return PTR_ERR(guard);
2851 
2852 	switch (tdx_cmd.id) {
2853 	case KVM_TDX_INIT_VM:
2854 		r = tdx_td_init(kvm, &tdx_cmd);
2855 		break;
2856 	case KVM_TDX_FINALIZE_VM:
2857 		r = tdx_td_finalize(kvm, &tdx_cmd);
2858 		break;
2859 	default:
2860 		return -EINVAL;
2861 	}
2862 
2863 	if (copy_to_user(argp, &tdx_cmd, sizeof(struct kvm_tdx_cmd)))
2864 		return -EFAULT;
2865 
2866 	return r;
2867 }
2868 
2869 /* VMM can pass one 64bit auxiliary data to vcpu via RCX for guest BIOS. */
2870 static int tdx_td_vcpu_init(struct kvm_vcpu *vcpu, u64 vcpu_rcx)
2871 {
2872 	struct kvm_tdx *kvm_tdx = to_kvm_tdx(vcpu->kvm);
2873 	struct vcpu_tdx *tdx = to_tdx(vcpu);
2874 	struct page *page;
2875 	int ret, i;
2876 	u64 err;
2877 
2878 	page = alloc_page(GFP_KERNEL);
2879 	if (!page)
2880 		return -ENOMEM;
2881 	tdx->vp.tdvpr_page = page;
2882 
2883 	/*
2884 	 * page_to_phys() does not work in 'noinstr' code, like guest
2885 	 * entry via tdh_vp_enter(). Precalculate and store it instead
2886 	 * of doing it at runtime later.
2887 	 */
2888 	tdx->vp.tdvpr_pa = page_to_phys(tdx->vp.tdvpr_page);
2889 
2890 	tdx->vp.tdcx_pages = kcalloc(kvm_tdx->td.tdcx_nr_pages, sizeof(*tdx->vp.tdcx_pages),
2891 			       	     GFP_KERNEL);
2892 	if (!tdx->vp.tdcx_pages) {
2893 		ret = -ENOMEM;
2894 		goto free_tdvpr;
2895 	}
2896 
2897 	for (i = 0; i < kvm_tdx->td.tdcx_nr_pages; i++) {
2898 		page = alloc_page(GFP_KERNEL);
2899 		if (!page) {
2900 			ret = -ENOMEM;
2901 			goto free_tdcx;
2902 		}
2903 		tdx->vp.tdcx_pages[i] = page;
2904 	}
2905 
2906 	err = tdh_vp_create(&kvm_tdx->td, &tdx->vp);
2907 	if (TDX_BUG_ON(err, TDH_VP_CREATE, vcpu->kvm)) {
2908 		ret = -EIO;
2909 		goto free_tdcx;
2910 	}
2911 
2912 	for (i = 0; i < kvm_tdx->td.tdcx_nr_pages; i++) {
2913 		err = tdh_vp_addcx(&tdx->vp, tdx->vp.tdcx_pages[i]);
2914 		if (TDX_BUG_ON(err, TDH_VP_ADDCX, vcpu->kvm)) {
2915 			/*
2916 			 * Pages already added are reclaimed by the vcpu_free
2917 			 * method, but the rest are freed here.
2918 			 */
2919 			for (; i < kvm_tdx->td.tdcx_nr_pages; i++) {
2920 				__free_page(tdx->vp.tdcx_pages[i]);
2921 				tdx->vp.tdcx_pages[i] = NULL;
2922 			}
2923 			return -EIO;
2924 		}
2925 	}
2926 
2927 	/*
2928 	 * tdh_vp_init() can take an exclusive lock of the TDR resource inside
2929 	 * the TDX-Module.  The TDR resource is also taken as shared in several
2930 	 * no-fail MMU paths, which could return TDX_OPERAND_BUSY on contention
2931 	 * (TDX-Module locks are try-lock implementations with no slow path).
2932 	 * Take mmu_lock for write to reflect the nature of the lock taken by
2933 	 * the TDX-Module, and to ensure the no-fail MMU paths succeed, e.g. if
2934 	 * a concurrent PUNCH_HOLE on guest_memfd triggers removal of SPTEs.
2935 	 */
2936 	scoped_guard(write_lock, &vcpu->kvm->mmu_lock) {
2937 		err = tdh_vp_init(&tdx->vp, vcpu_rcx, vcpu->vcpu_id);
2938 		if (TDX_BUG_ON(err, TDH_VP_INIT, vcpu->kvm))
2939 			return -EIO;
2940 	}
2941 
2942 	vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
2943 
2944 	return 0;
2945 
2946 free_tdcx:
2947 	for (i = 0; i < kvm_tdx->td.tdcx_nr_pages; i++) {
2948 		if (tdx->vp.tdcx_pages[i])
2949 			__free_page(tdx->vp.tdcx_pages[i]);
2950 		tdx->vp.tdcx_pages[i] = NULL;
2951 	}
2952 	kfree(tdx->vp.tdcx_pages);
2953 	tdx->vp.tdcx_pages = NULL;
2954 
2955 free_tdvpr:
2956 	if (tdx->vp.tdvpr_page)
2957 		__free_page(tdx->vp.tdvpr_page);
2958 	tdx->vp.tdvpr_page = NULL;
2959 	tdx->vp.tdvpr_pa = 0;
2960 
2961 	return ret;
2962 }
2963 
2964 /* Sometimes reads multipple subleafs. Return how many enties were written. */
2965 static int tdx_vcpu_get_cpuid_leaf(struct kvm_vcpu *vcpu, u32 leaf, int *entry_index,
2966 				   struct kvm_cpuid_entry2 *output_e)
2967 {
2968 	int sub_leaf = 0;
2969 	int ret;
2970 
2971 	/* First try without a subleaf */
2972 	ret = tdx_read_cpuid(vcpu, leaf, 0, false, entry_index, output_e);
2973 
2974 	/* If success, or invalid leaf, just give up */
2975 	if (ret != -EIO)
2976 		return ret;
2977 
2978 	/*
2979 	 * If the try without a subleaf failed, try reading subleafs until
2980 	 * failure. The TDX module only supports 6 bits of subleaf index.
2981 	 */
2982 	while (1) {
2983 		/* Keep reading subleafs until there is a failure. */
2984 		if (tdx_read_cpuid(vcpu, leaf, sub_leaf, true, entry_index, output_e))
2985 			return !sub_leaf;
2986 
2987 		sub_leaf++;
2988 		output_e++;
2989 	}
2990 
2991 	return 0;
2992 }
2993 
2994 static int tdx_vcpu_get_cpuid(struct kvm_vcpu *vcpu, struct kvm_tdx_cmd *cmd)
2995 {
2996 	struct kvm_cpuid2 __user *output;
2997 	struct kvm_cpuid2 *td_cpuid;
2998 	int r = 0, i = 0, leaf;
2999 	u32 level;
3000 
3001 	output = u64_to_user_ptr(cmd->data);
3002 	td_cpuid = kzalloc(sizeof(*td_cpuid) +
3003 			sizeof(output->entries[0]) * KVM_MAX_CPUID_ENTRIES,
3004 			GFP_KERNEL);
3005 	if (!td_cpuid)
3006 		return -ENOMEM;
3007 
3008 	if (copy_from_user(td_cpuid, output, sizeof(*output))) {
3009 		r = -EFAULT;
3010 		goto out;
3011 	}
3012 
3013 	/* Read max CPUID for normal range */
3014 	if (tdx_vcpu_get_cpuid_leaf(vcpu, 0, &i, &td_cpuid->entries[i])) {
3015 		r = -EIO;
3016 		goto out;
3017 	}
3018 	level = td_cpuid->entries[0].eax;
3019 
3020 	for (leaf = 1; leaf <= level; leaf++)
3021 		tdx_vcpu_get_cpuid_leaf(vcpu, leaf, &i, &td_cpuid->entries[i]);
3022 
3023 	/* Read max CPUID for extended range */
3024 	if (tdx_vcpu_get_cpuid_leaf(vcpu, 0x80000000, &i, &td_cpuid->entries[i])) {
3025 		r = -EIO;
3026 		goto out;
3027 	}
3028 	level = td_cpuid->entries[i - 1].eax;
3029 
3030 	for (leaf = 0x80000001; leaf <= level; leaf++)
3031 		tdx_vcpu_get_cpuid_leaf(vcpu, leaf, &i, &td_cpuid->entries[i]);
3032 
3033 	if (td_cpuid->nent < i)
3034 		r = -E2BIG;
3035 	td_cpuid->nent = i;
3036 
3037 	if (copy_to_user(output, td_cpuid, sizeof(*output))) {
3038 		r = -EFAULT;
3039 		goto out;
3040 	}
3041 
3042 	if (r == -E2BIG)
3043 		goto out;
3044 
3045 	if (copy_to_user(output->entries, td_cpuid->entries,
3046 			 td_cpuid->nent * sizeof(struct kvm_cpuid_entry2)))
3047 		r = -EFAULT;
3048 
3049 out:
3050 	kfree(td_cpuid);
3051 
3052 	return r;
3053 }
3054 
3055 static int tdx_vcpu_init(struct kvm_vcpu *vcpu, struct kvm_tdx_cmd *cmd)
3056 {
3057 	u64 apic_base;
3058 	struct vcpu_tdx *tdx = to_tdx(vcpu);
3059 	int ret;
3060 
3061 	if (cmd->flags)
3062 		return -EINVAL;
3063 
3064 	if (tdx->state != VCPU_TD_STATE_UNINITIALIZED)
3065 		return -EINVAL;
3066 
3067 	/*
3068 	 * TDX requires X2APIC, userspace is responsible for configuring guest
3069 	 * CPUID accordingly.
3070 	 */
3071 	apic_base = APIC_DEFAULT_PHYS_BASE | LAPIC_MODE_X2APIC |
3072 		(kvm_vcpu_is_reset_bsp(vcpu) ? MSR_IA32_APICBASE_BSP : 0);
3073 	if (kvm_apic_set_base(vcpu, apic_base, true))
3074 		return -EINVAL;
3075 
3076 	ret = tdx_td_vcpu_init(vcpu, (u64)cmd->data);
3077 	if (ret)
3078 		return ret;
3079 
3080 	td_vmcs_write16(tdx, POSTED_INTR_NV, POSTED_INTR_VECTOR);
3081 	td_vmcs_write64(tdx, POSTED_INTR_DESC_ADDR, __pa(&tdx->vt.pi_desc));
3082 	td_vmcs_setbit32(tdx, PIN_BASED_VM_EXEC_CONTROL, PIN_BASED_POSTED_INTR);
3083 
3084 	tdx->state = VCPU_TD_STATE_INITIALIZED;
3085 
3086 	return 0;
3087 }
3088 
3089 void tdx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
3090 {
3091 	/*
3092 	 * Yell on INIT, as TDX doesn't support INIT, i.e. KVM should drop all
3093 	 * INIT events.
3094 	 *
3095 	 * Defer initializing vCPU for RESET state until KVM_TDX_INIT_VCPU, as
3096 	 * userspace needs to define the vCPU model before KVM can initialize
3097 	 * vCPU state, e.g. to enable x2APIC.
3098 	 */
3099 	WARN_ON_ONCE(init_event);
3100 }
3101 
3102 struct tdx_gmem_post_populate_arg {
3103 	struct kvm_vcpu *vcpu;
3104 	__u32 flags;
3105 };
3106 
3107 static int tdx_gmem_post_populate(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn,
3108 				  struct page *src_page, void *_arg)
3109 {
3110 	struct tdx_gmem_post_populate_arg *arg = _arg;
3111 	struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
3112 	u64 err, entry, level_state;
3113 	gpa_t gpa = gfn_to_gpa(gfn);
3114 	int ret, i;
3115 
3116 	if (KVM_BUG_ON(kvm_tdx->page_add_src, kvm))
3117 		return -EIO;
3118 
3119 	if (!src_page)
3120 		return -EOPNOTSUPP;
3121 
3122 	kvm_tdx->page_add_src = src_page;
3123 	ret = kvm_tdp_mmu_map_private_pfn(arg->vcpu, gfn, pfn);
3124 	kvm_tdx->page_add_src = NULL;
3125 
3126 	if (ret || !(arg->flags & KVM_TDX_MEASURE_MEMORY_REGION))
3127 		return ret;
3128 
3129 	/*
3130 	 * Note, MR.EXTEND can fail if the S-EPT mapping is somehow removed
3131 	 * between mapping the pfn and now, but slots_lock prevents memslot
3132 	 * updates, filemap_invalidate_lock() prevents guest_memfd updates,
3133 	 * mmu_notifier events can't reach S-EPT entries, and KVM's internal
3134 	 * zapping flows are mutually exclusive with S-EPT mappings.
3135 	 */
3136 	for (i = 0; i < PAGE_SIZE; i += TDX_EXTENDMR_CHUNKSIZE) {
3137 		err = tdh_mr_extend(&kvm_tdx->td, gpa + i, &entry, &level_state);
3138 		if (TDX_BUG_ON_2(err, TDH_MR_EXTEND, entry, level_state, kvm))
3139 			return -EIO;
3140 	}
3141 
3142 	return 0;
3143 }
3144 
3145 static int tdx_vcpu_init_mem_region(struct kvm_vcpu *vcpu, struct kvm_tdx_cmd *cmd)
3146 {
3147 	struct vcpu_tdx *tdx = to_tdx(vcpu);
3148 	struct kvm *kvm = vcpu->kvm;
3149 	struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
3150 	struct kvm_tdx_init_mem_region region;
3151 	struct tdx_gmem_post_populate_arg arg;
3152 	long gmem_ret;
3153 	int ret;
3154 
3155 	if (tdx->state != VCPU_TD_STATE_INITIALIZED)
3156 		return -EINVAL;
3157 
3158 	/* Once TD is finalized, the initial guest memory is fixed. */
3159 	if (kvm_tdx->state == TD_STATE_RUNNABLE)
3160 		return -EINVAL;
3161 
3162 	if (cmd->flags & ~KVM_TDX_MEASURE_MEMORY_REGION)
3163 		return -EINVAL;
3164 
3165 	if (copy_from_user(&region, u64_to_user_ptr(cmd->data), sizeof(region)))
3166 		return -EFAULT;
3167 
3168 	if (!PAGE_ALIGNED(region.source_addr) || !PAGE_ALIGNED(region.gpa) ||
3169 	    !region.nr_pages ||
3170 	    region.gpa + (region.nr_pages << PAGE_SHIFT) <= region.gpa ||
3171 	    !vt_is_tdx_private_gpa(kvm, region.gpa) ||
3172 	    !vt_is_tdx_private_gpa(kvm, region.gpa + (region.nr_pages << PAGE_SHIFT) - 1))
3173 		return -EINVAL;
3174 
3175 	ret = 0;
3176 	while (region.nr_pages) {
3177 		if (signal_pending(current)) {
3178 			ret = -EINTR;
3179 			break;
3180 		}
3181 
3182 		arg = (struct tdx_gmem_post_populate_arg) {
3183 			.vcpu = vcpu,
3184 			.flags = cmd->flags,
3185 		};
3186 		gmem_ret = kvm_gmem_populate(kvm, gpa_to_gfn(region.gpa),
3187 					     u64_to_user_ptr(region.source_addr),
3188 					     1, tdx_gmem_post_populate, &arg);
3189 		if (gmem_ret < 0) {
3190 			ret = gmem_ret;
3191 			break;
3192 		}
3193 
3194 		if (gmem_ret != 1) {
3195 			ret = -EIO;
3196 			break;
3197 		}
3198 
3199 		region.source_addr += PAGE_SIZE;
3200 		region.gpa += PAGE_SIZE;
3201 		region.nr_pages--;
3202 
3203 		cond_resched();
3204 	}
3205 
3206 	if (copy_to_user(u64_to_user_ptr(cmd->data), &region, sizeof(region)))
3207 		ret = -EFAULT;
3208 	return ret;
3209 }
3210 
3211 int tdx_vcpu_unlocked_ioctl(struct kvm_vcpu *vcpu, void __user *argp)
3212 {
3213 	struct kvm *kvm = vcpu->kvm;
3214 	struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
3215 	struct kvm_tdx_cmd cmd;
3216 	int r;
3217 
3218 	r = tdx_get_cmd(argp, &cmd);
3219 	if (r)
3220 		return r;
3221 
3222 	CLASS(tdx_vm_state_guard, guard)(kvm);
3223 	if (IS_ERR(guard))
3224 		return PTR_ERR(guard);
3225 
3226 	if (!is_hkid_assigned(kvm_tdx) || kvm_tdx->state == TD_STATE_RUNNABLE)
3227 		return -EINVAL;
3228 
3229 	vcpu_load(vcpu);
3230 
3231 	switch (cmd.id) {
3232 	case KVM_TDX_INIT_MEM_REGION:
3233 		r = tdx_vcpu_init_mem_region(vcpu, &cmd);
3234 		break;
3235 	case KVM_TDX_INIT_VCPU:
3236 		r = tdx_vcpu_init(vcpu, &cmd);
3237 		break;
3238 	default:
3239 		r = -ENOIOCTLCMD;
3240 		break;
3241 	}
3242 
3243 	vcpu_put(vcpu);
3244 
3245 	return r;
3246 }
3247 
3248 int tdx_vcpu_ioctl(struct kvm_vcpu *vcpu, void __user *argp)
3249 {
3250 	struct kvm_tdx *kvm_tdx = to_kvm_tdx(vcpu->kvm);
3251 	struct kvm_tdx_cmd cmd;
3252 	int ret;
3253 
3254 	if (!is_hkid_assigned(kvm_tdx) || kvm_tdx->state == TD_STATE_RUNNABLE)
3255 		return -EINVAL;
3256 
3257 	ret = tdx_get_cmd(argp, &cmd);
3258 	if (ret)
3259 		return ret;
3260 
3261 	switch (cmd.id) {
3262 	case KVM_TDX_GET_CPUID:
3263 		ret = tdx_vcpu_get_cpuid(vcpu, &cmd);
3264 		break;
3265 	default:
3266 		ret = -EINVAL;
3267 		break;
3268 	}
3269 
3270 	return ret;
3271 }
3272 
3273 int tdx_gmem_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn, bool is_private)
3274 {
3275 	if (!is_private)
3276 		return 0;
3277 
3278 	return PG_LEVEL_4K;
3279 }
3280 
3281 void tdx_hardware_unsetup(void)
3282 {
3283 	misc_cg_set_capacity(MISC_CG_RES_TDX, 0);
3284 }
3285 
3286 static int __init __tdx_hardware_setup(void)
3287 {
3288 	const struct tdx_sys_info_td_conf *td_conf;
3289 	int i;
3290 
3291 	for (i = 0; i < ARRAY_SIZE(tdx_uret_msrs); i++) {
3292 		/*
3293 		 * Check if MSRs (tdx_uret_msrs) can be saved/restored
3294 		 * before returning to user space.
3295 		 */
3296 		tdx_uret_msrs[i].slot = kvm_find_user_return_msr(tdx_uret_msrs[i].msr);
3297 		if (tdx_uret_msrs[i].slot == -1) {
3298 			/* If any MSR isn't supported, it is a KVM bug */
3299 			pr_err("MSR %x isn't included by kvm_find_user_return_msr\n",
3300 				tdx_uret_msrs[i].msr);
3301 			return -EIO;
3302 		}
3303 	}
3304 
3305 	/* Get TDX global information for later use */
3306 	tdx_sysinfo = tdx_get_sysinfo();
3307 	if (!tdx_sysinfo)
3308 		return -ENODEV;
3309 
3310 	/* Check TDX module and KVM capabilities */
3311 	if (!tdx_get_supported_attrs(&tdx_sysinfo->td_conf) ||
3312 	    !tdx_get_supported_xfam(&tdx_sysinfo->td_conf))
3313 		return -EINVAL;
3314 
3315 	if (!(tdx_sysinfo->features.tdx_features0 & MD_FIELD_ID_FEATURES0_TOPOLOGY_ENUM))
3316 		return -EINVAL;
3317 
3318 	/*
3319 	 * TDX has its own limit of maximum vCPUs it can support for all
3320 	 * TDX guests in addition to KVM_MAX_VCPUS.  Userspace needs to
3321 	 * query TDX guest's maximum vCPUs by checking KVM_CAP_MAX_VCPU
3322 	 * extension on per-VM basis.
3323 	 *
3324 	 * TDX module reports such limit via the MAX_VCPU_PER_TD global
3325 	 * metadata.  Different modules may report different values.
3326 	 * Some old module may also not support this metadata (in which
3327 	 * case this limit is U16_MAX).
3328 	 *
3329 	 * In practice, the reported value reflects the maximum logical
3330 	 * CPUs that ALL the platforms that the module supports can
3331 	 * possibly have.
3332 	 *
3333 	 * Simply forwarding the MAX_VCPU_PER_TD to userspace could
3334 	 * result in an unpredictable ABI.  KVM instead always advertise
3335 	 * the number of logical CPUs the platform has as the maximum
3336 	 * vCPUs for TDX guests.
3337 	 *
3338 	 * Make sure MAX_VCPU_PER_TD reported by TDX module is not
3339 	 * smaller than the number of logical CPUs, otherwise KVM will
3340 	 * report an unsupported value to userspace.
3341 	 *
3342 	 * Note, a platform with TDX enabled in the BIOS cannot support
3343 	 * physical CPU hotplug, and TDX requires the BIOS has marked
3344 	 * all logical CPUs in MADT table as enabled.  Just use
3345 	 * num_present_cpus() for the number of logical CPUs.
3346 	 */
3347 	td_conf = &tdx_sysinfo->td_conf;
3348 	if (td_conf->max_vcpus_per_td < num_present_cpus()) {
3349 		pr_err("Disable TDX: MAX_VCPU_PER_TD (%u) smaller than number of logical CPUs (%u).\n",
3350 				td_conf->max_vcpus_per_td, num_present_cpus());
3351 		return -EINVAL;
3352 	}
3353 
3354 	if (misc_cg_set_capacity(MISC_CG_RES_TDX, tdx_get_nr_guest_keyids()))
3355 		return -EINVAL;
3356 
3357 	return 0;
3358 }
3359 
3360 int __init tdx_hardware_setup(void)
3361 {
3362 	int r, i;
3363 
3364 	/* tdx_disable_virtualization_cpu() uses associated_tdvcpus. */
3365 	for_each_possible_cpu(i)
3366 		INIT_LIST_HEAD(&per_cpu(associated_tdvcpus, i));
3367 
3368 	if (!enable_tdx)
3369 		return 0;
3370 
3371 	if (!enable_ept) {
3372 		pr_err("EPT is required for TDX\n");
3373 		goto success_disable_tdx;
3374 	}
3375 
3376 	if (!tdp_mmu_enabled || !enable_mmio_caching || !enable_ept_ad_bits) {
3377 		pr_err("TDP MMU and MMIO caching and EPT A/D bit is required for TDX\n");
3378 		goto success_disable_tdx;
3379 	}
3380 
3381 	if (!enable_apicv) {
3382 		pr_err("APICv is required for TDX\n");
3383 		goto success_disable_tdx;
3384 	}
3385 
3386 	if (!cpu_feature_enabled(X86_FEATURE_OSXSAVE)) {
3387 		pr_err("tdx: OSXSAVE is required for TDX\n");
3388 		goto success_disable_tdx;
3389 	}
3390 
3391 	if (!cpu_feature_enabled(X86_FEATURE_TDX_HOST_PLATFORM)) {
3392 		pr_err("TDX not supported by the host platform\n");
3393 		goto success_disable_tdx;
3394 	}
3395 
3396 	r = __tdx_hardware_setup();
3397 	if (r) {
3398 		/*
3399 		 * Disable TDX only but don't fail to load module if the TDX
3400 		 * module could not be loaded.  No need to print message saying
3401 		 * "module is not loaded" because it was printed when the first
3402 		 * SEAMCALL failed.  Don't bother unwinding the S-EPT hooks or
3403 		 * vm_size, as kvm_x86_ops have already been finalized (and are
3404 		 * intentionally not exported).  The S-EPT code is unreachable,
3405 		 * and allocating a few more bytes per VM in a should-be-rare
3406 		 * failure scenario is a non-issue.
3407 		 */
3408 		if (r == -ENODEV)
3409 			goto success_disable_tdx;
3410 
3411 		return r;
3412 	}
3413 
3414 	KVM_SANITY_CHECK_VM_STRUCT_SIZE(kvm_tdx);
3415 
3416 	vt_x86_ops.vm_size = max_t(unsigned int, vt_x86_ops.vm_size, sizeof(struct kvm_tdx));
3417 
3418 	vt_x86_ops.link_external_spt = tdx_sept_link_private_spt;
3419 	vt_x86_ops.set_external_spte = tdx_sept_set_private_spte;
3420 	vt_x86_ops.free_external_spt = tdx_sept_free_private_spt;
3421 	vt_x86_ops.remove_external_spte = tdx_sept_remove_private_spte;
3422 	vt_x86_ops.protected_apic_has_interrupt = tdx_protected_apic_has_interrupt;
3423 	return 0;
3424 
3425 success_disable_tdx:
3426 	enable_tdx = 0;
3427 	return 0;
3428 }
3429