xref: /linux/arch/x86/kvm/vmx/tdx.c (revision 189f164e573e18d9f8876dbd3ad8fcbe11f93037)
1 // SPDX-License-Identifier: GPL-2.0
2 #include <linux/cleanup.h>
3 #include <linux/cpu.h>
4 #include <asm/cpufeature.h>
5 #include <asm/fpu/xcr.h>
6 #include <linux/misc_cgroup.h>
7 #include <linux/mmu_context.h>
8 #include <asm/tdx.h>
9 #include "capabilities.h"
10 #include "mmu.h"
11 #include "x86_ops.h"
12 #include "lapic.h"
13 #include "tdx.h"
14 #include "vmx.h"
15 #include "mmu/spte.h"
16 #include "common.h"
17 #include "posted_intr.h"
18 #include "irq.h"
19 #include <trace/events/kvm.h>
20 #include "trace.h"
21 
22 #pragma GCC poison to_vmx
23 
24 #undef pr_fmt
25 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
26 
27 #define __TDX_BUG_ON(__err, __f, __kvm, __fmt, __args...)			\
28 ({										\
29 	struct kvm *_kvm = (__kvm);						\
30 	bool __ret = !!(__err);							\
31 										\
32 	if (WARN_ON_ONCE(__ret && (!_kvm || !_kvm->vm_bugged))) {		\
33 		if (_kvm)							\
34 			kvm_vm_bugged(_kvm);					\
35 		pr_err_ratelimited("SEAMCALL " __f " failed: 0x%llx" __fmt "\n",\
36 				   __err,  __args);				\
37 	}									\
38 	unlikely(__ret);							\
39 })
40 
41 #define TDX_BUG_ON(__err, __fn, __kvm)				\
42 	__TDX_BUG_ON(__err, #__fn, __kvm, "%s", "")
43 
44 #define TDX_BUG_ON_1(__err, __fn, a1, __kvm)			\
45 	__TDX_BUG_ON(__err, #__fn, __kvm, ", " #a1 " 0x%llx", a1)
46 
47 #define TDX_BUG_ON_2(__err, __fn, a1, a2, __kvm)	\
48 	__TDX_BUG_ON(__err, #__fn, __kvm, ", " #a1 " 0x%llx, " #a2 " 0x%llx", a1, a2)
49 
50 #define TDX_BUG_ON_3(__err, __fn, a1, a2, a3, __kvm)	\
51 	__TDX_BUG_ON(__err, #__fn, __kvm, ", " #a1 " 0x%llx, " #a2 ", 0x%llx, " #a3 " 0x%llx", \
52 		     a1, a2, a3)
53 
54 
55 bool enable_tdx __ro_after_init;
56 module_param_named(tdx, enable_tdx, bool, 0444);
57 
58 #define TDX_SHARED_BIT_PWL_5 gpa_to_gfn(BIT_ULL(51))
59 #define TDX_SHARED_BIT_PWL_4 gpa_to_gfn(BIT_ULL(47))
60 
61 static enum cpuhp_state tdx_cpuhp_state;
62 
63 static const struct tdx_sys_info *tdx_sysinfo;
64 
65 void tdh_vp_rd_failed(struct vcpu_tdx *tdx, char *uclass, u32 field, u64 err)
66 {
67 	KVM_BUG_ON(1, tdx->vcpu.kvm);
68 	pr_err("TDH_VP_RD[%s.0x%x] failed 0x%llx\n", uclass, field, err);
69 }
70 
71 void tdh_vp_wr_failed(struct vcpu_tdx *tdx, char *uclass, char *op, u32 field,
72 		      u64 val, u64 err)
73 {
74 	KVM_BUG_ON(1, tdx->vcpu.kvm);
75 	pr_err("TDH_VP_WR[%s.0x%x]%s0x%llx failed: 0x%llx\n", uclass, field, op, val, err);
76 }
77 
78 #define KVM_SUPPORTED_TD_ATTRS (TDX_TD_ATTR_SEPT_VE_DISABLE)
79 
80 static __always_inline struct kvm_tdx *to_kvm_tdx(struct kvm *kvm)
81 {
82 	return container_of(kvm, struct kvm_tdx, kvm);
83 }
84 
85 static __always_inline struct vcpu_tdx *to_tdx(struct kvm_vcpu *vcpu)
86 {
87 	return container_of(vcpu, struct vcpu_tdx, vcpu);
88 }
89 
90 static u64 tdx_get_supported_attrs(const struct tdx_sys_info_td_conf *td_conf)
91 {
92 	u64 val = KVM_SUPPORTED_TD_ATTRS;
93 
94 	if ((val & td_conf->attributes_fixed1) != td_conf->attributes_fixed1)
95 		return 0;
96 
97 	val &= td_conf->attributes_fixed0;
98 
99 	return val;
100 }
101 
102 static u64 tdx_get_supported_xfam(const struct tdx_sys_info_td_conf *td_conf)
103 {
104 	u64 val = kvm_caps.supported_xcr0 | kvm_caps.supported_xss;
105 
106 	if ((val & td_conf->xfam_fixed1) != td_conf->xfam_fixed1)
107 		return 0;
108 
109 	val &= td_conf->xfam_fixed0;
110 
111 	return val;
112 }
113 
114 static int tdx_get_guest_phys_addr_bits(const u32 eax)
115 {
116 	return (eax & GENMASK(23, 16)) >> 16;
117 }
118 
119 static u32 tdx_set_guest_phys_addr_bits(const u32 eax, int addr_bits)
120 {
121 	return (eax & ~GENMASK(23, 16)) | (addr_bits & 0xff) << 16;
122 }
123 
124 #define TDX_FEATURE_TSX (__feature_bit(X86_FEATURE_HLE) | __feature_bit(X86_FEATURE_RTM))
125 
126 static bool has_tsx(const struct kvm_cpuid_entry2 *entry)
127 {
128 	return entry->function == 7 && entry->index == 0 &&
129 	       (entry->ebx & TDX_FEATURE_TSX);
130 }
131 
132 static void clear_tsx(struct kvm_cpuid_entry2 *entry)
133 {
134 	entry->ebx &= ~TDX_FEATURE_TSX;
135 }
136 
137 static bool has_waitpkg(const struct kvm_cpuid_entry2 *entry)
138 {
139 	return entry->function == 7 && entry->index == 0 &&
140 	       (entry->ecx & __feature_bit(X86_FEATURE_WAITPKG));
141 }
142 
143 static void clear_waitpkg(struct kvm_cpuid_entry2 *entry)
144 {
145 	entry->ecx &= ~__feature_bit(X86_FEATURE_WAITPKG);
146 }
147 
148 static void tdx_clear_unsupported_cpuid(struct kvm_cpuid_entry2 *entry)
149 {
150 	if (has_tsx(entry))
151 		clear_tsx(entry);
152 
153 	if (has_waitpkg(entry))
154 		clear_waitpkg(entry);
155 }
156 
157 static bool tdx_unsupported_cpuid(const struct kvm_cpuid_entry2 *entry)
158 {
159 	return has_tsx(entry) || has_waitpkg(entry);
160 }
161 
162 #define KVM_TDX_CPUID_NO_SUBLEAF	((__u32)-1)
163 
164 static void td_init_cpuid_entry2(struct kvm_cpuid_entry2 *entry, unsigned char idx)
165 {
166 	const struct tdx_sys_info_td_conf *td_conf = &tdx_sysinfo->td_conf;
167 
168 	entry->function = (u32)td_conf->cpuid_config_leaves[idx];
169 	entry->index = td_conf->cpuid_config_leaves[idx] >> 32;
170 	entry->eax = (u32)td_conf->cpuid_config_values[idx][0];
171 	entry->ebx = td_conf->cpuid_config_values[idx][0] >> 32;
172 	entry->ecx = (u32)td_conf->cpuid_config_values[idx][1];
173 	entry->edx = td_conf->cpuid_config_values[idx][1] >> 32;
174 
175 	if (entry->index == KVM_TDX_CPUID_NO_SUBLEAF)
176 		entry->index = 0;
177 
178 	/*
179 	 * The TDX module doesn't allow configuring the guest phys addr bits
180 	 * (EAX[23:16]).  However, KVM uses it as an interface to the userspace
181 	 * to configure the GPAW.  Report these bits as configurable.
182 	 */
183 	if (entry->function == 0x80000008)
184 		entry->eax = tdx_set_guest_phys_addr_bits(entry->eax, 0xff);
185 
186 	tdx_clear_unsupported_cpuid(entry);
187 }
188 
189 #define TDVMCALLINFO_SETUP_EVENT_NOTIFY_INTERRUPT	BIT(1)
190 
191 static int init_kvm_tdx_caps(const struct tdx_sys_info_td_conf *td_conf,
192 			     struct kvm_tdx_capabilities *caps)
193 {
194 	int i;
195 
196 	caps->supported_attrs = tdx_get_supported_attrs(td_conf);
197 	if (!caps->supported_attrs)
198 		return -EIO;
199 
200 	caps->supported_xfam = tdx_get_supported_xfam(td_conf);
201 	if (!caps->supported_xfam)
202 		return -EIO;
203 
204 	caps->cpuid.nent = td_conf->num_cpuid_config;
205 
206 	caps->user_tdvmcallinfo_1_r11 =
207 		TDVMCALLINFO_SETUP_EVENT_NOTIFY_INTERRUPT;
208 
209 	for (i = 0; i < td_conf->num_cpuid_config; i++)
210 		td_init_cpuid_entry2(&caps->cpuid.entries[i], i);
211 
212 	return 0;
213 }
214 
215 /*
216  * Some SEAMCALLs acquire the TDX module globally, and can fail with
217  * TDX_OPERAND_BUSY.  Use a global mutex to serialize these SEAMCALLs.
218  */
219 static DEFINE_MUTEX(tdx_lock);
220 
221 static atomic_t nr_configured_hkid;
222 
223 static bool tdx_operand_busy(u64 err)
224 {
225 	return (err & TDX_SEAMCALL_STATUS_MASK) == TDX_OPERAND_BUSY;
226 }
227 
228 
229 /*
230  * A per-CPU list of TD vCPUs associated with a given CPU.
231  * Protected by interrupt mask. Only manipulated by the CPU owning this per-CPU
232  * list.
233  * - When a vCPU is loaded onto a CPU, it is removed from the per-CPU list of
234  *   the old CPU during the IPI callback running on the old CPU, and then added
235  *   to the per-CPU list of the new CPU.
236  * - When a TD is tearing down, all vCPUs are disassociated from their current
237  *   running CPUs and removed from the per-CPU list during the IPI callback
238  *   running on those CPUs.
239  * - When a CPU is brought down, traverse the per-CPU list to disassociate all
240  *   associated TD vCPUs and remove them from the per-CPU list.
241  */
242 static DEFINE_PER_CPU(struct list_head, associated_tdvcpus);
243 
244 static __always_inline unsigned long tdvmcall_exit_type(struct kvm_vcpu *vcpu)
245 {
246 	return to_tdx(vcpu)->vp_enter_args.r10;
247 }
248 
249 static __always_inline unsigned long tdvmcall_leaf(struct kvm_vcpu *vcpu)
250 {
251 	return to_tdx(vcpu)->vp_enter_args.r11;
252 }
253 
254 static __always_inline void tdvmcall_set_return_code(struct kvm_vcpu *vcpu,
255 						     long val)
256 {
257 	to_tdx(vcpu)->vp_enter_args.r10 = val;
258 }
259 
260 static __always_inline void tdvmcall_set_return_val(struct kvm_vcpu *vcpu,
261 						    unsigned long val)
262 {
263 	to_tdx(vcpu)->vp_enter_args.r11 = val;
264 }
265 
266 static inline void tdx_hkid_free(struct kvm_tdx *kvm_tdx)
267 {
268 	tdx_guest_keyid_free(kvm_tdx->hkid);
269 	kvm_tdx->hkid = -1;
270 	atomic_dec(&nr_configured_hkid);
271 	misc_cg_uncharge(MISC_CG_RES_TDX, kvm_tdx->misc_cg, 1);
272 	put_misc_cg(kvm_tdx->misc_cg);
273 	kvm_tdx->misc_cg = NULL;
274 }
275 
276 static inline bool is_hkid_assigned(struct kvm_tdx *kvm_tdx)
277 {
278 	return kvm_tdx->hkid > 0;
279 }
280 
281 static inline void tdx_disassociate_vp(struct kvm_vcpu *vcpu)
282 {
283 	lockdep_assert_irqs_disabled();
284 
285 	list_del(&to_tdx(vcpu)->cpu_list);
286 
287 	/*
288 	 * Ensure tdx->cpu_list is updated before setting vcpu->cpu to -1,
289 	 * otherwise, a different CPU can see vcpu->cpu = -1 and add the vCPU
290 	 * to its list before it's deleted from this CPU's list.
291 	 */
292 	smp_wmb();
293 
294 	vcpu->cpu = -1;
295 }
296 
297 /*
298  * Execute a SEAMCALL related to removing/blocking S-EPT entries, with a single
299  * retry (if necessary) after forcing vCPUs to exit and wait for the operation
300  * to complete.  All flows that remove/block S-EPT entries run with mmu_lock
301  * held for write, i.e. are mutually exclusive with each other, but they aren't
302  * mutually exclusive with running vCPUs, and so can fail with "operand busy"
303  * if a vCPU acquires a relevant lock in the TDX-Module, e.g. when doing TDCALL.
304  *
305  * Note, the retry is guaranteed to succeed, absent KVM and/or TDX-Module bugs.
306  */
307 #define tdh_do_no_vcpus(tdh_func, kvm, args...)					\
308 ({										\
309 	struct kvm_tdx *__kvm_tdx = to_kvm_tdx(kvm);				\
310 	u64 __err;								\
311 										\
312 	lockdep_assert_held_write(&kvm->mmu_lock);				\
313 										\
314 	__err = tdh_func(args);							\
315 	if (unlikely(tdx_operand_busy(__err))) {				\
316 		WRITE_ONCE(__kvm_tdx->wait_for_sept_zap, true);			\
317 		kvm_make_all_cpus_request(kvm, KVM_REQ_OUTSIDE_GUEST_MODE);	\
318 										\
319 		__err = tdh_func(args);						\
320 										\
321 		WRITE_ONCE(__kvm_tdx->wait_for_sept_zap, false);		\
322 	}									\
323 	__err;									\
324 })
325 
326 /* TDH.PHYMEM.PAGE.RECLAIM is allowed only when destroying the TD. */
327 static int __tdx_reclaim_page(struct page *page)
328 {
329 	u64 err, rcx, rdx, r8;
330 
331 	err = tdh_phymem_page_reclaim(page, &rcx, &rdx, &r8);
332 
333 	/*
334 	 * No need to check for TDX_OPERAND_BUSY; all TD pages are freed
335 	 * before the HKID is released and control pages have also been
336 	 * released at this point, so there is no possibility of contention.
337 	 */
338 	if (TDX_BUG_ON_3(err, TDH_PHYMEM_PAGE_RECLAIM, rcx, rdx, r8, NULL))
339 		return -EIO;
340 
341 	return 0;
342 }
343 
344 static int tdx_reclaim_page(struct page *page)
345 {
346 	int r;
347 
348 	r = __tdx_reclaim_page(page);
349 	if (!r)
350 		tdx_quirk_reset_page(page);
351 	return r;
352 }
353 
354 
355 /*
356  * Reclaim the TD control page(s) which are crypto-protected by TDX guest's
357  * private KeyID.  Assume the cache associated with the TDX private KeyID has
358  * been flushed.
359  */
360 static void tdx_reclaim_control_page(struct page *ctrl_page)
361 {
362 	/*
363 	 * Leak the page if the kernel failed to reclaim the page.
364 	 * The kernel cannot use it safely anymore.
365 	 */
366 	if (tdx_reclaim_page(ctrl_page))
367 		return;
368 
369 	__free_page(ctrl_page);
370 }
371 
372 struct tdx_flush_vp_arg {
373 	struct kvm_vcpu *vcpu;
374 	u64 err;
375 };
376 
377 static void tdx_flush_vp(void *_arg)
378 {
379 	struct tdx_flush_vp_arg *arg = _arg;
380 	struct kvm_vcpu *vcpu = arg->vcpu;
381 	u64 err;
382 
383 	arg->err = 0;
384 	lockdep_assert_irqs_disabled();
385 
386 	/* Task migration can race with CPU offlining. */
387 	if (unlikely(vcpu->cpu != raw_smp_processor_id()))
388 		return;
389 
390 	/*
391 	 * No need to do TDH_VP_FLUSH if the vCPU hasn't been initialized.  The
392 	 * list tracking still needs to be updated so that it's correct if/when
393 	 * the vCPU does get initialized.
394 	 */
395 	if (to_tdx(vcpu)->state != VCPU_TD_STATE_UNINITIALIZED) {
396 		/*
397 		 * No need to retry.  TDX Resources needed for TDH.VP.FLUSH are:
398 		 * TDVPR as exclusive, TDR as shared, and TDCS as shared.  This
399 		 * vp flush function is called when destructing vCPU/TD or vCPU
400 		 * migration.  No other thread uses TDVPR in those cases.
401 		 */
402 		err = tdh_vp_flush(&to_tdx(vcpu)->vp);
403 		if (unlikely(err && err != TDX_VCPU_NOT_ASSOCIATED)) {
404 			/*
405 			 * This function is called in IPI context. Do not use
406 			 * printk to avoid console semaphore.
407 			 * The caller prints out the error message, instead.
408 			 */
409 			if (err)
410 				arg->err = err;
411 		}
412 	}
413 
414 	tdx_disassociate_vp(vcpu);
415 }
416 
417 static void tdx_flush_vp_on_cpu(struct kvm_vcpu *vcpu)
418 {
419 	struct tdx_flush_vp_arg arg = {
420 		.vcpu = vcpu,
421 	};
422 	int cpu = vcpu->cpu;
423 
424 	if (unlikely(cpu == -1))
425 		return;
426 
427 	smp_call_function_single(cpu, tdx_flush_vp, &arg, 1);
428 
429 	TDX_BUG_ON(arg.err, TDH_VP_FLUSH, vcpu->kvm);
430 }
431 
432 void tdx_disable_virtualization_cpu(void)
433 {
434 	int cpu = raw_smp_processor_id();
435 	struct list_head *tdvcpus = &per_cpu(associated_tdvcpus, cpu);
436 	struct tdx_flush_vp_arg arg;
437 	struct vcpu_tdx *tdx, *tmp;
438 	unsigned long flags;
439 
440 	local_irq_save(flags);
441 	/* Safe variant needed as tdx_disassociate_vp() deletes the entry. */
442 	list_for_each_entry_safe(tdx, tmp, tdvcpus, cpu_list) {
443 		arg.vcpu = &tdx->vcpu;
444 		tdx_flush_vp(&arg);
445 	}
446 	local_irq_restore(flags);
447 
448 	/*
449 	 * Flush cache now if kexec is possible: this is necessary to avoid
450 	 * having dirty private memory cachelines when the new kernel boots,
451 	 * but WBINVD is a relatively expensive operation and doing it during
452 	 * kexec can exacerbate races in native_stop_other_cpus().  Do it
453 	 * now, since this is a safe moment and there is going to be no more
454 	 * TDX activity on this CPU from this point on.
455 	 */
456 	tdx_cpu_flush_cache_for_kexec();
457 }
458 
459 #define TDX_SEAMCALL_RETRIES 10000
460 
461 static void smp_func_do_phymem_cache_wb(void *unused)
462 {
463 	u64 err = 0;
464 	bool resume;
465 	int i;
466 
467 	/*
468 	 * TDH.PHYMEM.CACHE.WB flushes caches associated with any TDX private
469 	 * KeyID on the package or core.  The TDX module may not finish the
470 	 * cache flush but return TDX_INTERRUPTED_RESUMEABLE instead.  The
471 	 * kernel should retry it until it returns success w/o rescheduling.
472 	 */
473 	for (i = TDX_SEAMCALL_RETRIES; i > 0; i--) {
474 		resume = !!err;
475 		err = tdh_phymem_cache_wb(resume);
476 		switch (err) {
477 		case TDX_INTERRUPTED_RESUMABLE:
478 			continue;
479 		case TDX_NO_HKID_READY_TO_WBCACHE:
480 			err = TDX_SUCCESS; /* Already done by other thread */
481 			fallthrough;
482 		default:
483 			goto out;
484 		}
485 	}
486 
487 out:
488 	TDX_BUG_ON(err, TDH_PHYMEM_CACHE_WB, NULL);
489 }
490 
491 void tdx_mmu_release_hkid(struct kvm *kvm)
492 {
493 	bool packages_allocated, targets_allocated;
494 	struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
495 	cpumask_var_t packages, targets;
496 	struct kvm_vcpu *vcpu;
497 	unsigned long j;
498 	int i;
499 	u64 err;
500 
501 	if (!is_hkid_assigned(kvm_tdx))
502 		return;
503 
504 	packages_allocated = zalloc_cpumask_var(&packages, GFP_KERNEL);
505 	targets_allocated = zalloc_cpumask_var(&targets, GFP_KERNEL);
506 	cpus_read_lock();
507 
508 	kvm_for_each_vcpu(j, vcpu, kvm)
509 		tdx_flush_vp_on_cpu(vcpu);
510 
511 	/*
512 	 * TDH.PHYMEM.CACHE.WB tries to acquire the TDX module global lock
513 	 * and can fail with TDX_OPERAND_BUSY when it fails to get the lock.
514 	 * Multiple TDX guests can be destroyed simultaneously. Take the
515 	 * mutex to prevent it from getting error.
516 	 */
517 	mutex_lock(&tdx_lock);
518 
519 	/*
520 	 * Releasing HKID is in vm_destroy().
521 	 * After the above flushing vps, there should be no more vCPU
522 	 * associations, as all vCPU fds have been released at this stage.
523 	 */
524 	err = tdh_mng_vpflushdone(&kvm_tdx->td);
525 	if (err == TDX_FLUSHVP_NOT_DONE)
526 		goto out;
527 	if (TDX_BUG_ON(err, TDH_MNG_VPFLUSHDONE, kvm)) {
528 		pr_err("tdh_mng_vpflushdone() failed. HKID %d is leaked.\n",
529 		       kvm_tdx->hkid);
530 		goto out;
531 	}
532 
533 	for_each_online_cpu(i) {
534 		if (packages_allocated &&
535 		    cpumask_test_and_set_cpu(topology_physical_package_id(i),
536 					     packages))
537 			continue;
538 		if (targets_allocated)
539 			cpumask_set_cpu(i, targets);
540 	}
541 	if (targets_allocated)
542 		on_each_cpu_mask(targets, smp_func_do_phymem_cache_wb, NULL, true);
543 	else
544 		on_each_cpu(smp_func_do_phymem_cache_wb, NULL, true);
545 	/*
546 	 * In the case of error in smp_func_do_phymem_cache_wb(), the following
547 	 * tdh_mng_key_freeid() will fail.
548 	 */
549 	err = tdh_mng_key_freeid(&kvm_tdx->td);
550 	if (TDX_BUG_ON(err, TDH_MNG_KEY_FREEID, kvm)) {
551 		pr_err("tdh_mng_key_freeid() failed. HKID %d is leaked.\n",
552 		       kvm_tdx->hkid);
553 	} else {
554 		tdx_hkid_free(kvm_tdx);
555 	}
556 
557 out:
558 	mutex_unlock(&tdx_lock);
559 	cpus_read_unlock();
560 	free_cpumask_var(targets);
561 	free_cpumask_var(packages);
562 }
563 
564 static void tdx_reclaim_td_control_pages(struct kvm *kvm)
565 {
566 	struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
567 	u64 err;
568 	int i;
569 
570 	/*
571 	 * tdx_mmu_release_hkid() failed to reclaim HKID.  Something went wrong
572 	 * heavily with TDX module.  Give up freeing TD pages.  As the function
573 	 * already warned, don't warn it again.
574 	 */
575 	if (is_hkid_assigned(kvm_tdx))
576 		return;
577 
578 	if (kvm_tdx->td.tdcs_pages) {
579 		for (i = 0; i < kvm_tdx->td.tdcs_nr_pages; i++) {
580 			if (!kvm_tdx->td.tdcs_pages[i])
581 				continue;
582 
583 			tdx_reclaim_control_page(kvm_tdx->td.tdcs_pages[i]);
584 		}
585 		kfree(kvm_tdx->td.tdcs_pages);
586 		kvm_tdx->td.tdcs_pages = NULL;
587 	}
588 
589 	if (!kvm_tdx->td.tdr_page)
590 		return;
591 
592 	if (__tdx_reclaim_page(kvm_tdx->td.tdr_page))
593 		return;
594 
595 	/*
596 	 * Use a SEAMCALL to ask the TDX module to flush the cache based on the
597 	 * KeyID. TDX module may access TDR while operating on TD (Especially
598 	 * when it is reclaiming TDCS).
599 	 */
600 	err = tdh_phymem_page_wbinvd_tdr(&kvm_tdx->td);
601 	if (TDX_BUG_ON(err, TDH_PHYMEM_PAGE_WBINVD, kvm))
602 		return;
603 
604 	tdx_quirk_reset_page(kvm_tdx->td.tdr_page);
605 
606 	__free_page(kvm_tdx->td.tdr_page);
607 	kvm_tdx->td.tdr_page = NULL;
608 }
609 
610 void tdx_vm_destroy(struct kvm *kvm)
611 {
612 	struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
613 
614 	tdx_reclaim_td_control_pages(kvm);
615 
616 	kvm_tdx->state = TD_STATE_UNINITIALIZED;
617 }
618 
619 static int tdx_do_tdh_mng_key_config(void *param)
620 {
621 	struct kvm_tdx *kvm_tdx = param;
622 	u64 err;
623 
624 	/* TDX_RND_NO_ENTROPY related retries are handled by sc_retry() */
625 	err = tdh_mng_key_config(&kvm_tdx->td);
626 	if (TDX_BUG_ON(err, TDH_MNG_KEY_CONFIG, &kvm_tdx->kvm))
627 		return -EIO;
628 
629 	return 0;
630 }
631 
632 int tdx_vm_init(struct kvm *kvm)
633 {
634 	struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
635 
636 	kvm->arch.has_protected_state = true;
637 	/*
638 	 * TDX Module doesn't allow the hypervisor to modify the EOI-bitmap,
639 	 * i.e. all EOIs are accelerated and never trigger exits.
640 	 */
641 	kvm->arch.has_protected_eoi = true;
642 	kvm->arch.has_private_mem = true;
643 	kvm->arch.disabled_quirks |= KVM_X86_QUIRK_IGNORE_GUEST_PAT;
644 
645 	/*
646 	 * Because guest TD is protected, VMM can't parse the instruction in TD.
647 	 * Instead, guest uses MMIO hypercall.  For unmodified device driver,
648 	 * #VE needs to be injected for MMIO and #VE handler in TD converts MMIO
649 	 * instruction into MMIO hypercall.
650 	 *
651 	 * SPTE value for MMIO needs to be setup so that #VE is injected into
652 	 * TD instead of triggering EPT MISCONFIG.
653 	 * - RWX=0 so that EPT violation is triggered.
654 	 * - suppress #VE bit is cleared to inject #VE.
655 	 */
656 	kvm_mmu_set_mmio_spte_value(kvm, 0);
657 
658 	/*
659 	 * TDX has its own limit of maximum vCPUs it can support for all
660 	 * TDX guests in addition to KVM_MAX_VCPUS.  TDX module reports
661 	 * such limit via the MAX_VCPU_PER_TD global metadata.  In
662 	 * practice, it reflects the number of logical CPUs that ALL
663 	 * platforms that the TDX module supports can possibly have.
664 	 *
665 	 * Limit TDX guest's maximum vCPUs to the number of logical CPUs
666 	 * the platform has.  Simply forwarding the MAX_VCPU_PER_TD to
667 	 * userspace would result in an unpredictable ABI.
668 	 */
669 	kvm->max_vcpus = min_t(int, kvm->max_vcpus, num_present_cpus());
670 
671 	kvm_tdx->state = TD_STATE_UNINITIALIZED;
672 
673 	return 0;
674 }
675 
676 int tdx_vcpu_create(struct kvm_vcpu *vcpu)
677 {
678 	struct kvm_tdx *kvm_tdx = to_kvm_tdx(vcpu->kvm);
679 	struct vcpu_tdx *tdx = to_tdx(vcpu);
680 
681 	if (kvm_tdx->state != TD_STATE_INITIALIZED)
682 		return -EIO;
683 
684 	/*
685 	 * TDX module mandates APICv, which requires an in-kernel local APIC.
686 	 * Disallow an in-kernel I/O APIC, because level-triggered interrupts
687 	 * and thus the I/O APIC as a whole can't be faithfully emulated in KVM.
688 	 */
689 	if (!irqchip_split(vcpu->kvm))
690 		return -EINVAL;
691 
692 	fpstate_set_confidential(&vcpu->arch.guest_fpu);
693 	vcpu->arch.apic->guest_apic_protected = true;
694 	INIT_LIST_HEAD(&tdx->vt.pi_wakeup_list);
695 
696 	vcpu->arch.efer = EFER_SCE | EFER_LME | EFER_LMA | EFER_NX;
697 
698 	vcpu->arch.switch_db_regs = KVM_DEBUGREG_AUTO_SWITCH;
699 	vcpu->arch.cr0_guest_owned_bits = -1ul;
700 	vcpu->arch.cr4_guest_owned_bits = -1ul;
701 
702 	/* KVM can't change TSC offset/multiplier as TDX module manages them. */
703 	vcpu->arch.guest_tsc_protected = true;
704 	vcpu->arch.tsc_offset = kvm_tdx->tsc_offset;
705 	vcpu->arch.l1_tsc_offset = vcpu->arch.tsc_offset;
706 	vcpu->arch.tsc_scaling_ratio = kvm_tdx->tsc_multiplier;
707 	vcpu->arch.l1_tsc_scaling_ratio = kvm_tdx->tsc_multiplier;
708 
709 	vcpu->arch.guest_state_protected =
710 		!(to_kvm_tdx(vcpu->kvm)->attributes & TDX_TD_ATTR_DEBUG);
711 
712 	if ((kvm_tdx->xfam & XFEATURE_MASK_XTILE) == XFEATURE_MASK_XTILE)
713 		vcpu->arch.xfd_no_write_intercept = true;
714 
715 	tdx->vt.pi_desc.nv = POSTED_INTR_VECTOR;
716 	__pi_set_sn(&tdx->vt.pi_desc);
717 
718 	tdx->state = VCPU_TD_STATE_UNINITIALIZED;
719 
720 	return 0;
721 }
722 
723 void tdx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
724 {
725 	struct vcpu_tdx *tdx = to_tdx(vcpu);
726 
727 	vmx_vcpu_pi_load(vcpu, cpu);
728 	if (vcpu->cpu == cpu || !is_hkid_assigned(to_kvm_tdx(vcpu->kvm)))
729 		return;
730 
731 	tdx_flush_vp_on_cpu(vcpu);
732 
733 	KVM_BUG_ON(cpu != raw_smp_processor_id(), vcpu->kvm);
734 	local_irq_disable();
735 	/*
736 	 * Pairs with the smp_wmb() in tdx_disassociate_vp() to ensure
737 	 * vcpu->cpu is read before tdx->cpu_list.
738 	 */
739 	smp_rmb();
740 
741 	list_add(&tdx->cpu_list, &per_cpu(associated_tdvcpus, cpu));
742 	local_irq_enable();
743 }
744 
745 bool tdx_interrupt_allowed(struct kvm_vcpu *vcpu)
746 {
747 	/*
748 	 * KVM can't get the interrupt status of TDX guest and it assumes
749 	 * interrupt is always allowed unless TDX guest calls TDVMCALL with HLT,
750 	 * which passes the interrupt blocked flag.
751 	 */
752 	return vmx_get_exit_reason(vcpu).basic != EXIT_REASON_HLT ||
753 	       !to_tdx(vcpu)->vp_enter_args.r12;
754 }
755 
756 static bool tdx_protected_apic_has_interrupt(struct kvm_vcpu *vcpu)
757 {
758 	u64 vcpu_state_details;
759 
760 	if (pi_has_pending_interrupt(vcpu))
761 		return true;
762 
763 	/*
764 	 * Only check RVI pending for HALTED case with IRQ enabled.
765 	 * For non-HLT cases, KVM doesn't care about STI/SS shadows.  And if the
766 	 * interrupt was pending before TD exit, then it _must_ be blocked,
767 	 * otherwise the interrupt would have been serviced at the instruction
768 	 * boundary.
769 	 */
770 	if (vmx_get_exit_reason(vcpu).basic != EXIT_REASON_HLT ||
771 	    to_tdx(vcpu)->vp_enter_args.r12)
772 		return false;
773 
774 	vcpu_state_details =
775 		td_state_non_arch_read64(to_tdx(vcpu), TD_VCPU_STATE_DETAILS_NON_ARCH);
776 
777 	return tdx_vcpu_state_details_intr_pending(vcpu_state_details);
778 }
779 
780 struct tdx_uret_msr {
781 	u32 msr;
782 	unsigned int slot;
783 	u64 defval;
784 };
785 
786 static struct tdx_uret_msr tdx_uret_msrs[] = {
787 	{.msr = MSR_SYSCALL_MASK, .defval = 0x20200 },
788 	{.msr = MSR_STAR,},
789 	{.msr = MSR_LSTAR,},
790 	{.msr = MSR_TSC_AUX,},
791 };
792 
793 void tdx_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
794 {
795 	struct vcpu_vt *vt = to_vt(vcpu);
796 	int i;
797 
798 	if (vt->guest_state_loaded)
799 		return;
800 
801 	if (likely(is_64bit_mm(current->mm)))
802 		vt->msr_host_kernel_gs_base = current->thread.gsbase;
803 	else
804 		vt->msr_host_kernel_gs_base = read_msr(MSR_KERNEL_GS_BASE);
805 
806 	vt->guest_state_loaded = true;
807 
808 	/*
809 	 * Explicitly set user-return MSRs that are clobbered by the TDX-Module
810 	 * if VP.ENTER succeeds, i.e. on TD-Exit, with the values that would be
811 	 * written by the TDX-Module.  Don't rely on the TDX-Module to actually
812 	 * clobber the MSRs, as the contract is poorly defined and not upheld.
813 	 * E.g. the TDX-Module will synthesize an EPT Violation without doing
814 	 * VM-Enter if it suspects a zero-step attack, and never "restore" VMM
815 	 * state.
816 	 */
817 	for (i = 0; i < ARRAY_SIZE(tdx_uret_msrs); i++)
818 		kvm_set_user_return_msr(tdx_uret_msrs[i].slot,
819 					tdx_uret_msrs[i].defval, -1ull);
820 }
821 
822 static void tdx_prepare_switch_to_host(struct kvm_vcpu *vcpu)
823 {
824 	struct vcpu_vt *vt = to_vt(vcpu);
825 
826 	if (!vt->guest_state_loaded)
827 		return;
828 
829 	++vcpu->stat.host_state_reload;
830 	wrmsrl(MSR_KERNEL_GS_BASE, vt->msr_host_kernel_gs_base);
831 
832 	vt->guest_state_loaded = false;
833 }
834 
835 void tdx_vcpu_put(struct kvm_vcpu *vcpu)
836 {
837 	vmx_vcpu_pi_put(vcpu);
838 	tdx_prepare_switch_to_host(vcpu);
839 }
840 
841 /*
842  * Life cycles for a TD and a vCPU:
843  * 1. KVM_CREATE_VM ioctl.
844  *    TD state is TD_STATE_UNINITIALIZED.
845  *    hkid is not assigned at this stage.
846  * 2. KVM_TDX_INIT_VM ioctl.
847  *    TD transitions to TD_STATE_INITIALIZED.
848  *    hkid is assigned after this stage.
849  * 3. KVM_CREATE_VCPU ioctl. (only when TD is TD_STATE_INITIALIZED).
850  *    3.1 tdx_vcpu_create() transitions vCPU state to VCPU_TD_STATE_UNINITIALIZED.
851  *    3.2 vcpu_load() and vcpu_put() in kvm_arch_vcpu_create().
852  *    3.3 (conditional) if any error encountered after kvm_arch_vcpu_create()
853  *        kvm_arch_vcpu_destroy() --> tdx_vcpu_free().
854  * 4. KVM_TDX_INIT_VCPU ioctl.
855  *    tdx_vcpu_init() transitions vCPU state to VCPU_TD_STATE_INITIALIZED.
856  *    vCPU control structures are allocated at this stage.
857  * 5. kvm_destroy_vm().
858  *    5.1 tdx_mmu_release_hkid(): (1) tdh_vp_flush(), disassociates all vCPUs.
859  *                                (2) puts hkid to !assigned state.
860  *    5.2 kvm_destroy_vcpus() --> tdx_vcpu_free():
861  *        transitions vCPU to VCPU_TD_STATE_UNINITIALIZED state.
862  *    5.3 tdx_vm_destroy()
863  *        transitions TD to TD_STATE_UNINITIALIZED state.
864  *
865  * tdx_vcpu_free() can be invoked only at 3.3 or 5.2.
866  * - If at 3.3, hkid is still assigned, but the vCPU must be in
867  *   VCPU_TD_STATE_UNINITIALIZED state.
868  * - if at 5.2, hkid must be !assigned and all vCPUs must be in
869  *   VCPU_TD_STATE_INITIALIZED state and have been dissociated.
870  */
871 void tdx_vcpu_free(struct kvm_vcpu *vcpu)
872 {
873 	struct kvm_tdx *kvm_tdx = to_kvm_tdx(vcpu->kvm);
874 	struct vcpu_tdx *tdx = to_tdx(vcpu);
875 	int i;
876 
877 	if (vcpu->cpu != -1) {
878 		KVM_BUG_ON(tdx->state == VCPU_TD_STATE_INITIALIZED, vcpu->kvm);
879 		tdx_flush_vp_on_cpu(vcpu);
880 		return;
881 	}
882 
883 	/*
884 	 * It is not possible to reclaim pages while hkid is assigned. It might
885 	 * be assigned if the TD VM is being destroyed but freeing hkid failed,
886 	 * in which case the pages are leaked.
887 	 */
888 	if (is_hkid_assigned(kvm_tdx))
889 		return;
890 
891 	if (tdx->vp.tdcx_pages) {
892 		for (i = 0; i < kvm_tdx->td.tdcx_nr_pages; i++) {
893 			if (tdx->vp.tdcx_pages[i])
894 				tdx_reclaim_control_page(tdx->vp.tdcx_pages[i]);
895 		}
896 		kfree(tdx->vp.tdcx_pages);
897 		tdx->vp.tdcx_pages = NULL;
898 	}
899 	if (tdx->vp.tdvpr_page) {
900 		tdx_reclaim_control_page(tdx->vp.tdvpr_page);
901 		tdx->vp.tdvpr_page = NULL;
902 		tdx->vp.tdvpr_pa = 0;
903 	}
904 
905 	tdx->state = VCPU_TD_STATE_UNINITIALIZED;
906 }
907 
908 int tdx_vcpu_pre_run(struct kvm_vcpu *vcpu)
909 {
910 	if (unlikely(to_tdx(vcpu)->state != VCPU_TD_STATE_INITIALIZED ||
911 		     to_kvm_tdx(vcpu->kvm)->state != TD_STATE_RUNNABLE))
912 		return -EINVAL;
913 
914 	return 1;
915 }
916 
917 static __always_inline u32 tdcall_to_vmx_exit_reason(struct kvm_vcpu *vcpu)
918 {
919 	switch (tdvmcall_leaf(vcpu)) {
920 	case EXIT_REASON_CPUID:
921 	case EXIT_REASON_HLT:
922 	case EXIT_REASON_IO_INSTRUCTION:
923 	case EXIT_REASON_MSR_READ:
924 	case EXIT_REASON_MSR_WRITE:
925 		return tdvmcall_leaf(vcpu);
926 	case EXIT_REASON_EPT_VIOLATION:
927 		return EXIT_REASON_EPT_MISCONFIG;
928 	default:
929 		break;
930 	}
931 
932 	return EXIT_REASON_TDCALL;
933 }
934 
935 static __always_inline u32 tdx_to_vmx_exit_reason(struct kvm_vcpu *vcpu)
936 {
937 	struct vcpu_tdx *tdx = to_tdx(vcpu);
938 	u32 exit_reason;
939 
940 	switch (tdx->vp_enter_ret & TDX_SEAMCALL_STATUS_MASK) {
941 	case TDX_SUCCESS:
942 	case TDX_NON_RECOVERABLE_VCPU:
943 	case TDX_NON_RECOVERABLE_TD:
944 	case TDX_NON_RECOVERABLE_TD_NON_ACCESSIBLE:
945 	case TDX_NON_RECOVERABLE_TD_WRONG_APIC_MODE:
946 		break;
947 	default:
948 		return -1u;
949 	}
950 
951 	exit_reason = tdx->vp_enter_ret;
952 
953 	switch (exit_reason) {
954 	case EXIT_REASON_TDCALL:
955 		if (tdvmcall_exit_type(vcpu))
956 			return EXIT_REASON_VMCALL;
957 
958 		return tdcall_to_vmx_exit_reason(vcpu);
959 	case EXIT_REASON_EPT_MISCONFIG:
960 		/*
961 		 * Defer KVM_BUG_ON() until tdx_handle_exit() because this is in
962 		 * non-instrumentable code with interrupts disabled.
963 		 */
964 		return -1u;
965 	default:
966 		break;
967 	}
968 
969 	return exit_reason;
970 }
971 
972 static noinstr void tdx_vcpu_enter_exit(struct kvm_vcpu *vcpu)
973 {
974 	struct vcpu_tdx *tdx = to_tdx(vcpu);
975 	struct vcpu_vt *vt = to_vt(vcpu);
976 
977 	guest_state_enter_irqoff();
978 
979 	tdx->vp_enter_ret = tdh_vp_enter(&tdx->vp, &tdx->vp_enter_args);
980 
981 	vt->exit_reason.full = tdx_to_vmx_exit_reason(vcpu);
982 
983 	vt->exit_qualification = tdx->vp_enter_args.rcx;
984 	tdx->ext_exit_qualification = tdx->vp_enter_args.rdx;
985 	tdx->exit_gpa = tdx->vp_enter_args.r8;
986 	vt->exit_intr_info = tdx->vp_enter_args.r9;
987 
988 	vmx_handle_nmi(vcpu);
989 
990 	guest_state_exit_irqoff();
991 }
992 
993 static bool tdx_failed_vmentry(struct kvm_vcpu *vcpu)
994 {
995 	return vmx_get_exit_reason(vcpu).failed_vmentry &&
996 	       vmx_get_exit_reason(vcpu).full != -1u;
997 }
998 
999 static fastpath_t tdx_exit_handlers_fastpath(struct kvm_vcpu *vcpu)
1000 {
1001 	u64 vp_enter_ret = to_tdx(vcpu)->vp_enter_ret;
1002 
1003 	/*
1004 	 * TDX_OPERAND_BUSY could be returned for SEPT due to 0-step mitigation
1005 	 * or for TD EPOCH due to contention with TDH.MEM.TRACK on TDH.VP.ENTER.
1006 	 *
1007 	 * When KVM requests KVM_REQ_OUTSIDE_GUEST_MODE, which has both
1008 	 * KVM_REQUEST_WAIT and KVM_REQUEST_NO_ACTION set, it requires target
1009 	 * vCPUs leaving fastpath so that interrupt can be enabled to ensure the
1010 	 * IPIs can be delivered. Return EXIT_FASTPATH_EXIT_HANDLED instead of
1011 	 * EXIT_FASTPATH_REENTER_GUEST to exit fastpath, otherwise, the
1012 	 * requester may be blocked endlessly.
1013 	 */
1014 	if (unlikely(tdx_operand_busy(vp_enter_ret)))
1015 		return EXIT_FASTPATH_EXIT_HANDLED;
1016 
1017 	return EXIT_FASTPATH_NONE;
1018 }
1019 
1020 #define TDX_REGS_AVAIL_SET	(BIT_ULL(VCPU_EXREG_EXIT_INFO_1) | \
1021 				 BIT_ULL(VCPU_EXREG_EXIT_INFO_2) | \
1022 				 BIT_ULL(VCPU_REGS_RAX) | \
1023 				 BIT_ULL(VCPU_REGS_RBX) | \
1024 				 BIT_ULL(VCPU_REGS_RCX) | \
1025 				 BIT_ULL(VCPU_REGS_RDX) | \
1026 				 BIT_ULL(VCPU_REGS_RBP) | \
1027 				 BIT_ULL(VCPU_REGS_RSI) | \
1028 				 BIT_ULL(VCPU_REGS_RDI) | \
1029 				 BIT_ULL(VCPU_REGS_R8) | \
1030 				 BIT_ULL(VCPU_REGS_R9) | \
1031 				 BIT_ULL(VCPU_REGS_R10) | \
1032 				 BIT_ULL(VCPU_REGS_R11) | \
1033 				 BIT_ULL(VCPU_REGS_R12) | \
1034 				 BIT_ULL(VCPU_REGS_R13) | \
1035 				 BIT_ULL(VCPU_REGS_R14) | \
1036 				 BIT_ULL(VCPU_REGS_R15))
1037 
1038 static void tdx_load_host_xsave_state(struct kvm_vcpu *vcpu)
1039 {
1040 	struct kvm_tdx *kvm_tdx = to_kvm_tdx(vcpu->kvm);
1041 
1042 	/*
1043 	 * All TDX hosts support PKRU; but even if they didn't,
1044 	 * vcpu->arch.host_pkru would be 0 and the wrpkru would be
1045 	 * skipped.
1046 	 */
1047 	if (vcpu->arch.host_pkru != 0)
1048 		wrpkru(vcpu->arch.host_pkru);
1049 
1050 	if (kvm_host.xcr0 != (kvm_tdx->xfam & kvm_caps.supported_xcr0))
1051 		xsetbv(XCR_XFEATURE_ENABLED_MASK, kvm_host.xcr0);
1052 
1053 	/*
1054 	 * Likewise, even if a TDX hosts didn't support XSS both arms of
1055 	 * the comparison would be 0 and the wrmsrl would be skipped.
1056 	 */
1057 	if (kvm_host.xss != (kvm_tdx->xfam & kvm_caps.supported_xss))
1058 		wrmsrl(MSR_IA32_XSS, kvm_host.xss);
1059 }
1060 
1061 #define TDX_DEBUGCTL_PRESERVED (DEBUGCTLMSR_BTF | \
1062 				DEBUGCTLMSR_FREEZE_PERFMON_ON_PMI | \
1063 				DEBUGCTLMSR_FREEZE_IN_SMM)
1064 
1065 fastpath_t tdx_vcpu_run(struct kvm_vcpu *vcpu, u64 run_flags)
1066 {
1067 	struct vcpu_tdx *tdx = to_tdx(vcpu);
1068 	struct vcpu_vt *vt = to_vt(vcpu);
1069 
1070 	/*
1071 	 * WARN if KVM wants to force an immediate exit, as the TDX module does
1072 	 * not guarantee entry into the guest, i.e. it's possible for KVM to
1073 	 * _think_ it completed entry to the guest and forced an immediate exit
1074 	 * without actually having done so.  Luckily, KVM never needs to force
1075 	 * an immediate exit for TDX (KVM can't do direct event injection, so
1076 	 * just WARN and continue on.
1077 	 */
1078 	WARN_ON_ONCE(run_flags);
1079 
1080 	/*
1081 	 * Wait until retry of SEPT-zap-related SEAMCALL completes before
1082 	 * allowing vCPU entry to avoid contention with tdh_vp_enter() and
1083 	 * TDCALLs.
1084 	 */
1085 	if (unlikely(READ_ONCE(to_kvm_tdx(vcpu->kvm)->wait_for_sept_zap)))
1086 		return EXIT_FASTPATH_EXIT_HANDLED;
1087 
1088 	trace_kvm_entry(vcpu, run_flags & KVM_RUN_FORCE_IMMEDIATE_EXIT);
1089 
1090 	if (pi_test_on(&vt->pi_desc)) {
1091 		apic->send_IPI_self(POSTED_INTR_VECTOR);
1092 
1093 		if (pi_test_pir(kvm_lapic_get_reg(vcpu->arch.apic, APIC_LVTT) &
1094 			       APIC_VECTOR_MASK, &vt->pi_desc))
1095 			kvm_wait_lapic_expire(vcpu);
1096 	}
1097 
1098 	tdx_vcpu_enter_exit(vcpu);
1099 
1100 	if (vcpu->arch.host_debugctl & ~TDX_DEBUGCTL_PRESERVED)
1101 		update_debugctlmsr(vcpu->arch.host_debugctl);
1102 
1103 	tdx_load_host_xsave_state(vcpu);
1104 
1105 	vcpu->arch.regs_avail &= TDX_REGS_AVAIL_SET;
1106 
1107 	if (unlikely(tdx->vp_enter_ret == EXIT_REASON_EPT_MISCONFIG))
1108 		return EXIT_FASTPATH_NONE;
1109 
1110 	if (unlikely((tdx->vp_enter_ret & TDX_SW_ERROR) == TDX_SW_ERROR))
1111 		return EXIT_FASTPATH_NONE;
1112 
1113 	trace_kvm_exit(vcpu, KVM_ISA_VMX);
1114 
1115 	if (unlikely(tdx_failed_vmentry(vcpu)))
1116 		return EXIT_FASTPATH_NONE;
1117 
1118 	return tdx_exit_handlers_fastpath(vcpu);
1119 }
1120 
1121 void tdx_inject_nmi(struct kvm_vcpu *vcpu)
1122 {
1123 	++vcpu->stat.nmi_injections;
1124 	td_management_write8(to_tdx(vcpu), TD_VCPU_PEND_NMI, 1);
1125 	/*
1126 	 * From KVM's perspective, NMI injection is completed right after
1127 	 * writing to PEND_NMI.  KVM doesn't care whether an NMI is injected by
1128 	 * the TDX module or not.
1129 	 */
1130 	vcpu->arch.nmi_injected = false;
1131 	/*
1132 	 * TDX doesn't support KVM to request NMI window exit.  If there is
1133 	 * still a pending vNMI, KVM is not able to inject it along with the
1134 	 * one pending in TDX module in a back-to-back way.  Since the previous
1135 	 * vNMI is still pending in TDX module, i.e. it has not been delivered
1136 	 * to TDX guest yet, it's OK to collapse the pending vNMI into the
1137 	 * previous one.  The guest is expected to handle all the NMI sources
1138 	 * when handling the first vNMI.
1139 	 */
1140 	vcpu->arch.nmi_pending = 0;
1141 }
1142 
1143 static int tdx_handle_exception_nmi(struct kvm_vcpu *vcpu)
1144 {
1145 	u32 intr_info = vmx_get_intr_info(vcpu);
1146 
1147 	/*
1148 	 * Machine checks are handled by handle_exception_irqoff(), or by
1149 	 * tdx_handle_exit() with TDX_NON_RECOVERABLE set if a #MC occurs on
1150 	 * VM-Entry.  NMIs are handled by tdx_vcpu_enter_exit().
1151 	 */
1152 	if (is_nmi(intr_info) || is_machine_check(intr_info))
1153 		return 1;
1154 
1155 	vcpu->run->exit_reason = KVM_EXIT_EXCEPTION;
1156 	vcpu->run->ex.exception = intr_info & INTR_INFO_VECTOR_MASK;
1157 	vcpu->run->ex.error_code = 0;
1158 
1159 	return 0;
1160 }
1161 
1162 static int complete_hypercall_exit(struct kvm_vcpu *vcpu)
1163 {
1164 	tdvmcall_set_return_code(vcpu, vcpu->run->hypercall.ret);
1165 	return 1;
1166 }
1167 
1168 static int tdx_emulate_vmcall(struct kvm_vcpu *vcpu)
1169 {
1170 	kvm_rax_write(vcpu, to_tdx(vcpu)->vp_enter_args.r10);
1171 	kvm_rbx_write(vcpu, to_tdx(vcpu)->vp_enter_args.r11);
1172 	kvm_rcx_write(vcpu, to_tdx(vcpu)->vp_enter_args.r12);
1173 	kvm_rdx_write(vcpu, to_tdx(vcpu)->vp_enter_args.r13);
1174 	kvm_rsi_write(vcpu, to_tdx(vcpu)->vp_enter_args.r14);
1175 
1176 	return __kvm_emulate_hypercall(vcpu, 0, complete_hypercall_exit);
1177 }
1178 
1179 /*
1180  * Split into chunks and check interrupt pending between chunks.  This allows
1181  * for timely injection of interrupts to prevent issues with guest lockup
1182  * detection.
1183  */
1184 #define TDX_MAP_GPA_MAX_LEN (2 * 1024 * 1024)
1185 static void __tdx_map_gpa(struct vcpu_tdx *tdx);
1186 
1187 static int tdx_complete_vmcall_map_gpa(struct kvm_vcpu *vcpu)
1188 {
1189 	struct vcpu_tdx *tdx = to_tdx(vcpu);
1190 
1191 	if (vcpu->run->hypercall.ret) {
1192 		tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_INVALID_OPERAND);
1193 		tdx->vp_enter_args.r11 = tdx->map_gpa_next;
1194 		return 1;
1195 	}
1196 
1197 	tdx->map_gpa_next += TDX_MAP_GPA_MAX_LEN;
1198 	if (tdx->map_gpa_next >= tdx->map_gpa_end)
1199 		return 1;
1200 
1201 	/*
1202 	 * Stop processing the remaining part if there is a pending interrupt,
1203 	 * which could be qualified to deliver.  Skip checking pending RVI for
1204 	 * TDVMCALL_MAP_GPA, see comments in tdx_protected_apic_has_interrupt().
1205 	 */
1206 	if (kvm_vcpu_has_events(vcpu)) {
1207 		tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_RETRY);
1208 		tdx->vp_enter_args.r11 = tdx->map_gpa_next;
1209 		return 1;
1210 	}
1211 
1212 	__tdx_map_gpa(tdx);
1213 	return 0;
1214 }
1215 
1216 static void __tdx_map_gpa(struct vcpu_tdx *tdx)
1217 {
1218 	u64 gpa = tdx->map_gpa_next;
1219 	u64 size = tdx->map_gpa_end - tdx->map_gpa_next;
1220 
1221 	if (size > TDX_MAP_GPA_MAX_LEN)
1222 		size = TDX_MAP_GPA_MAX_LEN;
1223 
1224 	tdx->vcpu.run->exit_reason       = KVM_EXIT_HYPERCALL;
1225 	tdx->vcpu.run->hypercall.nr      = KVM_HC_MAP_GPA_RANGE;
1226 	/*
1227 	 * In principle this should have been -KVM_ENOSYS, but userspace (QEMU <=9.2)
1228 	 * assumed that vcpu->run->hypercall.ret is never changed by KVM and thus that
1229 	 * it was always zero on KVM_EXIT_HYPERCALL.  Since KVM is now overwriting
1230 	 * vcpu->run->hypercall.ret, ensuring that it is zero to not break QEMU.
1231 	 */
1232 	tdx->vcpu.run->hypercall.ret = 0;
1233 	tdx->vcpu.run->hypercall.args[0] = gpa & ~gfn_to_gpa(kvm_gfn_direct_bits(tdx->vcpu.kvm));
1234 	tdx->vcpu.run->hypercall.args[1] = size / PAGE_SIZE;
1235 	tdx->vcpu.run->hypercall.args[2] = vt_is_tdx_private_gpa(tdx->vcpu.kvm, gpa) ?
1236 					   KVM_MAP_GPA_RANGE_ENCRYPTED :
1237 					   KVM_MAP_GPA_RANGE_DECRYPTED;
1238 	tdx->vcpu.run->hypercall.flags   = KVM_EXIT_HYPERCALL_LONG_MODE;
1239 
1240 	tdx->vcpu.arch.complete_userspace_io = tdx_complete_vmcall_map_gpa;
1241 }
1242 
1243 static int tdx_map_gpa(struct kvm_vcpu *vcpu)
1244 {
1245 	struct vcpu_tdx *tdx = to_tdx(vcpu);
1246 	u64 gpa = tdx->vp_enter_args.r12;
1247 	u64 size = tdx->vp_enter_args.r13;
1248 	u64 ret;
1249 
1250 	/*
1251 	 * Converting TDVMCALL_MAP_GPA to KVM_HC_MAP_GPA_RANGE requires
1252 	 * userspace to enable KVM_CAP_EXIT_HYPERCALL with KVM_HC_MAP_GPA_RANGE
1253 	 * bit set.  This is a base call so it should always be supported, but
1254 	 * KVM has no way to ensure that userspace implements the GHCI correctly.
1255 	 * So if KVM_HC_MAP_GPA_RANGE does not cause a VMEXIT, return an error
1256 	 * to the guest.
1257 	 */
1258 	if (!user_exit_on_hypercall(vcpu->kvm, KVM_HC_MAP_GPA_RANGE)) {
1259 		ret = TDVMCALL_STATUS_SUBFUNC_UNSUPPORTED;
1260 		goto error;
1261 	}
1262 
1263 	if (gpa + size <= gpa || !kvm_vcpu_is_legal_gpa(vcpu, gpa) ||
1264 	    !kvm_vcpu_is_legal_gpa(vcpu, gpa + size - 1) ||
1265 	    (vt_is_tdx_private_gpa(vcpu->kvm, gpa) !=
1266 	     vt_is_tdx_private_gpa(vcpu->kvm, gpa + size - 1))) {
1267 		ret = TDVMCALL_STATUS_INVALID_OPERAND;
1268 		goto error;
1269 	}
1270 
1271 	if (!PAGE_ALIGNED(gpa) || !PAGE_ALIGNED(size)) {
1272 		ret = TDVMCALL_STATUS_ALIGN_ERROR;
1273 		goto error;
1274 	}
1275 
1276 	tdx->map_gpa_end = gpa + size;
1277 	tdx->map_gpa_next = gpa;
1278 
1279 	__tdx_map_gpa(tdx);
1280 	return 0;
1281 
1282 error:
1283 	tdvmcall_set_return_code(vcpu, ret);
1284 	tdx->vp_enter_args.r11 = gpa;
1285 	return 1;
1286 }
1287 
1288 static int tdx_report_fatal_error(struct kvm_vcpu *vcpu)
1289 {
1290 	struct vcpu_tdx *tdx = to_tdx(vcpu);
1291 	u64 *regs = vcpu->run->system_event.data;
1292 	u64 *module_regs = &tdx->vp_enter_args.r8;
1293 	int index = VCPU_REGS_RAX;
1294 
1295 	vcpu->run->exit_reason = KVM_EXIT_SYSTEM_EVENT;
1296 	vcpu->run->system_event.type = KVM_SYSTEM_EVENT_TDX_FATAL;
1297 	vcpu->run->system_event.ndata = 16;
1298 
1299 	/* Dump 16 general-purpose registers to userspace in ascending order. */
1300 	regs[index++] = tdx->vp_enter_ret;
1301 	regs[index++] = tdx->vp_enter_args.rcx;
1302 	regs[index++] = tdx->vp_enter_args.rdx;
1303 	regs[index++] = tdx->vp_enter_args.rbx;
1304 	regs[index++] = 0;
1305 	regs[index++] = 0;
1306 	regs[index++] = tdx->vp_enter_args.rsi;
1307 	regs[index] = tdx->vp_enter_args.rdi;
1308 	for (index = 0; index < 8; index++)
1309 		regs[VCPU_REGS_R8 + index] = module_regs[index];
1310 
1311 	return 0;
1312 }
1313 
1314 static int tdx_emulate_cpuid(struct kvm_vcpu *vcpu)
1315 {
1316 	u32 eax, ebx, ecx, edx;
1317 	struct vcpu_tdx *tdx = to_tdx(vcpu);
1318 
1319 	/* EAX and ECX for cpuid is stored in R12 and R13. */
1320 	eax = tdx->vp_enter_args.r12;
1321 	ecx = tdx->vp_enter_args.r13;
1322 
1323 	kvm_cpuid(vcpu, &eax, &ebx, &ecx, &edx, false);
1324 
1325 	tdx->vp_enter_args.r12 = eax;
1326 	tdx->vp_enter_args.r13 = ebx;
1327 	tdx->vp_enter_args.r14 = ecx;
1328 	tdx->vp_enter_args.r15 = edx;
1329 
1330 	return 1;
1331 }
1332 
1333 static int tdx_complete_pio_out(struct kvm_vcpu *vcpu)
1334 {
1335 	vcpu->arch.pio.count = 0;
1336 	return 1;
1337 }
1338 
1339 static int tdx_complete_pio_in(struct kvm_vcpu *vcpu)
1340 {
1341 	struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
1342 	unsigned long val = 0;
1343 	int ret;
1344 
1345 	ret = ctxt->ops->pio_in_emulated(ctxt, vcpu->arch.pio.size,
1346 					 vcpu->arch.pio.port, &val, 1);
1347 
1348 	WARN_ON_ONCE(!ret);
1349 
1350 	tdvmcall_set_return_val(vcpu, val);
1351 
1352 	return 1;
1353 }
1354 
1355 static int tdx_emulate_io(struct kvm_vcpu *vcpu)
1356 {
1357 	struct vcpu_tdx *tdx = to_tdx(vcpu);
1358 	struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
1359 	unsigned long val = 0;
1360 	unsigned int port;
1361 	u64 size, write;
1362 	int ret;
1363 
1364 	++vcpu->stat.io_exits;
1365 
1366 	size = tdx->vp_enter_args.r12;
1367 	write = tdx->vp_enter_args.r13;
1368 	port = tdx->vp_enter_args.r14;
1369 
1370 	if ((write != 0 && write != 1) || (size != 1 && size != 2 && size != 4)) {
1371 		tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_INVALID_OPERAND);
1372 		return 1;
1373 	}
1374 
1375 	if (write) {
1376 		val = tdx->vp_enter_args.r15;
1377 		ret = ctxt->ops->pio_out_emulated(ctxt, size, port, &val, 1);
1378 	} else {
1379 		ret = ctxt->ops->pio_in_emulated(ctxt, size, port, &val, 1);
1380 	}
1381 
1382 	if (!ret)
1383 		vcpu->arch.complete_userspace_io = write ? tdx_complete_pio_out :
1384 							   tdx_complete_pio_in;
1385 	else if (!write)
1386 		tdvmcall_set_return_val(vcpu, val);
1387 
1388 	return ret;
1389 }
1390 
1391 static int tdx_complete_mmio_read(struct kvm_vcpu *vcpu)
1392 {
1393 	unsigned long val = 0;
1394 	gpa_t gpa;
1395 	int size;
1396 
1397 	gpa = vcpu->mmio_fragments[0].gpa;
1398 	size = vcpu->mmio_fragments[0].len;
1399 
1400 	memcpy(&val, vcpu->run->mmio.data, size);
1401 	tdvmcall_set_return_val(vcpu, val);
1402 	trace_kvm_mmio(KVM_TRACE_MMIO_READ, size, gpa, &val);
1403 	return 1;
1404 }
1405 
1406 static inline int tdx_mmio_write(struct kvm_vcpu *vcpu, gpa_t gpa, int size,
1407 				 unsigned long val)
1408 {
1409 	if (!kvm_io_bus_write(vcpu, KVM_FAST_MMIO_BUS, gpa, 0, NULL)) {
1410 		trace_kvm_fast_mmio(gpa);
1411 		return 0;
1412 	}
1413 
1414 	trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, size, gpa, &val);
1415 	if (kvm_io_bus_write(vcpu, KVM_MMIO_BUS, gpa, size, &val))
1416 		return -EOPNOTSUPP;
1417 
1418 	return 0;
1419 }
1420 
1421 static inline int tdx_mmio_read(struct kvm_vcpu *vcpu, gpa_t gpa, int size)
1422 {
1423 	unsigned long val;
1424 
1425 	if (kvm_io_bus_read(vcpu, KVM_MMIO_BUS, gpa, size, &val))
1426 		return -EOPNOTSUPP;
1427 
1428 	tdvmcall_set_return_val(vcpu, val);
1429 	trace_kvm_mmio(KVM_TRACE_MMIO_READ, size, gpa, &val);
1430 	return 0;
1431 }
1432 
1433 static int tdx_emulate_mmio(struct kvm_vcpu *vcpu)
1434 {
1435 	struct vcpu_tdx *tdx = to_tdx(vcpu);
1436 	int size, write, r;
1437 	unsigned long val;
1438 	gpa_t gpa;
1439 
1440 	size = tdx->vp_enter_args.r12;
1441 	write = tdx->vp_enter_args.r13;
1442 	gpa = tdx->vp_enter_args.r14;
1443 	val = write ? tdx->vp_enter_args.r15 : 0;
1444 
1445 	if (size != 1 && size != 2 && size != 4 && size != 8)
1446 		goto error;
1447 	if (write != 0 && write != 1)
1448 		goto error;
1449 
1450 	/*
1451 	 * TDG.VP.VMCALL<MMIO> allows only shared GPA, it makes no sense to
1452 	 * do MMIO emulation for private GPA.
1453 	 */
1454 	if (vt_is_tdx_private_gpa(vcpu->kvm, gpa) ||
1455 	    vt_is_tdx_private_gpa(vcpu->kvm, gpa + size - 1))
1456 		goto error;
1457 
1458 	gpa = gpa & ~gfn_to_gpa(kvm_gfn_direct_bits(vcpu->kvm));
1459 
1460 	if (write)
1461 		r = tdx_mmio_write(vcpu, gpa, size, val);
1462 	else
1463 		r = tdx_mmio_read(vcpu, gpa, size);
1464 	if (!r)
1465 		/* Kernel completed device emulation. */
1466 		return 1;
1467 
1468 	/* Request the device emulation to userspace device model. */
1469 	vcpu->mmio_is_write = write;
1470 	if (!write)
1471 		vcpu->arch.complete_userspace_io = tdx_complete_mmio_read;
1472 
1473 	vcpu->run->mmio.phys_addr = gpa;
1474 	vcpu->run->mmio.len = size;
1475 	vcpu->run->mmio.is_write = write;
1476 	vcpu->run->exit_reason = KVM_EXIT_MMIO;
1477 
1478 	if (write) {
1479 		memcpy(vcpu->run->mmio.data, &val, size);
1480 	} else {
1481 		vcpu->mmio_fragments[0].gpa = gpa;
1482 		vcpu->mmio_fragments[0].len = size;
1483 		trace_kvm_mmio(KVM_TRACE_MMIO_READ_UNSATISFIED, size, gpa, NULL);
1484 	}
1485 	return 0;
1486 
1487 error:
1488 	tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_INVALID_OPERAND);
1489 	return 1;
1490 }
1491 
1492 static int tdx_complete_get_td_vm_call_info(struct kvm_vcpu *vcpu)
1493 {
1494 	struct vcpu_tdx *tdx = to_tdx(vcpu);
1495 
1496 	tdvmcall_set_return_code(vcpu, vcpu->run->tdx.get_tdvmcall_info.ret);
1497 
1498 	/*
1499 	 * For now, there is no TDVMCALL beyond GHCI base API supported by KVM
1500 	 * directly without the support from userspace, just set the value
1501 	 * returned from userspace.
1502 	 */
1503 	tdx->vp_enter_args.r11 = vcpu->run->tdx.get_tdvmcall_info.r11;
1504 	tdx->vp_enter_args.r12 = vcpu->run->tdx.get_tdvmcall_info.r12;
1505 	tdx->vp_enter_args.r13 = vcpu->run->tdx.get_tdvmcall_info.r13;
1506 	tdx->vp_enter_args.r14 = vcpu->run->tdx.get_tdvmcall_info.r14;
1507 
1508 	return 1;
1509 }
1510 
1511 static int tdx_get_td_vm_call_info(struct kvm_vcpu *vcpu)
1512 {
1513 	struct vcpu_tdx *tdx = to_tdx(vcpu);
1514 
1515 	switch (tdx->vp_enter_args.r12) {
1516 	case 0:
1517 		tdx->vp_enter_args.r11 = 0;
1518 		tdx->vp_enter_args.r12 = 0;
1519 		tdx->vp_enter_args.r13 = 0;
1520 		tdx->vp_enter_args.r14 = 0;
1521 		tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_SUCCESS);
1522 		return 1;
1523 	case 1:
1524 		vcpu->run->tdx.get_tdvmcall_info.leaf = tdx->vp_enter_args.r12;
1525 		vcpu->run->exit_reason = KVM_EXIT_TDX;
1526 		vcpu->run->tdx.flags = 0;
1527 		vcpu->run->tdx.nr = TDVMCALL_GET_TD_VM_CALL_INFO;
1528 		vcpu->run->tdx.get_tdvmcall_info.ret = TDVMCALL_STATUS_SUCCESS;
1529 		vcpu->run->tdx.get_tdvmcall_info.r11 = 0;
1530 		vcpu->run->tdx.get_tdvmcall_info.r12 = 0;
1531 		vcpu->run->tdx.get_tdvmcall_info.r13 = 0;
1532 		vcpu->run->tdx.get_tdvmcall_info.r14 = 0;
1533 		vcpu->arch.complete_userspace_io = tdx_complete_get_td_vm_call_info;
1534 		return 0;
1535 	default:
1536 		tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_INVALID_OPERAND);
1537 		return 1;
1538 	}
1539 }
1540 
1541 static int tdx_complete_simple(struct kvm_vcpu *vcpu)
1542 {
1543 	tdvmcall_set_return_code(vcpu, vcpu->run->tdx.unknown.ret);
1544 	return 1;
1545 }
1546 
1547 static int tdx_get_quote(struct kvm_vcpu *vcpu)
1548 {
1549 	struct vcpu_tdx *tdx = to_tdx(vcpu);
1550 	u64 gpa = tdx->vp_enter_args.r12;
1551 	u64 size = tdx->vp_enter_args.r13;
1552 
1553 	/* The gpa of buffer must have shared bit set. */
1554 	if (vt_is_tdx_private_gpa(vcpu->kvm, gpa)) {
1555 		tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_INVALID_OPERAND);
1556 		return 1;
1557 	}
1558 
1559 	vcpu->run->exit_reason = KVM_EXIT_TDX;
1560 	vcpu->run->tdx.flags = 0;
1561 	vcpu->run->tdx.nr = TDVMCALL_GET_QUOTE;
1562 	vcpu->run->tdx.get_quote.ret = TDVMCALL_STATUS_SUBFUNC_UNSUPPORTED;
1563 	vcpu->run->tdx.get_quote.gpa = gpa & ~gfn_to_gpa(kvm_gfn_direct_bits(tdx->vcpu.kvm));
1564 	vcpu->run->tdx.get_quote.size = size;
1565 
1566 	vcpu->arch.complete_userspace_io = tdx_complete_simple;
1567 
1568 	return 0;
1569 }
1570 
1571 static int tdx_setup_event_notify_interrupt(struct kvm_vcpu *vcpu)
1572 {
1573 	struct vcpu_tdx *tdx = to_tdx(vcpu);
1574 	u64 vector = tdx->vp_enter_args.r12;
1575 
1576 	if (vector < 32 || vector > 255) {
1577 		tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_INVALID_OPERAND);
1578 		return 1;
1579 	}
1580 
1581 	vcpu->run->exit_reason = KVM_EXIT_TDX;
1582 	vcpu->run->tdx.flags = 0;
1583 	vcpu->run->tdx.nr = TDVMCALL_SETUP_EVENT_NOTIFY_INTERRUPT;
1584 	vcpu->run->tdx.setup_event_notify.ret = TDVMCALL_STATUS_SUBFUNC_UNSUPPORTED;
1585 	vcpu->run->tdx.setup_event_notify.vector = vector;
1586 
1587 	vcpu->arch.complete_userspace_io = tdx_complete_simple;
1588 
1589 	return 0;
1590 }
1591 
1592 static int handle_tdvmcall(struct kvm_vcpu *vcpu)
1593 {
1594 	switch (tdvmcall_leaf(vcpu)) {
1595 	case TDVMCALL_MAP_GPA:
1596 		return tdx_map_gpa(vcpu);
1597 	case TDVMCALL_REPORT_FATAL_ERROR:
1598 		return tdx_report_fatal_error(vcpu);
1599 	case TDVMCALL_GET_TD_VM_CALL_INFO:
1600 		return tdx_get_td_vm_call_info(vcpu);
1601 	case TDVMCALL_GET_QUOTE:
1602 		return tdx_get_quote(vcpu);
1603 	case TDVMCALL_SETUP_EVENT_NOTIFY_INTERRUPT:
1604 		return tdx_setup_event_notify_interrupt(vcpu);
1605 	default:
1606 		break;
1607 	}
1608 
1609 	tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_SUBFUNC_UNSUPPORTED);
1610 	return 1;
1611 }
1612 
1613 void tdx_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa, int pgd_level)
1614 {
1615 	u64 shared_bit = (pgd_level == 5) ? TDX_SHARED_BIT_PWL_5 :
1616 			  TDX_SHARED_BIT_PWL_4;
1617 
1618 	if (KVM_BUG_ON(shared_bit != kvm_gfn_direct_bits(vcpu->kvm), vcpu->kvm))
1619 		return;
1620 
1621 	td_vmcs_write64(to_tdx(vcpu), SHARED_EPT_POINTER, root_hpa);
1622 }
1623 
1624 static int tdx_mem_page_add(struct kvm *kvm, gfn_t gfn, enum pg_level level,
1625 			    kvm_pfn_t pfn)
1626 {
1627 	struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
1628 	u64 err, entry, level_state;
1629 	gpa_t gpa = gfn_to_gpa(gfn);
1630 
1631 	lockdep_assert_held(&kvm->slots_lock);
1632 
1633 	if (KVM_BUG_ON(kvm->arch.pre_fault_allowed, kvm) ||
1634 	    KVM_BUG_ON(!kvm_tdx->page_add_src, kvm))
1635 		return -EIO;
1636 
1637 	err = tdh_mem_page_add(&kvm_tdx->td, gpa, pfn_to_page(pfn),
1638 			       kvm_tdx->page_add_src, &entry, &level_state);
1639 	if (unlikely(tdx_operand_busy(err)))
1640 		return -EBUSY;
1641 
1642 	if (TDX_BUG_ON_2(err, TDH_MEM_PAGE_ADD, entry, level_state, kvm))
1643 		return -EIO;
1644 
1645 	return 0;
1646 }
1647 
1648 static int tdx_mem_page_aug(struct kvm *kvm, gfn_t gfn,
1649 			    enum pg_level level, kvm_pfn_t pfn)
1650 {
1651 	int tdx_level = pg_level_to_tdx_sept_level(level);
1652 	struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
1653 	struct page *page = pfn_to_page(pfn);
1654 	gpa_t gpa = gfn_to_gpa(gfn);
1655 	u64 entry, level_state;
1656 	u64 err;
1657 
1658 	err = tdh_mem_page_aug(&kvm_tdx->td, gpa, tdx_level, page, &entry, &level_state);
1659 	if (unlikely(tdx_operand_busy(err)))
1660 		return -EBUSY;
1661 
1662 	if (TDX_BUG_ON_2(err, TDH_MEM_PAGE_AUG, entry, level_state, kvm))
1663 		return -EIO;
1664 
1665 	return 0;
1666 }
1667 
1668 static int tdx_sept_set_private_spte(struct kvm *kvm, gfn_t gfn,
1669 				     enum pg_level level, u64 mirror_spte)
1670 {
1671 	struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
1672 	kvm_pfn_t pfn = spte_to_pfn(mirror_spte);
1673 
1674 	/* TODO: handle large pages. */
1675 	if (KVM_BUG_ON(level != PG_LEVEL_4K, kvm))
1676 		return -EIO;
1677 
1678 	WARN_ON_ONCE(!is_shadow_present_pte(mirror_spte) ||
1679 		     (mirror_spte & VMX_EPT_RWX_MASK) != VMX_EPT_RWX_MASK);
1680 
1681 	/*
1682 	 * Ensure pre_fault_allowed is read by kvm_arch_vcpu_pre_fault_memory()
1683 	 * before kvm_tdx->state.  Userspace must not be allowed to pre-fault
1684 	 * arbitrary memory until the initial memory image is finalized.  Pairs
1685 	 * with the smp_wmb() in tdx_td_finalize().
1686 	 */
1687 	smp_rmb();
1688 
1689 	/*
1690 	 * If the TD isn't finalized/runnable, then userspace is initializing
1691 	 * the VM image via KVM_TDX_INIT_MEM_REGION; ADD the page to the TD.
1692 	 */
1693 	if (unlikely(kvm_tdx->state != TD_STATE_RUNNABLE))
1694 		return tdx_mem_page_add(kvm, gfn, level, pfn);
1695 
1696 	return tdx_mem_page_aug(kvm, gfn, level, pfn);
1697 }
1698 
1699 static int tdx_sept_link_private_spt(struct kvm *kvm, gfn_t gfn,
1700 				     enum pg_level level, void *private_spt)
1701 {
1702 	int tdx_level = pg_level_to_tdx_sept_level(level);
1703 	gpa_t gpa = gfn_to_gpa(gfn);
1704 	struct page *page = virt_to_page(private_spt);
1705 	u64 err, entry, level_state;
1706 
1707 	err = tdh_mem_sept_add(&to_kvm_tdx(kvm)->td, gpa, tdx_level, page, &entry,
1708 			       &level_state);
1709 	if (unlikely(tdx_operand_busy(err)))
1710 		return -EBUSY;
1711 
1712 	if (TDX_BUG_ON_2(err, TDH_MEM_SEPT_ADD, entry, level_state, kvm))
1713 		return -EIO;
1714 
1715 	return 0;
1716 }
1717 
1718 /*
1719  * Ensure shared and private EPTs to be flushed on all vCPUs.
1720  * tdh_mem_track() is the only caller that increases TD epoch. An increase in
1721  * the TD epoch (e.g., to value "N + 1") is successful only if no vCPUs are
1722  * running in guest mode with the value "N - 1".
1723  *
1724  * A successful execution of tdh_mem_track() ensures that vCPUs can only run in
1725  * guest mode with TD epoch value "N" if no TD exit occurs after the TD epoch
1726  * being increased to "N + 1".
1727  *
1728  * Kicking off all vCPUs after that further results in no vCPUs can run in guest
1729  * mode with TD epoch value "N", which unblocks the next tdh_mem_track() (e.g.
1730  * to increase TD epoch to "N + 2").
1731  *
1732  * TDX module will flush EPT on the next TD enter and make vCPUs to run in
1733  * guest mode with TD epoch value "N + 1".
1734  *
1735  * kvm_make_all_cpus_request() guarantees all vCPUs are out of guest mode by
1736  * waiting empty IPI handler ack_kick().
1737  *
1738  * No action is required to the vCPUs being kicked off since the kicking off
1739  * occurs certainly after TD epoch increment and before the next
1740  * tdh_mem_track().
1741  */
1742 static void tdx_track(struct kvm *kvm)
1743 {
1744 	struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
1745 	u64 err;
1746 
1747 	/* If TD isn't finalized, it's before any vcpu running. */
1748 	if (unlikely(kvm_tdx->state != TD_STATE_RUNNABLE))
1749 		return;
1750 
1751 	/*
1752 	 * The full sequence of TDH.MEM.TRACK and forcing vCPUs out of guest
1753 	 * mode must be serialized, as TDH.MEM.TRACK will fail if the previous
1754 	 * tracking epoch hasn't completed.
1755 	 */
1756 	lockdep_assert_held_write(&kvm->mmu_lock);
1757 
1758 	err = tdh_do_no_vcpus(tdh_mem_track, kvm, &kvm_tdx->td);
1759 	TDX_BUG_ON(err, TDH_MEM_TRACK, kvm);
1760 
1761 	kvm_make_all_cpus_request(kvm, KVM_REQ_OUTSIDE_GUEST_MODE);
1762 }
1763 
1764 static int tdx_sept_free_private_spt(struct kvm *kvm, gfn_t gfn,
1765 				     enum pg_level level, void *private_spt)
1766 {
1767 	struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
1768 
1769 	/*
1770 	 * free_external_spt() is only called after hkid is freed when TD is
1771 	 * tearing down.
1772 	 * KVM doesn't (yet) zap page table pages in mirror page table while
1773 	 * TD is active, though guest pages mapped in mirror page table could be
1774 	 * zapped during TD is active, e.g. for shared <-> private conversion
1775 	 * and slot move/deletion.
1776 	 */
1777 	if (KVM_BUG_ON(is_hkid_assigned(kvm_tdx), kvm))
1778 		return -EIO;
1779 
1780 	/*
1781 	 * The HKID assigned to this TD was already freed and cache was
1782 	 * already flushed. We don't have to flush again.
1783 	 */
1784 	return tdx_reclaim_page(virt_to_page(private_spt));
1785 }
1786 
1787 static void tdx_sept_remove_private_spte(struct kvm *kvm, gfn_t gfn,
1788 					 enum pg_level level, u64 mirror_spte)
1789 {
1790 	struct page *page = pfn_to_page(spte_to_pfn(mirror_spte));
1791 	int tdx_level = pg_level_to_tdx_sept_level(level);
1792 	struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
1793 	gpa_t gpa = gfn_to_gpa(gfn);
1794 	u64 err, entry, level_state;
1795 
1796 	lockdep_assert_held_write(&kvm->mmu_lock);
1797 
1798 	/*
1799 	 * HKID is released after all private pages have been removed, and set
1800 	 * before any might be populated. Warn if zapping is attempted when
1801 	 * there can't be anything populated in the private EPT.
1802 	 */
1803 	if (KVM_BUG_ON(!is_hkid_assigned(to_kvm_tdx(kvm)), kvm))
1804 		return;
1805 
1806 	/* TODO: handle large pages. */
1807 	if (KVM_BUG_ON(level != PG_LEVEL_4K, kvm))
1808 		return;
1809 
1810 	err = tdh_do_no_vcpus(tdh_mem_range_block, kvm, &kvm_tdx->td, gpa,
1811 			      tdx_level, &entry, &level_state);
1812 	if (TDX_BUG_ON_2(err, TDH_MEM_RANGE_BLOCK, entry, level_state, kvm))
1813 		return;
1814 
1815 	/*
1816 	 * TDX requires TLB tracking before dropping private page.  Do
1817 	 * it here, although it is also done later.
1818 	 */
1819 	tdx_track(kvm);
1820 
1821 	/*
1822 	 * When zapping private page, write lock is held. So no race condition
1823 	 * with other vcpu sept operation.
1824 	 * Race with TDH.VP.ENTER due to (0-step mitigation) and Guest TDCALLs.
1825 	 */
1826 	err = tdh_do_no_vcpus(tdh_mem_page_remove, kvm, &kvm_tdx->td, gpa,
1827 			      tdx_level, &entry, &level_state);
1828 	if (TDX_BUG_ON_2(err, TDH_MEM_PAGE_REMOVE, entry, level_state, kvm))
1829 		return;
1830 
1831 	err = tdh_phymem_page_wbinvd_hkid((u16)kvm_tdx->hkid, page);
1832 	if (TDX_BUG_ON(err, TDH_PHYMEM_PAGE_WBINVD, kvm))
1833 		return;
1834 
1835 	tdx_quirk_reset_page(page);
1836 }
1837 
1838 void tdx_deliver_interrupt(struct kvm_lapic *apic, int delivery_mode,
1839 			   int trig_mode, int vector)
1840 {
1841 	struct kvm_vcpu *vcpu = apic->vcpu;
1842 	struct vcpu_tdx *tdx = to_tdx(vcpu);
1843 
1844 	/* TDX supports only posted interrupt.  No lapic emulation. */
1845 	__vmx_deliver_posted_interrupt(vcpu, &tdx->vt.pi_desc, vector);
1846 
1847 	trace_kvm_apicv_accept_irq(vcpu->vcpu_id, delivery_mode, trig_mode, vector);
1848 }
1849 
1850 static inline bool tdx_is_sept_violation_unexpected_pending(struct kvm_vcpu *vcpu)
1851 {
1852 	u64 eeq_type = to_tdx(vcpu)->ext_exit_qualification & TDX_EXT_EXIT_QUAL_TYPE_MASK;
1853 	u64 eq = vmx_get_exit_qual(vcpu);
1854 
1855 	if (eeq_type != TDX_EXT_EXIT_QUAL_TYPE_PENDING_EPT_VIOLATION)
1856 		return false;
1857 
1858 	return !(eq & EPT_VIOLATION_PROT_MASK) && !(eq & EPT_VIOLATION_EXEC_FOR_RING3_LIN);
1859 }
1860 
1861 static int tdx_handle_ept_violation(struct kvm_vcpu *vcpu)
1862 {
1863 	unsigned long exit_qual;
1864 	gpa_t gpa = to_tdx(vcpu)->exit_gpa;
1865 	bool local_retry = false;
1866 	int ret;
1867 
1868 	if (vt_is_tdx_private_gpa(vcpu->kvm, gpa)) {
1869 		if (tdx_is_sept_violation_unexpected_pending(vcpu)) {
1870 			pr_warn("Guest access before accepting 0x%llx on vCPU %d\n",
1871 				gpa, vcpu->vcpu_id);
1872 			kvm_vm_dead(vcpu->kvm);
1873 			return -EIO;
1874 		}
1875 		/*
1876 		 * Always treat SEPT violations as write faults.  Ignore the
1877 		 * EXIT_QUALIFICATION reported by TDX-SEAM for SEPT violations.
1878 		 * TD private pages are always RWX in the SEPT tables,
1879 		 * i.e. they're always mapped writable.  Just as importantly,
1880 		 * treating SEPT violations as write faults is necessary to
1881 		 * avoid COW allocations, which will cause TDAUGPAGE failures
1882 		 * due to aliasing a single HPA to multiple GPAs.
1883 		 */
1884 		exit_qual = EPT_VIOLATION_ACC_WRITE;
1885 
1886 		/* Only private GPA triggers zero-step mitigation */
1887 		local_retry = true;
1888 	} else {
1889 		exit_qual = vmx_get_exit_qual(vcpu);
1890 		/*
1891 		 * EPT violation due to instruction fetch should never be
1892 		 * triggered from shared memory in TDX guest.  If such EPT
1893 		 * violation occurs, treat it as broken hardware.
1894 		 */
1895 		if (KVM_BUG_ON(exit_qual & EPT_VIOLATION_ACC_INSTR, vcpu->kvm))
1896 			return -EIO;
1897 	}
1898 
1899 	trace_kvm_page_fault(vcpu, gpa, exit_qual);
1900 
1901 	/*
1902 	 * To minimize TDH.VP.ENTER invocations, retry locally for private GPA
1903 	 * mapping in TDX.
1904 	 *
1905 	 * KVM may return RET_PF_RETRY for private GPA due to
1906 	 * - contentions when atomically updating SPTEs of the mirror page table
1907 	 * - in-progress GFN invalidation or memslot removal.
1908 	 * - TDX_OPERAND_BUSY error from TDH.MEM.PAGE.AUG or TDH.MEM.SEPT.ADD,
1909 	 *   caused by contentions with TDH.VP.ENTER (with zero-step mitigation)
1910 	 *   or certain TDCALLs.
1911 	 *
1912 	 * If TDH.VP.ENTER is invoked more times than the threshold set by the
1913 	 * TDX module before KVM resolves the private GPA mapping, the TDX
1914 	 * module will activate zero-step mitigation during TDH.VP.ENTER. This
1915 	 * process acquires an SEPT tree lock in the TDX module, leading to
1916 	 * further contentions with TDH.MEM.PAGE.AUG or TDH.MEM.SEPT.ADD
1917 	 * operations on other vCPUs.
1918 	 *
1919 	 * Breaking out of local retries for kvm_vcpu_has_events() is for
1920 	 * interrupt injection. kvm_vcpu_has_events() should not see pending
1921 	 * events for TDX. Since KVM can't determine if IRQs (or NMIs) are
1922 	 * blocked by TDs, false positives are inevitable i.e., KVM may re-enter
1923 	 * the guest even if the IRQ/NMI can't be delivered.
1924 	 *
1925 	 * Note: even without breaking out of local retries, zero-step
1926 	 * mitigation may still occur due to
1927 	 * - invoking of TDH.VP.ENTER after KVM_EXIT_MEMORY_FAULT,
1928 	 * - a single RIP causing EPT violations for more GFNs than the
1929 	 *   threshold count.
1930 	 * This is safe, as triggering zero-step mitigation only introduces
1931 	 * contentions to page installation SEAMCALLs on other vCPUs, which will
1932 	 * handle retries locally in their EPT violation handlers.
1933 	 */
1934 	while (1) {
1935 		struct kvm_memory_slot *slot;
1936 
1937 		ret = __vmx_handle_ept_violation(vcpu, gpa, exit_qual);
1938 
1939 		if (ret != RET_PF_RETRY || !local_retry)
1940 			break;
1941 
1942 		if (kvm_vcpu_has_events(vcpu) || signal_pending(current))
1943 			break;
1944 
1945 		if (kvm_check_request(KVM_REQ_VM_DEAD, vcpu)) {
1946 			ret = -EIO;
1947 			break;
1948 		}
1949 
1950 		/*
1951 		 * Bail if the memslot is invalid, i.e. is being deleted, as
1952 		 * faulting in will never succeed and this task needs to drop
1953 		 * SRCU in order to let memslot deletion complete.
1954 		 */
1955 		slot = kvm_vcpu_gfn_to_memslot(vcpu, gpa_to_gfn(gpa));
1956 		if (slot && slot->flags & KVM_MEMSLOT_INVALID)
1957 			break;
1958 
1959 		cond_resched();
1960 	}
1961 	return ret;
1962 }
1963 
1964 int tdx_complete_emulated_msr(struct kvm_vcpu *vcpu, int err)
1965 {
1966 	if (err) {
1967 		tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_INVALID_OPERAND);
1968 		return 1;
1969 	}
1970 
1971 	if (vmx_get_exit_reason(vcpu).basic == EXIT_REASON_MSR_READ)
1972 		tdvmcall_set_return_val(vcpu, kvm_read_edx_eax(vcpu));
1973 
1974 	return 1;
1975 }
1976 
1977 
1978 int tdx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t fastpath)
1979 {
1980 	struct vcpu_tdx *tdx = to_tdx(vcpu);
1981 	u64 vp_enter_ret = tdx->vp_enter_ret;
1982 	union vmx_exit_reason exit_reason = vmx_get_exit_reason(vcpu);
1983 
1984 	if (fastpath != EXIT_FASTPATH_NONE)
1985 		return 1;
1986 
1987 	if (unlikely(vp_enter_ret == EXIT_REASON_EPT_MISCONFIG)) {
1988 		KVM_BUG_ON(1, vcpu->kvm);
1989 		return -EIO;
1990 	}
1991 
1992 	/*
1993 	 * Handle TDX SW errors, including TDX_SEAMCALL_UD, TDX_SEAMCALL_GP and
1994 	 * TDX_SEAMCALL_VMFAILINVALID.
1995 	 */
1996 	if (unlikely((vp_enter_ret & TDX_SW_ERROR) == TDX_SW_ERROR)) {
1997 		KVM_BUG_ON(!kvm_rebooting, vcpu->kvm);
1998 		goto unhandled_exit;
1999 	}
2000 
2001 	if (unlikely(tdx_failed_vmentry(vcpu))) {
2002 		/*
2003 		 * If the guest state is protected, that means off-TD debug is
2004 		 * not enabled, TDX_NON_RECOVERABLE must be set.
2005 		 */
2006 		WARN_ON_ONCE(vcpu->arch.guest_state_protected &&
2007 				!(vp_enter_ret & TDX_NON_RECOVERABLE));
2008 		vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY;
2009 		vcpu->run->fail_entry.hardware_entry_failure_reason = exit_reason.full;
2010 		vcpu->run->fail_entry.cpu = vcpu->arch.last_vmentry_cpu;
2011 		return 0;
2012 	}
2013 
2014 	if (unlikely(vp_enter_ret & (TDX_ERROR | TDX_NON_RECOVERABLE)) &&
2015 		exit_reason.basic != EXIT_REASON_TRIPLE_FAULT) {
2016 		kvm_pr_unimpl("TD vp_enter_ret 0x%llx\n", vp_enter_ret);
2017 		goto unhandled_exit;
2018 	}
2019 
2020 	WARN_ON_ONCE(exit_reason.basic != EXIT_REASON_TRIPLE_FAULT &&
2021 		     (vp_enter_ret & TDX_SEAMCALL_STATUS_MASK) != TDX_SUCCESS);
2022 
2023 	switch (exit_reason.basic) {
2024 	case EXIT_REASON_TRIPLE_FAULT:
2025 		vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN;
2026 		vcpu->mmio_needed = 0;
2027 		return 0;
2028 	case EXIT_REASON_EXCEPTION_NMI:
2029 		return tdx_handle_exception_nmi(vcpu);
2030 	case EXIT_REASON_EXTERNAL_INTERRUPT:
2031 		++vcpu->stat.irq_exits;
2032 		return 1;
2033 	case EXIT_REASON_CPUID:
2034 		return tdx_emulate_cpuid(vcpu);
2035 	case EXIT_REASON_HLT:
2036 		return kvm_emulate_halt_noskip(vcpu);
2037 	case EXIT_REASON_TDCALL:
2038 		return handle_tdvmcall(vcpu);
2039 	case EXIT_REASON_VMCALL:
2040 		return tdx_emulate_vmcall(vcpu);
2041 	case EXIT_REASON_IO_INSTRUCTION:
2042 		return tdx_emulate_io(vcpu);
2043 	case EXIT_REASON_MSR_READ:
2044 		kvm_rcx_write(vcpu, tdx->vp_enter_args.r12);
2045 		return kvm_emulate_rdmsr(vcpu);
2046 	case EXIT_REASON_MSR_WRITE:
2047 		kvm_rcx_write(vcpu, tdx->vp_enter_args.r12);
2048 		kvm_rax_write(vcpu, tdx->vp_enter_args.r13 & -1u);
2049 		kvm_rdx_write(vcpu, tdx->vp_enter_args.r13 >> 32);
2050 		return kvm_emulate_wrmsr(vcpu);
2051 	case EXIT_REASON_EPT_MISCONFIG:
2052 		return tdx_emulate_mmio(vcpu);
2053 	case EXIT_REASON_EPT_VIOLATION:
2054 		return tdx_handle_ept_violation(vcpu);
2055 	case EXIT_REASON_OTHER_SMI:
2056 		/*
2057 		 * Unlike VMX, SMI in SEAM non-root mode (i.e. when
2058 		 * TD guest vCPU is running) will cause VM exit to TDX module,
2059 		 * then SEAMRET to KVM.  Once it exits to KVM, SMI is delivered
2060 		 * and handled by kernel handler right away.
2061 		 *
2062 		 * The Other SMI exit can also be caused by the SEAM non-root
2063 		 * machine check delivered via Machine Check System Management
2064 		 * Interrupt (MSMI), but it has already been handled by the
2065 		 * kernel machine check handler, i.e., the memory page has been
2066 		 * marked as poisoned and it won't be freed to the free list
2067 		 * when the TDX guest is terminated (the TDX module marks the
2068 		 * guest as dead and prevent it from further running when
2069 		 * machine check happens in SEAM non-root).
2070 		 *
2071 		 * - A MSMI will not reach here, it's handled as non_recoverable
2072 		 *   case above.
2073 		 * - If it's not an MSMI, no need to do anything here.
2074 		 */
2075 		return 1;
2076 	default:
2077 		break;
2078 	}
2079 
2080 unhandled_exit:
2081 	kvm_prepare_unexpected_reason_exit(vcpu, vp_enter_ret);
2082 	return 0;
2083 }
2084 
2085 void tdx_get_exit_info(struct kvm_vcpu *vcpu, u32 *reason,
2086 		u64 *info1, u64 *info2, u32 *intr_info, u32 *error_code)
2087 {
2088 	struct vcpu_tdx *tdx = to_tdx(vcpu);
2089 
2090 	*reason = tdx->vt.exit_reason.full;
2091 	if (*reason != -1u) {
2092 		*info1 = vmx_get_exit_qual(vcpu);
2093 		*info2 = tdx->ext_exit_qualification;
2094 		*intr_info = vmx_get_intr_info(vcpu);
2095 	} else {
2096 		*info1 = 0;
2097 		*info2 = 0;
2098 		*intr_info = 0;
2099 	}
2100 
2101 	*error_code = 0;
2102 }
2103 
2104 bool tdx_has_emulated_msr(u32 index)
2105 {
2106 	switch (index) {
2107 	case MSR_IA32_UCODE_REV:
2108 	case MSR_IA32_ARCH_CAPABILITIES:
2109 	case MSR_IA32_POWER_CTL:
2110 	case MSR_IA32_CR_PAT:
2111 	case MSR_MTRRcap:
2112 	case MTRRphysBase_MSR(0) ... MSR_MTRRfix4K_F8000:
2113 	case MSR_MTRRdefType:
2114 	case MSR_IA32_TSC_DEADLINE:
2115 	case MSR_IA32_MISC_ENABLE:
2116 	case MSR_PLATFORM_INFO:
2117 	case MSR_MISC_FEATURES_ENABLES:
2118 	case MSR_IA32_APICBASE:
2119 	case MSR_EFER:
2120 	case MSR_IA32_FEAT_CTL:
2121 	case MSR_IA32_MCG_CAP:
2122 	case MSR_IA32_MCG_STATUS:
2123 	case MSR_IA32_MCG_CTL:
2124 	case MSR_IA32_MCG_EXT_CTL:
2125 	case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1:
2126 	case MSR_IA32_MC0_CTL2 ... MSR_IA32_MCx_CTL2(KVM_MAX_MCE_BANKS) - 1:
2127 		/* MSR_IA32_MCx_{CTL, STATUS, ADDR, MISC, CTL2} */
2128 	case MSR_KVM_POLL_CONTROL:
2129 		return true;
2130 	case APIC_BASE_MSR ... APIC_BASE_MSR + 0xff:
2131 		/*
2132 		 * x2APIC registers that are virtualized by the CPU can't be
2133 		 * emulated, KVM doesn't have access to the virtual APIC page.
2134 		 */
2135 		switch (index) {
2136 		case X2APIC_MSR(APIC_TASKPRI):
2137 		case X2APIC_MSR(APIC_PROCPRI):
2138 		case X2APIC_MSR(APIC_EOI):
2139 		case X2APIC_MSR(APIC_ISR) ... X2APIC_MSR(APIC_ISR + APIC_ISR_NR):
2140 		case X2APIC_MSR(APIC_TMR) ... X2APIC_MSR(APIC_TMR + APIC_ISR_NR):
2141 		case X2APIC_MSR(APIC_IRR) ... X2APIC_MSR(APIC_IRR + APIC_ISR_NR):
2142 			return false;
2143 		default:
2144 			return true;
2145 		}
2146 	default:
2147 		return false;
2148 	}
2149 }
2150 
2151 static bool tdx_is_read_only_msr(u32 index)
2152 {
2153 	return  index == MSR_IA32_APICBASE || index == MSR_EFER ||
2154 		index == MSR_IA32_FEAT_CTL;
2155 }
2156 
2157 int tdx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
2158 {
2159 	switch (msr->index) {
2160 	case MSR_IA32_FEAT_CTL:
2161 		/*
2162 		 * MCE and MCA are advertised via cpuid. Guest kernel could
2163 		 * check if LMCE is enabled or not.
2164 		 */
2165 		msr->data = FEAT_CTL_LOCKED;
2166 		if (vcpu->arch.mcg_cap & MCG_LMCE_P)
2167 			msr->data |= FEAT_CTL_LMCE_ENABLED;
2168 		return 0;
2169 	case MSR_IA32_MCG_EXT_CTL:
2170 		if (!msr->host_initiated && !(vcpu->arch.mcg_cap & MCG_LMCE_P))
2171 			return 1;
2172 		msr->data = vcpu->arch.mcg_ext_ctl;
2173 		return 0;
2174 	default:
2175 		if (!tdx_has_emulated_msr(msr->index))
2176 			return 1;
2177 
2178 		return kvm_get_msr_common(vcpu, msr);
2179 	}
2180 }
2181 
2182 int tdx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
2183 {
2184 	switch (msr->index) {
2185 	case MSR_IA32_MCG_EXT_CTL:
2186 		if ((!msr->host_initiated && !(vcpu->arch.mcg_cap & MCG_LMCE_P)) ||
2187 		    (msr->data & ~MCG_EXT_CTL_LMCE_EN))
2188 			return 1;
2189 		vcpu->arch.mcg_ext_ctl = msr->data;
2190 		return 0;
2191 	default:
2192 		if (tdx_is_read_only_msr(msr->index))
2193 			return 1;
2194 
2195 		if (!tdx_has_emulated_msr(msr->index))
2196 			return 1;
2197 
2198 		return kvm_set_msr_common(vcpu, msr);
2199 	}
2200 }
2201 
2202 static int tdx_get_capabilities(struct kvm_tdx_cmd *cmd)
2203 {
2204 	const struct tdx_sys_info_td_conf *td_conf = &tdx_sysinfo->td_conf;
2205 	struct kvm_tdx_capabilities __user *user_caps;
2206 	struct kvm_tdx_capabilities *caps = NULL;
2207 	u32 nr_user_entries;
2208 	int ret = 0;
2209 
2210 	/* flags is reserved for future use */
2211 	if (cmd->flags)
2212 		return -EINVAL;
2213 
2214 	user_caps = u64_to_user_ptr(cmd->data);
2215 	if (get_user(nr_user_entries, &user_caps->cpuid.nent))
2216 		return -EFAULT;
2217 
2218 	if (nr_user_entries < td_conf->num_cpuid_config)
2219 		return -E2BIG;
2220 
2221 	caps = kzalloc_flex(*caps, cpuid.entries, td_conf->num_cpuid_config);
2222 	if (!caps)
2223 		return -ENOMEM;
2224 
2225 	ret = init_kvm_tdx_caps(td_conf, caps);
2226 	if (ret)
2227 		goto out;
2228 
2229 	if (copy_to_user(user_caps, caps, struct_size(caps, cpuid.entries,
2230 						      caps->cpuid.nent))) {
2231 		ret = -EFAULT;
2232 		goto out;
2233 	}
2234 
2235 out:
2236 	/* kfree() accepts NULL. */
2237 	kfree(caps);
2238 	return ret;
2239 }
2240 
2241 /*
2242  * KVM reports guest physical address in CPUID.0x800000008.EAX[23:16], which is
2243  * similar to TDX's GPAW. Use this field as the interface for userspace to
2244  * configure the GPAW and EPT level for TDs.
2245  *
2246  * Only values 48 and 52 are supported. Value 52 means GPAW-52 and EPT level
2247  * 5, Value 48 means GPAW-48 and EPT level 4. For value 48, GPAW-48 is always
2248  * supported. Value 52 is only supported when the platform supports 5 level
2249  * EPT.
2250  */
2251 static int setup_tdparams_eptp_controls(struct kvm_cpuid2 *cpuid,
2252 					struct td_params *td_params)
2253 {
2254 	const struct kvm_cpuid_entry2 *entry;
2255 	int guest_pa;
2256 
2257 	entry = kvm_find_cpuid_entry2(cpuid->entries, cpuid->nent, 0x80000008, 0);
2258 	if (!entry)
2259 		return -EINVAL;
2260 
2261 	guest_pa = tdx_get_guest_phys_addr_bits(entry->eax);
2262 
2263 	if (guest_pa != 48 && guest_pa != 52)
2264 		return -EINVAL;
2265 
2266 	if (guest_pa == 52 && !cpu_has_vmx_ept_5levels())
2267 		return -EINVAL;
2268 
2269 	td_params->eptp_controls = VMX_EPTP_MT_WB;
2270 	if (guest_pa == 52) {
2271 		td_params->eptp_controls |= VMX_EPTP_PWL_5;
2272 		td_params->config_flags |= TDX_CONFIG_FLAGS_MAX_GPAW;
2273 	} else {
2274 		td_params->eptp_controls |= VMX_EPTP_PWL_4;
2275 	}
2276 
2277 	return 0;
2278 }
2279 
2280 static int setup_tdparams_cpuids(struct kvm_cpuid2 *cpuid,
2281 				 struct td_params *td_params)
2282 {
2283 	const struct tdx_sys_info_td_conf *td_conf = &tdx_sysinfo->td_conf;
2284 	const struct kvm_cpuid_entry2 *entry;
2285 	struct tdx_cpuid_value *value;
2286 	int i, copy_cnt = 0;
2287 
2288 	/*
2289 	 * td_params.cpuid_values: The number and the order of cpuid_value must
2290 	 * be same to the one of struct tdsysinfo.{num_cpuid_config, cpuid_configs}
2291 	 * It's assumed that td_params was zeroed.
2292 	 */
2293 	for (i = 0; i < td_conf->num_cpuid_config; i++) {
2294 		struct kvm_cpuid_entry2 tmp;
2295 
2296 		td_init_cpuid_entry2(&tmp, i);
2297 
2298 		entry = kvm_find_cpuid_entry2(cpuid->entries, cpuid->nent,
2299 					      tmp.function, tmp.index);
2300 		if (!entry)
2301 			continue;
2302 
2303 		if (tdx_unsupported_cpuid(entry))
2304 			return -EINVAL;
2305 
2306 		copy_cnt++;
2307 
2308 		value = &td_params->cpuid_values[i];
2309 		value->eax = entry->eax;
2310 		value->ebx = entry->ebx;
2311 		value->ecx = entry->ecx;
2312 		value->edx = entry->edx;
2313 
2314 		/*
2315 		 * TDX module does not accept nonzero bits 16..23 for the
2316 		 * CPUID[0x80000008].EAX, see setup_tdparams_eptp_controls().
2317 		 */
2318 		if (tmp.function == 0x80000008)
2319 			value->eax = tdx_set_guest_phys_addr_bits(value->eax, 0);
2320 	}
2321 
2322 	/*
2323 	 * Rely on the TDX module to reject invalid configuration, but it can't
2324 	 * check of leafs that don't have a proper slot in td_params->cpuid_values
2325 	 * to stick then. So fail if there were entries that didn't get copied to
2326 	 * td_params.
2327 	 */
2328 	if (copy_cnt != cpuid->nent)
2329 		return -EINVAL;
2330 
2331 	return 0;
2332 }
2333 
2334 static int setup_tdparams(struct kvm *kvm, struct td_params *td_params,
2335 			struct kvm_tdx_init_vm *init_vm)
2336 {
2337 	const struct tdx_sys_info_td_conf *td_conf = &tdx_sysinfo->td_conf;
2338 	struct kvm_cpuid2 *cpuid = &init_vm->cpuid;
2339 	int ret;
2340 
2341 	if (kvm->created_vcpus)
2342 		return -EBUSY;
2343 
2344 	if (init_vm->attributes & ~tdx_get_supported_attrs(td_conf))
2345 		return -EINVAL;
2346 
2347 	if (init_vm->xfam & ~tdx_get_supported_xfam(td_conf))
2348 		return -EINVAL;
2349 
2350 	td_params->max_vcpus = kvm->max_vcpus;
2351 	td_params->attributes = init_vm->attributes | td_conf->attributes_fixed1;
2352 	td_params->xfam = init_vm->xfam | td_conf->xfam_fixed1;
2353 
2354 	td_params->config_flags = TDX_CONFIG_FLAGS_NO_RBP_MOD;
2355 	td_params->tsc_frequency = TDX_TSC_KHZ_TO_25MHZ(kvm->arch.default_tsc_khz);
2356 
2357 	ret = setup_tdparams_eptp_controls(cpuid, td_params);
2358 	if (ret)
2359 		return ret;
2360 
2361 	ret = setup_tdparams_cpuids(cpuid, td_params);
2362 	if (ret)
2363 		return ret;
2364 
2365 #define MEMCPY_SAME_SIZE(dst, src)				\
2366 	do {							\
2367 		BUILD_BUG_ON(sizeof(dst) != sizeof(src));	\
2368 		memcpy((dst), (src), sizeof(dst));		\
2369 	} while (0)
2370 
2371 	MEMCPY_SAME_SIZE(td_params->mrconfigid, init_vm->mrconfigid);
2372 	MEMCPY_SAME_SIZE(td_params->mrowner, init_vm->mrowner);
2373 	MEMCPY_SAME_SIZE(td_params->mrownerconfig, init_vm->mrownerconfig);
2374 
2375 	return 0;
2376 }
2377 
2378 static int __tdx_td_init(struct kvm *kvm, struct td_params *td_params,
2379 			 u64 *seamcall_err)
2380 {
2381 	struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
2382 	cpumask_var_t packages;
2383 	struct page **tdcs_pages = NULL;
2384 	struct page *tdr_page;
2385 	int ret, i;
2386 	u64 err, rcx;
2387 
2388 	*seamcall_err = 0;
2389 	ret = tdx_guest_keyid_alloc();
2390 	if (ret < 0)
2391 		return ret;
2392 	kvm_tdx->hkid = ret;
2393 	kvm_tdx->misc_cg = get_current_misc_cg();
2394 	ret = misc_cg_try_charge(MISC_CG_RES_TDX, kvm_tdx->misc_cg, 1);
2395 	if (ret)
2396 		goto free_hkid;
2397 
2398 	ret = -ENOMEM;
2399 
2400 	atomic_inc(&nr_configured_hkid);
2401 
2402 	tdr_page = alloc_page(GFP_KERNEL);
2403 	if (!tdr_page)
2404 		goto free_hkid;
2405 
2406 	kvm_tdx->td.tdcs_nr_pages = tdx_sysinfo->td_ctrl.tdcs_base_size / PAGE_SIZE;
2407 	/* TDVPS = TDVPR(4K page) + TDCX(multiple 4K pages), -1 for TDVPR. */
2408 	kvm_tdx->td.tdcx_nr_pages = tdx_sysinfo->td_ctrl.tdvps_base_size / PAGE_SIZE - 1;
2409 	tdcs_pages = kzalloc_objs(*kvm_tdx->td.tdcs_pages,
2410 				  kvm_tdx->td.tdcs_nr_pages);
2411 	if (!tdcs_pages)
2412 		goto free_tdr;
2413 
2414 	for (i = 0; i < kvm_tdx->td.tdcs_nr_pages; i++) {
2415 		tdcs_pages[i] = alloc_page(GFP_KERNEL);
2416 		if (!tdcs_pages[i])
2417 			goto free_tdcs;
2418 	}
2419 
2420 	if (!zalloc_cpumask_var(&packages, GFP_KERNEL))
2421 		goto free_tdcs;
2422 
2423 	cpus_read_lock();
2424 
2425 	/*
2426 	 * Need at least one CPU of the package to be online in order to
2427 	 * program all packages for host key id.  Check it.
2428 	 */
2429 	for_each_present_cpu(i)
2430 		cpumask_set_cpu(topology_physical_package_id(i), packages);
2431 	for_each_online_cpu(i)
2432 		cpumask_clear_cpu(topology_physical_package_id(i), packages);
2433 	if (!cpumask_empty(packages)) {
2434 		ret = -EIO;
2435 		/*
2436 		 * Because it's hard for human operator to figure out the
2437 		 * reason, warn it.
2438 		 */
2439 #define MSG_ALLPKG	"All packages need to have online CPU to create TD. Online CPU and retry.\n"
2440 		pr_warn_ratelimited(MSG_ALLPKG);
2441 		goto free_packages;
2442 	}
2443 
2444 	/*
2445 	 * TDH.MNG.CREATE tries to grab the global TDX module and fails
2446 	 * with TDX_OPERAND_BUSY when it fails to grab.  Take the global
2447 	 * lock to prevent it from failure.
2448 	 */
2449 	mutex_lock(&tdx_lock);
2450 	kvm_tdx->td.tdr_page = tdr_page;
2451 	err = tdh_mng_create(&kvm_tdx->td, kvm_tdx->hkid);
2452 	mutex_unlock(&tdx_lock);
2453 
2454 	if (err == TDX_RND_NO_ENTROPY) {
2455 		ret = -EAGAIN;
2456 		goto free_packages;
2457 	}
2458 
2459 	if (TDX_BUG_ON(err, TDH_MNG_CREATE, kvm)) {
2460 		ret = -EIO;
2461 		goto free_packages;
2462 	}
2463 
2464 	for_each_online_cpu(i) {
2465 		int pkg = topology_physical_package_id(i);
2466 
2467 		if (cpumask_test_and_set_cpu(pkg, packages))
2468 			continue;
2469 
2470 		/*
2471 		 * Program the memory controller in the package with an
2472 		 * encryption key associated to a TDX private host key id
2473 		 * assigned to this TDR.  Concurrent operations on same memory
2474 		 * controller results in TDX_OPERAND_BUSY. No locking needed
2475 		 * beyond the cpus_read_lock() above as it serializes against
2476 		 * hotplug and the first online CPU of the package is always
2477 		 * used. We never have two CPUs in the same socket trying to
2478 		 * program the key.
2479 		 */
2480 		ret = smp_call_on_cpu(i, tdx_do_tdh_mng_key_config,
2481 				      kvm_tdx, true);
2482 		if (ret)
2483 			break;
2484 	}
2485 	cpus_read_unlock();
2486 	free_cpumask_var(packages);
2487 	if (ret) {
2488 		i = 0;
2489 		goto teardown;
2490 	}
2491 
2492 	kvm_tdx->td.tdcs_pages = tdcs_pages;
2493 	for (i = 0; i < kvm_tdx->td.tdcs_nr_pages; i++) {
2494 		err = tdh_mng_addcx(&kvm_tdx->td, tdcs_pages[i]);
2495 		if (err == TDX_RND_NO_ENTROPY) {
2496 			/* Here it's hard to allow userspace to retry. */
2497 			ret = -EAGAIN;
2498 			goto teardown;
2499 		}
2500 		if (TDX_BUG_ON(err, TDH_MNG_ADDCX, kvm)) {
2501 			ret = -EIO;
2502 			goto teardown;
2503 		}
2504 	}
2505 
2506 	err = tdh_mng_init(&kvm_tdx->td, __pa(td_params), &rcx);
2507 	if ((err & TDX_SEAMCALL_STATUS_MASK) == TDX_OPERAND_INVALID) {
2508 		/*
2509 		 * Because a user gives operands, don't warn.
2510 		 * Return a hint to the user because it's sometimes hard for the
2511 		 * user to figure out which operand is invalid.  SEAMCALL status
2512 		 * code includes which operand caused invalid operand error.
2513 		 */
2514 		*seamcall_err = err;
2515 		ret = -EINVAL;
2516 		goto teardown;
2517 	} else if (TDX_BUG_ON_1(err, TDH_MNG_INIT, rcx, kvm)) {
2518 		ret = -EIO;
2519 		goto teardown;
2520 	}
2521 
2522 	return 0;
2523 
2524 	/*
2525 	 * The sequence for freeing resources from a partially initialized TD
2526 	 * varies based on where in the initialization flow failure occurred.
2527 	 * Simply use the full teardown and destroy, which naturally play nice
2528 	 * with partial initialization.
2529 	 */
2530 teardown:
2531 	/* Only free pages not yet added, so start at 'i' */
2532 	for (; i < kvm_tdx->td.tdcs_nr_pages; i++) {
2533 		if (tdcs_pages[i]) {
2534 			__free_page(tdcs_pages[i]);
2535 			tdcs_pages[i] = NULL;
2536 		}
2537 	}
2538 	if (!kvm_tdx->td.tdcs_pages)
2539 		kfree(tdcs_pages);
2540 
2541 	tdx_mmu_release_hkid(kvm);
2542 	tdx_reclaim_td_control_pages(kvm);
2543 
2544 	return ret;
2545 
2546 free_packages:
2547 	cpus_read_unlock();
2548 	free_cpumask_var(packages);
2549 
2550 free_tdcs:
2551 	for (i = 0; i < kvm_tdx->td.tdcs_nr_pages; i++) {
2552 		if (tdcs_pages[i])
2553 			__free_page(tdcs_pages[i]);
2554 	}
2555 	kfree(tdcs_pages);
2556 	kvm_tdx->td.tdcs_pages = NULL;
2557 
2558 free_tdr:
2559 	if (tdr_page)
2560 		__free_page(tdr_page);
2561 	kvm_tdx->td.tdr_page = NULL;
2562 
2563 free_hkid:
2564 	tdx_hkid_free(kvm_tdx);
2565 
2566 	return ret;
2567 }
2568 
2569 static u64 tdx_td_metadata_field_read(struct kvm_tdx *tdx, u64 field_id,
2570 				      u64 *data)
2571 {
2572 	u64 err;
2573 
2574 	err = tdh_mng_rd(&tdx->td, field_id, data);
2575 
2576 	return err;
2577 }
2578 
2579 #define TDX_MD_UNREADABLE_LEAF_MASK	GENMASK(30, 7)
2580 #define TDX_MD_UNREADABLE_SUBLEAF_MASK	GENMASK(31, 7)
2581 
2582 static int tdx_read_cpuid(struct kvm_vcpu *vcpu, u32 leaf, u32 sub_leaf,
2583 			  bool sub_leaf_set, int *entry_index,
2584 			  struct kvm_cpuid_entry2 *out)
2585 {
2586 	struct kvm_tdx *kvm_tdx = to_kvm_tdx(vcpu->kvm);
2587 	u64 field_id = TD_MD_FIELD_ID_CPUID_VALUES;
2588 	u64 ebx_eax, edx_ecx;
2589 	u64 err = 0;
2590 
2591 	if (sub_leaf > 0b1111111)
2592 		return -EINVAL;
2593 
2594 	if (*entry_index >= KVM_MAX_CPUID_ENTRIES)
2595 		return -EINVAL;
2596 
2597 	if (leaf & TDX_MD_UNREADABLE_LEAF_MASK ||
2598 	    sub_leaf & TDX_MD_UNREADABLE_SUBLEAF_MASK)
2599 		return -EINVAL;
2600 
2601 	/*
2602 	 * bit 23:17, REVSERVED: reserved, must be 0;
2603 	 * bit 16,    LEAF_31: leaf number bit 31;
2604 	 * bit 15:9,  LEAF_6_0: leaf number bits 6:0, leaf bits 30:7 are
2605 	 *                      implicitly 0;
2606 	 * bit 8,     SUBLEAF_NA: sub-leaf not applicable flag;
2607 	 * bit 7:1,   SUBLEAF_6_0: sub-leaf number bits 6:0. If SUBLEAF_NA is 1,
2608 	 *                         the SUBLEAF_6_0 is all-1.
2609 	 *                         sub-leaf bits 31:7 are implicitly 0;
2610 	 * bit 0,     ELEMENT_I: Element index within field;
2611 	 */
2612 	field_id |= ((leaf & 0x80000000) ? 1 : 0) << 16;
2613 	field_id |= (leaf & 0x7f) << 9;
2614 	if (sub_leaf_set)
2615 		field_id |= (sub_leaf & 0x7f) << 1;
2616 	else
2617 		field_id |= 0x1fe;
2618 
2619 	err = tdx_td_metadata_field_read(kvm_tdx, field_id, &ebx_eax);
2620 	if (err) //TODO check for specific errors
2621 		goto err_out;
2622 
2623 	out->eax = (u32) ebx_eax;
2624 	out->ebx = (u32) (ebx_eax >> 32);
2625 
2626 	field_id++;
2627 	err = tdx_td_metadata_field_read(kvm_tdx, field_id, &edx_ecx);
2628 	/*
2629 	 * It's weird that reading edx_ecx fails while reading ebx_eax
2630 	 * succeeded.
2631 	 */
2632 	if (WARN_ON_ONCE(err))
2633 		goto err_out;
2634 
2635 	out->ecx = (u32) edx_ecx;
2636 	out->edx = (u32) (edx_ecx >> 32);
2637 
2638 	out->function = leaf;
2639 	out->index = sub_leaf;
2640 	out->flags |= sub_leaf_set ? KVM_CPUID_FLAG_SIGNIFCANT_INDEX : 0;
2641 
2642 	/*
2643 	 * Work around missing support on old TDX modules, fetch
2644 	 * guest maxpa from gfn_direct_bits.
2645 	 */
2646 	if (leaf == 0x80000008) {
2647 		gpa_t gpa_bits = gfn_to_gpa(kvm_gfn_direct_bits(vcpu->kvm));
2648 		unsigned int g_maxpa = __ffs(gpa_bits) + 1;
2649 
2650 		out->eax = tdx_set_guest_phys_addr_bits(out->eax, g_maxpa);
2651 	}
2652 
2653 	(*entry_index)++;
2654 
2655 	return 0;
2656 
2657 err_out:
2658 	out->eax = 0;
2659 	out->ebx = 0;
2660 	out->ecx = 0;
2661 	out->edx = 0;
2662 
2663 	return -EIO;
2664 }
2665 
2666 typedef void *tdx_vm_state_guard_t;
2667 
2668 static tdx_vm_state_guard_t tdx_acquire_vm_state_locks(struct kvm *kvm)
2669 {
2670 	int r;
2671 
2672 	mutex_lock(&kvm->lock);
2673 
2674 	if (kvm->created_vcpus != atomic_read(&kvm->online_vcpus)) {
2675 		r = -EBUSY;
2676 		goto out_err;
2677 	}
2678 
2679 	r = kvm_lock_all_vcpus(kvm);
2680 	if (r)
2681 		goto out_err;
2682 
2683 	/*
2684 	 * Note the unintuitive ordering!  vcpu->mutex must be taken outside
2685 	 * kvm->slots_lock!
2686 	 */
2687 	mutex_lock(&kvm->slots_lock);
2688 	return kvm;
2689 
2690 out_err:
2691 	mutex_unlock(&kvm->lock);
2692 	return ERR_PTR(r);
2693 }
2694 
2695 static void tdx_release_vm_state_locks(struct kvm *kvm)
2696 {
2697 	mutex_unlock(&kvm->slots_lock);
2698 	kvm_unlock_all_vcpus(kvm);
2699 	mutex_unlock(&kvm->lock);
2700 }
2701 
2702 DEFINE_CLASS(tdx_vm_state_guard, tdx_vm_state_guard_t,
2703 	     if (!IS_ERR(_T)) tdx_release_vm_state_locks(_T),
2704 	     tdx_acquire_vm_state_locks(kvm), struct kvm *kvm);
2705 
2706 static int tdx_td_init(struct kvm *kvm, struct kvm_tdx_cmd *cmd)
2707 {
2708 	struct kvm_tdx_init_vm __user *user_data = u64_to_user_ptr(cmd->data);
2709 	struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
2710 	struct kvm_tdx_init_vm *init_vm;
2711 	struct td_params *td_params = NULL;
2712 	u32 nr_user_entries;
2713 	int ret;
2714 
2715 	BUILD_BUG_ON(sizeof(*init_vm) != 256 + sizeof_field(struct kvm_tdx_init_vm, cpuid));
2716 	BUILD_BUG_ON(sizeof(struct td_params) != 1024);
2717 
2718 	if (kvm_tdx->state != TD_STATE_UNINITIALIZED)
2719 		return -EINVAL;
2720 
2721 	if (cmd->flags)
2722 		return -EINVAL;
2723 
2724 	if (get_user(nr_user_entries, &user_data->cpuid.nent))
2725 		return -EFAULT;
2726 
2727 	if (nr_user_entries > KVM_MAX_CPUID_ENTRIES)
2728 		return -E2BIG;
2729 
2730 	init_vm = memdup_user(user_data,
2731 			      struct_size(user_data, cpuid.entries, nr_user_entries));
2732 	if (IS_ERR(init_vm))
2733 		return PTR_ERR(init_vm);
2734 
2735 	if (memchr_inv(init_vm->reserved, 0, sizeof(init_vm->reserved))) {
2736 		ret = -EINVAL;
2737 		goto out;
2738 	}
2739 
2740 	if (init_vm->cpuid.padding) {
2741 		ret = -EINVAL;
2742 		goto out;
2743 	}
2744 
2745 	td_params = kzalloc_obj(struct td_params);
2746 	if (!td_params) {
2747 		ret = -ENOMEM;
2748 		goto out;
2749 	}
2750 
2751 	ret = setup_tdparams(kvm, td_params, init_vm);
2752 	if (ret)
2753 		goto out;
2754 
2755 	ret = __tdx_td_init(kvm, td_params, &cmd->hw_error);
2756 	if (ret)
2757 		goto out;
2758 
2759 	kvm_tdx->tsc_offset = td_tdcs_exec_read64(kvm_tdx, TD_TDCS_EXEC_TSC_OFFSET);
2760 	kvm_tdx->tsc_multiplier = td_tdcs_exec_read64(kvm_tdx, TD_TDCS_EXEC_TSC_MULTIPLIER);
2761 	kvm_tdx->attributes = td_params->attributes;
2762 	kvm_tdx->xfam = td_params->xfam;
2763 
2764 	if (td_params->config_flags & TDX_CONFIG_FLAGS_MAX_GPAW)
2765 		kvm->arch.gfn_direct_bits = TDX_SHARED_BIT_PWL_5;
2766 	else
2767 		kvm->arch.gfn_direct_bits = TDX_SHARED_BIT_PWL_4;
2768 
2769 	kvm_tdx->state = TD_STATE_INITIALIZED;
2770 out:
2771 	/* kfree() accepts NULL. */
2772 	kfree(init_vm);
2773 	kfree(td_params);
2774 
2775 	return ret;
2776 }
2777 
2778 void tdx_flush_tlb_current(struct kvm_vcpu *vcpu)
2779 {
2780 	/*
2781 	 * flush_tlb_current() is invoked when the first time for the vcpu to
2782 	 * run or when root of shared EPT is invalidated.
2783 	 * KVM only needs to flush shared EPT because the TDX module handles TLB
2784 	 * invalidation for private EPT in tdh_vp_enter();
2785 	 *
2786 	 * A single context invalidation for shared EPT can be performed here.
2787 	 * However, this single context invalidation requires the private EPTP
2788 	 * rather than the shared EPTP to flush shared EPT, as shared EPT uses
2789 	 * private EPTP as its ASID for TLB invalidation.
2790 	 *
2791 	 * To avoid reading back private EPTP, perform a global invalidation for
2792 	 * shared EPT instead to keep this function simple.
2793 	 */
2794 	ept_sync_global();
2795 }
2796 
2797 void tdx_flush_tlb_all(struct kvm_vcpu *vcpu)
2798 {
2799 	/*
2800 	 * TDX has called tdx_track() in tdx_sept_remove_private_spte() to
2801 	 * ensure that private EPT will be flushed on the next TD enter. No need
2802 	 * to call tdx_track() here again even when this callback is a result of
2803 	 * zapping private EPT.
2804 	 *
2805 	 * Due to the lack of the context to determine which EPT has been
2806 	 * affected by zapping, invoke invept() directly here for both shared
2807 	 * EPT and private EPT for simplicity, though it's not necessary for
2808 	 * private EPT.
2809 	 */
2810 	ept_sync_global();
2811 }
2812 
2813 static int tdx_td_finalize(struct kvm *kvm, struct kvm_tdx_cmd *cmd)
2814 {
2815 	struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
2816 
2817 	if (!is_hkid_assigned(kvm_tdx) || kvm_tdx->state == TD_STATE_RUNNABLE)
2818 		return -EINVAL;
2819 
2820 	cmd->hw_error = tdh_mr_finalize(&kvm_tdx->td);
2821 	if (tdx_operand_busy(cmd->hw_error))
2822 		return -EBUSY;
2823 	if (TDX_BUG_ON(cmd->hw_error, TDH_MR_FINALIZE, kvm))
2824 		return -EIO;
2825 
2826 	kvm_tdx->state = TD_STATE_RUNNABLE;
2827 	/* TD_STATE_RUNNABLE must be set before 'pre_fault_allowed' */
2828 	smp_wmb();
2829 	kvm->arch.pre_fault_allowed = true;
2830 	return 0;
2831 }
2832 
2833 static int tdx_get_cmd(void __user *argp, struct kvm_tdx_cmd *cmd)
2834 {
2835 	if (copy_from_user(cmd, argp, sizeof(*cmd)))
2836 		return -EFAULT;
2837 
2838 	/*
2839 	 * Userspace should never set hw_error.  KVM writes hw_error to report
2840 	 * hardware-defined error back to userspace.
2841 	 */
2842 	if (cmd->hw_error)
2843 		return -EINVAL;
2844 
2845 	return 0;
2846 }
2847 
2848 int tdx_vm_ioctl(struct kvm *kvm, void __user *argp)
2849 {
2850 	struct kvm_tdx_cmd tdx_cmd;
2851 	int r;
2852 
2853 	r = tdx_get_cmd(argp, &tdx_cmd);
2854 	if (r)
2855 		return r;
2856 
2857 	if (tdx_cmd.id == KVM_TDX_CAPABILITIES)
2858 		return tdx_get_capabilities(&tdx_cmd);
2859 
2860 	CLASS(tdx_vm_state_guard, guard)(kvm);
2861 	if (IS_ERR(guard))
2862 		return PTR_ERR(guard);
2863 
2864 	switch (tdx_cmd.id) {
2865 	case KVM_TDX_INIT_VM:
2866 		r = tdx_td_init(kvm, &tdx_cmd);
2867 		break;
2868 	case KVM_TDX_FINALIZE_VM:
2869 		r = tdx_td_finalize(kvm, &tdx_cmd);
2870 		break;
2871 	default:
2872 		return -EINVAL;
2873 	}
2874 
2875 	if (copy_to_user(argp, &tdx_cmd, sizeof(struct kvm_tdx_cmd)))
2876 		return -EFAULT;
2877 
2878 	return r;
2879 }
2880 
2881 /* VMM can pass one 64bit auxiliary data to vcpu via RCX for guest BIOS. */
2882 static int tdx_td_vcpu_init(struct kvm_vcpu *vcpu, u64 vcpu_rcx)
2883 {
2884 	struct kvm_tdx *kvm_tdx = to_kvm_tdx(vcpu->kvm);
2885 	struct vcpu_tdx *tdx = to_tdx(vcpu);
2886 	struct page *page;
2887 	int ret, i;
2888 	u64 err;
2889 
2890 	page = alloc_page(GFP_KERNEL);
2891 	if (!page)
2892 		return -ENOMEM;
2893 	tdx->vp.tdvpr_page = page;
2894 
2895 	/*
2896 	 * page_to_phys() does not work in 'noinstr' code, like guest
2897 	 * entry via tdh_vp_enter(). Precalculate and store it instead
2898 	 * of doing it at runtime later.
2899 	 */
2900 	tdx->vp.tdvpr_pa = page_to_phys(tdx->vp.tdvpr_page);
2901 
2902 	tdx->vp.tdcx_pages = kcalloc(kvm_tdx->td.tdcx_nr_pages, sizeof(*tdx->vp.tdcx_pages),
2903 			       	     GFP_KERNEL);
2904 	if (!tdx->vp.tdcx_pages) {
2905 		ret = -ENOMEM;
2906 		goto free_tdvpr;
2907 	}
2908 
2909 	for (i = 0; i < kvm_tdx->td.tdcx_nr_pages; i++) {
2910 		page = alloc_page(GFP_KERNEL);
2911 		if (!page) {
2912 			ret = -ENOMEM;
2913 			goto free_tdcx;
2914 		}
2915 		tdx->vp.tdcx_pages[i] = page;
2916 	}
2917 
2918 	err = tdh_vp_create(&kvm_tdx->td, &tdx->vp);
2919 	if (TDX_BUG_ON(err, TDH_VP_CREATE, vcpu->kvm)) {
2920 		ret = -EIO;
2921 		goto free_tdcx;
2922 	}
2923 
2924 	for (i = 0; i < kvm_tdx->td.tdcx_nr_pages; i++) {
2925 		err = tdh_vp_addcx(&tdx->vp, tdx->vp.tdcx_pages[i]);
2926 		if (TDX_BUG_ON(err, TDH_VP_ADDCX, vcpu->kvm)) {
2927 			/*
2928 			 * Pages already added are reclaimed by the vcpu_free
2929 			 * method, but the rest are freed here.
2930 			 */
2931 			for (; i < kvm_tdx->td.tdcx_nr_pages; i++) {
2932 				__free_page(tdx->vp.tdcx_pages[i]);
2933 				tdx->vp.tdcx_pages[i] = NULL;
2934 			}
2935 			return -EIO;
2936 		}
2937 	}
2938 
2939 	/*
2940 	 * tdh_vp_init() can take an exclusive lock of the TDR resource inside
2941 	 * the TDX-Module.  The TDR resource is also taken as shared in several
2942 	 * no-fail MMU paths, which could return TDX_OPERAND_BUSY on contention
2943 	 * (TDX-Module locks are try-lock implementations with no slow path).
2944 	 * Take mmu_lock for write to reflect the nature of the lock taken by
2945 	 * the TDX-Module, and to ensure the no-fail MMU paths succeed, e.g. if
2946 	 * a concurrent PUNCH_HOLE on guest_memfd triggers removal of SPTEs.
2947 	 */
2948 	scoped_guard(write_lock, &vcpu->kvm->mmu_lock) {
2949 		err = tdh_vp_init(&tdx->vp, vcpu_rcx, vcpu->vcpu_id);
2950 		if (TDX_BUG_ON(err, TDH_VP_INIT, vcpu->kvm))
2951 			return -EIO;
2952 	}
2953 
2954 	vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
2955 
2956 	return 0;
2957 
2958 free_tdcx:
2959 	for (i = 0; i < kvm_tdx->td.tdcx_nr_pages; i++) {
2960 		if (tdx->vp.tdcx_pages[i])
2961 			__free_page(tdx->vp.tdcx_pages[i]);
2962 		tdx->vp.tdcx_pages[i] = NULL;
2963 	}
2964 	kfree(tdx->vp.tdcx_pages);
2965 	tdx->vp.tdcx_pages = NULL;
2966 
2967 free_tdvpr:
2968 	if (tdx->vp.tdvpr_page)
2969 		__free_page(tdx->vp.tdvpr_page);
2970 	tdx->vp.tdvpr_page = NULL;
2971 	tdx->vp.tdvpr_pa = 0;
2972 
2973 	return ret;
2974 }
2975 
2976 /* Sometimes reads multipple subleafs. Return how many enties were written. */
2977 static int tdx_vcpu_get_cpuid_leaf(struct kvm_vcpu *vcpu, u32 leaf, int *entry_index,
2978 				   struct kvm_cpuid_entry2 *output_e)
2979 {
2980 	int sub_leaf = 0;
2981 	int ret;
2982 
2983 	/* First try without a subleaf */
2984 	ret = tdx_read_cpuid(vcpu, leaf, 0, false, entry_index, output_e);
2985 
2986 	/* If success, or invalid leaf, just give up */
2987 	if (ret != -EIO)
2988 		return ret;
2989 
2990 	/*
2991 	 * If the try without a subleaf failed, try reading subleafs until
2992 	 * failure. The TDX module only supports 6 bits of subleaf index.
2993 	 */
2994 	while (1) {
2995 		/* Keep reading subleafs until there is a failure. */
2996 		if (tdx_read_cpuid(vcpu, leaf, sub_leaf, true, entry_index, output_e))
2997 			return !sub_leaf;
2998 
2999 		sub_leaf++;
3000 		output_e++;
3001 	}
3002 
3003 	return 0;
3004 }
3005 
3006 static int tdx_vcpu_get_cpuid(struct kvm_vcpu *vcpu, struct kvm_tdx_cmd *cmd)
3007 {
3008 	struct kvm_cpuid2 __user *output;
3009 	struct kvm_cpuid2 *td_cpuid;
3010 	int r = 0, i = 0, leaf;
3011 	u32 level;
3012 
3013 	output = u64_to_user_ptr(cmd->data);
3014 	td_cpuid = kzalloc(sizeof(*td_cpuid) +
3015 			sizeof(output->entries[0]) * KVM_MAX_CPUID_ENTRIES,
3016 			GFP_KERNEL);
3017 	if (!td_cpuid)
3018 		return -ENOMEM;
3019 
3020 	if (copy_from_user(td_cpuid, output, sizeof(*output))) {
3021 		r = -EFAULT;
3022 		goto out;
3023 	}
3024 
3025 	/* Read max CPUID for normal range */
3026 	if (tdx_vcpu_get_cpuid_leaf(vcpu, 0, &i, &td_cpuid->entries[i])) {
3027 		r = -EIO;
3028 		goto out;
3029 	}
3030 	level = td_cpuid->entries[0].eax;
3031 
3032 	for (leaf = 1; leaf <= level; leaf++)
3033 		tdx_vcpu_get_cpuid_leaf(vcpu, leaf, &i, &td_cpuid->entries[i]);
3034 
3035 	/* Read max CPUID for extended range */
3036 	if (tdx_vcpu_get_cpuid_leaf(vcpu, 0x80000000, &i, &td_cpuid->entries[i])) {
3037 		r = -EIO;
3038 		goto out;
3039 	}
3040 	level = td_cpuid->entries[i - 1].eax;
3041 
3042 	for (leaf = 0x80000001; leaf <= level; leaf++)
3043 		tdx_vcpu_get_cpuid_leaf(vcpu, leaf, &i, &td_cpuid->entries[i]);
3044 
3045 	if (td_cpuid->nent < i)
3046 		r = -E2BIG;
3047 	td_cpuid->nent = i;
3048 
3049 	if (copy_to_user(output, td_cpuid, sizeof(*output))) {
3050 		r = -EFAULT;
3051 		goto out;
3052 	}
3053 
3054 	if (r == -E2BIG)
3055 		goto out;
3056 
3057 	if (copy_to_user(output->entries, td_cpuid->entries,
3058 			 td_cpuid->nent * sizeof(struct kvm_cpuid_entry2)))
3059 		r = -EFAULT;
3060 
3061 out:
3062 	kfree(td_cpuid);
3063 
3064 	return r;
3065 }
3066 
3067 static int tdx_vcpu_init(struct kvm_vcpu *vcpu, struct kvm_tdx_cmd *cmd)
3068 {
3069 	u64 apic_base;
3070 	struct vcpu_tdx *tdx = to_tdx(vcpu);
3071 	int ret;
3072 
3073 	if (cmd->flags)
3074 		return -EINVAL;
3075 
3076 	if (tdx->state != VCPU_TD_STATE_UNINITIALIZED)
3077 		return -EINVAL;
3078 
3079 	/*
3080 	 * TDX requires X2APIC, userspace is responsible for configuring guest
3081 	 * CPUID accordingly.
3082 	 */
3083 	apic_base = APIC_DEFAULT_PHYS_BASE | LAPIC_MODE_X2APIC |
3084 		(kvm_vcpu_is_reset_bsp(vcpu) ? MSR_IA32_APICBASE_BSP : 0);
3085 	if (kvm_apic_set_base(vcpu, apic_base, true))
3086 		return -EINVAL;
3087 
3088 	ret = tdx_td_vcpu_init(vcpu, (u64)cmd->data);
3089 	if (ret)
3090 		return ret;
3091 
3092 	td_vmcs_write16(tdx, POSTED_INTR_NV, POSTED_INTR_VECTOR);
3093 	td_vmcs_write64(tdx, POSTED_INTR_DESC_ADDR, __pa(&tdx->vt.pi_desc));
3094 	td_vmcs_setbit32(tdx, PIN_BASED_VM_EXEC_CONTROL, PIN_BASED_POSTED_INTR);
3095 
3096 	tdx->state = VCPU_TD_STATE_INITIALIZED;
3097 
3098 	return 0;
3099 }
3100 
3101 void tdx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
3102 {
3103 	/*
3104 	 * Yell on INIT, as TDX doesn't support INIT, i.e. KVM should drop all
3105 	 * INIT events.
3106 	 *
3107 	 * Defer initializing vCPU for RESET state until KVM_TDX_INIT_VCPU, as
3108 	 * userspace needs to define the vCPU model before KVM can initialize
3109 	 * vCPU state, e.g. to enable x2APIC.
3110 	 */
3111 	WARN_ON_ONCE(init_event);
3112 }
3113 
3114 struct tdx_gmem_post_populate_arg {
3115 	struct kvm_vcpu *vcpu;
3116 	__u32 flags;
3117 };
3118 
3119 static int tdx_gmem_post_populate(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn,
3120 				  struct page *src_page, void *_arg)
3121 {
3122 	struct tdx_gmem_post_populate_arg *arg = _arg;
3123 	struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
3124 	u64 err, entry, level_state;
3125 	gpa_t gpa = gfn_to_gpa(gfn);
3126 	int ret, i;
3127 
3128 	if (KVM_BUG_ON(kvm_tdx->page_add_src, kvm))
3129 		return -EIO;
3130 
3131 	if (!src_page)
3132 		return -EOPNOTSUPP;
3133 
3134 	kvm_tdx->page_add_src = src_page;
3135 	ret = kvm_tdp_mmu_map_private_pfn(arg->vcpu, gfn, pfn);
3136 	kvm_tdx->page_add_src = NULL;
3137 
3138 	if (ret || !(arg->flags & KVM_TDX_MEASURE_MEMORY_REGION))
3139 		return ret;
3140 
3141 	/*
3142 	 * Note, MR.EXTEND can fail if the S-EPT mapping is somehow removed
3143 	 * between mapping the pfn and now, but slots_lock prevents memslot
3144 	 * updates, filemap_invalidate_lock() prevents guest_memfd updates,
3145 	 * mmu_notifier events can't reach S-EPT entries, and KVM's internal
3146 	 * zapping flows are mutually exclusive with S-EPT mappings.
3147 	 */
3148 	for (i = 0; i < PAGE_SIZE; i += TDX_EXTENDMR_CHUNKSIZE) {
3149 		err = tdh_mr_extend(&kvm_tdx->td, gpa + i, &entry, &level_state);
3150 		if (TDX_BUG_ON_2(err, TDH_MR_EXTEND, entry, level_state, kvm))
3151 			return -EIO;
3152 	}
3153 
3154 	return 0;
3155 }
3156 
3157 static int tdx_vcpu_init_mem_region(struct kvm_vcpu *vcpu, struct kvm_tdx_cmd *cmd)
3158 {
3159 	struct vcpu_tdx *tdx = to_tdx(vcpu);
3160 	struct kvm *kvm = vcpu->kvm;
3161 	struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
3162 	struct kvm_tdx_init_mem_region region;
3163 	struct tdx_gmem_post_populate_arg arg;
3164 	long gmem_ret;
3165 	int ret;
3166 
3167 	if (tdx->state != VCPU_TD_STATE_INITIALIZED)
3168 		return -EINVAL;
3169 
3170 	/* Once TD is finalized, the initial guest memory is fixed. */
3171 	if (kvm_tdx->state == TD_STATE_RUNNABLE)
3172 		return -EINVAL;
3173 
3174 	if (cmd->flags & ~KVM_TDX_MEASURE_MEMORY_REGION)
3175 		return -EINVAL;
3176 
3177 	if (copy_from_user(&region, u64_to_user_ptr(cmd->data), sizeof(region)))
3178 		return -EFAULT;
3179 
3180 	if (!PAGE_ALIGNED(region.source_addr) || !PAGE_ALIGNED(region.gpa) ||
3181 	    !region.nr_pages ||
3182 	    region.gpa + (region.nr_pages << PAGE_SHIFT) <= region.gpa ||
3183 	    !vt_is_tdx_private_gpa(kvm, region.gpa) ||
3184 	    !vt_is_tdx_private_gpa(kvm, region.gpa + (region.nr_pages << PAGE_SHIFT) - 1))
3185 		return -EINVAL;
3186 
3187 	ret = 0;
3188 	while (region.nr_pages) {
3189 		if (signal_pending(current)) {
3190 			ret = -EINTR;
3191 			break;
3192 		}
3193 
3194 		arg = (struct tdx_gmem_post_populate_arg) {
3195 			.vcpu = vcpu,
3196 			.flags = cmd->flags,
3197 		};
3198 		gmem_ret = kvm_gmem_populate(kvm, gpa_to_gfn(region.gpa),
3199 					     u64_to_user_ptr(region.source_addr),
3200 					     1, tdx_gmem_post_populate, &arg);
3201 		if (gmem_ret < 0) {
3202 			ret = gmem_ret;
3203 			break;
3204 		}
3205 
3206 		if (gmem_ret != 1) {
3207 			ret = -EIO;
3208 			break;
3209 		}
3210 
3211 		region.source_addr += PAGE_SIZE;
3212 		region.gpa += PAGE_SIZE;
3213 		region.nr_pages--;
3214 
3215 		cond_resched();
3216 	}
3217 
3218 	if (copy_to_user(u64_to_user_ptr(cmd->data), &region, sizeof(region)))
3219 		ret = -EFAULT;
3220 	return ret;
3221 }
3222 
3223 int tdx_vcpu_unlocked_ioctl(struct kvm_vcpu *vcpu, void __user *argp)
3224 {
3225 	struct kvm *kvm = vcpu->kvm;
3226 	struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
3227 	struct kvm_tdx_cmd cmd;
3228 	int r;
3229 
3230 	r = tdx_get_cmd(argp, &cmd);
3231 	if (r)
3232 		return r;
3233 
3234 	CLASS(tdx_vm_state_guard, guard)(kvm);
3235 	if (IS_ERR(guard))
3236 		return PTR_ERR(guard);
3237 
3238 	if (!is_hkid_assigned(kvm_tdx) || kvm_tdx->state == TD_STATE_RUNNABLE)
3239 		return -EINVAL;
3240 
3241 	vcpu_load(vcpu);
3242 
3243 	switch (cmd.id) {
3244 	case KVM_TDX_INIT_MEM_REGION:
3245 		r = tdx_vcpu_init_mem_region(vcpu, &cmd);
3246 		break;
3247 	case KVM_TDX_INIT_VCPU:
3248 		r = tdx_vcpu_init(vcpu, &cmd);
3249 		break;
3250 	default:
3251 		r = -ENOIOCTLCMD;
3252 		break;
3253 	}
3254 
3255 	vcpu_put(vcpu);
3256 
3257 	return r;
3258 }
3259 
3260 int tdx_vcpu_ioctl(struct kvm_vcpu *vcpu, void __user *argp)
3261 {
3262 	struct kvm_tdx *kvm_tdx = to_kvm_tdx(vcpu->kvm);
3263 	struct kvm_tdx_cmd cmd;
3264 	int ret;
3265 
3266 	if (!is_hkid_assigned(kvm_tdx) || kvm_tdx->state == TD_STATE_RUNNABLE)
3267 		return -EINVAL;
3268 
3269 	ret = tdx_get_cmd(argp, &cmd);
3270 	if (ret)
3271 		return ret;
3272 
3273 	switch (cmd.id) {
3274 	case KVM_TDX_GET_CPUID:
3275 		ret = tdx_vcpu_get_cpuid(vcpu, &cmd);
3276 		break;
3277 	default:
3278 		ret = -EINVAL;
3279 		break;
3280 	}
3281 
3282 	return ret;
3283 }
3284 
3285 int tdx_gmem_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn, bool is_private)
3286 {
3287 	if (!is_private)
3288 		return 0;
3289 
3290 	return PG_LEVEL_4K;
3291 }
3292 
3293 static int tdx_online_cpu(unsigned int cpu)
3294 {
3295 	unsigned long flags;
3296 	int r;
3297 
3298 	/* Sanity check CPU is already in post-VMXON */
3299 	WARN_ON_ONCE(!(cr4_read_shadow() & X86_CR4_VMXE));
3300 
3301 	local_irq_save(flags);
3302 	r = tdx_cpu_enable();
3303 	local_irq_restore(flags);
3304 
3305 	return r;
3306 }
3307 
3308 static int tdx_offline_cpu(unsigned int cpu)
3309 {
3310 	int i;
3311 
3312 	/* No TD is running.  Allow any cpu to be offline. */
3313 	if (!atomic_read(&nr_configured_hkid))
3314 		return 0;
3315 
3316 	/*
3317 	 * In order to reclaim TDX HKID, (i.e. when deleting guest TD), need to
3318 	 * call TDH.PHYMEM.PAGE.WBINVD on all packages to program all memory
3319 	 * controller with pconfig.  If we have active TDX HKID, refuse to
3320 	 * offline the last online cpu.
3321 	 */
3322 	for_each_online_cpu(i) {
3323 		/*
3324 		 * Found another online cpu on the same package.
3325 		 * Allow to offline.
3326 		 */
3327 		if (i != cpu && topology_physical_package_id(i) ==
3328 				topology_physical_package_id(cpu))
3329 			return 0;
3330 	}
3331 
3332 	/*
3333 	 * This is the last cpu of this package.  Don't offline it.
3334 	 *
3335 	 * Because it's hard for human operator to understand the
3336 	 * reason, warn it.
3337 	 */
3338 #define MSG_ALLPKG_ONLINE \
3339 	"TDX requires all packages to have an online CPU. Delete all TDs in order to offline all CPUs of a package.\n"
3340 	pr_warn_ratelimited(MSG_ALLPKG_ONLINE);
3341 	return -EBUSY;
3342 }
3343 
3344 static void __do_tdx_cleanup(void)
3345 {
3346 	/*
3347 	 * Once TDX module is initialized, it cannot be disabled and
3348 	 * re-initialized again w/o runtime update (which isn't
3349 	 * supported by kernel).  Only need to remove the cpuhp here.
3350 	 * The TDX host core code tracks TDX status and can handle
3351 	 * 'multiple enabling' scenario.
3352 	 */
3353 	WARN_ON_ONCE(!tdx_cpuhp_state);
3354 	cpuhp_remove_state_nocalls_cpuslocked(tdx_cpuhp_state);
3355 	tdx_cpuhp_state = 0;
3356 }
3357 
3358 static void __tdx_cleanup(void)
3359 {
3360 	cpus_read_lock();
3361 	__do_tdx_cleanup();
3362 	cpus_read_unlock();
3363 }
3364 
3365 static int __init __do_tdx_bringup(void)
3366 {
3367 	int r;
3368 
3369 	/*
3370 	 * TDX-specific cpuhp callback to call tdx_cpu_enable() on all
3371 	 * online CPUs before calling tdx_enable(), and on any new
3372 	 * going-online CPU to make sure it is ready for TDX guest.
3373 	 */
3374 	r = cpuhp_setup_state_cpuslocked(CPUHP_AP_ONLINE_DYN,
3375 					 "kvm/cpu/tdx:online",
3376 					 tdx_online_cpu, tdx_offline_cpu);
3377 	if (r < 0)
3378 		return r;
3379 
3380 	tdx_cpuhp_state = r;
3381 
3382 	r = tdx_enable();
3383 	if (r)
3384 		__do_tdx_cleanup();
3385 
3386 	return r;
3387 }
3388 
3389 static int __init __tdx_bringup(void)
3390 {
3391 	const struct tdx_sys_info_td_conf *td_conf;
3392 	int r, i;
3393 
3394 	for (i = 0; i < ARRAY_SIZE(tdx_uret_msrs); i++) {
3395 		/*
3396 		 * Check if MSRs (tdx_uret_msrs) can be saved/restored
3397 		 * before returning to user space.
3398 		 */
3399 		tdx_uret_msrs[i].slot = kvm_find_user_return_msr(tdx_uret_msrs[i].msr);
3400 		if (tdx_uret_msrs[i].slot == -1) {
3401 			/* If any MSR isn't supported, it is a KVM bug */
3402 			pr_err("MSR %x isn't included by kvm_find_user_return_msr\n",
3403 				tdx_uret_msrs[i].msr);
3404 			return -EIO;
3405 		}
3406 	}
3407 
3408 	/*
3409 	 * Enabling TDX requires enabling hardware virtualization first,
3410 	 * as making SEAMCALLs requires CPU being in post-VMXON state.
3411 	 */
3412 	r = kvm_enable_virtualization();
3413 	if (r)
3414 		return r;
3415 
3416 	cpus_read_lock();
3417 	r = __do_tdx_bringup();
3418 	cpus_read_unlock();
3419 
3420 	if (r)
3421 		goto tdx_bringup_err;
3422 
3423 	r = -EINVAL;
3424 	/* Get TDX global information for later use */
3425 	tdx_sysinfo = tdx_get_sysinfo();
3426 	if (WARN_ON_ONCE(!tdx_sysinfo))
3427 		goto get_sysinfo_err;
3428 
3429 	/* Check TDX module and KVM capabilities */
3430 	if (!tdx_get_supported_attrs(&tdx_sysinfo->td_conf) ||
3431 	    !tdx_get_supported_xfam(&tdx_sysinfo->td_conf))
3432 		goto get_sysinfo_err;
3433 
3434 	if (!(tdx_sysinfo->features.tdx_features0 & MD_FIELD_ID_FEATURES0_TOPOLOGY_ENUM))
3435 		goto get_sysinfo_err;
3436 
3437 	/*
3438 	 * TDX has its own limit of maximum vCPUs it can support for all
3439 	 * TDX guests in addition to KVM_MAX_VCPUS.  Userspace needs to
3440 	 * query TDX guest's maximum vCPUs by checking KVM_CAP_MAX_VCPU
3441 	 * extension on per-VM basis.
3442 	 *
3443 	 * TDX module reports such limit via the MAX_VCPU_PER_TD global
3444 	 * metadata.  Different modules may report different values.
3445 	 * Some old module may also not support this metadata (in which
3446 	 * case this limit is U16_MAX).
3447 	 *
3448 	 * In practice, the reported value reflects the maximum logical
3449 	 * CPUs that ALL the platforms that the module supports can
3450 	 * possibly have.
3451 	 *
3452 	 * Simply forwarding the MAX_VCPU_PER_TD to userspace could
3453 	 * result in an unpredictable ABI.  KVM instead always advertise
3454 	 * the number of logical CPUs the platform has as the maximum
3455 	 * vCPUs for TDX guests.
3456 	 *
3457 	 * Make sure MAX_VCPU_PER_TD reported by TDX module is not
3458 	 * smaller than the number of logical CPUs, otherwise KVM will
3459 	 * report an unsupported value to userspace.
3460 	 *
3461 	 * Note, a platform with TDX enabled in the BIOS cannot support
3462 	 * physical CPU hotplug, and TDX requires the BIOS has marked
3463 	 * all logical CPUs in MADT table as enabled.  Just use
3464 	 * num_present_cpus() for the number of logical CPUs.
3465 	 */
3466 	td_conf = &tdx_sysinfo->td_conf;
3467 	if (td_conf->max_vcpus_per_td < num_present_cpus()) {
3468 		pr_err("Disable TDX: MAX_VCPU_PER_TD (%u) smaller than number of logical CPUs (%u).\n",
3469 				td_conf->max_vcpus_per_td, num_present_cpus());
3470 		goto get_sysinfo_err;
3471 	}
3472 
3473 	if (misc_cg_set_capacity(MISC_CG_RES_TDX, tdx_get_nr_guest_keyids()))
3474 		goto get_sysinfo_err;
3475 
3476 	/*
3477 	 * Leave hardware virtualization enabled after TDX is enabled
3478 	 * successfully.  TDX CPU hotplug depends on this.
3479 	 */
3480 	return 0;
3481 
3482 get_sysinfo_err:
3483 	__tdx_cleanup();
3484 tdx_bringup_err:
3485 	kvm_disable_virtualization();
3486 	return r;
3487 }
3488 
3489 void tdx_cleanup(void)
3490 {
3491 	if (enable_tdx) {
3492 		misc_cg_set_capacity(MISC_CG_RES_TDX, 0);
3493 		__tdx_cleanup();
3494 		kvm_disable_virtualization();
3495 	}
3496 }
3497 
3498 int __init tdx_bringup(void)
3499 {
3500 	int r, i;
3501 
3502 	/* tdx_disable_virtualization_cpu() uses associated_tdvcpus. */
3503 	for_each_possible_cpu(i)
3504 		INIT_LIST_HEAD(&per_cpu(associated_tdvcpus, i));
3505 
3506 	if (!enable_tdx)
3507 		return 0;
3508 
3509 	if (!enable_ept) {
3510 		pr_err("EPT is required for TDX\n");
3511 		goto success_disable_tdx;
3512 	}
3513 
3514 	if (!tdp_mmu_enabled || !enable_mmio_caching || !enable_ept_ad_bits) {
3515 		pr_err("TDP MMU and MMIO caching and EPT A/D bit is required for TDX\n");
3516 		goto success_disable_tdx;
3517 	}
3518 
3519 	if (!enable_apicv) {
3520 		pr_err("APICv is required for TDX\n");
3521 		goto success_disable_tdx;
3522 	}
3523 
3524 	if (!cpu_feature_enabled(X86_FEATURE_OSXSAVE)) {
3525 		pr_err("tdx: OSXSAVE is required for TDX\n");
3526 		goto success_disable_tdx;
3527 	}
3528 
3529 	if (!cpu_feature_enabled(X86_FEATURE_MOVDIR64B)) {
3530 		pr_err("tdx: MOVDIR64B is required for TDX\n");
3531 		goto success_disable_tdx;
3532 	}
3533 
3534 	if (!cpu_feature_enabled(X86_FEATURE_SELFSNOOP)) {
3535 		pr_err("Self-snoop is required for TDX\n");
3536 		goto success_disable_tdx;
3537 	}
3538 
3539 	if (!cpu_feature_enabled(X86_FEATURE_TDX_HOST_PLATFORM)) {
3540 		pr_err("tdx: no TDX private KeyIDs available\n");
3541 		goto success_disable_tdx;
3542 	}
3543 
3544 	if (!enable_virt_at_load) {
3545 		pr_err("tdx: tdx requires kvm.enable_virt_at_load=1\n");
3546 		goto success_disable_tdx;
3547 	}
3548 
3549 	/*
3550 	 * Ideally KVM should probe whether TDX module has been loaded
3551 	 * first and then try to bring it up.  But TDX needs to use SEAMCALL
3552 	 * to probe whether the module is loaded (there is no CPUID or MSR
3553 	 * for that), and making SEAMCALL requires enabling virtualization
3554 	 * first, just like the rest steps of bringing up TDX module.
3555 	 *
3556 	 * So, for simplicity do everything in __tdx_bringup(); the first
3557 	 * SEAMCALL will return -ENODEV when the module is not loaded.  The
3558 	 * only complication is having to make sure that initialization
3559 	 * SEAMCALLs don't return TDX_SEAMCALL_VMFAILINVALID in other
3560 	 * cases.
3561 	 */
3562 	r = __tdx_bringup();
3563 	if (r) {
3564 		/*
3565 		 * Disable TDX only but don't fail to load module if the TDX
3566 		 * module could not be loaded.  No need to print message saying
3567 		 * "module is not loaded" because it was printed when the first
3568 		 * SEAMCALL failed.  Don't bother unwinding the S-EPT hooks or
3569 		 * vm_size, as kvm_x86_ops have already been finalized (and are
3570 		 * intentionally not exported).  The S-EPT code is unreachable,
3571 		 * and allocating a few more bytes per VM in a should-be-rare
3572 		 * failure scenario is a non-issue.
3573 		 */
3574 		if (r == -ENODEV)
3575 			goto success_disable_tdx;
3576 
3577 		enable_tdx = 0;
3578 	}
3579 
3580 	return r;
3581 
3582 success_disable_tdx:
3583 	enable_tdx = 0;
3584 	return 0;
3585 }
3586 
3587 void __init tdx_hardware_setup(void)
3588 {
3589 	KVM_SANITY_CHECK_VM_STRUCT_SIZE(kvm_tdx);
3590 
3591 	/*
3592 	 * Note, if the TDX module can't be loaded, KVM TDX support will be
3593 	 * disabled but KVM will continue loading (see tdx_bringup()).
3594 	 */
3595 	vt_x86_ops.vm_size = max_t(unsigned int, vt_x86_ops.vm_size, sizeof(struct kvm_tdx));
3596 
3597 	vt_x86_ops.link_external_spt = tdx_sept_link_private_spt;
3598 	vt_x86_ops.set_external_spte = tdx_sept_set_private_spte;
3599 	vt_x86_ops.free_external_spt = tdx_sept_free_private_spt;
3600 	vt_x86_ops.remove_external_spte = tdx_sept_remove_private_spte;
3601 	vt_x86_ops.protected_apic_has_interrupt = tdx_protected_apic_has_interrupt;
3602 }
3603