xref: /linux/arch/x86/kvm/vmx/tdx.c (revision 5feaa7a07b85ebbef418ba4b80e4e0d23dc379f5)
1 // SPDX-License-Identifier: GPL-2.0
2 #include <linux/cleanup.h>
3 #include <linux/cpu.h>
4 #include <asm/cpufeature.h>
5 #include <asm/fpu/xcr.h>
6 #include <linux/misc_cgroup.h>
7 #include <linux/mmu_context.h>
8 #include <asm/tdx.h>
9 #include "capabilities.h"
10 #include "mmu.h"
11 #include "x86_ops.h"
12 #include "lapic.h"
13 #include "tdx.h"
14 #include "vmx.h"
15 #include "mmu/spte.h"
16 #include "common.h"
17 #include "posted_intr.h"
18 #include "irq.h"
19 #include <trace/events/kvm.h>
20 #include "trace.h"
21 
22 #pragma GCC poison to_vmx
23 
24 #undef pr_fmt
25 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
26 
27 #define pr_tdx_error(__fn, __err)	\
28 	pr_err_ratelimited("SEAMCALL %s failed: 0x%llx\n", #__fn, __err)
29 
30 #define __pr_tdx_error_N(__fn_str, __err, __fmt, ...)		\
31 	pr_err_ratelimited("SEAMCALL " __fn_str " failed: 0x%llx, " __fmt,  __err,  __VA_ARGS__)
32 
33 #define pr_tdx_error_1(__fn, __err, __rcx)		\
34 	__pr_tdx_error_N(#__fn, __err, "rcx 0x%llx\n", __rcx)
35 
36 #define pr_tdx_error_2(__fn, __err, __rcx, __rdx)	\
37 	__pr_tdx_error_N(#__fn, __err, "rcx 0x%llx, rdx 0x%llx\n", __rcx, __rdx)
38 
39 #define pr_tdx_error_3(__fn, __err, __rcx, __rdx, __r8)	\
40 	__pr_tdx_error_N(#__fn, __err, "rcx 0x%llx, rdx 0x%llx, r8 0x%llx\n", __rcx, __rdx, __r8)
41 
42 bool enable_tdx __ro_after_init;
43 module_param_named(tdx, enable_tdx, bool, 0444);
44 
45 #define TDX_SHARED_BIT_PWL_5 gpa_to_gfn(BIT_ULL(51))
46 #define TDX_SHARED_BIT_PWL_4 gpa_to_gfn(BIT_ULL(47))
47 
48 static enum cpuhp_state tdx_cpuhp_state;
49 
50 static const struct tdx_sys_info *tdx_sysinfo;
51 
52 void tdh_vp_rd_failed(struct vcpu_tdx *tdx, char *uclass, u32 field, u64 err)
53 {
54 	KVM_BUG_ON(1, tdx->vcpu.kvm);
55 	pr_err("TDH_VP_RD[%s.0x%x] failed 0x%llx\n", uclass, field, err);
56 }
57 
58 void tdh_vp_wr_failed(struct vcpu_tdx *tdx, char *uclass, char *op, u32 field,
59 		      u64 val, u64 err)
60 {
61 	KVM_BUG_ON(1, tdx->vcpu.kvm);
62 	pr_err("TDH_VP_WR[%s.0x%x]%s0x%llx failed: 0x%llx\n", uclass, field, op, val, err);
63 }
64 
65 #define KVM_SUPPORTED_TD_ATTRS (TDX_TD_ATTR_SEPT_VE_DISABLE)
66 
67 static __always_inline struct kvm_tdx *to_kvm_tdx(struct kvm *kvm)
68 {
69 	return container_of(kvm, struct kvm_tdx, kvm);
70 }
71 
72 static __always_inline struct vcpu_tdx *to_tdx(struct kvm_vcpu *vcpu)
73 {
74 	return container_of(vcpu, struct vcpu_tdx, vcpu);
75 }
76 
77 static u64 tdx_get_supported_attrs(const struct tdx_sys_info_td_conf *td_conf)
78 {
79 	u64 val = KVM_SUPPORTED_TD_ATTRS;
80 
81 	if ((val & td_conf->attributes_fixed1) != td_conf->attributes_fixed1)
82 		return 0;
83 
84 	val &= td_conf->attributes_fixed0;
85 
86 	return val;
87 }
88 
89 static u64 tdx_get_supported_xfam(const struct tdx_sys_info_td_conf *td_conf)
90 {
91 	u64 val = kvm_caps.supported_xcr0 | kvm_caps.supported_xss;
92 
93 	if ((val & td_conf->xfam_fixed1) != td_conf->xfam_fixed1)
94 		return 0;
95 
96 	val &= td_conf->xfam_fixed0;
97 
98 	return val;
99 }
100 
101 static int tdx_get_guest_phys_addr_bits(const u32 eax)
102 {
103 	return (eax & GENMASK(23, 16)) >> 16;
104 }
105 
106 static u32 tdx_set_guest_phys_addr_bits(const u32 eax, int addr_bits)
107 {
108 	return (eax & ~GENMASK(23, 16)) | (addr_bits & 0xff) << 16;
109 }
110 
111 #define TDX_FEATURE_TSX (__feature_bit(X86_FEATURE_HLE) | __feature_bit(X86_FEATURE_RTM))
112 
113 static bool has_tsx(const struct kvm_cpuid_entry2 *entry)
114 {
115 	return entry->function == 7 && entry->index == 0 &&
116 	       (entry->ebx & TDX_FEATURE_TSX);
117 }
118 
119 static void clear_tsx(struct kvm_cpuid_entry2 *entry)
120 {
121 	entry->ebx &= ~TDX_FEATURE_TSX;
122 }
123 
124 static bool has_waitpkg(const struct kvm_cpuid_entry2 *entry)
125 {
126 	return entry->function == 7 && entry->index == 0 &&
127 	       (entry->ecx & __feature_bit(X86_FEATURE_WAITPKG));
128 }
129 
130 static void clear_waitpkg(struct kvm_cpuid_entry2 *entry)
131 {
132 	entry->ecx &= ~__feature_bit(X86_FEATURE_WAITPKG);
133 }
134 
135 static void tdx_clear_unsupported_cpuid(struct kvm_cpuid_entry2 *entry)
136 {
137 	if (has_tsx(entry))
138 		clear_tsx(entry);
139 
140 	if (has_waitpkg(entry))
141 		clear_waitpkg(entry);
142 }
143 
144 static bool tdx_unsupported_cpuid(const struct kvm_cpuid_entry2 *entry)
145 {
146 	return has_tsx(entry) || has_waitpkg(entry);
147 }
148 
149 #define KVM_TDX_CPUID_NO_SUBLEAF	((__u32)-1)
150 
151 static void td_init_cpuid_entry2(struct kvm_cpuid_entry2 *entry, unsigned char idx)
152 {
153 	const struct tdx_sys_info_td_conf *td_conf = &tdx_sysinfo->td_conf;
154 
155 	entry->function = (u32)td_conf->cpuid_config_leaves[idx];
156 	entry->index = td_conf->cpuid_config_leaves[idx] >> 32;
157 	entry->eax = (u32)td_conf->cpuid_config_values[idx][0];
158 	entry->ebx = td_conf->cpuid_config_values[idx][0] >> 32;
159 	entry->ecx = (u32)td_conf->cpuid_config_values[idx][1];
160 	entry->edx = td_conf->cpuid_config_values[idx][1] >> 32;
161 
162 	if (entry->index == KVM_TDX_CPUID_NO_SUBLEAF)
163 		entry->index = 0;
164 
165 	/*
166 	 * The TDX module doesn't allow configuring the guest phys addr bits
167 	 * (EAX[23:16]).  However, KVM uses it as an interface to the userspace
168 	 * to configure the GPAW.  Report these bits as configurable.
169 	 */
170 	if (entry->function == 0x80000008)
171 		entry->eax = tdx_set_guest_phys_addr_bits(entry->eax, 0xff);
172 
173 	tdx_clear_unsupported_cpuid(entry);
174 }
175 
176 #define TDVMCALLINFO_GET_QUOTE				BIT(0)
177 #define TDVMCALLINFO_SETUP_EVENT_NOTIFY_INTERRUPT	BIT(1)
178 
179 static int init_kvm_tdx_caps(const struct tdx_sys_info_td_conf *td_conf,
180 			     struct kvm_tdx_capabilities *caps)
181 {
182 	int i;
183 
184 	caps->supported_attrs = tdx_get_supported_attrs(td_conf);
185 	if (!caps->supported_attrs)
186 		return -EIO;
187 
188 	caps->supported_xfam = tdx_get_supported_xfam(td_conf);
189 	if (!caps->supported_xfam)
190 		return -EIO;
191 
192 	caps->cpuid.nent = td_conf->num_cpuid_config;
193 
194 	caps->user_tdvmcallinfo_1_r11 =
195 		TDVMCALLINFO_GET_QUOTE |
196 		TDVMCALLINFO_SETUP_EVENT_NOTIFY_INTERRUPT;
197 
198 	for (i = 0; i < td_conf->num_cpuid_config; i++)
199 		td_init_cpuid_entry2(&caps->cpuid.entries[i], i);
200 
201 	return 0;
202 }
203 
204 /*
205  * Some SEAMCALLs acquire the TDX module globally, and can fail with
206  * TDX_OPERAND_BUSY.  Use a global mutex to serialize these SEAMCALLs.
207  */
208 static DEFINE_MUTEX(tdx_lock);
209 
210 static atomic_t nr_configured_hkid;
211 
212 static bool tdx_operand_busy(u64 err)
213 {
214 	return (err & TDX_SEAMCALL_STATUS_MASK) == TDX_OPERAND_BUSY;
215 }
216 
217 
218 /*
219  * A per-CPU list of TD vCPUs associated with a given CPU.
220  * Protected by interrupt mask. Only manipulated by the CPU owning this per-CPU
221  * list.
222  * - When a vCPU is loaded onto a CPU, it is removed from the per-CPU list of
223  *   the old CPU during the IPI callback running on the old CPU, and then added
224  *   to the per-CPU list of the new CPU.
225  * - When a TD is tearing down, all vCPUs are disassociated from their current
226  *   running CPUs and removed from the per-CPU list during the IPI callback
227  *   running on those CPUs.
228  * - When a CPU is brought down, traverse the per-CPU list to disassociate all
229  *   associated TD vCPUs and remove them from the per-CPU list.
230  */
231 static DEFINE_PER_CPU(struct list_head, associated_tdvcpus);
232 
233 static __always_inline unsigned long tdvmcall_exit_type(struct kvm_vcpu *vcpu)
234 {
235 	return to_tdx(vcpu)->vp_enter_args.r10;
236 }
237 
238 static __always_inline unsigned long tdvmcall_leaf(struct kvm_vcpu *vcpu)
239 {
240 	return to_tdx(vcpu)->vp_enter_args.r11;
241 }
242 
243 static __always_inline void tdvmcall_set_return_code(struct kvm_vcpu *vcpu,
244 						     long val)
245 {
246 	to_tdx(vcpu)->vp_enter_args.r10 = val;
247 }
248 
249 static __always_inline void tdvmcall_set_return_val(struct kvm_vcpu *vcpu,
250 						    unsigned long val)
251 {
252 	to_tdx(vcpu)->vp_enter_args.r11 = val;
253 }
254 
255 static inline void tdx_hkid_free(struct kvm_tdx *kvm_tdx)
256 {
257 	tdx_guest_keyid_free(kvm_tdx->hkid);
258 	kvm_tdx->hkid = -1;
259 	atomic_dec(&nr_configured_hkid);
260 	misc_cg_uncharge(MISC_CG_RES_TDX, kvm_tdx->misc_cg, 1);
261 	put_misc_cg(kvm_tdx->misc_cg);
262 	kvm_tdx->misc_cg = NULL;
263 }
264 
265 static inline bool is_hkid_assigned(struct kvm_tdx *kvm_tdx)
266 {
267 	return kvm_tdx->hkid > 0;
268 }
269 
270 static inline void tdx_disassociate_vp(struct kvm_vcpu *vcpu)
271 {
272 	lockdep_assert_irqs_disabled();
273 
274 	list_del(&to_tdx(vcpu)->cpu_list);
275 
276 	/*
277 	 * Ensure tdx->cpu_list is updated before setting vcpu->cpu to -1,
278 	 * otherwise, a different CPU can see vcpu->cpu = -1 and add the vCPU
279 	 * to its list before it's deleted from this CPU's list.
280 	 */
281 	smp_wmb();
282 
283 	vcpu->cpu = -1;
284 }
285 
286 static void tdx_clear_page(struct page *page)
287 {
288 	const void *zero_page = (const void *) page_to_virt(ZERO_PAGE(0));
289 	void *dest = page_to_virt(page);
290 	unsigned long i;
291 
292 	/*
293 	 * The page could have been poisoned.  MOVDIR64B also clears
294 	 * the poison bit so the kernel can safely use the page again.
295 	 */
296 	for (i = 0; i < PAGE_SIZE; i += 64)
297 		movdir64b(dest + i, zero_page);
298 	/*
299 	 * MOVDIR64B store uses WC buffer.  Prevent following memory reads
300 	 * from seeing potentially poisoned cache.
301 	 */
302 	__mb();
303 }
304 
305 static void tdx_no_vcpus_enter_start(struct kvm *kvm)
306 {
307 	struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
308 
309 	lockdep_assert_held_write(&kvm->mmu_lock);
310 
311 	WRITE_ONCE(kvm_tdx->wait_for_sept_zap, true);
312 
313 	kvm_make_all_cpus_request(kvm, KVM_REQ_OUTSIDE_GUEST_MODE);
314 }
315 
316 static void tdx_no_vcpus_enter_stop(struct kvm *kvm)
317 {
318 	struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
319 
320 	lockdep_assert_held_write(&kvm->mmu_lock);
321 
322 	WRITE_ONCE(kvm_tdx->wait_for_sept_zap, false);
323 }
324 
325 /* TDH.PHYMEM.PAGE.RECLAIM is allowed only when destroying the TD. */
326 static int __tdx_reclaim_page(struct page *page)
327 {
328 	u64 err, rcx, rdx, r8;
329 
330 	err = tdh_phymem_page_reclaim(page, &rcx, &rdx, &r8);
331 
332 	/*
333 	 * No need to check for TDX_OPERAND_BUSY; all TD pages are freed
334 	 * before the HKID is released and control pages have also been
335 	 * released at this point, so there is no possibility of contention.
336 	 */
337 	if (WARN_ON_ONCE(err)) {
338 		pr_tdx_error_3(TDH_PHYMEM_PAGE_RECLAIM, err, rcx, rdx, r8);
339 		return -EIO;
340 	}
341 	return 0;
342 }
343 
344 static int tdx_reclaim_page(struct page *page)
345 {
346 	int r;
347 
348 	r = __tdx_reclaim_page(page);
349 	if (!r)
350 		tdx_clear_page(page);
351 	return r;
352 }
353 
354 
355 /*
356  * Reclaim the TD control page(s) which are crypto-protected by TDX guest's
357  * private KeyID.  Assume the cache associated with the TDX private KeyID has
358  * been flushed.
359  */
360 static void tdx_reclaim_control_page(struct page *ctrl_page)
361 {
362 	/*
363 	 * Leak the page if the kernel failed to reclaim the page.
364 	 * The kernel cannot use it safely anymore.
365 	 */
366 	if (tdx_reclaim_page(ctrl_page))
367 		return;
368 
369 	__free_page(ctrl_page);
370 }
371 
372 struct tdx_flush_vp_arg {
373 	struct kvm_vcpu *vcpu;
374 	u64 err;
375 };
376 
377 static void tdx_flush_vp(void *_arg)
378 {
379 	struct tdx_flush_vp_arg *arg = _arg;
380 	struct kvm_vcpu *vcpu = arg->vcpu;
381 	u64 err;
382 
383 	arg->err = 0;
384 	lockdep_assert_irqs_disabled();
385 
386 	/* Task migration can race with CPU offlining. */
387 	if (unlikely(vcpu->cpu != raw_smp_processor_id()))
388 		return;
389 
390 	/*
391 	 * No need to do TDH_VP_FLUSH if the vCPU hasn't been initialized.  The
392 	 * list tracking still needs to be updated so that it's correct if/when
393 	 * the vCPU does get initialized.
394 	 */
395 	if (to_tdx(vcpu)->state != VCPU_TD_STATE_UNINITIALIZED) {
396 		/*
397 		 * No need to retry.  TDX Resources needed for TDH.VP.FLUSH are:
398 		 * TDVPR as exclusive, TDR as shared, and TDCS as shared.  This
399 		 * vp flush function is called when destructing vCPU/TD or vCPU
400 		 * migration.  No other thread uses TDVPR in those cases.
401 		 */
402 		err = tdh_vp_flush(&to_tdx(vcpu)->vp);
403 		if (unlikely(err && err != TDX_VCPU_NOT_ASSOCIATED)) {
404 			/*
405 			 * This function is called in IPI context. Do not use
406 			 * printk to avoid console semaphore.
407 			 * The caller prints out the error message, instead.
408 			 */
409 			if (err)
410 				arg->err = err;
411 		}
412 	}
413 
414 	tdx_disassociate_vp(vcpu);
415 }
416 
417 static void tdx_flush_vp_on_cpu(struct kvm_vcpu *vcpu)
418 {
419 	struct tdx_flush_vp_arg arg = {
420 		.vcpu = vcpu,
421 	};
422 	int cpu = vcpu->cpu;
423 
424 	if (unlikely(cpu == -1))
425 		return;
426 
427 	smp_call_function_single(cpu, tdx_flush_vp, &arg, 1);
428 	if (KVM_BUG_ON(arg.err, vcpu->kvm))
429 		pr_tdx_error(TDH_VP_FLUSH, arg.err);
430 }
431 
432 void tdx_disable_virtualization_cpu(void)
433 {
434 	int cpu = raw_smp_processor_id();
435 	struct list_head *tdvcpus = &per_cpu(associated_tdvcpus, cpu);
436 	struct tdx_flush_vp_arg arg;
437 	struct vcpu_tdx *tdx, *tmp;
438 	unsigned long flags;
439 
440 	local_irq_save(flags);
441 	/* Safe variant needed as tdx_disassociate_vp() deletes the entry. */
442 	list_for_each_entry_safe(tdx, tmp, tdvcpus, cpu_list) {
443 		arg.vcpu = &tdx->vcpu;
444 		tdx_flush_vp(&arg);
445 	}
446 	local_irq_restore(flags);
447 }
448 
449 #define TDX_SEAMCALL_RETRIES 10000
450 
451 static void smp_func_do_phymem_cache_wb(void *unused)
452 {
453 	u64 err = 0;
454 	bool resume;
455 	int i;
456 
457 	/*
458 	 * TDH.PHYMEM.CACHE.WB flushes caches associated with any TDX private
459 	 * KeyID on the package or core.  The TDX module may not finish the
460 	 * cache flush but return TDX_INTERRUPTED_RESUMEABLE instead.  The
461 	 * kernel should retry it until it returns success w/o rescheduling.
462 	 */
463 	for (i = TDX_SEAMCALL_RETRIES; i > 0; i--) {
464 		resume = !!err;
465 		err = tdh_phymem_cache_wb(resume);
466 		switch (err) {
467 		case TDX_INTERRUPTED_RESUMABLE:
468 			continue;
469 		case TDX_NO_HKID_READY_TO_WBCACHE:
470 			err = TDX_SUCCESS; /* Already done by other thread */
471 			fallthrough;
472 		default:
473 			goto out;
474 		}
475 	}
476 
477 out:
478 	if (WARN_ON_ONCE(err))
479 		pr_tdx_error(TDH_PHYMEM_CACHE_WB, err);
480 }
481 
482 void tdx_mmu_release_hkid(struct kvm *kvm)
483 {
484 	bool packages_allocated, targets_allocated;
485 	struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
486 	cpumask_var_t packages, targets;
487 	struct kvm_vcpu *vcpu;
488 	unsigned long j;
489 	int i;
490 	u64 err;
491 
492 	if (!is_hkid_assigned(kvm_tdx))
493 		return;
494 
495 	packages_allocated = zalloc_cpumask_var(&packages, GFP_KERNEL);
496 	targets_allocated = zalloc_cpumask_var(&targets, GFP_KERNEL);
497 	cpus_read_lock();
498 
499 	kvm_for_each_vcpu(j, vcpu, kvm)
500 		tdx_flush_vp_on_cpu(vcpu);
501 
502 	/*
503 	 * TDH.PHYMEM.CACHE.WB tries to acquire the TDX module global lock
504 	 * and can fail with TDX_OPERAND_BUSY when it fails to get the lock.
505 	 * Multiple TDX guests can be destroyed simultaneously. Take the
506 	 * mutex to prevent it from getting error.
507 	 */
508 	mutex_lock(&tdx_lock);
509 
510 	/*
511 	 * Releasing HKID is in vm_destroy().
512 	 * After the above flushing vps, there should be no more vCPU
513 	 * associations, as all vCPU fds have been released at this stage.
514 	 */
515 	err = tdh_mng_vpflushdone(&kvm_tdx->td);
516 	if (err == TDX_FLUSHVP_NOT_DONE)
517 		goto out;
518 	if (KVM_BUG_ON(err, kvm)) {
519 		pr_tdx_error(TDH_MNG_VPFLUSHDONE, err);
520 		pr_err("tdh_mng_vpflushdone() failed. HKID %d is leaked.\n",
521 		       kvm_tdx->hkid);
522 		goto out;
523 	}
524 
525 	for_each_online_cpu(i) {
526 		if (packages_allocated &&
527 		    cpumask_test_and_set_cpu(topology_physical_package_id(i),
528 					     packages))
529 			continue;
530 		if (targets_allocated)
531 			cpumask_set_cpu(i, targets);
532 	}
533 	if (targets_allocated)
534 		on_each_cpu_mask(targets, smp_func_do_phymem_cache_wb, NULL, true);
535 	else
536 		on_each_cpu(smp_func_do_phymem_cache_wb, NULL, true);
537 	/*
538 	 * In the case of error in smp_func_do_phymem_cache_wb(), the following
539 	 * tdh_mng_key_freeid() will fail.
540 	 */
541 	err = tdh_mng_key_freeid(&kvm_tdx->td);
542 	if (KVM_BUG_ON(err, kvm)) {
543 		pr_tdx_error(TDH_MNG_KEY_FREEID, err);
544 		pr_err("tdh_mng_key_freeid() failed. HKID %d is leaked.\n",
545 		       kvm_tdx->hkid);
546 	} else {
547 		tdx_hkid_free(kvm_tdx);
548 	}
549 
550 out:
551 	mutex_unlock(&tdx_lock);
552 	cpus_read_unlock();
553 	free_cpumask_var(targets);
554 	free_cpumask_var(packages);
555 }
556 
557 static void tdx_reclaim_td_control_pages(struct kvm *kvm)
558 {
559 	struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
560 	u64 err;
561 	int i;
562 
563 	/*
564 	 * tdx_mmu_release_hkid() failed to reclaim HKID.  Something went wrong
565 	 * heavily with TDX module.  Give up freeing TD pages.  As the function
566 	 * already warned, don't warn it again.
567 	 */
568 	if (is_hkid_assigned(kvm_tdx))
569 		return;
570 
571 	if (kvm_tdx->td.tdcs_pages) {
572 		for (i = 0; i < kvm_tdx->td.tdcs_nr_pages; i++) {
573 			if (!kvm_tdx->td.tdcs_pages[i])
574 				continue;
575 
576 			tdx_reclaim_control_page(kvm_tdx->td.tdcs_pages[i]);
577 		}
578 		kfree(kvm_tdx->td.tdcs_pages);
579 		kvm_tdx->td.tdcs_pages = NULL;
580 	}
581 
582 	if (!kvm_tdx->td.tdr_page)
583 		return;
584 
585 	if (__tdx_reclaim_page(kvm_tdx->td.tdr_page))
586 		return;
587 
588 	/*
589 	 * Use a SEAMCALL to ask the TDX module to flush the cache based on the
590 	 * KeyID. TDX module may access TDR while operating on TD (Especially
591 	 * when it is reclaiming TDCS).
592 	 */
593 	err = tdh_phymem_page_wbinvd_tdr(&kvm_tdx->td);
594 	if (KVM_BUG_ON(err, kvm)) {
595 		pr_tdx_error(TDH_PHYMEM_PAGE_WBINVD, err);
596 		return;
597 	}
598 	tdx_clear_page(kvm_tdx->td.tdr_page);
599 
600 	__free_page(kvm_tdx->td.tdr_page);
601 	kvm_tdx->td.tdr_page = NULL;
602 }
603 
604 void tdx_vm_destroy(struct kvm *kvm)
605 {
606 	struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
607 
608 	tdx_reclaim_td_control_pages(kvm);
609 
610 	kvm_tdx->state = TD_STATE_UNINITIALIZED;
611 }
612 
613 static int tdx_do_tdh_mng_key_config(void *param)
614 {
615 	struct kvm_tdx *kvm_tdx = param;
616 	u64 err;
617 
618 	/* TDX_RND_NO_ENTROPY related retries are handled by sc_retry() */
619 	err = tdh_mng_key_config(&kvm_tdx->td);
620 
621 	if (KVM_BUG_ON(err, &kvm_tdx->kvm)) {
622 		pr_tdx_error(TDH_MNG_KEY_CONFIG, err);
623 		return -EIO;
624 	}
625 
626 	return 0;
627 }
628 
629 int tdx_vm_init(struct kvm *kvm)
630 {
631 	struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
632 
633 	kvm->arch.has_protected_state = true;
634 	kvm->arch.has_private_mem = true;
635 	kvm->arch.disabled_quirks |= KVM_X86_QUIRK_IGNORE_GUEST_PAT;
636 
637 	/*
638 	 * Because guest TD is protected, VMM can't parse the instruction in TD.
639 	 * Instead, guest uses MMIO hypercall.  For unmodified device driver,
640 	 * #VE needs to be injected for MMIO and #VE handler in TD converts MMIO
641 	 * instruction into MMIO hypercall.
642 	 *
643 	 * SPTE value for MMIO needs to be setup so that #VE is injected into
644 	 * TD instead of triggering EPT MISCONFIG.
645 	 * - RWX=0 so that EPT violation is triggered.
646 	 * - suppress #VE bit is cleared to inject #VE.
647 	 */
648 	kvm_mmu_set_mmio_spte_value(kvm, 0);
649 
650 	/*
651 	 * TDX has its own limit of maximum vCPUs it can support for all
652 	 * TDX guests in addition to KVM_MAX_VCPUS.  TDX module reports
653 	 * such limit via the MAX_VCPU_PER_TD global metadata.  In
654 	 * practice, it reflects the number of logical CPUs that ALL
655 	 * platforms that the TDX module supports can possibly have.
656 	 *
657 	 * Limit TDX guest's maximum vCPUs to the number of logical CPUs
658 	 * the platform has.  Simply forwarding the MAX_VCPU_PER_TD to
659 	 * userspace would result in an unpredictable ABI.
660 	 */
661 	kvm->max_vcpus = min_t(int, kvm->max_vcpus, num_present_cpus());
662 
663 	kvm_tdx->state = TD_STATE_UNINITIALIZED;
664 
665 	return 0;
666 }
667 
668 int tdx_vcpu_create(struct kvm_vcpu *vcpu)
669 {
670 	struct kvm_tdx *kvm_tdx = to_kvm_tdx(vcpu->kvm);
671 	struct vcpu_tdx *tdx = to_tdx(vcpu);
672 
673 	if (kvm_tdx->state != TD_STATE_INITIALIZED)
674 		return -EIO;
675 
676 	/*
677 	 * TDX module mandates APICv, which requires an in-kernel local APIC.
678 	 * Disallow an in-kernel I/O APIC, because level-triggered interrupts
679 	 * and thus the I/O APIC as a whole can't be faithfully emulated in KVM.
680 	 */
681 	if (!irqchip_split(vcpu->kvm))
682 		return -EINVAL;
683 
684 	fpstate_set_confidential(&vcpu->arch.guest_fpu);
685 	vcpu->arch.apic->guest_apic_protected = true;
686 	INIT_LIST_HEAD(&tdx->vt.pi_wakeup_list);
687 
688 	vcpu->arch.efer = EFER_SCE | EFER_LME | EFER_LMA | EFER_NX;
689 
690 	vcpu->arch.switch_db_regs = KVM_DEBUGREG_AUTO_SWITCH;
691 	vcpu->arch.cr0_guest_owned_bits = -1ul;
692 	vcpu->arch.cr4_guest_owned_bits = -1ul;
693 
694 	/* KVM can't change TSC offset/multiplier as TDX module manages them. */
695 	vcpu->arch.guest_tsc_protected = true;
696 	vcpu->arch.tsc_offset = kvm_tdx->tsc_offset;
697 	vcpu->arch.l1_tsc_offset = vcpu->arch.tsc_offset;
698 	vcpu->arch.tsc_scaling_ratio = kvm_tdx->tsc_multiplier;
699 	vcpu->arch.l1_tsc_scaling_ratio = kvm_tdx->tsc_multiplier;
700 
701 	vcpu->arch.guest_state_protected =
702 		!(to_kvm_tdx(vcpu->kvm)->attributes & TDX_TD_ATTR_DEBUG);
703 
704 	if ((kvm_tdx->xfam & XFEATURE_MASK_XTILE) == XFEATURE_MASK_XTILE)
705 		vcpu->arch.xfd_no_write_intercept = true;
706 
707 	tdx->vt.pi_desc.nv = POSTED_INTR_VECTOR;
708 	__pi_set_sn(&tdx->vt.pi_desc);
709 
710 	tdx->state = VCPU_TD_STATE_UNINITIALIZED;
711 
712 	return 0;
713 }
714 
715 void tdx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
716 {
717 	struct vcpu_tdx *tdx = to_tdx(vcpu);
718 
719 	vmx_vcpu_pi_load(vcpu, cpu);
720 	if (vcpu->cpu == cpu || !is_hkid_assigned(to_kvm_tdx(vcpu->kvm)))
721 		return;
722 
723 	tdx_flush_vp_on_cpu(vcpu);
724 
725 	KVM_BUG_ON(cpu != raw_smp_processor_id(), vcpu->kvm);
726 	local_irq_disable();
727 	/*
728 	 * Pairs with the smp_wmb() in tdx_disassociate_vp() to ensure
729 	 * vcpu->cpu is read before tdx->cpu_list.
730 	 */
731 	smp_rmb();
732 
733 	list_add(&tdx->cpu_list, &per_cpu(associated_tdvcpus, cpu));
734 	local_irq_enable();
735 }
736 
737 bool tdx_interrupt_allowed(struct kvm_vcpu *vcpu)
738 {
739 	/*
740 	 * KVM can't get the interrupt status of TDX guest and it assumes
741 	 * interrupt is always allowed unless TDX guest calls TDVMCALL with HLT,
742 	 * which passes the interrupt blocked flag.
743 	 */
744 	return vmx_get_exit_reason(vcpu).basic != EXIT_REASON_HLT ||
745 	       !to_tdx(vcpu)->vp_enter_args.r12;
746 }
747 
748 bool tdx_protected_apic_has_interrupt(struct kvm_vcpu *vcpu)
749 {
750 	u64 vcpu_state_details;
751 
752 	if (pi_has_pending_interrupt(vcpu))
753 		return true;
754 
755 	/*
756 	 * Only check RVI pending for HALTED case with IRQ enabled.
757 	 * For non-HLT cases, KVM doesn't care about STI/SS shadows.  And if the
758 	 * interrupt was pending before TD exit, then it _must_ be blocked,
759 	 * otherwise the interrupt would have been serviced at the instruction
760 	 * boundary.
761 	 */
762 	if (vmx_get_exit_reason(vcpu).basic != EXIT_REASON_HLT ||
763 	    to_tdx(vcpu)->vp_enter_args.r12)
764 		return false;
765 
766 	vcpu_state_details =
767 		td_state_non_arch_read64(to_tdx(vcpu), TD_VCPU_STATE_DETAILS_NON_ARCH);
768 
769 	return tdx_vcpu_state_details_intr_pending(vcpu_state_details);
770 }
771 
772 /*
773  * Compared to vmx_prepare_switch_to_guest(), there is not much to do
774  * as SEAMCALL/SEAMRET calls take care of most of save and restore.
775  */
776 void tdx_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
777 {
778 	struct vcpu_vt *vt = to_vt(vcpu);
779 
780 	if (vt->guest_state_loaded)
781 		return;
782 
783 	if (likely(is_64bit_mm(current->mm)))
784 		vt->msr_host_kernel_gs_base = current->thread.gsbase;
785 	else
786 		vt->msr_host_kernel_gs_base = read_msr(MSR_KERNEL_GS_BASE);
787 
788 	vt->host_debugctlmsr = get_debugctlmsr();
789 
790 	vt->guest_state_loaded = true;
791 }
792 
793 struct tdx_uret_msr {
794 	u32 msr;
795 	unsigned int slot;
796 	u64 defval;
797 };
798 
799 static struct tdx_uret_msr tdx_uret_msrs[] = {
800 	{.msr = MSR_SYSCALL_MASK, .defval = 0x20200 },
801 	{.msr = MSR_STAR,},
802 	{.msr = MSR_LSTAR,},
803 	{.msr = MSR_TSC_AUX,},
804 };
805 
806 static void tdx_user_return_msr_update_cache(void)
807 {
808 	int i;
809 
810 	for (i = 0; i < ARRAY_SIZE(tdx_uret_msrs); i++)
811 		kvm_user_return_msr_update_cache(tdx_uret_msrs[i].slot,
812 						 tdx_uret_msrs[i].defval);
813 }
814 
815 static void tdx_prepare_switch_to_host(struct kvm_vcpu *vcpu)
816 {
817 	struct vcpu_vt *vt = to_vt(vcpu);
818 	struct vcpu_tdx *tdx = to_tdx(vcpu);
819 
820 	if (!vt->guest_state_loaded)
821 		return;
822 
823 	++vcpu->stat.host_state_reload;
824 	wrmsrl(MSR_KERNEL_GS_BASE, vt->msr_host_kernel_gs_base);
825 
826 	if (tdx->guest_entered) {
827 		tdx_user_return_msr_update_cache();
828 		tdx->guest_entered = false;
829 	}
830 
831 	vt->guest_state_loaded = false;
832 }
833 
834 void tdx_vcpu_put(struct kvm_vcpu *vcpu)
835 {
836 	vmx_vcpu_pi_put(vcpu);
837 	tdx_prepare_switch_to_host(vcpu);
838 }
839 
840 void tdx_vcpu_free(struct kvm_vcpu *vcpu)
841 {
842 	struct kvm_tdx *kvm_tdx = to_kvm_tdx(vcpu->kvm);
843 	struct vcpu_tdx *tdx = to_tdx(vcpu);
844 	int i;
845 
846 	/*
847 	 * It is not possible to reclaim pages while hkid is assigned. It might
848 	 * be assigned if:
849 	 * 1. the TD VM is being destroyed but freeing hkid failed, in which
850 	 * case the pages are leaked
851 	 * 2. TD VCPU creation failed and this on the error path, in which case
852 	 * there is nothing to do anyway
853 	 */
854 	if (is_hkid_assigned(kvm_tdx))
855 		return;
856 
857 	if (tdx->vp.tdcx_pages) {
858 		for (i = 0; i < kvm_tdx->td.tdcx_nr_pages; i++) {
859 			if (tdx->vp.tdcx_pages[i])
860 				tdx_reclaim_control_page(tdx->vp.tdcx_pages[i]);
861 		}
862 		kfree(tdx->vp.tdcx_pages);
863 		tdx->vp.tdcx_pages = NULL;
864 	}
865 	if (tdx->vp.tdvpr_page) {
866 		tdx_reclaim_control_page(tdx->vp.tdvpr_page);
867 		tdx->vp.tdvpr_page = 0;
868 	}
869 
870 	tdx->state = VCPU_TD_STATE_UNINITIALIZED;
871 }
872 
873 int tdx_vcpu_pre_run(struct kvm_vcpu *vcpu)
874 {
875 	if (unlikely(to_tdx(vcpu)->state != VCPU_TD_STATE_INITIALIZED ||
876 		     to_kvm_tdx(vcpu->kvm)->state != TD_STATE_RUNNABLE))
877 		return -EINVAL;
878 
879 	return 1;
880 }
881 
882 static __always_inline u32 tdcall_to_vmx_exit_reason(struct kvm_vcpu *vcpu)
883 {
884 	switch (tdvmcall_leaf(vcpu)) {
885 	case EXIT_REASON_CPUID:
886 	case EXIT_REASON_HLT:
887 	case EXIT_REASON_IO_INSTRUCTION:
888 	case EXIT_REASON_MSR_READ:
889 	case EXIT_REASON_MSR_WRITE:
890 		return tdvmcall_leaf(vcpu);
891 	case EXIT_REASON_EPT_VIOLATION:
892 		return EXIT_REASON_EPT_MISCONFIG;
893 	default:
894 		break;
895 	}
896 
897 	return EXIT_REASON_TDCALL;
898 }
899 
900 static __always_inline u32 tdx_to_vmx_exit_reason(struct kvm_vcpu *vcpu)
901 {
902 	struct vcpu_tdx *tdx = to_tdx(vcpu);
903 	u32 exit_reason;
904 
905 	switch (tdx->vp_enter_ret & TDX_SEAMCALL_STATUS_MASK) {
906 	case TDX_SUCCESS:
907 	case TDX_NON_RECOVERABLE_VCPU:
908 	case TDX_NON_RECOVERABLE_TD:
909 	case TDX_NON_RECOVERABLE_TD_NON_ACCESSIBLE:
910 	case TDX_NON_RECOVERABLE_TD_WRONG_APIC_MODE:
911 		break;
912 	default:
913 		return -1u;
914 	}
915 
916 	exit_reason = tdx->vp_enter_ret;
917 
918 	switch (exit_reason) {
919 	case EXIT_REASON_TDCALL:
920 		if (tdvmcall_exit_type(vcpu))
921 			return EXIT_REASON_VMCALL;
922 
923 		return tdcall_to_vmx_exit_reason(vcpu);
924 	case EXIT_REASON_EPT_MISCONFIG:
925 		/*
926 		 * Defer KVM_BUG_ON() until tdx_handle_exit() because this is in
927 		 * non-instrumentable code with interrupts disabled.
928 		 */
929 		return -1u;
930 	default:
931 		break;
932 	}
933 
934 	return exit_reason;
935 }
936 
937 static noinstr void tdx_vcpu_enter_exit(struct kvm_vcpu *vcpu)
938 {
939 	struct vcpu_tdx *tdx = to_tdx(vcpu);
940 	struct vcpu_vt *vt = to_vt(vcpu);
941 
942 	guest_state_enter_irqoff();
943 
944 	tdx->vp_enter_ret = tdh_vp_enter(&tdx->vp, &tdx->vp_enter_args);
945 
946 	vt->exit_reason.full = tdx_to_vmx_exit_reason(vcpu);
947 
948 	vt->exit_qualification = tdx->vp_enter_args.rcx;
949 	tdx->ext_exit_qualification = tdx->vp_enter_args.rdx;
950 	tdx->exit_gpa = tdx->vp_enter_args.r8;
951 	vt->exit_intr_info = tdx->vp_enter_args.r9;
952 
953 	vmx_handle_nmi(vcpu);
954 
955 	guest_state_exit_irqoff();
956 }
957 
958 static bool tdx_failed_vmentry(struct kvm_vcpu *vcpu)
959 {
960 	return vmx_get_exit_reason(vcpu).failed_vmentry &&
961 	       vmx_get_exit_reason(vcpu).full != -1u;
962 }
963 
964 static fastpath_t tdx_exit_handlers_fastpath(struct kvm_vcpu *vcpu)
965 {
966 	u64 vp_enter_ret = to_tdx(vcpu)->vp_enter_ret;
967 
968 	/*
969 	 * TDX_OPERAND_BUSY could be returned for SEPT due to 0-step mitigation
970 	 * or for TD EPOCH due to contention with TDH.MEM.TRACK on TDH.VP.ENTER.
971 	 *
972 	 * When KVM requests KVM_REQ_OUTSIDE_GUEST_MODE, which has both
973 	 * KVM_REQUEST_WAIT and KVM_REQUEST_NO_ACTION set, it requires target
974 	 * vCPUs leaving fastpath so that interrupt can be enabled to ensure the
975 	 * IPIs can be delivered. Return EXIT_FASTPATH_EXIT_HANDLED instead of
976 	 * EXIT_FASTPATH_REENTER_GUEST to exit fastpath, otherwise, the
977 	 * requester may be blocked endlessly.
978 	 */
979 	if (unlikely(tdx_operand_busy(vp_enter_ret)))
980 		return EXIT_FASTPATH_EXIT_HANDLED;
981 
982 	return EXIT_FASTPATH_NONE;
983 }
984 
985 #define TDX_REGS_AVAIL_SET	(BIT_ULL(VCPU_EXREG_EXIT_INFO_1) | \
986 				 BIT_ULL(VCPU_EXREG_EXIT_INFO_2) | \
987 				 BIT_ULL(VCPU_REGS_RAX) | \
988 				 BIT_ULL(VCPU_REGS_RBX) | \
989 				 BIT_ULL(VCPU_REGS_RCX) | \
990 				 BIT_ULL(VCPU_REGS_RDX) | \
991 				 BIT_ULL(VCPU_REGS_RBP) | \
992 				 BIT_ULL(VCPU_REGS_RSI) | \
993 				 BIT_ULL(VCPU_REGS_RDI) | \
994 				 BIT_ULL(VCPU_REGS_R8) | \
995 				 BIT_ULL(VCPU_REGS_R9) | \
996 				 BIT_ULL(VCPU_REGS_R10) | \
997 				 BIT_ULL(VCPU_REGS_R11) | \
998 				 BIT_ULL(VCPU_REGS_R12) | \
999 				 BIT_ULL(VCPU_REGS_R13) | \
1000 				 BIT_ULL(VCPU_REGS_R14) | \
1001 				 BIT_ULL(VCPU_REGS_R15))
1002 
1003 static void tdx_load_host_xsave_state(struct kvm_vcpu *vcpu)
1004 {
1005 	struct kvm_tdx *kvm_tdx = to_kvm_tdx(vcpu->kvm);
1006 
1007 	/*
1008 	 * All TDX hosts support PKRU; but even if they didn't,
1009 	 * vcpu->arch.host_pkru would be 0 and the wrpkru would be
1010 	 * skipped.
1011 	 */
1012 	if (vcpu->arch.host_pkru != 0)
1013 		wrpkru(vcpu->arch.host_pkru);
1014 
1015 	if (kvm_host.xcr0 != (kvm_tdx->xfam & kvm_caps.supported_xcr0))
1016 		xsetbv(XCR_XFEATURE_ENABLED_MASK, kvm_host.xcr0);
1017 
1018 	/*
1019 	 * Likewise, even if a TDX hosts didn't support XSS both arms of
1020 	 * the comparison would be 0 and the wrmsrl would be skipped.
1021 	 */
1022 	if (kvm_host.xss != (kvm_tdx->xfam & kvm_caps.supported_xss))
1023 		wrmsrl(MSR_IA32_XSS, kvm_host.xss);
1024 }
1025 
1026 #define TDX_DEBUGCTL_PRESERVED (DEBUGCTLMSR_BTF | \
1027 				DEBUGCTLMSR_FREEZE_PERFMON_ON_PMI | \
1028 				DEBUGCTLMSR_FREEZE_IN_SMM)
1029 
1030 fastpath_t tdx_vcpu_run(struct kvm_vcpu *vcpu, bool force_immediate_exit)
1031 {
1032 	struct vcpu_tdx *tdx = to_tdx(vcpu);
1033 	struct vcpu_vt *vt = to_vt(vcpu);
1034 
1035 	/*
1036 	 * force_immediate_exit requires vCPU entering for events injection with
1037 	 * an immediately exit followed. But The TDX module doesn't guarantee
1038 	 * entry, it's already possible for KVM to _think_ it completely entry
1039 	 * to the guest without actually having done so.
1040 	 * Since KVM never needs to force an immediate exit for TDX, and can't
1041 	 * do direct injection, just warn on force_immediate_exit.
1042 	 */
1043 	WARN_ON_ONCE(force_immediate_exit);
1044 
1045 	/*
1046 	 * Wait until retry of SEPT-zap-related SEAMCALL completes before
1047 	 * allowing vCPU entry to avoid contention with tdh_vp_enter() and
1048 	 * TDCALLs.
1049 	 */
1050 	if (unlikely(READ_ONCE(to_kvm_tdx(vcpu->kvm)->wait_for_sept_zap)))
1051 		return EXIT_FASTPATH_EXIT_HANDLED;
1052 
1053 	trace_kvm_entry(vcpu, force_immediate_exit);
1054 
1055 	if (pi_test_on(&vt->pi_desc)) {
1056 		apic->send_IPI_self(POSTED_INTR_VECTOR);
1057 
1058 		if (pi_test_pir(kvm_lapic_get_reg(vcpu->arch.apic, APIC_LVTT) &
1059 			       APIC_VECTOR_MASK, &vt->pi_desc))
1060 			kvm_wait_lapic_expire(vcpu);
1061 	}
1062 
1063 	tdx_vcpu_enter_exit(vcpu);
1064 
1065 	if (vt->host_debugctlmsr & ~TDX_DEBUGCTL_PRESERVED)
1066 		update_debugctlmsr(vt->host_debugctlmsr);
1067 
1068 	tdx_load_host_xsave_state(vcpu);
1069 	tdx->guest_entered = true;
1070 
1071 	vcpu->arch.regs_avail &= TDX_REGS_AVAIL_SET;
1072 
1073 	if (unlikely(tdx->vp_enter_ret == EXIT_REASON_EPT_MISCONFIG))
1074 		return EXIT_FASTPATH_NONE;
1075 
1076 	if (unlikely((tdx->vp_enter_ret & TDX_SW_ERROR) == TDX_SW_ERROR))
1077 		return EXIT_FASTPATH_NONE;
1078 
1079 	if (unlikely(vmx_get_exit_reason(vcpu).basic == EXIT_REASON_MCE_DURING_VMENTRY))
1080 		kvm_machine_check();
1081 
1082 	trace_kvm_exit(vcpu, KVM_ISA_VMX);
1083 
1084 	if (unlikely(tdx_failed_vmentry(vcpu)))
1085 		return EXIT_FASTPATH_NONE;
1086 
1087 	return tdx_exit_handlers_fastpath(vcpu);
1088 }
1089 
1090 void tdx_inject_nmi(struct kvm_vcpu *vcpu)
1091 {
1092 	++vcpu->stat.nmi_injections;
1093 	td_management_write8(to_tdx(vcpu), TD_VCPU_PEND_NMI, 1);
1094 	/*
1095 	 * From KVM's perspective, NMI injection is completed right after
1096 	 * writing to PEND_NMI.  KVM doesn't care whether an NMI is injected by
1097 	 * the TDX module or not.
1098 	 */
1099 	vcpu->arch.nmi_injected = false;
1100 	/*
1101 	 * TDX doesn't support KVM to request NMI window exit.  If there is
1102 	 * still a pending vNMI, KVM is not able to inject it along with the
1103 	 * one pending in TDX module in a back-to-back way.  Since the previous
1104 	 * vNMI is still pending in TDX module, i.e. it has not been delivered
1105 	 * to TDX guest yet, it's OK to collapse the pending vNMI into the
1106 	 * previous one.  The guest is expected to handle all the NMI sources
1107 	 * when handling the first vNMI.
1108 	 */
1109 	vcpu->arch.nmi_pending = 0;
1110 }
1111 
1112 static int tdx_handle_exception_nmi(struct kvm_vcpu *vcpu)
1113 {
1114 	u32 intr_info = vmx_get_intr_info(vcpu);
1115 
1116 	/*
1117 	 * Machine checks are handled by handle_exception_irqoff(), or by
1118 	 * tdx_handle_exit() with TDX_NON_RECOVERABLE set if a #MC occurs on
1119 	 * VM-Entry.  NMIs are handled by tdx_vcpu_enter_exit().
1120 	 */
1121 	if (is_nmi(intr_info) || is_machine_check(intr_info))
1122 		return 1;
1123 
1124 	vcpu->run->exit_reason = KVM_EXIT_EXCEPTION;
1125 	vcpu->run->ex.exception = intr_info & INTR_INFO_VECTOR_MASK;
1126 	vcpu->run->ex.error_code = 0;
1127 
1128 	return 0;
1129 }
1130 
1131 static int complete_hypercall_exit(struct kvm_vcpu *vcpu)
1132 {
1133 	tdvmcall_set_return_code(vcpu, vcpu->run->hypercall.ret);
1134 	return 1;
1135 }
1136 
1137 static int tdx_emulate_vmcall(struct kvm_vcpu *vcpu)
1138 {
1139 	kvm_rax_write(vcpu, to_tdx(vcpu)->vp_enter_args.r10);
1140 	kvm_rbx_write(vcpu, to_tdx(vcpu)->vp_enter_args.r11);
1141 	kvm_rcx_write(vcpu, to_tdx(vcpu)->vp_enter_args.r12);
1142 	kvm_rdx_write(vcpu, to_tdx(vcpu)->vp_enter_args.r13);
1143 	kvm_rsi_write(vcpu, to_tdx(vcpu)->vp_enter_args.r14);
1144 
1145 	return __kvm_emulate_hypercall(vcpu, 0, complete_hypercall_exit);
1146 }
1147 
1148 /*
1149  * Split into chunks and check interrupt pending between chunks.  This allows
1150  * for timely injection of interrupts to prevent issues with guest lockup
1151  * detection.
1152  */
1153 #define TDX_MAP_GPA_MAX_LEN (2 * 1024 * 1024)
1154 static void __tdx_map_gpa(struct vcpu_tdx *tdx);
1155 
1156 static int tdx_complete_vmcall_map_gpa(struct kvm_vcpu *vcpu)
1157 {
1158 	struct vcpu_tdx *tdx = to_tdx(vcpu);
1159 
1160 	if (vcpu->run->hypercall.ret) {
1161 		tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_INVALID_OPERAND);
1162 		tdx->vp_enter_args.r11 = tdx->map_gpa_next;
1163 		return 1;
1164 	}
1165 
1166 	tdx->map_gpa_next += TDX_MAP_GPA_MAX_LEN;
1167 	if (tdx->map_gpa_next >= tdx->map_gpa_end)
1168 		return 1;
1169 
1170 	/*
1171 	 * Stop processing the remaining part if there is a pending interrupt,
1172 	 * which could be qualified to deliver.  Skip checking pending RVI for
1173 	 * TDVMCALL_MAP_GPA, see comments in tdx_protected_apic_has_interrupt().
1174 	 */
1175 	if (kvm_vcpu_has_events(vcpu)) {
1176 		tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_RETRY);
1177 		tdx->vp_enter_args.r11 = tdx->map_gpa_next;
1178 		return 1;
1179 	}
1180 
1181 	__tdx_map_gpa(tdx);
1182 	return 0;
1183 }
1184 
1185 static void __tdx_map_gpa(struct vcpu_tdx *tdx)
1186 {
1187 	u64 gpa = tdx->map_gpa_next;
1188 	u64 size = tdx->map_gpa_end - tdx->map_gpa_next;
1189 
1190 	if (size > TDX_MAP_GPA_MAX_LEN)
1191 		size = TDX_MAP_GPA_MAX_LEN;
1192 
1193 	tdx->vcpu.run->exit_reason       = KVM_EXIT_HYPERCALL;
1194 	tdx->vcpu.run->hypercall.nr      = KVM_HC_MAP_GPA_RANGE;
1195 	/*
1196 	 * In principle this should have been -KVM_ENOSYS, but userspace (QEMU <=9.2)
1197 	 * assumed that vcpu->run->hypercall.ret is never changed by KVM and thus that
1198 	 * it was always zero on KVM_EXIT_HYPERCALL.  Since KVM is now overwriting
1199 	 * vcpu->run->hypercall.ret, ensuring that it is zero to not break QEMU.
1200 	 */
1201 	tdx->vcpu.run->hypercall.ret = 0;
1202 	tdx->vcpu.run->hypercall.args[0] = gpa & ~gfn_to_gpa(kvm_gfn_direct_bits(tdx->vcpu.kvm));
1203 	tdx->vcpu.run->hypercall.args[1] = size / PAGE_SIZE;
1204 	tdx->vcpu.run->hypercall.args[2] = vt_is_tdx_private_gpa(tdx->vcpu.kvm, gpa) ?
1205 					   KVM_MAP_GPA_RANGE_ENCRYPTED :
1206 					   KVM_MAP_GPA_RANGE_DECRYPTED;
1207 	tdx->vcpu.run->hypercall.flags   = KVM_EXIT_HYPERCALL_LONG_MODE;
1208 
1209 	tdx->vcpu.arch.complete_userspace_io = tdx_complete_vmcall_map_gpa;
1210 }
1211 
1212 static int tdx_map_gpa(struct kvm_vcpu *vcpu)
1213 {
1214 	struct vcpu_tdx *tdx = to_tdx(vcpu);
1215 	u64 gpa = tdx->vp_enter_args.r12;
1216 	u64 size = tdx->vp_enter_args.r13;
1217 	u64 ret;
1218 
1219 	/*
1220 	 * Converting TDVMCALL_MAP_GPA to KVM_HC_MAP_GPA_RANGE requires
1221 	 * userspace to enable KVM_CAP_EXIT_HYPERCALL with KVM_HC_MAP_GPA_RANGE
1222 	 * bit set.  This is a base call so it should always be supported, but
1223 	 * KVM has no way to ensure that userspace implements the GHCI correctly.
1224 	 * So if KVM_HC_MAP_GPA_RANGE does not cause a VMEXIT, return an error
1225 	 * to the guest.
1226 	 */
1227 	if (!user_exit_on_hypercall(vcpu->kvm, KVM_HC_MAP_GPA_RANGE)) {
1228 		ret = TDVMCALL_STATUS_SUBFUNC_UNSUPPORTED;
1229 		goto error;
1230 	}
1231 
1232 	if (gpa + size <= gpa || !kvm_vcpu_is_legal_gpa(vcpu, gpa) ||
1233 	    !kvm_vcpu_is_legal_gpa(vcpu, gpa + size - 1) ||
1234 	    (vt_is_tdx_private_gpa(vcpu->kvm, gpa) !=
1235 	     vt_is_tdx_private_gpa(vcpu->kvm, gpa + size - 1))) {
1236 		ret = TDVMCALL_STATUS_INVALID_OPERAND;
1237 		goto error;
1238 	}
1239 
1240 	if (!PAGE_ALIGNED(gpa) || !PAGE_ALIGNED(size)) {
1241 		ret = TDVMCALL_STATUS_ALIGN_ERROR;
1242 		goto error;
1243 	}
1244 
1245 	tdx->map_gpa_end = gpa + size;
1246 	tdx->map_gpa_next = gpa;
1247 
1248 	__tdx_map_gpa(tdx);
1249 	return 0;
1250 
1251 error:
1252 	tdvmcall_set_return_code(vcpu, ret);
1253 	tdx->vp_enter_args.r11 = gpa;
1254 	return 1;
1255 }
1256 
1257 static int tdx_report_fatal_error(struct kvm_vcpu *vcpu)
1258 {
1259 	struct vcpu_tdx *tdx = to_tdx(vcpu);
1260 	u64 *regs = vcpu->run->system_event.data;
1261 	u64 *module_regs = &tdx->vp_enter_args.r8;
1262 	int index = VCPU_REGS_RAX;
1263 
1264 	vcpu->run->exit_reason = KVM_EXIT_SYSTEM_EVENT;
1265 	vcpu->run->system_event.type = KVM_SYSTEM_EVENT_TDX_FATAL;
1266 	vcpu->run->system_event.ndata = 16;
1267 
1268 	/* Dump 16 general-purpose registers to userspace in ascending order. */
1269 	regs[index++] = tdx->vp_enter_ret;
1270 	regs[index++] = tdx->vp_enter_args.rcx;
1271 	regs[index++] = tdx->vp_enter_args.rdx;
1272 	regs[index++] = tdx->vp_enter_args.rbx;
1273 	regs[index++] = 0;
1274 	regs[index++] = 0;
1275 	regs[index++] = tdx->vp_enter_args.rsi;
1276 	regs[index] = tdx->vp_enter_args.rdi;
1277 	for (index = 0; index < 8; index++)
1278 		regs[VCPU_REGS_R8 + index] = module_regs[index];
1279 
1280 	return 0;
1281 }
1282 
1283 static int tdx_emulate_cpuid(struct kvm_vcpu *vcpu)
1284 {
1285 	u32 eax, ebx, ecx, edx;
1286 	struct vcpu_tdx *tdx = to_tdx(vcpu);
1287 
1288 	/* EAX and ECX for cpuid is stored in R12 and R13. */
1289 	eax = tdx->vp_enter_args.r12;
1290 	ecx = tdx->vp_enter_args.r13;
1291 
1292 	kvm_cpuid(vcpu, &eax, &ebx, &ecx, &edx, false);
1293 
1294 	tdx->vp_enter_args.r12 = eax;
1295 	tdx->vp_enter_args.r13 = ebx;
1296 	tdx->vp_enter_args.r14 = ecx;
1297 	tdx->vp_enter_args.r15 = edx;
1298 
1299 	return 1;
1300 }
1301 
1302 static int tdx_complete_pio_out(struct kvm_vcpu *vcpu)
1303 {
1304 	vcpu->arch.pio.count = 0;
1305 	return 1;
1306 }
1307 
1308 static int tdx_complete_pio_in(struct kvm_vcpu *vcpu)
1309 {
1310 	struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
1311 	unsigned long val = 0;
1312 	int ret;
1313 
1314 	ret = ctxt->ops->pio_in_emulated(ctxt, vcpu->arch.pio.size,
1315 					 vcpu->arch.pio.port, &val, 1);
1316 
1317 	WARN_ON_ONCE(!ret);
1318 
1319 	tdvmcall_set_return_val(vcpu, val);
1320 
1321 	return 1;
1322 }
1323 
1324 static int tdx_emulate_io(struct kvm_vcpu *vcpu)
1325 {
1326 	struct vcpu_tdx *tdx = to_tdx(vcpu);
1327 	struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
1328 	unsigned long val = 0;
1329 	unsigned int port;
1330 	u64 size, write;
1331 	int ret;
1332 
1333 	++vcpu->stat.io_exits;
1334 
1335 	size = tdx->vp_enter_args.r12;
1336 	write = tdx->vp_enter_args.r13;
1337 	port = tdx->vp_enter_args.r14;
1338 
1339 	if ((write != 0 && write != 1) || (size != 1 && size != 2 && size != 4)) {
1340 		tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_INVALID_OPERAND);
1341 		return 1;
1342 	}
1343 
1344 	if (write) {
1345 		val = tdx->vp_enter_args.r15;
1346 		ret = ctxt->ops->pio_out_emulated(ctxt, size, port, &val, 1);
1347 	} else {
1348 		ret = ctxt->ops->pio_in_emulated(ctxt, size, port, &val, 1);
1349 	}
1350 
1351 	if (!ret)
1352 		vcpu->arch.complete_userspace_io = write ? tdx_complete_pio_out :
1353 							   tdx_complete_pio_in;
1354 	else if (!write)
1355 		tdvmcall_set_return_val(vcpu, val);
1356 
1357 	return ret;
1358 }
1359 
1360 static int tdx_complete_mmio_read(struct kvm_vcpu *vcpu)
1361 {
1362 	unsigned long val = 0;
1363 	gpa_t gpa;
1364 	int size;
1365 
1366 	gpa = vcpu->mmio_fragments[0].gpa;
1367 	size = vcpu->mmio_fragments[0].len;
1368 
1369 	memcpy(&val, vcpu->run->mmio.data, size);
1370 	tdvmcall_set_return_val(vcpu, val);
1371 	trace_kvm_mmio(KVM_TRACE_MMIO_READ, size, gpa, &val);
1372 	return 1;
1373 }
1374 
1375 static inline int tdx_mmio_write(struct kvm_vcpu *vcpu, gpa_t gpa, int size,
1376 				 unsigned long val)
1377 {
1378 	if (!kvm_io_bus_write(vcpu, KVM_FAST_MMIO_BUS, gpa, 0, NULL)) {
1379 		trace_kvm_fast_mmio(gpa);
1380 		return 0;
1381 	}
1382 
1383 	trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, size, gpa, &val);
1384 	if (kvm_io_bus_write(vcpu, KVM_MMIO_BUS, gpa, size, &val))
1385 		return -EOPNOTSUPP;
1386 
1387 	return 0;
1388 }
1389 
1390 static inline int tdx_mmio_read(struct kvm_vcpu *vcpu, gpa_t gpa, int size)
1391 {
1392 	unsigned long val;
1393 
1394 	if (kvm_io_bus_read(vcpu, KVM_MMIO_BUS, gpa, size, &val))
1395 		return -EOPNOTSUPP;
1396 
1397 	tdvmcall_set_return_val(vcpu, val);
1398 	trace_kvm_mmio(KVM_TRACE_MMIO_READ, size, gpa, &val);
1399 	return 0;
1400 }
1401 
1402 static int tdx_emulate_mmio(struct kvm_vcpu *vcpu)
1403 {
1404 	struct vcpu_tdx *tdx = to_tdx(vcpu);
1405 	int size, write, r;
1406 	unsigned long val;
1407 	gpa_t gpa;
1408 
1409 	size = tdx->vp_enter_args.r12;
1410 	write = tdx->vp_enter_args.r13;
1411 	gpa = tdx->vp_enter_args.r14;
1412 	val = write ? tdx->vp_enter_args.r15 : 0;
1413 
1414 	if (size != 1 && size != 2 && size != 4 && size != 8)
1415 		goto error;
1416 	if (write != 0 && write != 1)
1417 		goto error;
1418 
1419 	/*
1420 	 * TDG.VP.VMCALL<MMIO> allows only shared GPA, it makes no sense to
1421 	 * do MMIO emulation for private GPA.
1422 	 */
1423 	if (vt_is_tdx_private_gpa(vcpu->kvm, gpa) ||
1424 	    vt_is_tdx_private_gpa(vcpu->kvm, gpa + size - 1))
1425 		goto error;
1426 
1427 	gpa = gpa & ~gfn_to_gpa(kvm_gfn_direct_bits(vcpu->kvm));
1428 
1429 	if (write)
1430 		r = tdx_mmio_write(vcpu, gpa, size, val);
1431 	else
1432 		r = tdx_mmio_read(vcpu, gpa, size);
1433 	if (!r)
1434 		/* Kernel completed device emulation. */
1435 		return 1;
1436 
1437 	/* Request the device emulation to userspace device model. */
1438 	vcpu->mmio_is_write = write;
1439 	if (!write)
1440 		vcpu->arch.complete_userspace_io = tdx_complete_mmio_read;
1441 
1442 	vcpu->run->mmio.phys_addr = gpa;
1443 	vcpu->run->mmio.len = size;
1444 	vcpu->run->mmio.is_write = write;
1445 	vcpu->run->exit_reason = KVM_EXIT_MMIO;
1446 
1447 	if (write) {
1448 		memcpy(vcpu->run->mmio.data, &val, size);
1449 	} else {
1450 		vcpu->mmio_fragments[0].gpa = gpa;
1451 		vcpu->mmio_fragments[0].len = size;
1452 		trace_kvm_mmio(KVM_TRACE_MMIO_READ_UNSATISFIED, size, gpa, NULL);
1453 	}
1454 	return 0;
1455 
1456 error:
1457 	tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_INVALID_OPERAND);
1458 	return 1;
1459 }
1460 
1461 static int tdx_complete_get_td_vm_call_info(struct kvm_vcpu *vcpu)
1462 {
1463 	struct vcpu_tdx *tdx = to_tdx(vcpu);
1464 
1465 	tdvmcall_set_return_code(vcpu, vcpu->run->tdx.get_tdvmcall_info.ret);
1466 
1467 	/*
1468 	 * For now, there is no TDVMCALL beyond GHCI base API supported by KVM
1469 	 * directly without the support from userspace, just set the value
1470 	 * returned from userspace.
1471 	 */
1472 	tdx->vp_enter_args.r11 = vcpu->run->tdx.get_tdvmcall_info.r11;
1473 	tdx->vp_enter_args.r12 = vcpu->run->tdx.get_tdvmcall_info.r12;
1474 	tdx->vp_enter_args.r13 = vcpu->run->tdx.get_tdvmcall_info.r13;
1475 	tdx->vp_enter_args.r14 = vcpu->run->tdx.get_tdvmcall_info.r14;
1476 
1477 	return 1;
1478 }
1479 
1480 static int tdx_get_td_vm_call_info(struct kvm_vcpu *vcpu)
1481 {
1482 	struct vcpu_tdx *tdx = to_tdx(vcpu);
1483 
1484 	switch (tdx->vp_enter_args.r12) {
1485 	case 0:
1486 		tdx->vp_enter_args.r11 = 0;
1487 		tdx->vp_enter_args.r12 = 0;
1488 		tdx->vp_enter_args.r13 = 0;
1489 		tdx->vp_enter_args.r14 = 0;
1490 		tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_SUCCESS);
1491 		return 1;
1492 	case 1:
1493 		vcpu->run->tdx.get_tdvmcall_info.leaf = tdx->vp_enter_args.r12;
1494 		vcpu->run->exit_reason = KVM_EXIT_TDX;
1495 		vcpu->run->tdx.flags = 0;
1496 		vcpu->run->tdx.nr = TDVMCALL_GET_TD_VM_CALL_INFO;
1497 		vcpu->run->tdx.get_tdvmcall_info.ret = TDVMCALL_STATUS_SUCCESS;
1498 		vcpu->run->tdx.get_tdvmcall_info.r11 = 0;
1499 		vcpu->run->tdx.get_tdvmcall_info.r12 = 0;
1500 		vcpu->run->tdx.get_tdvmcall_info.r13 = 0;
1501 		vcpu->run->tdx.get_tdvmcall_info.r14 = 0;
1502 		vcpu->arch.complete_userspace_io = tdx_complete_get_td_vm_call_info;
1503 		return 0;
1504 	default:
1505 		tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_INVALID_OPERAND);
1506 		return 1;
1507 	}
1508 }
1509 
1510 static int tdx_complete_simple(struct kvm_vcpu *vcpu)
1511 {
1512 	tdvmcall_set_return_code(vcpu, vcpu->run->tdx.unknown.ret);
1513 	return 1;
1514 }
1515 
1516 static int tdx_get_quote(struct kvm_vcpu *vcpu)
1517 {
1518 	struct vcpu_tdx *tdx = to_tdx(vcpu);
1519 	u64 gpa = tdx->vp_enter_args.r12;
1520 	u64 size = tdx->vp_enter_args.r13;
1521 
1522 	/* The gpa of buffer must have shared bit set. */
1523 	if (vt_is_tdx_private_gpa(vcpu->kvm, gpa)) {
1524 		tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_INVALID_OPERAND);
1525 		return 1;
1526 	}
1527 
1528 	vcpu->run->exit_reason = KVM_EXIT_TDX;
1529 	vcpu->run->tdx.flags = 0;
1530 	vcpu->run->tdx.nr = TDVMCALL_GET_QUOTE;
1531 	vcpu->run->tdx.get_quote.ret = TDVMCALL_STATUS_SUBFUNC_UNSUPPORTED;
1532 	vcpu->run->tdx.get_quote.gpa = gpa & ~gfn_to_gpa(kvm_gfn_direct_bits(tdx->vcpu.kvm));
1533 	vcpu->run->tdx.get_quote.size = size;
1534 
1535 	vcpu->arch.complete_userspace_io = tdx_complete_simple;
1536 
1537 	return 0;
1538 }
1539 
1540 static int tdx_setup_event_notify_interrupt(struct kvm_vcpu *vcpu)
1541 {
1542 	struct vcpu_tdx *tdx = to_tdx(vcpu);
1543 	u64 vector = tdx->vp_enter_args.r12;
1544 
1545 	if (vector < 32 || vector > 255) {
1546 		tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_INVALID_OPERAND);
1547 		return 1;
1548 	}
1549 
1550 	vcpu->run->exit_reason = KVM_EXIT_TDX;
1551 	vcpu->run->tdx.flags = 0;
1552 	vcpu->run->tdx.nr = TDVMCALL_SETUP_EVENT_NOTIFY_INTERRUPT;
1553 	vcpu->run->tdx.setup_event_notify.ret = TDVMCALL_STATUS_SUBFUNC_UNSUPPORTED;
1554 	vcpu->run->tdx.setup_event_notify.vector = vector;
1555 
1556 	vcpu->arch.complete_userspace_io = tdx_complete_simple;
1557 
1558 	return 0;
1559 }
1560 
1561 static int handle_tdvmcall(struct kvm_vcpu *vcpu)
1562 {
1563 	switch (tdvmcall_leaf(vcpu)) {
1564 	case TDVMCALL_MAP_GPA:
1565 		return tdx_map_gpa(vcpu);
1566 	case TDVMCALL_REPORT_FATAL_ERROR:
1567 		return tdx_report_fatal_error(vcpu);
1568 	case TDVMCALL_GET_TD_VM_CALL_INFO:
1569 		return tdx_get_td_vm_call_info(vcpu);
1570 	case TDVMCALL_GET_QUOTE:
1571 		return tdx_get_quote(vcpu);
1572 	case TDVMCALL_SETUP_EVENT_NOTIFY_INTERRUPT:
1573 		return tdx_setup_event_notify_interrupt(vcpu);
1574 	default:
1575 		break;
1576 	}
1577 
1578 	tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_SUBFUNC_UNSUPPORTED);
1579 	return 1;
1580 }
1581 
1582 void tdx_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa, int pgd_level)
1583 {
1584 	u64 shared_bit = (pgd_level == 5) ? TDX_SHARED_BIT_PWL_5 :
1585 			  TDX_SHARED_BIT_PWL_4;
1586 
1587 	if (KVM_BUG_ON(shared_bit != kvm_gfn_direct_bits(vcpu->kvm), vcpu->kvm))
1588 		return;
1589 
1590 	td_vmcs_write64(to_tdx(vcpu), SHARED_EPT_POINTER, root_hpa);
1591 }
1592 
1593 static void tdx_unpin(struct kvm *kvm, struct page *page)
1594 {
1595 	put_page(page);
1596 }
1597 
1598 static int tdx_mem_page_aug(struct kvm *kvm, gfn_t gfn,
1599 			    enum pg_level level, struct page *page)
1600 {
1601 	int tdx_level = pg_level_to_tdx_sept_level(level);
1602 	struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
1603 	gpa_t gpa = gfn_to_gpa(gfn);
1604 	u64 entry, level_state;
1605 	u64 err;
1606 
1607 	err = tdh_mem_page_aug(&kvm_tdx->td, gpa, tdx_level, page, &entry, &level_state);
1608 	if (unlikely(tdx_operand_busy(err))) {
1609 		tdx_unpin(kvm, page);
1610 		return -EBUSY;
1611 	}
1612 
1613 	if (KVM_BUG_ON(err, kvm)) {
1614 		pr_tdx_error_2(TDH_MEM_PAGE_AUG, err, entry, level_state);
1615 		tdx_unpin(kvm, page);
1616 		return -EIO;
1617 	}
1618 
1619 	return 0;
1620 }
1621 
1622 /*
1623  * KVM_TDX_INIT_MEM_REGION calls kvm_gmem_populate() to map guest pages; the
1624  * callback tdx_gmem_post_populate() then maps pages into private memory.
1625  * through the a seamcall TDH.MEM.PAGE.ADD().  The SEAMCALL also requires the
1626  * private EPT structures for the page to have been built before, which is
1627  * done via kvm_tdp_map_page(). nr_premapped counts the number of pages that
1628  * were added to the EPT structures but not added with TDH.MEM.PAGE.ADD().
1629  * The counter has to be zero on KVM_TDX_FINALIZE_VM, to ensure that there
1630  * are no half-initialized shared EPT pages.
1631  */
1632 static int tdx_mem_page_record_premap_cnt(struct kvm *kvm, gfn_t gfn,
1633 					  enum pg_level level, kvm_pfn_t pfn)
1634 {
1635 	struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
1636 
1637 	if (KVM_BUG_ON(kvm->arch.pre_fault_allowed, kvm))
1638 		return -EINVAL;
1639 
1640 	/* nr_premapped will be decreased when tdh_mem_page_add() is called. */
1641 	atomic64_inc(&kvm_tdx->nr_premapped);
1642 	return 0;
1643 }
1644 
1645 int tdx_sept_set_private_spte(struct kvm *kvm, gfn_t gfn,
1646 			      enum pg_level level, kvm_pfn_t pfn)
1647 {
1648 	struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
1649 	struct page *page = pfn_to_page(pfn);
1650 
1651 	/* TODO: handle large pages. */
1652 	if (KVM_BUG_ON(level != PG_LEVEL_4K, kvm))
1653 		return -EINVAL;
1654 
1655 	/*
1656 	 * Because guest_memfd doesn't support page migration with
1657 	 * a_ops->migrate_folio (yet), no callback is triggered for KVM on page
1658 	 * migration.  Until guest_memfd supports page migration, prevent page
1659 	 * migration.
1660 	 * TODO: Once guest_memfd introduces callback on page migration,
1661 	 * implement it and remove get_page/put_page().
1662 	 */
1663 	get_page(page);
1664 
1665 	/*
1666 	 * Read 'pre_fault_allowed' before 'kvm_tdx->state'; see matching
1667 	 * barrier in tdx_td_finalize().
1668 	 */
1669 	smp_rmb();
1670 	if (likely(kvm_tdx->state == TD_STATE_RUNNABLE))
1671 		return tdx_mem_page_aug(kvm, gfn, level, page);
1672 
1673 	return tdx_mem_page_record_premap_cnt(kvm, gfn, level, pfn);
1674 }
1675 
1676 static int tdx_sept_drop_private_spte(struct kvm *kvm, gfn_t gfn,
1677 				      enum pg_level level, struct page *page)
1678 {
1679 	int tdx_level = pg_level_to_tdx_sept_level(level);
1680 	struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
1681 	gpa_t gpa = gfn_to_gpa(gfn);
1682 	u64 err, entry, level_state;
1683 
1684 	/* TODO: handle large pages. */
1685 	if (KVM_BUG_ON(level != PG_LEVEL_4K, kvm))
1686 		return -EINVAL;
1687 
1688 	if (KVM_BUG_ON(!is_hkid_assigned(kvm_tdx), kvm))
1689 		return -EINVAL;
1690 
1691 	/*
1692 	 * When zapping private page, write lock is held. So no race condition
1693 	 * with other vcpu sept operation.
1694 	 * Race with TDH.VP.ENTER due to (0-step mitigation) and Guest TDCALLs.
1695 	 */
1696 	err = tdh_mem_page_remove(&kvm_tdx->td, gpa, tdx_level, &entry,
1697 				  &level_state);
1698 
1699 	if (unlikely(tdx_operand_busy(err))) {
1700 		/*
1701 		 * The second retry is expected to succeed after kicking off all
1702 		 * other vCPUs and prevent them from invoking TDH.VP.ENTER.
1703 		 */
1704 		tdx_no_vcpus_enter_start(kvm);
1705 		err = tdh_mem_page_remove(&kvm_tdx->td, gpa, tdx_level, &entry,
1706 					  &level_state);
1707 		tdx_no_vcpus_enter_stop(kvm);
1708 	}
1709 
1710 	if (KVM_BUG_ON(err, kvm)) {
1711 		pr_tdx_error_2(TDH_MEM_PAGE_REMOVE, err, entry, level_state);
1712 		return -EIO;
1713 	}
1714 
1715 	err = tdh_phymem_page_wbinvd_hkid((u16)kvm_tdx->hkid, page);
1716 
1717 	if (KVM_BUG_ON(err, kvm)) {
1718 		pr_tdx_error(TDH_PHYMEM_PAGE_WBINVD, err);
1719 		return -EIO;
1720 	}
1721 	tdx_clear_page(page);
1722 	tdx_unpin(kvm, page);
1723 	return 0;
1724 }
1725 
1726 int tdx_sept_link_private_spt(struct kvm *kvm, gfn_t gfn,
1727 			      enum pg_level level, void *private_spt)
1728 {
1729 	int tdx_level = pg_level_to_tdx_sept_level(level);
1730 	gpa_t gpa = gfn_to_gpa(gfn);
1731 	struct page *page = virt_to_page(private_spt);
1732 	u64 err, entry, level_state;
1733 
1734 	err = tdh_mem_sept_add(&to_kvm_tdx(kvm)->td, gpa, tdx_level, page, &entry,
1735 			       &level_state);
1736 	if (unlikely(tdx_operand_busy(err)))
1737 		return -EBUSY;
1738 
1739 	if (KVM_BUG_ON(err, kvm)) {
1740 		pr_tdx_error_2(TDH_MEM_SEPT_ADD, err, entry, level_state);
1741 		return -EIO;
1742 	}
1743 
1744 	return 0;
1745 }
1746 
1747 /*
1748  * Check if the error returned from a SEPT zap SEAMCALL is due to that a page is
1749  * mapped by KVM_TDX_INIT_MEM_REGION without tdh_mem_page_add() being called
1750  * successfully.
1751  *
1752  * Since tdh_mem_sept_add() must have been invoked successfully before a
1753  * non-leaf entry present in the mirrored page table, the SEPT ZAP related
1754  * SEAMCALLs should not encounter err TDX_EPT_WALK_FAILED. They should instead
1755  * find TDX_EPT_ENTRY_STATE_INCORRECT due to an empty leaf entry found in the
1756  * SEPT.
1757  *
1758  * Further check if the returned entry from SEPT walking is with RWX permissions
1759  * to filter out anything unexpected.
1760  *
1761  * Note: @level is pg_level, not the tdx_level. The tdx_level extracted from
1762  * level_state returned from a SEAMCALL error is the same as that passed into
1763  * the SEAMCALL.
1764  */
1765 static int tdx_is_sept_zap_err_due_to_premap(struct kvm_tdx *kvm_tdx, u64 err,
1766 					     u64 entry, int level)
1767 {
1768 	if (!err || kvm_tdx->state == TD_STATE_RUNNABLE)
1769 		return false;
1770 
1771 	if (err != (TDX_EPT_ENTRY_STATE_INCORRECT | TDX_OPERAND_ID_RCX))
1772 		return false;
1773 
1774 	if ((is_last_spte(entry, level) && (entry & VMX_EPT_RWX_MASK)))
1775 		return false;
1776 
1777 	return true;
1778 }
1779 
1780 static int tdx_sept_zap_private_spte(struct kvm *kvm, gfn_t gfn,
1781 				     enum pg_level level, struct page *page)
1782 {
1783 	int tdx_level = pg_level_to_tdx_sept_level(level);
1784 	struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
1785 	gpa_t gpa = gfn_to_gpa(gfn) & KVM_HPAGE_MASK(level);
1786 	u64 err, entry, level_state;
1787 
1788 	/* For now large page isn't supported yet. */
1789 	WARN_ON_ONCE(level != PG_LEVEL_4K);
1790 
1791 	err = tdh_mem_range_block(&kvm_tdx->td, gpa, tdx_level, &entry, &level_state);
1792 
1793 	if (unlikely(tdx_operand_busy(err))) {
1794 		/* After no vCPUs enter, the second retry is expected to succeed */
1795 		tdx_no_vcpus_enter_start(kvm);
1796 		err = tdh_mem_range_block(&kvm_tdx->td, gpa, tdx_level, &entry, &level_state);
1797 		tdx_no_vcpus_enter_stop(kvm);
1798 	}
1799 	if (tdx_is_sept_zap_err_due_to_premap(kvm_tdx, err, entry, level) &&
1800 	    !KVM_BUG_ON(!atomic64_read(&kvm_tdx->nr_premapped), kvm)) {
1801 		atomic64_dec(&kvm_tdx->nr_premapped);
1802 		tdx_unpin(kvm, page);
1803 		return 0;
1804 	}
1805 
1806 	if (KVM_BUG_ON(err, kvm)) {
1807 		pr_tdx_error_2(TDH_MEM_RANGE_BLOCK, err, entry, level_state);
1808 		return -EIO;
1809 	}
1810 	return 1;
1811 }
1812 
1813 /*
1814  * Ensure shared and private EPTs to be flushed on all vCPUs.
1815  * tdh_mem_track() is the only caller that increases TD epoch. An increase in
1816  * the TD epoch (e.g., to value "N + 1") is successful only if no vCPUs are
1817  * running in guest mode with the value "N - 1".
1818  *
1819  * A successful execution of tdh_mem_track() ensures that vCPUs can only run in
1820  * guest mode with TD epoch value "N" if no TD exit occurs after the TD epoch
1821  * being increased to "N + 1".
1822  *
1823  * Kicking off all vCPUs after that further results in no vCPUs can run in guest
1824  * mode with TD epoch value "N", which unblocks the next tdh_mem_track() (e.g.
1825  * to increase TD epoch to "N + 2").
1826  *
1827  * TDX module will flush EPT on the next TD enter and make vCPUs to run in
1828  * guest mode with TD epoch value "N + 1".
1829  *
1830  * kvm_make_all_cpus_request() guarantees all vCPUs are out of guest mode by
1831  * waiting empty IPI handler ack_kick().
1832  *
1833  * No action is required to the vCPUs being kicked off since the kicking off
1834  * occurs certainly after TD epoch increment and before the next
1835  * tdh_mem_track().
1836  */
1837 static void tdx_track(struct kvm *kvm)
1838 {
1839 	struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
1840 	u64 err;
1841 
1842 	/* If TD isn't finalized, it's before any vcpu running. */
1843 	if (unlikely(kvm_tdx->state != TD_STATE_RUNNABLE))
1844 		return;
1845 
1846 	lockdep_assert_held_write(&kvm->mmu_lock);
1847 
1848 	err = tdh_mem_track(&kvm_tdx->td);
1849 	if (unlikely(tdx_operand_busy(err))) {
1850 		/* After no vCPUs enter, the second retry is expected to succeed */
1851 		tdx_no_vcpus_enter_start(kvm);
1852 		err = tdh_mem_track(&kvm_tdx->td);
1853 		tdx_no_vcpus_enter_stop(kvm);
1854 	}
1855 
1856 	if (KVM_BUG_ON(err, kvm))
1857 		pr_tdx_error(TDH_MEM_TRACK, err);
1858 
1859 	kvm_make_all_cpus_request(kvm, KVM_REQ_OUTSIDE_GUEST_MODE);
1860 }
1861 
1862 int tdx_sept_free_private_spt(struct kvm *kvm, gfn_t gfn,
1863 			      enum pg_level level, void *private_spt)
1864 {
1865 	struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
1866 
1867 	/*
1868 	 * free_external_spt() is only called after hkid is freed when TD is
1869 	 * tearing down.
1870 	 * KVM doesn't (yet) zap page table pages in mirror page table while
1871 	 * TD is active, though guest pages mapped in mirror page table could be
1872 	 * zapped during TD is active, e.g. for shared <-> private conversion
1873 	 * and slot move/deletion.
1874 	 */
1875 	if (KVM_BUG_ON(is_hkid_assigned(kvm_tdx), kvm))
1876 		return -EINVAL;
1877 
1878 	/*
1879 	 * The HKID assigned to this TD was already freed and cache was
1880 	 * already flushed. We don't have to flush again.
1881 	 */
1882 	return tdx_reclaim_page(virt_to_page(private_spt));
1883 }
1884 
1885 int tdx_sept_remove_private_spte(struct kvm *kvm, gfn_t gfn,
1886 				 enum pg_level level, kvm_pfn_t pfn)
1887 {
1888 	struct page *page = pfn_to_page(pfn);
1889 	int ret;
1890 
1891 	/*
1892 	 * HKID is released after all private pages have been removed, and set
1893 	 * before any might be populated. Warn if zapping is attempted when
1894 	 * there can't be anything populated in the private EPT.
1895 	 */
1896 	if (KVM_BUG_ON(!is_hkid_assigned(to_kvm_tdx(kvm)), kvm))
1897 		return -EINVAL;
1898 
1899 	ret = tdx_sept_zap_private_spte(kvm, gfn, level, page);
1900 	if (ret <= 0)
1901 		return ret;
1902 
1903 	/*
1904 	 * TDX requires TLB tracking before dropping private page.  Do
1905 	 * it here, although it is also done later.
1906 	 */
1907 	tdx_track(kvm);
1908 
1909 	return tdx_sept_drop_private_spte(kvm, gfn, level, page);
1910 }
1911 
1912 void tdx_deliver_interrupt(struct kvm_lapic *apic, int delivery_mode,
1913 			   int trig_mode, int vector)
1914 {
1915 	struct kvm_vcpu *vcpu = apic->vcpu;
1916 	struct vcpu_tdx *tdx = to_tdx(vcpu);
1917 
1918 	/* TDX supports only posted interrupt.  No lapic emulation. */
1919 	__vmx_deliver_posted_interrupt(vcpu, &tdx->vt.pi_desc, vector);
1920 
1921 	trace_kvm_apicv_accept_irq(vcpu->vcpu_id, delivery_mode, trig_mode, vector);
1922 }
1923 
1924 static inline bool tdx_is_sept_violation_unexpected_pending(struct kvm_vcpu *vcpu)
1925 {
1926 	u64 eeq_type = to_tdx(vcpu)->ext_exit_qualification & TDX_EXT_EXIT_QUAL_TYPE_MASK;
1927 	u64 eq = vmx_get_exit_qual(vcpu);
1928 
1929 	if (eeq_type != TDX_EXT_EXIT_QUAL_TYPE_PENDING_EPT_VIOLATION)
1930 		return false;
1931 
1932 	return !(eq & EPT_VIOLATION_PROT_MASK) && !(eq & EPT_VIOLATION_EXEC_FOR_RING3_LIN);
1933 }
1934 
1935 static int tdx_handle_ept_violation(struct kvm_vcpu *vcpu)
1936 {
1937 	unsigned long exit_qual;
1938 	gpa_t gpa = to_tdx(vcpu)->exit_gpa;
1939 	bool local_retry = false;
1940 	int ret;
1941 
1942 	if (vt_is_tdx_private_gpa(vcpu->kvm, gpa)) {
1943 		if (tdx_is_sept_violation_unexpected_pending(vcpu)) {
1944 			pr_warn("Guest access before accepting 0x%llx on vCPU %d\n",
1945 				gpa, vcpu->vcpu_id);
1946 			kvm_vm_dead(vcpu->kvm);
1947 			return -EIO;
1948 		}
1949 		/*
1950 		 * Always treat SEPT violations as write faults.  Ignore the
1951 		 * EXIT_QUALIFICATION reported by TDX-SEAM for SEPT violations.
1952 		 * TD private pages are always RWX in the SEPT tables,
1953 		 * i.e. they're always mapped writable.  Just as importantly,
1954 		 * treating SEPT violations as write faults is necessary to
1955 		 * avoid COW allocations, which will cause TDAUGPAGE failures
1956 		 * due to aliasing a single HPA to multiple GPAs.
1957 		 */
1958 		exit_qual = EPT_VIOLATION_ACC_WRITE;
1959 
1960 		/* Only private GPA triggers zero-step mitigation */
1961 		local_retry = true;
1962 	} else {
1963 		exit_qual = vmx_get_exit_qual(vcpu);
1964 		/*
1965 		 * EPT violation due to instruction fetch should never be
1966 		 * triggered from shared memory in TDX guest.  If such EPT
1967 		 * violation occurs, treat it as broken hardware.
1968 		 */
1969 		if (KVM_BUG_ON(exit_qual & EPT_VIOLATION_ACC_INSTR, vcpu->kvm))
1970 			return -EIO;
1971 	}
1972 
1973 	trace_kvm_page_fault(vcpu, gpa, exit_qual);
1974 
1975 	/*
1976 	 * To minimize TDH.VP.ENTER invocations, retry locally for private GPA
1977 	 * mapping in TDX.
1978 	 *
1979 	 * KVM may return RET_PF_RETRY for private GPA due to
1980 	 * - contentions when atomically updating SPTEs of the mirror page table
1981 	 * - in-progress GFN invalidation or memslot removal.
1982 	 * - TDX_OPERAND_BUSY error from TDH.MEM.PAGE.AUG or TDH.MEM.SEPT.ADD,
1983 	 *   caused by contentions with TDH.VP.ENTER (with zero-step mitigation)
1984 	 *   or certain TDCALLs.
1985 	 *
1986 	 * If TDH.VP.ENTER is invoked more times than the threshold set by the
1987 	 * TDX module before KVM resolves the private GPA mapping, the TDX
1988 	 * module will activate zero-step mitigation during TDH.VP.ENTER. This
1989 	 * process acquires an SEPT tree lock in the TDX module, leading to
1990 	 * further contentions with TDH.MEM.PAGE.AUG or TDH.MEM.SEPT.ADD
1991 	 * operations on other vCPUs.
1992 	 *
1993 	 * Breaking out of local retries for kvm_vcpu_has_events() is for
1994 	 * interrupt injection. kvm_vcpu_has_events() should not see pending
1995 	 * events for TDX. Since KVM can't determine if IRQs (or NMIs) are
1996 	 * blocked by TDs, false positives are inevitable i.e., KVM may re-enter
1997 	 * the guest even if the IRQ/NMI can't be delivered.
1998 	 *
1999 	 * Note: even without breaking out of local retries, zero-step
2000 	 * mitigation may still occur due to
2001 	 * - invoking of TDH.VP.ENTER after KVM_EXIT_MEMORY_FAULT,
2002 	 * - a single RIP causing EPT violations for more GFNs than the
2003 	 *   threshold count.
2004 	 * This is safe, as triggering zero-step mitigation only introduces
2005 	 * contentions to page installation SEAMCALLs on other vCPUs, which will
2006 	 * handle retries locally in their EPT violation handlers.
2007 	 */
2008 	while (1) {
2009 		ret = __vmx_handle_ept_violation(vcpu, gpa, exit_qual);
2010 
2011 		if (ret != RET_PF_RETRY || !local_retry)
2012 			break;
2013 
2014 		if (kvm_vcpu_has_events(vcpu) || signal_pending(current))
2015 			break;
2016 
2017 		if (kvm_check_request(KVM_REQ_VM_DEAD, vcpu)) {
2018 			ret = -EIO;
2019 			break;
2020 		}
2021 
2022 		cond_resched();
2023 	}
2024 	return ret;
2025 }
2026 
2027 int tdx_complete_emulated_msr(struct kvm_vcpu *vcpu, int err)
2028 {
2029 	if (err) {
2030 		tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_INVALID_OPERAND);
2031 		return 1;
2032 	}
2033 
2034 	if (vmx_get_exit_reason(vcpu).basic == EXIT_REASON_MSR_READ)
2035 		tdvmcall_set_return_val(vcpu, kvm_read_edx_eax(vcpu));
2036 
2037 	return 1;
2038 }
2039 
2040 
2041 int tdx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t fastpath)
2042 {
2043 	struct vcpu_tdx *tdx = to_tdx(vcpu);
2044 	u64 vp_enter_ret = tdx->vp_enter_ret;
2045 	union vmx_exit_reason exit_reason = vmx_get_exit_reason(vcpu);
2046 
2047 	if (fastpath != EXIT_FASTPATH_NONE)
2048 		return 1;
2049 
2050 	if (unlikely(vp_enter_ret == EXIT_REASON_EPT_MISCONFIG)) {
2051 		KVM_BUG_ON(1, vcpu->kvm);
2052 		return -EIO;
2053 	}
2054 
2055 	/*
2056 	 * Handle TDX SW errors, including TDX_SEAMCALL_UD, TDX_SEAMCALL_GP and
2057 	 * TDX_SEAMCALL_VMFAILINVALID.
2058 	 */
2059 	if (unlikely((vp_enter_ret & TDX_SW_ERROR) == TDX_SW_ERROR)) {
2060 		KVM_BUG_ON(!kvm_rebooting, vcpu->kvm);
2061 		goto unhandled_exit;
2062 	}
2063 
2064 	if (unlikely(tdx_failed_vmentry(vcpu))) {
2065 		/*
2066 		 * If the guest state is protected, that means off-TD debug is
2067 		 * not enabled, TDX_NON_RECOVERABLE must be set.
2068 		 */
2069 		WARN_ON_ONCE(vcpu->arch.guest_state_protected &&
2070 				!(vp_enter_ret & TDX_NON_RECOVERABLE));
2071 		vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY;
2072 		vcpu->run->fail_entry.hardware_entry_failure_reason = exit_reason.full;
2073 		vcpu->run->fail_entry.cpu = vcpu->arch.last_vmentry_cpu;
2074 		return 0;
2075 	}
2076 
2077 	if (unlikely(vp_enter_ret & (TDX_ERROR | TDX_NON_RECOVERABLE)) &&
2078 		exit_reason.basic != EXIT_REASON_TRIPLE_FAULT) {
2079 		kvm_pr_unimpl("TD vp_enter_ret 0x%llx\n", vp_enter_ret);
2080 		goto unhandled_exit;
2081 	}
2082 
2083 	WARN_ON_ONCE(exit_reason.basic != EXIT_REASON_TRIPLE_FAULT &&
2084 		     (vp_enter_ret & TDX_SEAMCALL_STATUS_MASK) != TDX_SUCCESS);
2085 
2086 	switch (exit_reason.basic) {
2087 	case EXIT_REASON_TRIPLE_FAULT:
2088 		vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN;
2089 		vcpu->mmio_needed = 0;
2090 		return 0;
2091 	case EXIT_REASON_EXCEPTION_NMI:
2092 		return tdx_handle_exception_nmi(vcpu);
2093 	case EXIT_REASON_EXTERNAL_INTERRUPT:
2094 		++vcpu->stat.irq_exits;
2095 		return 1;
2096 	case EXIT_REASON_CPUID:
2097 		return tdx_emulate_cpuid(vcpu);
2098 	case EXIT_REASON_HLT:
2099 		return kvm_emulate_halt_noskip(vcpu);
2100 	case EXIT_REASON_TDCALL:
2101 		return handle_tdvmcall(vcpu);
2102 	case EXIT_REASON_VMCALL:
2103 		return tdx_emulate_vmcall(vcpu);
2104 	case EXIT_REASON_IO_INSTRUCTION:
2105 		return tdx_emulate_io(vcpu);
2106 	case EXIT_REASON_MSR_READ:
2107 		kvm_rcx_write(vcpu, tdx->vp_enter_args.r12);
2108 		return kvm_emulate_rdmsr(vcpu);
2109 	case EXIT_REASON_MSR_WRITE:
2110 		kvm_rcx_write(vcpu, tdx->vp_enter_args.r12);
2111 		kvm_rax_write(vcpu, tdx->vp_enter_args.r13 & -1u);
2112 		kvm_rdx_write(vcpu, tdx->vp_enter_args.r13 >> 32);
2113 		return kvm_emulate_wrmsr(vcpu);
2114 	case EXIT_REASON_EPT_MISCONFIG:
2115 		return tdx_emulate_mmio(vcpu);
2116 	case EXIT_REASON_EPT_VIOLATION:
2117 		return tdx_handle_ept_violation(vcpu);
2118 	case EXIT_REASON_OTHER_SMI:
2119 		/*
2120 		 * Unlike VMX, SMI in SEAM non-root mode (i.e. when
2121 		 * TD guest vCPU is running) will cause VM exit to TDX module,
2122 		 * then SEAMRET to KVM.  Once it exits to KVM, SMI is delivered
2123 		 * and handled by kernel handler right away.
2124 		 *
2125 		 * The Other SMI exit can also be caused by the SEAM non-root
2126 		 * machine check delivered via Machine Check System Management
2127 		 * Interrupt (MSMI), but it has already been handled by the
2128 		 * kernel machine check handler, i.e., the memory page has been
2129 		 * marked as poisoned and it won't be freed to the free list
2130 		 * when the TDX guest is terminated (the TDX module marks the
2131 		 * guest as dead and prevent it from further running when
2132 		 * machine check happens in SEAM non-root).
2133 		 *
2134 		 * - A MSMI will not reach here, it's handled as non_recoverable
2135 		 *   case above.
2136 		 * - If it's not an MSMI, no need to do anything here.
2137 		 */
2138 		return 1;
2139 	default:
2140 		break;
2141 	}
2142 
2143 unhandled_exit:
2144 	vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
2145 	vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_UNEXPECTED_EXIT_REASON;
2146 	vcpu->run->internal.ndata = 2;
2147 	vcpu->run->internal.data[0] = vp_enter_ret;
2148 	vcpu->run->internal.data[1] = vcpu->arch.last_vmentry_cpu;
2149 	return 0;
2150 }
2151 
2152 void tdx_get_exit_info(struct kvm_vcpu *vcpu, u32 *reason,
2153 		u64 *info1, u64 *info2, u32 *intr_info, u32 *error_code)
2154 {
2155 	struct vcpu_tdx *tdx = to_tdx(vcpu);
2156 
2157 	*reason = tdx->vt.exit_reason.full;
2158 	if (*reason != -1u) {
2159 		*info1 = vmx_get_exit_qual(vcpu);
2160 		*info2 = tdx->ext_exit_qualification;
2161 		*intr_info = vmx_get_intr_info(vcpu);
2162 	} else {
2163 		*info1 = 0;
2164 		*info2 = 0;
2165 		*intr_info = 0;
2166 	}
2167 
2168 	*error_code = 0;
2169 }
2170 
2171 bool tdx_has_emulated_msr(u32 index)
2172 {
2173 	switch (index) {
2174 	case MSR_IA32_UCODE_REV:
2175 	case MSR_IA32_ARCH_CAPABILITIES:
2176 	case MSR_IA32_POWER_CTL:
2177 	case MSR_IA32_CR_PAT:
2178 	case MSR_MTRRcap:
2179 	case MTRRphysBase_MSR(0) ... MSR_MTRRfix4K_F8000:
2180 	case MSR_MTRRdefType:
2181 	case MSR_IA32_TSC_DEADLINE:
2182 	case MSR_IA32_MISC_ENABLE:
2183 	case MSR_PLATFORM_INFO:
2184 	case MSR_MISC_FEATURES_ENABLES:
2185 	case MSR_IA32_APICBASE:
2186 	case MSR_EFER:
2187 	case MSR_IA32_FEAT_CTL:
2188 	case MSR_IA32_MCG_CAP:
2189 	case MSR_IA32_MCG_STATUS:
2190 	case MSR_IA32_MCG_CTL:
2191 	case MSR_IA32_MCG_EXT_CTL:
2192 	case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1:
2193 	case MSR_IA32_MC0_CTL2 ... MSR_IA32_MCx_CTL2(KVM_MAX_MCE_BANKS) - 1:
2194 		/* MSR_IA32_MCx_{CTL, STATUS, ADDR, MISC, CTL2} */
2195 	case MSR_KVM_POLL_CONTROL:
2196 		return true;
2197 	case APIC_BASE_MSR ... APIC_BASE_MSR + 0xff:
2198 		/*
2199 		 * x2APIC registers that are virtualized by the CPU can't be
2200 		 * emulated, KVM doesn't have access to the virtual APIC page.
2201 		 */
2202 		switch (index) {
2203 		case X2APIC_MSR(APIC_TASKPRI):
2204 		case X2APIC_MSR(APIC_PROCPRI):
2205 		case X2APIC_MSR(APIC_EOI):
2206 		case X2APIC_MSR(APIC_ISR) ... X2APIC_MSR(APIC_ISR + APIC_ISR_NR):
2207 		case X2APIC_MSR(APIC_TMR) ... X2APIC_MSR(APIC_TMR + APIC_ISR_NR):
2208 		case X2APIC_MSR(APIC_IRR) ... X2APIC_MSR(APIC_IRR + APIC_ISR_NR):
2209 			return false;
2210 		default:
2211 			return true;
2212 		}
2213 	default:
2214 		return false;
2215 	}
2216 }
2217 
2218 static bool tdx_is_read_only_msr(u32 index)
2219 {
2220 	return  index == MSR_IA32_APICBASE || index == MSR_EFER ||
2221 		index == MSR_IA32_FEAT_CTL;
2222 }
2223 
2224 int tdx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
2225 {
2226 	switch (msr->index) {
2227 	case MSR_IA32_FEAT_CTL:
2228 		/*
2229 		 * MCE and MCA are advertised via cpuid. Guest kernel could
2230 		 * check if LMCE is enabled or not.
2231 		 */
2232 		msr->data = FEAT_CTL_LOCKED;
2233 		if (vcpu->arch.mcg_cap & MCG_LMCE_P)
2234 			msr->data |= FEAT_CTL_LMCE_ENABLED;
2235 		return 0;
2236 	case MSR_IA32_MCG_EXT_CTL:
2237 		if (!msr->host_initiated && !(vcpu->arch.mcg_cap & MCG_LMCE_P))
2238 			return 1;
2239 		msr->data = vcpu->arch.mcg_ext_ctl;
2240 		return 0;
2241 	default:
2242 		if (!tdx_has_emulated_msr(msr->index))
2243 			return 1;
2244 
2245 		return kvm_get_msr_common(vcpu, msr);
2246 	}
2247 }
2248 
2249 int tdx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
2250 {
2251 	switch (msr->index) {
2252 	case MSR_IA32_MCG_EXT_CTL:
2253 		if ((!msr->host_initiated && !(vcpu->arch.mcg_cap & MCG_LMCE_P)) ||
2254 		    (msr->data & ~MCG_EXT_CTL_LMCE_EN))
2255 			return 1;
2256 		vcpu->arch.mcg_ext_ctl = msr->data;
2257 		return 0;
2258 	default:
2259 		if (tdx_is_read_only_msr(msr->index))
2260 			return 1;
2261 
2262 		if (!tdx_has_emulated_msr(msr->index))
2263 			return 1;
2264 
2265 		return kvm_set_msr_common(vcpu, msr);
2266 	}
2267 }
2268 
2269 static int tdx_get_capabilities(struct kvm_tdx_cmd *cmd)
2270 {
2271 	const struct tdx_sys_info_td_conf *td_conf = &tdx_sysinfo->td_conf;
2272 	struct kvm_tdx_capabilities __user *user_caps;
2273 	struct kvm_tdx_capabilities *caps = NULL;
2274 	int ret = 0;
2275 
2276 	/* flags is reserved for future use */
2277 	if (cmd->flags)
2278 		return -EINVAL;
2279 
2280 	caps = kmalloc(sizeof(*caps) +
2281 		       sizeof(struct kvm_cpuid_entry2) * td_conf->num_cpuid_config,
2282 		       GFP_KERNEL);
2283 	if (!caps)
2284 		return -ENOMEM;
2285 
2286 	user_caps = u64_to_user_ptr(cmd->data);
2287 	if (copy_from_user(caps, user_caps, sizeof(*caps))) {
2288 		ret = -EFAULT;
2289 		goto out;
2290 	}
2291 
2292 	if (caps->cpuid.nent < td_conf->num_cpuid_config) {
2293 		ret = -E2BIG;
2294 		goto out;
2295 	}
2296 
2297 	ret = init_kvm_tdx_caps(td_conf, caps);
2298 	if (ret)
2299 		goto out;
2300 
2301 	if (copy_to_user(user_caps, caps, sizeof(*caps))) {
2302 		ret = -EFAULT;
2303 		goto out;
2304 	}
2305 
2306 	if (copy_to_user(user_caps->cpuid.entries, caps->cpuid.entries,
2307 			 caps->cpuid.nent *
2308 			 sizeof(caps->cpuid.entries[0])))
2309 		ret = -EFAULT;
2310 
2311 out:
2312 	/* kfree() accepts NULL. */
2313 	kfree(caps);
2314 	return ret;
2315 }
2316 
2317 /*
2318  * KVM reports guest physical address in CPUID.0x800000008.EAX[23:16], which is
2319  * similar to TDX's GPAW. Use this field as the interface for userspace to
2320  * configure the GPAW and EPT level for TDs.
2321  *
2322  * Only values 48 and 52 are supported. Value 52 means GPAW-52 and EPT level
2323  * 5, Value 48 means GPAW-48 and EPT level 4. For value 48, GPAW-48 is always
2324  * supported. Value 52 is only supported when the platform supports 5 level
2325  * EPT.
2326  */
2327 static int setup_tdparams_eptp_controls(struct kvm_cpuid2 *cpuid,
2328 					struct td_params *td_params)
2329 {
2330 	const struct kvm_cpuid_entry2 *entry;
2331 	int guest_pa;
2332 
2333 	entry = kvm_find_cpuid_entry2(cpuid->entries, cpuid->nent, 0x80000008, 0);
2334 	if (!entry)
2335 		return -EINVAL;
2336 
2337 	guest_pa = tdx_get_guest_phys_addr_bits(entry->eax);
2338 
2339 	if (guest_pa != 48 && guest_pa != 52)
2340 		return -EINVAL;
2341 
2342 	if (guest_pa == 52 && !cpu_has_vmx_ept_5levels())
2343 		return -EINVAL;
2344 
2345 	td_params->eptp_controls = VMX_EPTP_MT_WB;
2346 	if (guest_pa == 52) {
2347 		td_params->eptp_controls |= VMX_EPTP_PWL_5;
2348 		td_params->config_flags |= TDX_CONFIG_FLAGS_MAX_GPAW;
2349 	} else {
2350 		td_params->eptp_controls |= VMX_EPTP_PWL_4;
2351 	}
2352 
2353 	return 0;
2354 }
2355 
2356 static int setup_tdparams_cpuids(struct kvm_cpuid2 *cpuid,
2357 				 struct td_params *td_params)
2358 {
2359 	const struct tdx_sys_info_td_conf *td_conf = &tdx_sysinfo->td_conf;
2360 	const struct kvm_cpuid_entry2 *entry;
2361 	struct tdx_cpuid_value *value;
2362 	int i, copy_cnt = 0;
2363 
2364 	/*
2365 	 * td_params.cpuid_values: The number and the order of cpuid_value must
2366 	 * be same to the one of struct tdsysinfo.{num_cpuid_config, cpuid_configs}
2367 	 * It's assumed that td_params was zeroed.
2368 	 */
2369 	for (i = 0; i < td_conf->num_cpuid_config; i++) {
2370 		struct kvm_cpuid_entry2 tmp;
2371 
2372 		td_init_cpuid_entry2(&tmp, i);
2373 
2374 		entry = kvm_find_cpuid_entry2(cpuid->entries, cpuid->nent,
2375 					      tmp.function, tmp.index);
2376 		if (!entry)
2377 			continue;
2378 
2379 		if (tdx_unsupported_cpuid(entry))
2380 			return -EINVAL;
2381 
2382 		copy_cnt++;
2383 
2384 		value = &td_params->cpuid_values[i];
2385 		value->eax = entry->eax;
2386 		value->ebx = entry->ebx;
2387 		value->ecx = entry->ecx;
2388 		value->edx = entry->edx;
2389 
2390 		/*
2391 		 * TDX module does not accept nonzero bits 16..23 for the
2392 		 * CPUID[0x80000008].EAX, see setup_tdparams_eptp_controls().
2393 		 */
2394 		if (tmp.function == 0x80000008)
2395 			value->eax = tdx_set_guest_phys_addr_bits(value->eax, 0);
2396 	}
2397 
2398 	/*
2399 	 * Rely on the TDX module to reject invalid configuration, but it can't
2400 	 * check of leafs that don't have a proper slot in td_params->cpuid_values
2401 	 * to stick then. So fail if there were entries that didn't get copied to
2402 	 * td_params.
2403 	 */
2404 	if (copy_cnt != cpuid->nent)
2405 		return -EINVAL;
2406 
2407 	return 0;
2408 }
2409 
2410 static int setup_tdparams(struct kvm *kvm, struct td_params *td_params,
2411 			struct kvm_tdx_init_vm *init_vm)
2412 {
2413 	const struct tdx_sys_info_td_conf *td_conf = &tdx_sysinfo->td_conf;
2414 	struct kvm_cpuid2 *cpuid = &init_vm->cpuid;
2415 	int ret;
2416 
2417 	if (kvm->created_vcpus)
2418 		return -EBUSY;
2419 
2420 	if (init_vm->attributes & ~tdx_get_supported_attrs(td_conf))
2421 		return -EINVAL;
2422 
2423 	if (init_vm->xfam & ~tdx_get_supported_xfam(td_conf))
2424 		return -EINVAL;
2425 
2426 	td_params->max_vcpus = kvm->max_vcpus;
2427 	td_params->attributes = init_vm->attributes | td_conf->attributes_fixed1;
2428 	td_params->xfam = init_vm->xfam | td_conf->xfam_fixed1;
2429 
2430 	td_params->config_flags = TDX_CONFIG_FLAGS_NO_RBP_MOD;
2431 	td_params->tsc_frequency = TDX_TSC_KHZ_TO_25MHZ(kvm->arch.default_tsc_khz);
2432 
2433 	ret = setup_tdparams_eptp_controls(cpuid, td_params);
2434 	if (ret)
2435 		return ret;
2436 
2437 	ret = setup_tdparams_cpuids(cpuid, td_params);
2438 	if (ret)
2439 		return ret;
2440 
2441 #define MEMCPY_SAME_SIZE(dst, src)				\
2442 	do {							\
2443 		BUILD_BUG_ON(sizeof(dst) != sizeof(src));	\
2444 		memcpy((dst), (src), sizeof(dst));		\
2445 	} while (0)
2446 
2447 	MEMCPY_SAME_SIZE(td_params->mrconfigid, init_vm->mrconfigid);
2448 	MEMCPY_SAME_SIZE(td_params->mrowner, init_vm->mrowner);
2449 	MEMCPY_SAME_SIZE(td_params->mrownerconfig, init_vm->mrownerconfig);
2450 
2451 	return 0;
2452 }
2453 
2454 static int __tdx_td_init(struct kvm *kvm, struct td_params *td_params,
2455 			 u64 *seamcall_err)
2456 {
2457 	struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
2458 	cpumask_var_t packages;
2459 	struct page **tdcs_pages = NULL;
2460 	struct page *tdr_page;
2461 	int ret, i;
2462 	u64 err, rcx;
2463 
2464 	*seamcall_err = 0;
2465 	ret = tdx_guest_keyid_alloc();
2466 	if (ret < 0)
2467 		return ret;
2468 	kvm_tdx->hkid = ret;
2469 	kvm_tdx->misc_cg = get_current_misc_cg();
2470 	ret = misc_cg_try_charge(MISC_CG_RES_TDX, kvm_tdx->misc_cg, 1);
2471 	if (ret)
2472 		goto free_hkid;
2473 
2474 	ret = -ENOMEM;
2475 
2476 	atomic_inc(&nr_configured_hkid);
2477 
2478 	tdr_page = alloc_page(GFP_KERNEL);
2479 	if (!tdr_page)
2480 		goto free_hkid;
2481 
2482 	kvm_tdx->td.tdcs_nr_pages = tdx_sysinfo->td_ctrl.tdcs_base_size / PAGE_SIZE;
2483 	/* TDVPS = TDVPR(4K page) + TDCX(multiple 4K pages), -1 for TDVPR. */
2484 	kvm_tdx->td.tdcx_nr_pages = tdx_sysinfo->td_ctrl.tdvps_base_size / PAGE_SIZE - 1;
2485 	tdcs_pages = kcalloc(kvm_tdx->td.tdcs_nr_pages, sizeof(*kvm_tdx->td.tdcs_pages),
2486 			     GFP_KERNEL | __GFP_ZERO);
2487 	if (!tdcs_pages)
2488 		goto free_tdr;
2489 
2490 	for (i = 0; i < kvm_tdx->td.tdcs_nr_pages; i++) {
2491 		tdcs_pages[i] = alloc_page(GFP_KERNEL);
2492 		if (!tdcs_pages[i])
2493 			goto free_tdcs;
2494 	}
2495 
2496 	if (!zalloc_cpumask_var(&packages, GFP_KERNEL))
2497 		goto free_tdcs;
2498 
2499 	cpus_read_lock();
2500 
2501 	/*
2502 	 * Need at least one CPU of the package to be online in order to
2503 	 * program all packages for host key id.  Check it.
2504 	 */
2505 	for_each_present_cpu(i)
2506 		cpumask_set_cpu(topology_physical_package_id(i), packages);
2507 	for_each_online_cpu(i)
2508 		cpumask_clear_cpu(topology_physical_package_id(i), packages);
2509 	if (!cpumask_empty(packages)) {
2510 		ret = -EIO;
2511 		/*
2512 		 * Because it's hard for human operator to figure out the
2513 		 * reason, warn it.
2514 		 */
2515 #define MSG_ALLPKG	"All packages need to have online CPU to create TD. Online CPU and retry.\n"
2516 		pr_warn_ratelimited(MSG_ALLPKG);
2517 		goto free_packages;
2518 	}
2519 
2520 	/*
2521 	 * TDH.MNG.CREATE tries to grab the global TDX module and fails
2522 	 * with TDX_OPERAND_BUSY when it fails to grab.  Take the global
2523 	 * lock to prevent it from failure.
2524 	 */
2525 	mutex_lock(&tdx_lock);
2526 	kvm_tdx->td.tdr_page = tdr_page;
2527 	err = tdh_mng_create(&kvm_tdx->td, kvm_tdx->hkid);
2528 	mutex_unlock(&tdx_lock);
2529 
2530 	if (err == TDX_RND_NO_ENTROPY) {
2531 		ret = -EAGAIN;
2532 		goto free_packages;
2533 	}
2534 
2535 	if (WARN_ON_ONCE(err)) {
2536 		pr_tdx_error(TDH_MNG_CREATE, err);
2537 		ret = -EIO;
2538 		goto free_packages;
2539 	}
2540 
2541 	for_each_online_cpu(i) {
2542 		int pkg = topology_physical_package_id(i);
2543 
2544 		if (cpumask_test_and_set_cpu(pkg, packages))
2545 			continue;
2546 
2547 		/*
2548 		 * Program the memory controller in the package with an
2549 		 * encryption key associated to a TDX private host key id
2550 		 * assigned to this TDR.  Concurrent operations on same memory
2551 		 * controller results in TDX_OPERAND_BUSY. No locking needed
2552 		 * beyond the cpus_read_lock() above as it serializes against
2553 		 * hotplug and the first online CPU of the package is always
2554 		 * used. We never have two CPUs in the same socket trying to
2555 		 * program the key.
2556 		 */
2557 		ret = smp_call_on_cpu(i, tdx_do_tdh_mng_key_config,
2558 				      kvm_tdx, true);
2559 		if (ret)
2560 			break;
2561 	}
2562 	cpus_read_unlock();
2563 	free_cpumask_var(packages);
2564 	if (ret) {
2565 		i = 0;
2566 		goto teardown;
2567 	}
2568 
2569 	kvm_tdx->td.tdcs_pages = tdcs_pages;
2570 	for (i = 0; i < kvm_tdx->td.tdcs_nr_pages; i++) {
2571 		err = tdh_mng_addcx(&kvm_tdx->td, tdcs_pages[i]);
2572 		if (err == TDX_RND_NO_ENTROPY) {
2573 			/* Here it's hard to allow userspace to retry. */
2574 			ret = -EAGAIN;
2575 			goto teardown;
2576 		}
2577 		if (WARN_ON_ONCE(err)) {
2578 			pr_tdx_error(TDH_MNG_ADDCX, err);
2579 			ret = -EIO;
2580 			goto teardown;
2581 		}
2582 	}
2583 
2584 	err = tdh_mng_init(&kvm_tdx->td, __pa(td_params), &rcx);
2585 	if ((err & TDX_SEAMCALL_STATUS_MASK) == TDX_OPERAND_INVALID) {
2586 		/*
2587 		 * Because a user gives operands, don't warn.
2588 		 * Return a hint to the user because it's sometimes hard for the
2589 		 * user to figure out which operand is invalid.  SEAMCALL status
2590 		 * code includes which operand caused invalid operand error.
2591 		 */
2592 		*seamcall_err = err;
2593 		ret = -EINVAL;
2594 		goto teardown;
2595 	} else if (WARN_ON_ONCE(err)) {
2596 		pr_tdx_error_1(TDH_MNG_INIT, err, rcx);
2597 		ret = -EIO;
2598 		goto teardown;
2599 	}
2600 
2601 	return 0;
2602 
2603 	/*
2604 	 * The sequence for freeing resources from a partially initialized TD
2605 	 * varies based on where in the initialization flow failure occurred.
2606 	 * Simply use the full teardown and destroy, which naturally play nice
2607 	 * with partial initialization.
2608 	 */
2609 teardown:
2610 	/* Only free pages not yet added, so start at 'i' */
2611 	for (; i < kvm_tdx->td.tdcs_nr_pages; i++) {
2612 		if (tdcs_pages[i]) {
2613 			__free_page(tdcs_pages[i]);
2614 			tdcs_pages[i] = NULL;
2615 		}
2616 	}
2617 	if (!kvm_tdx->td.tdcs_pages)
2618 		kfree(tdcs_pages);
2619 
2620 	tdx_mmu_release_hkid(kvm);
2621 	tdx_reclaim_td_control_pages(kvm);
2622 
2623 	return ret;
2624 
2625 free_packages:
2626 	cpus_read_unlock();
2627 	free_cpumask_var(packages);
2628 
2629 free_tdcs:
2630 	for (i = 0; i < kvm_tdx->td.tdcs_nr_pages; i++) {
2631 		if (tdcs_pages[i])
2632 			__free_page(tdcs_pages[i]);
2633 	}
2634 	kfree(tdcs_pages);
2635 	kvm_tdx->td.tdcs_pages = NULL;
2636 
2637 free_tdr:
2638 	if (tdr_page)
2639 		__free_page(tdr_page);
2640 	kvm_tdx->td.tdr_page = 0;
2641 
2642 free_hkid:
2643 	tdx_hkid_free(kvm_tdx);
2644 
2645 	return ret;
2646 }
2647 
2648 static u64 tdx_td_metadata_field_read(struct kvm_tdx *tdx, u64 field_id,
2649 				      u64 *data)
2650 {
2651 	u64 err;
2652 
2653 	err = tdh_mng_rd(&tdx->td, field_id, data);
2654 
2655 	return err;
2656 }
2657 
2658 #define TDX_MD_UNREADABLE_LEAF_MASK	GENMASK(30, 7)
2659 #define TDX_MD_UNREADABLE_SUBLEAF_MASK	GENMASK(31, 7)
2660 
2661 static int tdx_read_cpuid(struct kvm_vcpu *vcpu, u32 leaf, u32 sub_leaf,
2662 			  bool sub_leaf_set, int *entry_index,
2663 			  struct kvm_cpuid_entry2 *out)
2664 {
2665 	struct kvm_tdx *kvm_tdx = to_kvm_tdx(vcpu->kvm);
2666 	u64 field_id = TD_MD_FIELD_ID_CPUID_VALUES;
2667 	u64 ebx_eax, edx_ecx;
2668 	u64 err = 0;
2669 
2670 	if (sub_leaf > 0b1111111)
2671 		return -EINVAL;
2672 
2673 	if (*entry_index >= KVM_MAX_CPUID_ENTRIES)
2674 		return -EINVAL;
2675 
2676 	if (leaf & TDX_MD_UNREADABLE_LEAF_MASK ||
2677 	    sub_leaf & TDX_MD_UNREADABLE_SUBLEAF_MASK)
2678 		return -EINVAL;
2679 
2680 	/*
2681 	 * bit 23:17, REVSERVED: reserved, must be 0;
2682 	 * bit 16,    LEAF_31: leaf number bit 31;
2683 	 * bit 15:9,  LEAF_6_0: leaf number bits 6:0, leaf bits 30:7 are
2684 	 *                      implicitly 0;
2685 	 * bit 8,     SUBLEAF_NA: sub-leaf not applicable flag;
2686 	 * bit 7:1,   SUBLEAF_6_0: sub-leaf number bits 6:0. If SUBLEAF_NA is 1,
2687 	 *                         the SUBLEAF_6_0 is all-1.
2688 	 *                         sub-leaf bits 31:7 are implicitly 0;
2689 	 * bit 0,     ELEMENT_I: Element index within field;
2690 	 */
2691 	field_id |= ((leaf & 0x80000000) ? 1 : 0) << 16;
2692 	field_id |= (leaf & 0x7f) << 9;
2693 	if (sub_leaf_set)
2694 		field_id |= (sub_leaf & 0x7f) << 1;
2695 	else
2696 		field_id |= 0x1fe;
2697 
2698 	err = tdx_td_metadata_field_read(kvm_tdx, field_id, &ebx_eax);
2699 	if (err) //TODO check for specific errors
2700 		goto err_out;
2701 
2702 	out->eax = (u32) ebx_eax;
2703 	out->ebx = (u32) (ebx_eax >> 32);
2704 
2705 	field_id++;
2706 	err = tdx_td_metadata_field_read(kvm_tdx, field_id, &edx_ecx);
2707 	/*
2708 	 * It's weird that reading edx_ecx fails while reading ebx_eax
2709 	 * succeeded.
2710 	 */
2711 	if (WARN_ON_ONCE(err))
2712 		goto err_out;
2713 
2714 	out->ecx = (u32) edx_ecx;
2715 	out->edx = (u32) (edx_ecx >> 32);
2716 
2717 	out->function = leaf;
2718 	out->index = sub_leaf;
2719 	out->flags |= sub_leaf_set ? KVM_CPUID_FLAG_SIGNIFCANT_INDEX : 0;
2720 
2721 	/*
2722 	 * Work around missing support on old TDX modules, fetch
2723 	 * guest maxpa from gfn_direct_bits.
2724 	 */
2725 	if (leaf == 0x80000008) {
2726 		gpa_t gpa_bits = gfn_to_gpa(kvm_gfn_direct_bits(vcpu->kvm));
2727 		unsigned int g_maxpa = __ffs(gpa_bits) + 1;
2728 
2729 		out->eax = tdx_set_guest_phys_addr_bits(out->eax, g_maxpa);
2730 	}
2731 
2732 	(*entry_index)++;
2733 
2734 	return 0;
2735 
2736 err_out:
2737 	out->eax = 0;
2738 	out->ebx = 0;
2739 	out->ecx = 0;
2740 	out->edx = 0;
2741 
2742 	return -EIO;
2743 }
2744 
2745 static int tdx_td_init(struct kvm *kvm, struct kvm_tdx_cmd *cmd)
2746 {
2747 	struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
2748 	struct kvm_tdx_init_vm *init_vm;
2749 	struct td_params *td_params = NULL;
2750 	int ret;
2751 
2752 	BUILD_BUG_ON(sizeof(*init_vm) != 256 + sizeof_field(struct kvm_tdx_init_vm, cpuid));
2753 	BUILD_BUG_ON(sizeof(struct td_params) != 1024);
2754 
2755 	if (kvm_tdx->state != TD_STATE_UNINITIALIZED)
2756 		return -EINVAL;
2757 
2758 	if (cmd->flags)
2759 		return -EINVAL;
2760 
2761 	init_vm = kmalloc(sizeof(*init_vm) +
2762 			  sizeof(init_vm->cpuid.entries[0]) * KVM_MAX_CPUID_ENTRIES,
2763 			  GFP_KERNEL);
2764 	if (!init_vm)
2765 		return -ENOMEM;
2766 
2767 	if (copy_from_user(init_vm, u64_to_user_ptr(cmd->data), sizeof(*init_vm))) {
2768 		ret = -EFAULT;
2769 		goto out;
2770 	}
2771 
2772 	if (init_vm->cpuid.nent > KVM_MAX_CPUID_ENTRIES) {
2773 		ret = -E2BIG;
2774 		goto out;
2775 	}
2776 
2777 	if (copy_from_user(init_vm->cpuid.entries,
2778 			   u64_to_user_ptr(cmd->data) + sizeof(*init_vm),
2779 			   flex_array_size(init_vm, cpuid.entries, init_vm->cpuid.nent))) {
2780 		ret = -EFAULT;
2781 		goto out;
2782 	}
2783 
2784 	if (memchr_inv(init_vm->reserved, 0, sizeof(init_vm->reserved))) {
2785 		ret = -EINVAL;
2786 		goto out;
2787 	}
2788 
2789 	if (init_vm->cpuid.padding) {
2790 		ret = -EINVAL;
2791 		goto out;
2792 	}
2793 
2794 	td_params = kzalloc(sizeof(struct td_params), GFP_KERNEL);
2795 	if (!td_params) {
2796 		ret = -ENOMEM;
2797 		goto out;
2798 	}
2799 
2800 	ret = setup_tdparams(kvm, td_params, init_vm);
2801 	if (ret)
2802 		goto out;
2803 
2804 	ret = __tdx_td_init(kvm, td_params, &cmd->hw_error);
2805 	if (ret)
2806 		goto out;
2807 
2808 	kvm_tdx->tsc_offset = td_tdcs_exec_read64(kvm_tdx, TD_TDCS_EXEC_TSC_OFFSET);
2809 	kvm_tdx->tsc_multiplier = td_tdcs_exec_read64(kvm_tdx, TD_TDCS_EXEC_TSC_MULTIPLIER);
2810 	kvm_tdx->attributes = td_params->attributes;
2811 	kvm_tdx->xfam = td_params->xfam;
2812 
2813 	if (td_params->config_flags & TDX_CONFIG_FLAGS_MAX_GPAW)
2814 		kvm->arch.gfn_direct_bits = TDX_SHARED_BIT_PWL_5;
2815 	else
2816 		kvm->arch.gfn_direct_bits = TDX_SHARED_BIT_PWL_4;
2817 
2818 	kvm_tdx->state = TD_STATE_INITIALIZED;
2819 out:
2820 	/* kfree() accepts NULL. */
2821 	kfree(init_vm);
2822 	kfree(td_params);
2823 
2824 	return ret;
2825 }
2826 
2827 void tdx_flush_tlb_current(struct kvm_vcpu *vcpu)
2828 {
2829 	/*
2830 	 * flush_tlb_current() is invoked when the first time for the vcpu to
2831 	 * run or when root of shared EPT is invalidated.
2832 	 * KVM only needs to flush shared EPT because the TDX module handles TLB
2833 	 * invalidation for private EPT in tdh_vp_enter();
2834 	 *
2835 	 * A single context invalidation for shared EPT can be performed here.
2836 	 * However, this single context invalidation requires the private EPTP
2837 	 * rather than the shared EPTP to flush shared EPT, as shared EPT uses
2838 	 * private EPTP as its ASID for TLB invalidation.
2839 	 *
2840 	 * To avoid reading back private EPTP, perform a global invalidation for
2841 	 * shared EPT instead to keep this function simple.
2842 	 */
2843 	ept_sync_global();
2844 }
2845 
2846 void tdx_flush_tlb_all(struct kvm_vcpu *vcpu)
2847 {
2848 	/*
2849 	 * TDX has called tdx_track() in tdx_sept_remove_private_spte() to
2850 	 * ensure that private EPT will be flushed on the next TD enter. No need
2851 	 * to call tdx_track() here again even when this callback is a result of
2852 	 * zapping private EPT.
2853 	 *
2854 	 * Due to the lack of the context to determine which EPT has been
2855 	 * affected by zapping, invoke invept() directly here for both shared
2856 	 * EPT and private EPT for simplicity, though it's not necessary for
2857 	 * private EPT.
2858 	 */
2859 	ept_sync_global();
2860 }
2861 
2862 static int tdx_td_finalize(struct kvm *kvm, struct kvm_tdx_cmd *cmd)
2863 {
2864 	struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
2865 
2866 	guard(mutex)(&kvm->slots_lock);
2867 
2868 	if (!is_hkid_assigned(kvm_tdx) || kvm_tdx->state == TD_STATE_RUNNABLE)
2869 		return -EINVAL;
2870 	/*
2871 	 * Pages are pending for KVM_TDX_INIT_MEM_REGION to issue
2872 	 * TDH.MEM.PAGE.ADD().
2873 	 */
2874 	if (atomic64_read(&kvm_tdx->nr_premapped))
2875 		return -EINVAL;
2876 
2877 	cmd->hw_error = tdh_mr_finalize(&kvm_tdx->td);
2878 	if (tdx_operand_busy(cmd->hw_error))
2879 		return -EBUSY;
2880 	if (KVM_BUG_ON(cmd->hw_error, kvm)) {
2881 		pr_tdx_error(TDH_MR_FINALIZE, cmd->hw_error);
2882 		return -EIO;
2883 	}
2884 
2885 	kvm_tdx->state = TD_STATE_RUNNABLE;
2886 	/* TD_STATE_RUNNABLE must be set before 'pre_fault_allowed' */
2887 	smp_wmb();
2888 	kvm->arch.pre_fault_allowed = true;
2889 	return 0;
2890 }
2891 
2892 int tdx_vm_ioctl(struct kvm *kvm, void __user *argp)
2893 {
2894 	struct kvm_tdx_cmd tdx_cmd;
2895 	int r;
2896 
2897 	if (copy_from_user(&tdx_cmd, argp, sizeof(struct kvm_tdx_cmd)))
2898 		return -EFAULT;
2899 
2900 	/*
2901 	 * Userspace should never set hw_error. It is used to fill
2902 	 * hardware-defined error by the kernel.
2903 	 */
2904 	if (tdx_cmd.hw_error)
2905 		return -EINVAL;
2906 
2907 	mutex_lock(&kvm->lock);
2908 
2909 	switch (tdx_cmd.id) {
2910 	case KVM_TDX_CAPABILITIES:
2911 		r = tdx_get_capabilities(&tdx_cmd);
2912 		break;
2913 	case KVM_TDX_INIT_VM:
2914 		r = tdx_td_init(kvm, &tdx_cmd);
2915 		break;
2916 	case KVM_TDX_FINALIZE_VM:
2917 		r = tdx_td_finalize(kvm, &tdx_cmd);
2918 		break;
2919 	default:
2920 		r = -EINVAL;
2921 		goto out;
2922 	}
2923 
2924 	if (copy_to_user(argp, &tdx_cmd, sizeof(struct kvm_tdx_cmd)))
2925 		r = -EFAULT;
2926 
2927 out:
2928 	mutex_unlock(&kvm->lock);
2929 	return r;
2930 }
2931 
2932 /* VMM can pass one 64bit auxiliary data to vcpu via RCX for guest BIOS. */
2933 static int tdx_td_vcpu_init(struct kvm_vcpu *vcpu, u64 vcpu_rcx)
2934 {
2935 	struct kvm_tdx *kvm_tdx = to_kvm_tdx(vcpu->kvm);
2936 	struct vcpu_tdx *tdx = to_tdx(vcpu);
2937 	struct page *page;
2938 	int ret, i;
2939 	u64 err;
2940 
2941 	page = alloc_page(GFP_KERNEL);
2942 	if (!page)
2943 		return -ENOMEM;
2944 	tdx->vp.tdvpr_page = page;
2945 
2946 	tdx->vp.tdcx_pages = kcalloc(kvm_tdx->td.tdcx_nr_pages, sizeof(*tdx->vp.tdcx_pages),
2947 			       	     GFP_KERNEL);
2948 	if (!tdx->vp.tdcx_pages) {
2949 		ret = -ENOMEM;
2950 		goto free_tdvpr;
2951 	}
2952 
2953 	for (i = 0; i < kvm_tdx->td.tdcx_nr_pages; i++) {
2954 		page = alloc_page(GFP_KERNEL);
2955 		if (!page) {
2956 			ret = -ENOMEM;
2957 			goto free_tdcx;
2958 		}
2959 		tdx->vp.tdcx_pages[i] = page;
2960 	}
2961 
2962 	err = tdh_vp_create(&kvm_tdx->td, &tdx->vp);
2963 	if (KVM_BUG_ON(err, vcpu->kvm)) {
2964 		ret = -EIO;
2965 		pr_tdx_error(TDH_VP_CREATE, err);
2966 		goto free_tdcx;
2967 	}
2968 
2969 	for (i = 0; i < kvm_tdx->td.tdcx_nr_pages; i++) {
2970 		err = tdh_vp_addcx(&tdx->vp, tdx->vp.tdcx_pages[i]);
2971 		if (KVM_BUG_ON(err, vcpu->kvm)) {
2972 			pr_tdx_error(TDH_VP_ADDCX, err);
2973 			/*
2974 			 * Pages already added are reclaimed by the vcpu_free
2975 			 * method, but the rest are freed here.
2976 			 */
2977 			for (; i < kvm_tdx->td.tdcx_nr_pages; i++) {
2978 				__free_page(tdx->vp.tdcx_pages[i]);
2979 				tdx->vp.tdcx_pages[i] = NULL;
2980 			}
2981 			return -EIO;
2982 		}
2983 	}
2984 
2985 	err = tdh_vp_init(&tdx->vp, vcpu_rcx, vcpu->vcpu_id);
2986 	if (KVM_BUG_ON(err, vcpu->kvm)) {
2987 		pr_tdx_error(TDH_VP_INIT, err);
2988 		return -EIO;
2989 	}
2990 
2991 	vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
2992 
2993 	return 0;
2994 
2995 free_tdcx:
2996 	for (i = 0; i < kvm_tdx->td.tdcx_nr_pages; i++) {
2997 		if (tdx->vp.tdcx_pages[i])
2998 			__free_page(tdx->vp.tdcx_pages[i]);
2999 		tdx->vp.tdcx_pages[i] = NULL;
3000 	}
3001 	kfree(tdx->vp.tdcx_pages);
3002 	tdx->vp.tdcx_pages = NULL;
3003 
3004 free_tdvpr:
3005 	if (tdx->vp.tdvpr_page)
3006 		__free_page(tdx->vp.tdvpr_page);
3007 	tdx->vp.tdvpr_page = 0;
3008 
3009 	return ret;
3010 }
3011 
3012 /* Sometimes reads multipple subleafs. Return how many enties were written. */
3013 static int tdx_vcpu_get_cpuid_leaf(struct kvm_vcpu *vcpu, u32 leaf, int *entry_index,
3014 				   struct kvm_cpuid_entry2 *output_e)
3015 {
3016 	int sub_leaf = 0;
3017 	int ret;
3018 
3019 	/* First try without a subleaf */
3020 	ret = tdx_read_cpuid(vcpu, leaf, 0, false, entry_index, output_e);
3021 
3022 	/* If success, or invalid leaf, just give up */
3023 	if (ret != -EIO)
3024 		return ret;
3025 
3026 	/*
3027 	 * If the try without a subleaf failed, try reading subleafs until
3028 	 * failure. The TDX module only supports 6 bits of subleaf index.
3029 	 */
3030 	while (1) {
3031 		/* Keep reading subleafs until there is a failure. */
3032 		if (tdx_read_cpuid(vcpu, leaf, sub_leaf, true, entry_index, output_e))
3033 			return !sub_leaf;
3034 
3035 		sub_leaf++;
3036 		output_e++;
3037 	}
3038 
3039 	return 0;
3040 }
3041 
3042 static int tdx_vcpu_get_cpuid(struct kvm_vcpu *vcpu, struct kvm_tdx_cmd *cmd)
3043 {
3044 	struct kvm_cpuid2 __user *output, *td_cpuid;
3045 	int r = 0, i = 0, leaf;
3046 	u32 level;
3047 
3048 	output = u64_to_user_ptr(cmd->data);
3049 	td_cpuid = kzalloc(sizeof(*td_cpuid) +
3050 			sizeof(output->entries[0]) * KVM_MAX_CPUID_ENTRIES,
3051 			GFP_KERNEL);
3052 	if (!td_cpuid)
3053 		return -ENOMEM;
3054 
3055 	if (copy_from_user(td_cpuid, output, sizeof(*output))) {
3056 		r = -EFAULT;
3057 		goto out;
3058 	}
3059 
3060 	/* Read max CPUID for normal range */
3061 	if (tdx_vcpu_get_cpuid_leaf(vcpu, 0, &i, &td_cpuid->entries[i])) {
3062 		r = -EIO;
3063 		goto out;
3064 	}
3065 	level = td_cpuid->entries[0].eax;
3066 
3067 	for (leaf = 1; leaf <= level; leaf++)
3068 		tdx_vcpu_get_cpuid_leaf(vcpu, leaf, &i, &td_cpuid->entries[i]);
3069 
3070 	/* Read max CPUID for extended range */
3071 	if (tdx_vcpu_get_cpuid_leaf(vcpu, 0x80000000, &i, &td_cpuid->entries[i])) {
3072 		r = -EIO;
3073 		goto out;
3074 	}
3075 	level = td_cpuid->entries[i - 1].eax;
3076 
3077 	for (leaf = 0x80000001; leaf <= level; leaf++)
3078 		tdx_vcpu_get_cpuid_leaf(vcpu, leaf, &i, &td_cpuid->entries[i]);
3079 
3080 	if (td_cpuid->nent < i)
3081 		r = -E2BIG;
3082 	td_cpuid->nent = i;
3083 
3084 	if (copy_to_user(output, td_cpuid, sizeof(*output))) {
3085 		r = -EFAULT;
3086 		goto out;
3087 	}
3088 
3089 	if (r == -E2BIG)
3090 		goto out;
3091 
3092 	if (copy_to_user(output->entries, td_cpuid->entries,
3093 			 td_cpuid->nent * sizeof(struct kvm_cpuid_entry2)))
3094 		r = -EFAULT;
3095 
3096 out:
3097 	kfree(td_cpuid);
3098 
3099 	return r;
3100 }
3101 
3102 static int tdx_vcpu_init(struct kvm_vcpu *vcpu, struct kvm_tdx_cmd *cmd)
3103 {
3104 	u64 apic_base;
3105 	struct vcpu_tdx *tdx = to_tdx(vcpu);
3106 	int ret;
3107 
3108 	if (cmd->flags)
3109 		return -EINVAL;
3110 
3111 	if (tdx->state != VCPU_TD_STATE_UNINITIALIZED)
3112 		return -EINVAL;
3113 
3114 	/*
3115 	 * TDX requires X2APIC, userspace is responsible for configuring guest
3116 	 * CPUID accordingly.
3117 	 */
3118 	apic_base = APIC_DEFAULT_PHYS_BASE | LAPIC_MODE_X2APIC |
3119 		(kvm_vcpu_is_reset_bsp(vcpu) ? MSR_IA32_APICBASE_BSP : 0);
3120 	if (kvm_apic_set_base(vcpu, apic_base, true))
3121 		return -EINVAL;
3122 
3123 	ret = tdx_td_vcpu_init(vcpu, (u64)cmd->data);
3124 	if (ret)
3125 		return ret;
3126 
3127 	td_vmcs_write16(tdx, POSTED_INTR_NV, POSTED_INTR_VECTOR);
3128 	td_vmcs_write64(tdx, POSTED_INTR_DESC_ADDR, __pa(&tdx->vt.pi_desc));
3129 	td_vmcs_setbit32(tdx, PIN_BASED_VM_EXEC_CONTROL, PIN_BASED_POSTED_INTR);
3130 
3131 	tdx->state = VCPU_TD_STATE_INITIALIZED;
3132 
3133 	return 0;
3134 }
3135 
3136 void tdx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
3137 {
3138 	/*
3139 	 * Yell on INIT, as TDX doesn't support INIT, i.e. KVM should drop all
3140 	 * INIT events.
3141 	 *
3142 	 * Defer initializing vCPU for RESET state until KVM_TDX_INIT_VCPU, as
3143 	 * userspace needs to define the vCPU model before KVM can initialize
3144 	 * vCPU state, e.g. to enable x2APIC.
3145 	 */
3146 	WARN_ON_ONCE(init_event);
3147 }
3148 
3149 struct tdx_gmem_post_populate_arg {
3150 	struct kvm_vcpu *vcpu;
3151 	__u32 flags;
3152 };
3153 
3154 static int tdx_gmem_post_populate(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn,
3155 				  void __user *src, int order, void *_arg)
3156 {
3157 	u64 error_code = PFERR_GUEST_FINAL_MASK | PFERR_PRIVATE_ACCESS;
3158 	struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
3159 	struct tdx_gmem_post_populate_arg *arg = _arg;
3160 	struct kvm_vcpu *vcpu = arg->vcpu;
3161 	gpa_t gpa = gfn_to_gpa(gfn);
3162 	u8 level = PG_LEVEL_4K;
3163 	struct page *src_page;
3164 	int ret, i;
3165 	u64 err, entry, level_state;
3166 
3167 	/*
3168 	 * Get the source page if it has been faulted in. Return failure if the
3169 	 * source page has been swapped out or unmapped in primary memory.
3170 	 */
3171 	ret = get_user_pages_fast((unsigned long)src, 1, 0, &src_page);
3172 	if (ret < 0)
3173 		return ret;
3174 	if (ret != 1)
3175 		return -ENOMEM;
3176 
3177 	ret = kvm_tdp_map_page(vcpu, gpa, error_code, &level);
3178 	if (ret < 0)
3179 		goto out;
3180 
3181 	/*
3182 	 * The private mem cannot be zapped after kvm_tdp_map_page()
3183 	 * because all paths are covered by slots_lock and the
3184 	 * filemap invalidate lock.  Check that they are indeed enough.
3185 	 */
3186 	if (IS_ENABLED(CONFIG_KVM_PROVE_MMU)) {
3187 		scoped_guard(read_lock, &kvm->mmu_lock) {
3188 			if (KVM_BUG_ON(!kvm_tdp_mmu_gpa_is_mapped(vcpu, gpa), kvm)) {
3189 				ret = -EIO;
3190 				goto out;
3191 			}
3192 		}
3193 	}
3194 
3195 	ret = 0;
3196 	err = tdh_mem_page_add(&kvm_tdx->td, gpa, pfn_to_page(pfn),
3197 			       src_page, &entry, &level_state);
3198 	if (err) {
3199 		ret = unlikely(tdx_operand_busy(err)) ? -EBUSY : -EIO;
3200 		goto out;
3201 	}
3202 
3203 	if (!KVM_BUG_ON(!atomic64_read(&kvm_tdx->nr_premapped), kvm))
3204 		atomic64_dec(&kvm_tdx->nr_premapped);
3205 
3206 	if (arg->flags & KVM_TDX_MEASURE_MEMORY_REGION) {
3207 		for (i = 0; i < PAGE_SIZE; i += TDX_EXTENDMR_CHUNKSIZE) {
3208 			err = tdh_mr_extend(&kvm_tdx->td, gpa + i, &entry,
3209 					    &level_state);
3210 			if (err) {
3211 				ret = -EIO;
3212 				break;
3213 			}
3214 		}
3215 	}
3216 
3217 out:
3218 	put_page(src_page);
3219 	return ret;
3220 }
3221 
3222 static int tdx_vcpu_init_mem_region(struct kvm_vcpu *vcpu, struct kvm_tdx_cmd *cmd)
3223 {
3224 	struct vcpu_tdx *tdx = to_tdx(vcpu);
3225 	struct kvm *kvm = vcpu->kvm;
3226 	struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
3227 	struct kvm_tdx_init_mem_region region;
3228 	struct tdx_gmem_post_populate_arg arg;
3229 	long gmem_ret;
3230 	int ret;
3231 
3232 	if (tdx->state != VCPU_TD_STATE_INITIALIZED)
3233 		return -EINVAL;
3234 
3235 	guard(mutex)(&kvm->slots_lock);
3236 
3237 	/* Once TD is finalized, the initial guest memory is fixed. */
3238 	if (kvm_tdx->state == TD_STATE_RUNNABLE)
3239 		return -EINVAL;
3240 
3241 	if (cmd->flags & ~KVM_TDX_MEASURE_MEMORY_REGION)
3242 		return -EINVAL;
3243 
3244 	if (copy_from_user(&region, u64_to_user_ptr(cmd->data), sizeof(region)))
3245 		return -EFAULT;
3246 
3247 	if (!PAGE_ALIGNED(region.source_addr) || !PAGE_ALIGNED(region.gpa) ||
3248 	    !region.nr_pages ||
3249 	    region.gpa + (region.nr_pages << PAGE_SHIFT) <= region.gpa ||
3250 	    !vt_is_tdx_private_gpa(kvm, region.gpa) ||
3251 	    !vt_is_tdx_private_gpa(kvm, region.gpa + (region.nr_pages << PAGE_SHIFT) - 1))
3252 		return -EINVAL;
3253 
3254 	kvm_mmu_reload(vcpu);
3255 	ret = 0;
3256 	while (region.nr_pages) {
3257 		if (signal_pending(current)) {
3258 			ret = -EINTR;
3259 			break;
3260 		}
3261 
3262 		arg = (struct tdx_gmem_post_populate_arg) {
3263 			.vcpu = vcpu,
3264 			.flags = cmd->flags,
3265 		};
3266 		gmem_ret = kvm_gmem_populate(kvm, gpa_to_gfn(region.gpa),
3267 					     u64_to_user_ptr(region.source_addr),
3268 					     1, tdx_gmem_post_populate, &arg);
3269 		if (gmem_ret < 0) {
3270 			ret = gmem_ret;
3271 			break;
3272 		}
3273 
3274 		if (gmem_ret != 1) {
3275 			ret = -EIO;
3276 			break;
3277 		}
3278 
3279 		region.source_addr += PAGE_SIZE;
3280 		region.gpa += PAGE_SIZE;
3281 		region.nr_pages--;
3282 
3283 		cond_resched();
3284 	}
3285 
3286 	if (copy_to_user(u64_to_user_ptr(cmd->data), &region, sizeof(region)))
3287 		ret = -EFAULT;
3288 	return ret;
3289 }
3290 
3291 int tdx_vcpu_ioctl(struct kvm_vcpu *vcpu, void __user *argp)
3292 {
3293 	struct kvm_tdx *kvm_tdx = to_kvm_tdx(vcpu->kvm);
3294 	struct kvm_tdx_cmd cmd;
3295 	int ret;
3296 
3297 	if (!is_hkid_assigned(kvm_tdx) || kvm_tdx->state == TD_STATE_RUNNABLE)
3298 		return -EINVAL;
3299 
3300 	if (copy_from_user(&cmd, argp, sizeof(cmd)))
3301 		return -EFAULT;
3302 
3303 	if (cmd.hw_error)
3304 		return -EINVAL;
3305 
3306 	switch (cmd.id) {
3307 	case KVM_TDX_INIT_VCPU:
3308 		ret = tdx_vcpu_init(vcpu, &cmd);
3309 		break;
3310 	case KVM_TDX_INIT_MEM_REGION:
3311 		ret = tdx_vcpu_init_mem_region(vcpu, &cmd);
3312 		break;
3313 	case KVM_TDX_GET_CPUID:
3314 		ret = tdx_vcpu_get_cpuid(vcpu, &cmd);
3315 		break;
3316 	default:
3317 		ret = -EINVAL;
3318 		break;
3319 	}
3320 
3321 	return ret;
3322 }
3323 
3324 int tdx_gmem_private_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn)
3325 {
3326 	return PG_LEVEL_4K;
3327 }
3328 
3329 static int tdx_online_cpu(unsigned int cpu)
3330 {
3331 	unsigned long flags;
3332 	int r;
3333 
3334 	/* Sanity check CPU is already in post-VMXON */
3335 	WARN_ON_ONCE(!(cr4_read_shadow() & X86_CR4_VMXE));
3336 
3337 	local_irq_save(flags);
3338 	r = tdx_cpu_enable();
3339 	local_irq_restore(flags);
3340 
3341 	return r;
3342 }
3343 
3344 static int tdx_offline_cpu(unsigned int cpu)
3345 {
3346 	int i;
3347 
3348 	/* No TD is running.  Allow any cpu to be offline. */
3349 	if (!atomic_read(&nr_configured_hkid))
3350 		return 0;
3351 
3352 	/*
3353 	 * In order to reclaim TDX HKID, (i.e. when deleting guest TD), need to
3354 	 * call TDH.PHYMEM.PAGE.WBINVD on all packages to program all memory
3355 	 * controller with pconfig.  If we have active TDX HKID, refuse to
3356 	 * offline the last online cpu.
3357 	 */
3358 	for_each_online_cpu(i) {
3359 		/*
3360 		 * Found another online cpu on the same package.
3361 		 * Allow to offline.
3362 		 */
3363 		if (i != cpu && topology_physical_package_id(i) ==
3364 				topology_physical_package_id(cpu))
3365 			return 0;
3366 	}
3367 
3368 	/*
3369 	 * This is the last cpu of this package.  Don't offline it.
3370 	 *
3371 	 * Because it's hard for human operator to understand the
3372 	 * reason, warn it.
3373 	 */
3374 #define MSG_ALLPKG_ONLINE \
3375 	"TDX requires all packages to have an online CPU. Delete all TDs in order to offline all CPUs of a package.\n"
3376 	pr_warn_ratelimited(MSG_ALLPKG_ONLINE);
3377 	return -EBUSY;
3378 }
3379 
3380 static void __do_tdx_cleanup(void)
3381 {
3382 	/*
3383 	 * Once TDX module is initialized, it cannot be disabled and
3384 	 * re-initialized again w/o runtime update (which isn't
3385 	 * supported by kernel).  Only need to remove the cpuhp here.
3386 	 * The TDX host core code tracks TDX status and can handle
3387 	 * 'multiple enabling' scenario.
3388 	 */
3389 	WARN_ON_ONCE(!tdx_cpuhp_state);
3390 	cpuhp_remove_state_nocalls_cpuslocked(tdx_cpuhp_state);
3391 	tdx_cpuhp_state = 0;
3392 }
3393 
3394 static void __tdx_cleanup(void)
3395 {
3396 	cpus_read_lock();
3397 	__do_tdx_cleanup();
3398 	cpus_read_unlock();
3399 }
3400 
3401 static int __init __do_tdx_bringup(void)
3402 {
3403 	int r;
3404 
3405 	/*
3406 	 * TDX-specific cpuhp callback to call tdx_cpu_enable() on all
3407 	 * online CPUs before calling tdx_enable(), and on any new
3408 	 * going-online CPU to make sure it is ready for TDX guest.
3409 	 */
3410 	r = cpuhp_setup_state_cpuslocked(CPUHP_AP_ONLINE_DYN,
3411 					 "kvm/cpu/tdx:online",
3412 					 tdx_online_cpu, tdx_offline_cpu);
3413 	if (r < 0)
3414 		return r;
3415 
3416 	tdx_cpuhp_state = r;
3417 
3418 	r = tdx_enable();
3419 	if (r)
3420 		__do_tdx_cleanup();
3421 
3422 	return r;
3423 }
3424 
3425 static int __init __tdx_bringup(void)
3426 {
3427 	const struct tdx_sys_info_td_conf *td_conf;
3428 	int r, i;
3429 
3430 	for (i = 0; i < ARRAY_SIZE(tdx_uret_msrs); i++) {
3431 		/*
3432 		 * Check if MSRs (tdx_uret_msrs) can be saved/restored
3433 		 * before returning to user space.
3434 		 *
3435 		 * this_cpu_ptr(user_return_msrs)->registered isn't checked
3436 		 * because the registration is done at vcpu runtime by
3437 		 * tdx_user_return_msr_update_cache().
3438 		 */
3439 		tdx_uret_msrs[i].slot = kvm_find_user_return_msr(tdx_uret_msrs[i].msr);
3440 		if (tdx_uret_msrs[i].slot == -1) {
3441 			/* If any MSR isn't supported, it is a KVM bug */
3442 			pr_err("MSR %x isn't included by kvm_find_user_return_msr\n",
3443 				tdx_uret_msrs[i].msr);
3444 			return -EIO;
3445 		}
3446 	}
3447 
3448 	/*
3449 	 * Enabling TDX requires enabling hardware virtualization first,
3450 	 * as making SEAMCALLs requires CPU being in post-VMXON state.
3451 	 */
3452 	r = kvm_enable_virtualization();
3453 	if (r)
3454 		return r;
3455 
3456 	cpus_read_lock();
3457 	r = __do_tdx_bringup();
3458 	cpus_read_unlock();
3459 
3460 	if (r)
3461 		goto tdx_bringup_err;
3462 
3463 	/* Get TDX global information for later use */
3464 	tdx_sysinfo = tdx_get_sysinfo();
3465 	if (WARN_ON_ONCE(!tdx_sysinfo)) {
3466 		r = -EINVAL;
3467 		goto get_sysinfo_err;
3468 	}
3469 
3470 	/* Check TDX module and KVM capabilities */
3471 	if (!tdx_get_supported_attrs(&tdx_sysinfo->td_conf) ||
3472 	    !tdx_get_supported_xfam(&tdx_sysinfo->td_conf))
3473 		goto get_sysinfo_err;
3474 
3475 	if (!(tdx_sysinfo->features.tdx_features0 & MD_FIELD_ID_FEATURES0_TOPOLOGY_ENUM))
3476 		goto get_sysinfo_err;
3477 
3478 	/*
3479 	 * TDX has its own limit of maximum vCPUs it can support for all
3480 	 * TDX guests in addition to KVM_MAX_VCPUS.  Userspace needs to
3481 	 * query TDX guest's maximum vCPUs by checking KVM_CAP_MAX_VCPU
3482 	 * extension on per-VM basis.
3483 	 *
3484 	 * TDX module reports such limit via the MAX_VCPU_PER_TD global
3485 	 * metadata.  Different modules may report different values.
3486 	 * Some old module may also not support this metadata (in which
3487 	 * case this limit is U16_MAX).
3488 	 *
3489 	 * In practice, the reported value reflects the maximum logical
3490 	 * CPUs that ALL the platforms that the module supports can
3491 	 * possibly have.
3492 	 *
3493 	 * Simply forwarding the MAX_VCPU_PER_TD to userspace could
3494 	 * result in an unpredictable ABI.  KVM instead always advertise
3495 	 * the number of logical CPUs the platform has as the maximum
3496 	 * vCPUs for TDX guests.
3497 	 *
3498 	 * Make sure MAX_VCPU_PER_TD reported by TDX module is not
3499 	 * smaller than the number of logical CPUs, otherwise KVM will
3500 	 * report an unsupported value to userspace.
3501 	 *
3502 	 * Note, a platform with TDX enabled in the BIOS cannot support
3503 	 * physical CPU hotplug, and TDX requires the BIOS has marked
3504 	 * all logical CPUs in MADT table as enabled.  Just use
3505 	 * num_present_cpus() for the number of logical CPUs.
3506 	 */
3507 	td_conf = &tdx_sysinfo->td_conf;
3508 	if (td_conf->max_vcpus_per_td < num_present_cpus()) {
3509 		pr_err("Disable TDX: MAX_VCPU_PER_TD (%u) smaller than number of logical CPUs (%u).\n",
3510 				td_conf->max_vcpus_per_td, num_present_cpus());
3511 		r = -EINVAL;
3512 		goto get_sysinfo_err;
3513 	}
3514 
3515 	if (misc_cg_set_capacity(MISC_CG_RES_TDX, tdx_get_nr_guest_keyids())) {
3516 		r = -EINVAL;
3517 		goto get_sysinfo_err;
3518 	}
3519 
3520 	/*
3521 	 * Leave hardware virtualization enabled after TDX is enabled
3522 	 * successfully.  TDX CPU hotplug depends on this.
3523 	 */
3524 	return 0;
3525 
3526 get_sysinfo_err:
3527 	__tdx_cleanup();
3528 tdx_bringup_err:
3529 	kvm_disable_virtualization();
3530 	return r;
3531 }
3532 
3533 void tdx_cleanup(void)
3534 {
3535 	if (enable_tdx) {
3536 		misc_cg_set_capacity(MISC_CG_RES_TDX, 0);
3537 		__tdx_cleanup();
3538 		kvm_disable_virtualization();
3539 	}
3540 }
3541 
3542 int __init tdx_bringup(void)
3543 {
3544 	int r, i;
3545 
3546 	/* tdx_disable_virtualization_cpu() uses associated_tdvcpus. */
3547 	for_each_possible_cpu(i)
3548 		INIT_LIST_HEAD(&per_cpu(associated_tdvcpus, i));
3549 
3550 	if (!enable_tdx)
3551 		return 0;
3552 
3553 	if (!enable_ept) {
3554 		pr_err("EPT is required for TDX\n");
3555 		goto success_disable_tdx;
3556 	}
3557 
3558 	if (!tdp_mmu_enabled || !enable_mmio_caching || !enable_ept_ad_bits) {
3559 		pr_err("TDP MMU and MMIO caching and EPT A/D bit is required for TDX\n");
3560 		goto success_disable_tdx;
3561 	}
3562 
3563 	if (!enable_apicv) {
3564 		pr_err("APICv is required for TDX\n");
3565 		goto success_disable_tdx;
3566 	}
3567 
3568 	if (!cpu_feature_enabled(X86_FEATURE_OSXSAVE)) {
3569 		pr_err("tdx: OSXSAVE is required for TDX\n");
3570 		goto success_disable_tdx;
3571 	}
3572 
3573 	if (!cpu_feature_enabled(X86_FEATURE_MOVDIR64B)) {
3574 		pr_err("tdx: MOVDIR64B is required for TDX\n");
3575 		goto success_disable_tdx;
3576 	}
3577 
3578 	if (!cpu_feature_enabled(X86_FEATURE_SELFSNOOP)) {
3579 		pr_err("Self-snoop is required for TDX\n");
3580 		goto success_disable_tdx;
3581 	}
3582 
3583 	if (!cpu_feature_enabled(X86_FEATURE_TDX_HOST_PLATFORM)) {
3584 		pr_err("tdx: no TDX private KeyIDs available\n");
3585 		goto success_disable_tdx;
3586 	}
3587 
3588 	if (!enable_virt_at_load) {
3589 		pr_err("tdx: tdx requires kvm.enable_virt_at_load=1\n");
3590 		goto success_disable_tdx;
3591 	}
3592 
3593 	/*
3594 	 * Ideally KVM should probe whether TDX module has been loaded
3595 	 * first and then try to bring it up.  But TDX needs to use SEAMCALL
3596 	 * to probe whether the module is loaded (there is no CPUID or MSR
3597 	 * for that), and making SEAMCALL requires enabling virtualization
3598 	 * first, just like the rest steps of bringing up TDX module.
3599 	 *
3600 	 * So, for simplicity do everything in __tdx_bringup(); the first
3601 	 * SEAMCALL will return -ENODEV when the module is not loaded.  The
3602 	 * only complication is having to make sure that initialization
3603 	 * SEAMCALLs don't return TDX_SEAMCALL_VMFAILINVALID in other
3604 	 * cases.
3605 	 */
3606 	r = __tdx_bringup();
3607 	if (r) {
3608 		/*
3609 		 * Disable TDX only but don't fail to load module if
3610 		 * the TDX module could not be loaded.  No need to print
3611 		 * message saying "module is not loaded" because it was
3612 		 * printed when the first SEAMCALL failed.
3613 		 */
3614 		if (r == -ENODEV)
3615 			goto success_disable_tdx;
3616 
3617 		enable_tdx = 0;
3618 	}
3619 
3620 	return r;
3621 
3622 success_disable_tdx:
3623 	enable_tdx = 0;
3624 	return 0;
3625 }
3626