xref: /linux/arch/x86/kvm/vmx/tdx.c (revision e669e322c52c49c161e46492963e64319fbb53a8)
1fcdbdf63SKai Huang // SPDX-License-Identifier: GPL-2.0
2c846b451SIsaku Yamahata #include <linux/cleanup.h>
3fcdbdf63SKai Huang #include <linux/cpu.h>
4fcdbdf63SKai Huang #include <asm/cpufeature.h>
56bfa6d85SIsaku Yamahata #include <asm/fpu/xcr.h>
67c035beaSZhiming Hu #include <linux/misc_cgroup.h>
781bf40d5SIsaku Yamahata #include <linux/mmu_context.h>
8fcdbdf63SKai Huang #include <asm/tdx.h>
9fcdbdf63SKai Huang #include "capabilities.h"
10488808e6SXiaoyao Li #include "mmu.h"
11b2aaf38cSIsaku Yamahata #include "x86_ops.h"
129002f8cfSIsaku Yamahata #include "lapic.h"
13fcdbdf63SKai Huang #include "tdx.h"
1422836e1dSIsaku Yamahata #include "vmx.h"
157d10ffb1SIsaku Yamahata #include "mmu/spte.h"
16c846b451SIsaku Yamahata #include "common.h"
1781bf40d5SIsaku Yamahata #include "posted_intr.h"
18209afc0cSBinbin Wu #include "irq.h"
1981bf912bSIsaku Yamahata #include <trace/events/kvm.h>
2081bf912bSIsaku Yamahata #include "trace.h"
21fcdbdf63SKai Huang 
2209b3d3c1SIsaku Yamahata #pragma GCC poison to_vmx
2309b3d3c1SIsaku Yamahata 
24fcdbdf63SKai Huang #undef pr_fmt
25fcdbdf63SKai Huang #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
26fcdbdf63SKai Huang 
27e4aa6f69SIsaku Yamahata #define pr_tdx_error(__fn, __err)	\
28e4aa6f69SIsaku Yamahata 	pr_err_ratelimited("SEAMCALL %s failed: 0x%llx\n", #__fn, __err)
29e4aa6f69SIsaku Yamahata 
30e4aa6f69SIsaku Yamahata #define __pr_tdx_error_N(__fn_str, __err, __fmt, ...)		\
31e4aa6f69SIsaku Yamahata 	pr_err_ratelimited("SEAMCALL " __fn_str " failed: 0x%llx, " __fmt,  __err,  __VA_ARGS__)
32e4aa6f69SIsaku Yamahata 
33e4aa6f69SIsaku Yamahata #define pr_tdx_error_1(__fn, __err, __rcx)		\
34e4aa6f69SIsaku Yamahata 	__pr_tdx_error_N(#__fn, __err, "rcx 0x%llx\n", __rcx)
35e4aa6f69SIsaku Yamahata 
36e4aa6f69SIsaku Yamahata #define pr_tdx_error_2(__fn, __err, __rcx, __rdx)	\
37e4aa6f69SIsaku Yamahata 	__pr_tdx_error_N(#__fn, __err, "rcx 0x%llx, rdx 0x%llx\n", __rcx, __rdx)
38e4aa6f69SIsaku Yamahata 
39e4aa6f69SIsaku Yamahata #define pr_tdx_error_3(__fn, __err, __rcx, __rdx, __r8)	\
40e4aa6f69SIsaku Yamahata 	__pr_tdx_error_N(#__fn, __err, "rcx 0x%llx, rdx 0x%llx, r8 0x%llx\n", __rcx, __rdx, __r8)
41e4aa6f69SIsaku Yamahata 
4209b3d3c1SIsaku Yamahata bool enable_tdx __ro_after_init;
43fcdbdf63SKai Huang module_param_named(tdx, enable_tdx, bool, 0444);
44fcdbdf63SKai Huang 
4587e3f45eSSean Christopherson #define TDX_SHARED_BIT_PWL_5 gpa_to_gfn(BIT_ULL(51))
4687e3f45eSSean Christopherson #define TDX_SHARED_BIT_PWL_4 gpa_to_gfn(BIT_ULL(47))
4787e3f45eSSean Christopherson 
48fcdbdf63SKai Huang static enum cpuhp_state tdx_cpuhp_state;
49fcdbdf63SKai Huang 
5045154fb0SKai Huang static const struct tdx_sys_info *tdx_sysinfo;
5145154fb0SKai Huang 
tdh_vp_rd_failed(struct vcpu_tdx * tdx,char * uclass,u32 field,u64 err)52fe1e6d48SIsaku Yamahata void tdh_vp_rd_failed(struct vcpu_tdx *tdx, char *uclass, u32 field, u64 err)
53fe1e6d48SIsaku Yamahata {
54fe1e6d48SIsaku Yamahata 	KVM_BUG_ON(1, tdx->vcpu.kvm);
55fe1e6d48SIsaku Yamahata 	pr_err("TDH_VP_RD[%s.0x%x] failed 0x%llx\n", uclass, field, err);
56fe1e6d48SIsaku Yamahata }
57fe1e6d48SIsaku Yamahata 
tdh_vp_wr_failed(struct vcpu_tdx * tdx,char * uclass,char * op,u32 field,u64 val,u64 err)58fe1e6d48SIsaku Yamahata void tdh_vp_wr_failed(struct vcpu_tdx *tdx, char *uclass, char *op, u32 field,
59fe1e6d48SIsaku Yamahata 		      u64 val, u64 err)
60fe1e6d48SIsaku Yamahata {
61fe1e6d48SIsaku Yamahata 	KVM_BUG_ON(1, tdx->vcpu.kvm);
62fe1e6d48SIsaku Yamahata 	pr_err("TDH_VP_WR[%s.0x%x]%s0x%llx failed: 0x%llx\n", uclass, field, op, val, err);
63fe1e6d48SIsaku Yamahata }
64fe1e6d48SIsaku Yamahata 
6561bb2827SIsaku Yamahata #define KVM_SUPPORTED_TD_ATTRS (TDX_TD_ATTR_SEPT_VE_DISABLE)
6661bb2827SIsaku Yamahata 
to_kvm_tdx(struct kvm * kvm)6709b3d3c1SIsaku Yamahata static __always_inline struct kvm_tdx *to_kvm_tdx(struct kvm *kvm)
6809b3d3c1SIsaku Yamahata {
6909b3d3c1SIsaku Yamahata 	return container_of(kvm, struct kvm_tdx, kvm);
7009b3d3c1SIsaku Yamahata }
7109b3d3c1SIsaku Yamahata 
to_tdx(struct kvm_vcpu * vcpu)7209b3d3c1SIsaku Yamahata static __always_inline struct vcpu_tdx *to_tdx(struct kvm_vcpu *vcpu)
7309b3d3c1SIsaku Yamahata {
7409b3d3c1SIsaku Yamahata 	return container_of(vcpu, struct vcpu_tdx, vcpu);
7509b3d3c1SIsaku Yamahata }
7609b3d3c1SIsaku Yamahata 
tdx_get_supported_attrs(const struct tdx_sys_info_td_conf * td_conf)7761bb2827SIsaku Yamahata static u64 tdx_get_supported_attrs(const struct tdx_sys_info_td_conf *td_conf)
7861bb2827SIsaku Yamahata {
7961bb2827SIsaku Yamahata 	u64 val = KVM_SUPPORTED_TD_ATTRS;
8061bb2827SIsaku Yamahata 
8161bb2827SIsaku Yamahata 	if ((val & td_conf->attributes_fixed1) != td_conf->attributes_fixed1)
8261bb2827SIsaku Yamahata 		return 0;
8361bb2827SIsaku Yamahata 
8461bb2827SIsaku Yamahata 	val &= td_conf->attributes_fixed0;
8561bb2827SIsaku Yamahata 
8661bb2827SIsaku Yamahata 	return val;
8761bb2827SIsaku Yamahata }
8861bb2827SIsaku Yamahata 
tdx_get_supported_xfam(const struct tdx_sys_info_td_conf * td_conf)8961bb2827SIsaku Yamahata static u64 tdx_get_supported_xfam(const struct tdx_sys_info_td_conf *td_conf)
9061bb2827SIsaku Yamahata {
9161bb2827SIsaku Yamahata 	u64 val = kvm_caps.supported_xcr0 | kvm_caps.supported_xss;
9261bb2827SIsaku Yamahata 
9361bb2827SIsaku Yamahata 	if ((val & td_conf->xfam_fixed1) != td_conf->xfam_fixed1)
9461bb2827SIsaku Yamahata 		return 0;
9561bb2827SIsaku Yamahata 
9661bb2827SIsaku Yamahata 	val &= td_conf->xfam_fixed0;
9761bb2827SIsaku Yamahata 
9861bb2827SIsaku Yamahata 	return val;
9961bb2827SIsaku Yamahata }
10061bb2827SIsaku Yamahata 
tdx_get_guest_phys_addr_bits(const u32 eax)1010186dd29SIsaku Yamahata static int tdx_get_guest_phys_addr_bits(const u32 eax)
1020186dd29SIsaku Yamahata {
1030186dd29SIsaku Yamahata 	return (eax & GENMASK(23, 16)) >> 16;
1040186dd29SIsaku Yamahata }
1050186dd29SIsaku Yamahata 
tdx_set_guest_phys_addr_bits(const u32 eax,int addr_bits)10661bb2827SIsaku Yamahata static u32 tdx_set_guest_phys_addr_bits(const u32 eax, int addr_bits)
10761bb2827SIsaku Yamahata {
10861bb2827SIsaku Yamahata 	return (eax & ~GENMASK(23, 16)) | (addr_bits & 0xff) << 16;
10961bb2827SIsaku Yamahata }
11061bb2827SIsaku Yamahata 
1116d415778SAdrian Hunter #define TDX_FEATURE_TSX (__feature_bit(X86_FEATURE_HLE) | __feature_bit(X86_FEATURE_RTM))
1126d415778SAdrian Hunter 
has_tsx(const struct kvm_cpuid_entry2 * entry)1136d415778SAdrian Hunter static bool has_tsx(const struct kvm_cpuid_entry2 *entry)
1146d415778SAdrian Hunter {
1156d415778SAdrian Hunter 	return entry->function == 7 && entry->index == 0 &&
1166d415778SAdrian Hunter 	       (entry->ebx & TDX_FEATURE_TSX);
1176d415778SAdrian Hunter }
1186d415778SAdrian Hunter 
clear_tsx(struct kvm_cpuid_entry2 * entry)1196d415778SAdrian Hunter static void clear_tsx(struct kvm_cpuid_entry2 *entry)
1206d415778SAdrian Hunter {
1216d415778SAdrian Hunter 	entry->ebx &= ~TDX_FEATURE_TSX;
1226d415778SAdrian Hunter }
1236d415778SAdrian Hunter 
has_waitpkg(const struct kvm_cpuid_entry2 * entry)1246d415778SAdrian Hunter static bool has_waitpkg(const struct kvm_cpuid_entry2 *entry)
1256d415778SAdrian Hunter {
1266d415778SAdrian Hunter 	return entry->function == 7 && entry->index == 0 &&
1276d415778SAdrian Hunter 	       (entry->ecx & __feature_bit(X86_FEATURE_WAITPKG));
1286d415778SAdrian Hunter }
1296d415778SAdrian Hunter 
clear_waitpkg(struct kvm_cpuid_entry2 * entry)1306d415778SAdrian Hunter static void clear_waitpkg(struct kvm_cpuid_entry2 *entry)
1316d415778SAdrian Hunter {
1326d415778SAdrian Hunter 	entry->ecx &= ~__feature_bit(X86_FEATURE_WAITPKG);
1336d415778SAdrian Hunter }
1346d415778SAdrian Hunter 
tdx_clear_unsupported_cpuid(struct kvm_cpuid_entry2 * entry)1356d415778SAdrian Hunter static void tdx_clear_unsupported_cpuid(struct kvm_cpuid_entry2 *entry)
1366d415778SAdrian Hunter {
1376d415778SAdrian Hunter 	if (has_tsx(entry))
1386d415778SAdrian Hunter 		clear_tsx(entry);
1396d415778SAdrian Hunter 
1406d415778SAdrian Hunter 	if (has_waitpkg(entry))
1416d415778SAdrian Hunter 		clear_waitpkg(entry);
1426d415778SAdrian Hunter }
1436d415778SAdrian Hunter 
tdx_unsupported_cpuid(const struct kvm_cpuid_entry2 * entry)1446d415778SAdrian Hunter static bool tdx_unsupported_cpuid(const struct kvm_cpuid_entry2 *entry)
1456d415778SAdrian Hunter {
1466d415778SAdrian Hunter 	return has_tsx(entry) || has_waitpkg(entry);
1476d415778SAdrian Hunter }
1486d415778SAdrian Hunter 
14961bb2827SIsaku Yamahata #define KVM_TDX_CPUID_NO_SUBLEAF	((__u32)-1)
15061bb2827SIsaku Yamahata 
td_init_cpuid_entry2(struct kvm_cpuid_entry2 * entry,unsigned char idx)15161bb2827SIsaku Yamahata static void td_init_cpuid_entry2(struct kvm_cpuid_entry2 *entry, unsigned char idx)
15261bb2827SIsaku Yamahata {
15361bb2827SIsaku Yamahata 	const struct tdx_sys_info_td_conf *td_conf = &tdx_sysinfo->td_conf;
15461bb2827SIsaku Yamahata 
15561bb2827SIsaku Yamahata 	entry->function = (u32)td_conf->cpuid_config_leaves[idx];
15661bb2827SIsaku Yamahata 	entry->index = td_conf->cpuid_config_leaves[idx] >> 32;
15761bb2827SIsaku Yamahata 	entry->eax = (u32)td_conf->cpuid_config_values[idx][0];
15861bb2827SIsaku Yamahata 	entry->ebx = td_conf->cpuid_config_values[idx][0] >> 32;
15961bb2827SIsaku Yamahata 	entry->ecx = (u32)td_conf->cpuid_config_values[idx][1];
16061bb2827SIsaku Yamahata 	entry->edx = td_conf->cpuid_config_values[idx][1] >> 32;
16161bb2827SIsaku Yamahata 
16261bb2827SIsaku Yamahata 	if (entry->index == KVM_TDX_CPUID_NO_SUBLEAF)
16361bb2827SIsaku Yamahata 		entry->index = 0;
16461bb2827SIsaku Yamahata 
16561bb2827SIsaku Yamahata 	/*
16661bb2827SIsaku Yamahata 	 * The TDX module doesn't allow configuring the guest phys addr bits
16761bb2827SIsaku Yamahata 	 * (EAX[23:16]).  However, KVM uses it as an interface to the userspace
16861bb2827SIsaku Yamahata 	 * to configure the GPAW.  Report these bits as configurable.
16961bb2827SIsaku Yamahata 	 */
17061bb2827SIsaku Yamahata 	if (entry->function == 0x80000008)
17161bb2827SIsaku Yamahata 		entry->eax = tdx_set_guest_phys_addr_bits(entry->eax, 0xff);
1726d415778SAdrian Hunter 
1736d415778SAdrian Hunter 	tdx_clear_unsupported_cpuid(entry);
17461bb2827SIsaku Yamahata }
17561bb2827SIsaku Yamahata 
init_kvm_tdx_caps(const struct tdx_sys_info_td_conf * td_conf,struct kvm_tdx_capabilities * caps)17661bb2827SIsaku Yamahata static int init_kvm_tdx_caps(const struct tdx_sys_info_td_conf *td_conf,
17761bb2827SIsaku Yamahata 			     struct kvm_tdx_capabilities *caps)
17861bb2827SIsaku Yamahata {
17961bb2827SIsaku Yamahata 	int i;
18061bb2827SIsaku Yamahata 
18161bb2827SIsaku Yamahata 	caps->supported_attrs = tdx_get_supported_attrs(td_conf);
18261bb2827SIsaku Yamahata 	if (!caps->supported_attrs)
18361bb2827SIsaku Yamahata 		return -EIO;
18461bb2827SIsaku Yamahata 
18561bb2827SIsaku Yamahata 	caps->supported_xfam = tdx_get_supported_xfam(td_conf);
18661bb2827SIsaku Yamahata 	if (!caps->supported_xfam)
18761bb2827SIsaku Yamahata 		return -EIO;
18861bb2827SIsaku Yamahata 
18961bb2827SIsaku Yamahata 	caps->cpuid.nent = td_conf->num_cpuid_config;
19061bb2827SIsaku Yamahata 
19161bb2827SIsaku Yamahata 	for (i = 0; i < td_conf->num_cpuid_config; i++)
19261bb2827SIsaku Yamahata 		td_init_cpuid_entry2(&caps->cpuid.entries[i], i);
19361bb2827SIsaku Yamahata 
19461bb2827SIsaku Yamahata 	return 0;
19561bb2827SIsaku Yamahata }
19661bb2827SIsaku Yamahata 
1978d032b68SIsaku Yamahata /*
1988d032b68SIsaku Yamahata  * Some SEAMCALLs acquire the TDX module globally, and can fail with
1998d032b68SIsaku Yamahata  * TDX_OPERAND_BUSY.  Use a global mutex to serialize these SEAMCALLs.
2008d032b68SIsaku Yamahata  */
2018d032b68SIsaku Yamahata static DEFINE_MUTEX(tdx_lock);
2028d032b68SIsaku Yamahata 
2039934d7e5SIsaku Yamahata static atomic_t nr_configured_hkid;
2049934d7e5SIsaku Yamahata 
tdx_operand_busy(u64 err)20502ab5770SIsaku Yamahata static bool tdx_operand_busy(u64 err)
20602ab5770SIsaku Yamahata {
20702ab5770SIsaku Yamahata 	return (err & TDX_SEAMCALL_STATUS_MASK) == TDX_OPERAND_BUSY;
20802ab5770SIsaku Yamahata }
20902ab5770SIsaku Yamahata 
21002ab5770SIsaku Yamahata 
211d789fa6eSIsaku Yamahata /*
212d789fa6eSIsaku Yamahata  * A per-CPU list of TD vCPUs associated with a given CPU.
213d789fa6eSIsaku Yamahata  * Protected by interrupt mask. Only manipulated by the CPU owning this per-CPU
214d789fa6eSIsaku Yamahata  * list.
215d789fa6eSIsaku Yamahata  * - When a vCPU is loaded onto a CPU, it is removed from the per-CPU list of
216d789fa6eSIsaku Yamahata  *   the old CPU during the IPI callback running on the old CPU, and then added
217d789fa6eSIsaku Yamahata  *   to the per-CPU list of the new CPU.
218d789fa6eSIsaku Yamahata  * - When a TD is tearing down, all vCPUs are disassociated from their current
219d789fa6eSIsaku Yamahata  *   running CPUs and removed from the per-CPU list during the IPI callback
220d789fa6eSIsaku Yamahata  *   running on those CPUs.
221d789fa6eSIsaku Yamahata  * - When a CPU is brought down, traverse the per-CPU list to disassociate all
222d789fa6eSIsaku Yamahata  *   associated TD vCPUs and remove them from the per-CPU list.
223d789fa6eSIsaku Yamahata  */
224d789fa6eSIsaku Yamahata static DEFINE_PER_CPU(struct list_head, associated_tdvcpus);
225d789fa6eSIsaku Yamahata 
tdvmcall_exit_type(struct kvm_vcpu * vcpu)226c42856afSIsaku Yamahata static __always_inline unsigned long tdvmcall_exit_type(struct kvm_vcpu *vcpu)
227c42856afSIsaku Yamahata {
228c42856afSIsaku Yamahata 	return to_tdx(vcpu)->vp_enter_args.r10;
229c42856afSIsaku Yamahata }
230c42856afSIsaku Yamahata 
tdvmcall_leaf(struct kvm_vcpu * vcpu)231c42856afSIsaku Yamahata static __always_inline unsigned long tdvmcall_leaf(struct kvm_vcpu *vcpu)
232c42856afSIsaku Yamahata {
233c42856afSIsaku Yamahata 	return to_tdx(vcpu)->vp_enter_args.r11;
234c42856afSIsaku Yamahata }
235c42856afSIsaku Yamahata 
tdvmcall_set_return_code(struct kvm_vcpu * vcpu,long val)236c42856afSIsaku Yamahata static __always_inline void tdvmcall_set_return_code(struct kvm_vcpu *vcpu,
237c42856afSIsaku Yamahata 						     long val)
238c42856afSIsaku Yamahata {
239c42856afSIsaku Yamahata 	to_tdx(vcpu)->vp_enter_args.r10 = val;
240c42856afSIsaku Yamahata }
241c42856afSIsaku Yamahata 
tdvmcall_set_return_val(struct kvm_vcpu * vcpu,unsigned long val)242c42856afSIsaku Yamahata static __always_inline void tdvmcall_set_return_val(struct kvm_vcpu *vcpu,
243c42856afSIsaku Yamahata 						    unsigned long val)
244c42856afSIsaku Yamahata {
245c42856afSIsaku Yamahata 	to_tdx(vcpu)->vp_enter_args.r11 = val;
246c42856afSIsaku Yamahata }
247c42856afSIsaku Yamahata 
tdx_hkid_free(struct kvm_tdx * kvm_tdx)2488d032b68SIsaku Yamahata static inline void tdx_hkid_free(struct kvm_tdx *kvm_tdx)
2498d032b68SIsaku Yamahata {
2508d032b68SIsaku Yamahata 	tdx_guest_keyid_free(kvm_tdx->hkid);
2518d032b68SIsaku Yamahata 	kvm_tdx->hkid = -1;
2529934d7e5SIsaku Yamahata 	atomic_dec(&nr_configured_hkid);
2537c035beaSZhiming Hu 	misc_cg_uncharge(MISC_CG_RES_TDX, kvm_tdx->misc_cg, 1);
2547c035beaSZhiming Hu 	put_misc_cg(kvm_tdx->misc_cg);
2557c035beaSZhiming Hu 	kvm_tdx->misc_cg = NULL;
2568d032b68SIsaku Yamahata }
2578d032b68SIsaku Yamahata 
is_hkid_assigned(struct kvm_tdx * kvm_tdx)2588d032b68SIsaku Yamahata static inline bool is_hkid_assigned(struct kvm_tdx *kvm_tdx)
2598d032b68SIsaku Yamahata {
2608d032b68SIsaku Yamahata 	return kvm_tdx->hkid > 0;
2618d032b68SIsaku Yamahata }
2628d032b68SIsaku Yamahata 
tdx_disassociate_vp(struct kvm_vcpu * vcpu)263d789fa6eSIsaku Yamahata static inline void tdx_disassociate_vp(struct kvm_vcpu *vcpu)
264d789fa6eSIsaku Yamahata {
265d789fa6eSIsaku Yamahata 	lockdep_assert_irqs_disabled();
266d789fa6eSIsaku Yamahata 
267d789fa6eSIsaku Yamahata 	list_del(&to_tdx(vcpu)->cpu_list);
268d789fa6eSIsaku Yamahata 
269d789fa6eSIsaku Yamahata 	/*
270d789fa6eSIsaku Yamahata 	 * Ensure tdx->cpu_list is updated before setting vcpu->cpu to -1,
271d789fa6eSIsaku Yamahata 	 * otherwise, a different CPU can see vcpu->cpu = -1 and add the vCPU
272d789fa6eSIsaku Yamahata 	 * to its list before it's deleted from this CPU's list.
273d789fa6eSIsaku Yamahata 	 */
274d789fa6eSIsaku Yamahata 	smp_wmb();
275d789fa6eSIsaku Yamahata 
276d789fa6eSIsaku Yamahata 	vcpu->cpu = -1;
277d789fa6eSIsaku Yamahata }
278d789fa6eSIsaku Yamahata 
tdx_clear_page(struct page * page)2798d032b68SIsaku Yamahata static void tdx_clear_page(struct page *page)
2808d032b68SIsaku Yamahata {
2818d032b68SIsaku Yamahata 	const void *zero_page = (const void *) page_to_virt(ZERO_PAGE(0));
2828d032b68SIsaku Yamahata 	void *dest = page_to_virt(page);
2838d032b68SIsaku Yamahata 	unsigned long i;
2848d032b68SIsaku Yamahata 
2858d032b68SIsaku Yamahata 	/*
2868d032b68SIsaku Yamahata 	 * The page could have been poisoned.  MOVDIR64B also clears
2878d032b68SIsaku Yamahata 	 * the poison bit so the kernel can safely use the page again.
2888d032b68SIsaku Yamahata 	 */
2898d032b68SIsaku Yamahata 	for (i = 0; i < PAGE_SIZE; i += 64)
2908d032b68SIsaku Yamahata 		movdir64b(dest + i, zero_page);
2918d032b68SIsaku Yamahata 	/*
2928d032b68SIsaku Yamahata 	 * MOVDIR64B store uses WC buffer.  Prevent following memory reads
2938d032b68SIsaku Yamahata 	 * from seeing potentially poisoned cache.
2948d032b68SIsaku Yamahata 	 */
2958d032b68SIsaku Yamahata 	__mb();
2968d032b68SIsaku Yamahata }
2978d032b68SIsaku Yamahata 
tdx_no_vcpus_enter_start(struct kvm * kvm)2984b2abc49SYan Zhao static void tdx_no_vcpus_enter_start(struct kvm *kvm)
2994b2abc49SYan Zhao {
3004b2abc49SYan Zhao 	struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
3014b2abc49SYan Zhao 
3024b2abc49SYan Zhao 	lockdep_assert_held_write(&kvm->mmu_lock);
3034b2abc49SYan Zhao 
3044b2abc49SYan Zhao 	WRITE_ONCE(kvm_tdx->wait_for_sept_zap, true);
3054b2abc49SYan Zhao 
3064b2abc49SYan Zhao 	kvm_make_all_cpus_request(kvm, KVM_REQ_OUTSIDE_GUEST_MODE);
3074b2abc49SYan Zhao }
3084b2abc49SYan Zhao 
tdx_no_vcpus_enter_stop(struct kvm * kvm)3094b2abc49SYan Zhao static void tdx_no_vcpus_enter_stop(struct kvm *kvm)
3104b2abc49SYan Zhao {
3114b2abc49SYan Zhao 	struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
3124b2abc49SYan Zhao 
3134b2abc49SYan Zhao 	lockdep_assert_held_write(&kvm->mmu_lock);
3144b2abc49SYan Zhao 
3154b2abc49SYan Zhao 	WRITE_ONCE(kvm_tdx->wait_for_sept_zap, false);
3164b2abc49SYan Zhao }
3174b2abc49SYan Zhao 
3188d032b68SIsaku Yamahata /* TDH.PHYMEM.PAGE.RECLAIM is allowed only when destroying the TD. */
__tdx_reclaim_page(struct page * page)3198d032b68SIsaku Yamahata static int __tdx_reclaim_page(struct page *page)
3208d032b68SIsaku Yamahata {
3218d032b68SIsaku Yamahata 	u64 err, rcx, rdx, r8;
3228d032b68SIsaku Yamahata 
3238d032b68SIsaku Yamahata 	err = tdh_phymem_page_reclaim(page, &rcx, &rdx, &r8);
3248d032b68SIsaku Yamahata 
3258d032b68SIsaku Yamahata 	/*
3268d032b68SIsaku Yamahata 	 * No need to check for TDX_OPERAND_BUSY; all TD pages are freed
3278d032b68SIsaku Yamahata 	 * before the HKID is released and control pages have also been
3288d032b68SIsaku Yamahata 	 * released at this point, so there is no possibility of contention.
3298d032b68SIsaku Yamahata 	 */
3308d032b68SIsaku Yamahata 	if (WARN_ON_ONCE(err)) {
3318d032b68SIsaku Yamahata 		pr_tdx_error_3(TDH_PHYMEM_PAGE_RECLAIM, err, rcx, rdx, r8);
3328d032b68SIsaku Yamahata 		return -EIO;
3338d032b68SIsaku Yamahata 	}
3348d032b68SIsaku Yamahata 	return 0;
3358d032b68SIsaku Yamahata }
3368d032b68SIsaku Yamahata 
tdx_reclaim_page(struct page * page)3378d032b68SIsaku Yamahata static int tdx_reclaim_page(struct page *page)
3388d032b68SIsaku Yamahata {
3398d032b68SIsaku Yamahata 	int r;
3408d032b68SIsaku Yamahata 
3418d032b68SIsaku Yamahata 	r = __tdx_reclaim_page(page);
3428d032b68SIsaku Yamahata 	if (!r)
3438d032b68SIsaku Yamahata 		tdx_clear_page(page);
3448d032b68SIsaku Yamahata 	return r;
3458d032b68SIsaku Yamahata }
3468d032b68SIsaku Yamahata 
3478d032b68SIsaku Yamahata 
3488d032b68SIsaku Yamahata /*
3498d032b68SIsaku Yamahata  * Reclaim the TD control page(s) which are crypto-protected by TDX guest's
3508d032b68SIsaku Yamahata  * private KeyID.  Assume the cache associated with the TDX private KeyID has
3518d032b68SIsaku Yamahata  * been flushed.
3528d032b68SIsaku Yamahata  */
tdx_reclaim_control_page(struct page * ctrl_page)3538d032b68SIsaku Yamahata static void tdx_reclaim_control_page(struct page *ctrl_page)
3548d032b68SIsaku Yamahata {
3558d032b68SIsaku Yamahata 	/*
3568d032b68SIsaku Yamahata 	 * Leak the page if the kernel failed to reclaim the page.
3578d032b68SIsaku Yamahata 	 * The kernel cannot use it safely anymore.
3588d032b68SIsaku Yamahata 	 */
3598d032b68SIsaku Yamahata 	if (tdx_reclaim_page(ctrl_page))
3608d032b68SIsaku Yamahata 		return;
3618d032b68SIsaku Yamahata 
3628d032b68SIsaku Yamahata 	__free_page(ctrl_page);
3638d032b68SIsaku Yamahata }
3648d032b68SIsaku Yamahata 
365d789fa6eSIsaku Yamahata struct tdx_flush_vp_arg {
366d789fa6eSIsaku Yamahata 	struct kvm_vcpu *vcpu;
367d789fa6eSIsaku Yamahata 	u64 err;
368d789fa6eSIsaku Yamahata };
369d789fa6eSIsaku Yamahata 
tdx_flush_vp(void * _arg)370d789fa6eSIsaku Yamahata static void tdx_flush_vp(void *_arg)
371d789fa6eSIsaku Yamahata {
372d789fa6eSIsaku Yamahata 	struct tdx_flush_vp_arg *arg = _arg;
373d789fa6eSIsaku Yamahata 	struct kvm_vcpu *vcpu = arg->vcpu;
374d789fa6eSIsaku Yamahata 	u64 err;
375d789fa6eSIsaku Yamahata 
376d789fa6eSIsaku Yamahata 	arg->err = 0;
377d789fa6eSIsaku Yamahata 	lockdep_assert_irqs_disabled();
378d789fa6eSIsaku Yamahata 
379d789fa6eSIsaku Yamahata 	/* Task migration can race with CPU offlining. */
380d789fa6eSIsaku Yamahata 	if (unlikely(vcpu->cpu != raw_smp_processor_id()))
381d789fa6eSIsaku Yamahata 		return;
382d789fa6eSIsaku Yamahata 
383d789fa6eSIsaku Yamahata 	/*
384d789fa6eSIsaku Yamahata 	 * No need to do TDH_VP_FLUSH if the vCPU hasn't been initialized.  The
385d789fa6eSIsaku Yamahata 	 * list tracking still needs to be updated so that it's correct if/when
386d789fa6eSIsaku Yamahata 	 * the vCPU does get initialized.
387d789fa6eSIsaku Yamahata 	 */
388d789fa6eSIsaku Yamahata 	if (to_tdx(vcpu)->state != VCPU_TD_STATE_UNINITIALIZED) {
389d789fa6eSIsaku Yamahata 		/*
390d789fa6eSIsaku Yamahata 		 * No need to retry.  TDX Resources needed for TDH.VP.FLUSH are:
391d789fa6eSIsaku Yamahata 		 * TDVPR as exclusive, TDR as shared, and TDCS as shared.  This
392d789fa6eSIsaku Yamahata 		 * vp flush function is called when destructing vCPU/TD or vCPU
393d789fa6eSIsaku Yamahata 		 * migration.  No other thread uses TDVPR in those cases.
394d789fa6eSIsaku Yamahata 		 */
395d789fa6eSIsaku Yamahata 		err = tdh_vp_flush(&to_tdx(vcpu)->vp);
396d789fa6eSIsaku Yamahata 		if (unlikely(err && err != TDX_VCPU_NOT_ASSOCIATED)) {
397d789fa6eSIsaku Yamahata 			/*
398d789fa6eSIsaku Yamahata 			 * This function is called in IPI context. Do not use
399d789fa6eSIsaku Yamahata 			 * printk to avoid console semaphore.
400d789fa6eSIsaku Yamahata 			 * The caller prints out the error message, instead.
401d789fa6eSIsaku Yamahata 			 */
402d789fa6eSIsaku Yamahata 			if (err)
403d789fa6eSIsaku Yamahata 				arg->err = err;
404d789fa6eSIsaku Yamahata 		}
405d789fa6eSIsaku Yamahata 	}
406d789fa6eSIsaku Yamahata 
407d789fa6eSIsaku Yamahata 	tdx_disassociate_vp(vcpu);
408d789fa6eSIsaku Yamahata }
409d789fa6eSIsaku Yamahata 
tdx_flush_vp_on_cpu(struct kvm_vcpu * vcpu)410d789fa6eSIsaku Yamahata static void tdx_flush_vp_on_cpu(struct kvm_vcpu *vcpu)
411d789fa6eSIsaku Yamahata {
412d789fa6eSIsaku Yamahata 	struct tdx_flush_vp_arg arg = {
413d789fa6eSIsaku Yamahata 		.vcpu = vcpu,
414d789fa6eSIsaku Yamahata 	};
415d789fa6eSIsaku Yamahata 	int cpu = vcpu->cpu;
416d789fa6eSIsaku Yamahata 
417d789fa6eSIsaku Yamahata 	if (unlikely(cpu == -1))
418d789fa6eSIsaku Yamahata 		return;
419d789fa6eSIsaku Yamahata 
420d789fa6eSIsaku Yamahata 	smp_call_function_single(cpu, tdx_flush_vp, &arg, 1);
421d789fa6eSIsaku Yamahata 	if (KVM_BUG_ON(arg.err, vcpu->kvm))
422d789fa6eSIsaku Yamahata 		pr_tdx_error(TDH_VP_FLUSH, arg.err);
423d789fa6eSIsaku Yamahata }
424d789fa6eSIsaku Yamahata 
tdx_disable_virtualization_cpu(void)425d789fa6eSIsaku Yamahata void tdx_disable_virtualization_cpu(void)
426d789fa6eSIsaku Yamahata {
427d789fa6eSIsaku Yamahata 	int cpu = raw_smp_processor_id();
428d789fa6eSIsaku Yamahata 	struct list_head *tdvcpus = &per_cpu(associated_tdvcpus, cpu);
429d789fa6eSIsaku Yamahata 	struct tdx_flush_vp_arg arg;
430d789fa6eSIsaku Yamahata 	struct vcpu_tdx *tdx, *tmp;
431d789fa6eSIsaku Yamahata 	unsigned long flags;
432d789fa6eSIsaku Yamahata 
433d789fa6eSIsaku Yamahata 	local_irq_save(flags);
434d789fa6eSIsaku Yamahata 	/* Safe variant needed as tdx_disassociate_vp() deletes the entry. */
435d789fa6eSIsaku Yamahata 	list_for_each_entry_safe(tdx, tmp, tdvcpus, cpu_list) {
436d789fa6eSIsaku Yamahata 		arg.vcpu = &tdx->vcpu;
437d789fa6eSIsaku Yamahata 		tdx_flush_vp(&arg);
438d789fa6eSIsaku Yamahata 	}
439d789fa6eSIsaku Yamahata 	local_irq_restore(flags);
440d789fa6eSIsaku Yamahata }
441d789fa6eSIsaku Yamahata 
4428d032b68SIsaku Yamahata #define TDX_SEAMCALL_RETRIES 10000
4438d032b68SIsaku Yamahata 
smp_func_do_phymem_cache_wb(void * unused)4448d032b68SIsaku Yamahata static void smp_func_do_phymem_cache_wb(void *unused)
4458d032b68SIsaku Yamahata {
4468d032b68SIsaku Yamahata 	u64 err = 0;
4478d032b68SIsaku Yamahata 	bool resume;
4488d032b68SIsaku Yamahata 	int i;
4498d032b68SIsaku Yamahata 
4508d032b68SIsaku Yamahata 	/*
4518d032b68SIsaku Yamahata 	 * TDH.PHYMEM.CACHE.WB flushes caches associated with any TDX private
4528d032b68SIsaku Yamahata 	 * KeyID on the package or core.  The TDX module may not finish the
4538d032b68SIsaku Yamahata 	 * cache flush but return TDX_INTERRUPTED_RESUMEABLE instead.  The
4548d032b68SIsaku Yamahata 	 * kernel should retry it until it returns success w/o rescheduling.
4558d032b68SIsaku Yamahata 	 */
4568d032b68SIsaku Yamahata 	for (i = TDX_SEAMCALL_RETRIES; i > 0; i--) {
4578d032b68SIsaku Yamahata 		resume = !!err;
4588d032b68SIsaku Yamahata 		err = tdh_phymem_cache_wb(resume);
4598d032b68SIsaku Yamahata 		switch (err) {
4608d032b68SIsaku Yamahata 		case TDX_INTERRUPTED_RESUMABLE:
4618d032b68SIsaku Yamahata 			continue;
4628d032b68SIsaku Yamahata 		case TDX_NO_HKID_READY_TO_WBCACHE:
4638d032b68SIsaku Yamahata 			err = TDX_SUCCESS; /* Already done by other thread */
4648d032b68SIsaku Yamahata 			fallthrough;
4658d032b68SIsaku Yamahata 		default:
4668d032b68SIsaku Yamahata 			goto out;
4678d032b68SIsaku Yamahata 		}
4688d032b68SIsaku Yamahata 	}
4698d032b68SIsaku Yamahata 
4708d032b68SIsaku Yamahata out:
4718d032b68SIsaku Yamahata 	if (WARN_ON_ONCE(err))
4728d032b68SIsaku Yamahata 		pr_tdx_error(TDH_PHYMEM_CACHE_WB, err);
4738d032b68SIsaku Yamahata }
4748d032b68SIsaku Yamahata 
tdx_mmu_release_hkid(struct kvm * kvm)4758d032b68SIsaku Yamahata void tdx_mmu_release_hkid(struct kvm *kvm)
4768d032b68SIsaku Yamahata {
4778d032b68SIsaku Yamahata 	bool packages_allocated, targets_allocated;
4788d032b68SIsaku Yamahata 	struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
4798d032b68SIsaku Yamahata 	cpumask_var_t packages, targets;
480d789fa6eSIsaku Yamahata 	struct kvm_vcpu *vcpu;
481d789fa6eSIsaku Yamahata 	unsigned long j;
4828d032b68SIsaku Yamahata 	int i;
483d789fa6eSIsaku Yamahata 	u64 err;
4848d032b68SIsaku Yamahata 
4858d032b68SIsaku Yamahata 	if (!is_hkid_assigned(kvm_tdx))
4868d032b68SIsaku Yamahata 		return;
4878d032b68SIsaku Yamahata 
4888d032b68SIsaku Yamahata 	packages_allocated = zalloc_cpumask_var(&packages, GFP_KERNEL);
4898d032b68SIsaku Yamahata 	targets_allocated = zalloc_cpumask_var(&targets, GFP_KERNEL);
4908d032b68SIsaku Yamahata 	cpus_read_lock();
4918d032b68SIsaku Yamahata 
492d789fa6eSIsaku Yamahata 	kvm_for_each_vcpu(j, vcpu, kvm)
493d789fa6eSIsaku Yamahata 		tdx_flush_vp_on_cpu(vcpu);
494d789fa6eSIsaku Yamahata 
4958d032b68SIsaku Yamahata 	/*
4968d032b68SIsaku Yamahata 	 * TDH.PHYMEM.CACHE.WB tries to acquire the TDX module global lock
4978d032b68SIsaku Yamahata 	 * and can fail with TDX_OPERAND_BUSY when it fails to get the lock.
4988d032b68SIsaku Yamahata 	 * Multiple TDX guests can be destroyed simultaneously. Take the
4998d032b68SIsaku Yamahata 	 * mutex to prevent it from getting error.
5008d032b68SIsaku Yamahata 	 */
5018d032b68SIsaku Yamahata 	mutex_lock(&tdx_lock);
5028d032b68SIsaku Yamahata 
5038d032b68SIsaku Yamahata 	/*
5048d032b68SIsaku Yamahata 	 * Releasing HKID is in vm_destroy().
5058d032b68SIsaku Yamahata 	 * After the above flushing vps, there should be no more vCPU
5068d032b68SIsaku Yamahata 	 * associations, as all vCPU fds have been released at this stage.
5078d032b68SIsaku Yamahata 	 */
508d789fa6eSIsaku Yamahata 	err = tdh_mng_vpflushdone(&kvm_tdx->td);
509d789fa6eSIsaku Yamahata 	if (err == TDX_FLUSHVP_NOT_DONE)
510d789fa6eSIsaku Yamahata 		goto out;
511d789fa6eSIsaku Yamahata 	if (KVM_BUG_ON(err, kvm)) {
512d789fa6eSIsaku Yamahata 		pr_tdx_error(TDH_MNG_VPFLUSHDONE, err);
513d789fa6eSIsaku Yamahata 		pr_err("tdh_mng_vpflushdone() failed. HKID %d is leaked.\n",
514d789fa6eSIsaku Yamahata 		       kvm_tdx->hkid);
515d789fa6eSIsaku Yamahata 		goto out;
516d789fa6eSIsaku Yamahata 	}
517d789fa6eSIsaku Yamahata 
5188d032b68SIsaku Yamahata 	for_each_online_cpu(i) {
5198d032b68SIsaku Yamahata 		if (packages_allocated &&
5208d032b68SIsaku Yamahata 		    cpumask_test_and_set_cpu(topology_physical_package_id(i),
5218d032b68SIsaku Yamahata 					     packages))
5228d032b68SIsaku Yamahata 			continue;
5238d032b68SIsaku Yamahata 		if (targets_allocated)
5248d032b68SIsaku Yamahata 			cpumask_set_cpu(i, targets);
5258d032b68SIsaku Yamahata 	}
5268d032b68SIsaku Yamahata 	if (targets_allocated)
5278d032b68SIsaku Yamahata 		on_each_cpu_mask(targets, smp_func_do_phymem_cache_wb, NULL, true);
5288d032b68SIsaku Yamahata 	else
5298d032b68SIsaku Yamahata 		on_each_cpu(smp_func_do_phymem_cache_wb, NULL, true);
5308d032b68SIsaku Yamahata 	/*
5318d032b68SIsaku Yamahata 	 * In the case of error in smp_func_do_phymem_cache_wb(), the following
5328d032b68SIsaku Yamahata 	 * tdh_mng_key_freeid() will fail.
5338d032b68SIsaku Yamahata 	 */
5348d032b68SIsaku Yamahata 	err = tdh_mng_key_freeid(&kvm_tdx->td);
5358d032b68SIsaku Yamahata 	if (KVM_BUG_ON(err, kvm)) {
5368d032b68SIsaku Yamahata 		pr_tdx_error(TDH_MNG_KEY_FREEID, err);
5378d032b68SIsaku Yamahata 		pr_err("tdh_mng_key_freeid() failed. HKID %d is leaked.\n",
5388d032b68SIsaku Yamahata 		       kvm_tdx->hkid);
5398d032b68SIsaku Yamahata 	} else {
5408d032b68SIsaku Yamahata 		tdx_hkid_free(kvm_tdx);
5418d032b68SIsaku Yamahata 	}
5428d032b68SIsaku Yamahata 
543d789fa6eSIsaku Yamahata out:
5448d032b68SIsaku Yamahata 	mutex_unlock(&tdx_lock);
5458d032b68SIsaku Yamahata 	cpus_read_unlock();
5468d032b68SIsaku Yamahata 	free_cpumask_var(targets);
5478d032b68SIsaku Yamahata 	free_cpumask_var(packages);
5488d032b68SIsaku Yamahata }
5498d032b68SIsaku Yamahata 
tdx_reclaim_td_control_pages(struct kvm * kvm)5508d032b68SIsaku Yamahata static void tdx_reclaim_td_control_pages(struct kvm *kvm)
5518d032b68SIsaku Yamahata {
5528d032b68SIsaku Yamahata 	struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
5538d032b68SIsaku Yamahata 	u64 err;
5548d032b68SIsaku Yamahata 	int i;
5558d032b68SIsaku Yamahata 
5568d032b68SIsaku Yamahata 	/*
5578d032b68SIsaku Yamahata 	 * tdx_mmu_release_hkid() failed to reclaim HKID.  Something went wrong
5588d032b68SIsaku Yamahata 	 * heavily with TDX module.  Give up freeing TD pages.  As the function
5598d032b68SIsaku Yamahata 	 * already warned, don't warn it again.
5608d032b68SIsaku Yamahata 	 */
5618d032b68SIsaku Yamahata 	if (is_hkid_assigned(kvm_tdx))
5628d032b68SIsaku Yamahata 		return;
5638d032b68SIsaku Yamahata 
5648d032b68SIsaku Yamahata 	if (kvm_tdx->td.tdcs_pages) {
5658d032b68SIsaku Yamahata 		for (i = 0; i < kvm_tdx->td.tdcs_nr_pages; i++) {
5668d032b68SIsaku Yamahata 			if (!kvm_tdx->td.tdcs_pages[i])
5678d032b68SIsaku Yamahata 				continue;
5688d032b68SIsaku Yamahata 
5698d032b68SIsaku Yamahata 			tdx_reclaim_control_page(kvm_tdx->td.tdcs_pages[i]);
5708d032b68SIsaku Yamahata 		}
5718d032b68SIsaku Yamahata 		kfree(kvm_tdx->td.tdcs_pages);
5728d032b68SIsaku Yamahata 		kvm_tdx->td.tdcs_pages = NULL;
5738d032b68SIsaku Yamahata 	}
5748d032b68SIsaku Yamahata 
5758d032b68SIsaku Yamahata 	if (!kvm_tdx->td.tdr_page)
5768d032b68SIsaku Yamahata 		return;
5778d032b68SIsaku Yamahata 
5788d032b68SIsaku Yamahata 	if (__tdx_reclaim_page(kvm_tdx->td.tdr_page))
5798d032b68SIsaku Yamahata 		return;
5808d032b68SIsaku Yamahata 
5818d032b68SIsaku Yamahata 	/*
5828d032b68SIsaku Yamahata 	 * Use a SEAMCALL to ask the TDX module to flush the cache based on the
5838d032b68SIsaku Yamahata 	 * KeyID. TDX module may access TDR while operating on TD (Especially
5848d032b68SIsaku Yamahata 	 * when it is reclaiming TDCS).
5858d032b68SIsaku Yamahata 	 */
5868d032b68SIsaku Yamahata 	err = tdh_phymem_page_wbinvd_tdr(&kvm_tdx->td);
5878d032b68SIsaku Yamahata 	if (KVM_BUG_ON(err, kvm)) {
5888d032b68SIsaku Yamahata 		pr_tdx_error(TDH_PHYMEM_PAGE_WBINVD, err);
5898d032b68SIsaku Yamahata 		return;
5908d032b68SIsaku Yamahata 	}
5918d032b68SIsaku Yamahata 	tdx_clear_page(kvm_tdx->td.tdr_page);
5928d032b68SIsaku Yamahata 
5938d032b68SIsaku Yamahata 	__free_page(kvm_tdx->td.tdr_page);
5948d032b68SIsaku Yamahata 	kvm_tdx->td.tdr_page = NULL;
5958d032b68SIsaku Yamahata }
5968d032b68SIsaku Yamahata 
tdx_vm_destroy(struct kvm * kvm)5978d032b68SIsaku Yamahata void tdx_vm_destroy(struct kvm *kvm)
5988d032b68SIsaku Yamahata {
5990186dd29SIsaku Yamahata 	struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
6000186dd29SIsaku Yamahata 
6018d032b68SIsaku Yamahata 	tdx_reclaim_td_control_pages(kvm);
6020186dd29SIsaku Yamahata 
6030186dd29SIsaku Yamahata 	kvm_tdx->state = TD_STATE_UNINITIALIZED;
6048d032b68SIsaku Yamahata }
6058d032b68SIsaku Yamahata 
tdx_do_tdh_mng_key_config(void * param)6068d032b68SIsaku Yamahata static int tdx_do_tdh_mng_key_config(void *param)
6078d032b68SIsaku Yamahata {
6088d032b68SIsaku Yamahata 	struct kvm_tdx *kvm_tdx = param;
6098d032b68SIsaku Yamahata 	u64 err;
6108d032b68SIsaku Yamahata 
6118d032b68SIsaku Yamahata 	/* TDX_RND_NO_ENTROPY related retries are handled by sc_retry() */
6128d032b68SIsaku Yamahata 	err = tdh_mng_key_config(&kvm_tdx->td);
6138d032b68SIsaku Yamahata 
6148d032b68SIsaku Yamahata 	if (KVM_BUG_ON(err, &kvm_tdx->kvm)) {
6158d032b68SIsaku Yamahata 		pr_tdx_error(TDH_MNG_KEY_CONFIG, err);
6168d032b68SIsaku Yamahata 		return -EIO;
6178d032b68SIsaku Yamahata 	}
6188d032b68SIsaku Yamahata 
6198d032b68SIsaku Yamahata 	return 0;
6208d032b68SIsaku Yamahata }
6218d032b68SIsaku Yamahata 
tdx_vm_init(struct kvm * kvm)6228d032b68SIsaku Yamahata int tdx_vm_init(struct kvm *kvm)
6238d032b68SIsaku Yamahata {
6240186dd29SIsaku Yamahata 	struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
6250186dd29SIsaku Yamahata 
6268d032b68SIsaku Yamahata 	kvm->arch.has_protected_state = true;
6278d032b68SIsaku Yamahata 	kvm->arch.has_private_mem = true;
62890fe64a9SYan Zhao 	kvm->arch.disabled_quirks |= KVM_X86_QUIRK_IGNORE_GUEST_PAT;
6298d032b68SIsaku Yamahata 
630f94f4a97SIsaku Yamahata 	/*
6317d10ffb1SIsaku Yamahata 	 * Because guest TD is protected, VMM can't parse the instruction in TD.
6327d10ffb1SIsaku Yamahata 	 * Instead, guest uses MMIO hypercall.  For unmodified device driver,
6337d10ffb1SIsaku Yamahata 	 * #VE needs to be injected for MMIO and #VE handler in TD converts MMIO
6347d10ffb1SIsaku Yamahata 	 * instruction into MMIO hypercall.
6357d10ffb1SIsaku Yamahata 	 *
6367d10ffb1SIsaku Yamahata 	 * SPTE value for MMIO needs to be setup so that #VE is injected into
6377d10ffb1SIsaku Yamahata 	 * TD instead of triggering EPT MISCONFIG.
6387d10ffb1SIsaku Yamahata 	 * - RWX=0 so that EPT violation is triggered.
6397d10ffb1SIsaku Yamahata 	 * - suppress #VE bit is cleared to inject #VE.
6407d10ffb1SIsaku Yamahata 	 */
6417d10ffb1SIsaku Yamahata 	kvm_mmu_set_mmio_spte_value(kvm, 0);
6427d10ffb1SIsaku Yamahata 
6437d10ffb1SIsaku Yamahata 	/*
644f94f4a97SIsaku Yamahata 	 * TDX has its own limit of maximum vCPUs it can support for all
645f94f4a97SIsaku Yamahata 	 * TDX guests in addition to KVM_MAX_VCPUS.  TDX module reports
646f94f4a97SIsaku Yamahata 	 * such limit via the MAX_VCPU_PER_TD global metadata.  In
647f94f4a97SIsaku Yamahata 	 * practice, it reflects the number of logical CPUs that ALL
648f94f4a97SIsaku Yamahata 	 * platforms that the TDX module supports can possibly have.
649f94f4a97SIsaku Yamahata 	 *
650f94f4a97SIsaku Yamahata 	 * Limit TDX guest's maximum vCPUs to the number of logical CPUs
651f94f4a97SIsaku Yamahata 	 * the platform has.  Simply forwarding the MAX_VCPU_PER_TD to
652f94f4a97SIsaku Yamahata 	 * userspace would result in an unpredictable ABI.
653f94f4a97SIsaku Yamahata 	 */
654f94f4a97SIsaku Yamahata 	kvm->max_vcpus = min_t(int, kvm->max_vcpus, num_present_cpus());
655f94f4a97SIsaku Yamahata 
6560186dd29SIsaku Yamahata 	kvm_tdx->state = TD_STATE_UNINITIALIZED;
6570186dd29SIsaku Yamahata 
6580186dd29SIsaku Yamahata 	return 0;
6598d032b68SIsaku Yamahata }
6608d032b68SIsaku Yamahata 
tdx_vcpu_create(struct kvm_vcpu * vcpu)6619002f8cfSIsaku Yamahata int tdx_vcpu_create(struct kvm_vcpu *vcpu)
6629002f8cfSIsaku Yamahata {
6639002f8cfSIsaku Yamahata 	struct kvm_tdx *kvm_tdx = to_kvm_tdx(vcpu->kvm);
664a50f673fSIsaku Yamahata 	struct vcpu_tdx *tdx = to_tdx(vcpu);
6659002f8cfSIsaku Yamahata 
6669002f8cfSIsaku Yamahata 	if (kvm_tdx->state != TD_STATE_INITIALIZED)
6679002f8cfSIsaku Yamahata 		return -EIO;
6689002f8cfSIsaku Yamahata 
669209afc0cSBinbin Wu 	/*
670209afc0cSBinbin Wu 	 * TDX module mandates APICv, which requires an in-kernel local APIC.
671209afc0cSBinbin Wu 	 * Disallow an in-kernel I/O APIC, because level-triggered interrupts
672209afc0cSBinbin Wu 	 * and thus the I/O APIC as a whole can't be faithfully emulated in KVM.
673209afc0cSBinbin Wu 	 */
674209afc0cSBinbin Wu 	if (!irqchip_split(vcpu->kvm))
6759002f8cfSIsaku Yamahata 		return -EINVAL;
6769002f8cfSIsaku Yamahata 
6779002f8cfSIsaku Yamahata 	fpstate_set_confidential(&vcpu->arch.guest_fpu);
67890cfe144SSean Christopherson 	vcpu->arch.apic->guest_apic_protected = true;
67934d2d1caSIsaku Yamahata 	INIT_LIST_HEAD(&tdx->vt.pi_wakeup_list);
6809002f8cfSIsaku Yamahata 
6819002f8cfSIsaku Yamahata 	vcpu->arch.efer = EFER_SCE | EFER_LME | EFER_LMA | EFER_NX;
6829002f8cfSIsaku Yamahata 
683484612f1SIsaku Yamahata 	vcpu->arch.switch_db_regs = KVM_DEBUGREG_AUTO_SWITCH;
6849002f8cfSIsaku Yamahata 	vcpu->arch.cr0_guest_owned_bits = -1ul;
6859002f8cfSIsaku Yamahata 	vcpu->arch.cr4_guest_owned_bits = -1ul;
6869002f8cfSIsaku Yamahata 
6879002f8cfSIsaku Yamahata 	/* KVM can't change TSC offset/multiplier as TDX module manages them. */
6889002f8cfSIsaku Yamahata 	vcpu->arch.guest_tsc_protected = true;
6899002f8cfSIsaku Yamahata 	vcpu->arch.tsc_offset = kvm_tdx->tsc_offset;
6909002f8cfSIsaku Yamahata 	vcpu->arch.l1_tsc_offset = vcpu->arch.tsc_offset;
6919002f8cfSIsaku Yamahata 	vcpu->arch.tsc_scaling_ratio = kvm_tdx->tsc_multiplier;
6929002f8cfSIsaku Yamahata 	vcpu->arch.l1_tsc_scaling_ratio = kvm_tdx->tsc_multiplier;
6939002f8cfSIsaku Yamahata 
6949002f8cfSIsaku Yamahata 	vcpu->arch.guest_state_protected =
6959002f8cfSIsaku Yamahata 		!(to_kvm_tdx(vcpu->kvm)->attributes & TDX_TD_ATTR_DEBUG);
6969002f8cfSIsaku Yamahata 
6979002f8cfSIsaku Yamahata 	if ((kvm_tdx->xfam & XFEATURE_MASK_XTILE) == XFEATURE_MASK_XTILE)
6989002f8cfSIsaku Yamahata 		vcpu->arch.xfd_no_write_intercept = true;
6999002f8cfSIsaku Yamahata 
70024c12911SIsaku Yamahata 	tdx->vt.pi_desc.nv = POSTED_INTR_VECTOR;
70124c12911SIsaku Yamahata 	__pi_set_sn(&tdx->vt.pi_desc);
70224c12911SIsaku Yamahata 
703a50f673fSIsaku Yamahata 	tdx->state = VCPU_TD_STATE_UNINITIALIZED;
704a50f673fSIsaku Yamahata 
7059002f8cfSIsaku Yamahata 	return 0;
7069002f8cfSIsaku Yamahata }
7079002f8cfSIsaku Yamahata 
tdx_vcpu_load(struct kvm_vcpu * vcpu,int cpu)708d789fa6eSIsaku Yamahata void tdx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
709d789fa6eSIsaku Yamahata {
710d789fa6eSIsaku Yamahata 	struct vcpu_tdx *tdx = to_tdx(vcpu);
711d789fa6eSIsaku Yamahata 
71224c12911SIsaku Yamahata 	vmx_vcpu_pi_load(vcpu, cpu);
713d789fa6eSIsaku Yamahata 	if (vcpu->cpu == cpu || !is_hkid_assigned(to_kvm_tdx(vcpu->kvm)))
714d789fa6eSIsaku Yamahata 		return;
715d789fa6eSIsaku Yamahata 
716d789fa6eSIsaku Yamahata 	tdx_flush_vp_on_cpu(vcpu);
717d789fa6eSIsaku Yamahata 
718d789fa6eSIsaku Yamahata 	KVM_BUG_ON(cpu != raw_smp_processor_id(), vcpu->kvm);
719d789fa6eSIsaku Yamahata 	local_irq_disable();
720d789fa6eSIsaku Yamahata 	/*
721d789fa6eSIsaku Yamahata 	 * Pairs with the smp_wmb() in tdx_disassociate_vp() to ensure
722d789fa6eSIsaku Yamahata 	 * vcpu->cpu is read before tdx->cpu_list.
723d789fa6eSIsaku Yamahata 	 */
724d789fa6eSIsaku Yamahata 	smp_rmb();
725d789fa6eSIsaku Yamahata 
726d789fa6eSIsaku Yamahata 	list_add(&tdx->cpu_list, &per_cpu(associated_tdvcpus, cpu));
727d789fa6eSIsaku Yamahata 	local_irq_enable();
728d789fa6eSIsaku Yamahata }
729d789fa6eSIsaku Yamahata 
tdx_interrupt_allowed(struct kvm_vcpu * vcpu)7305cf7239bSIsaku Yamahata bool tdx_interrupt_allowed(struct kvm_vcpu *vcpu)
7315cf7239bSIsaku Yamahata {
7325cf7239bSIsaku Yamahata 	/*
7335cf7239bSIsaku Yamahata 	 * KVM can't get the interrupt status of TDX guest and it assumes
7345cf7239bSIsaku Yamahata 	 * interrupt is always allowed unless TDX guest calls TDVMCALL with HLT,
7355cf7239bSIsaku Yamahata 	 * which passes the interrupt blocked flag.
7365cf7239bSIsaku Yamahata 	 */
7375cf7239bSIsaku Yamahata 	return vmx_get_exit_reason(vcpu).basic != EXIT_REASON_HLT ||
7385cf7239bSIsaku Yamahata 	       !to_tdx(vcpu)->vp_enter_args.r12;
7395cf7239bSIsaku Yamahata }
7405cf7239bSIsaku Yamahata 
tdx_protected_apic_has_interrupt(struct kvm_vcpu * vcpu)74190cfe144SSean Christopherson bool tdx_protected_apic_has_interrupt(struct kvm_vcpu *vcpu)
74290cfe144SSean Christopherson {
7435cf7239bSIsaku Yamahata 	u64 vcpu_state_details;
7445cf7239bSIsaku Yamahata 
7455cf7239bSIsaku Yamahata 	if (pi_has_pending_interrupt(vcpu))
7465cf7239bSIsaku Yamahata 		return true;
7475cf7239bSIsaku Yamahata 
7485cf7239bSIsaku Yamahata 	/*
7495cf7239bSIsaku Yamahata 	 * Only check RVI pending for HALTED case with IRQ enabled.
7505cf7239bSIsaku Yamahata 	 * For non-HLT cases, KVM doesn't care about STI/SS shadows.  And if the
7515cf7239bSIsaku Yamahata 	 * interrupt was pending before TD exit, then it _must_ be blocked,
7525cf7239bSIsaku Yamahata 	 * otherwise the interrupt would have been serviced at the instruction
7535cf7239bSIsaku Yamahata 	 * boundary.
7545cf7239bSIsaku Yamahata 	 */
7555cf7239bSIsaku Yamahata 	if (vmx_get_exit_reason(vcpu).basic != EXIT_REASON_HLT ||
7565cf7239bSIsaku Yamahata 	    to_tdx(vcpu)->vp_enter_args.r12)
7575cf7239bSIsaku Yamahata 		return false;
7585cf7239bSIsaku Yamahata 
7595cf7239bSIsaku Yamahata 	vcpu_state_details =
7605cf7239bSIsaku Yamahata 		td_state_non_arch_read64(to_tdx(vcpu), TD_VCPU_STATE_DETAILS_NON_ARCH);
7615cf7239bSIsaku Yamahata 
7625cf7239bSIsaku Yamahata 	return tdx_vcpu_state_details_intr_pending(vcpu_state_details);
76390cfe144SSean Christopherson }
76490cfe144SSean Christopherson 
76581bf40d5SIsaku Yamahata /*
76681bf40d5SIsaku Yamahata  * Compared to vmx_prepare_switch_to_guest(), there is not much to do
76781bf40d5SIsaku Yamahata  * as SEAMCALL/SEAMRET calls take care of most of save and restore.
76881bf40d5SIsaku Yamahata  */
tdx_prepare_switch_to_guest(struct kvm_vcpu * vcpu)76981bf40d5SIsaku Yamahata void tdx_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
77081bf40d5SIsaku Yamahata {
77181bf40d5SIsaku Yamahata 	struct vcpu_vt *vt = to_vt(vcpu);
77281bf40d5SIsaku Yamahata 
77381bf40d5SIsaku Yamahata 	if (vt->guest_state_loaded)
77481bf40d5SIsaku Yamahata 		return;
77581bf40d5SIsaku Yamahata 
77681bf40d5SIsaku Yamahata 	if (likely(is_64bit_mm(current->mm)))
77781bf40d5SIsaku Yamahata 		vt->msr_host_kernel_gs_base = current->thread.gsbase;
77881bf40d5SIsaku Yamahata 	else
77981bf40d5SIsaku Yamahata 		vt->msr_host_kernel_gs_base = read_msr(MSR_KERNEL_GS_BASE);
78081bf40d5SIsaku Yamahata 
7818af09903SAdrian Hunter 	vt->host_debugctlmsr = get_debugctlmsr();
7828af09903SAdrian Hunter 
78381bf40d5SIsaku Yamahata 	vt->guest_state_loaded = true;
78481bf40d5SIsaku Yamahata }
78581bf40d5SIsaku Yamahata 
786e0b4f31aSIsaku Yamahata struct tdx_uret_msr {
787e0b4f31aSIsaku Yamahata 	u32 msr;
788e0b4f31aSIsaku Yamahata 	unsigned int slot;
789e0b4f31aSIsaku Yamahata 	u64 defval;
790e0b4f31aSIsaku Yamahata };
791e0b4f31aSIsaku Yamahata 
792e0b4f31aSIsaku Yamahata static struct tdx_uret_msr tdx_uret_msrs[] = {
793e0b4f31aSIsaku Yamahata 	{.msr = MSR_SYSCALL_MASK, .defval = 0x20200 },
794e0b4f31aSIsaku Yamahata 	{.msr = MSR_STAR,},
795e0b4f31aSIsaku Yamahata 	{.msr = MSR_LSTAR,},
796e0b4f31aSIsaku Yamahata 	{.msr = MSR_TSC_AUX,},
797e0b4f31aSIsaku Yamahata };
798e0b4f31aSIsaku Yamahata 
tdx_user_return_msr_update_cache(void)799e0b4f31aSIsaku Yamahata static void tdx_user_return_msr_update_cache(void)
800e0b4f31aSIsaku Yamahata {
801e0b4f31aSIsaku Yamahata 	int i;
802e0b4f31aSIsaku Yamahata 
803e0b4f31aSIsaku Yamahata 	for (i = 0; i < ARRAY_SIZE(tdx_uret_msrs); i++)
804e0b4f31aSIsaku Yamahata 		kvm_user_return_msr_update_cache(tdx_uret_msrs[i].slot,
805e0b4f31aSIsaku Yamahata 						 tdx_uret_msrs[i].defval);
806e0b4f31aSIsaku Yamahata }
807e0b4f31aSIsaku Yamahata 
tdx_prepare_switch_to_host(struct kvm_vcpu * vcpu)80881bf40d5SIsaku Yamahata static void tdx_prepare_switch_to_host(struct kvm_vcpu *vcpu)
80981bf40d5SIsaku Yamahata {
81081bf40d5SIsaku Yamahata 	struct vcpu_vt *vt = to_vt(vcpu);
811e0b4f31aSIsaku Yamahata 	struct vcpu_tdx *tdx = to_tdx(vcpu);
81281bf40d5SIsaku Yamahata 
81381bf40d5SIsaku Yamahata 	if (!vt->guest_state_loaded)
81481bf40d5SIsaku Yamahata 		return;
81581bf40d5SIsaku Yamahata 
81681bf40d5SIsaku Yamahata 	++vcpu->stat.host_state_reload;
81781bf40d5SIsaku Yamahata 	wrmsrl(MSR_KERNEL_GS_BASE, vt->msr_host_kernel_gs_base);
81881bf40d5SIsaku Yamahata 
819e0b4f31aSIsaku Yamahata 	if (tdx->guest_entered) {
820e0b4f31aSIsaku Yamahata 		tdx_user_return_msr_update_cache();
821e0b4f31aSIsaku Yamahata 		tdx->guest_entered = false;
822e0b4f31aSIsaku Yamahata 	}
823e0b4f31aSIsaku Yamahata 
82481bf40d5SIsaku Yamahata 	vt->guest_state_loaded = false;
82581bf40d5SIsaku Yamahata }
82681bf40d5SIsaku Yamahata 
tdx_vcpu_put(struct kvm_vcpu * vcpu)82781bf40d5SIsaku Yamahata void tdx_vcpu_put(struct kvm_vcpu *vcpu)
82881bf40d5SIsaku Yamahata {
82981bf40d5SIsaku Yamahata 	vmx_vcpu_pi_put(vcpu);
83081bf40d5SIsaku Yamahata 	tdx_prepare_switch_to_host(vcpu);
83181bf40d5SIsaku Yamahata }
83281bf40d5SIsaku Yamahata 
tdx_vcpu_free(struct kvm_vcpu * vcpu)8339002f8cfSIsaku Yamahata void tdx_vcpu_free(struct kvm_vcpu *vcpu)
8349002f8cfSIsaku Yamahata {
835a50f673fSIsaku Yamahata 	struct kvm_tdx *kvm_tdx = to_kvm_tdx(vcpu->kvm);
836a50f673fSIsaku Yamahata 	struct vcpu_tdx *tdx = to_tdx(vcpu);
837a50f673fSIsaku Yamahata 	int i;
838a50f673fSIsaku Yamahata 
839a50f673fSIsaku Yamahata 	/*
840a50f673fSIsaku Yamahata 	 * It is not possible to reclaim pages while hkid is assigned. It might
841a50f673fSIsaku Yamahata 	 * be assigned if:
842a50f673fSIsaku Yamahata 	 * 1. the TD VM is being destroyed but freeing hkid failed, in which
843a50f673fSIsaku Yamahata 	 * case the pages are leaked
844a50f673fSIsaku Yamahata 	 * 2. TD VCPU creation failed and this on the error path, in which case
845a50f673fSIsaku Yamahata 	 * there is nothing to do anyway
846a50f673fSIsaku Yamahata 	 */
847a50f673fSIsaku Yamahata 	if (is_hkid_assigned(kvm_tdx))
848a50f673fSIsaku Yamahata 		return;
849a50f673fSIsaku Yamahata 
850a50f673fSIsaku Yamahata 	if (tdx->vp.tdcx_pages) {
851a50f673fSIsaku Yamahata 		for (i = 0; i < kvm_tdx->td.tdcx_nr_pages; i++) {
852a50f673fSIsaku Yamahata 			if (tdx->vp.tdcx_pages[i])
853a50f673fSIsaku Yamahata 				tdx_reclaim_control_page(tdx->vp.tdcx_pages[i]);
854a50f673fSIsaku Yamahata 		}
855a50f673fSIsaku Yamahata 		kfree(tdx->vp.tdcx_pages);
856a50f673fSIsaku Yamahata 		tdx->vp.tdcx_pages = NULL;
857a50f673fSIsaku Yamahata 	}
858a50f673fSIsaku Yamahata 	if (tdx->vp.tdvpr_page) {
859a50f673fSIsaku Yamahata 		tdx_reclaim_control_page(tdx->vp.tdvpr_page);
860a50f673fSIsaku Yamahata 		tdx->vp.tdvpr_page = 0;
861a50f673fSIsaku Yamahata 	}
862a50f673fSIsaku Yamahata 
863a50f673fSIsaku Yamahata 	tdx->state = VCPU_TD_STATE_UNINITIALIZED;
8649002f8cfSIsaku Yamahata }
8659002f8cfSIsaku Yamahata 
tdx_vcpu_pre_run(struct kvm_vcpu * vcpu)86681bf912bSIsaku Yamahata int tdx_vcpu_pre_run(struct kvm_vcpu *vcpu)
86781bf912bSIsaku Yamahata {
86881bf912bSIsaku Yamahata 	if (unlikely(to_tdx(vcpu)->state != VCPU_TD_STATE_INITIALIZED ||
86981bf912bSIsaku Yamahata 		     to_kvm_tdx(vcpu->kvm)->state != TD_STATE_RUNNABLE))
87081bf912bSIsaku Yamahata 		return -EINVAL;
87181bf912bSIsaku Yamahata 
87281bf912bSIsaku Yamahata 	return 1;
87381bf912bSIsaku Yamahata }
87481bf912bSIsaku Yamahata 
tdcall_to_vmx_exit_reason(struct kvm_vcpu * vcpu)875c42856afSIsaku Yamahata static __always_inline u32 tdcall_to_vmx_exit_reason(struct kvm_vcpu *vcpu)
876c42856afSIsaku Yamahata {
877c42856afSIsaku Yamahata 	switch (tdvmcall_leaf(vcpu)) {
8783bf31b57SIsaku Yamahata 	case EXIT_REASON_CPUID:
8795cf7239bSIsaku Yamahata 	case EXIT_REASON_HLT:
88033608aafSIsaku Yamahata 	case EXIT_REASON_IO_INSTRUCTION:
881081385dbSIsaku Yamahata 	case EXIT_REASON_MSR_READ:
882081385dbSIsaku Yamahata 	case EXIT_REASON_MSR_WRITE:
88333608aafSIsaku Yamahata 		return tdvmcall_leaf(vcpu);
884bb723bebSSean Christopherson 	case EXIT_REASON_EPT_VIOLATION:
885bb723bebSSean Christopherson 		return EXIT_REASON_EPT_MISCONFIG;
886c42856afSIsaku Yamahata 	default:
887c42856afSIsaku Yamahata 		break;
888c42856afSIsaku Yamahata 	}
889c42856afSIsaku Yamahata 
890c42856afSIsaku Yamahata 	return EXIT_REASON_TDCALL;
891c42856afSIsaku Yamahata }
892c42856afSIsaku Yamahata 
tdx_to_vmx_exit_reason(struct kvm_vcpu * vcpu)893095b71a0SIsaku Yamahata static __always_inline u32 tdx_to_vmx_exit_reason(struct kvm_vcpu *vcpu)
894095b71a0SIsaku Yamahata {
895095b71a0SIsaku Yamahata 	struct vcpu_tdx *tdx = to_tdx(vcpu);
896c42856afSIsaku Yamahata 	u32 exit_reason;
897095b71a0SIsaku Yamahata 
898095b71a0SIsaku Yamahata 	switch (tdx->vp_enter_ret & TDX_SEAMCALL_STATUS_MASK) {
899095b71a0SIsaku Yamahata 	case TDX_SUCCESS:
900095b71a0SIsaku Yamahata 	case TDX_NON_RECOVERABLE_VCPU:
901095b71a0SIsaku Yamahata 	case TDX_NON_RECOVERABLE_TD:
902095b71a0SIsaku Yamahata 	case TDX_NON_RECOVERABLE_TD_NON_ACCESSIBLE:
903095b71a0SIsaku Yamahata 	case TDX_NON_RECOVERABLE_TD_WRONG_APIC_MODE:
904095b71a0SIsaku Yamahata 		break;
905095b71a0SIsaku Yamahata 	default:
906095b71a0SIsaku Yamahata 		return -1u;
907095b71a0SIsaku Yamahata 	}
908095b71a0SIsaku Yamahata 
909c42856afSIsaku Yamahata 	exit_reason = tdx->vp_enter_ret;
910c42856afSIsaku Yamahata 
911c42856afSIsaku Yamahata 	switch (exit_reason) {
912c42856afSIsaku Yamahata 	case EXIT_REASON_TDCALL:
913c42856afSIsaku Yamahata 		if (tdvmcall_exit_type(vcpu))
914c42856afSIsaku Yamahata 			return EXIT_REASON_VMCALL;
915c42856afSIsaku Yamahata 
916c42856afSIsaku Yamahata 		return tdcall_to_vmx_exit_reason(vcpu);
917da407fe4SIsaku Yamahata 	case EXIT_REASON_EPT_MISCONFIG:
918da407fe4SIsaku Yamahata 		/*
919da407fe4SIsaku Yamahata 		 * Defer KVM_BUG_ON() until tdx_handle_exit() because this is in
920da407fe4SIsaku Yamahata 		 * non-instrumentable code with interrupts disabled.
921da407fe4SIsaku Yamahata 		 */
922da407fe4SIsaku Yamahata 		return -1u;
923c42856afSIsaku Yamahata 	default:
924c42856afSIsaku Yamahata 		break;
925c42856afSIsaku Yamahata 	}
926c42856afSIsaku Yamahata 
927c42856afSIsaku Yamahata 	return exit_reason;
928095b71a0SIsaku Yamahata }
929095b71a0SIsaku Yamahata 
tdx_vcpu_enter_exit(struct kvm_vcpu * vcpu)93081bf912bSIsaku Yamahata static noinstr void tdx_vcpu_enter_exit(struct kvm_vcpu *vcpu)
93181bf912bSIsaku Yamahata {
93281bf912bSIsaku Yamahata 	struct vcpu_tdx *tdx = to_tdx(vcpu);
933095b71a0SIsaku Yamahata 	struct vcpu_vt *vt = to_vt(vcpu);
93481bf912bSIsaku Yamahata 
93581bf912bSIsaku Yamahata 	guest_state_enter_irqoff();
93681bf912bSIsaku Yamahata 
93781bf912bSIsaku Yamahata 	tdx->vp_enter_ret = tdh_vp_enter(&tdx->vp, &tdx->vp_enter_args);
93881bf912bSIsaku Yamahata 
939095b71a0SIsaku Yamahata 	vt->exit_reason.full = tdx_to_vmx_exit_reason(vcpu);
940095b71a0SIsaku Yamahata 
941095b71a0SIsaku Yamahata 	vt->exit_qualification = tdx->vp_enter_args.rcx;
942095b71a0SIsaku Yamahata 	tdx->ext_exit_qualification = tdx->vp_enter_args.rdx;
943095b71a0SIsaku Yamahata 	tdx->exit_gpa = tdx->vp_enter_args.r8;
944095b71a0SIsaku Yamahata 	vt->exit_intr_info = tdx->vp_enter_args.r9;
945095b71a0SIsaku Yamahata 
946f30cb642SIsaku Yamahata 	vmx_handle_nmi(vcpu);
947f30cb642SIsaku Yamahata 
94881bf912bSIsaku Yamahata 	guest_state_exit_irqoff();
94981bf912bSIsaku Yamahata }
95081bf912bSIsaku Yamahata 
tdx_failed_vmentry(struct kvm_vcpu * vcpu)951095b71a0SIsaku Yamahata static bool tdx_failed_vmentry(struct kvm_vcpu *vcpu)
952095b71a0SIsaku Yamahata {
953095b71a0SIsaku Yamahata 	return vmx_get_exit_reason(vcpu).failed_vmentry &&
954095b71a0SIsaku Yamahata 	       vmx_get_exit_reason(vcpu).full != -1u;
955095b71a0SIsaku Yamahata }
956095b71a0SIsaku Yamahata 
tdx_exit_handlers_fastpath(struct kvm_vcpu * vcpu)957095b71a0SIsaku Yamahata static fastpath_t tdx_exit_handlers_fastpath(struct kvm_vcpu *vcpu)
958095b71a0SIsaku Yamahata {
959095b71a0SIsaku Yamahata 	u64 vp_enter_ret = to_tdx(vcpu)->vp_enter_ret;
960095b71a0SIsaku Yamahata 
961095b71a0SIsaku Yamahata 	/*
962095b71a0SIsaku Yamahata 	 * TDX_OPERAND_BUSY could be returned for SEPT due to 0-step mitigation
963095b71a0SIsaku Yamahata 	 * or for TD EPOCH due to contention with TDH.MEM.TRACK on TDH.VP.ENTER.
964095b71a0SIsaku Yamahata 	 *
965095b71a0SIsaku Yamahata 	 * When KVM requests KVM_REQ_OUTSIDE_GUEST_MODE, which has both
966095b71a0SIsaku Yamahata 	 * KVM_REQUEST_WAIT and KVM_REQUEST_NO_ACTION set, it requires target
967095b71a0SIsaku Yamahata 	 * vCPUs leaving fastpath so that interrupt can be enabled to ensure the
968095b71a0SIsaku Yamahata 	 * IPIs can be delivered. Return EXIT_FASTPATH_EXIT_HANDLED instead of
969095b71a0SIsaku Yamahata 	 * EXIT_FASTPATH_REENTER_GUEST to exit fastpath, otherwise, the
970095b71a0SIsaku Yamahata 	 * requester may be blocked endlessly.
971095b71a0SIsaku Yamahata 	 */
972095b71a0SIsaku Yamahata 	if (unlikely(tdx_operand_busy(vp_enter_ret)))
973095b71a0SIsaku Yamahata 		return EXIT_FASTPATH_EXIT_HANDLED;
974095b71a0SIsaku Yamahata 
975095b71a0SIsaku Yamahata 	return EXIT_FASTPATH_NONE;
976095b71a0SIsaku Yamahata }
977095b71a0SIsaku Yamahata 
97881bf912bSIsaku Yamahata #define TDX_REGS_AVAIL_SET	(BIT_ULL(VCPU_EXREG_EXIT_INFO_1) | \
97981bf912bSIsaku Yamahata 				 BIT_ULL(VCPU_EXREG_EXIT_INFO_2) | \
98081bf912bSIsaku Yamahata 				 BIT_ULL(VCPU_REGS_RAX) | \
98181bf912bSIsaku Yamahata 				 BIT_ULL(VCPU_REGS_RBX) | \
98281bf912bSIsaku Yamahata 				 BIT_ULL(VCPU_REGS_RCX) | \
98381bf912bSIsaku Yamahata 				 BIT_ULL(VCPU_REGS_RDX) | \
98481bf912bSIsaku Yamahata 				 BIT_ULL(VCPU_REGS_RBP) | \
98581bf912bSIsaku Yamahata 				 BIT_ULL(VCPU_REGS_RSI) | \
98681bf912bSIsaku Yamahata 				 BIT_ULL(VCPU_REGS_RDI) | \
98781bf912bSIsaku Yamahata 				 BIT_ULL(VCPU_REGS_R8) | \
98881bf912bSIsaku Yamahata 				 BIT_ULL(VCPU_REGS_R9) | \
98981bf912bSIsaku Yamahata 				 BIT_ULL(VCPU_REGS_R10) | \
99081bf912bSIsaku Yamahata 				 BIT_ULL(VCPU_REGS_R11) | \
99181bf912bSIsaku Yamahata 				 BIT_ULL(VCPU_REGS_R12) | \
99281bf912bSIsaku Yamahata 				 BIT_ULL(VCPU_REGS_R13) | \
99381bf912bSIsaku Yamahata 				 BIT_ULL(VCPU_REGS_R14) | \
99481bf912bSIsaku Yamahata 				 BIT_ULL(VCPU_REGS_R15))
99581bf912bSIsaku Yamahata 
tdx_load_host_xsave_state(struct kvm_vcpu * vcpu)9966bfa6d85SIsaku Yamahata static void tdx_load_host_xsave_state(struct kvm_vcpu *vcpu)
9976bfa6d85SIsaku Yamahata {
9986bfa6d85SIsaku Yamahata 	struct kvm_tdx *kvm_tdx = to_kvm_tdx(vcpu->kvm);
9996bfa6d85SIsaku Yamahata 
10006bfa6d85SIsaku Yamahata 	/*
10016bfa6d85SIsaku Yamahata 	 * All TDX hosts support PKRU; but even if they didn't,
10026bfa6d85SIsaku Yamahata 	 * vcpu->arch.host_pkru would be 0 and the wrpkru would be
10036bfa6d85SIsaku Yamahata 	 * skipped.
10046bfa6d85SIsaku Yamahata 	 */
10056bfa6d85SIsaku Yamahata 	if (vcpu->arch.host_pkru != 0)
10066bfa6d85SIsaku Yamahata 		wrpkru(vcpu->arch.host_pkru);
10076bfa6d85SIsaku Yamahata 
10086bfa6d85SIsaku Yamahata 	if (kvm_host.xcr0 != (kvm_tdx->xfam & kvm_caps.supported_xcr0))
10096bfa6d85SIsaku Yamahata 		xsetbv(XCR_XFEATURE_ENABLED_MASK, kvm_host.xcr0);
10106bfa6d85SIsaku Yamahata 
10116bfa6d85SIsaku Yamahata 	/*
10126bfa6d85SIsaku Yamahata 	 * Likewise, even if a TDX hosts didn't support XSS both arms of
10136bfa6d85SIsaku Yamahata 	 * the comparison would be 0 and the wrmsrl would be skipped.
10146bfa6d85SIsaku Yamahata 	 */
10156bfa6d85SIsaku Yamahata 	if (kvm_host.xss != (kvm_tdx->xfam & kvm_caps.supported_xss))
10166bfa6d85SIsaku Yamahata 		wrmsrl(MSR_IA32_XSS, kvm_host.xss);
10176bfa6d85SIsaku Yamahata }
10188af09903SAdrian Hunter 
10198af09903SAdrian Hunter #define TDX_DEBUGCTL_PRESERVED (DEBUGCTLMSR_BTF | \
10208af09903SAdrian Hunter 				DEBUGCTLMSR_FREEZE_PERFMON_ON_PMI | \
10218af09903SAdrian Hunter 				DEBUGCTLMSR_FREEZE_IN_SMM)
10226bfa6d85SIsaku Yamahata 
tdx_vcpu_run(struct kvm_vcpu * vcpu,bool force_immediate_exit)102381bf912bSIsaku Yamahata fastpath_t tdx_vcpu_run(struct kvm_vcpu *vcpu, bool force_immediate_exit)
102481bf912bSIsaku Yamahata {
1025e0b4f31aSIsaku Yamahata 	struct vcpu_tdx *tdx = to_tdx(vcpu);
10268af09903SAdrian Hunter 	struct vcpu_vt *vt = to_vt(vcpu);
1027e0b4f31aSIsaku Yamahata 
102881bf912bSIsaku Yamahata 	/*
102981bf912bSIsaku Yamahata 	 * force_immediate_exit requires vCPU entering for events injection with
103081bf912bSIsaku Yamahata 	 * an immediately exit followed. But The TDX module doesn't guarantee
103181bf912bSIsaku Yamahata 	 * entry, it's already possible for KVM to _think_ it completely entry
103281bf912bSIsaku Yamahata 	 * to the guest without actually having done so.
103381bf912bSIsaku Yamahata 	 * Since KVM never needs to force an immediate exit for TDX, and can't
103481bf912bSIsaku Yamahata 	 * do direct injection, just warn on force_immediate_exit.
103581bf912bSIsaku Yamahata 	 */
103681bf912bSIsaku Yamahata 	WARN_ON_ONCE(force_immediate_exit);
103781bf912bSIsaku Yamahata 
10384b2abc49SYan Zhao 	/*
10394b2abc49SYan Zhao 	 * Wait until retry of SEPT-zap-related SEAMCALL completes before
10404b2abc49SYan Zhao 	 * allowing vCPU entry to avoid contention with tdh_vp_enter() and
10414b2abc49SYan Zhao 	 * TDCALLs.
10424b2abc49SYan Zhao 	 */
10434b2abc49SYan Zhao 	if (unlikely(READ_ONCE(to_kvm_tdx(vcpu->kvm)->wait_for_sept_zap)))
10444b2abc49SYan Zhao 		return EXIT_FASTPATH_EXIT_HANDLED;
10454b2abc49SYan Zhao 
104681bf912bSIsaku Yamahata 	trace_kvm_entry(vcpu, force_immediate_exit);
104781bf912bSIsaku Yamahata 
1048fc17de99SIsaku Yamahata 	if (pi_test_on(&vt->pi_desc)) {
104924c12911SIsaku Yamahata 		apic->send_IPI_self(POSTED_INTR_VECTOR);
105024c12911SIsaku Yamahata 
1051fc17de99SIsaku Yamahata 		if (pi_test_pir(kvm_lapic_get_reg(vcpu->arch.apic, APIC_LVTT) &
1052fc17de99SIsaku Yamahata 			       APIC_VECTOR_MASK, &vt->pi_desc))
1053fc17de99SIsaku Yamahata 			kvm_wait_lapic_expire(vcpu);
1054fc17de99SIsaku Yamahata 	}
1055fc17de99SIsaku Yamahata 
105681bf912bSIsaku Yamahata 	tdx_vcpu_enter_exit(vcpu);
105781bf912bSIsaku Yamahata 
10588af09903SAdrian Hunter 	if (vt->host_debugctlmsr & ~TDX_DEBUGCTL_PRESERVED)
10598af09903SAdrian Hunter 		update_debugctlmsr(vt->host_debugctlmsr);
10608af09903SAdrian Hunter 
10616bfa6d85SIsaku Yamahata 	tdx_load_host_xsave_state(vcpu);
1062e0b4f31aSIsaku Yamahata 	tdx->guest_entered = true;
10636bfa6d85SIsaku Yamahata 
106481bf912bSIsaku Yamahata 	vcpu->arch.regs_avail &= TDX_REGS_AVAIL_SET;
106581bf912bSIsaku Yamahata 
1066da407fe4SIsaku Yamahata 	if (unlikely(tdx->vp_enter_ret == EXIT_REASON_EPT_MISCONFIG))
1067da407fe4SIsaku Yamahata 		return EXIT_FASTPATH_NONE;
1068da407fe4SIsaku Yamahata 
1069095b71a0SIsaku Yamahata 	if (unlikely((tdx->vp_enter_ret & TDX_SW_ERROR) == TDX_SW_ERROR))
1070095b71a0SIsaku Yamahata 		return EXIT_FASTPATH_NONE;
1071095b71a0SIsaku Yamahata 
1072095b71a0SIsaku Yamahata 	if (unlikely(vmx_get_exit_reason(vcpu).basic == EXIT_REASON_MCE_DURING_VMENTRY))
1073095b71a0SIsaku Yamahata 		kvm_machine_check();
1074095b71a0SIsaku Yamahata 
107581bf912bSIsaku Yamahata 	trace_kvm_exit(vcpu, KVM_ISA_VMX);
107681bf912bSIsaku Yamahata 
1077095b71a0SIsaku Yamahata 	if (unlikely(tdx_failed_vmentry(vcpu)))
107881bf912bSIsaku Yamahata 		return EXIT_FASTPATH_NONE;
1079095b71a0SIsaku Yamahata 
1080095b71a0SIsaku Yamahata 	return tdx_exit_handlers_fastpath(vcpu);
108181bf912bSIsaku Yamahata }
108287e3f45eSSean Christopherson 
tdx_inject_nmi(struct kvm_vcpu * vcpu)1083acc64eb4SIsaku Yamahata void tdx_inject_nmi(struct kvm_vcpu *vcpu)
1084acc64eb4SIsaku Yamahata {
1085acc64eb4SIsaku Yamahata 	++vcpu->stat.nmi_injections;
1086acc64eb4SIsaku Yamahata 	td_management_write8(to_tdx(vcpu), TD_VCPU_PEND_NMI, 1);
1087acc64eb4SIsaku Yamahata 	/*
1088acc64eb4SIsaku Yamahata 	 * From KVM's perspective, NMI injection is completed right after
1089acc64eb4SIsaku Yamahata 	 * writing to PEND_NMI.  KVM doesn't care whether an NMI is injected by
1090acc64eb4SIsaku Yamahata 	 * the TDX module or not.
1091acc64eb4SIsaku Yamahata 	 */
1092acc64eb4SIsaku Yamahata 	vcpu->arch.nmi_injected = false;
1093acc64eb4SIsaku Yamahata 	/*
1094acc64eb4SIsaku Yamahata 	 * TDX doesn't support KVM to request NMI window exit.  If there is
1095acc64eb4SIsaku Yamahata 	 * still a pending vNMI, KVM is not able to inject it along with the
1096acc64eb4SIsaku Yamahata 	 * one pending in TDX module in a back-to-back way.  Since the previous
1097acc64eb4SIsaku Yamahata 	 * vNMI is still pending in TDX module, i.e. it has not been delivered
1098acc64eb4SIsaku Yamahata 	 * to TDX guest yet, it's OK to collapse the pending vNMI into the
1099acc64eb4SIsaku Yamahata 	 * previous one.  The guest is expected to handle all the NMI sources
1100acc64eb4SIsaku Yamahata 	 * when handling the first vNMI.
1101acc64eb4SIsaku Yamahata 	 */
1102acc64eb4SIsaku Yamahata 	vcpu->arch.nmi_pending = 0;
1103acc64eb4SIsaku Yamahata }
1104acc64eb4SIsaku Yamahata 
tdx_handle_exception_nmi(struct kvm_vcpu * vcpu)1105f30cb642SIsaku Yamahata static int tdx_handle_exception_nmi(struct kvm_vcpu *vcpu)
1106f30cb642SIsaku Yamahata {
1107f30cb642SIsaku Yamahata 	u32 intr_info = vmx_get_intr_info(vcpu);
1108f30cb642SIsaku Yamahata 
1109f30cb642SIsaku Yamahata 	/*
1110f30cb642SIsaku Yamahata 	 * Machine checks are handled by handle_exception_irqoff(), or by
1111f30cb642SIsaku Yamahata 	 * tdx_handle_exit() with TDX_NON_RECOVERABLE set if a #MC occurs on
1112f30cb642SIsaku Yamahata 	 * VM-Entry.  NMIs are handled by tdx_vcpu_enter_exit().
1113f30cb642SIsaku Yamahata 	 */
1114f30cb642SIsaku Yamahata 	if (is_nmi(intr_info) || is_machine_check(intr_info))
1115f30cb642SIsaku Yamahata 		return 1;
1116f30cb642SIsaku Yamahata 
1117f30cb642SIsaku Yamahata 	vcpu->run->exit_reason = KVM_EXIT_EXCEPTION;
1118f30cb642SIsaku Yamahata 	vcpu->run->ex.exception = intr_info & INTR_INFO_VECTOR_MASK;
1119f30cb642SIsaku Yamahata 	vcpu->run->ex.error_code = 0;
1120f30cb642SIsaku Yamahata 
1121f30cb642SIsaku Yamahata 	return 0;
1122f30cb642SIsaku Yamahata }
1123f30cb642SIsaku Yamahata 
complete_hypercall_exit(struct kvm_vcpu * vcpu)1124d5998c02SIsaku Yamahata static int complete_hypercall_exit(struct kvm_vcpu *vcpu)
1125d5998c02SIsaku Yamahata {
1126d5998c02SIsaku Yamahata 	tdvmcall_set_return_code(vcpu, vcpu->run->hypercall.ret);
1127d5998c02SIsaku Yamahata 	return 1;
1128d5998c02SIsaku Yamahata }
1129d5998c02SIsaku Yamahata 
tdx_emulate_vmcall(struct kvm_vcpu * vcpu)1130d5998c02SIsaku Yamahata static int tdx_emulate_vmcall(struct kvm_vcpu *vcpu)
1131d5998c02SIsaku Yamahata {
1132d5998c02SIsaku Yamahata 	kvm_rax_write(vcpu, to_tdx(vcpu)->vp_enter_args.r10);
1133d5998c02SIsaku Yamahata 	kvm_rbx_write(vcpu, to_tdx(vcpu)->vp_enter_args.r11);
1134d5998c02SIsaku Yamahata 	kvm_rcx_write(vcpu, to_tdx(vcpu)->vp_enter_args.r12);
1135d5998c02SIsaku Yamahata 	kvm_rdx_write(vcpu, to_tdx(vcpu)->vp_enter_args.r13);
1136d5998c02SIsaku Yamahata 	kvm_rsi_write(vcpu, to_tdx(vcpu)->vp_enter_args.r14);
1137d5998c02SIsaku Yamahata 
1138d5998c02SIsaku Yamahata 	return __kvm_emulate_hypercall(vcpu, 0, complete_hypercall_exit);
1139d5998c02SIsaku Yamahata }
1140d5998c02SIsaku Yamahata 
11412c304880SBinbin Wu /*
11422c304880SBinbin Wu  * Split into chunks and check interrupt pending between chunks.  This allows
11432c304880SBinbin Wu  * for timely injection of interrupts to prevent issues with guest lockup
11442c304880SBinbin Wu  * detection.
11452c304880SBinbin Wu  */
11462c304880SBinbin Wu #define TDX_MAP_GPA_MAX_LEN (2 * 1024 * 1024)
11472c304880SBinbin Wu static void __tdx_map_gpa(struct vcpu_tdx *tdx);
11482c304880SBinbin Wu 
tdx_complete_vmcall_map_gpa(struct kvm_vcpu * vcpu)11492c304880SBinbin Wu static int tdx_complete_vmcall_map_gpa(struct kvm_vcpu *vcpu)
11502c304880SBinbin Wu {
11512c304880SBinbin Wu 	struct vcpu_tdx *tdx = to_tdx(vcpu);
11522c304880SBinbin Wu 
11532c304880SBinbin Wu 	if (vcpu->run->hypercall.ret) {
11542c304880SBinbin Wu 		tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_INVALID_OPERAND);
11552c304880SBinbin Wu 		tdx->vp_enter_args.r11 = tdx->map_gpa_next;
11562c304880SBinbin Wu 		return 1;
11572c304880SBinbin Wu 	}
11582c304880SBinbin Wu 
11592c304880SBinbin Wu 	tdx->map_gpa_next += TDX_MAP_GPA_MAX_LEN;
11602c304880SBinbin Wu 	if (tdx->map_gpa_next >= tdx->map_gpa_end)
11612c304880SBinbin Wu 		return 1;
11622c304880SBinbin Wu 
11632c304880SBinbin Wu 	/*
11642c304880SBinbin Wu 	 * Stop processing the remaining part if there is a pending interrupt,
11652c304880SBinbin Wu 	 * which could be qualified to deliver.  Skip checking pending RVI for
11665cf7239bSIsaku Yamahata 	 * TDVMCALL_MAP_GPA, see comments in tdx_protected_apic_has_interrupt().
11672c304880SBinbin Wu 	 */
11682c304880SBinbin Wu 	if (kvm_vcpu_has_events(vcpu)) {
11692c304880SBinbin Wu 		tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_RETRY);
11702c304880SBinbin Wu 		tdx->vp_enter_args.r11 = tdx->map_gpa_next;
11712c304880SBinbin Wu 		return 1;
11722c304880SBinbin Wu 	}
11732c304880SBinbin Wu 
11742c304880SBinbin Wu 	__tdx_map_gpa(tdx);
11752c304880SBinbin Wu 	return 0;
11762c304880SBinbin Wu }
11772c304880SBinbin Wu 
__tdx_map_gpa(struct vcpu_tdx * tdx)11782c304880SBinbin Wu static void __tdx_map_gpa(struct vcpu_tdx *tdx)
11792c304880SBinbin Wu {
11802c304880SBinbin Wu 	u64 gpa = tdx->map_gpa_next;
11812c304880SBinbin Wu 	u64 size = tdx->map_gpa_end - tdx->map_gpa_next;
11822c304880SBinbin Wu 
11832c304880SBinbin Wu 	if (size > TDX_MAP_GPA_MAX_LEN)
11842c304880SBinbin Wu 		size = TDX_MAP_GPA_MAX_LEN;
11852c304880SBinbin Wu 
11862c304880SBinbin Wu 	tdx->vcpu.run->exit_reason       = KVM_EXIT_HYPERCALL;
11872c304880SBinbin Wu 	tdx->vcpu.run->hypercall.nr      = KVM_HC_MAP_GPA_RANGE;
11882c304880SBinbin Wu 	/*
11892c304880SBinbin Wu 	 * In principle this should have been -KVM_ENOSYS, but userspace (QEMU <=9.2)
11902c304880SBinbin Wu 	 * assumed that vcpu->run->hypercall.ret is never changed by KVM and thus that
11912c304880SBinbin Wu 	 * it was always zero on KVM_EXIT_HYPERCALL.  Since KVM is now overwriting
11922c304880SBinbin Wu 	 * vcpu->run->hypercall.ret, ensuring that it is zero to not break QEMU.
11932c304880SBinbin Wu 	 */
11942c304880SBinbin Wu 	tdx->vcpu.run->hypercall.ret = 0;
11952c304880SBinbin Wu 	tdx->vcpu.run->hypercall.args[0] = gpa & ~gfn_to_gpa(kvm_gfn_direct_bits(tdx->vcpu.kvm));
11962c304880SBinbin Wu 	tdx->vcpu.run->hypercall.args[1] = size / PAGE_SIZE;
11972c304880SBinbin Wu 	tdx->vcpu.run->hypercall.args[2] = vt_is_tdx_private_gpa(tdx->vcpu.kvm, gpa) ?
11982c304880SBinbin Wu 					   KVM_MAP_GPA_RANGE_ENCRYPTED :
11992c304880SBinbin Wu 					   KVM_MAP_GPA_RANGE_DECRYPTED;
12002c304880SBinbin Wu 	tdx->vcpu.run->hypercall.flags   = KVM_EXIT_HYPERCALL_LONG_MODE;
12012c304880SBinbin Wu 
12022c304880SBinbin Wu 	tdx->vcpu.arch.complete_userspace_io = tdx_complete_vmcall_map_gpa;
12032c304880SBinbin Wu }
12042c304880SBinbin Wu 
tdx_map_gpa(struct kvm_vcpu * vcpu)12052c304880SBinbin Wu static int tdx_map_gpa(struct kvm_vcpu *vcpu)
12062c304880SBinbin Wu {
12072c304880SBinbin Wu 	struct vcpu_tdx *tdx = to_tdx(vcpu);
12082c304880SBinbin Wu 	u64 gpa = tdx->vp_enter_args.r12;
12092c304880SBinbin Wu 	u64 size = tdx->vp_enter_args.r13;
12102c304880SBinbin Wu 	u64 ret;
12112c304880SBinbin Wu 
12122c304880SBinbin Wu 	/*
12132c304880SBinbin Wu 	 * Converting TDVMCALL_MAP_GPA to KVM_HC_MAP_GPA_RANGE requires
12142c304880SBinbin Wu 	 * userspace to enable KVM_CAP_EXIT_HYPERCALL with KVM_HC_MAP_GPA_RANGE
1215b5aafcb4SBinbin Wu 	 * bit set.  This is a base call so it should always be supported, but
1216b5aafcb4SBinbin Wu 	 * KVM has no way to ensure that userspace implements the GHCI correctly.
1217b5aafcb4SBinbin Wu 	 * So if KVM_HC_MAP_GPA_RANGE does not cause a VMEXIT, return an error
1218b5aafcb4SBinbin Wu 	 * to the guest.
12192c304880SBinbin Wu 	 */
12202c304880SBinbin Wu 	if (!user_exit_on_hypercall(vcpu->kvm, KVM_HC_MAP_GPA_RANGE)) {
1221b5aafcb4SBinbin Wu 		ret = TDVMCALL_STATUS_SUBFUNC_UNSUPPORTED;
12222c304880SBinbin Wu 		goto error;
12232c304880SBinbin Wu 	}
12242c304880SBinbin Wu 
12252c304880SBinbin Wu 	if (gpa + size <= gpa || !kvm_vcpu_is_legal_gpa(vcpu, gpa) ||
12262c304880SBinbin Wu 	    !kvm_vcpu_is_legal_gpa(vcpu, gpa + size - 1) ||
12272c304880SBinbin Wu 	    (vt_is_tdx_private_gpa(vcpu->kvm, gpa) !=
12282c304880SBinbin Wu 	     vt_is_tdx_private_gpa(vcpu->kvm, gpa + size - 1))) {
12292c304880SBinbin Wu 		ret = TDVMCALL_STATUS_INVALID_OPERAND;
12302c304880SBinbin Wu 		goto error;
12312c304880SBinbin Wu 	}
12322c304880SBinbin Wu 
12332c304880SBinbin Wu 	if (!PAGE_ALIGNED(gpa) || !PAGE_ALIGNED(size)) {
12342c304880SBinbin Wu 		ret = TDVMCALL_STATUS_ALIGN_ERROR;
12352c304880SBinbin Wu 		goto error;
12362c304880SBinbin Wu 	}
12372c304880SBinbin Wu 
12382c304880SBinbin Wu 	tdx->map_gpa_end = gpa + size;
12392c304880SBinbin Wu 	tdx->map_gpa_next = gpa;
12402c304880SBinbin Wu 
12412c304880SBinbin Wu 	__tdx_map_gpa(tdx);
12422c304880SBinbin Wu 	return 0;
12432c304880SBinbin Wu 
12442c304880SBinbin Wu error:
12452c304880SBinbin Wu 	tdvmcall_set_return_code(vcpu, ret);
12462c304880SBinbin Wu 	tdx->vp_enter_args.r11 = gpa;
12472c304880SBinbin Wu 	return 1;
12482c304880SBinbin Wu }
12492c304880SBinbin Wu 
tdx_report_fatal_error(struct kvm_vcpu * vcpu)125079462faaSBinbin Wu static int tdx_report_fatal_error(struct kvm_vcpu *vcpu)
125179462faaSBinbin Wu {
125279462faaSBinbin Wu 	struct vcpu_tdx *tdx = to_tdx(vcpu);
125379462faaSBinbin Wu 	u64 *regs = vcpu->run->system_event.data;
125479462faaSBinbin Wu 	u64 *module_regs = &tdx->vp_enter_args.r8;
125579462faaSBinbin Wu 	int index = VCPU_REGS_RAX;
125679462faaSBinbin Wu 
125779462faaSBinbin Wu 	vcpu->run->exit_reason = KVM_EXIT_SYSTEM_EVENT;
125879462faaSBinbin Wu 	vcpu->run->system_event.type = KVM_SYSTEM_EVENT_TDX_FATAL;
125979462faaSBinbin Wu 	vcpu->run->system_event.ndata = 16;
126079462faaSBinbin Wu 
126179462faaSBinbin Wu 	/* Dump 16 general-purpose registers to userspace in ascending order. */
126279462faaSBinbin Wu 	regs[index++] = tdx->vp_enter_ret;
126379462faaSBinbin Wu 	regs[index++] = tdx->vp_enter_args.rcx;
126479462faaSBinbin Wu 	regs[index++] = tdx->vp_enter_args.rdx;
126579462faaSBinbin Wu 	regs[index++] = tdx->vp_enter_args.rbx;
126679462faaSBinbin Wu 	regs[index++] = 0;
126779462faaSBinbin Wu 	regs[index++] = 0;
126879462faaSBinbin Wu 	regs[index++] = tdx->vp_enter_args.rsi;
126979462faaSBinbin Wu 	regs[index] = tdx->vp_enter_args.rdi;
127079462faaSBinbin Wu 	for (index = 0; index < 8; index++)
127179462faaSBinbin Wu 		regs[VCPU_REGS_R8 + index] = module_regs[index];
127279462faaSBinbin Wu 
127379462faaSBinbin Wu 	return 0;
127479462faaSBinbin Wu }
127579462faaSBinbin Wu 
tdx_emulate_cpuid(struct kvm_vcpu * vcpu)12763bf31b57SIsaku Yamahata static int tdx_emulate_cpuid(struct kvm_vcpu *vcpu)
12773bf31b57SIsaku Yamahata {
12783bf31b57SIsaku Yamahata 	u32 eax, ebx, ecx, edx;
12793bf31b57SIsaku Yamahata 	struct vcpu_tdx *tdx = to_tdx(vcpu);
12803bf31b57SIsaku Yamahata 
12813bf31b57SIsaku Yamahata 	/* EAX and ECX for cpuid is stored in R12 and R13. */
12823bf31b57SIsaku Yamahata 	eax = tdx->vp_enter_args.r12;
12833bf31b57SIsaku Yamahata 	ecx = tdx->vp_enter_args.r13;
12843bf31b57SIsaku Yamahata 
12853bf31b57SIsaku Yamahata 	kvm_cpuid(vcpu, &eax, &ebx, &ecx, &edx, false);
12863bf31b57SIsaku Yamahata 
12873bf31b57SIsaku Yamahata 	tdx->vp_enter_args.r12 = eax;
12883bf31b57SIsaku Yamahata 	tdx->vp_enter_args.r13 = ebx;
12893bf31b57SIsaku Yamahata 	tdx->vp_enter_args.r14 = ecx;
12903bf31b57SIsaku Yamahata 	tdx->vp_enter_args.r15 = edx;
12913bf31b57SIsaku Yamahata 
12923bf31b57SIsaku Yamahata 	return 1;
12933bf31b57SIsaku Yamahata }
12943bf31b57SIsaku Yamahata 
tdx_complete_pio_out(struct kvm_vcpu * vcpu)129533608aafSIsaku Yamahata static int tdx_complete_pio_out(struct kvm_vcpu *vcpu)
129633608aafSIsaku Yamahata {
129733608aafSIsaku Yamahata 	vcpu->arch.pio.count = 0;
129833608aafSIsaku Yamahata 	return 1;
129933608aafSIsaku Yamahata }
130033608aafSIsaku Yamahata 
tdx_complete_pio_in(struct kvm_vcpu * vcpu)130133608aafSIsaku Yamahata static int tdx_complete_pio_in(struct kvm_vcpu *vcpu)
130233608aafSIsaku Yamahata {
130333608aafSIsaku Yamahata 	struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
130433608aafSIsaku Yamahata 	unsigned long val = 0;
130533608aafSIsaku Yamahata 	int ret;
130633608aafSIsaku Yamahata 
130733608aafSIsaku Yamahata 	ret = ctxt->ops->pio_in_emulated(ctxt, vcpu->arch.pio.size,
130833608aafSIsaku Yamahata 					 vcpu->arch.pio.port, &val, 1);
130933608aafSIsaku Yamahata 
131033608aafSIsaku Yamahata 	WARN_ON_ONCE(!ret);
131133608aafSIsaku Yamahata 
131233608aafSIsaku Yamahata 	tdvmcall_set_return_val(vcpu, val);
131333608aafSIsaku Yamahata 
131433608aafSIsaku Yamahata 	return 1;
131533608aafSIsaku Yamahata }
131633608aafSIsaku Yamahata 
tdx_emulate_io(struct kvm_vcpu * vcpu)131733608aafSIsaku Yamahata static int tdx_emulate_io(struct kvm_vcpu *vcpu)
131833608aafSIsaku Yamahata {
131933608aafSIsaku Yamahata 	struct vcpu_tdx *tdx = to_tdx(vcpu);
132033608aafSIsaku Yamahata 	struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
132133608aafSIsaku Yamahata 	unsigned long val = 0;
132233608aafSIsaku Yamahata 	unsigned int port;
132333608aafSIsaku Yamahata 	u64 size, write;
132433608aafSIsaku Yamahata 	int ret;
132533608aafSIsaku Yamahata 
132633608aafSIsaku Yamahata 	++vcpu->stat.io_exits;
132733608aafSIsaku Yamahata 
132833608aafSIsaku Yamahata 	size = tdx->vp_enter_args.r12;
132933608aafSIsaku Yamahata 	write = tdx->vp_enter_args.r13;
133033608aafSIsaku Yamahata 	port = tdx->vp_enter_args.r14;
133133608aafSIsaku Yamahata 
133233608aafSIsaku Yamahata 	if ((write != 0 && write != 1) || (size != 1 && size != 2 && size != 4)) {
133333608aafSIsaku Yamahata 		tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_INVALID_OPERAND);
133433608aafSIsaku Yamahata 		return 1;
133533608aafSIsaku Yamahata 	}
133633608aafSIsaku Yamahata 
133733608aafSIsaku Yamahata 	if (write) {
133833608aafSIsaku Yamahata 		val = tdx->vp_enter_args.r15;
133933608aafSIsaku Yamahata 		ret = ctxt->ops->pio_out_emulated(ctxt, size, port, &val, 1);
134033608aafSIsaku Yamahata 	} else {
134133608aafSIsaku Yamahata 		ret = ctxt->ops->pio_in_emulated(ctxt, size, port, &val, 1);
134233608aafSIsaku Yamahata 	}
134333608aafSIsaku Yamahata 
134433608aafSIsaku Yamahata 	if (!ret)
134533608aafSIsaku Yamahata 		vcpu->arch.complete_userspace_io = write ? tdx_complete_pio_out :
134633608aafSIsaku Yamahata 							   tdx_complete_pio_in;
134733608aafSIsaku Yamahata 	else if (!write)
134833608aafSIsaku Yamahata 		tdvmcall_set_return_val(vcpu, val);
134933608aafSIsaku Yamahata 
135033608aafSIsaku Yamahata 	return ret;
135133608aafSIsaku Yamahata }
135233608aafSIsaku Yamahata 
tdx_complete_mmio_read(struct kvm_vcpu * vcpu)1353bb723bebSSean Christopherson static int tdx_complete_mmio_read(struct kvm_vcpu *vcpu)
1354bb723bebSSean Christopherson {
1355bb723bebSSean Christopherson 	unsigned long val = 0;
1356bb723bebSSean Christopherson 	gpa_t gpa;
1357bb723bebSSean Christopherson 	int size;
1358bb723bebSSean Christopherson 
1359bb723bebSSean Christopherson 	gpa = vcpu->mmio_fragments[0].gpa;
1360bb723bebSSean Christopherson 	size = vcpu->mmio_fragments[0].len;
1361bb723bebSSean Christopherson 
1362bb723bebSSean Christopherson 	memcpy(&val, vcpu->run->mmio.data, size);
1363bb723bebSSean Christopherson 	tdvmcall_set_return_val(vcpu, val);
1364bb723bebSSean Christopherson 	trace_kvm_mmio(KVM_TRACE_MMIO_READ, size, gpa, &val);
1365bb723bebSSean Christopherson 	return 1;
1366bb723bebSSean Christopherson }
1367bb723bebSSean Christopherson 
tdx_mmio_write(struct kvm_vcpu * vcpu,gpa_t gpa,int size,unsigned long val)1368bb723bebSSean Christopherson static inline int tdx_mmio_write(struct kvm_vcpu *vcpu, gpa_t gpa, int size,
1369bb723bebSSean Christopherson 				 unsigned long val)
1370bb723bebSSean Christopherson {
1371bb723bebSSean Christopherson 	if (!kvm_io_bus_write(vcpu, KVM_FAST_MMIO_BUS, gpa, 0, NULL)) {
1372bb723bebSSean Christopherson 		trace_kvm_fast_mmio(gpa);
1373bb723bebSSean Christopherson 		return 0;
1374bb723bebSSean Christopherson 	}
1375bb723bebSSean Christopherson 
1376bb723bebSSean Christopherson 	trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, size, gpa, &val);
1377bb723bebSSean Christopherson 	if (kvm_io_bus_write(vcpu, KVM_MMIO_BUS, gpa, size, &val))
1378bb723bebSSean Christopherson 		return -EOPNOTSUPP;
1379bb723bebSSean Christopherson 
1380bb723bebSSean Christopherson 	return 0;
1381bb723bebSSean Christopherson }
1382bb723bebSSean Christopherson 
tdx_mmio_read(struct kvm_vcpu * vcpu,gpa_t gpa,int size)1383bb723bebSSean Christopherson static inline int tdx_mmio_read(struct kvm_vcpu *vcpu, gpa_t gpa, int size)
1384bb723bebSSean Christopherson {
1385bb723bebSSean Christopherson 	unsigned long val;
1386bb723bebSSean Christopherson 
1387bb723bebSSean Christopherson 	if (kvm_io_bus_read(vcpu, KVM_MMIO_BUS, gpa, size, &val))
1388bb723bebSSean Christopherson 		return -EOPNOTSUPP;
1389bb723bebSSean Christopherson 
1390bb723bebSSean Christopherson 	tdvmcall_set_return_val(vcpu, val);
1391bb723bebSSean Christopherson 	trace_kvm_mmio(KVM_TRACE_MMIO_READ, size, gpa, &val);
1392bb723bebSSean Christopherson 	return 0;
1393bb723bebSSean Christopherson }
1394bb723bebSSean Christopherson 
tdx_emulate_mmio(struct kvm_vcpu * vcpu)1395bb723bebSSean Christopherson static int tdx_emulate_mmio(struct kvm_vcpu *vcpu)
1396bb723bebSSean Christopherson {
1397bb723bebSSean Christopherson 	struct vcpu_tdx *tdx = to_tdx(vcpu);
1398bb723bebSSean Christopherson 	int size, write, r;
1399bb723bebSSean Christopherson 	unsigned long val;
1400bb723bebSSean Christopherson 	gpa_t gpa;
1401bb723bebSSean Christopherson 
1402bb723bebSSean Christopherson 	size = tdx->vp_enter_args.r12;
1403bb723bebSSean Christopherson 	write = tdx->vp_enter_args.r13;
1404bb723bebSSean Christopherson 	gpa = tdx->vp_enter_args.r14;
1405bb723bebSSean Christopherson 	val = write ? tdx->vp_enter_args.r15 : 0;
1406bb723bebSSean Christopherson 
1407bb723bebSSean Christopherson 	if (size != 1 && size != 2 && size != 4 && size != 8)
1408bb723bebSSean Christopherson 		goto error;
1409bb723bebSSean Christopherson 	if (write != 0 && write != 1)
1410bb723bebSSean Christopherson 		goto error;
1411bb723bebSSean Christopherson 
1412bb723bebSSean Christopherson 	/*
1413bb723bebSSean Christopherson 	 * TDG.VP.VMCALL<MMIO> allows only shared GPA, it makes no sense to
1414bb723bebSSean Christopherson 	 * do MMIO emulation for private GPA.
1415bb723bebSSean Christopherson 	 */
1416bb723bebSSean Christopherson 	if (vt_is_tdx_private_gpa(vcpu->kvm, gpa) ||
1417bb723bebSSean Christopherson 	    vt_is_tdx_private_gpa(vcpu->kvm, gpa + size - 1))
1418bb723bebSSean Christopherson 		goto error;
1419bb723bebSSean Christopherson 
1420bb723bebSSean Christopherson 	gpa = gpa & ~gfn_to_gpa(kvm_gfn_direct_bits(vcpu->kvm));
1421bb723bebSSean Christopherson 
1422bb723bebSSean Christopherson 	if (write)
1423bb723bebSSean Christopherson 		r = tdx_mmio_write(vcpu, gpa, size, val);
1424bb723bebSSean Christopherson 	else
1425bb723bebSSean Christopherson 		r = tdx_mmio_read(vcpu, gpa, size);
1426bb723bebSSean Christopherson 	if (!r)
1427bb723bebSSean Christopherson 		/* Kernel completed device emulation. */
1428bb723bebSSean Christopherson 		return 1;
1429bb723bebSSean Christopherson 
1430bb723bebSSean Christopherson 	/* Request the device emulation to userspace device model. */
1431bb723bebSSean Christopherson 	vcpu->mmio_is_write = write;
1432bb723bebSSean Christopherson 	if (!write)
1433bb723bebSSean Christopherson 		vcpu->arch.complete_userspace_io = tdx_complete_mmio_read;
1434bb723bebSSean Christopherson 
1435bb723bebSSean Christopherson 	vcpu->run->mmio.phys_addr = gpa;
1436bb723bebSSean Christopherson 	vcpu->run->mmio.len = size;
1437bb723bebSSean Christopherson 	vcpu->run->mmio.is_write = write;
1438bb723bebSSean Christopherson 	vcpu->run->exit_reason = KVM_EXIT_MMIO;
1439bb723bebSSean Christopherson 
1440bb723bebSSean Christopherson 	if (write) {
1441bb723bebSSean Christopherson 		memcpy(vcpu->run->mmio.data, &val, size);
1442bb723bebSSean Christopherson 	} else {
1443bb723bebSSean Christopherson 		vcpu->mmio_fragments[0].gpa = gpa;
1444bb723bebSSean Christopherson 		vcpu->mmio_fragments[0].len = size;
1445bb723bebSSean Christopherson 		trace_kvm_mmio(KVM_TRACE_MMIO_READ_UNSATISFIED, size, gpa, NULL);
1446bb723bebSSean Christopherson 	}
1447bb723bebSSean Christopherson 	return 0;
1448bb723bebSSean Christopherson 
1449bb723bebSSean Christopherson error:
1450bb723bebSSean Christopherson 	tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_INVALID_OPERAND);
1451bb723bebSSean Christopherson 	return 1;
1452bb723bebSSean Christopherson }
1453bb723bebSSean Christopherson 
tdx_complete_get_td_vm_call_info(struct kvm_vcpu * vcpu)1454*25e8b1ddSBinbin Wu static int tdx_complete_get_td_vm_call_info(struct kvm_vcpu *vcpu)
1455*25e8b1ddSBinbin Wu {
1456*25e8b1ddSBinbin Wu 	struct vcpu_tdx *tdx = to_tdx(vcpu);
1457*25e8b1ddSBinbin Wu 
1458*25e8b1ddSBinbin Wu 	tdvmcall_set_return_code(vcpu, vcpu->run->tdx.get_tdvmcall_info.ret);
1459*25e8b1ddSBinbin Wu 
1460*25e8b1ddSBinbin Wu 	/*
1461*25e8b1ddSBinbin Wu 	 * For now, there is no TDVMCALL beyond GHCI base API supported by KVM
1462*25e8b1ddSBinbin Wu 	 * directly without the support from userspace, just set the value
1463*25e8b1ddSBinbin Wu 	 * returned from userspace.
1464*25e8b1ddSBinbin Wu 	 */
1465*25e8b1ddSBinbin Wu 	tdx->vp_enter_args.r11 = vcpu->run->tdx.get_tdvmcall_info.r11;
1466*25e8b1ddSBinbin Wu 	tdx->vp_enter_args.r12 = vcpu->run->tdx.get_tdvmcall_info.r12;
1467*25e8b1ddSBinbin Wu 	tdx->vp_enter_args.r13 = vcpu->run->tdx.get_tdvmcall_info.r13;
1468*25e8b1ddSBinbin Wu 	tdx->vp_enter_args.r14 = vcpu->run->tdx.get_tdvmcall_info.r14;
1469*25e8b1ddSBinbin Wu 
1470*25e8b1ddSBinbin Wu 	return 1;
1471*25e8b1ddSBinbin Wu }
1472*25e8b1ddSBinbin Wu 
tdx_get_td_vm_call_info(struct kvm_vcpu * vcpu)147304733836SIsaku Yamahata static int tdx_get_td_vm_call_info(struct kvm_vcpu *vcpu)
147404733836SIsaku Yamahata {
147504733836SIsaku Yamahata 	struct vcpu_tdx *tdx = to_tdx(vcpu);
147604733836SIsaku Yamahata 
1477*25e8b1ddSBinbin Wu 	switch (tdx->vp_enter_args.r12) {
1478*25e8b1ddSBinbin Wu 	case 0:
147904733836SIsaku Yamahata 		tdx->vp_enter_args.r11 = 0;
1480*25e8b1ddSBinbin Wu 		tdx->vp_enter_args.r12 = 0;
148104733836SIsaku Yamahata 		tdx->vp_enter_args.r13 = 0;
148204733836SIsaku Yamahata 		tdx->vp_enter_args.r14 = 0;
1483*25e8b1ddSBinbin Wu 		tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_SUCCESS);
148404733836SIsaku Yamahata 		return 1;
1485*25e8b1ddSBinbin Wu 	case 1:
1486*25e8b1ddSBinbin Wu 		vcpu->run->tdx.get_tdvmcall_info.leaf = tdx->vp_enter_args.r12;
1487*25e8b1ddSBinbin Wu 		vcpu->run->exit_reason = KVM_EXIT_TDX;
1488*25e8b1ddSBinbin Wu 		vcpu->run->tdx.flags = 0;
1489*25e8b1ddSBinbin Wu 		vcpu->run->tdx.nr = TDVMCALL_GET_TD_VM_CALL_INFO;
1490*25e8b1ddSBinbin Wu 		vcpu->run->tdx.get_tdvmcall_info.ret = TDVMCALL_STATUS_SUCCESS;
1491*25e8b1ddSBinbin Wu 		vcpu->run->tdx.get_tdvmcall_info.r11 = 0;
1492*25e8b1ddSBinbin Wu 		vcpu->run->tdx.get_tdvmcall_info.r12 = 0;
1493*25e8b1ddSBinbin Wu 		vcpu->run->tdx.get_tdvmcall_info.r13 = 0;
1494*25e8b1ddSBinbin Wu 		vcpu->run->tdx.get_tdvmcall_info.r14 = 0;
1495*25e8b1ddSBinbin Wu 		vcpu->arch.complete_userspace_io = tdx_complete_get_td_vm_call_info;
1496*25e8b1ddSBinbin Wu 		return 0;
1497*25e8b1ddSBinbin Wu 	default:
1498*25e8b1ddSBinbin Wu 		tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_INVALID_OPERAND);
1499*25e8b1ddSBinbin Wu 		return 1;
1500*25e8b1ddSBinbin Wu 	}
150104733836SIsaku Yamahata }
150204733836SIsaku Yamahata 
tdx_complete_simple(struct kvm_vcpu * vcpu)1503cf207eacSBinbin Wu static int tdx_complete_simple(struct kvm_vcpu *vcpu)
1504cf207eacSBinbin Wu {
1505cf207eacSBinbin Wu 	tdvmcall_set_return_code(vcpu, vcpu->run->tdx.unknown.ret);
1506cf207eacSBinbin Wu 	return 1;
1507cf207eacSBinbin Wu }
1508cf207eacSBinbin Wu 
tdx_get_quote(struct kvm_vcpu * vcpu)1509cf207eacSBinbin Wu static int tdx_get_quote(struct kvm_vcpu *vcpu)
1510cf207eacSBinbin Wu {
1511cf207eacSBinbin Wu 	struct vcpu_tdx *tdx = to_tdx(vcpu);
1512cf207eacSBinbin Wu 	u64 gpa = tdx->vp_enter_args.r12;
1513cf207eacSBinbin Wu 	u64 size = tdx->vp_enter_args.r13;
1514cf207eacSBinbin Wu 
1515cf207eacSBinbin Wu 	/* The gpa of buffer must have shared bit set. */
1516cf207eacSBinbin Wu 	if (vt_is_tdx_private_gpa(vcpu->kvm, gpa)) {
1517cf207eacSBinbin Wu 		tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_INVALID_OPERAND);
1518cf207eacSBinbin Wu 		return 1;
1519cf207eacSBinbin Wu 	}
1520cf207eacSBinbin Wu 
1521cf207eacSBinbin Wu 	vcpu->run->exit_reason = KVM_EXIT_TDX;
1522cf207eacSBinbin Wu 	vcpu->run->tdx.flags = 0;
1523cf207eacSBinbin Wu 	vcpu->run->tdx.nr = TDVMCALL_GET_QUOTE;
1524cf207eacSBinbin Wu 	vcpu->run->tdx.get_quote.ret = TDVMCALL_STATUS_SUBFUNC_UNSUPPORTED;
1525cf207eacSBinbin Wu 	vcpu->run->tdx.get_quote.gpa = gpa & ~gfn_to_gpa(kvm_gfn_direct_bits(tdx->vcpu.kvm));
1526cf207eacSBinbin Wu 	vcpu->run->tdx.get_quote.size = size;
1527cf207eacSBinbin Wu 
1528cf207eacSBinbin Wu 	vcpu->arch.complete_userspace_io = tdx_complete_simple;
1529cf207eacSBinbin Wu 
1530cf207eacSBinbin Wu 	return 0;
1531cf207eacSBinbin Wu }
1532cf207eacSBinbin Wu 
handle_tdvmcall(struct kvm_vcpu * vcpu)1533c42856afSIsaku Yamahata static int handle_tdvmcall(struct kvm_vcpu *vcpu)
1534c42856afSIsaku Yamahata {
1535c42856afSIsaku Yamahata 	switch (tdvmcall_leaf(vcpu)) {
15362c304880SBinbin Wu 	case TDVMCALL_MAP_GPA:
15372c304880SBinbin Wu 		return tdx_map_gpa(vcpu);
153879462faaSBinbin Wu 	case TDVMCALL_REPORT_FATAL_ERROR:
153979462faaSBinbin Wu 		return tdx_report_fatal_error(vcpu);
154004733836SIsaku Yamahata 	case TDVMCALL_GET_TD_VM_CALL_INFO:
154104733836SIsaku Yamahata 		return tdx_get_td_vm_call_info(vcpu);
1542cf207eacSBinbin Wu 	case TDVMCALL_GET_QUOTE:
1543cf207eacSBinbin Wu 		return tdx_get_quote(vcpu);
1544c42856afSIsaku Yamahata 	default:
1545c42856afSIsaku Yamahata 		break;
1546c42856afSIsaku Yamahata 	}
1547c42856afSIsaku Yamahata 
1548b5aafcb4SBinbin Wu 	tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_SUBFUNC_UNSUPPORTED);
1549c42856afSIsaku Yamahata 	return 1;
1550c42856afSIsaku Yamahata }
1551c42856afSIsaku Yamahata 
tdx_load_mmu_pgd(struct kvm_vcpu * vcpu,hpa_t root_hpa,int pgd_level)155287e3f45eSSean Christopherson void tdx_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa, int pgd_level)
155387e3f45eSSean Christopherson {
155487e3f45eSSean Christopherson 	u64 shared_bit = (pgd_level == 5) ? TDX_SHARED_BIT_PWL_5 :
155587e3f45eSSean Christopherson 			  TDX_SHARED_BIT_PWL_4;
155687e3f45eSSean Christopherson 
155787e3f45eSSean Christopherson 	if (KVM_BUG_ON(shared_bit != kvm_gfn_direct_bits(vcpu->kvm), vcpu->kvm))
155887e3f45eSSean Christopherson 		return;
155987e3f45eSSean Christopherson 
156087e3f45eSSean Christopherson 	td_vmcs_write64(to_tdx(vcpu), SHARED_EPT_POINTER, root_hpa);
156187e3f45eSSean Christopherson }
156287e3f45eSSean Christopherson 
tdx_unpin(struct kvm * kvm,struct page * page)156302ab5770SIsaku Yamahata static void tdx_unpin(struct kvm *kvm, struct page *page)
156402ab5770SIsaku Yamahata {
156502ab5770SIsaku Yamahata 	put_page(page);
156602ab5770SIsaku Yamahata }
156702ab5770SIsaku Yamahata 
tdx_mem_page_aug(struct kvm * kvm,gfn_t gfn,enum pg_level level,struct page * page)156802ab5770SIsaku Yamahata static int tdx_mem_page_aug(struct kvm *kvm, gfn_t gfn,
156902ab5770SIsaku Yamahata 			    enum pg_level level, struct page *page)
157002ab5770SIsaku Yamahata {
157102ab5770SIsaku Yamahata 	int tdx_level = pg_level_to_tdx_sept_level(level);
157202ab5770SIsaku Yamahata 	struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
157302ab5770SIsaku Yamahata 	gpa_t gpa = gfn_to_gpa(gfn);
157402ab5770SIsaku Yamahata 	u64 entry, level_state;
157502ab5770SIsaku Yamahata 	u64 err;
157602ab5770SIsaku Yamahata 
157702ab5770SIsaku Yamahata 	err = tdh_mem_page_aug(&kvm_tdx->td, gpa, tdx_level, page, &entry, &level_state);
157802ab5770SIsaku Yamahata 	if (unlikely(tdx_operand_busy(err))) {
157902ab5770SIsaku Yamahata 		tdx_unpin(kvm, page);
158002ab5770SIsaku Yamahata 		return -EBUSY;
158102ab5770SIsaku Yamahata 	}
158202ab5770SIsaku Yamahata 
158302ab5770SIsaku Yamahata 	if (KVM_BUG_ON(err, kvm)) {
158402ab5770SIsaku Yamahata 		pr_tdx_error_2(TDH_MEM_PAGE_AUG, err, entry, level_state);
158502ab5770SIsaku Yamahata 		tdx_unpin(kvm, page);
158602ab5770SIsaku Yamahata 		return -EIO;
158702ab5770SIsaku Yamahata 	}
158802ab5770SIsaku Yamahata 
158902ab5770SIsaku Yamahata 	return 0;
159002ab5770SIsaku Yamahata }
159102ab5770SIsaku Yamahata 
1592012426d6SIsaku Yamahata /*
1593012426d6SIsaku Yamahata  * KVM_TDX_INIT_MEM_REGION calls kvm_gmem_populate() to map guest pages; the
1594012426d6SIsaku Yamahata  * callback tdx_gmem_post_populate() then maps pages into private memory.
1595012426d6SIsaku Yamahata  * through the a seamcall TDH.MEM.PAGE.ADD().  The SEAMCALL also requires the
1596012426d6SIsaku Yamahata  * private EPT structures for the page to have been built before, which is
1597012426d6SIsaku Yamahata  * done via kvm_tdp_map_page(). nr_premapped counts the number of pages that
1598012426d6SIsaku Yamahata  * were added to the EPT structures but not added with TDH.MEM.PAGE.ADD().
1599012426d6SIsaku Yamahata  * The counter has to be zero on KVM_TDX_FINALIZE_VM, to ensure that there
1600012426d6SIsaku Yamahata  * are no half-initialized shared EPT pages.
1601012426d6SIsaku Yamahata  */
tdx_mem_page_record_premap_cnt(struct kvm * kvm,gfn_t gfn,enum pg_level level,kvm_pfn_t pfn)1602012426d6SIsaku Yamahata static int tdx_mem_page_record_premap_cnt(struct kvm *kvm, gfn_t gfn,
1603012426d6SIsaku Yamahata 					  enum pg_level level, kvm_pfn_t pfn)
1604012426d6SIsaku Yamahata {
1605012426d6SIsaku Yamahata 	struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
1606012426d6SIsaku Yamahata 
1607012426d6SIsaku Yamahata 	if (KVM_BUG_ON(kvm->arch.pre_fault_allowed, kvm))
1608012426d6SIsaku Yamahata 		return -EINVAL;
1609012426d6SIsaku Yamahata 
1610012426d6SIsaku Yamahata 	/* nr_premapped will be decreased when tdh_mem_page_add() is called. */
1611012426d6SIsaku Yamahata 	atomic64_inc(&kvm_tdx->nr_premapped);
1612012426d6SIsaku Yamahata 	return 0;
1613012426d6SIsaku Yamahata }
1614012426d6SIsaku Yamahata 
tdx_sept_set_private_spte(struct kvm * kvm,gfn_t gfn,enum pg_level level,kvm_pfn_t pfn)161502ab5770SIsaku Yamahata int tdx_sept_set_private_spte(struct kvm *kvm, gfn_t gfn,
161602ab5770SIsaku Yamahata 			      enum pg_level level, kvm_pfn_t pfn)
161702ab5770SIsaku Yamahata {
161802ab5770SIsaku Yamahata 	struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
161902ab5770SIsaku Yamahata 	struct page *page = pfn_to_page(pfn);
162002ab5770SIsaku Yamahata 
162102ab5770SIsaku Yamahata 	/* TODO: handle large pages. */
162202ab5770SIsaku Yamahata 	if (KVM_BUG_ON(level != PG_LEVEL_4K, kvm))
162302ab5770SIsaku Yamahata 		return -EINVAL;
162402ab5770SIsaku Yamahata 
162502ab5770SIsaku Yamahata 	/*
162602ab5770SIsaku Yamahata 	 * Because guest_memfd doesn't support page migration with
162702ab5770SIsaku Yamahata 	 * a_ops->migrate_folio (yet), no callback is triggered for KVM on page
162802ab5770SIsaku Yamahata 	 * migration.  Until guest_memfd supports page migration, prevent page
162902ab5770SIsaku Yamahata 	 * migration.
163002ab5770SIsaku Yamahata 	 * TODO: Once guest_memfd introduces callback on page migration,
163102ab5770SIsaku Yamahata 	 * implement it and remove get_page/put_page().
163202ab5770SIsaku Yamahata 	 */
163302ab5770SIsaku Yamahata 	get_page(page);
163402ab5770SIsaku Yamahata 
1635012426d6SIsaku Yamahata 	/*
1636012426d6SIsaku Yamahata 	 * Read 'pre_fault_allowed' before 'kvm_tdx->state'; see matching
1637012426d6SIsaku Yamahata 	 * barrier in tdx_td_finalize().
1638012426d6SIsaku Yamahata 	 */
1639012426d6SIsaku Yamahata 	smp_rmb();
164002ab5770SIsaku Yamahata 	if (likely(kvm_tdx->state == TD_STATE_RUNNABLE))
164102ab5770SIsaku Yamahata 		return tdx_mem_page_aug(kvm, gfn, level, page);
164202ab5770SIsaku Yamahata 
1643012426d6SIsaku Yamahata 	return tdx_mem_page_record_premap_cnt(kvm, gfn, level, pfn);
164402ab5770SIsaku Yamahata }
164502ab5770SIsaku Yamahata 
tdx_sept_drop_private_spte(struct kvm * kvm,gfn_t gfn,enum pg_level level,struct page * page)164602ab5770SIsaku Yamahata static int tdx_sept_drop_private_spte(struct kvm *kvm, gfn_t gfn,
164702ab5770SIsaku Yamahata 				      enum pg_level level, struct page *page)
164802ab5770SIsaku Yamahata {
164902ab5770SIsaku Yamahata 	int tdx_level = pg_level_to_tdx_sept_level(level);
165002ab5770SIsaku Yamahata 	struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
165102ab5770SIsaku Yamahata 	gpa_t gpa = gfn_to_gpa(gfn);
165202ab5770SIsaku Yamahata 	u64 err, entry, level_state;
165302ab5770SIsaku Yamahata 
165402ab5770SIsaku Yamahata 	/* TODO: handle large pages. */
165502ab5770SIsaku Yamahata 	if (KVM_BUG_ON(level != PG_LEVEL_4K, kvm))
165602ab5770SIsaku Yamahata 		return -EINVAL;
165702ab5770SIsaku Yamahata 
165802ab5770SIsaku Yamahata 	if (KVM_BUG_ON(!is_hkid_assigned(kvm_tdx), kvm))
165902ab5770SIsaku Yamahata 		return -EINVAL;
166002ab5770SIsaku Yamahata 
166102ab5770SIsaku Yamahata 	/*
16624b2abc49SYan Zhao 	 * When zapping private page, write lock is held. So no race condition
16634b2abc49SYan Zhao 	 * with other vcpu sept operation.
16644b2abc49SYan Zhao 	 * Race with TDH.VP.ENTER due to (0-step mitigation) and Guest TDCALLs.
166502ab5770SIsaku Yamahata 	 */
166602ab5770SIsaku Yamahata 	err = tdh_mem_page_remove(&kvm_tdx->td, gpa, tdx_level, &entry,
166702ab5770SIsaku Yamahata 				  &level_state);
16684b2abc49SYan Zhao 
16694b2abc49SYan Zhao 	if (unlikely(tdx_operand_busy(err))) {
16704b2abc49SYan Zhao 		/*
16714b2abc49SYan Zhao 		 * The second retry is expected to succeed after kicking off all
16724b2abc49SYan Zhao 		 * other vCPUs and prevent them from invoking TDH.VP.ENTER.
16734b2abc49SYan Zhao 		 */
16744b2abc49SYan Zhao 		tdx_no_vcpus_enter_start(kvm);
16754b2abc49SYan Zhao 		err = tdh_mem_page_remove(&kvm_tdx->td, gpa, tdx_level, &entry,
16764b2abc49SYan Zhao 					  &level_state);
16774b2abc49SYan Zhao 		tdx_no_vcpus_enter_stop(kvm);
16784b2abc49SYan Zhao 	}
167902ab5770SIsaku Yamahata 
168002ab5770SIsaku Yamahata 	if (KVM_BUG_ON(err, kvm)) {
168102ab5770SIsaku Yamahata 		pr_tdx_error_2(TDH_MEM_PAGE_REMOVE, err, entry, level_state);
168202ab5770SIsaku Yamahata 		return -EIO;
168302ab5770SIsaku Yamahata 	}
168402ab5770SIsaku Yamahata 
168502ab5770SIsaku Yamahata 	err = tdh_phymem_page_wbinvd_hkid((u16)kvm_tdx->hkid, page);
168602ab5770SIsaku Yamahata 
168702ab5770SIsaku Yamahata 	if (KVM_BUG_ON(err, kvm)) {
168802ab5770SIsaku Yamahata 		pr_tdx_error(TDH_PHYMEM_PAGE_WBINVD, err);
168902ab5770SIsaku Yamahata 		return -EIO;
169002ab5770SIsaku Yamahata 	}
169102ab5770SIsaku Yamahata 	tdx_clear_page(page);
169202ab5770SIsaku Yamahata 	tdx_unpin(kvm, page);
169302ab5770SIsaku Yamahata 	return 0;
169402ab5770SIsaku Yamahata }
169502ab5770SIsaku Yamahata 
tdx_sept_link_private_spt(struct kvm * kvm,gfn_t gfn,enum pg_level level,void * private_spt)169602ab5770SIsaku Yamahata int tdx_sept_link_private_spt(struct kvm *kvm, gfn_t gfn,
169702ab5770SIsaku Yamahata 			      enum pg_level level, void *private_spt)
169802ab5770SIsaku Yamahata {
169902ab5770SIsaku Yamahata 	int tdx_level = pg_level_to_tdx_sept_level(level);
170002ab5770SIsaku Yamahata 	gpa_t gpa = gfn_to_gpa(gfn);
170102ab5770SIsaku Yamahata 	struct page *page = virt_to_page(private_spt);
170202ab5770SIsaku Yamahata 	u64 err, entry, level_state;
170302ab5770SIsaku Yamahata 
170402ab5770SIsaku Yamahata 	err = tdh_mem_sept_add(&to_kvm_tdx(kvm)->td, gpa, tdx_level, page, &entry,
170502ab5770SIsaku Yamahata 			       &level_state);
170602ab5770SIsaku Yamahata 	if (unlikely(tdx_operand_busy(err)))
170702ab5770SIsaku Yamahata 		return -EBUSY;
170802ab5770SIsaku Yamahata 
170902ab5770SIsaku Yamahata 	if (KVM_BUG_ON(err, kvm)) {
171002ab5770SIsaku Yamahata 		pr_tdx_error_2(TDH_MEM_SEPT_ADD, err, entry, level_state);
171102ab5770SIsaku Yamahata 		return -EIO;
171202ab5770SIsaku Yamahata 	}
171302ab5770SIsaku Yamahata 
171402ab5770SIsaku Yamahata 	return 0;
171502ab5770SIsaku Yamahata }
171602ab5770SIsaku Yamahata 
1717eac0b72fSYan Zhao /*
1718eac0b72fSYan Zhao  * Check if the error returned from a SEPT zap SEAMCALL is due to that a page is
1719eac0b72fSYan Zhao  * mapped by KVM_TDX_INIT_MEM_REGION without tdh_mem_page_add() being called
1720eac0b72fSYan Zhao  * successfully.
1721eac0b72fSYan Zhao  *
1722eac0b72fSYan Zhao  * Since tdh_mem_sept_add() must have been invoked successfully before a
1723eac0b72fSYan Zhao  * non-leaf entry present in the mirrored page table, the SEPT ZAP related
1724eac0b72fSYan Zhao  * SEAMCALLs should not encounter err TDX_EPT_WALK_FAILED. They should instead
1725eac0b72fSYan Zhao  * find TDX_EPT_ENTRY_STATE_INCORRECT due to an empty leaf entry found in the
1726eac0b72fSYan Zhao  * SEPT.
1727eac0b72fSYan Zhao  *
1728eac0b72fSYan Zhao  * Further check if the returned entry from SEPT walking is with RWX permissions
1729eac0b72fSYan Zhao  * to filter out anything unexpected.
1730eac0b72fSYan Zhao  *
1731eac0b72fSYan Zhao  * Note: @level is pg_level, not the tdx_level. The tdx_level extracted from
1732eac0b72fSYan Zhao  * level_state returned from a SEAMCALL error is the same as that passed into
1733eac0b72fSYan Zhao  * the SEAMCALL.
1734eac0b72fSYan Zhao  */
tdx_is_sept_zap_err_due_to_premap(struct kvm_tdx * kvm_tdx,u64 err,u64 entry,int level)1735eac0b72fSYan Zhao static int tdx_is_sept_zap_err_due_to_premap(struct kvm_tdx *kvm_tdx, u64 err,
1736eac0b72fSYan Zhao 					     u64 entry, int level)
1737eac0b72fSYan Zhao {
1738eac0b72fSYan Zhao 	if (!err || kvm_tdx->state == TD_STATE_RUNNABLE)
1739eac0b72fSYan Zhao 		return false;
1740eac0b72fSYan Zhao 
1741eac0b72fSYan Zhao 	if (err != (TDX_EPT_ENTRY_STATE_INCORRECT | TDX_OPERAND_ID_RCX))
1742eac0b72fSYan Zhao 		return false;
1743eac0b72fSYan Zhao 
1744eac0b72fSYan Zhao 	if ((is_last_spte(entry, level) && (entry & VMX_EPT_RWX_MASK)))
1745eac0b72fSYan Zhao 		return false;
1746eac0b72fSYan Zhao 
1747eac0b72fSYan Zhao 	return true;
1748eac0b72fSYan Zhao }
1749eac0b72fSYan Zhao 
tdx_sept_zap_private_spte(struct kvm * kvm,gfn_t gfn,enum pg_level level,struct page * page)175002ab5770SIsaku Yamahata static int tdx_sept_zap_private_spte(struct kvm *kvm, gfn_t gfn,
1751eac0b72fSYan Zhao 				     enum pg_level level, struct page *page)
175202ab5770SIsaku Yamahata {
175302ab5770SIsaku Yamahata 	int tdx_level = pg_level_to_tdx_sept_level(level);
175402ab5770SIsaku Yamahata 	struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
175502ab5770SIsaku Yamahata 	gpa_t gpa = gfn_to_gpa(gfn) & KVM_HPAGE_MASK(level);
175602ab5770SIsaku Yamahata 	u64 err, entry, level_state;
175702ab5770SIsaku Yamahata 
175802ab5770SIsaku Yamahata 	/* For now large page isn't supported yet. */
175902ab5770SIsaku Yamahata 	WARN_ON_ONCE(level != PG_LEVEL_4K);
176002ab5770SIsaku Yamahata 
176102ab5770SIsaku Yamahata 	err = tdh_mem_range_block(&kvm_tdx->td, gpa, tdx_level, &entry, &level_state);
1762eac0b72fSYan Zhao 
17634b2abc49SYan Zhao 	if (unlikely(tdx_operand_busy(err))) {
17644b2abc49SYan Zhao 		/* After no vCPUs enter, the second retry is expected to succeed */
17654b2abc49SYan Zhao 		tdx_no_vcpus_enter_start(kvm);
17664b2abc49SYan Zhao 		err = tdh_mem_range_block(&kvm_tdx->td, gpa, tdx_level, &entry, &level_state);
17674b2abc49SYan Zhao 		tdx_no_vcpus_enter_stop(kvm);
17684b2abc49SYan Zhao 	}
1769eac0b72fSYan Zhao 	if (tdx_is_sept_zap_err_due_to_premap(kvm_tdx, err, entry, level) &&
1770eac0b72fSYan Zhao 	    !KVM_BUG_ON(!atomic64_read(&kvm_tdx->nr_premapped), kvm)) {
1771eac0b72fSYan Zhao 		atomic64_dec(&kvm_tdx->nr_premapped);
1772eac0b72fSYan Zhao 		tdx_unpin(kvm, page);
1773eac0b72fSYan Zhao 		return 0;
1774eac0b72fSYan Zhao 	}
1775eac0b72fSYan Zhao 
177602ab5770SIsaku Yamahata 	if (KVM_BUG_ON(err, kvm)) {
177702ab5770SIsaku Yamahata 		pr_tdx_error_2(TDH_MEM_RANGE_BLOCK, err, entry, level_state);
177802ab5770SIsaku Yamahata 		return -EIO;
177902ab5770SIsaku Yamahata 	}
1780eac0b72fSYan Zhao 	return 1;
178102ab5770SIsaku Yamahata }
178202ab5770SIsaku Yamahata 
178322836e1dSIsaku Yamahata /*
178422836e1dSIsaku Yamahata  * Ensure shared and private EPTs to be flushed on all vCPUs.
178522836e1dSIsaku Yamahata  * tdh_mem_track() is the only caller that increases TD epoch. An increase in
178622836e1dSIsaku Yamahata  * the TD epoch (e.g., to value "N + 1") is successful only if no vCPUs are
178722836e1dSIsaku Yamahata  * running in guest mode with the value "N - 1".
178822836e1dSIsaku Yamahata  *
178922836e1dSIsaku Yamahata  * A successful execution of tdh_mem_track() ensures that vCPUs can only run in
179022836e1dSIsaku Yamahata  * guest mode with TD epoch value "N" if no TD exit occurs after the TD epoch
179122836e1dSIsaku Yamahata  * being increased to "N + 1".
179222836e1dSIsaku Yamahata  *
179322836e1dSIsaku Yamahata  * Kicking off all vCPUs after that further results in no vCPUs can run in guest
179422836e1dSIsaku Yamahata  * mode with TD epoch value "N", which unblocks the next tdh_mem_track() (e.g.
179522836e1dSIsaku Yamahata  * to increase TD epoch to "N + 2").
179622836e1dSIsaku Yamahata  *
179722836e1dSIsaku Yamahata  * TDX module will flush EPT on the next TD enter and make vCPUs to run in
179822836e1dSIsaku Yamahata  * guest mode with TD epoch value "N + 1".
179922836e1dSIsaku Yamahata  *
180022836e1dSIsaku Yamahata  * kvm_make_all_cpus_request() guarantees all vCPUs are out of guest mode by
180122836e1dSIsaku Yamahata  * waiting empty IPI handler ack_kick().
180222836e1dSIsaku Yamahata  *
180322836e1dSIsaku Yamahata  * No action is required to the vCPUs being kicked off since the kicking off
180422836e1dSIsaku Yamahata  * occurs certainly after TD epoch increment and before the next
180522836e1dSIsaku Yamahata  * tdh_mem_track().
180622836e1dSIsaku Yamahata  */
tdx_track(struct kvm * kvm)180702ab5770SIsaku Yamahata static void tdx_track(struct kvm *kvm)
180822836e1dSIsaku Yamahata {
180922836e1dSIsaku Yamahata 	struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
181022836e1dSIsaku Yamahata 	u64 err;
181122836e1dSIsaku Yamahata 
181222836e1dSIsaku Yamahata 	/* If TD isn't finalized, it's before any vcpu running. */
181322836e1dSIsaku Yamahata 	if (unlikely(kvm_tdx->state != TD_STATE_RUNNABLE))
181422836e1dSIsaku Yamahata 		return;
181522836e1dSIsaku Yamahata 
181622836e1dSIsaku Yamahata 	lockdep_assert_held_write(&kvm->mmu_lock);
181722836e1dSIsaku Yamahata 
181822836e1dSIsaku Yamahata 	err = tdh_mem_track(&kvm_tdx->td);
18194b2abc49SYan Zhao 	if (unlikely(tdx_operand_busy(err))) {
18204b2abc49SYan Zhao 		/* After no vCPUs enter, the second retry is expected to succeed */
18214b2abc49SYan Zhao 		tdx_no_vcpus_enter_start(kvm);
18224b2abc49SYan Zhao 		err = tdh_mem_track(&kvm_tdx->td);
18234b2abc49SYan Zhao 		tdx_no_vcpus_enter_stop(kvm);
18244b2abc49SYan Zhao 	}
182522836e1dSIsaku Yamahata 
182622836e1dSIsaku Yamahata 	if (KVM_BUG_ON(err, kvm))
182722836e1dSIsaku Yamahata 		pr_tdx_error(TDH_MEM_TRACK, err);
182822836e1dSIsaku Yamahata 
182922836e1dSIsaku Yamahata 	kvm_make_all_cpus_request(kvm, KVM_REQ_OUTSIDE_GUEST_MODE);
183022836e1dSIsaku Yamahata }
183122836e1dSIsaku Yamahata 
tdx_sept_free_private_spt(struct kvm * kvm,gfn_t gfn,enum pg_level level,void * private_spt)183202ab5770SIsaku Yamahata int tdx_sept_free_private_spt(struct kvm *kvm, gfn_t gfn,
183302ab5770SIsaku Yamahata 			      enum pg_level level, void *private_spt)
183402ab5770SIsaku Yamahata {
183502ab5770SIsaku Yamahata 	struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
183602ab5770SIsaku Yamahata 
183702ab5770SIsaku Yamahata 	/*
183802ab5770SIsaku Yamahata 	 * free_external_spt() is only called after hkid is freed when TD is
183902ab5770SIsaku Yamahata 	 * tearing down.
184002ab5770SIsaku Yamahata 	 * KVM doesn't (yet) zap page table pages in mirror page table while
184102ab5770SIsaku Yamahata 	 * TD is active, though guest pages mapped in mirror page table could be
184202ab5770SIsaku Yamahata 	 * zapped during TD is active, e.g. for shared <-> private conversion
184302ab5770SIsaku Yamahata 	 * and slot move/deletion.
184402ab5770SIsaku Yamahata 	 */
184502ab5770SIsaku Yamahata 	if (KVM_BUG_ON(is_hkid_assigned(kvm_tdx), kvm))
184602ab5770SIsaku Yamahata 		return -EINVAL;
184702ab5770SIsaku Yamahata 
184802ab5770SIsaku Yamahata 	/*
184902ab5770SIsaku Yamahata 	 * The HKID assigned to this TD was already freed and cache was
185002ab5770SIsaku Yamahata 	 * already flushed. We don't have to flush again.
185102ab5770SIsaku Yamahata 	 */
185202ab5770SIsaku Yamahata 	return tdx_reclaim_page(virt_to_page(private_spt));
185302ab5770SIsaku Yamahata }
185402ab5770SIsaku Yamahata 
tdx_sept_remove_private_spte(struct kvm * kvm,gfn_t gfn,enum pg_level level,kvm_pfn_t pfn)185502ab5770SIsaku Yamahata int tdx_sept_remove_private_spte(struct kvm *kvm, gfn_t gfn,
185602ab5770SIsaku Yamahata 				 enum pg_level level, kvm_pfn_t pfn)
185702ab5770SIsaku Yamahata {
1858eac0b72fSYan Zhao 	struct page *page = pfn_to_page(pfn);
185902ab5770SIsaku Yamahata 	int ret;
186002ab5770SIsaku Yamahata 
186102ab5770SIsaku Yamahata 	/*
186202ab5770SIsaku Yamahata 	 * HKID is released after all private pages have been removed, and set
186302ab5770SIsaku Yamahata 	 * before any might be populated. Warn if zapping is attempted when
186402ab5770SIsaku Yamahata 	 * there can't be anything populated in the private EPT.
186502ab5770SIsaku Yamahata 	 */
186602ab5770SIsaku Yamahata 	if (KVM_BUG_ON(!is_hkid_assigned(to_kvm_tdx(kvm)), kvm))
186702ab5770SIsaku Yamahata 		return -EINVAL;
186802ab5770SIsaku Yamahata 
1869eac0b72fSYan Zhao 	ret = tdx_sept_zap_private_spte(kvm, gfn, level, page);
1870eac0b72fSYan Zhao 	if (ret <= 0)
187102ab5770SIsaku Yamahata 		return ret;
187202ab5770SIsaku Yamahata 
187302ab5770SIsaku Yamahata 	/*
187402ab5770SIsaku Yamahata 	 * TDX requires TLB tracking before dropping private page.  Do
187502ab5770SIsaku Yamahata 	 * it here, although it is also done later.
187602ab5770SIsaku Yamahata 	 */
187702ab5770SIsaku Yamahata 	tdx_track(kvm);
187802ab5770SIsaku Yamahata 
1879eac0b72fSYan Zhao 	return tdx_sept_drop_private_spte(kvm, gfn, level, page);
188002ab5770SIsaku Yamahata }
188102ab5770SIsaku Yamahata 
tdx_deliver_interrupt(struct kvm_lapic * apic,int delivery_mode,int trig_mode,int vector)188224c12911SIsaku Yamahata void tdx_deliver_interrupt(struct kvm_lapic *apic, int delivery_mode,
188324c12911SIsaku Yamahata 			   int trig_mode, int vector)
188424c12911SIsaku Yamahata {
188524c12911SIsaku Yamahata 	struct kvm_vcpu *vcpu = apic->vcpu;
188624c12911SIsaku Yamahata 	struct vcpu_tdx *tdx = to_tdx(vcpu);
188724c12911SIsaku Yamahata 
188824c12911SIsaku Yamahata 	/* TDX supports only posted interrupt.  No lapic emulation. */
188924c12911SIsaku Yamahata 	__vmx_deliver_posted_interrupt(vcpu, &tdx->vt.pi_desc, vector);
189024c12911SIsaku Yamahata 
189124c12911SIsaku Yamahata 	trace_kvm_apicv_accept_irq(vcpu->vcpu_id, delivery_mode, trig_mode, vector);
189224c12911SIsaku Yamahata }
189324c12911SIsaku Yamahata 
tdx_is_sept_violation_unexpected_pending(struct kvm_vcpu * vcpu)1894e6a85781SYan Zhao static inline bool tdx_is_sept_violation_unexpected_pending(struct kvm_vcpu *vcpu)
1895e6a85781SYan Zhao {
1896e6a85781SYan Zhao 	u64 eeq_type = to_tdx(vcpu)->ext_exit_qualification & TDX_EXT_EXIT_QUAL_TYPE_MASK;
1897e6a85781SYan Zhao 	u64 eq = vmx_get_exit_qual(vcpu);
1898e6a85781SYan Zhao 
1899e6a85781SYan Zhao 	if (eeq_type != TDX_EXT_EXIT_QUAL_TYPE_PENDING_EPT_VIOLATION)
1900e6a85781SYan Zhao 		return false;
1901e6a85781SYan Zhao 
1902fd02aa45SPaolo Bonzini 	return !(eq & EPT_VIOLATION_PROT_MASK) && !(eq & EPT_VIOLATION_EXEC_FOR_RING3_LIN);
1903e6a85781SYan Zhao }
1904e6a85781SYan Zhao 
tdx_handle_ept_violation(struct kvm_vcpu * vcpu)1905da407fe4SIsaku Yamahata static int tdx_handle_ept_violation(struct kvm_vcpu *vcpu)
1906da407fe4SIsaku Yamahata {
1907da407fe4SIsaku Yamahata 	unsigned long exit_qual;
1908da407fe4SIsaku Yamahata 	gpa_t gpa = to_tdx(vcpu)->exit_gpa;
1909b0327bb2SYan Zhao 	bool local_retry = false;
1910b0327bb2SYan Zhao 	int ret;
1911da407fe4SIsaku Yamahata 
1912da407fe4SIsaku Yamahata 	if (vt_is_tdx_private_gpa(vcpu->kvm, gpa)) {
1913e6a85781SYan Zhao 		if (tdx_is_sept_violation_unexpected_pending(vcpu)) {
1914e6a85781SYan Zhao 			pr_warn("Guest access before accepting 0x%llx on vCPU %d\n",
1915e6a85781SYan Zhao 				gpa, vcpu->vcpu_id);
1916e6a85781SYan Zhao 			kvm_vm_dead(vcpu->kvm);
1917e6a85781SYan Zhao 			return -EIO;
1918e6a85781SYan Zhao 		}
1919da407fe4SIsaku Yamahata 		/*
1920da407fe4SIsaku Yamahata 		 * Always treat SEPT violations as write faults.  Ignore the
1921da407fe4SIsaku Yamahata 		 * EXIT_QUALIFICATION reported by TDX-SEAM for SEPT violations.
1922da407fe4SIsaku Yamahata 		 * TD private pages are always RWX in the SEPT tables,
1923da407fe4SIsaku Yamahata 		 * i.e. they're always mapped writable.  Just as importantly,
1924da407fe4SIsaku Yamahata 		 * treating SEPT violations as write faults is necessary to
1925da407fe4SIsaku Yamahata 		 * avoid COW allocations, which will cause TDAUGPAGE failures
1926da407fe4SIsaku Yamahata 		 * due to aliasing a single HPA to multiple GPAs.
1927da407fe4SIsaku Yamahata 		 */
1928da407fe4SIsaku Yamahata 		exit_qual = EPT_VIOLATION_ACC_WRITE;
1929b0327bb2SYan Zhao 
1930b0327bb2SYan Zhao 		/* Only private GPA triggers zero-step mitigation */
1931b0327bb2SYan Zhao 		local_retry = true;
1932da407fe4SIsaku Yamahata 	} else {
1933da407fe4SIsaku Yamahata 		exit_qual = vmx_get_exit_qual(vcpu);
1934da407fe4SIsaku Yamahata 		/*
1935da407fe4SIsaku Yamahata 		 * EPT violation due to instruction fetch should never be
1936da407fe4SIsaku Yamahata 		 * triggered from shared memory in TDX guest.  If such EPT
1937da407fe4SIsaku Yamahata 		 * violation occurs, treat it as broken hardware.
1938da407fe4SIsaku Yamahata 		 */
1939da407fe4SIsaku Yamahata 		if (KVM_BUG_ON(exit_qual & EPT_VIOLATION_ACC_INSTR, vcpu->kvm))
1940da407fe4SIsaku Yamahata 			return -EIO;
1941da407fe4SIsaku Yamahata 	}
1942da407fe4SIsaku Yamahata 
1943da407fe4SIsaku Yamahata 	trace_kvm_page_fault(vcpu, gpa, exit_qual);
1944b0327bb2SYan Zhao 
1945b0327bb2SYan Zhao 	/*
1946b0327bb2SYan Zhao 	 * To minimize TDH.VP.ENTER invocations, retry locally for private GPA
1947b0327bb2SYan Zhao 	 * mapping in TDX.
1948b0327bb2SYan Zhao 	 *
1949b0327bb2SYan Zhao 	 * KVM may return RET_PF_RETRY for private GPA due to
1950b0327bb2SYan Zhao 	 * - contentions when atomically updating SPTEs of the mirror page table
1951b0327bb2SYan Zhao 	 * - in-progress GFN invalidation or memslot removal.
1952b0327bb2SYan Zhao 	 * - TDX_OPERAND_BUSY error from TDH.MEM.PAGE.AUG or TDH.MEM.SEPT.ADD,
1953b0327bb2SYan Zhao 	 *   caused by contentions with TDH.VP.ENTER (with zero-step mitigation)
1954b0327bb2SYan Zhao 	 *   or certain TDCALLs.
1955b0327bb2SYan Zhao 	 *
1956b0327bb2SYan Zhao 	 * If TDH.VP.ENTER is invoked more times than the threshold set by the
1957b0327bb2SYan Zhao 	 * TDX module before KVM resolves the private GPA mapping, the TDX
1958b0327bb2SYan Zhao 	 * module will activate zero-step mitigation during TDH.VP.ENTER. This
1959b0327bb2SYan Zhao 	 * process acquires an SEPT tree lock in the TDX module, leading to
1960b0327bb2SYan Zhao 	 * further contentions with TDH.MEM.PAGE.AUG or TDH.MEM.SEPT.ADD
1961b0327bb2SYan Zhao 	 * operations on other vCPUs.
1962b0327bb2SYan Zhao 	 *
1963b0327bb2SYan Zhao 	 * Breaking out of local retries for kvm_vcpu_has_events() is for
1964b0327bb2SYan Zhao 	 * interrupt injection. kvm_vcpu_has_events() should not see pending
1965b0327bb2SYan Zhao 	 * events for TDX. Since KVM can't determine if IRQs (or NMIs) are
1966b0327bb2SYan Zhao 	 * blocked by TDs, false positives are inevitable i.e., KVM may re-enter
1967b0327bb2SYan Zhao 	 * the guest even if the IRQ/NMI can't be delivered.
1968b0327bb2SYan Zhao 	 *
1969b0327bb2SYan Zhao 	 * Note: even without breaking out of local retries, zero-step
1970b0327bb2SYan Zhao 	 * mitigation may still occur due to
1971b0327bb2SYan Zhao 	 * - invoking of TDH.VP.ENTER after KVM_EXIT_MEMORY_FAULT,
1972b0327bb2SYan Zhao 	 * - a single RIP causing EPT violations for more GFNs than the
1973b0327bb2SYan Zhao 	 *   threshold count.
1974b0327bb2SYan Zhao 	 * This is safe, as triggering zero-step mitigation only introduces
1975b0327bb2SYan Zhao 	 * contentions to page installation SEAMCALLs on other vCPUs, which will
1976b0327bb2SYan Zhao 	 * handle retries locally in their EPT violation handlers.
1977b0327bb2SYan Zhao 	 */
1978b0327bb2SYan Zhao 	while (1) {
1979b0327bb2SYan Zhao 		ret = __vmx_handle_ept_violation(vcpu, gpa, exit_qual);
1980b0327bb2SYan Zhao 
1981b0327bb2SYan Zhao 		if (ret != RET_PF_RETRY || !local_retry)
1982b0327bb2SYan Zhao 			break;
1983b0327bb2SYan Zhao 
1984b0327bb2SYan Zhao 		if (kvm_vcpu_has_events(vcpu) || signal_pending(current))
1985b0327bb2SYan Zhao 			break;
1986b0327bb2SYan Zhao 
1987b0327bb2SYan Zhao 		if (kvm_check_request(KVM_REQ_VM_DEAD, vcpu)) {
1988b0327bb2SYan Zhao 			ret = -EIO;
1989b0327bb2SYan Zhao 			break;
1990b0327bb2SYan Zhao 		}
1991b0327bb2SYan Zhao 
1992b0327bb2SYan Zhao 		cond_resched();
1993b0327bb2SYan Zhao 	}
1994b0327bb2SYan Zhao 	return ret;
1995da407fe4SIsaku Yamahata }
1996da407fe4SIsaku Yamahata 
tdx_complete_emulated_msr(struct kvm_vcpu * vcpu,int err)1997081385dbSIsaku Yamahata int tdx_complete_emulated_msr(struct kvm_vcpu *vcpu, int err)
1998081385dbSIsaku Yamahata {
1999081385dbSIsaku Yamahata 	if (err) {
2000081385dbSIsaku Yamahata 		tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_INVALID_OPERAND);
2001081385dbSIsaku Yamahata 		return 1;
2002081385dbSIsaku Yamahata 	}
2003081385dbSIsaku Yamahata 
2004081385dbSIsaku Yamahata 	if (vmx_get_exit_reason(vcpu).basic == EXIT_REASON_MSR_READ)
2005081385dbSIsaku Yamahata 		tdvmcall_set_return_val(vcpu, kvm_read_edx_eax(vcpu));
2006081385dbSIsaku Yamahata 
2007081385dbSIsaku Yamahata 	return 1;
2008081385dbSIsaku Yamahata }
2009081385dbSIsaku Yamahata 
2010081385dbSIsaku Yamahata 
tdx_handle_exit(struct kvm_vcpu * vcpu,fastpath_t fastpath)2011095b71a0SIsaku Yamahata int tdx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t fastpath)
2012095b71a0SIsaku Yamahata {
2013095b71a0SIsaku Yamahata 	struct vcpu_tdx *tdx = to_tdx(vcpu);
2014095b71a0SIsaku Yamahata 	u64 vp_enter_ret = tdx->vp_enter_ret;
2015095b71a0SIsaku Yamahata 	union vmx_exit_reason exit_reason = vmx_get_exit_reason(vcpu);
2016095b71a0SIsaku Yamahata 
2017095b71a0SIsaku Yamahata 	if (fastpath != EXIT_FASTPATH_NONE)
2018095b71a0SIsaku Yamahata 		return 1;
2019095b71a0SIsaku Yamahata 
2020da407fe4SIsaku Yamahata 	if (unlikely(vp_enter_ret == EXIT_REASON_EPT_MISCONFIG)) {
2021da407fe4SIsaku Yamahata 		KVM_BUG_ON(1, vcpu->kvm);
2022da407fe4SIsaku Yamahata 		return -EIO;
2023da407fe4SIsaku Yamahata 	}
2024da407fe4SIsaku Yamahata 
2025095b71a0SIsaku Yamahata 	/*
2026095b71a0SIsaku Yamahata 	 * Handle TDX SW errors, including TDX_SEAMCALL_UD, TDX_SEAMCALL_GP and
2027095b71a0SIsaku Yamahata 	 * TDX_SEAMCALL_VMFAILINVALID.
2028095b71a0SIsaku Yamahata 	 */
2029095b71a0SIsaku Yamahata 	if (unlikely((vp_enter_ret & TDX_SW_ERROR) == TDX_SW_ERROR)) {
2030095b71a0SIsaku Yamahata 		KVM_BUG_ON(!kvm_rebooting, vcpu->kvm);
2031095b71a0SIsaku Yamahata 		goto unhandled_exit;
2032095b71a0SIsaku Yamahata 	}
2033095b71a0SIsaku Yamahata 
2034095b71a0SIsaku Yamahata 	if (unlikely(tdx_failed_vmentry(vcpu))) {
2035095b71a0SIsaku Yamahata 		/*
2036095b71a0SIsaku Yamahata 		 * If the guest state is protected, that means off-TD debug is
2037095b71a0SIsaku Yamahata 		 * not enabled, TDX_NON_RECOVERABLE must be set.
2038095b71a0SIsaku Yamahata 		 */
2039095b71a0SIsaku Yamahata 		WARN_ON_ONCE(vcpu->arch.guest_state_protected &&
2040095b71a0SIsaku Yamahata 				!(vp_enter_ret & TDX_NON_RECOVERABLE));
2041095b71a0SIsaku Yamahata 		vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY;
2042095b71a0SIsaku Yamahata 		vcpu->run->fail_entry.hardware_entry_failure_reason = exit_reason.full;
2043095b71a0SIsaku Yamahata 		vcpu->run->fail_entry.cpu = vcpu->arch.last_vmentry_cpu;
2044095b71a0SIsaku Yamahata 		return 0;
2045095b71a0SIsaku Yamahata 	}
2046095b71a0SIsaku Yamahata 
2047095b71a0SIsaku Yamahata 	if (unlikely(vp_enter_ret & (TDX_ERROR | TDX_NON_RECOVERABLE)) &&
2048095b71a0SIsaku Yamahata 		exit_reason.basic != EXIT_REASON_TRIPLE_FAULT) {
2049095b71a0SIsaku Yamahata 		kvm_pr_unimpl("TD vp_enter_ret 0x%llx\n", vp_enter_ret);
2050095b71a0SIsaku Yamahata 		goto unhandled_exit;
2051095b71a0SIsaku Yamahata 	}
2052095b71a0SIsaku Yamahata 
2053095b71a0SIsaku Yamahata 	WARN_ON_ONCE(exit_reason.basic != EXIT_REASON_TRIPLE_FAULT &&
2054095b71a0SIsaku Yamahata 		     (vp_enter_ret & TDX_SEAMCALL_STATUS_MASK) != TDX_SUCCESS);
2055095b71a0SIsaku Yamahata 
2056095b71a0SIsaku Yamahata 	switch (exit_reason.basic) {
2057095b71a0SIsaku Yamahata 	case EXIT_REASON_TRIPLE_FAULT:
2058095b71a0SIsaku Yamahata 		vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN;
2059095b71a0SIsaku Yamahata 		vcpu->mmio_needed = 0;
2060095b71a0SIsaku Yamahata 		return 0;
2061f30cb642SIsaku Yamahata 	case EXIT_REASON_EXCEPTION_NMI:
2062f30cb642SIsaku Yamahata 		return tdx_handle_exception_nmi(vcpu);
2063f30cb642SIsaku Yamahata 	case EXIT_REASON_EXTERNAL_INTERRUPT:
2064f30cb642SIsaku Yamahata 		++vcpu->stat.irq_exits;
2065f30cb642SIsaku Yamahata 		return 1;
20663bf31b57SIsaku Yamahata 	case EXIT_REASON_CPUID:
20673bf31b57SIsaku Yamahata 		return tdx_emulate_cpuid(vcpu);
20685cf7239bSIsaku Yamahata 	case EXIT_REASON_HLT:
20695cf7239bSIsaku Yamahata 		return kvm_emulate_halt_noskip(vcpu);
2070c42856afSIsaku Yamahata 	case EXIT_REASON_TDCALL:
2071c42856afSIsaku Yamahata 		return handle_tdvmcall(vcpu);
2072d5998c02SIsaku Yamahata 	case EXIT_REASON_VMCALL:
2073d5998c02SIsaku Yamahata 		return tdx_emulate_vmcall(vcpu);
207433608aafSIsaku Yamahata 	case EXIT_REASON_IO_INSTRUCTION:
207533608aafSIsaku Yamahata 		return tdx_emulate_io(vcpu);
2076081385dbSIsaku Yamahata 	case EXIT_REASON_MSR_READ:
2077081385dbSIsaku Yamahata 		kvm_rcx_write(vcpu, tdx->vp_enter_args.r12);
2078081385dbSIsaku Yamahata 		return kvm_emulate_rdmsr(vcpu);
2079081385dbSIsaku Yamahata 	case EXIT_REASON_MSR_WRITE:
2080081385dbSIsaku Yamahata 		kvm_rcx_write(vcpu, tdx->vp_enter_args.r12);
2081081385dbSIsaku Yamahata 		kvm_rax_write(vcpu, tdx->vp_enter_args.r13 & -1u);
2082081385dbSIsaku Yamahata 		kvm_rdx_write(vcpu, tdx->vp_enter_args.r13 >> 32);
2083081385dbSIsaku Yamahata 		return kvm_emulate_wrmsr(vcpu);
2084bb723bebSSean Christopherson 	case EXIT_REASON_EPT_MISCONFIG:
2085bb723bebSSean Christopherson 		return tdx_emulate_mmio(vcpu);
2086da407fe4SIsaku Yamahata 	case EXIT_REASON_EPT_VIOLATION:
2087da407fe4SIsaku Yamahata 		return tdx_handle_ept_violation(vcpu);
20886c441e4dSIsaku Yamahata 	case EXIT_REASON_OTHER_SMI:
20896c441e4dSIsaku Yamahata 		/*
20906c441e4dSIsaku Yamahata 		 * Unlike VMX, SMI in SEAM non-root mode (i.e. when
20916c441e4dSIsaku Yamahata 		 * TD guest vCPU is running) will cause VM exit to TDX module,
20926c441e4dSIsaku Yamahata 		 * then SEAMRET to KVM.  Once it exits to KVM, SMI is delivered
20936c441e4dSIsaku Yamahata 		 * and handled by kernel handler right away.
20946c441e4dSIsaku Yamahata 		 *
20956c441e4dSIsaku Yamahata 		 * The Other SMI exit can also be caused by the SEAM non-root
20966c441e4dSIsaku Yamahata 		 * machine check delivered via Machine Check System Management
20976c441e4dSIsaku Yamahata 		 * Interrupt (MSMI), but it has already been handled by the
20986c441e4dSIsaku Yamahata 		 * kernel machine check handler, i.e., the memory page has been
20996c441e4dSIsaku Yamahata 		 * marked as poisoned and it won't be freed to the free list
21006c441e4dSIsaku Yamahata 		 * when the TDX guest is terminated (the TDX module marks the
21016c441e4dSIsaku Yamahata 		 * guest as dead and prevent it from further running when
21026c441e4dSIsaku Yamahata 		 * machine check happens in SEAM non-root).
21036c441e4dSIsaku Yamahata 		 *
21046c441e4dSIsaku Yamahata 		 * - A MSMI will not reach here, it's handled as non_recoverable
21056c441e4dSIsaku Yamahata 		 *   case above.
21066c441e4dSIsaku Yamahata 		 * - If it's not an MSMI, no need to do anything here.
21076c441e4dSIsaku Yamahata 		 */
21086c441e4dSIsaku Yamahata 		return 1;
2109095b71a0SIsaku Yamahata 	default:
2110095b71a0SIsaku Yamahata 		break;
2111095b71a0SIsaku Yamahata 	}
2112095b71a0SIsaku Yamahata 
2113095b71a0SIsaku Yamahata unhandled_exit:
2114095b71a0SIsaku Yamahata 	vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
2115095b71a0SIsaku Yamahata 	vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_UNEXPECTED_EXIT_REASON;
2116095b71a0SIsaku Yamahata 	vcpu->run->internal.ndata = 2;
2117095b71a0SIsaku Yamahata 	vcpu->run->internal.data[0] = vp_enter_ret;
2118095b71a0SIsaku Yamahata 	vcpu->run->internal.data[1] = vcpu->arch.last_vmentry_cpu;
2119095b71a0SIsaku Yamahata 	return 0;
2120095b71a0SIsaku Yamahata }
2121095b71a0SIsaku Yamahata 
tdx_get_exit_info(struct kvm_vcpu * vcpu,u32 * reason,u64 * info1,u64 * info2,u32 * intr_info,u32 * error_code)2122095b71a0SIsaku Yamahata void tdx_get_exit_info(struct kvm_vcpu *vcpu, u32 *reason,
2123095b71a0SIsaku Yamahata 		u64 *info1, u64 *info2, u32 *intr_info, u32 *error_code)
2124095b71a0SIsaku Yamahata {
2125095b71a0SIsaku Yamahata 	struct vcpu_tdx *tdx = to_tdx(vcpu);
2126095b71a0SIsaku Yamahata 
2127095b71a0SIsaku Yamahata 	*reason = tdx->vt.exit_reason.full;
2128095b71a0SIsaku Yamahata 	if (*reason != -1u) {
2129095b71a0SIsaku Yamahata 		*info1 = vmx_get_exit_qual(vcpu);
2130095b71a0SIsaku Yamahata 		*info2 = tdx->ext_exit_qualification;
2131095b71a0SIsaku Yamahata 		*intr_info = vmx_get_intr_info(vcpu);
2132095b71a0SIsaku Yamahata 	} else {
2133095b71a0SIsaku Yamahata 		*info1 = 0;
2134095b71a0SIsaku Yamahata 		*info2 = 0;
2135095b71a0SIsaku Yamahata 		*intr_info = 0;
2136095b71a0SIsaku Yamahata 	}
2137095b71a0SIsaku Yamahata 
2138095b71a0SIsaku Yamahata 	*error_code = 0;
2139095b71a0SIsaku Yamahata }
2140095b71a0SIsaku Yamahata 
tdx_has_emulated_msr(u32 index)2141dd50294fSIsaku Yamahata bool tdx_has_emulated_msr(u32 index)
2142dd50294fSIsaku Yamahata {
2143dd50294fSIsaku Yamahata 	switch (index) {
2144dd50294fSIsaku Yamahata 	case MSR_IA32_UCODE_REV:
2145dd50294fSIsaku Yamahata 	case MSR_IA32_ARCH_CAPABILITIES:
2146dd50294fSIsaku Yamahata 	case MSR_IA32_POWER_CTL:
2147dd50294fSIsaku Yamahata 	case MSR_IA32_CR_PAT:
214826eab9aeSBinbin Wu 	case MSR_MTRRcap:
214926eab9aeSBinbin Wu 	case MTRRphysBase_MSR(0) ... MSR_MTRRfix4K_F8000:
215026eab9aeSBinbin Wu 	case MSR_MTRRdefType:
2151dd50294fSIsaku Yamahata 	case MSR_IA32_TSC_DEADLINE:
2152dd50294fSIsaku Yamahata 	case MSR_IA32_MISC_ENABLE:
2153dd50294fSIsaku Yamahata 	case MSR_PLATFORM_INFO:
2154dd50294fSIsaku Yamahata 	case MSR_MISC_FEATURES_ENABLES:
2155dd50294fSIsaku Yamahata 	case MSR_IA32_APICBASE:
2156dd50294fSIsaku Yamahata 	case MSR_EFER:
21579fc3402aSIsaku Yamahata 	case MSR_IA32_FEAT_CTL:
2158dd50294fSIsaku Yamahata 	case MSR_IA32_MCG_CAP:
2159dd50294fSIsaku Yamahata 	case MSR_IA32_MCG_STATUS:
2160dd50294fSIsaku Yamahata 	case MSR_IA32_MCG_CTL:
2161dd50294fSIsaku Yamahata 	case MSR_IA32_MCG_EXT_CTL:
2162dd50294fSIsaku Yamahata 	case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1:
2163dd50294fSIsaku Yamahata 	case MSR_IA32_MC0_CTL2 ... MSR_IA32_MCx_CTL2(KVM_MAX_MCE_BANKS) - 1:
2164dd50294fSIsaku Yamahata 		/* MSR_IA32_MCx_{CTL, STATUS, ADDR, MISC, CTL2} */
2165dd50294fSIsaku Yamahata 	case MSR_KVM_POLL_CONTROL:
2166dd50294fSIsaku Yamahata 		return true;
2167dd50294fSIsaku Yamahata 	case APIC_BASE_MSR ... APIC_BASE_MSR + 0xff:
2168dd50294fSIsaku Yamahata 		/*
2169dd50294fSIsaku Yamahata 		 * x2APIC registers that are virtualized by the CPU can't be
2170dd50294fSIsaku Yamahata 		 * emulated, KVM doesn't have access to the virtual APIC page.
2171dd50294fSIsaku Yamahata 		 */
2172dd50294fSIsaku Yamahata 		switch (index) {
2173dd50294fSIsaku Yamahata 		case X2APIC_MSR(APIC_TASKPRI):
2174dd50294fSIsaku Yamahata 		case X2APIC_MSR(APIC_PROCPRI):
2175dd50294fSIsaku Yamahata 		case X2APIC_MSR(APIC_EOI):
2176dd50294fSIsaku Yamahata 		case X2APIC_MSR(APIC_ISR) ... X2APIC_MSR(APIC_ISR + APIC_ISR_NR):
2177dd50294fSIsaku Yamahata 		case X2APIC_MSR(APIC_TMR) ... X2APIC_MSR(APIC_TMR + APIC_ISR_NR):
2178dd50294fSIsaku Yamahata 		case X2APIC_MSR(APIC_IRR) ... X2APIC_MSR(APIC_IRR + APIC_ISR_NR):
2179dd50294fSIsaku Yamahata 			return false;
2180dd50294fSIsaku Yamahata 		default:
2181dd50294fSIsaku Yamahata 			return true;
2182dd50294fSIsaku Yamahata 		}
2183dd50294fSIsaku Yamahata 	default:
2184dd50294fSIsaku Yamahata 		return false;
2185dd50294fSIsaku Yamahata 	}
2186dd50294fSIsaku Yamahata }
2187dd50294fSIsaku Yamahata 
tdx_is_read_only_msr(u32 index)2188dd50294fSIsaku Yamahata static bool tdx_is_read_only_msr(u32 index)
2189dd50294fSIsaku Yamahata {
21909fc3402aSIsaku Yamahata 	return  index == MSR_IA32_APICBASE || index == MSR_EFER ||
21919fc3402aSIsaku Yamahata 		index == MSR_IA32_FEAT_CTL;
2192dd50294fSIsaku Yamahata }
2193dd50294fSIsaku Yamahata 
tdx_get_msr(struct kvm_vcpu * vcpu,struct msr_data * msr)2194dd50294fSIsaku Yamahata int tdx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
2195dd50294fSIsaku Yamahata {
21969fc3402aSIsaku Yamahata 	switch (msr->index) {
21979fc3402aSIsaku Yamahata 	case MSR_IA32_FEAT_CTL:
21989fc3402aSIsaku Yamahata 		/*
21999fc3402aSIsaku Yamahata 		 * MCE and MCA are advertised via cpuid. Guest kernel could
22009fc3402aSIsaku Yamahata 		 * check if LMCE is enabled or not.
22019fc3402aSIsaku Yamahata 		 */
22029fc3402aSIsaku Yamahata 		msr->data = FEAT_CTL_LOCKED;
22039fc3402aSIsaku Yamahata 		if (vcpu->arch.mcg_cap & MCG_LMCE_P)
22049fc3402aSIsaku Yamahata 			msr->data |= FEAT_CTL_LMCE_ENABLED;
22059fc3402aSIsaku Yamahata 		return 0;
22069fc3402aSIsaku Yamahata 	case MSR_IA32_MCG_EXT_CTL:
22079fc3402aSIsaku Yamahata 		if (!msr->host_initiated && !(vcpu->arch.mcg_cap & MCG_LMCE_P))
22089fc3402aSIsaku Yamahata 			return 1;
22099fc3402aSIsaku Yamahata 		msr->data = vcpu->arch.mcg_ext_ctl;
22109fc3402aSIsaku Yamahata 		return 0;
22119fc3402aSIsaku Yamahata 	default:
2212dd50294fSIsaku Yamahata 		if (!tdx_has_emulated_msr(msr->index))
2213dd50294fSIsaku Yamahata 			return 1;
2214dd50294fSIsaku Yamahata 
2215dd50294fSIsaku Yamahata 		return kvm_get_msr_common(vcpu, msr);
2216dd50294fSIsaku Yamahata 	}
22179fc3402aSIsaku Yamahata }
2218dd50294fSIsaku Yamahata 
tdx_set_msr(struct kvm_vcpu * vcpu,struct msr_data * msr)2219dd50294fSIsaku Yamahata int tdx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
2220dd50294fSIsaku Yamahata {
22219fc3402aSIsaku Yamahata 	switch (msr->index) {
22229fc3402aSIsaku Yamahata 	case MSR_IA32_MCG_EXT_CTL:
22239fc3402aSIsaku Yamahata 		if ((!msr->host_initiated && !(vcpu->arch.mcg_cap & MCG_LMCE_P)) ||
22249fc3402aSIsaku Yamahata 		    (msr->data & ~MCG_EXT_CTL_LMCE_EN))
22259fc3402aSIsaku Yamahata 			return 1;
22269fc3402aSIsaku Yamahata 		vcpu->arch.mcg_ext_ctl = msr->data;
22279fc3402aSIsaku Yamahata 		return 0;
22289fc3402aSIsaku Yamahata 	default:
2229dd50294fSIsaku Yamahata 		if (tdx_is_read_only_msr(msr->index))
2230dd50294fSIsaku Yamahata 			return 1;
2231dd50294fSIsaku Yamahata 
2232dd50294fSIsaku Yamahata 		if (!tdx_has_emulated_msr(msr->index))
2233dd50294fSIsaku Yamahata 			return 1;
2234dd50294fSIsaku Yamahata 
2235dd50294fSIsaku Yamahata 		return kvm_set_msr_common(vcpu, msr);
2236dd50294fSIsaku Yamahata 	}
22379fc3402aSIsaku Yamahata }
2238dd50294fSIsaku Yamahata 
tdx_get_capabilities(struct kvm_tdx_cmd * cmd)223961bb2827SIsaku Yamahata static int tdx_get_capabilities(struct kvm_tdx_cmd *cmd)
224061bb2827SIsaku Yamahata {
224161bb2827SIsaku Yamahata 	const struct tdx_sys_info_td_conf *td_conf = &tdx_sysinfo->td_conf;
224261bb2827SIsaku Yamahata 	struct kvm_tdx_capabilities __user *user_caps;
224361bb2827SIsaku Yamahata 	struct kvm_tdx_capabilities *caps = NULL;
224461bb2827SIsaku Yamahata 	int ret = 0;
224561bb2827SIsaku Yamahata 
224661bb2827SIsaku Yamahata 	/* flags is reserved for future use */
224761bb2827SIsaku Yamahata 	if (cmd->flags)
224861bb2827SIsaku Yamahata 		return -EINVAL;
224961bb2827SIsaku Yamahata 
225061bb2827SIsaku Yamahata 	caps = kmalloc(sizeof(*caps) +
225161bb2827SIsaku Yamahata 		       sizeof(struct kvm_cpuid_entry2) * td_conf->num_cpuid_config,
225261bb2827SIsaku Yamahata 		       GFP_KERNEL);
225361bb2827SIsaku Yamahata 	if (!caps)
225461bb2827SIsaku Yamahata 		return -ENOMEM;
225561bb2827SIsaku Yamahata 
225661bb2827SIsaku Yamahata 	user_caps = u64_to_user_ptr(cmd->data);
225761bb2827SIsaku Yamahata 	if (copy_from_user(caps, user_caps, sizeof(*caps))) {
225861bb2827SIsaku Yamahata 		ret = -EFAULT;
225961bb2827SIsaku Yamahata 		goto out;
226061bb2827SIsaku Yamahata 	}
226161bb2827SIsaku Yamahata 
226261bb2827SIsaku Yamahata 	if (caps->cpuid.nent < td_conf->num_cpuid_config) {
226361bb2827SIsaku Yamahata 		ret = -E2BIG;
226461bb2827SIsaku Yamahata 		goto out;
226561bb2827SIsaku Yamahata 	}
226661bb2827SIsaku Yamahata 
226761bb2827SIsaku Yamahata 	ret = init_kvm_tdx_caps(td_conf, caps);
226861bb2827SIsaku Yamahata 	if (ret)
226961bb2827SIsaku Yamahata 		goto out;
227061bb2827SIsaku Yamahata 
227161bb2827SIsaku Yamahata 	if (copy_to_user(user_caps, caps, sizeof(*caps))) {
227261bb2827SIsaku Yamahata 		ret = -EFAULT;
227361bb2827SIsaku Yamahata 		goto out;
227461bb2827SIsaku Yamahata 	}
227561bb2827SIsaku Yamahata 
227661bb2827SIsaku Yamahata 	if (copy_to_user(user_caps->cpuid.entries, caps->cpuid.entries,
227761bb2827SIsaku Yamahata 			 caps->cpuid.nent *
227861bb2827SIsaku Yamahata 			 sizeof(caps->cpuid.entries[0])))
227961bb2827SIsaku Yamahata 		ret = -EFAULT;
228061bb2827SIsaku Yamahata 
228161bb2827SIsaku Yamahata out:
228261bb2827SIsaku Yamahata 	/* kfree() accepts NULL. */
228361bb2827SIsaku Yamahata 	kfree(caps);
228461bb2827SIsaku Yamahata 	return ret;
228561bb2827SIsaku Yamahata }
228661bb2827SIsaku Yamahata 
22870186dd29SIsaku Yamahata /*
22880186dd29SIsaku Yamahata  * KVM reports guest physical address in CPUID.0x800000008.EAX[23:16], which is
22890186dd29SIsaku Yamahata  * similar to TDX's GPAW. Use this field as the interface for userspace to
22900186dd29SIsaku Yamahata  * configure the GPAW and EPT level for TDs.
22910186dd29SIsaku Yamahata  *
22920186dd29SIsaku Yamahata  * Only values 48 and 52 are supported. Value 52 means GPAW-52 and EPT level
22930186dd29SIsaku Yamahata  * 5, Value 48 means GPAW-48 and EPT level 4. For value 48, GPAW-48 is always
22940186dd29SIsaku Yamahata  * supported. Value 52 is only supported when the platform supports 5 level
22950186dd29SIsaku Yamahata  * EPT.
22960186dd29SIsaku Yamahata  */
setup_tdparams_eptp_controls(struct kvm_cpuid2 * cpuid,struct td_params * td_params)22970186dd29SIsaku Yamahata static int setup_tdparams_eptp_controls(struct kvm_cpuid2 *cpuid,
22980186dd29SIsaku Yamahata 					struct td_params *td_params)
22990186dd29SIsaku Yamahata {
23000186dd29SIsaku Yamahata 	const struct kvm_cpuid_entry2 *entry;
23010186dd29SIsaku Yamahata 	int guest_pa;
23020186dd29SIsaku Yamahata 
23030186dd29SIsaku Yamahata 	entry = kvm_find_cpuid_entry2(cpuid->entries, cpuid->nent, 0x80000008, 0);
23040186dd29SIsaku Yamahata 	if (!entry)
23050186dd29SIsaku Yamahata 		return -EINVAL;
23060186dd29SIsaku Yamahata 
23070186dd29SIsaku Yamahata 	guest_pa = tdx_get_guest_phys_addr_bits(entry->eax);
23080186dd29SIsaku Yamahata 
23090186dd29SIsaku Yamahata 	if (guest_pa != 48 && guest_pa != 52)
23100186dd29SIsaku Yamahata 		return -EINVAL;
23110186dd29SIsaku Yamahata 
23120186dd29SIsaku Yamahata 	if (guest_pa == 52 && !cpu_has_vmx_ept_5levels())
23130186dd29SIsaku Yamahata 		return -EINVAL;
23140186dd29SIsaku Yamahata 
23150186dd29SIsaku Yamahata 	td_params->eptp_controls = VMX_EPTP_MT_WB;
23160186dd29SIsaku Yamahata 	if (guest_pa == 52) {
23170186dd29SIsaku Yamahata 		td_params->eptp_controls |= VMX_EPTP_PWL_5;
23180186dd29SIsaku Yamahata 		td_params->config_flags |= TDX_CONFIG_FLAGS_MAX_GPAW;
23190186dd29SIsaku Yamahata 	} else {
23200186dd29SIsaku Yamahata 		td_params->eptp_controls |= VMX_EPTP_PWL_4;
23210186dd29SIsaku Yamahata 	}
23220186dd29SIsaku Yamahata 
23230186dd29SIsaku Yamahata 	return 0;
23240186dd29SIsaku Yamahata }
23250186dd29SIsaku Yamahata 
setup_tdparams_cpuids(struct kvm_cpuid2 * cpuid,struct td_params * td_params)23260186dd29SIsaku Yamahata static int setup_tdparams_cpuids(struct kvm_cpuid2 *cpuid,
23270186dd29SIsaku Yamahata 				 struct td_params *td_params)
23280186dd29SIsaku Yamahata {
23290186dd29SIsaku Yamahata 	const struct tdx_sys_info_td_conf *td_conf = &tdx_sysinfo->td_conf;
23300186dd29SIsaku Yamahata 	const struct kvm_cpuid_entry2 *entry;
23310186dd29SIsaku Yamahata 	struct tdx_cpuid_value *value;
23320186dd29SIsaku Yamahata 	int i, copy_cnt = 0;
23330186dd29SIsaku Yamahata 
23340186dd29SIsaku Yamahata 	/*
23350186dd29SIsaku Yamahata 	 * td_params.cpuid_values: The number and the order of cpuid_value must
23360186dd29SIsaku Yamahata 	 * be same to the one of struct tdsysinfo.{num_cpuid_config, cpuid_configs}
23370186dd29SIsaku Yamahata 	 * It's assumed that td_params was zeroed.
23380186dd29SIsaku Yamahata 	 */
23390186dd29SIsaku Yamahata 	for (i = 0; i < td_conf->num_cpuid_config; i++) {
23400186dd29SIsaku Yamahata 		struct kvm_cpuid_entry2 tmp;
23410186dd29SIsaku Yamahata 
23420186dd29SIsaku Yamahata 		td_init_cpuid_entry2(&tmp, i);
23430186dd29SIsaku Yamahata 
23440186dd29SIsaku Yamahata 		entry = kvm_find_cpuid_entry2(cpuid->entries, cpuid->nent,
23450186dd29SIsaku Yamahata 					      tmp.function, tmp.index);
23460186dd29SIsaku Yamahata 		if (!entry)
23470186dd29SIsaku Yamahata 			continue;
23480186dd29SIsaku Yamahata 
23496d415778SAdrian Hunter 		if (tdx_unsupported_cpuid(entry))
23506d415778SAdrian Hunter 			return -EINVAL;
23516d415778SAdrian Hunter 
23520186dd29SIsaku Yamahata 		copy_cnt++;
23530186dd29SIsaku Yamahata 
23540186dd29SIsaku Yamahata 		value = &td_params->cpuid_values[i];
23550186dd29SIsaku Yamahata 		value->eax = entry->eax;
23560186dd29SIsaku Yamahata 		value->ebx = entry->ebx;
23570186dd29SIsaku Yamahata 		value->ecx = entry->ecx;
23580186dd29SIsaku Yamahata 		value->edx = entry->edx;
23590186dd29SIsaku Yamahata 
23600186dd29SIsaku Yamahata 		/*
23610186dd29SIsaku Yamahata 		 * TDX module does not accept nonzero bits 16..23 for the
23620186dd29SIsaku Yamahata 		 * CPUID[0x80000008].EAX, see setup_tdparams_eptp_controls().
23630186dd29SIsaku Yamahata 		 */
23640186dd29SIsaku Yamahata 		if (tmp.function == 0x80000008)
23650186dd29SIsaku Yamahata 			value->eax = tdx_set_guest_phys_addr_bits(value->eax, 0);
23660186dd29SIsaku Yamahata 	}
23670186dd29SIsaku Yamahata 
23680186dd29SIsaku Yamahata 	/*
23690186dd29SIsaku Yamahata 	 * Rely on the TDX module to reject invalid configuration, but it can't
23700186dd29SIsaku Yamahata 	 * check of leafs that don't have a proper slot in td_params->cpuid_values
23710186dd29SIsaku Yamahata 	 * to stick then. So fail if there were entries that didn't get copied to
23720186dd29SIsaku Yamahata 	 * td_params.
23730186dd29SIsaku Yamahata 	 */
23740186dd29SIsaku Yamahata 	if (copy_cnt != cpuid->nent)
23750186dd29SIsaku Yamahata 		return -EINVAL;
23760186dd29SIsaku Yamahata 
23770186dd29SIsaku Yamahata 	return 0;
23780186dd29SIsaku Yamahata }
23790186dd29SIsaku Yamahata 
setup_tdparams(struct kvm * kvm,struct td_params * td_params,struct kvm_tdx_init_vm * init_vm)23800186dd29SIsaku Yamahata static int setup_tdparams(struct kvm *kvm, struct td_params *td_params,
23810186dd29SIsaku Yamahata 			struct kvm_tdx_init_vm *init_vm)
23820186dd29SIsaku Yamahata {
23830186dd29SIsaku Yamahata 	const struct tdx_sys_info_td_conf *td_conf = &tdx_sysinfo->td_conf;
23840186dd29SIsaku Yamahata 	struct kvm_cpuid2 *cpuid = &init_vm->cpuid;
23850186dd29SIsaku Yamahata 	int ret;
23860186dd29SIsaku Yamahata 
23870186dd29SIsaku Yamahata 	if (kvm->created_vcpus)
23880186dd29SIsaku Yamahata 		return -EBUSY;
23890186dd29SIsaku Yamahata 
23900186dd29SIsaku Yamahata 	if (init_vm->attributes & ~tdx_get_supported_attrs(td_conf))
23910186dd29SIsaku Yamahata 		return -EINVAL;
23920186dd29SIsaku Yamahata 
23930186dd29SIsaku Yamahata 	if (init_vm->xfam & ~tdx_get_supported_xfam(td_conf))
23940186dd29SIsaku Yamahata 		return -EINVAL;
23950186dd29SIsaku Yamahata 
23960186dd29SIsaku Yamahata 	td_params->max_vcpus = kvm->max_vcpus;
23970186dd29SIsaku Yamahata 	td_params->attributes = init_vm->attributes | td_conf->attributes_fixed1;
23980186dd29SIsaku Yamahata 	td_params->xfam = init_vm->xfam | td_conf->xfam_fixed1;
23990186dd29SIsaku Yamahata 
24000186dd29SIsaku Yamahata 	td_params->config_flags = TDX_CONFIG_FLAGS_NO_RBP_MOD;
24010186dd29SIsaku Yamahata 	td_params->tsc_frequency = TDX_TSC_KHZ_TO_25MHZ(kvm->arch.default_tsc_khz);
24020186dd29SIsaku Yamahata 
24030186dd29SIsaku Yamahata 	ret = setup_tdparams_eptp_controls(cpuid, td_params);
24040186dd29SIsaku Yamahata 	if (ret)
24050186dd29SIsaku Yamahata 		return ret;
24060186dd29SIsaku Yamahata 
24070186dd29SIsaku Yamahata 	ret = setup_tdparams_cpuids(cpuid, td_params);
24080186dd29SIsaku Yamahata 	if (ret)
24090186dd29SIsaku Yamahata 		return ret;
24100186dd29SIsaku Yamahata 
24110186dd29SIsaku Yamahata #define MEMCPY_SAME_SIZE(dst, src)				\
24120186dd29SIsaku Yamahata 	do {							\
24130186dd29SIsaku Yamahata 		BUILD_BUG_ON(sizeof(dst) != sizeof(src));	\
24140186dd29SIsaku Yamahata 		memcpy((dst), (src), sizeof(dst));		\
24150186dd29SIsaku Yamahata 	} while (0)
24160186dd29SIsaku Yamahata 
24170186dd29SIsaku Yamahata 	MEMCPY_SAME_SIZE(td_params->mrconfigid, init_vm->mrconfigid);
24180186dd29SIsaku Yamahata 	MEMCPY_SAME_SIZE(td_params->mrowner, init_vm->mrowner);
24190186dd29SIsaku Yamahata 	MEMCPY_SAME_SIZE(td_params->mrownerconfig, init_vm->mrownerconfig);
24200186dd29SIsaku Yamahata 
24210186dd29SIsaku Yamahata 	return 0;
24220186dd29SIsaku Yamahata }
24230186dd29SIsaku Yamahata 
__tdx_td_init(struct kvm * kvm,struct td_params * td_params,u64 * seamcall_err)24240186dd29SIsaku Yamahata static int __tdx_td_init(struct kvm *kvm, struct td_params *td_params,
24250186dd29SIsaku Yamahata 			 u64 *seamcall_err)
24268d032b68SIsaku Yamahata {
24278d032b68SIsaku Yamahata 	struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
24288d032b68SIsaku Yamahata 	cpumask_var_t packages;
24298d032b68SIsaku Yamahata 	struct page **tdcs_pages = NULL;
24308d032b68SIsaku Yamahata 	struct page *tdr_page;
24318d032b68SIsaku Yamahata 	int ret, i;
24320186dd29SIsaku Yamahata 	u64 err, rcx;
24338d032b68SIsaku Yamahata 
24340186dd29SIsaku Yamahata 	*seamcall_err = 0;
24358d032b68SIsaku Yamahata 	ret = tdx_guest_keyid_alloc();
24368d032b68SIsaku Yamahata 	if (ret < 0)
24378d032b68SIsaku Yamahata 		return ret;
24388d032b68SIsaku Yamahata 	kvm_tdx->hkid = ret;
24397c035beaSZhiming Hu 	kvm_tdx->misc_cg = get_current_misc_cg();
24407c035beaSZhiming Hu 	ret = misc_cg_try_charge(MISC_CG_RES_TDX, kvm_tdx->misc_cg, 1);
24417c035beaSZhiming Hu 	if (ret)
24427c035beaSZhiming Hu 		goto free_hkid;
24438d032b68SIsaku Yamahata 
24448d032b68SIsaku Yamahata 	ret = -ENOMEM;
24458d032b68SIsaku Yamahata 
24469934d7e5SIsaku Yamahata 	atomic_inc(&nr_configured_hkid);
24479934d7e5SIsaku Yamahata 
24488d032b68SIsaku Yamahata 	tdr_page = alloc_page(GFP_KERNEL);
24498d032b68SIsaku Yamahata 	if (!tdr_page)
24508d032b68SIsaku Yamahata 		goto free_hkid;
24518d032b68SIsaku Yamahata 
24528d032b68SIsaku Yamahata 	kvm_tdx->td.tdcs_nr_pages = tdx_sysinfo->td_ctrl.tdcs_base_size / PAGE_SIZE;
2453a50f673fSIsaku Yamahata 	/* TDVPS = TDVPR(4K page) + TDCX(multiple 4K pages), -1 for TDVPR. */
2454a50f673fSIsaku Yamahata 	kvm_tdx->td.tdcx_nr_pages = tdx_sysinfo->td_ctrl.tdvps_base_size / PAGE_SIZE - 1;
24558d032b68SIsaku Yamahata 	tdcs_pages = kcalloc(kvm_tdx->td.tdcs_nr_pages, sizeof(*kvm_tdx->td.tdcs_pages),
24568d032b68SIsaku Yamahata 			     GFP_KERNEL | __GFP_ZERO);
24578d032b68SIsaku Yamahata 	if (!tdcs_pages)
24588d032b68SIsaku Yamahata 		goto free_tdr;
24598d032b68SIsaku Yamahata 
24608d032b68SIsaku Yamahata 	for (i = 0; i < kvm_tdx->td.tdcs_nr_pages; i++) {
24618d032b68SIsaku Yamahata 		tdcs_pages[i] = alloc_page(GFP_KERNEL);
24628d032b68SIsaku Yamahata 		if (!tdcs_pages[i])
24638d032b68SIsaku Yamahata 			goto free_tdcs;
24648d032b68SIsaku Yamahata 	}
24658d032b68SIsaku Yamahata 
24668d032b68SIsaku Yamahata 	if (!zalloc_cpumask_var(&packages, GFP_KERNEL))
24678d032b68SIsaku Yamahata 		goto free_tdcs;
24688d032b68SIsaku Yamahata 
24698d032b68SIsaku Yamahata 	cpus_read_lock();
24708d032b68SIsaku Yamahata 
24718d032b68SIsaku Yamahata 	/*
24728d032b68SIsaku Yamahata 	 * Need at least one CPU of the package to be online in order to
24738d032b68SIsaku Yamahata 	 * program all packages for host key id.  Check it.
24748d032b68SIsaku Yamahata 	 */
24758d032b68SIsaku Yamahata 	for_each_present_cpu(i)
24768d032b68SIsaku Yamahata 		cpumask_set_cpu(topology_physical_package_id(i), packages);
24778d032b68SIsaku Yamahata 	for_each_online_cpu(i)
24788d032b68SIsaku Yamahata 		cpumask_clear_cpu(topology_physical_package_id(i), packages);
24798d032b68SIsaku Yamahata 	if (!cpumask_empty(packages)) {
24808d032b68SIsaku Yamahata 		ret = -EIO;
24818d032b68SIsaku Yamahata 		/*
24828d032b68SIsaku Yamahata 		 * Because it's hard for human operator to figure out the
24838d032b68SIsaku Yamahata 		 * reason, warn it.
24848d032b68SIsaku Yamahata 		 */
24858d032b68SIsaku Yamahata #define MSG_ALLPKG	"All packages need to have online CPU to create TD. Online CPU and retry.\n"
24868d032b68SIsaku Yamahata 		pr_warn_ratelimited(MSG_ALLPKG);
24878d032b68SIsaku Yamahata 		goto free_packages;
24888d032b68SIsaku Yamahata 	}
24898d032b68SIsaku Yamahata 
24908d032b68SIsaku Yamahata 	/*
24918d032b68SIsaku Yamahata 	 * TDH.MNG.CREATE tries to grab the global TDX module and fails
24928d032b68SIsaku Yamahata 	 * with TDX_OPERAND_BUSY when it fails to grab.  Take the global
24938d032b68SIsaku Yamahata 	 * lock to prevent it from failure.
24948d032b68SIsaku Yamahata 	 */
24958d032b68SIsaku Yamahata 	mutex_lock(&tdx_lock);
24968d032b68SIsaku Yamahata 	kvm_tdx->td.tdr_page = tdr_page;
24978d032b68SIsaku Yamahata 	err = tdh_mng_create(&kvm_tdx->td, kvm_tdx->hkid);
24988d032b68SIsaku Yamahata 	mutex_unlock(&tdx_lock);
24998d032b68SIsaku Yamahata 
25008d032b68SIsaku Yamahata 	if (err == TDX_RND_NO_ENTROPY) {
25018d032b68SIsaku Yamahata 		ret = -EAGAIN;
25028d032b68SIsaku Yamahata 		goto free_packages;
25038d032b68SIsaku Yamahata 	}
25048d032b68SIsaku Yamahata 
25058d032b68SIsaku Yamahata 	if (WARN_ON_ONCE(err)) {
25068d032b68SIsaku Yamahata 		pr_tdx_error(TDH_MNG_CREATE, err);
25078d032b68SIsaku Yamahata 		ret = -EIO;
25088d032b68SIsaku Yamahata 		goto free_packages;
25098d032b68SIsaku Yamahata 	}
25108d032b68SIsaku Yamahata 
25118d032b68SIsaku Yamahata 	for_each_online_cpu(i) {
25128d032b68SIsaku Yamahata 		int pkg = topology_physical_package_id(i);
25138d032b68SIsaku Yamahata 
25148d032b68SIsaku Yamahata 		if (cpumask_test_and_set_cpu(pkg, packages))
25158d032b68SIsaku Yamahata 			continue;
25168d032b68SIsaku Yamahata 
25178d032b68SIsaku Yamahata 		/*
25188d032b68SIsaku Yamahata 		 * Program the memory controller in the package with an
25198d032b68SIsaku Yamahata 		 * encryption key associated to a TDX private host key id
25208d032b68SIsaku Yamahata 		 * assigned to this TDR.  Concurrent operations on same memory
25218d032b68SIsaku Yamahata 		 * controller results in TDX_OPERAND_BUSY. No locking needed
25228d032b68SIsaku Yamahata 		 * beyond the cpus_read_lock() above as it serializes against
25238d032b68SIsaku Yamahata 		 * hotplug and the first online CPU of the package is always
25248d032b68SIsaku Yamahata 		 * used. We never have two CPUs in the same socket trying to
25258d032b68SIsaku Yamahata 		 * program the key.
25268d032b68SIsaku Yamahata 		 */
25278d032b68SIsaku Yamahata 		ret = smp_call_on_cpu(i, tdx_do_tdh_mng_key_config,
25288d032b68SIsaku Yamahata 				      kvm_tdx, true);
25298d032b68SIsaku Yamahata 		if (ret)
25308d032b68SIsaku Yamahata 			break;
25318d032b68SIsaku Yamahata 	}
25328d032b68SIsaku Yamahata 	cpus_read_unlock();
25338d032b68SIsaku Yamahata 	free_cpumask_var(packages);
25348d032b68SIsaku Yamahata 	if (ret) {
25358d032b68SIsaku Yamahata 		i = 0;
25368d032b68SIsaku Yamahata 		goto teardown;
25378d032b68SIsaku Yamahata 	}
25388d032b68SIsaku Yamahata 
25398d032b68SIsaku Yamahata 	kvm_tdx->td.tdcs_pages = tdcs_pages;
25408d032b68SIsaku Yamahata 	for (i = 0; i < kvm_tdx->td.tdcs_nr_pages; i++) {
25418d032b68SIsaku Yamahata 		err = tdh_mng_addcx(&kvm_tdx->td, tdcs_pages[i]);
25428d032b68SIsaku Yamahata 		if (err == TDX_RND_NO_ENTROPY) {
25438d032b68SIsaku Yamahata 			/* Here it's hard to allow userspace to retry. */
25448d032b68SIsaku Yamahata 			ret = -EAGAIN;
25458d032b68SIsaku Yamahata 			goto teardown;
25468d032b68SIsaku Yamahata 		}
25478d032b68SIsaku Yamahata 		if (WARN_ON_ONCE(err)) {
25488d032b68SIsaku Yamahata 			pr_tdx_error(TDH_MNG_ADDCX, err);
25498d032b68SIsaku Yamahata 			ret = -EIO;
25508d032b68SIsaku Yamahata 			goto teardown;
25518d032b68SIsaku Yamahata 		}
25528d032b68SIsaku Yamahata 	}
25538d032b68SIsaku Yamahata 
25540186dd29SIsaku Yamahata 	err = tdh_mng_init(&kvm_tdx->td, __pa(td_params), &rcx);
25550186dd29SIsaku Yamahata 	if ((err & TDX_SEAMCALL_STATUS_MASK) == TDX_OPERAND_INVALID) {
25568d032b68SIsaku Yamahata 		/*
25570186dd29SIsaku Yamahata 		 * Because a user gives operands, don't warn.
25580186dd29SIsaku Yamahata 		 * Return a hint to the user because it's sometimes hard for the
25590186dd29SIsaku Yamahata 		 * user to figure out which operand is invalid.  SEAMCALL status
25600186dd29SIsaku Yamahata 		 * code includes which operand caused invalid operand error.
25618d032b68SIsaku Yamahata 		 */
25620186dd29SIsaku Yamahata 		*seamcall_err = err;
25630186dd29SIsaku Yamahata 		ret = -EINVAL;
25640186dd29SIsaku Yamahata 		goto teardown;
25650186dd29SIsaku Yamahata 	} else if (WARN_ON_ONCE(err)) {
25660186dd29SIsaku Yamahata 		pr_tdx_error_1(TDH_MNG_INIT, err, rcx);
25670186dd29SIsaku Yamahata 		ret = -EIO;
25680186dd29SIsaku Yamahata 		goto teardown;
25690186dd29SIsaku Yamahata 	}
25700186dd29SIsaku Yamahata 
25718d032b68SIsaku Yamahata 	return 0;
25728d032b68SIsaku Yamahata 
25738d032b68SIsaku Yamahata 	/*
25748d032b68SIsaku Yamahata 	 * The sequence for freeing resources from a partially initialized TD
25758d032b68SIsaku Yamahata 	 * varies based on where in the initialization flow failure occurred.
25768d032b68SIsaku Yamahata 	 * Simply use the full teardown and destroy, which naturally play nice
25778d032b68SIsaku Yamahata 	 * with partial initialization.
25788d032b68SIsaku Yamahata 	 */
25798d032b68SIsaku Yamahata teardown:
25808d032b68SIsaku Yamahata 	/* Only free pages not yet added, so start at 'i' */
25818d032b68SIsaku Yamahata 	for (; i < kvm_tdx->td.tdcs_nr_pages; i++) {
25828d032b68SIsaku Yamahata 		if (tdcs_pages[i]) {
25838d032b68SIsaku Yamahata 			__free_page(tdcs_pages[i]);
25848d032b68SIsaku Yamahata 			tdcs_pages[i] = NULL;
25858d032b68SIsaku Yamahata 		}
25868d032b68SIsaku Yamahata 	}
25878d032b68SIsaku Yamahata 	if (!kvm_tdx->td.tdcs_pages)
25888d032b68SIsaku Yamahata 		kfree(tdcs_pages);
25898d032b68SIsaku Yamahata 
25908d032b68SIsaku Yamahata 	tdx_mmu_release_hkid(kvm);
25918d032b68SIsaku Yamahata 	tdx_reclaim_td_control_pages(kvm);
25928d032b68SIsaku Yamahata 
25938d032b68SIsaku Yamahata 	return ret;
25948d032b68SIsaku Yamahata 
25958d032b68SIsaku Yamahata free_packages:
25968d032b68SIsaku Yamahata 	cpus_read_unlock();
25978d032b68SIsaku Yamahata 	free_cpumask_var(packages);
25988d032b68SIsaku Yamahata 
25998d032b68SIsaku Yamahata free_tdcs:
26008d032b68SIsaku Yamahata 	for (i = 0; i < kvm_tdx->td.tdcs_nr_pages; i++) {
26018d032b68SIsaku Yamahata 		if (tdcs_pages[i])
26028d032b68SIsaku Yamahata 			__free_page(tdcs_pages[i]);
26038d032b68SIsaku Yamahata 	}
26048d032b68SIsaku Yamahata 	kfree(tdcs_pages);
26058d032b68SIsaku Yamahata 	kvm_tdx->td.tdcs_pages = NULL;
26068d032b68SIsaku Yamahata 
26078d032b68SIsaku Yamahata free_tdr:
26088d032b68SIsaku Yamahata 	if (tdr_page)
26098d032b68SIsaku Yamahata 		__free_page(tdr_page);
26108d032b68SIsaku Yamahata 	kvm_tdx->td.tdr_page = 0;
26118d032b68SIsaku Yamahata 
26128d032b68SIsaku Yamahata free_hkid:
26138d032b68SIsaku Yamahata 	tdx_hkid_free(kvm_tdx);
26148d032b68SIsaku Yamahata 
26158d032b68SIsaku Yamahata 	return ret;
26168d032b68SIsaku Yamahata }
26178d032b68SIsaku Yamahata 
tdx_td_metadata_field_read(struct kvm_tdx * tdx,u64 field_id,u64 * data)2618488808e6SXiaoyao Li static u64 tdx_td_metadata_field_read(struct kvm_tdx *tdx, u64 field_id,
2619488808e6SXiaoyao Li 				      u64 *data)
2620488808e6SXiaoyao Li {
2621488808e6SXiaoyao Li 	u64 err;
2622488808e6SXiaoyao Li 
2623488808e6SXiaoyao Li 	err = tdh_mng_rd(&tdx->td, field_id, data);
2624488808e6SXiaoyao Li 
2625488808e6SXiaoyao Li 	return err;
2626488808e6SXiaoyao Li }
2627488808e6SXiaoyao Li 
2628488808e6SXiaoyao Li #define TDX_MD_UNREADABLE_LEAF_MASK	GENMASK(30, 7)
2629488808e6SXiaoyao Li #define TDX_MD_UNREADABLE_SUBLEAF_MASK	GENMASK(31, 7)
2630488808e6SXiaoyao Li 
tdx_read_cpuid(struct kvm_vcpu * vcpu,u32 leaf,u32 sub_leaf,bool sub_leaf_set,int * entry_index,struct kvm_cpuid_entry2 * out)2631488808e6SXiaoyao Li static int tdx_read_cpuid(struct kvm_vcpu *vcpu, u32 leaf, u32 sub_leaf,
2632488808e6SXiaoyao Li 			  bool sub_leaf_set, int *entry_index,
2633488808e6SXiaoyao Li 			  struct kvm_cpuid_entry2 *out)
2634488808e6SXiaoyao Li {
2635488808e6SXiaoyao Li 	struct kvm_tdx *kvm_tdx = to_kvm_tdx(vcpu->kvm);
2636488808e6SXiaoyao Li 	u64 field_id = TD_MD_FIELD_ID_CPUID_VALUES;
2637488808e6SXiaoyao Li 	u64 ebx_eax, edx_ecx;
2638488808e6SXiaoyao Li 	u64 err = 0;
2639488808e6SXiaoyao Li 
2640488808e6SXiaoyao Li 	if (sub_leaf > 0b1111111)
2641488808e6SXiaoyao Li 		return -EINVAL;
2642488808e6SXiaoyao Li 
2643488808e6SXiaoyao Li 	if (*entry_index >= KVM_MAX_CPUID_ENTRIES)
2644488808e6SXiaoyao Li 		return -EINVAL;
2645488808e6SXiaoyao Li 
2646488808e6SXiaoyao Li 	if (leaf & TDX_MD_UNREADABLE_LEAF_MASK ||
2647488808e6SXiaoyao Li 	    sub_leaf & TDX_MD_UNREADABLE_SUBLEAF_MASK)
2648488808e6SXiaoyao Li 		return -EINVAL;
2649488808e6SXiaoyao Li 
2650488808e6SXiaoyao Li 	/*
2651488808e6SXiaoyao Li 	 * bit 23:17, REVSERVED: reserved, must be 0;
2652488808e6SXiaoyao Li 	 * bit 16,    LEAF_31: leaf number bit 31;
2653488808e6SXiaoyao Li 	 * bit 15:9,  LEAF_6_0: leaf number bits 6:0, leaf bits 30:7 are
2654488808e6SXiaoyao Li 	 *                      implicitly 0;
2655488808e6SXiaoyao Li 	 * bit 8,     SUBLEAF_NA: sub-leaf not applicable flag;
2656488808e6SXiaoyao Li 	 * bit 7:1,   SUBLEAF_6_0: sub-leaf number bits 6:0. If SUBLEAF_NA is 1,
2657488808e6SXiaoyao Li 	 *                         the SUBLEAF_6_0 is all-1.
2658488808e6SXiaoyao Li 	 *                         sub-leaf bits 31:7 are implicitly 0;
2659488808e6SXiaoyao Li 	 * bit 0,     ELEMENT_I: Element index within field;
2660488808e6SXiaoyao Li 	 */
2661488808e6SXiaoyao Li 	field_id |= ((leaf & 0x80000000) ? 1 : 0) << 16;
2662488808e6SXiaoyao Li 	field_id |= (leaf & 0x7f) << 9;
2663488808e6SXiaoyao Li 	if (sub_leaf_set)
2664488808e6SXiaoyao Li 		field_id |= (sub_leaf & 0x7f) << 1;
2665488808e6SXiaoyao Li 	else
2666488808e6SXiaoyao Li 		field_id |= 0x1fe;
2667488808e6SXiaoyao Li 
2668488808e6SXiaoyao Li 	err = tdx_td_metadata_field_read(kvm_tdx, field_id, &ebx_eax);
2669488808e6SXiaoyao Li 	if (err) //TODO check for specific errors
2670488808e6SXiaoyao Li 		goto err_out;
2671488808e6SXiaoyao Li 
2672488808e6SXiaoyao Li 	out->eax = (u32) ebx_eax;
2673488808e6SXiaoyao Li 	out->ebx = (u32) (ebx_eax >> 32);
2674488808e6SXiaoyao Li 
2675488808e6SXiaoyao Li 	field_id++;
2676488808e6SXiaoyao Li 	err = tdx_td_metadata_field_read(kvm_tdx, field_id, &edx_ecx);
2677488808e6SXiaoyao Li 	/*
2678488808e6SXiaoyao Li 	 * It's weird that reading edx_ecx fails while reading ebx_eax
2679488808e6SXiaoyao Li 	 * succeeded.
2680488808e6SXiaoyao Li 	 */
2681488808e6SXiaoyao Li 	if (WARN_ON_ONCE(err))
2682488808e6SXiaoyao Li 		goto err_out;
2683488808e6SXiaoyao Li 
2684488808e6SXiaoyao Li 	out->ecx = (u32) edx_ecx;
2685488808e6SXiaoyao Li 	out->edx = (u32) (edx_ecx >> 32);
2686488808e6SXiaoyao Li 
2687488808e6SXiaoyao Li 	out->function = leaf;
2688488808e6SXiaoyao Li 	out->index = sub_leaf;
2689488808e6SXiaoyao Li 	out->flags |= sub_leaf_set ? KVM_CPUID_FLAG_SIGNIFCANT_INDEX : 0;
2690488808e6SXiaoyao Li 
2691488808e6SXiaoyao Li 	/*
2692488808e6SXiaoyao Li 	 * Work around missing support on old TDX modules, fetch
2693488808e6SXiaoyao Li 	 * guest maxpa from gfn_direct_bits.
2694488808e6SXiaoyao Li 	 */
2695488808e6SXiaoyao Li 	if (leaf == 0x80000008) {
2696488808e6SXiaoyao Li 		gpa_t gpa_bits = gfn_to_gpa(kvm_gfn_direct_bits(vcpu->kvm));
2697488808e6SXiaoyao Li 		unsigned int g_maxpa = __ffs(gpa_bits) + 1;
2698488808e6SXiaoyao Li 
2699488808e6SXiaoyao Li 		out->eax = tdx_set_guest_phys_addr_bits(out->eax, g_maxpa);
2700488808e6SXiaoyao Li 	}
2701488808e6SXiaoyao Li 
2702488808e6SXiaoyao Li 	(*entry_index)++;
2703488808e6SXiaoyao Li 
2704488808e6SXiaoyao Li 	return 0;
2705488808e6SXiaoyao Li 
2706488808e6SXiaoyao Li err_out:
2707488808e6SXiaoyao Li 	out->eax = 0;
2708488808e6SXiaoyao Li 	out->ebx = 0;
2709488808e6SXiaoyao Li 	out->ecx = 0;
2710488808e6SXiaoyao Li 	out->edx = 0;
2711488808e6SXiaoyao Li 
2712488808e6SXiaoyao Li 	return -EIO;
2713488808e6SXiaoyao Li }
2714488808e6SXiaoyao Li 
tdx_td_init(struct kvm * kvm,struct kvm_tdx_cmd * cmd)27150186dd29SIsaku Yamahata static int tdx_td_init(struct kvm *kvm, struct kvm_tdx_cmd *cmd)
27160186dd29SIsaku Yamahata {
27170186dd29SIsaku Yamahata 	struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
27180186dd29SIsaku Yamahata 	struct kvm_tdx_init_vm *init_vm;
27190186dd29SIsaku Yamahata 	struct td_params *td_params = NULL;
27200186dd29SIsaku Yamahata 	int ret;
27210186dd29SIsaku Yamahata 
27220186dd29SIsaku Yamahata 	BUILD_BUG_ON(sizeof(*init_vm) != 256 + sizeof_field(struct kvm_tdx_init_vm, cpuid));
27230186dd29SIsaku Yamahata 	BUILD_BUG_ON(sizeof(struct td_params) != 1024);
27240186dd29SIsaku Yamahata 
27250186dd29SIsaku Yamahata 	if (kvm_tdx->state != TD_STATE_UNINITIALIZED)
27260186dd29SIsaku Yamahata 		return -EINVAL;
27270186dd29SIsaku Yamahata 
27280186dd29SIsaku Yamahata 	if (cmd->flags)
27290186dd29SIsaku Yamahata 		return -EINVAL;
27300186dd29SIsaku Yamahata 
27310186dd29SIsaku Yamahata 	init_vm = kmalloc(sizeof(*init_vm) +
27320186dd29SIsaku Yamahata 			  sizeof(init_vm->cpuid.entries[0]) * KVM_MAX_CPUID_ENTRIES,
27330186dd29SIsaku Yamahata 			  GFP_KERNEL);
27340186dd29SIsaku Yamahata 	if (!init_vm)
27350186dd29SIsaku Yamahata 		return -ENOMEM;
27360186dd29SIsaku Yamahata 
27370186dd29SIsaku Yamahata 	if (copy_from_user(init_vm, u64_to_user_ptr(cmd->data), sizeof(*init_vm))) {
27380186dd29SIsaku Yamahata 		ret = -EFAULT;
27390186dd29SIsaku Yamahata 		goto out;
27400186dd29SIsaku Yamahata 	}
27410186dd29SIsaku Yamahata 
27420186dd29SIsaku Yamahata 	if (init_vm->cpuid.nent > KVM_MAX_CPUID_ENTRIES) {
27430186dd29SIsaku Yamahata 		ret = -E2BIG;
27440186dd29SIsaku Yamahata 		goto out;
27450186dd29SIsaku Yamahata 	}
27460186dd29SIsaku Yamahata 
27470186dd29SIsaku Yamahata 	if (copy_from_user(init_vm->cpuid.entries,
27480186dd29SIsaku Yamahata 			   u64_to_user_ptr(cmd->data) + sizeof(*init_vm),
27490186dd29SIsaku Yamahata 			   flex_array_size(init_vm, cpuid.entries, init_vm->cpuid.nent))) {
27500186dd29SIsaku Yamahata 		ret = -EFAULT;
27510186dd29SIsaku Yamahata 		goto out;
27520186dd29SIsaku Yamahata 	}
27530186dd29SIsaku Yamahata 
27540186dd29SIsaku Yamahata 	if (memchr_inv(init_vm->reserved, 0, sizeof(init_vm->reserved))) {
27550186dd29SIsaku Yamahata 		ret = -EINVAL;
27560186dd29SIsaku Yamahata 		goto out;
27570186dd29SIsaku Yamahata 	}
27580186dd29SIsaku Yamahata 
27590186dd29SIsaku Yamahata 	if (init_vm->cpuid.padding) {
27600186dd29SIsaku Yamahata 		ret = -EINVAL;
27610186dd29SIsaku Yamahata 		goto out;
27620186dd29SIsaku Yamahata 	}
27630186dd29SIsaku Yamahata 
27640186dd29SIsaku Yamahata 	td_params = kzalloc(sizeof(struct td_params), GFP_KERNEL);
27650186dd29SIsaku Yamahata 	if (!td_params) {
27660186dd29SIsaku Yamahata 		ret = -ENOMEM;
27670186dd29SIsaku Yamahata 		goto out;
27680186dd29SIsaku Yamahata 	}
27690186dd29SIsaku Yamahata 
27700186dd29SIsaku Yamahata 	ret = setup_tdparams(kvm, td_params, init_vm);
27710186dd29SIsaku Yamahata 	if (ret)
27720186dd29SIsaku Yamahata 		goto out;
27730186dd29SIsaku Yamahata 
27740186dd29SIsaku Yamahata 	ret = __tdx_td_init(kvm, td_params, &cmd->hw_error);
27750186dd29SIsaku Yamahata 	if (ret)
27760186dd29SIsaku Yamahata 		goto out;
27770186dd29SIsaku Yamahata 
27780186dd29SIsaku Yamahata 	kvm_tdx->tsc_offset = td_tdcs_exec_read64(kvm_tdx, TD_TDCS_EXEC_TSC_OFFSET);
27790186dd29SIsaku Yamahata 	kvm_tdx->tsc_multiplier = td_tdcs_exec_read64(kvm_tdx, TD_TDCS_EXEC_TSC_MULTIPLIER);
27800186dd29SIsaku Yamahata 	kvm_tdx->attributes = td_params->attributes;
27810186dd29SIsaku Yamahata 	kvm_tdx->xfam = td_params->xfam;
27820186dd29SIsaku Yamahata 
2783e0fbb3bbSIsaku Yamahata 	if (td_params->config_flags & TDX_CONFIG_FLAGS_MAX_GPAW)
2784e0fbb3bbSIsaku Yamahata 		kvm->arch.gfn_direct_bits = TDX_SHARED_BIT_PWL_5;
2785e0fbb3bbSIsaku Yamahata 	else
2786e0fbb3bbSIsaku Yamahata 		kvm->arch.gfn_direct_bits = TDX_SHARED_BIT_PWL_4;
2787e0fbb3bbSIsaku Yamahata 
27880186dd29SIsaku Yamahata 	kvm_tdx->state = TD_STATE_INITIALIZED;
27890186dd29SIsaku Yamahata out:
27900186dd29SIsaku Yamahata 	/* kfree() accepts NULL. */
27910186dd29SIsaku Yamahata 	kfree(init_vm);
27920186dd29SIsaku Yamahata 	kfree(td_params);
27930186dd29SIsaku Yamahata 
27940186dd29SIsaku Yamahata 	return ret;
27950186dd29SIsaku Yamahata }
27960186dd29SIsaku Yamahata 
tdx_flush_tlb_current(struct kvm_vcpu * vcpu)279722836e1dSIsaku Yamahata void tdx_flush_tlb_current(struct kvm_vcpu *vcpu)
279822836e1dSIsaku Yamahata {
279922836e1dSIsaku Yamahata 	/*
280022836e1dSIsaku Yamahata 	 * flush_tlb_current() is invoked when the first time for the vcpu to
280122836e1dSIsaku Yamahata 	 * run or when root of shared EPT is invalidated.
280222836e1dSIsaku Yamahata 	 * KVM only needs to flush shared EPT because the TDX module handles TLB
280322836e1dSIsaku Yamahata 	 * invalidation for private EPT in tdh_vp_enter();
280422836e1dSIsaku Yamahata 	 *
280522836e1dSIsaku Yamahata 	 * A single context invalidation for shared EPT can be performed here.
280622836e1dSIsaku Yamahata 	 * However, this single context invalidation requires the private EPTP
280722836e1dSIsaku Yamahata 	 * rather than the shared EPTP to flush shared EPT, as shared EPT uses
280822836e1dSIsaku Yamahata 	 * private EPTP as its ASID for TLB invalidation.
280922836e1dSIsaku Yamahata 	 *
281022836e1dSIsaku Yamahata 	 * To avoid reading back private EPTP, perform a global invalidation for
281122836e1dSIsaku Yamahata 	 * shared EPT instead to keep this function simple.
281222836e1dSIsaku Yamahata 	 */
281322836e1dSIsaku Yamahata 	ept_sync_global();
281422836e1dSIsaku Yamahata }
281522836e1dSIsaku Yamahata 
tdx_flush_tlb_all(struct kvm_vcpu * vcpu)281622836e1dSIsaku Yamahata void tdx_flush_tlb_all(struct kvm_vcpu *vcpu)
281722836e1dSIsaku Yamahata {
281822836e1dSIsaku Yamahata 	/*
281922836e1dSIsaku Yamahata 	 * TDX has called tdx_track() in tdx_sept_remove_private_spte() to
282022836e1dSIsaku Yamahata 	 * ensure that private EPT will be flushed on the next TD enter. No need
282122836e1dSIsaku Yamahata 	 * to call tdx_track() here again even when this callback is a result of
282222836e1dSIsaku Yamahata 	 * zapping private EPT.
282322836e1dSIsaku Yamahata 	 *
282422836e1dSIsaku Yamahata 	 * Due to the lack of the context to determine which EPT has been
282522836e1dSIsaku Yamahata 	 * affected by zapping, invoke invept() directly here for both shared
282622836e1dSIsaku Yamahata 	 * EPT and private EPT for simplicity, though it's not necessary for
282722836e1dSIsaku Yamahata 	 * private EPT.
282822836e1dSIsaku Yamahata 	 */
282922836e1dSIsaku Yamahata 	ept_sync_global();
283022836e1dSIsaku Yamahata }
283122836e1dSIsaku Yamahata 
tdx_td_finalize(struct kvm * kvm,struct kvm_tdx_cmd * cmd)2832012426d6SIsaku Yamahata static int tdx_td_finalize(struct kvm *kvm, struct kvm_tdx_cmd *cmd)
2833012426d6SIsaku Yamahata {
2834012426d6SIsaku Yamahata 	struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
2835012426d6SIsaku Yamahata 
2836012426d6SIsaku Yamahata 	guard(mutex)(&kvm->slots_lock);
2837012426d6SIsaku Yamahata 
2838012426d6SIsaku Yamahata 	if (!is_hkid_assigned(kvm_tdx) || kvm_tdx->state == TD_STATE_RUNNABLE)
2839012426d6SIsaku Yamahata 		return -EINVAL;
2840012426d6SIsaku Yamahata 	/*
2841012426d6SIsaku Yamahata 	 * Pages are pending for KVM_TDX_INIT_MEM_REGION to issue
2842012426d6SIsaku Yamahata 	 * TDH.MEM.PAGE.ADD().
2843012426d6SIsaku Yamahata 	 */
2844012426d6SIsaku Yamahata 	if (atomic64_read(&kvm_tdx->nr_premapped))
2845012426d6SIsaku Yamahata 		return -EINVAL;
2846012426d6SIsaku Yamahata 
2847012426d6SIsaku Yamahata 	cmd->hw_error = tdh_mr_finalize(&kvm_tdx->td);
2848012426d6SIsaku Yamahata 	if (tdx_operand_busy(cmd->hw_error))
2849012426d6SIsaku Yamahata 		return -EBUSY;
2850012426d6SIsaku Yamahata 	if (KVM_BUG_ON(cmd->hw_error, kvm)) {
2851012426d6SIsaku Yamahata 		pr_tdx_error(TDH_MR_FINALIZE, cmd->hw_error);
2852012426d6SIsaku Yamahata 		return -EIO;
2853012426d6SIsaku Yamahata 	}
2854012426d6SIsaku Yamahata 
2855012426d6SIsaku Yamahata 	kvm_tdx->state = TD_STATE_RUNNABLE;
2856012426d6SIsaku Yamahata 	/* TD_STATE_RUNNABLE must be set before 'pre_fault_allowed' */
2857012426d6SIsaku Yamahata 	smp_wmb();
2858012426d6SIsaku Yamahata 	kvm->arch.pre_fault_allowed = true;
2859012426d6SIsaku Yamahata 	return 0;
2860012426d6SIsaku Yamahata }
2861012426d6SIsaku Yamahata 
tdx_vm_ioctl(struct kvm * kvm,void __user * argp)2862b2aaf38cSIsaku Yamahata int tdx_vm_ioctl(struct kvm *kvm, void __user *argp)
2863b2aaf38cSIsaku Yamahata {
2864b2aaf38cSIsaku Yamahata 	struct kvm_tdx_cmd tdx_cmd;
2865b2aaf38cSIsaku Yamahata 	int r;
2866b2aaf38cSIsaku Yamahata 
2867b2aaf38cSIsaku Yamahata 	if (copy_from_user(&tdx_cmd, argp, sizeof(struct kvm_tdx_cmd)))
2868b2aaf38cSIsaku Yamahata 		return -EFAULT;
2869b2aaf38cSIsaku Yamahata 
2870b2aaf38cSIsaku Yamahata 	/*
2871b2aaf38cSIsaku Yamahata 	 * Userspace should never set hw_error. It is used to fill
2872b2aaf38cSIsaku Yamahata 	 * hardware-defined error by the kernel.
2873b2aaf38cSIsaku Yamahata 	 */
2874b2aaf38cSIsaku Yamahata 	if (tdx_cmd.hw_error)
2875b2aaf38cSIsaku Yamahata 		return -EINVAL;
2876b2aaf38cSIsaku Yamahata 
2877b2aaf38cSIsaku Yamahata 	mutex_lock(&kvm->lock);
2878b2aaf38cSIsaku Yamahata 
2879b2aaf38cSIsaku Yamahata 	switch (tdx_cmd.id) {
288061bb2827SIsaku Yamahata 	case KVM_TDX_CAPABILITIES:
288161bb2827SIsaku Yamahata 		r = tdx_get_capabilities(&tdx_cmd);
288261bb2827SIsaku Yamahata 		break;
28830186dd29SIsaku Yamahata 	case KVM_TDX_INIT_VM:
28840186dd29SIsaku Yamahata 		r = tdx_td_init(kvm, &tdx_cmd);
28850186dd29SIsaku Yamahata 		break;
2886012426d6SIsaku Yamahata 	case KVM_TDX_FINALIZE_VM:
2887012426d6SIsaku Yamahata 		r = tdx_td_finalize(kvm, &tdx_cmd);
2888012426d6SIsaku Yamahata 		break;
2889b2aaf38cSIsaku Yamahata 	default:
2890b2aaf38cSIsaku Yamahata 		r = -EINVAL;
2891b2aaf38cSIsaku Yamahata 		goto out;
2892b2aaf38cSIsaku Yamahata 	}
2893b2aaf38cSIsaku Yamahata 
2894b2aaf38cSIsaku Yamahata 	if (copy_to_user(argp, &tdx_cmd, sizeof(struct kvm_tdx_cmd)))
2895b2aaf38cSIsaku Yamahata 		r = -EFAULT;
2896b2aaf38cSIsaku Yamahata 
2897b2aaf38cSIsaku Yamahata out:
2898b2aaf38cSIsaku Yamahata 	mutex_unlock(&kvm->lock);
2899b2aaf38cSIsaku Yamahata 	return r;
2900b2aaf38cSIsaku Yamahata }
2901b2aaf38cSIsaku Yamahata 
2902a50f673fSIsaku Yamahata /* VMM can pass one 64bit auxiliary data to vcpu via RCX for guest BIOS. */
tdx_td_vcpu_init(struct kvm_vcpu * vcpu,u64 vcpu_rcx)2903a50f673fSIsaku Yamahata static int tdx_td_vcpu_init(struct kvm_vcpu *vcpu, u64 vcpu_rcx)
2904a50f673fSIsaku Yamahata {
2905a50f673fSIsaku Yamahata 	struct kvm_tdx *kvm_tdx = to_kvm_tdx(vcpu->kvm);
2906a50f673fSIsaku Yamahata 	struct vcpu_tdx *tdx = to_tdx(vcpu);
2907a50f673fSIsaku Yamahata 	struct page *page;
2908a50f673fSIsaku Yamahata 	int ret, i;
2909a50f673fSIsaku Yamahata 	u64 err;
2910a50f673fSIsaku Yamahata 
2911a50f673fSIsaku Yamahata 	page = alloc_page(GFP_KERNEL);
2912a50f673fSIsaku Yamahata 	if (!page)
2913a50f673fSIsaku Yamahata 		return -ENOMEM;
2914a50f673fSIsaku Yamahata 	tdx->vp.tdvpr_page = page;
2915a50f673fSIsaku Yamahata 
2916a50f673fSIsaku Yamahata 	tdx->vp.tdcx_pages = kcalloc(kvm_tdx->td.tdcx_nr_pages, sizeof(*tdx->vp.tdcx_pages),
2917a50f673fSIsaku Yamahata 			       	     GFP_KERNEL);
2918a50f673fSIsaku Yamahata 	if (!tdx->vp.tdcx_pages) {
2919a50f673fSIsaku Yamahata 		ret = -ENOMEM;
2920a50f673fSIsaku Yamahata 		goto free_tdvpr;
2921a50f673fSIsaku Yamahata 	}
2922a50f673fSIsaku Yamahata 
2923a50f673fSIsaku Yamahata 	for (i = 0; i < kvm_tdx->td.tdcx_nr_pages; i++) {
2924a50f673fSIsaku Yamahata 		page = alloc_page(GFP_KERNEL);
2925a50f673fSIsaku Yamahata 		if (!page) {
2926a50f673fSIsaku Yamahata 			ret = -ENOMEM;
2927a50f673fSIsaku Yamahata 			goto free_tdcx;
2928a50f673fSIsaku Yamahata 		}
2929a50f673fSIsaku Yamahata 		tdx->vp.tdcx_pages[i] = page;
2930a50f673fSIsaku Yamahata 	}
2931a50f673fSIsaku Yamahata 
2932a50f673fSIsaku Yamahata 	err = tdh_vp_create(&kvm_tdx->td, &tdx->vp);
2933a50f673fSIsaku Yamahata 	if (KVM_BUG_ON(err, vcpu->kvm)) {
2934a50f673fSIsaku Yamahata 		ret = -EIO;
2935a50f673fSIsaku Yamahata 		pr_tdx_error(TDH_VP_CREATE, err);
2936a50f673fSIsaku Yamahata 		goto free_tdcx;
2937a50f673fSIsaku Yamahata 	}
2938a50f673fSIsaku Yamahata 
2939a50f673fSIsaku Yamahata 	for (i = 0; i < kvm_tdx->td.tdcx_nr_pages; i++) {
2940a50f673fSIsaku Yamahata 		err = tdh_vp_addcx(&tdx->vp, tdx->vp.tdcx_pages[i]);
2941a50f673fSIsaku Yamahata 		if (KVM_BUG_ON(err, vcpu->kvm)) {
2942a50f673fSIsaku Yamahata 			pr_tdx_error(TDH_VP_ADDCX, err);
2943a50f673fSIsaku Yamahata 			/*
2944a50f673fSIsaku Yamahata 			 * Pages already added are reclaimed by the vcpu_free
2945a50f673fSIsaku Yamahata 			 * method, but the rest are freed here.
2946a50f673fSIsaku Yamahata 			 */
2947a50f673fSIsaku Yamahata 			for (; i < kvm_tdx->td.tdcx_nr_pages; i++) {
2948a50f673fSIsaku Yamahata 				__free_page(tdx->vp.tdcx_pages[i]);
2949a50f673fSIsaku Yamahata 				tdx->vp.tdcx_pages[i] = NULL;
2950a50f673fSIsaku Yamahata 			}
2951a50f673fSIsaku Yamahata 			return -EIO;
2952a50f673fSIsaku Yamahata 		}
2953a50f673fSIsaku Yamahata 	}
2954a50f673fSIsaku Yamahata 
2955a50f673fSIsaku Yamahata 	err = tdh_vp_init(&tdx->vp, vcpu_rcx, vcpu->vcpu_id);
2956a50f673fSIsaku Yamahata 	if (KVM_BUG_ON(err, vcpu->kvm)) {
2957a50f673fSIsaku Yamahata 		pr_tdx_error(TDH_VP_INIT, err);
2958a50f673fSIsaku Yamahata 		return -EIO;
2959a50f673fSIsaku Yamahata 	}
2960a50f673fSIsaku Yamahata 
2961a50f673fSIsaku Yamahata 	vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
2962a50f673fSIsaku Yamahata 
2963a50f673fSIsaku Yamahata 	return 0;
2964a50f673fSIsaku Yamahata 
2965a50f673fSIsaku Yamahata free_tdcx:
2966a50f673fSIsaku Yamahata 	for (i = 0; i < kvm_tdx->td.tdcx_nr_pages; i++) {
2967a50f673fSIsaku Yamahata 		if (tdx->vp.tdcx_pages[i])
2968a50f673fSIsaku Yamahata 			__free_page(tdx->vp.tdcx_pages[i]);
2969a50f673fSIsaku Yamahata 		tdx->vp.tdcx_pages[i] = NULL;
2970a50f673fSIsaku Yamahata 	}
2971a50f673fSIsaku Yamahata 	kfree(tdx->vp.tdcx_pages);
2972a50f673fSIsaku Yamahata 	tdx->vp.tdcx_pages = NULL;
2973a50f673fSIsaku Yamahata 
2974a50f673fSIsaku Yamahata free_tdvpr:
2975a50f673fSIsaku Yamahata 	if (tdx->vp.tdvpr_page)
2976a50f673fSIsaku Yamahata 		__free_page(tdx->vp.tdvpr_page);
2977a50f673fSIsaku Yamahata 	tdx->vp.tdvpr_page = 0;
2978a50f673fSIsaku Yamahata 
2979a50f673fSIsaku Yamahata 	return ret;
2980a50f673fSIsaku Yamahata }
2981a50f673fSIsaku Yamahata 
2982488808e6SXiaoyao Li /* Sometimes reads multipple subleafs. Return how many enties were written. */
tdx_vcpu_get_cpuid_leaf(struct kvm_vcpu * vcpu,u32 leaf,int * entry_index,struct kvm_cpuid_entry2 * output_e)2983488808e6SXiaoyao Li static int tdx_vcpu_get_cpuid_leaf(struct kvm_vcpu *vcpu, u32 leaf, int *entry_index,
2984488808e6SXiaoyao Li 				   struct kvm_cpuid_entry2 *output_e)
2985488808e6SXiaoyao Li {
2986488808e6SXiaoyao Li 	int sub_leaf = 0;
2987488808e6SXiaoyao Li 	int ret;
2988488808e6SXiaoyao Li 
2989488808e6SXiaoyao Li 	/* First try without a subleaf */
2990488808e6SXiaoyao Li 	ret = tdx_read_cpuid(vcpu, leaf, 0, false, entry_index, output_e);
2991488808e6SXiaoyao Li 
2992488808e6SXiaoyao Li 	/* If success, or invalid leaf, just give up */
2993488808e6SXiaoyao Li 	if (ret != -EIO)
2994488808e6SXiaoyao Li 		return ret;
2995488808e6SXiaoyao Li 
2996488808e6SXiaoyao Li 	/*
2997488808e6SXiaoyao Li 	 * If the try without a subleaf failed, try reading subleafs until
2998488808e6SXiaoyao Li 	 * failure. The TDX module only supports 6 bits of subleaf index.
2999488808e6SXiaoyao Li 	 */
3000488808e6SXiaoyao Li 	while (1) {
3001488808e6SXiaoyao Li 		/* Keep reading subleafs until there is a failure. */
3002488808e6SXiaoyao Li 		if (tdx_read_cpuid(vcpu, leaf, sub_leaf, true, entry_index, output_e))
3003488808e6SXiaoyao Li 			return !sub_leaf;
3004488808e6SXiaoyao Li 
3005488808e6SXiaoyao Li 		sub_leaf++;
3006488808e6SXiaoyao Li 		output_e++;
3007488808e6SXiaoyao Li 	}
3008488808e6SXiaoyao Li 
3009488808e6SXiaoyao Li 	return 0;
3010488808e6SXiaoyao Li }
3011488808e6SXiaoyao Li 
tdx_vcpu_get_cpuid(struct kvm_vcpu * vcpu,struct kvm_tdx_cmd * cmd)3012488808e6SXiaoyao Li static int tdx_vcpu_get_cpuid(struct kvm_vcpu *vcpu, struct kvm_tdx_cmd *cmd)
3013488808e6SXiaoyao Li {
3014488808e6SXiaoyao Li 	struct kvm_cpuid2 __user *output, *td_cpuid;
3015488808e6SXiaoyao Li 	int r = 0, i = 0, leaf;
3016488808e6SXiaoyao Li 	u32 level;
3017488808e6SXiaoyao Li 
3018488808e6SXiaoyao Li 	output = u64_to_user_ptr(cmd->data);
3019488808e6SXiaoyao Li 	td_cpuid = kzalloc(sizeof(*td_cpuid) +
3020488808e6SXiaoyao Li 			sizeof(output->entries[0]) * KVM_MAX_CPUID_ENTRIES,
3021488808e6SXiaoyao Li 			GFP_KERNEL);
3022488808e6SXiaoyao Li 	if (!td_cpuid)
3023488808e6SXiaoyao Li 		return -ENOMEM;
3024488808e6SXiaoyao Li 
3025488808e6SXiaoyao Li 	if (copy_from_user(td_cpuid, output, sizeof(*output))) {
3026488808e6SXiaoyao Li 		r = -EFAULT;
3027488808e6SXiaoyao Li 		goto out;
3028488808e6SXiaoyao Li 	}
3029488808e6SXiaoyao Li 
3030488808e6SXiaoyao Li 	/* Read max CPUID for normal range */
3031488808e6SXiaoyao Li 	if (tdx_vcpu_get_cpuid_leaf(vcpu, 0, &i, &td_cpuid->entries[i])) {
3032488808e6SXiaoyao Li 		r = -EIO;
3033488808e6SXiaoyao Li 		goto out;
3034488808e6SXiaoyao Li 	}
3035488808e6SXiaoyao Li 	level = td_cpuid->entries[0].eax;
3036488808e6SXiaoyao Li 
3037488808e6SXiaoyao Li 	for (leaf = 1; leaf <= level; leaf++)
3038488808e6SXiaoyao Li 		tdx_vcpu_get_cpuid_leaf(vcpu, leaf, &i, &td_cpuid->entries[i]);
3039488808e6SXiaoyao Li 
3040488808e6SXiaoyao Li 	/* Read max CPUID for extended range */
3041488808e6SXiaoyao Li 	if (tdx_vcpu_get_cpuid_leaf(vcpu, 0x80000000, &i, &td_cpuid->entries[i])) {
3042488808e6SXiaoyao Li 		r = -EIO;
3043488808e6SXiaoyao Li 		goto out;
3044488808e6SXiaoyao Li 	}
3045488808e6SXiaoyao Li 	level = td_cpuid->entries[i - 1].eax;
3046488808e6SXiaoyao Li 
3047488808e6SXiaoyao Li 	for (leaf = 0x80000001; leaf <= level; leaf++)
3048488808e6SXiaoyao Li 		tdx_vcpu_get_cpuid_leaf(vcpu, leaf, &i, &td_cpuid->entries[i]);
3049488808e6SXiaoyao Li 
3050488808e6SXiaoyao Li 	if (td_cpuid->nent < i)
3051488808e6SXiaoyao Li 		r = -E2BIG;
3052488808e6SXiaoyao Li 	td_cpuid->nent = i;
3053488808e6SXiaoyao Li 
3054488808e6SXiaoyao Li 	if (copy_to_user(output, td_cpuid, sizeof(*output))) {
3055488808e6SXiaoyao Li 		r = -EFAULT;
3056488808e6SXiaoyao Li 		goto out;
3057488808e6SXiaoyao Li 	}
3058488808e6SXiaoyao Li 
3059488808e6SXiaoyao Li 	if (r == -E2BIG)
3060488808e6SXiaoyao Li 		goto out;
3061488808e6SXiaoyao Li 
3062488808e6SXiaoyao Li 	if (copy_to_user(output->entries, td_cpuid->entries,
3063488808e6SXiaoyao Li 			 td_cpuid->nent * sizeof(struct kvm_cpuid_entry2)))
3064488808e6SXiaoyao Li 		r = -EFAULT;
3065488808e6SXiaoyao Li 
3066488808e6SXiaoyao Li out:
3067488808e6SXiaoyao Li 	kfree(td_cpuid);
3068488808e6SXiaoyao Li 
3069488808e6SXiaoyao Li 	return r;
3070488808e6SXiaoyao Li }
3071488808e6SXiaoyao Li 
tdx_vcpu_init(struct kvm_vcpu * vcpu,struct kvm_tdx_cmd * cmd)3072a50f673fSIsaku Yamahata static int tdx_vcpu_init(struct kvm_vcpu *vcpu, struct kvm_tdx_cmd *cmd)
3073a50f673fSIsaku Yamahata {
3074a50f673fSIsaku Yamahata 	u64 apic_base;
3075a50f673fSIsaku Yamahata 	struct vcpu_tdx *tdx = to_tdx(vcpu);
3076a50f673fSIsaku Yamahata 	int ret;
3077a50f673fSIsaku Yamahata 
3078a50f673fSIsaku Yamahata 	if (cmd->flags)
3079a50f673fSIsaku Yamahata 		return -EINVAL;
3080a50f673fSIsaku Yamahata 
3081a50f673fSIsaku Yamahata 	if (tdx->state != VCPU_TD_STATE_UNINITIALIZED)
3082a50f673fSIsaku Yamahata 		return -EINVAL;
3083a50f673fSIsaku Yamahata 
3084a50f673fSIsaku Yamahata 	/*
3085a50f673fSIsaku Yamahata 	 * TDX requires X2APIC, userspace is responsible for configuring guest
3086a50f673fSIsaku Yamahata 	 * CPUID accordingly.
3087a50f673fSIsaku Yamahata 	 */
3088a50f673fSIsaku Yamahata 	apic_base = APIC_DEFAULT_PHYS_BASE | LAPIC_MODE_X2APIC |
3089a50f673fSIsaku Yamahata 		(kvm_vcpu_is_reset_bsp(vcpu) ? MSR_IA32_APICBASE_BSP : 0);
3090a50f673fSIsaku Yamahata 	if (kvm_apic_set_base(vcpu, apic_base, true))
3091a50f673fSIsaku Yamahata 		return -EINVAL;
3092a50f673fSIsaku Yamahata 
3093a50f673fSIsaku Yamahata 	ret = tdx_td_vcpu_init(vcpu, (u64)cmd->data);
3094a50f673fSIsaku Yamahata 	if (ret)
3095a50f673fSIsaku Yamahata 		return ret;
3096a50f673fSIsaku Yamahata 
309724c12911SIsaku Yamahata 	td_vmcs_write16(tdx, POSTED_INTR_NV, POSTED_INTR_VECTOR);
309824c12911SIsaku Yamahata 	td_vmcs_write64(tdx, POSTED_INTR_DESC_ADDR, __pa(&tdx->vt.pi_desc));
309924c12911SIsaku Yamahata 	td_vmcs_setbit32(tdx, PIN_BASED_VM_EXEC_CONTROL, PIN_BASED_POSTED_INTR);
310024c12911SIsaku Yamahata 
3101a50f673fSIsaku Yamahata 	tdx->state = VCPU_TD_STATE_INITIALIZED;
3102a50f673fSIsaku Yamahata 
3103a50f673fSIsaku Yamahata 	return 0;
3104a50f673fSIsaku Yamahata }
3105a50f673fSIsaku Yamahata 
tdx_vcpu_reset(struct kvm_vcpu * vcpu,bool init_event)31064cdf243eSIsaku Yamahata void tdx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
31074cdf243eSIsaku Yamahata {
31084cdf243eSIsaku Yamahata 	/*
31094cdf243eSIsaku Yamahata 	 * Yell on INIT, as TDX doesn't support INIT, i.e. KVM should drop all
31104cdf243eSIsaku Yamahata 	 * INIT events.
31114cdf243eSIsaku Yamahata 	 *
31124cdf243eSIsaku Yamahata 	 * Defer initializing vCPU for RESET state until KVM_TDX_INIT_VCPU, as
31134cdf243eSIsaku Yamahata 	 * userspace needs to define the vCPU model before KVM can initialize
31144cdf243eSIsaku Yamahata 	 * vCPU state, e.g. to enable x2APIC.
31154cdf243eSIsaku Yamahata 	 */
31164cdf243eSIsaku Yamahata 	WARN_ON_ONCE(init_event);
31174cdf243eSIsaku Yamahata }
31184cdf243eSIsaku Yamahata 
3119c846b451SIsaku Yamahata struct tdx_gmem_post_populate_arg {
3120c846b451SIsaku Yamahata 	struct kvm_vcpu *vcpu;
3121c846b451SIsaku Yamahata 	__u32 flags;
3122c846b451SIsaku Yamahata };
3123c846b451SIsaku Yamahata 
tdx_gmem_post_populate(struct kvm * kvm,gfn_t gfn,kvm_pfn_t pfn,void __user * src,int order,void * _arg)3124c846b451SIsaku Yamahata static int tdx_gmem_post_populate(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn,
3125c846b451SIsaku Yamahata 				  void __user *src, int order, void *_arg)
3126c846b451SIsaku Yamahata {
3127c846b451SIsaku Yamahata 	u64 error_code = PFERR_GUEST_FINAL_MASK | PFERR_PRIVATE_ACCESS;
3128c846b451SIsaku Yamahata 	struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
3129c846b451SIsaku Yamahata 	struct tdx_gmem_post_populate_arg *arg = _arg;
3130c846b451SIsaku Yamahata 	struct kvm_vcpu *vcpu = arg->vcpu;
3131c846b451SIsaku Yamahata 	gpa_t gpa = gfn_to_gpa(gfn);
3132c846b451SIsaku Yamahata 	u8 level = PG_LEVEL_4K;
3133c846b451SIsaku Yamahata 	struct page *src_page;
3134c846b451SIsaku Yamahata 	int ret, i;
3135c846b451SIsaku Yamahata 	u64 err, entry, level_state;
3136c846b451SIsaku Yamahata 
3137c846b451SIsaku Yamahata 	/*
3138c846b451SIsaku Yamahata 	 * Get the source page if it has been faulted in. Return failure if the
3139c846b451SIsaku Yamahata 	 * source page has been swapped out or unmapped in primary memory.
3140c846b451SIsaku Yamahata 	 */
3141c846b451SIsaku Yamahata 	ret = get_user_pages_fast((unsigned long)src, 1, 0, &src_page);
3142c846b451SIsaku Yamahata 	if (ret < 0)
3143c846b451SIsaku Yamahata 		return ret;
3144c846b451SIsaku Yamahata 	if (ret != 1)
3145c846b451SIsaku Yamahata 		return -ENOMEM;
3146c846b451SIsaku Yamahata 
3147c846b451SIsaku Yamahata 	ret = kvm_tdp_map_page(vcpu, gpa, error_code, &level);
3148c846b451SIsaku Yamahata 	if (ret < 0)
3149c846b451SIsaku Yamahata 		goto out;
3150c846b451SIsaku Yamahata 
3151c846b451SIsaku Yamahata 	/*
3152c846b451SIsaku Yamahata 	 * The private mem cannot be zapped after kvm_tdp_map_page()
3153c846b451SIsaku Yamahata 	 * because all paths are covered by slots_lock and the
3154c846b451SIsaku Yamahata 	 * filemap invalidate lock.  Check that they are indeed enough.
3155c846b451SIsaku Yamahata 	 */
3156c846b451SIsaku Yamahata 	if (IS_ENABLED(CONFIG_KVM_PROVE_MMU)) {
3157c846b451SIsaku Yamahata 		scoped_guard(read_lock, &kvm->mmu_lock) {
3158c846b451SIsaku Yamahata 			if (KVM_BUG_ON(!kvm_tdp_mmu_gpa_is_mapped(vcpu, gpa), kvm)) {
3159c846b451SIsaku Yamahata 				ret = -EIO;
3160c846b451SIsaku Yamahata 				goto out;
3161c846b451SIsaku Yamahata 			}
3162c846b451SIsaku Yamahata 		}
3163c846b451SIsaku Yamahata 	}
3164c846b451SIsaku Yamahata 
3165c846b451SIsaku Yamahata 	ret = 0;
3166c846b451SIsaku Yamahata 	err = tdh_mem_page_add(&kvm_tdx->td, gpa, pfn_to_page(pfn),
3167c846b451SIsaku Yamahata 			       src_page, &entry, &level_state);
3168c846b451SIsaku Yamahata 	if (err) {
3169c846b451SIsaku Yamahata 		ret = unlikely(tdx_operand_busy(err)) ? -EBUSY : -EIO;
3170c846b451SIsaku Yamahata 		goto out;
3171c846b451SIsaku Yamahata 	}
3172c846b451SIsaku Yamahata 
3173012426d6SIsaku Yamahata 	if (!KVM_BUG_ON(!atomic64_read(&kvm_tdx->nr_premapped), kvm))
3174012426d6SIsaku Yamahata 		atomic64_dec(&kvm_tdx->nr_premapped);
3175012426d6SIsaku Yamahata 
3176c846b451SIsaku Yamahata 	if (arg->flags & KVM_TDX_MEASURE_MEMORY_REGION) {
3177c846b451SIsaku Yamahata 		for (i = 0; i < PAGE_SIZE; i += TDX_EXTENDMR_CHUNKSIZE) {
3178c846b451SIsaku Yamahata 			err = tdh_mr_extend(&kvm_tdx->td, gpa + i, &entry,
3179c846b451SIsaku Yamahata 					    &level_state);
3180c846b451SIsaku Yamahata 			if (err) {
3181c846b451SIsaku Yamahata 				ret = -EIO;
3182c846b451SIsaku Yamahata 				break;
3183c846b451SIsaku Yamahata 			}
3184c846b451SIsaku Yamahata 		}
3185c846b451SIsaku Yamahata 	}
3186c846b451SIsaku Yamahata 
3187c846b451SIsaku Yamahata out:
3188c846b451SIsaku Yamahata 	put_page(src_page);
3189c846b451SIsaku Yamahata 	return ret;
3190c846b451SIsaku Yamahata }
3191c846b451SIsaku Yamahata 
tdx_vcpu_init_mem_region(struct kvm_vcpu * vcpu,struct kvm_tdx_cmd * cmd)3192c846b451SIsaku Yamahata static int tdx_vcpu_init_mem_region(struct kvm_vcpu *vcpu, struct kvm_tdx_cmd *cmd)
3193c846b451SIsaku Yamahata {
3194c846b451SIsaku Yamahata 	struct vcpu_tdx *tdx = to_tdx(vcpu);
3195c846b451SIsaku Yamahata 	struct kvm *kvm = vcpu->kvm;
3196c846b451SIsaku Yamahata 	struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
3197c846b451SIsaku Yamahata 	struct kvm_tdx_init_mem_region region;
3198c846b451SIsaku Yamahata 	struct tdx_gmem_post_populate_arg arg;
3199c846b451SIsaku Yamahata 	long gmem_ret;
3200c846b451SIsaku Yamahata 	int ret;
3201c846b451SIsaku Yamahata 
3202c846b451SIsaku Yamahata 	if (tdx->state != VCPU_TD_STATE_INITIALIZED)
3203c846b451SIsaku Yamahata 		return -EINVAL;
3204c846b451SIsaku Yamahata 
3205c846b451SIsaku Yamahata 	guard(mutex)(&kvm->slots_lock);
3206c846b451SIsaku Yamahata 
3207c846b451SIsaku Yamahata 	/* Once TD is finalized, the initial guest memory is fixed. */
3208c846b451SIsaku Yamahata 	if (kvm_tdx->state == TD_STATE_RUNNABLE)
3209c846b451SIsaku Yamahata 		return -EINVAL;
3210c846b451SIsaku Yamahata 
3211c846b451SIsaku Yamahata 	if (cmd->flags & ~KVM_TDX_MEASURE_MEMORY_REGION)
3212c846b451SIsaku Yamahata 		return -EINVAL;
3213c846b451SIsaku Yamahata 
3214c846b451SIsaku Yamahata 	if (copy_from_user(&region, u64_to_user_ptr(cmd->data), sizeof(region)))
3215c846b451SIsaku Yamahata 		return -EFAULT;
3216c846b451SIsaku Yamahata 
3217c846b451SIsaku Yamahata 	if (!PAGE_ALIGNED(region.source_addr) || !PAGE_ALIGNED(region.gpa) ||
3218c846b451SIsaku Yamahata 	    !region.nr_pages ||
3219c846b451SIsaku Yamahata 	    region.gpa + (region.nr_pages << PAGE_SHIFT) <= region.gpa ||
3220c846b451SIsaku Yamahata 	    !vt_is_tdx_private_gpa(kvm, region.gpa) ||
3221c846b451SIsaku Yamahata 	    !vt_is_tdx_private_gpa(kvm, region.gpa + (region.nr_pages << PAGE_SHIFT) - 1))
3222c846b451SIsaku Yamahata 		return -EINVAL;
3223c846b451SIsaku Yamahata 
3224c846b451SIsaku Yamahata 	kvm_mmu_reload(vcpu);
3225c846b451SIsaku Yamahata 	ret = 0;
3226c846b451SIsaku Yamahata 	while (region.nr_pages) {
3227c846b451SIsaku Yamahata 		if (signal_pending(current)) {
3228c846b451SIsaku Yamahata 			ret = -EINTR;
3229c846b451SIsaku Yamahata 			break;
3230c846b451SIsaku Yamahata 		}
3231c846b451SIsaku Yamahata 
3232c846b451SIsaku Yamahata 		arg = (struct tdx_gmem_post_populate_arg) {
3233c846b451SIsaku Yamahata 			.vcpu = vcpu,
3234c846b451SIsaku Yamahata 			.flags = cmd->flags,
3235c846b451SIsaku Yamahata 		};
3236c846b451SIsaku Yamahata 		gmem_ret = kvm_gmem_populate(kvm, gpa_to_gfn(region.gpa),
3237c846b451SIsaku Yamahata 					     u64_to_user_ptr(region.source_addr),
3238c846b451SIsaku Yamahata 					     1, tdx_gmem_post_populate, &arg);
3239c846b451SIsaku Yamahata 		if (gmem_ret < 0) {
3240c846b451SIsaku Yamahata 			ret = gmem_ret;
3241c846b451SIsaku Yamahata 			break;
3242c846b451SIsaku Yamahata 		}
3243c846b451SIsaku Yamahata 
3244c846b451SIsaku Yamahata 		if (gmem_ret != 1) {
3245c846b451SIsaku Yamahata 			ret = -EIO;
3246c846b451SIsaku Yamahata 			break;
3247c846b451SIsaku Yamahata 		}
3248c846b451SIsaku Yamahata 
3249c846b451SIsaku Yamahata 		region.source_addr += PAGE_SIZE;
3250c846b451SIsaku Yamahata 		region.gpa += PAGE_SIZE;
3251c846b451SIsaku Yamahata 		region.nr_pages--;
3252c846b451SIsaku Yamahata 
3253c846b451SIsaku Yamahata 		cond_resched();
3254c846b451SIsaku Yamahata 	}
3255c846b451SIsaku Yamahata 
3256c846b451SIsaku Yamahata 	if (copy_to_user(u64_to_user_ptr(cmd->data), &region, sizeof(region)))
3257c846b451SIsaku Yamahata 		ret = -EFAULT;
3258c846b451SIsaku Yamahata 	return ret;
3259c846b451SIsaku Yamahata }
3260c846b451SIsaku Yamahata 
tdx_vcpu_ioctl(struct kvm_vcpu * vcpu,void __user * argp)3261a50f673fSIsaku Yamahata int tdx_vcpu_ioctl(struct kvm_vcpu *vcpu, void __user *argp)
3262a50f673fSIsaku Yamahata {
3263a50f673fSIsaku Yamahata 	struct kvm_tdx *kvm_tdx = to_kvm_tdx(vcpu->kvm);
3264a50f673fSIsaku Yamahata 	struct kvm_tdx_cmd cmd;
3265a50f673fSIsaku Yamahata 	int ret;
3266a50f673fSIsaku Yamahata 
3267a50f673fSIsaku Yamahata 	if (!is_hkid_assigned(kvm_tdx) || kvm_tdx->state == TD_STATE_RUNNABLE)
3268a50f673fSIsaku Yamahata 		return -EINVAL;
3269a50f673fSIsaku Yamahata 
3270a50f673fSIsaku Yamahata 	if (copy_from_user(&cmd, argp, sizeof(cmd)))
3271a50f673fSIsaku Yamahata 		return -EFAULT;
3272a50f673fSIsaku Yamahata 
3273a50f673fSIsaku Yamahata 	if (cmd.hw_error)
3274a50f673fSIsaku Yamahata 		return -EINVAL;
3275a50f673fSIsaku Yamahata 
3276a50f673fSIsaku Yamahata 	switch (cmd.id) {
3277a50f673fSIsaku Yamahata 	case KVM_TDX_INIT_VCPU:
3278a50f673fSIsaku Yamahata 		ret = tdx_vcpu_init(vcpu, &cmd);
3279a50f673fSIsaku Yamahata 		break;
3280c846b451SIsaku Yamahata 	case KVM_TDX_INIT_MEM_REGION:
3281c846b451SIsaku Yamahata 		ret = tdx_vcpu_init_mem_region(vcpu, &cmd);
3282c846b451SIsaku Yamahata 		break;
3283488808e6SXiaoyao Li 	case KVM_TDX_GET_CPUID:
3284488808e6SXiaoyao Li 		ret = tdx_vcpu_get_cpuid(vcpu, &cmd);
3285488808e6SXiaoyao Li 		break;
3286a50f673fSIsaku Yamahata 	default:
3287a50f673fSIsaku Yamahata 		ret = -EINVAL;
3288a50f673fSIsaku Yamahata 		break;
3289a50f673fSIsaku Yamahata 	}
3290a50f673fSIsaku Yamahata 
3291a50f673fSIsaku Yamahata 	return ret;
3292a50f673fSIsaku Yamahata }
3293a50f673fSIsaku Yamahata 
tdx_gmem_private_max_mapping_level(struct kvm * kvm,kvm_pfn_t pfn)32940036b87aSIsaku Yamahata int tdx_gmem_private_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn)
32950036b87aSIsaku Yamahata {
32960036b87aSIsaku Yamahata 	return PG_LEVEL_4K;
32970036b87aSIsaku Yamahata }
32980036b87aSIsaku Yamahata 
tdx_online_cpu(unsigned int cpu)3299fcdbdf63SKai Huang static int tdx_online_cpu(unsigned int cpu)
3300fcdbdf63SKai Huang {
3301fcdbdf63SKai Huang 	unsigned long flags;
3302fcdbdf63SKai Huang 	int r;
3303fcdbdf63SKai Huang 
3304fcdbdf63SKai Huang 	/* Sanity check CPU is already in post-VMXON */
3305fcdbdf63SKai Huang 	WARN_ON_ONCE(!(cr4_read_shadow() & X86_CR4_VMXE));
3306fcdbdf63SKai Huang 
3307fcdbdf63SKai Huang 	local_irq_save(flags);
3308fcdbdf63SKai Huang 	r = tdx_cpu_enable();
3309fcdbdf63SKai Huang 	local_irq_restore(flags);
3310fcdbdf63SKai Huang 
3311fcdbdf63SKai Huang 	return r;
3312fcdbdf63SKai Huang }
3313fcdbdf63SKai Huang 
tdx_offline_cpu(unsigned int cpu)33149934d7e5SIsaku Yamahata static int tdx_offline_cpu(unsigned int cpu)
33159934d7e5SIsaku Yamahata {
33169934d7e5SIsaku Yamahata 	int i;
33179934d7e5SIsaku Yamahata 
33189934d7e5SIsaku Yamahata 	/* No TD is running.  Allow any cpu to be offline. */
33199934d7e5SIsaku Yamahata 	if (!atomic_read(&nr_configured_hkid))
33209934d7e5SIsaku Yamahata 		return 0;
33219934d7e5SIsaku Yamahata 
33229934d7e5SIsaku Yamahata 	/*
33239934d7e5SIsaku Yamahata 	 * In order to reclaim TDX HKID, (i.e. when deleting guest TD), need to
33249934d7e5SIsaku Yamahata 	 * call TDH.PHYMEM.PAGE.WBINVD on all packages to program all memory
33259934d7e5SIsaku Yamahata 	 * controller with pconfig.  If we have active TDX HKID, refuse to
33269934d7e5SIsaku Yamahata 	 * offline the last online cpu.
33279934d7e5SIsaku Yamahata 	 */
33289934d7e5SIsaku Yamahata 	for_each_online_cpu(i) {
33299934d7e5SIsaku Yamahata 		/*
33309934d7e5SIsaku Yamahata 		 * Found another online cpu on the same package.
33319934d7e5SIsaku Yamahata 		 * Allow to offline.
33329934d7e5SIsaku Yamahata 		 */
33339934d7e5SIsaku Yamahata 		if (i != cpu && topology_physical_package_id(i) ==
33349934d7e5SIsaku Yamahata 				topology_physical_package_id(cpu))
33359934d7e5SIsaku Yamahata 			return 0;
33369934d7e5SIsaku Yamahata 	}
33379934d7e5SIsaku Yamahata 
33389934d7e5SIsaku Yamahata 	/*
33399934d7e5SIsaku Yamahata 	 * This is the last cpu of this package.  Don't offline it.
33409934d7e5SIsaku Yamahata 	 *
33419934d7e5SIsaku Yamahata 	 * Because it's hard for human operator to understand the
33429934d7e5SIsaku Yamahata 	 * reason, warn it.
33439934d7e5SIsaku Yamahata 	 */
33449934d7e5SIsaku Yamahata #define MSG_ALLPKG_ONLINE \
33459934d7e5SIsaku Yamahata 	"TDX requires all packages to have an online CPU. Delete all TDs in order to offline all CPUs of a package.\n"
33469934d7e5SIsaku Yamahata 	pr_warn_ratelimited(MSG_ALLPKG_ONLINE);
33479934d7e5SIsaku Yamahata 	return -EBUSY;
33489934d7e5SIsaku Yamahata }
33499934d7e5SIsaku Yamahata 
__do_tdx_cleanup(void)3350fcdbdf63SKai Huang static void __do_tdx_cleanup(void)
3351fcdbdf63SKai Huang {
3352fcdbdf63SKai Huang 	/*
3353fcdbdf63SKai Huang 	 * Once TDX module is initialized, it cannot be disabled and
3354fcdbdf63SKai Huang 	 * re-initialized again w/o runtime update (which isn't
3355fcdbdf63SKai Huang 	 * supported by kernel).  Only need to remove the cpuhp here.
3356fcdbdf63SKai Huang 	 * The TDX host core code tracks TDX status and can handle
3357fcdbdf63SKai Huang 	 * 'multiple enabling' scenario.
3358fcdbdf63SKai Huang 	 */
3359fcdbdf63SKai Huang 	WARN_ON_ONCE(!tdx_cpuhp_state);
3360fcdbdf63SKai Huang 	cpuhp_remove_state_nocalls_cpuslocked(tdx_cpuhp_state);
3361fcdbdf63SKai Huang 	tdx_cpuhp_state = 0;
3362fcdbdf63SKai Huang }
3363fcdbdf63SKai Huang 
__tdx_cleanup(void)3364fcdbdf63SKai Huang static void __tdx_cleanup(void)
3365fcdbdf63SKai Huang {
3366fcdbdf63SKai Huang 	cpus_read_lock();
3367fcdbdf63SKai Huang 	__do_tdx_cleanup();
3368fcdbdf63SKai Huang 	cpus_read_unlock();
3369fcdbdf63SKai Huang }
3370fcdbdf63SKai Huang 
__do_tdx_bringup(void)3371fcdbdf63SKai Huang static int __init __do_tdx_bringup(void)
3372fcdbdf63SKai Huang {
3373fcdbdf63SKai Huang 	int r;
3374fcdbdf63SKai Huang 
3375fcdbdf63SKai Huang 	/*
3376fcdbdf63SKai Huang 	 * TDX-specific cpuhp callback to call tdx_cpu_enable() on all
3377fcdbdf63SKai Huang 	 * online CPUs before calling tdx_enable(), and on any new
3378fcdbdf63SKai Huang 	 * going-online CPU to make sure it is ready for TDX guest.
3379fcdbdf63SKai Huang 	 */
3380fcdbdf63SKai Huang 	r = cpuhp_setup_state_cpuslocked(CPUHP_AP_ONLINE_DYN,
3381fcdbdf63SKai Huang 					 "kvm/cpu/tdx:online",
33829934d7e5SIsaku Yamahata 					 tdx_online_cpu, tdx_offline_cpu);
3383fcdbdf63SKai Huang 	if (r < 0)
3384fcdbdf63SKai Huang 		return r;
3385fcdbdf63SKai Huang 
3386fcdbdf63SKai Huang 	tdx_cpuhp_state = r;
3387fcdbdf63SKai Huang 
3388fcdbdf63SKai Huang 	r = tdx_enable();
3389fcdbdf63SKai Huang 	if (r)
3390fcdbdf63SKai Huang 		__do_tdx_cleanup();
3391fcdbdf63SKai Huang 
3392fcdbdf63SKai Huang 	return r;
3393fcdbdf63SKai Huang }
3394fcdbdf63SKai Huang 
__tdx_bringup(void)3395fcdbdf63SKai Huang static int __init __tdx_bringup(void)
3396fcdbdf63SKai Huang {
3397f94f4a97SIsaku Yamahata 	const struct tdx_sys_info_td_conf *td_conf;
3398e0b4f31aSIsaku Yamahata 	int r, i;
3399e0b4f31aSIsaku Yamahata 
3400e0b4f31aSIsaku Yamahata 	for (i = 0; i < ARRAY_SIZE(tdx_uret_msrs); i++) {
3401e0b4f31aSIsaku Yamahata 		/*
3402e0b4f31aSIsaku Yamahata 		 * Check if MSRs (tdx_uret_msrs) can be saved/restored
3403e0b4f31aSIsaku Yamahata 		 * before returning to user space.
3404e0b4f31aSIsaku Yamahata 		 *
3405e0b4f31aSIsaku Yamahata 		 * this_cpu_ptr(user_return_msrs)->registered isn't checked
3406e0b4f31aSIsaku Yamahata 		 * because the registration is done at vcpu runtime by
3407e0b4f31aSIsaku Yamahata 		 * tdx_user_return_msr_update_cache().
3408e0b4f31aSIsaku Yamahata 		 */
3409e0b4f31aSIsaku Yamahata 		tdx_uret_msrs[i].slot = kvm_find_user_return_msr(tdx_uret_msrs[i].msr);
3410e0b4f31aSIsaku Yamahata 		if (tdx_uret_msrs[i].slot == -1) {
3411e0b4f31aSIsaku Yamahata 			/* If any MSR isn't supported, it is a KVM bug */
3412e0b4f31aSIsaku Yamahata 			pr_err("MSR %x isn't included by kvm_find_user_return_msr\n",
3413e0b4f31aSIsaku Yamahata 				tdx_uret_msrs[i].msr);
3414e0b4f31aSIsaku Yamahata 			return -EIO;
3415e0b4f31aSIsaku Yamahata 		}
3416e0b4f31aSIsaku Yamahata 	}
3417fcdbdf63SKai Huang 
3418fcdbdf63SKai Huang 	/*
3419fcdbdf63SKai Huang 	 * Enabling TDX requires enabling hardware virtualization first,
3420fcdbdf63SKai Huang 	 * as making SEAMCALLs requires CPU being in post-VMXON state.
3421fcdbdf63SKai Huang 	 */
3422fcdbdf63SKai Huang 	r = kvm_enable_virtualization();
3423fcdbdf63SKai Huang 	if (r)
3424fcdbdf63SKai Huang 		return r;
3425fcdbdf63SKai Huang 
3426fcdbdf63SKai Huang 	cpus_read_lock();
3427fcdbdf63SKai Huang 	r = __do_tdx_bringup();
3428fcdbdf63SKai Huang 	cpus_read_unlock();
3429fcdbdf63SKai Huang 
3430fcdbdf63SKai Huang 	if (r)
3431fcdbdf63SKai Huang 		goto tdx_bringup_err;
3432fcdbdf63SKai Huang 
343345154fb0SKai Huang 	/* Get TDX global information for later use */
343445154fb0SKai Huang 	tdx_sysinfo = tdx_get_sysinfo();
343545154fb0SKai Huang 	if (WARN_ON_ONCE(!tdx_sysinfo)) {
343645154fb0SKai Huang 		r = -EINVAL;
343745154fb0SKai Huang 		goto get_sysinfo_err;
343845154fb0SKai Huang 	}
343945154fb0SKai Huang 
344061bb2827SIsaku Yamahata 	/* Check TDX module and KVM capabilities */
344161bb2827SIsaku Yamahata 	if (!tdx_get_supported_attrs(&tdx_sysinfo->td_conf) ||
344261bb2827SIsaku Yamahata 	    !tdx_get_supported_xfam(&tdx_sysinfo->td_conf))
344361bb2827SIsaku Yamahata 		goto get_sysinfo_err;
344461bb2827SIsaku Yamahata 
344561bb2827SIsaku Yamahata 	if (!(tdx_sysinfo->features.tdx_features0 & MD_FIELD_ID_FEATURES0_TOPOLOGY_ENUM))
344661bb2827SIsaku Yamahata 		goto get_sysinfo_err;
344761bb2827SIsaku Yamahata 
3448fcdbdf63SKai Huang 	/*
3449f94f4a97SIsaku Yamahata 	 * TDX has its own limit of maximum vCPUs it can support for all
3450f94f4a97SIsaku Yamahata 	 * TDX guests in addition to KVM_MAX_VCPUS.  Userspace needs to
3451f94f4a97SIsaku Yamahata 	 * query TDX guest's maximum vCPUs by checking KVM_CAP_MAX_VCPU
3452f94f4a97SIsaku Yamahata 	 * extension on per-VM basis.
3453f94f4a97SIsaku Yamahata 	 *
3454f94f4a97SIsaku Yamahata 	 * TDX module reports such limit via the MAX_VCPU_PER_TD global
3455f94f4a97SIsaku Yamahata 	 * metadata.  Different modules may report different values.
3456f94f4a97SIsaku Yamahata 	 * Some old module may also not support this metadata (in which
3457f94f4a97SIsaku Yamahata 	 * case this limit is U16_MAX).
3458f94f4a97SIsaku Yamahata 	 *
3459f94f4a97SIsaku Yamahata 	 * In practice, the reported value reflects the maximum logical
3460f94f4a97SIsaku Yamahata 	 * CPUs that ALL the platforms that the module supports can
3461f94f4a97SIsaku Yamahata 	 * possibly have.
3462f94f4a97SIsaku Yamahata 	 *
3463f94f4a97SIsaku Yamahata 	 * Simply forwarding the MAX_VCPU_PER_TD to userspace could
3464f94f4a97SIsaku Yamahata 	 * result in an unpredictable ABI.  KVM instead always advertise
3465f94f4a97SIsaku Yamahata 	 * the number of logical CPUs the platform has as the maximum
3466f94f4a97SIsaku Yamahata 	 * vCPUs for TDX guests.
3467f94f4a97SIsaku Yamahata 	 *
3468f94f4a97SIsaku Yamahata 	 * Make sure MAX_VCPU_PER_TD reported by TDX module is not
3469f94f4a97SIsaku Yamahata 	 * smaller than the number of logical CPUs, otherwise KVM will
3470f94f4a97SIsaku Yamahata 	 * report an unsupported value to userspace.
3471f94f4a97SIsaku Yamahata 	 *
3472f94f4a97SIsaku Yamahata 	 * Note, a platform with TDX enabled in the BIOS cannot support
3473f94f4a97SIsaku Yamahata 	 * physical CPU hotplug, and TDX requires the BIOS has marked
3474f94f4a97SIsaku Yamahata 	 * all logical CPUs in MADT table as enabled.  Just use
3475f94f4a97SIsaku Yamahata 	 * num_present_cpus() for the number of logical CPUs.
3476f94f4a97SIsaku Yamahata 	 */
3477f94f4a97SIsaku Yamahata 	td_conf = &tdx_sysinfo->td_conf;
3478f94f4a97SIsaku Yamahata 	if (td_conf->max_vcpus_per_td < num_present_cpus()) {
3479f94f4a97SIsaku Yamahata 		pr_err("Disable TDX: MAX_VCPU_PER_TD (%u) smaller than number of logical CPUs (%u).\n",
3480f94f4a97SIsaku Yamahata 				td_conf->max_vcpus_per_td, num_present_cpus());
3481f94f4a97SIsaku Yamahata 		r = -EINVAL;
3482f94f4a97SIsaku Yamahata 		goto get_sysinfo_err;
3483f94f4a97SIsaku Yamahata 	}
3484f94f4a97SIsaku Yamahata 
34857c035beaSZhiming Hu 	if (misc_cg_set_capacity(MISC_CG_RES_TDX, tdx_get_nr_guest_keyids())) {
34867c035beaSZhiming Hu 		r = -EINVAL;
34877c035beaSZhiming Hu 		goto get_sysinfo_err;
34887c035beaSZhiming Hu 	}
34897c035beaSZhiming Hu 
3490f94f4a97SIsaku Yamahata 	/*
3491fcdbdf63SKai Huang 	 * Leave hardware virtualization enabled after TDX is enabled
3492fcdbdf63SKai Huang 	 * successfully.  TDX CPU hotplug depends on this.
3493fcdbdf63SKai Huang 	 */
3494fcdbdf63SKai Huang 	return 0;
349561bb2827SIsaku Yamahata 
349645154fb0SKai Huang get_sysinfo_err:
349745154fb0SKai Huang 	__tdx_cleanup();
3498fcdbdf63SKai Huang tdx_bringup_err:
3499fcdbdf63SKai Huang 	kvm_disable_virtualization();
3500fcdbdf63SKai Huang 	return r;
3501fcdbdf63SKai Huang }
3502fcdbdf63SKai Huang 
tdx_cleanup(void)3503fcdbdf63SKai Huang void tdx_cleanup(void)
3504fcdbdf63SKai Huang {
3505fcdbdf63SKai Huang 	if (enable_tdx) {
35067c035beaSZhiming Hu 		misc_cg_set_capacity(MISC_CG_RES_TDX, 0);
3507fcdbdf63SKai Huang 		__tdx_cleanup();
3508fcdbdf63SKai Huang 		kvm_disable_virtualization();
3509fcdbdf63SKai Huang 	}
3510fcdbdf63SKai Huang }
3511fcdbdf63SKai Huang 
tdx_bringup(void)3512fcdbdf63SKai Huang int __init tdx_bringup(void)
3513fcdbdf63SKai Huang {
3514d789fa6eSIsaku Yamahata 	int r, i;
3515d789fa6eSIsaku Yamahata 
3516d789fa6eSIsaku Yamahata 	/* tdx_disable_virtualization_cpu() uses associated_tdvcpus. */
3517d789fa6eSIsaku Yamahata 	for_each_possible_cpu(i)
3518d789fa6eSIsaku Yamahata 		INIT_LIST_HEAD(&per_cpu(associated_tdvcpus, i));
3519fcdbdf63SKai Huang 
3520fcdbdf63SKai Huang 	if (!enable_tdx)
3521fcdbdf63SKai Huang 		return 0;
3522fcdbdf63SKai Huang 
3523427a6486SIsaku Yamahata 	if (!enable_ept) {
3524427a6486SIsaku Yamahata 		pr_err("EPT is required for TDX\n");
3525427a6486SIsaku Yamahata 		goto success_disable_tdx;
3526427a6486SIsaku Yamahata 	}
3527427a6486SIsaku Yamahata 
3528427a6486SIsaku Yamahata 	if (!tdp_mmu_enabled || !enable_mmio_caching || !enable_ept_ad_bits) {
3529427a6486SIsaku Yamahata 		pr_err("TDP MMU and MMIO caching and EPT A/D bit is required for TDX\n");
3530427a6486SIsaku Yamahata 		goto success_disable_tdx;
3531427a6486SIsaku Yamahata 	}
3532427a6486SIsaku Yamahata 
3533f65916aeSIsaku Yamahata 	if (!enable_apicv) {
3534f65916aeSIsaku Yamahata 		pr_err("APICv is required for TDX\n");
3535f65916aeSIsaku Yamahata 		goto success_disable_tdx;
3536f65916aeSIsaku Yamahata 	}
3537f65916aeSIsaku Yamahata 
35386bfa6d85SIsaku Yamahata 	if (!cpu_feature_enabled(X86_FEATURE_OSXSAVE)) {
35396bfa6d85SIsaku Yamahata 		pr_err("tdx: OSXSAVE is required for TDX\n");
35406bfa6d85SIsaku Yamahata 		goto success_disable_tdx;
35416bfa6d85SIsaku Yamahata 	}
35426bfa6d85SIsaku Yamahata 
35438d032b68SIsaku Yamahata 	if (!cpu_feature_enabled(X86_FEATURE_MOVDIR64B)) {
35448d032b68SIsaku Yamahata 		pr_err("tdx: MOVDIR64B is required for TDX\n");
35458d032b68SIsaku Yamahata 		goto success_disable_tdx;
35468d032b68SIsaku Yamahata 	}
35478d032b68SIsaku Yamahata 
354890fe64a9SYan Zhao 	if (!cpu_feature_enabled(X86_FEATURE_SELFSNOOP)) {
354990fe64a9SYan Zhao 		pr_err("Self-snoop is required for TDX\n");
355090fe64a9SYan Zhao 		goto success_disable_tdx;
355190fe64a9SYan Zhao 	}
355290fe64a9SYan Zhao 
3553fcdbdf63SKai Huang 	if (!cpu_feature_enabled(X86_FEATURE_TDX_HOST_PLATFORM)) {
3554fcdbdf63SKai Huang 		pr_err("tdx: no TDX private KeyIDs available\n");
3555fcdbdf63SKai Huang 		goto success_disable_tdx;
3556fcdbdf63SKai Huang 	}
3557fcdbdf63SKai Huang 
3558fcdbdf63SKai Huang 	if (!enable_virt_at_load) {
3559fcdbdf63SKai Huang 		pr_err("tdx: tdx requires kvm.enable_virt_at_load=1\n");
3560fcdbdf63SKai Huang 		goto success_disable_tdx;
3561fcdbdf63SKai Huang 	}
3562fcdbdf63SKai Huang 
3563fcdbdf63SKai Huang 	/*
3564fcdbdf63SKai Huang 	 * Ideally KVM should probe whether TDX module has been loaded
3565fcdbdf63SKai Huang 	 * first and then try to bring it up.  But TDX needs to use SEAMCALL
3566fcdbdf63SKai Huang 	 * to probe whether the module is loaded (there is no CPUID or MSR
3567fcdbdf63SKai Huang 	 * for that), and making SEAMCALL requires enabling virtualization
3568fcdbdf63SKai Huang 	 * first, just like the rest steps of bringing up TDX module.
3569fcdbdf63SKai Huang 	 *
3570fcdbdf63SKai Huang 	 * So, for simplicity do everything in __tdx_bringup(); the first
3571fcdbdf63SKai Huang 	 * SEAMCALL will return -ENODEV when the module is not loaded.  The
3572fcdbdf63SKai Huang 	 * only complication is having to make sure that initialization
3573fcdbdf63SKai Huang 	 * SEAMCALLs don't return TDX_SEAMCALL_VMFAILINVALID in other
3574fcdbdf63SKai Huang 	 * cases.
3575fcdbdf63SKai Huang 	 */
3576fcdbdf63SKai Huang 	r = __tdx_bringup();
3577fcdbdf63SKai Huang 	if (r) {
3578fcdbdf63SKai Huang 		/*
3579fcdbdf63SKai Huang 		 * Disable TDX only but don't fail to load module if
3580fcdbdf63SKai Huang 		 * the TDX module could not be loaded.  No need to print
3581fcdbdf63SKai Huang 		 * message saying "module is not loaded" because it was
3582fcdbdf63SKai Huang 		 * printed when the first SEAMCALL failed.
3583fcdbdf63SKai Huang 		 */
3584fcdbdf63SKai Huang 		if (r == -ENODEV)
3585fcdbdf63SKai Huang 			goto success_disable_tdx;
3586fcdbdf63SKai Huang 
3587fcdbdf63SKai Huang 		enable_tdx = 0;
3588fcdbdf63SKai Huang 	}
3589fcdbdf63SKai Huang 
3590fcdbdf63SKai Huang 	return r;
3591fcdbdf63SKai Huang 
3592fcdbdf63SKai Huang success_disable_tdx:
3593fcdbdf63SKai Huang 	enable_tdx = 0;
3594fcdbdf63SKai Huang 	return 0;
3595fcdbdf63SKai Huang }
3596