1fcdbdf63SKai Huang // SPDX-License-Identifier: GPL-2.0
2c846b451SIsaku Yamahata #include <linux/cleanup.h>
3fcdbdf63SKai Huang #include <linux/cpu.h>
4fcdbdf63SKai Huang #include <asm/cpufeature.h>
56bfa6d85SIsaku Yamahata #include <asm/fpu/xcr.h>
67c035beaSZhiming Hu #include <linux/misc_cgroup.h>
781bf40d5SIsaku Yamahata #include <linux/mmu_context.h>
8fcdbdf63SKai Huang #include <asm/tdx.h>
9fcdbdf63SKai Huang #include "capabilities.h"
10488808e6SXiaoyao Li #include "mmu.h"
11b2aaf38cSIsaku Yamahata #include "x86_ops.h"
129002f8cfSIsaku Yamahata #include "lapic.h"
13fcdbdf63SKai Huang #include "tdx.h"
1422836e1dSIsaku Yamahata #include "vmx.h"
157d10ffb1SIsaku Yamahata #include "mmu/spte.h"
16c846b451SIsaku Yamahata #include "common.h"
1781bf40d5SIsaku Yamahata #include "posted_intr.h"
18209afc0cSBinbin Wu #include "irq.h"
1981bf912bSIsaku Yamahata #include <trace/events/kvm.h>
2081bf912bSIsaku Yamahata #include "trace.h"
21fcdbdf63SKai Huang
2209b3d3c1SIsaku Yamahata #pragma GCC poison to_vmx
2309b3d3c1SIsaku Yamahata
24fcdbdf63SKai Huang #undef pr_fmt
25fcdbdf63SKai Huang #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
26fcdbdf63SKai Huang
27e4aa6f69SIsaku Yamahata #define pr_tdx_error(__fn, __err) \
28e4aa6f69SIsaku Yamahata pr_err_ratelimited("SEAMCALL %s failed: 0x%llx\n", #__fn, __err)
29e4aa6f69SIsaku Yamahata
30e4aa6f69SIsaku Yamahata #define __pr_tdx_error_N(__fn_str, __err, __fmt, ...) \
31e4aa6f69SIsaku Yamahata pr_err_ratelimited("SEAMCALL " __fn_str " failed: 0x%llx, " __fmt, __err, __VA_ARGS__)
32e4aa6f69SIsaku Yamahata
33e4aa6f69SIsaku Yamahata #define pr_tdx_error_1(__fn, __err, __rcx) \
34e4aa6f69SIsaku Yamahata __pr_tdx_error_N(#__fn, __err, "rcx 0x%llx\n", __rcx)
35e4aa6f69SIsaku Yamahata
36e4aa6f69SIsaku Yamahata #define pr_tdx_error_2(__fn, __err, __rcx, __rdx) \
37e4aa6f69SIsaku Yamahata __pr_tdx_error_N(#__fn, __err, "rcx 0x%llx, rdx 0x%llx\n", __rcx, __rdx)
38e4aa6f69SIsaku Yamahata
39e4aa6f69SIsaku Yamahata #define pr_tdx_error_3(__fn, __err, __rcx, __rdx, __r8) \
40e4aa6f69SIsaku Yamahata __pr_tdx_error_N(#__fn, __err, "rcx 0x%llx, rdx 0x%llx, r8 0x%llx\n", __rcx, __rdx, __r8)
41e4aa6f69SIsaku Yamahata
4209b3d3c1SIsaku Yamahata bool enable_tdx __ro_after_init;
43fcdbdf63SKai Huang module_param_named(tdx, enable_tdx, bool, 0444);
44fcdbdf63SKai Huang
4587e3f45eSSean Christopherson #define TDX_SHARED_BIT_PWL_5 gpa_to_gfn(BIT_ULL(51))
4687e3f45eSSean Christopherson #define TDX_SHARED_BIT_PWL_4 gpa_to_gfn(BIT_ULL(47))
4787e3f45eSSean Christopherson
48fcdbdf63SKai Huang static enum cpuhp_state tdx_cpuhp_state;
49fcdbdf63SKai Huang
5045154fb0SKai Huang static const struct tdx_sys_info *tdx_sysinfo;
5145154fb0SKai Huang
tdh_vp_rd_failed(struct vcpu_tdx * tdx,char * uclass,u32 field,u64 err)52fe1e6d48SIsaku Yamahata void tdh_vp_rd_failed(struct vcpu_tdx *tdx, char *uclass, u32 field, u64 err)
53fe1e6d48SIsaku Yamahata {
54fe1e6d48SIsaku Yamahata KVM_BUG_ON(1, tdx->vcpu.kvm);
55fe1e6d48SIsaku Yamahata pr_err("TDH_VP_RD[%s.0x%x] failed 0x%llx\n", uclass, field, err);
56fe1e6d48SIsaku Yamahata }
57fe1e6d48SIsaku Yamahata
tdh_vp_wr_failed(struct vcpu_tdx * tdx,char * uclass,char * op,u32 field,u64 val,u64 err)58fe1e6d48SIsaku Yamahata void tdh_vp_wr_failed(struct vcpu_tdx *tdx, char *uclass, char *op, u32 field,
59fe1e6d48SIsaku Yamahata u64 val, u64 err)
60fe1e6d48SIsaku Yamahata {
61fe1e6d48SIsaku Yamahata KVM_BUG_ON(1, tdx->vcpu.kvm);
62fe1e6d48SIsaku Yamahata pr_err("TDH_VP_WR[%s.0x%x]%s0x%llx failed: 0x%llx\n", uclass, field, op, val, err);
63fe1e6d48SIsaku Yamahata }
64fe1e6d48SIsaku Yamahata
6561bb2827SIsaku Yamahata #define KVM_SUPPORTED_TD_ATTRS (TDX_TD_ATTR_SEPT_VE_DISABLE)
6661bb2827SIsaku Yamahata
to_kvm_tdx(struct kvm * kvm)6709b3d3c1SIsaku Yamahata static __always_inline struct kvm_tdx *to_kvm_tdx(struct kvm *kvm)
6809b3d3c1SIsaku Yamahata {
6909b3d3c1SIsaku Yamahata return container_of(kvm, struct kvm_tdx, kvm);
7009b3d3c1SIsaku Yamahata }
7109b3d3c1SIsaku Yamahata
to_tdx(struct kvm_vcpu * vcpu)7209b3d3c1SIsaku Yamahata static __always_inline struct vcpu_tdx *to_tdx(struct kvm_vcpu *vcpu)
7309b3d3c1SIsaku Yamahata {
7409b3d3c1SIsaku Yamahata return container_of(vcpu, struct vcpu_tdx, vcpu);
7509b3d3c1SIsaku Yamahata }
7609b3d3c1SIsaku Yamahata
tdx_get_supported_attrs(const struct tdx_sys_info_td_conf * td_conf)7761bb2827SIsaku Yamahata static u64 tdx_get_supported_attrs(const struct tdx_sys_info_td_conf *td_conf)
7861bb2827SIsaku Yamahata {
7961bb2827SIsaku Yamahata u64 val = KVM_SUPPORTED_TD_ATTRS;
8061bb2827SIsaku Yamahata
8161bb2827SIsaku Yamahata if ((val & td_conf->attributes_fixed1) != td_conf->attributes_fixed1)
8261bb2827SIsaku Yamahata return 0;
8361bb2827SIsaku Yamahata
8461bb2827SIsaku Yamahata val &= td_conf->attributes_fixed0;
8561bb2827SIsaku Yamahata
8661bb2827SIsaku Yamahata return val;
8761bb2827SIsaku Yamahata }
8861bb2827SIsaku Yamahata
tdx_get_supported_xfam(const struct tdx_sys_info_td_conf * td_conf)8961bb2827SIsaku Yamahata static u64 tdx_get_supported_xfam(const struct tdx_sys_info_td_conf *td_conf)
9061bb2827SIsaku Yamahata {
9161bb2827SIsaku Yamahata u64 val = kvm_caps.supported_xcr0 | kvm_caps.supported_xss;
9261bb2827SIsaku Yamahata
9361bb2827SIsaku Yamahata if ((val & td_conf->xfam_fixed1) != td_conf->xfam_fixed1)
9461bb2827SIsaku Yamahata return 0;
9561bb2827SIsaku Yamahata
9661bb2827SIsaku Yamahata val &= td_conf->xfam_fixed0;
9761bb2827SIsaku Yamahata
9861bb2827SIsaku Yamahata return val;
9961bb2827SIsaku Yamahata }
10061bb2827SIsaku Yamahata
tdx_get_guest_phys_addr_bits(const u32 eax)1010186dd29SIsaku Yamahata static int tdx_get_guest_phys_addr_bits(const u32 eax)
1020186dd29SIsaku Yamahata {
1030186dd29SIsaku Yamahata return (eax & GENMASK(23, 16)) >> 16;
1040186dd29SIsaku Yamahata }
1050186dd29SIsaku Yamahata
tdx_set_guest_phys_addr_bits(const u32 eax,int addr_bits)10661bb2827SIsaku Yamahata static u32 tdx_set_guest_phys_addr_bits(const u32 eax, int addr_bits)
10761bb2827SIsaku Yamahata {
10861bb2827SIsaku Yamahata return (eax & ~GENMASK(23, 16)) | (addr_bits & 0xff) << 16;
10961bb2827SIsaku Yamahata }
11061bb2827SIsaku Yamahata
1116d415778SAdrian Hunter #define TDX_FEATURE_TSX (__feature_bit(X86_FEATURE_HLE) | __feature_bit(X86_FEATURE_RTM))
1126d415778SAdrian Hunter
has_tsx(const struct kvm_cpuid_entry2 * entry)1136d415778SAdrian Hunter static bool has_tsx(const struct kvm_cpuid_entry2 *entry)
1146d415778SAdrian Hunter {
1156d415778SAdrian Hunter return entry->function == 7 && entry->index == 0 &&
1166d415778SAdrian Hunter (entry->ebx & TDX_FEATURE_TSX);
1176d415778SAdrian Hunter }
1186d415778SAdrian Hunter
clear_tsx(struct kvm_cpuid_entry2 * entry)1196d415778SAdrian Hunter static void clear_tsx(struct kvm_cpuid_entry2 *entry)
1206d415778SAdrian Hunter {
1216d415778SAdrian Hunter entry->ebx &= ~TDX_FEATURE_TSX;
1226d415778SAdrian Hunter }
1236d415778SAdrian Hunter
has_waitpkg(const struct kvm_cpuid_entry2 * entry)1246d415778SAdrian Hunter static bool has_waitpkg(const struct kvm_cpuid_entry2 *entry)
1256d415778SAdrian Hunter {
1266d415778SAdrian Hunter return entry->function == 7 && entry->index == 0 &&
1276d415778SAdrian Hunter (entry->ecx & __feature_bit(X86_FEATURE_WAITPKG));
1286d415778SAdrian Hunter }
1296d415778SAdrian Hunter
clear_waitpkg(struct kvm_cpuid_entry2 * entry)1306d415778SAdrian Hunter static void clear_waitpkg(struct kvm_cpuid_entry2 *entry)
1316d415778SAdrian Hunter {
1326d415778SAdrian Hunter entry->ecx &= ~__feature_bit(X86_FEATURE_WAITPKG);
1336d415778SAdrian Hunter }
1346d415778SAdrian Hunter
tdx_clear_unsupported_cpuid(struct kvm_cpuid_entry2 * entry)1356d415778SAdrian Hunter static void tdx_clear_unsupported_cpuid(struct kvm_cpuid_entry2 *entry)
1366d415778SAdrian Hunter {
1376d415778SAdrian Hunter if (has_tsx(entry))
1386d415778SAdrian Hunter clear_tsx(entry);
1396d415778SAdrian Hunter
1406d415778SAdrian Hunter if (has_waitpkg(entry))
1416d415778SAdrian Hunter clear_waitpkg(entry);
1426d415778SAdrian Hunter }
1436d415778SAdrian Hunter
tdx_unsupported_cpuid(const struct kvm_cpuid_entry2 * entry)1446d415778SAdrian Hunter static bool tdx_unsupported_cpuid(const struct kvm_cpuid_entry2 *entry)
1456d415778SAdrian Hunter {
1466d415778SAdrian Hunter return has_tsx(entry) || has_waitpkg(entry);
1476d415778SAdrian Hunter }
1486d415778SAdrian Hunter
14961bb2827SIsaku Yamahata #define KVM_TDX_CPUID_NO_SUBLEAF ((__u32)-1)
15061bb2827SIsaku Yamahata
td_init_cpuid_entry2(struct kvm_cpuid_entry2 * entry,unsigned char idx)15161bb2827SIsaku Yamahata static void td_init_cpuid_entry2(struct kvm_cpuid_entry2 *entry, unsigned char idx)
15261bb2827SIsaku Yamahata {
15361bb2827SIsaku Yamahata const struct tdx_sys_info_td_conf *td_conf = &tdx_sysinfo->td_conf;
15461bb2827SIsaku Yamahata
15561bb2827SIsaku Yamahata entry->function = (u32)td_conf->cpuid_config_leaves[idx];
15661bb2827SIsaku Yamahata entry->index = td_conf->cpuid_config_leaves[idx] >> 32;
15761bb2827SIsaku Yamahata entry->eax = (u32)td_conf->cpuid_config_values[idx][0];
15861bb2827SIsaku Yamahata entry->ebx = td_conf->cpuid_config_values[idx][0] >> 32;
15961bb2827SIsaku Yamahata entry->ecx = (u32)td_conf->cpuid_config_values[idx][1];
16061bb2827SIsaku Yamahata entry->edx = td_conf->cpuid_config_values[idx][1] >> 32;
16161bb2827SIsaku Yamahata
16261bb2827SIsaku Yamahata if (entry->index == KVM_TDX_CPUID_NO_SUBLEAF)
16361bb2827SIsaku Yamahata entry->index = 0;
16461bb2827SIsaku Yamahata
16561bb2827SIsaku Yamahata /*
16661bb2827SIsaku Yamahata * The TDX module doesn't allow configuring the guest phys addr bits
16761bb2827SIsaku Yamahata * (EAX[23:16]). However, KVM uses it as an interface to the userspace
16861bb2827SIsaku Yamahata * to configure the GPAW. Report these bits as configurable.
16961bb2827SIsaku Yamahata */
17061bb2827SIsaku Yamahata if (entry->function == 0x80000008)
17161bb2827SIsaku Yamahata entry->eax = tdx_set_guest_phys_addr_bits(entry->eax, 0xff);
1726d415778SAdrian Hunter
1736d415778SAdrian Hunter tdx_clear_unsupported_cpuid(entry);
17461bb2827SIsaku Yamahata }
17561bb2827SIsaku Yamahata
init_kvm_tdx_caps(const struct tdx_sys_info_td_conf * td_conf,struct kvm_tdx_capabilities * caps)17661bb2827SIsaku Yamahata static int init_kvm_tdx_caps(const struct tdx_sys_info_td_conf *td_conf,
17761bb2827SIsaku Yamahata struct kvm_tdx_capabilities *caps)
17861bb2827SIsaku Yamahata {
17961bb2827SIsaku Yamahata int i;
18061bb2827SIsaku Yamahata
18161bb2827SIsaku Yamahata caps->supported_attrs = tdx_get_supported_attrs(td_conf);
18261bb2827SIsaku Yamahata if (!caps->supported_attrs)
18361bb2827SIsaku Yamahata return -EIO;
18461bb2827SIsaku Yamahata
18561bb2827SIsaku Yamahata caps->supported_xfam = tdx_get_supported_xfam(td_conf);
18661bb2827SIsaku Yamahata if (!caps->supported_xfam)
18761bb2827SIsaku Yamahata return -EIO;
18861bb2827SIsaku Yamahata
18961bb2827SIsaku Yamahata caps->cpuid.nent = td_conf->num_cpuid_config;
19061bb2827SIsaku Yamahata
19161bb2827SIsaku Yamahata for (i = 0; i < td_conf->num_cpuid_config; i++)
19261bb2827SIsaku Yamahata td_init_cpuid_entry2(&caps->cpuid.entries[i], i);
19361bb2827SIsaku Yamahata
19461bb2827SIsaku Yamahata return 0;
19561bb2827SIsaku Yamahata }
19661bb2827SIsaku Yamahata
1978d032b68SIsaku Yamahata /*
1988d032b68SIsaku Yamahata * Some SEAMCALLs acquire the TDX module globally, and can fail with
1998d032b68SIsaku Yamahata * TDX_OPERAND_BUSY. Use a global mutex to serialize these SEAMCALLs.
2008d032b68SIsaku Yamahata */
2018d032b68SIsaku Yamahata static DEFINE_MUTEX(tdx_lock);
2028d032b68SIsaku Yamahata
2039934d7e5SIsaku Yamahata static atomic_t nr_configured_hkid;
2049934d7e5SIsaku Yamahata
tdx_operand_busy(u64 err)20502ab5770SIsaku Yamahata static bool tdx_operand_busy(u64 err)
20602ab5770SIsaku Yamahata {
20702ab5770SIsaku Yamahata return (err & TDX_SEAMCALL_STATUS_MASK) == TDX_OPERAND_BUSY;
20802ab5770SIsaku Yamahata }
20902ab5770SIsaku Yamahata
21002ab5770SIsaku Yamahata
211d789fa6eSIsaku Yamahata /*
212d789fa6eSIsaku Yamahata * A per-CPU list of TD vCPUs associated with a given CPU.
213d789fa6eSIsaku Yamahata * Protected by interrupt mask. Only manipulated by the CPU owning this per-CPU
214d789fa6eSIsaku Yamahata * list.
215d789fa6eSIsaku Yamahata * - When a vCPU is loaded onto a CPU, it is removed from the per-CPU list of
216d789fa6eSIsaku Yamahata * the old CPU during the IPI callback running on the old CPU, and then added
217d789fa6eSIsaku Yamahata * to the per-CPU list of the new CPU.
218d789fa6eSIsaku Yamahata * - When a TD is tearing down, all vCPUs are disassociated from their current
219d789fa6eSIsaku Yamahata * running CPUs and removed from the per-CPU list during the IPI callback
220d789fa6eSIsaku Yamahata * running on those CPUs.
221d789fa6eSIsaku Yamahata * - When a CPU is brought down, traverse the per-CPU list to disassociate all
222d789fa6eSIsaku Yamahata * associated TD vCPUs and remove them from the per-CPU list.
223d789fa6eSIsaku Yamahata */
224d789fa6eSIsaku Yamahata static DEFINE_PER_CPU(struct list_head, associated_tdvcpus);
225d789fa6eSIsaku Yamahata
tdvmcall_exit_type(struct kvm_vcpu * vcpu)226c42856afSIsaku Yamahata static __always_inline unsigned long tdvmcall_exit_type(struct kvm_vcpu *vcpu)
227c42856afSIsaku Yamahata {
228c42856afSIsaku Yamahata return to_tdx(vcpu)->vp_enter_args.r10;
229c42856afSIsaku Yamahata }
230c42856afSIsaku Yamahata
tdvmcall_leaf(struct kvm_vcpu * vcpu)231c42856afSIsaku Yamahata static __always_inline unsigned long tdvmcall_leaf(struct kvm_vcpu *vcpu)
232c42856afSIsaku Yamahata {
233c42856afSIsaku Yamahata return to_tdx(vcpu)->vp_enter_args.r11;
234c42856afSIsaku Yamahata }
235c42856afSIsaku Yamahata
tdvmcall_set_return_code(struct kvm_vcpu * vcpu,long val)236c42856afSIsaku Yamahata static __always_inline void tdvmcall_set_return_code(struct kvm_vcpu *vcpu,
237c42856afSIsaku Yamahata long val)
238c42856afSIsaku Yamahata {
239c42856afSIsaku Yamahata to_tdx(vcpu)->vp_enter_args.r10 = val;
240c42856afSIsaku Yamahata }
241c42856afSIsaku Yamahata
tdvmcall_set_return_val(struct kvm_vcpu * vcpu,unsigned long val)242c42856afSIsaku Yamahata static __always_inline void tdvmcall_set_return_val(struct kvm_vcpu *vcpu,
243c42856afSIsaku Yamahata unsigned long val)
244c42856afSIsaku Yamahata {
245c42856afSIsaku Yamahata to_tdx(vcpu)->vp_enter_args.r11 = val;
246c42856afSIsaku Yamahata }
247c42856afSIsaku Yamahata
tdx_hkid_free(struct kvm_tdx * kvm_tdx)2488d032b68SIsaku Yamahata static inline void tdx_hkid_free(struct kvm_tdx *kvm_tdx)
2498d032b68SIsaku Yamahata {
2508d032b68SIsaku Yamahata tdx_guest_keyid_free(kvm_tdx->hkid);
2518d032b68SIsaku Yamahata kvm_tdx->hkid = -1;
2529934d7e5SIsaku Yamahata atomic_dec(&nr_configured_hkid);
2537c035beaSZhiming Hu misc_cg_uncharge(MISC_CG_RES_TDX, kvm_tdx->misc_cg, 1);
2547c035beaSZhiming Hu put_misc_cg(kvm_tdx->misc_cg);
2557c035beaSZhiming Hu kvm_tdx->misc_cg = NULL;
2568d032b68SIsaku Yamahata }
2578d032b68SIsaku Yamahata
is_hkid_assigned(struct kvm_tdx * kvm_tdx)2588d032b68SIsaku Yamahata static inline bool is_hkid_assigned(struct kvm_tdx *kvm_tdx)
2598d032b68SIsaku Yamahata {
2608d032b68SIsaku Yamahata return kvm_tdx->hkid > 0;
2618d032b68SIsaku Yamahata }
2628d032b68SIsaku Yamahata
tdx_disassociate_vp(struct kvm_vcpu * vcpu)263d789fa6eSIsaku Yamahata static inline void tdx_disassociate_vp(struct kvm_vcpu *vcpu)
264d789fa6eSIsaku Yamahata {
265d789fa6eSIsaku Yamahata lockdep_assert_irqs_disabled();
266d789fa6eSIsaku Yamahata
267d789fa6eSIsaku Yamahata list_del(&to_tdx(vcpu)->cpu_list);
268d789fa6eSIsaku Yamahata
269d789fa6eSIsaku Yamahata /*
270d789fa6eSIsaku Yamahata * Ensure tdx->cpu_list is updated before setting vcpu->cpu to -1,
271d789fa6eSIsaku Yamahata * otherwise, a different CPU can see vcpu->cpu = -1 and add the vCPU
272d789fa6eSIsaku Yamahata * to its list before it's deleted from this CPU's list.
273d789fa6eSIsaku Yamahata */
274d789fa6eSIsaku Yamahata smp_wmb();
275d789fa6eSIsaku Yamahata
276d789fa6eSIsaku Yamahata vcpu->cpu = -1;
277d789fa6eSIsaku Yamahata }
278d789fa6eSIsaku Yamahata
tdx_clear_page(struct page * page)2798d032b68SIsaku Yamahata static void tdx_clear_page(struct page *page)
2808d032b68SIsaku Yamahata {
2818d032b68SIsaku Yamahata const void *zero_page = (const void *) page_to_virt(ZERO_PAGE(0));
2828d032b68SIsaku Yamahata void *dest = page_to_virt(page);
2838d032b68SIsaku Yamahata unsigned long i;
2848d032b68SIsaku Yamahata
2858d032b68SIsaku Yamahata /*
2868d032b68SIsaku Yamahata * The page could have been poisoned. MOVDIR64B also clears
2878d032b68SIsaku Yamahata * the poison bit so the kernel can safely use the page again.
2888d032b68SIsaku Yamahata */
2898d032b68SIsaku Yamahata for (i = 0; i < PAGE_SIZE; i += 64)
2908d032b68SIsaku Yamahata movdir64b(dest + i, zero_page);
2918d032b68SIsaku Yamahata /*
2928d032b68SIsaku Yamahata * MOVDIR64B store uses WC buffer. Prevent following memory reads
2938d032b68SIsaku Yamahata * from seeing potentially poisoned cache.
2948d032b68SIsaku Yamahata */
2958d032b68SIsaku Yamahata __mb();
2968d032b68SIsaku Yamahata }
2978d032b68SIsaku Yamahata
tdx_no_vcpus_enter_start(struct kvm * kvm)2984b2abc49SYan Zhao static void tdx_no_vcpus_enter_start(struct kvm *kvm)
2994b2abc49SYan Zhao {
3004b2abc49SYan Zhao struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
3014b2abc49SYan Zhao
3024b2abc49SYan Zhao lockdep_assert_held_write(&kvm->mmu_lock);
3034b2abc49SYan Zhao
3044b2abc49SYan Zhao WRITE_ONCE(kvm_tdx->wait_for_sept_zap, true);
3054b2abc49SYan Zhao
3064b2abc49SYan Zhao kvm_make_all_cpus_request(kvm, KVM_REQ_OUTSIDE_GUEST_MODE);
3074b2abc49SYan Zhao }
3084b2abc49SYan Zhao
tdx_no_vcpus_enter_stop(struct kvm * kvm)3094b2abc49SYan Zhao static void tdx_no_vcpus_enter_stop(struct kvm *kvm)
3104b2abc49SYan Zhao {
3114b2abc49SYan Zhao struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
3124b2abc49SYan Zhao
3134b2abc49SYan Zhao lockdep_assert_held_write(&kvm->mmu_lock);
3144b2abc49SYan Zhao
3154b2abc49SYan Zhao WRITE_ONCE(kvm_tdx->wait_for_sept_zap, false);
3164b2abc49SYan Zhao }
3174b2abc49SYan Zhao
3188d032b68SIsaku Yamahata /* TDH.PHYMEM.PAGE.RECLAIM is allowed only when destroying the TD. */
__tdx_reclaim_page(struct page * page)3198d032b68SIsaku Yamahata static int __tdx_reclaim_page(struct page *page)
3208d032b68SIsaku Yamahata {
3218d032b68SIsaku Yamahata u64 err, rcx, rdx, r8;
3228d032b68SIsaku Yamahata
3238d032b68SIsaku Yamahata err = tdh_phymem_page_reclaim(page, &rcx, &rdx, &r8);
3248d032b68SIsaku Yamahata
3258d032b68SIsaku Yamahata /*
3268d032b68SIsaku Yamahata * No need to check for TDX_OPERAND_BUSY; all TD pages are freed
3278d032b68SIsaku Yamahata * before the HKID is released and control pages have also been
3288d032b68SIsaku Yamahata * released at this point, so there is no possibility of contention.
3298d032b68SIsaku Yamahata */
3308d032b68SIsaku Yamahata if (WARN_ON_ONCE(err)) {
3318d032b68SIsaku Yamahata pr_tdx_error_3(TDH_PHYMEM_PAGE_RECLAIM, err, rcx, rdx, r8);
3328d032b68SIsaku Yamahata return -EIO;
3338d032b68SIsaku Yamahata }
3348d032b68SIsaku Yamahata return 0;
3358d032b68SIsaku Yamahata }
3368d032b68SIsaku Yamahata
tdx_reclaim_page(struct page * page)3378d032b68SIsaku Yamahata static int tdx_reclaim_page(struct page *page)
3388d032b68SIsaku Yamahata {
3398d032b68SIsaku Yamahata int r;
3408d032b68SIsaku Yamahata
3418d032b68SIsaku Yamahata r = __tdx_reclaim_page(page);
3428d032b68SIsaku Yamahata if (!r)
3438d032b68SIsaku Yamahata tdx_clear_page(page);
3448d032b68SIsaku Yamahata return r;
3458d032b68SIsaku Yamahata }
3468d032b68SIsaku Yamahata
3478d032b68SIsaku Yamahata
3488d032b68SIsaku Yamahata /*
3498d032b68SIsaku Yamahata * Reclaim the TD control page(s) which are crypto-protected by TDX guest's
3508d032b68SIsaku Yamahata * private KeyID. Assume the cache associated with the TDX private KeyID has
3518d032b68SIsaku Yamahata * been flushed.
3528d032b68SIsaku Yamahata */
tdx_reclaim_control_page(struct page * ctrl_page)3538d032b68SIsaku Yamahata static void tdx_reclaim_control_page(struct page *ctrl_page)
3548d032b68SIsaku Yamahata {
3558d032b68SIsaku Yamahata /*
3568d032b68SIsaku Yamahata * Leak the page if the kernel failed to reclaim the page.
3578d032b68SIsaku Yamahata * The kernel cannot use it safely anymore.
3588d032b68SIsaku Yamahata */
3598d032b68SIsaku Yamahata if (tdx_reclaim_page(ctrl_page))
3608d032b68SIsaku Yamahata return;
3618d032b68SIsaku Yamahata
3628d032b68SIsaku Yamahata __free_page(ctrl_page);
3638d032b68SIsaku Yamahata }
3648d032b68SIsaku Yamahata
365d789fa6eSIsaku Yamahata struct tdx_flush_vp_arg {
366d789fa6eSIsaku Yamahata struct kvm_vcpu *vcpu;
367d789fa6eSIsaku Yamahata u64 err;
368d789fa6eSIsaku Yamahata };
369d789fa6eSIsaku Yamahata
tdx_flush_vp(void * _arg)370d789fa6eSIsaku Yamahata static void tdx_flush_vp(void *_arg)
371d789fa6eSIsaku Yamahata {
372d789fa6eSIsaku Yamahata struct tdx_flush_vp_arg *arg = _arg;
373d789fa6eSIsaku Yamahata struct kvm_vcpu *vcpu = arg->vcpu;
374d789fa6eSIsaku Yamahata u64 err;
375d789fa6eSIsaku Yamahata
376d789fa6eSIsaku Yamahata arg->err = 0;
377d789fa6eSIsaku Yamahata lockdep_assert_irqs_disabled();
378d789fa6eSIsaku Yamahata
379d789fa6eSIsaku Yamahata /* Task migration can race with CPU offlining. */
380d789fa6eSIsaku Yamahata if (unlikely(vcpu->cpu != raw_smp_processor_id()))
381d789fa6eSIsaku Yamahata return;
382d789fa6eSIsaku Yamahata
383d789fa6eSIsaku Yamahata /*
384d789fa6eSIsaku Yamahata * No need to do TDH_VP_FLUSH if the vCPU hasn't been initialized. The
385d789fa6eSIsaku Yamahata * list tracking still needs to be updated so that it's correct if/when
386d789fa6eSIsaku Yamahata * the vCPU does get initialized.
387d789fa6eSIsaku Yamahata */
388d789fa6eSIsaku Yamahata if (to_tdx(vcpu)->state != VCPU_TD_STATE_UNINITIALIZED) {
389d789fa6eSIsaku Yamahata /*
390d789fa6eSIsaku Yamahata * No need to retry. TDX Resources needed for TDH.VP.FLUSH are:
391d789fa6eSIsaku Yamahata * TDVPR as exclusive, TDR as shared, and TDCS as shared. This
392d789fa6eSIsaku Yamahata * vp flush function is called when destructing vCPU/TD or vCPU
393d789fa6eSIsaku Yamahata * migration. No other thread uses TDVPR in those cases.
394d789fa6eSIsaku Yamahata */
395d789fa6eSIsaku Yamahata err = tdh_vp_flush(&to_tdx(vcpu)->vp);
396d789fa6eSIsaku Yamahata if (unlikely(err && err != TDX_VCPU_NOT_ASSOCIATED)) {
397d789fa6eSIsaku Yamahata /*
398d789fa6eSIsaku Yamahata * This function is called in IPI context. Do not use
399d789fa6eSIsaku Yamahata * printk to avoid console semaphore.
400d789fa6eSIsaku Yamahata * The caller prints out the error message, instead.
401d789fa6eSIsaku Yamahata */
402d789fa6eSIsaku Yamahata if (err)
403d789fa6eSIsaku Yamahata arg->err = err;
404d789fa6eSIsaku Yamahata }
405d789fa6eSIsaku Yamahata }
406d789fa6eSIsaku Yamahata
407d789fa6eSIsaku Yamahata tdx_disassociate_vp(vcpu);
408d789fa6eSIsaku Yamahata }
409d789fa6eSIsaku Yamahata
tdx_flush_vp_on_cpu(struct kvm_vcpu * vcpu)410d789fa6eSIsaku Yamahata static void tdx_flush_vp_on_cpu(struct kvm_vcpu *vcpu)
411d789fa6eSIsaku Yamahata {
412d789fa6eSIsaku Yamahata struct tdx_flush_vp_arg arg = {
413d789fa6eSIsaku Yamahata .vcpu = vcpu,
414d789fa6eSIsaku Yamahata };
415d789fa6eSIsaku Yamahata int cpu = vcpu->cpu;
416d789fa6eSIsaku Yamahata
417d789fa6eSIsaku Yamahata if (unlikely(cpu == -1))
418d789fa6eSIsaku Yamahata return;
419d789fa6eSIsaku Yamahata
420d789fa6eSIsaku Yamahata smp_call_function_single(cpu, tdx_flush_vp, &arg, 1);
421d789fa6eSIsaku Yamahata if (KVM_BUG_ON(arg.err, vcpu->kvm))
422d789fa6eSIsaku Yamahata pr_tdx_error(TDH_VP_FLUSH, arg.err);
423d789fa6eSIsaku Yamahata }
424d789fa6eSIsaku Yamahata
tdx_disable_virtualization_cpu(void)425d789fa6eSIsaku Yamahata void tdx_disable_virtualization_cpu(void)
426d789fa6eSIsaku Yamahata {
427d789fa6eSIsaku Yamahata int cpu = raw_smp_processor_id();
428d789fa6eSIsaku Yamahata struct list_head *tdvcpus = &per_cpu(associated_tdvcpus, cpu);
429d789fa6eSIsaku Yamahata struct tdx_flush_vp_arg arg;
430d789fa6eSIsaku Yamahata struct vcpu_tdx *tdx, *tmp;
431d789fa6eSIsaku Yamahata unsigned long flags;
432d789fa6eSIsaku Yamahata
433d789fa6eSIsaku Yamahata local_irq_save(flags);
434d789fa6eSIsaku Yamahata /* Safe variant needed as tdx_disassociate_vp() deletes the entry. */
435d789fa6eSIsaku Yamahata list_for_each_entry_safe(tdx, tmp, tdvcpus, cpu_list) {
436d789fa6eSIsaku Yamahata arg.vcpu = &tdx->vcpu;
437d789fa6eSIsaku Yamahata tdx_flush_vp(&arg);
438d789fa6eSIsaku Yamahata }
439d789fa6eSIsaku Yamahata local_irq_restore(flags);
440d789fa6eSIsaku Yamahata }
441d789fa6eSIsaku Yamahata
4428d032b68SIsaku Yamahata #define TDX_SEAMCALL_RETRIES 10000
4438d032b68SIsaku Yamahata
smp_func_do_phymem_cache_wb(void * unused)4448d032b68SIsaku Yamahata static void smp_func_do_phymem_cache_wb(void *unused)
4458d032b68SIsaku Yamahata {
4468d032b68SIsaku Yamahata u64 err = 0;
4478d032b68SIsaku Yamahata bool resume;
4488d032b68SIsaku Yamahata int i;
4498d032b68SIsaku Yamahata
4508d032b68SIsaku Yamahata /*
4518d032b68SIsaku Yamahata * TDH.PHYMEM.CACHE.WB flushes caches associated with any TDX private
4528d032b68SIsaku Yamahata * KeyID on the package or core. The TDX module may not finish the
4538d032b68SIsaku Yamahata * cache flush but return TDX_INTERRUPTED_RESUMEABLE instead. The
4548d032b68SIsaku Yamahata * kernel should retry it until it returns success w/o rescheduling.
4558d032b68SIsaku Yamahata */
4568d032b68SIsaku Yamahata for (i = TDX_SEAMCALL_RETRIES; i > 0; i--) {
4578d032b68SIsaku Yamahata resume = !!err;
4588d032b68SIsaku Yamahata err = tdh_phymem_cache_wb(resume);
4598d032b68SIsaku Yamahata switch (err) {
4608d032b68SIsaku Yamahata case TDX_INTERRUPTED_RESUMABLE:
4618d032b68SIsaku Yamahata continue;
4628d032b68SIsaku Yamahata case TDX_NO_HKID_READY_TO_WBCACHE:
4638d032b68SIsaku Yamahata err = TDX_SUCCESS; /* Already done by other thread */
4648d032b68SIsaku Yamahata fallthrough;
4658d032b68SIsaku Yamahata default:
4668d032b68SIsaku Yamahata goto out;
4678d032b68SIsaku Yamahata }
4688d032b68SIsaku Yamahata }
4698d032b68SIsaku Yamahata
4708d032b68SIsaku Yamahata out:
4718d032b68SIsaku Yamahata if (WARN_ON_ONCE(err))
4728d032b68SIsaku Yamahata pr_tdx_error(TDH_PHYMEM_CACHE_WB, err);
4738d032b68SIsaku Yamahata }
4748d032b68SIsaku Yamahata
tdx_mmu_release_hkid(struct kvm * kvm)4758d032b68SIsaku Yamahata void tdx_mmu_release_hkid(struct kvm *kvm)
4768d032b68SIsaku Yamahata {
4778d032b68SIsaku Yamahata bool packages_allocated, targets_allocated;
4788d032b68SIsaku Yamahata struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
4798d032b68SIsaku Yamahata cpumask_var_t packages, targets;
480d789fa6eSIsaku Yamahata struct kvm_vcpu *vcpu;
481d789fa6eSIsaku Yamahata unsigned long j;
4828d032b68SIsaku Yamahata int i;
483d789fa6eSIsaku Yamahata u64 err;
4848d032b68SIsaku Yamahata
4858d032b68SIsaku Yamahata if (!is_hkid_assigned(kvm_tdx))
4868d032b68SIsaku Yamahata return;
4878d032b68SIsaku Yamahata
4888d032b68SIsaku Yamahata packages_allocated = zalloc_cpumask_var(&packages, GFP_KERNEL);
4898d032b68SIsaku Yamahata targets_allocated = zalloc_cpumask_var(&targets, GFP_KERNEL);
4908d032b68SIsaku Yamahata cpus_read_lock();
4918d032b68SIsaku Yamahata
492d789fa6eSIsaku Yamahata kvm_for_each_vcpu(j, vcpu, kvm)
493d789fa6eSIsaku Yamahata tdx_flush_vp_on_cpu(vcpu);
494d789fa6eSIsaku Yamahata
4958d032b68SIsaku Yamahata /*
4968d032b68SIsaku Yamahata * TDH.PHYMEM.CACHE.WB tries to acquire the TDX module global lock
4978d032b68SIsaku Yamahata * and can fail with TDX_OPERAND_BUSY when it fails to get the lock.
4988d032b68SIsaku Yamahata * Multiple TDX guests can be destroyed simultaneously. Take the
4998d032b68SIsaku Yamahata * mutex to prevent it from getting error.
5008d032b68SIsaku Yamahata */
5018d032b68SIsaku Yamahata mutex_lock(&tdx_lock);
5028d032b68SIsaku Yamahata
5038d032b68SIsaku Yamahata /*
5048d032b68SIsaku Yamahata * Releasing HKID is in vm_destroy().
5058d032b68SIsaku Yamahata * After the above flushing vps, there should be no more vCPU
5068d032b68SIsaku Yamahata * associations, as all vCPU fds have been released at this stage.
5078d032b68SIsaku Yamahata */
508d789fa6eSIsaku Yamahata err = tdh_mng_vpflushdone(&kvm_tdx->td);
509d789fa6eSIsaku Yamahata if (err == TDX_FLUSHVP_NOT_DONE)
510d789fa6eSIsaku Yamahata goto out;
511d789fa6eSIsaku Yamahata if (KVM_BUG_ON(err, kvm)) {
512d789fa6eSIsaku Yamahata pr_tdx_error(TDH_MNG_VPFLUSHDONE, err);
513d789fa6eSIsaku Yamahata pr_err("tdh_mng_vpflushdone() failed. HKID %d is leaked.\n",
514d789fa6eSIsaku Yamahata kvm_tdx->hkid);
515d789fa6eSIsaku Yamahata goto out;
516d789fa6eSIsaku Yamahata }
517d789fa6eSIsaku Yamahata
5188d032b68SIsaku Yamahata for_each_online_cpu(i) {
5198d032b68SIsaku Yamahata if (packages_allocated &&
5208d032b68SIsaku Yamahata cpumask_test_and_set_cpu(topology_physical_package_id(i),
5218d032b68SIsaku Yamahata packages))
5228d032b68SIsaku Yamahata continue;
5238d032b68SIsaku Yamahata if (targets_allocated)
5248d032b68SIsaku Yamahata cpumask_set_cpu(i, targets);
5258d032b68SIsaku Yamahata }
5268d032b68SIsaku Yamahata if (targets_allocated)
5278d032b68SIsaku Yamahata on_each_cpu_mask(targets, smp_func_do_phymem_cache_wb, NULL, true);
5288d032b68SIsaku Yamahata else
5298d032b68SIsaku Yamahata on_each_cpu(smp_func_do_phymem_cache_wb, NULL, true);
5308d032b68SIsaku Yamahata /*
5318d032b68SIsaku Yamahata * In the case of error in smp_func_do_phymem_cache_wb(), the following
5328d032b68SIsaku Yamahata * tdh_mng_key_freeid() will fail.
5338d032b68SIsaku Yamahata */
5348d032b68SIsaku Yamahata err = tdh_mng_key_freeid(&kvm_tdx->td);
5358d032b68SIsaku Yamahata if (KVM_BUG_ON(err, kvm)) {
5368d032b68SIsaku Yamahata pr_tdx_error(TDH_MNG_KEY_FREEID, err);
5378d032b68SIsaku Yamahata pr_err("tdh_mng_key_freeid() failed. HKID %d is leaked.\n",
5388d032b68SIsaku Yamahata kvm_tdx->hkid);
5398d032b68SIsaku Yamahata } else {
5408d032b68SIsaku Yamahata tdx_hkid_free(kvm_tdx);
5418d032b68SIsaku Yamahata }
5428d032b68SIsaku Yamahata
543d789fa6eSIsaku Yamahata out:
5448d032b68SIsaku Yamahata mutex_unlock(&tdx_lock);
5458d032b68SIsaku Yamahata cpus_read_unlock();
5468d032b68SIsaku Yamahata free_cpumask_var(targets);
5478d032b68SIsaku Yamahata free_cpumask_var(packages);
5488d032b68SIsaku Yamahata }
5498d032b68SIsaku Yamahata
tdx_reclaim_td_control_pages(struct kvm * kvm)5508d032b68SIsaku Yamahata static void tdx_reclaim_td_control_pages(struct kvm *kvm)
5518d032b68SIsaku Yamahata {
5528d032b68SIsaku Yamahata struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
5538d032b68SIsaku Yamahata u64 err;
5548d032b68SIsaku Yamahata int i;
5558d032b68SIsaku Yamahata
5568d032b68SIsaku Yamahata /*
5578d032b68SIsaku Yamahata * tdx_mmu_release_hkid() failed to reclaim HKID. Something went wrong
5588d032b68SIsaku Yamahata * heavily with TDX module. Give up freeing TD pages. As the function
5598d032b68SIsaku Yamahata * already warned, don't warn it again.
5608d032b68SIsaku Yamahata */
5618d032b68SIsaku Yamahata if (is_hkid_assigned(kvm_tdx))
5628d032b68SIsaku Yamahata return;
5638d032b68SIsaku Yamahata
5648d032b68SIsaku Yamahata if (kvm_tdx->td.tdcs_pages) {
5658d032b68SIsaku Yamahata for (i = 0; i < kvm_tdx->td.tdcs_nr_pages; i++) {
5668d032b68SIsaku Yamahata if (!kvm_tdx->td.tdcs_pages[i])
5678d032b68SIsaku Yamahata continue;
5688d032b68SIsaku Yamahata
5698d032b68SIsaku Yamahata tdx_reclaim_control_page(kvm_tdx->td.tdcs_pages[i]);
5708d032b68SIsaku Yamahata }
5718d032b68SIsaku Yamahata kfree(kvm_tdx->td.tdcs_pages);
5728d032b68SIsaku Yamahata kvm_tdx->td.tdcs_pages = NULL;
5738d032b68SIsaku Yamahata }
5748d032b68SIsaku Yamahata
5758d032b68SIsaku Yamahata if (!kvm_tdx->td.tdr_page)
5768d032b68SIsaku Yamahata return;
5778d032b68SIsaku Yamahata
5788d032b68SIsaku Yamahata if (__tdx_reclaim_page(kvm_tdx->td.tdr_page))
5798d032b68SIsaku Yamahata return;
5808d032b68SIsaku Yamahata
5818d032b68SIsaku Yamahata /*
5828d032b68SIsaku Yamahata * Use a SEAMCALL to ask the TDX module to flush the cache based on the
5838d032b68SIsaku Yamahata * KeyID. TDX module may access TDR while operating on TD (Especially
5848d032b68SIsaku Yamahata * when it is reclaiming TDCS).
5858d032b68SIsaku Yamahata */
5868d032b68SIsaku Yamahata err = tdh_phymem_page_wbinvd_tdr(&kvm_tdx->td);
5878d032b68SIsaku Yamahata if (KVM_BUG_ON(err, kvm)) {
5888d032b68SIsaku Yamahata pr_tdx_error(TDH_PHYMEM_PAGE_WBINVD, err);
5898d032b68SIsaku Yamahata return;
5908d032b68SIsaku Yamahata }
5918d032b68SIsaku Yamahata tdx_clear_page(kvm_tdx->td.tdr_page);
5928d032b68SIsaku Yamahata
5938d032b68SIsaku Yamahata __free_page(kvm_tdx->td.tdr_page);
5948d032b68SIsaku Yamahata kvm_tdx->td.tdr_page = NULL;
5958d032b68SIsaku Yamahata }
5968d032b68SIsaku Yamahata
tdx_vm_destroy(struct kvm * kvm)5978d032b68SIsaku Yamahata void tdx_vm_destroy(struct kvm *kvm)
5988d032b68SIsaku Yamahata {
5990186dd29SIsaku Yamahata struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
6000186dd29SIsaku Yamahata
6018d032b68SIsaku Yamahata tdx_reclaim_td_control_pages(kvm);
6020186dd29SIsaku Yamahata
6030186dd29SIsaku Yamahata kvm_tdx->state = TD_STATE_UNINITIALIZED;
6048d032b68SIsaku Yamahata }
6058d032b68SIsaku Yamahata
tdx_do_tdh_mng_key_config(void * param)6068d032b68SIsaku Yamahata static int tdx_do_tdh_mng_key_config(void *param)
6078d032b68SIsaku Yamahata {
6088d032b68SIsaku Yamahata struct kvm_tdx *kvm_tdx = param;
6098d032b68SIsaku Yamahata u64 err;
6108d032b68SIsaku Yamahata
6118d032b68SIsaku Yamahata /* TDX_RND_NO_ENTROPY related retries are handled by sc_retry() */
6128d032b68SIsaku Yamahata err = tdh_mng_key_config(&kvm_tdx->td);
6138d032b68SIsaku Yamahata
6148d032b68SIsaku Yamahata if (KVM_BUG_ON(err, &kvm_tdx->kvm)) {
6158d032b68SIsaku Yamahata pr_tdx_error(TDH_MNG_KEY_CONFIG, err);
6168d032b68SIsaku Yamahata return -EIO;
6178d032b68SIsaku Yamahata }
6188d032b68SIsaku Yamahata
6198d032b68SIsaku Yamahata return 0;
6208d032b68SIsaku Yamahata }
6218d032b68SIsaku Yamahata
tdx_vm_init(struct kvm * kvm)6228d032b68SIsaku Yamahata int tdx_vm_init(struct kvm *kvm)
6238d032b68SIsaku Yamahata {
6240186dd29SIsaku Yamahata struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
6250186dd29SIsaku Yamahata
6268d032b68SIsaku Yamahata kvm->arch.has_protected_state = true;
6278d032b68SIsaku Yamahata kvm->arch.has_private_mem = true;
62890fe64a9SYan Zhao kvm->arch.disabled_quirks |= KVM_X86_QUIRK_IGNORE_GUEST_PAT;
6298d032b68SIsaku Yamahata
630f94f4a97SIsaku Yamahata /*
6317d10ffb1SIsaku Yamahata * Because guest TD is protected, VMM can't parse the instruction in TD.
6327d10ffb1SIsaku Yamahata * Instead, guest uses MMIO hypercall. For unmodified device driver,
6337d10ffb1SIsaku Yamahata * #VE needs to be injected for MMIO and #VE handler in TD converts MMIO
6347d10ffb1SIsaku Yamahata * instruction into MMIO hypercall.
6357d10ffb1SIsaku Yamahata *
6367d10ffb1SIsaku Yamahata * SPTE value for MMIO needs to be setup so that #VE is injected into
6377d10ffb1SIsaku Yamahata * TD instead of triggering EPT MISCONFIG.
6387d10ffb1SIsaku Yamahata * - RWX=0 so that EPT violation is triggered.
6397d10ffb1SIsaku Yamahata * - suppress #VE bit is cleared to inject #VE.
6407d10ffb1SIsaku Yamahata */
6417d10ffb1SIsaku Yamahata kvm_mmu_set_mmio_spte_value(kvm, 0);
6427d10ffb1SIsaku Yamahata
6437d10ffb1SIsaku Yamahata /*
644f94f4a97SIsaku Yamahata * TDX has its own limit of maximum vCPUs it can support for all
645f94f4a97SIsaku Yamahata * TDX guests in addition to KVM_MAX_VCPUS. TDX module reports
646f94f4a97SIsaku Yamahata * such limit via the MAX_VCPU_PER_TD global metadata. In
647f94f4a97SIsaku Yamahata * practice, it reflects the number of logical CPUs that ALL
648f94f4a97SIsaku Yamahata * platforms that the TDX module supports can possibly have.
649f94f4a97SIsaku Yamahata *
650f94f4a97SIsaku Yamahata * Limit TDX guest's maximum vCPUs to the number of logical CPUs
651f94f4a97SIsaku Yamahata * the platform has. Simply forwarding the MAX_VCPU_PER_TD to
652f94f4a97SIsaku Yamahata * userspace would result in an unpredictable ABI.
653f94f4a97SIsaku Yamahata */
654f94f4a97SIsaku Yamahata kvm->max_vcpus = min_t(int, kvm->max_vcpus, num_present_cpus());
655f94f4a97SIsaku Yamahata
6560186dd29SIsaku Yamahata kvm_tdx->state = TD_STATE_UNINITIALIZED;
6570186dd29SIsaku Yamahata
6580186dd29SIsaku Yamahata return 0;
6598d032b68SIsaku Yamahata }
6608d032b68SIsaku Yamahata
tdx_vcpu_create(struct kvm_vcpu * vcpu)6619002f8cfSIsaku Yamahata int tdx_vcpu_create(struct kvm_vcpu *vcpu)
6629002f8cfSIsaku Yamahata {
6639002f8cfSIsaku Yamahata struct kvm_tdx *kvm_tdx = to_kvm_tdx(vcpu->kvm);
664a50f673fSIsaku Yamahata struct vcpu_tdx *tdx = to_tdx(vcpu);
6659002f8cfSIsaku Yamahata
6669002f8cfSIsaku Yamahata if (kvm_tdx->state != TD_STATE_INITIALIZED)
6679002f8cfSIsaku Yamahata return -EIO;
6689002f8cfSIsaku Yamahata
669209afc0cSBinbin Wu /*
670209afc0cSBinbin Wu * TDX module mandates APICv, which requires an in-kernel local APIC.
671209afc0cSBinbin Wu * Disallow an in-kernel I/O APIC, because level-triggered interrupts
672209afc0cSBinbin Wu * and thus the I/O APIC as a whole can't be faithfully emulated in KVM.
673209afc0cSBinbin Wu */
674209afc0cSBinbin Wu if (!irqchip_split(vcpu->kvm))
6759002f8cfSIsaku Yamahata return -EINVAL;
6769002f8cfSIsaku Yamahata
6779002f8cfSIsaku Yamahata fpstate_set_confidential(&vcpu->arch.guest_fpu);
67890cfe144SSean Christopherson vcpu->arch.apic->guest_apic_protected = true;
67934d2d1caSIsaku Yamahata INIT_LIST_HEAD(&tdx->vt.pi_wakeup_list);
6809002f8cfSIsaku Yamahata
6819002f8cfSIsaku Yamahata vcpu->arch.efer = EFER_SCE | EFER_LME | EFER_LMA | EFER_NX;
6829002f8cfSIsaku Yamahata
683484612f1SIsaku Yamahata vcpu->arch.switch_db_regs = KVM_DEBUGREG_AUTO_SWITCH;
6849002f8cfSIsaku Yamahata vcpu->arch.cr0_guest_owned_bits = -1ul;
6859002f8cfSIsaku Yamahata vcpu->arch.cr4_guest_owned_bits = -1ul;
6869002f8cfSIsaku Yamahata
6879002f8cfSIsaku Yamahata /* KVM can't change TSC offset/multiplier as TDX module manages them. */
6889002f8cfSIsaku Yamahata vcpu->arch.guest_tsc_protected = true;
6899002f8cfSIsaku Yamahata vcpu->arch.tsc_offset = kvm_tdx->tsc_offset;
6909002f8cfSIsaku Yamahata vcpu->arch.l1_tsc_offset = vcpu->arch.tsc_offset;
6919002f8cfSIsaku Yamahata vcpu->arch.tsc_scaling_ratio = kvm_tdx->tsc_multiplier;
6929002f8cfSIsaku Yamahata vcpu->arch.l1_tsc_scaling_ratio = kvm_tdx->tsc_multiplier;
6939002f8cfSIsaku Yamahata
6949002f8cfSIsaku Yamahata vcpu->arch.guest_state_protected =
6959002f8cfSIsaku Yamahata !(to_kvm_tdx(vcpu->kvm)->attributes & TDX_TD_ATTR_DEBUG);
6969002f8cfSIsaku Yamahata
6979002f8cfSIsaku Yamahata if ((kvm_tdx->xfam & XFEATURE_MASK_XTILE) == XFEATURE_MASK_XTILE)
6989002f8cfSIsaku Yamahata vcpu->arch.xfd_no_write_intercept = true;
6999002f8cfSIsaku Yamahata
70024c12911SIsaku Yamahata tdx->vt.pi_desc.nv = POSTED_INTR_VECTOR;
70124c12911SIsaku Yamahata __pi_set_sn(&tdx->vt.pi_desc);
70224c12911SIsaku Yamahata
703a50f673fSIsaku Yamahata tdx->state = VCPU_TD_STATE_UNINITIALIZED;
704a50f673fSIsaku Yamahata
7059002f8cfSIsaku Yamahata return 0;
7069002f8cfSIsaku Yamahata }
7079002f8cfSIsaku Yamahata
tdx_vcpu_load(struct kvm_vcpu * vcpu,int cpu)708d789fa6eSIsaku Yamahata void tdx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
709d789fa6eSIsaku Yamahata {
710d789fa6eSIsaku Yamahata struct vcpu_tdx *tdx = to_tdx(vcpu);
711d789fa6eSIsaku Yamahata
71224c12911SIsaku Yamahata vmx_vcpu_pi_load(vcpu, cpu);
713d789fa6eSIsaku Yamahata if (vcpu->cpu == cpu || !is_hkid_assigned(to_kvm_tdx(vcpu->kvm)))
714d789fa6eSIsaku Yamahata return;
715d789fa6eSIsaku Yamahata
716d789fa6eSIsaku Yamahata tdx_flush_vp_on_cpu(vcpu);
717d789fa6eSIsaku Yamahata
718d789fa6eSIsaku Yamahata KVM_BUG_ON(cpu != raw_smp_processor_id(), vcpu->kvm);
719d789fa6eSIsaku Yamahata local_irq_disable();
720d789fa6eSIsaku Yamahata /*
721d789fa6eSIsaku Yamahata * Pairs with the smp_wmb() in tdx_disassociate_vp() to ensure
722d789fa6eSIsaku Yamahata * vcpu->cpu is read before tdx->cpu_list.
723d789fa6eSIsaku Yamahata */
724d789fa6eSIsaku Yamahata smp_rmb();
725d789fa6eSIsaku Yamahata
726d789fa6eSIsaku Yamahata list_add(&tdx->cpu_list, &per_cpu(associated_tdvcpus, cpu));
727d789fa6eSIsaku Yamahata local_irq_enable();
728d789fa6eSIsaku Yamahata }
729d789fa6eSIsaku Yamahata
tdx_interrupt_allowed(struct kvm_vcpu * vcpu)7305cf7239bSIsaku Yamahata bool tdx_interrupt_allowed(struct kvm_vcpu *vcpu)
7315cf7239bSIsaku Yamahata {
7325cf7239bSIsaku Yamahata /*
7335cf7239bSIsaku Yamahata * KVM can't get the interrupt status of TDX guest and it assumes
7345cf7239bSIsaku Yamahata * interrupt is always allowed unless TDX guest calls TDVMCALL with HLT,
7355cf7239bSIsaku Yamahata * which passes the interrupt blocked flag.
7365cf7239bSIsaku Yamahata */
7375cf7239bSIsaku Yamahata return vmx_get_exit_reason(vcpu).basic != EXIT_REASON_HLT ||
7385cf7239bSIsaku Yamahata !to_tdx(vcpu)->vp_enter_args.r12;
7395cf7239bSIsaku Yamahata }
7405cf7239bSIsaku Yamahata
tdx_protected_apic_has_interrupt(struct kvm_vcpu * vcpu)74190cfe144SSean Christopherson bool tdx_protected_apic_has_interrupt(struct kvm_vcpu *vcpu)
74290cfe144SSean Christopherson {
7435cf7239bSIsaku Yamahata u64 vcpu_state_details;
7445cf7239bSIsaku Yamahata
7455cf7239bSIsaku Yamahata if (pi_has_pending_interrupt(vcpu))
7465cf7239bSIsaku Yamahata return true;
7475cf7239bSIsaku Yamahata
7485cf7239bSIsaku Yamahata /*
7495cf7239bSIsaku Yamahata * Only check RVI pending for HALTED case with IRQ enabled.
7505cf7239bSIsaku Yamahata * For non-HLT cases, KVM doesn't care about STI/SS shadows. And if the
7515cf7239bSIsaku Yamahata * interrupt was pending before TD exit, then it _must_ be blocked,
7525cf7239bSIsaku Yamahata * otherwise the interrupt would have been serviced at the instruction
7535cf7239bSIsaku Yamahata * boundary.
7545cf7239bSIsaku Yamahata */
7555cf7239bSIsaku Yamahata if (vmx_get_exit_reason(vcpu).basic != EXIT_REASON_HLT ||
7565cf7239bSIsaku Yamahata to_tdx(vcpu)->vp_enter_args.r12)
7575cf7239bSIsaku Yamahata return false;
7585cf7239bSIsaku Yamahata
7595cf7239bSIsaku Yamahata vcpu_state_details =
7605cf7239bSIsaku Yamahata td_state_non_arch_read64(to_tdx(vcpu), TD_VCPU_STATE_DETAILS_NON_ARCH);
7615cf7239bSIsaku Yamahata
7625cf7239bSIsaku Yamahata return tdx_vcpu_state_details_intr_pending(vcpu_state_details);
76390cfe144SSean Christopherson }
76490cfe144SSean Christopherson
76581bf40d5SIsaku Yamahata /*
76681bf40d5SIsaku Yamahata * Compared to vmx_prepare_switch_to_guest(), there is not much to do
76781bf40d5SIsaku Yamahata * as SEAMCALL/SEAMRET calls take care of most of save and restore.
76881bf40d5SIsaku Yamahata */
tdx_prepare_switch_to_guest(struct kvm_vcpu * vcpu)76981bf40d5SIsaku Yamahata void tdx_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
77081bf40d5SIsaku Yamahata {
77181bf40d5SIsaku Yamahata struct vcpu_vt *vt = to_vt(vcpu);
77281bf40d5SIsaku Yamahata
77381bf40d5SIsaku Yamahata if (vt->guest_state_loaded)
77481bf40d5SIsaku Yamahata return;
77581bf40d5SIsaku Yamahata
77681bf40d5SIsaku Yamahata if (likely(is_64bit_mm(current->mm)))
77781bf40d5SIsaku Yamahata vt->msr_host_kernel_gs_base = current->thread.gsbase;
77881bf40d5SIsaku Yamahata else
77981bf40d5SIsaku Yamahata vt->msr_host_kernel_gs_base = read_msr(MSR_KERNEL_GS_BASE);
78081bf40d5SIsaku Yamahata
7818af09903SAdrian Hunter vt->host_debugctlmsr = get_debugctlmsr();
7828af09903SAdrian Hunter
78381bf40d5SIsaku Yamahata vt->guest_state_loaded = true;
78481bf40d5SIsaku Yamahata }
78581bf40d5SIsaku Yamahata
786e0b4f31aSIsaku Yamahata struct tdx_uret_msr {
787e0b4f31aSIsaku Yamahata u32 msr;
788e0b4f31aSIsaku Yamahata unsigned int slot;
789e0b4f31aSIsaku Yamahata u64 defval;
790e0b4f31aSIsaku Yamahata };
791e0b4f31aSIsaku Yamahata
792e0b4f31aSIsaku Yamahata static struct tdx_uret_msr tdx_uret_msrs[] = {
793e0b4f31aSIsaku Yamahata {.msr = MSR_SYSCALL_MASK, .defval = 0x20200 },
794e0b4f31aSIsaku Yamahata {.msr = MSR_STAR,},
795e0b4f31aSIsaku Yamahata {.msr = MSR_LSTAR,},
796e0b4f31aSIsaku Yamahata {.msr = MSR_TSC_AUX,},
797e0b4f31aSIsaku Yamahata };
798e0b4f31aSIsaku Yamahata
tdx_user_return_msr_update_cache(void)799e0b4f31aSIsaku Yamahata static void tdx_user_return_msr_update_cache(void)
800e0b4f31aSIsaku Yamahata {
801e0b4f31aSIsaku Yamahata int i;
802e0b4f31aSIsaku Yamahata
803e0b4f31aSIsaku Yamahata for (i = 0; i < ARRAY_SIZE(tdx_uret_msrs); i++)
804e0b4f31aSIsaku Yamahata kvm_user_return_msr_update_cache(tdx_uret_msrs[i].slot,
805e0b4f31aSIsaku Yamahata tdx_uret_msrs[i].defval);
806e0b4f31aSIsaku Yamahata }
807e0b4f31aSIsaku Yamahata
tdx_prepare_switch_to_host(struct kvm_vcpu * vcpu)80881bf40d5SIsaku Yamahata static void tdx_prepare_switch_to_host(struct kvm_vcpu *vcpu)
80981bf40d5SIsaku Yamahata {
81081bf40d5SIsaku Yamahata struct vcpu_vt *vt = to_vt(vcpu);
811e0b4f31aSIsaku Yamahata struct vcpu_tdx *tdx = to_tdx(vcpu);
81281bf40d5SIsaku Yamahata
81381bf40d5SIsaku Yamahata if (!vt->guest_state_loaded)
81481bf40d5SIsaku Yamahata return;
81581bf40d5SIsaku Yamahata
81681bf40d5SIsaku Yamahata ++vcpu->stat.host_state_reload;
81781bf40d5SIsaku Yamahata wrmsrl(MSR_KERNEL_GS_BASE, vt->msr_host_kernel_gs_base);
81881bf40d5SIsaku Yamahata
819e0b4f31aSIsaku Yamahata if (tdx->guest_entered) {
820e0b4f31aSIsaku Yamahata tdx_user_return_msr_update_cache();
821e0b4f31aSIsaku Yamahata tdx->guest_entered = false;
822e0b4f31aSIsaku Yamahata }
823e0b4f31aSIsaku Yamahata
82481bf40d5SIsaku Yamahata vt->guest_state_loaded = false;
82581bf40d5SIsaku Yamahata }
82681bf40d5SIsaku Yamahata
tdx_vcpu_put(struct kvm_vcpu * vcpu)82781bf40d5SIsaku Yamahata void tdx_vcpu_put(struct kvm_vcpu *vcpu)
82881bf40d5SIsaku Yamahata {
82981bf40d5SIsaku Yamahata vmx_vcpu_pi_put(vcpu);
83081bf40d5SIsaku Yamahata tdx_prepare_switch_to_host(vcpu);
83181bf40d5SIsaku Yamahata }
83281bf40d5SIsaku Yamahata
tdx_vcpu_free(struct kvm_vcpu * vcpu)8339002f8cfSIsaku Yamahata void tdx_vcpu_free(struct kvm_vcpu *vcpu)
8349002f8cfSIsaku Yamahata {
835a50f673fSIsaku Yamahata struct kvm_tdx *kvm_tdx = to_kvm_tdx(vcpu->kvm);
836a50f673fSIsaku Yamahata struct vcpu_tdx *tdx = to_tdx(vcpu);
837a50f673fSIsaku Yamahata int i;
838a50f673fSIsaku Yamahata
839a50f673fSIsaku Yamahata /*
840a50f673fSIsaku Yamahata * It is not possible to reclaim pages while hkid is assigned. It might
841a50f673fSIsaku Yamahata * be assigned if:
842a50f673fSIsaku Yamahata * 1. the TD VM is being destroyed but freeing hkid failed, in which
843a50f673fSIsaku Yamahata * case the pages are leaked
844a50f673fSIsaku Yamahata * 2. TD VCPU creation failed and this on the error path, in which case
845a50f673fSIsaku Yamahata * there is nothing to do anyway
846a50f673fSIsaku Yamahata */
847a50f673fSIsaku Yamahata if (is_hkid_assigned(kvm_tdx))
848a50f673fSIsaku Yamahata return;
849a50f673fSIsaku Yamahata
850a50f673fSIsaku Yamahata if (tdx->vp.tdcx_pages) {
851a50f673fSIsaku Yamahata for (i = 0; i < kvm_tdx->td.tdcx_nr_pages; i++) {
852a50f673fSIsaku Yamahata if (tdx->vp.tdcx_pages[i])
853a50f673fSIsaku Yamahata tdx_reclaim_control_page(tdx->vp.tdcx_pages[i]);
854a50f673fSIsaku Yamahata }
855a50f673fSIsaku Yamahata kfree(tdx->vp.tdcx_pages);
856a50f673fSIsaku Yamahata tdx->vp.tdcx_pages = NULL;
857a50f673fSIsaku Yamahata }
858a50f673fSIsaku Yamahata if (tdx->vp.tdvpr_page) {
859a50f673fSIsaku Yamahata tdx_reclaim_control_page(tdx->vp.tdvpr_page);
860a50f673fSIsaku Yamahata tdx->vp.tdvpr_page = 0;
861a50f673fSIsaku Yamahata }
862a50f673fSIsaku Yamahata
863a50f673fSIsaku Yamahata tdx->state = VCPU_TD_STATE_UNINITIALIZED;
8649002f8cfSIsaku Yamahata }
8659002f8cfSIsaku Yamahata
tdx_vcpu_pre_run(struct kvm_vcpu * vcpu)86681bf912bSIsaku Yamahata int tdx_vcpu_pre_run(struct kvm_vcpu *vcpu)
86781bf912bSIsaku Yamahata {
86881bf912bSIsaku Yamahata if (unlikely(to_tdx(vcpu)->state != VCPU_TD_STATE_INITIALIZED ||
86981bf912bSIsaku Yamahata to_kvm_tdx(vcpu->kvm)->state != TD_STATE_RUNNABLE))
87081bf912bSIsaku Yamahata return -EINVAL;
87181bf912bSIsaku Yamahata
87281bf912bSIsaku Yamahata return 1;
87381bf912bSIsaku Yamahata }
87481bf912bSIsaku Yamahata
tdcall_to_vmx_exit_reason(struct kvm_vcpu * vcpu)875c42856afSIsaku Yamahata static __always_inline u32 tdcall_to_vmx_exit_reason(struct kvm_vcpu *vcpu)
876c42856afSIsaku Yamahata {
877c42856afSIsaku Yamahata switch (tdvmcall_leaf(vcpu)) {
8783bf31b57SIsaku Yamahata case EXIT_REASON_CPUID:
8795cf7239bSIsaku Yamahata case EXIT_REASON_HLT:
88033608aafSIsaku Yamahata case EXIT_REASON_IO_INSTRUCTION:
881081385dbSIsaku Yamahata case EXIT_REASON_MSR_READ:
882081385dbSIsaku Yamahata case EXIT_REASON_MSR_WRITE:
88333608aafSIsaku Yamahata return tdvmcall_leaf(vcpu);
884bb723bebSSean Christopherson case EXIT_REASON_EPT_VIOLATION:
885bb723bebSSean Christopherson return EXIT_REASON_EPT_MISCONFIG;
886c42856afSIsaku Yamahata default:
887c42856afSIsaku Yamahata break;
888c42856afSIsaku Yamahata }
889c42856afSIsaku Yamahata
890c42856afSIsaku Yamahata return EXIT_REASON_TDCALL;
891c42856afSIsaku Yamahata }
892c42856afSIsaku Yamahata
tdx_to_vmx_exit_reason(struct kvm_vcpu * vcpu)893095b71a0SIsaku Yamahata static __always_inline u32 tdx_to_vmx_exit_reason(struct kvm_vcpu *vcpu)
894095b71a0SIsaku Yamahata {
895095b71a0SIsaku Yamahata struct vcpu_tdx *tdx = to_tdx(vcpu);
896c42856afSIsaku Yamahata u32 exit_reason;
897095b71a0SIsaku Yamahata
898095b71a0SIsaku Yamahata switch (tdx->vp_enter_ret & TDX_SEAMCALL_STATUS_MASK) {
899095b71a0SIsaku Yamahata case TDX_SUCCESS:
900095b71a0SIsaku Yamahata case TDX_NON_RECOVERABLE_VCPU:
901095b71a0SIsaku Yamahata case TDX_NON_RECOVERABLE_TD:
902095b71a0SIsaku Yamahata case TDX_NON_RECOVERABLE_TD_NON_ACCESSIBLE:
903095b71a0SIsaku Yamahata case TDX_NON_RECOVERABLE_TD_WRONG_APIC_MODE:
904095b71a0SIsaku Yamahata break;
905095b71a0SIsaku Yamahata default:
906095b71a0SIsaku Yamahata return -1u;
907095b71a0SIsaku Yamahata }
908095b71a0SIsaku Yamahata
909c42856afSIsaku Yamahata exit_reason = tdx->vp_enter_ret;
910c42856afSIsaku Yamahata
911c42856afSIsaku Yamahata switch (exit_reason) {
912c42856afSIsaku Yamahata case EXIT_REASON_TDCALL:
913c42856afSIsaku Yamahata if (tdvmcall_exit_type(vcpu))
914c42856afSIsaku Yamahata return EXIT_REASON_VMCALL;
915c42856afSIsaku Yamahata
916c42856afSIsaku Yamahata return tdcall_to_vmx_exit_reason(vcpu);
917da407fe4SIsaku Yamahata case EXIT_REASON_EPT_MISCONFIG:
918da407fe4SIsaku Yamahata /*
919da407fe4SIsaku Yamahata * Defer KVM_BUG_ON() until tdx_handle_exit() because this is in
920da407fe4SIsaku Yamahata * non-instrumentable code with interrupts disabled.
921da407fe4SIsaku Yamahata */
922da407fe4SIsaku Yamahata return -1u;
923c42856afSIsaku Yamahata default:
924c42856afSIsaku Yamahata break;
925c42856afSIsaku Yamahata }
926c42856afSIsaku Yamahata
927c42856afSIsaku Yamahata return exit_reason;
928095b71a0SIsaku Yamahata }
929095b71a0SIsaku Yamahata
tdx_vcpu_enter_exit(struct kvm_vcpu * vcpu)93081bf912bSIsaku Yamahata static noinstr void tdx_vcpu_enter_exit(struct kvm_vcpu *vcpu)
93181bf912bSIsaku Yamahata {
93281bf912bSIsaku Yamahata struct vcpu_tdx *tdx = to_tdx(vcpu);
933095b71a0SIsaku Yamahata struct vcpu_vt *vt = to_vt(vcpu);
93481bf912bSIsaku Yamahata
93581bf912bSIsaku Yamahata guest_state_enter_irqoff();
93681bf912bSIsaku Yamahata
93781bf912bSIsaku Yamahata tdx->vp_enter_ret = tdh_vp_enter(&tdx->vp, &tdx->vp_enter_args);
93881bf912bSIsaku Yamahata
939095b71a0SIsaku Yamahata vt->exit_reason.full = tdx_to_vmx_exit_reason(vcpu);
940095b71a0SIsaku Yamahata
941095b71a0SIsaku Yamahata vt->exit_qualification = tdx->vp_enter_args.rcx;
942095b71a0SIsaku Yamahata tdx->ext_exit_qualification = tdx->vp_enter_args.rdx;
943095b71a0SIsaku Yamahata tdx->exit_gpa = tdx->vp_enter_args.r8;
944095b71a0SIsaku Yamahata vt->exit_intr_info = tdx->vp_enter_args.r9;
945095b71a0SIsaku Yamahata
946f30cb642SIsaku Yamahata vmx_handle_nmi(vcpu);
947f30cb642SIsaku Yamahata
94881bf912bSIsaku Yamahata guest_state_exit_irqoff();
94981bf912bSIsaku Yamahata }
95081bf912bSIsaku Yamahata
tdx_failed_vmentry(struct kvm_vcpu * vcpu)951095b71a0SIsaku Yamahata static bool tdx_failed_vmentry(struct kvm_vcpu *vcpu)
952095b71a0SIsaku Yamahata {
953095b71a0SIsaku Yamahata return vmx_get_exit_reason(vcpu).failed_vmentry &&
954095b71a0SIsaku Yamahata vmx_get_exit_reason(vcpu).full != -1u;
955095b71a0SIsaku Yamahata }
956095b71a0SIsaku Yamahata
tdx_exit_handlers_fastpath(struct kvm_vcpu * vcpu)957095b71a0SIsaku Yamahata static fastpath_t tdx_exit_handlers_fastpath(struct kvm_vcpu *vcpu)
958095b71a0SIsaku Yamahata {
959095b71a0SIsaku Yamahata u64 vp_enter_ret = to_tdx(vcpu)->vp_enter_ret;
960095b71a0SIsaku Yamahata
961095b71a0SIsaku Yamahata /*
962095b71a0SIsaku Yamahata * TDX_OPERAND_BUSY could be returned for SEPT due to 0-step mitigation
963095b71a0SIsaku Yamahata * or for TD EPOCH due to contention with TDH.MEM.TRACK on TDH.VP.ENTER.
964095b71a0SIsaku Yamahata *
965095b71a0SIsaku Yamahata * When KVM requests KVM_REQ_OUTSIDE_GUEST_MODE, which has both
966095b71a0SIsaku Yamahata * KVM_REQUEST_WAIT and KVM_REQUEST_NO_ACTION set, it requires target
967095b71a0SIsaku Yamahata * vCPUs leaving fastpath so that interrupt can be enabled to ensure the
968095b71a0SIsaku Yamahata * IPIs can be delivered. Return EXIT_FASTPATH_EXIT_HANDLED instead of
969095b71a0SIsaku Yamahata * EXIT_FASTPATH_REENTER_GUEST to exit fastpath, otherwise, the
970095b71a0SIsaku Yamahata * requester may be blocked endlessly.
971095b71a0SIsaku Yamahata */
972095b71a0SIsaku Yamahata if (unlikely(tdx_operand_busy(vp_enter_ret)))
973095b71a0SIsaku Yamahata return EXIT_FASTPATH_EXIT_HANDLED;
974095b71a0SIsaku Yamahata
975095b71a0SIsaku Yamahata return EXIT_FASTPATH_NONE;
976095b71a0SIsaku Yamahata }
977095b71a0SIsaku Yamahata
97881bf912bSIsaku Yamahata #define TDX_REGS_AVAIL_SET (BIT_ULL(VCPU_EXREG_EXIT_INFO_1) | \
97981bf912bSIsaku Yamahata BIT_ULL(VCPU_EXREG_EXIT_INFO_2) | \
98081bf912bSIsaku Yamahata BIT_ULL(VCPU_REGS_RAX) | \
98181bf912bSIsaku Yamahata BIT_ULL(VCPU_REGS_RBX) | \
98281bf912bSIsaku Yamahata BIT_ULL(VCPU_REGS_RCX) | \
98381bf912bSIsaku Yamahata BIT_ULL(VCPU_REGS_RDX) | \
98481bf912bSIsaku Yamahata BIT_ULL(VCPU_REGS_RBP) | \
98581bf912bSIsaku Yamahata BIT_ULL(VCPU_REGS_RSI) | \
98681bf912bSIsaku Yamahata BIT_ULL(VCPU_REGS_RDI) | \
98781bf912bSIsaku Yamahata BIT_ULL(VCPU_REGS_R8) | \
98881bf912bSIsaku Yamahata BIT_ULL(VCPU_REGS_R9) | \
98981bf912bSIsaku Yamahata BIT_ULL(VCPU_REGS_R10) | \
99081bf912bSIsaku Yamahata BIT_ULL(VCPU_REGS_R11) | \
99181bf912bSIsaku Yamahata BIT_ULL(VCPU_REGS_R12) | \
99281bf912bSIsaku Yamahata BIT_ULL(VCPU_REGS_R13) | \
99381bf912bSIsaku Yamahata BIT_ULL(VCPU_REGS_R14) | \
99481bf912bSIsaku Yamahata BIT_ULL(VCPU_REGS_R15))
99581bf912bSIsaku Yamahata
tdx_load_host_xsave_state(struct kvm_vcpu * vcpu)9966bfa6d85SIsaku Yamahata static void tdx_load_host_xsave_state(struct kvm_vcpu *vcpu)
9976bfa6d85SIsaku Yamahata {
9986bfa6d85SIsaku Yamahata struct kvm_tdx *kvm_tdx = to_kvm_tdx(vcpu->kvm);
9996bfa6d85SIsaku Yamahata
10006bfa6d85SIsaku Yamahata /*
10016bfa6d85SIsaku Yamahata * All TDX hosts support PKRU; but even if they didn't,
10026bfa6d85SIsaku Yamahata * vcpu->arch.host_pkru would be 0 and the wrpkru would be
10036bfa6d85SIsaku Yamahata * skipped.
10046bfa6d85SIsaku Yamahata */
10056bfa6d85SIsaku Yamahata if (vcpu->arch.host_pkru != 0)
10066bfa6d85SIsaku Yamahata wrpkru(vcpu->arch.host_pkru);
10076bfa6d85SIsaku Yamahata
10086bfa6d85SIsaku Yamahata if (kvm_host.xcr0 != (kvm_tdx->xfam & kvm_caps.supported_xcr0))
10096bfa6d85SIsaku Yamahata xsetbv(XCR_XFEATURE_ENABLED_MASK, kvm_host.xcr0);
10106bfa6d85SIsaku Yamahata
10116bfa6d85SIsaku Yamahata /*
10126bfa6d85SIsaku Yamahata * Likewise, even if a TDX hosts didn't support XSS both arms of
10136bfa6d85SIsaku Yamahata * the comparison would be 0 and the wrmsrl would be skipped.
10146bfa6d85SIsaku Yamahata */
10156bfa6d85SIsaku Yamahata if (kvm_host.xss != (kvm_tdx->xfam & kvm_caps.supported_xss))
10166bfa6d85SIsaku Yamahata wrmsrl(MSR_IA32_XSS, kvm_host.xss);
10176bfa6d85SIsaku Yamahata }
10188af09903SAdrian Hunter
10198af09903SAdrian Hunter #define TDX_DEBUGCTL_PRESERVED (DEBUGCTLMSR_BTF | \
10208af09903SAdrian Hunter DEBUGCTLMSR_FREEZE_PERFMON_ON_PMI | \
10218af09903SAdrian Hunter DEBUGCTLMSR_FREEZE_IN_SMM)
10226bfa6d85SIsaku Yamahata
tdx_vcpu_run(struct kvm_vcpu * vcpu,bool force_immediate_exit)102381bf912bSIsaku Yamahata fastpath_t tdx_vcpu_run(struct kvm_vcpu *vcpu, bool force_immediate_exit)
102481bf912bSIsaku Yamahata {
1025e0b4f31aSIsaku Yamahata struct vcpu_tdx *tdx = to_tdx(vcpu);
10268af09903SAdrian Hunter struct vcpu_vt *vt = to_vt(vcpu);
1027e0b4f31aSIsaku Yamahata
102881bf912bSIsaku Yamahata /*
102981bf912bSIsaku Yamahata * force_immediate_exit requires vCPU entering for events injection with
103081bf912bSIsaku Yamahata * an immediately exit followed. But The TDX module doesn't guarantee
103181bf912bSIsaku Yamahata * entry, it's already possible for KVM to _think_ it completely entry
103281bf912bSIsaku Yamahata * to the guest without actually having done so.
103381bf912bSIsaku Yamahata * Since KVM never needs to force an immediate exit for TDX, and can't
103481bf912bSIsaku Yamahata * do direct injection, just warn on force_immediate_exit.
103581bf912bSIsaku Yamahata */
103681bf912bSIsaku Yamahata WARN_ON_ONCE(force_immediate_exit);
103781bf912bSIsaku Yamahata
10384b2abc49SYan Zhao /*
10394b2abc49SYan Zhao * Wait until retry of SEPT-zap-related SEAMCALL completes before
10404b2abc49SYan Zhao * allowing vCPU entry to avoid contention with tdh_vp_enter() and
10414b2abc49SYan Zhao * TDCALLs.
10424b2abc49SYan Zhao */
10434b2abc49SYan Zhao if (unlikely(READ_ONCE(to_kvm_tdx(vcpu->kvm)->wait_for_sept_zap)))
10444b2abc49SYan Zhao return EXIT_FASTPATH_EXIT_HANDLED;
10454b2abc49SYan Zhao
104681bf912bSIsaku Yamahata trace_kvm_entry(vcpu, force_immediate_exit);
104781bf912bSIsaku Yamahata
1048fc17de99SIsaku Yamahata if (pi_test_on(&vt->pi_desc)) {
104924c12911SIsaku Yamahata apic->send_IPI_self(POSTED_INTR_VECTOR);
105024c12911SIsaku Yamahata
1051fc17de99SIsaku Yamahata if (pi_test_pir(kvm_lapic_get_reg(vcpu->arch.apic, APIC_LVTT) &
1052fc17de99SIsaku Yamahata APIC_VECTOR_MASK, &vt->pi_desc))
1053fc17de99SIsaku Yamahata kvm_wait_lapic_expire(vcpu);
1054fc17de99SIsaku Yamahata }
1055fc17de99SIsaku Yamahata
105681bf912bSIsaku Yamahata tdx_vcpu_enter_exit(vcpu);
105781bf912bSIsaku Yamahata
10588af09903SAdrian Hunter if (vt->host_debugctlmsr & ~TDX_DEBUGCTL_PRESERVED)
10598af09903SAdrian Hunter update_debugctlmsr(vt->host_debugctlmsr);
10608af09903SAdrian Hunter
10616bfa6d85SIsaku Yamahata tdx_load_host_xsave_state(vcpu);
1062e0b4f31aSIsaku Yamahata tdx->guest_entered = true;
10636bfa6d85SIsaku Yamahata
106481bf912bSIsaku Yamahata vcpu->arch.regs_avail &= TDX_REGS_AVAIL_SET;
106581bf912bSIsaku Yamahata
1066da407fe4SIsaku Yamahata if (unlikely(tdx->vp_enter_ret == EXIT_REASON_EPT_MISCONFIG))
1067da407fe4SIsaku Yamahata return EXIT_FASTPATH_NONE;
1068da407fe4SIsaku Yamahata
1069095b71a0SIsaku Yamahata if (unlikely((tdx->vp_enter_ret & TDX_SW_ERROR) == TDX_SW_ERROR))
1070095b71a0SIsaku Yamahata return EXIT_FASTPATH_NONE;
1071095b71a0SIsaku Yamahata
1072095b71a0SIsaku Yamahata if (unlikely(vmx_get_exit_reason(vcpu).basic == EXIT_REASON_MCE_DURING_VMENTRY))
1073095b71a0SIsaku Yamahata kvm_machine_check();
1074095b71a0SIsaku Yamahata
107581bf912bSIsaku Yamahata trace_kvm_exit(vcpu, KVM_ISA_VMX);
107681bf912bSIsaku Yamahata
1077095b71a0SIsaku Yamahata if (unlikely(tdx_failed_vmentry(vcpu)))
107881bf912bSIsaku Yamahata return EXIT_FASTPATH_NONE;
1079095b71a0SIsaku Yamahata
1080095b71a0SIsaku Yamahata return tdx_exit_handlers_fastpath(vcpu);
108181bf912bSIsaku Yamahata }
108287e3f45eSSean Christopherson
tdx_inject_nmi(struct kvm_vcpu * vcpu)1083acc64eb4SIsaku Yamahata void tdx_inject_nmi(struct kvm_vcpu *vcpu)
1084acc64eb4SIsaku Yamahata {
1085acc64eb4SIsaku Yamahata ++vcpu->stat.nmi_injections;
1086acc64eb4SIsaku Yamahata td_management_write8(to_tdx(vcpu), TD_VCPU_PEND_NMI, 1);
1087acc64eb4SIsaku Yamahata /*
1088acc64eb4SIsaku Yamahata * From KVM's perspective, NMI injection is completed right after
1089acc64eb4SIsaku Yamahata * writing to PEND_NMI. KVM doesn't care whether an NMI is injected by
1090acc64eb4SIsaku Yamahata * the TDX module or not.
1091acc64eb4SIsaku Yamahata */
1092acc64eb4SIsaku Yamahata vcpu->arch.nmi_injected = false;
1093acc64eb4SIsaku Yamahata /*
1094acc64eb4SIsaku Yamahata * TDX doesn't support KVM to request NMI window exit. If there is
1095acc64eb4SIsaku Yamahata * still a pending vNMI, KVM is not able to inject it along with the
1096acc64eb4SIsaku Yamahata * one pending in TDX module in a back-to-back way. Since the previous
1097acc64eb4SIsaku Yamahata * vNMI is still pending in TDX module, i.e. it has not been delivered
1098acc64eb4SIsaku Yamahata * to TDX guest yet, it's OK to collapse the pending vNMI into the
1099acc64eb4SIsaku Yamahata * previous one. The guest is expected to handle all the NMI sources
1100acc64eb4SIsaku Yamahata * when handling the first vNMI.
1101acc64eb4SIsaku Yamahata */
1102acc64eb4SIsaku Yamahata vcpu->arch.nmi_pending = 0;
1103acc64eb4SIsaku Yamahata }
1104acc64eb4SIsaku Yamahata
tdx_handle_exception_nmi(struct kvm_vcpu * vcpu)1105f30cb642SIsaku Yamahata static int tdx_handle_exception_nmi(struct kvm_vcpu *vcpu)
1106f30cb642SIsaku Yamahata {
1107f30cb642SIsaku Yamahata u32 intr_info = vmx_get_intr_info(vcpu);
1108f30cb642SIsaku Yamahata
1109f30cb642SIsaku Yamahata /*
1110f30cb642SIsaku Yamahata * Machine checks are handled by handle_exception_irqoff(), or by
1111f30cb642SIsaku Yamahata * tdx_handle_exit() with TDX_NON_RECOVERABLE set if a #MC occurs on
1112f30cb642SIsaku Yamahata * VM-Entry. NMIs are handled by tdx_vcpu_enter_exit().
1113f30cb642SIsaku Yamahata */
1114f30cb642SIsaku Yamahata if (is_nmi(intr_info) || is_machine_check(intr_info))
1115f30cb642SIsaku Yamahata return 1;
1116f30cb642SIsaku Yamahata
1117f30cb642SIsaku Yamahata vcpu->run->exit_reason = KVM_EXIT_EXCEPTION;
1118f30cb642SIsaku Yamahata vcpu->run->ex.exception = intr_info & INTR_INFO_VECTOR_MASK;
1119f30cb642SIsaku Yamahata vcpu->run->ex.error_code = 0;
1120f30cb642SIsaku Yamahata
1121f30cb642SIsaku Yamahata return 0;
1122f30cb642SIsaku Yamahata }
1123f30cb642SIsaku Yamahata
complete_hypercall_exit(struct kvm_vcpu * vcpu)1124d5998c02SIsaku Yamahata static int complete_hypercall_exit(struct kvm_vcpu *vcpu)
1125d5998c02SIsaku Yamahata {
1126d5998c02SIsaku Yamahata tdvmcall_set_return_code(vcpu, vcpu->run->hypercall.ret);
1127d5998c02SIsaku Yamahata return 1;
1128d5998c02SIsaku Yamahata }
1129d5998c02SIsaku Yamahata
tdx_emulate_vmcall(struct kvm_vcpu * vcpu)1130d5998c02SIsaku Yamahata static int tdx_emulate_vmcall(struct kvm_vcpu *vcpu)
1131d5998c02SIsaku Yamahata {
1132d5998c02SIsaku Yamahata kvm_rax_write(vcpu, to_tdx(vcpu)->vp_enter_args.r10);
1133d5998c02SIsaku Yamahata kvm_rbx_write(vcpu, to_tdx(vcpu)->vp_enter_args.r11);
1134d5998c02SIsaku Yamahata kvm_rcx_write(vcpu, to_tdx(vcpu)->vp_enter_args.r12);
1135d5998c02SIsaku Yamahata kvm_rdx_write(vcpu, to_tdx(vcpu)->vp_enter_args.r13);
1136d5998c02SIsaku Yamahata kvm_rsi_write(vcpu, to_tdx(vcpu)->vp_enter_args.r14);
1137d5998c02SIsaku Yamahata
1138d5998c02SIsaku Yamahata return __kvm_emulate_hypercall(vcpu, 0, complete_hypercall_exit);
1139d5998c02SIsaku Yamahata }
1140d5998c02SIsaku Yamahata
11412c304880SBinbin Wu /*
11422c304880SBinbin Wu * Split into chunks and check interrupt pending between chunks. This allows
11432c304880SBinbin Wu * for timely injection of interrupts to prevent issues with guest lockup
11442c304880SBinbin Wu * detection.
11452c304880SBinbin Wu */
11462c304880SBinbin Wu #define TDX_MAP_GPA_MAX_LEN (2 * 1024 * 1024)
11472c304880SBinbin Wu static void __tdx_map_gpa(struct vcpu_tdx *tdx);
11482c304880SBinbin Wu
tdx_complete_vmcall_map_gpa(struct kvm_vcpu * vcpu)11492c304880SBinbin Wu static int tdx_complete_vmcall_map_gpa(struct kvm_vcpu *vcpu)
11502c304880SBinbin Wu {
11512c304880SBinbin Wu struct vcpu_tdx *tdx = to_tdx(vcpu);
11522c304880SBinbin Wu
11532c304880SBinbin Wu if (vcpu->run->hypercall.ret) {
11542c304880SBinbin Wu tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_INVALID_OPERAND);
11552c304880SBinbin Wu tdx->vp_enter_args.r11 = tdx->map_gpa_next;
11562c304880SBinbin Wu return 1;
11572c304880SBinbin Wu }
11582c304880SBinbin Wu
11592c304880SBinbin Wu tdx->map_gpa_next += TDX_MAP_GPA_MAX_LEN;
11602c304880SBinbin Wu if (tdx->map_gpa_next >= tdx->map_gpa_end)
11612c304880SBinbin Wu return 1;
11622c304880SBinbin Wu
11632c304880SBinbin Wu /*
11642c304880SBinbin Wu * Stop processing the remaining part if there is a pending interrupt,
11652c304880SBinbin Wu * which could be qualified to deliver. Skip checking pending RVI for
11665cf7239bSIsaku Yamahata * TDVMCALL_MAP_GPA, see comments in tdx_protected_apic_has_interrupt().
11672c304880SBinbin Wu */
11682c304880SBinbin Wu if (kvm_vcpu_has_events(vcpu)) {
11692c304880SBinbin Wu tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_RETRY);
11702c304880SBinbin Wu tdx->vp_enter_args.r11 = tdx->map_gpa_next;
11712c304880SBinbin Wu return 1;
11722c304880SBinbin Wu }
11732c304880SBinbin Wu
11742c304880SBinbin Wu __tdx_map_gpa(tdx);
11752c304880SBinbin Wu return 0;
11762c304880SBinbin Wu }
11772c304880SBinbin Wu
__tdx_map_gpa(struct vcpu_tdx * tdx)11782c304880SBinbin Wu static void __tdx_map_gpa(struct vcpu_tdx *tdx)
11792c304880SBinbin Wu {
11802c304880SBinbin Wu u64 gpa = tdx->map_gpa_next;
11812c304880SBinbin Wu u64 size = tdx->map_gpa_end - tdx->map_gpa_next;
11822c304880SBinbin Wu
11832c304880SBinbin Wu if (size > TDX_MAP_GPA_MAX_LEN)
11842c304880SBinbin Wu size = TDX_MAP_GPA_MAX_LEN;
11852c304880SBinbin Wu
11862c304880SBinbin Wu tdx->vcpu.run->exit_reason = KVM_EXIT_HYPERCALL;
11872c304880SBinbin Wu tdx->vcpu.run->hypercall.nr = KVM_HC_MAP_GPA_RANGE;
11882c304880SBinbin Wu /*
11892c304880SBinbin Wu * In principle this should have been -KVM_ENOSYS, but userspace (QEMU <=9.2)
11902c304880SBinbin Wu * assumed that vcpu->run->hypercall.ret is never changed by KVM and thus that
11912c304880SBinbin Wu * it was always zero on KVM_EXIT_HYPERCALL. Since KVM is now overwriting
11922c304880SBinbin Wu * vcpu->run->hypercall.ret, ensuring that it is zero to not break QEMU.
11932c304880SBinbin Wu */
11942c304880SBinbin Wu tdx->vcpu.run->hypercall.ret = 0;
11952c304880SBinbin Wu tdx->vcpu.run->hypercall.args[0] = gpa & ~gfn_to_gpa(kvm_gfn_direct_bits(tdx->vcpu.kvm));
11962c304880SBinbin Wu tdx->vcpu.run->hypercall.args[1] = size / PAGE_SIZE;
11972c304880SBinbin Wu tdx->vcpu.run->hypercall.args[2] = vt_is_tdx_private_gpa(tdx->vcpu.kvm, gpa) ?
11982c304880SBinbin Wu KVM_MAP_GPA_RANGE_ENCRYPTED :
11992c304880SBinbin Wu KVM_MAP_GPA_RANGE_DECRYPTED;
12002c304880SBinbin Wu tdx->vcpu.run->hypercall.flags = KVM_EXIT_HYPERCALL_LONG_MODE;
12012c304880SBinbin Wu
12022c304880SBinbin Wu tdx->vcpu.arch.complete_userspace_io = tdx_complete_vmcall_map_gpa;
12032c304880SBinbin Wu }
12042c304880SBinbin Wu
tdx_map_gpa(struct kvm_vcpu * vcpu)12052c304880SBinbin Wu static int tdx_map_gpa(struct kvm_vcpu *vcpu)
12062c304880SBinbin Wu {
12072c304880SBinbin Wu struct vcpu_tdx *tdx = to_tdx(vcpu);
12082c304880SBinbin Wu u64 gpa = tdx->vp_enter_args.r12;
12092c304880SBinbin Wu u64 size = tdx->vp_enter_args.r13;
12102c304880SBinbin Wu u64 ret;
12112c304880SBinbin Wu
12122c304880SBinbin Wu /*
12132c304880SBinbin Wu * Converting TDVMCALL_MAP_GPA to KVM_HC_MAP_GPA_RANGE requires
12142c304880SBinbin Wu * userspace to enable KVM_CAP_EXIT_HYPERCALL with KVM_HC_MAP_GPA_RANGE
1215b5aafcb4SBinbin Wu * bit set. This is a base call so it should always be supported, but
1216b5aafcb4SBinbin Wu * KVM has no way to ensure that userspace implements the GHCI correctly.
1217b5aafcb4SBinbin Wu * So if KVM_HC_MAP_GPA_RANGE does not cause a VMEXIT, return an error
1218b5aafcb4SBinbin Wu * to the guest.
12192c304880SBinbin Wu */
12202c304880SBinbin Wu if (!user_exit_on_hypercall(vcpu->kvm, KVM_HC_MAP_GPA_RANGE)) {
1221b5aafcb4SBinbin Wu ret = TDVMCALL_STATUS_SUBFUNC_UNSUPPORTED;
12222c304880SBinbin Wu goto error;
12232c304880SBinbin Wu }
12242c304880SBinbin Wu
12252c304880SBinbin Wu if (gpa + size <= gpa || !kvm_vcpu_is_legal_gpa(vcpu, gpa) ||
12262c304880SBinbin Wu !kvm_vcpu_is_legal_gpa(vcpu, gpa + size - 1) ||
12272c304880SBinbin Wu (vt_is_tdx_private_gpa(vcpu->kvm, gpa) !=
12282c304880SBinbin Wu vt_is_tdx_private_gpa(vcpu->kvm, gpa + size - 1))) {
12292c304880SBinbin Wu ret = TDVMCALL_STATUS_INVALID_OPERAND;
12302c304880SBinbin Wu goto error;
12312c304880SBinbin Wu }
12322c304880SBinbin Wu
12332c304880SBinbin Wu if (!PAGE_ALIGNED(gpa) || !PAGE_ALIGNED(size)) {
12342c304880SBinbin Wu ret = TDVMCALL_STATUS_ALIGN_ERROR;
12352c304880SBinbin Wu goto error;
12362c304880SBinbin Wu }
12372c304880SBinbin Wu
12382c304880SBinbin Wu tdx->map_gpa_end = gpa + size;
12392c304880SBinbin Wu tdx->map_gpa_next = gpa;
12402c304880SBinbin Wu
12412c304880SBinbin Wu __tdx_map_gpa(tdx);
12422c304880SBinbin Wu return 0;
12432c304880SBinbin Wu
12442c304880SBinbin Wu error:
12452c304880SBinbin Wu tdvmcall_set_return_code(vcpu, ret);
12462c304880SBinbin Wu tdx->vp_enter_args.r11 = gpa;
12472c304880SBinbin Wu return 1;
12482c304880SBinbin Wu }
12492c304880SBinbin Wu
tdx_report_fatal_error(struct kvm_vcpu * vcpu)125079462faaSBinbin Wu static int tdx_report_fatal_error(struct kvm_vcpu *vcpu)
125179462faaSBinbin Wu {
125279462faaSBinbin Wu struct vcpu_tdx *tdx = to_tdx(vcpu);
125379462faaSBinbin Wu u64 *regs = vcpu->run->system_event.data;
125479462faaSBinbin Wu u64 *module_regs = &tdx->vp_enter_args.r8;
125579462faaSBinbin Wu int index = VCPU_REGS_RAX;
125679462faaSBinbin Wu
125779462faaSBinbin Wu vcpu->run->exit_reason = KVM_EXIT_SYSTEM_EVENT;
125879462faaSBinbin Wu vcpu->run->system_event.type = KVM_SYSTEM_EVENT_TDX_FATAL;
125979462faaSBinbin Wu vcpu->run->system_event.ndata = 16;
126079462faaSBinbin Wu
126179462faaSBinbin Wu /* Dump 16 general-purpose registers to userspace in ascending order. */
126279462faaSBinbin Wu regs[index++] = tdx->vp_enter_ret;
126379462faaSBinbin Wu regs[index++] = tdx->vp_enter_args.rcx;
126479462faaSBinbin Wu regs[index++] = tdx->vp_enter_args.rdx;
126579462faaSBinbin Wu regs[index++] = tdx->vp_enter_args.rbx;
126679462faaSBinbin Wu regs[index++] = 0;
126779462faaSBinbin Wu regs[index++] = 0;
126879462faaSBinbin Wu regs[index++] = tdx->vp_enter_args.rsi;
126979462faaSBinbin Wu regs[index] = tdx->vp_enter_args.rdi;
127079462faaSBinbin Wu for (index = 0; index < 8; index++)
127179462faaSBinbin Wu regs[VCPU_REGS_R8 + index] = module_regs[index];
127279462faaSBinbin Wu
127379462faaSBinbin Wu return 0;
127479462faaSBinbin Wu }
127579462faaSBinbin Wu
tdx_emulate_cpuid(struct kvm_vcpu * vcpu)12763bf31b57SIsaku Yamahata static int tdx_emulate_cpuid(struct kvm_vcpu *vcpu)
12773bf31b57SIsaku Yamahata {
12783bf31b57SIsaku Yamahata u32 eax, ebx, ecx, edx;
12793bf31b57SIsaku Yamahata struct vcpu_tdx *tdx = to_tdx(vcpu);
12803bf31b57SIsaku Yamahata
12813bf31b57SIsaku Yamahata /* EAX and ECX for cpuid is stored in R12 and R13. */
12823bf31b57SIsaku Yamahata eax = tdx->vp_enter_args.r12;
12833bf31b57SIsaku Yamahata ecx = tdx->vp_enter_args.r13;
12843bf31b57SIsaku Yamahata
12853bf31b57SIsaku Yamahata kvm_cpuid(vcpu, &eax, &ebx, &ecx, &edx, false);
12863bf31b57SIsaku Yamahata
12873bf31b57SIsaku Yamahata tdx->vp_enter_args.r12 = eax;
12883bf31b57SIsaku Yamahata tdx->vp_enter_args.r13 = ebx;
12893bf31b57SIsaku Yamahata tdx->vp_enter_args.r14 = ecx;
12903bf31b57SIsaku Yamahata tdx->vp_enter_args.r15 = edx;
12913bf31b57SIsaku Yamahata
12923bf31b57SIsaku Yamahata return 1;
12933bf31b57SIsaku Yamahata }
12943bf31b57SIsaku Yamahata
tdx_complete_pio_out(struct kvm_vcpu * vcpu)129533608aafSIsaku Yamahata static int tdx_complete_pio_out(struct kvm_vcpu *vcpu)
129633608aafSIsaku Yamahata {
129733608aafSIsaku Yamahata vcpu->arch.pio.count = 0;
129833608aafSIsaku Yamahata return 1;
129933608aafSIsaku Yamahata }
130033608aafSIsaku Yamahata
tdx_complete_pio_in(struct kvm_vcpu * vcpu)130133608aafSIsaku Yamahata static int tdx_complete_pio_in(struct kvm_vcpu *vcpu)
130233608aafSIsaku Yamahata {
130333608aafSIsaku Yamahata struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
130433608aafSIsaku Yamahata unsigned long val = 0;
130533608aafSIsaku Yamahata int ret;
130633608aafSIsaku Yamahata
130733608aafSIsaku Yamahata ret = ctxt->ops->pio_in_emulated(ctxt, vcpu->arch.pio.size,
130833608aafSIsaku Yamahata vcpu->arch.pio.port, &val, 1);
130933608aafSIsaku Yamahata
131033608aafSIsaku Yamahata WARN_ON_ONCE(!ret);
131133608aafSIsaku Yamahata
131233608aafSIsaku Yamahata tdvmcall_set_return_val(vcpu, val);
131333608aafSIsaku Yamahata
131433608aafSIsaku Yamahata return 1;
131533608aafSIsaku Yamahata }
131633608aafSIsaku Yamahata
tdx_emulate_io(struct kvm_vcpu * vcpu)131733608aafSIsaku Yamahata static int tdx_emulate_io(struct kvm_vcpu *vcpu)
131833608aafSIsaku Yamahata {
131933608aafSIsaku Yamahata struct vcpu_tdx *tdx = to_tdx(vcpu);
132033608aafSIsaku Yamahata struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
132133608aafSIsaku Yamahata unsigned long val = 0;
132233608aafSIsaku Yamahata unsigned int port;
132333608aafSIsaku Yamahata u64 size, write;
132433608aafSIsaku Yamahata int ret;
132533608aafSIsaku Yamahata
132633608aafSIsaku Yamahata ++vcpu->stat.io_exits;
132733608aafSIsaku Yamahata
132833608aafSIsaku Yamahata size = tdx->vp_enter_args.r12;
132933608aafSIsaku Yamahata write = tdx->vp_enter_args.r13;
133033608aafSIsaku Yamahata port = tdx->vp_enter_args.r14;
133133608aafSIsaku Yamahata
133233608aafSIsaku Yamahata if ((write != 0 && write != 1) || (size != 1 && size != 2 && size != 4)) {
133333608aafSIsaku Yamahata tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_INVALID_OPERAND);
133433608aafSIsaku Yamahata return 1;
133533608aafSIsaku Yamahata }
133633608aafSIsaku Yamahata
133733608aafSIsaku Yamahata if (write) {
133833608aafSIsaku Yamahata val = tdx->vp_enter_args.r15;
133933608aafSIsaku Yamahata ret = ctxt->ops->pio_out_emulated(ctxt, size, port, &val, 1);
134033608aafSIsaku Yamahata } else {
134133608aafSIsaku Yamahata ret = ctxt->ops->pio_in_emulated(ctxt, size, port, &val, 1);
134233608aafSIsaku Yamahata }
134333608aafSIsaku Yamahata
134433608aafSIsaku Yamahata if (!ret)
134533608aafSIsaku Yamahata vcpu->arch.complete_userspace_io = write ? tdx_complete_pio_out :
134633608aafSIsaku Yamahata tdx_complete_pio_in;
134733608aafSIsaku Yamahata else if (!write)
134833608aafSIsaku Yamahata tdvmcall_set_return_val(vcpu, val);
134933608aafSIsaku Yamahata
135033608aafSIsaku Yamahata return ret;
135133608aafSIsaku Yamahata }
135233608aafSIsaku Yamahata
tdx_complete_mmio_read(struct kvm_vcpu * vcpu)1353bb723bebSSean Christopherson static int tdx_complete_mmio_read(struct kvm_vcpu *vcpu)
1354bb723bebSSean Christopherson {
1355bb723bebSSean Christopherson unsigned long val = 0;
1356bb723bebSSean Christopherson gpa_t gpa;
1357bb723bebSSean Christopherson int size;
1358bb723bebSSean Christopherson
1359bb723bebSSean Christopherson gpa = vcpu->mmio_fragments[0].gpa;
1360bb723bebSSean Christopherson size = vcpu->mmio_fragments[0].len;
1361bb723bebSSean Christopherson
1362bb723bebSSean Christopherson memcpy(&val, vcpu->run->mmio.data, size);
1363bb723bebSSean Christopherson tdvmcall_set_return_val(vcpu, val);
1364bb723bebSSean Christopherson trace_kvm_mmio(KVM_TRACE_MMIO_READ, size, gpa, &val);
1365bb723bebSSean Christopherson return 1;
1366bb723bebSSean Christopherson }
1367bb723bebSSean Christopherson
tdx_mmio_write(struct kvm_vcpu * vcpu,gpa_t gpa,int size,unsigned long val)1368bb723bebSSean Christopherson static inline int tdx_mmio_write(struct kvm_vcpu *vcpu, gpa_t gpa, int size,
1369bb723bebSSean Christopherson unsigned long val)
1370bb723bebSSean Christopherson {
1371bb723bebSSean Christopherson if (!kvm_io_bus_write(vcpu, KVM_FAST_MMIO_BUS, gpa, 0, NULL)) {
1372bb723bebSSean Christopherson trace_kvm_fast_mmio(gpa);
1373bb723bebSSean Christopherson return 0;
1374bb723bebSSean Christopherson }
1375bb723bebSSean Christopherson
1376bb723bebSSean Christopherson trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, size, gpa, &val);
1377bb723bebSSean Christopherson if (kvm_io_bus_write(vcpu, KVM_MMIO_BUS, gpa, size, &val))
1378bb723bebSSean Christopherson return -EOPNOTSUPP;
1379bb723bebSSean Christopherson
1380bb723bebSSean Christopherson return 0;
1381bb723bebSSean Christopherson }
1382bb723bebSSean Christopherson
tdx_mmio_read(struct kvm_vcpu * vcpu,gpa_t gpa,int size)1383bb723bebSSean Christopherson static inline int tdx_mmio_read(struct kvm_vcpu *vcpu, gpa_t gpa, int size)
1384bb723bebSSean Christopherson {
1385bb723bebSSean Christopherson unsigned long val;
1386bb723bebSSean Christopherson
1387bb723bebSSean Christopherson if (kvm_io_bus_read(vcpu, KVM_MMIO_BUS, gpa, size, &val))
1388bb723bebSSean Christopherson return -EOPNOTSUPP;
1389bb723bebSSean Christopherson
1390bb723bebSSean Christopherson tdvmcall_set_return_val(vcpu, val);
1391bb723bebSSean Christopherson trace_kvm_mmio(KVM_TRACE_MMIO_READ, size, gpa, &val);
1392bb723bebSSean Christopherson return 0;
1393bb723bebSSean Christopherson }
1394bb723bebSSean Christopherson
tdx_emulate_mmio(struct kvm_vcpu * vcpu)1395bb723bebSSean Christopherson static int tdx_emulate_mmio(struct kvm_vcpu *vcpu)
1396bb723bebSSean Christopherson {
1397bb723bebSSean Christopherson struct vcpu_tdx *tdx = to_tdx(vcpu);
1398bb723bebSSean Christopherson int size, write, r;
1399bb723bebSSean Christopherson unsigned long val;
1400bb723bebSSean Christopherson gpa_t gpa;
1401bb723bebSSean Christopherson
1402bb723bebSSean Christopherson size = tdx->vp_enter_args.r12;
1403bb723bebSSean Christopherson write = tdx->vp_enter_args.r13;
1404bb723bebSSean Christopherson gpa = tdx->vp_enter_args.r14;
1405bb723bebSSean Christopherson val = write ? tdx->vp_enter_args.r15 : 0;
1406bb723bebSSean Christopherson
1407bb723bebSSean Christopherson if (size != 1 && size != 2 && size != 4 && size != 8)
1408bb723bebSSean Christopherson goto error;
1409bb723bebSSean Christopherson if (write != 0 && write != 1)
1410bb723bebSSean Christopherson goto error;
1411bb723bebSSean Christopherson
1412bb723bebSSean Christopherson /*
1413bb723bebSSean Christopherson * TDG.VP.VMCALL<MMIO> allows only shared GPA, it makes no sense to
1414bb723bebSSean Christopherson * do MMIO emulation for private GPA.
1415bb723bebSSean Christopherson */
1416bb723bebSSean Christopherson if (vt_is_tdx_private_gpa(vcpu->kvm, gpa) ||
1417bb723bebSSean Christopherson vt_is_tdx_private_gpa(vcpu->kvm, gpa + size - 1))
1418bb723bebSSean Christopherson goto error;
1419bb723bebSSean Christopherson
1420bb723bebSSean Christopherson gpa = gpa & ~gfn_to_gpa(kvm_gfn_direct_bits(vcpu->kvm));
1421bb723bebSSean Christopherson
1422bb723bebSSean Christopherson if (write)
1423bb723bebSSean Christopherson r = tdx_mmio_write(vcpu, gpa, size, val);
1424bb723bebSSean Christopherson else
1425bb723bebSSean Christopherson r = tdx_mmio_read(vcpu, gpa, size);
1426bb723bebSSean Christopherson if (!r)
1427bb723bebSSean Christopherson /* Kernel completed device emulation. */
1428bb723bebSSean Christopherson return 1;
1429bb723bebSSean Christopherson
1430bb723bebSSean Christopherson /* Request the device emulation to userspace device model. */
1431bb723bebSSean Christopherson vcpu->mmio_is_write = write;
1432bb723bebSSean Christopherson if (!write)
1433bb723bebSSean Christopherson vcpu->arch.complete_userspace_io = tdx_complete_mmio_read;
1434bb723bebSSean Christopherson
1435bb723bebSSean Christopherson vcpu->run->mmio.phys_addr = gpa;
1436bb723bebSSean Christopherson vcpu->run->mmio.len = size;
1437bb723bebSSean Christopherson vcpu->run->mmio.is_write = write;
1438bb723bebSSean Christopherson vcpu->run->exit_reason = KVM_EXIT_MMIO;
1439bb723bebSSean Christopherson
1440bb723bebSSean Christopherson if (write) {
1441bb723bebSSean Christopherson memcpy(vcpu->run->mmio.data, &val, size);
1442bb723bebSSean Christopherson } else {
1443bb723bebSSean Christopherson vcpu->mmio_fragments[0].gpa = gpa;
1444bb723bebSSean Christopherson vcpu->mmio_fragments[0].len = size;
1445bb723bebSSean Christopherson trace_kvm_mmio(KVM_TRACE_MMIO_READ_UNSATISFIED, size, gpa, NULL);
1446bb723bebSSean Christopherson }
1447bb723bebSSean Christopherson return 0;
1448bb723bebSSean Christopherson
1449bb723bebSSean Christopherson error:
1450bb723bebSSean Christopherson tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_INVALID_OPERAND);
1451bb723bebSSean Christopherson return 1;
1452bb723bebSSean Christopherson }
1453bb723bebSSean Christopherson
tdx_complete_get_td_vm_call_info(struct kvm_vcpu * vcpu)1454*25e8b1ddSBinbin Wu static int tdx_complete_get_td_vm_call_info(struct kvm_vcpu *vcpu)
1455*25e8b1ddSBinbin Wu {
1456*25e8b1ddSBinbin Wu struct vcpu_tdx *tdx = to_tdx(vcpu);
1457*25e8b1ddSBinbin Wu
1458*25e8b1ddSBinbin Wu tdvmcall_set_return_code(vcpu, vcpu->run->tdx.get_tdvmcall_info.ret);
1459*25e8b1ddSBinbin Wu
1460*25e8b1ddSBinbin Wu /*
1461*25e8b1ddSBinbin Wu * For now, there is no TDVMCALL beyond GHCI base API supported by KVM
1462*25e8b1ddSBinbin Wu * directly without the support from userspace, just set the value
1463*25e8b1ddSBinbin Wu * returned from userspace.
1464*25e8b1ddSBinbin Wu */
1465*25e8b1ddSBinbin Wu tdx->vp_enter_args.r11 = vcpu->run->tdx.get_tdvmcall_info.r11;
1466*25e8b1ddSBinbin Wu tdx->vp_enter_args.r12 = vcpu->run->tdx.get_tdvmcall_info.r12;
1467*25e8b1ddSBinbin Wu tdx->vp_enter_args.r13 = vcpu->run->tdx.get_tdvmcall_info.r13;
1468*25e8b1ddSBinbin Wu tdx->vp_enter_args.r14 = vcpu->run->tdx.get_tdvmcall_info.r14;
1469*25e8b1ddSBinbin Wu
1470*25e8b1ddSBinbin Wu return 1;
1471*25e8b1ddSBinbin Wu }
1472*25e8b1ddSBinbin Wu
tdx_get_td_vm_call_info(struct kvm_vcpu * vcpu)147304733836SIsaku Yamahata static int tdx_get_td_vm_call_info(struct kvm_vcpu *vcpu)
147404733836SIsaku Yamahata {
147504733836SIsaku Yamahata struct vcpu_tdx *tdx = to_tdx(vcpu);
147604733836SIsaku Yamahata
1477*25e8b1ddSBinbin Wu switch (tdx->vp_enter_args.r12) {
1478*25e8b1ddSBinbin Wu case 0:
147904733836SIsaku Yamahata tdx->vp_enter_args.r11 = 0;
1480*25e8b1ddSBinbin Wu tdx->vp_enter_args.r12 = 0;
148104733836SIsaku Yamahata tdx->vp_enter_args.r13 = 0;
148204733836SIsaku Yamahata tdx->vp_enter_args.r14 = 0;
1483*25e8b1ddSBinbin Wu tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_SUCCESS);
148404733836SIsaku Yamahata return 1;
1485*25e8b1ddSBinbin Wu case 1:
1486*25e8b1ddSBinbin Wu vcpu->run->tdx.get_tdvmcall_info.leaf = tdx->vp_enter_args.r12;
1487*25e8b1ddSBinbin Wu vcpu->run->exit_reason = KVM_EXIT_TDX;
1488*25e8b1ddSBinbin Wu vcpu->run->tdx.flags = 0;
1489*25e8b1ddSBinbin Wu vcpu->run->tdx.nr = TDVMCALL_GET_TD_VM_CALL_INFO;
1490*25e8b1ddSBinbin Wu vcpu->run->tdx.get_tdvmcall_info.ret = TDVMCALL_STATUS_SUCCESS;
1491*25e8b1ddSBinbin Wu vcpu->run->tdx.get_tdvmcall_info.r11 = 0;
1492*25e8b1ddSBinbin Wu vcpu->run->tdx.get_tdvmcall_info.r12 = 0;
1493*25e8b1ddSBinbin Wu vcpu->run->tdx.get_tdvmcall_info.r13 = 0;
1494*25e8b1ddSBinbin Wu vcpu->run->tdx.get_tdvmcall_info.r14 = 0;
1495*25e8b1ddSBinbin Wu vcpu->arch.complete_userspace_io = tdx_complete_get_td_vm_call_info;
1496*25e8b1ddSBinbin Wu return 0;
1497*25e8b1ddSBinbin Wu default:
1498*25e8b1ddSBinbin Wu tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_INVALID_OPERAND);
1499*25e8b1ddSBinbin Wu return 1;
1500*25e8b1ddSBinbin Wu }
150104733836SIsaku Yamahata }
150204733836SIsaku Yamahata
tdx_complete_simple(struct kvm_vcpu * vcpu)1503cf207eacSBinbin Wu static int tdx_complete_simple(struct kvm_vcpu *vcpu)
1504cf207eacSBinbin Wu {
1505cf207eacSBinbin Wu tdvmcall_set_return_code(vcpu, vcpu->run->tdx.unknown.ret);
1506cf207eacSBinbin Wu return 1;
1507cf207eacSBinbin Wu }
1508cf207eacSBinbin Wu
tdx_get_quote(struct kvm_vcpu * vcpu)1509cf207eacSBinbin Wu static int tdx_get_quote(struct kvm_vcpu *vcpu)
1510cf207eacSBinbin Wu {
1511cf207eacSBinbin Wu struct vcpu_tdx *tdx = to_tdx(vcpu);
1512cf207eacSBinbin Wu u64 gpa = tdx->vp_enter_args.r12;
1513cf207eacSBinbin Wu u64 size = tdx->vp_enter_args.r13;
1514cf207eacSBinbin Wu
1515cf207eacSBinbin Wu /* The gpa of buffer must have shared bit set. */
1516cf207eacSBinbin Wu if (vt_is_tdx_private_gpa(vcpu->kvm, gpa)) {
1517cf207eacSBinbin Wu tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_INVALID_OPERAND);
1518cf207eacSBinbin Wu return 1;
1519cf207eacSBinbin Wu }
1520cf207eacSBinbin Wu
1521cf207eacSBinbin Wu vcpu->run->exit_reason = KVM_EXIT_TDX;
1522cf207eacSBinbin Wu vcpu->run->tdx.flags = 0;
1523cf207eacSBinbin Wu vcpu->run->tdx.nr = TDVMCALL_GET_QUOTE;
1524cf207eacSBinbin Wu vcpu->run->tdx.get_quote.ret = TDVMCALL_STATUS_SUBFUNC_UNSUPPORTED;
1525cf207eacSBinbin Wu vcpu->run->tdx.get_quote.gpa = gpa & ~gfn_to_gpa(kvm_gfn_direct_bits(tdx->vcpu.kvm));
1526cf207eacSBinbin Wu vcpu->run->tdx.get_quote.size = size;
1527cf207eacSBinbin Wu
1528cf207eacSBinbin Wu vcpu->arch.complete_userspace_io = tdx_complete_simple;
1529cf207eacSBinbin Wu
1530cf207eacSBinbin Wu return 0;
1531cf207eacSBinbin Wu }
1532cf207eacSBinbin Wu
handle_tdvmcall(struct kvm_vcpu * vcpu)1533c42856afSIsaku Yamahata static int handle_tdvmcall(struct kvm_vcpu *vcpu)
1534c42856afSIsaku Yamahata {
1535c42856afSIsaku Yamahata switch (tdvmcall_leaf(vcpu)) {
15362c304880SBinbin Wu case TDVMCALL_MAP_GPA:
15372c304880SBinbin Wu return tdx_map_gpa(vcpu);
153879462faaSBinbin Wu case TDVMCALL_REPORT_FATAL_ERROR:
153979462faaSBinbin Wu return tdx_report_fatal_error(vcpu);
154004733836SIsaku Yamahata case TDVMCALL_GET_TD_VM_CALL_INFO:
154104733836SIsaku Yamahata return tdx_get_td_vm_call_info(vcpu);
1542cf207eacSBinbin Wu case TDVMCALL_GET_QUOTE:
1543cf207eacSBinbin Wu return tdx_get_quote(vcpu);
1544c42856afSIsaku Yamahata default:
1545c42856afSIsaku Yamahata break;
1546c42856afSIsaku Yamahata }
1547c42856afSIsaku Yamahata
1548b5aafcb4SBinbin Wu tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_SUBFUNC_UNSUPPORTED);
1549c42856afSIsaku Yamahata return 1;
1550c42856afSIsaku Yamahata }
1551c42856afSIsaku Yamahata
tdx_load_mmu_pgd(struct kvm_vcpu * vcpu,hpa_t root_hpa,int pgd_level)155287e3f45eSSean Christopherson void tdx_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa, int pgd_level)
155387e3f45eSSean Christopherson {
155487e3f45eSSean Christopherson u64 shared_bit = (pgd_level == 5) ? TDX_SHARED_BIT_PWL_5 :
155587e3f45eSSean Christopherson TDX_SHARED_BIT_PWL_4;
155687e3f45eSSean Christopherson
155787e3f45eSSean Christopherson if (KVM_BUG_ON(shared_bit != kvm_gfn_direct_bits(vcpu->kvm), vcpu->kvm))
155887e3f45eSSean Christopherson return;
155987e3f45eSSean Christopherson
156087e3f45eSSean Christopherson td_vmcs_write64(to_tdx(vcpu), SHARED_EPT_POINTER, root_hpa);
156187e3f45eSSean Christopherson }
156287e3f45eSSean Christopherson
tdx_unpin(struct kvm * kvm,struct page * page)156302ab5770SIsaku Yamahata static void tdx_unpin(struct kvm *kvm, struct page *page)
156402ab5770SIsaku Yamahata {
156502ab5770SIsaku Yamahata put_page(page);
156602ab5770SIsaku Yamahata }
156702ab5770SIsaku Yamahata
tdx_mem_page_aug(struct kvm * kvm,gfn_t gfn,enum pg_level level,struct page * page)156802ab5770SIsaku Yamahata static int tdx_mem_page_aug(struct kvm *kvm, gfn_t gfn,
156902ab5770SIsaku Yamahata enum pg_level level, struct page *page)
157002ab5770SIsaku Yamahata {
157102ab5770SIsaku Yamahata int tdx_level = pg_level_to_tdx_sept_level(level);
157202ab5770SIsaku Yamahata struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
157302ab5770SIsaku Yamahata gpa_t gpa = gfn_to_gpa(gfn);
157402ab5770SIsaku Yamahata u64 entry, level_state;
157502ab5770SIsaku Yamahata u64 err;
157602ab5770SIsaku Yamahata
157702ab5770SIsaku Yamahata err = tdh_mem_page_aug(&kvm_tdx->td, gpa, tdx_level, page, &entry, &level_state);
157802ab5770SIsaku Yamahata if (unlikely(tdx_operand_busy(err))) {
157902ab5770SIsaku Yamahata tdx_unpin(kvm, page);
158002ab5770SIsaku Yamahata return -EBUSY;
158102ab5770SIsaku Yamahata }
158202ab5770SIsaku Yamahata
158302ab5770SIsaku Yamahata if (KVM_BUG_ON(err, kvm)) {
158402ab5770SIsaku Yamahata pr_tdx_error_2(TDH_MEM_PAGE_AUG, err, entry, level_state);
158502ab5770SIsaku Yamahata tdx_unpin(kvm, page);
158602ab5770SIsaku Yamahata return -EIO;
158702ab5770SIsaku Yamahata }
158802ab5770SIsaku Yamahata
158902ab5770SIsaku Yamahata return 0;
159002ab5770SIsaku Yamahata }
159102ab5770SIsaku Yamahata
1592012426d6SIsaku Yamahata /*
1593012426d6SIsaku Yamahata * KVM_TDX_INIT_MEM_REGION calls kvm_gmem_populate() to map guest pages; the
1594012426d6SIsaku Yamahata * callback tdx_gmem_post_populate() then maps pages into private memory.
1595012426d6SIsaku Yamahata * through the a seamcall TDH.MEM.PAGE.ADD(). The SEAMCALL also requires the
1596012426d6SIsaku Yamahata * private EPT structures for the page to have been built before, which is
1597012426d6SIsaku Yamahata * done via kvm_tdp_map_page(). nr_premapped counts the number of pages that
1598012426d6SIsaku Yamahata * were added to the EPT structures but not added with TDH.MEM.PAGE.ADD().
1599012426d6SIsaku Yamahata * The counter has to be zero on KVM_TDX_FINALIZE_VM, to ensure that there
1600012426d6SIsaku Yamahata * are no half-initialized shared EPT pages.
1601012426d6SIsaku Yamahata */
tdx_mem_page_record_premap_cnt(struct kvm * kvm,gfn_t gfn,enum pg_level level,kvm_pfn_t pfn)1602012426d6SIsaku Yamahata static int tdx_mem_page_record_premap_cnt(struct kvm *kvm, gfn_t gfn,
1603012426d6SIsaku Yamahata enum pg_level level, kvm_pfn_t pfn)
1604012426d6SIsaku Yamahata {
1605012426d6SIsaku Yamahata struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
1606012426d6SIsaku Yamahata
1607012426d6SIsaku Yamahata if (KVM_BUG_ON(kvm->arch.pre_fault_allowed, kvm))
1608012426d6SIsaku Yamahata return -EINVAL;
1609012426d6SIsaku Yamahata
1610012426d6SIsaku Yamahata /* nr_premapped will be decreased when tdh_mem_page_add() is called. */
1611012426d6SIsaku Yamahata atomic64_inc(&kvm_tdx->nr_premapped);
1612012426d6SIsaku Yamahata return 0;
1613012426d6SIsaku Yamahata }
1614012426d6SIsaku Yamahata
tdx_sept_set_private_spte(struct kvm * kvm,gfn_t gfn,enum pg_level level,kvm_pfn_t pfn)161502ab5770SIsaku Yamahata int tdx_sept_set_private_spte(struct kvm *kvm, gfn_t gfn,
161602ab5770SIsaku Yamahata enum pg_level level, kvm_pfn_t pfn)
161702ab5770SIsaku Yamahata {
161802ab5770SIsaku Yamahata struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
161902ab5770SIsaku Yamahata struct page *page = pfn_to_page(pfn);
162002ab5770SIsaku Yamahata
162102ab5770SIsaku Yamahata /* TODO: handle large pages. */
162202ab5770SIsaku Yamahata if (KVM_BUG_ON(level != PG_LEVEL_4K, kvm))
162302ab5770SIsaku Yamahata return -EINVAL;
162402ab5770SIsaku Yamahata
162502ab5770SIsaku Yamahata /*
162602ab5770SIsaku Yamahata * Because guest_memfd doesn't support page migration with
162702ab5770SIsaku Yamahata * a_ops->migrate_folio (yet), no callback is triggered for KVM on page
162802ab5770SIsaku Yamahata * migration. Until guest_memfd supports page migration, prevent page
162902ab5770SIsaku Yamahata * migration.
163002ab5770SIsaku Yamahata * TODO: Once guest_memfd introduces callback on page migration,
163102ab5770SIsaku Yamahata * implement it and remove get_page/put_page().
163202ab5770SIsaku Yamahata */
163302ab5770SIsaku Yamahata get_page(page);
163402ab5770SIsaku Yamahata
1635012426d6SIsaku Yamahata /*
1636012426d6SIsaku Yamahata * Read 'pre_fault_allowed' before 'kvm_tdx->state'; see matching
1637012426d6SIsaku Yamahata * barrier in tdx_td_finalize().
1638012426d6SIsaku Yamahata */
1639012426d6SIsaku Yamahata smp_rmb();
164002ab5770SIsaku Yamahata if (likely(kvm_tdx->state == TD_STATE_RUNNABLE))
164102ab5770SIsaku Yamahata return tdx_mem_page_aug(kvm, gfn, level, page);
164202ab5770SIsaku Yamahata
1643012426d6SIsaku Yamahata return tdx_mem_page_record_premap_cnt(kvm, gfn, level, pfn);
164402ab5770SIsaku Yamahata }
164502ab5770SIsaku Yamahata
tdx_sept_drop_private_spte(struct kvm * kvm,gfn_t gfn,enum pg_level level,struct page * page)164602ab5770SIsaku Yamahata static int tdx_sept_drop_private_spte(struct kvm *kvm, gfn_t gfn,
164702ab5770SIsaku Yamahata enum pg_level level, struct page *page)
164802ab5770SIsaku Yamahata {
164902ab5770SIsaku Yamahata int tdx_level = pg_level_to_tdx_sept_level(level);
165002ab5770SIsaku Yamahata struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
165102ab5770SIsaku Yamahata gpa_t gpa = gfn_to_gpa(gfn);
165202ab5770SIsaku Yamahata u64 err, entry, level_state;
165302ab5770SIsaku Yamahata
165402ab5770SIsaku Yamahata /* TODO: handle large pages. */
165502ab5770SIsaku Yamahata if (KVM_BUG_ON(level != PG_LEVEL_4K, kvm))
165602ab5770SIsaku Yamahata return -EINVAL;
165702ab5770SIsaku Yamahata
165802ab5770SIsaku Yamahata if (KVM_BUG_ON(!is_hkid_assigned(kvm_tdx), kvm))
165902ab5770SIsaku Yamahata return -EINVAL;
166002ab5770SIsaku Yamahata
166102ab5770SIsaku Yamahata /*
16624b2abc49SYan Zhao * When zapping private page, write lock is held. So no race condition
16634b2abc49SYan Zhao * with other vcpu sept operation.
16644b2abc49SYan Zhao * Race with TDH.VP.ENTER due to (0-step mitigation) and Guest TDCALLs.
166502ab5770SIsaku Yamahata */
166602ab5770SIsaku Yamahata err = tdh_mem_page_remove(&kvm_tdx->td, gpa, tdx_level, &entry,
166702ab5770SIsaku Yamahata &level_state);
16684b2abc49SYan Zhao
16694b2abc49SYan Zhao if (unlikely(tdx_operand_busy(err))) {
16704b2abc49SYan Zhao /*
16714b2abc49SYan Zhao * The second retry is expected to succeed after kicking off all
16724b2abc49SYan Zhao * other vCPUs and prevent them from invoking TDH.VP.ENTER.
16734b2abc49SYan Zhao */
16744b2abc49SYan Zhao tdx_no_vcpus_enter_start(kvm);
16754b2abc49SYan Zhao err = tdh_mem_page_remove(&kvm_tdx->td, gpa, tdx_level, &entry,
16764b2abc49SYan Zhao &level_state);
16774b2abc49SYan Zhao tdx_no_vcpus_enter_stop(kvm);
16784b2abc49SYan Zhao }
167902ab5770SIsaku Yamahata
168002ab5770SIsaku Yamahata if (KVM_BUG_ON(err, kvm)) {
168102ab5770SIsaku Yamahata pr_tdx_error_2(TDH_MEM_PAGE_REMOVE, err, entry, level_state);
168202ab5770SIsaku Yamahata return -EIO;
168302ab5770SIsaku Yamahata }
168402ab5770SIsaku Yamahata
168502ab5770SIsaku Yamahata err = tdh_phymem_page_wbinvd_hkid((u16)kvm_tdx->hkid, page);
168602ab5770SIsaku Yamahata
168702ab5770SIsaku Yamahata if (KVM_BUG_ON(err, kvm)) {
168802ab5770SIsaku Yamahata pr_tdx_error(TDH_PHYMEM_PAGE_WBINVD, err);
168902ab5770SIsaku Yamahata return -EIO;
169002ab5770SIsaku Yamahata }
169102ab5770SIsaku Yamahata tdx_clear_page(page);
169202ab5770SIsaku Yamahata tdx_unpin(kvm, page);
169302ab5770SIsaku Yamahata return 0;
169402ab5770SIsaku Yamahata }
169502ab5770SIsaku Yamahata
tdx_sept_link_private_spt(struct kvm * kvm,gfn_t gfn,enum pg_level level,void * private_spt)169602ab5770SIsaku Yamahata int tdx_sept_link_private_spt(struct kvm *kvm, gfn_t gfn,
169702ab5770SIsaku Yamahata enum pg_level level, void *private_spt)
169802ab5770SIsaku Yamahata {
169902ab5770SIsaku Yamahata int tdx_level = pg_level_to_tdx_sept_level(level);
170002ab5770SIsaku Yamahata gpa_t gpa = gfn_to_gpa(gfn);
170102ab5770SIsaku Yamahata struct page *page = virt_to_page(private_spt);
170202ab5770SIsaku Yamahata u64 err, entry, level_state;
170302ab5770SIsaku Yamahata
170402ab5770SIsaku Yamahata err = tdh_mem_sept_add(&to_kvm_tdx(kvm)->td, gpa, tdx_level, page, &entry,
170502ab5770SIsaku Yamahata &level_state);
170602ab5770SIsaku Yamahata if (unlikely(tdx_operand_busy(err)))
170702ab5770SIsaku Yamahata return -EBUSY;
170802ab5770SIsaku Yamahata
170902ab5770SIsaku Yamahata if (KVM_BUG_ON(err, kvm)) {
171002ab5770SIsaku Yamahata pr_tdx_error_2(TDH_MEM_SEPT_ADD, err, entry, level_state);
171102ab5770SIsaku Yamahata return -EIO;
171202ab5770SIsaku Yamahata }
171302ab5770SIsaku Yamahata
171402ab5770SIsaku Yamahata return 0;
171502ab5770SIsaku Yamahata }
171602ab5770SIsaku Yamahata
1717eac0b72fSYan Zhao /*
1718eac0b72fSYan Zhao * Check if the error returned from a SEPT zap SEAMCALL is due to that a page is
1719eac0b72fSYan Zhao * mapped by KVM_TDX_INIT_MEM_REGION without tdh_mem_page_add() being called
1720eac0b72fSYan Zhao * successfully.
1721eac0b72fSYan Zhao *
1722eac0b72fSYan Zhao * Since tdh_mem_sept_add() must have been invoked successfully before a
1723eac0b72fSYan Zhao * non-leaf entry present in the mirrored page table, the SEPT ZAP related
1724eac0b72fSYan Zhao * SEAMCALLs should not encounter err TDX_EPT_WALK_FAILED. They should instead
1725eac0b72fSYan Zhao * find TDX_EPT_ENTRY_STATE_INCORRECT due to an empty leaf entry found in the
1726eac0b72fSYan Zhao * SEPT.
1727eac0b72fSYan Zhao *
1728eac0b72fSYan Zhao * Further check if the returned entry from SEPT walking is with RWX permissions
1729eac0b72fSYan Zhao * to filter out anything unexpected.
1730eac0b72fSYan Zhao *
1731eac0b72fSYan Zhao * Note: @level is pg_level, not the tdx_level. The tdx_level extracted from
1732eac0b72fSYan Zhao * level_state returned from a SEAMCALL error is the same as that passed into
1733eac0b72fSYan Zhao * the SEAMCALL.
1734eac0b72fSYan Zhao */
tdx_is_sept_zap_err_due_to_premap(struct kvm_tdx * kvm_tdx,u64 err,u64 entry,int level)1735eac0b72fSYan Zhao static int tdx_is_sept_zap_err_due_to_premap(struct kvm_tdx *kvm_tdx, u64 err,
1736eac0b72fSYan Zhao u64 entry, int level)
1737eac0b72fSYan Zhao {
1738eac0b72fSYan Zhao if (!err || kvm_tdx->state == TD_STATE_RUNNABLE)
1739eac0b72fSYan Zhao return false;
1740eac0b72fSYan Zhao
1741eac0b72fSYan Zhao if (err != (TDX_EPT_ENTRY_STATE_INCORRECT | TDX_OPERAND_ID_RCX))
1742eac0b72fSYan Zhao return false;
1743eac0b72fSYan Zhao
1744eac0b72fSYan Zhao if ((is_last_spte(entry, level) && (entry & VMX_EPT_RWX_MASK)))
1745eac0b72fSYan Zhao return false;
1746eac0b72fSYan Zhao
1747eac0b72fSYan Zhao return true;
1748eac0b72fSYan Zhao }
1749eac0b72fSYan Zhao
tdx_sept_zap_private_spte(struct kvm * kvm,gfn_t gfn,enum pg_level level,struct page * page)175002ab5770SIsaku Yamahata static int tdx_sept_zap_private_spte(struct kvm *kvm, gfn_t gfn,
1751eac0b72fSYan Zhao enum pg_level level, struct page *page)
175202ab5770SIsaku Yamahata {
175302ab5770SIsaku Yamahata int tdx_level = pg_level_to_tdx_sept_level(level);
175402ab5770SIsaku Yamahata struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
175502ab5770SIsaku Yamahata gpa_t gpa = gfn_to_gpa(gfn) & KVM_HPAGE_MASK(level);
175602ab5770SIsaku Yamahata u64 err, entry, level_state;
175702ab5770SIsaku Yamahata
175802ab5770SIsaku Yamahata /* For now large page isn't supported yet. */
175902ab5770SIsaku Yamahata WARN_ON_ONCE(level != PG_LEVEL_4K);
176002ab5770SIsaku Yamahata
176102ab5770SIsaku Yamahata err = tdh_mem_range_block(&kvm_tdx->td, gpa, tdx_level, &entry, &level_state);
1762eac0b72fSYan Zhao
17634b2abc49SYan Zhao if (unlikely(tdx_operand_busy(err))) {
17644b2abc49SYan Zhao /* After no vCPUs enter, the second retry is expected to succeed */
17654b2abc49SYan Zhao tdx_no_vcpus_enter_start(kvm);
17664b2abc49SYan Zhao err = tdh_mem_range_block(&kvm_tdx->td, gpa, tdx_level, &entry, &level_state);
17674b2abc49SYan Zhao tdx_no_vcpus_enter_stop(kvm);
17684b2abc49SYan Zhao }
1769eac0b72fSYan Zhao if (tdx_is_sept_zap_err_due_to_premap(kvm_tdx, err, entry, level) &&
1770eac0b72fSYan Zhao !KVM_BUG_ON(!atomic64_read(&kvm_tdx->nr_premapped), kvm)) {
1771eac0b72fSYan Zhao atomic64_dec(&kvm_tdx->nr_premapped);
1772eac0b72fSYan Zhao tdx_unpin(kvm, page);
1773eac0b72fSYan Zhao return 0;
1774eac0b72fSYan Zhao }
1775eac0b72fSYan Zhao
177602ab5770SIsaku Yamahata if (KVM_BUG_ON(err, kvm)) {
177702ab5770SIsaku Yamahata pr_tdx_error_2(TDH_MEM_RANGE_BLOCK, err, entry, level_state);
177802ab5770SIsaku Yamahata return -EIO;
177902ab5770SIsaku Yamahata }
1780eac0b72fSYan Zhao return 1;
178102ab5770SIsaku Yamahata }
178202ab5770SIsaku Yamahata
178322836e1dSIsaku Yamahata /*
178422836e1dSIsaku Yamahata * Ensure shared and private EPTs to be flushed on all vCPUs.
178522836e1dSIsaku Yamahata * tdh_mem_track() is the only caller that increases TD epoch. An increase in
178622836e1dSIsaku Yamahata * the TD epoch (e.g., to value "N + 1") is successful only if no vCPUs are
178722836e1dSIsaku Yamahata * running in guest mode with the value "N - 1".
178822836e1dSIsaku Yamahata *
178922836e1dSIsaku Yamahata * A successful execution of tdh_mem_track() ensures that vCPUs can only run in
179022836e1dSIsaku Yamahata * guest mode with TD epoch value "N" if no TD exit occurs after the TD epoch
179122836e1dSIsaku Yamahata * being increased to "N + 1".
179222836e1dSIsaku Yamahata *
179322836e1dSIsaku Yamahata * Kicking off all vCPUs after that further results in no vCPUs can run in guest
179422836e1dSIsaku Yamahata * mode with TD epoch value "N", which unblocks the next tdh_mem_track() (e.g.
179522836e1dSIsaku Yamahata * to increase TD epoch to "N + 2").
179622836e1dSIsaku Yamahata *
179722836e1dSIsaku Yamahata * TDX module will flush EPT on the next TD enter and make vCPUs to run in
179822836e1dSIsaku Yamahata * guest mode with TD epoch value "N + 1".
179922836e1dSIsaku Yamahata *
180022836e1dSIsaku Yamahata * kvm_make_all_cpus_request() guarantees all vCPUs are out of guest mode by
180122836e1dSIsaku Yamahata * waiting empty IPI handler ack_kick().
180222836e1dSIsaku Yamahata *
180322836e1dSIsaku Yamahata * No action is required to the vCPUs being kicked off since the kicking off
180422836e1dSIsaku Yamahata * occurs certainly after TD epoch increment and before the next
180522836e1dSIsaku Yamahata * tdh_mem_track().
180622836e1dSIsaku Yamahata */
tdx_track(struct kvm * kvm)180702ab5770SIsaku Yamahata static void tdx_track(struct kvm *kvm)
180822836e1dSIsaku Yamahata {
180922836e1dSIsaku Yamahata struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
181022836e1dSIsaku Yamahata u64 err;
181122836e1dSIsaku Yamahata
181222836e1dSIsaku Yamahata /* If TD isn't finalized, it's before any vcpu running. */
181322836e1dSIsaku Yamahata if (unlikely(kvm_tdx->state != TD_STATE_RUNNABLE))
181422836e1dSIsaku Yamahata return;
181522836e1dSIsaku Yamahata
181622836e1dSIsaku Yamahata lockdep_assert_held_write(&kvm->mmu_lock);
181722836e1dSIsaku Yamahata
181822836e1dSIsaku Yamahata err = tdh_mem_track(&kvm_tdx->td);
18194b2abc49SYan Zhao if (unlikely(tdx_operand_busy(err))) {
18204b2abc49SYan Zhao /* After no vCPUs enter, the second retry is expected to succeed */
18214b2abc49SYan Zhao tdx_no_vcpus_enter_start(kvm);
18224b2abc49SYan Zhao err = tdh_mem_track(&kvm_tdx->td);
18234b2abc49SYan Zhao tdx_no_vcpus_enter_stop(kvm);
18244b2abc49SYan Zhao }
182522836e1dSIsaku Yamahata
182622836e1dSIsaku Yamahata if (KVM_BUG_ON(err, kvm))
182722836e1dSIsaku Yamahata pr_tdx_error(TDH_MEM_TRACK, err);
182822836e1dSIsaku Yamahata
182922836e1dSIsaku Yamahata kvm_make_all_cpus_request(kvm, KVM_REQ_OUTSIDE_GUEST_MODE);
183022836e1dSIsaku Yamahata }
183122836e1dSIsaku Yamahata
tdx_sept_free_private_spt(struct kvm * kvm,gfn_t gfn,enum pg_level level,void * private_spt)183202ab5770SIsaku Yamahata int tdx_sept_free_private_spt(struct kvm *kvm, gfn_t gfn,
183302ab5770SIsaku Yamahata enum pg_level level, void *private_spt)
183402ab5770SIsaku Yamahata {
183502ab5770SIsaku Yamahata struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
183602ab5770SIsaku Yamahata
183702ab5770SIsaku Yamahata /*
183802ab5770SIsaku Yamahata * free_external_spt() is only called after hkid is freed when TD is
183902ab5770SIsaku Yamahata * tearing down.
184002ab5770SIsaku Yamahata * KVM doesn't (yet) zap page table pages in mirror page table while
184102ab5770SIsaku Yamahata * TD is active, though guest pages mapped in mirror page table could be
184202ab5770SIsaku Yamahata * zapped during TD is active, e.g. for shared <-> private conversion
184302ab5770SIsaku Yamahata * and slot move/deletion.
184402ab5770SIsaku Yamahata */
184502ab5770SIsaku Yamahata if (KVM_BUG_ON(is_hkid_assigned(kvm_tdx), kvm))
184602ab5770SIsaku Yamahata return -EINVAL;
184702ab5770SIsaku Yamahata
184802ab5770SIsaku Yamahata /*
184902ab5770SIsaku Yamahata * The HKID assigned to this TD was already freed and cache was
185002ab5770SIsaku Yamahata * already flushed. We don't have to flush again.
185102ab5770SIsaku Yamahata */
185202ab5770SIsaku Yamahata return tdx_reclaim_page(virt_to_page(private_spt));
185302ab5770SIsaku Yamahata }
185402ab5770SIsaku Yamahata
tdx_sept_remove_private_spte(struct kvm * kvm,gfn_t gfn,enum pg_level level,kvm_pfn_t pfn)185502ab5770SIsaku Yamahata int tdx_sept_remove_private_spte(struct kvm *kvm, gfn_t gfn,
185602ab5770SIsaku Yamahata enum pg_level level, kvm_pfn_t pfn)
185702ab5770SIsaku Yamahata {
1858eac0b72fSYan Zhao struct page *page = pfn_to_page(pfn);
185902ab5770SIsaku Yamahata int ret;
186002ab5770SIsaku Yamahata
186102ab5770SIsaku Yamahata /*
186202ab5770SIsaku Yamahata * HKID is released after all private pages have been removed, and set
186302ab5770SIsaku Yamahata * before any might be populated. Warn if zapping is attempted when
186402ab5770SIsaku Yamahata * there can't be anything populated in the private EPT.
186502ab5770SIsaku Yamahata */
186602ab5770SIsaku Yamahata if (KVM_BUG_ON(!is_hkid_assigned(to_kvm_tdx(kvm)), kvm))
186702ab5770SIsaku Yamahata return -EINVAL;
186802ab5770SIsaku Yamahata
1869eac0b72fSYan Zhao ret = tdx_sept_zap_private_spte(kvm, gfn, level, page);
1870eac0b72fSYan Zhao if (ret <= 0)
187102ab5770SIsaku Yamahata return ret;
187202ab5770SIsaku Yamahata
187302ab5770SIsaku Yamahata /*
187402ab5770SIsaku Yamahata * TDX requires TLB tracking before dropping private page. Do
187502ab5770SIsaku Yamahata * it here, although it is also done later.
187602ab5770SIsaku Yamahata */
187702ab5770SIsaku Yamahata tdx_track(kvm);
187802ab5770SIsaku Yamahata
1879eac0b72fSYan Zhao return tdx_sept_drop_private_spte(kvm, gfn, level, page);
188002ab5770SIsaku Yamahata }
188102ab5770SIsaku Yamahata
tdx_deliver_interrupt(struct kvm_lapic * apic,int delivery_mode,int trig_mode,int vector)188224c12911SIsaku Yamahata void tdx_deliver_interrupt(struct kvm_lapic *apic, int delivery_mode,
188324c12911SIsaku Yamahata int trig_mode, int vector)
188424c12911SIsaku Yamahata {
188524c12911SIsaku Yamahata struct kvm_vcpu *vcpu = apic->vcpu;
188624c12911SIsaku Yamahata struct vcpu_tdx *tdx = to_tdx(vcpu);
188724c12911SIsaku Yamahata
188824c12911SIsaku Yamahata /* TDX supports only posted interrupt. No lapic emulation. */
188924c12911SIsaku Yamahata __vmx_deliver_posted_interrupt(vcpu, &tdx->vt.pi_desc, vector);
189024c12911SIsaku Yamahata
189124c12911SIsaku Yamahata trace_kvm_apicv_accept_irq(vcpu->vcpu_id, delivery_mode, trig_mode, vector);
189224c12911SIsaku Yamahata }
189324c12911SIsaku Yamahata
tdx_is_sept_violation_unexpected_pending(struct kvm_vcpu * vcpu)1894e6a85781SYan Zhao static inline bool tdx_is_sept_violation_unexpected_pending(struct kvm_vcpu *vcpu)
1895e6a85781SYan Zhao {
1896e6a85781SYan Zhao u64 eeq_type = to_tdx(vcpu)->ext_exit_qualification & TDX_EXT_EXIT_QUAL_TYPE_MASK;
1897e6a85781SYan Zhao u64 eq = vmx_get_exit_qual(vcpu);
1898e6a85781SYan Zhao
1899e6a85781SYan Zhao if (eeq_type != TDX_EXT_EXIT_QUAL_TYPE_PENDING_EPT_VIOLATION)
1900e6a85781SYan Zhao return false;
1901e6a85781SYan Zhao
1902fd02aa45SPaolo Bonzini return !(eq & EPT_VIOLATION_PROT_MASK) && !(eq & EPT_VIOLATION_EXEC_FOR_RING3_LIN);
1903e6a85781SYan Zhao }
1904e6a85781SYan Zhao
tdx_handle_ept_violation(struct kvm_vcpu * vcpu)1905da407fe4SIsaku Yamahata static int tdx_handle_ept_violation(struct kvm_vcpu *vcpu)
1906da407fe4SIsaku Yamahata {
1907da407fe4SIsaku Yamahata unsigned long exit_qual;
1908da407fe4SIsaku Yamahata gpa_t gpa = to_tdx(vcpu)->exit_gpa;
1909b0327bb2SYan Zhao bool local_retry = false;
1910b0327bb2SYan Zhao int ret;
1911da407fe4SIsaku Yamahata
1912da407fe4SIsaku Yamahata if (vt_is_tdx_private_gpa(vcpu->kvm, gpa)) {
1913e6a85781SYan Zhao if (tdx_is_sept_violation_unexpected_pending(vcpu)) {
1914e6a85781SYan Zhao pr_warn("Guest access before accepting 0x%llx on vCPU %d\n",
1915e6a85781SYan Zhao gpa, vcpu->vcpu_id);
1916e6a85781SYan Zhao kvm_vm_dead(vcpu->kvm);
1917e6a85781SYan Zhao return -EIO;
1918e6a85781SYan Zhao }
1919da407fe4SIsaku Yamahata /*
1920da407fe4SIsaku Yamahata * Always treat SEPT violations as write faults. Ignore the
1921da407fe4SIsaku Yamahata * EXIT_QUALIFICATION reported by TDX-SEAM for SEPT violations.
1922da407fe4SIsaku Yamahata * TD private pages are always RWX in the SEPT tables,
1923da407fe4SIsaku Yamahata * i.e. they're always mapped writable. Just as importantly,
1924da407fe4SIsaku Yamahata * treating SEPT violations as write faults is necessary to
1925da407fe4SIsaku Yamahata * avoid COW allocations, which will cause TDAUGPAGE failures
1926da407fe4SIsaku Yamahata * due to aliasing a single HPA to multiple GPAs.
1927da407fe4SIsaku Yamahata */
1928da407fe4SIsaku Yamahata exit_qual = EPT_VIOLATION_ACC_WRITE;
1929b0327bb2SYan Zhao
1930b0327bb2SYan Zhao /* Only private GPA triggers zero-step mitigation */
1931b0327bb2SYan Zhao local_retry = true;
1932da407fe4SIsaku Yamahata } else {
1933da407fe4SIsaku Yamahata exit_qual = vmx_get_exit_qual(vcpu);
1934da407fe4SIsaku Yamahata /*
1935da407fe4SIsaku Yamahata * EPT violation due to instruction fetch should never be
1936da407fe4SIsaku Yamahata * triggered from shared memory in TDX guest. If such EPT
1937da407fe4SIsaku Yamahata * violation occurs, treat it as broken hardware.
1938da407fe4SIsaku Yamahata */
1939da407fe4SIsaku Yamahata if (KVM_BUG_ON(exit_qual & EPT_VIOLATION_ACC_INSTR, vcpu->kvm))
1940da407fe4SIsaku Yamahata return -EIO;
1941da407fe4SIsaku Yamahata }
1942da407fe4SIsaku Yamahata
1943da407fe4SIsaku Yamahata trace_kvm_page_fault(vcpu, gpa, exit_qual);
1944b0327bb2SYan Zhao
1945b0327bb2SYan Zhao /*
1946b0327bb2SYan Zhao * To minimize TDH.VP.ENTER invocations, retry locally for private GPA
1947b0327bb2SYan Zhao * mapping in TDX.
1948b0327bb2SYan Zhao *
1949b0327bb2SYan Zhao * KVM may return RET_PF_RETRY for private GPA due to
1950b0327bb2SYan Zhao * - contentions when atomically updating SPTEs of the mirror page table
1951b0327bb2SYan Zhao * - in-progress GFN invalidation or memslot removal.
1952b0327bb2SYan Zhao * - TDX_OPERAND_BUSY error from TDH.MEM.PAGE.AUG or TDH.MEM.SEPT.ADD,
1953b0327bb2SYan Zhao * caused by contentions with TDH.VP.ENTER (with zero-step mitigation)
1954b0327bb2SYan Zhao * or certain TDCALLs.
1955b0327bb2SYan Zhao *
1956b0327bb2SYan Zhao * If TDH.VP.ENTER is invoked more times than the threshold set by the
1957b0327bb2SYan Zhao * TDX module before KVM resolves the private GPA mapping, the TDX
1958b0327bb2SYan Zhao * module will activate zero-step mitigation during TDH.VP.ENTER. This
1959b0327bb2SYan Zhao * process acquires an SEPT tree lock in the TDX module, leading to
1960b0327bb2SYan Zhao * further contentions with TDH.MEM.PAGE.AUG or TDH.MEM.SEPT.ADD
1961b0327bb2SYan Zhao * operations on other vCPUs.
1962b0327bb2SYan Zhao *
1963b0327bb2SYan Zhao * Breaking out of local retries for kvm_vcpu_has_events() is for
1964b0327bb2SYan Zhao * interrupt injection. kvm_vcpu_has_events() should not see pending
1965b0327bb2SYan Zhao * events for TDX. Since KVM can't determine if IRQs (or NMIs) are
1966b0327bb2SYan Zhao * blocked by TDs, false positives are inevitable i.e., KVM may re-enter
1967b0327bb2SYan Zhao * the guest even if the IRQ/NMI can't be delivered.
1968b0327bb2SYan Zhao *
1969b0327bb2SYan Zhao * Note: even without breaking out of local retries, zero-step
1970b0327bb2SYan Zhao * mitigation may still occur due to
1971b0327bb2SYan Zhao * - invoking of TDH.VP.ENTER after KVM_EXIT_MEMORY_FAULT,
1972b0327bb2SYan Zhao * - a single RIP causing EPT violations for more GFNs than the
1973b0327bb2SYan Zhao * threshold count.
1974b0327bb2SYan Zhao * This is safe, as triggering zero-step mitigation only introduces
1975b0327bb2SYan Zhao * contentions to page installation SEAMCALLs on other vCPUs, which will
1976b0327bb2SYan Zhao * handle retries locally in their EPT violation handlers.
1977b0327bb2SYan Zhao */
1978b0327bb2SYan Zhao while (1) {
1979b0327bb2SYan Zhao ret = __vmx_handle_ept_violation(vcpu, gpa, exit_qual);
1980b0327bb2SYan Zhao
1981b0327bb2SYan Zhao if (ret != RET_PF_RETRY || !local_retry)
1982b0327bb2SYan Zhao break;
1983b0327bb2SYan Zhao
1984b0327bb2SYan Zhao if (kvm_vcpu_has_events(vcpu) || signal_pending(current))
1985b0327bb2SYan Zhao break;
1986b0327bb2SYan Zhao
1987b0327bb2SYan Zhao if (kvm_check_request(KVM_REQ_VM_DEAD, vcpu)) {
1988b0327bb2SYan Zhao ret = -EIO;
1989b0327bb2SYan Zhao break;
1990b0327bb2SYan Zhao }
1991b0327bb2SYan Zhao
1992b0327bb2SYan Zhao cond_resched();
1993b0327bb2SYan Zhao }
1994b0327bb2SYan Zhao return ret;
1995da407fe4SIsaku Yamahata }
1996da407fe4SIsaku Yamahata
tdx_complete_emulated_msr(struct kvm_vcpu * vcpu,int err)1997081385dbSIsaku Yamahata int tdx_complete_emulated_msr(struct kvm_vcpu *vcpu, int err)
1998081385dbSIsaku Yamahata {
1999081385dbSIsaku Yamahata if (err) {
2000081385dbSIsaku Yamahata tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_INVALID_OPERAND);
2001081385dbSIsaku Yamahata return 1;
2002081385dbSIsaku Yamahata }
2003081385dbSIsaku Yamahata
2004081385dbSIsaku Yamahata if (vmx_get_exit_reason(vcpu).basic == EXIT_REASON_MSR_READ)
2005081385dbSIsaku Yamahata tdvmcall_set_return_val(vcpu, kvm_read_edx_eax(vcpu));
2006081385dbSIsaku Yamahata
2007081385dbSIsaku Yamahata return 1;
2008081385dbSIsaku Yamahata }
2009081385dbSIsaku Yamahata
2010081385dbSIsaku Yamahata
tdx_handle_exit(struct kvm_vcpu * vcpu,fastpath_t fastpath)2011095b71a0SIsaku Yamahata int tdx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t fastpath)
2012095b71a0SIsaku Yamahata {
2013095b71a0SIsaku Yamahata struct vcpu_tdx *tdx = to_tdx(vcpu);
2014095b71a0SIsaku Yamahata u64 vp_enter_ret = tdx->vp_enter_ret;
2015095b71a0SIsaku Yamahata union vmx_exit_reason exit_reason = vmx_get_exit_reason(vcpu);
2016095b71a0SIsaku Yamahata
2017095b71a0SIsaku Yamahata if (fastpath != EXIT_FASTPATH_NONE)
2018095b71a0SIsaku Yamahata return 1;
2019095b71a0SIsaku Yamahata
2020da407fe4SIsaku Yamahata if (unlikely(vp_enter_ret == EXIT_REASON_EPT_MISCONFIG)) {
2021da407fe4SIsaku Yamahata KVM_BUG_ON(1, vcpu->kvm);
2022da407fe4SIsaku Yamahata return -EIO;
2023da407fe4SIsaku Yamahata }
2024da407fe4SIsaku Yamahata
2025095b71a0SIsaku Yamahata /*
2026095b71a0SIsaku Yamahata * Handle TDX SW errors, including TDX_SEAMCALL_UD, TDX_SEAMCALL_GP and
2027095b71a0SIsaku Yamahata * TDX_SEAMCALL_VMFAILINVALID.
2028095b71a0SIsaku Yamahata */
2029095b71a0SIsaku Yamahata if (unlikely((vp_enter_ret & TDX_SW_ERROR) == TDX_SW_ERROR)) {
2030095b71a0SIsaku Yamahata KVM_BUG_ON(!kvm_rebooting, vcpu->kvm);
2031095b71a0SIsaku Yamahata goto unhandled_exit;
2032095b71a0SIsaku Yamahata }
2033095b71a0SIsaku Yamahata
2034095b71a0SIsaku Yamahata if (unlikely(tdx_failed_vmentry(vcpu))) {
2035095b71a0SIsaku Yamahata /*
2036095b71a0SIsaku Yamahata * If the guest state is protected, that means off-TD debug is
2037095b71a0SIsaku Yamahata * not enabled, TDX_NON_RECOVERABLE must be set.
2038095b71a0SIsaku Yamahata */
2039095b71a0SIsaku Yamahata WARN_ON_ONCE(vcpu->arch.guest_state_protected &&
2040095b71a0SIsaku Yamahata !(vp_enter_ret & TDX_NON_RECOVERABLE));
2041095b71a0SIsaku Yamahata vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY;
2042095b71a0SIsaku Yamahata vcpu->run->fail_entry.hardware_entry_failure_reason = exit_reason.full;
2043095b71a0SIsaku Yamahata vcpu->run->fail_entry.cpu = vcpu->arch.last_vmentry_cpu;
2044095b71a0SIsaku Yamahata return 0;
2045095b71a0SIsaku Yamahata }
2046095b71a0SIsaku Yamahata
2047095b71a0SIsaku Yamahata if (unlikely(vp_enter_ret & (TDX_ERROR | TDX_NON_RECOVERABLE)) &&
2048095b71a0SIsaku Yamahata exit_reason.basic != EXIT_REASON_TRIPLE_FAULT) {
2049095b71a0SIsaku Yamahata kvm_pr_unimpl("TD vp_enter_ret 0x%llx\n", vp_enter_ret);
2050095b71a0SIsaku Yamahata goto unhandled_exit;
2051095b71a0SIsaku Yamahata }
2052095b71a0SIsaku Yamahata
2053095b71a0SIsaku Yamahata WARN_ON_ONCE(exit_reason.basic != EXIT_REASON_TRIPLE_FAULT &&
2054095b71a0SIsaku Yamahata (vp_enter_ret & TDX_SEAMCALL_STATUS_MASK) != TDX_SUCCESS);
2055095b71a0SIsaku Yamahata
2056095b71a0SIsaku Yamahata switch (exit_reason.basic) {
2057095b71a0SIsaku Yamahata case EXIT_REASON_TRIPLE_FAULT:
2058095b71a0SIsaku Yamahata vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN;
2059095b71a0SIsaku Yamahata vcpu->mmio_needed = 0;
2060095b71a0SIsaku Yamahata return 0;
2061f30cb642SIsaku Yamahata case EXIT_REASON_EXCEPTION_NMI:
2062f30cb642SIsaku Yamahata return tdx_handle_exception_nmi(vcpu);
2063f30cb642SIsaku Yamahata case EXIT_REASON_EXTERNAL_INTERRUPT:
2064f30cb642SIsaku Yamahata ++vcpu->stat.irq_exits;
2065f30cb642SIsaku Yamahata return 1;
20663bf31b57SIsaku Yamahata case EXIT_REASON_CPUID:
20673bf31b57SIsaku Yamahata return tdx_emulate_cpuid(vcpu);
20685cf7239bSIsaku Yamahata case EXIT_REASON_HLT:
20695cf7239bSIsaku Yamahata return kvm_emulate_halt_noskip(vcpu);
2070c42856afSIsaku Yamahata case EXIT_REASON_TDCALL:
2071c42856afSIsaku Yamahata return handle_tdvmcall(vcpu);
2072d5998c02SIsaku Yamahata case EXIT_REASON_VMCALL:
2073d5998c02SIsaku Yamahata return tdx_emulate_vmcall(vcpu);
207433608aafSIsaku Yamahata case EXIT_REASON_IO_INSTRUCTION:
207533608aafSIsaku Yamahata return tdx_emulate_io(vcpu);
2076081385dbSIsaku Yamahata case EXIT_REASON_MSR_READ:
2077081385dbSIsaku Yamahata kvm_rcx_write(vcpu, tdx->vp_enter_args.r12);
2078081385dbSIsaku Yamahata return kvm_emulate_rdmsr(vcpu);
2079081385dbSIsaku Yamahata case EXIT_REASON_MSR_WRITE:
2080081385dbSIsaku Yamahata kvm_rcx_write(vcpu, tdx->vp_enter_args.r12);
2081081385dbSIsaku Yamahata kvm_rax_write(vcpu, tdx->vp_enter_args.r13 & -1u);
2082081385dbSIsaku Yamahata kvm_rdx_write(vcpu, tdx->vp_enter_args.r13 >> 32);
2083081385dbSIsaku Yamahata return kvm_emulate_wrmsr(vcpu);
2084bb723bebSSean Christopherson case EXIT_REASON_EPT_MISCONFIG:
2085bb723bebSSean Christopherson return tdx_emulate_mmio(vcpu);
2086da407fe4SIsaku Yamahata case EXIT_REASON_EPT_VIOLATION:
2087da407fe4SIsaku Yamahata return tdx_handle_ept_violation(vcpu);
20886c441e4dSIsaku Yamahata case EXIT_REASON_OTHER_SMI:
20896c441e4dSIsaku Yamahata /*
20906c441e4dSIsaku Yamahata * Unlike VMX, SMI in SEAM non-root mode (i.e. when
20916c441e4dSIsaku Yamahata * TD guest vCPU is running) will cause VM exit to TDX module,
20926c441e4dSIsaku Yamahata * then SEAMRET to KVM. Once it exits to KVM, SMI is delivered
20936c441e4dSIsaku Yamahata * and handled by kernel handler right away.
20946c441e4dSIsaku Yamahata *
20956c441e4dSIsaku Yamahata * The Other SMI exit can also be caused by the SEAM non-root
20966c441e4dSIsaku Yamahata * machine check delivered via Machine Check System Management
20976c441e4dSIsaku Yamahata * Interrupt (MSMI), but it has already been handled by the
20986c441e4dSIsaku Yamahata * kernel machine check handler, i.e., the memory page has been
20996c441e4dSIsaku Yamahata * marked as poisoned and it won't be freed to the free list
21006c441e4dSIsaku Yamahata * when the TDX guest is terminated (the TDX module marks the
21016c441e4dSIsaku Yamahata * guest as dead and prevent it from further running when
21026c441e4dSIsaku Yamahata * machine check happens in SEAM non-root).
21036c441e4dSIsaku Yamahata *
21046c441e4dSIsaku Yamahata * - A MSMI will not reach here, it's handled as non_recoverable
21056c441e4dSIsaku Yamahata * case above.
21066c441e4dSIsaku Yamahata * - If it's not an MSMI, no need to do anything here.
21076c441e4dSIsaku Yamahata */
21086c441e4dSIsaku Yamahata return 1;
2109095b71a0SIsaku Yamahata default:
2110095b71a0SIsaku Yamahata break;
2111095b71a0SIsaku Yamahata }
2112095b71a0SIsaku Yamahata
2113095b71a0SIsaku Yamahata unhandled_exit:
2114095b71a0SIsaku Yamahata vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
2115095b71a0SIsaku Yamahata vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_UNEXPECTED_EXIT_REASON;
2116095b71a0SIsaku Yamahata vcpu->run->internal.ndata = 2;
2117095b71a0SIsaku Yamahata vcpu->run->internal.data[0] = vp_enter_ret;
2118095b71a0SIsaku Yamahata vcpu->run->internal.data[1] = vcpu->arch.last_vmentry_cpu;
2119095b71a0SIsaku Yamahata return 0;
2120095b71a0SIsaku Yamahata }
2121095b71a0SIsaku Yamahata
tdx_get_exit_info(struct kvm_vcpu * vcpu,u32 * reason,u64 * info1,u64 * info2,u32 * intr_info,u32 * error_code)2122095b71a0SIsaku Yamahata void tdx_get_exit_info(struct kvm_vcpu *vcpu, u32 *reason,
2123095b71a0SIsaku Yamahata u64 *info1, u64 *info2, u32 *intr_info, u32 *error_code)
2124095b71a0SIsaku Yamahata {
2125095b71a0SIsaku Yamahata struct vcpu_tdx *tdx = to_tdx(vcpu);
2126095b71a0SIsaku Yamahata
2127095b71a0SIsaku Yamahata *reason = tdx->vt.exit_reason.full;
2128095b71a0SIsaku Yamahata if (*reason != -1u) {
2129095b71a0SIsaku Yamahata *info1 = vmx_get_exit_qual(vcpu);
2130095b71a0SIsaku Yamahata *info2 = tdx->ext_exit_qualification;
2131095b71a0SIsaku Yamahata *intr_info = vmx_get_intr_info(vcpu);
2132095b71a0SIsaku Yamahata } else {
2133095b71a0SIsaku Yamahata *info1 = 0;
2134095b71a0SIsaku Yamahata *info2 = 0;
2135095b71a0SIsaku Yamahata *intr_info = 0;
2136095b71a0SIsaku Yamahata }
2137095b71a0SIsaku Yamahata
2138095b71a0SIsaku Yamahata *error_code = 0;
2139095b71a0SIsaku Yamahata }
2140095b71a0SIsaku Yamahata
tdx_has_emulated_msr(u32 index)2141dd50294fSIsaku Yamahata bool tdx_has_emulated_msr(u32 index)
2142dd50294fSIsaku Yamahata {
2143dd50294fSIsaku Yamahata switch (index) {
2144dd50294fSIsaku Yamahata case MSR_IA32_UCODE_REV:
2145dd50294fSIsaku Yamahata case MSR_IA32_ARCH_CAPABILITIES:
2146dd50294fSIsaku Yamahata case MSR_IA32_POWER_CTL:
2147dd50294fSIsaku Yamahata case MSR_IA32_CR_PAT:
214826eab9aeSBinbin Wu case MSR_MTRRcap:
214926eab9aeSBinbin Wu case MTRRphysBase_MSR(0) ... MSR_MTRRfix4K_F8000:
215026eab9aeSBinbin Wu case MSR_MTRRdefType:
2151dd50294fSIsaku Yamahata case MSR_IA32_TSC_DEADLINE:
2152dd50294fSIsaku Yamahata case MSR_IA32_MISC_ENABLE:
2153dd50294fSIsaku Yamahata case MSR_PLATFORM_INFO:
2154dd50294fSIsaku Yamahata case MSR_MISC_FEATURES_ENABLES:
2155dd50294fSIsaku Yamahata case MSR_IA32_APICBASE:
2156dd50294fSIsaku Yamahata case MSR_EFER:
21579fc3402aSIsaku Yamahata case MSR_IA32_FEAT_CTL:
2158dd50294fSIsaku Yamahata case MSR_IA32_MCG_CAP:
2159dd50294fSIsaku Yamahata case MSR_IA32_MCG_STATUS:
2160dd50294fSIsaku Yamahata case MSR_IA32_MCG_CTL:
2161dd50294fSIsaku Yamahata case MSR_IA32_MCG_EXT_CTL:
2162dd50294fSIsaku Yamahata case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1:
2163dd50294fSIsaku Yamahata case MSR_IA32_MC0_CTL2 ... MSR_IA32_MCx_CTL2(KVM_MAX_MCE_BANKS) - 1:
2164dd50294fSIsaku Yamahata /* MSR_IA32_MCx_{CTL, STATUS, ADDR, MISC, CTL2} */
2165dd50294fSIsaku Yamahata case MSR_KVM_POLL_CONTROL:
2166dd50294fSIsaku Yamahata return true;
2167dd50294fSIsaku Yamahata case APIC_BASE_MSR ... APIC_BASE_MSR + 0xff:
2168dd50294fSIsaku Yamahata /*
2169dd50294fSIsaku Yamahata * x2APIC registers that are virtualized by the CPU can't be
2170dd50294fSIsaku Yamahata * emulated, KVM doesn't have access to the virtual APIC page.
2171dd50294fSIsaku Yamahata */
2172dd50294fSIsaku Yamahata switch (index) {
2173dd50294fSIsaku Yamahata case X2APIC_MSR(APIC_TASKPRI):
2174dd50294fSIsaku Yamahata case X2APIC_MSR(APIC_PROCPRI):
2175dd50294fSIsaku Yamahata case X2APIC_MSR(APIC_EOI):
2176dd50294fSIsaku Yamahata case X2APIC_MSR(APIC_ISR) ... X2APIC_MSR(APIC_ISR + APIC_ISR_NR):
2177dd50294fSIsaku Yamahata case X2APIC_MSR(APIC_TMR) ... X2APIC_MSR(APIC_TMR + APIC_ISR_NR):
2178dd50294fSIsaku Yamahata case X2APIC_MSR(APIC_IRR) ... X2APIC_MSR(APIC_IRR + APIC_ISR_NR):
2179dd50294fSIsaku Yamahata return false;
2180dd50294fSIsaku Yamahata default:
2181dd50294fSIsaku Yamahata return true;
2182dd50294fSIsaku Yamahata }
2183dd50294fSIsaku Yamahata default:
2184dd50294fSIsaku Yamahata return false;
2185dd50294fSIsaku Yamahata }
2186dd50294fSIsaku Yamahata }
2187dd50294fSIsaku Yamahata
tdx_is_read_only_msr(u32 index)2188dd50294fSIsaku Yamahata static bool tdx_is_read_only_msr(u32 index)
2189dd50294fSIsaku Yamahata {
21909fc3402aSIsaku Yamahata return index == MSR_IA32_APICBASE || index == MSR_EFER ||
21919fc3402aSIsaku Yamahata index == MSR_IA32_FEAT_CTL;
2192dd50294fSIsaku Yamahata }
2193dd50294fSIsaku Yamahata
tdx_get_msr(struct kvm_vcpu * vcpu,struct msr_data * msr)2194dd50294fSIsaku Yamahata int tdx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
2195dd50294fSIsaku Yamahata {
21969fc3402aSIsaku Yamahata switch (msr->index) {
21979fc3402aSIsaku Yamahata case MSR_IA32_FEAT_CTL:
21989fc3402aSIsaku Yamahata /*
21999fc3402aSIsaku Yamahata * MCE and MCA are advertised via cpuid. Guest kernel could
22009fc3402aSIsaku Yamahata * check if LMCE is enabled or not.
22019fc3402aSIsaku Yamahata */
22029fc3402aSIsaku Yamahata msr->data = FEAT_CTL_LOCKED;
22039fc3402aSIsaku Yamahata if (vcpu->arch.mcg_cap & MCG_LMCE_P)
22049fc3402aSIsaku Yamahata msr->data |= FEAT_CTL_LMCE_ENABLED;
22059fc3402aSIsaku Yamahata return 0;
22069fc3402aSIsaku Yamahata case MSR_IA32_MCG_EXT_CTL:
22079fc3402aSIsaku Yamahata if (!msr->host_initiated && !(vcpu->arch.mcg_cap & MCG_LMCE_P))
22089fc3402aSIsaku Yamahata return 1;
22099fc3402aSIsaku Yamahata msr->data = vcpu->arch.mcg_ext_ctl;
22109fc3402aSIsaku Yamahata return 0;
22119fc3402aSIsaku Yamahata default:
2212dd50294fSIsaku Yamahata if (!tdx_has_emulated_msr(msr->index))
2213dd50294fSIsaku Yamahata return 1;
2214dd50294fSIsaku Yamahata
2215dd50294fSIsaku Yamahata return kvm_get_msr_common(vcpu, msr);
2216dd50294fSIsaku Yamahata }
22179fc3402aSIsaku Yamahata }
2218dd50294fSIsaku Yamahata
tdx_set_msr(struct kvm_vcpu * vcpu,struct msr_data * msr)2219dd50294fSIsaku Yamahata int tdx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
2220dd50294fSIsaku Yamahata {
22219fc3402aSIsaku Yamahata switch (msr->index) {
22229fc3402aSIsaku Yamahata case MSR_IA32_MCG_EXT_CTL:
22239fc3402aSIsaku Yamahata if ((!msr->host_initiated && !(vcpu->arch.mcg_cap & MCG_LMCE_P)) ||
22249fc3402aSIsaku Yamahata (msr->data & ~MCG_EXT_CTL_LMCE_EN))
22259fc3402aSIsaku Yamahata return 1;
22269fc3402aSIsaku Yamahata vcpu->arch.mcg_ext_ctl = msr->data;
22279fc3402aSIsaku Yamahata return 0;
22289fc3402aSIsaku Yamahata default:
2229dd50294fSIsaku Yamahata if (tdx_is_read_only_msr(msr->index))
2230dd50294fSIsaku Yamahata return 1;
2231dd50294fSIsaku Yamahata
2232dd50294fSIsaku Yamahata if (!tdx_has_emulated_msr(msr->index))
2233dd50294fSIsaku Yamahata return 1;
2234dd50294fSIsaku Yamahata
2235dd50294fSIsaku Yamahata return kvm_set_msr_common(vcpu, msr);
2236dd50294fSIsaku Yamahata }
22379fc3402aSIsaku Yamahata }
2238dd50294fSIsaku Yamahata
tdx_get_capabilities(struct kvm_tdx_cmd * cmd)223961bb2827SIsaku Yamahata static int tdx_get_capabilities(struct kvm_tdx_cmd *cmd)
224061bb2827SIsaku Yamahata {
224161bb2827SIsaku Yamahata const struct tdx_sys_info_td_conf *td_conf = &tdx_sysinfo->td_conf;
224261bb2827SIsaku Yamahata struct kvm_tdx_capabilities __user *user_caps;
224361bb2827SIsaku Yamahata struct kvm_tdx_capabilities *caps = NULL;
224461bb2827SIsaku Yamahata int ret = 0;
224561bb2827SIsaku Yamahata
224661bb2827SIsaku Yamahata /* flags is reserved for future use */
224761bb2827SIsaku Yamahata if (cmd->flags)
224861bb2827SIsaku Yamahata return -EINVAL;
224961bb2827SIsaku Yamahata
225061bb2827SIsaku Yamahata caps = kmalloc(sizeof(*caps) +
225161bb2827SIsaku Yamahata sizeof(struct kvm_cpuid_entry2) * td_conf->num_cpuid_config,
225261bb2827SIsaku Yamahata GFP_KERNEL);
225361bb2827SIsaku Yamahata if (!caps)
225461bb2827SIsaku Yamahata return -ENOMEM;
225561bb2827SIsaku Yamahata
225661bb2827SIsaku Yamahata user_caps = u64_to_user_ptr(cmd->data);
225761bb2827SIsaku Yamahata if (copy_from_user(caps, user_caps, sizeof(*caps))) {
225861bb2827SIsaku Yamahata ret = -EFAULT;
225961bb2827SIsaku Yamahata goto out;
226061bb2827SIsaku Yamahata }
226161bb2827SIsaku Yamahata
226261bb2827SIsaku Yamahata if (caps->cpuid.nent < td_conf->num_cpuid_config) {
226361bb2827SIsaku Yamahata ret = -E2BIG;
226461bb2827SIsaku Yamahata goto out;
226561bb2827SIsaku Yamahata }
226661bb2827SIsaku Yamahata
226761bb2827SIsaku Yamahata ret = init_kvm_tdx_caps(td_conf, caps);
226861bb2827SIsaku Yamahata if (ret)
226961bb2827SIsaku Yamahata goto out;
227061bb2827SIsaku Yamahata
227161bb2827SIsaku Yamahata if (copy_to_user(user_caps, caps, sizeof(*caps))) {
227261bb2827SIsaku Yamahata ret = -EFAULT;
227361bb2827SIsaku Yamahata goto out;
227461bb2827SIsaku Yamahata }
227561bb2827SIsaku Yamahata
227661bb2827SIsaku Yamahata if (copy_to_user(user_caps->cpuid.entries, caps->cpuid.entries,
227761bb2827SIsaku Yamahata caps->cpuid.nent *
227861bb2827SIsaku Yamahata sizeof(caps->cpuid.entries[0])))
227961bb2827SIsaku Yamahata ret = -EFAULT;
228061bb2827SIsaku Yamahata
228161bb2827SIsaku Yamahata out:
228261bb2827SIsaku Yamahata /* kfree() accepts NULL. */
228361bb2827SIsaku Yamahata kfree(caps);
228461bb2827SIsaku Yamahata return ret;
228561bb2827SIsaku Yamahata }
228661bb2827SIsaku Yamahata
22870186dd29SIsaku Yamahata /*
22880186dd29SIsaku Yamahata * KVM reports guest physical address in CPUID.0x800000008.EAX[23:16], which is
22890186dd29SIsaku Yamahata * similar to TDX's GPAW. Use this field as the interface for userspace to
22900186dd29SIsaku Yamahata * configure the GPAW and EPT level for TDs.
22910186dd29SIsaku Yamahata *
22920186dd29SIsaku Yamahata * Only values 48 and 52 are supported. Value 52 means GPAW-52 and EPT level
22930186dd29SIsaku Yamahata * 5, Value 48 means GPAW-48 and EPT level 4. For value 48, GPAW-48 is always
22940186dd29SIsaku Yamahata * supported. Value 52 is only supported when the platform supports 5 level
22950186dd29SIsaku Yamahata * EPT.
22960186dd29SIsaku Yamahata */
setup_tdparams_eptp_controls(struct kvm_cpuid2 * cpuid,struct td_params * td_params)22970186dd29SIsaku Yamahata static int setup_tdparams_eptp_controls(struct kvm_cpuid2 *cpuid,
22980186dd29SIsaku Yamahata struct td_params *td_params)
22990186dd29SIsaku Yamahata {
23000186dd29SIsaku Yamahata const struct kvm_cpuid_entry2 *entry;
23010186dd29SIsaku Yamahata int guest_pa;
23020186dd29SIsaku Yamahata
23030186dd29SIsaku Yamahata entry = kvm_find_cpuid_entry2(cpuid->entries, cpuid->nent, 0x80000008, 0);
23040186dd29SIsaku Yamahata if (!entry)
23050186dd29SIsaku Yamahata return -EINVAL;
23060186dd29SIsaku Yamahata
23070186dd29SIsaku Yamahata guest_pa = tdx_get_guest_phys_addr_bits(entry->eax);
23080186dd29SIsaku Yamahata
23090186dd29SIsaku Yamahata if (guest_pa != 48 && guest_pa != 52)
23100186dd29SIsaku Yamahata return -EINVAL;
23110186dd29SIsaku Yamahata
23120186dd29SIsaku Yamahata if (guest_pa == 52 && !cpu_has_vmx_ept_5levels())
23130186dd29SIsaku Yamahata return -EINVAL;
23140186dd29SIsaku Yamahata
23150186dd29SIsaku Yamahata td_params->eptp_controls = VMX_EPTP_MT_WB;
23160186dd29SIsaku Yamahata if (guest_pa == 52) {
23170186dd29SIsaku Yamahata td_params->eptp_controls |= VMX_EPTP_PWL_5;
23180186dd29SIsaku Yamahata td_params->config_flags |= TDX_CONFIG_FLAGS_MAX_GPAW;
23190186dd29SIsaku Yamahata } else {
23200186dd29SIsaku Yamahata td_params->eptp_controls |= VMX_EPTP_PWL_4;
23210186dd29SIsaku Yamahata }
23220186dd29SIsaku Yamahata
23230186dd29SIsaku Yamahata return 0;
23240186dd29SIsaku Yamahata }
23250186dd29SIsaku Yamahata
setup_tdparams_cpuids(struct kvm_cpuid2 * cpuid,struct td_params * td_params)23260186dd29SIsaku Yamahata static int setup_tdparams_cpuids(struct kvm_cpuid2 *cpuid,
23270186dd29SIsaku Yamahata struct td_params *td_params)
23280186dd29SIsaku Yamahata {
23290186dd29SIsaku Yamahata const struct tdx_sys_info_td_conf *td_conf = &tdx_sysinfo->td_conf;
23300186dd29SIsaku Yamahata const struct kvm_cpuid_entry2 *entry;
23310186dd29SIsaku Yamahata struct tdx_cpuid_value *value;
23320186dd29SIsaku Yamahata int i, copy_cnt = 0;
23330186dd29SIsaku Yamahata
23340186dd29SIsaku Yamahata /*
23350186dd29SIsaku Yamahata * td_params.cpuid_values: The number and the order of cpuid_value must
23360186dd29SIsaku Yamahata * be same to the one of struct tdsysinfo.{num_cpuid_config, cpuid_configs}
23370186dd29SIsaku Yamahata * It's assumed that td_params was zeroed.
23380186dd29SIsaku Yamahata */
23390186dd29SIsaku Yamahata for (i = 0; i < td_conf->num_cpuid_config; i++) {
23400186dd29SIsaku Yamahata struct kvm_cpuid_entry2 tmp;
23410186dd29SIsaku Yamahata
23420186dd29SIsaku Yamahata td_init_cpuid_entry2(&tmp, i);
23430186dd29SIsaku Yamahata
23440186dd29SIsaku Yamahata entry = kvm_find_cpuid_entry2(cpuid->entries, cpuid->nent,
23450186dd29SIsaku Yamahata tmp.function, tmp.index);
23460186dd29SIsaku Yamahata if (!entry)
23470186dd29SIsaku Yamahata continue;
23480186dd29SIsaku Yamahata
23496d415778SAdrian Hunter if (tdx_unsupported_cpuid(entry))
23506d415778SAdrian Hunter return -EINVAL;
23516d415778SAdrian Hunter
23520186dd29SIsaku Yamahata copy_cnt++;
23530186dd29SIsaku Yamahata
23540186dd29SIsaku Yamahata value = &td_params->cpuid_values[i];
23550186dd29SIsaku Yamahata value->eax = entry->eax;
23560186dd29SIsaku Yamahata value->ebx = entry->ebx;
23570186dd29SIsaku Yamahata value->ecx = entry->ecx;
23580186dd29SIsaku Yamahata value->edx = entry->edx;
23590186dd29SIsaku Yamahata
23600186dd29SIsaku Yamahata /*
23610186dd29SIsaku Yamahata * TDX module does not accept nonzero bits 16..23 for the
23620186dd29SIsaku Yamahata * CPUID[0x80000008].EAX, see setup_tdparams_eptp_controls().
23630186dd29SIsaku Yamahata */
23640186dd29SIsaku Yamahata if (tmp.function == 0x80000008)
23650186dd29SIsaku Yamahata value->eax = tdx_set_guest_phys_addr_bits(value->eax, 0);
23660186dd29SIsaku Yamahata }
23670186dd29SIsaku Yamahata
23680186dd29SIsaku Yamahata /*
23690186dd29SIsaku Yamahata * Rely on the TDX module to reject invalid configuration, but it can't
23700186dd29SIsaku Yamahata * check of leafs that don't have a proper slot in td_params->cpuid_values
23710186dd29SIsaku Yamahata * to stick then. So fail if there were entries that didn't get copied to
23720186dd29SIsaku Yamahata * td_params.
23730186dd29SIsaku Yamahata */
23740186dd29SIsaku Yamahata if (copy_cnt != cpuid->nent)
23750186dd29SIsaku Yamahata return -EINVAL;
23760186dd29SIsaku Yamahata
23770186dd29SIsaku Yamahata return 0;
23780186dd29SIsaku Yamahata }
23790186dd29SIsaku Yamahata
setup_tdparams(struct kvm * kvm,struct td_params * td_params,struct kvm_tdx_init_vm * init_vm)23800186dd29SIsaku Yamahata static int setup_tdparams(struct kvm *kvm, struct td_params *td_params,
23810186dd29SIsaku Yamahata struct kvm_tdx_init_vm *init_vm)
23820186dd29SIsaku Yamahata {
23830186dd29SIsaku Yamahata const struct tdx_sys_info_td_conf *td_conf = &tdx_sysinfo->td_conf;
23840186dd29SIsaku Yamahata struct kvm_cpuid2 *cpuid = &init_vm->cpuid;
23850186dd29SIsaku Yamahata int ret;
23860186dd29SIsaku Yamahata
23870186dd29SIsaku Yamahata if (kvm->created_vcpus)
23880186dd29SIsaku Yamahata return -EBUSY;
23890186dd29SIsaku Yamahata
23900186dd29SIsaku Yamahata if (init_vm->attributes & ~tdx_get_supported_attrs(td_conf))
23910186dd29SIsaku Yamahata return -EINVAL;
23920186dd29SIsaku Yamahata
23930186dd29SIsaku Yamahata if (init_vm->xfam & ~tdx_get_supported_xfam(td_conf))
23940186dd29SIsaku Yamahata return -EINVAL;
23950186dd29SIsaku Yamahata
23960186dd29SIsaku Yamahata td_params->max_vcpus = kvm->max_vcpus;
23970186dd29SIsaku Yamahata td_params->attributes = init_vm->attributes | td_conf->attributes_fixed1;
23980186dd29SIsaku Yamahata td_params->xfam = init_vm->xfam | td_conf->xfam_fixed1;
23990186dd29SIsaku Yamahata
24000186dd29SIsaku Yamahata td_params->config_flags = TDX_CONFIG_FLAGS_NO_RBP_MOD;
24010186dd29SIsaku Yamahata td_params->tsc_frequency = TDX_TSC_KHZ_TO_25MHZ(kvm->arch.default_tsc_khz);
24020186dd29SIsaku Yamahata
24030186dd29SIsaku Yamahata ret = setup_tdparams_eptp_controls(cpuid, td_params);
24040186dd29SIsaku Yamahata if (ret)
24050186dd29SIsaku Yamahata return ret;
24060186dd29SIsaku Yamahata
24070186dd29SIsaku Yamahata ret = setup_tdparams_cpuids(cpuid, td_params);
24080186dd29SIsaku Yamahata if (ret)
24090186dd29SIsaku Yamahata return ret;
24100186dd29SIsaku Yamahata
24110186dd29SIsaku Yamahata #define MEMCPY_SAME_SIZE(dst, src) \
24120186dd29SIsaku Yamahata do { \
24130186dd29SIsaku Yamahata BUILD_BUG_ON(sizeof(dst) != sizeof(src)); \
24140186dd29SIsaku Yamahata memcpy((dst), (src), sizeof(dst)); \
24150186dd29SIsaku Yamahata } while (0)
24160186dd29SIsaku Yamahata
24170186dd29SIsaku Yamahata MEMCPY_SAME_SIZE(td_params->mrconfigid, init_vm->mrconfigid);
24180186dd29SIsaku Yamahata MEMCPY_SAME_SIZE(td_params->mrowner, init_vm->mrowner);
24190186dd29SIsaku Yamahata MEMCPY_SAME_SIZE(td_params->mrownerconfig, init_vm->mrownerconfig);
24200186dd29SIsaku Yamahata
24210186dd29SIsaku Yamahata return 0;
24220186dd29SIsaku Yamahata }
24230186dd29SIsaku Yamahata
__tdx_td_init(struct kvm * kvm,struct td_params * td_params,u64 * seamcall_err)24240186dd29SIsaku Yamahata static int __tdx_td_init(struct kvm *kvm, struct td_params *td_params,
24250186dd29SIsaku Yamahata u64 *seamcall_err)
24268d032b68SIsaku Yamahata {
24278d032b68SIsaku Yamahata struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
24288d032b68SIsaku Yamahata cpumask_var_t packages;
24298d032b68SIsaku Yamahata struct page **tdcs_pages = NULL;
24308d032b68SIsaku Yamahata struct page *tdr_page;
24318d032b68SIsaku Yamahata int ret, i;
24320186dd29SIsaku Yamahata u64 err, rcx;
24338d032b68SIsaku Yamahata
24340186dd29SIsaku Yamahata *seamcall_err = 0;
24358d032b68SIsaku Yamahata ret = tdx_guest_keyid_alloc();
24368d032b68SIsaku Yamahata if (ret < 0)
24378d032b68SIsaku Yamahata return ret;
24388d032b68SIsaku Yamahata kvm_tdx->hkid = ret;
24397c035beaSZhiming Hu kvm_tdx->misc_cg = get_current_misc_cg();
24407c035beaSZhiming Hu ret = misc_cg_try_charge(MISC_CG_RES_TDX, kvm_tdx->misc_cg, 1);
24417c035beaSZhiming Hu if (ret)
24427c035beaSZhiming Hu goto free_hkid;
24438d032b68SIsaku Yamahata
24448d032b68SIsaku Yamahata ret = -ENOMEM;
24458d032b68SIsaku Yamahata
24469934d7e5SIsaku Yamahata atomic_inc(&nr_configured_hkid);
24479934d7e5SIsaku Yamahata
24488d032b68SIsaku Yamahata tdr_page = alloc_page(GFP_KERNEL);
24498d032b68SIsaku Yamahata if (!tdr_page)
24508d032b68SIsaku Yamahata goto free_hkid;
24518d032b68SIsaku Yamahata
24528d032b68SIsaku Yamahata kvm_tdx->td.tdcs_nr_pages = tdx_sysinfo->td_ctrl.tdcs_base_size / PAGE_SIZE;
2453a50f673fSIsaku Yamahata /* TDVPS = TDVPR(4K page) + TDCX(multiple 4K pages), -1 for TDVPR. */
2454a50f673fSIsaku Yamahata kvm_tdx->td.tdcx_nr_pages = tdx_sysinfo->td_ctrl.tdvps_base_size / PAGE_SIZE - 1;
24558d032b68SIsaku Yamahata tdcs_pages = kcalloc(kvm_tdx->td.tdcs_nr_pages, sizeof(*kvm_tdx->td.tdcs_pages),
24568d032b68SIsaku Yamahata GFP_KERNEL | __GFP_ZERO);
24578d032b68SIsaku Yamahata if (!tdcs_pages)
24588d032b68SIsaku Yamahata goto free_tdr;
24598d032b68SIsaku Yamahata
24608d032b68SIsaku Yamahata for (i = 0; i < kvm_tdx->td.tdcs_nr_pages; i++) {
24618d032b68SIsaku Yamahata tdcs_pages[i] = alloc_page(GFP_KERNEL);
24628d032b68SIsaku Yamahata if (!tdcs_pages[i])
24638d032b68SIsaku Yamahata goto free_tdcs;
24648d032b68SIsaku Yamahata }
24658d032b68SIsaku Yamahata
24668d032b68SIsaku Yamahata if (!zalloc_cpumask_var(&packages, GFP_KERNEL))
24678d032b68SIsaku Yamahata goto free_tdcs;
24688d032b68SIsaku Yamahata
24698d032b68SIsaku Yamahata cpus_read_lock();
24708d032b68SIsaku Yamahata
24718d032b68SIsaku Yamahata /*
24728d032b68SIsaku Yamahata * Need at least one CPU of the package to be online in order to
24738d032b68SIsaku Yamahata * program all packages for host key id. Check it.
24748d032b68SIsaku Yamahata */
24758d032b68SIsaku Yamahata for_each_present_cpu(i)
24768d032b68SIsaku Yamahata cpumask_set_cpu(topology_physical_package_id(i), packages);
24778d032b68SIsaku Yamahata for_each_online_cpu(i)
24788d032b68SIsaku Yamahata cpumask_clear_cpu(topology_physical_package_id(i), packages);
24798d032b68SIsaku Yamahata if (!cpumask_empty(packages)) {
24808d032b68SIsaku Yamahata ret = -EIO;
24818d032b68SIsaku Yamahata /*
24828d032b68SIsaku Yamahata * Because it's hard for human operator to figure out the
24838d032b68SIsaku Yamahata * reason, warn it.
24848d032b68SIsaku Yamahata */
24858d032b68SIsaku Yamahata #define MSG_ALLPKG "All packages need to have online CPU to create TD. Online CPU and retry.\n"
24868d032b68SIsaku Yamahata pr_warn_ratelimited(MSG_ALLPKG);
24878d032b68SIsaku Yamahata goto free_packages;
24888d032b68SIsaku Yamahata }
24898d032b68SIsaku Yamahata
24908d032b68SIsaku Yamahata /*
24918d032b68SIsaku Yamahata * TDH.MNG.CREATE tries to grab the global TDX module and fails
24928d032b68SIsaku Yamahata * with TDX_OPERAND_BUSY when it fails to grab. Take the global
24938d032b68SIsaku Yamahata * lock to prevent it from failure.
24948d032b68SIsaku Yamahata */
24958d032b68SIsaku Yamahata mutex_lock(&tdx_lock);
24968d032b68SIsaku Yamahata kvm_tdx->td.tdr_page = tdr_page;
24978d032b68SIsaku Yamahata err = tdh_mng_create(&kvm_tdx->td, kvm_tdx->hkid);
24988d032b68SIsaku Yamahata mutex_unlock(&tdx_lock);
24998d032b68SIsaku Yamahata
25008d032b68SIsaku Yamahata if (err == TDX_RND_NO_ENTROPY) {
25018d032b68SIsaku Yamahata ret = -EAGAIN;
25028d032b68SIsaku Yamahata goto free_packages;
25038d032b68SIsaku Yamahata }
25048d032b68SIsaku Yamahata
25058d032b68SIsaku Yamahata if (WARN_ON_ONCE(err)) {
25068d032b68SIsaku Yamahata pr_tdx_error(TDH_MNG_CREATE, err);
25078d032b68SIsaku Yamahata ret = -EIO;
25088d032b68SIsaku Yamahata goto free_packages;
25098d032b68SIsaku Yamahata }
25108d032b68SIsaku Yamahata
25118d032b68SIsaku Yamahata for_each_online_cpu(i) {
25128d032b68SIsaku Yamahata int pkg = topology_physical_package_id(i);
25138d032b68SIsaku Yamahata
25148d032b68SIsaku Yamahata if (cpumask_test_and_set_cpu(pkg, packages))
25158d032b68SIsaku Yamahata continue;
25168d032b68SIsaku Yamahata
25178d032b68SIsaku Yamahata /*
25188d032b68SIsaku Yamahata * Program the memory controller in the package with an
25198d032b68SIsaku Yamahata * encryption key associated to a TDX private host key id
25208d032b68SIsaku Yamahata * assigned to this TDR. Concurrent operations on same memory
25218d032b68SIsaku Yamahata * controller results in TDX_OPERAND_BUSY. No locking needed
25228d032b68SIsaku Yamahata * beyond the cpus_read_lock() above as it serializes against
25238d032b68SIsaku Yamahata * hotplug and the first online CPU of the package is always
25248d032b68SIsaku Yamahata * used. We never have two CPUs in the same socket trying to
25258d032b68SIsaku Yamahata * program the key.
25268d032b68SIsaku Yamahata */
25278d032b68SIsaku Yamahata ret = smp_call_on_cpu(i, tdx_do_tdh_mng_key_config,
25288d032b68SIsaku Yamahata kvm_tdx, true);
25298d032b68SIsaku Yamahata if (ret)
25308d032b68SIsaku Yamahata break;
25318d032b68SIsaku Yamahata }
25328d032b68SIsaku Yamahata cpus_read_unlock();
25338d032b68SIsaku Yamahata free_cpumask_var(packages);
25348d032b68SIsaku Yamahata if (ret) {
25358d032b68SIsaku Yamahata i = 0;
25368d032b68SIsaku Yamahata goto teardown;
25378d032b68SIsaku Yamahata }
25388d032b68SIsaku Yamahata
25398d032b68SIsaku Yamahata kvm_tdx->td.tdcs_pages = tdcs_pages;
25408d032b68SIsaku Yamahata for (i = 0; i < kvm_tdx->td.tdcs_nr_pages; i++) {
25418d032b68SIsaku Yamahata err = tdh_mng_addcx(&kvm_tdx->td, tdcs_pages[i]);
25428d032b68SIsaku Yamahata if (err == TDX_RND_NO_ENTROPY) {
25438d032b68SIsaku Yamahata /* Here it's hard to allow userspace to retry. */
25448d032b68SIsaku Yamahata ret = -EAGAIN;
25458d032b68SIsaku Yamahata goto teardown;
25468d032b68SIsaku Yamahata }
25478d032b68SIsaku Yamahata if (WARN_ON_ONCE(err)) {
25488d032b68SIsaku Yamahata pr_tdx_error(TDH_MNG_ADDCX, err);
25498d032b68SIsaku Yamahata ret = -EIO;
25508d032b68SIsaku Yamahata goto teardown;
25518d032b68SIsaku Yamahata }
25528d032b68SIsaku Yamahata }
25538d032b68SIsaku Yamahata
25540186dd29SIsaku Yamahata err = tdh_mng_init(&kvm_tdx->td, __pa(td_params), &rcx);
25550186dd29SIsaku Yamahata if ((err & TDX_SEAMCALL_STATUS_MASK) == TDX_OPERAND_INVALID) {
25568d032b68SIsaku Yamahata /*
25570186dd29SIsaku Yamahata * Because a user gives operands, don't warn.
25580186dd29SIsaku Yamahata * Return a hint to the user because it's sometimes hard for the
25590186dd29SIsaku Yamahata * user to figure out which operand is invalid. SEAMCALL status
25600186dd29SIsaku Yamahata * code includes which operand caused invalid operand error.
25618d032b68SIsaku Yamahata */
25620186dd29SIsaku Yamahata *seamcall_err = err;
25630186dd29SIsaku Yamahata ret = -EINVAL;
25640186dd29SIsaku Yamahata goto teardown;
25650186dd29SIsaku Yamahata } else if (WARN_ON_ONCE(err)) {
25660186dd29SIsaku Yamahata pr_tdx_error_1(TDH_MNG_INIT, err, rcx);
25670186dd29SIsaku Yamahata ret = -EIO;
25680186dd29SIsaku Yamahata goto teardown;
25690186dd29SIsaku Yamahata }
25700186dd29SIsaku Yamahata
25718d032b68SIsaku Yamahata return 0;
25728d032b68SIsaku Yamahata
25738d032b68SIsaku Yamahata /*
25748d032b68SIsaku Yamahata * The sequence for freeing resources from a partially initialized TD
25758d032b68SIsaku Yamahata * varies based on where in the initialization flow failure occurred.
25768d032b68SIsaku Yamahata * Simply use the full teardown and destroy, which naturally play nice
25778d032b68SIsaku Yamahata * with partial initialization.
25788d032b68SIsaku Yamahata */
25798d032b68SIsaku Yamahata teardown:
25808d032b68SIsaku Yamahata /* Only free pages not yet added, so start at 'i' */
25818d032b68SIsaku Yamahata for (; i < kvm_tdx->td.tdcs_nr_pages; i++) {
25828d032b68SIsaku Yamahata if (tdcs_pages[i]) {
25838d032b68SIsaku Yamahata __free_page(tdcs_pages[i]);
25848d032b68SIsaku Yamahata tdcs_pages[i] = NULL;
25858d032b68SIsaku Yamahata }
25868d032b68SIsaku Yamahata }
25878d032b68SIsaku Yamahata if (!kvm_tdx->td.tdcs_pages)
25888d032b68SIsaku Yamahata kfree(tdcs_pages);
25898d032b68SIsaku Yamahata
25908d032b68SIsaku Yamahata tdx_mmu_release_hkid(kvm);
25918d032b68SIsaku Yamahata tdx_reclaim_td_control_pages(kvm);
25928d032b68SIsaku Yamahata
25938d032b68SIsaku Yamahata return ret;
25948d032b68SIsaku Yamahata
25958d032b68SIsaku Yamahata free_packages:
25968d032b68SIsaku Yamahata cpus_read_unlock();
25978d032b68SIsaku Yamahata free_cpumask_var(packages);
25988d032b68SIsaku Yamahata
25998d032b68SIsaku Yamahata free_tdcs:
26008d032b68SIsaku Yamahata for (i = 0; i < kvm_tdx->td.tdcs_nr_pages; i++) {
26018d032b68SIsaku Yamahata if (tdcs_pages[i])
26028d032b68SIsaku Yamahata __free_page(tdcs_pages[i]);
26038d032b68SIsaku Yamahata }
26048d032b68SIsaku Yamahata kfree(tdcs_pages);
26058d032b68SIsaku Yamahata kvm_tdx->td.tdcs_pages = NULL;
26068d032b68SIsaku Yamahata
26078d032b68SIsaku Yamahata free_tdr:
26088d032b68SIsaku Yamahata if (tdr_page)
26098d032b68SIsaku Yamahata __free_page(tdr_page);
26108d032b68SIsaku Yamahata kvm_tdx->td.tdr_page = 0;
26118d032b68SIsaku Yamahata
26128d032b68SIsaku Yamahata free_hkid:
26138d032b68SIsaku Yamahata tdx_hkid_free(kvm_tdx);
26148d032b68SIsaku Yamahata
26158d032b68SIsaku Yamahata return ret;
26168d032b68SIsaku Yamahata }
26178d032b68SIsaku Yamahata
tdx_td_metadata_field_read(struct kvm_tdx * tdx,u64 field_id,u64 * data)2618488808e6SXiaoyao Li static u64 tdx_td_metadata_field_read(struct kvm_tdx *tdx, u64 field_id,
2619488808e6SXiaoyao Li u64 *data)
2620488808e6SXiaoyao Li {
2621488808e6SXiaoyao Li u64 err;
2622488808e6SXiaoyao Li
2623488808e6SXiaoyao Li err = tdh_mng_rd(&tdx->td, field_id, data);
2624488808e6SXiaoyao Li
2625488808e6SXiaoyao Li return err;
2626488808e6SXiaoyao Li }
2627488808e6SXiaoyao Li
2628488808e6SXiaoyao Li #define TDX_MD_UNREADABLE_LEAF_MASK GENMASK(30, 7)
2629488808e6SXiaoyao Li #define TDX_MD_UNREADABLE_SUBLEAF_MASK GENMASK(31, 7)
2630488808e6SXiaoyao Li
tdx_read_cpuid(struct kvm_vcpu * vcpu,u32 leaf,u32 sub_leaf,bool sub_leaf_set,int * entry_index,struct kvm_cpuid_entry2 * out)2631488808e6SXiaoyao Li static int tdx_read_cpuid(struct kvm_vcpu *vcpu, u32 leaf, u32 sub_leaf,
2632488808e6SXiaoyao Li bool sub_leaf_set, int *entry_index,
2633488808e6SXiaoyao Li struct kvm_cpuid_entry2 *out)
2634488808e6SXiaoyao Li {
2635488808e6SXiaoyao Li struct kvm_tdx *kvm_tdx = to_kvm_tdx(vcpu->kvm);
2636488808e6SXiaoyao Li u64 field_id = TD_MD_FIELD_ID_CPUID_VALUES;
2637488808e6SXiaoyao Li u64 ebx_eax, edx_ecx;
2638488808e6SXiaoyao Li u64 err = 0;
2639488808e6SXiaoyao Li
2640488808e6SXiaoyao Li if (sub_leaf > 0b1111111)
2641488808e6SXiaoyao Li return -EINVAL;
2642488808e6SXiaoyao Li
2643488808e6SXiaoyao Li if (*entry_index >= KVM_MAX_CPUID_ENTRIES)
2644488808e6SXiaoyao Li return -EINVAL;
2645488808e6SXiaoyao Li
2646488808e6SXiaoyao Li if (leaf & TDX_MD_UNREADABLE_LEAF_MASK ||
2647488808e6SXiaoyao Li sub_leaf & TDX_MD_UNREADABLE_SUBLEAF_MASK)
2648488808e6SXiaoyao Li return -EINVAL;
2649488808e6SXiaoyao Li
2650488808e6SXiaoyao Li /*
2651488808e6SXiaoyao Li * bit 23:17, REVSERVED: reserved, must be 0;
2652488808e6SXiaoyao Li * bit 16, LEAF_31: leaf number bit 31;
2653488808e6SXiaoyao Li * bit 15:9, LEAF_6_0: leaf number bits 6:0, leaf bits 30:7 are
2654488808e6SXiaoyao Li * implicitly 0;
2655488808e6SXiaoyao Li * bit 8, SUBLEAF_NA: sub-leaf not applicable flag;
2656488808e6SXiaoyao Li * bit 7:1, SUBLEAF_6_0: sub-leaf number bits 6:0. If SUBLEAF_NA is 1,
2657488808e6SXiaoyao Li * the SUBLEAF_6_0 is all-1.
2658488808e6SXiaoyao Li * sub-leaf bits 31:7 are implicitly 0;
2659488808e6SXiaoyao Li * bit 0, ELEMENT_I: Element index within field;
2660488808e6SXiaoyao Li */
2661488808e6SXiaoyao Li field_id |= ((leaf & 0x80000000) ? 1 : 0) << 16;
2662488808e6SXiaoyao Li field_id |= (leaf & 0x7f) << 9;
2663488808e6SXiaoyao Li if (sub_leaf_set)
2664488808e6SXiaoyao Li field_id |= (sub_leaf & 0x7f) << 1;
2665488808e6SXiaoyao Li else
2666488808e6SXiaoyao Li field_id |= 0x1fe;
2667488808e6SXiaoyao Li
2668488808e6SXiaoyao Li err = tdx_td_metadata_field_read(kvm_tdx, field_id, &ebx_eax);
2669488808e6SXiaoyao Li if (err) //TODO check for specific errors
2670488808e6SXiaoyao Li goto err_out;
2671488808e6SXiaoyao Li
2672488808e6SXiaoyao Li out->eax = (u32) ebx_eax;
2673488808e6SXiaoyao Li out->ebx = (u32) (ebx_eax >> 32);
2674488808e6SXiaoyao Li
2675488808e6SXiaoyao Li field_id++;
2676488808e6SXiaoyao Li err = tdx_td_metadata_field_read(kvm_tdx, field_id, &edx_ecx);
2677488808e6SXiaoyao Li /*
2678488808e6SXiaoyao Li * It's weird that reading edx_ecx fails while reading ebx_eax
2679488808e6SXiaoyao Li * succeeded.
2680488808e6SXiaoyao Li */
2681488808e6SXiaoyao Li if (WARN_ON_ONCE(err))
2682488808e6SXiaoyao Li goto err_out;
2683488808e6SXiaoyao Li
2684488808e6SXiaoyao Li out->ecx = (u32) edx_ecx;
2685488808e6SXiaoyao Li out->edx = (u32) (edx_ecx >> 32);
2686488808e6SXiaoyao Li
2687488808e6SXiaoyao Li out->function = leaf;
2688488808e6SXiaoyao Li out->index = sub_leaf;
2689488808e6SXiaoyao Li out->flags |= sub_leaf_set ? KVM_CPUID_FLAG_SIGNIFCANT_INDEX : 0;
2690488808e6SXiaoyao Li
2691488808e6SXiaoyao Li /*
2692488808e6SXiaoyao Li * Work around missing support on old TDX modules, fetch
2693488808e6SXiaoyao Li * guest maxpa from gfn_direct_bits.
2694488808e6SXiaoyao Li */
2695488808e6SXiaoyao Li if (leaf == 0x80000008) {
2696488808e6SXiaoyao Li gpa_t gpa_bits = gfn_to_gpa(kvm_gfn_direct_bits(vcpu->kvm));
2697488808e6SXiaoyao Li unsigned int g_maxpa = __ffs(gpa_bits) + 1;
2698488808e6SXiaoyao Li
2699488808e6SXiaoyao Li out->eax = tdx_set_guest_phys_addr_bits(out->eax, g_maxpa);
2700488808e6SXiaoyao Li }
2701488808e6SXiaoyao Li
2702488808e6SXiaoyao Li (*entry_index)++;
2703488808e6SXiaoyao Li
2704488808e6SXiaoyao Li return 0;
2705488808e6SXiaoyao Li
2706488808e6SXiaoyao Li err_out:
2707488808e6SXiaoyao Li out->eax = 0;
2708488808e6SXiaoyao Li out->ebx = 0;
2709488808e6SXiaoyao Li out->ecx = 0;
2710488808e6SXiaoyao Li out->edx = 0;
2711488808e6SXiaoyao Li
2712488808e6SXiaoyao Li return -EIO;
2713488808e6SXiaoyao Li }
2714488808e6SXiaoyao Li
tdx_td_init(struct kvm * kvm,struct kvm_tdx_cmd * cmd)27150186dd29SIsaku Yamahata static int tdx_td_init(struct kvm *kvm, struct kvm_tdx_cmd *cmd)
27160186dd29SIsaku Yamahata {
27170186dd29SIsaku Yamahata struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
27180186dd29SIsaku Yamahata struct kvm_tdx_init_vm *init_vm;
27190186dd29SIsaku Yamahata struct td_params *td_params = NULL;
27200186dd29SIsaku Yamahata int ret;
27210186dd29SIsaku Yamahata
27220186dd29SIsaku Yamahata BUILD_BUG_ON(sizeof(*init_vm) != 256 + sizeof_field(struct kvm_tdx_init_vm, cpuid));
27230186dd29SIsaku Yamahata BUILD_BUG_ON(sizeof(struct td_params) != 1024);
27240186dd29SIsaku Yamahata
27250186dd29SIsaku Yamahata if (kvm_tdx->state != TD_STATE_UNINITIALIZED)
27260186dd29SIsaku Yamahata return -EINVAL;
27270186dd29SIsaku Yamahata
27280186dd29SIsaku Yamahata if (cmd->flags)
27290186dd29SIsaku Yamahata return -EINVAL;
27300186dd29SIsaku Yamahata
27310186dd29SIsaku Yamahata init_vm = kmalloc(sizeof(*init_vm) +
27320186dd29SIsaku Yamahata sizeof(init_vm->cpuid.entries[0]) * KVM_MAX_CPUID_ENTRIES,
27330186dd29SIsaku Yamahata GFP_KERNEL);
27340186dd29SIsaku Yamahata if (!init_vm)
27350186dd29SIsaku Yamahata return -ENOMEM;
27360186dd29SIsaku Yamahata
27370186dd29SIsaku Yamahata if (copy_from_user(init_vm, u64_to_user_ptr(cmd->data), sizeof(*init_vm))) {
27380186dd29SIsaku Yamahata ret = -EFAULT;
27390186dd29SIsaku Yamahata goto out;
27400186dd29SIsaku Yamahata }
27410186dd29SIsaku Yamahata
27420186dd29SIsaku Yamahata if (init_vm->cpuid.nent > KVM_MAX_CPUID_ENTRIES) {
27430186dd29SIsaku Yamahata ret = -E2BIG;
27440186dd29SIsaku Yamahata goto out;
27450186dd29SIsaku Yamahata }
27460186dd29SIsaku Yamahata
27470186dd29SIsaku Yamahata if (copy_from_user(init_vm->cpuid.entries,
27480186dd29SIsaku Yamahata u64_to_user_ptr(cmd->data) + sizeof(*init_vm),
27490186dd29SIsaku Yamahata flex_array_size(init_vm, cpuid.entries, init_vm->cpuid.nent))) {
27500186dd29SIsaku Yamahata ret = -EFAULT;
27510186dd29SIsaku Yamahata goto out;
27520186dd29SIsaku Yamahata }
27530186dd29SIsaku Yamahata
27540186dd29SIsaku Yamahata if (memchr_inv(init_vm->reserved, 0, sizeof(init_vm->reserved))) {
27550186dd29SIsaku Yamahata ret = -EINVAL;
27560186dd29SIsaku Yamahata goto out;
27570186dd29SIsaku Yamahata }
27580186dd29SIsaku Yamahata
27590186dd29SIsaku Yamahata if (init_vm->cpuid.padding) {
27600186dd29SIsaku Yamahata ret = -EINVAL;
27610186dd29SIsaku Yamahata goto out;
27620186dd29SIsaku Yamahata }
27630186dd29SIsaku Yamahata
27640186dd29SIsaku Yamahata td_params = kzalloc(sizeof(struct td_params), GFP_KERNEL);
27650186dd29SIsaku Yamahata if (!td_params) {
27660186dd29SIsaku Yamahata ret = -ENOMEM;
27670186dd29SIsaku Yamahata goto out;
27680186dd29SIsaku Yamahata }
27690186dd29SIsaku Yamahata
27700186dd29SIsaku Yamahata ret = setup_tdparams(kvm, td_params, init_vm);
27710186dd29SIsaku Yamahata if (ret)
27720186dd29SIsaku Yamahata goto out;
27730186dd29SIsaku Yamahata
27740186dd29SIsaku Yamahata ret = __tdx_td_init(kvm, td_params, &cmd->hw_error);
27750186dd29SIsaku Yamahata if (ret)
27760186dd29SIsaku Yamahata goto out;
27770186dd29SIsaku Yamahata
27780186dd29SIsaku Yamahata kvm_tdx->tsc_offset = td_tdcs_exec_read64(kvm_tdx, TD_TDCS_EXEC_TSC_OFFSET);
27790186dd29SIsaku Yamahata kvm_tdx->tsc_multiplier = td_tdcs_exec_read64(kvm_tdx, TD_TDCS_EXEC_TSC_MULTIPLIER);
27800186dd29SIsaku Yamahata kvm_tdx->attributes = td_params->attributes;
27810186dd29SIsaku Yamahata kvm_tdx->xfam = td_params->xfam;
27820186dd29SIsaku Yamahata
2783e0fbb3bbSIsaku Yamahata if (td_params->config_flags & TDX_CONFIG_FLAGS_MAX_GPAW)
2784e0fbb3bbSIsaku Yamahata kvm->arch.gfn_direct_bits = TDX_SHARED_BIT_PWL_5;
2785e0fbb3bbSIsaku Yamahata else
2786e0fbb3bbSIsaku Yamahata kvm->arch.gfn_direct_bits = TDX_SHARED_BIT_PWL_4;
2787e0fbb3bbSIsaku Yamahata
27880186dd29SIsaku Yamahata kvm_tdx->state = TD_STATE_INITIALIZED;
27890186dd29SIsaku Yamahata out:
27900186dd29SIsaku Yamahata /* kfree() accepts NULL. */
27910186dd29SIsaku Yamahata kfree(init_vm);
27920186dd29SIsaku Yamahata kfree(td_params);
27930186dd29SIsaku Yamahata
27940186dd29SIsaku Yamahata return ret;
27950186dd29SIsaku Yamahata }
27960186dd29SIsaku Yamahata
tdx_flush_tlb_current(struct kvm_vcpu * vcpu)279722836e1dSIsaku Yamahata void tdx_flush_tlb_current(struct kvm_vcpu *vcpu)
279822836e1dSIsaku Yamahata {
279922836e1dSIsaku Yamahata /*
280022836e1dSIsaku Yamahata * flush_tlb_current() is invoked when the first time for the vcpu to
280122836e1dSIsaku Yamahata * run or when root of shared EPT is invalidated.
280222836e1dSIsaku Yamahata * KVM only needs to flush shared EPT because the TDX module handles TLB
280322836e1dSIsaku Yamahata * invalidation for private EPT in tdh_vp_enter();
280422836e1dSIsaku Yamahata *
280522836e1dSIsaku Yamahata * A single context invalidation for shared EPT can be performed here.
280622836e1dSIsaku Yamahata * However, this single context invalidation requires the private EPTP
280722836e1dSIsaku Yamahata * rather than the shared EPTP to flush shared EPT, as shared EPT uses
280822836e1dSIsaku Yamahata * private EPTP as its ASID for TLB invalidation.
280922836e1dSIsaku Yamahata *
281022836e1dSIsaku Yamahata * To avoid reading back private EPTP, perform a global invalidation for
281122836e1dSIsaku Yamahata * shared EPT instead to keep this function simple.
281222836e1dSIsaku Yamahata */
281322836e1dSIsaku Yamahata ept_sync_global();
281422836e1dSIsaku Yamahata }
281522836e1dSIsaku Yamahata
tdx_flush_tlb_all(struct kvm_vcpu * vcpu)281622836e1dSIsaku Yamahata void tdx_flush_tlb_all(struct kvm_vcpu *vcpu)
281722836e1dSIsaku Yamahata {
281822836e1dSIsaku Yamahata /*
281922836e1dSIsaku Yamahata * TDX has called tdx_track() in tdx_sept_remove_private_spte() to
282022836e1dSIsaku Yamahata * ensure that private EPT will be flushed on the next TD enter. No need
282122836e1dSIsaku Yamahata * to call tdx_track() here again even when this callback is a result of
282222836e1dSIsaku Yamahata * zapping private EPT.
282322836e1dSIsaku Yamahata *
282422836e1dSIsaku Yamahata * Due to the lack of the context to determine which EPT has been
282522836e1dSIsaku Yamahata * affected by zapping, invoke invept() directly here for both shared
282622836e1dSIsaku Yamahata * EPT and private EPT for simplicity, though it's not necessary for
282722836e1dSIsaku Yamahata * private EPT.
282822836e1dSIsaku Yamahata */
282922836e1dSIsaku Yamahata ept_sync_global();
283022836e1dSIsaku Yamahata }
283122836e1dSIsaku Yamahata
tdx_td_finalize(struct kvm * kvm,struct kvm_tdx_cmd * cmd)2832012426d6SIsaku Yamahata static int tdx_td_finalize(struct kvm *kvm, struct kvm_tdx_cmd *cmd)
2833012426d6SIsaku Yamahata {
2834012426d6SIsaku Yamahata struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
2835012426d6SIsaku Yamahata
2836012426d6SIsaku Yamahata guard(mutex)(&kvm->slots_lock);
2837012426d6SIsaku Yamahata
2838012426d6SIsaku Yamahata if (!is_hkid_assigned(kvm_tdx) || kvm_tdx->state == TD_STATE_RUNNABLE)
2839012426d6SIsaku Yamahata return -EINVAL;
2840012426d6SIsaku Yamahata /*
2841012426d6SIsaku Yamahata * Pages are pending for KVM_TDX_INIT_MEM_REGION to issue
2842012426d6SIsaku Yamahata * TDH.MEM.PAGE.ADD().
2843012426d6SIsaku Yamahata */
2844012426d6SIsaku Yamahata if (atomic64_read(&kvm_tdx->nr_premapped))
2845012426d6SIsaku Yamahata return -EINVAL;
2846012426d6SIsaku Yamahata
2847012426d6SIsaku Yamahata cmd->hw_error = tdh_mr_finalize(&kvm_tdx->td);
2848012426d6SIsaku Yamahata if (tdx_operand_busy(cmd->hw_error))
2849012426d6SIsaku Yamahata return -EBUSY;
2850012426d6SIsaku Yamahata if (KVM_BUG_ON(cmd->hw_error, kvm)) {
2851012426d6SIsaku Yamahata pr_tdx_error(TDH_MR_FINALIZE, cmd->hw_error);
2852012426d6SIsaku Yamahata return -EIO;
2853012426d6SIsaku Yamahata }
2854012426d6SIsaku Yamahata
2855012426d6SIsaku Yamahata kvm_tdx->state = TD_STATE_RUNNABLE;
2856012426d6SIsaku Yamahata /* TD_STATE_RUNNABLE must be set before 'pre_fault_allowed' */
2857012426d6SIsaku Yamahata smp_wmb();
2858012426d6SIsaku Yamahata kvm->arch.pre_fault_allowed = true;
2859012426d6SIsaku Yamahata return 0;
2860012426d6SIsaku Yamahata }
2861012426d6SIsaku Yamahata
tdx_vm_ioctl(struct kvm * kvm,void __user * argp)2862b2aaf38cSIsaku Yamahata int tdx_vm_ioctl(struct kvm *kvm, void __user *argp)
2863b2aaf38cSIsaku Yamahata {
2864b2aaf38cSIsaku Yamahata struct kvm_tdx_cmd tdx_cmd;
2865b2aaf38cSIsaku Yamahata int r;
2866b2aaf38cSIsaku Yamahata
2867b2aaf38cSIsaku Yamahata if (copy_from_user(&tdx_cmd, argp, sizeof(struct kvm_tdx_cmd)))
2868b2aaf38cSIsaku Yamahata return -EFAULT;
2869b2aaf38cSIsaku Yamahata
2870b2aaf38cSIsaku Yamahata /*
2871b2aaf38cSIsaku Yamahata * Userspace should never set hw_error. It is used to fill
2872b2aaf38cSIsaku Yamahata * hardware-defined error by the kernel.
2873b2aaf38cSIsaku Yamahata */
2874b2aaf38cSIsaku Yamahata if (tdx_cmd.hw_error)
2875b2aaf38cSIsaku Yamahata return -EINVAL;
2876b2aaf38cSIsaku Yamahata
2877b2aaf38cSIsaku Yamahata mutex_lock(&kvm->lock);
2878b2aaf38cSIsaku Yamahata
2879b2aaf38cSIsaku Yamahata switch (tdx_cmd.id) {
288061bb2827SIsaku Yamahata case KVM_TDX_CAPABILITIES:
288161bb2827SIsaku Yamahata r = tdx_get_capabilities(&tdx_cmd);
288261bb2827SIsaku Yamahata break;
28830186dd29SIsaku Yamahata case KVM_TDX_INIT_VM:
28840186dd29SIsaku Yamahata r = tdx_td_init(kvm, &tdx_cmd);
28850186dd29SIsaku Yamahata break;
2886012426d6SIsaku Yamahata case KVM_TDX_FINALIZE_VM:
2887012426d6SIsaku Yamahata r = tdx_td_finalize(kvm, &tdx_cmd);
2888012426d6SIsaku Yamahata break;
2889b2aaf38cSIsaku Yamahata default:
2890b2aaf38cSIsaku Yamahata r = -EINVAL;
2891b2aaf38cSIsaku Yamahata goto out;
2892b2aaf38cSIsaku Yamahata }
2893b2aaf38cSIsaku Yamahata
2894b2aaf38cSIsaku Yamahata if (copy_to_user(argp, &tdx_cmd, sizeof(struct kvm_tdx_cmd)))
2895b2aaf38cSIsaku Yamahata r = -EFAULT;
2896b2aaf38cSIsaku Yamahata
2897b2aaf38cSIsaku Yamahata out:
2898b2aaf38cSIsaku Yamahata mutex_unlock(&kvm->lock);
2899b2aaf38cSIsaku Yamahata return r;
2900b2aaf38cSIsaku Yamahata }
2901b2aaf38cSIsaku Yamahata
2902a50f673fSIsaku Yamahata /* VMM can pass one 64bit auxiliary data to vcpu via RCX for guest BIOS. */
tdx_td_vcpu_init(struct kvm_vcpu * vcpu,u64 vcpu_rcx)2903a50f673fSIsaku Yamahata static int tdx_td_vcpu_init(struct kvm_vcpu *vcpu, u64 vcpu_rcx)
2904a50f673fSIsaku Yamahata {
2905a50f673fSIsaku Yamahata struct kvm_tdx *kvm_tdx = to_kvm_tdx(vcpu->kvm);
2906a50f673fSIsaku Yamahata struct vcpu_tdx *tdx = to_tdx(vcpu);
2907a50f673fSIsaku Yamahata struct page *page;
2908a50f673fSIsaku Yamahata int ret, i;
2909a50f673fSIsaku Yamahata u64 err;
2910a50f673fSIsaku Yamahata
2911a50f673fSIsaku Yamahata page = alloc_page(GFP_KERNEL);
2912a50f673fSIsaku Yamahata if (!page)
2913a50f673fSIsaku Yamahata return -ENOMEM;
2914a50f673fSIsaku Yamahata tdx->vp.tdvpr_page = page;
2915a50f673fSIsaku Yamahata
2916a50f673fSIsaku Yamahata tdx->vp.tdcx_pages = kcalloc(kvm_tdx->td.tdcx_nr_pages, sizeof(*tdx->vp.tdcx_pages),
2917a50f673fSIsaku Yamahata GFP_KERNEL);
2918a50f673fSIsaku Yamahata if (!tdx->vp.tdcx_pages) {
2919a50f673fSIsaku Yamahata ret = -ENOMEM;
2920a50f673fSIsaku Yamahata goto free_tdvpr;
2921a50f673fSIsaku Yamahata }
2922a50f673fSIsaku Yamahata
2923a50f673fSIsaku Yamahata for (i = 0; i < kvm_tdx->td.tdcx_nr_pages; i++) {
2924a50f673fSIsaku Yamahata page = alloc_page(GFP_KERNEL);
2925a50f673fSIsaku Yamahata if (!page) {
2926a50f673fSIsaku Yamahata ret = -ENOMEM;
2927a50f673fSIsaku Yamahata goto free_tdcx;
2928a50f673fSIsaku Yamahata }
2929a50f673fSIsaku Yamahata tdx->vp.tdcx_pages[i] = page;
2930a50f673fSIsaku Yamahata }
2931a50f673fSIsaku Yamahata
2932a50f673fSIsaku Yamahata err = tdh_vp_create(&kvm_tdx->td, &tdx->vp);
2933a50f673fSIsaku Yamahata if (KVM_BUG_ON(err, vcpu->kvm)) {
2934a50f673fSIsaku Yamahata ret = -EIO;
2935a50f673fSIsaku Yamahata pr_tdx_error(TDH_VP_CREATE, err);
2936a50f673fSIsaku Yamahata goto free_tdcx;
2937a50f673fSIsaku Yamahata }
2938a50f673fSIsaku Yamahata
2939a50f673fSIsaku Yamahata for (i = 0; i < kvm_tdx->td.tdcx_nr_pages; i++) {
2940a50f673fSIsaku Yamahata err = tdh_vp_addcx(&tdx->vp, tdx->vp.tdcx_pages[i]);
2941a50f673fSIsaku Yamahata if (KVM_BUG_ON(err, vcpu->kvm)) {
2942a50f673fSIsaku Yamahata pr_tdx_error(TDH_VP_ADDCX, err);
2943a50f673fSIsaku Yamahata /*
2944a50f673fSIsaku Yamahata * Pages already added are reclaimed by the vcpu_free
2945a50f673fSIsaku Yamahata * method, but the rest are freed here.
2946a50f673fSIsaku Yamahata */
2947a50f673fSIsaku Yamahata for (; i < kvm_tdx->td.tdcx_nr_pages; i++) {
2948a50f673fSIsaku Yamahata __free_page(tdx->vp.tdcx_pages[i]);
2949a50f673fSIsaku Yamahata tdx->vp.tdcx_pages[i] = NULL;
2950a50f673fSIsaku Yamahata }
2951a50f673fSIsaku Yamahata return -EIO;
2952a50f673fSIsaku Yamahata }
2953a50f673fSIsaku Yamahata }
2954a50f673fSIsaku Yamahata
2955a50f673fSIsaku Yamahata err = tdh_vp_init(&tdx->vp, vcpu_rcx, vcpu->vcpu_id);
2956a50f673fSIsaku Yamahata if (KVM_BUG_ON(err, vcpu->kvm)) {
2957a50f673fSIsaku Yamahata pr_tdx_error(TDH_VP_INIT, err);
2958a50f673fSIsaku Yamahata return -EIO;
2959a50f673fSIsaku Yamahata }
2960a50f673fSIsaku Yamahata
2961a50f673fSIsaku Yamahata vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
2962a50f673fSIsaku Yamahata
2963a50f673fSIsaku Yamahata return 0;
2964a50f673fSIsaku Yamahata
2965a50f673fSIsaku Yamahata free_tdcx:
2966a50f673fSIsaku Yamahata for (i = 0; i < kvm_tdx->td.tdcx_nr_pages; i++) {
2967a50f673fSIsaku Yamahata if (tdx->vp.tdcx_pages[i])
2968a50f673fSIsaku Yamahata __free_page(tdx->vp.tdcx_pages[i]);
2969a50f673fSIsaku Yamahata tdx->vp.tdcx_pages[i] = NULL;
2970a50f673fSIsaku Yamahata }
2971a50f673fSIsaku Yamahata kfree(tdx->vp.tdcx_pages);
2972a50f673fSIsaku Yamahata tdx->vp.tdcx_pages = NULL;
2973a50f673fSIsaku Yamahata
2974a50f673fSIsaku Yamahata free_tdvpr:
2975a50f673fSIsaku Yamahata if (tdx->vp.tdvpr_page)
2976a50f673fSIsaku Yamahata __free_page(tdx->vp.tdvpr_page);
2977a50f673fSIsaku Yamahata tdx->vp.tdvpr_page = 0;
2978a50f673fSIsaku Yamahata
2979a50f673fSIsaku Yamahata return ret;
2980a50f673fSIsaku Yamahata }
2981a50f673fSIsaku Yamahata
2982488808e6SXiaoyao Li /* Sometimes reads multipple subleafs. Return how many enties were written. */
tdx_vcpu_get_cpuid_leaf(struct kvm_vcpu * vcpu,u32 leaf,int * entry_index,struct kvm_cpuid_entry2 * output_e)2983488808e6SXiaoyao Li static int tdx_vcpu_get_cpuid_leaf(struct kvm_vcpu *vcpu, u32 leaf, int *entry_index,
2984488808e6SXiaoyao Li struct kvm_cpuid_entry2 *output_e)
2985488808e6SXiaoyao Li {
2986488808e6SXiaoyao Li int sub_leaf = 0;
2987488808e6SXiaoyao Li int ret;
2988488808e6SXiaoyao Li
2989488808e6SXiaoyao Li /* First try without a subleaf */
2990488808e6SXiaoyao Li ret = tdx_read_cpuid(vcpu, leaf, 0, false, entry_index, output_e);
2991488808e6SXiaoyao Li
2992488808e6SXiaoyao Li /* If success, or invalid leaf, just give up */
2993488808e6SXiaoyao Li if (ret != -EIO)
2994488808e6SXiaoyao Li return ret;
2995488808e6SXiaoyao Li
2996488808e6SXiaoyao Li /*
2997488808e6SXiaoyao Li * If the try without a subleaf failed, try reading subleafs until
2998488808e6SXiaoyao Li * failure. The TDX module only supports 6 bits of subleaf index.
2999488808e6SXiaoyao Li */
3000488808e6SXiaoyao Li while (1) {
3001488808e6SXiaoyao Li /* Keep reading subleafs until there is a failure. */
3002488808e6SXiaoyao Li if (tdx_read_cpuid(vcpu, leaf, sub_leaf, true, entry_index, output_e))
3003488808e6SXiaoyao Li return !sub_leaf;
3004488808e6SXiaoyao Li
3005488808e6SXiaoyao Li sub_leaf++;
3006488808e6SXiaoyao Li output_e++;
3007488808e6SXiaoyao Li }
3008488808e6SXiaoyao Li
3009488808e6SXiaoyao Li return 0;
3010488808e6SXiaoyao Li }
3011488808e6SXiaoyao Li
tdx_vcpu_get_cpuid(struct kvm_vcpu * vcpu,struct kvm_tdx_cmd * cmd)3012488808e6SXiaoyao Li static int tdx_vcpu_get_cpuid(struct kvm_vcpu *vcpu, struct kvm_tdx_cmd *cmd)
3013488808e6SXiaoyao Li {
3014488808e6SXiaoyao Li struct kvm_cpuid2 __user *output, *td_cpuid;
3015488808e6SXiaoyao Li int r = 0, i = 0, leaf;
3016488808e6SXiaoyao Li u32 level;
3017488808e6SXiaoyao Li
3018488808e6SXiaoyao Li output = u64_to_user_ptr(cmd->data);
3019488808e6SXiaoyao Li td_cpuid = kzalloc(sizeof(*td_cpuid) +
3020488808e6SXiaoyao Li sizeof(output->entries[0]) * KVM_MAX_CPUID_ENTRIES,
3021488808e6SXiaoyao Li GFP_KERNEL);
3022488808e6SXiaoyao Li if (!td_cpuid)
3023488808e6SXiaoyao Li return -ENOMEM;
3024488808e6SXiaoyao Li
3025488808e6SXiaoyao Li if (copy_from_user(td_cpuid, output, sizeof(*output))) {
3026488808e6SXiaoyao Li r = -EFAULT;
3027488808e6SXiaoyao Li goto out;
3028488808e6SXiaoyao Li }
3029488808e6SXiaoyao Li
3030488808e6SXiaoyao Li /* Read max CPUID for normal range */
3031488808e6SXiaoyao Li if (tdx_vcpu_get_cpuid_leaf(vcpu, 0, &i, &td_cpuid->entries[i])) {
3032488808e6SXiaoyao Li r = -EIO;
3033488808e6SXiaoyao Li goto out;
3034488808e6SXiaoyao Li }
3035488808e6SXiaoyao Li level = td_cpuid->entries[0].eax;
3036488808e6SXiaoyao Li
3037488808e6SXiaoyao Li for (leaf = 1; leaf <= level; leaf++)
3038488808e6SXiaoyao Li tdx_vcpu_get_cpuid_leaf(vcpu, leaf, &i, &td_cpuid->entries[i]);
3039488808e6SXiaoyao Li
3040488808e6SXiaoyao Li /* Read max CPUID for extended range */
3041488808e6SXiaoyao Li if (tdx_vcpu_get_cpuid_leaf(vcpu, 0x80000000, &i, &td_cpuid->entries[i])) {
3042488808e6SXiaoyao Li r = -EIO;
3043488808e6SXiaoyao Li goto out;
3044488808e6SXiaoyao Li }
3045488808e6SXiaoyao Li level = td_cpuid->entries[i - 1].eax;
3046488808e6SXiaoyao Li
3047488808e6SXiaoyao Li for (leaf = 0x80000001; leaf <= level; leaf++)
3048488808e6SXiaoyao Li tdx_vcpu_get_cpuid_leaf(vcpu, leaf, &i, &td_cpuid->entries[i]);
3049488808e6SXiaoyao Li
3050488808e6SXiaoyao Li if (td_cpuid->nent < i)
3051488808e6SXiaoyao Li r = -E2BIG;
3052488808e6SXiaoyao Li td_cpuid->nent = i;
3053488808e6SXiaoyao Li
3054488808e6SXiaoyao Li if (copy_to_user(output, td_cpuid, sizeof(*output))) {
3055488808e6SXiaoyao Li r = -EFAULT;
3056488808e6SXiaoyao Li goto out;
3057488808e6SXiaoyao Li }
3058488808e6SXiaoyao Li
3059488808e6SXiaoyao Li if (r == -E2BIG)
3060488808e6SXiaoyao Li goto out;
3061488808e6SXiaoyao Li
3062488808e6SXiaoyao Li if (copy_to_user(output->entries, td_cpuid->entries,
3063488808e6SXiaoyao Li td_cpuid->nent * sizeof(struct kvm_cpuid_entry2)))
3064488808e6SXiaoyao Li r = -EFAULT;
3065488808e6SXiaoyao Li
3066488808e6SXiaoyao Li out:
3067488808e6SXiaoyao Li kfree(td_cpuid);
3068488808e6SXiaoyao Li
3069488808e6SXiaoyao Li return r;
3070488808e6SXiaoyao Li }
3071488808e6SXiaoyao Li
tdx_vcpu_init(struct kvm_vcpu * vcpu,struct kvm_tdx_cmd * cmd)3072a50f673fSIsaku Yamahata static int tdx_vcpu_init(struct kvm_vcpu *vcpu, struct kvm_tdx_cmd *cmd)
3073a50f673fSIsaku Yamahata {
3074a50f673fSIsaku Yamahata u64 apic_base;
3075a50f673fSIsaku Yamahata struct vcpu_tdx *tdx = to_tdx(vcpu);
3076a50f673fSIsaku Yamahata int ret;
3077a50f673fSIsaku Yamahata
3078a50f673fSIsaku Yamahata if (cmd->flags)
3079a50f673fSIsaku Yamahata return -EINVAL;
3080a50f673fSIsaku Yamahata
3081a50f673fSIsaku Yamahata if (tdx->state != VCPU_TD_STATE_UNINITIALIZED)
3082a50f673fSIsaku Yamahata return -EINVAL;
3083a50f673fSIsaku Yamahata
3084a50f673fSIsaku Yamahata /*
3085a50f673fSIsaku Yamahata * TDX requires X2APIC, userspace is responsible for configuring guest
3086a50f673fSIsaku Yamahata * CPUID accordingly.
3087a50f673fSIsaku Yamahata */
3088a50f673fSIsaku Yamahata apic_base = APIC_DEFAULT_PHYS_BASE | LAPIC_MODE_X2APIC |
3089a50f673fSIsaku Yamahata (kvm_vcpu_is_reset_bsp(vcpu) ? MSR_IA32_APICBASE_BSP : 0);
3090a50f673fSIsaku Yamahata if (kvm_apic_set_base(vcpu, apic_base, true))
3091a50f673fSIsaku Yamahata return -EINVAL;
3092a50f673fSIsaku Yamahata
3093a50f673fSIsaku Yamahata ret = tdx_td_vcpu_init(vcpu, (u64)cmd->data);
3094a50f673fSIsaku Yamahata if (ret)
3095a50f673fSIsaku Yamahata return ret;
3096a50f673fSIsaku Yamahata
309724c12911SIsaku Yamahata td_vmcs_write16(tdx, POSTED_INTR_NV, POSTED_INTR_VECTOR);
309824c12911SIsaku Yamahata td_vmcs_write64(tdx, POSTED_INTR_DESC_ADDR, __pa(&tdx->vt.pi_desc));
309924c12911SIsaku Yamahata td_vmcs_setbit32(tdx, PIN_BASED_VM_EXEC_CONTROL, PIN_BASED_POSTED_INTR);
310024c12911SIsaku Yamahata
3101a50f673fSIsaku Yamahata tdx->state = VCPU_TD_STATE_INITIALIZED;
3102a50f673fSIsaku Yamahata
3103a50f673fSIsaku Yamahata return 0;
3104a50f673fSIsaku Yamahata }
3105a50f673fSIsaku Yamahata
tdx_vcpu_reset(struct kvm_vcpu * vcpu,bool init_event)31064cdf243eSIsaku Yamahata void tdx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
31074cdf243eSIsaku Yamahata {
31084cdf243eSIsaku Yamahata /*
31094cdf243eSIsaku Yamahata * Yell on INIT, as TDX doesn't support INIT, i.e. KVM should drop all
31104cdf243eSIsaku Yamahata * INIT events.
31114cdf243eSIsaku Yamahata *
31124cdf243eSIsaku Yamahata * Defer initializing vCPU for RESET state until KVM_TDX_INIT_VCPU, as
31134cdf243eSIsaku Yamahata * userspace needs to define the vCPU model before KVM can initialize
31144cdf243eSIsaku Yamahata * vCPU state, e.g. to enable x2APIC.
31154cdf243eSIsaku Yamahata */
31164cdf243eSIsaku Yamahata WARN_ON_ONCE(init_event);
31174cdf243eSIsaku Yamahata }
31184cdf243eSIsaku Yamahata
3119c846b451SIsaku Yamahata struct tdx_gmem_post_populate_arg {
3120c846b451SIsaku Yamahata struct kvm_vcpu *vcpu;
3121c846b451SIsaku Yamahata __u32 flags;
3122c846b451SIsaku Yamahata };
3123c846b451SIsaku Yamahata
tdx_gmem_post_populate(struct kvm * kvm,gfn_t gfn,kvm_pfn_t pfn,void __user * src,int order,void * _arg)3124c846b451SIsaku Yamahata static int tdx_gmem_post_populate(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn,
3125c846b451SIsaku Yamahata void __user *src, int order, void *_arg)
3126c846b451SIsaku Yamahata {
3127c846b451SIsaku Yamahata u64 error_code = PFERR_GUEST_FINAL_MASK | PFERR_PRIVATE_ACCESS;
3128c846b451SIsaku Yamahata struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
3129c846b451SIsaku Yamahata struct tdx_gmem_post_populate_arg *arg = _arg;
3130c846b451SIsaku Yamahata struct kvm_vcpu *vcpu = arg->vcpu;
3131c846b451SIsaku Yamahata gpa_t gpa = gfn_to_gpa(gfn);
3132c846b451SIsaku Yamahata u8 level = PG_LEVEL_4K;
3133c846b451SIsaku Yamahata struct page *src_page;
3134c846b451SIsaku Yamahata int ret, i;
3135c846b451SIsaku Yamahata u64 err, entry, level_state;
3136c846b451SIsaku Yamahata
3137c846b451SIsaku Yamahata /*
3138c846b451SIsaku Yamahata * Get the source page if it has been faulted in. Return failure if the
3139c846b451SIsaku Yamahata * source page has been swapped out or unmapped in primary memory.
3140c846b451SIsaku Yamahata */
3141c846b451SIsaku Yamahata ret = get_user_pages_fast((unsigned long)src, 1, 0, &src_page);
3142c846b451SIsaku Yamahata if (ret < 0)
3143c846b451SIsaku Yamahata return ret;
3144c846b451SIsaku Yamahata if (ret != 1)
3145c846b451SIsaku Yamahata return -ENOMEM;
3146c846b451SIsaku Yamahata
3147c846b451SIsaku Yamahata ret = kvm_tdp_map_page(vcpu, gpa, error_code, &level);
3148c846b451SIsaku Yamahata if (ret < 0)
3149c846b451SIsaku Yamahata goto out;
3150c846b451SIsaku Yamahata
3151c846b451SIsaku Yamahata /*
3152c846b451SIsaku Yamahata * The private mem cannot be zapped after kvm_tdp_map_page()
3153c846b451SIsaku Yamahata * because all paths are covered by slots_lock and the
3154c846b451SIsaku Yamahata * filemap invalidate lock. Check that they are indeed enough.
3155c846b451SIsaku Yamahata */
3156c846b451SIsaku Yamahata if (IS_ENABLED(CONFIG_KVM_PROVE_MMU)) {
3157c846b451SIsaku Yamahata scoped_guard(read_lock, &kvm->mmu_lock) {
3158c846b451SIsaku Yamahata if (KVM_BUG_ON(!kvm_tdp_mmu_gpa_is_mapped(vcpu, gpa), kvm)) {
3159c846b451SIsaku Yamahata ret = -EIO;
3160c846b451SIsaku Yamahata goto out;
3161c846b451SIsaku Yamahata }
3162c846b451SIsaku Yamahata }
3163c846b451SIsaku Yamahata }
3164c846b451SIsaku Yamahata
3165c846b451SIsaku Yamahata ret = 0;
3166c846b451SIsaku Yamahata err = tdh_mem_page_add(&kvm_tdx->td, gpa, pfn_to_page(pfn),
3167c846b451SIsaku Yamahata src_page, &entry, &level_state);
3168c846b451SIsaku Yamahata if (err) {
3169c846b451SIsaku Yamahata ret = unlikely(tdx_operand_busy(err)) ? -EBUSY : -EIO;
3170c846b451SIsaku Yamahata goto out;
3171c846b451SIsaku Yamahata }
3172c846b451SIsaku Yamahata
3173012426d6SIsaku Yamahata if (!KVM_BUG_ON(!atomic64_read(&kvm_tdx->nr_premapped), kvm))
3174012426d6SIsaku Yamahata atomic64_dec(&kvm_tdx->nr_premapped);
3175012426d6SIsaku Yamahata
3176c846b451SIsaku Yamahata if (arg->flags & KVM_TDX_MEASURE_MEMORY_REGION) {
3177c846b451SIsaku Yamahata for (i = 0; i < PAGE_SIZE; i += TDX_EXTENDMR_CHUNKSIZE) {
3178c846b451SIsaku Yamahata err = tdh_mr_extend(&kvm_tdx->td, gpa + i, &entry,
3179c846b451SIsaku Yamahata &level_state);
3180c846b451SIsaku Yamahata if (err) {
3181c846b451SIsaku Yamahata ret = -EIO;
3182c846b451SIsaku Yamahata break;
3183c846b451SIsaku Yamahata }
3184c846b451SIsaku Yamahata }
3185c846b451SIsaku Yamahata }
3186c846b451SIsaku Yamahata
3187c846b451SIsaku Yamahata out:
3188c846b451SIsaku Yamahata put_page(src_page);
3189c846b451SIsaku Yamahata return ret;
3190c846b451SIsaku Yamahata }
3191c846b451SIsaku Yamahata
tdx_vcpu_init_mem_region(struct kvm_vcpu * vcpu,struct kvm_tdx_cmd * cmd)3192c846b451SIsaku Yamahata static int tdx_vcpu_init_mem_region(struct kvm_vcpu *vcpu, struct kvm_tdx_cmd *cmd)
3193c846b451SIsaku Yamahata {
3194c846b451SIsaku Yamahata struct vcpu_tdx *tdx = to_tdx(vcpu);
3195c846b451SIsaku Yamahata struct kvm *kvm = vcpu->kvm;
3196c846b451SIsaku Yamahata struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
3197c846b451SIsaku Yamahata struct kvm_tdx_init_mem_region region;
3198c846b451SIsaku Yamahata struct tdx_gmem_post_populate_arg arg;
3199c846b451SIsaku Yamahata long gmem_ret;
3200c846b451SIsaku Yamahata int ret;
3201c846b451SIsaku Yamahata
3202c846b451SIsaku Yamahata if (tdx->state != VCPU_TD_STATE_INITIALIZED)
3203c846b451SIsaku Yamahata return -EINVAL;
3204c846b451SIsaku Yamahata
3205c846b451SIsaku Yamahata guard(mutex)(&kvm->slots_lock);
3206c846b451SIsaku Yamahata
3207c846b451SIsaku Yamahata /* Once TD is finalized, the initial guest memory is fixed. */
3208c846b451SIsaku Yamahata if (kvm_tdx->state == TD_STATE_RUNNABLE)
3209c846b451SIsaku Yamahata return -EINVAL;
3210c846b451SIsaku Yamahata
3211c846b451SIsaku Yamahata if (cmd->flags & ~KVM_TDX_MEASURE_MEMORY_REGION)
3212c846b451SIsaku Yamahata return -EINVAL;
3213c846b451SIsaku Yamahata
3214c846b451SIsaku Yamahata if (copy_from_user(®ion, u64_to_user_ptr(cmd->data), sizeof(region)))
3215c846b451SIsaku Yamahata return -EFAULT;
3216c846b451SIsaku Yamahata
3217c846b451SIsaku Yamahata if (!PAGE_ALIGNED(region.source_addr) || !PAGE_ALIGNED(region.gpa) ||
3218c846b451SIsaku Yamahata !region.nr_pages ||
3219c846b451SIsaku Yamahata region.gpa + (region.nr_pages << PAGE_SHIFT) <= region.gpa ||
3220c846b451SIsaku Yamahata !vt_is_tdx_private_gpa(kvm, region.gpa) ||
3221c846b451SIsaku Yamahata !vt_is_tdx_private_gpa(kvm, region.gpa + (region.nr_pages << PAGE_SHIFT) - 1))
3222c846b451SIsaku Yamahata return -EINVAL;
3223c846b451SIsaku Yamahata
3224c846b451SIsaku Yamahata kvm_mmu_reload(vcpu);
3225c846b451SIsaku Yamahata ret = 0;
3226c846b451SIsaku Yamahata while (region.nr_pages) {
3227c846b451SIsaku Yamahata if (signal_pending(current)) {
3228c846b451SIsaku Yamahata ret = -EINTR;
3229c846b451SIsaku Yamahata break;
3230c846b451SIsaku Yamahata }
3231c846b451SIsaku Yamahata
3232c846b451SIsaku Yamahata arg = (struct tdx_gmem_post_populate_arg) {
3233c846b451SIsaku Yamahata .vcpu = vcpu,
3234c846b451SIsaku Yamahata .flags = cmd->flags,
3235c846b451SIsaku Yamahata };
3236c846b451SIsaku Yamahata gmem_ret = kvm_gmem_populate(kvm, gpa_to_gfn(region.gpa),
3237c846b451SIsaku Yamahata u64_to_user_ptr(region.source_addr),
3238c846b451SIsaku Yamahata 1, tdx_gmem_post_populate, &arg);
3239c846b451SIsaku Yamahata if (gmem_ret < 0) {
3240c846b451SIsaku Yamahata ret = gmem_ret;
3241c846b451SIsaku Yamahata break;
3242c846b451SIsaku Yamahata }
3243c846b451SIsaku Yamahata
3244c846b451SIsaku Yamahata if (gmem_ret != 1) {
3245c846b451SIsaku Yamahata ret = -EIO;
3246c846b451SIsaku Yamahata break;
3247c846b451SIsaku Yamahata }
3248c846b451SIsaku Yamahata
3249c846b451SIsaku Yamahata region.source_addr += PAGE_SIZE;
3250c846b451SIsaku Yamahata region.gpa += PAGE_SIZE;
3251c846b451SIsaku Yamahata region.nr_pages--;
3252c846b451SIsaku Yamahata
3253c846b451SIsaku Yamahata cond_resched();
3254c846b451SIsaku Yamahata }
3255c846b451SIsaku Yamahata
3256c846b451SIsaku Yamahata if (copy_to_user(u64_to_user_ptr(cmd->data), ®ion, sizeof(region)))
3257c846b451SIsaku Yamahata ret = -EFAULT;
3258c846b451SIsaku Yamahata return ret;
3259c846b451SIsaku Yamahata }
3260c846b451SIsaku Yamahata
tdx_vcpu_ioctl(struct kvm_vcpu * vcpu,void __user * argp)3261a50f673fSIsaku Yamahata int tdx_vcpu_ioctl(struct kvm_vcpu *vcpu, void __user *argp)
3262a50f673fSIsaku Yamahata {
3263a50f673fSIsaku Yamahata struct kvm_tdx *kvm_tdx = to_kvm_tdx(vcpu->kvm);
3264a50f673fSIsaku Yamahata struct kvm_tdx_cmd cmd;
3265a50f673fSIsaku Yamahata int ret;
3266a50f673fSIsaku Yamahata
3267a50f673fSIsaku Yamahata if (!is_hkid_assigned(kvm_tdx) || kvm_tdx->state == TD_STATE_RUNNABLE)
3268a50f673fSIsaku Yamahata return -EINVAL;
3269a50f673fSIsaku Yamahata
3270a50f673fSIsaku Yamahata if (copy_from_user(&cmd, argp, sizeof(cmd)))
3271a50f673fSIsaku Yamahata return -EFAULT;
3272a50f673fSIsaku Yamahata
3273a50f673fSIsaku Yamahata if (cmd.hw_error)
3274a50f673fSIsaku Yamahata return -EINVAL;
3275a50f673fSIsaku Yamahata
3276a50f673fSIsaku Yamahata switch (cmd.id) {
3277a50f673fSIsaku Yamahata case KVM_TDX_INIT_VCPU:
3278a50f673fSIsaku Yamahata ret = tdx_vcpu_init(vcpu, &cmd);
3279a50f673fSIsaku Yamahata break;
3280c846b451SIsaku Yamahata case KVM_TDX_INIT_MEM_REGION:
3281c846b451SIsaku Yamahata ret = tdx_vcpu_init_mem_region(vcpu, &cmd);
3282c846b451SIsaku Yamahata break;
3283488808e6SXiaoyao Li case KVM_TDX_GET_CPUID:
3284488808e6SXiaoyao Li ret = tdx_vcpu_get_cpuid(vcpu, &cmd);
3285488808e6SXiaoyao Li break;
3286a50f673fSIsaku Yamahata default:
3287a50f673fSIsaku Yamahata ret = -EINVAL;
3288a50f673fSIsaku Yamahata break;
3289a50f673fSIsaku Yamahata }
3290a50f673fSIsaku Yamahata
3291a50f673fSIsaku Yamahata return ret;
3292a50f673fSIsaku Yamahata }
3293a50f673fSIsaku Yamahata
tdx_gmem_private_max_mapping_level(struct kvm * kvm,kvm_pfn_t pfn)32940036b87aSIsaku Yamahata int tdx_gmem_private_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn)
32950036b87aSIsaku Yamahata {
32960036b87aSIsaku Yamahata return PG_LEVEL_4K;
32970036b87aSIsaku Yamahata }
32980036b87aSIsaku Yamahata
tdx_online_cpu(unsigned int cpu)3299fcdbdf63SKai Huang static int tdx_online_cpu(unsigned int cpu)
3300fcdbdf63SKai Huang {
3301fcdbdf63SKai Huang unsigned long flags;
3302fcdbdf63SKai Huang int r;
3303fcdbdf63SKai Huang
3304fcdbdf63SKai Huang /* Sanity check CPU is already in post-VMXON */
3305fcdbdf63SKai Huang WARN_ON_ONCE(!(cr4_read_shadow() & X86_CR4_VMXE));
3306fcdbdf63SKai Huang
3307fcdbdf63SKai Huang local_irq_save(flags);
3308fcdbdf63SKai Huang r = tdx_cpu_enable();
3309fcdbdf63SKai Huang local_irq_restore(flags);
3310fcdbdf63SKai Huang
3311fcdbdf63SKai Huang return r;
3312fcdbdf63SKai Huang }
3313fcdbdf63SKai Huang
tdx_offline_cpu(unsigned int cpu)33149934d7e5SIsaku Yamahata static int tdx_offline_cpu(unsigned int cpu)
33159934d7e5SIsaku Yamahata {
33169934d7e5SIsaku Yamahata int i;
33179934d7e5SIsaku Yamahata
33189934d7e5SIsaku Yamahata /* No TD is running. Allow any cpu to be offline. */
33199934d7e5SIsaku Yamahata if (!atomic_read(&nr_configured_hkid))
33209934d7e5SIsaku Yamahata return 0;
33219934d7e5SIsaku Yamahata
33229934d7e5SIsaku Yamahata /*
33239934d7e5SIsaku Yamahata * In order to reclaim TDX HKID, (i.e. when deleting guest TD), need to
33249934d7e5SIsaku Yamahata * call TDH.PHYMEM.PAGE.WBINVD on all packages to program all memory
33259934d7e5SIsaku Yamahata * controller with pconfig. If we have active TDX HKID, refuse to
33269934d7e5SIsaku Yamahata * offline the last online cpu.
33279934d7e5SIsaku Yamahata */
33289934d7e5SIsaku Yamahata for_each_online_cpu(i) {
33299934d7e5SIsaku Yamahata /*
33309934d7e5SIsaku Yamahata * Found another online cpu on the same package.
33319934d7e5SIsaku Yamahata * Allow to offline.
33329934d7e5SIsaku Yamahata */
33339934d7e5SIsaku Yamahata if (i != cpu && topology_physical_package_id(i) ==
33349934d7e5SIsaku Yamahata topology_physical_package_id(cpu))
33359934d7e5SIsaku Yamahata return 0;
33369934d7e5SIsaku Yamahata }
33379934d7e5SIsaku Yamahata
33389934d7e5SIsaku Yamahata /*
33399934d7e5SIsaku Yamahata * This is the last cpu of this package. Don't offline it.
33409934d7e5SIsaku Yamahata *
33419934d7e5SIsaku Yamahata * Because it's hard for human operator to understand the
33429934d7e5SIsaku Yamahata * reason, warn it.
33439934d7e5SIsaku Yamahata */
33449934d7e5SIsaku Yamahata #define MSG_ALLPKG_ONLINE \
33459934d7e5SIsaku Yamahata "TDX requires all packages to have an online CPU. Delete all TDs in order to offline all CPUs of a package.\n"
33469934d7e5SIsaku Yamahata pr_warn_ratelimited(MSG_ALLPKG_ONLINE);
33479934d7e5SIsaku Yamahata return -EBUSY;
33489934d7e5SIsaku Yamahata }
33499934d7e5SIsaku Yamahata
__do_tdx_cleanup(void)3350fcdbdf63SKai Huang static void __do_tdx_cleanup(void)
3351fcdbdf63SKai Huang {
3352fcdbdf63SKai Huang /*
3353fcdbdf63SKai Huang * Once TDX module is initialized, it cannot be disabled and
3354fcdbdf63SKai Huang * re-initialized again w/o runtime update (which isn't
3355fcdbdf63SKai Huang * supported by kernel). Only need to remove the cpuhp here.
3356fcdbdf63SKai Huang * The TDX host core code tracks TDX status and can handle
3357fcdbdf63SKai Huang * 'multiple enabling' scenario.
3358fcdbdf63SKai Huang */
3359fcdbdf63SKai Huang WARN_ON_ONCE(!tdx_cpuhp_state);
3360fcdbdf63SKai Huang cpuhp_remove_state_nocalls_cpuslocked(tdx_cpuhp_state);
3361fcdbdf63SKai Huang tdx_cpuhp_state = 0;
3362fcdbdf63SKai Huang }
3363fcdbdf63SKai Huang
__tdx_cleanup(void)3364fcdbdf63SKai Huang static void __tdx_cleanup(void)
3365fcdbdf63SKai Huang {
3366fcdbdf63SKai Huang cpus_read_lock();
3367fcdbdf63SKai Huang __do_tdx_cleanup();
3368fcdbdf63SKai Huang cpus_read_unlock();
3369fcdbdf63SKai Huang }
3370fcdbdf63SKai Huang
__do_tdx_bringup(void)3371fcdbdf63SKai Huang static int __init __do_tdx_bringup(void)
3372fcdbdf63SKai Huang {
3373fcdbdf63SKai Huang int r;
3374fcdbdf63SKai Huang
3375fcdbdf63SKai Huang /*
3376fcdbdf63SKai Huang * TDX-specific cpuhp callback to call tdx_cpu_enable() on all
3377fcdbdf63SKai Huang * online CPUs before calling tdx_enable(), and on any new
3378fcdbdf63SKai Huang * going-online CPU to make sure it is ready for TDX guest.
3379fcdbdf63SKai Huang */
3380fcdbdf63SKai Huang r = cpuhp_setup_state_cpuslocked(CPUHP_AP_ONLINE_DYN,
3381fcdbdf63SKai Huang "kvm/cpu/tdx:online",
33829934d7e5SIsaku Yamahata tdx_online_cpu, tdx_offline_cpu);
3383fcdbdf63SKai Huang if (r < 0)
3384fcdbdf63SKai Huang return r;
3385fcdbdf63SKai Huang
3386fcdbdf63SKai Huang tdx_cpuhp_state = r;
3387fcdbdf63SKai Huang
3388fcdbdf63SKai Huang r = tdx_enable();
3389fcdbdf63SKai Huang if (r)
3390fcdbdf63SKai Huang __do_tdx_cleanup();
3391fcdbdf63SKai Huang
3392fcdbdf63SKai Huang return r;
3393fcdbdf63SKai Huang }
3394fcdbdf63SKai Huang
__tdx_bringup(void)3395fcdbdf63SKai Huang static int __init __tdx_bringup(void)
3396fcdbdf63SKai Huang {
3397f94f4a97SIsaku Yamahata const struct tdx_sys_info_td_conf *td_conf;
3398e0b4f31aSIsaku Yamahata int r, i;
3399e0b4f31aSIsaku Yamahata
3400e0b4f31aSIsaku Yamahata for (i = 0; i < ARRAY_SIZE(tdx_uret_msrs); i++) {
3401e0b4f31aSIsaku Yamahata /*
3402e0b4f31aSIsaku Yamahata * Check if MSRs (tdx_uret_msrs) can be saved/restored
3403e0b4f31aSIsaku Yamahata * before returning to user space.
3404e0b4f31aSIsaku Yamahata *
3405e0b4f31aSIsaku Yamahata * this_cpu_ptr(user_return_msrs)->registered isn't checked
3406e0b4f31aSIsaku Yamahata * because the registration is done at vcpu runtime by
3407e0b4f31aSIsaku Yamahata * tdx_user_return_msr_update_cache().
3408e0b4f31aSIsaku Yamahata */
3409e0b4f31aSIsaku Yamahata tdx_uret_msrs[i].slot = kvm_find_user_return_msr(tdx_uret_msrs[i].msr);
3410e0b4f31aSIsaku Yamahata if (tdx_uret_msrs[i].slot == -1) {
3411e0b4f31aSIsaku Yamahata /* If any MSR isn't supported, it is a KVM bug */
3412e0b4f31aSIsaku Yamahata pr_err("MSR %x isn't included by kvm_find_user_return_msr\n",
3413e0b4f31aSIsaku Yamahata tdx_uret_msrs[i].msr);
3414e0b4f31aSIsaku Yamahata return -EIO;
3415e0b4f31aSIsaku Yamahata }
3416e0b4f31aSIsaku Yamahata }
3417fcdbdf63SKai Huang
3418fcdbdf63SKai Huang /*
3419fcdbdf63SKai Huang * Enabling TDX requires enabling hardware virtualization first,
3420fcdbdf63SKai Huang * as making SEAMCALLs requires CPU being in post-VMXON state.
3421fcdbdf63SKai Huang */
3422fcdbdf63SKai Huang r = kvm_enable_virtualization();
3423fcdbdf63SKai Huang if (r)
3424fcdbdf63SKai Huang return r;
3425fcdbdf63SKai Huang
3426fcdbdf63SKai Huang cpus_read_lock();
3427fcdbdf63SKai Huang r = __do_tdx_bringup();
3428fcdbdf63SKai Huang cpus_read_unlock();
3429fcdbdf63SKai Huang
3430fcdbdf63SKai Huang if (r)
3431fcdbdf63SKai Huang goto tdx_bringup_err;
3432fcdbdf63SKai Huang
343345154fb0SKai Huang /* Get TDX global information for later use */
343445154fb0SKai Huang tdx_sysinfo = tdx_get_sysinfo();
343545154fb0SKai Huang if (WARN_ON_ONCE(!tdx_sysinfo)) {
343645154fb0SKai Huang r = -EINVAL;
343745154fb0SKai Huang goto get_sysinfo_err;
343845154fb0SKai Huang }
343945154fb0SKai Huang
344061bb2827SIsaku Yamahata /* Check TDX module and KVM capabilities */
344161bb2827SIsaku Yamahata if (!tdx_get_supported_attrs(&tdx_sysinfo->td_conf) ||
344261bb2827SIsaku Yamahata !tdx_get_supported_xfam(&tdx_sysinfo->td_conf))
344361bb2827SIsaku Yamahata goto get_sysinfo_err;
344461bb2827SIsaku Yamahata
344561bb2827SIsaku Yamahata if (!(tdx_sysinfo->features.tdx_features0 & MD_FIELD_ID_FEATURES0_TOPOLOGY_ENUM))
344661bb2827SIsaku Yamahata goto get_sysinfo_err;
344761bb2827SIsaku Yamahata
3448fcdbdf63SKai Huang /*
3449f94f4a97SIsaku Yamahata * TDX has its own limit of maximum vCPUs it can support for all
3450f94f4a97SIsaku Yamahata * TDX guests in addition to KVM_MAX_VCPUS. Userspace needs to
3451f94f4a97SIsaku Yamahata * query TDX guest's maximum vCPUs by checking KVM_CAP_MAX_VCPU
3452f94f4a97SIsaku Yamahata * extension on per-VM basis.
3453f94f4a97SIsaku Yamahata *
3454f94f4a97SIsaku Yamahata * TDX module reports such limit via the MAX_VCPU_PER_TD global
3455f94f4a97SIsaku Yamahata * metadata. Different modules may report different values.
3456f94f4a97SIsaku Yamahata * Some old module may also not support this metadata (in which
3457f94f4a97SIsaku Yamahata * case this limit is U16_MAX).
3458f94f4a97SIsaku Yamahata *
3459f94f4a97SIsaku Yamahata * In practice, the reported value reflects the maximum logical
3460f94f4a97SIsaku Yamahata * CPUs that ALL the platforms that the module supports can
3461f94f4a97SIsaku Yamahata * possibly have.
3462f94f4a97SIsaku Yamahata *
3463f94f4a97SIsaku Yamahata * Simply forwarding the MAX_VCPU_PER_TD to userspace could
3464f94f4a97SIsaku Yamahata * result in an unpredictable ABI. KVM instead always advertise
3465f94f4a97SIsaku Yamahata * the number of logical CPUs the platform has as the maximum
3466f94f4a97SIsaku Yamahata * vCPUs for TDX guests.
3467f94f4a97SIsaku Yamahata *
3468f94f4a97SIsaku Yamahata * Make sure MAX_VCPU_PER_TD reported by TDX module is not
3469f94f4a97SIsaku Yamahata * smaller than the number of logical CPUs, otherwise KVM will
3470f94f4a97SIsaku Yamahata * report an unsupported value to userspace.
3471f94f4a97SIsaku Yamahata *
3472f94f4a97SIsaku Yamahata * Note, a platform with TDX enabled in the BIOS cannot support
3473f94f4a97SIsaku Yamahata * physical CPU hotplug, and TDX requires the BIOS has marked
3474f94f4a97SIsaku Yamahata * all logical CPUs in MADT table as enabled. Just use
3475f94f4a97SIsaku Yamahata * num_present_cpus() for the number of logical CPUs.
3476f94f4a97SIsaku Yamahata */
3477f94f4a97SIsaku Yamahata td_conf = &tdx_sysinfo->td_conf;
3478f94f4a97SIsaku Yamahata if (td_conf->max_vcpus_per_td < num_present_cpus()) {
3479f94f4a97SIsaku Yamahata pr_err("Disable TDX: MAX_VCPU_PER_TD (%u) smaller than number of logical CPUs (%u).\n",
3480f94f4a97SIsaku Yamahata td_conf->max_vcpus_per_td, num_present_cpus());
3481f94f4a97SIsaku Yamahata r = -EINVAL;
3482f94f4a97SIsaku Yamahata goto get_sysinfo_err;
3483f94f4a97SIsaku Yamahata }
3484f94f4a97SIsaku Yamahata
34857c035beaSZhiming Hu if (misc_cg_set_capacity(MISC_CG_RES_TDX, tdx_get_nr_guest_keyids())) {
34867c035beaSZhiming Hu r = -EINVAL;
34877c035beaSZhiming Hu goto get_sysinfo_err;
34887c035beaSZhiming Hu }
34897c035beaSZhiming Hu
3490f94f4a97SIsaku Yamahata /*
3491fcdbdf63SKai Huang * Leave hardware virtualization enabled after TDX is enabled
3492fcdbdf63SKai Huang * successfully. TDX CPU hotplug depends on this.
3493fcdbdf63SKai Huang */
3494fcdbdf63SKai Huang return 0;
349561bb2827SIsaku Yamahata
349645154fb0SKai Huang get_sysinfo_err:
349745154fb0SKai Huang __tdx_cleanup();
3498fcdbdf63SKai Huang tdx_bringup_err:
3499fcdbdf63SKai Huang kvm_disable_virtualization();
3500fcdbdf63SKai Huang return r;
3501fcdbdf63SKai Huang }
3502fcdbdf63SKai Huang
tdx_cleanup(void)3503fcdbdf63SKai Huang void tdx_cleanup(void)
3504fcdbdf63SKai Huang {
3505fcdbdf63SKai Huang if (enable_tdx) {
35067c035beaSZhiming Hu misc_cg_set_capacity(MISC_CG_RES_TDX, 0);
3507fcdbdf63SKai Huang __tdx_cleanup();
3508fcdbdf63SKai Huang kvm_disable_virtualization();
3509fcdbdf63SKai Huang }
3510fcdbdf63SKai Huang }
3511fcdbdf63SKai Huang
tdx_bringup(void)3512fcdbdf63SKai Huang int __init tdx_bringup(void)
3513fcdbdf63SKai Huang {
3514d789fa6eSIsaku Yamahata int r, i;
3515d789fa6eSIsaku Yamahata
3516d789fa6eSIsaku Yamahata /* tdx_disable_virtualization_cpu() uses associated_tdvcpus. */
3517d789fa6eSIsaku Yamahata for_each_possible_cpu(i)
3518d789fa6eSIsaku Yamahata INIT_LIST_HEAD(&per_cpu(associated_tdvcpus, i));
3519fcdbdf63SKai Huang
3520fcdbdf63SKai Huang if (!enable_tdx)
3521fcdbdf63SKai Huang return 0;
3522fcdbdf63SKai Huang
3523427a6486SIsaku Yamahata if (!enable_ept) {
3524427a6486SIsaku Yamahata pr_err("EPT is required for TDX\n");
3525427a6486SIsaku Yamahata goto success_disable_tdx;
3526427a6486SIsaku Yamahata }
3527427a6486SIsaku Yamahata
3528427a6486SIsaku Yamahata if (!tdp_mmu_enabled || !enable_mmio_caching || !enable_ept_ad_bits) {
3529427a6486SIsaku Yamahata pr_err("TDP MMU and MMIO caching and EPT A/D bit is required for TDX\n");
3530427a6486SIsaku Yamahata goto success_disable_tdx;
3531427a6486SIsaku Yamahata }
3532427a6486SIsaku Yamahata
3533f65916aeSIsaku Yamahata if (!enable_apicv) {
3534f65916aeSIsaku Yamahata pr_err("APICv is required for TDX\n");
3535f65916aeSIsaku Yamahata goto success_disable_tdx;
3536f65916aeSIsaku Yamahata }
3537f65916aeSIsaku Yamahata
35386bfa6d85SIsaku Yamahata if (!cpu_feature_enabled(X86_FEATURE_OSXSAVE)) {
35396bfa6d85SIsaku Yamahata pr_err("tdx: OSXSAVE is required for TDX\n");
35406bfa6d85SIsaku Yamahata goto success_disable_tdx;
35416bfa6d85SIsaku Yamahata }
35426bfa6d85SIsaku Yamahata
35438d032b68SIsaku Yamahata if (!cpu_feature_enabled(X86_FEATURE_MOVDIR64B)) {
35448d032b68SIsaku Yamahata pr_err("tdx: MOVDIR64B is required for TDX\n");
35458d032b68SIsaku Yamahata goto success_disable_tdx;
35468d032b68SIsaku Yamahata }
35478d032b68SIsaku Yamahata
354890fe64a9SYan Zhao if (!cpu_feature_enabled(X86_FEATURE_SELFSNOOP)) {
354990fe64a9SYan Zhao pr_err("Self-snoop is required for TDX\n");
355090fe64a9SYan Zhao goto success_disable_tdx;
355190fe64a9SYan Zhao }
355290fe64a9SYan Zhao
3553fcdbdf63SKai Huang if (!cpu_feature_enabled(X86_FEATURE_TDX_HOST_PLATFORM)) {
3554fcdbdf63SKai Huang pr_err("tdx: no TDX private KeyIDs available\n");
3555fcdbdf63SKai Huang goto success_disable_tdx;
3556fcdbdf63SKai Huang }
3557fcdbdf63SKai Huang
3558fcdbdf63SKai Huang if (!enable_virt_at_load) {
3559fcdbdf63SKai Huang pr_err("tdx: tdx requires kvm.enable_virt_at_load=1\n");
3560fcdbdf63SKai Huang goto success_disable_tdx;
3561fcdbdf63SKai Huang }
3562fcdbdf63SKai Huang
3563fcdbdf63SKai Huang /*
3564fcdbdf63SKai Huang * Ideally KVM should probe whether TDX module has been loaded
3565fcdbdf63SKai Huang * first and then try to bring it up. But TDX needs to use SEAMCALL
3566fcdbdf63SKai Huang * to probe whether the module is loaded (there is no CPUID or MSR
3567fcdbdf63SKai Huang * for that), and making SEAMCALL requires enabling virtualization
3568fcdbdf63SKai Huang * first, just like the rest steps of bringing up TDX module.
3569fcdbdf63SKai Huang *
3570fcdbdf63SKai Huang * So, for simplicity do everything in __tdx_bringup(); the first
3571fcdbdf63SKai Huang * SEAMCALL will return -ENODEV when the module is not loaded. The
3572fcdbdf63SKai Huang * only complication is having to make sure that initialization
3573fcdbdf63SKai Huang * SEAMCALLs don't return TDX_SEAMCALL_VMFAILINVALID in other
3574fcdbdf63SKai Huang * cases.
3575fcdbdf63SKai Huang */
3576fcdbdf63SKai Huang r = __tdx_bringup();
3577fcdbdf63SKai Huang if (r) {
3578fcdbdf63SKai Huang /*
3579fcdbdf63SKai Huang * Disable TDX only but don't fail to load module if
3580fcdbdf63SKai Huang * the TDX module could not be loaded. No need to print
3581fcdbdf63SKai Huang * message saying "module is not loaded" because it was
3582fcdbdf63SKai Huang * printed when the first SEAMCALL failed.
3583fcdbdf63SKai Huang */
3584fcdbdf63SKai Huang if (r == -ENODEV)
3585fcdbdf63SKai Huang goto success_disable_tdx;
3586fcdbdf63SKai Huang
3587fcdbdf63SKai Huang enable_tdx = 0;
3588fcdbdf63SKai Huang }
3589fcdbdf63SKai Huang
3590fcdbdf63SKai Huang return r;
3591fcdbdf63SKai Huang
3592fcdbdf63SKai Huang success_disable_tdx:
3593fcdbdf63SKai Huang enable_tdx = 0;
3594fcdbdf63SKai Huang return 0;
3595fcdbdf63SKai Huang }
3596