xref: /linux/arch/x86/kvm/vmx/tdx.c (revision e669e322c52c49c161e46492963e64319fbb53a8)
1 // SPDX-License-Identifier: GPL-2.0
2 #include <linux/cleanup.h>
3 #include <linux/cpu.h>
4 #include <asm/cpufeature.h>
5 #include <asm/fpu/xcr.h>
6 #include <linux/misc_cgroup.h>
7 #include <linux/mmu_context.h>
8 #include <asm/tdx.h>
9 #include "capabilities.h"
10 #include "mmu.h"
11 #include "x86_ops.h"
12 #include "lapic.h"
13 #include "tdx.h"
14 #include "vmx.h"
15 #include "mmu/spte.h"
16 #include "common.h"
17 #include "posted_intr.h"
18 #include "irq.h"
19 #include <trace/events/kvm.h>
20 #include "trace.h"
21 
22 #pragma GCC poison to_vmx
23 
24 #undef pr_fmt
25 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
26 
27 #define pr_tdx_error(__fn, __err)	\
28 	pr_err_ratelimited("SEAMCALL %s failed: 0x%llx\n", #__fn, __err)
29 
30 #define __pr_tdx_error_N(__fn_str, __err, __fmt, ...)		\
31 	pr_err_ratelimited("SEAMCALL " __fn_str " failed: 0x%llx, " __fmt,  __err,  __VA_ARGS__)
32 
33 #define pr_tdx_error_1(__fn, __err, __rcx)		\
34 	__pr_tdx_error_N(#__fn, __err, "rcx 0x%llx\n", __rcx)
35 
36 #define pr_tdx_error_2(__fn, __err, __rcx, __rdx)	\
37 	__pr_tdx_error_N(#__fn, __err, "rcx 0x%llx, rdx 0x%llx\n", __rcx, __rdx)
38 
39 #define pr_tdx_error_3(__fn, __err, __rcx, __rdx, __r8)	\
40 	__pr_tdx_error_N(#__fn, __err, "rcx 0x%llx, rdx 0x%llx, r8 0x%llx\n", __rcx, __rdx, __r8)
41 
42 bool enable_tdx __ro_after_init;
43 module_param_named(tdx, enable_tdx, bool, 0444);
44 
45 #define TDX_SHARED_BIT_PWL_5 gpa_to_gfn(BIT_ULL(51))
46 #define TDX_SHARED_BIT_PWL_4 gpa_to_gfn(BIT_ULL(47))
47 
48 static enum cpuhp_state tdx_cpuhp_state;
49 
50 static const struct tdx_sys_info *tdx_sysinfo;
51 
tdh_vp_rd_failed(struct vcpu_tdx * tdx,char * uclass,u32 field,u64 err)52 void tdh_vp_rd_failed(struct vcpu_tdx *tdx, char *uclass, u32 field, u64 err)
53 {
54 	KVM_BUG_ON(1, tdx->vcpu.kvm);
55 	pr_err("TDH_VP_RD[%s.0x%x] failed 0x%llx\n", uclass, field, err);
56 }
57 
tdh_vp_wr_failed(struct vcpu_tdx * tdx,char * uclass,char * op,u32 field,u64 val,u64 err)58 void tdh_vp_wr_failed(struct vcpu_tdx *tdx, char *uclass, char *op, u32 field,
59 		      u64 val, u64 err)
60 {
61 	KVM_BUG_ON(1, tdx->vcpu.kvm);
62 	pr_err("TDH_VP_WR[%s.0x%x]%s0x%llx failed: 0x%llx\n", uclass, field, op, val, err);
63 }
64 
65 #define KVM_SUPPORTED_TD_ATTRS (TDX_TD_ATTR_SEPT_VE_DISABLE)
66 
to_kvm_tdx(struct kvm * kvm)67 static __always_inline struct kvm_tdx *to_kvm_tdx(struct kvm *kvm)
68 {
69 	return container_of(kvm, struct kvm_tdx, kvm);
70 }
71 
to_tdx(struct kvm_vcpu * vcpu)72 static __always_inline struct vcpu_tdx *to_tdx(struct kvm_vcpu *vcpu)
73 {
74 	return container_of(vcpu, struct vcpu_tdx, vcpu);
75 }
76 
tdx_get_supported_attrs(const struct tdx_sys_info_td_conf * td_conf)77 static u64 tdx_get_supported_attrs(const struct tdx_sys_info_td_conf *td_conf)
78 {
79 	u64 val = KVM_SUPPORTED_TD_ATTRS;
80 
81 	if ((val & td_conf->attributes_fixed1) != td_conf->attributes_fixed1)
82 		return 0;
83 
84 	val &= td_conf->attributes_fixed0;
85 
86 	return val;
87 }
88 
tdx_get_supported_xfam(const struct tdx_sys_info_td_conf * td_conf)89 static u64 tdx_get_supported_xfam(const struct tdx_sys_info_td_conf *td_conf)
90 {
91 	u64 val = kvm_caps.supported_xcr0 | kvm_caps.supported_xss;
92 
93 	if ((val & td_conf->xfam_fixed1) != td_conf->xfam_fixed1)
94 		return 0;
95 
96 	val &= td_conf->xfam_fixed0;
97 
98 	return val;
99 }
100 
tdx_get_guest_phys_addr_bits(const u32 eax)101 static int tdx_get_guest_phys_addr_bits(const u32 eax)
102 {
103 	return (eax & GENMASK(23, 16)) >> 16;
104 }
105 
tdx_set_guest_phys_addr_bits(const u32 eax,int addr_bits)106 static u32 tdx_set_guest_phys_addr_bits(const u32 eax, int addr_bits)
107 {
108 	return (eax & ~GENMASK(23, 16)) | (addr_bits & 0xff) << 16;
109 }
110 
111 #define TDX_FEATURE_TSX (__feature_bit(X86_FEATURE_HLE) | __feature_bit(X86_FEATURE_RTM))
112 
has_tsx(const struct kvm_cpuid_entry2 * entry)113 static bool has_tsx(const struct kvm_cpuid_entry2 *entry)
114 {
115 	return entry->function == 7 && entry->index == 0 &&
116 	       (entry->ebx & TDX_FEATURE_TSX);
117 }
118 
clear_tsx(struct kvm_cpuid_entry2 * entry)119 static void clear_tsx(struct kvm_cpuid_entry2 *entry)
120 {
121 	entry->ebx &= ~TDX_FEATURE_TSX;
122 }
123 
has_waitpkg(const struct kvm_cpuid_entry2 * entry)124 static bool has_waitpkg(const struct kvm_cpuid_entry2 *entry)
125 {
126 	return entry->function == 7 && entry->index == 0 &&
127 	       (entry->ecx & __feature_bit(X86_FEATURE_WAITPKG));
128 }
129 
clear_waitpkg(struct kvm_cpuid_entry2 * entry)130 static void clear_waitpkg(struct kvm_cpuid_entry2 *entry)
131 {
132 	entry->ecx &= ~__feature_bit(X86_FEATURE_WAITPKG);
133 }
134 
tdx_clear_unsupported_cpuid(struct kvm_cpuid_entry2 * entry)135 static void tdx_clear_unsupported_cpuid(struct kvm_cpuid_entry2 *entry)
136 {
137 	if (has_tsx(entry))
138 		clear_tsx(entry);
139 
140 	if (has_waitpkg(entry))
141 		clear_waitpkg(entry);
142 }
143 
tdx_unsupported_cpuid(const struct kvm_cpuid_entry2 * entry)144 static bool tdx_unsupported_cpuid(const struct kvm_cpuid_entry2 *entry)
145 {
146 	return has_tsx(entry) || has_waitpkg(entry);
147 }
148 
149 #define KVM_TDX_CPUID_NO_SUBLEAF	((__u32)-1)
150 
td_init_cpuid_entry2(struct kvm_cpuid_entry2 * entry,unsigned char idx)151 static void td_init_cpuid_entry2(struct kvm_cpuid_entry2 *entry, unsigned char idx)
152 {
153 	const struct tdx_sys_info_td_conf *td_conf = &tdx_sysinfo->td_conf;
154 
155 	entry->function = (u32)td_conf->cpuid_config_leaves[idx];
156 	entry->index = td_conf->cpuid_config_leaves[idx] >> 32;
157 	entry->eax = (u32)td_conf->cpuid_config_values[idx][0];
158 	entry->ebx = td_conf->cpuid_config_values[idx][0] >> 32;
159 	entry->ecx = (u32)td_conf->cpuid_config_values[idx][1];
160 	entry->edx = td_conf->cpuid_config_values[idx][1] >> 32;
161 
162 	if (entry->index == KVM_TDX_CPUID_NO_SUBLEAF)
163 		entry->index = 0;
164 
165 	/*
166 	 * The TDX module doesn't allow configuring the guest phys addr bits
167 	 * (EAX[23:16]).  However, KVM uses it as an interface to the userspace
168 	 * to configure the GPAW.  Report these bits as configurable.
169 	 */
170 	if (entry->function == 0x80000008)
171 		entry->eax = tdx_set_guest_phys_addr_bits(entry->eax, 0xff);
172 
173 	tdx_clear_unsupported_cpuid(entry);
174 }
175 
init_kvm_tdx_caps(const struct tdx_sys_info_td_conf * td_conf,struct kvm_tdx_capabilities * caps)176 static int init_kvm_tdx_caps(const struct tdx_sys_info_td_conf *td_conf,
177 			     struct kvm_tdx_capabilities *caps)
178 {
179 	int i;
180 
181 	caps->supported_attrs = tdx_get_supported_attrs(td_conf);
182 	if (!caps->supported_attrs)
183 		return -EIO;
184 
185 	caps->supported_xfam = tdx_get_supported_xfam(td_conf);
186 	if (!caps->supported_xfam)
187 		return -EIO;
188 
189 	caps->cpuid.nent = td_conf->num_cpuid_config;
190 
191 	for (i = 0; i < td_conf->num_cpuid_config; i++)
192 		td_init_cpuid_entry2(&caps->cpuid.entries[i], i);
193 
194 	return 0;
195 }
196 
197 /*
198  * Some SEAMCALLs acquire the TDX module globally, and can fail with
199  * TDX_OPERAND_BUSY.  Use a global mutex to serialize these SEAMCALLs.
200  */
201 static DEFINE_MUTEX(tdx_lock);
202 
203 static atomic_t nr_configured_hkid;
204 
tdx_operand_busy(u64 err)205 static bool tdx_operand_busy(u64 err)
206 {
207 	return (err & TDX_SEAMCALL_STATUS_MASK) == TDX_OPERAND_BUSY;
208 }
209 
210 
211 /*
212  * A per-CPU list of TD vCPUs associated with a given CPU.
213  * Protected by interrupt mask. Only manipulated by the CPU owning this per-CPU
214  * list.
215  * - When a vCPU is loaded onto a CPU, it is removed from the per-CPU list of
216  *   the old CPU during the IPI callback running on the old CPU, and then added
217  *   to the per-CPU list of the new CPU.
218  * - When a TD is tearing down, all vCPUs are disassociated from their current
219  *   running CPUs and removed from the per-CPU list during the IPI callback
220  *   running on those CPUs.
221  * - When a CPU is brought down, traverse the per-CPU list to disassociate all
222  *   associated TD vCPUs and remove them from the per-CPU list.
223  */
224 static DEFINE_PER_CPU(struct list_head, associated_tdvcpus);
225 
tdvmcall_exit_type(struct kvm_vcpu * vcpu)226 static __always_inline unsigned long tdvmcall_exit_type(struct kvm_vcpu *vcpu)
227 {
228 	return to_tdx(vcpu)->vp_enter_args.r10;
229 }
230 
tdvmcall_leaf(struct kvm_vcpu * vcpu)231 static __always_inline unsigned long tdvmcall_leaf(struct kvm_vcpu *vcpu)
232 {
233 	return to_tdx(vcpu)->vp_enter_args.r11;
234 }
235 
tdvmcall_set_return_code(struct kvm_vcpu * vcpu,long val)236 static __always_inline void tdvmcall_set_return_code(struct kvm_vcpu *vcpu,
237 						     long val)
238 {
239 	to_tdx(vcpu)->vp_enter_args.r10 = val;
240 }
241 
tdvmcall_set_return_val(struct kvm_vcpu * vcpu,unsigned long val)242 static __always_inline void tdvmcall_set_return_val(struct kvm_vcpu *vcpu,
243 						    unsigned long val)
244 {
245 	to_tdx(vcpu)->vp_enter_args.r11 = val;
246 }
247 
tdx_hkid_free(struct kvm_tdx * kvm_tdx)248 static inline void tdx_hkid_free(struct kvm_tdx *kvm_tdx)
249 {
250 	tdx_guest_keyid_free(kvm_tdx->hkid);
251 	kvm_tdx->hkid = -1;
252 	atomic_dec(&nr_configured_hkid);
253 	misc_cg_uncharge(MISC_CG_RES_TDX, kvm_tdx->misc_cg, 1);
254 	put_misc_cg(kvm_tdx->misc_cg);
255 	kvm_tdx->misc_cg = NULL;
256 }
257 
is_hkid_assigned(struct kvm_tdx * kvm_tdx)258 static inline bool is_hkid_assigned(struct kvm_tdx *kvm_tdx)
259 {
260 	return kvm_tdx->hkid > 0;
261 }
262 
tdx_disassociate_vp(struct kvm_vcpu * vcpu)263 static inline void tdx_disassociate_vp(struct kvm_vcpu *vcpu)
264 {
265 	lockdep_assert_irqs_disabled();
266 
267 	list_del(&to_tdx(vcpu)->cpu_list);
268 
269 	/*
270 	 * Ensure tdx->cpu_list is updated before setting vcpu->cpu to -1,
271 	 * otherwise, a different CPU can see vcpu->cpu = -1 and add the vCPU
272 	 * to its list before it's deleted from this CPU's list.
273 	 */
274 	smp_wmb();
275 
276 	vcpu->cpu = -1;
277 }
278 
tdx_clear_page(struct page * page)279 static void tdx_clear_page(struct page *page)
280 {
281 	const void *zero_page = (const void *) page_to_virt(ZERO_PAGE(0));
282 	void *dest = page_to_virt(page);
283 	unsigned long i;
284 
285 	/*
286 	 * The page could have been poisoned.  MOVDIR64B also clears
287 	 * the poison bit so the kernel can safely use the page again.
288 	 */
289 	for (i = 0; i < PAGE_SIZE; i += 64)
290 		movdir64b(dest + i, zero_page);
291 	/*
292 	 * MOVDIR64B store uses WC buffer.  Prevent following memory reads
293 	 * from seeing potentially poisoned cache.
294 	 */
295 	__mb();
296 }
297 
tdx_no_vcpus_enter_start(struct kvm * kvm)298 static void tdx_no_vcpus_enter_start(struct kvm *kvm)
299 {
300 	struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
301 
302 	lockdep_assert_held_write(&kvm->mmu_lock);
303 
304 	WRITE_ONCE(kvm_tdx->wait_for_sept_zap, true);
305 
306 	kvm_make_all_cpus_request(kvm, KVM_REQ_OUTSIDE_GUEST_MODE);
307 }
308 
tdx_no_vcpus_enter_stop(struct kvm * kvm)309 static void tdx_no_vcpus_enter_stop(struct kvm *kvm)
310 {
311 	struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
312 
313 	lockdep_assert_held_write(&kvm->mmu_lock);
314 
315 	WRITE_ONCE(kvm_tdx->wait_for_sept_zap, false);
316 }
317 
318 /* TDH.PHYMEM.PAGE.RECLAIM is allowed only when destroying the TD. */
__tdx_reclaim_page(struct page * page)319 static int __tdx_reclaim_page(struct page *page)
320 {
321 	u64 err, rcx, rdx, r8;
322 
323 	err = tdh_phymem_page_reclaim(page, &rcx, &rdx, &r8);
324 
325 	/*
326 	 * No need to check for TDX_OPERAND_BUSY; all TD pages are freed
327 	 * before the HKID is released and control pages have also been
328 	 * released at this point, so there is no possibility of contention.
329 	 */
330 	if (WARN_ON_ONCE(err)) {
331 		pr_tdx_error_3(TDH_PHYMEM_PAGE_RECLAIM, err, rcx, rdx, r8);
332 		return -EIO;
333 	}
334 	return 0;
335 }
336 
tdx_reclaim_page(struct page * page)337 static int tdx_reclaim_page(struct page *page)
338 {
339 	int r;
340 
341 	r = __tdx_reclaim_page(page);
342 	if (!r)
343 		tdx_clear_page(page);
344 	return r;
345 }
346 
347 
348 /*
349  * Reclaim the TD control page(s) which are crypto-protected by TDX guest's
350  * private KeyID.  Assume the cache associated with the TDX private KeyID has
351  * been flushed.
352  */
tdx_reclaim_control_page(struct page * ctrl_page)353 static void tdx_reclaim_control_page(struct page *ctrl_page)
354 {
355 	/*
356 	 * Leak the page if the kernel failed to reclaim the page.
357 	 * The kernel cannot use it safely anymore.
358 	 */
359 	if (tdx_reclaim_page(ctrl_page))
360 		return;
361 
362 	__free_page(ctrl_page);
363 }
364 
365 struct tdx_flush_vp_arg {
366 	struct kvm_vcpu *vcpu;
367 	u64 err;
368 };
369 
tdx_flush_vp(void * _arg)370 static void tdx_flush_vp(void *_arg)
371 {
372 	struct tdx_flush_vp_arg *arg = _arg;
373 	struct kvm_vcpu *vcpu = arg->vcpu;
374 	u64 err;
375 
376 	arg->err = 0;
377 	lockdep_assert_irqs_disabled();
378 
379 	/* Task migration can race with CPU offlining. */
380 	if (unlikely(vcpu->cpu != raw_smp_processor_id()))
381 		return;
382 
383 	/*
384 	 * No need to do TDH_VP_FLUSH if the vCPU hasn't been initialized.  The
385 	 * list tracking still needs to be updated so that it's correct if/when
386 	 * the vCPU does get initialized.
387 	 */
388 	if (to_tdx(vcpu)->state != VCPU_TD_STATE_UNINITIALIZED) {
389 		/*
390 		 * No need to retry.  TDX Resources needed for TDH.VP.FLUSH are:
391 		 * TDVPR as exclusive, TDR as shared, and TDCS as shared.  This
392 		 * vp flush function is called when destructing vCPU/TD or vCPU
393 		 * migration.  No other thread uses TDVPR in those cases.
394 		 */
395 		err = tdh_vp_flush(&to_tdx(vcpu)->vp);
396 		if (unlikely(err && err != TDX_VCPU_NOT_ASSOCIATED)) {
397 			/*
398 			 * This function is called in IPI context. Do not use
399 			 * printk to avoid console semaphore.
400 			 * The caller prints out the error message, instead.
401 			 */
402 			if (err)
403 				arg->err = err;
404 		}
405 	}
406 
407 	tdx_disassociate_vp(vcpu);
408 }
409 
tdx_flush_vp_on_cpu(struct kvm_vcpu * vcpu)410 static void tdx_flush_vp_on_cpu(struct kvm_vcpu *vcpu)
411 {
412 	struct tdx_flush_vp_arg arg = {
413 		.vcpu = vcpu,
414 	};
415 	int cpu = vcpu->cpu;
416 
417 	if (unlikely(cpu == -1))
418 		return;
419 
420 	smp_call_function_single(cpu, tdx_flush_vp, &arg, 1);
421 	if (KVM_BUG_ON(arg.err, vcpu->kvm))
422 		pr_tdx_error(TDH_VP_FLUSH, arg.err);
423 }
424 
tdx_disable_virtualization_cpu(void)425 void tdx_disable_virtualization_cpu(void)
426 {
427 	int cpu = raw_smp_processor_id();
428 	struct list_head *tdvcpus = &per_cpu(associated_tdvcpus, cpu);
429 	struct tdx_flush_vp_arg arg;
430 	struct vcpu_tdx *tdx, *tmp;
431 	unsigned long flags;
432 
433 	local_irq_save(flags);
434 	/* Safe variant needed as tdx_disassociate_vp() deletes the entry. */
435 	list_for_each_entry_safe(tdx, tmp, tdvcpus, cpu_list) {
436 		arg.vcpu = &tdx->vcpu;
437 		tdx_flush_vp(&arg);
438 	}
439 	local_irq_restore(flags);
440 }
441 
442 #define TDX_SEAMCALL_RETRIES 10000
443 
smp_func_do_phymem_cache_wb(void * unused)444 static void smp_func_do_phymem_cache_wb(void *unused)
445 {
446 	u64 err = 0;
447 	bool resume;
448 	int i;
449 
450 	/*
451 	 * TDH.PHYMEM.CACHE.WB flushes caches associated with any TDX private
452 	 * KeyID on the package or core.  The TDX module may not finish the
453 	 * cache flush but return TDX_INTERRUPTED_RESUMEABLE instead.  The
454 	 * kernel should retry it until it returns success w/o rescheduling.
455 	 */
456 	for (i = TDX_SEAMCALL_RETRIES; i > 0; i--) {
457 		resume = !!err;
458 		err = tdh_phymem_cache_wb(resume);
459 		switch (err) {
460 		case TDX_INTERRUPTED_RESUMABLE:
461 			continue;
462 		case TDX_NO_HKID_READY_TO_WBCACHE:
463 			err = TDX_SUCCESS; /* Already done by other thread */
464 			fallthrough;
465 		default:
466 			goto out;
467 		}
468 	}
469 
470 out:
471 	if (WARN_ON_ONCE(err))
472 		pr_tdx_error(TDH_PHYMEM_CACHE_WB, err);
473 }
474 
tdx_mmu_release_hkid(struct kvm * kvm)475 void tdx_mmu_release_hkid(struct kvm *kvm)
476 {
477 	bool packages_allocated, targets_allocated;
478 	struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
479 	cpumask_var_t packages, targets;
480 	struct kvm_vcpu *vcpu;
481 	unsigned long j;
482 	int i;
483 	u64 err;
484 
485 	if (!is_hkid_assigned(kvm_tdx))
486 		return;
487 
488 	packages_allocated = zalloc_cpumask_var(&packages, GFP_KERNEL);
489 	targets_allocated = zalloc_cpumask_var(&targets, GFP_KERNEL);
490 	cpus_read_lock();
491 
492 	kvm_for_each_vcpu(j, vcpu, kvm)
493 		tdx_flush_vp_on_cpu(vcpu);
494 
495 	/*
496 	 * TDH.PHYMEM.CACHE.WB tries to acquire the TDX module global lock
497 	 * and can fail with TDX_OPERAND_BUSY when it fails to get the lock.
498 	 * Multiple TDX guests can be destroyed simultaneously. Take the
499 	 * mutex to prevent it from getting error.
500 	 */
501 	mutex_lock(&tdx_lock);
502 
503 	/*
504 	 * Releasing HKID is in vm_destroy().
505 	 * After the above flushing vps, there should be no more vCPU
506 	 * associations, as all vCPU fds have been released at this stage.
507 	 */
508 	err = tdh_mng_vpflushdone(&kvm_tdx->td);
509 	if (err == TDX_FLUSHVP_NOT_DONE)
510 		goto out;
511 	if (KVM_BUG_ON(err, kvm)) {
512 		pr_tdx_error(TDH_MNG_VPFLUSHDONE, err);
513 		pr_err("tdh_mng_vpflushdone() failed. HKID %d is leaked.\n",
514 		       kvm_tdx->hkid);
515 		goto out;
516 	}
517 
518 	for_each_online_cpu(i) {
519 		if (packages_allocated &&
520 		    cpumask_test_and_set_cpu(topology_physical_package_id(i),
521 					     packages))
522 			continue;
523 		if (targets_allocated)
524 			cpumask_set_cpu(i, targets);
525 	}
526 	if (targets_allocated)
527 		on_each_cpu_mask(targets, smp_func_do_phymem_cache_wb, NULL, true);
528 	else
529 		on_each_cpu(smp_func_do_phymem_cache_wb, NULL, true);
530 	/*
531 	 * In the case of error in smp_func_do_phymem_cache_wb(), the following
532 	 * tdh_mng_key_freeid() will fail.
533 	 */
534 	err = tdh_mng_key_freeid(&kvm_tdx->td);
535 	if (KVM_BUG_ON(err, kvm)) {
536 		pr_tdx_error(TDH_MNG_KEY_FREEID, err);
537 		pr_err("tdh_mng_key_freeid() failed. HKID %d is leaked.\n",
538 		       kvm_tdx->hkid);
539 	} else {
540 		tdx_hkid_free(kvm_tdx);
541 	}
542 
543 out:
544 	mutex_unlock(&tdx_lock);
545 	cpus_read_unlock();
546 	free_cpumask_var(targets);
547 	free_cpumask_var(packages);
548 }
549 
tdx_reclaim_td_control_pages(struct kvm * kvm)550 static void tdx_reclaim_td_control_pages(struct kvm *kvm)
551 {
552 	struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
553 	u64 err;
554 	int i;
555 
556 	/*
557 	 * tdx_mmu_release_hkid() failed to reclaim HKID.  Something went wrong
558 	 * heavily with TDX module.  Give up freeing TD pages.  As the function
559 	 * already warned, don't warn it again.
560 	 */
561 	if (is_hkid_assigned(kvm_tdx))
562 		return;
563 
564 	if (kvm_tdx->td.tdcs_pages) {
565 		for (i = 0; i < kvm_tdx->td.tdcs_nr_pages; i++) {
566 			if (!kvm_tdx->td.tdcs_pages[i])
567 				continue;
568 
569 			tdx_reclaim_control_page(kvm_tdx->td.tdcs_pages[i]);
570 		}
571 		kfree(kvm_tdx->td.tdcs_pages);
572 		kvm_tdx->td.tdcs_pages = NULL;
573 	}
574 
575 	if (!kvm_tdx->td.tdr_page)
576 		return;
577 
578 	if (__tdx_reclaim_page(kvm_tdx->td.tdr_page))
579 		return;
580 
581 	/*
582 	 * Use a SEAMCALL to ask the TDX module to flush the cache based on the
583 	 * KeyID. TDX module may access TDR while operating on TD (Especially
584 	 * when it is reclaiming TDCS).
585 	 */
586 	err = tdh_phymem_page_wbinvd_tdr(&kvm_tdx->td);
587 	if (KVM_BUG_ON(err, kvm)) {
588 		pr_tdx_error(TDH_PHYMEM_PAGE_WBINVD, err);
589 		return;
590 	}
591 	tdx_clear_page(kvm_tdx->td.tdr_page);
592 
593 	__free_page(kvm_tdx->td.tdr_page);
594 	kvm_tdx->td.tdr_page = NULL;
595 }
596 
tdx_vm_destroy(struct kvm * kvm)597 void tdx_vm_destroy(struct kvm *kvm)
598 {
599 	struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
600 
601 	tdx_reclaim_td_control_pages(kvm);
602 
603 	kvm_tdx->state = TD_STATE_UNINITIALIZED;
604 }
605 
tdx_do_tdh_mng_key_config(void * param)606 static int tdx_do_tdh_mng_key_config(void *param)
607 {
608 	struct kvm_tdx *kvm_tdx = param;
609 	u64 err;
610 
611 	/* TDX_RND_NO_ENTROPY related retries are handled by sc_retry() */
612 	err = tdh_mng_key_config(&kvm_tdx->td);
613 
614 	if (KVM_BUG_ON(err, &kvm_tdx->kvm)) {
615 		pr_tdx_error(TDH_MNG_KEY_CONFIG, err);
616 		return -EIO;
617 	}
618 
619 	return 0;
620 }
621 
tdx_vm_init(struct kvm * kvm)622 int tdx_vm_init(struct kvm *kvm)
623 {
624 	struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
625 
626 	kvm->arch.has_protected_state = true;
627 	kvm->arch.has_private_mem = true;
628 	kvm->arch.disabled_quirks |= KVM_X86_QUIRK_IGNORE_GUEST_PAT;
629 
630 	/*
631 	 * Because guest TD is protected, VMM can't parse the instruction in TD.
632 	 * Instead, guest uses MMIO hypercall.  For unmodified device driver,
633 	 * #VE needs to be injected for MMIO and #VE handler in TD converts MMIO
634 	 * instruction into MMIO hypercall.
635 	 *
636 	 * SPTE value for MMIO needs to be setup so that #VE is injected into
637 	 * TD instead of triggering EPT MISCONFIG.
638 	 * - RWX=0 so that EPT violation is triggered.
639 	 * - suppress #VE bit is cleared to inject #VE.
640 	 */
641 	kvm_mmu_set_mmio_spte_value(kvm, 0);
642 
643 	/*
644 	 * TDX has its own limit of maximum vCPUs it can support for all
645 	 * TDX guests in addition to KVM_MAX_VCPUS.  TDX module reports
646 	 * such limit via the MAX_VCPU_PER_TD global metadata.  In
647 	 * practice, it reflects the number of logical CPUs that ALL
648 	 * platforms that the TDX module supports can possibly have.
649 	 *
650 	 * Limit TDX guest's maximum vCPUs to the number of logical CPUs
651 	 * the platform has.  Simply forwarding the MAX_VCPU_PER_TD to
652 	 * userspace would result in an unpredictable ABI.
653 	 */
654 	kvm->max_vcpus = min_t(int, kvm->max_vcpus, num_present_cpus());
655 
656 	kvm_tdx->state = TD_STATE_UNINITIALIZED;
657 
658 	return 0;
659 }
660 
tdx_vcpu_create(struct kvm_vcpu * vcpu)661 int tdx_vcpu_create(struct kvm_vcpu *vcpu)
662 {
663 	struct kvm_tdx *kvm_tdx = to_kvm_tdx(vcpu->kvm);
664 	struct vcpu_tdx *tdx = to_tdx(vcpu);
665 
666 	if (kvm_tdx->state != TD_STATE_INITIALIZED)
667 		return -EIO;
668 
669 	/*
670 	 * TDX module mandates APICv, which requires an in-kernel local APIC.
671 	 * Disallow an in-kernel I/O APIC, because level-triggered interrupts
672 	 * and thus the I/O APIC as a whole can't be faithfully emulated in KVM.
673 	 */
674 	if (!irqchip_split(vcpu->kvm))
675 		return -EINVAL;
676 
677 	fpstate_set_confidential(&vcpu->arch.guest_fpu);
678 	vcpu->arch.apic->guest_apic_protected = true;
679 	INIT_LIST_HEAD(&tdx->vt.pi_wakeup_list);
680 
681 	vcpu->arch.efer = EFER_SCE | EFER_LME | EFER_LMA | EFER_NX;
682 
683 	vcpu->arch.switch_db_regs = KVM_DEBUGREG_AUTO_SWITCH;
684 	vcpu->arch.cr0_guest_owned_bits = -1ul;
685 	vcpu->arch.cr4_guest_owned_bits = -1ul;
686 
687 	/* KVM can't change TSC offset/multiplier as TDX module manages them. */
688 	vcpu->arch.guest_tsc_protected = true;
689 	vcpu->arch.tsc_offset = kvm_tdx->tsc_offset;
690 	vcpu->arch.l1_tsc_offset = vcpu->arch.tsc_offset;
691 	vcpu->arch.tsc_scaling_ratio = kvm_tdx->tsc_multiplier;
692 	vcpu->arch.l1_tsc_scaling_ratio = kvm_tdx->tsc_multiplier;
693 
694 	vcpu->arch.guest_state_protected =
695 		!(to_kvm_tdx(vcpu->kvm)->attributes & TDX_TD_ATTR_DEBUG);
696 
697 	if ((kvm_tdx->xfam & XFEATURE_MASK_XTILE) == XFEATURE_MASK_XTILE)
698 		vcpu->arch.xfd_no_write_intercept = true;
699 
700 	tdx->vt.pi_desc.nv = POSTED_INTR_VECTOR;
701 	__pi_set_sn(&tdx->vt.pi_desc);
702 
703 	tdx->state = VCPU_TD_STATE_UNINITIALIZED;
704 
705 	return 0;
706 }
707 
tdx_vcpu_load(struct kvm_vcpu * vcpu,int cpu)708 void tdx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
709 {
710 	struct vcpu_tdx *tdx = to_tdx(vcpu);
711 
712 	vmx_vcpu_pi_load(vcpu, cpu);
713 	if (vcpu->cpu == cpu || !is_hkid_assigned(to_kvm_tdx(vcpu->kvm)))
714 		return;
715 
716 	tdx_flush_vp_on_cpu(vcpu);
717 
718 	KVM_BUG_ON(cpu != raw_smp_processor_id(), vcpu->kvm);
719 	local_irq_disable();
720 	/*
721 	 * Pairs with the smp_wmb() in tdx_disassociate_vp() to ensure
722 	 * vcpu->cpu is read before tdx->cpu_list.
723 	 */
724 	smp_rmb();
725 
726 	list_add(&tdx->cpu_list, &per_cpu(associated_tdvcpus, cpu));
727 	local_irq_enable();
728 }
729 
tdx_interrupt_allowed(struct kvm_vcpu * vcpu)730 bool tdx_interrupt_allowed(struct kvm_vcpu *vcpu)
731 {
732 	/*
733 	 * KVM can't get the interrupt status of TDX guest and it assumes
734 	 * interrupt is always allowed unless TDX guest calls TDVMCALL with HLT,
735 	 * which passes the interrupt blocked flag.
736 	 */
737 	return vmx_get_exit_reason(vcpu).basic != EXIT_REASON_HLT ||
738 	       !to_tdx(vcpu)->vp_enter_args.r12;
739 }
740 
tdx_protected_apic_has_interrupt(struct kvm_vcpu * vcpu)741 bool tdx_protected_apic_has_interrupt(struct kvm_vcpu *vcpu)
742 {
743 	u64 vcpu_state_details;
744 
745 	if (pi_has_pending_interrupt(vcpu))
746 		return true;
747 
748 	/*
749 	 * Only check RVI pending for HALTED case with IRQ enabled.
750 	 * For non-HLT cases, KVM doesn't care about STI/SS shadows.  And if the
751 	 * interrupt was pending before TD exit, then it _must_ be blocked,
752 	 * otherwise the interrupt would have been serviced at the instruction
753 	 * boundary.
754 	 */
755 	if (vmx_get_exit_reason(vcpu).basic != EXIT_REASON_HLT ||
756 	    to_tdx(vcpu)->vp_enter_args.r12)
757 		return false;
758 
759 	vcpu_state_details =
760 		td_state_non_arch_read64(to_tdx(vcpu), TD_VCPU_STATE_DETAILS_NON_ARCH);
761 
762 	return tdx_vcpu_state_details_intr_pending(vcpu_state_details);
763 }
764 
765 /*
766  * Compared to vmx_prepare_switch_to_guest(), there is not much to do
767  * as SEAMCALL/SEAMRET calls take care of most of save and restore.
768  */
tdx_prepare_switch_to_guest(struct kvm_vcpu * vcpu)769 void tdx_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
770 {
771 	struct vcpu_vt *vt = to_vt(vcpu);
772 
773 	if (vt->guest_state_loaded)
774 		return;
775 
776 	if (likely(is_64bit_mm(current->mm)))
777 		vt->msr_host_kernel_gs_base = current->thread.gsbase;
778 	else
779 		vt->msr_host_kernel_gs_base = read_msr(MSR_KERNEL_GS_BASE);
780 
781 	vt->host_debugctlmsr = get_debugctlmsr();
782 
783 	vt->guest_state_loaded = true;
784 }
785 
786 struct tdx_uret_msr {
787 	u32 msr;
788 	unsigned int slot;
789 	u64 defval;
790 };
791 
792 static struct tdx_uret_msr tdx_uret_msrs[] = {
793 	{.msr = MSR_SYSCALL_MASK, .defval = 0x20200 },
794 	{.msr = MSR_STAR,},
795 	{.msr = MSR_LSTAR,},
796 	{.msr = MSR_TSC_AUX,},
797 };
798 
tdx_user_return_msr_update_cache(void)799 static void tdx_user_return_msr_update_cache(void)
800 {
801 	int i;
802 
803 	for (i = 0; i < ARRAY_SIZE(tdx_uret_msrs); i++)
804 		kvm_user_return_msr_update_cache(tdx_uret_msrs[i].slot,
805 						 tdx_uret_msrs[i].defval);
806 }
807 
tdx_prepare_switch_to_host(struct kvm_vcpu * vcpu)808 static void tdx_prepare_switch_to_host(struct kvm_vcpu *vcpu)
809 {
810 	struct vcpu_vt *vt = to_vt(vcpu);
811 	struct vcpu_tdx *tdx = to_tdx(vcpu);
812 
813 	if (!vt->guest_state_loaded)
814 		return;
815 
816 	++vcpu->stat.host_state_reload;
817 	wrmsrl(MSR_KERNEL_GS_BASE, vt->msr_host_kernel_gs_base);
818 
819 	if (tdx->guest_entered) {
820 		tdx_user_return_msr_update_cache();
821 		tdx->guest_entered = false;
822 	}
823 
824 	vt->guest_state_loaded = false;
825 }
826 
tdx_vcpu_put(struct kvm_vcpu * vcpu)827 void tdx_vcpu_put(struct kvm_vcpu *vcpu)
828 {
829 	vmx_vcpu_pi_put(vcpu);
830 	tdx_prepare_switch_to_host(vcpu);
831 }
832 
tdx_vcpu_free(struct kvm_vcpu * vcpu)833 void tdx_vcpu_free(struct kvm_vcpu *vcpu)
834 {
835 	struct kvm_tdx *kvm_tdx = to_kvm_tdx(vcpu->kvm);
836 	struct vcpu_tdx *tdx = to_tdx(vcpu);
837 	int i;
838 
839 	/*
840 	 * It is not possible to reclaim pages while hkid is assigned. It might
841 	 * be assigned if:
842 	 * 1. the TD VM is being destroyed but freeing hkid failed, in which
843 	 * case the pages are leaked
844 	 * 2. TD VCPU creation failed and this on the error path, in which case
845 	 * there is nothing to do anyway
846 	 */
847 	if (is_hkid_assigned(kvm_tdx))
848 		return;
849 
850 	if (tdx->vp.tdcx_pages) {
851 		for (i = 0; i < kvm_tdx->td.tdcx_nr_pages; i++) {
852 			if (tdx->vp.tdcx_pages[i])
853 				tdx_reclaim_control_page(tdx->vp.tdcx_pages[i]);
854 		}
855 		kfree(tdx->vp.tdcx_pages);
856 		tdx->vp.tdcx_pages = NULL;
857 	}
858 	if (tdx->vp.tdvpr_page) {
859 		tdx_reclaim_control_page(tdx->vp.tdvpr_page);
860 		tdx->vp.tdvpr_page = 0;
861 	}
862 
863 	tdx->state = VCPU_TD_STATE_UNINITIALIZED;
864 }
865 
tdx_vcpu_pre_run(struct kvm_vcpu * vcpu)866 int tdx_vcpu_pre_run(struct kvm_vcpu *vcpu)
867 {
868 	if (unlikely(to_tdx(vcpu)->state != VCPU_TD_STATE_INITIALIZED ||
869 		     to_kvm_tdx(vcpu->kvm)->state != TD_STATE_RUNNABLE))
870 		return -EINVAL;
871 
872 	return 1;
873 }
874 
tdcall_to_vmx_exit_reason(struct kvm_vcpu * vcpu)875 static __always_inline u32 tdcall_to_vmx_exit_reason(struct kvm_vcpu *vcpu)
876 {
877 	switch (tdvmcall_leaf(vcpu)) {
878 	case EXIT_REASON_CPUID:
879 	case EXIT_REASON_HLT:
880 	case EXIT_REASON_IO_INSTRUCTION:
881 	case EXIT_REASON_MSR_READ:
882 	case EXIT_REASON_MSR_WRITE:
883 		return tdvmcall_leaf(vcpu);
884 	case EXIT_REASON_EPT_VIOLATION:
885 		return EXIT_REASON_EPT_MISCONFIG;
886 	default:
887 		break;
888 	}
889 
890 	return EXIT_REASON_TDCALL;
891 }
892 
tdx_to_vmx_exit_reason(struct kvm_vcpu * vcpu)893 static __always_inline u32 tdx_to_vmx_exit_reason(struct kvm_vcpu *vcpu)
894 {
895 	struct vcpu_tdx *tdx = to_tdx(vcpu);
896 	u32 exit_reason;
897 
898 	switch (tdx->vp_enter_ret & TDX_SEAMCALL_STATUS_MASK) {
899 	case TDX_SUCCESS:
900 	case TDX_NON_RECOVERABLE_VCPU:
901 	case TDX_NON_RECOVERABLE_TD:
902 	case TDX_NON_RECOVERABLE_TD_NON_ACCESSIBLE:
903 	case TDX_NON_RECOVERABLE_TD_WRONG_APIC_MODE:
904 		break;
905 	default:
906 		return -1u;
907 	}
908 
909 	exit_reason = tdx->vp_enter_ret;
910 
911 	switch (exit_reason) {
912 	case EXIT_REASON_TDCALL:
913 		if (tdvmcall_exit_type(vcpu))
914 			return EXIT_REASON_VMCALL;
915 
916 		return tdcall_to_vmx_exit_reason(vcpu);
917 	case EXIT_REASON_EPT_MISCONFIG:
918 		/*
919 		 * Defer KVM_BUG_ON() until tdx_handle_exit() because this is in
920 		 * non-instrumentable code with interrupts disabled.
921 		 */
922 		return -1u;
923 	default:
924 		break;
925 	}
926 
927 	return exit_reason;
928 }
929 
tdx_vcpu_enter_exit(struct kvm_vcpu * vcpu)930 static noinstr void tdx_vcpu_enter_exit(struct kvm_vcpu *vcpu)
931 {
932 	struct vcpu_tdx *tdx = to_tdx(vcpu);
933 	struct vcpu_vt *vt = to_vt(vcpu);
934 
935 	guest_state_enter_irqoff();
936 
937 	tdx->vp_enter_ret = tdh_vp_enter(&tdx->vp, &tdx->vp_enter_args);
938 
939 	vt->exit_reason.full = tdx_to_vmx_exit_reason(vcpu);
940 
941 	vt->exit_qualification = tdx->vp_enter_args.rcx;
942 	tdx->ext_exit_qualification = tdx->vp_enter_args.rdx;
943 	tdx->exit_gpa = tdx->vp_enter_args.r8;
944 	vt->exit_intr_info = tdx->vp_enter_args.r9;
945 
946 	vmx_handle_nmi(vcpu);
947 
948 	guest_state_exit_irqoff();
949 }
950 
tdx_failed_vmentry(struct kvm_vcpu * vcpu)951 static bool tdx_failed_vmentry(struct kvm_vcpu *vcpu)
952 {
953 	return vmx_get_exit_reason(vcpu).failed_vmentry &&
954 	       vmx_get_exit_reason(vcpu).full != -1u;
955 }
956 
tdx_exit_handlers_fastpath(struct kvm_vcpu * vcpu)957 static fastpath_t tdx_exit_handlers_fastpath(struct kvm_vcpu *vcpu)
958 {
959 	u64 vp_enter_ret = to_tdx(vcpu)->vp_enter_ret;
960 
961 	/*
962 	 * TDX_OPERAND_BUSY could be returned for SEPT due to 0-step mitigation
963 	 * or for TD EPOCH due to contention with TDH.MEM.TRACK on TDH.VP.ENTER.
964 	 *
965 	 * When KVM requests KVM_REQ_OUTSIDE_GUEST_MODE, which has both
966 	 * KVM_REQUEST_WAIT and KVM_REQUEST_NO_ACTION set, it requires target
967 	 * vCPUs leaving fastpath so that interrupt can be enabled to ensure the
968 	 * IPIs can be delivered. Return EXIT_FASTPATH_EXIT_HANDLED instead of
969 	 * EXIT_FASTPATH_REENTER_GUEST to exit fastpath, otherwise, the
970 	 * requester may be blocked endlessly.
971 	 */
972 	if (unlikely(tdx_operand_busy(vp_enter_ret)))
973 		return EXIT_FASTPATH_EXIT_HANDLED;
974 
975 	return EXIT_FASTPATH_NONE;
976 }
977 
978 #define TDX_REGS_AVAIL_SET	(BIT_ULL(VCPU_EXREG_EXIT_INFO_1) | \
979 				 BIT_ULL(VCPU_EXREG_EXIT_INFO_2) | \
980 				 BIT_ULL(VCPU_REGS_RAX) | \
981 				 BIT_ULL(VCPU_REGS_RBX) | \
982 				 BIT_ULL(VCPU_REGS_RCX) | \
983 				 BIT_ULL(VCPU_REGS_RDX) | \
984 				 BIT_ULL(VCPU_REGS_RBP) | \
985 				 BIT_ULL(VCPU_REGS_RSI) | \
986 				 BIT_ULL(VCPU_REGS_RDI) | \
987 				 BIT_ULL(VCPU_REGS_R8) | \
988 				 BIT_ULL(VCPU_REGS_R9) | \
989 				 BIT_ULL(VCPU_REGS_R10) | \
990 				 BIT_ULL(VCPU_REGS_R11) | \
991 				 BIT_ULL(VCPU_REGS_R12) | \
992 				 BIT_ULL(VCPU_REGS_R13) | \
993 				 BIT_ULL(VCPU_REGS_R14) | \
994 				 BIT_ULL(VCPU_REGS_R15))
995 
tdx_load_host_xsave_state(struct kvm_vcpu * vcpu)996 static void tdx_load_host_xsave_state(struct kvm_vcpu *vcpu)
997 {
998 	struct kvm_tdx *kvm_tdx = to_kvm_tdx(vcpu->kvm);
999 
1000 	/*
1001 	 * All TDX hosts support PKRU; but even if they didn't,
1002 	 * vcpu->arch.host_pkru would be 0 and the wrpkru would be
1003 	 * skipped.
1004 	 */
1005 	if (vcpu->arch.host_pkru != 0)
1006 		wrpkru(vcpu->arch.host_pkru);
1007 
1008 	if (kvm_host.xcr0 != (kvm_tdx->xfam & kvm_caps.supported_xcr0))
1009 		xsetbv(XCR_XFEATURE_ENABLED_MASK, kvm_host.xcr0);
1010 
1011 	/*
1012 	 * Likewise, even if a TDX hosts didn't support XSS both arms of
1013 	 * the comparison would be 0 and the wrmsrl would be skipped.
1014 	 */
1015 	if (kvm_host.xss != (kvm_tdx->xfam & kvm_caps.supported_xss))
1016 		wrmsrl(MSR_IA32_XSS, kvm_host.xss);
1017 }
1018 
1019 #define TDX_DEBUGCTL_PRESERVED (DEBUGCTLMSR_BTF | \
1020 				DEBUGCTLMSR_FREEZE_PERFMON_ON_PMI | \
1021 				DEBUGCTLMSR_FREEZE_IN_SMM)
1022 
tdx_vcpu_run(struct kvm_vcpu * vcpu,bool force_immediate_exit)1023 fastpath_t tdx_vcpu_run(struct kvm_vcpu *vcpu, bool force_immediate_exit)
1024 {
1025 	struct vcpu_tdx *tdx = to_tdx(vcpu);
1026 	struct vcpu_vt *vt = to_vt(vcpu);
1027 
1028 	/*
1029 	 * force_immediate_exit requires vCPU entering for events injection with
1030 	 * an immediately exit followed. But The TDX module doesn't guarantee
1031 	 * entry, it's already possible for KVM to _think_ it completely entry
1032 	 * to the guest without actually having done so.
1033 	 * Since KVM never needs to force an immediate exit for TDX, and can't
1034 	 * do direct injection, just warn on force_immediate_exit.
1035 	 */
1036 	WARN_ON_ONCE(force_immediate_exit);
1037 
1038 	/*
1039 	 * Wait until retry of SEPT-zap-related SEAMCALL completes before
1040 	 * allowing vCPU entry to avoid contention with tdh_vp_enter() and
1041 	 * TDCALLs.
1042 	 */
1043 	if (unlikely(READ_ONCE(to_kvm_tdx(vcpu->kvm)->wait_for_sept_zap)))
1044 		return EXIT_FASTPATH_EXIT_HANDLED;
1045 
1046 	trace_kvm_entry(vcpu, force_immediate_exit);
1047 
1048 	if (pi_test_on(&vt->pi_desc)) {
1049 		apic->send_IPI_self(POSTED_INTR_VECTOR);
1050 
1051 		if (pi_test_pir(kvm_lapic_get_reg(vcpu->arch.apic, APIC_LVTT) &
1052 			       APIC_VECTOR_MASK, &vt->pi_desc))
1053 			kvm_wait_lapic_expire(vcpu);
1054 	}
1055 
1056 	tdx_vcpu_enter_exit(vcpu);
1057 
1058 	if (vt->host_debugctlmsr & ~TDX_DEBUGCTL_PRESERVED)
1059 		update_debugctlmsr(vt->host_debugctlmsr);
1060 
1061 	tdx_load_host_xsave_state(vcpu);
1062 	tdx->guest_entered = true;
1063 
1064 	vcpu->arch.regs_avail &= TDX_REGS_AVAIL_SET;
1065 
1066 	if (unlikely(tdx->vp_enter_ret == EXIT_REASON_EPT_MISCONFIG))
1067 		return EXIT_FASTPATH_NONE;
1068 
1069 	if (unlikely((tdx->vp_enter_ret & TDX_SW_ERROR) == TDX_SW_ERROR))
1070 		return EXIT_FASTPATH_NONE;
1071 
1072 	if (unlikely(vmx_get_exit_reason(vcpu).basic == EXIT_REASON_MCE_DURING_VMENTRY))
1073 		kvm_machine_check();
1074 
1075 	trace_kvm_exit(vcpu, KVM_ISA_VMX);
1076 
1077 	if (unlikely(tdx_failed_vmentry(vcpu)))
1078 		return EXIT_FASTPATH_NONE;
1079 
1080 	return tdx_exit_handlers_fastpath(vcpu);
1081 }
1082 
tdx_inject_nmi(struct kvm_vcpu * vcpu)1083 void tdx_inject_nmi(struct kvm_vcpu *vcpu)
1084 {
1085 	++vcpu->stat.nmi_injections;
1086 	td_management_write8(to_tdx(vcpu), TD_VCPU_PEND_NMI, 1);
1087 	/*
1088 	 * From KVM's perspective, NMI injection is completed right after
1089 	 * writing to PEND_NMI.  KVM doesn't care whether an NMI is injected by
1090 	 * the TDX module or not.
1091 	 */
1092 	vcpu->arch.nmi_injected = false;
1093 	/*
1094 	 * TDX doesn't support KVM to request NMI window exit.  If there is
1095 	 * still a pending vNMI, KVM is not able to inject it along with the
1096 	 * one pending in TDX module in a back-to-back way.  Since the previous
1097 	 * vNMI is still pending in TDX module, i.e. it has not been delivered
1098 	 * to TDX guest yet, it's OK to collapse the pending vNMI into the
1099 	 * previous one.  The guest is expected to handle all the NMI sources
1100 	 * when handling the first vNMI.
1101 	 */
1102 	vcpu->arch.nmi_pending = 0;
1103 }
1104 
tdx_handle_exception_nmi(struct kvm_vcpu * vcpu)1105 static int tdx_handle_exception_nmi(struct kvm_vcpu *vcpu)
1106 {
1107 	u32 intr_info = vmx_get_intr_info(vcpu);
1108 
1109 	/*
1110 	 * Machine checks are handled by handle_exception_irqoff(), or by
1111 	 * tdx_handle_exit() with TDX_NON_RECOVERABLE set if a #MC occurs on
1112 	 * VM-Entry.  NMIs are handled by tdx_vcpu_enter_exit().
1113 	 */
1114 	if (is_nmi(intr_info) || is_machine_check(intr_info))
1115 		return 1;
1116 
1117 	vcpu->run->exit_reason = KVM_EXIT_EXCEPTION;
1118 	vcpu->run->ex.exception = intr_info & INTR_INFO_VECTOR_MASK;
1119 	vcpu->run->ex.error_code = 0;
1120 
1121 	return 0;
1122 }
1123 
complete_hypercall_exit(struct kvm_vcpu * vcpu)1124 static int complete_hypercall_exit(struct kvm_vcpu *vcpu)
1125 {
1126 	tdvmcall_set_return_code(vcpu, vcpu->run->hypercall.ret);
1127 	return 1;
1128 }
1129 
tdx_emulate_vmcall(struct kvm_vcpu * vcpu)1130 static int tdx_emulate_vmcall(struct kvm_vcpu *vcpu)
1131 {
1132 	kvm_rax_write(vcpu, to_tdx(vcpu)->vp_enter_args.r10);
1133 	kvm_rbx_write(vcpu, to_tdx(vcpu)->vp_enter_args.r11);
1134 	kvm_rcx_write(vcpu, to_tdx(vcpu)->vp_enter_args.r12);
1135 	kvm_rdx_write(vcpu, to_tdx(vcpu)->vp_enter_args.r13);
1136 	kvm_rsi_write(vcpu, to_tdx(vcpu)->vp_enter_args.r14);
1137 
1138 	return __kvm_emulate_hypercall(vcpu, 0, complete_hypercall_exit);
1139 }
1140 
1141 /*
1142  * Split into chunks and check interrupt pending between chunks.  This allows
1143  * for timely injection of interrupts to prevent issues with guest lockup
1144  * detection.
1145  */
1146 #define TDX_MAP_GPA_MAX_LEN (2 * 1024 * 1024)
1147 static void __tdx_map_gpa(struct vcpu_tdx *tdx);
1148 
tdx_complete_vmcall_map_gpa(struct kvm_vcpu * vcpu)1149 static int tdx_complete_vmcall_map_gpa(struct kvm_vcpu *vcpu)
1150 {
1151 	struct vcpu_tdx *tdx = to_tdx(vcpu);
1152 
1153 	if (vcpu->run->hypercall.ret) {
1154 		tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_INVALID_OPERAND);
1155 		tdx->vp_enter_args.r11 = tdx->map_gpa_next;
1156 		return 1;
1157 	}
1158 
1159 	tdx->map_gpa_next += TDX_MAP_GPA_MAX_LEN;
1160 	if (tdx->map_gpa_next >= tdx->map_gpa_end)
1161 		return 1;
1162 
1163 	/*
1164 	 * Stop processing the remaining part if there is a pending interrupt,
1165 	 * which could be qualified to deliver.  Skip checking pending RVI for
1166 	 * TDVMCALL_MAP_GPA, see comments in tdx_protected_apic_has_interrupt().
1167 	 */
1168 	if (kvm_vcpu_has_events(vcpu)) {
1169 		tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_RETRY);
1170 		tdx->vp_enter_args.r11 = tdx->map_gpa_next;
1171 		return 1;
1172 	}
1173 
1174 	__tdx_map_gpa(tdx);
1175 	return 0;
1176 }
1177 
__tdx_map_gpa(struct vcpu_tdx * tdx)1178 static void __tdx_map_gpa(struct vcpu_tdx *tdx)
1179 {
1180 	u64 gpa = tdx->map_gpa_next;
1181 	u64 size = tdx->map_gpa_end - tdx->map_gpa_next;
1182 
1183 	if (size > TDX_MAP_GPA_MAX_LEN)
1184 		size = TDX_MAP_GPA_MAX_LEN;
1185 
1186 	tdx->vcpu.run->exit_reason       = KVM_EXIT_HYPERCALL;
1187 	tdx->vcpu.run->hypercall.nr      = KVM_HC_MAP_GPA_RANGE;
1188 	/*
1189 	 * In principle this should have been -KVM_ENOSYS, but userspace (QEMU <=9.2)
1190 	 * assumed that vcpu->run->hypercall.ret is never changed by KVM and thus that
1191 	 * it was always zero on KVM_EXIT_HYPERCALL.  Since KVM is now overwriting
1192 	 * vcpu->run->hypercall.ret, ensuring that it is zero to not break QEMU.
1193 	 */
1194 	tdx->vcpu.run->hypercall.ret = 0;
1195 	tdx->vcpu.run->hypercall.args[0] = gpa & ~gfn_to_gpa(kvm_gfn_direct_bits(tdx->vcpu.kvm));
1196 	tdx->vcpu.run->hypercall.args[1] = size / PAGE_SIZE;
1197 	tdx->vcpu.run->hypercall.args[2] = vt_is_tdx_private_gpa(tdx->vcpu.kvm, gpa) ?
1198 					   KVM_MAP_GPA_RANGE_ENCRYPTED :
1199 					   KVM_MAP_GPA_RANGE_DECRYPTED;
1200 	tdx->vcpu.run->hypercall.flags   = KVM_EXIT_HYPERCALL_LONG_MODE;
1201 
1202 	tdx->vcpu.arch.complete_userspace_io = tdx_complete_vmcall_map_gpa;
1203 }
1204 
tdx_map_gpa(struct kvm_vcpu * vcpu)1205 static int tdx_map_gpa(struct kvm_vcpu *vcpu)
1206 {
1207 	struct vcpu_tdx *tdx = to_tdx(vcpu);
1208 	u64 gpa = tdx->vp_enter_args.r12;
1209 	u64 size = tdx->vp_enter_args.r13;
1210 	u64 ret;
1211 
1212 	/*
1213 	 * Converting TDVMCALL_MAP_GPA to KVM_HC_MAP_GPA_RANGE requires
1214 	 * userspace to enable KVM_CAP_EXIT_HYPERCALL with KVM_HC_MAP_GPA_RANGE
1215 	 * bit set.  This is a base call so it should always be supported, but
1216 	 * KVM has no way to ensure that userspace implements the GHCI correctly.
1217 	 * So if KVM_HC_MAP_GPA_RANGE does not cause a VMEXIT, return an error
1218 	 * to the guest.
1219 	 */
1220 	if (!user_exit_on_hypercall(vcpu->kvm, KVM_HC_MAP_GPA_RANGE)) {
1221 		ret = TDVMCALL_STATUS_SUBFUNC_UNSUPPORTED;
1222 		goto error;
1223 	}
1224 
1225 	if (gpa + size <= gpa || !kvm_vcpu_is_legal_gpa(vcpu, gpa) ||
1226 	    !kvm_vcpu_is_legal_gpa(vcpu, gpa + size - 1) ||
1227 	    (vt_is_tdx_private_gpa(vcpu->kvm, gpa) !=
1228 	     vt_is_tdx_private_gpa(vcpu->kvm, gpa + size - 1))) {
1229 		ret = TDVMCALL_STATUS_INVALID_OPERAND;
1230 		goto error;
1231 	}
1232 
1233 	if (!PAGE_ALIGNED(gpa) || !PAGE_ALIGNED(size)) {
1234 		ret = TDVMCALL_STATUS_ALIGN_ERROR;
1235 		goto error;
1236 	}
1237 
1238 	tdx->map_gpa_end = gpa + size;
1239 	tdx->map_gpa_next = gpa;
1240 
1241 	__tdx_map_gpa(tdx);
1242 	return 0;
1243 
1244 error:
1245 	tdvmcall_set_return_code(vcpu, ret);
1246 	tdx->vp_enter_args.r11 = gpa;
1247 	return 1;
1248 }
1249 
tdx_report_fatal_error(struct kvm_vcpu * vcpu)1250 static int tdx_report_fatal_error(struct kvm_vcpu *vcpu)
1251 {
1252 	struct vcpu_tdx *tdx = to_tdx(vcpu);
1253 	u64 *regs = vcpu->run->system_event.data;
1254 	u64 *module_regs = &tdx->vp_enter_args.r8;
1255 	int index = VCPU_REGS_RAX;
1256 
1257 	vcpu->run->exit_reason = KVM_EXIT_SYSTEM_EVENT;
1258 	vcpu->run->system_event.type = KVM_SYSTEM_EVENT_TDX_FATAL;
1259 	vcpu->run->system_event.ndata = 16;
1260 
1261 	/* Dump 16 general-purpose registers to userspace in ascending order. */
1262 	regs[index++] = tdx->vp_enter_ret;
1263 	regs[index++] = tdx->vp_enter_args.rcx;
1264 	regs[index++] = tdx->vp_enter_args.rdx;
1265 	regs[index++] = tdx->vp_enter_args.rbx;
1266 	regs[index++] = 0;
1267 	regs[index++] = 0;
1268 	regs[index++] = tdx->vp_enter_args.rsi;
1269 	regs[index] = tdx->vp_enter_args.rdi;
1270 	for (index = 0; index < 8; index++)
1271 		regs[VCPU_REGS_R8 + index] = module_regs[index];
1272 
1273 	return 0;
1274 }
1275 
tdx_emulate_cpuid(struct kvm_vcpu * vcpu)1276 static int tdx_emulate_cpuid(struct kvm_vcpu *vcpu)
1277 {
1278 	u32 eax, ebx, ecx, edx;
1279 	struct vcpu_tdx *tdx = to_tdx(vcpu);
1280 
1281 	/* EAX and ECX for cpuid is stored in R12 and R13. */
1282 	eax = tdx->vp_enter_args.r12;
1283 	ecx = tdx->vp_enter_args.r13;
1284 
1285 	kvm_cpuid(vcpu, &eax, &ebx, &ecx, &edx, false);
1286 
1287 	tdx->vp_enter_args.r12 = eax;
1288 	tdx->vp_enter_args.r13 = ebx;
1289 	tdx->vp_enter_args.r14 = ecx;
1290 	tdx->vp_enter_args.r15 = edx;
1291 
1292 	return 1;
1293 }
1294 
tdx_complete_pio_out(struct kvm_vcpu * vcpu)1295 static int tdx_complete_pio_out(struct kvm_vcpu *vcpu)
1296 {
1297 	vcpu->arch.pio.count = 0;
1298 	return 1;
1299 }
1300 
tdx_complete_pio_in(struct kvm_vcpu * vcpu)1301 static int tdx_complete_pio_in(struct kvm_vcpu *vcpu)
1302 {
1303 	struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
1304 	unsigned long val = 0;
1305 	int ret;
1306 
1307 	ret = ctxt->ops->pio_in_emulated(ctxt, vcpu->arch.pio.size,
1308 					 vcpu->arch.pio.port, &val, 1);
1309 
1310 	WARN_ON_ONCE(!ret);
1311 
1312 	tdvmcall_set_return_val(vcpu, val);
1313 
1314 	return 1;
1315 }
1316 
tdx_emulate_io(struct kvm_vcpu * vcpu)1317 static int tdx_emulate_io(struct kvm_vcpu *vcpu)
1318 {
1319 	struct vcpu_tdx *tdx = to_tdx(vcpu);
1320 	struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
1321 	unsigned long val = 0;
1322 	unsigned int port;
1323 	u64 size, write;
1324 	int ret;
1325 
1326 	++vcpu->stat.io_exits;
1327 
1328 	size = tdx->vp_enter_args.r12;
1329 	write = tdx->vp_enter_args.r13;
1330 	port = tdx->vp_enter_args.r14;
1331 
1332 	if ((write != 0 && write != 1) || (size != 1 && size != 2 && size != 4)) {
1333 		tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_INVALID_OPERAND);
1334 		return 1;
1335 	}
1336 
1337 	if (write) {
1338 		val = tdx->vp_enter_args.r15;
1339 		ret = ctxt->ops->pio_out_emulated(ctxt, size, port, &val, 1);
1340 	} else {
1341 		ret = ctxt->ops->pio_in_emulated(ctxt, size, port, &val, 1);
1342 	}
1343 
1344 	if (!ret)
1345 		vcpu->arch.complete_userspace_io = write ? tdx_complete_pio_out :
1346 							   tdx_complete_pio_in;
1347 	else if (!write)
1348 		tdvmcall_set_return_val(vcpu, val);
1349 
1350 	return ret;
1351 }
1352 
tdx_complete_mmio_read(struct kvm_vcpu * vcpu)1353 static int tdx_complete_mmio_read(struct kvm_vcpu *vcpu)
1354 {
1355 	unsigned long val = 0;
1356 	gpa_t gpa;
1357 	int size;
1358 
1359 	gpa = vcpu->mmio_fragments[0].gpa;
1360 	size = vcpu->mmio_fragments[0].len;
1361 
1362 	memcpy(&val, vcpu->run->mmio.data, size);
1363 	tdvmcall_set_return_val(vcpu, val);
1364 	trace_kvm_mmio(KVM_TRACE_MMIO_READ, size, gpa, &val);
1365 	return 1;
1366 }
1367 
tdx_mmio_write(struct kvm_vcpu * vcpu,gpa_t gpa,int size,unsigned long val)1368 static inline int tdx_mmio_write(struct kvm_vcpu *vcpu, gpa_t gpa, int size,
1369 				 unsigned long val)
1370 {
1371 	if (!kvm_io_bus_write(vcpu, KVM_FAST_MMIO_BUS, gpa, 0, NULL)) {
1372 		trace_kvm_fast_mmio(gpa);
1373 		return 0;
1374 	}
1375 
1376 	trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, size, gpa, &val);
1377 	if (kvm_io_bus_write(vcpu, KVM_MMIO_BUS, gpa, size, &val))
1378 		return -EOPNOTSUPP;
1379 
1380 	return 0;
1381 }
1382 
tdx_mmio_read(struct kvm_vcpu * vcpu,gpa_t gpa,int size)1383 static inline int tdx_mmio_read(struct kvm_vcpu *vcpu, gpa_t gpa, int size)
1384 {
1385 	unsigned long val;
1386 
1387 	if (kvm_io_bus_read(vcpu, KVM_MMIO_BUS, gpa, size, &val))
1388 		return -EOPNOTSUPP;
1389 
1390 	tdvmcall_set_return_val(vcpu, val);
1391 	trace_kvm_mmio(KVM_TRACE_MMIO_READ, size, gpa, &val);
1392 	return 0;
1393 }
1394 
tdx_emulate_mmio(struct kvm_vcpu * vcpu)1395 static int tdx_emulate_mmio(struct kvm_vcpu *vcpu)
1396 {
1397 	struct vcpu_tdx *tdx = to_tdx(vcpu);
1398 	int size, write, r;
1399 	unsigned long val;
1400 	gpa_t gpa;
1401 
1402 	size = tdx->vp_enter_args.r12;
1403 	write = tdx->vp_enter_args.r13;
1404 	gpa = tdx->vp_enter_args.r14;
1405 	val = write ? tdx->vp_enter_args.r15 : 0;
1406 
1407 	if (size != 1 && size != 2 && size != 4 && size != 8)
1408 		goto error;
1409 	if (write != 0 && write != 1)
1410 		goto error;
1411 
1412 	/*
1413 	 * TDG.VP.VMCALL<MMIO> allows only shared GPA, it makes no sense to
1414 	 * do MMIO emulation for private GPA.
1415 	 */
1416 	if (vt_is_tdx_private_gpa(vcpu->kvm, gpa) ||
1417 	    vt_is_tdx_private_gpa(vcpu->kvm, gpa + size - 1))
1418 		goto error;
1419 
1420 	gpa = gpa & ~gfn_to_gpa(kvm_gfn_direct_bits(vcpu->kvm));
1421 
1422 	if (write)
1423 		r = tdx_mmio_write(vcpu, gpa, size, val);
1424 	else
1425 		r = tdx_mmio_read(vcpu, gpa, size);
1426 	if (!r)
1427 		/* Kernel completed device emulation. */
1428 		return 1;
1429 
1430 	/* Request the device emulation to userspace device model. */
1431 	vcpu->mmio_is_write = write;
1432 	if (!write)
1433 		vcpu->arch.complete_userspace_io = tdx_complete_mmio_read;
1434 
1435 	vcpu->run->mmio.phys_addr = gpa;
1436 	vcpu->run->mmio.len = size;
1437 	vcpu->run->mmio.is_write = write;
1438 	vcpu->run->exit_reason = KVM_EXIT_MMIO;
1439 
1440 	if (write) {
1441 		memcpy(vcpu->run->mmio.data, &val, size);
1442 	} else {
1443 		vcpu->mmio_fragments[0].gpa = gpa;
1444 		vcpu->mmio_fragments[0].len = size;
1445 		trace_kvm_mmio(KVM_TRACE_MMIO_READ_UNSATISFIED, size, gpa, NULL);
1446 	}
1447 	return 0;
1448 
1449 error:
1450 	tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_INVALID_OPERAND);
1451 	return 1;
1452 }
1453 
tdx_complete_get_td_vm_call_info(struct kvm_vcpu * vcpu)1454 static int tdx_complete_get_td_vm_call_info(struct kvm_vcpu *vcpu)
1455 {
1456 	struct vcpu_tdx *tdx = to_tdx(vcpu);
1457 
1458 	tdvmcall_set_return_code(vcpu, vcpu->run->tdx.get_tdvmcall_info.ret);
1459 
1460 	/*
1461 	 * For now, there is no TDVMCALL beyond GHCI base API supported by KVM
1462 	 * directly without the support from userspace, just set the value
1463 	 * returned from userspace.
1464 	 */
1465 	tdx->vp_enter_args.r11 = vcpu->run->tdx.get_tdvmcall_info.r11;
1466 	tdx->vp_enter_args.r12 = vcpu->run->tdx.get_tdvmcall_info.r12;
1467 	tdx->vp_enter_args.r13 = vcpu->run->tdx.get_tdvmcall_info.r13;
1468 	tdx->vp_enter_args.r14 = vcpu->run->tdx.get_tdvmcall_info.r14;
1469 
1470 	return 1;
1471 }
1472 
tdx_get_td_vm_call_info(struct kvm_vcpu * vcpu)1473 static int tdx_get_td_vm_call_info(struct kvm_vcpu *vcpu)
1474 {
1475 	struct vcpu_tdx *tdx = to_tdx(vcpu);
1476 
1477 	switch (tdx->vp_enter_args.r12) {
1478 	case 0:
1479 		tdx->vp_enter_args.r11 = 0;
1480 		tdx->vp_enter_args.r12 = 0;
1481 		tdx->vp_enter_args.r13 = 0;
1482 		tdx->vp_enter_args.r14 = 0;
1483 		tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_SUCCESS);
1484 		return 1;
1485 	case 1:
1486 		vcpu->run->tdx.get_tdvmcall_info.leaf = tdx->vp_enter_args.r12;
1487 		vcpu->run->exit_reason = KVM_EXIT_TDX;
1488 		vcpu->run->tdx.flags = 0;
1489 		vcpu->run->tdx.nr = TDVMCALL_GET_TD_VM_CALL_INFO;
1490 		vcpu->run->tdx.get_tdvmcall_info.ret = TDVMCALL_STATUS_SUCCESS;
1491 		vcpu->run->tdx.get_tdvmcall_info.r11 = 0;
1492 		vcpu->run->tdx.get_tdvmcall_info.r12 = 0;
1493 		vcpu->run->tdx.get_tdvmcall_info.r13 = 0;
1494 		vcpu->run->tdx.get_tdvmcall_info.r14 = 0;
1495 		vcpu->arch.complete_userspace_io = tdx_complete_get_td_vm_call_info;
1496 		return 0;
1497 	default:
1498 		tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_INVALID_OPERAND);
1499 		return 1;
1500 	}
1501 }
1502 
tdx_complete_simple(struct kvm_vcpu * vcpu)1503 static int tdx_complete_simple(struct kvm_vcpu *vcpu)
1504 {
1505 	tdvmcall_set_return_code(vcpu, vcpu->run->tdx.unknown.ret);
1506 	return 1;
1507 }
1508 
tdx_get_quote(struct kvm_vcpu * vcpu)1509 static int tdx_get_quote(struct kvm_vcpu *vcpu)
1510 {
1511 	struct vcpu_tdx *tdx = to_tdx(vcpu);
1512 	u64 gpa = tdx->vp_enter_args.r12;
1513 	u64 size = tdx->vp_enter_args.r13;
1514 
1515 	/* The gpa of buffer must have shared bit set. */
1516 	if (vt_is_tdx_private_gpa(vcpu->kvm, gpa)) {
1517 		tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_INVALID_OPERAND);
1518 		return 1;
1519 	}
1520 
1521 	vcpu->run->exit_reason = KVM_EXIT_TDX;
1522 	vcpu->run->tdx.flags = 0;
1523 	vcpu->run->tdx.nr = TDVMCALL_GET_QUOTE;
1524 	vcpu->run->tdx.get_quote.ret = TDVMCALL_STATUS_SUBFUNC_UNSUPPORTED;
1525 	vcpu->run->tdx.get_quote.gpa = gpa & ~gfn_to_gpa(kvm_gfn_direct_bits(tdx->vcpu.kvm));
1526 	vcpu->run->tdx.get_quote.size = size;
1527 
1528 	vcpu->arch.complete_userspace_io = tdx_complete_simple;
1529 
1530 	return 0;
1531 }
1532 
handle_tdvmcall(struct kvm_vcpu * vcpu)1533 static int handle_tdvmcall(struct kvm_vcpu *vcpu)
1534 {
1535 	switch (tdvmcall_leaf(vcpu)) {
1536 	case TDVMCALL_MAP_GPA:
1537 		return tdx_map_gpa(vcpu);
1538 	case TDVMCALL_REPORT_FATAL_ERROR:
1539 		return tdx_report_fatal_error(vcpu);
1540 	case TDVMCALL_GET_TD_VM_CALL_INFO:
1541 		return tdx_get_td_vm_call_info(vcpu);
1542 	case TDVMCALL_GET_QUOTE:
1543 		return tdx_get_quote(vcpu);
1544 	default:
1545 		break;
1546 	}
1547 
1548 	tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_SUBFUNC_UNSUPPORTED);
1549 	return 1;
1550 }
1551 
tdx_load_mmu_pgd(struct kvm_vcpu * vcpu,hpa_t root_hpa,int pgd_level)1552 void tdx_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa, int pgd_level)
1553 {
1554 	u64 shared_bit = (pgd_level == 5) ? TDX_SHARED_BIT_PWL_5 :
1555 			  TDX_SHARED_BIT_PWL_4;
1556 
1557 	if (KVM_BUG_ON(shared_bit != kvm_gfn_direct_bits(vcpu->kvm), vcpu->kvm))
1558 		return;
1559 
1560 	td_vmcs_write64(to_tdx(vcpu), SHARED_EPT_POINTER, root_hpa);
1561 }
1562 
tdx_unpin(struct kvm * kvm,struct page * page)1563 static void tdx_unpin(struct kvm *kvm, struct page *page)
1564 {
1565 	put_page(page);
1566 }
1567 
tdx_mem_page_aug(struct kvm * kvm,gfn_t gfn,enum pg_level level,struct page * page)1568 static int tdx_mem_page_aug(struct kvm *kvm, gfn_t gfn,
1569 			    enum pg_level level, struct page *page)
1570 {
1571 	int tdx_level = pg_level_to_tdx_sept_level(level);
1572 	struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
1573 	gpa_t gpa = gfn_to_gpa(gfn);
1574 	u64 entry, level_state;
1575 	u64 err;
1576 
1577 	err = tdh_mem_page_aug(&kvm_tdx->td, gpa, tdx_level, page, &entry, &level_state);
1578 	if (unlikely(tdx_operand_busy(err))) {
1579 		tdx_unpin(kvm, page);
1580 		return -EBUSY;
1581 	}
1582 
1583 	if (KVM_BUG_ON(err, kvm)) {
1584 		pr_tdx_error_2(TDH_MEM_PAGE_AUG, err, entry, level_state);
1585 		tdx_unpin(kvm, page);
1586 		return -EIO;
1587 	}
1588 
1589 	return 0;
1590 }
1591 
1592 /*
1593  * KVM_TDX_INIT_MEM_REGION calls kvm_gmem_populate() to map guest pages; the
1594  * callback tdx_gmem_post_populate() then maps pages into private memory.
1595  * through the a seamcall TDH.MEM.PAGE.ADD().  The SEAMCALL also requires the
1596  * private EPT structures for the page to have been built before, which is
1597  * done via kvm_tdp_map_page(). nr_premapped counts the number of pages that
1598  * were added to the EPT structures but not added with TDH.MEM.PAGE.ADD().
1599  * The counter has to be zero on KVM_TDX_FINALIZE_VM, to ensure that there
1600  * are no half-initialized shared EPT pages.
1601  */
tdx_mem_page_record_premap_cnt(struct kvm * kvm,gfn_t gfn,enum pg_level level,kvm_pfn_t pfn)1602 static int tdx_mem_page_record_premap_cnt(struct kvm *kvm, gfn_t gfn,
1603 					  enum pg_level level, kvm_pfn_t pfn)
1604 {
1605 	struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
1606 
1607 	if (KVM_BUG_ON(kvm->arch.pre_fault_allowed, kvm))
1608 		return -EINVAL;
1609 
1610 	/* nr_premapped will be decreased when tdh_mem_page_add() is called. */
1611 	atomic64_inc(&kvm_tdx->nr_premapped);
1612 	return 0;
1613 }
1614 
tdx_sept_set_private_spte(struct kvm * kvm,gfn_t gfn,enum pg_level level,kvm_pfn_t pfn)1615 int tdx_sept_set_private_spte(struct kvm *kvm, gfn_t gfn,
1616 			      enum pg_level level, kvm_pfn_t pfn)
1617 {
1618 	struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
1619 	struct page *page = pfn_to_page(pfn);
1620 
1621 	/* TODO: handle large pages. */
1622 	if (KVM_BUG_ON(level != PG_LEVEL_4K, kvm))
1623 		return -EINVAL;
1624 
1625 	/*
1626 	 * Because guest_memfd doesn't support page migration with
1627 	 * a_ops->migrate_folio (yet), no callback is triggered for KVM on page
1628 	 * migration.  Until guest_memfd supports page migration, prevent page
1629 	 * migration.
1630 	 * TODO: Once guest_memfd introduces callback on page migration,
1631 	 * implement it and remove get_page/put_page().
1632 	 */
1633 	get_page(page);
1634 
1635 	/*
1636 	 * Read 'pre_fault_allowed' before 'kvm_tdx->state'; see matching
1637 	 * barrier in tdx_td_finalize().
1638 	 */
1639 	smp_rmb();
1640 	if (likely(kvm_tdx->state == TD_STATE_RUNNABLE))
1641 		return tdx_mem_page_aug(kvm, gfn, level, page);
1642 
1643 	return tdx_mem_page_record_premap_cnt(kvm, gfn, level, pfn);
1644 }
1645 
tdx_sept_drop_private_spte(struct kvm * kvm,gfn_t gfn,enum pg_level level,struct page * page)1646 static int tdx_sept_drop_private_spte(struct kvm *kvm, gfn_t gfn,
1647 				      enum pg_level level, struct page *page)
1648 {
1649 	int tdx_level = pg_level_to_tdx_sept_level(level);
1650 	struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
1651 	gpa_t gpa = gfn_to_gpa(gfn);
1652 	u64 err, entry, level_state;
1653 
1654 	/* TODO: handle large pages. */
1655 	if (KVM_BUG_ON(level != PG_LEVEL_4K, kvm))
1656 		return -EINVAL;
1657 
1658 	if (KVM_BUG_ON(!is_hkid_assigned(kvm_tdx), kvm))
1659 		return -EINVAL;
1660 
1661 	/*
1662 	 * When zapping private page, write lock is held. So no race condition
1663 	 * with other vcpu sept operation.
1664 	 * Race with TDH.VP.ENTER due to (0-step mitigation) and Guest TDCALLs.
1665 	 */
1666 	err = tdh_mem_page_remove(&kvm_tdx->td, gpa, tdx_level, &entry,
1667 				  &level_state);
1668 
1669 	if (unlikely(tdx_operand_busy(err))) {
1670 		/*
1671 		 * The second retry is expected to succeed after kicking off all
1672 		 * other vCPUs and prevent them from invoking TDH.VP.ENTER.
1673 		 */
1674 		tdx_no_vcpus_enter_start(kvm);
1675 		err = tdh_mem_page_remove(&kvm_tdx->td, gpa, tdx_level, &entry,
1676 					  &level_state);
1677 		tdx_no_vcpus_enter_stop(kvm);
1678 	}
1679 
1680 	if (KVM_BUG_ON(err, kvm)) {
1681 		pr_tdx_error_2(TDH_MEM_PAGE_REMOVE, err, entry, level_state);
1682 		return -EIO;
1683 	}
1684 
1685 	err = tdh_phymem_page_wbinvd_hkid((u16)kvm_tdx->hkid, page);
1686 
1687 	if (KVM_BUG_ON(err, kvm)) {
1688 		pr_tdx_error(TDH_PHYMEM_PAGE_WBINVD, err);
1689 		return -EIO;
1690 	}
1691 	tdx_clear_page(page);
1692 	tdx_unpin(kvm, page);
1693 	return 0;
1694 }
1695 
tdx_sept_link_private_spt(struct kvm * kvm,gfn_t gfn,enum pg_level level,void * private_spt)1696 int tdx_sept_link_private_spt(struct kvm *kvm, gfn_t gfn,
1697 			      enum pg_level level, void *private_spt)
1698 {
1699 	int tdx_level = pg_level_to_tdx_sept_level(level);
1700 	gpa_t gpa = gfn_to_gpa(gfn);
1701 	struct page *page = virt_to_page(private_spt);
1702 	u64 err, entry, level_state;
1703 
1704 	err = tdh_mem_sept_add(&to_kvm_tdx(kvm)->td, gpa, tdx_level, page, &entry,
1705 			       &level_state);
1706 	if (unlikely(tdx_operand_busy(err)))
1707 		return -EBUSY;
1708 
1709 	if (KVM_BUG_ON(err, kvm)) {
1710 		pr_tdx_error_2(TDH_MEM_SEPT_ADD, err, entry, level_state);
1711 		return -EIO;
1712 	}
1713 
1714 	return 0;
1715 }
1716 
1717 /*
1718  * Check if the error returned from a SEPT zap SEAMCALL is due to that a page is
1719  * mapped by KVM_TDX_INIT_MEM_REGION without tdh_mem_page_add() being called
1720  * successfully.
1721  *
1722  * Since tdh_mem_sept_add() must have been invoked successfully before a
1723  * non-leaf entry present in the mirrored page table, the SEPT ZAP related
1724  * SEAMCALLs should not encounter err TDX_EPT_WALK_FAILED. They should instead
1725  * find TDX_EPT_ENTRY_STATE_INCORRECT due to an empty leaf entry found in the
1726  * SEPT.
1727  *
1728  * Further check if the returned entry from SEPT walking is with RWX permissions
1729  * to filter out anything unexpected.
1730  *
1731  * Note: @level is pg_level, not the tdx_level. The tdx_level extracted from
1732  * level_state returned from a SEAMCALL error is the same as that passed into
1733  * the SEAMCALL.
1734  */
tdx_is_sept_zap_err_due_to_premap(struct kvm_tdx * kvm_tdx,u64 err,u64 entry,int level)1735 static int tdx_is_sept_zap_err_due_to_premap(struct kvm_tdx *kvm_tdx, u64 err,
1736 					     u64 entry, int level)
1737 {
1738 	if (!err || kvm_tdx->state == TD_STATE_RUNNABLE)
1739 		return false;
1740 
1741 	if (err != (TDX_EPT_ENTRY_STATE_INCORRECT | TDX_OPERAND_ID_RCX))
1742 		return false;
1743 
1744 	if ((is_last_spte(entry, level) && (entry & VMX_EPT_RWX_MASK)))
1745 		return false;
1746 
1747 	return true;
1748 }
1749 
tdx_sept_zap_private_spte(struct kvm * kvm,gfn_t gfn,enum pg_level level,struct page * page)1750 static int tdx_sept_zap_private_spte(struct kvm *kvm, gfn_t gfn,
1751 				     enum pg_level level, struct page *page)
1752 {
1753 	int tdx_level = pg_level_to_tdx_sept_level(level);
1754 	struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
1755 	gpa_t gpa = gfn_to_gpa(gfn) & KVM_HPAGE_MASK(level);
1756 	u64 err, entry, level_state;
1757 
1758 	/* For now large page isn't supported yet. */
1759 	WARN_ON_ONCE(level != PG_LEVEL_4K);
1760 
1761 	err = tdh_mem_range_block(&kvm_tdx->td, gpa, tdx_level, &entry, &level_state);
1762 
1763 	if (unlikely(tdx_operand_busy(err))) {
1764 		/* After no vCPUs enter, the second retry is expected to succeed */
1765 		tdx_no_vcpus_enter_start(kvm);
1766 		err = tdh_mem_range_block(&kvm_tdx->td, gpa, tdx_level, &entry, &level_state);
1767 		tdx_no_vcpus_enter_stop(kvm);
1768 	}
1769 	if (tdx_is_sept_zap_err_due_to_premap(kvm_tdx, err, entry, level) &&
1770 	    !KVM_BUG_ON(!atomic64_read(&kvm_tdx->nr_premapped), kvm)) {
1771 		atomic64_dec(&kvm_tdx->nr_premapped);
1772 		tdx_unpin(kvm, page);
1773 		return 0;
1774 	}
1775 
1776 	if (KVM_BUG_ON(err, kvm)) {
1777 		pr_tdx_error_2(TDH_MEM_RANGE_BLOCK, err, entry, level_state);
1778 		return -EIO;
1779 	}
1780 	return 1;
1781 }
1782 
1783 /*
1784  * Ensure shared and private EPTs to be flushed on all vCPUs.
1785  * tdh_mem_track() is the only caller that increases TD epoch. An increase in
1786  * the TD epoch (e.g., to value "N + 1") is successful only if no vCPUs are
1787  * running in guest mode with the value "N - 1".
1788  *
1789  * A successful execution of tdh_mem_track() ensures that vCPUs can only run in
1790  * guest mode with TD epoch value "N" if no TD exit occurs after the TD epoch
1791  * being increased to "N + 1".
1792  *
1793  * Kicking off all vCPUs after that further results in no vCPUs can run in guest
1794  * mode with TD epoch value "N", which unblocks the next tdh_mem_track() (e.g.
1795  * to increase TD epoch to "N + 2").
1796  *
1797  * TDX module will flush EPT on the next TD enter and make vCPUs to run in
1798  * guest mode with TD epoch value "N + 1".
1799  *
1800  * kvm_make_all_cpus_request() guarantees all vCPUs are out of guest mode by
1801  * waiting empty IPI handler ack_kick().
1802  *
1803  * No action is required to the vCPUs being kicked off since the kicking off
1804  * occurs certainly after TD epoch increment and before the next
1805  * tdh_mem_track().
1806  */
tdx_track(struct kvm * kvm)1807 static void tdx_track(struct kvm *kvm)
1808 {
1809 	struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
1810 	u64 err;
1811 
1812 	/* If TD isn't finalized, it's before any vcpu running. */
1813 	if (unlikely(kvm_tdx->state != TD_STATE_RUNNABLE))
1814 		return;
1815 
1816 	lockdep_assert_held_write(&kvm->mmu_lock);
1817 
1818 	err = tdh_mem_track(&kvm_tdx->td);
1819 	if (unlikely(tdx_operand_busy(err))) {
1820 		/* After no vCPUs enter, the second retry is expected to succeed */
1821 		tdx_no_vcpus_enter_start(kvm);
1822 		err = tdh_mem_track(&kvm_tdx->td);
1823 		tdx_no_vcpus_enter_stop(kvm);
1824 	}
1825 
1826 	if (KVM_BUG_ON(err, kvm))
1827 		pr_tdx_error(TDH_MEM_TRACK, err);
1828 
1829 	kvm_make_all_cpus_request(kvm, KVM_REQ_OUTSIDE_GUEST_MODE);
1830 }
1831 
tdx_sept_free_private_spt(struct kvm * kvm,gfn_t gfn,enum pg_level level,void * private_spt)1832 int tdx_sept_free_private_spt(struct kvm *kvm, gfn_t gfn,
1833 			      enum pg_level level, void *private_spt)
1834 {
1835 	struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
1836 
1837 	/*
1838 	 * free_external_spt() is only called after hkid is freed when TD is
1839 	 * tearing down.
1840 	 * KVM doesn't (yet) zap page table pages in mirror page table while
1841 	 * TD is active, though guest pages mapped in mirror page table could be
1842 	 * zapped during TD is active, e.g. for shared <-> private conversion
1843 	 * and slot move/deletion.
1844 	 */
1845 	if (KVM_BUG_ON(is_hkid_assigned(kvm_tdx), kvm))
1846 		return -EINVAL;
1847 
1848 	/*
1849 	 * The HKID assigned to this TD was already freed and cache was
1850 	 * already flushed. We don't have to flush again.
1851 	 */
1852 	return tdx_reclaim_page(virt_to_page(private_spt));
1853 }
1854 
tdx_sept_remove_private_spte(struct kvm * kvm,gfn_t gfn,enum pg_level level,kvm_pfn_t pfn)1855 int tdx_sept_remove_private_spte(struct kvm *kvm, gfn_t gfn,
1856 				 enum pg_level level, kvm_pfn_t pfn)
1857 {
1858 	struct page *page = pfn_to_page(pfn);
1859 	int ret;
1860 
1861 	/*
1862 	 * HKID is released after all private pages have been removed, and set
1863 	 * before any might be populated. Warn if zapping is attempted when
1864 	 * there can't be anything populated in the private EPT.
1865 	 */
1866 	if (KVM_BUG_ON(!is_hkid_assigned(to_kvm_tdx(kvm)), kvm))
1867 		return -EINVAL;
1868 
1869 	ret = tdx_sept_zap_private_spte(kvm, gfn, level, page);
1870 	if (ret <= 0)
1871 		return ret;
1872 
1873 	/*
1874 	 * TDX requires TLB tracking before dropping private page.  Do
1875 	 * it here, although it is also done later.
1876 	 */
1877 	tdx_track(kvm);
1878 
1879 	return tdx_sept_drop_private_spte(kvm, gfn, level, page);
1880 }
1881 
tdx_deliver_interrupt(struct kvm_lapic * apic,int delivery_mode,int trig_mode,int vector)1882 void tdx_deliver_interrupt(struct kvm_lapic *apic, int delivery_mode,
1883 			   int trig_mode, int vector)
1884 {
1885 	struct kvm_vcpu *vcpu = apic->vcpu;
1886 	struct vcpu_tdx *tdx = to_tdx(vcpu);
1887 
1888 	/* TDX supports only posted interrupt.  No lapic emulation. */
1889 	__vmx_deliver_posted_interrupt(vcpu, &tdx->vt.pi_desc, vector);
1890 
1891 	trace_kvm_apicv_accept_irq(vcpu->vcpu_id, delivery_mode, trig_mode, vector);
1892 }
1893 
tdx_is_sept_violation_unexpected_pending(struct kvm_vcpu * vcpu)1894 static inline bool tdx_is_sept_violation_unexpected_pending(struct kvm_vcpu *vcpu)
1895 {
1896 	u64 eeq_type = to_tdx(vcpu)->ext_exit_qualification & TDX_EXT_EXIT_QUAL_TYPE_MASK;
1897 	u64 eq = vmx_get_exit_qual(vcpu);
1898 
1899 	if (eeq_type != TDX_EXT_EXIT_QUAL_TYPE_PENDING_EPT_VIOLATION)
1900 		return false;
1901 
1902 	return !(eq & EPT_VIOLATION_PROT_MASK) && !(eq & EPT_VIOLATION_EXEC_FOR_RING3_LIN);
1903 }
1904 
tdx_handle_ept_violation(struct kvm_vcpu * vcpu)1905 static int tdx_handle_ept_violation(struct kvm_vcpu *vcpu)
1906 {
1907 	unsigned long exit_qual;
1908 	gpa_t gpa = to_tdx(vcpu)->exit_gpa;
1909 	bool local_retry = false;
1910 	int ret;
1911 
1912 	if (vt_is_tdx_private_gpa(vcpu->kvm, gpa)) {
1913 		if (tdx_is_sept_violation_unexpected_pending(vcpu)) {
1914 			pr_warn("Guest access before accepting 0x%llx on vCPU %d\n",
1915 				gpa, vcpu->vcpu_id);
1916 			kvm_vm_dead(vcpu->kvm);
1917 			return -EIO;
1918 		}
1919 		/*
1920 		 * Always treat SEPT violations as write faults.  Ignore the
1921 		 * EXIT_QUALIFICATION reported by TDX-SEAM for SEPT violations.
1922 		 * TD private pages are always RWX in the SEPT tables,
1923 		 * i.e. they're always mapped writable.  Just as importantly,
1924 		 * treating SEPT violations as write faults is necessary to
1925 		 * avoid COW allocations, which will cause TDAUGPAGE failures
1926 		 * due to aliasing a single HPA to multiple GPAs.
1927 		 */
1928 		exit_qual = EPT_VIOLATION_ACC_WRITE;
1929 
1930 		/* Only private GPA triggers zero-step mitigation */
1931 		local_retry = true;
1932 	} else {
1933 		exit_qual = vmx_get_exit_qual(vcpu);
1934 		/*
1935 		 * EPT violation due to instruction fetch should never be
1936 		 * triggered from shared memory in TDX guest.  If such EPT
1937 		 * violation occurs, treat it as broken hardware.
1938 		 */
1939 		if (KVM_BUG_ON(exit_qual & EPT_VIOLATION_ACC_INSTR, vcpu->kvm))
1940 			return -EIO;
1941 	}
1942 
1943 	trace_kvm_page_fault(vcpu, gpa, exit_qual);
1944 
1945 	/*
1946 	 * To minimize TDH.VP.ENTER invocations, retry locally for private GPA
1947 	 * mapping in TDX.
1948 	 *
1949 	 * KVM may return RET_PF_RETRY for private GPA due to
1950 	 * - contentions when atomically updating SPTEs of the mirror page table
1951 	 * - in-progress GFN invalidation or memslot removal.
1952 	 * - TDX_OPERAND_BUSY error from TDH.MEM.PAGE.AUG or TDH.MEM.SEPT.ADD,
1953 	 *   caused by contentions with TDH.VP.ENTER (with zero-step mitigation)
1954 	 *   or certain TDCALLs.
1955 	 *
1956 	 * If TDH.VP.ENTER is invoked more times than the threshold set by the
1957 	 * TDX module before KVM resolves the private GPA mapping, the TDX
1958 	 * module will activate zero-step mitigation during TDH.VP.ENTER. This
1959 	 * process acquires an SEPT tree lock in the TDX module, leading to
1960 	 * further contentions with TDH.MEM.PAGE.AUG or TDH.MEM.SEPT.ADD
1961 	 * operations on other vCPUs.
1962 	 *
1963 	 * Breaking out of local retries for kvm_vcpu_has_events() is for
1964 	 * interrupt injection. kvm_vcpu_has_events() should not see pending
1965 	 * events for TDX. Since KVM can't determine if IRQs (or NMIs) are
1966 	 * blocked by TDs, false positives are inevitable i.e., KVM may re-enter
1967 	 * the guest even if the IRQ/NMI can't be delivered.
1968 	 *
1969 	 * Note: even without breaking out of local retries, zero-step
1970 	 * mitigation may still occur due to
1971 	 * - invoking of TDH.VP.ENTER after KVM_EXIT_MEMORY_FAULT,
1972 	 * - a single RIP causing EPT violations for more GFNs than the
1973 	 *   threshold count.
1974 	 * This is safe, as triggering zero-step mitigation only introduces
1975 	 * contentions to page installation SEAMCALLs on other vCPUs, which will
1976 	 * handle retries locally in their EPT violation handlers.
1977 	 */
1978 	while (1) {
1979 		ret = __vmx_handle_ept_violation(vcpu, gpa, exit_qual);
1980 
1981 		if (ret != RET_PF_RETRY || !local_retry)
1982 			break;
1983 
1984 		if (kvm_vcpu_has_events(vcpu) || signal_pending(current))
1985 			break;
1986 
1987 		if (kvm_check_request(KVM_REQ_VM_DEAD, vcpu)) {
1988 			ret = -EIO;
1989 			break;
1990 		}
1991 
1992 		cond_resched();
1993 	}
1994 	return ret;
1995 }
1996 
tdx_complete_emulated_msr(struct kvm_vcpu * vcpu,int err)1997 int tdx_complete_emulated_msr(struct kvm_vcpu *vcpu, int err)
1998 {
1999 	if (err) {
2000 		tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_INVALID_OPERAND);
2001 		return 1;
2002 	}
2003 
2004 	if (vmx_get_exit_reason(vcpu).basic == EXIT_REASON_MSR_READ)
2005 		tdvmcall_set_return_val(vcpu, kvm_read_edx_eax(vcpu));
2006 
2007 	return 1;
2008 }
2009 
2010 
tdx_handle_exit(struct kvm_vcpu * vcpu,fastpath_t fastpath)2011 int tdx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t fastpath)
2012 {
2013 	struct vcpu_tdx *tdx = to_tdx(vcpu);
2014 	u64 vp_enter_ret = tdx->vp_enter_ret;
2015 	union vmx_exit_reason exit_reason = vmx_get_exit_reason(vcpu);
2016 
2017 	if (fastpath != EXIT_FASTPATH_NONE)
2018 		return 1;
2019 
2020 	if (unlikely(vp_enter_ret == EXIT_REASON_EPT_MISCONFIG)) {
2021 		KVM_BUG_ON(1, vcpu->kvm);
2022 		return -EIO;
2023 	}
2024 
2025 	/*
2026 	 * Handle TDX SW errors, including TDX_SEAMCALL_UD, TDX_SEAMCALL_GP and
2027 	 * TDX_SEAMCALL_VMFAILINVALID.
2028 	 */
2029 	if (unlikely((vp_enter_ret & TDX_SW_ERROR) == TDX_SW_ERROR)) {
2030 		KVM_BUG_ON(!kvm_rebooting, vcpu->kvm);
2031 		goto unhandled_exit;
2032 	}
2033 
2034 	if (unlikely(tdx_failed_vmentry(vcpu))) {
2035 		/*
2036 		 * If the guest state is protected, that means off-TD debug is
2037 		 * not enabled, TDX_NON_RECOVERABLE must be set.
2038 		 */
2039 		WARN_ON_ONCE(vcpu->arch.guest_state_protected &&
2040 				!(vp_enter_ret & TDX_NON_RECOVERABLE));
2041 		vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY;
2042 		vcpu->run->fail_entry.hardware_entry_failure_reason = exit_reason.full;
2043 		vcpu->run->fail_entry.cpu = vcpu->arch.last_vmentry_cpu;
2044 		return 0;
2045 	}
2046 
2047 	if (unlikely(vp_enter_ret & (TDX_ERROR | TDX_NON_RECOVERABLE)) &&
2048 		exit_reason.basic != EXIT_REASON_TRIPLE_FAULT) {
2049 		kvm_pr_unimpl("TD vp_enter_ret 0x%llx\n", vp_enter_ret);
2050 		goto unhandled_exit;
2051 	}
2052 
2053 	WARN_ON_ONCE(exit_reason.basic != EXIT_REASON_TRIPLE_FAULT &&
2054 		     (vp_enter_ret & TDX_SEAMCALL_STATUS_MASK) != TDX_SUCCESS);
2055 
2056 	switch (exit_reason.basic) {
2057 	case EXIT_REASON_TRIPLE_FAULT:
2058 		vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN;
2059 		vcpu->mmio_needed = 0;
2060 		return 0;
2061 	case EXIT_REASON_EXCEPTION_NMI:
2062 		return tdx_handle_exception_nmi(vcpu);
2063 	case EXIT_REASON_EXTERNAL_INTERRUPT:
2064 		++vcpu->stat.irq_exits;
2065 		return 1;
2066 	case EXIT_REASON_CPUID:
2067 		return tdx_emulate_cpuid(vcpu);
2068 	case EXIT_REASON_HLT:
2069 		return kvm_emulate_halt_noskip(vcpu);
2070 	case EXIT_REASON_TDCALL:
2071 		return handle_tdvmcall(vcpu);
2072 	case EXIT_REASON_VMCALL:
2073 		return tdx_emulate_vmcall(vcpu);
2074 	case EXIT_REASON_IO_INSTRUCTION:
2075 		return tdx_emulate_io(vcpu);
2076 	case EXIT_REASON_MSR_READ:
2077 		kvm_rcx_write(vcpu, tdx->vp_enter_args.r12);
2078 		return kvm_emulate_rdmsr(vcpu);
2079 	case EXIT_REASON_MSR_WRITE:
2080 		kvm_rcx_write(vcpu, tdx->vp_enter_args.r12);
2081 		kvm_rax_write(vcpu, tdx->vp_enter_args.r13 & -1u);
2082 		kvm_rdx_write(vcpu, tdx->vp_enter_args.r13 >> 32);
2083 		return kvm_emulate_wrmsr(vcpu);
2084 	case EXIT_REASON_EPT_MISCONFIG:
2085 		return tdx_emulate_mmio(vcpu);
2086 	case EXIT_REASON_EPT_VIOLATION:
2087 		return tdx_handle_ept_violation(vcpu);
2088 	case EXIT_REASON_OTHER_SMI:
2089 		/*
2090 		 * Unlike VMX, SMI in SEAM non-root mode (i.e. when
2091 		 * TD guest vCPU is running) will cause VM exit to TDX module,
2092 		 * then SEAMRET to KVM.  Once it exits to KVM, SMI is delivered
2093 		 * and handled by kernel handler right away.
2094 		 *
2095 		 * The Other SMI exit can also be caused by the SEAM non-root
2096 		 * machine check delivered via Machine Check System Management
2097 		 * Interrupt (MSMI), but it has already been handled by the
2098 		 * kernel machine check handler, i.e., the memory page has been
2099 		 * marked as poisoned and it won't be freed to the free list
2100 		 * when the TDX guest is terminated (the TDX module marks the
2101 		 * guest as dead and prevent it from further running when
2102 		 * machine check happens in SEAM non-root).
2103 		 *
2104 		 * - A MSMI will not reach here, it's handled as non_recoverable
2105 		 *   case above.
2106 		 * - If it's not an MSMI, no need to do anything here.
2107 		 */
2108 		return 1;
2109 	default:
2110 		break;
2111 	}
2112 
2113 unhandled_exit:
2114 	vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
2115 	vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_UNEXPECTED_EXIT_REASON;
2116 	vcpu->run->internal.ndata = 2;
2117 	vcpu->run->internal.data[0] = vp_enter_ret;
2118 	vcpu->run->internal.data[1] = vcpu->arch.last_vmentry_cpu;
2119 	return 0;
2120 }
2121 
tdx_get_exit_info(struct kvm_vcpu * vcpu,u32 * reason,u64 * info1,u64 * info2,u32 * intr_info,u32 * error_code)2122 void tdx_get_exit_info(struct kvm_vcpu *vcpu, u32 *reason,
2123 		u64 *info1, u64 *info2, u32 *intr_info, u32 *error_code)
2124 {
2125 	struct vcpu_tdx *tdx = to_tdx(vcpu);
2126 
2127 	*reason = tdx->vt.exit_reason.full;
2128 	if (*reason != -1u) {
2129 		*info1 = vmx_get_exit_qual(vcpu);
2130 		*info2 = tdx->ext_exit_qualification;
2131 		*intr_info = vmx_get_intr_info(vcpu);
2132 	} else {
2133 		*info1 = 0;
2134 		*info2 = 0;
2135 		*intr_info = 0;
2136 	}
2137 
2138 	*error_code = 0;
2139 }
2140 
tdx_has_emulated_msr(u32 index)2141 bool tdx_has_emulated_msr(u32 index)
2142 {
2143 	switch (index) {
2144 	case MSR_IA32_UCODE_REV:
2145 	case MSR_IA32_ARCH_CAPABILITIES:
2146 	case MSR_IA32_POWER_CTL:
2147 	case MSR_IA32_CR_PAT:
2148 	case MSR_MTRRcap:
2149 	case MTRRphysBase_MSR(0) ... MSR_MTRRfix4K_F8000:
2150 	case MSR_MTRRdefType:
2151 	case MSR_IA32_TSC_DEADLINE:
2152 	case MSR_IA32_MISC_ENABLE:
2153 	case MSR_PLATFORM_INFO:
2154 	case MSR_MISC_FEATURES_ENABLES:
2155 	case MSR_IA32_APICBASE:
2156 	case MSR_EFER:
2157 	case MSR_IA32_FEAT_CTL:
2158 	case MSR_IA32_MCG_CAP:
2159 	case MSR_IA32_MCG_STATUS:
2160 	case MSR_IA32_MCG_CTL:
2161 	case MSR_IA32_MCG_EXT_CTL:
2162 	case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1:
2163 	case MSR_IA32_MC0_CTL2 ... MSR_IA32_MCx_CTL2(KVM_MAX_MCE_BANKS) - 1:
2164 		/* MSR_IA32_MCx_{CTL, STATUS, ADDR, MISC, CTL2} */
2165 	case MSR_KVM_POLL_CONTROL:
2166 		return true;
2167 	case APIC_BASE_MSR ... APIC_BASE_MSR + 0xff:
2168 		/*
2169 		 * x2APIC registers that are virtualized by the CPU can't be
2170 		 * emulated, KVM doesn't have access to the virtual APIC page.
2171 		 */
2172 		switch (index) {
2173 		case X2APIC_MSR(APIC_TASKPRI):
2174 		case X2APIC_MSR(APIC_PROCPRI):
2175 		case X2APIC_MSR(APIC_EOI):
2176 		case X2APIC_MSR(APIC_ISR) ... X2APIC_MSR(APIC_ISR + APIC_ISR_NR):
2177 		case X2APIC_MSR(APIC_TMR) ... X2APIC_MSR(APIC_TMR + APIC_ISR_NR):
2178 		case X2APIC_MSR(APIC_IRR) ... X2APIC_MSR(APIC_IRR + APIC_ISR_NR):
2179 			return false;
2180 		default:
2181 			return true;
2182 		}
2183 	default:
2184 		return false;
2185 	}
2186 }
2187 
tdx_is_read_only_msr(u32 index)2188 static bool tdx_is_read_only_msr(u32 index)
2189 {
2190 	return  index == MSR_IA32_APICBASE || index == MSR_EFER ||
2191 		index == MSR_IA32_FEAT_CTL;
2192 }
2193 
tdx_get_msr(struct kvm_vcpu * vcpu,struct msr_data * msr)2194 int tdx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
2195 {
2196 	switch (msr->index) {
2197 	case MSR_IA32_FEAT_CTL:
2198 		/*
2199 		 * MCE and MCA are advertised via cpuid. Guest kernel could
2200 		 * check if LMCE is enabled or not.
2201 		 */
2202 		msr->data = FEAT_CTL_LOCKED;
2203 		if (vcpu->arch.mcg_cap & MCG_LMCE_P)
2204 			msr->data |= FEAT_CTL_LMCE_ENABLED;
2205 		return 0;
2206 	case MSR_IA32_MCG_EXT_CTL:
2207 		if (!msr->host_initiated && !(vcpu->arch.mcg_cap & MCG_LMCE_P))
2208 			return 1;
2209 		msr->data = vcpu->arch.mcg_ext_ctl;
2210 		return 0;
2211 	default:
2212 		if (!tdx_has_emulated_msr(msr->index))
2213 			return 1;
2214 
2215 		return kvm_get_msr_common(vcpu, msr);
2216 	}
2217 }
2218 
tdx_set_msr(struct kvm_vcpu * vcpu,struct msr_data * msr)2219 int tdx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
2220 {
2221 	switch (msr->index) {
2222 	case MSR_IA32_MCG_EXT_CTL:
2223 		if ((!msr->host_initiated && !(vcpu->arch.mcg_cap & MCG_LMCE_P)) ||
2224 		    (msr->data & ~MCG_EXT_CTL_LMCE_EN))
2225 			return 1;
2226 		vcpu->arch.mcg_ext_ctl = msr->data;
2227 		return 0;
2228 	default:
2229 		if (tdx_is_read_only_msr(msr->index))
2230 			return 1;
2231 
2232 		if (!tdx_has_emulated_msr(msr->index))
2233 			return 1;
2234 
2235 		return kvm_set_msr_common(vcpu, msr);
2236 	}
2237 }
2238 
tdx_get_capabilities(struct kvm_tdx_cmd * cmd)2239 static int tdx_get_capabilities(struct kvm_tdx_cmd *cmd)
2240 {
2241 	const struct tdx_sys_info_td_conf *td_conf = &tdx_sysinfo->td_conf;
2242 	struct kvm_tdx_capabilities __user *user_caps;
2243 	struct kvm_tdx_capabilities *caps = NULL;
2244 	int ret = 0;
2245 
2246 	/* flags is reserved for future use */
2247 	if (cmd->flags)
2248 		return -EINVAL;
2249 
2250 	caps = kmalloc(sizeof(*caps) +
2251 		       sizeof(struct kvm_cpuid_entry2) * td_conf->num_cpuid_config,
2252 		       GFP_KERNEL);
2253 	if (!caps)
2254 		return -ENOMEM;
2255 
2256 	user_caps = u64_to_user_ptr(cmd->data);
2257 	if (copy_from_user(caps, user_caps, sizeof(*caps))) {
2258 		ret = -EFAULT;
2259 		goto out;
2260 	}
2261 
2262 	if (caps->cpuid.nent < td_conf->num_cpuid_config) {
2263 		ret = -E2BIG;
2264 		goto out;
2265 	}
2266 
2267 	ret = init_kvm_tdx_caps(td_conf, caps);
2268 	if (ret)
2269 		goto out;
2270 
2271 	if (copy_to_user(user_caps, caps, sizeof(*caps))) {
2272 		ret = -EFAULT;
2273 		goto out;
2274 	}
2275 
2276 	if (copy_to_user(user_caps->cpuid.entries, caps->cpuid.entries,
2277 			 caps->cpuid.nent *
2278 			 sizeof(caps->cpuid.entries[0])))
2279 		ret = -EFAULT;
2280 
2281 out:
2282 	/* kfree() accepts NULL. */
2283 	kfree(caps);
2284 	return ret;
2285 }
2286 
2287 /*
2288  * KVM reports guest physical address in CPUID.0x800000008.EAX[23:16], which is
2289  * similar to TDX's GPAW. Use this field as the interface for userspace to
2290  * configure the GPAW and EPT level for TDs.
2291  *
2292  * Only values 48 and 52 are supported. Value 52 means GPAW-52 and EPT level
2293  * 5, Value 48 means GPAW-48 and EPT level 4. For value 48, GPAW-48 is always
2294  * supported. Value 52 is only supported when the platform supports 5 level
2295  * EPT.
2296  */
setup_tdparams_eptp_controls(struct kvm_cpuid2 * cpuid,struct td_params * td_params)2297 static int setup_tdparams_eptp_controls(struct kvm_cpuid2 *cpuid,
2298 					struct td_params *td_params)
2299 {
2300 	const struct kvm_cpuid_entry2 *entry;
2301 	int guest_pa;
2302 
2303 	entry = kvm_find_cpuid_entry2(cpuid->entries, cpuid->nent, 0x80000008, 0);
2304 	if (!entry)
2305 		return -EINVAL;
2306 
2307 	guest_pa = tdx_get_guest_phys_addr_bits(entry->eax);
2308 
2309 	if (guest_pa != 48 && guest_pa != 52)
2310 		return -EINVAL;
2311 
2312 	if (guest_pa == 52 && !cpu_has_vmx_ept_5levels())
2313 		return -EINVAL;
2314 
2315 	td_params->eptp_controls = VMX_EPTP_MT_WB;
2316 	if (guest_pa == 52) {
2317 		td_params->eptp_controls |= VMX_EPTP_PWL_5;
2318 		td_params->config_flags |= TDX_CONFIG_FLAGS_MAX_GPAW;
2319 	} else {
2320 		td_params->eptp_controls |= VMX_EPTP_PWL_4;
2321 	}
2322 
2323 	return 0;
2324 }
2325 
setup_tdparams_cpuids(struct kvm_cpuid2 * cpuid,struct td_params * td_params)2326 static int setup_tdparams_cpuids(struct kvm_cpuid2 *cpuid,
2327 				 struct td_params *td_params)
2328 {
2329 	const struct tdx_sys_info_td_conf *td_conf = &tdx_sysinfo->td_conf;
2330 	const struct kvm_cpuid_entry2 *entry;
2331 	struct tdx_cpuid_value *value;
2332 	int i, copy_cnt = 0;
2333 
2334 	/*
2335 	 * td_params.cpuid_values: The number and the order of cpuid_value must
2336 	 * be same to the one of struct tdsysinfo.{num_cpuid_config, cpuid_configs}
2337 	 * It's assumed that td_params was zeroed.
2338 	 */
2339 	for (i = 0; i < td_conf->num_cpuid_config; i++) {
2340 		struct kvm_cpuid_entry2 tmp;
2341 
2342 		td_init_cpuid_entry2(&tmp, i);
2343 
2344 		entry = kvm_find_cpuid_entry2(cpuid->entries, cpuid->nent,
2345 					      tmp.function, tmp.index);
2346 		if (!entry)
2347 			continue;
2348 
2349 		if (tdx_unsupported_cpuid(entry))
2350 			return -EINVAL;
2351 
2352 		copy_cnt++;
2353 
2354 		value = &td_params->cpuid_values[i];
2355 		value->eax = entry->eax;
2356 		value->ebx = entry->ebx;
2357 		value->ecx = entry->ecx;
2358 		value->edx = entry->edx;
2359 
2360 		/*
2361 		 * TDX module does not accept nonzero bits 16..23 for the
2362 		 * CPUID[0x80000008].EAX, see setup_tdparams_eptp_controls().
2363 		 */
2364 		if (tmp.function == 0x80000008)
2365 			value->eax = tdx_set_guest_phys_addr_bits(value->eax, 0);
2366 	}
2367 
2368 	/*
2369 	 * Rely on the TDX module to reject invalid configuration, but it can't
2370 	 * check of leafs that don't have a proper slot in td_params->cpuid_values
2371 	 * to stick then. So fail if there were entries that didn't get copied to
2372 	 * td_params.
2373 	 */
2374 	if (copy_cnt != cpuid->nent)
2375 		return -EINVAL;
2376 
2377 	return 0;
2378 }
2379 
setup_tdparams(struct kvm * kvm,struct td_params * td_params,struct kvm_tdx_init_vm * init_vm)2380 static int setup_tdparams(struct kvm *kvm, struct td_params *td_params,
2381 			struct kvm_tdx_init_vm *init_vm)
2382 {
2383 	const struct tdx_sys_info_td_conf *td_conf = &tdx_sysinfo->td_conf;
2384 	struct kvm_cpuid2 *cpuid = &init_vm->cpuid;
2385 	int ret;
2386 
2387 	if (kvm->created_vcpus)
2388 		return -EBUSY;
2389 
2390 	if (init_vm->attributes & ~tdx_get_supported_attrs(td_conf))
2391 		return -EINVAL;
2392 
2393 	if (init_vm->xfam & ~tdx_get_supported_xfam(td_conf))
2394 		return -EINVAL;
2395 
2396 	td_params->max_vcpus = kvm->max_vcpus;
2397 	td_params->attributes = init_vm->attributes | td_conf->attributes_fixed1;
2398 	td_params->xfam = init_vm->xfam | td_conf->xfam_fixed1;
2399 
2400 	td_params->config_flags = TDX_CONFIG_FLAGS_NO_RBP_MOD;
2401 	td_params->tsc_frequency = TDX_TSC_KHZ_TO_25MHZ(kvm->arch.default_tsc_khz);
2402 
2403 	ret = setup_tdparams_eptp_controls(cpuid, td_params);
2404 	if (ret)
2405 		return ret;
2406 
2407 	ret = setup_tdparams_cpuids(cpuid, td_params);
2408 	if (ret)
2409 		return ret;
2410 
2411 #define MEMCPY_SAME_SIZE(dst, src)				\
2412 	do {							\
2413 		BUILD_BUG_ON(sizeof(dst) != sizeof(src));	\
2414 		memcpy((dst), (src), sizeof(dst));		\
2415 	} while (0)
2416 
2417 	MEMCPY_SAME_SIZE(td_params->mrconfigid, init_vm->mrconfigid);
2418 	MEMCPY_SAME_SIZE(td_params->mrowner, init_vm->mrowner);
2419 	MEMCPY_SAME_SIZE(td_params->mrownerconfig, init_vm->mrownerconfig);
2420 
2421 	return 0;
2422 }
2423 
__tdx_td_init(struct kvm * kvm,struct td_params * td_params,u64 * seamcall_err)2424 static int __tdx_td_init(struct kvm *kvm, struct td_params *td_params,
2425 			 u64 *seamcall_err)
2426 {
2427 	struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
2428 	cpumask_var_t packages;
2429 	struct page **tdcs_pages = NULL;
2430 	struct page *tdr_page;
2431 	int ret, i;
2432 	u64 err, rcx;
2433 
2434 	*seamcall_err = 0;
2435 	ret = tdx_guest_keyid_alloc();
2436 	if (ret < 0)
2437 		return ret;
2438 	kvm_tdx->hkid = ret;
2439 	kvm_tdx->misc_cg = get_current_misc_cg();
2440 	ret = misc_cg_try_charge(MISC_CG_RES_TDX, kvm_tdx->misc_cg, 1);
2441 	if (ret)
2442 		goto free_hkid;
2443 
2444 	ret = -ENOMEM;
2445 
2446 	atomic_inc(&nr_configured_hkid);
2447 
2448 	tdr_page = alloc_page(GFP_KERNEL);
2449 	if (!tdr_page)
2450 		goto free_hkid;
2451 
2452 	kvm_tdx->td.tdcs_nr_pages = tdx_sysinfo->td_ctrl.tdcs_base_size / PAGE_SIZE;
2453 	/* TDVPS = TDVPR(4K page) + TDCX(multiple 4K pages), -1 for TDVPR. */
2454 	kvm_tdx->td.tdcx_nr_pages = tdx_sysinfo->td_ctrl.tdvps_base_size / PAGE_SIZE - 1;
2455 	tdcs_pages = kcalloc(kvm_tdx->td.tdcs_nr_pages, sizeof(*kvm_tdx->td.tdcs_pages),
2456 			     GFP_KERNEL | __GFP_ZERO);
2457 	if (!tdcs_pages)
2458 		goto free_tdr;
2459 
2460 	for (i = 0; i < kvm_tdx->td.tdcs_nr_pages; i++) {
2461 		tdcs_pages[i] = alloc_page(GFP_KERNEL);
2462 		if (!tdcs_pages[i])
2463 			goto free_tdcs;
2464 	}
2465 
2466 	if (!zalloc_cpumask_var(&packages, GFP_KERNEL))
2467 		goto free_tdcs;
2468 
2469 	cpus_read_lock();
2470 
2471 	/*
2472 	 * Need at least one CPU of the package to be online in order to
2473 	 * program all packages for host key id.  Check it.
2474 	 */
2475 	for_each_present_cpu(i)
2476 		cpumask_set_cpu(topology_physical_package_id(i), packages);
2477 	for_each_online_cpu(i)
2478 		cpumask_clear_cpu(topology_physical_package_id(i), packages);
2479 	if (!cpumask_empty(packages)) {
2480 		ret = -EIO;
2481 		/*
2482 		 * Because it's hard for human operator to figure out the
2483 		 * reason, warn it.
2484 		 */
2485 #define MSG_ALLPKG	"All packages need to have online CPU to create TD. Online CPU and retry.\n"
2486 		pr_warn_ratelimited(MSG_ALLPKG);
2487 		goto free_packages;
2488 	}
2489 
2490 	/*
2491 	 * TDH.MNG.CREATE tries to grab the global TDX module and fails
2492 	 * with TDX_OPERAND_BUSY when it fails to grab.  Take the global
2493 	 * lock to prevent it from failure.
2494 	 */
2495 	mutex_lock(&tdx_lock);
2496 	kvm_tdx->td.tdr_page = tdr_page;
2497 	err = tdh_mng_create(&kvm_tdx->td, kvm_tdx->hkid);
2498 	mutex_unlock(&tdx_lock);
2499 
2500 	if (err == TDX_RND_NO_ENTROPY) {
2501 		ret = -EAGAIN;
2502 		goto free_packages;
2503 	}
2504 
2505 	if (WARN_ON_ONCE(err)) {
2506 		pr_tdx_error(TDH_MNG_CREATE, err);
2507 		ret = -EIO;
2508 		goto free_packages;
2509 	}
2510 
2511 	for_each_online_cpu(i) {
2512 		int pkg = topology_physical_package_id(i);
2513 
2514 		if (cpumask_test_and_set_cpu(pkg, packages))
2515 			continue;
2516 
2517 		/*
2518 		 * Program the memory controller in the package with an
2519 		 * encryption key associated to a TDX private host key id
2520 		 * assigned to this TDR.  Concurrent operations on same memory
2521 		 * controller results in TDX_OPERAND_BUSY. No locking needed
2522 		 * beyond the cpus_read_lock() above as it serializes against
2523 		 * hotplug and the first online CPU of the package is always
2524 		 * used. We never have two CPUs in the same socket trying to
2525 		 * program the key.
2526 		 */
2527 		ret = smp_call_on_cpu(i, tdx_do_tdh_mng_key_config,
2528 				      kvm_tdx, true);
2529 		if (ret)
2530 			break;
2531 	}
2532 	cpus_read_unlock();
2533 	free_cpumask_var(packages);
2534 	if (ret) {
2535 		i = 0;
2536 		goto teardown;
2537 	}
2538 
2539 	kvm_tdx->td.tdcs_pages = tdcs_pages;
2540 	for (i = 0; i < kvm_tdx->td.tdcs_nr_pages; i++) {
2541 		err = tdh_mng_addcx(&kvm_tdx->td, tdcs_pages[i]);
2542 		if (err == TDX_RND_NO_ENTROPY) {
2543 			/* Here it's hard to allow userspace to retry. */
2544 			ret = -EAGAIN;
2545 			goto teardown;
2546 		}
2547 		if (WARN_ON_ONCE(err)) {
2548 			pr_tdx_error(TDH_MNG_ADDCX, err);
2549 			ret = -EIO;
2550 			goto teardown;
2551 		}
2552 	}
2553 
2554 	err = tdh_mng_init(&kvm_tdx->td, __pa(td_params), &rcx);
2555 	if ((err & TDX_SEAMCALL_STATUS_MASK) == TDX_OPERAND_INVALID) {
2556 		/*
2557 		 * Because a user gives operands, don't warn.
2558 		 * Return a hint to the user because it's sometimes hard for the
2559 		 * user to figure out which operand is invalid.  SEAMCALL status
2560 		 * code includes which operand caused invalid operand error.
2561 		 */
2562 		*seamcall_err = err;
2563 		ret = -EINVAL;
2564 		goto teardown;
2565 	} else if (WARN_ON_ONCE(err)) {
2566 		pr_tdx_error_1(TDH_MNG_INIT, err, rcx);
2567 		ret = -EIO;
2568 		goto teardown;
2569 	}
2570 
2571 	return 0;
2572 
2573 	/*
2574 	 * The sequence for freeing resources from a partially initialized TD
2575 	 * varies based on where in the initialization flow failure occurred.
2576 	 * Simply use the full teardown and destroy, which naturally play nice
2577 	 * with partial initialization.
2578 	 */
2579 teardown:
2580 	/* Only free pages not yet added, so start at 'i' */
2581 	for (; i < kvm_tdx->td.tdcs_nr_pages; i++) {
2582 		if (tdcs_pages[i]) {
2583 			__free_page(tdcs_pages[i]);
2584 			tdcs_pages[i] = NULL;
2585 		}
2586 	}
2587 	if (!kvm_tdx->td.tdcs_pages)
2588 		kfree(tdcs_pages);
2589 
2590 	tdx_mmu_release_hkid(kvm);
2591 	tdx_reclaim_td_control_pages(kvm);
2592 
2593 	return ret;
2594 
2595 free_packages:
2596 	cpus_read_unlock();
2597 	free_cpumask_var(packages);
2598 
2599 free_tdcs:
2600 	for (i = 0; i < kvm_tdx->td.tdcs_nr_pages; i++) {
2601 		if (tdcs_pages[i])
2602 			__free_page(tdcs_pages[i]);
2603 	}
2604 	kfree(tdcs_pages);
2605 	kvm_tdx->td.tdcs_pages = NULL;
2606 
2607 free_tdr:
2608 	if (tdr_page)
2609 		__free_page(tdr_page);
2610 	kvm_tdx->td.tdr_page = 0;
2611 
2612 free_hkid:
2613 	tdx_hkid_free(kvm_tdx);
2614 
2615 	return ret;
2616 }
2617 
tdx_td_metadata_field_read(struct kvm_tdx * tdx,u64 field_id,u64 * data)2618 static u64 tdx_td_metadata_field_read(struct kvm_tdx *tdx, u64 field_id,
2619 				      u64 *data)
2620 {
2621 	u64 err;
2622 
2623 	err = tdh_mng_rd(&tdx->td, field_id, data);
2624 
2625 	return err;
2626 }
2627 
2628 #define TDX_MD_UNREADABLE_LEAF_MASK	GENMASK(30, 7)
2629 #define TDX_MD_UNREADABLE_SUBLEAF_MASK	GENMASK(31, 7)
2630 
tdx_read_cpuid(struct kvm_vcpu * vcpu,u32 leaf,u32 sub_leaf,bool sub_leaf_set,int * entry_index,struct kvm_cpuid_entry2 * out)2631 static int tdx_read_cpuid(struct kvm_vcpu *vcpu, u32 leaf, u32 sub_leaf,
2632 			  bool sub_leaf_set, int *entry_index,
2633 			  struct kvm_cpuid_entry2 *out)
2634 {
2635 	struct kvm_tdx *kvm_tdx = to_kvm_tdx(vcpu->kvm);
2636 	u64 field_id = TD_MD_FIELD_ID_CPUID_VALUES;
2637 	u64 ebx_eax, edx_ecx;
2638 	u64 err = 0;
2639 
2640 	if (sub_leaf > 0b1111111)
2641 		return -EINVAL;
2642 
2643 	if (*entry_index >= KVM_MAX_CPUID_ENTRIES)
2644 		return -EINVAL;
2645 
2646 	if (leaf & TDX_MD_UNREADABLE_LEAF_MASK ||
2647 	    sub_leaf & TDX_MD_UNREADABLE_SUBLEAF_MASK)
2648 		return -EINVAL;
2649 
2650 	/*
2651 	 * bit 23:17, REVSERVED: reserved, must be 0;
2652 	 * bit 16,    LEAF_31: leaf number bit 31;
2653 	 * bit 15:9,  LEAF_6_0: leaf number bits 6:0, leaf bits 30:7 are
2654 	 *                      implicitly 0;
2655 	 * bit 8,     SUBLEAF_NA: sub-leaf not applicable flag;
2656 	 * bit 7:1,   SUBLEAF_6_0: sub-leaf number bits 6:0. If SUBLEAF_NA is 1,
2657 	 *                         the SUBLEAF_6_0 is all-1.
2658 	 *                         sub-leaf bits 31:7 are implicitly 0;
2659 	 * bit 0,     ELEMENT_I: Element index within field;
2660 	 */
2661 	field_id |= ((leaf & 0x80000000) ? 1 : 0) << 16;
2662 	field_id |= (leaf & 0x7f) << 9;
2663 	if (sub_leaf_set)
2664 		field_id |= (sub_leaf & 0x7f) << 1;
2665 	else
2666 		field_id |= 0x1fe;
2667 
2668 	err = tdx_td_metadata_field_read(kvm_tdx, field_id, &ebx_eax);
2669 	if (err) //TODO check for specific errors
2670 		goto err_out;
2671 
2672 	out->eax = (u32) ebx_eax;
2673 	out->ebx = (u32) (ebx_eax >> 32);
2674 
2675 	field_id++;
2676 	err = tdx_td_metadata_field_read(kvm_tdx, field_id, &edx_ecx);
2677 	/*
2678 	 * It's weird that reading edx_ecx fails while reading ebx_eax
2679 	 * succeeded.
2680 	 */
2681 	if (WARN_ON_ONCE(err))
2682 		goto err_out;
2683 
2684 	out->ecx = (u32) edx_ecx;
2685 	out->edx = (u32) (edx_ecx >> 32);
2686 
2687 	out->function = leaf;
2688 	out->index = sub_leaf;
2689 	out->flags |= sub_leaf_set ? KVM_CPUID_FLAG_SIGNIFCANT_INDEX : 0;
2690 
2691 	/*
2692 	 * Work around missing support on old TDX modules, fetch
2693 	 * guest maxpa from gfn_direct_bits.
2694 	 */
2695 	if (leaf == 0x80000008) {
2696 		gpa_t gpa_bits = gfn_to_gpa(kvm_gfn_direct_bits(vcpu->kvm));
2697 		unsigned int g_maxpa = __ffs(gpa_bits) + 1;
2698 
2699 		out->eax = tdx_set_guest_phys_addr_bits(out->eax, g_maxpa);
2700 	}
2701 
2702 	(*entry_index)++;
2703 
2704 	return 0;
2705 
2706 err_out:
2707 	out->eax = 0;
2708 	out->ebx = 0;
2709 	out->ecx = 0;
2710 	out->edx = 0;
2711 
2712 	return -EIO;
2713 }
2714 
tdx_td_init(struct kvm * kvm,struct kvm_tdx_cmd * cmd)2715 static int tdx_td_init(struct kvm *kvm, struct kvm_tdx_cmd *cmd)
2716 {
2717 	struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
2718 	struct kvm_tdx_init_vm *init_vm;
2719 	struct td_params *td_params = NULL;
2720 	int ret;
2721 
2722 	BUILD_BUG_ON(sizeof(*init_vm) != 256 + sizeof_field(struct kvm_tdx_init_vm, cpuid));
2723 	BUILD_BUG_ON(sizeof(struct td_params) != 1024);
2724 
2725 	if (kvm_tdx->state != TD_STATE_UNINITIALIZED)
2726 		return -EINVAL;
2727 
2728 	if (cmd->flags)
2729 		return -EINVAL;
2730 
2731 	init_vm = kmalloc(sizeof(*init_vm) +
2732 			  sizeof(init_vm->cpuid.entries[0]) * KVM_MAX_CPUID_ENTRIES,
2733 			  GFP_KERNEL);
2734 	if (!init_vm)
2735 		return -ENOMEM;
2736 
2737 	if (copy_from_user(init_vm, u64_to_user_ptr(cmd->data), sizeof(*init_vm))) {
2738 		ret = -EFAULT;
2739 		goto out;
2740 	}
2741 
2742 	if (init_vm->cpuid.nent > KVM_MAX_CPUID_ENTRIES) {
2743 		ret = -E2BIG;
2744 		goto out;
2745 	}
2746 
2747 	if (copy_from_user(init_vm->cpuid.entries,
2748 			   u64_to_user_ptr(cmd->data) + sizeof(*init_vm),
2749 			   flex_array_size(init_vm, cpuid.entries, init_vm->cpuid.nent))) {
2750 		ret = -EFAULT;
2751 		goto out;
2752 	}
2753 
2754 	if (memchr_inv(init_vm->reserved, 0, sizeof(init_vm->reserved))) {
2755 		ret = -EINVAL;
2756 		goto out;
2757 	}
2758 
2759 	if (init_vm->cpuid.padding) {
2760 		ret = -EINVAL;
2761 		goto out;
2762 	}
2763 
2764 	td_params = kzalloc(sizeof(struct td_params), GFP_KERNEL);
2765 	if (!td_params) {
2766 		ret = -ENOMEM;
2767 		goto out;
2768 	}
2769 
2770 	ret = setup_tdparams(kvm, td_params, init_vm);
2771 	if (ret)
2772 		goto out;
2773 
2774 	ret = __tdx_td_init(kvm, td_params, &cmd->hw_error);
2775 	if (ret)
2776 		goto out;
2777 
2778 	kvm_tdx->tsc_offset = td_tdcs_exec_read64(kvm_tdx, TD_TDCS_EXEC_TSC_OFFSET);
2779 	kvm_tdx->tsc_multiplier = td_tdcs_exec_read64(kvm_tdx, TD_TDCS_EXEC_TSC_MULTIPLIER);
2780 	kvm_tdx->attributes = td_params->attributes;
2781 	kvm_tdx->xfam = td_params->xfam;
2782 
2783 	if (td_params->config_flags & TDX_CONFIG_FLAGS_MAX_GPAW)
2784 		kvm->arch.gfn_direct_bits = TDX_SHARED_BIT_PWL_5;
2785 	else
2786 		kvm->arch.gfn_direct_bits = TDX_SHARED_BIT_PWL_4;
2787 
2788 	kvm_tdx->state = TD_STATE_INITIALIZED;
2789 out:
2790 	/* kfree() accepts NULL. */
2791 	kfree(init_vm);
2792 	kfree(td_params);
2793 
2794 	return ret;
2795 }
2796 
tdx_flush_tlb_current(struct kvm_vcpu * vcpu)2797 void tdx_flush_tlb_current(struct kvm_vcpu *vcpu)
2798 {
2799 	/*
2800 	 * flush_tlb_current() is invoked when the first time for the vcpu to
2801 	 * run or when root of shared EPT is invalidated.
2802 	 * KVM only needs to flush shared EPT because the TDX module handles TLB
2803 	 * invalidation for private EPT in tdh_vp_enter();
2804 	 *
2805 	 * A single context invalidation for shared EPT can be performed here.
2806 	 * However, this single context invalidation requires the private EPTP
2807 	 * rather than the shared EPTP to flush shared EPT, as shared EPT uses
2808 	 * private EPTP as its ASID for TLB invalidation.
2809 	 *
2810 	 * To avoid reading back private EPTP, perform a global invalidation for
2811 	 * shared EPT instead to keep this function simple.
2812 	 */
2813 	ept_sync_global();
2814 }
2815 
tdx_flush_tlb_all(struct kvm_vcpu * vcpu)2816 void tdx_flush_tlb_all(struct kvm_vcpu *vcpu)
2817 {
2818 	/*
2819 	 * TDX has called tdx_track() in tdx_sept_remove_private_spte() to
2820 	 * ensure that private EPT will be flushed on the next TD enter. No need
2821 	 * to call tdx_track() here again even when this callback is a result of
2822 	 * zapping private EPT.
2823 	 *
2824 	 * Due to the lack of the context to determine which EPT has been
2825 	 * affected by zapping, invoke invept() directly here for both shared
2826 	 * EPT and private EPT for simplicity, though it's not necessary for
2827 	 * private EPT.
2828 	 */
2829 	ept_sync_global();
2830 }
2831 
tdx_td_finalize(struct kvm * kvm,struct kvm_tdx_cmd * cmd)2832 static int tdx_td_finalize(struct kvm *kvm, struct kvm_tdx_cmd *cmd)
2833 {
2834 	struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
2835 
2836 	guard(mutex)(&kvm->slots_lock);
2837 
2838 	if (!is_hkid_assigned(kvm_tdx) || kvm_tdx->state == TD_STATE_RUNNABLE)
2839 		return -EINVAL;
2840 	/*
2841 	 * Pages are pending for KVM_TDX_INIT_MEM_REGION to issue
2842 	 * TDH.MEM.PAGE.ADD().
2843 	 */
2844 	if (atomic64_read(&kvm_tdx->nr_premapped))
2845 		return -EINVAL;
2846 
2847 	cmd->hw_error = tdh_mr_finalize(&kvm_tdx->td);
2848 	if (tdx_operand_busy(cmd->hw_error))
2849 		return -EBUSY;
2850 	if (KVM_BUG_ON(cmd->hw_error, kvm)) {
2851 		pr_tdx_error(TDH_MR_FINALIZE, cmd->hw_error);
2852 		return -EIO;
2853 	}
2854 
2855 	kvm_tdx->state = TD_STATE_RUNNABLE;
2856 	/* TD_STATE_RUNNABLE must be set before 'pre_fault_allowed' */
2857 	smp_wmb();
2858 	kvm->arch.pre_fault_allowed = true;
2859 	return 0;
2860 }
2861 
tdx_vm_ioctl(struct kvm * kvm,void __user * argp)2862 int tdx_vm_ioctl(struct kvm *kvm, void __user *argp)
2863 {
2864 	struct kvm_tdx_cmd tdx_cmd;
2865 	int r;
2866 
2867 	if (copy_from_user(&tdx_cmd, argp, sizeof(struct kvm_tdx_cmd)))
2868 		return -EFAULT;
2869 
2870 	/*
2871 	 * Userspace should never set hw_error. It is used to fill
2872 	 * hardware-defined error by the kernel.
2873 	 */
2874 	if (tdx_cmd.hw_error)
2875 		return -EINVAL;
2876 
2877 	mutex_lock(&kvm->lock);
2878 
2879 	switch (tdx_cmd.id) {
2880 	case KVM_TDX_CAPABILITIES:
2881 		r = tdx_get_capabilities(&tdx_cmd);
2882 		break;
2883 	case KVM_TDX_INIT_VM:
2884 		r = tdx_td_init(kvm, &tdx_cmd);
2885 		break;
2886 	case KVM_TDX_FINALIZE_VM:
2887 		r = tdx_td_finalize(kvm, &tdx_cmd);
2888 		break;
2889 	default:
2890 		r = -EINVAL;
2891 		goto out;
2892 	}
2893 
2894 	if (copy_to_user(argp, &tdx_cmd, sizeof(struct kvm_tdx_cmd)))
2895 		r = -EFAULT;
2896 
2897 out:
2898 	mutex_unlock(&kvm->lock);
2899 	return r;
2900 }
2901 
2902 /* VMM can pass one 64bit auxiliary data to vcpu via RCX for guest BIOS. */
tdx_td_vcpu_init(struct kvm_vcpu * vcpu,u64 vcpu_rcx)2903 static int tdx_td_vcpu_init(struct kvm_vcpu *vcpu, u64 vcpu_rcx)
2904 {
2905 	struct kvm_tdx *kvm_tdx = to_kvm_tdx(vcpu->kvm);
2906 	struct vcpu_tdx *tdx = to_tdx(vcpu);
2907 	struct page *page;
2908 	int ret, i;
2909 	u64 err;
2910 
2911 	page = alloc_page(GFP_KERNEL);
2912 	if (!page)
2913 		return -ENOMEM;
2914 	tdx->vp.tdvpr_page = page;
2915 
2916 	tdx->vp.tdcx_pages = kcalloc(kvm_tdx->td.tdcx_nr_pages, sizeof(*tdx->vp.tdcx_pages),
2917 			       	     GFP_KERNEL);
2918 	if (!tdx->vp.tdcx_pages) {
2919 		ret = -ENOMEM;
2920 		goto free_tdvpr;
2921 	}
2922 
2923 	for (i = 0; i < kvm_tdx->td.tdcx_nr_pages; i++) {
2924 		page = alloc_page(GFP_KERNEL);
2925 		if (!page) {
2926 			ret = -ENOMEM;
2927 			goto free_tdcx;
2928 		}
2929 		tdx->vp.tdcx_pages[i] = page;
2930 	}
2931 
2932 	err = tdh_vp_create(&kvm_tdx->td, &tdx->vp);
2933 	if (KVM_BUG_ON(err, vcpu->kvm)) {
2934 		ret = -EIO;
2935 		pr_tdx_error(TDH_VP_CREATE, err);
2936 		goto free_tdcx;
2937 	}
2938 
2939 	for (i = 0; i < kvm_tdx->td.tdcx_nr_pages; i++) {
2940 		err = tdh_vp_addcx(&tdx->vp, tdx->vp.tdcx_pages[i]);
2941 		if (KVM_BUG_ON(err, vcpu->kvm)) {
2942 			pr_tdx_error(TDH_VP_ADDCX, err);
2943 			/*
2944 			 * Pages already added are reclaimed by the vcpu_free
2945 			 * method, but the rest are freed here.
2946 			 */
2947 			for (; i < kvm_tdx->td.tdcx_nr_pages; i++) {
2948 				__free_page(tdx->vp.tdcx_pages[i]);
2949 				tdx->vp.tdcx_pages[i] = NULL;
2950 			}
2951 			return -EIO;
2952 		}
2953 	}
2954 
2955 	err = tdh_vp_init(&tdx->vp, vcpu_rcx, vcpu->vcpu_id);
2956 	if (KVM_BUG_ON(err, vcpu->kvm)) {
2957 		pr_tdx_error(TDH_VP_INIT, err);
2958 		return -EIO;
2959 	}
2960 
2961 	vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
2962 
2963 	return 0;
2964 
2965 free_tdcx:
2966 	for (i = 0; i < kvm_tdx->td.tdcx_nr_pages; i++) {
2967 		if (tdx->vp.tdcx_pages[i])
2968 			__free_page(tdx->vp.tdcx_pages[i]);
2969 		tdx->vp.tdcx_pages[i] = NULL;
2970 	}
2971 	kfree(tdx->vp.tdcx_pages);
2972 	tdx->vp.tdcx_pages = NULL;
2973 
2974 free_tdvpr:
2975 	if (tdx->vp.tdvpr_page)
2976 		__free_page(tdx->vp.tdvpr_page);
2977 	tdx->vp.tdvpr_page = 0;
2978 
2979 	return ret;
2980 }
2981 
2982 /* Sometimes reads multipple subleafs. Return how many enties were written. */
tdx_vcpu_get_cpuid_leaf(struct kvm_vcpu * vcpu,u32 leaf,int * entry_index,struct kvm_cpuid_entry2 * output_e)2983 static int tdx_vcpu_get_cpuid_leaf(struct kvm_vcpu *vcpu, u32 leaf, int *entry_index,
2984 				   struct kvm_cpuid_entry2 *output_e)
2985 {
2986 	int sub_leaf = 0;
2987 	int ret;
2988 
2989 	/* First try without a subleaf */
2990 	ret = tdx_read_cpuid(vcpu, leaf, 0, false, entry_index, output_e);
2991 
2992 	/* If success, or invalid leaf, just give up */
2993 	if (ret != -EIO)
2994 		return ret;
2995 
2996 	/*
2997 	 * If the try without a subleaf failed, try reading subleafs until
2998 	 * failure. The TDX module only supports 6 bits of subleaf index.
2999 	 */
3000 	while (1) {
3001 		/* Keep reading subleafs until there is a failure. */
3002 		if (tdx_read_cpuid(vcpu, leaf, sub_leaf, true, entry_index, output_e))
3003 			return !sub_leaf;
3004 
3005 		sub_leaf++;
3006 		output_e++;
3007 	}
3008 
3009 	return 0;
3010 }
3011 
tdx_vcpu_get_cpuid(struct kvm_vcpu * vcpu,struct kvm_tdx_cmd * cmd)3012 static int tdx_vcpu_get_cpuid(struct kvm_vcpu *vcpu, struct kvm_tdx_cmd *cmd)
3013 {
3014 	struct kvm_cpuid2 __user *output, *td_cpuid;
3015 	int r = 0, i = 0, leaf;
3016 	u32 level;
3017 
3018 	output = u64_to_user_ptr(cmd->data);
3019 	td_cpuid = kzalloc(sizeof(*td_cpuid) +
3020 			sizeof(output->entries[0]) * KVM_MAX_CPUID_ENTRIES,
3021 			GFP_KERNEL);
3022 	if (!td_cpuid)
3023 		return -ENOMEM;
3024 
3025 	if (copy_from_user(td_cpuid, output, sizeof(*output))) {
3026 		r = -EFAULT;
3027 		goto out;
3028 	}
3029 
3030 	/* Read max CPUID for normal range */
3031 	if (tdx_vcpu_get_cpuid_leaf(vcpu, 0, &i, &td_cpuid->entries[i])) {
3032 		r = -EIO;
3033 		goto out;
3034 	}
3035 	level = td_cpuid->entries[0].eax;
3036 
3037 	for (leaf = 1; leaf <= level; leaf++)
3038 		tdx_vcpu_get_cpuid_leaf(vcpu, leaf, &i, &td_cpuid->entries[i]);
3039 
3040 	/* Read max CPUID for extended range */
3041 	if (tdx_vcpu_get_cpuid_leaf(vcpu, 0x80000000, &i, &td_cpuid->entries[i])) {
3042 		r = -EIO;
3043 		goto out;
3044 	}
3045 	level = td_cpuid->entries[i - 1].eax;
3046 
3047 	for (leaf = 0x80000001; leaf <= level; leaf++)
3048 		tdx_vcpu_get_cpuid_leaf(vcpu, leaf, &i, &td_cpuid->entries[i]);
3049 
3050 	if (td_cpuid->nent < i)
3051 		r = -E2BIG;
3052 	td_cpuid->nent = i;
3053 
3054 	if (copy_to_user(output, td_cpuid, sizeof(*output))) {
3055 		r = -EFAULT;
3056 		goto out;
3057 	}
3058 
3059 	if (r == -E2BIG)
3060 		goto out;
3061 
3062 	if (copy_to_user(output->entries, td_cpuid->entries,
3063 			 td_cpuid->nent * sizeof(struct kvm_cpuid_entry2)))
3064 		r = -EFAULT;
3065 
3066 out:
3067 	kfree(td_cpuid);
3068 
3069 	return r;
3070 }
3071 
tdx_vcpu_init(struct kvm_vcpu * vcpu,struct kvm_tdx_cmd * cmd)3072 static int tdx_vcpu_init(struct kvm_vcpu *vcpu, struct kvm_tdx_cmd *cmd)
3073 {
3074 	u64 apic_base;
3075 	struct vcpu_tdx *tdx = to_tdx(vcpu);
3076 	int ret;
3077 
3078 	if (cmd->flags)
3079 		return -EINVAL;
3080 
3081 	if (tdx->state != VCPU_TD_STATE_UNINITIALIZED)
3082 		return -EINVAL;
3083 
3084 	/*
3085 	 * TDX requires X2APIC, userspace is responsible for configuring guest
3086 	 * CPUID accordingly.
3087 	 */
3088 	apic_base = APIC_DEFAULT_PHYS_BASE | LAPIC_MODE_X2APIC |
3089 		(kvm_vcpu_is_reset_bsp(vcpu) ? MSR_IA32_APICBASE_BSP : 0);
3090 	if (kvm_apic_set_base(vcpu, apic_base, true))
3091 		return -EINVAL;
3092 
3093 	ret = tdx_td_vcpu_init(vcpu, (u64)cmd->data);
3094 	if (ret)
3095 		return ret;
3096 
3097 	td_vmcs_write16(tdx, POSTED_INTR_NV, POSTED_INTR_VECTOR);
3098 	td_vmcs_write64(tdx, POSTED_INTR_DESC_ADDR, __pa(&tdx->vt.pi_desc));
3099 	td_vmcs_setbit32(tdx, PIN_BASED_VM_EXEC_CONTROL, PIN_BASED_POSTED_INTR);
3100 
3101 	tdx->state = VCPU_TD_STATE_INITIALIZED;
3102 
3103 	return 0;
3104 }
3105 
tdx_vcpu_reset(struct kvm_vcpu * vcpu,bool init_event)3106 void tdx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
3107 {
3108 	/*
3109 	 * Yell on INIT, as TDX doesn't support INIT, i.e. KVM should drop all
3110 	 * INIT events.
3111 	 *
3112 	 * Defer initializing vCPU for RESET state until KVM_TDX_INIT_VCPU, as
3113 	 * userspace needs to define the vCPU model before KVM can initialize
3114 	 * vCPU state, e.g. to enable x2APIC.
3115 	 */
3116 	WARN_ON_ONCE(init_event);
3117 }
3118 
3119 struct tdx_gmem_post_populate_arg {
3120 	struct kvm_vcpu *vcpu;
3121 	__u32 flags;
3122 };
3123 
tdx_gmem_post_populate(struct kvm * kvm,gfn_t gfn,kvm_pfn_t pfn,void __user * src,int order,void * _arg)3124 static int tdx_gmem_post_populate(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn,
3125 				  void __user *src, int order, void *_arg)
3126 {
3127 	u64 error_code = PFERR_GUEST_FINAL_MASK | PFERR_PRIVATE_ACCESS;
3128 	struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
3129 	struct tdx_gmem_post_populate_arg *arg = _arg;
3130 	struct kvm_vcpu *vcpu = arg->vcpu;
3131 	gpa_t gpa = gfn_to_gpa(gfn);
3132 	u8 level = PG_LEVEL_4K;
3133 	struct page *src_page;
3134 	int ret, i;
3135 	u64 err, entry, level_state;
3136 
3137 	/*
3138 	 * Get the source page if it has been faulted in. Return failure if the
3139 	 * source page has been swapped out or unmapped in primary memory.
3140 	 */
3141 	ret = get_user_pages_fast((unsigned long)src, 1, 0, &src_page);
3142 	if (ret < 0)
3143 		return ret;
3144 	if (ret != 1)
3145 		return -ENOMEM;
3146 
3147 	ret = kvm_tdp_map_page(vcpu, gpa, error_code, &level);
3148 	if (ret < 0)
3149 		goto out;
3150 
3151 	/*
3152 	 * The private mem cannot be zapped after kvm_tdp_map_page()
3153 	 * because all paths are covered by slots_lock and the
3154 	 * filemap invalidate lock.  Check that they are indeed enough.
3155 	 */
3156 	if (IS_ENABLED(CONFIG_KVM_PROVE_MMU)) {
3157 		scoped_guard(read_lock, &kvm->mmu_lock) {
3158 			if (KVM_BUG_ON(!kvm_tdp_mmu_gpa_is_mapped(vcpu, gpa), kvm)) {
3159 				ret = -EIO;
3160 				goto out;
3161 			}
3162 		}
3163 	}
3164 
3165 	ret = 0;
3166 	err = tdh_mem_page_add(&kvm_tdx->td, gpa, pfn_to_page(pfn),
3167 			       src_page, &entry, &level_state);
3168 	if (err) {
3169 		ret = unlikely(tdx_operand_busy(err)) ? -EBUSY : -EIO;
3170 		goto out;
3171 	}
3172 
3173 	if (!KVM_BUG_ON(!atomic64_read(&kvm_tdx->nr_premapped), kvm))
3174 		atomic64_dec(&kvm_tdx->nr_premapped);
3175 
3176 	if (arg->flags & KVM_TDX_MEASURE_MEMORY_REGION) {
3177 		for (i = 0; i < PAGE_SIZE; i += TDX_EXTENDMR_CHUNKSIZE) {
3178 			err = tdh_mr_extend(&kvm_tdx->td, gpa + i, &entry,
3179 					    &level_state);
3180 			if (err) {
3181 				ret = -EIO;
3182 				break;
3183 			}
3184 		}
3185 	}
3186 
3187 out:
3188 	put_page(src_page);
3189 	return ret;
3190 }
3191 
tdx_vcpu_init_mem_region(struct kvm_vcpu * vcpu,struct kvm_tdx_cmd * cmd)3192 static int tdx_vcpu_init_mem_region(struct kvm_vcpu *vcpu, struct kvm_tdx_cmd *cmd)
3193 {
3194 	struct vcpu_tdx *tdx = to_tdx(vcpu);
3195 	struct kvm *kvm = vcpu->kvm;
3196 	struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
3197 	struct kvm_tdx_init_mem_region region;
3198 	struct tdx_gmem_post_populate_arg arg;
3199 	long gmem_ret;
3200 	int ret;
3201 
3202 	if (tdx->state != VCPU_TD_STATE_INITIALIZED)
3203 		return -EINVAL;
3204 
3205 	guard(mutex)(&kvm->slots_lock);
3206 
3207 	/* Once TD is finalized, the initial guest memory is fixed. */
3208 	if (kvm_tdx->state == TD_STATE_RUNNABLE)
3209 		return -EINVAL;
3210 
3211 	if (cmd->flags & ~KVM_TDX_MEASURE_MEMORY_REGION)
3212 		return -EINVAL;
3213 
3214 	if (copy_from_user(&region, u64_to_user_ptr(cmd->data), sizeof(region)))
3215 		return -EFAULT;
3216 
3217 	if (!PAGE_ALIGNED(region.source_addr) || !PAGE_ALIGNED(region.gpa) ||
3218 	    !region.nr_pages ||
3219 	    region.gpa + (region.nr_pages << PAGE_SHIFT) <= region.gpa ||
3220 	    !vt_is_tdx_private_gpa(kvm, region.gpa) ||
3221 	    !vt_is_tdx_private_gpa(kvm, region.gpa + (region.nr_pages << PAGE_SHIFT) - 1))
3222 		return -EINVAL;
3223 
3224 	kvm_mmu_reload(vcpu);
3225 	ret = 0;
3226 	while (region.nr_pages) {
3227 		if (signal_pending(current)) {
3228 			ret = -EINTR;
3229 			break;
3230 		}
3231 
3232 		arg = (struct tdx_gmem_post_populate_arg) {
3233 			.vcpu = vcpu,
3234 			.flags = cmd->flags,
3235 		};
3236 		gmem_ret = kvm_gmem_populate(kvm, gpa_to_gfn(region.gpa),
3237 					     u64_to_user_ptr(region.source_addr),
3238 					     1, tdx_gmem_post_populate, &arg);
3239 		if (gmem_ret < 0) {
3240 			ret = gmem_ret;
3241 			break;
3242 		}
3243 
3244 		if (gmem_ret != 1) {
3245 			ret = -EIO;
3246 			break;
3247 		}
3248 
3249 		region.source_addr += PAGE_SIZE;
3250 		region.gpa += PAGE_SIZE;
3251 		region.nr_pages--;
3252 
3253 		cond_resched();
3254 	}
3255 
3256 	if (copy_to_user(u64_to_user_ptr(cmd->data), &region, sizeof(region)))
3257 		ret = -EFAULT;
3258 	return ret;
3259 }
3260 
tdx_vcpu_ioctl(struct kvm_vcpu * vcpu,void __user * argp)3261 int tdx_vcpu_ioctl(struct kvm_vcpu *vcpu, void __user *argp)
3262 {
3263 	struct kvm_tdx *kvm_tdx = to_kvm_tdx(vcpu->kvm);
3264 	struct kvm_tdx_cmd cmd;
3265 	int ret;
3266 
3267 	if (!is_hkid_assigned(kvm_tdx) || kvm_tdx->state == TD_STATE_RUNNABLE)
3268 		return -EINVAL;
3269 
3270 	if (copy_from_user(&cmd, argp, sizeof(cmd)))
3271 		return -EFAULT;
3272 
3273 	if (cmd.hw_error)
3274 		return -EINVAL;
3275 
3276 	switch (cmd.id) {
3277 	case KVM_TDX_INIT_VCPU:
3278 		ret = tdx_vcpu_init(vcpu, &cmd);
3279 		break;
3280 	case KVM_TDX_INIT_MEM_REGION:
3281 		ret = tdx_vcpu_init_mem_region(vcpu, &cmd);
3282 		break;
3283 	case KVM_TDX_GET_CPUID:
3284 		ret = tdx_vcpu_get_cpuid(vcpu, &cmd);
3285 		break;
3286 	default:
3287 		ret = -EINVAL;
3288 		break;
3289 	}
3290 
3291 	return ret;
3292 }
3293 
tdx_gmem_private_max_mapping_level(struct kvm * kvm,kvm_pfn_t pfn)3294 int tdx_gmem_private_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn)
3295 {
3296 	return PG_LEVEL_4K;
3297 }
3298 
tdx_online_cpu(unsigned int cpu)3299 static int tdx_online_cpu(unsigned int cpu)
3300 {
3301 	unsigned long flags;
3302 	int r;
3303 
3304 	/* Sanity check CPU is already in post-VMXON */
3305 	WARN_ON_ONCE(!(cr4_read_shadow() & X86_CR4_VMXE));
3306 
3307 	local_irq_save(flags);
3308 	r = tdx_cpu_enable();
3309 	local_irq_restore(flags);
3310 
3311 	return r;
3312 }
3313 
tdx_offline_cpu(unsigned int cpu)3314 static int tdx_offline_cpu(unsigned int cpu)
3315 {
3316 	int i;
3317 
3318 	/* No TD is running.  Allow any cpu to be offline. */
3319 	if (!atomic_read(&nr_configured_hkid))
3320 		return 0;
3321 
3322 	/*
3323 	 * In order to reclaim TDX HKID, (i.e. when deleting guest TD), need to
3324 	 * call TDH.PHYMEM.PAGE.WBINVD on all packages to program all memory
3325 	 * controller with pconfig.  If we have active TDX HKID, refuse to
3326 	 * offline the last online cpu.
3327 	 */
3328 	for_each_online_cpu(i) {
3329 		/*
3330 		 * Found another online cpu on the same package.
3331 		 * Allow to offline.
3332 		 */
3333 		if (i != cpu && topology_physical_package_id(i) ==
3334 				topology_physical_package_id(cpu))
3335 			return 0;
3336 	}
3337 
3338 	/*
3339 	 * This is the last cpu of this package.  Don't offline it.
3340 	 *
3341 	 * Because it's hard for human operator to understand the
3342 	 * reason, warn it.
3343 	 */
3344 #define MSG_ALLPKG_ONLINE \
3345 	"TDX requires all packages to have an online CPU. Delete all TDs in order to offline all CPUs of a package.\n"
3346 	pr_warn_ratelimited(MSG_ALLPKG_ONLINE);
3347 	return -EBUSY;
3348 }
3349 
__do_tdx_cleanup(void)3350 static void __do_tdx_cleanup(void)
3351 {
3352 	/*
3353 	 * Once TDX module is initialized, it cannot be disabled and
3354 	 * re-initialized again w/o runtime update (which isn't
3355 	 * supported by kernel).  Only need to remove the cpuhp here.
3356 	 * The TDX host core code tracks TDX status and can handle
3357 	 * 'multiple enabling' scenario.
3358 	 */
3359 	WARN_ON_ONCE(!tdx_cpuhp_state);
3360 	cpuhp_remove_state_nocalls_cpuslocked(tdx_cpuhp_state);
3361 	tdx_cpuhp_state = 0;
3362 }
3363 
__tdx_cleanup(void)3364 static void __tdx_cleanup(void)
3365 {
3366 	cpus_read_lock();
3367 	__do_tdx_cleanup();
3368 	cpus_read_unlock();
3369 }
3370 
__do_tdx_bringup(void)3371 static int __init __do_tdx_bringup(void)
3372 {
3373 	int r;
3374 
3375 	/*
3376 	 * TDX-specific cpuhp callback to call tdx_cpu_enable() on all
3377 	 * online CPUs before calling tdx_enable(), and on any new
3378 	 * going-online CPU to make sure it is ready for TDX guest.
3379 	 */
3380 	r = cpuhp_setup_state_cpuslocked(CPUHP_AP_ONLINE_DYN,
3381 					 "kvm/cpu/tdx:online",
3382 					 tdx_online_cpu, tdx_offline_cpu);
3383 	if (r < 0)
3384 		return r;
3385 
3386 	tdx_cpuhp_state = r;
3387 
3388 	r = tdx_enable();
3389 	if (r)
3390 		__do_tdx_cleanup();
3391 
3392 	return r;
3393 }
3394 
__tdx_bringup(void)3395 static int __init __tdx_bringup(void)
3396 {
3397 	const struct tdx_sys_info_td_conf *td_conf;
3398 	int r, i;
3399 
3400 	for (i = 0; i < ARRAY_SIZE(tdx_uret_msrs); i++) {
3401 		/*
3402 		 * Check if MSRs (tdx_uret_msrs) can be saved/restored
3403 		 * before returning to user space.
3404 		 *
3405 		 * this_cpu_ptr(user_return_msrs)->registered isn't checked
3406 		 * because the registration is done at vcpu runtime by
3407 		 * tdx_user_return_msr_update_cache().
3408 		 */
3409 		tdx_uret_msrs[i].slot = kvm_find_user_return_msr(tdx_uret_msrs[i].msr);
3410 		if (tdx_uret_msrs[i].slot == -1) {
3411 			/* If any MSR isn't supported, it is a KVM bug */
3412 			pr_err("MSR %x isn't included by kvm_find_user_return_msr\n",
3413 				tdx_uret_msrs[i].msr);
3414 			return -EIO;
3415 		}
3416 	}
3417 
3418 	/*
3419 	 * Enabling TDX requires enabling hardware virtualization first,
3420 	 * as making SEAMCALLs requires CPU being in post-VMXON state.
3421 	 */
3422 	r = kvm_enable_virtualization();
3423 	if (r)
3424 		return r;
3425 
3426 	cpus_read_lock();
3427 	r = __do_tdx_bringup();
3428 	cpus_read_unlock();
3429 
3430 	if (r)
3431 		goto tdx_bringup_err;
3432 
3433 	/* Get TDX global information for later use */
3434 	tdx_sysinfo = tdx_get_sysinfo();
3435 	if (WARN_ON_ONCE(!tdx_sysinfo)) {
3436 		r = -EINVAL;
3437 		goto get_sysinfo_err;
3438 	}
3439 
3440 	/* Check TDX module and KVM capabilities */
3441 	if (!tdx_get_supported_attrs(&tdx_sysinfo->td_conf) ||
3442 	    !tdx_get_supported_xfam(&tdx_sysinfo->td_conf))
3443 		goto get_sysinfo_err;
3444 
3445 	if (!(tdx_sysinfo->features.tdx_features0 & MD_FIELD_ID_FEATURES0_TOPOLOGY_ENUM))
3446 		goto get_sysinfo_err;
3447 
3448 	/*
3449 	 * TDX has its own limit of maximum vCPUs it can support for all
3450 	 * TDX guests in addition to KVM_MAX_VCPUS.  Userspace needs to
3451 	 * query TDX guest's maximum vCPUs by checking KVM_CAP_MAX_VCPU
3452 	 * extension on per-VM basis.
3453 	 *
3454 	 * TDX module reports such limit via the MAX_VCPU_PER_TD global
3455 	 * metadata.  Different modules may report different values.
3456 	 * Some old module may also not support this metadata (in which
3457 	 * case this limit is U16_MAX).
3458 	 *
3459 	 * In practice, the reported value reflects the maximum logical
3460 	 * CPUs that ALL the platforms that the module supports can
3461 	 * possibly have.
3462 	 *
3463 	 * Simply forwarding the MAX_VCPU_PER_TD to userspace could
3464 	 * result in an unpredictable ABI.  KVM instead always advertise
3465 	 * the number of logical CPUs the platform has as the maximum
3466 	 * vCPUs for TDX guests.
3467 	 *
3468 	 * Make sure MAX_VCPU_PER_TD reported by TDX module is not
3469 	 * smaller than the number of logical CPUs, otherwise KVM will
3470 	 * report an unsupported value to userspace.
3471 	 *
3472 	 * Note, a platform with TDX enabled in the BIOS cannot support
3473 	 * physical CPU hotplug, and TDX requires the BIOS has marked
3474 	 * all logical CPUs in MADT table as enabled.  Just use
3475 	 * num_present_cpus() for the number of logical CPUs.
3476 	 */
3477 	td_conf = &tdx_sysinfo->td_conf;
3478 	if (td_conf->max_vcpus_per_td < num_present_cpus()) {
3479 		pr_err("Disable TDX: MAX_VCPU_PER_TD (%u) smaller than number of logical CPUs (%u).\n",
3480 				td_conf->max_vcpus_per_td, num_present_cpus());
3481 		r = -EINVAL;
3482 		goto get_sysinfo_err;
3483 	}
3484 
3485 	if (misc_cg_set_capacity(MISC_CG_RES_TDX, tdx_get_nr_guest_keyids())) {
3486 		r = -EINVAL;
3487 		goto get_sysinfo_err;
3488 	}
3489 
3490 	/*
3491 	 * Leave hardware virtualization enabled after TDX is enabled
3492 	 * successfully.  TDX CPU hotplug depends on this.
3493 	 */
3494 	return 0;
3495 
3496 get_sysinfo_err:
3497 	__tdx_cleanup();
3498 tdx_bringup_err:
3499 	kvm_disable_virtualization();
3500 	return r;
3501 }
3502 
tdx_cleanup(void)3503 void tdx_cleanup(void)
3504 {
3505 	if (enable_tdx) {
3506 		misc_cg_set_capacity(MISC_CG_RES_TDX, 0);
3507 		__tdx_cleanup();
3508 		kvm_disable_virtualization();
3509 	}
3510 }
3511 
tdx_bringup(void)3512 int __init tdx_bringup(void)
3513 {
3514 	int r, i;
3515 
3516 	/* tdx_disable_virtualization_cpu() uses associated_tdvcpus. */
3517 	for_each_possible_cpu(i)
3518 		INIT_LIST_HEAD(&per_cpu(associated_tdvcpus, i));
3519 
3520 	if (!enable_tdx)
3521 		return 0;
3522 
3523 	if (!enable_ept) {
3524 		pr_err("EPT is required for TDX\n");
3525 		goto success_disable_tdx;
3526 	}
3527 
3528 	if (!tdp_mmu_enabled || !enable_mmio_caching || !enable_ept_ad_bits) {
3529 		pr_err("TDP MMU and MMIO caching and EPT A/D bit is required for TDX\n");
3530 		goto success_disable_tdx;
3531 	}
3532 
3533 	if (!enable_apicv) {
3534 		pr_err("APICv is required for TDX\n");
3535 		goto success_disable_tdx;
3536 	}
3537 
3538 	if (!cpu_feature_enabled(X86_FEATURE_OSXSAVE)) {
3539 		pr_err("tdx: OSXSAVE is required for TDX\n");
3540 		goto success_disable_tdx;
3541 	}
3542 
3543 	if (!cpu_feature_enabled(X86_FEATURE_MOVDIR64B)) {
3544 		pr_err("tdx: MOVDIR64B is required for TDX\n");
3545 		goto success_disable_tdx;
3546 	}
3547 
3548 	if (!cpu_feature_enabled(X86_FEATURE_SELFSNOOP)) {
3549 		pr_err("Self-snoop is required for TDX\n");
3550 		goto success_disable_tdx;
3551 	}
3552 
3553 	if (!cpu_feature_enabled(X86_FEATURE_TDX_HOST_PLATFORM)) {
3554 		pr_err("tdx: no TDX private KeyIDs available\n");
3555 		goto success_disable_tdx;
3556 	}
3557 
3558 	if (!enable_virt_at_load) {
3559 		pr_err("tdx: tdx requires kvm.enable_virt_at_load=1\n");
3560 		goto success_disable_tdx;
3561 	}
3562 
3563 	/*
3564 	 * Ideally KVM should probe whether TDX module has been loaded
3565 	 * first and then try to bring it up.  But TDX needs to use SEAMCALL
3566 	 * to probe whether the module is loaded (there is no CPUID or MSR
3567 	 * for that), and making SEAMCALL requires enabling virtualization
3568 	 * first, just like the rest steps of bringing up TDX module.
3569 	 *
3570 	 * So, for simplicity do everything in __tdx_bringup(); the first
3571 	 * SEAMCALL will return -ENODEV when the module is not loaded.  The
3572 	 * only complication is having to make sure that initialization
3573 	 * SEAMCALLs don't return TDX_SEAMCALL_VMFAILINVALID in other
3574 	 * cases.
3575 	 */
3576 	r = __tdx_bringup();
3577 	if (r) {
3578 		/*
3579 		 * Disable TDX only but don't fail to load module if
3580 		 * the TDX module could not be loaded.  No need to print
3581 		 * message saying "module is not loaded" because it was
3582 		 * printed when the first SEAMCALL failed.
3583 		 */
3584 		if (r == -ENODEV)
3585 			goto success_disable_tdx;
3586 
3587 		enable_tdx = 0;
3588 	}
3589 
3590 	return r;
3591 
3592 success_disable_tdx:
3593 	enable_tdx = 0;
3594 	return 0;
3595 }
3596