xref: /linux/arch/x86/virt/vmx/tdx/tdx.c (revision 8a7c601e14576a22c2bbf7f67455ccf3f3d2737f)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Copyright(c) 2023 Intel Corporation.
4  *
5  * Intel Trusted Domain Extensions (TDX) support
6  */
7 
8 #include "asm/page_types.h"
9 #define pr_fmt(fmt)	"virt/tdx: " fmt
10 
11 #include <linux/types.h>
12 #include <linux/cache.h>
13 #include <linux/init.h>
14 #include <linux/errno.h>
15 #include <linux/printk.h>
16 #include <linux/cpu.h>
17 #include <linux/spinlock.h>
18 #include <linux/percpu-defs.h>
19 #include <linux/mutex.h>
20 #include <linux/list.h>
21 #include <linux/memblock.h>
22 #include <linux/memory.h>
23 #include <linux/minmax.h>
24 #include <linux/sizes.h>
25 #include <linux/pfn.h>
26 #include <linux/align.h>
27 #include <linux/sort.h>
28 #include <linux/log2.h>
29 #include <linux/acpi.h>
30 #include <linux/suspend.h>
31 #include <linux/idr.h>
32 #include <asm/page.h>
33 #include <asm/special_insns.h>
34 #include <asm/msr-index.h>
35 #include <asm/msr.h>
36 #include <asm/cpufeature.h>
37 #include <asm/tdx.h>
38 #include <asm/cpu_device_id.h>
39 #include <asm/processor.h>
40 #include <asm/mce.h>
41 #include "tdx.h"
42 
43 static u32 tdx_global_keyid __ro_after_init;
44 static u32 tdx_guest_keyid_start __ro_after_init;
45 static u32 tdx_nr_guest_keyids __ro_after_init;
46 
47 static DEFINE_IDA(tdx_guest_keyid_pool);
48 
49 static DEFINE_PER_CPU(bool, tdx_lp_initialized);
50 
51 static struct tdmr_info_list tdx_tdmr_list;
52 
53 static enum tdx_module_status_t tdx_module_status;
54 static DEFINE_MUTEX(tdx_module_lock);
55 
56 /* All TDX-usable memory regions.  Protected by mem_hotplug_lock. */
57 static LIST_HEAD(tdx_memlist);
58 
59 static struct tdx_sys_info tdx_sysinfo;
60 
61 typedef void (*sc_err_func_t)(u64 fn, u64 err, struct tdx_module_args *args);
62 
63 static inline void seamcall_err(u64 fn, u64 err, struct tdx_module_args *args)
64 {
65 	pr_err("SEAMCALL (0x%016llx) failed: 0x%016llx\n", fn, err);
66 }
67 
68 static inline void seamcall_err_ret(u64 fn, u64 err,
69 				    struct tdx_module_args *args)
70 {
71 	seamcall_err(fn, err, args);
72 	pr_err("RCX 0x%016llx RDX 0x%016llx R08 0x%016llx\n",
73 			args->rcx, args->rdx, args->r8);
74 	pr_err("R09 0x%016llx R10 0x%016llx R11 0x%016llx\n",
75 			args->r9, args->r10, args->r11);
76 }
77 
78 static __always_inline int sc_retry_prerr(sc_func_t func,
79 					  sc_err_func_t err_func,
80 					  u64 fn, struct tdx_module_args *args)
81 {
82 	u64 sret = sc_retry(func, fn, args);
83 
84 	if (sret == TDX_SUCCESS)
85 		return 0;
86 
87 	if (sret == TDX_SEAMCALL_VMFAILINVALID)
88 		return -ENODEV;
89 
90 	if (sret == TDX_SEAMCALL_GP)
91 		return -EOPNOTSUPP;
92 
93 	if (sret == TDX_SEAMCALL_UD)
94 		return -EACCES;
95 
96 	err_func(fn, sret, args);
97 	return -EIO;
98 }
99 
100 #define seamcall_prerr(__fn, __args)						\
101 	sc_retry_prerr(__seamcall, seamcall_err, (__fn), (__args))
102 
103 #define seamcall_prerr_ret(__fn, __args)					\
104 	sc_retry_prerr(__seamcall_ret, seamcall_err_ret, (__fn), (__args))
105 
106 /*
107  * Do the module global initialization once and return its result.
108  * It can be done on any cpu.  It's always called with interrupts
109  * disabled.
110  */
111 static int try_init_module_global(void)
112 {
113 	struct tdx_module_args args = {};
114 	static DEFINE_RAW_SPINLOCK(sysinit_lock);
115 	static bool sysinit_done;
116 	static int sysinit_ret;
117 
118 	lockdep_assert_irqs_disabled();
119 
120 	raw_spin_lock(&sysinit_lock);
121 
122 	if (sysinit_done)
123 		goto out;
124 
125 	/* RCX is module attributes and all bits are reserved */
126 	args.rcx = 0;
127 	sysinit_ret = seamcall_prerr(TDH_SYS_INIT, &args);
128 
129 	/*
130 	 * The first SEAMCALL also detects the TDX module, thus
131 	 * it can fail due to the TDX module is not loaded.
132 	 * Dump message to let the user know.
133 	 */
134 	if (sysinit_ret == -ENODEV)
135 		pr_err("module not loaded\n");
136 
137 	sysinit_done = true;
138 out:
139 	raw_spin_unlock(&sysinit_lock);
140 	return sysinit_ret;
141 }
142 
143 /**
144  * tdx_cpu_enable - Enable TDX on local cpu
145  *
146  * Do one-time TDX module per-cpu initialization SEAMCALL (and TDX module
147  * global initialization SEAMCALL if not done) on local cpu to make this
148  * cpu be ready to run any other SEAMCALLs.
149  *
150  * Always call this function via IPI function calls.
151  *
152  * Return 0 on success, otherwise errors.
153  */
154 int tdx_cpu_enable(void)
155 {
156 	struct tdx_module_args args = {};
157 	int ret;
158 
159 	if (!boot_cpu_has(X86_FEATURE_TDX_HOST_PLATFORM))
160 		return -ENODEV;
161 
162 	lockdep_assert_irqs_disabled();
163 
164 	if (__this_cpu_read(tdx_lp_initialized))
165 		return 0;
166 
167 	/*
168 	 * The TDX module global initialization is the very first step
169 	 * to enable TDX.  Need to do it first (if hasn't been done)
170 	 * before the per-cpu initialization.
171 	 */
172 	ret = try_init_module_global();
173 	if (ret)
174 		return ret;
175 
176 	ret = seamcall_prerr(TDH_SYS_LP_INIT, &args);
177 	if (ret)
178 		return ret;
179 
180 	__this_cpu_write(tdx_lp_initialized, true);
181 
182 	return 0;
183 }
184 EXPORT_SYMBOL_GPL(tdx_cpu_enable);
185 
186 /*
187  * Add a memory region as a TDX memory block.  The caller must make sure
188  * all memory regions are added in address ascending order and don't
189  * overlap.
190  */
191 static int add_tdx_memblock(struct list_head *tmb_list, unsigned long start_pfn,
192 			    unsigned long end_pfn, int nid)
193 {
194 	struct tdx_memblock *tmb;
195 
196 	tmb = kmalloc(sizeof(*tmb), GFP_KERNEL);
197 	if (!tmb)
198 		return -ENOMEM;
199 
200 	INIT_LIST_HEAD(&tmb->list);
201 	tmb->start_pfn = start_pfn;
202 	tmb->end_pfn = end_pfn;
203 	tmb->nid = nid;
204 
205 	/* @tmb_list is protected by mem_hotplug_lock */
206 	list_add_tail(&tmb->list, tmb_list);
207 	return 0;
208 }
209 
210 static void free_tdx_memlist(struct list_head *tmb_list)
211 {
212 	/* @tmb_list is protected by mem_hotplug_lock */
213 	while (!list_empty(tmb_list)) {
214 		struct tdx_memblock *tmb = list_first_entry(tmb_list,
215 				struct tdx_memblock, list);
216 
217 		list_del(&tmb->list);
218 		kfree(tmb);
219 	}
220 }
221 
222 /*
223  * Ensure that all memblock memory regions are convertible to TDX
224  * memory.  Once this has been established, stash the memblock
225  * ranges off in a secondary structure because memblock is modified
226  * in memory hotplug while TDX memory regions are fixed.
227  */
228 static int build_tdx_memlist(struct list_head *tmb_list)
229 {
230 	unsigned long start_pfn, end_pfn;
231 	int i, nid, ret;
232 
233 	for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) {
234 		/*
235 		 * The first 1MB is not reported as TDX convertible memory.
236 		 * Although the first 1MB is always reserved and won't end up
237 		 * to the page allocator, it is still in memblock's memory
238 		 * regions.  Skip them manually to exclude them as TDX memory.
239 		 */
240 		start_pfn = max(start_pfn, PHYS_PFN(SZ_1M));
241 		if (start_pfn >= end_pfn)
242 			continue;
243 
244 		/*
245 		 * Add the memory regions as TDX memory.  The regions in
246 		 * memblock has already guaranteed they are in address
247 		 * ascending order and don't overlap.
248 		 */
249 		ret = add_tdx_memblock(tmb_list, start_pfn, end_pfn, nid);
250 		if (ret)
251 			goto err;
252 	}
253 
254 	return 0;
255 err:
256 	free_tdx_memlist(tmb_list);
257 	return ret;
258 }
259 
260 static int read_sys_metadata_field(u64 field_id, u64 *data)
261 {
262 	struct tdx_module_args args = {};
263 	int ret;
264 
265 	/*
266 	 * TDH.SYS.RD -- reads one global metadata field
267 	 *  - RDX (in): the field to read
268 	 *  - R8 (out): the field data
269 	 */
270 	args.rdx = field_id;
271 	ret = seamcall_prerr_ret(TDH_SYS_RD, &args);
272 	if (ret)
273 		return ret;
274 
275 	*data = args.r8;
276 
277 	return 0;
278 }
279 
280 #include "tdx_global_metadata.c"
281 
282 static int check_features(struct tdx_sys_info *sysinfo)
283 {
284 	u64 tdx_features0 = sysinfo->features.tdx_features0;
285 
286 	if (!(tdx_features0 & TDX_FEATURES0_NO_RBP_MOD)) {
287 		pr_err("frame pointer (RBP) clobber bug present, upgrade TDX module\n");
288 		return -EINVAL;
289 	}
290 
291 	return 0;
292 }
293 
294 /* Calculate the actual TDMR size */
295 static int tdmr_size_single(u16 max_reserved_per_tdmr)
296 {
297 	int tdmr_sz;
298 
299 	/*
300 	 * The actual size of TDMR depends on the maximum
301 	 * number of reserved areas.
302 	 */
303 	tdmr_sz = sizeof(struct tdmr_info);
304 	tdmr_sz += sizeof(struct tdmr_reserved_area) * max_reserved_per_tdmr;
305 
306 	return ALIGN(tdmr_sz, TDMR_INFO_ALIGNMENT);
307 }
308 
309 static int alloc_tdmr_list(struct tdmr_info_list *tdmr_list,
310 			   struct tdx_sys_info_tdmr *sysinfo_tdmr)
311 {
312 	size_t tdmr_sz, tdmr_array_sz;
313 	void *tdmr_array;
314 
315 	tdmr_sz = tdmr_size_single(sysinfo_tdmr->max_reserved_per_tdmr);
316 	tdmr_array_sz = tdmr_sz * sysinfo_tdmr->max_tdmrs;
317 
318 	/*
319 	 * To keep things simple, allocate all TDMRs together.
320 	 * The buffer needs to be physically contiguous to make
321 	 * sure each TDMR is physically contiguous.
322 	 */
323 	tdmr_array = alloc_pages_exact(tdmr_array_sz,
324 			GFP_KERNEL | __GFP_ZERO);
325 	if (!tdmr_array)
326 		return -ENOMEM;
327 
328 	tdmr_list->tdmrs = tdmr_array;
329 
330 	/*
331 	 * Keep the size of TDMR to find the target TDMR
332 	 * at a given index in the TDMR list.
333 	 */
334 	tdmr_list->tdmr_sz = tdmr_sz;
335 	tdmr_list->max_tdmrs = sysinfo_tdmr->max_tdmrs;
336 	tdmr_list->nr_consumed_tdmrs = 0;
337 
338 	return 0;
339 }
340 
341 static void free_tdmr_list(struct tdmr_info_list *tdmr_list)
342 {
343 	free_pages_exact(tdmr_list->tdmrs,
344 			tdmr_list->max_tdmrs * tdmr_list->tdmr_sz);
345 }
346 
347 /* Get the TDMR from the list at the given index. */
348 static struct tdmr_info *tdmr_entry(struct tdmr_info_list *tdmr_list,
349 				    int idx)
350 {
351 	int tdmr_info_offset = tdmr_list->tdmr_sz * idx;
352 
353 	return (void *)tdmr_list->tdmrs + tdmr_info_offset;
354 }
355 
356 #define TDMR_ALIGNMENT		SZ_1G
357 #define TDMR_ALIGN_DOWN(_addr)	ALIGN_DOWN((_addr), TDMR_ALIGNMENT)
358 #define TDMR_ALIGN_UP(_addr)	ALIGN((_addr), TDMR_ALIGNMENT)
359 
360 static inline u64 tdmr_end(struct tdmr_info *tdmr)
361 {
362 	return tdmr->base + tdmr->size;
363 }
364 
365 /*
366  * Take the memory referenced in @tmb_list and populate the
367  * preallocated @tdmr_list, following all the special alignment
368  * and size rules for TDMR.
369  */
370 static int fill_out_tdmrs(struct list_head *tmb_list,
371 			  struct tdmr_info_list *tdmr_list)
372 {
373 	struct tdx_memblock *tmb;
374 	int tdmr_idx = 0;
375 
376 	/*
377 	 * Loop over TDX memory regions and fill out TDMRs to cover them.
378 	 * To keep it simple, always try to use one TDMR to cover one
379 	 * memory region.
380 	 *
381 	 * In practice TDX supports at least 64 TDMRs.  A 2-socket system
382 	 * typically only consumes less than 10 of those.  This code is
383 	 * dumb and simple and may use more TMDRs than is strictly
384 	 * required.
385 	 */
386 	list_for_each_entry(tmb, tmb_list, list) {
387 		struct tdmr_info *tdmr = tdmr_entry(tdmr_list, tdmr_idx);
388 		u64 start, end;
389 
390 		start = TDMR_ALIGN_DOWN(PFN_PHYS(tmb->start_pfn));
391 		end   = TDMR_ALIGN_UP(PFN_PHYS(tmb->end_pfn));
392 
393 		/*
394 		 * A valid size indicates the current TDMR has already
395 		 * been filled out to cover the previous memory region(s).
396 		 */
397 		if (tdmr->size) {
398 			/*
399 			 * Loop to the next if the current memory region
400 			 * has already been fully covered.
401 			 */
402 			if (end <= tdmr_end(tdmr))
403 				continue;
404 
405 			/* Otherwise, skip the already covered part. */
406 			if (start < tdmr_end(tdmr))
407 				start = tdmr_end(tdmr);
408 
409 			/*
410 			 * Create a new TDMR to cover the current memory
411 			 * region, or the remaining part of it.
412 			 */
413 			tdmr_idx++;
414 			if (tdmr_idx >= tdmr_list->max_tdmrs) {
415 				pr_warn("initialization failed: TDMRs exhausted.\n");
416 				return -ENOSPC;
417 			}
418 
419 			tdmr = tdmr_entry(tdmr_list, tdmr_idx);
420 		}
421 
422 		tdmr->base = start;
423 		tdmr->size = end - start;
424 	}
425 
426 	/* @tdmr_idx is always the index of the last valid TDMR. */
427 	tdmr_list->nr_consumed_tdmrs = tdmr_idx + 1;
428 
429 	/*
430 	 * Warn early that kernel is about to run out of TDMRs.
431 	 *
432 	 * This is an indication that TDMR allocation has to be
433 	 * reworked to be smarter to not run into an issue.
434 	 */
435 	if (tdmr_list->max_tdmrs - tdmr_list->nr_consumed_tdmrs < TDMR_NR_WARN)
436 		pr_warn("consumed TDMRs reaching limit: %d used out of %d\n",
437 				tdmr_list->nr_consumed_tdmrs,
438 				tdmr_list->max_tdmrs);
439 
440 	return 0;
441 }
442 
443 /*
444  * Calculate PAMT size given a TDMR and a page size.  The returned
445  * PAMT size is always aligned up to 4K page boundary.
446  */
447 static unsigned long tdmr_get_pamt_sz(struct tdmr_info *tdmr, int pgsz,
448 				      u16 pamt_entry_size)
449 {
450 	unsigned long pamt_sz, nr_pamt_entries;
451 
452 	switch (pgsz) {
453 	case TDX_PS_4K:
454 		nr_pamt_entries = tdmr->size >> PAGE_SHIFT;
455 		break;
456 	case TDX_PS_2M:
457 		nr_pamt_entries = tdmr->size >> PMD_SHIFT;
458 		break;
459 	case TDX_PS_1G:
460 		nr_pamt_entries = tdmr->size >> PUD_SHIFT;
461 		break;
462 	default:
463 		WARN_ON_ONCE(1);
464 		return 0;
465 	}
466 
467 	pamt_sz = nr_pamt_entries * pamt_entry_size;
468 	/* TDX requires PAMT size must be 4K aligned */
469 	pamt_sz = ALIGN(pamt_sz, PAGE_SIZE);
470 
471 	return pamt_sz;
472 }
473 
474 /*
475  * Locate a NUMA node which should hold the allocation of the @tdmr
476  * PAMT.  This node will have some memory covered by the TDMR.  The
477  * relative amount of memory covered is not considered.
478  */
479 static int tdmr_get_nid(struct tdmr_info *tdmr, struct list_head *tmb_list)
480 {
481 	struct tdx_memblock *tmb;
482 
483 	/*
484 	 * A TDMR must cover at least part of one TMB.  That TMB will end
485 	 * after the TDMR begins.  But, that TMB may have started before
486 	 * the TDMR.  Find the next 'tmb' that _ends_ after this TDMR
487 	 * begins.  Ignore 'tmb' start addresses.  They are irrelevant.
488 	 */
489 	list_for_each_entry(tmb, tmb_list, list) {
490 		if (tmb->end_pfn > PHYS_PFN(tdmr->base))
491 			return tmb->nid;
492 	}
493 
494 	/*
495 	 * Fall back to allocating the TDMR's metadata from node 0 when
496 	 * no TDX memory block can be found.  This should never happen
497 	 * since TDMRs originate from TDX memory blocks.
498 	 */
499 	pr_warn("TDMR [0x%llx, 0x%llx): unable to find local NUMA node for PAMT allocation, fallback to use node 0.\n",
500 			tdmr->base, tdmr_end(tdmr));
501 	return 0;
502 }
503 
504 /*
505  * Allocate PAMTs from the local NUMA node of some memory in @tmb_list
506  * within @tdmr, and set up PAMTs for @tdmr.
507  */
508 static int tdmr_set_up_pamt(struct tdmr_info *tdmr,
509 			    struct list_head *tmb_list,
510 			    u16 pamt_entry_size[])
511 {
512 	unsigned long pamt_base[TDX_PS_NR];
513 	unsigned long pamt_size[TDX_PS_NR];
514 	unsigned long tdmr_pamt_base;
515 	unsigned long tdmr_pamt_size;
516 	struct page *pamt;
517 	int pgsz, nid;
518 
519 	nid = tdmr_get_nid(tdmr, tmb_list);
520 
521 	/*
522 	 * Calculate the PAMT size for each TDX supported page size
523 	 * and the total PAMT size.
524 	 */
525 	tdmr_pamt_size = 0;
526 	for (pgsz = TDX_PS_4K; pgsz < TDX_PS_NR; pgsz++) {
527 		pamt_size[pgsz] = tdmr_get_pamt_sz(tdmr, pgsz,
528 					pamt_entry_size[pgsz]);
529 		tdmr_pamt_size += pamt_size[pgsz];
530 	}
531 
532 	/*
533 	 * Allocate one chunk of physically contiguous memory for all
534 	 * PAMTs.  This helps minimize the PAMT's use of reserved areas
535 	 * in overlapped TDMRs.
536 	 */
537 	pamt = alloc_contig_pages(tdmr_pamt_size >> PAGE_SHIFT, GFP_KERNEL,
538 			nid, &node_online_map);
539 	if (!pamt)
540 		return -ENOMEM;
541 
542 	/*
543 	 * Break the contiguous allocation back up into the
544 	 * individual PAMTs for each page size.
545 	 */
546 	tdmr_pamt_base = page_to_pfn(pamt) << PAGE_SHIFT;
547 	for (pgsz = TDX_PS_4K; pgsz < TDX_PS_NR; pgsz++) {
548 		pamt_base[pgsz] = tdmr_pamt_base;
549 		tdmr_pamt_base += pamt_size[pgsz];
550 	}
551 
552 	tdmr->pamt_4k_base = pamt_base[TDX_PS_4K];
553 	tdmr->pamt_4k_size = pamt_size[TDX_PS_4K];
554 	tdmr->pamt_2m_base = pamt_base[TDX_PS_2M];
555 	tdmr->pamt_2m_size = pamt_size[TDX_PS_2M];
556 	tdmr->pamt_1g_base = pamt_base[TDX_PS_1G];
557 	tdmr->pamt_1g_size = pamt_size[TDX_PS_1G];
558 
559 	return 0;
560 }
561 
562 static void tdmr_get_pamt(struct tdmr_info *tdmr, unsigned long *pamt_base,
563 			  unsigned long *pamt_size)
564 {
565 	unsigned long pamt_bs, pamt_sz;
566 
567 	/*
568 	 * The PAMT was allocated in one contiguous unit.  The 4K PAMT
569 	 * should always point to the beginning of that allocation.
570 	 */
571 	pamt_bs = tdmr->pamt_4k_base;
572 	pamt_sz = tdmr->pamt_4k_size + tdmr->pamt_2m_size + tdmr->pamt_1g_size;
573 
574 	WARN_ON_ONCE((pamt_bs & ~PAGE_MASK) || (pamt_sz & ~PAGE_MASK));
575 
576 	*pamt_base = pamt_bs;
577 	*pamt_size = pamt_sz;
578 }
579 
580 static void tdmr_do_pamt_func(struct tdmr_info *tdmr,
581 		void (*pamt_func)(unsigned long base, unsigned long size))
582 {
583 	unsigned long pamt_base, pamt_size;
584 
585 	tdmr_get_pamt(tdmr, &pamt_base, &pamt_size);
586 
587 	/* Do nothing if PAMT hasn't been allocated for this TDMR */
588 	if (!pamt_size)
589 		return;
590 
591 	if (WARN_ON_ONCE(!pamt_base))
592 		return;
593 
594 	pamt_func(pamt_base, pamt_size);
595 }
596 
597 static void free_pamt(unsigned long pamt_base, unsigned long pamt_size)
598 {
599 	free_contig_range(pamt_base >> PAGE_SHIFT, pamt_size >> PAGE_SHIFT);
600 }
601 
602 static void tdmr_free_pamt(struct tdmr_info *tdmr)
603 {
604 	tdmr_do_pamt_func(tdmr, free_pamt);
605 }
606 
607 static void tdmrs_free_pamt_all(struct tdmr_info_list *tdmr_list)
608 {
609 	int i;
610 
611 	for (i = 0; i < tdmr_list->nr_consumed_tdmrs; i++)
612 		tdmr_free_pamt(tdmr_entry(tdmr_list, i));
613 }
614 
615 /* Allocate and set up PAMTs for all TDMRs */
616 static int tdmrs_set_up_pamt_all(struct tdmr_info_list *tdmr_list,
617 				 struct list_head *tmb_list,
618 				 u16 pamt_entry_size[])
619 {
620 	int i, ret = 0;
621 
622 	for (i = 0; i < tdmr_list->nr_consumed_tdmrs; i++) {
623 		ret = tdmr_set_up_pamt(tdmr_entry(tdmr_list, i), tmb_list,
624 				pamt_entry_size);
625 		if (ret)
626 			goto err;
627 	}
628 
629 	return 0;
630 err:
631 	tdmrs_free_pamt_all(tdmr_list);
632 	return ret;
633 }
634 
635 /*
636  * Convert TDX private pages back to normal by using MOVDIR64B to clear these
637  * pages. Typically, any write to the page will convert it from TDX private back
638  * to normal kernel memory. Systems with the X86_BUG_TDX_PW_MCE erratum need to
639  * do the conversion explicitly via MOVDIR64B.
640  */
641 static void tdx_quirk_reset_paddr(unsigned long base, unsigned long size)
642 {
643 	const void *zero_page = (const void *)page_address(ZERO_PAGE(0));
644 	unsigned long phys, end;
645 
646 	if (!boot_cpu_has_bug(X86_BUG_TDX_PW_MCE))
647 		return;
648 
649 	end = base + size;
650 	for (phys = base; phys < end; phys += 64)
651 		movdir64b(__va(phys), zero_page);
652 
653 	/*
654 	 * MOVDIR64B uses WC protocol.  Use memory barrier to
655 	 * make sure any later user of these pages sees the
656 	 * updated data.
657 	 */
658 	mb();
659 }
660 
661 void tdx_quirk_reset_page(struct page *page)
662 {
663 	tdx_quirk_reset_paddr(page_to_phys(page), PAGE_SIZE);
664 }
665 EXPORT_SYMBOL_GPL(tdx_quirk_reset_page);
666 
667 static void tdmr_quirk_reset_pamt(struct tdmr_info *tdmr)
668 {
669 	tdmr_do_pamt_func(tdmr, tdx_quirk_reset_paddr);
670 }
671 
672 static void tdmrs_quirk_reset_pamt_all(struct tdmr_info_list *tdmr_list)
673 {
674 	int i;
675 
676 	for (i = 0; i < tdmr_list->nr_consumed_tdmrs; i++)
677 		tdmr_quirk_reset_pamt(tdmr_entry(tdmr_list, i));
678 }
679 
680 static unsigned long tdmrs_count_pamt_kb(struct tdmr_info_list *tdmr_list)
681 {
682 	unsigned long pamt_size = 0;
683 	int i;
684 
685 	for (i = 0; i < tdmr_list->nr_consumed_tdmrs; i++) {
686 		unsigned long base, size;
687 
688 		tdmr_get_pamt(tdmr_entry(tdmr_list, i), &base, &size);
689 		pamt_size += size;
690 	}
691 
692 	return pamt_size / 1024;
693 }
694 
695 static int tdmr_add_rsvd_area(struct tdmr_info *tdmr, int *p_idx, u64 addr,
696 			      u64 size, u16 max_reserved_per_tdmr)
697 {
698 	struct tdmr_reserved_area *rsvd_areas = tdmr->reserved_areas;
699 	int idx = *p_idx;
700 
701 	/* Reserved area must be 4K aligned in offset and size */
702 	if (WARN_ON(addr & ~PAGE_MASK || size & ~PAGE_MASK))
703 		return -EINVAL;
704 
705 	if (idx >= max_reserved_per_tdmr) {
706 		pr_warn("initialization failed: TDMR [0x%llx, 0x%llx): reserved areas exhausted.\n",
707 				tdmr->base, tdmr_end(tdmr));
708 		return -ENOSPC;
709 	}
710 
711 	/*
712 	 * Consume one reserved area per call.  Make no effort to
713 	 * optimize or reduce the number of reserved areas which are
714 	 * consumed by contiguous reserved areas, for instance.
715 	 */
716 	rsvd_areas[idx].offset = addr - tdmr->base;
717 	rsvd_areas[idx].size = size;
718 
719 	*p_idx = idx + 1;
720 
721 	return 0;
722 }
723 
724 /*
725  * Go through @tmb_list to find holes between memory areas.  If any of
726  * those holes fall within @tdmr, set up a TDMR reserved area to cover
727  * the hole.
728  */
729 static int tdmr_populate_rsvd_holes(struct list_head *tmb_list,
730 				    struct tdmr_info *tdmr,
731 				    int *rsvd_idx,
732 				    u16 max_reserved_per_tdmr)
733 {
734 	struct tdx_memblock *tmb;
735 	u64 prev_end;
736 	int ret;
737 
738 	/*
739 	 * Start looking for reserved blocks at the
740 	 * beginning of the TDMR.
741 	 */
742 	prev_end = tdmr->base;
743 	list_for_each_entry(tmb, tmb_list, list) {
744 		u64 start, end;
745 
746 		start = PFN_PHYS(tmb->start_pfn);
747 		end   = PFN_PHYS(tmb->end_pfn);
748 
749 		/* Break if this region is after the TDMR */
750 		if (start >= tdmr_end(tdmr))
751 			break;
752 
753 		/* Exclude regions before this TDMR */
754 		if (end < tdmr->base)
755 			continue;
756 
757 		/*
758 		 * Skip over memory areas that
759 		 * have already been dealt with.
760 		 */
761 		if (start <= prev_end) {
762 			prev_end = end;
763 			continue;
764 		}
765 
766 		/* Add the hole before this region */
767 		ret = tdmr_add_rsvd_area(tdmr, rsvd_idx, prev_end,
768 				start - prev_end,
769 				max_reserved_per_tdmr);
770 		if (ret)
771 			return ret;
772 
773 		prev_end = end;
774 	}
775 
776 	/* Add the hole after the last region if it exists. */
777 	if (prev_end < tdmr_end(tdmr)) {
778 		ret = tdmr_add_rsvd_area(tdmr, rsvd_idx, prev_end,
779 				tdmr_end(tdmr) - prev_end,
780 				max_reserved_per_tdmr);
781 		if (ret)
782 			return ret;
783 	}
784 
785 	return 0;
786 }
787 
788 /*
789  * Go through @tdmr_list to find all PAMTs.  If any of those PAMTs
790  * overlaps with @tdmr, set up a TDMR reserved area to cover the
791  * overlapping part.
792  */
793 static int tdmr_populate_rsvd_pamts(struct tdmr_info_list *tdmr_list,
794 				    struct tdmr_info *tdmr,
795 				    int *rsvd_idx,
796 				    u16 max_reserved_per_tdmr)
797 {
798 	int i, ret;
799 
800 	for (i = 0; i < tdmr_list->nr_consumed_tdmrs; i++) {
801 		struct tdmr_info *tmp = tdmr_entry(tdmr_list, i);
802 		unsigned long pamt_base, pamt_size, pamt_end;
803 
804 		tdmr_get_pamt(tmp, &pamt_base, &pamt_size);
805 		/* Each TDMR must already have PAMT allocated */
806 		WARN_ON_ONCE(!pamt_size || !pamt_base);
807 
808 		pamt_end = pamt_base + pamt_size;
809 		/* Skip PAMTs outside of the given TDMR */
810 		if ((pamt_end <= tdmr->base) ||
811 				(pamt_base >= tdmr_end(tdmr)))
812 			continue;
813 
814 		/* Only mark the part within the TDMR as reserved */
815 		if (pamt_base < tdmr->base)
816 			pamt_base = tdmr->base;
817 		if (pamt_end > tdmr_end(tdmr))
818 			pamt_end = tdmr_end(tdmr);
819 
820 		ret = tdmr_add_rsvd_area(tdmr, rsvd_idx, pamt_base,
821 				pamt_end - pamt_base,
822 				max_reserved_per_tdmr);
823 		if (ret)
824 			return ret;
825 	}
826 
827 	return 0;
828 }
829 
830 /* Compare function called by sort() for TDMR reserved areas */
831 static int rsvd_area_cmp_func(const void *a, const void *b)
832 {
833 	struct tdmr_reserved_area *r1 = (struct tdmr_reserved_area *)a;
834 	struct tdmr_reserved_area *r2 = (struct tdmr_reserved_area *)b;
835 
836 	if (r1->offset + r1->size <= r2->offset)
837 		return -1;
838 	if (r1->offset >= r2->offset + r2->size)
839 		return 1;
840 
841 	/* Reserved areas cannot overlap.  The caller must guarantee. */
842 	WARN_ON_ONCE(1);
843 	return -1;
844 }
845 
846 /*
847  * Populate reserved areas for the given @tdmr, including memory holes
848  * (via @tmb_list) and PAMTs (via @tdmr_list).
849  */
850 static int tdmr_populate_rsvd_areas(struct tdmr_info *tdmr,
851 				    struct list_head *tmb_list,
852 				    struct tdmr_info_list *tdmr_list,
853 				    u16 max_reserved_per_tdmr)
854 {
855 	int ret, rsvd_idx = 0;
856 
857 	ret = tdmr_populate_rsvd_holes(tmb_list, tdmr, &rsvd_idx,
858 			max_reserved_per_tdmr);
859 	if (ret)
860 		return ret;
861 
862 	ret = tdmr_populate_rsvd_pamts(tdmr_list, tdmr, &rsvd_idx,
863 			max_reserved_per_tdmr);
864 	if (ret)
865 		return ret;
866 
867 	/* TDX requires reserved areas listed in address ascending order */
868 	sort(tdmr->reserved_areas, rsvd_idx, sizeof(struct tdmr_reserved_area),
869 			rsvd_area_cmp_func, NULL);
870 
871 	return 0;
872 }
873 
874 /*
875  * Populate reserved areas for all TDMRs in @tdmr_list, including memory
876  * holes (via @tmb_list) and PAMTs.
877  */
878 static int tdmrs_populate_rsvd_areas_all(struct tdmr_info_list *tdmr_list,
879 					 struct list_head *tmb_list,
880 					 u16 max_reserved_per_tdmr)
881 {
882 	int i;
883 
884 	for (i = 0; i < tdmr_list->nr_consumed_tdmrs; i++) {
885 		int ret;
886 
887 		ret = tdmr_populate_rsvd_areas(tdmr_entry(tdmr_list, i),
888 				tmb_list, tdmr_list, max_reserved_per_tdmr);
889 		if (ret)
890 			return ret;
891 	}
892 
893 	return 0;
894 }
895 
896 /*
897  * Construct a list of TDMRs on the preallocated space in @tdmr_list
898  * to cover all TDX memory regions in @tmb_list based on the TDX module
899  * TDMR global information in @sysinfo_tdmr.
900  */
901 static int construct_tdmrs(struct list_head *tmb_list,
902 			   struct tdmr_info_list *tdmr_list,
903 			   struct tdx_sys_info_tdmr *sysinfo_tdmr)
904 {
905 	u16 pamt_entry_size[TDX_PS_NR] = {
906 		sysinfo_tdmr->pamt_4k_entry_size,
907 		sysinfo_tdmr->pamt_2m_entry_size,
908 		sysinfo_tdmr->pamt_1g_entry_size,
909 	};
910 	int ret;
911 
912 	ret = fill_out_tdmrs(tmb_list, tdmr_list);
913 	if (ret)
914 		return ret;
915 
916 	ret = tdmrs_set_up_pamt_all(tdmr_list, tmb_list, pamt_entry_size);
917 	if (ret)
918 		return ret;
919 
920 	ret = tdmrs_populate_rsvd_areas_all(tdmr_list, tmb_list,
921 			sysinfo_tdmr->max_reserved_per_tdmr);
922 	if (ret)
923 		tdmrs_free_pamt_all(tdmr_list);
924 
925 	/*
926 	 * The tdmr_info_list is read-only from here on out.
927 	 * Ensure that these writes are seen by other CPUs.
928 	 * Pairs with a smp_rmb() in is_pamt_page().
929 	 */
930 	smp_wmb();
931 
932 	return ret;
933 }
934 
935 static int config_tdx_module(struct tdmr_info_list *tdmr_list, u64 global_keyid)
936 {
937 	struct tdx_module_args args = {};
938 	u64 *tdmr_pa_array;
939 	size_t array_sz;
940 	int i, ret;
941 
942 	/*
943 	 * TDMRs are passed to the TDX module via an array of physical
944 	 * addresses of each TDMR.  The array itself also has certain
945 	 * alignment requirement.
946 	 */
947 	array_sz = tdmr_list->nr_consumed_tdmrs * sizeof(u64);
948 	array_sz = roundup_pow_of_two(array_sz);
949 	if (array_sz < TDMR_INFO_PA_ARRAY_ALIGNMENT)
950 		array_sz = TDMR_INFO_PA_ARRAY_ALIGNMENT;
951 
952 	tdmr_pa_array = kzalloc(array_sz, GFP_KERNEL);
953 	if (!tdmr_pa_array)
954 		return -ENOMEM;
955 
956 	for (i = 0; i < tdmr_list->nr_consumed_tdmrs; i++)
957 		tdmr_pa_array[i] = __pa(tdmr_entry(tdmr_list, i));
958 
959 	args.rcx = __pa(tdmr_pa_array);
960 	args.rdx = tdmr_list->nr_consumed_tdmrs;
961 	args.r8 = global_keyid;
962 	ret = seamcall_prerr(TDH_SYS_CONFIG, &args);
963 
964 	/* Free the array as it is not required anymore. */
965 	kfree(tdmr_pa_array);
966 
967 	return ret;
968 }
969 
970 static int do_global_key_config(void *unused)
971 {
972 	struct tdx_module_args args = {};
973 
974 	return seamcall_prerr(TDH_SYS_KEY_CONFIG, &args);
975 }
976 
977 /*
978  * Attempt to configure the global KeyID on all physical packages.
979  *
980  * This requires running code on at least one CPU in each package.
981  * TDMR initialization) will fail will fail if any package in the
982  * system has no online CPUs.
983  *
984  * This code takes no affirmative steps to online CPUs.  Callers (aka.
985  * KVM) can ensure success by ensuring sufficient CPUs are online and
986  * can run SEAMCALLs.
987  */
988 static int config_global_keyid(void)
989 {
990 	cpumask_var_t packages;
991 	int cpu, ret = -EINVAL;
992 
993 	if (!zalloc_cpumask_var(&packages, GFP_KERNEL))
994 		return -ENOMEM;
995 
996 	/*
997 	 * Hardware doesn't guarantee cache coherency across different
998 	 * KeyIDs.  The kernel needs to flush PAMT's dirty cachelines
999 	 * (associated with KeyID 0) before the TDX module can use the
1000 	 * global KeyID to access the PAMT.  Given PAMTs are potentially
1001 	 * large (~1/256th of system RAM), just use WBINVD.
1002 	 */
1003 	wbinvd_on_all_cpus();
1004 
1005 	for_each_online_cpu(cpu) {
1006 		/*
1007 		 * The key configuration only needs to be done once per
1008 		 * package and will return an error if configured more
1009 		 * than once.  Avoid doing it multiple times per package.
1010 		 */
1011 		if (cpumask_test_and_set_cpu(topology_physical_package_id(cpu),
1012 					packages))
1013 			continue;
1014 
1015 		/*
1016 		 * TDH.SYS.KEY.CONFIG cannot run concurrently on
1017 		 * different cpus.  Do it one by one.
1018 		 */
1019 		ret = smp_call_on_cpu(cpu, do_global_key_config, NULL, true);
1020 		if (ret)
1021 			break;
1022 	}
1023 
1024 	free_cpumask_var(packages);
1025 	return ret;
1026 }
1027 
1028 static int init_tdmr(struct tdmr_info *tdmr)
1029 {
1030 	u64 next;
1031 
1032 	/*
1033 	 * Initializing a TDMR can be time consuming.  To avoid long
1034 	 * SEAMCALLs, the TDX module may only initialize a part of the
1035 	 * TDMR in each call.
1036 	 */
1037 	do {
1038 		struct tdx_module_args args = {
1039 			.rcx = tdmr->base,
1040 		};
1041 		int ret;
1042 
1043 		ret = seamcall_prerr_ret(TDH_SYS_TDMR_INIT, &args);
1044 		if (ret)
1045 			return ret;
1046 		/*
1047 		 * RDX contains 'next-to-initialize' address if
1048 		 * TDH.SYS.TDMR.INIT did not fully complete and
1049 		 * should be retried.
1050 		 */
1051 		next = args.rdx;
1052 		cond_resched();
1053 		/* Keep making SEAMCALLs until the TDMR is done */
1054 	} while (next < tdmr->base + tdmr->size);
1055 
1056 	return 0;
1057 }
1058 
1059 static int init_tdmrs(struct tdmr_info_list *tdmr_list)
1060 {
1061 	int i;
1062 
1063 	/*
1064 	 * This operation is costly.  It can be parallelized,
1065 	 * but keep it simple for now.
1066 	 */
1067 	for (i = 0; i < tdmr_list->nr_consumed_tdmrs; i++) {
1068 		int ret;
1069 
1070 		ret = init_tdmr(tdmr_entry(tdmr_list, i));
1071 		if (ret)
1072 			return ret;
1073 	}
1074 
1075 	return 0;
1076 }
1077 
1078 static int init_tdx_module(void)
1079 {
1080 	int ret;
1081 
1082 	ret = get_tdx_sys_info(&tdx_sysinfo);
1083 	if (ret)
1084 		return ret;
1085 
1086 	/* Check whether the kernel can support this module */
1087 	ret = check_features(&tdx_sysinfo);
1088 	if (ret)
1089 		return ret;
1090 
1091 	/*
1092 	 * To keep things simple, assume that all TDX-protected memory
1093 	 * will come from the page allocator.  Make sure all pages in the
1094 	 * page allocator are TDX-usable memory.
1095 	 *
1096 	 * Build the list of "TDX-usable" memory regions which cover all
1097 	 * pages in the page allocator to guarantee that.  Do it while
1098 	 * holding mem_hotplug_lock read-lock as the memory hotplug code
1099 	 * path reads the @tdx_memlist to reject any new memory.
1100 	 */
1101 	get_online_mems();
1102 
1103 	ret = build_tdx_memlist(&tdx_memlist);
1104 	if (ret)
1105 		goto out_put_tdxmem;
1106 
1107 	/* Allocate enough space for constructing TDMRs */
1108 	ret = alloc_tdmr_list(&tdx_tdmr_list, &tdx_sysinfo.tdmr);
1109 	if (ret)
1110 		goto err_free_tdxmem;
1111 
1112 	/* Cover all TDX-usable memory regions in TDMRs */
1113 	ret = construct_tdmrs(&tdx_memlist, &tdx_tdmr_list, &tdx_sysinfo.tdmr);
1114 	if (ret)
1115 		goto err_free_tdmrs;
1116 
1117 	/* Pass the TDMRs and the global KeyID to the TDX module */
1118 	ret = config_tdx_module(&tdx_tdmr_list, tdx_global_keyid);
1119 	if (ret)
1120 		goto err_free_pamts;
1121 
1122 	/* Config the key of global KeyID on all packages */
1123 	ret = config_global_keyid();
1124 	if (ret)
1125 		goto err_reset_pamts;
1126 
1127 	/* Initialize TDMRs to complete the TDX module initialization */
1128 	ret = init_tdmrs(&tdx_tdmr_list);
1129 	if (ret)
1130 		goto err_reset_pamts;
1131 
1132 	pr_info("%lu KB allocated for PAMT\n", tdmrs_count_pamt_kb(&tdx_tdmr_list));
1133 
1134 out_put_tdxmem:
1135 	/*
1136 	 * @tdx_memlist is written here and read at memory hotplug time.
1137 	 * Lock out memory hotplug code while building it.
1138 	 */
1139 	put_online_mems();
1140 	return ret;
1141 
1142 err_reset_pamts:
1143 	/*
1144 	 * Part of PAMTs may already have been initialized by the
1145 	 * TDX module.  Flush cache before returning PAMTs back
1146 	 * to the kernel.
1147 	 */
1148 	wbinvd_on_all_cpus();
1149 	tdmrs_quirk_reset_pamt_all(&tdx_tdmr_list);
1150 err_free_pamts:
1151 	tdmrs_free_pamt_all(&tdx_tdmr_list);
1152 err_free_tdmrs:
1153 	free_tdmr_list(&tdx_tdmr_list);
1154 err_free_tdxmem:
1155 	free_tdx_memlist(&tdx_memlist);
1156 	goto out_put_tdxmem;
1157 }
1158 
1159 static int __tdx_enable(void)
1160 {
1161 	int ret;
1162 
1163 	ret = init_tdx_module();
1164 	if (ret) {
1165 		pr_err("module initialization failed (%d)\n", ret);
1166 		tdx_module_status = TDX_MODULE_ERROR;
1167 		return ret;
1168 	}
1169 
1170 	pr_info("module initialized\n");
1171 	tdx_module_status = TDX_MODULE_INITIALIZED;
1172 
1173 	return 0;
1174 }
1175 
1176 /**
1177  * tdx_enable - Enable TDX module to make it ready to run TDX guests
1178  *
1179  * This function assumes the caller has: 1) held read lock of CPU hotplug
1180  * lock to prevent any new cpu from becoming online; 2) done both VMXON
1181  * and tdx_cpu_enable() on all online cpus.
1182  *
1183  * This function requires there's at least one online cpu for each CPU
1184  * package to succeed.
1185  *
1186  * This function can be called in parallel by multiple callers.
1187  *
1188  * Return 0 if TDX is enabled successfully, otherwise error.
1189  */
1190 int tdx_enable(void)
1191 {
1192 	int ret;
1193 
1194 	if (!boot_cpu_has(X86_FEATURE_TDX_HOST_PLATFORM))
1195 		return -ENODEV;
1196 
1197 	lockdep_assert_cpus_held();
1198 
1199 	mutex_lock(&tdx_module_lock);
1200 
1201 	switch (tdx_module_status) {
1202 	case TDX_MODULE_UNINITIALIZED:
1203 		ret = __tdx_enable();
1204 		break;
1205 	case TDX_MODULE_INITIALIZED:
1206 		/* Already initialized, great, tell the caller. */
1207 		ret = 0;
1208 		break;
1209 	default:
1210 		/* Failed to initialize in the previous attempts */
1211 		ret = -EINVAL;
1212 		break;
1213 	}
1214 
1215 	mutex_unlock(&tdx_module_lock);
1216 
1217 	return ret;
1218 }
1219 EXPORT_SYMBOL_GPL(tdx_enable);
1220 
1221 static bool is_pamt_page(unsigned long phys)
1222 {
1223 	struct tdmr_info_list *tdmr_list = &tdx_tdmr_list;
1224 	int i;
1225 
1226 	/* Ensure that all remote 'tdmr_list' writes are visible: */
1227 	smp_rmb();
1228 
1229 	/*
1230 	 * The TDX module is no longer returning TDX_SYS_NOT_READY and
1231 	 * is initialized.  The 'tdmr_list' was initialized long ago
1232 	 * and is now read-only.
1233 	 */
1234 	for (i = 0; i < tdmr_list->nr_consumed_tdmrs; i++) {
1235 		unsigned long base, size;
1236 
1237 		tdmr_get_pamt(tdmr_entry(tdmr_list, i), &base, &size);
1238 
1239 		if (phys >= base && phys < (base + size))
1240 			return true;
1241 	}
1242 
1243 	return false;
1244 }
1245 
1246 /*
1247  * Return whether the memory page at the given physical address is TDX
1248  * private memory or not.
1249  *
1250  * This can be imprecise for two known reasons:
1251  * 1. PAMTs are private memory and exist before the TDX module is
1252  *    ready and TDH_PHYMEM_PAGE_RDMD works.  This is a relatively
1253  *    short window that occurs once per boot.
1254  * 2. TDH_PHYMEM_PAGE_RDMD reflects the TDX module's knowledge of the
1255  *    page.  However, the page can still cause #MC until it has been
1256  *    fully converted to shared using 64-byte writes like MOVDIR64B.
1257  *    Buggy hosts might still leave #MC-causing memory in place which
1258  *    this function can not detect.
1259  */
1260 static bool paddr_is_tdx_private(unsigned long phys)
1261 {
1262 	struct tdx_module_args args = {
1263 		.rcx = phys & PAGE_MASK,
1264 	};
1265 	u64 sret;
1266 
1267 	if (!boot_cpu_has(X86_FEATURE_TDX_HOST_PLATFORM))
1268 		return false;
1269 
1270 	/* Get page type from the TDX module */
1271 	sret = __seamcall_dirty_cache(__seamcall_ret, TDH_PHYMEM_PAGE_RDMD, &args);
1272 
1273 	/*
1274 	 * The SEAMCALL will not return success unless there is a
1275 	 * working, "ready" TDX module.  Assume an absence of TDX
1276 	 * private pages until SEAMCALL is working.
1277 	 */
1278 	if (sret)
1279 		return false;
1280 
1281 	/*
1282 	 * SEAMCALL was successful -- read page type (via RCX):
1283 	 *
1284 	 *  - PT_NDA:	Page is not used by the TDX module
1285 	 *  - PT_RSVD:	Reserved for Non-TDX use
1286 	 *  - Others:	Page is used by the TDX module
1287 	 *
1288 	 * Note PAMT pages are marked as PT_RSVD but they are also TDX
1289 	 * private memory.
1290 	 */
1291 	switch (args.rcx) {
1292 	case PT_NDA:
1293 		return false;
1294 	case PT_RSVD:
1295 		return is_pamt_page(phys);
1296 	default:
1297 		return true;
1298 	}
1299 }
1300 
1301 /*
1302  * Some TDX-capable CPUs have an erratum.  A write to TDX private
1303  * memory poisons that memory, and a subsequent read of that memory
1304  * triggers #MC.
1305  *
1306  * Help distinguish erratum-triggered #MCs from a normal hardware one.
1307  * Just print additional message to show such #MC may be result of the
1308  * erratum.
1309  */
1310 const char *tdx_dump_mce_info(struct mce *m)
1311 {
1312 	if (!m || !mce_is_memory_error(m) || !mce_usable_address(m))
1313 		return NULL;
1314 
1315 	if (!paddr_is_tdx_private(m->addr))
1316 		return NULL;
1317 
1318 	return "TDX private memory error. Possible kernel bug.";
1319 }
1320 
1321 static __init int record_keyid_partitioning(u32 *tdx_keyid_start,
1322 					    u32 *nr_tdx_keyids)
1323 {
1324 	u32 _nr_mktme_keyids, _tdx_keyid_start, _nr_tdx_keyids;
1325 	int ret;
1326 
1327 	/*
1328 	 * IA32_MKTME_KEYID_PARTIONING:
1329 	 *   Bit [31:0]:	Number of MKTME KeyIDs.
1330 	 *   Bit [63:32]:	Number of TDX private KeyIDs.
1331 	 */
1332 	ret = rdmsr_safe(MSR_IA32_MKTME_KEYID_PARTITIONING, &_nr_mktme_keyids,
1333 			&_nr_tdx_keyids);
1334 	if (ret || !_nr_tdx_keyids)
1335 		return -EINVAL;
1336 
1337 	/* TDX KeyIDs start after the last MKTME KeyID. */
1338 	_tdx_keyid_start = _nr_mktme_keyids + 1;
1339 
1340 	*tdx_keyid_start = _tdx_keyid_start;
1341 	*nr_tdx_keyids = _nr_tdx_keyids;
1342 
1343 	return 0;
1344 }
1345 
1346 static bool is_tdx_memory(unsigned long start_pfn, unsigned long end_pfn)
1347 {
1348 	struct tdx_memblock *tmb;
1349 
1350 	/*
1351 	 * This check assumes that the start_pfn<->end_pfn range does not
1352 	 * cross multiple @tdx_memlist entries.  A single memory online
1353 	 * event across multiple memblocks (from which @tdx_memlist
1354 	 * entries are derived at the time of module initialization) is
1355 	 * not possible.  This is because memory offline/online is done
1356 	 * on granularity of 'struct memory_block', and the hotpluggable
1357 	 * memory region (one memblock) must be multiple of memory_block.
1358 	 */
1359 	list_for_each_entry(tmb, &tdx_memlist, list) {
1360 		if (start_pfn >= tmb->start_pfn && end_pfn <= tmb->end_pfn)
1361 			return true;
1362 	}
1363 	return false;
1364 }
1365 
1366 static int tdx_memory_notifier(struct notifier_block *nb, unsigned long action,
1367 			       void *v)
1368 {
1369 	struct memory_notify *mn = v;
1370 
1371 	if (action != MEM_GOING_ONLINE)
1372 		return NOTIFY_OK;
1373 
1374 	/*
1375 	 * Empty list means TDX isn't enabled.  Allow any memory
1376 	 * to go online.
1377 	 */
1378 	if (list_empty(&tdx_memlist))
1379 		return NOTIFY_OK;
1380 
1381 	/*
1382 	 * The TDX memory configuration is static and can not be
1383 	 * changed.  Reject onlining any memory which is outside of
1384 	 * the static configuration whether it supports TDX or not.
1385 	 */
1386 	if (is_tdx_memory(mn->start_pfn, mn->start_pfn + mn->nr_pages))
1387 		return NOTIFY_OK;
1388 
1389 	return NOTIFY_BAD;
1390 }
1391 
1392 static struct notifier_block tdx_memory_nb = {
1393 	.notifier_call = tdx_memory_notifier,
1394 };
1395 
1396 static void __init check_tdx_erratum(void)
1397 {
1398 	/*
1399 	 * These CPUs have an erratum.  A partial write from non-TD
1400 	 * software (e.g. via MOVNTI variants or UC/WC mapping) to TDX
1401 	 * private memory poisons that memory, and a subsequent read of
1402 	 * that memory triggers #MC.
1403 	 */
1404 	switch (boot_cpu_data.x86_vfm) {
1405 	case INTEL_SAPPHIRERAPIDS_X:
1406 	case INTEL_EMERALDRAPIDS_X:
1407 		setup_force_cpu_bug(X86_BUG_TDX_PW_MCE);
1408 	}
1409 }
1410 
1411 void __init tdx_init(void)
1412 {
1413 	u32 tdx_keyid_start, nr_tdx_keyids;
1414 	int err;
1415 
1416 	err = record_keyid_partitioning(&tdx_keyid_start, &nr_tdx_keyids);
1417 	if (err)
1418 		return;
1419 
1420 	pr_info("BIOS enabled: private KeyID range [%u, %u)\n",
1421 			tdx_keyid_start, tdx_keyid_start + nr_tdx_keyids);
1422 
1423 	/*
1424 	 * The TDX module itself requires one 'global KeyID' to protect
1425 	 * its metadata.  If there's only one TDX KeyID, there won't be
1426 	 * any left for TDX guests thus there's no point to enable TDX
1427 	 * at all.
1428 	 */
1429 	if (nr_tdx_keyids < 2) {
1430 		pr_err("initialization failed: too few private KeyIDs available.\n");
1431 		return;
1432 	}
1433 
1434 	/*
1435 	 * At this point, hibernation_available() indicates whether or
1436 	 * not hibernation support has been permanently disabled.
1437 	 */
1438 	if (hibernation_available()) {
1439 		pr_err("initialization failed: Hibernation support is enabled\n");
1440 		return;
1441 	}
1442 
1443 	err = register_memory_notifier(&tdx_memory_nb);
1444 	if (err) {
1445 		pr_err("initialization failed: register_memory_notifier() failed (%d)\n",
1446 				err);
1447 		return;
1448 	}
1449 
1450 #if defined(CONFIG_ACPI) && defined(CONFIG_SUSPEND)
1451 	pr_info("Disable ACPI S3. Turn off TDX in the BIOS to use ACPI S3.\n");
1452 	acpi_suspend_lowlevel = NULL;
1453 #endif
1454 
1455 	/*
1456 	 * Just use the first TDX KeyID as the 'global KeyID' and
1457 	 * leave the rest for TDX guests.
1458 	 */
1459 	tdx_global_keyid = tdx_keyid_start;
1460 	tdx_guest_keyid_start = tdx_keyid_start + 1;
1461 	tdx_nr_guest_keyids = nr_tdx_keyids - 1;
1462 
1463 	setup_force_cpu_cap(X86_FEATURE_TDX_HOST_PLATFORM);
1464 
1465 	check_tdx_erratum();
1466 }
1467 
1468 const struct tdx_sys_info *tdx_get_sysinfo(void)
1469 {
1470 	const struct tdx_sys_info *p = NULL;
1471 
1472 	/* Make sure all fields in @tdx_sysinfo have been populated */
1473 	mutex_lock(&tdx_module_lock);
1474 	if (tdx_module_status == TDX_MODULE_INITIALIZED)
1475 		p = (const struct tdx_sys_info *)&tdx_sysinfo;
1476 	mutex_unlock(&tdx_module_lock);
1477 
1478 	return p;
1479 }
1480 EXPORT_SYMBOL_GPL(tdx_get_sysinfo);
1481 
1482 u32 tdx_get_nr_guest_keyids(void)
1483 {
1484 	return tdx_nr_guest_keyids;
1485 }
1486 EXPORT_SYMBOL_GPL(tdx_get_nr_guest_keyids);
1487 
1488 int tdx_guest_keyid_alloc(void)
1489 {
1490 	return ida_alloc_range(&tdx_guest_keyid_pool, tdx_guest_keyid_start,
1491 			       tdx_guest_keyid_start + tdx_nr_guest_keyids - 1,
1492 			       GFP_KERNEL);
1493 }
1494 EXPORT_SYMBOL_GPL(tdx_guest_keyid_alloc);
1495 
1496 void tdx_guest_keyid_free(unsigned int keyid)
1497 {
1498 	ida_free(&tdx_guest_keyid_pool, keyid);
1499 }
1500 EXPORT_SYMBOL_GPL(tdx_guest_keyid_free);
1501 
1502 static inline u64 tdx_tdr_pa(struct tdx_td *td)
1503 {
1504 	return page_to_phys(td->tdr_page);
1505 }
1506 
1507 /*
1508  * The TDX module exposes a CLFLUSH_BEFORE_ALLOC bit to specify whether
1509  * a CLFLUSH of pages is required before handing them to the TDX module.
1510  * Be conservative and make the code simpler by doing the CLFLUSH
1511  * unconditionally.
1512  */
1513 static void tdx_clflush_page(struct page *page)
1514 {
1515 	clflush_cache_range(page_to_virt(page), PAGE_SIZE);
1516 }
1517 
1518 noinstr u64 tdh_vp_enter(struct tdx_vp *td, struct tdx_module_args *args)
1519 {
1520 	args->rcx = td->tdvpr_pa;
1521 
1522 	return __seamcall_dirty_cache(__seamcall_saved_ret, TDH_VP_ENTER, args);
1523 }
1524 EXPORT_SYMBOL_GPL(tdh_vp_enter);
1525 
1526 u64 tdh_mng_addcx(struct tdx_td *td, struct page *tdcs_page)
1527 {
1528 	struct tdx_module_args args = {
1529 		.rcx = page_to_phys(tdcs_page),
1530 		.rdx = tdx_tdr_pa(td),
1531 	};
1532 
1533 	tdx_clflush_page(tdcs_page);
1534 	return seamcall(TDH_MNG_ADDCX, &args);
1535 }
1536 EXPORT_SYMBOL_GPL(tdh_mng_addcx);
1537 
1538 u64 tdh_mem_page_add(struct tdx_td *td, u64 gpa, struct page *page, struct page *source, u64 *ext_err1, u64 *ext_err2)
1539 {
1540 	struct tdx_module_args args = {
1541 		.rcx = gpa,
1542 		.rdx = tdx_tdr_pa(td),
1543 		.r8 = page_to_phys(page),
1544 		.r9 = page_to_phys(source),
1545 	};
1546 	u64 ret;
1547 
1548 	tdx_clflush_page(page);
1549 	ret = seamcall_ret(TDH_MEM_PAGE_ADD, &args);
1550 
1551 	*ext_err1 = args.rcx;
1552 	*ext_err2 = args.rdx;
1553 
1554 	return ret;
1555 }
1556 EXPORT_SYMBOL_GPL(tdh_mem_page_add);
1557 
1558 u64 tdh_mem_sept_add(struct tdx_td *td, u64 gpa, int level, struct page *page, u64 *ext_err1, u64 *ext_err2)
1559 {
1560 	struct tdx_module_args args = {
1561 		.rcx = gpa | level,
1562 		.rdx = tdx_tdr_pa(td),
1563 		.r8 = page_to_phys(page),
1564 	};
1565 	u64 ret;
1566 
1567 	tdx_clflush_page(page);
1568 	ret = seamcall_ret(TDH_MEM_SEPT_ADD, &args);
1569 
1570 	*ext_err1 = args.rcx;
1571 	*ext_err2 = args.rdx;
1572 
1573 	return ret;
1574 }
1575 EXPORT_SYMBOL_GPL(tdh_mem_sept_add);
1576 
1577 u64 tdh_vp_addcx(struct tdx_vp *vp, struct page *tdcx_page)
1578 {
1579 	struct tdx_module_args args = {
1580 		.rcx = page_to_phys(tdcx_page),
1581 		.rdx = vp->tdvpr_pa,
1582 	};
1583 
1584 	tdx_clflush_page(tdcx_page);
1585 	return seamcall(TDH_VP_ADDCX, &args);
1586 }
1587 EXPORT_SYMBOL_GPL(tdh_vp_addcx);
1588 
1589 u64 tdh_mem_page_aug(struct tdx_td *td, u64 gpa, int level, struct page *page, u64 *ext_err1, u64 *ext_err2)
1590 {
1591 	struct tdx_module_args args = {
1592 		.rcx = gpa | level,
1593 		.rdx = tdx_tdr_pa(td),
1594 		.r8 = page_to_phys(page),
1595 	};
1596 	u64 ret;
1597 
1598 	tdx_clflush_page(page);
1599 	ret = seamcall_ret(TDH_MEM_PAGE_AUG, &args);
1600 
1601 	*ext_err1 = args.rcx;
1602 	*ext_err2 = args.rdx;
1603 
1604 	return ret;
1605 }
1606 EXPORT_SYMBOL_GPL(tdh_mem_page_aug);
1607 
1608 u64 tdh_mem_range_block(struct tdx_td *td, u64 gpa, int level, u64 *ext_err1, u64 *ext_err2)
1609 {
1610 	struct tdx_module_args args = {
1611 		.rcx = gpa | level,
1612 		.rdx = tdx_tdr_pa(td),
1613 	};
1614 	u64 ret;
1615 
1616 	ret = seamcall_ret(TDH_MEM_RANGE_BLOCK, &args);
1617 
1618 	*ext_err1 = args.rcx;
1619 	*ext_err2 = args.rdx;
1620 
1621 	return ret;
1622 }
1623 EXPORT_SYMBOL_GPL(tdh_mem_range_block);
1624 
1625 u64 tdh_mng_key_config(struct tdx_td *td)
1626 {
1627 	struct tdx_module_args args = {
1628 		.rcx = tdx_tdr_pa(td),
1629 	};
1630 
1631 	return seamcall(TDH_MNG_KEY_CONFIG, &args);
1632 }
1633 EXPORT_SYMBOL_GPL(tdh_mng_key_config);
1634 
1635 u64 tdh_mng_create(struct tdx_td *td, u16 hkid)
1636 {
1637 	struct tdx_module_args args = {
1638 		.rcx = tdx_tdr_pa(td),
1639 		.rdx = hkid,
1640 	};
1641 
1642 	tdx_clflush_page(td->tdr_page);
1643 	return seamcall(TDH_MNG_CREATE, &args);
1644 }
1645 EXPORT_SYMBOL_GPL(tdh_mng_create);
1646 
1647 u64 tdh_vp_create(struct tdx_td *td, struct tdx_vp *vp)
1648 {
1649 	struct tdx_module_args args = {
1650 		.rcx = vp->tdvpr_pa,
1651 		.rdx = tdx_tdr_pa(td),
1652 	};
1653 
1654 	tdx_clflush_page(vp->tdvpr_page);
1655 	return seamcall(TDH_VP_CREATE, &args);
1656 }
1657 EXPORT_SYMBOL_GPL(tdh_vp_create);
1658 
1659 u64 tdh_mng_rd(struct tdx_td *td, u64 field, u64 *data)
1660 {
1661 	struct tdx_module_args args = {
1662 		.rcx = tdx_tdr_pa(td),
1663 		.rdx = field,
1664 	};
1665 	u64 ret;
1666 
1667 	ret = seamcall_ret(TDH_MNG_RD, &args);
1668 
1669 	/* R8: Content of the field, or 0 in case of error. */
1670 	*data = args.r8;
1671 
1672 	return ret;
1673 }
1674 EXPORT_SYMBOL_GPL(tdh_mng_rd);
1675 
1676 u64 tdh_mr_extend(struct tdx_td *td, u64 gpa, u64 *ext_err1, u64 *ext_err2)
1677 {
1678 	struct tdx_module_args args = {
1679 		.rcx = gpa,
1680 		.rdx = tdx_tdr_pa(td),
1681 	};
1682 	u64 ret;
1683 
1684 	ret = seamcall_ret(TDH_MR_EXTEND, &args);
1685 
1686 	*ext_err1 = args.rcx;
1687 	*ext_err2 = args.rdx;
1688 
1689 	return ret;
1690 }
1691 EXPORT_SYMBOL_GPL(tdh_mr_extend);
1692 
1693 u64 tdh_mr_finalize(struct tdx_td *td)
1694 {
1695 	struct tdx_module_args args = {
1696 		.rcx = tdx_tdr_pa(td),
1697 	};
1698 
1699 	return seamcall(TDH_MR_FINALIZE, &args);
1700 }
1701 EXPORT_SYMBOL_GPL(tdh_mr_finalize);
1702 
1703 u64 tdh_vp_flush(struct tdx_vp *vp)
1704 {
1705 	struct tdx_module_args args = {
1706 		.rcx = vp->tdvpr_pa,
1707 	};
1708 
1709 	return seamcall(TDH_VP_FLUSH, &args);
1710 }
1711 EXPORT_SYMBOL_GPL(tdh_vp_flush);
1712 
1713 u64 tdh_mng_vpflushdone(struct tdx_td *td)
1714 {
1715 	struct tdx_module_args args = {
1716 		.rcx = tdx_tdr_pa(td),
1717 	};
1718 
1719 	return seamcall(TDH_MNG_VPFLUSHDONE, &args);
1720 }
1721 EXPORT_SYMBOL_GPL(tdh_mng_vpflushdone);
1722 
1723 u64 tdh_mng_key_freeid(struct tdx_td *td)
1724 {
1725 	struct tdx_module_args args = {
1726 		.rcx = tdx_tdr_pa(td),
1727 	};
1728 
1729 	return seamcall(TDH_MNG_KEY_FREEID, &args);
1730 }
1731 EXPORT_SYMBOL_GPL(tdh_mng_key_freeid);
1732 
1733 u64 tdh_mng_init(struct tdx_td *td, u64 td_params, u64 *extended_err)
1734 {
1735 	struct tdx_module_args args = {
1736 		.rcx = tdx_tdr_pa(td),
1737 		.rdx = td_params,
1738 	};
1739 	u64 ret;
1740 
1741 	ret = seamcall_ret(TDH_MNG_INIT, &args);
1742 
1743 	*extended_err = args.rcx;
1744 
1745 	return ret;
1746 }
1747 EXPORT_SYMBOL_GPL(tdh_mng_init);
1748 
1749 u64 tdh_vp_rd(struct tdx_vp *vp, u64 field, u64 *data)
1750 {
1751 	struct tdx_module_args args = {
1752 		.rcx = vp->tdvpr_pa,
1753 		.rdx = field,
1754 	};
1755 	u64 ret;
1756 
1757 	ret = seamcall_ret(TDH_VP_RD, &args);
1758 
1759 	/* R8: Content of the field, or 0 in case of error. */
1760 	*data = args.r8;
1761 
1762 	return ret;
1763 }
1764 EXPORT_SYMBOL_GPL(tdh_vp_rd);
1765 
1766 u64 tdh_vp_wr(struct tdx_vp *vp, u64 field, u64 data, u64 mask)
1767 {
1768 	struct tdx_module_args args = {
1769 		.rcx = vp->tdvpr_pa,
1770 		.rdx = field,
1771 		.r8 = data,
1772 		.r9 = mask,
1773 	};
1774 
1775 	return seamcall(TDH_VP_WR, &args);
1776 }
1777 EXPORT_SYMBOL_GPL(tdh_vp_wr);
1778 
1779 u64 tdh_vp_init(struct tdx_vp *vp, u64 initial_rcx, u32 x2apicid)
1780 {
1781 	struct tdx_module_args args = {
1782 		.rcx = vp->tdvpr_pa,
1783 		.rdx = initial_rcx,
1784 		.r8 = x2apicid,
1785 	};
1786 
1787 	/* apicid requires version == 1. */
1788 	return seamcall(TDH_VP_INIT | (1ULL << TDX_VERSION_SHIFT), &args);
1789 }
1790 EXPORT_SYMBOL_GPL(tdh_vp_init);
1791 
1792 /*
1793  * TDX ABI defines output operands as PT, OWNER and SIZE. These are TDX defined fomats.
1794  * So despite the names, they must be interpted specially as described by the spec. Return
1795  * them only for error reporting purposes.
1796  */
1797 u64 tdh_phymem_page_reclaim(struct page *page, u64 *tdx_pt, u64 *tdx_owner, u64 *tdx_size)
1798 {
1799 	struct tdx_module_args args = {
1800 		.rcx = page_to_phys(page),
1801 	};
1802 	u64 ret;
1803 
1804 	ret = seamcall_ret(TDH_PHYMEM_PAGE_RECLAIM, &args);
1805 
1806 	*tdx_pt = args.rcx;
1807 	*tdx_owner = args.rdx;
1808 	*tdx_size = args.r8;
1809 
1810 	return ret;
1811 }
1812 EXPORT_SYMBOL_GPL(tdh_phymem_page_reclaim);
1813 
1814 u64 tdh_mem_track(struct tdx_td *td)
1815 {
1816 	struct tdx_module_args args = {
1817 		.rcx = tdx_tdr_pa(td),
1818 	};
1819 
1820 	return seamcall(TDH_MEM_TRACK, &args);
1821 }
1822 EXPORT_SYMBOL_GPL(tdh_mem_track);
1823 
1824 u64 tdh_mem_page_remove(struct tdx_td *td, u64 gpa, u64 level, u64 *ext_err1, u64 *ext_err2)
1825 {
1826 	struct tdx_module_args args = {
1827 		.rcx = gpa | level,
1828 		.rdx = tdx_tdr_pa(td),
1829 	};
1830 	u64 ret;
1831 
1832 	ret = seamcall_ret(TDH_MEM_PAGE_REMOVE, &args);
1833 
1834 	*ext_err1 = args.rcx;
1835 	*ext_err2 = args.rdx;
1836 
1837 	return ret;
1838 }
1839 EXPORT_SYMBOL_GPL(tdh_mem_page_remove);
1840 
1841 u64 tdh_phymem_cache_wb(bool resume)
1842 {
1843 	struct tdx_module_args args = {
1844 		.rcx = resume ? 1 : 0,
1845 	};
1846 
1847 	return seamcall(TDH_PHYMEM_CACHE_WB, &args);
1848 }
1849 EXPORT_SYMBOL_GPL(tdh_phymem_cache_wb);
1850 
1851 u64 tdh_phymem_page_wbinvd_tdr(struct tdx_td *td)
1852 {
1853 	struct tdx_module_args args = {};
1854 
1855 	args.rcx = mk_keyed_paddr(tdx_global_keyid, td->tdr_page);
1856 
1857 	return seamcall(TDH_PHYMEM_PAGE_WBINVD, &args);
1858 }
1859 EXPORT_SYMBOL_GPL(tdh_phymem_page_wbinvd_tdr);
1860 
1861 u64 tdh_phymem_page_wbinvd_hkid(u64 hkid, struct page *page)
1862 {
1863 	struct tdx_module_args args = {};
1864 
1865 	args.rcx = mk_keyed_paddr(hkid, page);
1866 
1867 	return seamcall(TDH_PHYMEM_PAGE_WBINVD, &args);
1868 }
1869 EXPORT_SYMBOL_GPL(tdh_phymem_page_wbinvd_hkid);
1870 
1871 #ifdef CONFIG_KEXEC_CORE
1872 void tdx_cpu_flush_cache_for_kexec(void)
1873 {
1874 	lockdep_assert_preemption_disabled();
1875 
1876 	if (!this_cpu_read(cache_state_incoherent))
1877 		return;
1878 
1879 	/*
1880 	 * Private memory cachelines need to be clean at the time of
1881 	 * kexec.  Write them back now, as the caller promises that
1882 	 * there should be no more SEAMCALLs on this CPU.
1883 	 */
1884 	wbinvd();
1885 	this_cpu_write(cache_state_incoherent, false);
1886 }
1887 EXPORT_SYMBOL_GPL(tdx_cpu_flush_cache_for_kexec);
1888 #endif
1889