xref: /linux/arch/x86/virt/vmx/tdx/tdx.c (revision fa79e55d467366a2c52c68a261a0d6ea5f8a6534)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Copyright(c) 2023 Intel Corporation.
4  *
5  * Intel Trusted Domain Extensions (TDX) support
6  */
7 
8 #include "asm/page_types.h"
9 #define pr_fmt(fmt)	"virt/tdx: " fmt
10 
11 #include <linux/types.h>
12 #include <linux/cache.h>
13 #include <linux/init.h>
14 #include <linux/errno.h>
15 #include <linux/printk.h>
16 #include <linux/cpu.h>
17 #include <linux/spinlock.h>
18 #include <linux/percpu-defs.h>
19 #include <linux/mutex.h>
20 #include <linux/list.h>
21 #include <linux/memblock.h>
22 #include <linux/memory.h>
23 #include <linux/minmax.h>
24 #include <linux/sizes.h>
25 #include <linux/pfn.h>
26 #include <linux/align.h>
27 #include <linux/sort.h>
28 #include <linux/log2.h>
29 #include <linux/acpi.h>
30 #include <linux/suspend.h>
31 #include <linux/idr.h>
32 #include <asm/page.h>
33 #include <asm/special_insns.h>
34 #include <asm/msr-index.h>
35 #include <asm/msr.h>
36 #include <asm/cpufeature.h>
37 #include <asm/tdx.h>
38 #include <asm/cpu_device_id.h>
39 #include <asm/processor.h>
40 #include <asm/mce.h>
41 #include "tdx.h"
42 
43 static u32 tdx_global_keyid __ro_after_init;
44 static u32 tdx_guest_keyid_start __ro_after_init;
45 static u32 tdx_nr_guest_keyids __ro_after_init;
46 
47 static DEFINE_IDA(tdx_guest_keyid_pool);
48 
49 static DEFINE_PER_CPU(bool, tdx_lp_initialized);
50 
51 static struct tdmr_info_list tdx_tdmr_list;
52 
53 static enum tdx_module_status_t tdx_module_status;
54 static DEFINE_MUTEX(tdx_module_lock);
55 
56 /* All TDX-usable memory regions.  Protected by mem_hotplug_lock. */
57 static LIST_HEAD(tdx_memlist);
58 
59 static struct tdx_sys_info tdx_sysinfo;
60 
61 typedef void (*sc_err_func_t)(u64 fn, u64 err, struct tdx_module_args *args);
62 
63 static inline void seamcall_err(u64 fn, u64 err, struct tdx_module_args *args)
64 {
65 	pr_err("SEAMCALL (0x%016llx) failed: 0x%016llx\n", fn, err);
66 }
67 
68 static inline void seamcall_err_ret(u64 fn, u64 err,
69 				    struct tdx_module_args *args)
70 {
71 	seamcall_err(fn, err, args);
72 	pr_err("RCX 0x%016llx RDX 0x%016llx R08 0x%016llx\n",
73 			args->rcx, args->rdx, args->r8);
74 	pr_err("R09 0x%016llx R10 0x%016llx R11 0x%016llx\n",
75 			args->r9, args->r10, args->r11);
76 }
77 
78 static inline int sc_retry_prerr(sc_func_t func, sc_err_func_t err_func,
79 				 u64 fn, struct tdx_module_args *args)
80 {
81 	u64 sret = sc_retry(func, fn, args);
82 
83 	if (sret == TDX_SUCCESS)
84 		return 0;
85 
86 	if (sret == TDX_SEAMCALL_VMFAILINVALID)
87 		return -ENODEV;
88 
89 	if (sret == TDX_SEAMCALL_GP)
90 		return -EOPNOTSUPP;
91 
92 	if (sret == TDX_SEAMCALL_UD)
93 		return -EACCES;
94 
95 	err_func(fn, sret, args);
96 	return -EIO;
97 }
98 
99 #define seamcall_prerr(__fn, __args)						\
100 	sc_retry_prerr(__seamcall, seamcall_err, (__fn), (__args))
101 
102 #define seamcall_prerr_ret(__fn, __args)					\
103 	sc_retry_prerr(__seamcall_ret, seamcall_err_ret, (__fn), (__args))
104 
105 /*
106  * Do the module global initialization once and return its result.
107  * It can be done on any cpu.  It's always called with interrupts
108  * disabled.
109  */
110 static int try_init_module_global(void)
111 {
112 	struct tdx_module_args args = {};
113 	static DEFINE_RAW_SPINLOCK(sysinit_lock);
114 	static bool sysinit_done;
115 	static int sysinit_ret;
116 
117 	lockdep_assert_irqs_disabled();
118 
119 	raw_spin_lock(&sysinit_lock);
120 
121 	if (sysinit_done)
122 		goto out;
123 
124 	/* RCX is module attributes and all bits are reserved */
125 	args.rcx = 0;
126 	sysinit_ret = seamcall_prerr(TDH_SYS_INIT, &args);
127 
128 	/*
129 	 * The first SEAMCALL also detects the TDX module, thus
130 	 * it can fail due to the TDX module is not loaded.
131 	 * Dump message to let the user know.
132 	 */
133 	if (sysinit_ret == -ENODEV)
134 		pr_err("module not loaded\n");
135 
136 	sysinit_done = true;
137 out:
138 	raw_spin_unlock(&sysinit_lock);
139 	return sysinit_ret;
140 }
141 
142 /**
143  * tdx_cpu_enable - Enable TDX on local cpu
144  *
145  * Do one-time TDX module per-cpu initialization SEAMCALL (and TDX module
146  * global initialization SEAMCALL if not done) on local cpu to make this
147  * cpu be ready to run any other SEAMCALLs.
148  *
149  * Always call this function via IPI function calls.
150  *
151  * Return 0 on success, otherwise errors.
152  */
153 int tdx_cpu_enable(void)
154 {
155 	struct tdx_module_args args = {};
156 	int ret;
157 
158 	if (!boot_cpu_has(X86_FEATURE_TDX_HOST_PLATFORM))
159 		return -ENODEV;
160 
161 	lockdep_assert_irqs_disabled();
162 
163 	if (__this_cpu_read(tdx_lp_initialized))
164 		return 0;
165 
166 	/*
167 	 * The TDX module global initialization is the very first step
168 	 * to enable TDX.  Need to do it first (if hasn't been done)
169 	 * before the per-cpu initialization.
170 	 */
171 	ret = try_init_module_global();
172 	if (ret)
173 		return ret;
174 
175 	ret = seamcall_prerr(TDH_SYS_LP_INIT, &args);
176 	if (ret)
177 		return ret;
178 
179 	__this_cpu_write(tdx_lp_initialized, true);
180 
181 	return 0;
182 }
183 EXPORT_SYMBOL_GPL(tdx_cpu_enable);
184 
185 /*
186  * Add a memory region as a TDX memory block.  The caller must make sure
187  * all memory regions are added in address ascending order and don't
188  * overlap.
189  */
190 static int add_tdx_memblock(struct list_head *tmb_list, unsigned long start_pfn,
191 			    unsigned long end_pfn, int nid)
192 {
193 	struct tdx_memblock *tmb;
194 
195 	tmb = kmalloc(sizeof(*tmb), GFP_KERNEL);
196 	if (!tmb)
197 		return -ENOMEM;
198 
199 	INIT_LIST_HEAD(&tmb->list);
200 	tmb->start_pfn = start_pfn;
201 	tmb->end_pfn = end_pfn;
202 	tmb->nid = nid;
203 
204 	/* @tmb_list is protected by mem_hotplug_lock */
205 	list_add_tail(&tmb->list, tmb_list);
206 	return 0;
207 }
208 
209 static void free_tdx_memlist(struct list_head *tmb_list)
210 {
211 	/* @tmb_list is protected by mem_hotplug_lock */
212 	while (!list_empty(tmb_list)) {
213 		struct tdx_memblock *tmb = list_first_entry(tmb_list,
214 				struct tdx_memblock, list);
215 
216 		list_del(&tmb->list);
217 		kfree(tmb);
218 	}
219 }
220 
221 /*
222  * Ensure that all memblock memory regions are convertible to TDX
223  * memory.  Once this has been established, stash the memblock
224  * ranges off in a secondary structure because memblock is modified
225  * in memory hotplug while TDX memory regions are fixed.
226  */
227 static int build_tdx_memlist(struct list_head *tmb_list)
228 {
229 	unsigned long start_pfn, end_pfn;
230 	int i, nid, ret;
231 
232 	for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) {
233 		/*
234 		 * The first 1MB is not reported as TDX convertible memory.
235 		 * Although the first 1MB is always reserved and won't end up
236 		 * to the page allocator, it is still in memblock's memory
237 		 * regions.  Skip them manually to exclude them as TDX memory.
238 		 */
239 		start_pfn = max(start_pfn, PHYS_PFN(SZ_1M));
240 		if (start_pfn >= end_pfn)
241 			continue;
242 
243 		/*
244 		 * Add the memory regions as TDX memory.  The regions in
245 		 * memblock has already guaranteed they are in address
246 		 * ascending order and don't overlap.
247 		 */
248 		ret = add_tdx_memblock(tmb_list, start_pfn, end_pfn, nid);
249 		if (ret)
250 			goto err;
251 	}
252 
253 	return 0;
254 err:
255 	free_tdx_memlist(tmb_list);
256 	return ret;
257 }
258 
259 static int read_sys_metadata_field(u64 field_id, u64 *data)
260 {
261 	struct tdx_module_args args = {};
262 	int ret;
263 
264 	/*
265 	 * TDH.SYS.RD -- reads one global metadata field
266 	 *  - RDX (in): the field to read
267 	 *  - R8 (out): the field data
268 	 */
269 	args.rdx = field_id;
270 	ret = seamcall_prerr_ret(TDH_SYS_RD, &args);
271 	if (ret)
272 		return ret;
273 
274 	*data = args.r8;
275 
276 	return 0;
277 }
278 
279 #include "tdx_global_metadata.c"
280 
281 static int check_features(struct tdx_sys_info *sysinfo)
282 {
283 	u64 tdx_features0 = sysinfo->features.tdx_features0;
284 
285 	if (!(tdx_features0 & TDX_FEATURES0_NO_RBP_MOD)) {
286 		pr_err("frame pointer (RBP) clobber bug present, upgrade TDX module\n");
287 		return -EINVAL;
288 	}
289 
290 	return 0;
291 }
292 
293 /* Calculate the actual TDMR size */
294 static int tdmr_size_single(u16 max_reserved_per_tdmr)
295 {
296 	int tdmr_sz;
297 
298 	/*
299 	 * The actual size of TDMR depends on the maximum
300 	 * number of reserved areas.
301 	 */
302 	tdmr_sz = sizeof(struct tdmr_info);
303 	tdmr_sz += sizeof(struct tdmr_reserved_area) * max_reserved_per_tdmr;
304 
305 	return ALIGN(tdmr_sz, TDMR_INFO_ALIGNMENT);
306 }
307 
308 static int alloc_tdmr_list(struct tdmr_info_list *tdmr_list,
309 			   struct tdx_sys_info_tdmr *sysinfo_tdmr)
310 {
311 	size_t tdmr_sz, tdmr_array_sz;
312 	void *tdmr_array;
313 
314 	tdmr_sz = tdmr_size_single(sysinfo_tdmr->max_reserved_per_tdmr);
315 	tdmr_array_sz = tdmr_sz * sysinfo_tdmr->max_tdmrs;
316 
317 	/*
318 	 * To keep things simple, allocate all TDMRs together.
319 	 * The buffer needs to be physically contiguous to make
320 	 * sure each TDMR is physically contiguous.
321 	 */
322 	tdmr_array = alloc_pages_exact(tdmr_array_sz,
323 			GFP_KERNEL | __GFP_ZERO);
324 	if (!tdmr_array)
325 		return -ENOMEM;
326 
327 	tdmr_list->tdmrs = tdmr_array;
328 
329 	/*
330 	 * Keep the size of TDMR to find the target TDMR
331 	 * at a given index in the TDMR list.
332 	 */
333 	tdmr_list->tdmr_sz = tdmr_sz;
334 	tdmr_list->max_tdmrs = sysinfo_tdmr->max_tdmrs;
335 	tdmr_list->nr_consumed_tdmrs = 0;
336 
337 	return 0;
338 }
339 
340 static void free_tdmr_list(struct tdmr_info_list *tdmr_list)
341 {
342 	free_pages_exact(tdmr_list->tdmrs,
343 			tdmr_list->max_tdmrs * tdmr_list->tdmr_sz);
344 }
345 
346 /* Get the TDMR from the list at the given index. */
347 static struct tdmr_info *tdmr_entry(struct tdmr_info_list *tdmr_list,
348 				    int idx)
349 {
350 	int tdmr_info_offset = tdmr_list->tdmr_sz * idx;
351 
352 	return (void *)tdmr_list->tdmrs + tdmr_info_offset;
353 }
354 
355 #define TDMR_ALIGNMENT		SZ_1G
356 #define TDMR_ALIGN_DOWN(_addr)	ALIGN_DOWN((_addr), TDMR_ALIGNMENT)
357 #define TDMR_ALIGN_UP(_addr)	ALIGN((_addr), TDMR_ALIGNMENT)
358 
359 static inline u64 tdmr_end(struct tdmr_info *tdmr)
360 {
361 	return tdmr->base + tdmr->size;
362 }
363 
364 /*
365  * Take the memory referenced in @tmb_list and populate the
366  * preallocated @tdmr_list, following all the special alignment
367  * and size rules for TDMR.
368  */
369 static int fill_out_tdmrs(struct list_head *tmb_list,
370 			  struct tdmr_info_list *tdmr_list)
371 {
372 	struct tdx_memblock *tmb;
373 	int tdmr_idx = 0;
374 
375 	/*
376 	 * Loop over TDX memory regions and fill out TDMRs to cover them.
377 	 * To keep it simple, always try to use one TDMR to cover one
378 	 * memory region.
379 	 *
380 	 * In practice TDX supports at least 64 TDMRs.  A 2-socket system
381 	 * typically only consumes less than 10 of those.  This code is
382 	 * dumb and simple and may use more TMDRs than is strictly
383 	 * required.
384 	 */
385 	list_for_each_entry(tmb, tmb_list, list) {
386 		struct tdmr_info *tdmr = tdmr_entry(tdmr_list, tdmr_idx);
387 		u64 start, end;
388 
389 		start = TDMR_ALIGN_DOWN(PFN_PHYS(tmb->start_pfn));
390 		end   = TDMR_ALIGN_UP(PFN_PHYS(tmb->end_pfn));
391 
392 		/*
393 		 * A valid size indicates the current TDMR has already
394 		 * been filled out to cover the previous memory region(s).
395 		 */
396 		if (tdmr->size) {
397 			/*
398 			 * Loop to the next if the current memory region
399 			 * has already been fully covered.
400 			 */
401 			if (end <= tdmr_end(tdmr))
402 				continue;
403 
404 			/* Otherwise, skip the already covered part. */
405 			if (start < tdmr_end(tdmr))
406 				start = tdmr_end(tdmr);
407 
408 			/*
409 			 * Create a new TDMR to cover the current memory
410 			 * region, or the remaining part of it.
411 			 */
412 			tdmr_idx++;
413 			if (tdmr_idx >= tdmr_list->max_tdmrs) {
414 				pr_warn("initialization failed: TDMRs exhausted.\n");
415 				return -ENOSPC;
416 			}
417 
418 			tdmr = tdmr_entry(tdmr_list, tdmr_idx);
419 		}
420 
421 		tdmr->base = start;
422 		tdmr->size = end - start;
423 	}
424 
425 	/* @tdmr_idx is always the index of the last valid TDMR. */
426 	tdmr_list->nr_consumed_tdmrs = tdmr_idx + 1;
427 
428 	/*
429 	 * Warn early that kernel is about to run out of TDMRs.
430 	 *
431 	 * This is an indication that TDMR allocation has to be
432 	 * reworked to be smarter to not run into an issue.
433 	 */
434 	if (tdmr_list->max_tdmrs - tdmr_list->nr_consumed_tdmrs < TDMR_NR_WARN)
435 		pr_warn("consumed TDMRs reaching limit: %d used out of %d\n",
436 				tdmr_list->nr_consumed_tdmrs,
437 				tdmr_list->max_tdmrs);
438 
439 	return 0;
440 }
441 
442 /*
443  * Calculate PAMT size given a TDMR and a page size.  The returned
444  * PAMT size is always aligned up to 4K page boundary.
445  */
446 static unsigned long tdmr_get_pamt_sz(struct tdmr_info *tdmr, int pgsz,
447 				      u16 pamt_entry_size)
448 {
449 	unsigned long pamt_sz, nr_pamt_entries;
450 
451 	switch (pgsz) {
452 	case TDX_PS_4K:
453 		nr_pamt_entries = tdmr->size >> PAGE_SHIFT;
454 		break;
455 	case TDX_PS_2M:
456 		nr_pamt_entries = tdmr->size >> PMD_SHIFT;
457 		break;
458 	case TDX_PS_1G:
459 		nr_pamt_entries = tdmr->size >> PUD_SHIFT;
460 		break;
461 	default:
462 		WARN_ON_ONCE(1);
463 		return 0;
464 	}
465 
466 	pamt_sz = nr_pamt_entries * pamt_entry_size;
467 	/* TDX requires PAMT size must be 4K aligned */
468 	pamt_sz = ALIGN(pamt_sz, PAGE_SIZE);
469 
470 	return pamt_sz;
471 }
472 
473 /*
474  * Locate a NUMA node which should hold the allocation of the @tdmr
475  * PAMT.  This node will have some memory covered by the TDMR.  The
476  * relative amount of memory covered is not considered.
477  */
478 static int tdmr_get_nid(struct tdmr_info *tdmr, struct list_head *tmb_list)
479 {
480 	struct tdx_memblock *tmb;
481 
482 	/*
483 	 * A TDMR must cover at least part of one TMB.  That TMB will end
484 	 * after the TDMR begins.  But, that TMB may have started before
485 	 * the TDMR.  Find the next 'tmb' that _ends_ after this TDMR
486 	 * begins.  Ignore 'tmb' start addresses.  They are irrelevant.
487 	 */
488 	list_for_each_entry(tmb, tmb_list, list) {
489 		if (tmb->end_pfn > PHYS_PFN(tdmr->base))
490 			return tmb->nid;
491 	}
492 
493 	/*
494 	 * Fall back to allocating the TDMR's metadata from node 0 when
495 	 * no TDX memory block can be found.  This should never happen
496 	 * since TDMRs originate from TDX memory blocks.
497 	 */
498 	pr_warn("TDMR [0x%llx, 0x%llx): unable to find local NUMA node for PAMT allocation, fallback to use node 0.\n",
499 			tdmr->base, tdmr_end(tdmr));
500 	return 0;
501 }
502 
503 /*
504  * Allocate PAMTs from the local NUMA node of some memory in @tmb_list
505  * within @tdmr, and set up PAMTs for @tdmr.
506  */
507 static int tdmr_set_up_pamt(struct tdmr_info *tdmr,
508 			    struct list_head *tmb_list,
509 			    u16 pamt_entry_size[])
510 {
511 	unsigned long pamt_base[TDX_PS_NR];
512 	unsigned long pamt_size[TDX_PS_NR];
513 	unsigned long tdmr_pamt_base;
514 	unsigned long tdmr_pamt_size;
515 	struct page *pamt;
516 	int pgsz, nid;
517 
518 	nid = tdmr_get_nid(tdmr, tmb_list);
519 
520 	/*
521 	 * Calculate the PAMT size for each TDX supported page size
522 	 * and the total PAMT size.
523 	 */
524 	tdmr_pamt_size = 0;
525 	for (pgsz = TDX_PS_4K; pgsz < TDX_PS_NR; pgsz++) {
526 		pamt_size[pgsz] = tdmr_get_pamt_sz(tdmr, pgsz,
527 					pamt_entry_size[pgsz]);
528 		tdmr_pamt_size += pamt_size[pgsz];
529 	}
530 
531 	/*
532 	 * Allocate one chunk of physically contiguous memory for all
533 	 * PAMTs.  This helps minimize the PAMT's use of reserved areas
534 	 * in overlapped TDMRs.
535 	 */
536 	pamt = alloc_contig_pages(tdmr_pamt_size >> PAGE_SHIFT, GFP_KERNEL,
537 			nid, &node_online_map);
538 	if (!pamt)
539 		return -ENOMEM;
540 
541 	/*
542 	 * Break the contiguous allocation back up into the
543 	 * individual PAMTs for each page size.
544 	 */
545 	tdmr_pamt_base = page_to_pfn(pamt) << PAGE_SHIFT;
546 	for (pgsz = TDX_PS_4K; pgsz < TDX_PS_NR; pgsz++) {
547 		pamt_base[pgsz] = tdmr_pamt_base;
548 		tdmr_pamt_base += pamt_size[pgsz];
549 	}
550 
551 	tdmr->pamt_4k_base = pamt_base[TDX_PS_4K];
552 	tdmr->pamt_4k_size = pamt_size[TDX_PS_4K];
553 	tdmr->pamt_2m_base = pamt_base[TDX_PS_2M];
554 	tdmr->pamt_2m_size = pamt_size[TDX_PS_2M];
555 	tdmr->pamt_1g_base = pamt_base[TDX_PS_1G];
556 	tdmr->pamt_1g_size = pamt_size[TDX_PS_1G];
557 
558 	return 0;
559 }
560 
561 static void tdmr_get_pamt(struct tdmr_info *tdmr, unsigned long *pamt_base,
562 			  unsigned long *pamt_size)
563 {
564 	unsigned long pamt_bs, pamt_sz;
565 
566 	/*
567 	 * The PAMT was allocated in one contiguous unit.  The 4K PAMT
568 	 * should always point to the beginning of that allocation.
569 	 */
570 	pamt_bs = tdmr->pamt_4k_base;
571 	pamt_sz = tdmr->pamt_4k_size + tdmr->pamt_2m_size + tdmr->pamt_1g_size;
572 
573 	WARN_ON_ONCE((pamt_bs & ~PAGE_MASK) || (pamt_sz & ~PAGE_MASK));
574 
575 	*pamt_base = pamt_bs;
576 	*pamt_size = pamt_sz;
577 }
578 
579 static void tdmr_do_pamt_func(struct tdmr_info *tdmr,
580 		void (*pamt_func)(unsigned long base, unsigned long size))
581 {
582 	unsigned long pamt_base, pamt_size;
583 
584 	tdmr_get_pamt(tdmr, &pamt_base, &pamt_size);
585 
586 	/* Do nothing if PAMT hasn't been allocated for this TDMR */
587 	if (!pamt_size)
588 		return;
589 
590 	if (WARN_ON_ONCE(!pamt_base))
591 		return;
592 
593 	pamt_func(pamt_base, pamt_size);
594 }
595 
596 static void free_pamt(unsigned long pamt_base, unsigned long pamt_size)
597 {
598 	free_contig_range(pamt_base >> PAGE_SHIFT, pamt_size >> PAGE_SHIFT);
599 }
600 
601 static void tdmr_free_pamt(struct tdmr_info *tdmr)
602 {
603 	tdmr_do_pamt_func(tdmr, free_pamt);
604 }
605 
606 static void tdmrs_free_pamt_all(struct tdmr_info_list *tdmr_list)
607 {
608 	int i;
609 
610 	for (i = 0; i < tdmr_list->nr_consumed_tdmrs; i++)
611 		tdmr_free_pamt(tdmr_entry(tdmr_list, i));
612 }
613 
614 /* Allocate and set up PAMTs for all TDMRs */
615 static int tdmrs_set_up_pamt_all(struct tdmr_info_list *tdmr_list,
616 				 struct list_head *tmb_list,
617 				 u16 pamt_entry_size[])
618 {
619 	int i, ret = 0;
620 
621 	for (i = 0; i < tdmr_list->nr_consumed_tdmrs; i++) {
622 		ret = tdmr_set_up_pamt(tdmr_entry(tdmr_list, i), tmb_list,
623 				pamt_entry_size);
624 		if (ret)
625 			goto err;
626 	}
627 
628 	return 0;
629 err:
630 	tdmrs_free_pamt_all(tdmr_list);
631 	return ret;
632 }
633 
634 /*
635  * Convert TDX private pages back to normal by using MOVDIR64B to
636  * clear these pages.  Note this function doesn't flush cache of
637  * these TDX private pages.  The caller should make sure of that.
638  */
639 static void reset_tdx_pages(unsigned long base, unsigned long size)
640 {
641 	const void *zero_page = (const void *)page_address(ZERO_PAGE(0));
642 	unsigned long phys, end;
643 
644 	end = base + size;
645 	for (phys = base; phys < end; phys += 64)
646 		movdir64b(__va(phys), zero_page);
647 
648 	/*
649 	 * MOVDIR64B uses WC protocol.  Use memory barrier to
650 	 * make sure any later user of these pages sees the
651 	 * updated data.
652 	 */
653 	mb();
654 }
655 
656 static void tdmr_reset_pamt(struct tdmr_info *tdmr)
657 {
658 	tdmr_do_pamt_func(tdmr, reset_tdx_pages);
659 }
660 
661 static void tdmrs_reset_pamt_all(struct tdmr_info_list *tdmr_list)
662 {
663 	int i;
664 
665 	for (i = 0; i < tdmr_list->nr_consumed_tdmrs; i++)
666 		tdmr_reset_pamt(tdmr_entry(tdmr_list, i));
667 }
668 
669 static unsigned long tdmrs_count_pamt_kb(struct tdmr_info_list *tdmr_list)
670 {
671 	unsigned long pamt_size = 0;
672 	int i;
673 
674 	for (i = 0; i < tdmr_list->nr_consumed_tdmrs; i++) {
675 		unsigned long base, size;
676 
677 		tdmr_get_pamt(tdmr_entry(tdmr_list, i), &base, &size);
678 		pamt_size += size;
679 	}
680 
681 	return pamt_size / 1024;
682 }
683 
684 static int tdmr_add_rsvd_area(struct tdmr_info *tdmr, int *p_idx, u64 addr,
685 			      u64 size, u16 max_reserved_per_tdmr)
686 {
687 	struct tdmr_reserved_area *rsvd_areas = tdmr->reserved_areas;
688 	int idx = *p_idx;
689 
690 	/* Reserved area must be 4K aligned in offset and size */
691 	if (WARN_ON(addr & ~PAGE_MASK || size & ~PAGE_MASK))
692 		return -EINVAL;
693 
694 	if (idx >= max_reserved_per_tdmr) {
695 		pr_warn("initialization failed: TDMR [0x%llx, 0x%llx): reserved areas exhausted.\n",
696 				tdmr->base, tdmr_end(tdmr));
697 		return -ENOSPC;
698 	}
699 
700 	/*
701 	 * Consume one reserved area per call.  Make no effort to
702 	 * optimize or reduce the number of reserved areas which are
703 	 * consumed by contiguous reserved areas, for instance.
704 	 */
705 	rsvd_areas[idx].offset = addr - tdmr->base;
706 	rsvd_areas[idx].size = size;
707 
708 	*p_idx = idx + 1;
709 
710 	return 0;
711 }
712 
713 /*
714  * Go through @tmb_list to find holes between memory areas.  If any of
715  * those holes fall within @tdmr, set up a TDMR reserved area to cover
716  * the hole.
717  */
718 static int tdmr_populate_rsvd_holes(struct list_head *tmb_list,
719 				    struct tdmr_info *tdmr,
720 				    int *rsvd_idx,
721 				    u16 max_reserved_per_tdmr)
722 {
723 	struct tdx_memblock *tmb;
724 	u64 prev_end;
725 	int ret;
726 
727 	/*
728 	 * Start looking for reserved blocks at the
729 	 * beginning of the TDMR.
730 	 */
731 	prev_end = tdmr->base;
732 	list_for_each_entry(tmb, tmb_list, list) {
733 		u64 start, end;
734 
735 		start = PFN_PHYS(tmb->start_pfn);
736 		end   = PFN_PHYS(tmb->end_pfn);
737 
738 		/* Break if this region is after the TDMR */
739 		if (start >= tdmr_end(tdmr))
740 			break;
741 
742 		/* Exclude regions before this TDMR */
743 		if (end < tdmr->base)
744 			continue;
745 
746 		/*
747 		 * Skip over memory areas that
748 		 * have already been dealt with.
749 		 */
750 		if (start <= prev_end) {
751 			prev_end = end;
752 			continue;
753 		}
754 
755 		/* Add the hole before this region */
756 		ret = tdmr_add_rsvd_area(tdmr, rsvd_idx, prev_end,
757 				start - prev_end,
758 				max_reserved_per_tdmr);
759 		if (ret)
760 			return ret;
761 
762 		prev_end = end;
763 	}
764 
765 	/* Add the hole after the last region if it exists. */
766 	if (prev_end < tdmr_end(tdmr)) {
767 		ret = tdmr_add_rsvd_area(tdmr, rsvd_idx, prev_end,
768 				tdmr_end(tdmr) - prev_end,
769 				max_reserved_per_tdmr);
770 		if (ret)
771 			return ret;
772 	}
773 
774 	return 0;
775 }
776 
777 /*
778  * Go through @tdmr_list to find all PAMTs.  If any of those PAMTs
779  * overlaps with @tdmr, set up a TDMR reserved area to cover the
780  * overlapping part.
781  */
782 static int tdmr_populate_rsvd_pamts(struct tdmr_info_list *tdmr_list,
783 				    struct tdmr_info *tdmr,
784 				    int *rsvd_idx,
785 				    u16 max_reserved_per_tdmr)
786 {
787 	int i, ret;
788 
789 	for (i = 0; i < tdmr_list->nr_consumed_tdmrs; i++) {
790 		struct tdmr_info *tmp = tdmr_entry(tdmr_list, i);
791 		unsigned long pamt_base, pamt_size, pamt_end;
792 
793 		tdmr_get_pamt(tmp, &pamt_base, &pamt_size);
794 		/* Each TDMR must already have PAMT allocated */
795 		WARN_ON_ONCE(!pamt_size || !pamt_base);
796 
797 		pamt_end = pamt_base + pamt_size;
798 		/* Skip PAMTs outside of the given TDMR */
799 		if ((pamt_end <= tdmr->base) ||
800 				(pamt_base >= tdmr_end(tdmr)))
801 			continue;
802 
803 		/* Only mark the part within the TDMR as reserved */
804 		if (pamt_base < tdmr->base)
805 			pamt_base = tdmr->base;
806 		if (pamt_end > tdmr_end(tdmr))
807 			pamt_end = tdmr_end(tdmr);
808 
809 		ret = tdmr_add_rsvd_area(tdmr, rsvd_idx, pamt_base,
810 				pamt_end - pamt_base,
811 				max_reserved_per_tdmr);
812 		if (ret)
813 			return ret;
814 	}
815 
816 	return 0;
817 }
818 
819 /* Compare function called by sort() for TDMR reserved areas */
820 static int rsvd_area_cmp_func(const void *a, const void *b)
821 {
822 	struct tdmr_reserved_area *r1 = (struct tdmr_reserved_area *)a;
823 	struct tdmr_reserved_area *r2 = (struct tdmr_reserved_area *)b;
824 
825 	if (r1->offset + r1->size <= r2->offset)
826 		return -1;
827 	if (r1->offset >= r2->offset + r2->size)
828 		return 1;
829 
830 	/* Reserved areas cannot overlap.  The caller must guarantee. */
831 	WARN_ON_ONCE(1);
832 	return -1;
833 }
834 
835 /*
836  * Populate reserved areas for the given @tdmr, including memory holes
837  * (via @tmb_list) and PAMTs (via @tdmr_list).
838  */
839 static int tdmr_populate_rsvd_areas(struct tdmr_info *tdmr,
840 				    struct list_head *tmb_list,
841 				    struct tdmr_info_list *tdmr_list,
842 				    u16 max_reserved_per_tdmr)
843 {
844 	int ret, rsvd_idx = 0;
845 
846 	ret = tdmr_populate_rsvd_holes(tmb_list, tdmr, &rsvd_idx,
847 			max_reserved_per_tdmr);
848 	if (ret)
849 		return ret;
850 
851 	ret = tdmr_populate_rsvd_pamts(tdmr_list, tdmr, &rsvd_idx,
852 			max_reserved_per_tdmr);
853 	if (ret)
854 		return ret;
855 
856 	/* TDX requires reserved areas listed in address ascending order */
857 	sort(tdmr->reserved_areas, rsvd_idx, sizeof(struct tdmr_reserved_area),
858 			rsvd_area_cmp_func, NULL);
859 
860 	return 0;
861 }
862 
863 /*
864  * Populate reserved areas for all TDMRs in @tdmr_list, including memory
865  * holes (via @tmb_list) and PAMTs.
866  */
867 static int tdmrs_populate_rsvd_areas_all(struct tdmr_info_list *tdmr_list,
868 					 struct list_head *tmb_list,
869 					 u16 max_reserved_per_tdmr)
870 {
871 	int i;
872 
873 	for (i = 0; i < tdmr_list->nr_consumed_tdmrs; i++) {
874 		int ret;
875 
876 		ret = tdmr_populate_rsvd_areas(tdmr_entry(tdmr_list, i),
877 				tmb_list, tdmr_list, max_reserved_per_tdmr);
878 		if (ret)
879 			return ret;
880 	}
881 
882 	return 0;
883 }
884 
885 /*
886  * Construct a list of TDMRs on the preallocated space in @tdmr_list
887  * to cover all TDX memory regions in @tmb_list based on the TDX module
888  * TDMR global information in @sysinfo_tdmr.
889  */
890 static int construct_tdmrs(struct list_head *tmb_list,
891 			   struct tdmr_info_list *tdmr_list,
892 			   struct tdx_sys_info_tdmr *sysinfo_tdmr)
893 {
894 	u16 pamt_entry_size[TDX_PS_NR] = {
895 		sysinfo_tdmr->pamt_4k_entry_size,
896 		sysinfo_tdmr->pamt_2m_entry_size,
897 		sysinfo_tdmr->pamt_1g_entry_size,
898 	};
899 	int ret;
900 
901 	ret = fill_out_tdmrs(tmb_list, tdmr_list);
902 	if (ret)
903 		return ret;
904 
905 	ret = tdmrs_set_up_pamt_all(tdmr_list, tmb_list, pamt_entry_size);
906 	if (ret)
907 		return ret;
908 
909 	ret = tdmrs_populate_rsvd_areas_all(tdmr_list, tmb_list,
910 			sysinfo_tdmr->max_reserved_per_tdmr);
911 	if (ret)
912 		tdmrs_free_pamt_all(tdmr_list);
913 
914 	/*
915 	 * The tdmr_info_list is read-only from here on out.
916 	 * Ensure that these writes are seen by other CPUs.
917 	 * Pairs with a smp_rmb() in is_pamt_page().
918 	 */
919 	smp_wmb();
920 
921 	return ret;
922 }
923 
924 static int config_tdx_module(struct tdmr_info_list *tdmr_list, u64 global_keyid)
925 {
926 	struct tdx_module_args args = {};
927 	u64 *tdmr_pa_array;
928 	size_t array_sz;
929 	int i, ret;
930 
931 	/*
932 	 * TDMRs are passed to the TDX module via an array of physical
933 	 * addresses of each TDMR.  The array itself also has certain
934 	 * alignment requirement.
935 	 */
936 	array_sz = tdmr_list->nr_consumed_tdmrs * sizeof(u64);
937 	array_sz = roundup_pow_of_two(array_sz);
938 	if (array_sz < TDMR_INFO_PA_ARRAY_ALIGNMENT)
939 		array_sz = TDMR_INFO_PA_ARRAY_ALIGNMENT;
940 
941 	tdmr_pa_array = kzalloc(array_sz, GFP_KERNEL);
942 	if (!tdmr_pa_array)
943 		return -ENOMEM;
944 
945 	for (i = 0; i < tdmr_list->nr_consumed_tdmrs; i++)
946 		tdmr_pa_array[i] = __pa(tdmr_entry(tdmr_list, i));
947 
948 	args.rcx = __pa(tdmr_pa_array);
949 	args.rdx = tdmr_list->nr_consumed_tdmrs;
950 	args.r8 = global_keyid;
951 	ret = seamcall_prerr(TDH_SYS_CONFIG, &args);
952 
953 	/* Free the array as it is not required anymore. */
954 	kfree(tdmr_pa_array);
955 
956 	return ret;
957 }
958 
959 static int do_global_key_config(void *unused)
960 {
961 	struct tdx_module_args args = {};
962 
963 	return seamcall_prerr(TDH_SYS_KEY_CONFIG, &args);
964 }
965 
966 /*
967  * Attempt to configure the global KeyID on all physical packages.
968  *
969  * This requires running code on at least one CPU in each package.
970  * TDMR initialization) will fail will fail if any package in the
971  * system has no online CPUs.
972  *
973  * This code takes no affirmative steps to online CPUs.  Callers (aka.
974  * KVM) can ensure success by ensuring sufficient CPUs are online and
975  * can run SEAMCALLs.
976  */
977 static int config_global_keyid(void)
978 {
979 	cpumask_var_t packages;
980 	int cpu, ret = -EINVAL;
981 
982 	if (!zalloc_cpumask_var(&packages, GFP_KERNEL))
983 		return -ENOMEM;
984 
985 	/*
986 	 * Hardware doesn't guarantee cache coherency across different
987 	 * KeyIDs.  The kernel needs to flush PAMT's dirty cachelines
988 	 * (associated with KeyID 0) before the TDX module can use the
989 	 * global KeyID to access the PAMT.  Given PAMTs are potentially
990 	 * large (~1/256th of system RAM), just use WBINVD.
991 	 */
992 	wbinvd_on_all_cpus();
993 
994 	for_each_online_cpu(cpu) {
995 		/*
996 		 * The key configuration only needs to be done once per
997 		 * package and will return an error if configured more
998 		 * than once.  Avoid doing it multiple times per package.
999 		 */
1000 		if (cpumask_test_and_set_cpu(topology_physical_package_id(cpu),
1001 					packages))
1002 			continue;
1003 
1004 		/*
1005 		 * TDH.SYS.KEY.CONFIG cannot run concurrently on
1006 		 * different cpus.  Do it one by one.
1007 		 */
1008 		ret = smp_call_on_cpu(cpu, do_global_key_config, NULL, true);
1009 		if (ret)
1010 			break;
1011 	}
1012 
1013 	free_cpumask_var(packages);
1014 	return ret;
1015 }
1016 
1017 static int init_tdmr(struct tdmr_info *tdmr)
1018 {
1019 	u64 next;
1020 
1021 	/*
1022 	 * Initializing a TDMR can be time consuming.  To avoid long
1023 	 * SEAMCALLs, the TDX module may only initialize a part of the
1024 	 * TDMR in each call.
1025 	 */
1026 	do {
1027 		struct tdx_module_args args = {
1028 			.rcx = tdmr->base,
1029 		};
1030 		int ret;
1031 
1032 		ret = seamcall_prerr_ret(TDH_SYS_TDMR_INIT, &args);
1033 		if (ret)
1034 			return ret;
1035 		/*
1036 		 * RDX contains 'next-to-initialize' address if
1037 		 * TDH.SYS.TDMR.INIT did not fully complete and
1038 		 * should be retried.
1039 		 */
1040 		next = args.rdx;
1041 		cond_resched();
1042 		/* Keep making SEAMCALLs until the TDMR is done */
1043 	} while (next < tdmr->base + tdmr->size);
1044 
1045 	return 0;
1046 }
1047 
1048 static int init_tdmrs(struct tdmr_info_list *tdmr_list)
1049 {
1050 	int i;
1051 
1052 	/*
1053 	 * This operation is costly.  It can be parallelized,
1054 	 * but keep it simple for now.
1055 	 */
1056 	for (i = 0; i < tdmr_list->nr_consumed_tdmrs; i++) {
1057 		int ret;
1058 
1059 		ret = init_tdmr(tdmr_entry(tdmr_list, i));
1060 		if (ret)
1061 			return ret;
1062 	}
1063 
1064 	return 0;
1065 }
1066 
1067 static int init_tdx_module(void)
1068 {
1069 	int ret;
1070 
1071 	ret = get_tdx_sys_info(&tdx_sysinfo);
1072 	if (ret)
1073 		return ret;
1074 
1075 	/* Check whether the kernel can support this module */
1076 	ret = check_features(&tdx_sysinfo);
1077 	if (ret)
1078 		return ret;
1079 
1080 	/*
1081 	 * To keep things simple, assume that all TDX-protected memory
1082 	 * will come from the page allocator.  Make sure all pages in the
1083 	 * page allocator are TDX-usable memory.
1084 	 *
1085 	 * Build the list of "TDX-usable" memory regions which cover all
1086 	 * pages in the page allocator to guarantee that.  Do it while
1087 	 * holding mem_hotplug_lock read-lock as the memory hotplug code
1088 	 * path reads the @tdx_memlist to reject any new memory.
1089 	 */
1090 	get_online_mems();
1091 
1092 	ret = build_tdx_memlist(&tdx_memlist);
1093 	if (ret)
1094 		goto out_put_tdxmem;
1095 
1096 	/* Allocate enough space for constructing TDMRs */
1097 	ret = alloc_tdmr_list(&tdx_tdmr_list, &tdx_sysinfo.tdmr);
1098 	if (ret)
1099 		goto err_free_tdxmem;
1100 
1101 	/* Cover all TDX-usable memory regions in TDMRs */
1102 	ret = construct_tdmrs(&tdx_memlist, &tdx_tdmr_list, &tdx_sysinfo.tdmr);
1103 	if (ret)
1104 		goto err_free_tdmrs;
1105 
1106 	/* Pass the TDMRs and the global KeyID to the TDX module */
1107 	ret = config_tdx_module(&tdx_tdmr_list, tdx_global_keyid);
1108 	if (ret)
1109 		goto err_free_pamts;
1110 
1111 	/* Config the key of global KeyID on all packages */
1112 	ret = config_global_keyid();
1113 	if (ret)
1114 		goto err_reset_pamts;
1115 
1116 	/* Initialize TDMRs to complete the TDX module initialization */
1117 	ret = init_tdmrs(&tdx_tdmr_list);
1118 	if (ret)
1119 		goto err_reset_pamts;
1120 
1121 	pr_info("%lu KB allocated for PAMT\n", tdmrs_count_pamt_kb(&tdx_tdmr_list));
1122 
1123 out_put_tdxmem:
1124 	/*
1125 	 * @tdx_memlist is written here and read at memory hotplug time.
1126 	 * Lock out memory hotplug code while building it.
1127 	 */
1128 	put_online_mems();
1129 	return ret;
1130 
1131 err_reset_pamts:
1132 	/*
1133 	 * Part of PAMTs may already have been initialized by the
1134 	 * TDX module.  Flush cache before returning PAMTs back
1135 	 * to the kernel.
1136 	 */
1137 	wbinvd_on_all_cpus();
1138 	/*
1139 	 * According to the TDX hardware spec, if the platform
1140 	 * doesn't have the "partial write machine check"
1141 	 * erratum, any kernel read/write will never cause #MC
1142 	 * in kernel space, thus it's OK to not convert PAMTs
1143 	 * back to normal.  But do the conversion anyway here
1144 	 * as suggested by the TDX spec.
1145 	 */
1146 	tdmrs_reset_pamt_all(&tdx_tdmr_list);
1147 err_free_pamts:
1148 	tdmrs_free_pamt_all(&tdx_tdmr_list);
1149 err_free_tdmrs:
1150 	free_tdmr_list(&tdx_tdmr_list);
1151 err_free_tdxmem:
1152 	free_tdx_memlist(&tdx_memlist);
1153 	goto out_put_tdxmem;
1154 }
1155 
1156 static int __tdx_enable(void)
1157 {
1158 	int ret;
1159 
1160 	ret = init_tdx_module();
1161 	if (ret) {
1162 		pr_err("module initialization failed (%d)\n", ret);
1163 		tdx_module_status = TDX_MODULE_ERROR;
1164 		return ret;
1165 	}
1166 
1167 	pr_info("module initialized\n");
1168 	tdx_module_status = TDX_MODULE_INITIALIZED;
1169 
1170 	return 0;
1171 }
1172 
1173 /**
1174  * tdx_enable - Enable TDX module to make it ready to run TDX guests
1175  *
1176  * This function assumes the caller has: 1) held read lock of CPU hotplug
1177  * lock to prevent any new cpu from becoming online; 2) done both VMXON
1178  * and tdx_cpu_enable() on all online cpus.
1179  *
1180  * This function requires there's at least one online cpu for each CPU
1181  * package to succeed.
1182  *
1183  * This function can be called in parallel by multiple callers.
1184  *
1185  * Return 0 if TDX is enabled successfully, otherwise error.
1186  */
1187 int tdx_enable(void)
1188 {
1189 	int ret;
1190 
1191 	if (!boot_cpu_has(X86_FEATURE_TDX_HOST_PLATFORM))
1192 		return -ENODEV;
1193 
1194 	lockdep_assert_cpus_held();
1195 
1196 	mutex_lock(&tdx_module_lock);
1197 
1198 	switch (tdx_module_status) {
1199 	case TDX_MODULE_UNINITIALIZED:
1200 		ret = __tdx_enable();
1201 		break;
1202 	case TDX_MODULE_INITIALIZED:
1203 		/* Already initialized, great, tell the caller. */
1204 		ret = 0;
1205 		break;
1206 	default:
1207 		/* Failed to initialize in the previous attempts */
1208 		ret = -EINVAL;
1209 		break;
1210 	}
1211 
1212 	mutex_unlock(&tdx_module_lock);
1213 
1214 	return ret;
1215 }
1216 EXPORT_SYMBOL_GPL(tdx_enable);
1217 
1218 static bool is_pamt_page(unsigned long phys)
1219 {
1220 	struct tdmr_info_list *tdmr_list = &tdx_tdmr_list;
1221 	int i;
1222 
1223 	/* Ensure that all remote 'tdmr_list' writes are visible: */
1224 	smp_rmb();
1225 
1226 	/*
1227 	 * The TDX module is no longer returning TDX_SYS_NOT_READY and
1228 	 * is initialized.  The 'tdmr_list' was initialized long ago
1229 	 * and is now read-only.
1230 	 */
1231 	for (i = 0; i < tdmr_list->nr_consumed_tdmrs; i++) {
1232 		unsigned long base, size;
1233 
1234 		tdmr_get_pamt(tdmr_entry(tdmr_list, i), &base, &size);
1235 
1236 		if (phys >= base && phys < (base + size))
1237 			return true;
1238 	}
1239 
1240 	return false;
1241 }
1242 
1243 /*
1244  * Return whether the memory page at the given physical address is TDX
1245  * private memory or not.
1246  *
1247  * This can be imprecise for two known reasons:
1248  * 1. PAMTs are private memory and exist before the TDX module is
1249  *    ready and TDH_PHYMEM_PAGE_RDMD works.  This is a relatively
1250  *    short window that occurs once per boot.
1251  * 2. TDH_PHYMEM_PAGE_RDMD reflects the TDX module's knowledge of the
1252  *    page.  However, the page can still cause #MC until it has been
1253  *    fully converted to shared using 64-byte writes like MOVDIR64B.
1254  *    Buggy hosts might still leave #MC-causing memory in place which
1255  *    this function can not detect.
1256  */
1257 static bool paddr_is_tdx_private(unsigned long phys)
1258 {
1259 	struct tdx_module_args args = {
1260 		.rcx = phys & PAGE_MASK,
1261 	};
1262 	u64 sret;
1263 
1264 	if (!boot_cpu_has(X86_FEATURE_TDX_HOST_PLATFORM))
1265 		return false;
1266 
1267 	/* Get page type from the TDX module */
1268 	sret = __seamcall_ret(TDH_PHYMEM_PAGE_RDMD, &args);
1269 
1270 	/*
1271 	 * The SEAMCALL will not return success unless there is a
1272 	 * working, "ready" TDX module.  Assume an absence of TDX
1273 	 * private pages until SEAMCALL is working.
1274 	 */
1275 	if (sret)
1276 		return false;
1277 
1278 	/*
1279 	 * SEAMCALL was successful -- read page type (via RCX):
1280 	 *
1281 	 *  - PT_NDA:	Page is not used by the TDX module
1282 	 *  - PT_RSVD:	Reserved for Non-TDX use
1283 	 *  - Others:	Page is used by the TDX module
1284 	 *
1285 	 * Note PAMT pages are marked as PT_RSVD but they are also TDX
1286 	 * private memory.
1287 	 */
1288 	switch (args.rcx) {
1289 	case PT_NDA:
1290 		return false;
1291 	case PT_RSVD:
1292 		return is_pamt_page(phys);
1293 	default:
1294 		return true;
1295 	}
1296 }
1297 
1298 /*
1299  * Some TDX-capable CPUs have an erratum.  A write to TDX private
1300  * memory poisons that memory, and a subsequent read of that memory
1301  * triggers #MC.
1302  *
1303  * Help distinguish erratum-triggered #MCs from a normal hardware one.
1304  * Just print additional message to show such #MC may be result of the
1305  * erratum.
1306  */
1307 const char *tdx_dump_mce_info(struct mce *m)
1308 {
1309 	if (!m || !mce_is_memory_error(m) || !mce_usable_address(m))
1310 		return NULL;
1311 
1312 	if (!paddr_is_tdx_private(m->addr))
1313 		return NULL;
1314 
1315 	return "TDX private memory error. Possible kernel bug.";
1316 }
1317 
1318 static __init int record_keyid_partitioning(u32 *tdx_keyid_start,
1319 					    u32 *nr_tdx_keyids)
1320 {
1321 	u32 _nr_mktme_keyids, _tdx_keyid_start, _nr_tdx_keyids;
1322 	int ret;
1323 
1324 	/*
1325 	 * IA32_MKTME_KEYID_PARTIONING:
1326 	 *   Bit [31:0]:	Number of MKTME KeyIDs.
1327 	 *   Bit [63:32]:	Number of TDX private KeyIDs.
1328 	 */
1329 	ret = rdmsr_safe(MSR_IA32_MKTME_KEYID_PARTITIONING, &_nr_mktme_keyids,
1330 			&_nr_tdx_keyids);
1331 	if (ret || !_nr_tdx_keyids)
1332 		return -EINVAL;
1333 
1334 	/* TDX KeyIDs start after the last MKTME KeyID. */
1335 	_tdx_keyid_start = _nr_mktme_keyids + 1;
1336 
1337 	*tdx_keyid_start = _tdx_keyid_start;
1338 	*nr_tdx_keyids = _nr_tdx_keyids;
1339 
1340 	return 0;
1341 }
1342 
1343 static bool is_tdx_memory(unsigned long start_pfn, unsigned long end_pfn)
1344 {
1345 	struct tdx_memblock *tmb;
1346 
1347 	/*
1348 	 * This check assumes that the start_pfn<->end_pfn range does not
1349 	 * cross multiple @tdx_memlist entries.  A single memory online
1350 	 * event across multiple memblocks (from which @tdx_memlist
1351 	 * entries are derived at the time of module initialization) is
1352 	 * not possible.  This is because memory offline/online is done
1353 	 * on granularity of 'struct memory_block', and the hotpluggable
1354 	 * memory region (one memblock) must be multiple of memory_block.
1355 	 */
1356 	list_for_each_entry(tmb, &tdx_memlist, list) {
1357 		if (start_pfn >= tmb->start_pfn && end_pfn <= tmb->end_pfn)
1358 			return true;
1359 	}
1360 	return false;
1361 }
1362 
1363 static int tdx_memory_notifier(struct notifier_block *nb, unsigned long action,
1364 			       void *v)
1365 {
1366 	struct memory_notify *mn = v;
1367 
1368 	if (action != MEM_GOING_ONLINE)
1369 		return NOTIFY_OK;
1370 
1371 	/*
1372 	 * Empty list means TDX isn't enabled.  Allow any memory
1373 	 * to go online.
1374 	 */
1375 	if (list_empty(&tdx_memlist))
1376 		return NOTIFY_OK;
1377 
1378 	/*
1379 	 * The TDX memory configuration is static and can not be
1380 	 * changed.  Reject onlining any memory which is outside of
1381 	 * the static configuration whether it supports TDX or not.
1382 	 */
1383 	if (is_tdx_memory(mn->start_pfn, mn->start_pfn + mn->nr_pages))
1384 		return NOTIFY_OK;
1385 
1386 	return NOTIFY_BAD;
1387 }
1388 
1389 static struct notifier_block tdx_memory_nb = {
1390 	.notifier_call = tdx_memory_notifier,
1391 };
1392 
1393 static void __init check_tdx_erratum(void)
1394 {
1395 	/*
1396 	 * These CPUs have an erratum.  A partial write from non-TD
1397 	 * software (e.g. via MOVNTI variants or UC/WC mapping) to TDX
1398 	 * private memory poisons that memory, and a subsequent read of
1399 	 * that memory triggers #MC.
1400 	 */
1401 	switch (boot_cpu_data.x86_vfm) {
1402 	case INTEL_SAPPHIRERAPIDS_X:
1403 	case INTEL_EMERALDRAPIDS_X:
1404 		setup_force_cpu_bug(X86_BUG_TDX_PW_MCE);
1405 	}
1406 }
1407 
1408 void __init tdx_init(void)
1409 {
1410 	u32 tdx_keyid_start, nr_tdx_keyids;
1411 	int err;
1412 
1413 	err = record_keyid_partitioning(&tdx_keyid_start, &nr_tdx_keyids);
1414 	if (err)
1415 		return;
1416 
1417 	pr_info("BIOS enabled: private KeyID range [%u, %u)\n",
1418 			tdx_keyid_start, tdx_keyid_start + nr_tdx_keyids);
1419 
1420 	/*
1421 	 * The TDX module itself requires one 'global KeyID' to protect
1422 	 * its metadata.  If there's only one TDX KeyID, there won't be
1423 	 * any left for TDX guests thus there's no point to enable TDX
1424 	 * at all.
1425 	 */
1426 	if (nr_tdx_keyids < 2) {
1427 		pr_err("initialization failed: too few private KeyIDs available.\n");
1428 		return;
1429 	}
1430 
1431 	/*
1432 	 * At this point, hibernation_available() indicates whether or
1433 	 * not hibernation support has been permanently disabled.
1434 	 */
1435 	if (hibernation_available()) {
1436 		pr_err("initialization failed: Hibernation support is enabled\n");
1437 		return;
1438 	}
1439 
1440 	err = register_memory_notifier(&tdx_memory_nb);
1441 	if (err) {
1442 		pr_err("initialization failed: register_memory_notifier() failed (%d)\n",
1443 				err);
1444 		return;
1445 	}
1446 
1447 #if defined(CONFIG_ACPI) && defined(CONFIG_SUSPEND)
1448 	pr_info("Disable ACPI S3. Turn off TDX in the BIOS to use ACPI S3.\n");
1449 	acpi_suspend_lowlevel = NULL;
1450 #endif
1451 
1452 	/*
1453 	 * Just use the first TDX KeyID as the 'global KeyID' and
1454 	 * leave the rest for TDX guests.
1455 	 */
1456 	tdx_global_keyid = tdx_keyid_start;
1457 	tdx_guest_keyid_start = tdx_keyid_start + 1;
1458 	tdx_nr_guest_keyids = nr_tdx_keyids - 1;
1459 
1460 	setup_force_cpu_cap(X86_FEATURE_TDX_HOST_PLATFORM);
1461 
1462 	check_tdx_erratum();
1463 }
1464 
1465 const struct tdx_sys_info *tdx_get_sysinfo(void)
1466 {
1467 	const struct tdx_sys_info *p = NULL;
1468 
1469 	/* Make sure all fields in @tdx_sysinfo have been populated */
1470 	mutex_lock(&tdx_module_lock);
1471 	if (tdx_module_status == TDX_MODULE_INITIALIZED)
1472 		p = (const struct tdx_sys_info *)&tdx_sysinfo;
1473 	mutex_unlock(&tdx_module_lock);
1474 
1475 	return p;
1476 }
1477 EXPORT_SYMBOL_GPL(tdx_get_sysinfo);
1478 
1479 u32 tdx_get_nr_guest_keyids(void)
1480 {
1481 	return tdx_nr_guest_keyids;
1482 }
1483 EXPORT_SYMBOL_GPL(tdx_get_nr_guest_keyids);
1484 
1485 int tdx_guest_keyid_alloc(void)
1486 {
1487 	return ida_alloc_range(&tdx_guest_keyid_pool, tdx_guest_keyid_start,
1488 			       tdx_guest_keyid_start + tdx_nr_guest_keyids - 1,
1489 			       GFP_KERNEL);
1490 }
1491 EXPORT_SYMBOL_GPL(tdx_guest_keyid_alloc);
1492 
1493 void tdx_guest_keyid_free(unsigned int keyid)
1494 {
1495 	ida_free(&tdx_guest_keyid_pool, keyid);
1496 }
1497 EXPORT_SYMBOL_GPL(tdx_guest_keyid_free);
1498 
1499 static inline u64 tdx_tdr_pa(struct tdx_td *td)
1500 {
1501 	return page_to_phys(td->tdr_page);
1502 }
1503 
1504 static inline u64 tdx_tdvpr_pa(struct tdx_vp *td)
1505 {
1506 	return page_to_phys(td->tdvpr_page);
1507 }
1508 
1509 /*
1510  * The TDX module exposes a CLFLUSH_BEFORE_ALLOC bit to specify whether
1511  * a CLFLUSH of pages is required before handing them to the TDX module.
1512  * Be conservative and make the code simpler by doing the CLFLUSH
1513  * unconditionally.
1514  */
1515 static void tdx_clflush_page(struct page *page)
1516 {
1517 	clflush_cache_range(page_to_virt(page), PAGE_SIZE);
1518 }
1519 
1520 noinstr __flatten u64 tdh_vp_enter(struct tdx_vp *td, struct tdx_module_args *args)
1521 {
1522 	args->rcx = tdx_tdvpr_pa(td);
1523 
1524 	return __seamcall_saved_ret(TDH_VP_ENTER, args);
1525 }
1526 EXPORT_SYMBOL_GPL(tdh_vp_enter);
1527 
1528 u64 tdh_mng_addcx(struct tdx_td *td, struct page *tdcs_page)
1529 {
1530 	struct tdx_module_args args = {
1531 		.rcx = page_to_phys(tdcs_page),
1532 		.rdx = tdx_tdr_pa(td),
1533 	};
1534 
1535 	tdx_clflush_page(tdcs_page);
1536 	return seamcall(TDH_MNG_ADDCX, &args);
1537 }
1538 EXPORT_SYMBOL_GPL(tdh_mng_addcx);
1539 
1540 u64 tdh_mem_page_add(struct tdx_td *td, u64 gpa, struct page *page, struct page *source, u64 *ext_err1, u64 *ext_err2)
1541 {
1542 	struct tdx_module_args args = {
1543 		.rcx = gpa,
1544 		.rdx = tdx_tdr_pa(td),
1545 		.r8 = page_to_phys(page),
1546 		.r9 = page_to_phys(source),
1547 	};
1548 	u64 ret;
1549 
1550 	tdx_clflush_page(page);
1551 	ret = seamcall_ret(TDH_MEM_PAGE_ADD, &args);
1552 
1553 	*ext_err1 = args.rcx;
1554 	*ext_err2 = args.rdx;
1555 
1556 	return ret;
1557 }
1558 EXPORT_SYMBOL_GPL(tdh_mem_page_add);
1559 
1560 u64 tdh_mem_sept_add(struct tdx_td *td, u64 gpa, int level, struct page *page, u64 *ext_err1, u64 *ext_err2)
1561 {
1562 	struct tdx_module_args args = {
1563 		.rcx = gpa | level,
1564 		.rdx = tdx_tdr_pa(td),
1565 		.r8 = page_to_phys(page),
1566 	};
1567 	u64 ret;
1568 
1569 	tdx_clflush_page(page);
1570 	ret = seamcall_ret(TDH_MEM_SEPT_ADD, &args);
1571 
1572 	*ext_err1 = args.rcx;
1573 	*ext_err2 = args.rdx;
1574 
1575 	return ret;
1576 }
1577 EXPORT_SYMBOL_GPL(tdh_mem_sept_add);
1578 
1579 u64 tdh_vp_addcx(struct tdx_vp *vp, struct page *tdcx_page)
1580 {
1581 	struct tdx_module_args args = {
1582 		.rcx = page_to_phys(tdcx_page),
1583 		.rdx = tdx_tdvpr_pa(vp),
1584 	};
1585 
1586 	tdx_clflush_page(tdcx_page);
1587 	return seamcall(TDH_VP_ADDCX, &args);
1588 }
1589 EXPORT_SYMBOL_GPL(tdh_vp_addcx);
1590 
1591 u64 tdh_mem_page_aug(struct tdx_td *td, u64 gpa, int level, struct page *page, u64 *ext_err1, u64 *ext_err2)
1592 {
1593 	struct tdx_module_args args = {
1594 		.rcx = gpa | level,
1595 		.rdx = tdx_tdr_pa(td),
1596 		.r8 = page_to_phys(page),
1597 	};
1598 	u64 ret;
1599 
1600 	tdx_clflush_page(page);
1601 	ret = seamcall_ret(TDH_MEM_PAGE_AUG, &args);
1602 
1603 	*ext_err1 = args.rcx;
1604 	*ext_err2 = args.rdx;
1605 
1606 	return ret;
1607 }
1608 EXPORT_SYMBOL_GPL(tdh_mem_page_aug);
1609 
1610 u64 tdh_mem_range_block(struct tdx_td *td, u64 gpa, int level, u64 *ext_err1, u64 *ext_err2)
1611 {
1612 	struct tdx_module_args args = {
1613 		.rcx = gpa | level,
1614 		.rdx = tdx_tdr_pa(td),
1615 	};
1616 	u64 ret;
1617 
1618 	ret = seamcall_ret(TDH_MEM_RANGE_BLOCK, &args);
1619 
1620 	*ext_err1 = args.rcx;
1621 	*ext_err2 = args.rdx;
1622 
1623 	return ret;
1624 }
1625 EXPORT_SYMBOL_GPL(tdh_mem_range_block);
1626 
1627 u64 tdh_mng_key_config(struct tdx_td *td)
1628 {
1629 	struct tdx_module_args args = {
1630 		.rcx = tdx_tdr_pa(td),
1631 	};
1632 
1633 	return seamcall(TDH_MNG_KEY_CONFIG, &args);
1634 }
1635 EXPORT_SYMBOL_GPL(tdh_mng_key_config);
1636 
1637 u64 tdh_mng_create(struct tdx_td *td, u16 hkid)
1638 {
1639 	struct tdx_module_args args = {
1640 		.rcx = tdx_tdr_pa(td),
1641 		.rdx = hkid,
1642 	};
1643 
1644 	tdx_clflush_page(td->tdr_page);
1645 	return seamcall(TDH_MNG_CREATE, &args);
1646 }
1647 EXPORT_SYMBOL_GPL(tdh_mng_create);
1648 
1649 u64 tdh_vp_create(struct tdx_td *td, struct tdx_vp *vp)
1650 {
1651 	struct tdx_module_args args = {
1652 		.rcx = tdx_tdvpr_pa(vp),
1653 		.rdx = tdx_tdr_pa(td),
1654 	};
1655 
1656 	tdx_clflush_page(vp->tdvpr_page);
1657 	return seamcall(TDH_VP_CREATE, &args);
1658 }
1659 EXPORT_SYMBOL_GPL(tdh_vp_create);
1660 
1661 u64 tdh_mng_rd(struct tdx_td *td, u64 field, u64 *data)
1662 {
1663 	struct tdx_module_args args = {
1664 		.rcx = tdx_tdr_pa(td),
1665 		.rdx = field,
1666 	};
1667 	u64 ret;
1668 
1669 	ret = seamcall_ret(TDH_MNG_RD, &args);
1670 
1671 	/* R8: Content of the field, or 0 in case of error. */
1672 	*data = args.r8;
1673 
1674 	return ret;
1675 }
1676 EXPORT_SYMBOL_GPL(tdh_mng_rd);
1677 
1678 u64 tdh_mr_extend(struct tdx_td *td, u64 gpa, u64 *ext_err1, u64 *ext_err2)
1679 {
1680 	struct tdx_module_args args = {
1681 		.rcx = gpa,
1682 		.rdx = tdx_tdr_pa(td),
1683 	};
1684 	u64 ret;
1685 
1686 	ret = seamcall_ret(TDH_MR_EXTEND, &args);
1687 
1688 	*ext_err1 = args.rcx;
1689 	*ext_err2 = args.rdx;
1690 
1691 	return ret;
1692 }
1693 EXPORT_SYMBOL_GPL(tdh_mr_extend);
1694 
1695 u64 tdh_mr_finalize(struct tdx_td *td)
1696 {
1697 	struct tdx_module_args args = {
1698 		.rcx = tdx_tdr_pa(td),
1699 	};
1700 
1701 	return seamcall(TDH_MR_FINALIZE, &args);
1702 }
1703 EXPORT_SYMBOL_GPL(tdh_mr_finalize);
1704 
1705 u64 tdh_vp_flush(struct tdx_vp *vp)
1706 {
1707 	struct tdx_module_args args = {
1708 		.rcx = tdx_tdvpr_pa(vp),
1709 	};
1710 
1711 	return seamcall(TDH_VP_FLUSH, &args);
1712 }
1713 EXPORT_SYMBOL_GPL(tdh_vp_flush);
1714 
1715 u64 tdh_mng_vpflushdone(struct tdx_td *td)
1716 {
1717 	struct tdx_module_args args = {
1718 		.rcx = tdx_tdr_pa(td),
1719 	};
1720 
1721 	return seamcall(TDH_MNG_VPFLUSHDONE, &args);
1722 }
1723 EXPORT_SYMBOL_GPL(tdh_mng_vpflushdone);
1724 
1725 u64 tdh_mng_key_freeid(struct tdx_td *td)
1726 {
1727 	struct tdx_module_args args = {
1728 		.rcx = tdx_tdr_pa(td),
1729 	};
1730 
1731 	return seamcall(TDH_MNG_KEY_FREEID, &args);
1732 }
1733 EXPORT_SYMBOL_GPL(tdh_mng_key_freeid);
1734 
1735 u64 tdh_mng_init(struct tdx_td *td, u64 td_params, u64 *extended_err)
1736 {
1737 	struct tdx_module_args args = {
1738 		.rcx = tdx_tdr_pa(td),
1739 		.rdx = td_params,
1740 	};
1741 	u64 ret;
1742 
1743 	ret = seamcall_ret(TDH_MNG_INIT, &args);
1744 
1745 	*extended_err = args.rcx;
1746 
1747 	return ret;
1748 }
1749 EXPORT_SYMBOL_GPL(tdh_mng_init);
1750 
1751 u64 tdh_vp_rd(struct tdx_vp *vp, u64 field, u64 *data)
1752 {
1753 	struct tdx_module_args args = {
1754 		.rcx = tdx_tdvpr_pa(vp),
1755 		.rdx = field,
1756 	};
1757 	u64 ret;
1758 
1759 	ret = seamcall_ret(TDH_VP_RD, &args);
1760 
1761 	/* R8: Content of the field, or 0 in case of error. */
1762 	*data = args.r8;
1763 
1764 	return ret;
1765 }
1766 EXPORT_SYMBOL_GPL(tdh_vp_rd);
1767 
1768 u64 tdh_vp_wr(struct tdx_vp *vp, u64 field, u64 data, u64 mask)
1769 {
1770 	struct tdx_module_args args = {
1771 		.rcx = tdx_tdvpr_pa(vp),
1772 		.rdx = field,
1773 		.r8 = data,
1774 		.r9 = mask,
1775 	};
1776 
1777 	return seamcall(TDH_VP_WR, &args);
1778 }
1779 EXPORT_SYMBOL_GPL(tdh_vp_wr);
1780 
1781 u64 tdh_vp_init(struct tdx_vp *vp, u64 initial_rcx, u32 x2apicid)
1782 {
1783 	struct tdx_module_args args = {
1784 		.rcx = tdx_tdvpr_pa(vp),
1785 		.rdx = initial_rcx,
1786 		.r8 = x2apicid,
1787 	};
1788 
1789 	/* apicid requires version == 1. */
1790 	return seamcall(TDH_VP_INIT | (1ULL << TDX_VERSION_SHIFT), &args);
1791 }
1792 EXPORT_SYMBOL_GPL(tdh_vp_init);
1793 
1794 /*
1795  * TDX ABI defines output operands as PT, OWNER and SIZE. These are TDX defined fomats.
1796  * So despite the names, they must be interpted specially as described by the spec. Return
1797  * them only for error reporting purposes.
1798  */
1799 u64 tdh_phymem_page_reclaim(struct page *page, u64 *tdx_pt, u64 *tdx_owner, u64 *tdx_size)
1800 {
1801 	struct tdx_module_args args = {
1802 		.rcx = page_to_phys(page),
1803 	};
1804 	u64 ret;
1805 
1806 	ret = seamcall_ret(TDH_PHYMEM_PAGE_RECLAIM, &args);
1807 
1808 	*tdx_pt = args.rcx;
1809 	*tdx_owner = args.rdx;
1810 	*tdx_size = args.r8;
1811 
1812 	return ret;
1813 }
1814 EXPORT_SYMBOL_GPL(tdh_phymem_page_reclaim);
1815 
1816 u64 tdh_mem_track(struct tdx_td *td)
1817 {
1818 	struct tdx_module_args args = {
1819 		.rcx = tdx_tdr_pa(td),
1820 	};
1821 
1822 	return seamcall(TDH_MEM_TRACK, &args);
1823 }
1824 EXPORT_SYMBOL_GPL(tdh_mem_track);
1825 
1826 u64 tdh_mem_page_remove(struct tdx_td *td, u64 gpa, u64 level, u64 *ext_err1, u64 *ext_err2)
1827 {
1828 	struct tdx_module_args args = {
1829 		.rcx = gpa | level,
1830 		.rdx = tdx_tdr_pa(td),
1831 	};
1832 	u64 ret;
1833 
1834 	ret = seamcall_ret(TDH_MEM_PAGE_REMOVE, &args);
1835 
1836 	*ext_err1 = args.rcx;
1837 	*ext_err2 = args.rdx;
1838 
1839 	return ret;
1840 }
1841 EXPORT_SYMBOL_GPL(tdh_mem_page_remove);
1842 
1843 u64 tdh_phymem_cache_wb(bool resume)
1844 {
1845 	struct tdx_module_args args = {
1846 		.rcx = resume ? 1 : 0,
1847 	};
1848 
1849 	return seamcall(TDH_PHYMEM_CACHE_WB, &args);
1850 }
1851 EXPORT_SYMBOL_GPL(tdh_phymem_cache_wb);
1852 
1853 u64 tdh_phymem_page_wbinvd_tdr(struct tdx_td *td)
1854 {
1855 	struct tdx_module_args args = {};
1856 
1857 	args.rcx = mk_keyed_paddr(tdx_global_keyid, td->tdr_page);
1858 
1859 	return seamcall(TDH_PHYMEM_PAGE_WBINVD, &args);
1860 }
1861 EXPORT_SYMBOL_GPL(tdh_phymem_page_wbinvd_tdr);
1862 
1863 u64 tdh_phymem_page_wbinvd_hkid(u64 hkid, struct page *page)
1864 {
1865 	struct tdx_module_args args = {};
1866 
1867 	args.rcx = mk_keyed_paddr(hkid, page);
1868 
1869 	return seamcall(TDH_PHYMEM_PAGE_WBINVD, &args);
1870 }
1871 EXPORT_SYMBOL_GPL(tdh_phymem_page_wbinvd_hkid);
1872