xref: /linux/arch/x86/virt/vmx/tdx/tdx.c (revision c94cd9508b1335b949fd13ebd269313c65492df0)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Copyright(c) 2023 Intel Corporation.
4  *
5  * Intel Trusted Domain Extensions (TDX) support
6  */
7 
8 #define pr_fmt(fmt)	"virt/tdx: " fmt
9 
10 #include <linux/types.h>
11 #include <linux/cache.h>
12 #include <linux/init.h>
13 #include <linux/errno.h>
14 #include <linux/printk.h>
15 #include <linux/cpu.h>
16 #include <linux/spinlock.h>
17 #include <linux/percpu-defs.h>
18 #include <linux/mutex.h>
19 #include <linux/list.h>
20 #include <linux/memblock.h>
21 #include <linux/memory.h>
22 #include <linux/minmax.h>
23 #include <linux/sizes.h>
24 #include <linux/pfn.h>
25 #include <linux/align.h>
26 #include <linux/sort.h>
27 #include <linux/log2.h>
28 #include <linux/acpi.h>
29 #include <linux/suspend.h>
30 #include <asm/page.h>
31 #include <asm/special_insns.h>
32 #include <asm/msr-index.h>
33 #include <asm/msr.h>
34 #include <asm/cpufeature.h>
35 #include <asm/tdx.h>
36 #include <asm/cpu_device_id.h>
37 #include <asm/processor.h>
38 #include <asm/mce.h>
39 #include "tdx.h"
40 
41 static u32 tdx_global_keyid __ro_after_init;
42 static u32 tdx_guest_keyid_start __ro_after_init;
43 static u32 tdx_nr_guest_keyids __ro_after_init;
44 
45 static DEFINE_PER_CPU(bool, tdx_lp_initialized);
46 
47 static struct tdmr_info_list tdx_tdmr_list;
48 
49 static enum tdx_module_status_t tdx_module_status;
50 static DEFINE_MUTEX(tdx_module_lock);
51 
52 /* All TDX-usable memory regions.  Protected by mem_hotplug_lock. */
53 static LIST_HEAD(tdx_memlist);
54 
55 typedef void (*sc_err_func_t)(u64 fn, u64 err, struct tdx_module_args *args);
56 
57 static inline void seamcall_err(u64 fn, u64 err, struct tdx_module_args *args)
58 {
59 	pr_err("SEAMCALL (0x%016llx) failed: 0x%016llx\n", fn, err);
60 }
61 
62 static inline void seamcall_err_ret(u64 fn, u64 err,
63 				    struct tdx_module_args *args)
64 {
65 	seamcall_err(fn, err, args);
66 	pr_err("RCX 0x%016llx RDX 0x%016llx R08 0x%016llx\n",
67 			args->rcx, args->rdx, args->r8);
68 	pr_err("R09 0x%016llx R10 0x%016llx R11 0x%016llx\n",
69 			args->r9, args->r10, args->r11);
70 }
71 
72 static inline int sc_retry_prerr(sc_func_t func, sc_err_func_t err_func,
73 				 u64 fn, struct tdx_module_args *args)
74 {
75 	u64 sret = sc_retry(func, fn, args);
76 
77 	if (sret == TDX_SUCCESS)
78 		return 0;
79 
80 	if (sret == TDX_SEAMCALL_VMFAILINVALID)
81 		return -ENODEV;
82 
83 	if (sret == TDX_SEAMCALL_GP)
84 		return -EOPNOTSUPP;
85 
86 	if (sret == TDX_SEAMCALL_UD)
87 		return -EACCES;
88 
89 	err_func(fn, sret, args);
90 	return -EIO;
91 }
92 
93 #define seamcall_prerr(__fn, __args)						\
94 	sc_retry_prerr(__seamcall, seamcall_err, (__fn), (__args))
95 
96 #define seamcall_prerr_ret(__fn, __args)					\
97 	sc_retry_prerr(__seamcall_ret, seamcall_err_ret, (__fn), (__args))
98 
99 /*
100  * Do the module global initialization once and return its result.
101  * It can be done on any cpu.  It's always called with interrupts
102  * disabled.
103  */
104 static int try_init_module_global(void)
105 {
106 	struct tdx_module_args args = {};
107 	static DEFINE_RAW_SPINLOCK(sysinit_lock);
108 	static bool sysinit_done;
109 	static int sysinit_ret;
110 
111 	lockdep_assert_irqs_disabled();
112 
113 	raw_spin_lock(&sysinit_lock);
114 
115 	if (sysinit_done)
116 		goto out;
117 
118 	/* RCX is module attributes and all bits are reserved */
119 	args.rcx = 0;
120 	sysinit_ret = seamcall_prerr(TDH_SYS_INIT, &args);
121 
122 	/*
123 	 * The first SEAMCALL also detects the TDX module, thus
124 	 * it can fail due to the TDX module is not loaded.
125 	 * Dump message to let the user know.
126 	 */
127 	if (sysinit_ret == -ENODEV)
128 		pr_err("module not loaded\n");
129 
130 	sysinit_done = true;
131 out:
132 	raw_spin_unlock(&sysinit_lock);
133 	return sysinit_ret;
134 }
135 
136 /**
137  * tdx_cpu_enable - Enable TDX on local cpu
138  *
139  * Do one-time TDX module per-cpu initialization SEAMCALL (and TDX module
140  * global initialization SEAMCALL if not done) on local cpu to make this
141  * cpu be ready to run any other SEAMCALLs.
142  *
143  * Always call this function via IPI function calls.
144  *
145  * Return 0 on success, otherwise errors.
146  */
147 int tdx_cpu_enable(void)
148 {
149 	struct tdx_module_args args = {};
150 	int ret;
151 
152 	if (!boot_cpu_has(X86_FEATURE_TDX_HOST_PLATFORM))
153 		return -ENODEV;
154 
155 	lockdep_assert_irqs_disabled();
156 
157 	if (__this_cpu_read(tdx_lp_initialized))
158 		return 0;
159 
160 	/*
161 	 * The TDX module global initialization is the very first step
162 	 * to enable TDX.  Need to do it first (if hasn't been done)
163 	 * before the per-cpu initialization.
164 	 */
165 	ret = try_init_module_global();
166 	if (ret)
167 		return ret;
168 
169 	ret = seamcall_prerr(TDH_SYS_LP_INIT, &args);
170 	if (ret)
171 		return ret;
172 
173 	__this_cpu_write(tdx_lp_initialized, true);
174 
175 	return 0;
176 }
177 EXPORT_SYMBOL_GPL(tdx_cpu_enable);
178 
179 /*
180  * Add a memory region as a TDX memory block.  The caller must make sure
181  * all memory regions are added in address ascending order and don't
182  * overlap.
183  */
184 static int add_tdx_memblock(struct list_head *tmb_list, unsigned long start_pfn,
185 			    unsigned long end_pfn, int nid)
186 {
187 	struct tdx_memblock *tmb;
188 
189 	tmb = kmalloc(sizeof(*tmb), GFP_KERNEL);
190 	if (!tmb)
191 		return -ENOMEM;
192 
193 	INIT_LIST_HEAD(&tmb->list);
194 	tmb->start_pfn = start_pfn;
195 	tmb->end_pfn = end_pfn;
196 	tmb->nid = nid;
197 
198 	/* @tmb_list is protected by mem_hotplug_lock */
199 	list_add_tail(&tmb->list, tmb_list);
200 	return 0;
201 }
202 
203 static void free_tdx_memlist(struct list_head *tmb_list)
204 {
205 	/* @tmb_list is protected by mem_hotplug_lock */
206 	while (!list_empty(tmb_list)) {
207 		struct tdx_memblock *tmb = list_first_entry(tmb_list,
208 				struct tdx_memblock, list);
209 
210 		list_del(&tmb->list);
211 		kfree(tmb);
212 	}
213 }
214 
215 /*
216  * Ensure that all memblock memory regions are convertible to TDX
217  * memory.  Once this has been established, stash the memblock
218  * ranges off in a secondary structure because memblock is modified
219  * in memory hotplug while TDX memory regions are fixed.
220  */
221 static int build_tdx_memlist(struct list_head *tmb_list)
222 {
223 	unsigned long start_pfn, end_pfn;
224 	int i, nid, ret;
225 
226 	for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) {
227 		/*
228 		 * The first 1MB is not reported as TDX convertible memory.
229 		 * Although the first 1MB is always reserved and won't end up
230 		 * to the page allocator, it is still in memblock's memory
231 		 * regions.  Skip them manually to exclude them as TDX memory.
232 		 */
233 		start_pfn = max(start_pfn, PHYS_PFN(SZ_1M));
234 		if (start_pfn >= end_pfn)
235 			continue;
236 
237 		/*
238 		 * Add the memory regions as TDX memory.  The regions in
239 		 * memblock has already guaranteed they are in address
240 		 * ascending order and don't overlap.
241 		 */
242 		ret = add_tdx_memblock(tmb_list, start_pfn, end_pfn, nid);
243 		if (ret)
244 			goto err;
245 	}
246 
247 	return 0;
248 err:
249 	free_tdx_memlist(tmb_list);
250 	return ret;
251 }
252 
253 static int read_sys_metadata_field(u64 field_id, u64 *data)
254 {
255 	struct tdx_module_args args = {};
256 	int ret;
257 
258 	/*
259 	 * TDH.SYS.RD -- reads one global metadata field
260 	 *  - RDX (in): the field to read
261 	 *  - R8 (out): the field data
262 	 */
263 	args.rdx = field_id;
264 	ret = seamcall_prerr_ret(TDH_SYS_RD, &args);
265 	if (ret)
266 		return ret;
267 
268 	*data = args.r8;
269 
270 	return 0;
271 }
272 
273 static int read_sys_metadata_field16(u64 field_id,
274 				     int offset,
275 				     struct tdx_tdmr_sysinfo *ts)
276 {
277 	u16 *ts_member = ((void *)ts) + offset;
278 	u64 tmp;
279 	int ret;
280 
281 	if (WARN_ON_ONCE(MD_FIELD_ID_ELE_SIZE_CODE(field_id) !=
282 			MD_FIELD_ID_ELE_SIZE_16BIT))
283 		return -EINVAL;
284 
285 	ret = read_sys_metadata_field(field_id, &tmp);
286 	if (ret)
287 		return ret;
288 
289 	*ts_member = tmp;
290 
291 	return 0;
292 }
293 
294 struct field_mapping {
295 	u64 field_id;
296 	int offset;
297 };
298 
299 #define TD_SYSINFO_MAP(_field_id, _offset) \
300 	{ .field_id = MD_FIELD_ID_##_field_id,	   \
301 	  .offset   = offsetof(struct tdx_tdmr_sysinfo, _offset) }
302 
303 /* Map TD_SYSINFO fields into 'struct tdx_tdmr_sysinfo': */
304 static const struct field_mapping fields[] = {
305 	TD_SYSINFO_MAP(MAX_TDMRS,	      max_tdmrs),
306 	TD_SYSINFO_MAP(MAX_RESERVED_PER_TDMR, max_reserved_per_tdmr),
307 	TD_SYSINFO_MAP(PAMT_4K_ENTRY_SIZE,    pamt_entry_size[TDX_PS_4K]),
308 	TD_SYSINFO_MAP(PAMT_2M_ENTRY_SIZE,    pamt_entry_size[TDX_PS_2M]),
309 	TD_SYSINFO_MAP(PAMT_1G_ENTRY_SIZE,    pamt_entry_size[TDX_PS_1G]),
310 };
311 
312 static int get_tdx_tdmr_sysinfo(struct tdx_tdmr_sysinfo *tdmr_sysinfo)
313 {
314 	int ret;
315 	int i;
316 
317 	/* Populate 'tdmr_sysinfo' fields using the mapping structure above: */
318 	for (i = 0; i < ARRAY_SIZE(fields); i++) {
319 		ret = read_sys_metadata_field16(fields[i].field_id,
320 						fields[i].offset,
321 						tdmr_sysinfo);
322 		if (ret)
323 			return ret;
324 	}
325 
326 	return 0;
327 }
328 
329 /* Calculate the actual TDMR size */
330 static int tdmr_size_single(u16 max_reserved_per_tdmr)
331 {
332 	int tdmr_sz;
333 
334 	/*
335 	 * The actual size of TDMR depends on the maximum
336 	 * number of reserved areas.
337 	 */
338 	tdmr_sz = sizeof(struct tdmr_info);
339 	tdmr_sz += sizeof(struct tdmr_reserved_area) * max_reserved_per_tdmr;
340 
341 	return ALIGN(tdmr_sz, TDMR_INFO_ALIGNMENT);
342 }
343 
344 static int alloc_tdmr_list(struct tdmr_info_list *tdmr_list,
345 			   struct tdx_tdmr_sysinfo *tdmr_sysinfo)
346 {
347 	size_t tdmr_sz, tdmr_array_sz;
348 	void *tdmr_array;
349 
350 	tdmr_sz = tdmr_size_single(tdmr_sysinfo->max_reserved_per_tdmr);
351 	tdmr_array_sz = tdmr_sz * tdmr_sysinfo->max_tdmrs;
352 
353 	/*
354 	 * To keep things simple, allocate all TDMRs together.
355 	 * The buffer needs to be physically contiguous to make
356 	 * sure each TDMR is physically contiguous.
357 	 */
358 	tdmr_array = alloc_pages_exact(tdmr_array_sz,
359 			GFP_KERNEL | __GFP_ZERO);
360 	if (!tdmr_array)
361 		return -ENOMEM;
362 
363 	tdmr_list->tdmrs = tdmr_array;
364 
365 	/*
366 	 * Keep the size of TDMR to find the target TDMR
367 	 * at a given index in the TDMR list.
368 	 */
369 	tdmr_list->tdmr_sz = tdmr_sz;
370 	tdmr_list->max_tdmrs = tdmr_sysinfo->max_tdmrs;
371 	tdmr_list->nr_consumed_tdmrs = 0;
372 
373 	return 0;
374 }
375 
376 static void free_tdmr_list(struct tdmr_info_list *tdmr_list)
377 {
378 	free_pages_exact(tdmr_list->tdmrs,
379 			tdmr_list->max_tdmrs * tdmr_list->tdmr_sz);
380 }
381 
382 /* Get the TDMR from the list at the given index. */
383 static struct tdmr_info *tdmr_entry(struct tdmr_info_list *tdmr_list,
384 				    int idx)
385 {
386 	int tdmr_info_offset = tdmr_list->tdmr_sz * idx;
387 
388 	return (void *)tdmr_list->tdmrs + tdmr_info_offset;
389 }
390 
391 #define TDMR_ALIGNMENT		SZ_1G
392 #define TDMR_ALIGN_DOWN(_addr)	ALIGN_DOWN((_addr), TDMR_ALIGNMENT)
393 #define TDMR_ALIGN_UP(_addr)	ALIGN((_addr), TDMR_ALIGNMENT)
394 
395 static inline u64 tdmr_end(struct tdmr_info *tdmr)
396 {
397 	return tdmr->base + tdmr->size;
398 }
399 
400 /*
401  * Take the memory referenced in @tmb_list and populate the
402  * preallocated @tdmr_list, following all the special alignment
403  * and size rules for TDMR.
404  */
405 static int fill_out_tdmrs(struct list_head *tmb_list,
406 			  struct tdmr_info_list *tdmr_list)
407 {
408 	struct tdx_memblock *tmb;
409 	int tdmr_idx = 0;
410 
411 	/*
412 	 * Loop over TDX memory regions and fill out TDMRs to cover them.
413 	 * To keep it simple, always try to use one TDMR to cover one
414 	 * memory region.
415 	 *
416 	 * In practice TDX supports at least 64 TDMRs.  A 2-socket system
417 	 * typically only consumes less than 10 of those.  This code is
418 	 * dumb and simple and may use more TMDRs than is strictly
419 	 * required.
420 	 */
421 	list_for_each_entry(tmb, tmb_list, list) {
422 		struct tdmr_info *tdmr = tdmr_entry(tdmr_list, tdmr_idx);
423 		u64 start, end;
424 
425 		start = TDMR_ALIGN_DOWN(PFN_PHYS(tmb->start_pfn));
426 		end   = TDMR_ALIGN_UP(PFN_PHYS(tmb->end_pfn));
427 
428 		/*
429 		 * A valid size indicates the current TDMR has already
430 		 * been filled out to cover the previous memory region(s).
431 		 */
432 		if (tdmr->size) {
433 			/*
434 			 * Loop to the next if the current memory region
435 			 * has already been fully covered.
436 			 */
437 			if (end <= tdmr_end(tdmr))
438 				continue;
439 
440 			/* Otherwise, skip the already covered part. */
441 			if (start < tdmr_end(tdmr))
442 				start = tdmr_end(tdmr);
443 
444 			/*
445 			 * Create a new TDMR to cover the current memory
446 			 * region, or the remaining part of it.
447 			 */
448 			tdmr_idx++;
449 			if (tdmr_idx >= tdmr_list->max_tdmrs) {
450 				pr_warn("initialization failed: TDMRs exhausted.\n");
451 				return -ENOSPC;
452 			}
453 
454 			tdmr = tdmr_entry(tdmr_list, tdmr_idx);
455 		}
456 
457 		tdmr->base = start;
458 		tdmr->size = end - start;
459 	}
460 
461 	/* @tdmr_idx is always the index of the last valid TDMR. */
462 	tdmr_list->nr_consumed_tdmrs = tdmr_idx + 1;
463 
464 	/*
465 	 * Warn early that kernel is about to run out of TDMRs.
466 	 *
467 	 * This is an indication that TDMR allocation has to be
468 	 * reworked to be smarter to not run into an issue.
469 	 */
470 	if (tdmr_list->max_tdmrs - tdmr_list->nr_consumed_tdmrs < TDMR_NR_WARN)
471 		pr_warn("consumed TDMRs reaching limit: %d used out of %d\n",
472 				tdmr_list->nr_consumed_tdmrs,
473 				tdmr_list->max_tdmrs);
474 
475 	return 0;
476 }
477 
478 /*
479  * Calculate PAMT size given a TDMR and a page size.  The returned
480  * PAMT size is always aligned up to 4K page boundary.
481  */
482 static unsigned long tdmr_get_pamt_sz(struct tdmr_info *tdmr, int pgsz,
483 				      u16 pamt_entry_size)
484 {
485 	unsigned long pamt_sz, nr_pamt_entries;
486 
487 	switch (pgsz) {
488 	case TDX_PS_4K:
489 		nr_pamt_entries = tdmr->size >> PAGE_SHIFT;
490 		break;
491 	case TDX_PS_2M:
492 		nr_pamt_entries = tdmr->size >> PMD_SHIFT;
493 		break;
494 	case TDX_PS_1G:
495 		nr_pamt_entries = tdmr->size >> PUD_SHIFT;
496 		break;
497 	default:
498 		WARN_ON_ONCE(1);
499 		return 0;
500 	}
501 
502 	pamt_sz = nr_pamt_entries * pamt_entry_size;
503 	/* TDX requires PAMT size must be 4K aligned */
504 	pamt_sz = ALIGN(pamt_sz, PAGE_SIZE);
505 
506 	return pamt_sz;
507 }
508 
509 /*
510  * Locate a NUMA node which should hold the allocation of the @tdmr
511  * PAMT.  This node will have some memory covered by the TDMR.  The
512  * relative amount of memory covered is not considered.
513  */
514 static int tdmr_get_nid(struct tdmr_info *tdmr, struct list_head *tmb_list)
515 {
516 	struct tdx_memblock *tmb;
517 
518 	/*
519 	 * A TDMR must cover at least part of one TMB.  That TMB will end
520 	 * after the TDMR begins.  But, that TMB may have started before
521 	 * the TDMR.  Find the next 'tmb' that _ends_ after this TDMR
522 	 * begins.  Ignore 'tmb' start addresses.  They are irrelevant.
523 	 */
524 	list_for_each_entry(tmb, tmb_list, list) {
525 		if (tmb->end_pfn > PHYS_PFN(tdmr->base))
526 			return tmb->nid;
527 	}
528 
529 	/*
530 	 * Fall back to allocating the TDMR's metadata from node 0 when
531 	 * no TDX memory block can be found.  This should never happen
532 	 * since TDMRs originate from TDX memory blocks.
533 	 */
534 	pr_warn("TDMR [0x%llx, 0x%llx): unable to find local NUMA node for PAMT allocation, fallback to use node 0.\n",
535 			tdmr->base, tdmr_end(tdmr));
536 	return 0;
537 }
538 
539 /*
540  * Allocate PAMTs from the local NUMA node of some memory in @tmb_list
541  * within @tdmr, and set up PAMTs for @tdmr.
542  */
543 static int tdmr_set_up_pamt(struct tdmr_info *tdmr,
544 			    struct list_head *tmb_list,
545 			    u16 pamt_entry_size[])
546 {
547 	unsigned long pamt_base[TDX_PS_NR];
548 	unsigned long pamt_size[TDX_PS_NR];
549 	unsigned long tdmr_pamt_base;
550 	unsigned long tdmr_pamt_size;
551 	struct page *pamt;
552 	int pgsz, nid;
553 
554 	nid = tdmr_get_nid(tdmr, tmb_list);
555 
556 	/*
557 	 * Calculate the PAMT size for each TDX supported page size
558 	 * and the total PAMT size.
559 	 */
560 	tdmr_pamt_size = 0;
561 	for (pgsz = TDX_PS_4K; pgsz < TDX_PS_NR; pgsz++) {
562 		pamt_size[pgsz] = tdmr_get_pamt_sz(tdmr, pgsz,
563 					pamt_entry_size[pgsz]);
564 		tdmr_pamt_size += pamt_size[pgsz];
565 	}
566 
567 	/*
568 	 * Allocate one chunk of physically contiguous memory for all
569 	 * PAMTs.  This helps minimize the PAMT's use of reserved areas
570 	 * in overlapped TDMRs.
571 	 */
572 	pamt = alloc_contig_pages(tdmr_pamt_size >> PAGE_SHIFT, GFP_KERNEL,
573 			nid, &node_online_map);
574 	if (!pamt)
575 		return -ENOMEM;
576 
577 	/*
578 	 * Break the contiguous allocation back up into the
579 	 * individual PAMTs for each page size.
580 	 */
581 	tdmr_pamt_base = page_to_pfn(pamt) << PAGE_SHIFT;
582 	for (pgsz = TDX_PS_4K; pgsz < TDX_PS_NR; pgsz++) {
583 		pamt_base[pgsz] = tdmr_pamt_base;
584 		tdmr_pamt_base += pamt_size[pgsz];
585 	}
586 
587 	tdmr->pamt_4k_base = pamt_base[TDX_PS_4K];
588 	tdmr->pamt_4k_size = pamt_size[TDX_PS_4K];
589 	tdmr->pamt_2m_base = pamt_base[TDX_PS_2M];
590 	tdmr->pamt_2m_size = pamt_size[TDX_PS_2M];
591 	tdmr->pamt_1g_base = pamt_base[TDX_PS_1G];
592 	tdmr->pamt_1g_size = pamt_size[TDX_PS_1G];
593 
594 	return 0;
595 }
596 
597 static void tdmr_get_pamt(struct tdmr_info *tdmr, unsigned long *pamt_base,
598 			  unsigned long *pamt_size)
599 {
600 	unsigned long pamt_bs, pamt_sz;
601 
602 	/*
603 	 * The PAMT was allocated in one contiguous unit.  The 4K PAMT
604 	 * should always point to the beginning of that allocation.
605 	 */
606 	pamt_bs = tdmr->pamt_4k_base;
607 	pamt_sz = tdmr->pamt_4k_size + tdmr->pamt_2m_size + tdmr->pamt_1g_size;
608 
609 	WARN_ON_ONCE((pamt_bs & ~PAGE_MASK) || (pamt_sz & ~PAGE_MASK));
610 
611 	*pamt_base = pamt_bs;
612 	*pamt_size = pamt_sz;
613 }
614 
615 static void tdmr_do_pamt_func(struct tdmr_info *tdmr,
616 		void (*pamt_func)(unsigned long base, unsigned long size))
617 {
618 	unsigned long pamt_base, pamt_size;
619 
620 	tdmr_get_pamt(tdmr, &pamt_base, &pamt_size);
621 
622 	/* Do nothing if PAMT hasn't been allocated for this TDMR */
623 	if (!pamt_size)
624 		return;
625 
626 	if (WARN_ON_ONCE(!pamt_base))
627 		return;
628 
629 	pamt_func(pamt_base, pamt_size);
630 }
631 
632 static void free_pamt(unsigned long pamt_base, unsigned long pamt_size)
633 {
634 	free_contig_range(pamt_base >> PAGE_SHIFT, pamt_size >> PAGE_SHIFT);
635 }
636 
637 static void tdmr_free_pamt(struct tdmr_info *tdmr)
638 {
639 	tdmr_do_pamt_func(tdmr, free_pamt);
640 }
641 
642 static void tdmrs_free_pamt_all(struct tdmr_info_list *tdmr_list)
643 {
644 	int i;
645 
646 	for (i = 0; i < tdmr_list->nr_consumed_tdmrs; i++)
647 		tdmr_free_pamt(tdmr_entry(tdmr_list, i));
648 }
649 
650 /* Allocate and set up PAMTs for all TDMRs */
651 static int tdmrs_set_up_pamt_all(struct tdmr_info_list *tdmr_list,
652 				 struct list_head *tmb_list,
653 				 u16 pamt_entry_size[])
654 {
655 	int i, ret = 0;
656 
657 	for (i = 0; i < tdmr_list->nr_consumed_tdmrs; i++) {
658 		ret = tdmr_set_up_pamt(tdmr_entry(tdmr_list, i), tmb_list,
659 				pamt_entry_size);
660 		if (ret)
661 			goto err;
662 	}
663 
664 	return 0;
665 err:
666 	tdmrs_free_pamt_all(tdmr_list);
667 	return ret;
668 }
669 
670 /*
671  * Convert TDX private pages back to normal by using MOVDIR64B to
672  * clear these pages.  Note this function doesn't flush cache of
673  * these TDX private pages.  The caller should make sure of that.
674  */
675 static void reset_tdx_pages(unsigned long base, unsigned long size)
676 {
677 	const void *zero_page = (const void *)page_address(ZERO_PAGE(0));
678 	unsigned long phys, end;
679 
680 	end = base + size;
681 	for (phys = base; phys < end; phys += 64)
682 		movdir64b(__va(phys), zero_page);
683 
684 	/*
685 	 * MOVDIR64B uses WC protocol.  Use memory barrier to
686 	 * make sure any later user of these pages sees the
687 	 * updated data.
688 	 */
689 	mb();
690 }
691 
692 static void tdmr_reset_pamt(struct tdmr_info *tdmr)
693 {
694 	tdmr_do_pamt_func(tdmr, reset_tdx_pages);
695 }
696 
697 static void tdmrs_reset_pamt_all(struct tdmr_info_list *tdmr_list)
698 {
699 	int i;
700 
701 	for (i = 0; i < tdmr_list->nr_consumed_tdmrs; i++)
702 		tdmr_reset_pamt(tdmr_entry(tdmr_list, i));
703 }
704 
705 static unsigned long tdmrs_count_pamt_kb(struct tdmr_info_list *tdmr_list)
706 {
707 	unsigned long pamt_size = 0;
708 	int i;
709 
710 	for (i = 0; i < tdmr_list->nr_consumed_tdmrs; i++) {
711 		unsigned long base, size;
712 
713 		tdmr_get_pamt(tdmr_entry(tdmr_list, i), &base, &size);
714 		pamt_size += size;
715 	}
716 
717 	return pamt_size / 1024;
718 }
719 
720 static int tdmr_add_rsvd_area(struct tdmr_info *tdmr, int *p_idx, u64 addr,
721 			      u64 size, u16 max_reserved_per_tdmr)
722 {
723 	struct tdmr_reserved_area *rsvd_areas = tdmr->reserved_areas;
724 	int idx = *p_idx;
725 
726 	/* Reserved area must be 4K aligned in offset and size */
727 	if (WARN_ON(addr & ~PAGE_MASK || size & ~PAGE_MASK))
728 		return -EINVAL;
729 
730 	if (idx >= max_reserved_per_tdmr) {
731 		pr_warn("initialization failed: TDMR [0x%llx, 0x%llx): reserved areas exhausted.\n",
732 				tdmr->base, tdmr_end(tdmr));
733 		return -ENOSPC;
734 	}
735 
736 	/*
737 	 * Consume one reserved area per call.  Make no effort to
738 	 * optimize or reduce the number of reserved areas which are
739 	 * consumed by contiguous reserved areas, for instance.
740 	 */
741 	rsvd_areas[idx].offset = addr - tdmr->base;
742 	rsvd_areas[idx].size = size;
743 
744 	*p_idx = idx + 1;
745 
746 	return 0;
747 }
748 
749 /*
750  * Go through @tmb_list to find holes between memory areas.  If any of
751  * those holes fall within @tdmr, set up a TDMR reserved area to cover
752  * the hole.
753  */
754 static int tdmr_populate_rsvd_holes(struct list_head *tmb_list,
755 				    struct tdmr_info *tdmr,
756 				    int *rsvd_idx,
757 				    u16 max_reserved_per_tdmr)
758 {
759 	struct tdx_memblock *tmb;
760 	u64 prev_end;
761 	int ret;
762 
763 	/*
764 	 * Start looking for reserved blocks at the
765 	 * beginning of the TDMR.
766 	 */
767 	prev_end = tdmr->base;
768 	list_for_each_entry(tmb, tmb_list, list) {
769 		u64 start, end;
770 
771 		start = PFN_PHYS(tmb->start_pfn);
772 		end   = PFN_PHYS(tmb->end_pfn);
773 
774 		/* Break if this region is after the TDMR */
775 		if (start >= tdmr_end(tdmr))
776 			break;
777 
778 		/* Exclude regions before this TDMR */
779 		if (end < tdmr->base)
780 			continue;
781 
782 		/*
783 		 * Skip over memory areas that
784 		 * have already been dealt with.
785 		 */
786 		if (start <= prev_end) {
787 			prev_end = end;
788 			continue;
789 		}
790 
791 		/* Add the hole before this region */
792 		ret = tdmr_add_rsvd_area(tdmr, rsvd_idx, prev_end,
793 				start - prev_end,
794 				max_reserved_per_tdmr);
795 		if (ret)
796 			return ret;
797 
798 		prev_end = end;
799 	}
800 
801 	/* Add the hole after the last region if it exists. */
802 	if (prev_end < tdmr_end(tdmr)) {
803 		ret = tdmr_add_rsvd_area(tdmr, rsvd_idx, prev_end,
804 				tdmr_end(tdmr) - prev_end,
805 				max_reserved_per_tdmr);
806 		if (ret)
807 			return ret;
808 	}
809 
810 	return 0;
811 }
812 
813 /*
814  * Go through @tdmr_list to find all PAMTs.  If any of those PAMTs
815  * overlaps with @tdmr, set up a TDMR reserved area to cover the
816  * overlapping part.
817  */
818 static int tdmr_populate_rsvd_pamts(struct tdmr_info_list *tdmr_list,
819 				    struct tdmr_info *tdmr,
820 				    int *rsvd_idx,
821 				    u16 max_reserved_per_tdmr)
822 {
823 	int i, ret;
824 
825 	for (i = 0; i < tdmr_list->nr_consumed_tdmrs; i++) {
826 		struct tdmr_info *tmp = tdmr_entry(tdmr_list, i);
827 		unsigned long pamt_base, pamt_size, pamt_end;
828 
829 		tdmr_get_pamt(tmp, &pamt_base, &pamt_size);
830 		/* Each TDMR must already have PAMT allocated */
831 		WARN_ON_ONCE(!pamt_size || !pamt_base);
832 
833 		pamt_end = pamt_base + pamt_size;
834 		/* Skip PAMTs outside of the given TDMR */
835 		if ((pamt_end <= tdmr->base) ||
836 				(pamt_base >= tdmr_end(tdmr)))
837 			continue;
838 
839 		/* Only mark the part within the TDMR as reserved */
840 		if (pamt_base < tdmr->base)
841 			pamt_base = tdmr->base;
842 		if (pamt_end > tdmr_end(tdmr))
843 			pamt_end = tdmr_end(tdmr);
844 
845 		ret = tdmr_add_rsvd_area(tdmr, rsvd_idx, pamt_base,
846 				pamt_end - pamt_base,
847 				max_reserved_per_tdmr);
848 		if (ret)
849 			return ret;
850 	}
851 
852 	return 0;
853 }
854 
855 /* Compare function called by sort() for TDMR reserved areas */
856 static int rsvd_area_cmp_func(const void *a, const void *b)
857 {
858 	struct tdmr_reserved_area *r1 = (struct tdmr_reserved_area *)a;
859 	struct tdmr_reserved_area *r2 = (struct tdmr_reserved_area *)b;
860 
861 	if (r1->offset + r1->size <= r2->offset)
862 		return -1;
863 	if (r1->offset >= r2->offset + r2->size)
864 		return 1;
865 
866 	/* Reserved areas cannot overlap.  The caller must guarantee. */
867 	WARN_ON_ONCE(1);
868 	return -1;
869 }
870 
871 /*
872  * Populate reserved areas for the given @tdmr, including memory holes
873  * (via @tmb_list) and PAMTs (via @tdmr_list).
874  */
875 static int tdmr_populate_rsvd_areas(struct tdmr_info *tdmr,
876 				    struct list_head *tmb_list,
877 				    struct tdmr_info_list *tdmr_list,
878 				    u16 max_reserved_per_tdmr)
879 {
880 	int ret, rsvd_idx = 0;
881 
882 	ret = tdmr_populate_rsvd_holes(tmb_list, tdmr, &rsvd_idx,
883 			max_reserved_per_tdmr);
884 	if (ret)
885 		return ret;
886 
887 	ret = tdmr_populate_rsvd_pamts(tdmr_list, tdmr, &rsvd_idx,
888 			max_reserved_per_tdmr);
889 	if (ret)
890 		return ret;
891 
892 	/* TDX requires reserved areas listed in address ascending order */
893 	sort(tdmr->reserved_areas, rsvd_idx, sizeof(struct tdmr_reserved_area),
894 			rsvd_area_cmp_func, NULL);
895 
896 	return 0;
897 }
898 
899 /*
900  * Populate reserved areas for all TDMRs in @tdmr_list, including memory
901  * holes (via @tmb_list) and PAMTs.
902  */
903 static int tdmrs_populate_rsvd_areas_all(struct tdmr_info_list *tdmr_list,
904 					 struct list_head *tmb_list,
905 					 u16 max_reserved_per_tdmr)
906 {
907 	int i;
908 
909 	for (i = 0; i < tdmr_list->nr_consumed_tdmrs; i++) {
910 		int ret;
911 
912 		ret = tdmr_populate_rsvd_areas(tdmr_entry(tdmr_list, i),
913 				tmb_list, tdmr_list, max_reserved_per_tdmr);
914 		if (ret)
915 			return ret;
916 	}
917 
918 	return 0;
919 }
920 
921 /*
922  * Construct a list of TDMRs on the preallocated space in @tdmr_list
923  * to cover all TDX memory regions in @tmb_list based on the TDX module
924  * TDMR global information in @tdmr_sysinfo.
925  */
926 static int construct_tdmrs(struct list_head *tmb_list,
927 			   struct tdmr_info_list *tdmr_list,
928 			   struct tdx_tdmr_sysinfo *tdmr_sysinfo)
929 {
930 	int ret;
931 
932 	ret = fill_out_tdmrs(tmb_list, tdmr_list);
933 	if (ret)
934 		return ret;
935 
936 	ret = tdmrs_set_up_pamt_all(tdmr_list, tmb_list,
937 			tdmr_sysinfo->pamt_entry_size);
938 	if (ret)
939 		return ret;
940 
941 	ret = tdmrs_populate_rsvd_areas_all(tdmr_list, tmb_list,
942 			tdmr_sysinfo->max_reserved_per_tdmr);
943 	if (ret)
944 		tdmrs_free_pamt_all(tdmr_list);
945 
946 	/*
947 	 * The tdmr_info_list is read-only from here on out.
948 	 * Ensure that these writes are seen by other CPUs.
949 	 * Pairs with a smp_rmb() in is_pamt_page().
950 	 */
951 	smp_wmb();
952 
953 	return ret;
954 }
955 
956 static int config_tdx_module(struct tdmr_info_list *tdmr_list, u64 global_keyid)
957 {
958 	struct tdx_module_args args = {};
959 	u64 *tdmr_pa_array;
960 	size_t array_sz;
961 	int i, ret;
962 
963 	/*
964 	 * TDMRs are passed to the TDX module via an array of physical
965 	 * addresses of each TDMR.  The array itself also has certain
966 	 * alignment requirement.
967 	 */
968 	array_sz = tdmr_list->nr_consumed_tdmrs * sizeof(u64);
969 	array_sz = roundup_pow_of_two(array_sz);
970 	if (array_sz < TDMR_INFO_PA_ARRAY_ALIGNMENT)
971 		array_sz = TDMR_INFO_PA_ARRAY_ALIGNMENT;
972 
973 	tdmr_pa_array = kzalloc(array_sz, GFP_KERNEL);
974 	if (!tdmr_pa_array)
975 		return -ENOMEM;
976 
977 	for (i = 0; i < tdmr_list->nr_consumed_tdmrs; i++)
978 		tdmr_pa_array[i] = __pa(tdmr_entry(tdmr_list, i));
979 
980 	args.rcx = __pa(tdmr_pa_array);
981 	args.rdx = tdmr_list->nr_consumed_tdmrs;
982 	args.r8 = global_keyid;
983 	ret = seamcall_prerr(TDH_SYS_CONFIG, &args);
984 
985 	/* Free the array as it is not required anymore. */
986 	kfree(tdmr_pa_array);
987 
988 	return ret;
989 }
990 
991 static int do_global_key_config(void *unused)
992 {
993 	struct tdx_module_args args = {};
994 
995 	return seamcall_prerr(TDH_SYS_KEY_CONFIG, &args);
996 }
997 
998 /*
999  * Attempt to configure the global KeyID on all physical packages.
1000  *
1001  * This requires running code on at least one CPU in each package.
1002  * TDMR initialization) will fail will fail if any package in the
1003  * system has no online CPUs.
1004  *
1005  * This code takes no affirmative steps to online CPUs.  Callers (aka.
1006  * KVM) can ensure success by ensuring sufficient CPUs are online and
1007  * can run SEAMCALLs.
1008  */
1009 static int config_global_keyid(void)
1010 {
1011 	cpumask_var_t packages;
1012 	int cpu, ret = -EINVAL;
1013 
1014 	if (!zalloc_cpumask_var(&packages, GFP_KERNEL))
1015 		return -ENOMEM;
1016 
1017 	/*
1018 	 * Hardware doesn't guarantee cache coherency across different
1019 	 * KeyIDs.  The kernel needs to flush PAMT's dirty cachelines
1020 	 * (associated with KeyID 0) before the TDX module can use the
1021 	 * global KeyID to access the PAMT.  Given PAMTs are potentially
1022 	 * large (~1/256th of system RAM), just use WBINVD.
1023 	 */
1024 	wbinvd_on_all_cpus();
1025 
1026 	for_each_online_cpu(cpu) {
1027 		/*
1028 		 * The key configuration only needs to be done once per
1029 		 * package and will return an error if configured more
1030 		 * than once.  Avoid doing it multiple times per package.
1031 		 */
1032 		if (cpumask_test_and_set_cpu(topology_physical_package_id(cpu),
1033 					packages))
1034 			continue;
1035 
1036 		/*
1037 		 * TDH.SYS.KEY.CONFIG cannot run concurrently on
1038 		 * different cpus.  Do it one by one.
1039 		 */
1040 		ret = smp_call_on_cpu(cpu, do_global_key_config, NULL, true);
1041 		if (ret)
1042 			break;
1043 	}
1044 
1045 	free_cpumask_var(packages);
1046 	return ret;
1047 }
1048 
1049 static int init_tdmr(struct tdmr_info *tdmr)
1050 {
1051 	u64 next;
1052 
1053 	/*
1054 	 * Initializing a TDMR can be time consuming.  To avoid long
1055 	 * SEAMCALLs, the TDX module may only initialize a part of the
1056 	 * TDMR in each call.
1057 	 */
1058 	do {
1059 		struct tdx_module_args args = {
1060 			.rcx = tdmr->base,
1061 		};
1062 		int ret;
1063 
1064 		ret = seamcall_prerr_ret(TDH_SYS_TDMR_INIT, &args);
1065 		if (ret)
1066 			return ret;
1067 		/*
1068 		 * RDX contains 'next-to-initialize' address if
1069 		 * TDH.SYS.TDMR.INIT did not fully complete and
1070 		 * should be retried.
1071 		 */
1072 		next = args.rdx;
1073 		cond_resched();
1074 		/* Keep making SEAMCALLs until the TDMR is done */
1075 	} while (next < tdmr->base + tdmr->size);
1076 
1077 	return 0;
1078 }
1079 
1080 static int init_tdmrs(struct tdmr_info_list *tdmr_list)
1081 {
1082 	int i;
1083 
1084 	/*
1085 	 * This operation is costly.  It can be parallelized,
1086 	 * but keep it simple for now.
1087 	 */
1088 	for (i = 0; i < tdmr_list->nr_consumed_tdmrs; i++) {
1089 		int ret;
1090 
1091 		ret = init_tdmr(tdmr_entry(tdmr_list, i));
1092 		if (ret)
1093 			return ret;
1094 	}
1095 
1096 	return 0;
1097 }
1098 
1099 static int init_tdx_module(void)
1100 {
1101 	struct tdx_tdmr_sysinfo tdmr_sysinfo;
1102 	int ret;
1103 
1104 	/*
1105 	 * To keep things simple, assume that all TDX-protected memory
1106 	 * will come from the page allocator.  Make sure all pages in the
1107 	 * page allocator are TDX-usable memory.
1108 	 *
1109 	 * Build the list of "TDX-usable" memory regions which cover all
1110 	 * pages in the page allocator to guarantee that.  Do it while
1111 	 * holding mem_hotplug_lock read-lock as the memory hotplug code
1112 	 * path reads the @tdx_memlist to reject any new memory.
1113 	 */
1114 	get_online_mems();
1115 
1116 	ret = build_tdx_memlist(&tdx_memlist);
1117 	if (ret)
1118 		goto out_put_tdxmem;
1119 
1120 	ret = get_tdx_tdmr_sysinfo(&tdmr_sysinfo);
1121 	if (ret)
1122 		goto err_free_tdxmem;
1123 
1124 	/* Allocate enough space for constructing TDMRs */
1125 	ret = alloc_tdmr_list(&tdx_tdmr_list, &tdmr_sysinfo);
1126 	if (ret)
1127 		goto err_free_tdxmem;
1128 
1129 	/* Cover all TDX-usable memory regions in TDMRs */
1130 	ret = construct_tdmrs(&tdx_memlist, &tdx_tdmr_list, &tdmr_sysinfo);
1131 	if (ret)
1132 		goto err_free_tdmrs;
1133 
1134 	/* Pass the TDMRs and the global KeyID to the TDX module */
1135 	ret = config_tdx_module(&tdx_tdmr_list, tdx_global_keyid);
1136 	if (ret)
1137 		goto err_free_pamts;
1138 
1139 	/* Config the key of global KeyID on all packages */
1140 	ret = config_global_keyid();
1141 	if (ret)
1142 		goto err_reset_pamts;
1143 
1144 	/* Initialize TDMRs to complete the TDX module initialization */
1145 	ret = init_tdmrs(&tdx_tdmr_list);
1146 	if (ret)
1147 		goto err_reset_pamts;
1148 
1149 	pr_info("%lu KB allocated for PAMT\n", tdmrs_count_pamt_kb(&tdx_tdmr_list));
1150 
1151 out_put_tdxmem:
1152 	/*
1153 	 * @tdx_memlist is written here and read at memory hotplug time.
1154 	 * Lock out memory hotplug code while building it.
1155 	 */
1156 	put_online_mems();
1157 	return ret;
1158 
1159 err_reset_pamts:
1160 	/*
1161 	 * Part of PAMTs may already have been initialized by the
1162 	 * TDX module.  Flush cache before returning PAMTs back
1163 	 * to the kernel.
1164 	 */
1165 	wbinvd_on_all_cpus();
1166 	/*
1167 	 * According to the TDX hardware spec, if the platform
1168 	 * doesn't have the "partial write machine check"
1169 	 * erratum, any kernel read/write will never cause #MC
1170 	 * in kernel space, thus it's OK to not convert PAMTs
1171 	 * back to normal.  But do the conversion anyway here
1172 	 * as suggested by the TDX spec.
1173 	 */
1174 	tdmrs_reset_pamt_all(&tdx_tdmr_list);
1175 err_free_pamts:
1176 	tdmrs_free_pamt_all(&tdx_tdmr_list);
1177 err_free_tdmrs:
1178 	free_tdmr_list(&tdx_tdmr_list);
1179 err_free_tdxmem:
1180 	free_tdx_memlist(&tdx_memlist);
1181 	goto out_put_tdxmem;
1182 }
1183 
1184 static int __tdx_enable(void)
1185 {
1186 	int ret;
1187 
1188 	ret = init_tdx_module();
1189 	if (ret) {
1190 		pr_err("module initialization failed (%d)\n", ret);
1191 		tdx_module_status = TDX_MODULE_ERROR;
1192 		return ret;
1193 	}
1194 
1195 	pr_info("module initialized\n");
1196 	tdx_module_status = TDX_MODULE_INITIALIZED;
1197 
1198 	return 0;
1199 }
1200 
1201 /**
1202  * tdx_enable - Enable TDX module to make it ready to run TDX guests
1203  *
1204  * This function assumes the caller has: 1) held read lock of CPU hotplug
1205  * lock to prevent any new cpu from becoming online; 2) done both VMXON
1206  * and tdx_cpu_enable() on all online cpus.
1207  *
1208  * This function requires there's at least one online cpu for each CPU
1209  * package to succeed.
1210  *
1211  * This function can be called in parallel by multiple callers.
1212  *
1213  * Return 0 if TDX is enabled successfully, otherwise error.
1214  */
1215 int tdx_enable(void)
1216 {
1217 	int ret;
1218 
1219 	if (!boot_cpu_has(X86_FEATURE_TDX_HOST_PLATFORM))
1220 		return -ENODEV;
1221 
1222 	lockdep_assert_cpus_held();
1223 
1224 	mutex_lock(&tdx_module_lock);
1225 
1226 	switch (tdx_module_status) {
1227 	case TDX_MODULE_UNINITIALIZED:
1228 		ret = __tdx_enable();
1229 		break;
1230 	case TDX_MODULE_INITIALIZED:
1231 		/* Already initialized, great, tell the caller. */
1232 		ret = 0;
1233 		break;
1234 	default:
1235 		/* Failed to initialize in the previous attempts */
1236 		ret = -EINVAL;
1237 		break;
1238 	}
1239 
1240 	mutex_unlock(&tdx_module_lock);
1241 
1242 	return ret;
1243 }
1244 EXPORT_SYMBOL_GPL(tdx_enable);
1245 
1246 static bool is_pamt_page(unsigned long phys)
1247 {
1248 	struct tdmr_info_list *tdmr_list = &tdx_tdmr_list;
1249 	int i;
1250 
1251 	/* Ensure that all remote 'tdmr_list' writes are visible: */
1252 	smp_rmb();
1253 
1254 	/*
1255 	 * The TDX module is no longer returning TDX_SYS_NOT_READY and
1256 	 * is initialized.  The 'tdmr_list' was initialized long ago
1257 	 * and is now read-only.
1258 	 */
1259 	for (i = 0; i < tdmr_list->nr_consumed_tdmrs; i++) {
1260 		unsigned long base, size;
1261 
1262 		tdmr_get_pamt(tdmr_entry(tdmr_list, i), &base, &size);
1263 
1264 		if (phys >= base && phys < (base + size))
1265 			return true;
1266 	}
1267 
1268 	return false;
1269 }
1270 
1271 /*
1272  * Return whether the memory page at the given physical address is TDX
1273  * private memory or not.
1274  *
1275  * This can be imprecise for two known reasons:
1276  * 1. PAMTs are private memory and exist before the TDX module is
1277  *    ready and TDH_PHYMEM_PAGE_RDMD works.  This is a relatively
1278  *    short window that occurs once per boot.
1279  * 2. TDH_PHYMEM_PAGE_RDMD reflects the TDX module's knowledge of the
1280  *    page.  However, the page can still cause #MC until it has been
1281  *    fully converted to shared using 64-byte writes like MOVDIR64B.
1282  *    Buggy hosts might still leave #MC-causing memory in place which
1283  *    this function can not detect.
1284  */
1285 static bool paddr_is_tdx_private(unsigned long phys)
1286 {
1287 	struct tdx_module_args args = {
1288 		.rcx = phys & PAGE_MASK,
1289 	};
1290 	u64 sret;
1291 
1292 	if (!boot_cpu_has(X86_FEATURE_TDX_HOST_PLATFORM))
1293 		return false;
1294 
1295 	/* Get page type from the TDX module */
1296 	sret = __seamcall_ret(TDH_PHYMEM_PAGE_RDMD, &args);
1297 
1298 	/*
1299 	 * The SEAMCALL will not return success unless there is a
1300 	 * working, "ready" TDX module.  Assume an absence of TDX
1301 	 * private pages until SEAMCALL is working.
1302 	 */
1303 	if (sret)
1304 		return false;
1305 
1306 	/*
1307 	 * SEAMCALL was successful -- read page type (via RCX):
1308 	 *
1309 	 *  - PT_NDA:	Page is not used by the TDX module
1310 	 *  - PT_RSVD:	Reserved for Non-TDX use
1311 	 *  - Others:	Page is used by the TDX module
1312 	 *
1313 	 * Note PAMT pages are marked as PT_RSVD but they are also TDX
1314 	 * private memory.
1315 	 */
1316 	switch (args.rcx) {
1317 	case PT_NDA:
1318 		return false;
1319 	case PT_RSVD:
1320 		return is_pamt_page(phys);
1321 	default:
1322 		return true;
1323 	}
1324 }
1325 
1326 /*
1327  * Some TDX-capable CPUs have an erratum.  A write to TDX private
1328  * memory poisons that memory, and a subsequent read of that memory
1329  * triggers #MC.
1330  *
1331  * Help distinguish erratum-triggered #MCs from a normal hardware one.
1332  * Just print additional message to show such #MC may be result of the
1333  * erratum.
1334  */
1335 const char *tdx_dump_mce_info(struct mce *m)
1336 {
1337 	if (!m || !mce_is_memory_error(m) || !mce_usable_address(m))
1338 		return NULL;
1339 
1340 	if (!paddr_is_tdx_private(m->addr))
1341 		return NULL;
1342 
1343 	return "TDX private memory error. Possible kernel bug.";
1344 }
1345 
1346 static __init int record_keyid_partitioning(u32 *tdx_keyid_start,
1347 					    u32 *nr_tdx_keyids)
1348 {
1349 	u32 _nr_mktme_keyids, _tdx_keyid_start, _nr_tdx_keyids;
1350 	int ret;
1351 
1352 	/*
1353 	 * IA32_MKTME_KEYID_PARTIONING:
1354 	 *   Bit [31:0]:	Number of MKTME KeyIDs.
1355 	 *   Bit [63:32]:	Number of TDX private KeyIDs.
1356 	 */
1357 	ret = rdmsr_safe(MSR_IA32_MKTME_KEYID_PARTITIONING, &_nr_mktme_keyids,
1358 			&_nr_tdx_keyids);
1359 	if (ret || !_nr_tdx_keyids)
1360 		return -EINVAL;
1361 
1362 	/* TDX KeyIDs start after the last MKTME KeyID. */
1363 	_tdx_keyid_start = _nr_mktme_keyids + 1;
1364 
1365 	*tdx_keyid_start = _tdx_keyid_start;
1366 	*nr_tdx_keyids = _nr_tdx_keyids;
1367 
1368 	return 0;
1369 }
1370 
1371 static bool is_tdx_memory(unsigned long start_pfn, unsigned long end_pfn)
1372 {
1373 	struct tdx_memblock *tmb;
1374 
1375 	/*
1376 	 * This check assumes that the start_pfn<->end_pfn range does not
1377 	 * cross multiple @tdx_memlist entries.  A single memory online
1378 	 * event across multiple memblocks (from which @tdx_memlist
1379 	 * entries are derived at the time of module initialization) is
1380 	 * not possible.  This is because memory offline/online is done
1381 	 * on granularity of 'struct memory_block', and the hotpluggable
1382 	 * memory region (one memblock) must be multiple of memory_block.
1383 	 */
1384 	list_for_each_entry(tmb, &tdx_memlist, list) {
1385 		if (start_pfn >= tmb->start_pfn && end_pfn <= tmb->end_pfn)
1386 			return true;
1387 	}
1388 	return false;
1389 }
1390 
1391 static int tdx_memory_notifier(struct notifier_block *nb, unsigned long action,
1392 			       void *v)
1393 {
1394 	struct memory_notify *mn = v;
1395 
1396 	if (action != MEM_GOING_ONLINE)
1397 		return NOTIFY_OK;
1398 
1399 	/*
1400 	 * Empty list means TDX isn't enabled.  Allow any memory
1401 	 * to go online.
1402 	 */
1403 	if (list_empty(&tdx_memlist))
1404 		return NOTIFY_OK;
1405 
1406 	/*
1407 	 * The TDX memory configuration is static and can not be
1408 	 * changed.  Reject onlining any memory which is outside of
1409 	 * the static configuration whether it supports TDX or not.
1410 	 */
1411 	if (is_tdx_memory(mn->start_pfn, mn->start_pfn + mn->nr_pages))
1412 		return NOTIFY_OK;
1413 
1414 	return NOTIFY_BAD;
1415 }
1416 
1417 static struct notifier_block tdx_memory_nb = {
1418 	.notifier_call = tdx_memory_notifier,
1419 };
1420 
1421 static void __init check_tdx_erratum(void)
1422 {
1423 	/*
1424 	 * These CPUs have an erratum.  A partial write from non-TD
1425 	 * software (e.g. via MOVNTI variants or UC/WC mapping) to TDX
1426 	 * private memory poisons that memory, and a subsequent read of
1427 	 * that memory triggers #MC.
1428 	 */
1429 	switch (boot_cpu_data.x86_vfm) {
1430 	case INTEL_SAPPHIRERAPIDS_X:
1431 	case INTEL_EMERALDRAPIDS_X:
1432 		setup_force_cpu_bug(X86_BUG_TDX_PW_MCE);
1433 	}
1434 }
1435 
1436 void __init tdx_init(void)
1437 {
1438 	u32 tdx_keyid_start, nr_tdx_keyids;
1439 	int err;
1440 
1441 	err = record_keyid_partitioning(&tdx_keyid_start, &nr_tdx_keyids);
1442 	if (err)
1443 		return;
1444 
1445 	pr_info("BIOS enabled: private KeyID range [%u, %u)\n",
1446 			tdx_keyid_start, tdx_keyid_start + nr_tdx_keyids);
1447 
1448 	/*
1449 	 * The TDX module itself requires one 'global KeyID' to protect
1450 	 * its metadata.  If there's only one TDX KeyID, there won't be
1451 	 * any left for TDX guests thus there's no point to enable TDX
1452 	 * at all.
1453 	 */
1454 	if (nr_tdx_keyids < 2) {
1455 		pr_err("initialization failed: too few private KeyIDs available.\n");
1456 		return;
1457 	}
1458 
1459 	/*
1460 	 * At this point, hibernation_available() indicates whether or
1461 	 * not hibernation support has been permanently disabled.
1462 	 */
1463 	if (hibernation_available()) {
1464 		pr_err("initialization failed: Hibernation support is enabled\n");
1465 		return;
1466 	}
1467 
1468 	err = register_memory_notifier(&tdx_memory_nb);
1469 	if (err) {
1470 		pr_err("initialization failed: register_memory_notifier() failed (%d)\n",
1471 				err);
1472 		return;
1473 	}
1474 
1475 #if defined(CONFIG_ACPI) && defined(CONFIG_SUSPEND)
1476 	pr_info("Disable ACPI S3. Turn off TDX in the BIOS to use ACPI S3.\n");
1477 	acpi_suspend_lowlevel = NULL;
1478 #endif
1479 
1480 	/*
1481 	 * Just use the first TDX KeyID as the 'global KeyID' and
1482 	 * leave the rest for TDX guests.
1483 	 */
1484 	tdx_global_keyid = tdx_keyid_start;
1485 	tdx_guest_keyid_start = tdx_keyid_start + 1;
1486 	tdx_nr_guest_keyids = nr_tdx_keyids - 1;
1487 
1488 	setup_force_cpu_cap(X86_FEATURE_TDX_HOST_PLATFORM);
1489 
1490 	check_tdx_erratum();
1491 }
1492