1 // SPDX-License-Identifier: GPL-2.0
2 /*
3 * Copyright(c) 2023 Intel Corporation.
4 *
5 * Intel Trusted Domain Extensions (TDX) support
6 */
7
8 #define pr_fmt(fmt) "virt/tdx: " fmt
9
10 #include <linux/types.h>
11 #include <linux/cache.h>
12 #include <linux/init.h>
13 #include <linux/errno.h>
14 #include <linux/printk.h>
15 #include <linux/cpu.h>
16 #include <linux/spinlock.h>
17 #include <linux/percpu-defs.h>
18 #include <linux/mutex.h>
19 #include <linux/list.h>
20 #include <linux/memblock.h>
21 #include <linux/memory.h>
22 #include <linux/minmax.h>
23 #include <linux/sizes.h>
24 #include <linux/pfn.h>
25 #include <linux/align.h>
26 #include <linux/sort.h>
27 #include <linux/log2.h>
28 #include <linux/acpi.h>
29 #include <linux/suspend.h>
30 #include <asm/page.h>
31 #include <asm/special_insns.h>
32 #include <asm/msr-index.h>
33 #include <asm/msr.h>
34 #include <asm/cpufeature.h>
35 #include <asm/tdx.h>
36 #include <asm/cpu_device_id.h>
37 #include <asm/processor.h>
38 #include <asm/mce.h>
39 #include "tdx.h"
40
41 static u32 tdx_global_keyid __ro_after_init;
42 static u32 tdx_guest_keyid_start __ro_after_init;
43 static u32 tdx_nr_guest_keyids __ro_after_init;
44
45 static DEFINE_PER_CPU(bool, tdx_lp_initialized);
46
47 static struct tdmr_info_list tdx_tdmr_list;
48
49 static enum tdx_module_status_t tdx_module_status;
50 static DEFINE_MUTEX(tdx_module_lock);
51
52 /* All TDX-usable memory regions. Protected by mem_hotplug_lock. */
53 static LIST_HEAD(tdx_memlist);
54
55 typedef void (*sc_err_func_t)(u64 fn, u64 err, struct tdx_module_args *args);
56
seamcall_err(u64 fn,u64 err,struct tdx_module_args * args)57 static inline void seamcall_err(u64 fn, u64 err, struct tdx_module_args *args)
58 {
59 pr_err("SEAMCALL (0x%016llx) failed: 0x%016llx\n", fn, err);
60 }
61
seamcall_err_ret(u64 fn,u64 err,struct tdx_module_args * args)62 static inline void seamcall_err_ret(u64 fn, u64 err,
63 struct tdx_module_args *args)
64 {
65 seamcall_err(fn, err, args);
66 pr_err("RCX 0x%016llx RDX 0x%016llx R08 0x%016llx\n",
67 args->rcx, args->rdx, args->r8);
68 pr_err("R09 0x%016llx R10 0x%016llx R11 0x%016llx\n",
69 args->r9, args->r10, args->r11);
70 }
71
sc_retry_prerr(sc_func_t func,sc_err_func_t err_func,u64 fn,struct tdx_module_args * args)72 static inline int sc_retry_prerr(sc_func_t func, sc_err_func_t err_func,
73 u64 fn, struct tdx_module_args *args)
74 {
75 u64 sret = sc_retry(func, fn, args);
76
77 if (sret == TDX_SUCCESS)
78 return 0;
79
80 if (sret == TDX_SEAMCALL_VMFAILINVALID)
81 return -ENODEV;
82
83 if (sret == TDX_SEAMCALL_GP)
84 return -EOPNOTSUPP;
85
86 if (sret == TDX_SEAMCALL_UD)
87 return -EACCES;
88
89 err_func(fn, sret, args);
90 return -EIO;
91 }
92
93 #define seamcall_prerr(__fn, __args) \
94 sc_retry_prerr(__seamcall, seamcall_err, (__fn), (__args))
95
96 #define seamcall_prerr_ret(__fn, __args) \
97 sc_retry_prerr(__seamcall_ret, seamcall_err_ret, (__fn), (__args))
98
99 /*
100 * Do the module global initialization once and return its result.
101 * It can be done on any cpu. It's always called with interrupts
102 * disabled.
103 */
try_init_module_global(void)104 static int try_init_module_global(void)
105 {
106 struct tdx_module_args args = {};
107 static DEFINE_RAW_SPINLOCK(sysinit_lock);
108 static bool sysinit_done;
109 static int sysinit_ret;
110
111 lockdep_assert_irqs_disabled();
112
113 raw_spin_lock(&sysinit_lock);
114
115 if (sysinit_done)
116 goto out;
117
118 /* RCX is module attributes and all bits are reserved */
119 args.rcx = 0;
120 sysinit_ret = seamcall_prerr(TDH_SYS_INIT, &args);
121
122 /*
123 * The first SEAMCALL also detects the TDX module, thus
124 * it can fail due to the TDX module is not loaded.
125 * Dump message to let the user know.
126 */
127 if (sysinit_ret == -ENODEV)
128 pr_err("module not loaded\n");
129
130 sysinit_done = true;
131 out:
132 raw_spin_unlock(&sysinit_lock);
133 return sysinit_ret;
134 }
135
136 /**
137 * tdx_cpu_enable - Enable TDX on local cpu
138 *
139 * Do one-time TDX module per-cpu initialization SEAMCALL (and TDX module
140 * global initialization SEAMCALL if not done) on local cpu to make this
141 * cpu be ready to run any other SEAMCALLs.
142 *
143 * Always call this function via IPI function calls.
144 *
145 * Return 0 on success, otherwise errors.
146 */
tdx_cpu_enable(void)147 int tdx_cpu_enable(void)
148 {
149 struct tdx_module_args args = {};
150 int ret;
151
152 if (!boot_cpu_has(X86_FEATURE_TDX_HOST_PLATFORM))
153 return -ENODEV;
154
155 lockdep_assert_irqs_disabled();
156
157 if (__this_cpu_read(tdx_lp_initialized))
158 return 0;
159
160 /*
161 * The TDX module global initialization is the very first step
162 * to enable TDX. Need to do it first (if hasn't been done)
163 * before the per-cpu initialization.
164 */
165 ret = try_init_module_global();
166 if (ret)
167 return ret;
168
169 ret = seamcall_prerr(TDH_SYS_LP_INIT, &args);
170 if (ret)
171 return ret;
172
173 __this_cpu_write(tdx_lp_initialized, true);
174
175 return 0;
176 }
177 EXPORT_SYMBOL_GPL(tdx_cpu_enable);
178
179 /*
180 * Add a memory region as a TDX memory block. The caller must make sure
181 * all memory regions are added in address ascending order and don't
182 * overlap.
183 */
add_tdx_memblock(struct list_head * tmb_list,unsigned long start_pfn,unsigned long end_pfn,int nid)184 static int add_tdx_memblock(struct list_head *tmb_list, unsigned long start_pfn,
185 unsigned long end_pfn, int nid)
186 {
187 struct tdx_memblock *tmb;
188
189 tmb = kmalloc(sizeof(*tmb), GFP_KERNEL);
190 if (!tmb)
191 return -ENOMEM;
192
193 INIT_LIST_HEAD(&tmb->list);
194 tmb->start_pfn = start_pfn;
195 tmb->end_pfn = end_pfn;
196 tmb->nid = nid;
197
198 /* @tmb_list is protected by mem_hotplug_lock */
199 list_add_tail(&tmb->list, tmb_list);
200 return 0;
201 }
202
free_tdx_memlist(struct list_head * tmb_list)203 static void free_tdx_memlist(struct list_head *tmb_list)
204 {
205 /* @tmb_list is protected by mem_hotplug_lock */
206 while (!list_empty(tmb_list)) {
207 struct tdx_memblock *tmb = list_first_entry(tmb_list,
208 struct tdx_memblock, list);
209
210 list_del(&tmb->list);
211 kfree(tmb);
212 }
213 }
214
215 /*
216 * Ensure that all memblock memory regions are convertible to TDX
217 * memory. Once this has been established, stash the memblock
218 * ranges off in a secondary structure because memblock is modified
219 * in memory hotplug while TDX memory regions are fixed.
220 */
build_tdx_memlist(struct list_head * tmb_list)221 static int build_tdx_memlist(struct list_head *tmb_list)
222 {
223 unsigned long start_pfn, end_pfn;
224 int i, nid, ret;
225
226 for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) {
227 /*
228 * The first 1MB is not reported as TDX convertible memory.
229 * Although the first 1MB is always reserved and won't end up
230 * to the page allocator, it is still in memblock's memory
231 * regions. Skip them manually to exclude them as TDX memory.
232 */
233 start_pfn = max(start_pfn, PHYS_PFN(SZ_1M));
234 if (start_pfn >= end_pfn)
235 continue;
236
237 /*
238 * Add the memory regions as TDX memory. The regions in
239 * memblock has already guaranteed they are in address
240 * ascending order and don't overlap.
241 */
242 ret = add_tdx_memblock(tmb_list, start_pfn, end_pfn, nid);
243 if (ret)
244 goto err;
245 }
246
247 return 0;
248 err:
249 free_tdx_memlist(tmb_list);
250 return ret;
251 }
252
read_sys_metadata_field(u64 field_id,u64 * data)253 static int read_sys_metadata_field(u64 field_id, u64 *data)
254 {
255 struct tdx_module_args args = {};
256 int ret;
257
258 /*
259 * TDH.SYS.RD -- reads one global metadata field
260 * - RDX (in): the field to read
261 * - R8 (out): the field data
262 */
263 args.rdx = field_id;
264 ret = seamcall_prerr_ret(TDH_SYS_RD, &args);
265 if (ret)
266 return ret;
267
268 *data = args.r8;
269
270 return 0;
271 }
272
read_sys_metadata_field16(u64 field_id,int offset,struct tdx_tdmr_sysinfo * ts)273 static int read_sys_metadata_field16(u64 field_id,
274 int offset,
275 struct tdx_tdmr_sysinfo *ts)
276 {
277 u16 *ts_member = ((void *)ts) + offset;
278 u64 tmp;
279 int ret;
280
281 if (WARN_ON_ONCE(MD_FIELD_ID_ELE_SIZE_CODE(field_id) !=
282 MD_FIELD_ID_ELE_SIZE_16BIT))
283 return -EINVAL;
284
285 ret = read_sys_metadata_field(field_id, &tmp);
286 if (ret)
287 return ret;
288
289 *ts_member = tmp;
290
291 return 0;
292 }
293
294 struct field_mapping {
295 u64 field_id;
296 int offset;
297 };
298
299 #define TD_SYSINFO_MAP(_field_id, _offset) \
300 { .field_id = MD_FIELD_ID_##_field_id, \
301 .offset = offsetof(struct tdx_tdmr_sysinfo, _offset) }
302
303 /* Map TD_SYSINFO fields into 'struct tdx_tdmr_sysinfo': */
304 static const struct field_mapping fields[] = {
305 TD_SYSINFO_MAP(MAX_TDMRS, max_tdmrs),
306 TD_SYSINFO_MAP(MAX_RESERVED_PER_TDMR, max_reserved_per_tdmr),
307 TD_SYSINFO_MAP(PAMT_4K_ENTRY_SIZE, pamt_entry_size[TDX_PS_4K]),
308 TD_SYSINFO_MAP(PAMT_2M_ENTRY_SIZE, pamt_entry_size[TDX_PS_2M]),
309 TD_SYSINFO_MAP(PAMT_1G_ENTRY_SIZE, pamt_entry_size[TDX_PS_1G]),
310 };
311
get_tdx_tdmr_sysinfo(struct tdx_tdmr_sysinfo * tdmr_sysinfo)312 static int get_tdx_tdmr_sysinfo(struct tdx_tdmr_sysinfo *tdmr_sysinfo)
313 {
314 int ret;
315 int i;
316
317 /* Populate 'tdmr_sysinfo' fields using the mapping structure above: */
318 for (i = 0; i < ARRAY_SIZE(fields); i++) {
319 ret = read_sys_metadata_field16(fields[i].field_id,
320 fields[i].offset,
321 tdmr_sysinfo);
322 if (ret)
323 return ret;
324 }
325
326 return 0;
327 }
328
329 /* Calculate the actual TDMR size */
tdmr_size_single(u16 max_reserved_per_tdmr)330 static int tdmr_size_single(u16 max_reserved_per_tdmr)
331 {
332 int tdmr_sz;
333
334 /*
335 * The actual size of TDMR depends on the maximum
336 * number of reserved areas.
337 */
338 tdmr_sz = sizeof(struct tdmr_info);
339 tdmr_sz += sizeof(struct tdmr_reserved_area) * max_reserved_per_tdmr;
340
341 return ALIGN(tdmr_sz, TDMR_INFO_ALIGNMENT);
342 }
343
alloc_tdmr_list(struct tdmr_info_list * tdmr_list,struct tdx_tdmr_sysinfo * tdmr_sysinfo)344 static int alloc_tdmr_list(struct tdmr_info_list *tdmr_list,
345 struct tdx_tdmr_sysinfo *tdmr_sysinfo)
346 {
347 size_t tdmr_sz, tdmr_array_sz;
348 void *tdmr_array;
349
350 tdmr_sz = tdmr_size_single(tdmr_sysinfo->max_reserved_per_tdmr);
351 tdmr_array_sz = tdmr_sz * tdmr_sysinfo->max_tdmrs;
352
353 /*
354 * To keep things simple, allocate all TDMRs together.
355 * The buffer needs to be physically contiguous to make
356 * sure each TDMR is physically contiguous.
357 */
358 tdmr_array = alloc_pages_exact(tdmr_array_sz,
359 GFP_KERNEL | __GFP_ZERO);
360 if (!tdmr_array)
361 return -ENOMEM;
362
363 tdmr_list->tdmrs = tdmr_array;
364
365 /*
366 * Keep the size of TDMR to find the target TDMR
367 * at a given index in the TDMR list.
368 */
369 tdmr_list->tdmr_sz = tdmr_sz;
370 tdmr_list->max_tdmrs = tdmr_sysinfo->max_tdmrs;
371 tdmr_list->nr_consumed_tdmrs = 0;
372
373 return 0;
374 }
375
free_tdmr_list(struct tdmr_info_list * tdmr_list)376 static void free_tdmr_list(struct tdmr_info_list *tdmr_list)
377 {
378 free_pages_exact(tdmr_list->tdmrs,
379 tdmr_list->max_tdmrs * tdmr_list->tdmr_sz);
380 }
381
382 /* Get the TDMR from the list at the given index. */
tdmr_entry(struct tdmr_info_list * tdmr_list,int idx)383 static struct tdmr_info *tdmr_entry(struct tdmr_info_list *tdmr_list,
384 int idx)
385 {
386 int tdmr_info_offset = tdmr_list->tdmr_sz * idx;
387
388 return (void *)tdmr_list->tdmrs + tdmr_info_offset;
389 }
390
391 #define TDMR_ALIGNMENT SZ_1G
392 #define TDMR_ALIGN_DOWN(_addr) ALIGN_DOWN((_addr), TDMR_ALIGNMENT)
393 #define TDMR_ALIGN_UP(_addr) ALIGN((_addr), TDMR_ALIGNMENT)
394
tdmr_end(struct tdmr_info * tdmr)395 static inline u64 tdmr_end(struct tdmr_info *tdmr)
396 {
397 return tdmr->base + tdmr->size;
398 }
399
400 /*
401 * Take the memory referenced in @tmb_list and populate the
402 * preallocated @tdmr_list, following all the special alignment
403 * and size rules for TDMR.
404 */
fill_out_tdmrs(struct list_head * tmb_list,struct tdmr_info_list * tdmr_list)405 static int fill_out_tdmrs(struct list_head *tmb_list,
406 struct tdmr_info_list *tdmr_list)
407 {
408 struct tdx_memblock *tmb;
409 int tdmr_idx = 0;
410
411 /*
412 * Loop over TDX memory regions and fill out TDMRs to cover them.
413 * To keep it simple, always try to use one TDMR to cover one
414 * memory region.
415 *
416 * In practice TDX supports at least 64 TDMRs. A 2-socket system
417 * typically only consumes less than 10 of those. This code is
418 * dumb and simple and may use more TMDRs than is strictly
419 * required.
420 */
421 list_for_each_entry(tmb, tmb_list, list) {
422 struct tdmr_info *tdmr = tdmr_entry(tdmr_list, tdmr_idx);
423 u64 start, end;
424
425 start = TDMR_ALIGN_DOWN(PFN_PHYS(tmb->start_pfn));
426 end = TDMR_ALIGN_UP(PFN_PHYS(tmb->end_pfn));
427
428 /*
429 * A valid size indicates the current TDMR has already
430 * been filled out to cover the previous memory region(s).
431 */
432 if (tdmr->size) {
433 /*
434 * Loop to the next if the current memory region
435 * has already been fully covered.
436 */
437 if (end <= tdmr_end(tdmr))
438 continue;
439
440 /* Otherwise, skip the already covered part. */
441 if (start < tdmr_end(tdmr))
442 start = tdmr_end(tdmr);
443
444 /*
445 * Create a new TDMR to cover the current memory
446 * region, or the remaining part of it.
447 */
448 tdmr_idx++;
449 if (tdmr_idx >= tdmr_list->max_tdmrs) {
450 pr_warn("initialization failed: TDMRs exhausted.\n");
451 return -ENOSPC;
452 }
453
454 tdmr = tdmr_entry(tdmr_list, tdmr_idx);
455 }
456
457 tdmr->base = start;
458 tdmr->size = end - start;
459 }
460
461 /* @tdmr_idx is always the index of the last valid TDMR. */
462 tdmr_list->nr_consumed_tdmrs = tdmr_idx + 1;
463
464 /*
465 * Warn early that kernel is about to run out of TDMRs.
466 *
467 * This is an indication that TDMR allocation has to be
468 * reworked to be smarter to not run into an issue.
469 */
470 if (tdmr_list->max_tdmrs - tdmr_list->nr_consumed_tdmrs < TDMR_NR_WARN)
471 pr_warn("consumed TDMRs reaching limit: %d used out of %d\n",
472 tdmr_list->nr_consumed_tdmrs,
473 tdmr_list->max_tdmrs);
474
475 return 0;
476 }
477
478 /*
479 * Calculate PAMT size given a TDMR and a page size. The returned
480 * PAMT size is always aligned up to 4K page boundary.
481 */
tdmr_get_pamt_sz(struct tdmr_info * tdmr,int pgsz,u16 pamt_entry_size)482 static unsigned long tdmr_get_pamt_sz(struct tdmr_info *tdmr, int pgsz,
483 u16 pamt_entry_size)
484 {
485 unsigned long pamt_sz, nr_pamt_entries;
486
487 switch (pgsz) {
488 case TDX_PS_4K:
489 nr_pamt_entries = tdmr->size >> PAGE_SHIFT;
490 break;
491 case TDX_PS_2M:
492 nr_pamt_entries = tdmr->size >> PMD_SHIFT;
493 break;
494 case TDX_PS_1G:
495 nr_pamt_entries = tdmr->size >> PUD_SHIFT;
496 break;
497 default:
498 WARN_ON_ONCE(1);
499 return 0;
500 }
501
502 pamt_sz = nr_pamt_entries * pamt_entry_size;
503 /* TDX requires PAMT size must be 4K aligned */
504 pamt_sz = ALIGN(pamt_sz, PAGE_SIZE);
505
506 return pamt_sz;
507 }
508
509 /*
510 * Locate a NUMA node which should hold the allocation of the @tdmr
511 * PAMT. This node will have some memory covered by the TDMR. The
512 * relative amount of memory covered is not considered.
513 */
tdmr_get_nid(struct tdmr_info * tdmr,struct list_head * tmb_list)514 static int tdmr_get_nid(struct tdmr_info *tdmr, struct list_head *tmb_list)
515 {
516 struct tdx_memblock *tmb;
517
518 /*
519 * A TDMR must cover at least part of one TMB. That TMB will end
520 * after the TDMR begins. But, that TMB may have started before
521 * the TDMR. Find the next 'tmb' that _ends_ after this TDMR
522 * begins. Ignore 'tmb' start addresses. They are irrelevant.
523 */
524 list_for_each_entry(tmb, tmb_list, list) {
525 if (tmb->end_pfn > PHYS_PFN(tdmr->base))
526 return tmb->nid;
527 }
528
529 /*
530 * Fall back to allocating the TDMR's metadata from node 0 when
531 * no TDX memory block can be found. This should never happen
532 * since TDMRs originate from TDX memory blocks.
533 */
534 pr_warn("TDMR [0x%llx, 0x%llx): unable to find local NUMA node for PAMT allocation, fallback to use node 0.\n",
535 tdmr->base, tdmr_end(tdmr));
536 return 0;
537 }
538
539 /*
540 * Allocate PAMTs from the local NUMA node of some memory in @tmb_list
541 * within @tdmr, and set up PAMTs for @tdmr.
542 */
tdmr_set_up_pamt(struct tdmr_info * tdmr,struct list_head * tmb_list,u16 pamt_entry_size[])543 static int tdmr_set_up_pamt(struct tdmr_info *tdmr,
544 struct list_head *tmb_list,
545 u16 pamt_entry_size[])
546 {
547 unsigned long pamt_base[TDX_PS_NR];
548 unsigned long pamt_size[TDX_PS_NR];
549 unsigned long tdmr_pamt_base;
550 unsigned long tdmr_pamt_size;
551 struct page *pamt;
552 int pgsz, nid;
553
554 nid = tdmr_get_nid(tdmr, tmb_list);
555
556 /*
557 * Calculate the PAMT size for each TDX supported page size
558 * and the total PAMT size.
559 */
560 tdmr_pamt_size = 0;
561 for (pgsz = TDX_PS_4K; pgsz < TDX_PS_NR; pgsz++) {
562 pamt_size[pgsz] = tdmr_get_pamt_sz(tdmr, pgsz,
563 pamt_entry_size[pgsz]);
564 tdmr_pamt_size += pamt_size[pgsz];
565 }
566
567 /*
568 * Allocate one chunk of physically contiguous memory for all
569 * PAMTs. This helps minimize the PAMT's use of reserved areas
570 * in overlapped TDMRs.
571 */
572 pamt = alloc_contig_pages(tdmr_pamt_size >> PAGE_SHIFT, GFP_KERNEL,
573 nid, &node_online_map);
574 if (!pamt)
575 return -ENOMEM;
576
577 /*
578 * Break the contiguous allocation back up into the
579 * individual PAMTs for each page size.
580 */
581 tdmr_pamt_base = page_to_pfn(pamt) << PAGE_SHIFT;
582 for (pgsz = TDX_PS_4K; pgsz < TDX_PS_NR; pgsz++) {
583 pamt_base[pgsz] = tdmr_pamt_base;
584 tdmr_pamt_base += pamt_size[pgsz];
585 }
586
587 tdmr->pamt_4k_base = pamt_base[TDX_PS_4K];
588 tdmr->pamt_4k_size = pamt_size[TDX_PS_4K];
589 tdmr->pamt_2m_base = pamt_base[TDX_PS_2M];
590 tdmr->pamt_2m_size = pamt_size[TDX_PS_2M];
591 tdmr->pamt_1g_base = pamt_base[TDX_PS_1G];
592 tdmr->pamt_1g_size = pamt_size[TDX_PS_1G];
593
594 return 0;
595 }
596
tdmr_get_pamt(struct tdmr_info * tdmr,unsigned long * pamt_base,unsigned long * pamt_size)597 static void tdmr_get_pamt(struct tdmr_info *tdmr, unsigned long *pamt_base,
598 unsigned long *pamt_size)
599 {
600 unsigned long pamt_bs, pamt_sz;
601
602 /*
603 * The PAMT was allocated in one contiguous unit. The 4K PAMT
604 * should always point to the beginning of that allocation.
605 */
606 pamt_bs = tdmr->pamt_4k_base;
607 pamt_sz = tdmr->pamt_4k_size + tdmr->pamt_2m_size + tdmr->pamt_1g_size;
608
609 WARN_ON_ONCE((pamt_bs & ~PAGE_MASK) || (pamt_sz & ~PAGE_MASK));
610
611 *pamt_base = pamt_bs;
612 *pamt_size = pamt_sz;
613 }
614
tdmr_do_pamt_func(struct tdmr_info * tdmr,void (* pamt_func)(unsigned long base,unsigned long size))615 static void tdmr_do_pamt_func(struct tdmr_info *tdmr,
616 void (*pamt_func)(unsigned long base, unsigned long size))
617 {
618 unsigned long pamt_base, pamt_size;
619
620 tdmr_get_pamt(tdmr, &pamt_base, &pamt_size);
621
622 /* Do nothing if PAMT hasn't been allocated for this TDMR */
623 if (!pamt_size)
624 return;
625
626 if (WARN_ON_ONCE(!pamt_base))
627 return;
628
629 pamt_func(pamt_base, pamt_size);
630 }
631
free_pamt(unsigned long pamt_base,unsigned long pamt_size)632 static void free_pamt(unsigned long pamt_base, unsigned long pamt_size)
633 {
634 free_contig_range(pamt_base >> PAGE_SHIFT, pamt_size >> PAGE_SHIFT);
635 }
636
tdmr_free_pamt(struct tdmr_info * tdmr)637 static void tdmr_free_pamt(struct tdmr_info *tdmr)
638 {
639 tdmr_do_pamt_func(tdmr, free_pamt);
640 }
641
tdmrs_free_pamt_all(struct tdmr_info_list * tdmr_list)642 static void tdmrs_free_pamt_all(struct tdmr_info_list *tdmr_list)
643 {
644 int i;
645
646 for (i = 0; i < tdmr_list->nr_consumed_tdmrs; i++)
647 tdmr_free_pamt(tdmr_entry(tdmr_list, i));
648 }
649
650 /* Allocate and set up PAMTs for all TDMRs */
tdmrs_set_up_pamt_all(struct tdmr_info_list * tdmr_list,struct list_head * tmb_list,u16 pamt_entry_size[])651 static int tdmrs_set_up_pamt_all(struct tdmr_info_list *tdmr_list,
652 struct list_head *tmb_list,
653 u16 pamt_entry_size[])
654 {
655 int i, ret = 0;
656
657 for (i = 0; i < tdmr_list->nr_consumed_tdmrs; i++) {
658 ret = tdmr_set_up_pamt(tdmr_entry(tdmr_list, i), tmb_list,
659 pamt_entry_size);
660 if (ret)
661 goto err;
662 }
663
664 return 0;
665 err:
666 tdmrs_free_pamt_all(tdmr_list);
667 return ret;
668 }
669
670 /*
671 * Convert TDX private pages back to normal by using MOVDIR64B to
672 * clear these pages. Note this function doesn't flush cache of
673 * these TDX private pages. The caller should make sure of that.
674 */
reset_tdx_pages(unsigned long base,unsigned long size)675 static void reset_tdx_pages(unsigned long base, unsigned long size)
676 {
677 const void *zero_page = (const void *)page_address(ZERO_PAGE(0));
678 unsigned long phys, end;
679
680 end = base + size;
681 for (phys = base; phys < end; phys += 64)
682 movdir64b(__va(phys), zero_page);
683
684 /*
685 * MOVDIR64B uses WC protocol. Use memory barrier to
686 * make sure any later user of these pages sees the
687 * updated data.
688 */
689 mb();
690 }
691
tdmr_reset_pamt(struct tdmr_info * tdmr)692 static void tdmr_reset_pamt(struct tdmr_info *tdmr)
693 {
694 tdmr_do_pamt_func(tdmr, reset_tdx_pages);
695 }
696
tdmrs_reset_pamt_all(struct tdmr_info_list * tdmr_list)697 static void tdmrs_reset_pamt_all(struct tdmr_info_list *tdmr_list)
698 {
699 int i;
700
701 for (i = 0; i < tdmr_list->nr_consumed_tdmrs; i++)
702 tdmr_reset_pamt(tdmr_entry(tdmr_list, i));
703 }
704
tdmrs_count_pamt_kb(struct tdmr_info_list * tdmr_list)705 static unsigned long tdmrs_count_pamt_kb(struct tdmr_info_list *tdmr_list)
706 {
707 unsigned long pamt_size = 0;
708 int i;
709
710 for (i = 0; i < tdmr_list->nr_consumed_tdmrs; i++) {
711 unsigned long base, size;
712
713 tdmr_get_pamt(tdmr_entry(tdmr_list, i), &base, &size);
714 pamt_size += size;
715 }
716
717 return pamt_size / 1024;
718 }
719
tdmr_add_rsvd_area(struct tdmr_info * tdmr,int * p_idx,u64 addr,u64 size,u16 max_reserved_per_tdmr)720 static int tdmr_add_rsvd_area(struct tdmr_info *tdmr, int *p_idx, u64 addr,
721 u64 size, u16 max_reserved_per_tdmr)
722 {
723 struct tdmr_reserved_area *rsvd_areas = tdmr->reserved_areas;
724 int idx = *p_idx;
725
726 /* Reserved area must be 4K aligned in offset and size */
727 if (WARN_ON(addr & ~PAGE_MASK || size & ~PAGE_MASK))
728 return -EINVAL;
729
730 if (idx >= max_reserved_per_tdmr) {
731 pr_warn("initialization failed: TDMR [0x%llx, 0x%llx): reserved areas exhausted.\n",
732 tdmr->base, tdmr_end(tdmr));
733 return -ENOSPC;
734 }
735
736 /*
737 * Consume one reserved area per call. Make no effort to
738 * optimize or reduce the number of reserved areas which are
739 * consumed by contiguous reserved areas, for instance.
740 */
741 rsvd_areas[idx].offset = addr - tdmr->base;
742 rsvd_areas[idx].size = size;
743
744 *p_idx = idx + 1;
745
746 return 0;
747 }
748
749 /*
750 * Go through @tmb_list to find holes between memory areas. If any of
751 * those holes fall within @tdmr, set up a TDMR reserved area to cover
752 * the hole.
753 */
tdmr_populate_rsvd_holes(struct list_head * tmb_list,struct tdmr_info * tdmr,int * rsvd_idx,u16 max_reserved_per_tdmr)754 static int tdmr_populate_rsvd_holes(struct list_head *tmb_list,
755 struct tdmr_info *tdmr,
756 int *rsvd_idx,
757 u16 max_reserved_per_tdmr)
758 {
759 struct tdx_memblock *tmb;
760 u64 prev_end;
761 int ret;
762
763 /*
764 * Start looking for reserved blocks at the
765 * beginning of the TDMR.
766 */
767 prev_end = tdmr->base;
768 list_for_each_entry(tmb, tmb_list, list) {
769 u64 start, end;
770
771 start = PFN_PHYS(tmb->start_pfn);
772 end = PFN_PHYS(tmb->end_pfn);
773
774 /* Break if this region is after the TDMR */
775 if (start >= tdmr_end(tdmr))
776 break;
777
778 /* Exclude regions before this TDMR */
779 if (end < tdmr->base)
780 continue;
781
782 /*
783 * Skip over memory areas that
784 * have already been dealt with.
785 */
786 if (start <= prev_end) {
787 prev_end = end;
788 continue;
789 }
790
791 /* Add the hole before this region */
792 ret = tdmr_add_rsvd_area(tdmr, rsvd_idx, prev_end,
793 start - prev_end,
794 max_reserved_per_tdmr);
795 if (ret)
796 return ret;
797
798 prev_end = end;
799 }
800
801 /* Add the hole after the last region if it exists. */
802 if (prev_end < tdmr_end(tdmr)) {
803 ret = tdmr_add_rsvd_area(tdmr, rsvd_idx, prev_end,
804 tdmr_end(tdmr) - prev_end,
805 max_reserved_per_tdmr);
806 if (ret)
807 return ret;
808 }
809
810 return 0;
811 }
812
813 /*
814 * Go through @tdmr_list to find all PAMTs. If any of those PAMTs
815 * overlaps with @tdmr, set up a TDMR reserved area to cover the
816 * overlapping part.
817 */
tdmr_populate_rsvd_pamts(struct tdmr_info_list * tdmr_list,struct tdmr_info * tdmr,int * rsvd_idx,u16 max_reserved_per_tdmr)818 static int tdmr_populate_rsvd_pamts(struct tdmr_info_list *tdmr_list,
819 struct tdmr_info *tdmr,
820 int *rsvd_idx,
821 u16 max_reserved_per_tdmr)
822 {
823 int i, ret;
824
825 for (i = 0; i < tdmr_list->nr_consumed_tdmrs; i++) {
826 struct tdmr_info *tmp = tdmr_entry(tdmr_list, i);
827 unsigned long pamt_base, pamt_size, pamt_end;
828
829 tdmr_get_pamt(tmp, &pamt_base, &pamt_size);
830 /* Each TDMR must already have PAMT allocated */
831 WARN_ON_ONCE(!pamt_size || !pamt_base);
832
833 pamt_end = pamt_base + pamt_size;
834 /* Skip PAMTs outside of the given TDMR */
835 if ((pamt_end <= tdmr->base) ||
836 (pamt_base >= tdmr_end(tdmr)))
837 continue;
838
839 /* Only mark the part within the TDMR as reserved */
840 if (pamt_base < tdmr->base)
841 pamt_base = tdmr->base;
842 if (pamt_end > tdmr_end(tdmr))
843 pamt_end = tdmr_end(tdmr);
844
845 ret = tdmr_add_rsvd_area(tdmr, rsvd_idx, pamt_base,
846 pamt_end - pamt_base,
847 max_reserved_per_tdmr);
848 if (ret)
849 return ret;
850 }
851
852 return 0;
853 }
854
855 /* Compare function called by sort() for TDMR reserved areas */
rsvd_area_cmp_func(const void * a,const void * b)856 static int rsvd_area_cmp_func(const void *a, const void *b)
857 {
858 struct tdmr_reserved_area *r1 = (struct tdmr_reserved_area *)a;
859 struct tdmr_reserved_area *r2 = (struct tdmr_reserved_area *)b;
860
861 if (r1->offset + r1->size <= r2->offset)
862 return -1;
863 if (r1->offset >= r2->offset + r2->size)
864 return 1;
865
866 /* Reserved areas cannot overlap. The caller must guarantee. */
867 WARN_ON_ONCE(1);
868 return -1;
869 }
870
871 /*
872 * Populate reserved areas for the given @tdmr, including memory holes
873 * (via @tmb_list) and PAMTs (via @tdmr_list).
874 */
tdmr_populate_rsvd_areas(struct tdmr_info * tdmr,struct list_head * tmb_list,struct tdmr_info_list * tdmr_list,u16 max_reserved_per_tdmr)875 static int tdmr_populate_rsvd_areas(struct tdmr_info *tdmr,
876 struct list_head *tmb_list,
877 struct tdmr_info_list *tdmr_list,
878 u16 max_reserved_per_tdmr)
879 {
880 int ret, rsvd_idx = 0;
881
882 ret = tdmr_populate_rsvd_holes(tmb_list, tdmr, &rsvd_idx,
883 max_reserved_per_tdmr);
884 if (ret)
885 return ret;
886
887 ret = tdmr_populate_rsvd_pamts(tdmr_list, tdmr, &rsvd_idx,
888 max_reserved_per_tdmr);
889 if (ret)
890 return ret;
891
892 /* TDX requires reserved areas listed in address ascending order */
893 sort(tdmr->reserved_areas, rsvd_idx, sizeof(struct tdmr_reserved_area),
894 rsvd_area_cmp_func, NULL);
895
896 return 0;
897 }
898
899 /*
900 * Populate reserved areas for all TDMRs in @tdmr_list, including memory
901 * holes (via @tmb_list) and PAMTs.
902 */
tdmrs_populate_rsvd_areas_all(struct tdmr_info_list * tdmr_list,struct list_head * tmb_list,u16 max_reserved_per_tdmr)903 static int tdmrs_populate_rsvd_areas_all(struct tdmr_info_list *tdmr_list,
904 struct list_head *tmb_list,
905 u16 max_reserved_per_tdmr)
906 {
907 int i;
908
909 for (i = 0; i < tdmr_list->nr_consumed_tdmrs; i++) {
910 int ret;
911
912 ret = tdmr_populate_rsvd_areas(tdmr_entry(tdmr_list, i),
913 tmb_list, tdmr_list, max_reserved_per_tdmr);
914 if (ret)
915 return ret;
916 }
917
918 return 0;
919 }
920
921 /*
922 * Construct a list of TDMRs on the preallocated space in @tdmr_list
923 * to cover all TDX memory regions in @tmb_list based on the TDX module
924 * TDMR global information in @tdmr_sysinfo.
925 */
construct_tdmrs(struct list_head * tmb_list,struct tdmr_info_list * tdmr_list,struct tdx_tdmr_sysinfo * tdmr_sysinfo)926 static int construct_tdmrs(struct list_head *tmb_list,
927 struct tdmr_info_list *tdmr_list,
928 struct tdx_tdmr_sysinfo *tdmr_sysinfo)
929 {
930 int ret;
931
932 ret = fill_out_tdmrs(tmb_list, tdmr_list);
933 if (ret)
934 return ret;
935
936 ret = tdmrs_set_up_pamt_all(tdmr_list, tmb_list,
937 tdmr_sysinfo->pamt_entry_size);
938 if (ret)
939 return ret;
940
941 ret = tdmrs_populate_rsvd_areas_all(tdmr_list, tmb_list,
942 tdmr_sysinfo->max_reserved_per_tdmr);
943 if (ret)
944 tdmrs_free_pamt_all(tdmr_list);
945
946 /*
947 * The tdmr_info_list is read-only from here on out.
948 * Ensure that these writes are seen by other CPUs.
949 * Pairs with a smp_rmb() in is_pamt_page().
950 */
951 smp_wmb();
952
953 return ret;
954 }
955
config_tdx_module(struct tdmr_info_list * tdmr_list,u64 global_keyid)956 static int config_tdx_module(struct tdmr_info_list *tdmr_list, u64 global_keyid)
957 {
958 struct tdx_module_args args = {};
959 u64 *tdmr_pa_array;
960 size_t array_sz;
961 int i, ret;
962
963 /*
964 * TDMRs are passed to the TDX module via an array of physical
965 * addresses of each TDMR. The array itself also has certain
966 * alignment requirement.
967 */
968 array_sz = tdmr_list->nr_consumed_tdmrs * sizeof(u64);
969 array_sz = roundup_pow_of_two(array_sz);
970 if (array_sz < TDMR_INFO_PA_ARRAY_ALIGNMENT)
971 array_sz = TDMR_INFO_PA_ARRAY_ALIGNMENT;
972
973 tdmr_pa_array = kzalloc(array_sz, GFP_KERNEL);
974 if (!tdmr_pa_array)
975 return -ENOMEM;
976
977 for (i = 0; i < tdmr_list->nr_consumed_tdmrs; i++)
978 tdmr_pa_array[i] = __pa(tdmr_entry(tdmr_list, i));
979
980 args.rcx = __pa(tdmr_pa_array);
981 args.rdx = tdmr_list->nr_consumed_tdmrs;
982 args.r8 = global_keyid;
983 ret = seamcall_prerr(TDH_SYS_CONFIG, &args);
984
985 /* Free the array as it is not required anymore. */
986 kfree(tdmr_pa_array);
987
988 return ret;
989 }
990
do_global_key_config(void * unused)991 static int do_global_key_config(void *unused)
992 {
993 struct tdx_module_args args = {};
994
995 return seamcall_prerr(TDH_SYS_KEY_CONFIG, &args);
996 }
997
998 /*
999 * Attempt to configure the global KeyID on all physical packages.
1000 *
1001 * This requires running code on at least one CPU in each package.
1002 * TDMR initialization) will fail will fail if any package in the
1003 * system has no online CPUs.
1004 *
1005 * This code takes no affirmative steps to online CPUs. Callers (aka.
1006 * KVM) can ensure success by ensuring sufficient CPUs are online and
1007 * can run SEAMCALLs.
1008 */
config_global_keyid(void)1009 static int config_global_keyid(void)
1010 {
1011 cpumask_var_t packages;
1012 int cpu, ret = -EINVAL;
1013
1014 if (!zalloc_cpumask_var(&packages, GFP_KERNEL))
1015 return -ENOMEM;
1016
1017 /*
1018 * Hardware doesn't guarantee cache coherency across different
1019 * KeyIDs. The kernel needs to flush PAMT's dirty cachelines
1020 * (associated with KeyID 0) before the TDX module can use the
1021 * global KeyID to access the PAMT. Given PAMTs are potentially
1022 * large (~1/256th of system RAM), just use WBINVD.
1023 */
1024 wbinvd_on_all_cpus();
1025
1026 for_each_online_cpu(cpu) {
1027 /*
1028 * The key configuration only needs to be done once per
1029 * package and will return an error if configured more
1030 * than once. Avoid doing it multiple times per package.
1031 */
1032 if (cpumask_test_and_set_cpu(topology_physical_package_id(cpu),
1033 packages))
1034 continue;
1035
1036 /*
1037 * TDH.SYS.KEY.CONFIG cannot run concurrently on
1038 * different cpus. Do it one by one.
1039 */
1040 ret = smp_call_on_cpu(cpu, do_global_key_config, NULL, true);
1041 if (ret)
1042 break;
1043 }
1044
1045 free_cpumask_var(packages);
1046 return ret;
1047 }
1048
init_tdmr(struct tdmr_info * tdmr)1049 static int init_tdmr(struct tdmr_info *tdmr)
1050 {
1051 u64 next;
1052
1053 /*
1054 * Initializing a TDMR can be time consuming. To avoid long
1055 * SEAMCALLs, the TDX module may only initialize a part of the
1056 * TDMR in each call.
1057 */
1058 do {
1059 struct tdx_module_args args = {
1060 .rcx = tdmr->base,
1061 };
1062 int ret;
1063
1064 ret = seamcall_prerr_ret(TDH_SYS_TDMR_INIT, &args);
1065 if (ret)
1066 return ret;
1067 /*
1068 * RDX contains 'next-to-initialize' address if
1069 * TDH.SYS.TDMR.INIT did not fully complete and
1070 * should be retried.
1071 */
1072 next = args.rdx;
1073 cond_resched();
1074 /* Keep making SEAMCALLs until the TDMR is done */
1075 } while (next < tdmr->base + tdmr->size);
1076
1077 return 0;
1078 }
1079
init_tdmrs(struct tdmr_info_list * tdmr_list)1080 static int init_tdmrs(struct tdmr_info_list *tdmr_list)
1081 {
1082 int i;
1083
1084 /*
1085 * This operation is costly. It can be parallelized,
1086 * but keep it simple for now.
1087 */
1088 for (i = 0; i < tdmr_list->nr_consumed_tdmrs; i++) {
1089 int ret;
1090
1091 ret = init_tdmr(tdmr_entry(tdmr_list, i));
1092 if (ret)
1093 return ret;
1094 }
1095
1096 return 0;
1097 }
1098
init_tdx_module(void)1099 static int init_tdx_module(void)
1100 {
1101 struct tdx_tdmr_sysinfo tdmr_sysinfo;
1102 int ret;
1103
1104 /*
1105 * To keep things simple, assume that all TDX-protected memory
1106 * will come from the page allocator. Make sure all pages in the
1107 * page allocator are TDX-usable memory.
1108 *
1109 * Build the list of "TDX-usable" memory regions which cover all
1110 * pages in the page allocator to guarantee that. Do it while
1111 * holding mem_hotplug_lock read-lock as the memory hotplug code
1112 * path reads the @tdx_memlist to reject any new memory.
1113 */
1114 get_online_mems();
1115
1116 ret = build_tdx_memlist(&tdx_memlist);
1117 if (ret)
1118 goto out_put_tdxmem;
1119
1120 ret = get_tdx_tdmr_sysinfo(&tdmr_sysinfo);
1121 if (ret)
1122 goto err_free_tdxmem;
1123
1124 /* Allocate enough space for constructing TDMRs */
1125 ret = alloc_tdmr_list(&tdx_tdmr_list, &tdmr_sysinfo);
1126 if (ret)
1127 goto err_free_tdxmem;
1128
1129 /* Cover all TDX-usable memory regions in TDMRs */
1130 ret = construct_tdmrs(&tdx_memlist, &tdx_tdmr_list, &tdmr_sysinfo);
1131 if (ret)
1132 goto err_free_tdmrs;
1133
1134 /* Pass the TDMRs and the global KeyID to the TDX module */
1135 ret = config_tdx_module(&tdx_tdmr_list, tdx_global_keyid);
1136 if (ret)
1137 goto err_free_pamts;
1138
1139 /* Config the key of global KeyID on all packages */
1140 ret = config_global_keyid();
1141 if (ret)
1142 goto err_reset_pamts;
1143
1144 /* Initialize TDMRs to complete the TDX module initialization */
1145 ret = init_tdmrs(&tdx_tdmr_list);
1146 if (ret)
1147 goto err_reset_pamts;
1148
1149 pr_info("%lu KB allocated for PAMT\n", tdmrs_count_pamt_kb(&tdx_tdmr_list));
1150
1151 out_put_tdxmem:
1152 /*
1153 * @tdx_memlist is written here and read at memory hotplug time.
1154 * Lock out memory hotplug code while building it.
1155 */
1156 put_online_mems();
1157 return ret;
1158
1159 err_reset_pamts:
1160 /*
1161 * Part of PAMTs may already have been initialized by the
1162 * TDX module. Flush cache before returning PAMTs back
1163 * to the kernel.
1164 */
1165 wbinvd_on_all_cpus();
1166 /*
1167 * According to the TDX hardware spec, if the platform
1168 * doesn't have the "partial write machine check"
1169 * erratum, any kernel read/write will never cause #MC
1170 * in kernel space, thus it's OK to not convert PAMTs
1171 * back to normal. But do the conversion anyway here
1172 * as suggested by the TDX spec.
1173 */
1174 tdmrs_reset_pamt_all(&tdx_tdmr_list);
1175 err_free_pamts:
1176 tdmrs_free_pamt_all(&tdx_tdmr_list);
1177 err_free_tdmrs:
1178 free_tdmr_list(&tdx_tdmr_list);
1179 err_free_tdxmem:
1180 free_tdx_memlist(&tdx_memlist);
1181 goto out_put_tdxmem;
1182 }
1183
__tdx_enable(void)1184 static int __tdx_enable(void)
1185 {
1186 int ret;
1187
1188 ret = init_tdx_module();
1189 if (ret) {
1190 pr_err("module initialization failed (%d)\n", ret);
1191 tdx_module_status = TDX_MODULE_ERROR;
1192 return ret;
1193 }
1194
1195 pr_info("module initialized\n");
1196 tdx_module_status = TDX_MODULE_INITIALIZED;
1197
1198 return 0;
1199 }
1200
1201 /**
1202 * tdx_enable - Enable TDX module to make it ready to run TDX guests
1203 *
1204 * This function assumes the caller has: 1) held read lock of CPU hotplug
1205 * lock to prevent any new cpu from becoming online; 2) done both VMXON
1206 * and tdx_cpu_enable() on all online cpus.
1207 *
1208 * This function requires there's at least one online cpu for each CPU
1209 * package to succeed.
1210 *
1211 * This function can be called in parallel by multiple callers.
1212 *
1213 * Return 0 if TDX is enabled successfully, otherwise error.
1214 */
tdx_enable(void)1215 int tdx_enable(void)
1216 {
1217 int ret;
1218
1219 if (!boot_cpu_has(X86_FEATURE_TDX_HOST_PLATFORM))
1220 return -ENODEV;
1221
1222 lockdep_assert_cpus_held();
1223
1224 mutex_lock(&tdx_module_lock);
1225
1226 switch (tdx_module_status) {
1227 case TDX_MODULE_UNINITIALIZED:
1228 ret = __tdx_enable();
1229 break;
1230 case TDX_MODULE_INITIALIZED:
1231 /* Already initialized, great, tell the caller. */
1232 ret = 0;
1233 break;
1234 default:
1235 /* Failed to initialize in the previous attempts */
1236 ret = -EINVAL;
1237 break;
1238 }
1239
1240 mutex_unlock(&tdx_module_lock);
1241
1242 return ret;
1243 }
1244 EXPORT_SYMBOL_GPL(tdx_enable);
1245
is_pamt_page(unsigned long phys)1246 static bool is_pamt_page(unsigned long phys)
1247 {
1248 struct tdmr_info_list *tdmr_list = &tdx_tdmr_list;
1249 int i;
1250
1251 /* Ensure that all remote 'tdmr_list' writes are visible: */
1252 smp_rmb();
1253
1254 /*
1255 * The TDX module is no longer returning TDX_SYS_NOT_READY and
1256 * is initialized. The 'tdmr_list' was initialized long ago
1257 * and is now read-only.
1258 */
1259 for (i = 0; i < tdmr_list->nr_consumed_tdmrs; i++) {
1260 unsigned long base, size;
1261
1262 tdmr_get_pamt(tdmr_entry(tdmr_list, i), &base, &size);
1263
1264 if (phys >= base && phys < (base + size))
1265 return true;
1266 }
1267
1268 return false;
1269 }
1270
1271 /*
1272 * Return whether the memory page at the given physical address is TDX
1273 * private memory or not.
1274 *
1275 * This can be imprecise for two known reasons:
1276 * 1. PAMTs are private memory and exist before the TDX module is
1277 * ready and TDH_PHYMEM_PAGE_RDMD works. This is a relatively
1278 * short window that occurs once per boot.
1279 * 2. TDH_PHYMEM_PAGE_RDMD reflects the TDX module's knowledge of the
1280 * page. However, the page can still cause #MC until it has been
1281 * fully converted to shared using 64-byte writes like MOVDIR64B.
1282 * Buggy hosts might still leave #MC-causing memory in place which
1283 * this function can not detect.
1284 */
paddr_is_tdx_private(unsigned long phys)1285 static bool paddr_is_tdx_private(unsigned long phys)
1286 {
1287 struct tdx_module_args args = {
1288 .rcx = phys & PAGE_MASK,
1289 };
1290 u64 sret;
1291
1292 if (!boot_cpu_has(X86_FEATURE_TDX_HOST_PLATFORM))
1293 return false;
1294
1295 /* Get page type from the TDX module */
1296 sret = __seamcall_ret(TDH_PHYMEM_PAGE_RDMD, &args);
1297
1298 /*
1299 * The SEAMCALL will not return success unless there is a
1300 * working, "ready" TDX module. Assume an absence of TDX
1301 * private pages until SEAMCALL is working.
1302 */
1303 if (sret)
1304 return false;
1305
1306 /*
1307 * SEAMCALL was successful -- read page type (via RCX):
1308 *
1309 * - PT_NDA: Page is not used by the TDX module
1310 * - PT_RSVD: Reserved for Non-TDX use
1311 * - Others: Page is used by the TDX module
1312 *
1313 * Note PAMT pages are marked as PT_RSVD but they are also TDX
1314 * private memory.
1315 */
1316 switch (args.rcx) {
1317 case PT_NDA:
1318 return false;
1319 case PT_RSVD:
1320 return is_pamt_page(phys);
1321 default:
1322 return true;
1323 }
1324 }
1325
1326 /*
1327 * Some TDX-capable CPUs have an erratum. A write to TDX private
1328 * memory poisons that memory, and a subsequent read of that memory
1329 * triggers #MC.
1330 *
1331 * Help distinguish erratum-triggered #MCs from a normal hardware one.
1332 * Just print additional message to show such #MC may be result of the
1333 * erratum.
1334 */
tdx_dump_mce_info(struct mce * m)1335 const char *tdx_dump_mce_info(struct mce *m)
1336 {
1337 if (!m || !mce_is_memory_error(m) || !mce_usable_address(m))
1338 return NULL;
1339
1340 if (!paddr_is_tdx_private(m->addr))
1341 return NULL;
1342
1343 return "TDX private memory error. Possible kernel bug.";
1344 }
1345
record_keyid_partitioning(u32 * tdx_keyid_start,u32 * nr_tdx_keyids)1346 static __init int record_keyid_partitioning(u32 *tdx_keyid_start,
1347 u32 *nr_tdx_keyids)
1348 {
1349 u32 _nr_mktme_keyids, _tdx_keyid_start, _nr_tdx_keyids;
1350 int ret;
1351
1352 /*
1353 * IA32_MKTME_KEYID_PARTIONING:
1354 * Bit [31:0]: Number of MKTME KeyIDs.
1355 * Bit [63:32]: Number of TDX private KeyIDs.
1356 */
1357 ret = rdmsr_safe(MSR_IA32_MKTME_KEYID_PARTITIONING, &_nr_mktme_keyids,
1358 &_nr_tdx_keyids);
1359 if (ret || !_nr_tdx_keyids)
1360 return -EINVAL;
1361
1362 /* TDX KeyIDs start after the last MKTME KeyID. */
1363 _tdx_keyid_start = _nr_mktme_keyids + 1;
1364
1365 *tdx_keyid_start = _tdx_keyid_start;
1366 *nr_tdx_keyids = _nr_tdx_keyids;
1367
1368 return 0;
1369 }
1370
is_tdx_memory(unsigned long start_pfn,unsigned long end_pfn)1371 static bool is_tdx_memory(unsigned long start_pfn, unsigned long end_pfn)
1372 {
1373 struct tdx_memblock *tmb;
1374
1375 /*
1376 * This check assumes that the start_pfn<->end_pfn range does not
1377 * cross multiple @tdx_memlist entries. A single memory online
1378 * event across multiple memblocks (from which @tdx_memlist
1379 * entries are derived at the time of module initialization) is
1380 * not possible. This is because memory offline/online is done
1381 * on granularity of 'struct memory_block', and the hotpluggable
1382 * memory region (one memblock) must be multiple of memory_block.
1383 */
1384 list_for_each_entry(tmb, &tdx_memlist, list) {
1385 if (start_pfn >= tmb->start_pfn && end_pfn <= tmb->end_pfn)
1386 return true;
1387 }
1388 return false;
1389 }
1390
tdx_memory_notifier(struct notifier_block * nb,unsigned long action,void * v)1391 static int tdx_memory_notifier(struct notifier_block *nb, unsigned long action,
1392 void *v)
1393 {
1394 struct memory_notify *mn = v;
1395
1396 if (action != MEM_GOING_ONLINE)
1397 return NOTIFY_OK;
1398
1399 /*
1400 * Empty list means TDX isn't enabled. Allow any memory
1401 * to go online.
1402 */
1403 if (list_empty(&tdx_memlist))
1404 return NOTIFY_OK;
1405
1406 /*
1407 * The TDX memory configuration is static and can not be
1408 * changed. Reject onlining any memory which is outside of
1409 * the static configuration whether it supports TDX or not.
1410 */
1411 if (is_tdx_memory(mn->start_pfn, mn->start_pfn + mn->nr_pages))
1412 return NOTIFY_OK;
1413
1414 return NOTIFY_BAD;
1415 }
1416
1417 static struct notifier_block tdx_memory_nb = {
1418 .notifier_call = tdx_memory_notifier,
1419 };
1420
check_tdx_erratum(void)1421 static void __init check_tdx_erratum(void)
1422 {
1423 /*
1424 * These CPUs have an erratum. A partial write from non-TD
1425 * software (e.g. via MOVNTI variants or UC/WC mapping) to TDX
1426 * private memory poisons that memory, and a subsequent read of
1427 * that memory triggers #MC.
1428 */
1429 switch (boot_cpu_data.x86_vfm) {
1430 case INTEL_SAPPHIRERAPIDS_X:
1431 case INTEL_EMERALDRAPIDS_X:
1432 setup_force_cpu_bug(X86_BUG_TDX_PW_MCE);
1433 }
1434 }
1435
tdx_init(void)1436 void __init tdx_init(void)
1437 {
1438 u32 tdx_keyid_start, nr_tdx_keyids;
1439 int err;
1440
1441 err = record_keyid_partitioning(&tdx_keyid_start, &nr_tdx_keyids);
1442 if (err)
1443 return;
1444
1445 pr_info("BIOS enabled: private KeyID range [%u, %u)\n",
1446 tdx_keyid_start, tdx_keyid_start + nr_tdx_keyids);
1447
1448 /*
1449 * The TDX module itself requires one 'global KeyID' to protect
1450 * its metadata. If there's only one TDX KeyID, there won't be
1451 * any left for TDX guests thus there's no point to enable TDX
1452 * at all.
1453 */
1454 if (nr_tdx_keyids < 2) {
1455 pr_err("initialization failed: too few private KeyIDs available.\n");
1456 return;
1457 }
1458
1459 /*
1460 * At this point, hibernation_available() indicates whether or
1461 * not hibernation support has been permanently disabled.
1462 */
1463 if (hibernation_available()) {
1464 pr_err("initialization failed: Hibernation support is enabled\n");
1465 return;
1466 }
1467
1468 err = register_memory_notifier(&tdx_memory_nb);
1469 if (err) {
1470 pr_err("initialization failed: register_memory_notifier() failed (%d)\n",
1471 err);
1472 return;
1473 }
1474
1475 #if defined(CONFIG_ACPI) && defined(CONFIG_SUSPEND)
1476 pr_info("Disable ACPI S3. Turn off TDX in the BIOS to use ACPI S3.\n");
1477 acpi_suspend_lowlevel = NULL;
1478 #endif
1479
1480 /*
1481 * Just use the first TDX KeyID as the 'global KeyID' and
1482 * leave the rest for TDX guests.
1483 */
1484 tdx_global_keyid = tdx_keyid_start;
1485 tdx_guest_keyid_start = tdx_keyid_start + 1;
1486 tdx_nr_guest_keyids = nr_tdx_keyids - 1;
1487
1488 setup_force_cpu_cap(X86_FEATURE_TDX_HOST_PLATFORM);
1489
1490 check_tdx_erratum();
1491 }
1492