1 // SPDX-License-Identifier: GPL-2.0
2 /*
3 * Copyright(c) 2023 Intel Corporation.
4 *
5 * Intel Trusted Domain Extensions (TDX) support
6 */
7
8 #include "asm/page_types.h"
9 #define pr_fmt(fmt) "virt/tdx: " fmt
10
11 #include <linux/types.h>
12 #include <linux/cache.h>
13 #include <linux/init.h>
14 #include <linux/errno.h>
15 #include <linux/printk.h>
16 #include <linux/cpu.h>
17 #include <linux/spinlock.h>
18 #include <linux/percpu-defs.h>
19 #include <linux/mutex.h>
20 #include <linux/list.h>
21 #include <linux/memblock.h>
22 #include <linux/memory.h>
23 #include <linux/minmax.h>
24 #include <linux/sizes.h>
25 #include <linux/pfn.h>
26 #include <linux/align.h>
27 #include <linux/sort.h>
28 #include <linux/log2.h>
29 #include <linux/acpi.h>
30 #include <linux/suspend.h>
31 #include <linux/idr.h>
32 #include <asm/page.h>
33 #include <asm/special_insns.h>
34 #include <asm/msr-index.h>
35 #include <asm/msr.h>
36 #include <asm/cpufeature.h>
37 #include <asm/tdx.h>
38 #include <asm/cpu_device_id.h>
39 #include <asm/processor.h>
40 #include <asm/mce.h>
41 #include "tdx.h"
42
43 static u32 tdx_global_keyid __ro_after_init;
44 static u32 tdx_guest_keyid_start __ro_after_init;
45 static u32 tdx_nr_guest_keyids __ro_after_init;
46
47 static DEFINE_IDA(tdx_guest_keyid_pool);
48
49 static DEFINE_PER_CPU(bool, tdx_lp_initialized);
50
51 static struct tdmr_info_list tdx_tdmr_list;
52
53 static enum tdx_module_status_t tdx_module_status;
54 static DEFINE_MUTEX(tdx_module_lock);
55
56 /* All TDX-usable memory regions. Protected by mem_hotplug_lock. */
57 static LIST_HEAD(tdx_memlist);
58
59 static struct tdx_sys_info tdx_sysinfo;
60
61 typedef void (*sc_err_func_t)(u64 fn, u64 err, struct tdx_module_args *args);
62
seamcall_err(u64 fn,u64 err,struct tdx_module_args * args)63 static inline void seamcall_err(u64 fn, u64 err, struct tdx_module_args *args)
64 {
65 pr_err("SEAMCALL (0x%016llx) failed: 0x%016llx\n", fn, err);
66 }
67
seamcall_err_ret(u64 fn,u64 err,struct tdx_module_args * args)68 static inline void seamcall_err_ret(u64 fn, u64 err,
69 struct tdx_module_args *args)
70 {
71 seamcall_err(fn, err, args);
72 pr_err("RCX 0x%016llx RDX 0x%016llx R08 0x%016llx\n",
73 args->rcx, args->rdx, args->r8);
74 pr_err("R09 0x%016llx R10 0x%016llx R11 0x%016llx\n",
75 args->r9, args->r10, args->r11);
76 }
77
sc_retry_prerr(sc_func_t func,sc_err_func_t err_func,u64 fn,struct tdx_module_args * args)78 static __always_inline int sc_retry_prerr(sc_func_t func,
79 sc_err_func_t err_func,
80 u64 fn, struct tdx_module_args *args)
81 {
82 u64 sret = sc_retry(func, fn, args);
83
84 if (sret == TDX_SUCCESS)
85 return 0;
86
87 if (sret == TDX_SEAMCALL_VMFAILINVALID)
88 return -ENODEV;
89
90 if (sret == TDX_SEAMCALL_GP)
91 return -EOPNOTSUPP;
92
93 if (sret == TDX_SEAMCALL_UD)
94 return -EACCES;
95
96 err_func(fn, sret, args);
97 return -EIO;
98 }
99
100 #define seamcall_prerr(__fn, __args) \
101 sc_retry_prerr(__seamcall, seamcall_err, (__fn), (__args))
102
103 #define seamcall_prerr_ret(__fn, __args) \
104 sc_retry_prerr(__seamcall_ret, seamcall_err_ret, (__fn), (__args))
105
106 /*
107 * Do the module global initialization once and return its result.
108 * It can be done on any cpu. It's always called with interrupts
109 * disabled.
110 */
try_init_module_global(void)111 static int try_init_module_global(void)
112 {
113 struct tdx_module_args args = {};
114 static DEFINE_RAW_SPINLOCK(sysinit_lock);
115 static bool sysinit_done;
116 static int sysinit_ret;
117
118 lockdep_assert_irqs_disabled();
119
120 raw_spin_lock(&sysinit_lock);
121
122 if (sysinit_done)
123 goto out;
124
125 /* RCX is module attributes and all bits are reserved */
126 args.rcx = 0;
127 sysinit_ret = seamcall_prerr(TDH_SYS_INIT, &args);
128
129 /*
130 * The first SEAMCALL also detects the TDX module, thus
131 * it can fail due to the TDX module is not loaded.
132 * Dump message to let the user know.
133 */
134 if (sysinit_ret == -ENODEV)
135 pr_err("module not loaded\n");
136
137 sysinit_done = true;
138 out:
139 raw_spin_unlock(&sysinit_lock);
140 return sysinit_ret;
141 }
142
143 /**
144 * tdx_cpu_enable - Enable TDX on local cpu
145 *
146 * Do one-time TDX module per-cpu initialization SEAMCALL (and TDX module
147 * global initialization SEAMCALL if not done) on local cpu to make this
148 * cpu be ready to run any other SEAMCALLs.
149 *
150 * Always call this function via IPI function calls.
151 *
152 * Return 0 on success, otherwise errors.
153 */
tdx_cpu_enable(void)154 int tdx_cpu_enable(void)
155 {
156 struct tdx_module_args args = {};
157 int ret;
158
159 if (!boot_cpu_has(X86_FEATURE_TDX_HOST_PLATFORM))
160 return -ENODEV;
161
162 lockdep_assert_irqs_disabled();
163
164 if (__this_cpu_read(tdx_lp_initialized))
165 return 0;
166
167 /*
168 * The TDX module global initialization is the very first step
169 * to enable TDX. Need to do it first (if hasn't been done)
170 * before the per-cpu initialization.
171 */
172 ret = try_init_module_global();
173 if (ret)
174 return ret;
175
176 ret = seamcall_prerr(TDH_SYS_LP_INIT, &args);
177 if (ret)
178 return ret;
179
180 __this_cpu_write(tdx_lp_initialized, true);
181
182 return 0;
183 }
184 EXPORT_SYMBOL_GPL(tdx_cpu_enable);
185
186 /*
187 * Add a memory region as a TDX memory block. The caller must make sure
188 * all memory regions are added in address ascending order and don't
189 * overlap.
190 */
add_tdx_memblock(struct list_head * tmb_list,unsigned long start_pfn,unsigned long end_pfn,int nid)191 static int add_tdx_memblock(struct list_head *tmb_list, unsigned long start_pfn,
192 unsigned long end_pfn, int nid)
193 {
194 struct tdx_memblock *tmb;
195
196 tmb = kmalloc(sizeof(*tmb), GFP_KERNEL);
197 if (!tmb)
198 return -ENOMEM;
199
200 INIT_LIST_HEAD(&tmb->list);
201 tmb->start_pfn = start_pfn;
202 tmb->end_pfn = end_pfn;
203 tmb->nid = nid;
204
205 /* @tmb_list is protected by mem_hotplug_lock */
206 list_add_tail(&tmb->list, tmb_list);
207 return 0;
208 }
209
free_tdx_memlist(struct list_head * tmb_list)210 static void free_tdx_memlist(struct list_head *tmb_list)
211 {
212 /* @tmb_list is protected by mem_hotplug_lock */
213 while (!list_empty(tmb_list)) {
214 struct tdx_memblock *tmb = list_first_entry(tmb_list,
215 struct tdx_memblock, list);
216
217 list_del(&tmb->list);
218 kfree(tmb);
219 }
220 }
221
222 /*
223 * Ensure that all memblock memory regions are convertible to TDX
224 * memory. Once this has been established, stash the memblock
225 * ranges off in a secondary structure because memblock is modified
226 * in memory hotplug while TDX memory regions are fixed.
227 */
build_tdx_memlist(struct list_head * tmb_list)228 static int build_tdx_memlist(struct list_head *tmb_list)
229 {
230 unsigned long start_pfn, end_pfn;
231 int i, nid, ret;
232
233 for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) {
234 /*
235 * The first 1MB is not reported as TDX convertible memory.
236 * Although the first 1MB is always reserved and won't end up
237 * to the page allocator, it is still in memblock's memory
238 * regions. Skip them manually to exclude them as TDX memory.
239 */
240 start_pfn = max(start_pfn, PHYS_PFN(SZ_1M));
241 if (start_pfn >= end_pfn)
242 continue;
243
244 /*
245 * Add the memory regions as TDX memory. The regions in
246 * memblock has already guaranteed they are in address
247 * ascending order and don't overlap.
248 */
249 ret = add_tdx_memblock(tmb_list, start_pfn, end_pfn, nid);
250 if (ret)
251 goto err;
252 }
253
254 return 0;
255 err:
256 free_tdx_memlist(tmb_list);
257 return ret;
258 }
259
read_sys_metadata_field(u64 field_id,u64 * data)260 static int read_sys_metadata_field(u64 field_id, u64 *data)
261 {
262 struct tdx_module_args args = {};
263 int ret;
264
265 /*
266 * TDH.SYS.RD -- reads one global metadata field
267 * - RDX (in): the field to read
268 * - R8 (out): the field data
269 */
270 args.rdx = field_id;
271 ret = seamcall_prerr_ret(TDH_SYS_RD, &args);
272 if (ret)
273 return ret;
274
275 *data = args.r8;
276
277 return 0;
278 }
279
280 #include "tdx_global_metadata.c"
281
check_features(struct tdx_sys_info * sysinfo)282 static int check_features(struct tdx_sys_info *sysinfo)
283 {
284 u64 tdx_features0 = sysinfo->features.tdx_features0;
285
286 if (!(tdx_features0 & TDX_FEATURES0_NO_RBP_MOD)) {
287 pr_err("frame pointer (RBP) clobber bug present, upgrade TDX module\n");
288 return -EINVAL;
289 }
290
291 return 0;
292 }
293
294 /* Calculate the actual TDMR size */
tdmr_size_single(u16 max_reserved_per_tdmr)295 static int tdmr_size_single(u16 max_reserved_per_tdmr)
296 {
297 int tdmr_sz;
298
299 /*
300 * The actual size of TDMR depends on the maximum
301 * number of reserved areas.
302 */
303 tdmr_sz = sizeof(struct tdmr_info);
304 tdmr_sz += sizeof(struct tdmr_reserved_area) * max_reserved_per_tdmr;
305
306 return ALIGN(tdmr_sz, TDMR_INFO_ALIGNMENT);
307 }
308
alloc_tdmr_list(struct tdmr_info_list * tdmr_list,struct tdx_sys_info_tdmr * sysinfo_tdmr)309 static int alloc_tdmr_list(struct tdmr_info_list *tdmr_list,
310 struct tdx_sys_info_tdmr *sysinfo_tdmr)
311 {
312 size_t tdmr_sz, tdmr_array_sz;
313 void *tdmr_array;
314
315 tdmr_sz = tdmr_size_single(sysinfo_tdmr->max_reserved_per_tdmr);
316 tdmr_array_sz = tdmr_sz * sysinfo_tdmr->max_tdmrs;
317
318 /*
319 * To keep things simple, allocate all TDMRs together.
320 * The buffer needs to be physically contiguous to make
321 * sure each TDMR is physically contiguous.
322 */
323 tdmr_array = alloc_pages_exact(tdmr_array_sz,
324 GFP_KERNEL | __GFP_ZERO);
325 if (!tdmr_array)
326 return -ENOMEM;
327
328 tdmr_list->tdmrs = tdmr_array;
329
330 /*
331 * Keep the size of TDMR to find the target TDMR
332 * at a given index in the TDMR list.
333 */
334 tdmr_list->tdmr_sz = tdmr_sz;
335 tdmr_list->max_tdmrs = sysinfo_tdmr->max_tdmrs;
336 tdmr_list->nr_consumed_tdmrs = 0;
337
338 return 0;
339 }
340
free_tdmr_list(struct tdmr_info_list * tdmr_list)341 static void free_tdmr_list(struct tdmr_info_list *tdmr_list)
342 {
343 free_pages_exact(tdmr_list->tdmrs,
344 tdmr_list->max_tdmrs * tdmr_list->tdmr_sz);
345 }
346
347 /* Get the TDMR from the list at the given index. */
tdmr_entry(struct tdmr_info_list * tdmr_list,int idx)348 static struct tdmr_info *tdmr_entry(struct tdmr_info_list *tdmr_list,
349 int idx)
350 {
351 int tdmr_info_offset = tdmr_list->tdmr_sz * idx;
352
353 return (void *)tdmr_list->tdmrs + tdmr_info_offset;
354 }
355
356 #define TDMR_ALIGNMENT SZ_1G
357 #define TDMR_ALIGN_DOWN(_addr) ALIGN_DOWN((_addr), TDMR_ALIGNMENT)
358 #define TDMR_ALIGN_UP(_addr) ALIGN((_addr), TDMR_ALIGNMENT)
359
tdmr_end(struct tdmr_info * tdmr)360 static inline u64 tdmr_end(struct tdmr_info *tdmr)
361 {
362 return tdmr->base + tdmr->size;
363 }
364
365 /*
366 * Take the memory referenced in @tmb_list and populate the
367 * preallocated @tdmr_list, following all the special alignment
368 * and size rules for TDMR.
369 */
fill_out_tdmrs(struct list_head * tmb_list,struct tdmr_info_list * tdmr_list)370 static int fill_out_tdmrs(struct list_head *tmb_list,
371 struct tdmr_info_list *tdmr_list)
372 {
373 struct tdx_memblock *tmb;
374 int tdmr_idx = 0;
375
376 /*
377 * Loop over TDX memory regions and fill out TDMRs to cover them.
378 * To keep it simple, always try to use one TDMR to cover one
379 * memory region.
380 *
381 * In practice TDX supports at least 64 TDMRs. A 2-socket system
382 * typically only consumes less than 10 of those. This code is
383 * dumb and simple and may use more TMDRs than is strictly
384 * required.
385 */
386 list_for_each_entry(tmb, tmb_list, list) {
387 struct tdmr_info *tdmr = tdmr_entry(tdmr_list, tdmr_idx);
388 u64 start, end;
389
390 start = TDMR_ALIGN_DOWN(PFN_PHYS(tmb->start_pfn));
391 end = TDMR_ALIGN_UP(PFN_PHYS(tmb->end_pfn));
392
393 /*
394 * A valid size indicates the current TDMR has already
395 * been filled out to cover the previous memory region(s).
396 */
397 if (tdmr->size) {
398 /*
399 * Loop to the next if the current memory region
400 * has already been fully covered.
401 */
402 if (end <= tdmr_end(tdmr))
403 continue;
404
405 /* Otherwise, skip the already covered part. */
406 if (start < tdmr_end(tdmr))
407 start = tdmr_end(tdmr);
408
409 /*
410 * Create a new TDMR to cover the current memory
411 * region, or the remaining part of it.
412 */
413 tdmr_idx++;
414 if (tdmr_idx >= tdmr_list->max_tdmrs) {
415 pr_warn("initialization failed: TDMRs exhausted.\n");
416 return -ENOSPC;
417 }
418
419 tdmr = tdmr_entry(tdmr_list, tdmr_idx);
420 }
421
422 tdmr->base = start;
423 tdmr->size = end - start;
424 }
425
426 /* @tdmr_idx is always the index of the last valid TDMR. */
427 tdmr_list->nr_consumed_tdmrs = tdmr_idx + 1;
428
429 /*
430 * Warn early that kernel is about to run out of TDMRs.
431 *
432 * This is an indication that TDMR allocation has to be
433 * reworked to be smarter to not run into an issue.
434 */
435 if (tdmr_list->max_tdmrs - tdmr_list->nr_consumed_tdmrs < TDMR_NR_WARN)
436 pr_warn("consumed TDMRs reaching limit: %d used out of %d\n",
437 tdmr_list->nr_consumed_tdmrs,
438 tdmr_list->max_tdmrs);
439
440 return 0;
441 }
442
443 /*
444 * Calculate PAMT size given a TDMR and a page size. The returned
445 * PAMT size is always aligned up to 4K page boundary.
446 */
tdmr_get_pamt_sz(struct tdmr_info * tdmr,int pgsz,u16 pamt_entry_size)447 static unsigned long tdmr_get_pamt_sz(struct tdmr_info *tdmr, int pgsz,
448 u16 pamt_entry_size)
449 {
450 unsigned long pamt_sz, nr_pamt_entries;
451
452 switch (pgsz) {
453 case TDX_PS_4K:
454 nr_pamt_entries = tdmr->size >> PAGE_SHIFT;
455 break;
456 case TDX_PS_2M:
457 nr_pamt_entries = tdmr->size >> PMD_SHIFT;
458 break;
459 case TDX_PS_1G:
460 nr_pamt_entries = tdmr->size >> PUD_SHIFT;
461 break;
462 default:
463 WARN_ON_ONCE(1);
464 return 0;
465 }
466
467 pamt_sz = nr_pamt_entries * pamt_entry_size;
468 /* TDX requires PAMT size must be 4K aligned */
469 pamt_sz = ALIGN(pamt_sz, PAGE_SIZE);
470
471 return pamt_sz;
472 }
473
474 /*
475 * Locate a NUMA node which should hold the allocation of the @tdmr
476 * PAMT. This node will have some memory covered by the TDMR. The
477 * relative amount of memory covered is not considered.
478 */
tdmr_get_nid(struct tdmr_info * tdmr,struct list_head * tmb_list)479 static int tdmr_get_nid(struct tdmr_info *tdmr, struct list_head *tmb_list)
480 {
481 struct tdx_memblock *tmb;
482
483 /*
484 * A TDMR must cover at least part of one TMB. That TMB will end
485 * after the TDMR begins. But, that TMB may have started before
486 * the TDMR. Find the next 'tmb' that _ends_ after this TDMR
487 * begins. Ignore 'tmb' start addresses. They are irrelevant.
488 */
489 list_for_each_entry(tmb, tmb_list, list) {
490 if (tmb->end_pfn > PHYS_PFN(tdmr->base))
491 return tmb->nid;
492 }
493
494 /*
495 * Fall back to allocating the TDMR's metadata from node 0 when
496 * no TDX memory block can be found. This should never happen
497 * since TDMRs originate from TDX memory blocks.
498 */
499 pr_warn("TDMR [0x%llx, 0x%llx): unable to find local NUMA node for PAMT allocation, fallback to use node 0.\n",
500 tdmr->base, tdmr_end(tdmr));
501 return 0;
502 }
503
504 /*
505 * Allocate PAMTs from the local NUMA node of some memory in @tmb_list
506 * within @tdmr, and set up PAMTs for @tdmr.
507 */
tdmr_set_up_pamt(struct tdmr_info * tdmr,struct list_head * tmb_list,u16 pamt_entry_size[])508 static int tdmr_set_up_pamt(struct tdmr_info *tdmr,
509 struct list_head *tmb_list,
510 u16 pamt_entry_size[])
511 {
512 unsigned long pamt_base[TDX_PS_NR];
513 unsigned long pamt_size[TDX_PS_NR];
514 unsigned long tdmr_pamt_base;
515 unsigned long tdmr_pamt_size;
516 struct page *pamt;
517 int pgsz, nid;
518
519 nid = tdmr_get_nid(tdmr, tmb_list);
520
521 /*
522 * Calculate the PAMT size for each TDX supported page size
523 * and the total PAMT size.
524 */
525 tdmr_pamt_size = 0;
526 for (pgsz = TDX_PS_4K; pgsz < TDX_PS_NR; pgsz++) {
527 pamt_size[pgsz] = tdmr_get_pamt_sz(tdmr, pgsz,
528 pamt_entry_size[pgsz]);
529 tdmr_pamt_size += pamt_size[pgsz];
530 }
531
532 /*
533 * Allocate one chunk of physically contiguous memory for all
534 * PAMTs. This helps minimize the PAMT's use of reserved areas
535 * in overlapped TDMRs.
536 */
537 pamt = alloc_contig_pages(tdmr_pamt_size >> PAGE_SHIFT, GFP_KERNEL,
538 nid, &node_online_map);
539 if (!pamt)
540 return -ENOMEM;
541
542 /*
543 * Break the contiguous allocation back up into the
544 * individual PAMTs for each page size.
545 */
546 tdmr_pamt_base = page_to_pfn(pamt) << PAGE_SHIFT;
547 for (pgsz = TDX_PS_4K; pgsz < TDX_PS_NR; pgsz++) {
548 pamt_base[pgsz] = tdmr_pamt_base;
549 tdmr_pamt_base += pamt_size[pgsz];
550 }
551
552 tdmr->pamt_4k_base = pamt_base[TDX_PS_4K];
553 tdmr->pamt_4k_size = pamt_size[TDX_PS_4K];
554 tdmr->pamt_2m_base = pamt_base[TDX_PS_2M];
555 tdmr->pamt_2m_size = pamt_size[TDX_PS_2M];
556 tdmr->pamt_1g_base = pamt_base[TDX_PS_1G];
557 tdmr->pamt_1g_size = pamt_size[TDX_PS_1G];
558
559 return 0;
560 }
561
tdmr_get_pamt(struct tdmr_info * tdmr,unsigned long * pamt_base,unsigned long * pamt_size)562 static void tdmr_get_pamt(struct tdmr_info *tdmr, unsigned long *pamt_base,
563 unsigned long *pamt_size)
564 {
565 unsigned long pamt_bs, pamt_sz;
566
567 /*
568 * The PAMT was allocated in one contiguous unit. The 4K PAMT
569 * should always point to the beginning of that allocation.
570 */
571 pamt_bs = tdmr->pamt_4k_base;
572 pamt_sz = tdmr->pamt_4k_size + tdmr->pamt_2m_size + tdmr->pamt_1g_size;
573
574 WARN_ON_ONCE((pamt_bs & ~PAGE_MASK) || (pamt_sz & ~PAGE_MASK));
575
576 *pamt_base = pamt_bs;
577 *pamt_size = pamt_sz;
578 }
579
tdmr_do_pamt_func(struct tdmr_info * tdmr,void (* pamt_func)(unsigned long base,unsigned long size))580 static void tdmr_do_pamt_func(struct tdmr_info *tdmr,
581 void (*pamt_func)(unsigned long base, unsigned long size))
582 {
583 unsigned long pamt_base, pamt_size;
584
585 tdmr_get_pamt(tdmr, &pamt_base, &pamt_size);
586
587 /* Do nothing if PAMT hasn't been allocated for this TDMR */
588 if (!pamt_size)
589 return;
590
591 if (WARN_ON_ONCE(!pamt_base))
592 return;
593
594 pamt_func(pamt_base, pamt_size);
595 }
596
free_pamt(unsigned long pamt_base,unsigned long pamt_size)597 static void free_pamt(unsigned long pamt_base, unsigned long pamt_size)
598 {
599 free_contig_range(pamt_base >> PAGE_SHIFT, pamt_size >> PAGE_SHIFT);
600 }
601
tdmr_free_pamt(struct tdmr_info * tdmr)602 static void tdmr_free_pamt(struct tdmr_info *tdmr)
603 {
604 tdmr_do_pamt_func(tdmr, free_pamt);
605 }
606
tdmrs_free_pamt_all(struct tdmr_info_list * tdmr_list)607 static void tdmrs_free_pamt_all(struct tdmr_info_list *tdmr_list)
608 {
609 int i;
610
611 for (i = 0; i < tdmr_list->nr_consumed_tdmrs; i++)
612 tdmr_free_pamt(tdmr_entry(tdmr_list, i));
613 }
614
615 /* Allocate and set up PAMTs for all TDMRs */
tdmrs_set_up_pamt_all(struct tdmr_info_list * tdmr_list,struct list_head * tmb_list,u16 pamt_entry_size[])616 static int tdmrs_set_up_pamt_all(struct tdmr_info_list *tdmr_list,
617 struct list_head *tmb_list,
618 u16 pamt_entry_size[])
619 {
620 int i, ret = 0;
621
622 for (i = 0; i < tdmr_list->nr_consumed_tdmrs; i++) {
623 ret = tdmr_set_up_pamt(tdmr_entry(tdmr_list, i), tmb_list,
624 pamt_entry_size);
625 if (ret)
626 goto err;
627 }
628
629 return 0;
630 err:
631 tdmrs_free_pamt_all(tdmr_list);
632 return ret;
633 }
634
635 /*
636 * Convert TDX private pages back to normal by using MOVDIR64B to
637 * clear these pages. Note this function doesn't flush cache of
638 * these TDX private pages. The caller should make sure of that.
639 */
reset_tdx_pages(unsigned long base,unsigned long size)640 static void reset_tdx_pages(unsigned long base, unsigned long size)
641 {
642 const void *zero_page = (const void *)page_address(ZERO_PAGE(0));
643 unsigned long phys, end;
644
645 end = base + size;
646 for (phys = base; phys < end; phys += 64)
647 movdir64b(__va(phys), zero_page);
648
649 /*
650 * MOVDIR64B uses WC protocol. Use memory barrier to
651 * make sure any later user of these pages sees the
652 * updated data.
653 */
654 mb();
655 }
656
tdmr_reset_pamt(struct tdmr_info * tdmr)657 static void tdmr_reset_pamt(struct tdmr_info *tdmr)
658 {
659 tdmr_do_pamt_func(tdmr, reset_tdx_pages);
660 }
661
tdmrs_reset_pamt_all(struct tdmr_info_list * tdmr_list)662 static void tdmrs_reset_pamt_all(struct tdmr_info_list *tdmr_list)
663 {
664 int i;
665
666 for (i = 0; i < tdmr_list->nr_consumed_tdmrs; i++)
667 tdmr_reset_pamt(tdmr_entry(tdmr_list, i));
668 }
669
tdmrs_count_pamt_kb(struct tdmr_info_list * tdmr_list)670 static unsigned long tdmrs_count_pamt_kb(struct tdmr_info_list *tdmr_list)
671 {
672 unsigned long pamt_size = 0;
673 int i;
674
675 for (i = 0; i < tdmr_list->nr_consumed_tdmrs; i++) {
676 unsigned long base, size;
677
678 tdmr_get_pamt(tdmr_entry(tdmr_list, i), &base, &size);
679 pamt_size += size;
680 }
681
682 return pamt_size / 1024;
683 }
684
tdmr_add_rsvd_area(struct tdmr_info * tdmr,int * p_idx,u64 addr,u64 size,u16 max_reserved_per_tdmr)685 static int tdmr_add_rsvd_area(struct tdmr_info *tdmr, int *p_idx, u64 addr,
686 u64 size, u16 max_reserved_per_tdmr)
687 {
688 struct tdmr_reserved_area *rsvd_areas = tdmr->reserved_areas;
689 int idx = *p_idx;
690
691 /* Reserved area must be 4K aligned in offset and size */
692 if (WARN_ON(addr & ~PAGE_MASK || size & ~PAGE_MASK))
693 return -EINVAL;
694
695 if (idx >= max_reserved_per_tdmr) {
696 pr_warn("initialization failed: TDMR [0x%llx, 0x%llx): reserved areas exhausted.\n",
697 tdmr->base, tdmr_end(tdmr));
698 return -ENOSPC;
699 }
700
701 /*
702 * Consume one reserved area per call. Make no effort to
703 * optimize or reduce the number of reserved areas which are
704 * consumed by contiguous reserved areas, for instance.
705 */
706 rsvd_areas[idx].offset = addr - tdmr->base;
707 rsvd_areas[idx].size = size;
708
709 *p_idx = idx + 1;
710
711 return 0;
712 }
713
714 /*
715 * Go through @tmb_list to find holes between memory areas. If any of
716 * those holes fall within @tdmr, set up a TDMR reserved area to cover
717 * the hole.
718 */
tdmr_populate_rsvd_holes(struct list_head * tmb_list,struct tdmr_info * tdmr,int * rsvd_idx,u16 max_reserved_per_tdmr)719 static int tdmr_populate_rsvd_holes(struct list_head *tmb_list,
720 struct tdmr_info *tdmr,
721 int *rsvd_idx,
722 u16 max_reserved_per_tdmr)
723 {
724 struct tdx_memblock *tmb;
725 u64 prev_end;
726 int ret;
727
728 /*
729 * Start looking for reserved blocks at the
730 * beginning of the TDMR.
731 */
732 prev_end = tdmr->base;
733 list_for_each_entry(tmb, tmb_list, list) {
734 u64 start, end;
735
736 start = PFN_PHYS(tmb->start_pfn);
737 end = PFN_PHYS(tmb->end_pfn);
738
739 /* Break if this region is after the TDMR */
740 if (start >= tdmr_end(tdmr))
741 break;
742
743 /* Exclude regions before this TDMR */
744 if (end < tdmr->base)
745 continue;
746
747 /*
748 * Skip over memory areas that
749 * have already been dealt with.
750 */
751 if (start <= prev_end) {
752 prev_end = end;
753 continue;
754 }
755
756 /* Add the hole before this region */
757 ret = tdmr_add_rsvd_area(tdmr, rsvd_idx, prev_end,
758 start - prev_end,
759 max_reserved_per_tdmr);
760 if (ret)
761 return ret;
762
763 prev_end = end;
764 }
765
766 /* Add the hole after the last region if it exists. */
767 if (prev_end < tdmr_end(tdmr)) {
768 ret = tdmr_add_rsvd_area(tdmr, rsvd_idx, prev_end,
769 tdmr_end(tdmr) - prev_end,
770 max_reserved_per_tdmr);
771 if (ret)
772 return ret;
773 }
774
775 return 0;
776 }
777
778 /*
779 * Go through @tdmr_list to find all PAMTs. If any of those PAMTs
780 * overlaps with @tdmr, set up a TDMR reserved area to cover the
781 * overlapping part.
782 */
tdmr_populate_rsvd_pamts(struct tdmr_info_list * tdmr_list,struct tdmr_info * tdmr,int * rsvd_idx,u16 max_reserved_per_tdmr)783 static int tdmr_populate_rsvd_pamts(struct tdmr_info_list *tdmr_list,
784 struct tdmr_info *tdmr,
785 int *rsvd_idx,
786 u16 max_reserved_per_tdmr)
787 {
788 int i, ret;
789
790 for (i = 0; i < tdmr_list->nr_consumed_tdmrs; i++) {
791 struct tdmr_info *tmp = tdmr_entry(tdmr_list, i);
792 unsigned long pamt_base, pamt_size, pamt_end;
793
794 tdmr_get_pamt(tmp, &pamt_base, &pamt_size);
795 /* Each TDMR must already have PAMT allocated */
796 WARN_ON_ONCE(!pamt_size || !pamt_base);
797
798 pamt_end = pamt_base + pamt_size;
799 /* Skip PAMTs outside of the given TDMR */
800 if ((pamt_end <= tdmr->base) ||
801 (pamt_base >= tdmr_end(tdmr)))
802 continue;
803
804 /* Only mark the part within the TDMR as reserved */
805 if (pamt_base < tdmr->base)
806 pamt_base = tdmr->base;
807 if (pamt_end > tdmr_end(tdmr))
808 pamt_end = tdmr_end(tdmr);
809
810 ret = tdmr_add_rsvd_area(tdmr, rsvd_idx, pamt_base,
811 pamt_end - pamt_base,
812 max_reserved_per_tdmr);
813 if (ret)
814 return ret;
815 }
816
817 return 0;
818 }
819
820 /* Compare function called by sort() for TDMR reserved areas */
rsvd_area_cmp_func(const void * a,const void * b)821 static int rsvd_area_cmp_func(const void *a, const void *b)
822 {
823 struct tdmr_reserved_area *r1 = (struct tdmr_reserved_area *)a;
824 struct tdmr_reserved_area *r2 = (struct tdmr_reserved_area *)b;
825
826 if (r1->offset + r1->size <= r2->offset)
827 return -1;
828 if (r1->offset >= r2->offset + r2->size)
829 return 1;
830
831 /* Reserved areas cannot overlap. The caller must guarantee. */
832 WARN_ON_ONCE(1);
833 return -1;
834 }
835
836 /*
837 * Populate reserved areas for the given @tdmr, including memory holes
838 * (via @tmb_list) and PAMTs (via @tdmr_list).
839 */
tdmr_populate_rsvd_areas(struct tdmr_info * tdmr,struct list_head * tmb_list,struct tdmr_info_list * tdmr_list,u16 max_reserved_per_tdmr)840 static int tdmr_populate_rsvd_areas(struct tdmr_info *tdmr,
841 struct list_head *tmb_list,
842 struct tdmr_info_list *tdmr_list,
843 u16 max_reserved_per_tdmr)
844 {
845 int ret, rsvd_idx = 0;
846
847 ret = tdmr_populate_rsvd_holes(tmb_list, tdmr, &rsvd_idx,
848 max_reserved_per_tdmr);
849 if (ret)
850 return ret;
851
852 ret = tdmr_populate_rsvd_pamts(tdmr_list, tdmr, &rsvd_idx,
853 max_reserved_per_tdmr);
854 if (ret)
855 return ret;
856
857 /* TDX requires reserved areas listed in address ascending order */
858 sort(tdmr->reserved_areas, rsvd_idx, sizeof(struct tdmr_reserved_area),
859 rsvd_area_cmp_func, NULL);
860
861 return 0;
862 }
863
864 /*
865 * Populate reserved areas for all TDMRs in @tdmr_list, including memory
866 * holes (via @tmb_list) and PAMTs.
867 */
tdmrs_populate_rsvd_areas_all(struct tdmr_info_list * tdmr_list,struct list_head * tmb_list,u16 max_reserved_per_tdmr)868 static int tdmrs_populate_rsvd_areas_all(struct tdmr_info_list *tdmr_list,
869 struct list_head *tmb_list,
870 u16 max_reserved_per_tdmr)
871 {
872 int i;
873
874 for (i = 0; i < tdmr_list->nr_consumed_tdmrs; i++) {
875 int ret;
876
877 ret = tdmr_populate_rsvd_areas(tdmr_entry(tdmr_list, i),
878 tmb_list, tdmr_list, max_reserved_per_tdmr);
879 if (ret)
880 return ret;
881 }
882
883 return 0;
884 }
885
886 /*
887 * Construct a list of TDMRs on the preallocated space in @tdmr_list
888 * to cover all TDX memory regions in @tmb_list based on the TDX module
889 * TDMR global information in @sysinfo_tdmr.
890 */
construct_tdmrs(struct list_head * tmb_list,struct tdmr_info_list * tdmr_list,struct tdx_sys_info_tdmr * sysinfo_tdmr)891 static int construct_tdmrs(struct list_head *tmb_list,
892 struct tdmr_info_list *tdmr_list,
893 struct tdx_sys_info_tdmr *sysinfo_tdmr)
894 {
895 u16 pamt_entry_size[TDX_PS_NR] = {
896 sysinfo_tdmr->pamt_4k_entry_size,
897 sysinfo_tdmr->pamt_2m_entry_size,
898 sysinfo_tdmr->pamt_1g_entry_size,
899 };
900 int ret;
901
902 ret = fill_out_tdmrs(tmb_list, tdmr_list);
903 if (ret)
904 return ret;
905
906 ret = tdmrs_set_up_pamt_all(tdmr_list, tmb_list, pamt_entry_size);
907 if (ret)
908 return ret;
909
910 ret = tdmrs_populate_rsvd_areas_all(tdmr_list, tmb_list,
911 sysinfo_tdmr->max_reserved_per_tdmr);
912 if (ret)
913 tdmrs_free_pamt_all(tdmr_list);
914
915 /*
916 * The tdmr_info_list is read-only from here on out.
917 * Ensure that these writes are seen by other CPUs.
918 * Pairs with a smp_rmb() in is_pamt_page().
919 */
920 smp_wmb();
921
922 return ret;
923 }
924
config_tdx_module(struct tdmr_info_list * tdmr_list,u64 global_keyid)925 static int config_tdx_module(struct tdmr_info_list *tdmr_list, u64 global_keyid)
926 {
927 struct tdx_module_args args = {};
928 u64 *tdmr_pa_array;
929 size_t array_sz;
930 int i, ret;
931
932 /*
933 * TDMRs are passed to the TDX module via an array of physical
934 * addresses of each TDMR. The array itself also has certain
935 * alignment requirement.
936 */
937 array_sz = tdmr_list->nr_consumed_tdmrs * sizeof(u64);
938 array_sz = roundup_pow_of_two(array_sz);
939 if (array_sz < TDMR_INFO_PA_ARRAY_ALIGNMENT)
940 array_sz = TDMR_INFO_PA_ARRAY_ALIGNMENT;
941
942 tdmr_pa_array = kzalloc(array_sz, GFP_KERNEL);
943 if (!tdmr_pa_array)
944 return -ENOMEM;
945
946 for (i = 0; i < tdmr_list->nr_consumed_tdmrs; i++)
947 tdmr_pa_array[i] = __pa(tdmr_entry(tdmr_list, i));
948
949 args.rcx = __pa(tdmr_pa_array);
950 args.rdx = tdmr_list->nr_consumed_tdmrs;
951 args.r8 = global_keyid;
952 ret = seamcall_prerr(TDH_SYS_CONFIG, &args);
953
954 /* Free the array as it is not required anymore. */
955 kfree(tdmr_pa_array);
956
957 return ret;
958 }
959
do_global_key_config(void * unused)960 static int do_global_key_config(void *unused)
961 {
962 struct tdx_module_args args = {};
963
964 return seamcall_prerr(TDH_SYS_KEY_CONFIG, &args);
965 }
966
967 /*
968 * Attempt to configure the global KeyID on all physical packages.
969 *
970 * This requires running code on at least one CPU in each package.
971 * TDMR initialization) will fail will fail if any package in the
972 * system has no online CPUs.
973 *
974 * This code takes no affirmative steps to online CPUs. Callers (aka.
975 * KVM) can ensure success by ensuring sufficient CPUs are online and
976 * can run SEAMCALLs.
977 */
config_global_keyid(void)978 static int config_global_keyid(void)
979 {
980 cpumask_var_t packages;
981 int cpu, ret = -EINVAL;
982
983 if (!zalloc_cpumask_var(&packages, GFP_KERNEL))
984 return -ENOMEM;
985
986 /*
987 * Hardware doesn't guarantee cache coherency across different
988 * KeyIDs. The kernel needs to flush PAMT's dirty cachelines
989 * (associated with KeyID 0) before the TDX module can use the
990 * global KeyID to access the PAMT. Given PAMTs are potentially
991 * large (~1/256th of system RAM), just use WBINVD.
992 */
993 wbinvd_on_all_cpus();
994
995 for_each_online_cpu(cpu) {
996 /*
997 * The key configuration only needs to be done once per
998 * package and will return an error if configured more
999 * than once. Avoid doing it multiple times per package.
1000 */
1001 if (cpumask_test_and_set_cpu(topology_physical_package_id(cpu),
1002 packages))
1003 continue;
1004
1005 /*
1006 * TDH.SYS.KEY.CONFIG cannot run concurrently on
1007 * different cpus. Do it one by one.
1008 */
1009 ret = smp_call_on_cpu(cpu, do_global_key_config, NULL, true);
1010 if (ret)
1011 break;
1012 }
1013
1014 free_cpumask_var(packages);
1015 return ret;
1016 }
1017
init_tdmr(struct tdmr_info * tdmr)1018 static int init_tdmr(struct tdmr_info *tdmr)
1019 {
1020 u64 next;
1021
1022 /*
1023 * Initializing a TDMR can be time consuming. To avoid long
1024 * SEAMCALLs, the TDX module may only initialize a part of the
1025 * TDMR in each call.
1026 */
1027 do {
1028 struct tdx_module_args args = {
1029 .rcx = tdmr->base,
1030 };
1031 int ret;
1032
1033 ret = seamcall_prerr_ret(TDH_SYS_TDMR_INIT, &args);
1034 if (ret)
1035 return ret;
1036 /*
1037 * RDX contains 'next-to-initialize' address if
1038 * TDH.SYS.TDMR.INIT did not fully complete and
1039 * should be retried.
1040 */
1041 next = args.rdx;
1042 cond_resched();
1043 /* Keep making SEAMCALLs until the TDMR is done */
1044 } while (next < tdmr->base + tdmr->size);
1045
1046 return 0;
1047 }
1048
init_tdmrs(struct tdmr_info_list * tdmr_list)1049 static int init_tdmrs(struct tdmr_info_list *tdmr_list)
1050 {
1051 int i;
1052
1053 /*
1054 * This operation is costly. It can be parallelized,
1055 * but keep it simple for now.
1056 */
1057 for (i = 0; i < tdmr_list->nr_consumed_tdmrs; i++) {
1058 int ret;
1059
1060 ret = init_tdmr(tdmr_entry(tdmr_list, i));
1061 if (ret)
1062 return ret;
1063 }
1064
1065 return 0;
1066 }
1067
init_tdx_module(void)1068 static int init_tdx_module(void)
1069 {
1070 int ret;
1071
1072 ret = get_tdx_sys_info(&tdx_sysinfo);
1073 if (ret)
1074 return ret;
1075
1076 /* Check whether the kernel can support this module */
1077 ret = check_features(&tdx_sysinfo);
1078 if (ret)
1079 return ret;
1080
1081 /*
1082 * To keep things simple, assume that all TDX-protected memory
1083 * will come from the page allocator. Make sure all pages in the
1084 * page allocator are TDX-usable memory.
1085 *
1086 * Build the list of "TDX-usable" memory regions which cover all
1087 * pages in the page allocator to guarantee that. Do it while
1088 * holding mem_hotplug_lock read-lock as the memory hotplug code
1089 * path reads the @tdx_memlist to reject any new memory.
1090 */
1091 get_online_mems();
1092
1093 ret = build_tdx_memlist(&tdx_memlist);
1094 if (ret)
1095 goto out_put_tdxmem;
1096
1097 /* Allocate enough space for constructing TDMRs */
1098 ret = alloc_tdmr_list(&tdx_tdmr_list, &tdx_sysinfo.tdmr);
1099 if (ret)
1100 goto err_free_tdxmem;
1101
1102 /* Cover all TDX-usable memory regions in TDMRs */
1103 ret = construct_tdmrs(&tdx_memlist, &tdx_tdmr_list, &tdx_sysinfo.tdmr);
1104 if (ret)
1105 goto err_free_tdmrs;
1106
1107 /* Pass the TDMRs and the global KeyID to the TDX module */
1108 ret = config_tdx_module(&tdx_tdmr_list, tdx_global_keyid);
1109 if (ret)
1110 goto err_free_pamts;
1111
1112 /* Config the key of global KeyID on all packages */
1113 ret = config_global_keyid();
1114 if (ret)
1115 goto err_reset_pamts;
1116
1117 /* Initialize TDMRs to complete the TDX module initialization */
1118 ret = init_tdmrs(&tdx_tdmr_list);
1119 if (ret)
1120 goto err_reset_pamts;
1121
1122 pr_info("%lu KB allocated for PAMT\n", tdmrs_count_pamt_kb(&tdx_tdmr_list));
1123
1124 out_put_tdxmem:
1125 /*
1126 * @tdx_memlist is written here and read at memory hotplug time.
1127 * Lock out memory hotplug code while building it.
1128 */
1129 put_online_mems();
1130 return ret;
1131
1132 err_reset_pamts:
1133 /*
1134 * Part of PAMTs may already have been initialized by the
1135 * TDX module. Flush cache before returning PAMTs back
1136 * to the kernel.
1137 */
1138 wbinvd_on_all_cpus();
1139 /*
1140 * According to the TDX hardware spec, if the platform
1141 * doesn't have the "partial write machine check"
1142 * erratum, any kernel read/write will never cause #MC
1143 * in kernel space, thus it's OK to not convert PAMTs
1144 * back to normal. But do the conversion anyway here
1145 * as suggested by the TDX spec.
1146 */
1147 tdmrs_reset_pamt_all(&tdx_tdmr_list);
1148 err_free_pamts:
1149 tdmrs_free_pamt_all(&tdx_tdmr_list);
1150 err_free_tdmrs:
1151 free_tdmr_list(&tdx_tdmr_list);
1152 err_free_tdxmem:
1153 free_tdx_memlist(&tdx_memlist);
1154 goto out_put_tdxmem;
1155 }
1156
__tdx_enable(void)1157 static int __tdx_enable(void)
1158 {
1159 int ret;
1160
1161 ret = init_tdx_module();
1162 if (ret) {
1163 pr_err("module initialization failed (%d)\n", ret);
1164 tdx_module_status = TDX_MODULE_ERROR;
1165 return ret;
1166 }
1167
1168 pr_info("module initialized\n");
1169 tdx_module_status = TDX_MODULE_INITIALIZED;
1170
1171 return 0;
1172 }
1173
1174 /**
1175 * tdx_enable - Enable TDX module to make it ready to run TDX guests
1176 *
1177 * This function assumes the caller has: 1) held read lock of CPU hotplug
1178 * lock to prevent any new cpu from becoming online; 2) done both VMXON
1179 * and tdx_cpu_enable() on all online cpus.
1180 *
1181 * This function requires there's at least one online cpu for each CPU
1182 * package to succeed.
1183 *
1184 * This function can be called in parallel by multiple callers.
1185 *
1186 * Return 0 if TDX is enabled successfully, otherwise error.
1187 */
tdx_enable(void)1188 int tdx_enable(void)
1189 {
1190 int ret;
1191
1192 if (!boot_cpu_has(X86_FEATURE_TDX_HOST_PLATFORM))
1193 return -ENODEV;
1194
1195 lockdep_assert_cpus_held();
1196
1197 mutex_lock(&tdx_module_lock);
1198
1199 switch (tdx_module_status) {
1200 case TDX_MODULE_UNINITIALIZED:
1201 ret = __tdx_enable();
1202 break;
1203 case TDX_MODULE_INITIALIZED:
1204 /* Already initialized, great, tell the caller. */
1205 ret = 0;
1206 break;
1207 default:
1208 /* Failed to initialize in the previous attempts */
1209 ret = -EINVAL;
1210 break;
1211 }
1212
1213 mutex_unlock(&tdx_module_lock);
1214
1215 return ret;
1216 }
1217 EXPORT_SYMBOL_GPL(tdx_enable);
1218
is_pamt_page(unsigned long phys)1219 static bool is_pamt_page(unsigned long phys)
1220 {
1221 struct tdmr_info_list *tdmr_list = &tdx_tdmr_list;
1222 int i;
1223
1224 /* Ensure that all remote 'tdmr_list' writes are visible: */
1225 smp_rmb();
1226
1227 /*
1228 * The TDX module is no longer returning TDX_SYS_NOT_READY and
1229 * is initialized. The 'tdmr_list' was initialized long ago
1230 * and is now read-only.
1231 */
1232 for (i = 0; i < tdmr_list->nr_consumed_tdmrs; i++) {
1233 unsigned long base, size;
1234
1235 tdmr_get_pamt(tdmr_entry(tdmr_list, i), &base, &size);
1236
1237 if (phys >= base && phys < (base + size))
1238 return true;
1239 }
1240
1241 return false;
1242 }
1243
1244 /*
1245 * Return whether the memory page at the given physical address is TDX
1246 * private memory or not.
1247 *
1248 * This can be imprecise for two known reasons:
1249 * 1. PAMTs are private memory and exist before the TDX module is
1250 * ready and TDH_PHYMEM_PAGE_RDMD works. This is a relatively
1251 * short window that occurs once per boot.
1252 * 2. TDH_PHYMEM_PAGE_RDMD reflects the TDX module's knowledge of the
1253 * page. However, the page can still cause #MC until it has been
1254 * fully converted to shared using 64-byte writes like MOVDIR64B.
1255 * Buggy hosts might still leave #MC-causing memory in place which
1256 * this function can not detect.
1257 */
paddr_is_tdx_private(unsigned long phys)1258 static bool paddr_is_tdx_private(unsigned long phys)
1259 {
1260 struct tdx_module_args args = {
1261 .rcx = phys & PAGE_MASK,
1262 };
1263 u64 sret;
1264
1265 if (!boot_cpu_has(X86_FEATURE_TDX_HOST_PLATFORM))
1266 return false;
1267
1268 /* Get page type from the TDX module */
1269 sret = __seamcall_ret(TDH_PHYMEM_PAGE_RDMD, &args);
1270
1271 /*
1272 * The SEAMCALL will not return success unless there is a
1273 * working, "ready" TDX module. Assume an absence of TDX
1274 * private pages until SEAMCALL is working.
1275 */
1276 if (sret)
1277 return false;
1278
1279 /*
1280 * SEAMCALL was successful -- read page type (via RCX):
1281 *
1282 * - PT_NDA: Page is not used by the TDX module
1283 * - PT_RSVD: Reserved for Non-TDX use
1284 * - Others: Page is used by the TDX module
1285 *
1286 * Note PAMT pages are marked as PT_RSVD but they are also TDX
1287 * private memory.
1288 */
1289 switch (args.rcx) {
1290 case PT_NDA:
1291 return false;
1292 case PT_RSVD:
1293 return is_pamt_page(phys);
1294 default:
1295 return true;
1296 }
1297 }
1298
1299 /*
1300 * Some TDX-capable CPUs have an erratum. A write to TDX private
1301 * memory poisons that memory, and a subsequent read of that memory
1302 * triggers #MC.
1303 *
1304 * Help distinguish erratum-triggered #MCs from a normal hardware one.
1305 * Just print additional message to show such #MC may be result of the
1306 * erratum.
1307 */
tdx_dump_mce_info(struct mce * m)1308 const char *tdx_dump_mce_info(struct mce *m)
1309 {
1310 if (!m || !mce_is_memory_error(m) || !mce_usable_address(m))
1311 return NULL;
1312
1313 if (!paddr_is_tdx_private(m->addr))
1314 return NULL;
1315
1316 return "TDX private memory error. Possible kernel bug.";
1317 }
1318
record_keyid_partitioning(u32 * tdx_keyid_start,u32 * nr_tdx_keyids)1319 static __init int record_keyid_partitioning(u32 *tdx_keyid_start,
1320 u32 *nr_tdx_keyids)
1321 {
1322 u32 _nr_mktme_keyids, _tdx_keyid_start, _nr_tdx_keyids;
1323 int ret;
1324
1325 /*
1326 * IA32_MKTME_KEYID_PARTIONING:
1327 * Bit [31:0]: Number of MKTME KeyIDs.
1328 * Bit [63:32]: Number of TDX private KeyIDs.
1329 */
1330 ret = rdmsr_safe(MSR_IA32_MKTME_KEYID_PARTITIONING, &_nr_mktme_keyids,
1331 &_nr_tdx_keyids);
1332 if (ret || !_nr_tdx_keyids)
1333 return -EINVAL;
1334
1335 /* TDX KeyIDs start after the last MKTME KeyID. */
1336 _tdx_keyid_start = _nr_mktme_keyids + 1;
1337
1338 *tdx_keyid_start = _tdx_keyid_start;
1339 *nr_tdx_keyids = _nr_tdx_keyids;
1340
1341 return 0;
1342 }
1343
is_tdx_memory(unsigned long start_pfn,unsigned long end_pfn)1344 static bool is_tdx_memory(unsigned long start_pfn, unsigned long end_pfn)
1345 {
1346 struct tdx_memblock *tmb;
1347
1348 /*
1349 * This check assumes that the start_pfn<->end_pfn range does not
1350 * cross multiple @tdx_memlist entries. A single memory online
1351 * event across multiple memblocks (from which @tdx_memlist
1352 * entries are derived at the time of module initialization) is
1353 * not possible. This is because memory offline/online is done
1354 * on granularity of 'struct memory_block', and the hotpluggable
1355 * memory region (one memblock) must be multiple of memory_block.
1356 */
1357 list_for_each_entry(tmb, &tdx_memlist, list) {
1358 if (start_pfn >= tmb->start_pfn && end_pfn <= tmb->end_pfn)
1359 return true;
1360 }
1361 return false;
1362 }
1363
tdx_memory_notifier(struct notifier_block * nb,unsigned long action,void * v)1364 static int tdx_memory_notifier(struct notifier_block *nb, unsigned long action,
1365 void *v)
1366 {
1367 struct memory_notify *mn = v;
1368
1369 if (action != MEM_GOING_ONLINE)
1370 return NOTIFY_OK;
1371
1372 /*
1373 * Empty list means TDX isn't enabled. Allow any memory
1374 * to go online.
1375 */
1376 if (list_empty(&tdx_memlist))
1377 return NOTIFY_OK;
1378
1379 /*
1380 * The TDX memory configuration is static and can not be
1381 * changed. Reject onlining any memory which is outside of
1382 * the static configuration whether it supports TDX or not.
1383 */
1384 if (is_tdx_memory(mn->start_pfn, mn->start_pfn + mn->nr_pages))
1385 return NOTIFY_OK;
1386
1387 return NOTIFY_BAD;
1388 }
1389
1390 static struct notifier_block tdx_memory_nb = {
1391 .notifier_call = tdx_memory_notifier,
1392 };
1393
check_tdx_erratum(void)1394 static void __init check_tdx_erratum(void)
1395 {
1396 /*
1397 * These CPUs have an erratum. A partial write from non-TD
1398 * software (e.g. via MOVNTI variants or UC/WC mapping) to TDX
1399 * private memory poisons that memory, and a subsequent read of
1400 * that memory triggers #MC.
1401 */
1402 switch (boot_cpu_data.x86_vfm) {
1403 case INTEL_SAPPHIRERAPIDS_X:
1404 case INTEL_EMERALDRAPIDS_X:
1405 setup_force_cpu_bug(X86_BUG_TDX_PW_MCE);
1406 }
1407 }
1408
tdx_init(void)1409 void __init tdx_init(void)
1410 {
1411 u32 tdx_keyid_start, nr_tdx_keyids;
1412 int err;
1413
1414 err = record_keyid_partitioning(&tdx_keyid_start, &nr_tdx_keyids);
1415 if (err)
1416 return;
1417
1418 pr_info("BIOS enabled: private KeyID range [%u, %u)\n",
1419 tdx_keyid_start, tdx_keyid_start + nr_tdx_keyids);
1420
1421 /*
1422 * The TDX module itself requires one 'global KeyID' to protect
1423 * its metadata. If there's only one TDX KeyID, there won't be
1424 * any left for TDX guests thus there's no point to enable TDX
1425 * at all.
1426 */
1427 if (nr_tdx_keyids < 2) {
1428 pr_err("initialization failed: too few private KeyIDs available.\n");
1429 return;
1430 }
1431
1432 /*
1433 * At this point, hibernation_available() indicates whether or
1434 * not hibernation support has been permanently disabled.
1435 */
1436 if (hibernation_available()) {
1437 pr_err("initialization failed: Hibernation support is enabled\n");
1438 return;
1439 }
1440
1441 err = register_memory_notifier(&tdx_memory_nb);
1442 if (err) {
1443 pr_err("initialization failed: register_memory_notifier() failed (%d)\n",
1444 err);
1445 return;
1446 }
1447
1448 #if defined(CONFIG_ACPI) && defined(CONFIG_SUSPEND)
1449 pr_info("Disable ACPI S3. Turn off TDX in the BIOS to use ACPI S3.\n");
1450 acpi_suspend_lowlevel = NULL;
1451 #endif
1452
1453 /*
1454 * Just use the first TDX KeyID as the 'global KeyID' and
1455 * leave the rest for TDX guests.
1456 */
1457 tdx_global_keyid = tdx_keyid_start;
1458 tdx_guest_keyid_start = tdx_keyid_start + 1;
1459 tdx_nr_guest_keyids = nr_tdx_keyids - 1;
1460
1461 setup_force_cpu_cap(X86_FEATURE_TDX_HOST_PLATFORM);
1462
1463 check_tdx_erratum();
1464 }
1465
tdx_get_sysinfo(void)1466 const struct tdx_sys_info *tdx_get_sysinfo(void)
1467 {
1468 const struct tdx_sys_info *p = NULL;
1469
1470 /* Make sure all fields in @tdx_sysinfo have been populated */
1471 mutex_lock(&tdx_module_lock);
1472 if (tdx_module_status == TDX_MODULE_INITIALIZED)
1473 p = (const struct tdx_sys_info *)&tdx_sysinfo;
1474 mutex_unlock(&tdx_module_lock);
1475
1476 return p;
1477 }
1478 EXPORT_SYMBOL_GPL(tdx_get_sysinfo);
1479
tdx_get_nr_guest_keyids(void)1480 u32 tdx_get_nr_guest_keyids(void)
1481 {
1482 return tdx_nr_guest_keyids;
1483 }
1484 EXPORT_SYMBOL_GPL(tdx_get_nr_guest_keyids);
1485
tdx_guest_keyid_alloc(void)1486 int tdx_guest_keyid_alloc(void)
1487 {
1488 return ida_alloc_range(&tdx_guest_keyid_pool, tdx_guest_keyid_start,
1489 tdx_guest_keyid_start + tdx_nr_guest_keyids - 1,
1490 GFP_KERNEL);
1491 }
1492 EXPORT_SYMBOL_GPL(tdx_guest_keyid_alloc);
1493
tdx_guest_keyid_free(unsigned int keyid)1494 void tdx_guest_keyid_free(unsigned int keyid)
1495 {
1496 ida_free(&tdx_guest_keyid_pool, keyid);
1497 }
1498 EXPORT_SYMBOL_GPL(tdx_guest_keyid_free);
1499
tdx_tdr_pa(struct tdx_td * td)1500 static inline u64 tdx_tdr_pa(struct tdx_td *td)
1501 {
1502 return page_to_phys(td->tdr_page);
1503 }
1504
tdx_tdvpr_pa(struct tdx_vp * td)1505 static inline u64 tdx_tdvpr_pa(struct tdx_vp *td)
1506 {
1507 return page_to_phys(td->tdvpr_page);
1508 }
1509
1510 /*
1511 * The TDX module exposes a CLFLUSH_BEFORE_ALLOC bit to specify whether
1512 * a CLFLUSH of pages is required before handing them to the TDX module.
1513 * Be conservative and make the code simpler by doing the CLFLUSH
1514 * unconditionally.
1515 */
tdx_clflush_page(struct page * page)1516 static void tdx_clflush_page(struct page *page)
1517 {
1518 clflush_cache_range(page_to_virt(page), PAGE_SIZE);
1519 }
1520
tdh_vp_enter(struct tdx_vp * td,struct tdx_module_args * args)1521 noinstr __flatten u64 tdh_vp_enter(struct tdx_vp *td, struct tdx_module_args *args)
1522 {
1523 args->rcx = tdx_tdvpr_pa(td);
1524
1525 return __seamcall_saved_ret(TDH_VP_ENTER, args);
1526 }
1527 EXPORT_SYMBOL_GPL(tdh_vp_enter);
1528
tdh_mng_addcx(struct tdx_td * td,struct page * tdcs_page)1529 u64 tdh_mng_addcx(struct tdx_td *td, struct page *tdcs_page)
1530 {
1531 struct tdx_module_args args = {
1532 .rcx = page_to_phys(tdcs_page),
1533 .rdx = tdx_tdr_pa(td),
1534 };
1535
1536 tdx_clflush_page(tdcs_page);
1537 return seamcall(TDH_MNG_ADDCX, &args);
1538 }
1539 EXPORT_SYMBOL_GPL(tdh_mng_addcx);
1540
tdh_mem_page_add(struct tdx_td * td,u64 gpa,struct page * page,struct page * source,u64 * ext_err1,u64 * ext_err2)1541 u64 tdh_mem_page_add(struct tdx_td *td, u64 gpa, struct page *page, struct page *source, u64 *ext_err1, u64 *ext_err2)
1542 {
1543 struct tdx_module_args args = {
1544 .rcx = gpa,
1545 .rdx = tdx_tdr_pa(td),
1546 .r8 = page_to_phys(page),
1547 .r9 = page_to_phys(source),
1548 };
1549 u64 ret;
1550
1551 tdx_clflush_page(page);
1552 ret = seamcall_ret(TDH_MEM_PAGE_ADD, &args);
1553
1554 *ext_err1 = args.rcx;
1555 *ext_err2 = args.rdx;
1556
1557 return ret;
1558 }
1559 EXPORT_SYMBOL_GPL(tdh_mem_page_add);
1560
tdh_mem_sept_add(struct tdx_td * td,u64 gpa,int level,struct page * page,u64 * ext_err1,u64 * ext_err2)1561 u64 tdh_mem_sept_add(struct tdx_td *td, u64 gpa, int level, struct page *page, u64 *ext_err1, u64 *ext_err2)
1562 {
1563 struct tdx_module_args args = {
1564 .rcx = gpa | level,
1565 .rdx = tdx_tdr_pa(td),
1566 .r8 = page_to_phys(page),
1567 };
1568 u64 ret;
1569
1570 tdx_clflush_page(page);
1571 ret = seamcall_ret(TDH_MEM_SEPT_ADD, &args);
1572
1573 *ext_err1 = args.rcx;
1574 *ext_err2 = args.rdx;
1575
1576 return ret;
1577 }
1578 EXPORT_SYMBOL_GPL(tdh_mem_sept_add);
1579
tdh_vp_addcx(struct tdx_vp * vp,struct page * tdcx_page)1580 u64 tdh_vp_addcx(struct tdx_vp *vp, struct page *tdcx_page)
1581 {
1582 struct tdx_module_args args = {
1583 .rcx = page_to_phys(tdcx_page),
1584 .rdx = tdx_tdvpr_pa(vp),
1585 };
1586
1587 tdx_clflush_page(tdcx_page);
1588 return seamcall(TDH_VP_ADDCX, &args);
1589 }
1590 EXPORT_SYMBOL_GPL(tdh_vp_addcx);
1591
tdh_mem_page_aug(struct tdx_td * td,u64 gpa,int level,struct page * page,u64 * ext_err1,u64 * ext_err2)1592 u64 tdh_mem_page_aug(struct tdx_td *td, u64 gpa, int level, struct page *page, u64 *ext_err1, u64 *ext_err2)
1593 {
1594 struct tdx_module_args args = {
1595 .rcx = gpa | level,
1596 .rdx = tdx_tdr_pa(td),
1597 .r8 = page_to_phys(page),
1598 };
1599 u64 ret;
1600
1601 tdx_clflush_page(page);
1602 ret = seamcall_ret(TDH_MEM_PAGE_AUG, &args);
1603
1604 *ext_err1 = args.rcx;
1605 *ext_err2 = args.rdx;
1606
1607 return ret;
1608 }
1609 EXPORT_SYMBOL_GPL(tdh_mem_page_aug);
1610
tdh_mem_range_block(struct tdx_td * td,u64 gpa,int level,u64 * ext_err1,u64 * ext_err2)1611 u64 tdh_mem_range_block(struct tdx_td *td, u64 gpa, int level, u64 *ext_err1, u64 *ext_err2)
1612 {
1613 struct tdx_module_args args = {
1614 .rcx = gpa | level,
1615 .rdx = tdx_tdr_pa(td),
1616 };
1617 u64 ret;
1618
1619 ret = seamcall_ret(TDH_MEM_RANGE_BLOCK, &args);
1620
1621 *ext_err1 = args.rcx;
1622 *ext_err2 = args.rdx;
1623
1624 return ret;
1625 }
1626 EXPORT_SYMBOL_GPL(tdh_mem_range_block);
1627
tdh_mng_key_config(struct tdx_td * td)1628 u64 tdh_mng_key_config(struct tdx_td *td)
1629 {
1630 struct tdx_module_args args = {
1631 .rcx = tdx_tdr_pa(td),
1632 };
1633
1634 return seamcall(TDH_MNG_KEY_CONFIG, &args);
1635 }
1636 EXPORT_SYMBOL_GPL(tdh_mng_key_config);
1637
tdh_mng_create(struct tdx_td * td,u16 hkid)1638 u64 tdh_mng_create(struct tdx_td *td, u16 hkid)
1639 {
1640 struct tdx_module_args args = {
1641 .rcx = tdx_tdr_pa(td),
1642 .rdx = hkid,
1643 };
1644
1645 tdx_clflush_page(td->tdr_page);
1646 return seamcall(TDH_MNG_CREATE, &args);
1647 }
1648 EXPORT_SYMBOL_GPL(tdh_mng_create);
1649
tdh_vp_create(struct tdx_td * td,struct tdx_vp * vp)1650 u64 tdh_vp_create(struct tdx_td *td, struct tdx_vp *vp)
1651 {
1652 struct tdx_module_args args = {
1653 .rcx = tdx_tdvpr_pa(vp),
1654 .rdx = tdx_tdr_pa(td),
1655 };
1656
1657 tdx_clflush_page(vp->tdvpr_page);
1658 return seamcall(TDH_VP_CREATE, &args);
1659 }
1660 EXPORT_SYMBOL_GPL(tdh_vp_create);
1661
tdh_mng_rd(struct tdx_td * td,u64 field,u64 * data)1662 u64 tdh_mng_rd(struct tdx_td *td, u64 field, u64 *data)
1663 {
1664 struct tdx_module_args args = {
1665 .rcx = tdx_tdr_pa(td),
1666 .rdx = field,
1667 };
1668 u64 ret;
1669
1670 ret = seamcall_ret(TDH_MNG_RD, &args);
1671
1672 /* R8: Content of the field, or 0 in case of error. */
1673 *data = args.r8;
1674
1675 return ret;
1676 }
1677 EXPORT_SYMBOL_GPL(tdh_mng_rd);
1678
tdh_mr_extend(struct tdx_td * td,u64 gpa,u64 * ext_err1,u64 * ext_err2)1679 u64 tdh_mr_extend(struct tdx_td *td, u64 gpa, u64 *ext_err1, u64 *ext_err2)
1680 {
1681 struct tdx_module_args args = {
1682 .rcx = gpa,
1683 .rdx = tdx_tdr_pa(td),
1684 };
1685 u64 ret;
1686
1687 ret = seamcall_ret(TDH_MR_EXTEND, &args);
1688
1689 *ext_err1 = args.rcx;
1690 *ext_err2 = args.rdx;
1691
1692 return ret;
1693 }
1694 EXPORT_SYMBOL_GPL(tdh_mr_extend);
1695
tdh_mr_finalize(struct tdx_td * td)1696 u64 tdh_mr_finalize(struct tdx_td *td)
1697 {
1698 struct tdx_module_args args = {
1699 .rcx = tdx_tdr_pa(td),
1700 };
1701
1702 return seamcall(TDH_MR_FINALIZE, &args);
1703 }
1704 EXPORT_SYMBOL_GPL(tdh_mr_finalize);
1705
tdh_vp_flush(struct tdx_vp * vp)1706 u64 tdh_vp_flush(struct tdx_vp *vp)
1707 {
1708 struct tdx_module_args args = {
1709 .rcx = tdx_tdvpr_pa(vp),
1710 };
1711
1712 return seamcall(TDH_VP_FLUSH, &args);
1713 }
1714 EXPORT_SYMBOL_GPL(tdh_vp_flush);
1715
tdh_mng_vpflushdone(struct tdx_td * td)1716 u64 tdh_mng_vpflushdone(struct tdx_td *td)
1717 {
1718 struct tdx_module_args args = {
1719 .rcx = tdx_tdr_pa(td),
1720 };
1721
1722 return seamcall(TDH_MNG_VPFLUSHDONE, &args);
1723 }
1724 EXPORT_SYMBOL_GPL(tdh_mng_vpflushdone);
1725
tdh_mng_key_freeid(struct tdx_td * td)1726 u64 tdh_mng_key_freeid(struct tdx_td *td)
1727 {
1728 struct tdx_module_args args = {
1729 .rcx = tdx_tdr_pa(td),
1730 };
1731
1732 return seamcall(TDH_MNG_KEY_FREEID, &args);
1733 }
1734 EXPORT_SYMBOL_GPL(tdh_mng_key_freeid);
1735
tdh_mng_init(struct tdx_td * td,u64 td_params,u64 * extended_err)1736 u64 tdh_mng_init(struct tdx_td *td, u64 td_params, u64 *extended_err)
1737 {
1738 struct tdx_module_args args = {
1739 .rcx = tdx_tdr_pa(td),
1740 .rdx = td_params,
1741 };
1742 u64 ret;
1743
1744 ret = seamcall_ret(TDH_MNG_INIT, &args);
1745
1746 *extended_err = args.rcx;
1747
1748 return ret;
1749 }
1750 EXPORT_SYMBOL_GPL(tdh_mng_init);
1751
tdh_vp_rd(struct tdx_vp * vp,u64 field,u64 * data)1752 u64 tdh_vp_rd(struct tdx_vp *vp, u64 field, u64 *data)
1753 {
1754 struct tdx_module_args args = {
1755 .rcx = tdx_tdvpr_pa(vp),
1756 .rdx = field,
1757 };
1758 u64 ret;
1759
1760 ret = seamcall_ret(TDH_VP_RD, &args);
1761
1762 /* R8: Content of the field, or 0 in case of error. */
1763 *data = args.r8;
1764
1765 return ret;
1766 }
1767 EXPORT_SYMBOL_GPL(tdh_vp_rd);
1768
tdh_vp_wr(struct tdx_vp * vp,u64 field,u64 data,u64 mask)1769 u64 tdh_vp_wr(struct tdx_vp *vp, u64 field, u64 data, u64 mask)
1770 {
1771 struct tdx_module_args args = {
1772 .rcx = tdx_tdvpr_pa(vp),
1773 .rdx = field,
1774 .r8 = data,
1775 .r9 = mask,
1776 };
1777
1778 return seamcall(TDH_VP_WR, &args);
1779 }
1780 EXPORT_SYMBOL_GPL(tdh_vp_wr);
1781
tdh_vp_init(struct tdx_vp * vp,u64 initial_rcx,u32 x2apicid)1782 u64 tdh_vp_init(struct tdx_vp *vp, u64 initial_rcx, u32 x2apicid)
1783 {
1784 struct tdx_module_args args = {
1785 .rcx = tdx_tdvpr_pa(vp),
1786 .rdx = initial_rcx,
1787 .r8 = x2apicid,
1788 };
1789
1790 /* apicid requires version == 1. */
1791 return seamcall(TDH_VP_INIT | (1ULL << TDX_VERSION_SHIFT), &args);
1792 }
1793 EXPORT_SYMBOL_GPL(tdh_vp_init);
1794
1795 /*
1796 * TDX ABI defines output operands as PT, OWNER and SIZE. These are TDX defined fomats.
1797 * So despite the names, they must be interpted specially as described by the spec. Return
1798 * them only for error reporting purposes.
1799 */
tdh_phymem_page_reclaim(struct page * page,u64 * tdx_pt,u64 * tdx_owner,u64 * tdx_size)1800 u64 tdh_phymem_page_reclaim(struct page *page, u64 *tdx_pt, u64 *tdx_owner, u64 *tdx_size)
1801 {
1802 struct tdx_module_args args = {
1803 .rcx = page_to_phys(page),
1804 };
1805 u64 ret;
1806
1807 ret = seamcall_ret(TDH_PHYMEM_PAGE_RECLAIM, &args);
1808
1809 *tdx_pt = args.rcx;
1810 *tdx_owner = args.rdx;
1811 *tdx_size = args.r8;
1812
1813 return ret;
1814 }
1815 EXPORT_SYMBOL_GPL(tdh_phymem_page_reclaim);
1816
tdh_mem_track(struct tdx_td * td)1817 u64 tdh_mem_track(struct tdx_td *td)
1818 {
1819 struct tdx_module_args args = {
1820 .rcx = tdx_tdr_pa(td),
1821 };
1822
1823 return seamcall(TDH_MEM_TRACK, &args);
1824 }
1825 EXPORT_SYMBOL_GPL(tdh_mem_track);
1826
tdh_mem_page_remove(struct tdx_td * td,u64 gpa,u64 level,u64 * ext_err1,u64 * ext_err2)1827 u64 tdh_mem_page_remove(struct tdx_td *td, u64 gpa, u64 level, u64 *ext_err1, u64 *ext_err2)
1828 {
1829 struct tdx_module_args args = {
1830 .rcx = gpa | level,
1831 .rdx = tdx_tdr_pa(td),
1832 };
1833 u64 ret;
1834
1835 ret = seamcall_ret(TDH_MEM_PAGE_REMOVE, &args);
1836
1837 *ext_err1 = args.rcx;
1838 *ext_err2 = args.rdx;
1839
1840 return ret;
1841 }
1842 EXPORT_SYMBOL_GPL(tdh_mem_page_remove);
1843
tdh_phymem_cache_wb(bool resume)1844 u64 tdh_phymem_cache_wb(bool resume)
1845 {
1846 struct tdx_module_args args = {
1847 .rcx = resume ? 1 : 0,
1848 };
1849
1850 return seamcall(TDH_PHYMEM_CACHE_WB, &args);
1851 }
1852 EXPORT_SYMBOL_GPL(tdh_phymem_cache_wb);
1853
tdh_phymem_page_wbinvd_tdr(struct tdx_td * td)1854 u64 tdh_phymem_page_wbinvd_tdr(struct tdx_td *td)
1855 {
1856 struct tdx_module_args args = {};
1857
1858 args.rcx = mk_keyed_paddr(tdx_global_keyid, td->tdr_page);
1859
1860 return seamcall(TDH_PHYMEM_PAGE_WBINVD, &args);
1861 }
1862 EXPORT_SYMBOL_GPL(tdh_phymem_page_wbinvd_tdr);
1863
tdh_phymem_page_wbinvd_hkid(u64 hkid,struct page * page)1864 u64 tdh_phymem_page_wbinvd_hkid(u64 hkid, struct page *page)
1865 {
1866 struct tdx_module_args args = {};
1867
1868 args.rcx = mk_keyed_paddr(hkid, page);
1869
1870 return seamcall(TDH_PHYMEM_PAGE_WBINVD, &args);
1871 }
1872 EXPORT_SYMBOL_GPL(tdh_phymem_page_wbinvd_hkid);
1873