1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright(c) 2023 Intel Corporation. 4 * 5 * Intel Trusted Domain Extensions (TDX) support 6 */ 7 8 #include "asm/page_types.h" 9 #define pr_fmt(fmt) "virt/tdx: " fmt 10 11 #include <linux/types.h> 12 #include <linux/cache.h> 13 #include <linux/init.h> 14 #include <linux/errno.h> 15 #include <linux/printk.h> 16 #include <linux/cpu.h> 17 #include <linux/spinlock.h> 18 #include <linux/percpu-defs.h> 19 #include <linux/mutex.h> 20 #include <linux/list.h> 21 #include <linux/memblock.h> 22 #include <linux/memory.h> 23 #include <linux/minmax.h> 24 #include <linux/sizes.h> 25 #include <linux/pfn.h> 26 #include <linux/align.h> 27 #include <linux/sort.h> 28 #include <linux/log2.h> 29 #include <linux/acpi.h> 30 #include <linux/suspend.h> 31 #include <linux/idr.h> 32 #include <asm/page.h> 33 #include <asm/special_insns.h> 34 #include <asm/msr-index.h> 35 #include <asm/msr.h> 36 #include <asm/cpufeature.h> 37 #include <asm/tdx.h> 38 #include <asm/cpu_device_id.h> 39 #include <asm/processor.h> 40 #include <asm/mce.h> 41 #include "tdx.h" 42 43 static u32 tdx_global_keyid __ro_after_init; 44 static u32 tdx_guest_keyid_start __ro_after_init; 45 static u32 tdx_nr_guest_keyids __ro_after_init; 46 47 static DEFINE_IDA(tdx_guest_keyid_pool); 48 49 static DEFINE_PER_CPU(bool, tdx_lp_initialized); 50 51 static struct tdmr_info_list tdx_tdmr_list; 52 53 static enum tdx_module_status_t tdx_module_status; 54 static DEFINE_MUTEX(tdx_module_lock); 55 56 /* All TDX-usable memory regions. Protected by mem_hotplug_lock. */ 57 static LIST_HEAD(tdx_memlist); 58 59 static struct tdx_sys_info tdx_sysinfo; 60 61 typedef void (*sc_err_func_t)(u64 fn, u64 err, struct tdx_module_args *args); 62 63 static inline void seamcall_err(u64 fn, u64 err, struct tdx_module_args *args) 64 { 65 pr_err("SEAMCALL (0x%016llx) failed: 0x%016llx\n", fn, err); 66 } 67 68 static inline void seamcall_err_ret(u64 fn, u64 err, 69 struct tdx_module_args *args) 70 { 71 seamcall_err(fn, err, args); 72 pr_err("RCX 0x%016llx RDX 0x%016llx R08 0x%016llx\n", 73 args->rcx, args->rdx, args->r8); 74 pr_err("R09 0x%016llx R10 0x%016llx R11 0x%016llx\n", 75 args->r9, args->r10, args->r11); 76 } 77 78 static inline int sc_retry_prerr(sc_func_t func, sc_err_func_t err_func, 79 u64 fn, struct tdx_module_args *args) 80 { 81 u64 sret = sc_retry(func, fn, args); 82 83 if (sret == TDX_SUCCESS) 84 return 0; 85 86 if (sret == TDX_SEAMCALL_VMFAILINVALID) 87 return -ENODEV; 88 89 if (sret == TDX_SEAMCALL_GP) 90 return -EOPNOTSUPP; 91 92 if (sret == TDX_SEAMCALL_UD) 93 return -EACCES; 94 95 err_func(fn, sret, args); 96 return -EIO; 97 } 98 99 #define seamcall_prerr(__fn, __args) \ 100 sc_retry_prerr(__seamcall, seamcall_err, (__fn), (__args)) 101 102 #define seamcall_prerr_ret(__fn, __args) \ 103 sc_retry_prerr(__seamcall_ret, seamcall_err_ret, (__fn), (__args)) 104 105 /* 106 * Do the module global initialization once and return its result. 107 * It can be done on any cpu. It's always called with interrupts 108 * disabled. 109 */ 110 static int try_init_module_global(void) 111 { 112 struct tdx_module_args args = {}; 113 static DEFINE_RAW_SPINLOCK(sysinit_lock); 114 static bool sysinit_done; 115 static int sysinit_ret; 116 117 lockdep_assert_irqs_disabled(); 118 119 raw_spin_lock(&sysinit_lock); 120 121 if (sysinit_done) 122 goto out; 123 124 /* RCX is module attributes and all bits are reserved */ 125 args.rcx = 0; 126 sysinit_ret = seamcall_prerr(TDH_SYS_INIT, &args); 127 128 /* 129 * The first SEAMCALL also detects the TDX module, thus 130 * it can fail due to the TDX module is not loaded. 131 * Dump message to let the user know. 132 */ 133 if (sysinit_ret == -ENODEV) 134 pr_err("module not loaded\n"); 135 136 sysinit_done = true; 137 out: 138 raw_spin_unlock(&sysinit_lock); 139 return sysinit_ret; 140 } 141 142 /** 143 * tdx_cpu_enable - Enable TDX on local cpu 144 * 145 * Do one-time TDX module per-cpu initialization SEAMCALL (and TDX module 146 * global initialization SEAMCALL if not done) on local cpu to make this 147 * cpu be ready to run any other SEAMCALLs. 148 * 149 * Always call this function via IPI function calls. 150 * 151 * Return 0 on success, otherwise errors. 152 */ 153 int tdx_cpu_enable(void) 154 { 155 struct tdx_module_args args = {}; 156 int ret; 157 158 if (!boot_cpu_has(X86_FEATURE_TDX_HOST_PLATFORM)) 159 return -ENODEV; 160 161 lockdep_assert_irqs_disabled(); 162 163 if (__this_cpu_read(tdx_lp_initialized)) 164 return 0; 165 166 /* 167 * The TDX module global initialization is the very first step 168 * to enable TDX. Need to do it first (if hasn't been done) 169 * before the per-cpu initialization. 170 */ 171 ret = try_init_module_global(); 172 if (ret) 173 return ret; 174 175 ret = seamcall_prerr(TDH_SYS_LP_INIT, &args); 176 if (ret) 177 return ret; 178 179 __this_cpu_write(tdx_lp_initialized, true); 180 181 return 0; 182 } 183 EXPORT_SYMBOL_GPL(tdx_cpu_enable); 184 185 /* 186 * Add a memory region as a TDX memory block. The caller must make sure 187 * all memory regions are added in address ascending order and don't 188 * overlap. 189 */ 190 static int add_tdx_memblock(struct list_head *tmb_list, unsigned long start_pfn, 191 unsigned long end_pfn, int nid) 192 { 193 struct tdx_memblock *tmb; 194 195 tmb = kmalloc(sizeof(*tmb), GFP_KERNEL); 196 if (!tmb) 197 return -ENOMEM; 198 199 INIT_LIST_HEAD(&tmb->list); 200 tmb->start_pfn = start_pfn; 201 tmb->end_pfn = end_pfn; 202 tmb->nid = nid; 203 204 /* @tmb_list is protected by mem_hotplug_lock */ 205 list_add_tail(&tmb->list, tmb_list); 206 return 0; 207 } 208 209 static void free_tdx_memlist(struct list_head *tmb_list) 210 { 211 /* @tmb_list is protected by mem_hotplug_lock */ 212 while (!list_empty(tmb_list)) { 213 struct tdx_memblock *tmb = list_first_entry(tmb_list, 214 struct tdx_memblock, list); 215 216 list_del(&tmb->list); 217 kfree(tmb); 218 } 219 } 220 221 /* 222 * Ensure that all memblock memory regions are convertible to TDX 223 * memory. Once this has been established, stash the memblock 224 * ranges off in a secondary structure because memblock is modified 225 * in memory hotplug while TDX memory regions are fixed. 226 */ 227 static int build_tdx_memlist(struct list_head *tmb_list) 228 { 229 unsigned long start_pfn, end_pfn; 230 int i, nid, ret; 231 232 for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) { 233 /* 234 * The first 1MB is not reported as TDX convertible memory. 235 * Although the first 1MB is always reserved and won't end up 236 * to the page allocator, it is still in memblock's memory 237 * regions. Skip them manually to exclude them as TDX memory. 238 */ 239 start_pfn = max(start_pfn, PHYS_PFN(SZ_1M)); 240 if (start_pfn >= end_pfn) 241 continue; 242 243 /* 244 * Add the memory regions as TDX memory. The regions in 245 * memblock has already guaranteed they are in address 246 * ascending order and don't overlap. 247 */ 248 ret = add_tdx_memblock(tmb_list, start_pfn, end_pfn, nid); 249 if (ret) 250 goto err; 251 } 252 253 return 0; 254 err: 255 free_tdx_memlist(tmb_list); 256 return ret; 257 } 258 259 static int read_sys_metadata_field(u64 field_id, u64 *data) 260 { 261 struct tdx_module_args args = {}; 262 int ret; 263 264 /* 265 * TDH.SYS.RD -- reads one global metadata field 266 * - RDX (in): the field to read 267 * - R8 (out): the field data 268 */ 269 args.rdx = field_id; 270 ret = seamcall_prerr_ret(TDH_SYS_RD, &args); 271 if (ret) 272 return ret; 273 274 *data = args.r8; 275 276 return 0; 277 } 278 279 #include "tdx_global_metadata.c" 280 281 static int check_features(struct tdx_sys_info *sysinfo) 282 { 283 u64 tdx_features0 = sysinfo->features.tdx_features0; 284 285 if (!(tdx_features0 & TDX_FEATURES0_NO_RBP_MOD)) { 286 pr_err("frame pointer (RBP) clobber bug present, upgrade TDX module\n"); 287 return -EINVAL; 288 } 289 290 return 0; 291 } 292 293 /* Calculate the actual TDMR size */ 294 static int tdmr_size_single(u16 max_reserved_per_tdmr) 295 { 296 int tdmr_sz; 297 298 /* 299 * The actual size of TDMR depends on the maximum 300 * number of reserved areas. 301 */ 302 tdmr_sz = sizeof(struct tdmr_info); 303 tdmr_sz += sizeof(struct tdmr_reserved_area) * max_reserved_per_tdmr; 304 305 return ALIGN(tdmr_sz, TDMR_INFO_ALIGNMENT); 306 } 307 308 static int alloc_tdmr_list(struct tdmr_info_list *tdmr_list, 309 struct tdx_sys_info_tdmr *sysinfo_tdmr) 310 { 311 size_t tdmr_sz, tdmr_array_sz; 312 void *tdmr_array; 313 314 tdmr_sz = tdmr_size_single(sysinfo_tdmr->max_reserved_per_tdmr); 315 tdmr_array_sz = tdmr_sz * sysinfo_tdmr->max_tdmrs; 316 317 /* 318 * To keep things simple, allocate all TDMRs together. 319 * The buffer needs to be physically contiguous to make 320 * sure each TDMR is physically contiguous. 321 */ 322 tdmr_array = alloc_pages_exact(tdmr_array_sz, 323 GFP_KERNEL | __GFP_ZERO); 324 if (!tdmr_array) 325 return -ENOMEM; 326 327 tdmr_list->tdmrs = tdmr_array; 328 329 /* 330 * Keep the size of TDMR to find the target TDMR 331 * at a given index in the TDMR list. 332 */ 333 tdmr_list->tdmr_sz = tdmr_sz; 334 tdmr_list->max_tdmrs = sysinfo_tdmr->max_tdmrs; 335 tdmr_list->nr_consumed_tdmrs = 0; 336 337 return 0; 338 } 339 340 static void free_tdmr_list(struct tdmr_info_list *tdmr_list) 341 { 342 free_pages_exact(tdmr_list->tdmrs, 343 tdmr_list->max_tdmrs * tdmr_list->tdmr_sz); 344 } 345 346 /* Get the TDMR from the list at the given index. */ 347 static struct tdmr_info *tdmr_entry(struct tdmr_info_list *tdmr_list, 348 int idx) 349 { 350 int tdmr_info_offset = tdmr_list->tdmr_sz * idx; 351 352 return (void *)tdmr_list->tdmrs + tdmr_info_offset; 353 } 354 355 #define TDMR_ALIGNMENT SZ_1G 356 #define TDMR_ALIGN_DOWN(_addr) ALIGN_DOWN((_addr), TDMR_ALIGNMENT) 357 #define TDMR_ALIGN_UP(_addr) ALIGN((_addr), TDMR_ALIGNMENT) 358 359 static inline u64 tdmr_end(struct tdmr_info *tdmr) 360 { 361 return tdmr->base + tdmr->size; 362 } 363 364 /* 365 * Take the memory referenced in @tmb_list and populate the 366 * preallocated @tdmr_list, following all the special alignment 367 * and size rules for TDMR. 368 */ 369 static int fill_out_tdmrs(struct list_head *tmb_list, 370 struct tdmr_info_list *tdmr_list) 371 { 372 struct tdx_memblock *tmb; 373 int tdmr_idx = 0; 374 375 /* 376 * Loop over TDX memory regions and fill out TDMRs to cover them. 377 * To keep it simple, always try to use one TDMR to cover one 378 * memory region. 379 * 380 * In practice TDX supports at least 64 TDMRs. A 2-socket system 381 * typically only consumes less than 10 of those. This code is 382 * dumb and simple and may use more TMDRs than is strictly 383 * required. 384 */ 385 list_for_each_entry(tmb, tmb_list, list) { 386 struct tdmr_info *tdmr = tdmr_entry(tdmr_list, tdmr_idx); 387 u64 start, end; 388 389 start = TDMR_ALIGN_DOWN(PFN_PHYS(tmb->start_pfn)); 390 end = TDMR_ALIGN_UP(PFN_PHYS(tmb->end_pfn)); 391 392 /* 393 * A valid size indicates the current TDMR has already 394 * been filled out to cover the previous memory region(s). 395 */ 396 if (tdmr->size) { 397 /* 398 * Loop to the next if the current memory region 399 * has already been fully covered. 400 */ 401 if (end <= tdmr_end(tdmr)) 402 continue; 403 404 /* Otherwise, skip the already covered part. */ 405 if (start < tdmr_end(tdmr)) 406 start = tdmr_end(tdmr); 407 408 /* 409 * Create a new TDMR to cover the current memory 410 * region, or the remaining part of it. 411 */ 412 tdmr_idx++; 413 if (tdmr_idx >= tdmr_list->max_tdmrs) { 414 pr_warn("initialization failed: TDMRs exhausted.\n"); 415 return -ENOSPC; 416 } 417 418 tdmr = tdmr_entry(tdmr_list, tdmr_idx); 419 } 420 421 tdmr->base = start; 422 tdmr->size = end - start; 423 } 424 425 /* @tdmr_idx is always the index of the last valid TDMR. */ 426 tdmr_list->nr_consumed_tdmrs = tdmr_idx + 1; 427 428 /* 429 * Warn early that kernel is about to run out of TDMRs. 430 * 431 * This is an indication that TDMR allocation has to be 432 * reworked to be smarter to not run into an issue. 433 */ 434 if (tdmr_list->max_tdmrs - tdmr_list->nr_consumed_tdmrs < TDMR_NR_WARN) 435 pr_warn("consumed TDMRs reaching limit: %d used out of %d\n", 436 tdmr_list->nr_consumed_tdmrs, 437 tdmr_list->max_tdmrs); 438 439 return 0; 440 } 441 442 /* 443 * Calculate PAMT size given a TDMR and a page size. The returned 444 * PAMT size is always aligned up to 4K page boundary. 445 */ 446 static unsigned long tdmr_get_pamt_sz(struct tdmr_info *tdmr, int pgsz, 447 u16 pamt_entry_size) 448 { 449 unsigned long pamt_sz, nr_pamt_entries; 450 451 switch (pgsz) { 452 case TDX_PS_4K: 453 nr_pamt_entries = tdmr->size >> PAGE_SHIFT; 454 break; 455 case TDX_PS_2M: 456 nr_pamt_entries = tdmr->size >> PMD_SHIFT; 457 break; 458 case TDX_PS_1G: 459 nr_pamt_entries = tdmr->size >> PUD_SHIFT; 460 break; 461 default: 462 WARN_ON_ONCE(1); 463 return 0; 464 } 465 466 pamt_sz = nr_pamt_entries * pamt_entry_size; 467 /* TDX requires PAMT size must be 4K aligned */ 468 pamt_sz = ALIGN(pamt_sz, PAGE_SIZE); 469 470 return pamt_sz; 471 } 472 473 /* 474 * Locate a NUMA node which should hold the allocation of the @tdmr 475 * PAMT. This node will have some memory covered by the TDMR. The 476 * relative amount of memory covered is not considered. 477 */ 478 static int tdmr_get_nid(struct tdmr_info *tdmr, struct list_head *tmb_list) 479 { 480 struct tdx_memblock *tmb; 481 482 /* 483 * A TDMR must cover at least part of one TMB. That TMB will end 484 * after the TDMR begins. But, that TMB may have started before 485 * the TDMR. Find the next 'tmb' that _ends_ after this TDMR 486 * begins. Ignore 'tmb' start addresses. They are irrelevant. 487 */ 488 list_for_each_entry(tmb, tmb_list, list) { 489 if (tmb->end_pfn > PHYS_PFN(tdmr->base)) 490 return tmb->nid; 491 } 492 493 /* 494 * Fall back to allocating the TDMR's metadata from node 0 when 495 * no TDX memory block can be found. This should never happen 496 * since TDMRs originate from TDX memory blocks. 497 */ 498 pr_warn("TDMR [0x%llx, 0x%llx): unable to find local NUMA node for PAMT allocation, fallback to use node 0.\n", 499 tdmr->base, tdmr_end(tdmr)); 500 return 0; 501 } 502 503 /* 504 * Allocate PAMTs from the local NUMA node of some memory in @tmb_list 505 * within @tdmr, and set up PAMTs for @tdmr. 506 */ 507 static int tdmr_set_up_pamt(struct tdmr_info *tdmr, 508 struct list_head *tmb_list, 509 u16 pamt_entry_size[]) 510 { 511 unsigned long pamt_base[TDX_PS_NR]; 512 unsigned long pamt_size[TDX_PS_NR]; 513 unsigned long tdmr_pamt_base; 514 unsigned long tdmr_pamt_size; 515 struct page *pamt; 516 int pgsz, nid; 517 518 nid = tdmr_get_nid(tdmr, tmb_list); 519 520 /* 521 * Calculate the PAMT size for each TDX supported page size 522 * and the total PAMT size. 523 */ 524 tdmr_pamt_size = 0; 525 for (pgsz = TDX_PS_4K; pgsz < TDX_PS_NR; pgsz++) { 526 pamt_size[pgsz] = tdmr_get_pamt_sz(tdmr, pgsz, 527 pamt_entry_size[pgsz]); 528 tdmr_pamt_size += pamt_size[pgsz]; 529 } 530 531 /* 532 * Allocate one chunk of physically contiguous memory for all 533 * PAMTs. This helps minimize the PAMT's use of reserved areas 534 * in overlapped TDMRs. 535 */ 536 pamt = alloc_contig_pages(tdmr_pamt_size >> PAGE_SHIFT, GFP_KERNEL, 537 nid, &node_online_map); 538 if (!pamt) 539 return -ENOMEM; 540 541 /* 542 * Break the contiguous allocation back up into the 543 * individual PAMTs for each page size. 544 */ 545 tdmr_pamt_base = page_to_pfn(pamt) << PAGE_SHIFT; 546 for (pgsz = TDX_PS_4K; pgsz < TDX_PS_NR; pgsz++) { 547 pamt_base[pgsz] = tdmr_pamt_base; 548 tdmr_pamt_base += pamt_size[pgsz]; 549 } 550 551 tdmr->pamt_4k_base = pamt_base[TDX_PS_4K]; 552 tdmr->pamt_4k_size = pamt_size[TDX_PS_4K]; 553 tdmr->pamt_2m_base = pamt_base[TDX_PS_2M]; 554 tdmr->pamt_2m_size = pamt_size[TDX_PS_2M]; 555 tdmr->pamt_1g_base = pamt_base[TDX_PS_1G]; 556 tdmr->pamt_1g_size = pamt_size[TDX_PS_1G]; 557 558 return 0; 559 } 560 561 static void tdmr_get_pamt(struct tdmr_info *tdmr, unsigned long *pamt_base, 562 unsigned long *pamt_size) 563 { 564 unsigned long pamt_bs, pamt_sz; 565 566 /* 567 * The PAMT was allocated in one contiguous unit. The 4K PAMT 568 * should always point to the beginning of that allocation. 569 */ 570 pamt_bs = tdmr->pamt_4k_base; 571 pamt_sz = tdmr->pamt_4k_size + tdmr->pamt_2m_size + tdmr->pamt_1g_size; 572 573 WARN_ON_ONCE((pamt_bs & ~PAGE_MASK) || (pamt_sz & ~PAGE_MASK)); 574 575 *pamt_base = pamt_bs; 576 *pamt_size = pamt_sz; 577 } 578 579 static void tdmr_do_pamt_func(struct tdmr_info *tdmr, 580 void (*pamt_func)(unsigned long base, unsigned long size)) 581 { 582 unsigned long pamt_base, pamt_size; 583 584 tdmr_get_pamt(tdmr, &pamt_base, &pamt_size); 585 586 /* Do nothing if PAMT hasn't been allocated for this TDMR */ 587 if (!pamt_size) 588 return; 589 590 if (WARN_ON_ONCE(!pamt_base)) 591 return; 592 593 pamt_func(pamt_base, pamt_size); 594 } 595 596 static void free_pamt(unsigned long pamt_base, unsigned long pamt_size) 597 { 598 free_contig_range(pamt_base >> PAGE_SHIFT, pamt_size >> PAGE_SHIFT); 599 } 600 601 static void tdmr_free_pamt(struct tdmr_info *tdmr) 602 { 603 tdmr_do_pamt_func(tdmr, free_pamt); 604 } 605 606 static void tdmrs_free_pamt_all(struct tdmr_info_list *tdmr_list) 607 { 608 int i; 609 610 for (i = 0; i < tdmr_list->nr_consumed_tdmrs; i++) 611 tdmr_free_pamt(tdmr_entry(tdmr_list, i)); 612 } 613 614 /* Allocate and set up PAMTs for all TDMRs */ 615 static int tdmrs_set_up_pamt_all(struct tdmr_info_list *tdmr_list, 616 struct list_head *tmb_list, 617 u16 pamt_entry_size[]) 618 { 619 int i, ret = 0; 620 621 for (i = 0; i < tdmr_list->nr_consumed_tdmrs; i++) { 622 ret = tdmr_set_up_pamt(tdmr_entry(tdmr_list, i), tmb_list, 623 pamt_entry_size); 624 if (ret) 625 goto err; 626 } 627 628 return 0; 629 err: 630 tdmrs_free_pamt_all(tdmr_list); 631 return ret; 632 } 633 634 /* 635 * Convert TDX private pages back to normal by using MOVDIR64B to 636 * clear these pages. Note this function doesn't flush cache of 637 * these TDX private pages. The caller should make sure of that. 638 */ 639 static void reset_tdx_pages(unsigned long base, unsigned long size) 640 { 641 const void *zero_page = (const void *)page_address(ZERO_PAGE(0)); 642 unsigned long phys, end; 643 644 end = base + size; 645 for (phys = base; phys < end; phys += 64) 646 movdir64b(__va(phys), zero_page); 647 648 /* 649 * MOVDIR64B uses WC protocol. Use memory barrier to 650 * make sure any later user of these pages sees the 651 * updated data. 652 */ 653 mb(); 654 } 655 656 static void tdmr_reset_pamt(struct tdmr_info *tdmr) 657 { 658 tdmr_do_pamt_func(tdmr, reset_tdx_pages); 659 } 660 661 static void tdmrs_reset_pamt_all(struct tdmr_info_list *tdmr_list) 662 { 663 int i; 664 665 for (i = 0; i < tdmr_list->nr_consumed_tdmrs; i++) 666 tdmr_reset_pamt(tdmr_entry(tdmr_list, i)); 667 } 668 669 static unsigned long tdmrs_count_pamt_kb(struct tdmr_info_list *tdmr_list) 670 { 671 unsigned long pamt_size = 0; 672 int i; 673 674 for (i = 0; i < tdmr_list->nr_consumed_tdmrs; i++) { 675 unsigned long base, size; 676 677 tdmr_get_pamt(tdmr_entry(tdmr_list, i), &base, &size); 678 pamt_size += size; 679 } 680 681 return pamt_size / 1024; 682 } 683 684 static int tdmr_add_rsvd_area(struct tdmr_info *tdmr, int *p_idx, u64 addr, 685 u64 size, u16 max_reserved_per_tdmr) 686 { 687 struct tdmr_reserved_area *rsvd_areas = tdmr->reserved_areas; 688 int idx = *p_idx; 689 690 /* Reserved area must be 4K aligned in offset and size */ 691 if (WARN_ON(addr & ~PAGE_MASK || size & ~PAGE_MASK)) 692 return -EINVAL; 693 694 if (idx >= max_reserved_per_tdmr) { 695 pr_warn("initialization failed: TDMR [0x%llx, 0x%llx): reserved areas exhausted.\n", 696 tdmr->base, tdmr_end(tdmr)); 697 return -ENOSPC; 698 } 699 700 /* 701 * Consume one reserved area per call. Make no effort to 702 * optimize or reduce the number of reserved areas which are 703 * consumed by contiguous reserved areas, for instance. 704 */ 705 rsvd_areas[idx].offset = addr - tdmr->base; 706 rsvd_areas[idx].size = size; 707 708 *p_idx = idx + 1; 709 710 return 0; 711 } 712 713 /* 714 * Go through @tmb_list to find holes between memory areas. If any of 715 * those holes fall within @tdmr, set up a TDMR reserved area to cover 716 * the hole. 717 */ 718 static int tdmr_populate_rsvd_holes(struct list_head *tmb_list, 719 struct tdmr_info *tdmr, 720 int *rsvd_idx, 721 u16 max_reserved_per_tdmr) 722 { 723 struct tdx_memblock *tmb; 724 u64 prev_end; 725 int ret; 726 727 /* 728 * Start looking for reserved blocks at the 729 * beginning of the TDMR. 730 */ 731 prev_end = tdmr->base; 732 list_for_each_entry(tmb, tmb_list, list) { 733 u64 start, end; 734 735 start = PFN_PHYS(tmb->start_pfn); 736 end = PFN_PHYS(tmb->end_pfn); 737 738 /* Break if this region is after the TDMR */ 739 if (start >= tdmr_end(tdmr)) 740 break; 741 742 /* Exclude regions before this TDMR */ 743 if (end < tdmr->base) 744 continue; 745 746 /* 747 * Skip over memory areas that 748 * have already been dealt with. 749 */ 750 if (start <= prev_end) { 751 prev_end = end; 752 continue; 753 } 754 755 /* Add the hole before this region */ 756 ret = tdmr_add_rsvd_area(tdmr, rsvd_idx, prev_end, 757 start - prev_end, 758 max_reserved_per_tdmr); 759 if (ret) 760 return ret; 761 762 prev_end = end; 763 } 764 765 /* Add the hole after the last region if it exists. */ 766 if (prev_end < tdmr_end(tdmr)) { 767 ret = tdmr_add_rsvd_area(tdmr, rsvd_idx, prev_end, 768 tdmr_end(tdmr) - prev_end, 769 max_reserved_per_tdmr); 770 if (ret) 771 return ret; 772 } 773 774 return 0; 775 } 776 777 /* 778 * Go through @tdmr_list to find all PAMTs. If any of those PAMTs 779 * overlaps with @tdmr, set up a TDMR reserved area to cover the 780 * overlapping part. 781 */ 782 static int tdmr_populate_rsvd_pamts(struct tdmr_info_list *tdmr_list, 783 struct tdmr_info *tdmr, 784 int *rsvd_idx, 785 u16 max_reserved_per_tdmr) 786 { 787 int i, ret; 788 789 for (i = 0; i < tdmr_list->nr_consumed_tdmrs; i++) { 790 struct tdmr_info *tmp = tdmr_entry(tdmr_list, i); 791 unsigned long pamt_base, pamt_size, pamt_end; 792 793 tdmr_get_pamt(tmp, &pamt_base, &pamt_size); 794 /* Each TDMR must already have PAMT allocated */ 795 WARN_ON_ONCE(!pamt_size || !pamt_base); 796 797 pamt_end = pamt_base + pamt_size; 798 /* Skip PAMTs outside of the given TDMR */ 799 if ((pamt_end <= tdmr->base) || 800 (pamt_base >= tdmr_end(tdmr))) 801 continue; 802 803 /* Only mark the part within the TDMR as reserved */ 804 if (pamt_base < tdmr->base) 805 pamt_base = tdmr->base; 806 if (pamt_end > tdmr_end(tdmr)) 807 pamt_end = tdmr_end(tdmr); 808 809 ret = tdmr_add_rsvd_area(tdmr, rsvd_idx, pamt_base, 810 pamt_end - pamt_base, 811 max_reserved_per_tdmr); 812 if (ret) 813 return ret; 814 } 815 816 return 0; 817 } 818 819 /* Compare function called by sort() for TDMR reserved areas */ 820 static int rsvd_area_cmp_func(const void *a, const void *b) 821 { 822 struct tdmr_reserved_area *r1 = (struct tdmr_reserved_area *)a; 823 struct tdmr_reserved_area *r2 = (struct tdmr_reserved_area *)b; 824 825 if (r1->offset + r1->size <= r2->offset) 826 return -1; 827 if (r1->offset >= r2->offset + r2->size) 828 return 1; 829 830 /* Reserved areas cannot overlap. The caller must guarantee. */ 831 WARN_ON_ONCE(1); 832 return -1; 833 } 834 835 /* 836 * Populate reserved areas for the given @tdmr, including memory holes 837 * (via @tmb_list) and PAMTs (via @tdmr_list). 838 */ 839 static int tdmr_populate_rsvd_areas(struct tdmr_info *tdmr, 840 struct list_head *tmb_list, 841 struct tdmr_info_list *tdmr_list, 842 u16 max_reserved_per_tdmr) 843 { 844 int ret, rsvd_idx = 0; 845 846 ret = tdmr_populate_rsvd_holes(tmb_list, tdmr, &rsvd_idx, 847 max_reserved_per_tdmr); 848 if (ret) 849 return ret; 850 851 ret = tdmr_populate_rsvd_pamts(tdmr_list, tdmr, &rsvd_idx, 852 max_reserved_per_tdmr); 853 if (ret) 854 return ret; 855 856 /* TDX requires reserved areas listed in address ascending order */ 857 sort(tdmr->reserved_areas, rsvd_idx, sizeof(struct tdmr_reserved_area), 858 rsvd_area_cmp_func, NULL); 859 860 return 0; 861 } 862 863 /* 864 * Populate reserved areas for all TDMRs in @tdmr_list, including memory 865 * holes (via @tmb_list) and PAMTs. 866 */ 867 static int tdmrs_populate_rsvd_areas_all(struct tdmr_info_list *tdmr_list, 868 struct list_head *tmb_list, 869 u16 max_reserved_per_tdmr) 870 { 871 int i; 872 873 for (i = 0; i < tdmr_list->nr_consumed_tdmrs; i++) { 874 int ret; 875 876 ret = tdmr_populate_rsvd_areas(tdmr_entry(tdmr_list, i), 877 tmb_list, tdmr_list, max_reserved_per_tdmr); 878 if (ret) 879 return ret; 880 } 881 882 return 0; 883 } 884 885 /* 886 * Construct a list of TDMRs on the preallocated space in @tdmr_list 887 * to cover all TDX memory regions in @tmb_list based on the TDX module 888 * TDMR global information in @sysinfo_tdmr. 889 */ 890 static int construct_tdmrs(struct list_head *tmb_list, 891 struct tdmr_info_list *tdmr_list, 892 struct tdx_sys_info_tdmr *sysinfo_tdmr) 893 { 894 u16 pamt_entry_size[TDX_PS_NR] = { 895 sysinfo_tdmr->pamt_4k_entry_size, 896 sysinfo_tdmr->pamt_2m_entry_size, 897 sysinfo_tdmr->pamt_1g_entry_size, 898 }; 899 int ret; 900 901 ret = fill_out_tdmrs(tmb_list, tdmr_list); 902 if (ret) 903 return ret; 904 905 ret = tdmrs_set_up_pamt_all(tdmr_list, tmb_list, pamt_entry_size); 906 if (ret) 907 return ret; 908 909 ret = tdmrs_populate_rsvd_areas_all(tdmr_list, tmb_list, 910 sysinfo_tdmr->max_reserved_per_tdmr); 911 if (ret) 912 tdmrs_free_pamt_all(tdmr_list); 913 914 /* 915 * The tdmr_info_list is read-only from here on out. 916 * Ensure that these writes are seen by other CPUs. 917 * Pairs with a smp_rmb() in is_pamt_page(). 918 */ 919 smp_wmb(); 920 921 return ret; 922 } 923 924 static int config_tdx_module(struct tdmr_info_list *tdmr_list, u64 global_keyid) 925 { 926 struct tdx_module_args args = {}; 927 u64 *tdmr_pa_array; 928 size_t array_sz; 929 int i, ret; 930 931 /* 932 * TDMRs are passed to the TDX module via an array of physical 933 * addresses of each TDMR. The array itself also has certain 934 * alignment requirement. 935 */ 936 array_sz = tdmr_list->nr_consumed_tdmrs * sizeof(u64); 937 array_sz = roundup_pow_of_two(array_sz); 938 if (array_sz < TDMR_INFO_PA_ARRAY_ALIGNMENT) 939 array_sz = TDMR_INFO_PA_ARRAY_ALIGNMENT; 940 941 tdmr_pa_array = kzalloc(array_sz, GFP_KERNEL); 942 if (!tdmr_pa_array) 943 return -ENOMEM; 944 945 for (i = 0; i < tdmr_list->nr_consumed_tdmrs; i++) 946 tdmr_pa_array[i] = __pa(tdmr_entry(tdmr_list, i)); 947 948 args.rcx = __pa(tdmr_pa_array); 949 args.rdx = tdmr_list->nr_consumed_tdmrs; 950 args.r8 = global_keyid; 951 ret = seamcall_prerr(TDH_SYS_CONFIG, &args); 952 953 /* Free the array as it is not required anymore. */ 954 kfree(tdmr_pa_array); 955 956 return ret; 957 } 958 959 static int do_global_key_config(void *unused) 960 { 961 struct tdx_module_args args = {}; 962 963 return seamcall_prerr(TDH_SYS_KEY_CONFIG, &args); 964 } 965 966 /* 967 * Attempt to configure the global KeyID on all physical packages. 968 * 969 * This requires running code on at least one CPU in each package. 970 * TDMR initialization) will fail will fail if any package in the 971 * system has no online CPUs. 972 * 973 * This code takes no affirmative steps to online CPUs. Callers (aka. 974 * KVM) can ensure success by ensuring sufficient CPUs are online and 975 * can run SEAMCALLs. 976 */ 977 static int config_global_keyid(void) 978 { 979 cpumask_var_t packages; 980 int cpu, ret = -EINVAL; 981 982 if (!zalloc_cpumask_var(&packages, GFP_KERNEL)) 983 return -ENOMEM; 984 985 /* 986 * Hardware doesn't guarantee cache coherency across different 987 * KeyIDs. The kernel needs to flush PAMT's dirty cachelines 988 * (associated with KeyID 0) before the TDX module can use the 989 * global KeyID to access the PAMT. Given PAMTs are potentially 990 * large (~1/256th of system RAM), just use WBINVD. 991 */ 992 wbinvd_on_all_cpus(); 993 994 for_each_online_cpu(cpu) { 995 /* 996 * The key configuration only needs to be done once per 997 * package and will return an error if configured more 998 * than once. Avoid doing it multiple times per package. 999 */ 1000 if (cpumask_test_and_set_cpu(topology_physical_package_id(cpu), 1001 packages)) 1002 continue; 1003 1004 /* 1005 * TDH.SYS.KEY.CONFIG cannot run concurrently on 1006 * different cpus. Do it one by one. 1007 */ 1008 ret = smp_call_on_cpu(cpu, do_global_key_config, NULL, true); 1009 if (ret) 1010 break; 1011 } 1012 1013 free_cpumask_var(packages); 1014 return ret; 1015 } 1016 1017 static int init_tdmr(struct tdmr_info *tdmr) 1018 { 1019 u64 next; 1020 1021 /* 1022 * Initializing a TDMR can be time consuming. To avoid long 1023 * SEAMCALLs, the TDX module may only initialize a part of the 1024 * TDMR in each call. 1025 */ 1026 do { 1027 struct tdx_module_args args = { 1028 .rcx = tdmr->base, 1029 }; 1030 int ret; 1031 1032 ret = seamcall_prerr_ret(TDH_SYS_TDMR_INIT, &args); 1033 if (ret) 1034 return ret; 1035 /* 1036 * RDX contains 'next-to-initialize' address if 1037 * TDH.SYS.TDMR.INIT did not fully complete and 1038 * should be retried. 1039 */ 1040 next = args.rdx; 1041 cond_resched(); 1042 /* Keep making SEAMCALLs until the TDMR is done */ 1043 } while (next < tdmr->base + tdmr->size); 1044 1045 return 0; 1046 } 1047 1048 static int init_tdmrs(struct tdmr_info_list *tdmr_list) 1049 { 1050 int i; 1051 1052 /* 1053 * This operation is costly. It can be parallelized, 1054 * but keep it simple for now. 1055 */ 1056 for (i = 0; i < tdmr_list->nr_consumed_tdmrs; i++) { 1057 int ret; 1058 1059 ret = init_tdmr(tdmr_entry(tdmr_list, i)); 1060 if (ret) 1061 return ret; 1062 } 1063 1064 return 0; 1065 } 1066 1067 static int init_tdx_module(void) 1068 { 1069 int ret; 1070 1071 ret = get_tdx_sys_info(&tdx_sysinfo); 1072 if (ret) 1073 return ret; 1074 1075 /* Check whether the kernel can support this module */ 1076 ret = check_features(&tdx_sysinfo); 1077 if (ret) 1078 return ret; 1079 1080 /* 1081 * To keep things simple, assume that all TDX-protected memory 1082 * will come from the page allocator. Make sure all pages in the 1083 * page allocator are TDX-usable memory. 1084 * 1085 * Build the list of "TDX-usable" memory regions which cover all 1086 * pages in the page allocator to guarantee that. Do it while 1087 * holding mem_hotplug_lock read-lock as the memory hotplug code 1088 * path reads the @tdx_memlist to reject any new memory. 1089 */ 1090 get_online_mems(); 1091 1092 ret = build_tdx_memlist(&tdx_memlist); 1093 if (ret) 1094 goto out_put_tdxmem; 1095 1096 /* Allocate enough space for constructing TDMRs */ 1097 ret = alloc_tdmr_list(&tdx_tdmr_list, &tdx_sysinfo.tdmr); 1098 if (ret) 1099 goto err_free_tdxmem; 1100 1101 /* Cover all TDX-usable memory regions in TDMRs */ 1102 ret = construct_tdmrs(&tdx_memlist, &tdx_tdmr_list, &tdx_sysinfo.tdmr); 1103 if (ret) 1104 goto err_free_tdmrs; 1105 1106 /* Pass the TDMRs and the global KeyID to the TDX module */ 1107 ret = config_tdx_module(&tdx_tdmr_list, tdx_global_keyid); 1108 if (ret) 1109 goto err_free_pamts; 1110 1111 /* Config the key of global KeyID on all packages */ 1112 ret = config_global_keyid(); 1113 if (ret) 1114 goto err_reset_pamts; 1115 1116 /* Initialize TDMRs to complete the TDX module initialization */ 1117 ret = init_tdmrs(&tdx_tdmr_list); 1118 if (ret) 1119 goto err_reset_pamts; 1120 1121 pr_info("%lu KB allocated for PAMT\n", tdmrs_count_pamt_kb(&tdx_tdmr_list)); 1122 1123 out_put_tdxmem: 1124 /* 1125 * @tdx_memlist is written here and read at memory hotplug time. 1126 * Lock out memory hotplug code while building it. 1127 */ 1128 put_online_mems(); 1129 return ret; 1130 1131 err_reset_pamts: 1132 /* 1133 * Part of PAMTs may already have been initialized by the 1134 * TDX module. Flush cache before returning PAMTs back 1135 * to the kernel. 1136 */ 1137 wbinvd_on_all_cpus(); 1138 /* 1139 * According to the TDX hardware spec, if the platform 1140 * doesn't have the "partial write machine check" 1141 * erratum, any kernel read/write will never cause #MC 1142 * in kernel space, thus it's OK to not convert PAMTs 1143 * back to normal. But do the conversion anyway here 1144 * as suggested by the TDX spec. 1145 */ 1146 tdmrs_reset_pamt_all(&tdx_tdmr_list); 1147 err_free_pamts: 1148 tdmrs_free_pamt_all(&tdx_tdmr_list); 1149 err_free_tdmrs: 1150 free_tdmr_list(&tdx_tdmr_list); 1151 err_free_tdxmem: 1152 free_tdx_memlist(&tdx_memlist); 1153 goto out_put_tdxmem; 1154 } 1155 1156 static int __tdx_enable(void) 1157 { 1158 int ret; 1159 1160 ret = init_tdx_module(); 1161 if (ret) { 1162 pr_err("module initialization failed (%d)\n", ret); 1163 tdx_module_status = TDX_MODULE_ERROR; 1164 return ret; 1165 } 1166 1167 pr_info("module initialized\n"); 1168 tdx_module_status = TDX_MODULE_INITIALIZED; 1169 1170 return 0; 1171 } 1172 1173 /** 1174 * tdx_enable - Enable TDX module to make it ready to run TDX guests 1175 * 1176 * This function assumes the caller has: 1) held read lock of CPU hotplug 1177 * lock to prevent any new cpu from becoming online; 2) done both VMXON 1178 * and tdx_cpu_enable() on all online cpus. 1179 * 1180 * This function requires there's at least one online cpu for each CPU 1181 * package to succeed. 1182 * 1183 * This function can be called in parallel by multiple callers. 1184 * 1185 * Return 0 if TDX is enabled successfully, otherwise error. 1186 */ 1187 int tdx_enable(void) 1188 { 1189 int ret; 1190 1191 if (!boot_cpu_has(X86_FEATURE_TDX_HOST_PLATFORM)) 1192 return -ENODEV; 1193 1194 lockdep_assert_cpus_held(); 1195 1196 mutex_lock(&tdx_module_lock); 1197 1198 switch (tdx_module_status) { 1199 case TDX_MODULE_UNINITIALIZED: 1200 ret = __tdx_enable(); 1201 break; 1202 case TDX_MODULE_INITIALIZED: 1203 /* Already initialized, great, tell the caller. */ 1204 ret = 0; 1205 break; 1206 default: 1207 /* Failed to initialize in the previous attempts */ 1208 ret = -EINVAL; 1209 break; 1210 } 1211 1212 mutex_unlock(&tdx_module_lock); 1213 1214 return ret; 1215 } 1216 EXPORT_SYMBOL_GPL(tdx_enable); 1217 1218 static bool is_pamt_page(unsigned long phys) 1219 { 1220 struct tdmr_info_list *tdmr_list = &tdx_tdmr_list; 1221 int i; 1222 1223 /* Ensure that all remote 'tdmr_list' writes are visible: */ 1224 smp_rmb(); 1225 1226 /* 1227 * The TDX module is no longer returning TDX_SYS_NOT_READY and 1228 * is initialized. The 'tdmr_list' was initialized long ago 1229 * and is now read-only. 1230 */ 1231 for (i = 0; i < tdmr_list->nr_consumed_tdmrs; i++) { 1232 unsigned long base, size; 1233 1234 tdmr_get_pamt(tdmr_entry(tdmr_list, i), &base, &size); 1235 1236 if (phys >= base && phys < (base + size)) 1237 return true; 1238 } 1239 1240 return false; 1241 } 1242 1243 /* 1244 * Return whether the memory page at the given physical address is TDX 1245 * private memory or not. 1246 * 1247 * This can be imprecise for two known reasons: 1248 * 1. PAMTs are private memory and exist before the TDX module is 1249 * ready and TDH_PHYMEM_PAGE_RDMD works. This is a relatively 1250 * short window that occurs once per boot. 1251 * 2. TDH_PHYMEM_PAGE_RDMD reflects the TDX module's knowledge of the 1252 * page. However, the page can still cause #MC until it has been 1253 * fully converted to shared using 64-byte writes like MOVDIR64B. 1254 * Buggy hosts might still leave #MC-causing memory in place which 1255 * this function can not detect. 1256 */ 1257 static bool paddr_is_tdx_private(unsigned long phys) 1258 { 1259 struct tdx_module_args args = { 1260 .rcx = phys & PAGE_MASK, 1261 }; 1262 u64 sret; 1263 1264 if (!boot_cpu_has(X86_FEATURE_TDX_HOST_PLATFORM)) 1265 return false; 1266 1267 /* Get page type from the TDX module */ 1268 sret = __seamcall_ret(TDH_PHYMEM_PAGE_RDMD, &args); 1269 1270 /* 1271 * The SEAMCALL will not return success unless there is a 1272 * working, "ready" TDX module. Assume an absence of TDX 1273 * private pages until SEAMCALL is working. 1274 */ 1275 if (sret) 1276 return false; 1277 1278 /* 1279 * SEAMCALL was successful -- read page type (via RCX): 1280 * 1281 * - PT_NDA: Page is not used by the TDX module 1282 * - PT_RSVD: Reserved for Non-TDX use 1283 * - Others: Page is used by the TDX module 1284 * 1285 * Note PAMT pages are marked as PT_RSVD but they are also TDX 1286 * private memory. 1287 */ 1288 switch (args.rcx) { 1289 case PT_NDA: 1290 return false; 1291 case PT_RSVD: 1292 return is_pamt_page(phys); 1293 default: 1294 return true; 1295 } 1296 } 1297 1298 /* 1299 * Some TDX-capable CPUs have an erratum. A write to TDX private 1300 * memory poisons that memory, and a subsequent read of that memory 1301 * triggers #MC. 1302 * 1303 * Help distinguish erratum-triggered #MCs from a normal hardware one. 1304 * Just print additional message to show such #MC may be result of the 1305 * erratum. 1306 */ 1307 const char *tdx_dump_mce_info(struct mce *m) 1308 { 1309 if (!m || !mce_is_memory_error(m) || !mce_usable_address(m)) 1310 return NULL; 1311 1312 if (!paddr_is_tdx_private(m->addr)) 1313 return NULL; 1314 1315 return "TDX private memory error. Possible kernel bug."; 1316 } 1317 1318 static __init int record_keyid_partitioning(u32 *tdx_keyid_start, 1319 u32 *nr_tdx_keyids) 1320 { 1321 u32 _nr_mktme_keyids, _tdx_keyid_start, _nr_tdx_keyids; 1322 int ret; 1323 1324 /* 1325 * IA32_MKTME_KEYID_PARTIONING: 1326 * Bit [31:0]: Number of MKTME KeyIDs. 1327 * Bit [63:32]: Number of TDX private KeyIDs. 1328 */ 1329 ret = rdmsr_safe(MSR_IA32_MKTME_KEYID_PARTITIONING, &_nr_mktme_keyids, 1330 &_nr_tdx_keyids); 1331 if (ret || !_nr_tdx_keyids) 1332 return -EINVAL; 1333 1334 /* TDX KeyIDs start after the last MKTME KeyID. */ 1335 _tdx_keyid_start = _nr_mktme_keyids + 1; 1336 1337 *tdx_keyid_start = _tdx_keyid_start; 1338 *nr_tdx_keyids = _nr_tdx_keyids; 1339 1340 return 0; 1341 } 1342 1343 static bool is_tdx_memory(unsigned long start_pfn, unsigned long end_pfn) 1344 { 1345 struct tdx_memblock *tmb; 1346 1347 /* 1348 * This check assumes that the start_pfn<->end_pfn range does not 1349 * cross multiple @tdx_memlist entries. A single memory online 1350 * event across multiple memblocks (from which @tdx_memlist 1351 * entries are derived at the time of module initialization) is 1352 * not possible. This is because memory offline/online is done 1353 * on granularity of 'struct memory_block', and the hotpluggable 1354 * memory region (one memblock) must be multiple of memory_block. 1355 */ 1356 list_for_each_entry(tmb, &tdx_memlist, list) { 1357 if (start_pfn >= tmb->start_pfn && end_pfn <= tmb->end_pfn) 1358 return true; 1359 } 1360 return false; 1361 } 1362 1363 static int tdx_memory_notifier(struct notifier_block *nb, unsigned long action, 1364 void *v) 1365 { 1366 struct memory_notify *mn = v; 1367 1368 if (action != MEM_GOING_ONLINE) 1369 return NOTIFY_OK; 1370 1371 /* 1372 * Empty list means TDX isn't enabled. Allow any memory 1373 * to go online. 1374 */ 1375 if (list_empty(&tdx_memlist)) 1376 return NOTIFY_OK; 1377 1378 /* 1379 * The TDX memory configuration is static and can not be 1380 * changed. Reject onlining any memory which is outside of 1381 * the static configuration whether it supports TDX or not. 1382 */ 1383 if (is_tdx_memory(mn->start_pfn, mn->start_pfn + mn->nr_pages)) 1384 return NOTIFY_OK; 1385 1386 return NOTIFY_BAD; 1387 } 1388 1389 static struct notifier_block tdx_memory_nb = { 1390 .notifier_call = tdx_memory_notifier, 1391 }; 1392 1393 static void __init check_tdx_erratum(void) 1394 { 1395 /* 1396 * These CPUs have an erratum. A partial write from non-TD 1397 * software (e.g. via MOVNTI variants or UC/WC mapping) to TDX 1398 * private memory poisons that memory, and a subsequent read of 1399 * that memory triggers #MC. 1400 */ 1401 switch (boot_cpu_data.x86_vfm) { 1402 case INTEL_SAPPHIRERAPIDS_X: 1403 case INTEL_EMERALDRAPIDS_X: 1404 setup_force_cpu_bug(X86_BUG_TDX_PW_MCE); 1405 } 1406 } 1407 1408 void __init tdx_init(void) 1409 { 1410 u32 tdx_keyid_start, nr_tdx_keyids; 1411 int err; 1412 1413 err = record_keyid_partitioning(&tdx_keyid_start, &nr_tdx_keyids); 1414 if (err) 1415 return; 1416 1417 pr_info("BIOS enabled: private KeyID range [%u, %u)\n", 1418 tdx_keyid_start, tdx_keyid_start + nr_tdx_keyids); 1419 1420 /* 1421 * The TDX module itself requires one 'global KeyID' to protect 1422 * its metadata. If there's only one TDX KeyID, there won't be 1423 * any left for TDX guests thus there's no point to enable TDX 1424 * at all. 1425 */ 1426 if (nr_tdx_keyids < 2) { 1427 pr_err("initialization failed: too few private KeyIDs available.\n"); 1428 return; 1429 } 1430 1431 /* 1432 * At this point, hibernation_available() indicates whether or 1433 * not hibernation support has been permanently disabled. 1434 */ 1435 if (hibernation_available()) { 1436 pr_err("initialization failed: Hibernation support is enabled\n"); 1437 return; 1438 } 1439 1440 err = register_memory_notifier(&tdx_memory_nb); 1441 if (err) { 1442 pr_err("initialization failed: register_memory_notifier() failed (%d)\n", 1443 err); 1444 return; 1445 } 1446 1447 #if defined(CONFIG_ACPI) && defined(CONFIG_SUSPEND) 1448 pr_info("Disable ACPI S3. Turn off TDX in the BIOS to use ACPI S3.\n"); 1449 acpi_suspend_lowlevel = NULL; 1450 #endif 1451 1452 /* 1453 * Just use the first TDX KeyID as the 'global KeyID' and 1454 * leave the rest for TDX guests. 1455 */ 1456 tdx_global_keyid = tdx_keyid_start; 1457 tdx_guest_keyid_start = tdx_keyid_start + 1; 1458 tdx_nr_guest_keyids = nr_tdx_keyids - 1; 1459 1460 setup_force_cpu_cap(X86_FEATURE_TDX_HOST_PLATFORM); 1461 1462 check_tdx_erratum(); 1463 } 1464 1465 const struct tdx_sys_info *tdx_get_sysinfo(void) 1466 { 1467 const struct tdx_sys_info *p = NULL; 1468 1469 /* Make sure all fields in @tdx_sysinfo have been populated */ 1470 mutex_lock(&tdx_module_lock); 1471 if (tdx_module_status == TDX_MODULE_INITIALIZED) 1472 p = (const struct tdx_sys_info *)&tdx_sysinfo; 1473 mutex_unlock(&tdx_module_lock); 1474 1475 return p; 1476 } 1477 EXPORT_SYMBOL_GPL(tdx_get_sysinfo); 1478 1479 u32 tdx_get_nr_guest_keyids(void) 1480 { 1481 return tdx_nr_guest_keyids; 1482 } 1483 EXPORT_SYMBOL_GPL(tdx_get_nr_guest_keyids); 1484 1485 int tdx_guest_keyid_alloc(void) 1486 { 1487 return ida_alloc_range(&tdx_guest_keyid_pool, tdx_guest_keyid_start, 1488 tdx_guest_keyid_start + tdx_nr_guest_keyids - 1, 1489 GFP_KERNEL); 1490 } 1491 EXPORT_SYMBOL_GPL(tdx_guest_keyid_alloc); 1492 1493 void tdx_guest_keyid_free(unsigned int keyid) 1494 { 1495 ida_free(&tdx_guest_keyid_pool, keyid); 1496 } 1497 EXPORT_SYMBOL_GPL(tdx_guest_keyid_free); 1498 1499 static inline u64 tdx_tdr_pa(struct tdx_td *td) 1500 { 1501 return page_to_phys(td->tdr_page); 1502 } 1503 1504 static inline u64 tdx_tdvpr_pa(struct tdx_vp *td) 1505 { 1506 return page_to_phys(td->tdvpr_page); 1507 } 1508 1509 /* 1510 * The TDX module exposes a CLFLUSH_BEFORE_ALLOC bit to specify whether 1511 * a CLFLUSH of pages is required before handing them to the TDX module. 1512 * Be conservative and make the code simpler by doing the CLFLUSH 1513 * unconditionally. 1514 */ 1515 static void tdx_clflush_page(struct page *page) 1516 { 1517 clflush_cache_range(page_to_virt(page), PAGE_SIZE); 1518 } 1519 1520 noinstr __flatten u64 tdh_vp_enter(struct tdx_vp *td, struct tdx_module_args *args) 1521 { 1522 args->rcx = tdx_tdvpr_pa(td); 1523 1524 return __seamcall_saved_ret(TDH_VP_ENTER, args); 1525 } 1526 EXPORT_SYMBOL_GPL(tdh_vp_enter); 1527 1528 u64 tdh_mng_addcx(struct tdx_td *td, struct page *tdcs_page) 1529 { 1530 struct tdx_module_args args = { 1531 .rcx = page_to_phys(tdcs_page), 1532 .rdx = tdx_tdr_pa(td), 1533 }; 1534 1535 tdx_clflush_page(tdcs_page); 1536 return seamcall(TDH_MNG_ADDCX, &args); 1537 } 1538 EXPORT_SYMBOL_GPL(tdh_mng_addcx); 1539 1540 u64 tdh_mem_page_add(struct tdx_td *td, u64 gpa, struct page *page, struct page *source, u64 *ext_err1, u64 *ext_err2) 1541 { 1542 struct tdx_module_args args = { 1543 .rcx = gpa, 1544 .rdx = tdx_tdr_pa(td), 1545 .r8 = page_to_phys(page), 1546 .r9 = page_to_phys(source), 1547 }; 1548 u64 ret; 1549 1550 tdx_clflush_page(page); 1551 ret = seamcall_ret(TDH_MEM_PAGE_ADD, &args); 1552 1553 *ext_err1 = args.rcx; 1554 *ext_err2 = args.rdx; 1555 1556 return ret; 1557 } 1558 EXPORT_SYMBOL_GPL(tdh_mem_page_add); 1559 1560 u64 tdh_mem_sept_add(struct tdx_td *td, u64 gpa, int level, struct page *page, u64 *ext_err1, u64 *ext_err2) 1561 { 1562 struct tdx_module_args args = { 1563 .rcx = gpa | level, 1564 .rdx = tdx_tdr_pa(td), 1565 .r8 = page_to_phys(page), 1566 }; 1567 u64 ret; 1568 1569 tdx_clflush_page(page); 1570 ret = seamcall_ret(TDH_MEM_SEPT_ADD, &args); 1571 1572 *ext_err1 = args.rcx; 1573 *ext_err2 = args.rdx; 1574 1575 return ret; 1576 } 1577 EXPORT_SYMBOL_GPL(tdh_mem_sept_add); 1578 1579 u64 tdh_vp_addcx(struct tdx_vp *vp, struct page *tdcx_page) 1580 { 1581 struct tdx_module_args args = { 1582 .rcx = page_to_phys(tdcx_page), 1583 .rdx = tdx_tdvpr_pa(vp), 1584 }; 1585 1586 tdx_clflush_page(tdcx_page); 1587 return seamcall(TDH_VP_ADDCX, &args); 1588 } 1589 EXPORT_SYMBOL_GPL(tdh_vp_addcx); 1590 1591 u64 tdh_mem_page_aug(struct tdx_td *td, u64 gpa, int level, struct page *page, u64 *ext_err1, u64 *ext_err2) 1592 { 1593 struct tdx_module_args args = { 1594 .rcx = gpa | level, 1595 .rdx = tdx_tdr_pa(td), 1596 .r8 = page_to_phys(page), 1597 }; 1598 u64 ret; 1599 1600 tdx_clflush_page(page); 1601 ret = seamcall_ret(TDH_MEM_PAGE_AUG, &args); 1602 1603 *ext_err1 = args.rcx; 1604 *ext_err2 = args.rdx; 1605 1606 return ret; 1607 } 1608 EXPORT_SYMBOL_GPL(tdh_mem_page_aug); 1609 1610 u64 tdh_mem_range_block(struct tdx_td *td, u64 gpa, int level, u64 *ext_err1, u64 *ext_err2) 1611 { 1612 struct tdx_module_args args = { 1613 .rcx = gpa | level, 1614 .rdx = tdx_tdr_pa(td), 1615 }; 1616 u64 ret; 1617 1618 ret = seamcall_ret(TDH_MEM_RANGE_BLOCK, &args); 1619 1620 *ext_err1 = args.rcx; 1621 *ext_err2 = args.rdx; 1622 1623 return ret; 1624 } 1625 EXPORT_SYMBOL_GPL(tdh_mem_range_block); 1626 1627 u64 tdh_mng_key_config(struct tdx_td *td) 1628 { 1629 struct tdx_module_args args = { 1630 .rcx = tdx_tdr_pa(td), 1631 }; 1632 1633 return seamcall(TDH_MNG_KEY_CONFIG, &args); 1634 } 1635 EXPORT_SYMBOL_GPL(tdh_mng_key_config); 1636 1637 u64 tdh_mng_create(struct tdx_td *td, u16 hkid) 1638 { 1639 struct tdx_module_args args = { 1640 .rcx = tdx_tdr_pa(td), 1641 .rdx = hkid, 1642 }; 1643 1644 tdx_clflush_page(td->tdr_page); 1645 return seamcall(TDH_MNG_CREATE, &args); 1646 } 1647 EXPORT_SYMBOL_GPL(tdh_mng_create); 1648 1649 u64 tdh_vp_create(struct tdx_td *td, struct tdx_vp *vp) 1650 { 1651 struct tdx_module_args args = { 1652 .rcx = tdx_tdvpr_pa(vp), 1653 .rdx = tdx_tdr_pa(td), 1654 }; 1655 1656 tdx_clflush_page(vp->tdvpr_page); 1657 return seamcall(TDH_VP_CREATE, &args); 1658 } 1659 EXPORT_SYMBOL_GPL(tdh_vp_create); 1660 1661 u64 tdh_mng_rd(struct tdx_td *td, u64 field, u64 *data) 1662 { 1663 struct tdx_module_args args = { 1664 .rcx = tdx_tdr_pa(td), 1665 .rdx = field, 1666 }; 1667 u64 ret; 1668 1669 ret = seamcall_ret(TDH_MNG_RD, &args); 1670 1671 /* R8: Content of the field, or 0 in case of error. */ 1672 *data = args.r8; 1673 1674 return ret; 1675 } 1676 EXPORT_SYMBOL_GPL(tdh_mng_rd); 1677 1678 u64 tdh_mr_extend(struct tdx_td *td, u64 gpa, u64 *ext_err1, u64 *ext_err2) 1679 { 1680 struct tdx_module_args args = { 1681 .rcx = gpa, 1682 .rdx = tdx_tdr_pa(td), 1683 }; 1684 u64 ret; 1685 1686 ret = seamcall_ret(TDH_MR_EXTEND, &args); 1687 1688 *ext_err1 = args.rcx; 1689 *ext_err2 = args.rdx; 1690 1691 return ret; 1692 } 1693 EXPORT_SYMBOL_GPL(tdh_mr_extend); 1694 1695 u64 tdh_mr_finalize(struct tdx_td *td) 1696 { 1697 struct tdx_module_args args = { 1698 .rcx = tdx_tdr_pa(td), 1699 }; 1700 1701 return seamcall(TDH_MR_FINALIZE, &args); 1702 } 1703 EXPORT_SYMBOL_GPL(tdh_mr_finalize); 1704 1705 u64 tdh_vp_flush(struct tdx_vp *vp) 1706 { 1707 struct tdx_module_args args = { 1708 .rcx = tdx_tdvpr_pa(vp), 1709 }; 1710 1711 return seamcall(TDH_VP_FLUSH, &args); 1712 } 1713 EXPORT_SYMBOL_GPL(tdh_vp_flush); 1714 1715 u64 tdh_mng_vpflushdone(struct tdx_td *td) 1716 { 1717 struct tdx_module_args args = { 1718 .rcx = tdx_tdr_pa(td), 1719 }; 1720 1721 return seamcall(TDH_MNG_VPFLUSHDONE, &args); 1722 } 1723 EXPORT_SYMBOL_GPL(tdh_mng_vpflushdone); 1724 1725 u64 tdh_mng_key_freeid(struct tdx_td *td) 1726 { 1727 struct tdx_module_args args = { 1728 .rcx = tdx_tdr_pa(td), 1729 }; 1730 1731 return seamcall(TDH_MNG_KEY_FREEID, &args); 1732 } 1733 EXPORT_SYMBOL_GPL(tdh_mng_key_freeid); 1734 1735 u64 tdh_mng_init(struct tdx_td *td, u64 td_params, u64 *extended_err) 1736 { 1737 struct tdx_module_args args = { 1738 .rcx = tdx_tdr_pa(td), 1739 .rdx = td_params, 1740 }; 1741 u64 ret; 1742 1743 ret = seamcall_ret(TDH_MNG_INIT, &args); 1744 1745 *extended_err = args.rcx; 1746 1747 return ret; 1748 } 1749 EXPORT_SYMBOL_GPL(tdh_mng_init); 1750 1751 u64 tdh_vp_rd(struct tdx_vp *vp, u64 field, u64 *data) 1752 { 1753 struct tdx_module_args args = { 1754 .rcx = tdx_tdvpr_pa(vp), 1755 .rdx = field, 1756 }; 1757 u64 ret; 1758 1759 ret = seamcall_ret(TDH_VP_RD, &args); 1760 1761 /* R8: Content of the field, or 0 in case of error. */ 1762 *data = args.r8; 1763 1764 return ret; 1765 } 1766 EXPORT_SYMBOL_GPL(tdh_vp_rd); 1767 1768 u64 tdh_vp_wr(struct tdx_vp *vp, u64 field, u64 data, u64 mask) 1769 { 1770 struct tdx_module_args args = { 1771 .rcx = tdx_tdvpr_pa(vp), 1772 .rdx = field, 1773 .r8 = data, 1774 .r9 = mask, 1775 }; 1776 1777 return seamcall(TDH_VP_WR, &args); 1778 } 1779 EXPORT_SYMBOL_GPL(tdh_vp_wr); 1780 1781 u64 tdh_vp_init(struct tdx_vp *vp, u64 initial_rcx, u32 x2apicid) 1782 { 1783 struct tdx_module_args args = { 1784 .rcx = tdx_tdvpr_pa(vp), 1785 .rdx = initial_rcx, 1786 .r8 = x2apicid, 1787 }; 1788 1789 /* apicid requires version == 1. */ 1790 return seamcall(TDH_VP_INIT | (1ULL << TDX_VERSION_SHIFT), &args); 1791 } 1792 EXPORT_SYMBOL_GPL(tdh_vp_init); 1793 1794 /* 1795 * TDX ABI defines output operands as PT, OWNER and SIZE. These are TDX defined fomats. 1796 * So despite the names, they must be interpted specially as described by the spec. Return 1797 * them only for error reporting purposes. 1798 */ 1799 u64 tdh_phymem_page_reclaim(struct page *page, u64 *tdx_pt, u64 *tdx_owner, u64 *tdx_size) 1800 { 1801 struct tdx_module_args args = { 1802 .rcx = page_to_phys(page), 1803 }; 1804 u64 ret; 1805 1806 ret = seamcall_ret(TDH_PHYMEM_PAGE_RECLAIM, &args); 1807 1808 *tdx_pt = args.rcx; 1809 *tdx_owner = args.rdx; 1810 *tdx_size = args.r8; 1811 1812 return ret; 1813 } 1814 EXPORT_SYMBOL_GPL(tdh_phymem_page_reclaim); 1815 1816 u64 tdh_mem_track(struct tdx_td *td) 1817 { 1818 struct tdx_module_args args = { 1819 .rcx = tdx_tdr_pa(td), 1820 }; 1821 1822 return seamcall(TDH_MEM_TRACK, &args); 1823 } 1824 EXPORT_SYMBOL_GPL(tdh_mem_track); 1825 1826 u64 tdh_mem_page_remove(struct tdx_td *td, u64 gpa, u64 level, u64 *ext_err1, u64 *ext_err2) 1827 { 1828 struct tdx_module_args args = { 1829 .rcx = gpa | level, 1830 .rdx = tdx_tdr_pa(td), 1831 }; 1832 u64 ret; 1833 1834 ret = seamcall_ret(TDH_MEM_PAGE_REMOVE, &args); 1835 1836 *ext_err1 = args.rcx; 1837 *ext_err2 = args.rdx; 1838 1839 return ret; 1840 } 1841 EXPORT_SYMBOL_GPL(tdh_mem_page_remove); 1842 1843 u64 tdh_phymem_cache_wb(bool resume) 1844 { 1845 struct tdx_module_args args = { 1846 .rcx = resume ? 1 : 0, 1847 }; 1848 1849 return seamcall(TDH_PHYMEM_CACHE_WB, &args); 1850 } 1851 EXPORT_SYMBOL_GPL(tdh_phymem_cache_wb); 1852 1853 u64 tdh_phymem_page_wbinvd_tdr(struct tdx_td *td) 1854 { 1855 struct tdx_module_args args = {}; 1856 1857 args.rcx = mk_keyed_paddr(tdx_global_keyid, td->tdr_page); 1858 1859 return seamcall(TDH_PHYMEM_PAGE_WBINVD, &args); 1860 } 1861 EXPORT_SYMBOL_GPL(tdh_phymem_page_wbinvd_tdr); 1862 1863 u64 tdh_phymem_page_wbinvd_hkid(u64 hkid, struct page *page) 1864 { 1865 struct tdx_module_args args = {}; 1866 1867 args.rcx = mk_keyed_paddr(hkid, page); 1868 1869 return seamcall(TDH_PHYMEM_PAGE_WBINVD, &args); 1870 } 1871 EXPORT_SYMBOL_GPL(tdh_phymem_page_wbinvd_hkid); 1872