1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright(c) 2023 Intel Corporation. 4 * 5 * Intel Trusted Domain Extensions (TDX) support 6 */ 7 8 #include "asm/page_types.h" 9 #define pr_fmt(fmt) "virt/tdx: " fmt 10 11 #include <linux/types.h> 12 #include <linux/cache.h> 13 #include <linux/init.h> 14 #include <linux/errno.h> 15 #include <linux/printk.h> 16 #include <linux/cpu.h> 17 #include <linux/spinlock.h> 18 #include <linux/percpu-defs.h> 19 #include <linux/mutex.h> 20 #include <linux/list.h> 21 #include <linux/memblock.h> 22 #include <linux/memory.h> 23 #include <linux/minmax.h> 24 #include <linux/sizes.h> 25 #include <linux/pfn.h> 26 #include <linux/align.h> 27 #include <linux/sort.h> 28 #include <linux/log2.h> 29 #include <linux/acpi.h> 30 #include <linux/suspend.h> 31 #include <linux/idr.h> 32 #include <asm/page.h> 33 #include <asm/special_insns.h> 34 #include <asm/msr-index.h> 35 #include <asm/msr.h> 36 #include <asm/cpufeature.h> 37 #include <asm/tdx.h> 38 #include <asm/cpu_device_id.h> 39 #include <asm/processor.h> 40 #include <asm/mce.h> 41 #include "tdx.h" 42 43 static u32 tdx_global_keyid __ro_after_init; 44 static u32 tdx_guest_keyid_start __ro_after_init; 45 static u32 tdx_nr_guest_keyids __ro_after_init; 46 47 static DEFINE_IDA(tdx_guest_keyid_pool); 48 49 static DEFINE_PER_CPU(bool, tdx_lp_initialized); 50 51 static struct tdmr_info_list tdx_tdmr_list; 52 53 static enum tdx_module_status_t tdx_module_status; 54 static DEFINE_MUTEX(tdx_module_lock); 55 56 /* All TDX-usable memory regions. Protected by mem_hotplug_lock. */ 57 static LIST_HEAD(tdx_memlist); 58 59 static struct tdx_sys_info tdx_sysinfo; 60 61 typedef void (*sc_err_func_t)(u64 fn, u64 err, struct tdx_module_args *args); 62 63 static inline void seamcall_err(u64 fn, u64 err, struct tdx_module_args *args) 64 { 65 pr_err("SEAMCALL (0x%016llx) failed: 0x%016llx\n", fn, err); 66 } 67 68 static inline void seamcall_err_ret(u64 fn, u64 err, 69 struct tdx_module_args *args) 70 { 71 seamcall_err(fn, err, args); 72 pr_err("RCX 0x%016llx RDX 0x%016llx R08 0x%016llx\n", 73 args->rcx, args->rdx, args->r8); 74 pr_err("R09 0x%016llx R10 0x%016llx R11 0x%016llx\n", 75 args->r9, args->r10, args->r11); 76 } 77 78 static __always_inline int sc_retry_prerr(sc_func_t func, 79 sc_err_func_t err_func, 80 u64 fn, struct tdx_module_args *args) 81 { 82 u64 sret = sc_retry(func, fn, args); 83 84 if (sret == TDX_SUCCESS) 85 return 0; 86 87 if (sret == TDX_SEAMCALL_VMFAILINVALID) 88 return -ENODEV; 89 90 if (sret == TDX_SEAMCALL_GP) 91 return -EOPNOTSUPP; 92 93 if (sret == TDX_SEAMCALL_UD) 94 return -EACCES; 95 96 err_func(fn, sret, args); 97 return -EIO; 98 } 99 100 #define seamcall_prerr(__fn, __args) \ 101 sc_retry_prerr(__seamcall, seamcall_err, (__fn), (__args)) 102 103 #define seamcall_prerr_ret(__fn, __args) \ 104 sc_retry_prerr(__seamcall_ret, seamcall_err_ret, (__fn), (__args)) 105 106 /* 107 * Do the module global initialization once and return its result. 108 * It can be done on any cpu. It's always called with interrupts 109 * disabled. 110 */ 111 static int try_init_module_global(void) 112 { 113 struct tdx_module_args args = {}; 114 static DEFINE_RAW_SPINLOCK(sysinit_lock); 115 static bool sysinit_done; 116 static int sysinit_ret; 117 118 lockdep_assert_irqs_disabled(); 119 120 raw_spin_lock(&sysinit_lock); 121 122 if (sysinit_done) 123 goto out; 124 125 /* RCX is module attributes and all bits are reserved */ 126 args.rcx = 0; 127 sysinit_ret = seamcall_prerr(TDH_SYS_INIT, &args); 128 129 /* 130 * The first SEAMCALL also detects the TDX module, thus 131 * it can fail due to the TDX module is not loaded. 132 * Dump message to let the user know. 133 */ 134 if (sysinit_ret == -ENODEV) 135 pr_err("module not loaded\n"); 136 137 sysinit_done = true; 138 out: 139 raw_spin_unlock(&sysinit_lock); 140 return sysinit_ret; 141 } 142 143 /** 144 * tdx_cpu_enable - Enable TDX on local cpu 145 * 146 * Do one-time TDX module per-cpu initialization SEAMCALL (and TDX module 147 * global initialization SEAMCALL if not done) on local cpu to make this 148 * cpu be ready to run any other SEAMCALLs. 149 * 150 * Always call this function via IPI function calls. 151 * 152 * Return 0 on success, otherwise errors. 153 */ 154 int tdx_cpu_enable(void) 155 { 156 struct tdx_module_args args = {}; 157 int ret; 158 159 if (!boot_cpu_has(X86_FEATURE_TDX_HOST_PLATFORM)) 160 return -ENODEV; 161 162 lockdep_assert_irqs_disabled(); 163 164 if (__this_cpu_read(tdx_lp_initialized)) 165 return 0; 166 167 /* 168 * The TDX module global initialization is the very first step 169 * to enable TDX. Need to do it first (if hasn't been done) 170 * before the per-cpu initialization. 171 */ 172 ret = try_init_module_global(); 173 if (ret) 174 return ret; 175 176 ret = seamcall_prerr(TDH_SYS_LP_INIT, &args); 177 if (ret) 178 return ret; 179 180 __this_cpu_write(tdx_lp_initialized, true); 181 182 return 0; 183 } 184 EXPORT_SYMBOL_GPL(tdx_cpu_enable); 185 186 /* 187 * Add a memory region as a TDX memory block. The caller must make sure 188 * all memory regions are added in address ascending order and don't 189 * overlap. 190 */ 191 static int add_tdx_memblock(struct list_head *tmb_list, unsigned long start_pfn, 192 unsigned long end_pfn, int nid) 193 { 194 struct tdx_memblock *tmb; 195 196 tmb = kmalloc(sizeof(*tmb), GFP_KERNEL); 197 if (!tmb) 198 return -ENOMEM; 199 200 INIT_LIST_HEAD(&tmb->list); 201 tmb->start_pfn = start_pfn; 202 tmb->end_pfn = end_pfn; 203 tmb->nid = nid; 204 205 /* @tmb_list is protected by mem_hotplug_lock */ 206 list_add_tail(&tmb->list, tmb_list); 207 return 0; 208 } 209 210 static void free_tdx_memlist(struct list_head *tmb_list) 211 { 212 /* @tmb_list is protected by mem_hotplug_lock */ 213 while (!list_empty(tmb_list)) { 214 struct tdx_memblock *tmb = list_first_entry(tmb_list, 215 struct tdx_memblock, list); 216 217 list_del(&tmb->list); 218 kfree(tmb); 219 } 220 } 221 222 /* 223 * Ensure that all memblock memory regions are convertible to TDX 224 * memory. Once this has been established, stash the memblock 225 * ranges off in a secondary structure because memblock is modified 226 * in memory hotplug while TDX memory regions are fixed. 227 */ 228 static int build_tdx_memlist(struct list_head *tmb_list) 229 { 230 unsigned long start_pfn, end_pfn; 231 int i, nid, ret; 232 233 for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) { 234 /* 235 * The first 1MB is not reported as TDX convertible memory. 236 * Although the first 1MB is always reserved and won't end up 237 * to the page allocator, it is still in memblock's memory 238 * regions. Skip them manually to exclude them as TDX memory. 239 */ 240 start_pfn = max(start_pfn, PHYS_PFN(SZ_1M)); 241 if (start_pfn >= end_pfn) 242 continue; 243 244 /* 245 * Add the memory regions as TDX memory. The regions in 246 * memblock has already guaranteed they are in address 247 * ascending order and don't overlap. 248 */ 249 ret = add_tdx_memblock(tmb_list, start_pfn, end_pfn, nid); 250 if (ret) 251 goto err; 252 } 253 254 return 0; 255 err: 256 free_tdx_memlist(tmb_list); 257 return ret; 258 } 259 260 static int read_sys_metadata_field(u64 field_id, u64 *data) 261 { 262 struct tdx_module_args args = {}; 263 int ret; 264 265 /* 266 * TDH.SYS.RD -- reads one global metadata field 267 * - RDX (in): the field to read 268 * - R8 (out): the field data 269 */ 270 args.rdx = field_id; 271 ret = seamcall_prerr_ret(TDH_SYS_RD, &args); 272 if (ret) 273 return ret; 274 275 *data = args.r8; 276 277 return 0; 278 } 279 280 #include "tdx_global_metadata.c" 281 282 static int check_features(struct tdx_sys_info *sysinfo) 283 { 284 u64 tdx_features0 = sysinfo->features.tdx_features0; 285 286 if (!(tdx_features0 & TDX_FEATURES0_NO_RBP_MOD)) { 287 pr_err("frame pointer (RBP) clobber bug present, upgrade TDX module\n"); 288 return -EINVAL; 289 } 290 291 return 0; 292 } 293 294 /* Calculate the actual TDMR size */ 295 static int tdmr_size_single(u16 max_reserved_per_tdmr) 296 { 297 int tdmr_sz; 298 299 /* 300 * The actual size of TDMR depends on the maximum 301 * number of reserved areas. 302 */ 303 tdmr_sz = sizeof(struct tdmr_info); 304 tdmr_sz += sizeof(struct tdmr_reserved_area) * max_reserved_per_tdmr; 305 306 return ALIGN(tdmr_sz, TDMR_INFO_ALIGNMENT); 307 } 308 309 static int alloc_tdmr_list(struct tdmr_info_list *tdmr_list, 310 struct tdx_sys_info_tdmr *sysinfo_tdmr) 311 { 312 size_t tdmr_sz, tdmr_array_sz; 313 void *tdmr_array; 314 315 tdmr_sz = tdmr_size_single(sysinfo_tdmr->max_reserved_per_tdmr); 316 tdmr_array_sz = tdmr_sz * sysinfo_tdmr->max_tdmrs; 317 318 /* 319 * To keep things simple, allocate all TDMRs together. 320 * The buffer needs to be physically contiguous to make 321 * sure each TDMR is physically contiguous. 322 */ 323 tdmr_array = alloc_pages_exact(tdmr_array_sz, 324 GFP_KERNEL | __GFP_ZERO); 325 if (!tdmr_array) 326 return -ENOMEM; 327 328 tdmr_list->tdmrs = tdmr_array; 329 330 /* 331 * Keep the size of TDMR to find the target TDMR 332 * at a given index in the TDMR list. 333 */ 334 tdmr_list->tdmr_sz = tdmr_sz; 335 tdmr_list->max_tdmrs = sysinfo_tdmr->max_tdmrs; 336 tdmr_list->nr_consumed_tdmrs = 0; 337 338 return 0; 339 } 340 341 static void free_tdmr_list(struct tdmr_info_list *tdmr_list) 342 { 343 free_pages_exact(tdmr_list->tdmrs, 344 tdmr_list->max_tdmrs * tdmr_list->tdmr_sz); 345 } 346 347 /* Get the TDMR from the list at the given index. */ 348 static struct tdmr_info *tdmr_entry(struct tdmr_info_list *tdmr_list, 349 int idx) 350 { 351 int tdmr_info_offset = tdmr_list->tdmr_sz * idx; 352 353 return (void *)tdmr_list->tdmrs + tdmr_info_offset; 354 } 355 356 #define TDMR_ALIGNMENT SZ_1G 357 #define TDMR_ALIGN_DOWN(_addr) ALIGN_DOWN((_addr), TDMR_ALIGNMENT) 358 #define TDMR_ALIGN_UP(_addr) ALIGN((_addr), TDMR_ALIGNMENT) 359 360 static inline u64 tdmr_end(struct tdmr_info *tdmr) 361 { 362 return tdmr->base + tdmr->size; 363 } 364 365 /* 366 * Take the memory referenced in @tmb_list and populate the 367 * preallocated @tdmr_list, following all the special alignment 368 * and size rules for TDMR. 369 */ 370 static int fill_out_tdmrs(struct list_head *tmb_list, 371 struct tdmr_info_list *tdmr_list) 372 { 373 struct tdx_memblock *tmb; 374 int tdmr_idx = 0; 375 376 /* 377 * Loop over TDX memory regions and fill out TDMRs to cover them. 378 * To keep it simple, always try to use one TDMR to cover one 379 * memory region. 380 * 381 * In practice TDX supports at least 64 TDMRs. A 2-socket system 382 * typically only consumes less than 10 of those. This code is 383 * dumb and simple and may use more TMDRs than is strictly 384 * required. 385 */ 386 list_for_each_entry(tmb, tmb_list, list) { 387 struct tdmr_info *tdmr = tdmr_entry(tdmr_list, tdmr_idx); 388 u64 start, end; 389 390 start = TDMR_ALIGN_DOWN(PFN_PHYS(tmb->start_pfn)); 391 end = TDMR_ALIGN_UP(PFN_PHYS(tmb->end_pfn)); 392 393 /* 394 * A valid size indicates the current TDMR has already 395 * been filled out to cover the previous memory region(s). 396 */ 397 if (tdmr->size) { 398 /* 399 * Loop to the next if the current memory region 400 * has already been fully covered. 401 */ 402 if (end <= tdmr_end(tdmr)) 403 continue; 404 405 /* Otherwise, skip the already covered part. */ 406 if (start < tdmr_end(tdmr)) 407 start = tdmr_end(tdmr); 408 409 /* 410 * Create a new TDMR to cover the current memory 411 * region, or the remaining part of it. 412 */ 413 tdmr_idx++; 414 if (tdmr_idx >= tdmr_list->max_tdmrs) { 415 pr_warn("initialization failed: TDMRs exhausted.\n"); 416 return -ENOSPC; 417 } 418 419 tdmr = tdmr_entry(tdmr_list, tdmr_idx); 420 } 421 422 tdmr->base = start; 423 tdmr->size = end - start; 424 } 425 426 /* @tdmr_idx is always the index of the last valid TDMR. */ 427 tdmr_list->nr_consumed_tdmrs = tdmr_idx + 1; 428 429 /* 430 * Warn early that kernel is about to run out of TDMRs. 431 * 432 * This is an indication that TDMR allocation has to be 433 * reworked to be smarter to not run into an issue. 434 */ 435 if (tdmr_list->max_tdmrs - tdmr_list->nr_consumed_tdmrs < TDMR_NR_WARN) 436 pr_warn("consumed TDMRs reaching limit: %d used out of %d\n", 437 tdmr_list->nr_consumed_tdmrs, 438 tdmr_list->max_tdmrs); 439 440 return 0; 441 } 442 443 /* 444 * Calculate PAMT size given a TDMR and a page size. The returned 445 * PAMT size is always aligned up to 4K page boundary. 446 */ 447 static unsigned long tdmr_get_pamt_sz(struct tdmr_info *tdmr, int pgsz, 448 u16 pamt_entry_size) 449 { 450 unsigned long pamt_sz, nr_pamt_entries; 451 452 switch (pgsz) { 453 case TDX_PS_4K: 454 nr_pamt_entries = tdmr->size >> PAGE_SHIFT; 455 break; 456 case TDX_PS_2M: 457 nr_pamt_entries = tdmr->size >> PMD_SHIFT; 458 break; 459 case TDX_PS_1G: 460 nr_pamt_entries = tdmr->size >> PUD_SHIFT; 461 break; 462 default: 463 WARN_ON_ONCE(1); 464 return 0; 465 } 466 467 pamt_sz = nr_pamt_entries * pamt_entry_size; 468 /* TDX requires PAMT size must be 4K aligned */ 469 pamt_sz = ALIGN(pamt_sz, PAGE_SIZE); 470 471 return pamt_sz; 472 } 473 474 /* 475 * Locate a NUMA node which should hold the allocation of the @tdmr 476 * PAMT. This node will have some memory covered by the TDMR. The 477 * relative amount of memory covered is not considered. 478 */ 479 static int tdmr_get_nid(struct tdmr_info *tdmr, struct list_head *tmb_list) 480 { 481 struct tdx_memblock *tmb; 482 483 /* 484 * A TDMR must cover at least part of one TMB. That TMB will end 485 * after the TDMR begins. But, that TMB may have started before 486 * the TDMR. Find the next 'tmb' that _ends_ after this TDMR 487 * begins. Ignore 'tmb' start addresses. They are irrelevant. 488 */ 489 list_for_each_entry(tmb, tmb_list, list) { 490 if (tmb->end_pfn > PHYS_PFN(tdmr->base)) 491 return tmb->nid; 492 } 493 494 /* 495 * Fall back to allocating the TDMR's metadata from node 0 when 496 * no TDX memory block can be found. This should never happen 497 * since TDMRs originate from TDX memory blocks. 498 */ 499 pr_warn("TDMR [0x%llx, 0x%llx): unable to find local NUMA node for PAMT allocation, fallback to use node 0.\n", 500 tdmr->base, tdmr_end(tdmr)); 501 return 0; 502 } 503 504 /* 505 * Allocate PAMTs from the local NUMA node of some memory in @tmb_list 506 * within @tdmr, and set up PAMTs for @tdmr. 507 */ 508 static int tdmr_set_up_pamt(struct tdmr_info *tdmr, 509 struct list_head *tmb_list, 510 u16 pamt_entry_size[]) 511 { 512 unsigned long pamt_base[TDX_PS_NR]; 513 unsigned long pamt_size[TDX_PS_NR]; 514 unsigned long tdmr_pamt_base; 515 unsigned long tdmr_pamt_size; 516 struct page *pamt; 517 int pgsz, nid; 518 519 nid = tdmr_get_nid(tdmr, tmb_list); 520 521 /* 522 * Calculate the PAMT size for each TDX supported page size 523 * and the total PAMT size. 524 */ 525 tdmr_pamt_size = 0; 526 for (pgsz = TDX_PS_4K; pgsz < TDX_PS_NR; pgsz++) { 527 pamt_size[pgsz] = tdmr_get_pamt_sz(tdmr, pgsz, 528 pamt_entry_size[pgsz]); 529 tdmr_pamt_size += pamt_size[pgsz]; 530 } 531 532 /* 533 * Allocate one chunk of physically contiguous memory for all 534 * PAMTs. This helps minimize the PAMT's use of reserved areas 535 * in overlapped TDMRs. 536 */ 537 pamt = alloc_contig_pages(tdmr_pamt_size >> PAGE_SHIFT, GFP_KERNEL, 538 nid, &node_online_map); 539 if (!pamt) 540 return -ENOMEM; 541 542 /* 543 * Break the contiguous allocation back up into the 544 * individual PAMTs for each page size. 545 */ 546 tdmr_pamt_base = page_to_pfn(pamt) << PAGE_SHIFT; 547 for (pgsz = TDX_PS_4K; pgsz < TDX_PS_NR; pgsz++) { 548 pamt_base[pgsz] = tdmr_pamt_base; 549 tdmr_pamt_base += pamt_size[pgsz]; 550 } 551 552 tdmr->pamt_4k_base = pamt_base[TDX_PS_4K]; 553 tdmr->pamt_4k_size = pamt_size[TDX_PS_4K]; 554 tdmr->pamt_2m_base = pamt_base[TDX_PS_2M]; 555 tdmr->pamt_2m_size = pamt_size[TDX_PS_2M]; 556 tdmr->pamt_1g_base = pamt_base[TDX_PS_1G]; 557 tdmr->pamt_1g_size = pamt_size[TDX_PS_1G]; 558 559 return 0; 560 } 561 562 static void tdmr_get_pamt(struct tdmr_info *tdmr, unsigned long *pamt_base, 563 unsigned long *pamt_size) 564 { 565 unsigned long pamt_bs, pamt_sz; 566 567 /* 568 * The PAMT was allocated in one contiguous unit. The 4K PAMT 569 * should always point to the beginning of that allocation. 570 */ 571 pamt_bs = tdmr->pamt_4k_base; 572 pamt_sz = tdmr->pamt_4k_size + tdmr->pamt_2m_size + tdmr->pamt_1g_size; 573 574 WARN_ON_ONCE((pamt_bs & ~PAGE_MASK) || (pamt_sz & ~PAGE_MASK)); 575 576 *pamt_base = pamt_bs; 577 *pamt_size = pamt_sz; 578 } 579 580 static void tdmr_do_pamt_func(struct tdmr_info *tdmr, 581 void (*pamt_func)(unsigned long base, unsigned long size)) 582 { 583 unsigned long pamt_base, pamt_size; 584 585 tdmr_get_pamt(tdmr, &pamt_base, &pamt_size); 586 587 /* Do nothing if PAMT hasn't been allocated for this TDMR */ 588 if (!pamt_size) 589 return; 590 591 if (WARN_ON_ONCE(!pamt_base)) 592 return; 593 594 pamt_func(pamt_base, pamt_size); 595 } 596 597 static void free_pamt(unsigned long pamt_base, unsigned long pamt_size) 598 { 599 free_contig_range(pamt_base >> PAGE_SHIFT, pamt_size >> PAGE_SHIFT); 600 } 601 602 static void tdmr_free_pamt(struct tdmr_info *tdmr) 603 { 604 tdmr_do_pamt_func(tdmr, free_pamt); 605 } 606 607 static void tdmrs_free_pamt_all(struct tdmr_info_list *tdmr_list) 608 { 609 int i; 610 611 for (i = 0; i < tdmr_list->nr_consumed_tdmrs; i++) 612 tdmr_free_pamt(tdmr_entry(tdmr_list, i)); 613 } 614 615 /* Allocate and set up PAMTs for all TDMRs */ 616 static int tdmrs_set_up_pamt_all(struct tdmr_info_list *tdmr_list, 617 struct list_head *tmb_list, 618 u16 pamt_entry_size[]) 619 { 620 int i, ret = 0; 621 622 for (i = 0; i < tdmr_list->nr_consumed_tdmrs; i++) { 623 ret = tdmr_set_up_pamt(tdmr_entry(tdmr_list, i), tmb_list, 624 pamt_entry_size); 625 if (ret) 626 goto err; 627 } 628 629 return 0; 630 err: 631 tdmrs_free_pamt_all(tdmr_list); 632 return ret; 633 } 634 635 /* 636 * Convert TDX private pages back to normal by using MOVDIR64B to clear these 637 * pages. Typically, any write to the page will convert it from TDX private back 638 * to normal kernel memory. Systems with the X86_BUG_TDX_PW_MCE erratum need to 639 * do the conversion explicitly via MOVDIR64B. 640 */ 641 static void tdx_quirk_reset_paddr(unsigned long base, unsigned long size) 642 { 643 const void *zero_page = (const void *)page_address(ZERO_PAGE(0)); 644 unsigned long phys, end; 645 646 if (!boot_cpu_has_bug(X86_BUG_TDX_PW_MCE)) 647 return; 648 649 end = base + size; 650 for (phys = base; phys < end; phys += 64) 651 movdir64b(__va(phys), zero_page); 652 653 /* 654 * MOVDIR64B uses WC protocol. Use memory barrier to 655 * make sure any later user of these pages sees the 656 * updated data. 657 */ 658 mb(); 659 } 660 661 void tdx_quirk_reset_page(struct page *page) 662 { 663 tdx_quirk_reset_paddr(page_to_phys(page), PAGE_SIZE); 664 } 665 EXPORT_SYMBOL_GPL(tdx_quirk_reset_page); 666 667 static void tdmr_quirk_reset_pamt(struct tdmr_info *tdmr) 668 { 669 tdmr_do_pamt_func(tdmr, tdx_quirk_reset_paddr); 670 } 671 672 static void tdmrs_quirk_reset_pamt_all(struct tdmr_info_list *tdmr_list) 673 { 674 int i; 675 676 for (i = 0; i < tdmr_list->nr_consumed_tdmrs; i++) 677 tdmr_quirk_reset_pamt(tdmr_entry(tdmr_list, i)); 678 } 679 680 static unsigned long tdmrs_count_pamt_kb(struct tdmr_info_list *tdmr_list) 681 { 682 unsigned long pamt_size = 0; 683 int i; 684 685 for (i = 0; i < tdmr_list->nr_consumed_tdmrs; i++) { 686 unsigned long base, size; 687 688 tdmr_get_pamt(tdmr_entry(tdmr_list, i), &base, &size); 689 pamt_size += size; 690 } 691 692 return pamt_size / 1024; 693 } 694 695 static int tdmr_add_rsvd_area(struct tdmr_info *tdmr, int *p_idx, u64 addr, 696 u64 size, u16 max_reserved_per_tdmr) 697 { 698 struct tdmr_reserved_area *rsvd_areas = tdmr->reserved_areas; 699 int idx = *p_idx; 700 701 /* Reserved area must be 4K aligned in offset and size */ 702 if (WARN_ON(addr & ~PAGE_MASK || size & ~PAGE_MASK)) 703 return -EINVAL; 704 705 if (idx >= max_reserved_per_tdmr) { 706 pr_warn("initialization failed: TDMR [0x%llx, 0x%llx): reserved areas exhausted.\n", 707 tdmr->base, tdmr_end(tdmr)); 708 return -ENOSPC; 709 } 710 711 /* 712 * Consume one reserved area per call. Make no effort to 713 * optimize or reduce the number of reserved areas which are 714 * consumed by contiguous reserved areas, for instance. 715 */ 716 rsvd_areas[idx].offset = addr - tdmr->base; 717 rsvd_areas[idx].size = size; 718 719 *p_idx = idx + 1; 720 721 return 0; 722 } 723 724 /* 725 * Go through @tmb_list to find holes between memory areas. If any of 726 * those holes fall within @tdmr, set up a TDMR reserved area to cover 727 * the hole. 728 */ 729 static int tdmr_populate_rsvd_holes(struct list_head *tmb_list, 730 struct tdmr_info *tdmr, 731 int *rsvd_idx, 732 u16 max_reserved_per_tdmr) 733 { 734 struct tdx_memblock *tmb; 735 u64 prev_end; 736 int ret; 737 738 /* 739 * Start looking for reserved blocks at the 740 * beginning of the TDMR. 741 */ 742 prev_end = tdmr->base; 743 list_for_each_entry(tmb, tmb_list, list) { 744 u64 start, end; 745 746 start = PFN_PHYS(tmb->start_pfn); 747 end = PFN_PHYS(tmb->end_pfn); 748 749 /* Break if this region is after the TDMR */ 750 if (start >= tdmr_end(tdmr)) 751 break; 752 753 /* Exclude regions before this TDMR */ 754 if (end < tdmr->base) 755 continue; 756 757 /* 758 * Skip over memory areas that 759 * have already been dealt with. 760 */ 761 if (start <= prev_end) { 762 prev_end = end; 763 continue; 764 } 765 766 /* Add the hole before this region */ 767 ret = tdmr_add_rsvd_area(tdmr, rsvd_idx, prev_end, 768 start - prev_end, 769 max_reserved_per_tdmr); 770 if (ret) 771 return ret; 772 773 prev_end = end; 774 } 775 776 /* Add the hole after the last region if it exists. */ 777 if (prev_end < tdmr_end(tdmr)) { 778 ret = tdmr_add_rsvd_area(tdmr, rsvd_idx, prev_end, 779 tdmr_end(tdmr) - prev_end, 780 max_reserved_per_tdmr); 781 if (ret) 782 return ret; 783 } 784 785 return 0; 786 } 787 788 /* 789 * Go through @tdmr_list to find all PAMTs. If any of those PAMTs 790 * overlaps with @tdmr, set up a TDMR reserved area to cover the 791 * overlapping part. 792 */ 793 static int tdmr_populate_rsvd_pamts(struct tdmr_info_list *tdmr_list, 794 struct tdmr_info *tdmr, 795 int *rsvd_idx, 796 u16 max_reserved_per_tdmr) 797 { 798 int i, ret; 799 800 for (i = 0; i < tdmr_list->nr_consumed_tdmrs; i++) { 801 struct tdmr_info *tmp = tdmr_entry(tdmr_list, i); 802 unsigned long pamt_base, pamt_size, pamt_end; 803 804 tdmr_get_pamt(tmp, &pamt_base, &pamt_size); 805 /* Each TDMR must already have PAMT allocated */ 806 WARN_ON_ONCE(!pamt_size || !pamt_base); 807 808 pamt_end = pamt_base + pamt_size; 809 /* Skip PAMTs outside of the given TDMR */ 810 if ((pamt_end <= tdmr->base) || 811 (pamt_base >= tdmr_end(tdmr))) 812 continue; 813 814 /* Only mark the part within the TDMR as reserved */ 815 if (pamt_base < tdmr->base) 816 pamt_base = tdmr->base; 817 if (pamt_end > tdmr_end(tdmr)) 818 pamt_end = tdmr_end(tdmr); 819 820 ret = tdmr_add_rsvd_area(tdmr, rsvd_idx, pamt_base, 821 pamt_end - pamt_base, 822 max_reserved_per_tdmr); 823 if (ret) 824 return ret; 825 } 826 827 return 0; 828 } 829 830 /* Compare function called by sort() for TDMR reserved areas */ 831 static int rsvd_area_cmp_func(const void *a, const void *b) 832 { 833 struct tdmr_reserved_area *r1 = (struct tdmr_reserved_area *)a; 834 struct tdmr_reserved_area *r2 = (struct tdmr_reserved_area *)b; 835 836 if (r1->offset + r1->size <= r2->offset) 837 return -1; 838 if (r1->offset >= r2->offset + r2->size) 839 return 1; 840 841 /* Reserved areas cannot overlap. The caller must guarantee. */ 842 WARN_ON_ONCE(1); 843 return -1; 844 } 845 846 /* 847 * Populate reserved areas for the given @tdmr, including memory holes 848 * (via @tmb_list) and PAMTs (via @tdmr_list). 849 */ 850 static int tdmr_populate_rsvd_areas(struct tdmr_info *tdmr, 851 struct list_head *tmb_list, 852 struct tdmr_info_list *tdmr_list, 853 u16 max_reserved_per_tdmr) 854 { 855 int ret, rsvd_idx = 0; 856 857 ret = tdmr_populate_rsvd_holes(tmb_list, tdmr, &rsvd_idx, 858 max_reserved_per_tdmr); 859 if (ret) 860 return ret; 861 862 ret = tdmr_populate_rsvd_pamts(tdmr_list, tdmr, &rsvd_idx, 863 max_reserved_per_tdmr); 864 if (ret) 865 return ret; 866 867 /* TDX requires reserved areas listed in address ascending order */ 868 sort(tdmr->reserved_areas, rsvd_idx, sizeof(struct tdmr_reserved_area), 869 rsvd_area_cmp_func, NULL); 870 871 return 0; 872 } 873 874 /* 875 * Populate reserved areas for all TDMRs in @tdmr_list, including memory 876 * holes (via @tmb_list) and PAMTs. 877 */ 878 static int tdmrs_populate_rsvd_areas_all(struct tdmr_info_list *tdmr_list, 879 struct list_head *tmb_list, 880 u16 max_reserved_per_tdmr) 881 { 882 int i; 883 884 for (i = 0; i < tdmr_list->nr_consumed_tdmrs; i++) { 885 int ret; 886 887 ret = tdmr_populate_rsvd_areas(tdmr_entry(tdmr_list, i), 888 tmb_list, tdmr_list, max_reserved_per_tdmr); 889 if (ret) 890 return ret; 891 } 892 893 return 0; 894 } 895 896 /* 897 * Construct a list of TDMRs on the preallocated space in @tdmr_list 898 * to cover all TDX memory regions in @tmb_list based on the TDX module 899 * TDMR global information in @sysinfo_tdmr. 900 */ 901 static int construct_tdmrs(struct list_head *tmb_list, 902 struct tdmr_info_list *tdmr_list, 903 struct tdx_sys_info_tdmr *sysinfo_tdmr) 904 { 905 u16 pamt_entry_size[TDX_PS_NR] = { 906 sysinfo_tdmr->pamt_4k_entry_size, 907 sysinfo_tdmr->pamt_2m_entry_size, 908 sysinfo_tdmr->pamt_1g_entry_size, 909 }; 910 int ret; 911 912 ret = fill_out_tdmrs(tmb_list, tdmr_list); 913 if (ret) 914 return ret; 915 916 ret = tdmrs_set_up_pamt_all(tdmr_list, tmb_list, pamt_entry_size); 917 if (ret) 918 return ret; 919 920 ret = tdmrs_populate_rsvd_areas_all(tdmr_list, tmb_list, 921 sysinfo_tdmr->max_reserved_per_tdmr); 922 if (ret) 923 tdmrs_free_pamt_all(tdmr_list); 924 925 /* 926 * The tdmr_info_list is read-only from here on out. 927 * Ensure that these writes are seen by other CPUs. 928 * Pairs with a smp_rmb() in is_pamt_page(). 929 */ 930 smp_wmb(); 931 932 return ret; 933 } 934 935 static int config_tdx_module(struct tdmr_info_list *tdmr_list, u64 global_keyid) 936 { 937 struct tdx_module_args args = {}; 938 u64 *tdmr_pa_array; 939 size_t array_sz; 940 int i, ret; 941 942 /* 943 * TDMRs are passed to the TDX module via an array of physical 944 * addresses of each TDMR. The array itself also has certain 945 * alignment requirement. 946 */ 947 array_sz = tdmr_list->nr_consumed_tdmrs * sizeof(u64); 948 array_sz = roundup_pow_of_two(array_sz); 949 if (array_sz < TDMR_INFO_PA_ARRAY_ALIGNMENT) 950 array_sz = TDMR_INFO_PA_ARRAY_ALIGNMENT; 951 952 tdmr_pa_array = kzalloc(array_sz, GFP_KERNEL); 953 if (!tdmr_pa_array) 954 return -ENOMEM; 955 956 for (i = 0; i < tdmr_list->nr_consumed_tdmrs; i++) 957 tdmr_pa_array[i] = __pa(tdmr_entry(tdmr_list, i)); 958 959 args.rcx = __pa(tdmr_pa_array); 960 args.rdx = tdmr_list->nr_consumed_tdmrs; 961 args.r8 = global_keyid; 962 ret = seamcall_prerr(TDH_SYS_CONFIG, &args); 963 964 /* Free the array as it is not required anymore. */ 965 kfree(tdmr_pa_array); 966 967 return ret; 968 } 969 970 static int do_global_key_config(void *unused) 971 { 972 struct tdx_module_args args = {}; 973 974 return seamcall_prerr(TDH_SYS_KEY_CONFIG, &args); 975 } 976 977 /* 978 * Attempt to configure the global KeyID on all physical packages. 979 * 980 * This requires running code on at least one CPU in each package. 981 * TDMR initialization) will fail will fail if any package in the 982 * system has no online CPUs. 983 * 984 * This code takes no affirmative steps to online CPUs. Callers (aka. 985 * KVM) can ensure success by ensuring sufficient CPUs are online and 986 * can run SEAMCALLs. 987 */ 988 static int config_global_keyid(void) 989 { 990 cpumask_var_t packages; 991 int cpu, ret = -EINVAL; 992 993 if (!zalloc_cpumask_var(&packages, GFP_KERNEL)) 994 return -ENOMEM; 995 996 /* 997 * Hardware doesn't guarantee cache coherency across different 998 * KeyIDs. The kernel needs to flush PAMT's dirty cachelines 999 * (associated with KeyID 0) before the TDX module can use the 1000 * global KeyID to access the PAMT. Given PAMTs are potentially 1001 * large (~1/256th of system RAM), just use WBINVD. 1002 */ 1003 wbinvd_on_all_cpus(); 1004 1005 for_each_online_cpu(cpu) { 1006 /* 1007 * The key configuration only needs to be done once per 1008 * package and will return an error if configured more 1009 * than once. Avoid doing it multiple times per package. 1010 */ 1011 if (cpumask_test_and_set_cpu(topology_physical_package_id(cpu), 1012 packages)) 1013 continue; 1014 1015 /* 1016 * TDH.SYS.KEY.CONFIG cannot run concurrently on 1017 * different cpus. Do it one by one. 1018 */ 1019 ret = smp_call_on_cpu(cpu, do_global_key_config, NULL, true); 1020 if (ret) 1021 break; 1022 } 1023 1024 free_cpumask_var(packages); 1025 return ret; 1026 } 1027 1028 static int init_tdmr(struct tdmr_info *tdmr) 1029 { 1030 u64 next; 1031 1032 /* 1033 * Initializing a TDMR can be time consuming. To avoid long 1034 * SEAMCALLs, the TDX module may only initialize a part of the 1035 * TDMR in each call. 1036 */ 1037 do { 1038 struct tdx_module_args args = { 1039 .rcx = tdmr->base, 1040 }; 1041 int ret; 1042 1043 ret = seamcall_prerr_ret(TDH_SYS_TDMR_INIT, &args); 1044 if (ret) 1045 return ret; 1046 /* 1047 * RDX contains 'next-to-initialize' address if 1048 * TDH.SYS.TDMR.INIT did not fully complete and 1049 * should be retried. 1050 */ 1051 next = args.rdx; 1052 cond_resched(); 1053 /* Keep making SEAMCALLs until the TDMR is done */ 1054 } while (next < tdmr->base + tdmr->size); 1055 1056 return 0; 1057 } 1058 1059 static int init_tdmrs(struct tdmr_info_list *tdmr_list) 1060 { 1061 int i; 1062 1063 /* 1064 * This operation is costly. It can be parallelized, 1065 * but keep it simple for now. 1066 */ 1067 for (i = 0; i < tdmr_list->nr_consumed_tdmrs; i++) { 1068 int ret; 1069 1070 ret = init_tdmr(tdmr_entry(tdmr_list, i)); 1071 if (ret) 1072 return ret; 1073 } 1074 1075 return 0; 1076 } 1077 1078 static int init_tdx_module(void) 1079 { 1080 int ret; 1081 1082 ret = get_tdx_sys_info(&tdx_sysinfo); 1083 if (ret) 1084 return ret; 1085 1086 /* Check whether the kernel can support this module */ 1087 ret = check_features(&tdx_sysinfo); 1088 if (ret) 1089 return ret; 1090 1091 /* 1092 * To keep things simple, assume that all TDX-protected memory 1093 * will come from the page allocator. Make sure all pages in the 1094 * page allocator are TDX-usable memory. 1095 * 1096 * Build the list of "TDX-usable" memory regions which cover all 1097 * pages in the page allocator to guarantee that. Do it while 1098 * holding mem_hotplug_lock read-lock as the memory hotplug code 1099 * path reads the @tdx_memlist to reject any new memory. 1100 */ 1101 get_online_mems(); 1102 1103 ret = build_tdx_memlist(&tdx_memlist); 1104 if (ret) 1105 goto out_put_tdxmem; 1106 1107 /* Allocate enough space for constructing TDMRs */ 1108 ret = alloc_tdmr_list(&tdx_tdmr_list, &tdx_sysinfo.tdmr); 1109 if (ret) 1110 goto err_free_tdxmem; 1111 1112 /* Cover all TDX-usable memory regions in TDMRs */ 1113 ret = construct_tdmrs(&tdx_memlist, &tdx_tdmr_list, &tdx_sysinfo.tdmr); 1114 if (ret) 1115 goto err_free_tdmrs; 1116 1117 /* Pass the TDMRs and the global KeyID to the TDX module */ 1118 ret = config_tdx_module(&tdx_tdmr_list, tdx_global_keyid); 1119 if (ret) 1120 goto err_free_pamts; 1121 1122 /* Config the key of global KeyID on all packages */ 1123 ret = config_global_keyid(); 1124 if (ret) 1125 goto err_reset_pamts; 1126 1127 /* Initialize TDMRs to complete the TDX module initialization */ 1128 ret = init_tdmrs(&tdx_tdmr_list); 1129 if (ret) 1130 goto err_reset_pamts; 1131 1132 pr_info("%lu KB allocated for PAMT\n", tdmrs_count_pamt_kb(&tdx_tdmr_list)); 1133 1134 out_put_tdxmem: 1135 /* 1136 * @tdx_memlist is written here and read at memory hotplug time. 1137 * Lock out memory hotplug code while building it. 1138 */ 1139 put_online_mems(); 1140 return ret; 1141 1142 err_reset_pamts: 1143 /* 1144 * Part of PAMTs may already have been initialized by the 1145 * TDX module. Flush cache before returning PAMTs back 1146 * to the kernel. 1147 */ 1148 wbinvd_on_all_cpus(); 1149 tdmrs_quirk_reset_pamt_all(&tdx_tdmr_list); 1150 err_free_pamts: 1151 tdmrs_free_pamt_all(&tdx_tdmr_list); 1152 err_free_tdmrs: 1153 free_tdmr_list(&tdx_tdmr_list); 1154 err_free_tdxmem: 1155 free_tdx_memlist(&tdx_memlist); 1156 goto out_put_tdxmem; 1157 } 1158 1159 static int __tdx_enable(void) 1160 { 1161 int ret; 1162 1163 ret = init_tdx_module(); 1164 if (ret) { 1165 pr_err("module initialization failed (%d)\n", ret); 1166 tdx_module_status = TDX_MODULE_ERROR; 1167 return ret; 1168 } 1169 1170 pr_info("module initialized\n"); 1171 tdx_module_status = TDX_MODULE_INITIALIZED; 1172 1173 return 0; 1174 } 1175 1176 /** 1177 * tdx_enable - Enable TDX module to make it ready to run TDX guests 1178 * 1179 * This function assumes the caller has: 1) held read lock of CPU hotplug 1180 * lock to prevent any new cpu from becoming online; 2) done both VMXON 1181 * and tdx_cpu_enable() on all online cpus. 1182 * 1183 * This function requires there's at least one online cpu for each CPU 1184 * package to succeed. 1185 * 1186 * This function can be called in parallel by multiple callers. 1187 * 1188 * Return 0 if TDX is enabled successfully, otherwise error. 1189 */ 1190 int tdx_enable(void) 1191 { 1192 int ret; 1193 1194 if (!boot_cpu_has(X86_FEATURE_TDX_HOST_PLATFORM)) 1195 return -ENODEV; 1196 1197 lockdep_assert_cpus_held(); 1198 1199 mutex_lock(&tdx_module_lock); 1200 1201 switch (tdx_module_status) { 1202 case TDX_MODULE_UNINITIALIZED: 1203 ret = __tdx_enable(); 1204 break; 1205 case TDX_MODULE_INITIALIZED: 1206 /* Already initialized, great, tell the caller. */ 1207 ret = 0; 1208 break; 1209 default: 1210 /* Failed to initialize in the previous attempts */ 1211 ret = -EINVAL; 1212 break; 1213 } 1214 1215 mutex_unlock(&tdx_module_lock); 1216 1217 return ret; 1218 } 1219 EXPORT_SYMBOL_GPL(tdx_enable); 1220 1221 static bool is_pamt_page(unsigned long phys) 1222 { 1223 struct tdmr_info_list *tdmr_list = &tdx_tdmr_list; 1224 int i; 1225 1226 /* Ensure that all remote 'tdmr_list' writes are visible: */ 1227 smp_rmb(); 1228 1229 /* 1230 * The TDX module is no longer returning TDX_SYS_NOT_READY and 1231 * is initialized. The 'tdmr_list' was initialized long ago 1232 * and is now read-only. 1233 */ 1234 for (i = 0; i < tdmr_list->nr_consumed_tdmrs; i++) { 1235 unsigned long base, size; 1236 1237 tdmr_get_pamt(tdmr_entry(tdmr_list, i), &base, &size); 1238 1239 if (phys >= base && phys < (base + size)) 1240 return true; 1241 } 1242 1243 return false; 1244 } 1245 1246 /* 1247 * Return whether the memory page at the given physical address is TDX 1248 * private memory or not. 1249 * 1250 * This can be imprecise for two known reasons: 1251 * 1. PAMTs are private memory and exist before the TDX module is 1252 * ready and TDH_PHYMEM_PAGE_RDMD works. This is a relatively 1253 * short window that occurs once per boot. 1254 * 2. TDH_PHYMEM_PAGE_RDMD reflects the TDX module's knowledge of the 1255 * page. However, the page can still cause #MC until it has been 1256 * fully converted to shared using 64-byte writes like MOVDIR64B. 1257 * Buggy hosts might still leave #MC-causing memory in place which 1258 * this function can not detect. 1259 */ 1260 static bool paddr_is_tdx_private(unsigned long phys) 1261 { 1262 struct tdx_module_args args = { 1263 .rcx = phys & PAGE_MASK, 1264 }; 1265 u64 sret; 1266 1267 if (!boot_cpu_has(X86_FEATURE_TDX_HOST_PLATFORM)) 1268 return false; 1269 1270 /* Get page type from the TDX module */ 1271 sret = __seamcall_dirty_cache(__seamcall_ret, TDH_PHYMEM_PAGE_RDMD, &args); 1272 1273 /* 1274 * The SEAMCALL will not return success unless there is a 1275 * working, "ready" TDX module. Assume an absence of TDX 1276 * private pages until SEAMCALL is working. 1277 */ 1278 if (sret) 1279 return false; 1280 1281 /* 1282 * SEAMCALL was successful -- read page type (via RCX): 1283 * 1284 * - PT_NDA: Page is not used by the TDX module 1285 * - PT_RSVD: Reserved for Non-TDX use 1286 * - Others: Page is used by the TDX module 1287 * 1288 * Note PAMT pages are marked as PT_RSVD but they are also TDX 1289 * private memory. 1290 */ 1291 switch (args.rcx) { 1292 case PT_NDA: 1293 return false; 1294 case PT_RSVD: 1295 return is_pamt_page(phys); 1296 default: 1297 return true; 1298 } 1299 } 1300 1301 /* 1302 * Some TDX-capable CPUs have an erratum. A write to TDX private 1303 * memory poisons that memory, and a subsequent read of that memory 1304 * triggers #MC. 1305 * 1306 * Help distinguish erratum-triggered #MCs from a normal hardware one. 1307 * Just print additional message to show such #MC may be result of the 1308 * erratum. 1309 */ 1310 const char *tdx_dump_mce_info(struct mce *m) 1311 { 1312 if (!m || !mce_is_memory_error(m) || !mce_usable_address(m)) 1313 return NULL; 1314 1315 if (!paddr_is_tdx_private(m->addr)) 1316 return NULL; 1317 1318 return "TDX private memory error. Possible kernel bug."; 1319 } 1320 1321 static __init int record_keyid_partitioning(u32 *tdx_keyid_start, 1322 u32 *nr_tdx_keyids) 1323 { 1324 u32 _nr_mktme_keyids, _tdx_keyid_start, _nr_tdx_keyids; 1325 int ret; 1326 1327 /* 1328 * IA32_MKTME_KEYID_PARTIONING: 1329 * Bit [31:0]: Number of MKTME KeyIDs. 1330 * Bit [63:32]: Number of TDX private KeyIDs. 1331 */ 1332 ret = rdmsr_safe(MSR_IA32_MKTME_KEYID_PARTITIONING, &_nr_mktme_keyids, 1333 &_nr_tdx_keyids); 1334 if (ret || !_nr_tdx_keyids) 1335 return -EINVAL; 1336 1337 /* TDX KeyIDs start after the last MKTME KeyID. */ 1338 _tdx_keyid_start = _nr_mktme_keyids + 1; 1339 1340 *tdx_keyid_start = _tdx_keyid_start; 1341 *nr_tdx_keyids = _nr_tdx_keyids; 1342 1343 return 0; 1344 } 1345 1346 static bool is_tdx_memory(unsigned long start_pfn, unsigned long end_pfn) 1347 { 1348 struct tdx_memblock *tmb; 1349 1350 /* 1351 * This check assumes that the start_pfn<->end_pfn range does not 1352 * cross multiple @tdx_memlist entries. A single memory online 1353 * event across multiple memblocks (from which @tdx_memlist 1354 * entries are derived at the time of module initialization) is 1355 * not possible. This is because memory offline/online is done 1356 * on granularity of 'struct memory_block', and the hotpluggable 1357 * memory region (one memblock) must be multiple of memory_block. 1358 */ 1359 list_for_each_entry(tmb, &tdx_memlist, list) { 1360 if (start_pfn >= tmb->start_pfn && end_pfn <= tmb->end_pfn) 1361 return true; 1362 } 1363 return false; 1364 } 1365 1366 static int tdx_memory_notifier(struct notifier_block *nb, unsigned long action, 1367 void *v) 1368 { 1369 struct memory_notify *mn = v; 1370 1371 if (action != MEM_GOING_ONLINE) 1372 return NOTIFY_OK; 1373 1374 /* 1375 * Empty list means TDX isn't enabled. Allow any memory 1376 * to go online. 1377 */ 1378 if (list_empty(&tdx_memlist)) 1379 return NOTIFY_OK; 1380 1381 /* 1382 * The TDX memory configuration is static and can not be 1383 * changed. Reject onlining any memory which is outside of 1384 * the static configuration whether it supports TDX or not. 1385 */ 1386 if (is_tdx_memory(mn->start_pfn, mn->start_pfn + mn->nr_pages)) 1387 return NOTIFY_OK; 1388 1389 return NOTIFY_BAD; 1390 } 1391 1392 static struct notifier_block tdx_memory_nb = { 1393 .notifier_call = tdx_memory_notifier, 1394 }; 1395 1396 static void __init check_tdx_erratum(void) 1397 { 1398 /* 1399 * These CPUs have an erratum. A partial write from non-TD 1400 * software (e.g. via MOVNTI variants or UC/WC mapping) to TDX 1401 * private memory poisons that memory, and a subsequent read of 1402 * that memory triggers #MC. 1403 */ 1404 switch (boot_cpu_data.x86_vfm) { 1405 case INTEL_SAPPHIRERAPIDS_X: 1406 case INTEL_EMERALDRAPIDS_X: 1407 setup_force_cpu_bug(X86_BUG_TDX_PW_MCE); 1408 } 1409 } 1410 1411 void __init tdx_init(void) 1412 { 1413 u32 tdx_keyid_start, nr_tdx_keyids; 1414 int err; 1415 1416 err = record_keyid_partitioning(&tdx_keyid_start, &nr_tdx_keyids); 1417 if (err) 1418 return; 1419 1420 pr_info("BIOS enabled: private KeyID range [%u, %u)\n", 1421 tdx_keyid_start, tdx_keyid_start + nr_tdx_keyids); 1422 1423 /* 1424 * The TDX module itself requires one 'global KeyID' to protect 1425 * its metadata. If there's only one TDX KeyID, there won't be 1426 * any left for TDX guests thus there's no point to enable TDX 1427 * at all. 1428 */ 1429 if (nr_tdx_keyids < 2) { 1430 pr_err("initialization failed: too few private KeyIDs available.\n"); 1431 return; 1432 } 1433 1434 /* 1435 * At this point, hibernation_available() indicates whether or 1436 * not hibernation support has been permanently disabled. 1437 */ 1438 if (hibernation_available()) { 1439 pr_err("initialization failed: Hibernation support is enabled\n"); 1440 return; 1441 } 1442 1443 err = register_memory_notifier(&tdx_memory_nb); 1444 if (err) { 1445 pr_err("initialization failed: register_memory_notifier() failed (%d)\n", 1446 err); 1447 return; 1448 } 1449 1450 #if defined(CONFIG_ACPI) && defined(CONFIG_SUSPEND) 1451 pr_info("Disable ACPI S3. Turn off TDX in the BIOS to use ACPI S3.\n"); 1452 acpi_suspend_lowlevel = NULL; 1453 #endif 1454 1455 /* 1456 * Just use the first TDX KeyID as the 'global KeyID' and 1457 * leave the rest for TDX guests. 1458 */ 1459 tdx_global_keyid = tdx_keyid_start; 1460 tdx_guest_keyid_start = tdx_keyid_start + 1; 1461 tdx_nr_guest_keyids = nr_tdx_keyids - 1; 1462 1463 setup_force_cpu_cap(X86_FEATURE_TDX_HOST_PLATFORM); 1464 1465 check_tdx_erratum(); 1466 } 1467 1468 const struct tdx_sys_info *tdx_get_sysinfo(void) 1469 { 1470 const struct tdx_sys_info *p = NULL; 1471 1472 /* Make sure all fields in @tdx_sysinfo have been populated */ 1473 mutex_lock(&tdx_module_lock); 1474 if (tdx_module_status == TDX_MODULE_INITIALIZED) 1475 p = (const struct tdx_sys_info *)&tdx_sysinfo; 1476 mutex_unlock(&tdx_module_lock); 1477 1478 return p; 1479 } 1480 EXPORT_SYMBOL_GPL(tdx_get_sysinfo); 1481 1482 u32 tdx_get_nr_guest_keyids(void) 1483 { 1484 return tdx_nr_guest_keyids; 1485 } 1486 EXPORT_SYMBOL_GPL(tdx_get_nr_guest_keyids); 1487 1488 int tdx_guest_keyid_alloc(void) 1489 { 1490 return ida_alloc_range(&tdx_guest_keyid_pool, tdx_guest_keyid_start, 1491 tdx_guest_keyid_start + tdx_nr_guest_keyids - 1, 1492 GFP_KERNEL); 1493 } 1494 EXPORT_SYMBOL_GPL(tdx_guest_keyid_alloc); 1495 1496 void tdx_guest_keyid_free(unsigned int keyid) 1497 { 1498 ida_free(&tdx_guest_keyid_pool, keyid); 1499 } 1500 EXPORT_SYMBOL_GPL(tdx_guest_keyid_free); 1501 1502 static inline u64 tdx_tdr_pa(struct tdx_td *td) 1503 { 1504 return page_to_phys(td->tdr_page); 1505 } 1506 1507 /* 1508 * The TDX module exposes a CLFLUSH_BEFORE_ALLOC bit to specify whether 1509 * a CLFLUSH of pages is required before handing them to the TDX module. 1510 * Be conservative and make the code simpler by doing the CLFLUSH 1511 * unconditionally. 1512 */ 1513 static void tdx_clflush_page(struct page *page) 1514 { 1515 clflush_cache_range(page_to_virt(page), PAGE_SIZE); 1516 } 1517 1518 noinstr u64 tdh_vp_enter(struct tdx_vp *td, struct tdx_module_args *args) 1519 { 1520 args->rcx = td->tdvpr_pa; 1521 1522 return __seamcall_dirty_cache(__seamcall_saved_ret, TDH_VP_ENTER, args); 1523 } 1524 EXPORT_SYMBOL_GPL(tdh_vp_enter); 1525 1526 u64 tdh_mng_addcx(struct tdx_td *td, struct page *tdcs_page) 1527 { 1528 struct tdx_module_args args = { 1529 .rcx = page_to_phys(tdcs_page), 1530 .rdx = tdx_tdr_pa(td), 1531 }; 1532 1533 tdx_clflush_page(tdcs_page); 1534 return seamcall(TDH_MNG_ADDCX, &args); 1535 } 1536 EXPORT_SYMBOL_GPL(tdh_mng_addcx); 1537 1538 u64 tdh_mem_page_add(struct tdx_td *td, u64 gpa, struct page *page, struct page *source, u64 *ext_err1, u64 *ext_err2) 1539 { 1540 struct tdx_module_args args = { 1541 .rcx = gpa, 1542 .rdx = tdx_tdr_pa(td), 1543 .r8 = page_to_phys(page), 1544 .r9 = page_to_phys(source), 1545 }; 1546 u64 ret; 1547 1548 tdx_clflush_page(page); 1549 ret = seamcall_ret(TDH_MEM_PAGE_ADD, &args); 1550 1551 *ext_err1 = args.rcx; 1552 *ext_err2 = args.rdx; 1553 1554 return ret; 1555 } 1556 EXPORT_SYMBOL_GPL(tdh_mem_page_add); 1557 1558 u64 tdh_mem_sept_add(struct tdx_td *td, u64 gpa, int level, struct page *page, u64 *ext_err1, u64 *ext_err2) 1559 { 1560 struct tdx_module_args args = { 1561 .rcx = gpa | level, 1562 .rdx = tdx_tdr_pa(td), 1563 .r8 = page_to_phys(page), 1564 }; 1565 u64 ret; 1566 1567 tdx_clflush_page(page); 1568 ret = seamcall_ret(TDH_MEM_SEPT_ADD, &args); 1569 1570 *ext_err1 = args.rcx; 1571 *ext_err2 = args.rdx; 1572 1573 return ret; 1574 } 1575 EXPORT_SYMBOL_GPL(tdh_mem_sept_add); 1576 1577 u64 tdh_vp_addcx(struct tdx_vp *vp, struct page *tdcx_page) 1578 { 1579 struct tdx_module_args args = { 1580 .rcx = page_to_phys(tdcx_page), 1581 .rdx = vp->tdvpr_pa, 1582 }; 1583 1584 tdx_clflush_page(tdcx_page); 1585 return seamcall(TDH_VP_ADDCX, &args); 1586 } 1587 EXPORT_SYMBOL_GPL(tdh_vp_addcx); 1588 1589 u64 tdh_mem_page_aug(struct tdx_td *td, u64 gpa, int level, struct page *page, u64 *ext_err1, u64 *ext_err2) 1590 { 1591 struct tdx_module_args args = { 1592 .rcx = gpa | level, 1593 .rdx = tdx_tdr_pa(td), 1594 .r8 = page_to_phys(page), 1595 }; 1596 u64 ret; 1597 1598 tdx_clflush_page(page); 1599 ret = seamcall_ret(TDH_MEM_PAGE_AUG, &args); 1600 1601 *ext_err1 = args.rcx; 1602 *ext_err2 = args.rdx; 1603 1604 return ret; 1605 } 1606 EXPORT_SYMBOL_GPL(tdh_mem_page_aug); 1607 1608 u64 tdh_mem_range_block(struct tdx_td *td, u64 gpa, int level, u64 *ext_err1, u64 *ext_err2) 1609 { 1610 struct tdx_module_args args = { 1611 .rcx = gpa | level, 1612 .rdx = tdx_tdr_pa(td), 1613 }; 1614 u64 ret; 1615 1616 ret = seamcall_ret(TDH_MEM_RANGE_BLOCK, &args); 1617 1618 *ext_err1 = args.rcx; 1619 *ext_err2 = args.rdx; 1620 1621 return ret; 1622 } 1623 EXPORT_SYMBOL_GPL(tdh_mem_range_block); 1624 1625 u64 tdh_mng_key_config(struct tdx_td *td) 1626 { 1627 struct tdx_module_args args = { 1628 .rcx = tdx_tdr_pa(td), 1629 }; 1630 1631 return seamcall(TDH_MNG_KEY_CONFIG, &args); 1632 } 1633 EXPORT_SYMBOL_GPL(tdh_mng_key_config); 1634 1635 u64 tdh_mng_create(struct tdx_td *td, u16 hkid) 1636 { 1637 struct tdx_module_args args = { 1638 .rcx = tdx_tdr_pa(td), 1639 .rdx = hkid, 1640 }; 1641 1642 tdx_clflush_page(td->tdr_page); 1643 return seamcall(TDH_MNG_CREATE, &args); 1644 } 1645 EXPORT_SYMBOL_GPL(tdh_mng_create); 1646 1647 u64 tdh_vp_create(struct tdx_td *td, struct tdx_vp *vp) 1648 { 1649 struct tdx_module_args args = { 1650 .rcx = vp->tdvpr_pa, 1651 .rdx = tdx_tdr_pa(td), 1652 }; 1653 1654 tdx_clflush_page(vp->tdvpr_page); 1655 return seamcall(TDH_VP_CREATE, &args); 1656 } 1657 EXPORT_SYMBOL_GPL(tdh_vp_create); 1658 1659 u64 tdh_mng_rd(struct tdx_td *td, u64 field, u64 *data) 1660 { 1661 struct tdx_module_args args = { 1662 .rcx = tdx_tdr_pa(td), 1663 .rdx = field, 1664 }; 1665 u64 ret; 1666 1667 ret = seamcall_ret(TDH_MNG_RD, &args); 1668 1669 /* R8: Content of the field, or 0 in case of error. */ 1670 *data = args.r8; 1671 1672 return ret; 1673 } 1674 EXPORT_SYMBOL_GPL(tdh_mng_rd); 1675 1676 u64 tdh_mr_extend(struct tdx_td *td, u64 gpa, u64 *ext_err1, u64 *ext_err2) 1677 { 1678 struct tdx_module_args args = { 1679 .rcx = gpa, 1680 .rdx = tdx_tdr_pa(td), 1681 }; 1682 u64 ret; 1683 1684 ret = seamcall_ret(TDH_MR_EXTEND, &args); 1685 1686 *ext_err1 = args.rcx; 1687 *ext_err2 = args.rdx; 1688 1689 return ret; 1690 } 1691 EXPORT_SYMBOL_GPL(tdh_mr_extend); 1692 1693 u64 tdh_mr_finalize(struct tdx_td *td) 1694 { 1695 struct tdx_module_args args = { 1696 .rcx = tdx_tdr_pa(td), 1697 }; 1698 1699 return seamcall(TDH_MR_FINALIZE, &args); 1700 } 1701 EXPORT_SYMBOL_GPL(tdh_mr_finalize); 1702 1703 u64 tdh_vp_flush(struct tdx_vp *vp) 1704 { 1705 struct tdx_module_args args = { 1706 .rcx = vp->tdvpr_pa, 1707 }; 1708 1709 return seamcall(TDH_VP_FLUSH, &args); 1710 } 1711 EXPORT_SYMBOL_GPL(tdh_vp_flush); 1712 1713 u64 tdh_mng_vpflushdone(struct tdx_td *td) 1714 { 1715 struct tdx_module_args args = { 1716 .rcx = tdx_tdr_pa(td), 1717 }; 1718 1719 return seamcall(TDH_MNG_VPFLUSHDONE, &args); 1720 } 1721 EXPORT_SYMBOL_GPL(tdh_mng_vpflushdone); 1722 1723 u64 tdh_mng_key_freeid(struct tdx_td *td) 1724 { 1725 struct tdx_module_args args = { 1726 .rcx = tdx_tdr_pa(td), 1727 }; 1728 1729 return seamcall(TDH_MNG_KEY_FREEID, &args); 1730 } 1731 EXPORT_SYMBOL_GPL(tdh_mng_key_freeid); 1732 1733 u64 tdh_mng_init(struct tdx_td *td, u64 td_params, u64 *extended_err) 1734 { 1735 struct tdx_module_args args = { 1736 .rcx = tdx_tdr_pa(td), 1737 .rdx = td_params, 1738 }; 1739 u64 ret; 1740 1741 ret = seamcall_ret(TDH_MNG_INIT, &args); 1742 1743 *extended_err = args.rcx; 1744 1745 return ret; 1746 } 1747 EXPORT_SYMBOL_GPL(tdh_mng_init); 1748 1749 u64 tdh_vp_rd(struct tdx_vp *vp, u64 field, u64 *data) 1750 { 1751 struct tdx_module_args args = { 1752 .rcx = vp->tdvpr_pa, 1753 .rdx = field, 1754 }; 1755 u64 ret; 1756 1757 ret = seamcall_ret(TDH_VP_RD, &args); 1758 1759 /* R8: Content of the field, or 0 in case of error. */ 1760 *data = args.r8; 1761 1762 return ret; 1763 } 1764 EXPORT_SYMBOL_GPL(tdh_vp_rd); 1765 1766 u64 tdh_vp_wr(struct tdx_vp *vp, u64 field, u64 data, u64 mask) 1767 { 1768 struct tdx_module_args args = { 1769 .rcx = vp->tdvpr_pa, 1770 .rdx = field, 1771 .r8 = data, 1772 .r9 = mask, 1773 }; 1774 1775 return seamcall(TDH_VP_WR, &args); 1776 } 1777 EXPORT_SYMBOL_GPL(tdh_vp_wr); 1778 1779 u64 tdh_vp_init(struct tdx_vp *vp, u64 initial_rcx, u32 x2apicid) 1780 { 1781 struct tdx_module_args args = { 1782 .rcx = vp->tdvpr_pa, 1783 .rdx = initial_rcx, 1784 .r8 = x2apicid, 1785 }; 1786 1787 /* apicid requires version == 1. */ 1788 return seamcall(TDH_VP_INIT | (1ULL << TDX_VERSION_SHIFT), &args); 1789 } 1790 EXPORT_SYMBOL_GPL(tdh_vp_init); 1791 1792 /* 1793 * TDX ABI defines output operands as PT, OWNER and SIZE. These are TDX defined fomats. 1794 * So despite the names, they must be interpted specially as described by the spec. Return 1795 * them only for error reporting purposes. 1796 */ 1797 u64 tdh_phymem_page_reclaim(struct page *page, u64 *tdx_pt, u64 *tdx_owner, u64 *tdx_size) 1798 { 1799 struct tdx_module_args args = { 1800 .rcx = page_to_phys(page), 1801 }; 1802 u64 ret; 1803 1804 ret = seamcall_ret(TDH_PHYMEM_PAGE_RECLAIM, &args); 1805 1806 *tdx_pt = args.rcx; 1807 *tdx_owner = args.rdx; 1808 *tdx_size = args.r8; 1809 1810 return ret; 1811 } 1812 EXPORT_SYMBOL_GPL(tdh_phymem_page_reclaim); 1813 1814 u64 tdh_mem_track(struct tdx_td *td) 1815 { 1816 struct tdx_module_args args = { 1817 .rcx = tdx_tdr_pa(td), 1818 }; 1819 1820 return seamcall(TDH_MEM_TRACK, &args); 1821 } 1822 EXPORT_SYMBOL_GPL(tdh_mem_track); 1823 1824 u64 tdh_mem_page_remove(struct tdx_td *td, u64 gpa, u64 level, u64 *ext_err1, u64 *ext_err2) 1825 { 1826 struct tdx_module_args args = { 1827 .rcx = gpa | level, 1828 .rdx = tdx_tdr_pa(td), 1829 }; 1830 u64 ret; 1831 1832 ret = seamcall_ret(TDH_MEM_PAGE_REMOVE, &args); 1833 1834 *ext_err1 = args.rcx; 1835 *ext_err2 = args.rdx; 1836 1837 return ret; 1838 } 1839 EXPORT_SYMBOL_GPL(tdh_mem_page_remove); 1840 1841 u64 tdh_phymem_cache_wb(bool resume) 1842 { 1843 struct tdx_module_args args = { 1844 .rcx = resume ? 1 : 0, 1845 }; 1846 1847 return seamcall(TDH_PHYMEM_CACHE_WB, &args); 1848 } 1849 EXPORT_SYMBOL_GPL(tdh_phymem_cache_wb); 1850 1851 u64 tdh_phymem_page_wbinvd_tdr(struct tdx_td *td) 1852 { 1853 struct tdx_module_args args = {}; 1854 1855 args.rcx = mk_keyed_paddr(tdx_global_keyid, td->tdr_page); 1856 1857 return seamcall(TDH_PHYMEM_PAGE_WBINVD, &args); 1858 } 1859 EXPORT_SYMBOL_GPL(tdh_phymem_page_wbinvd_tdr); 1860 1861 u64 tdh_phymem_page_wbinvd_hkid(u64 hkid, struct page *page) 1862 { 1863 struct tdx_module_args args = {}; 1864 1865 args.rcx = mk_keyed_paddr(hkid, page); 1866 1867 return seamcall(TDH_PHYMEM_PAGE_WBINVD, &args); 1868 } 1869 EXPORT_SYMBOL_GPL(tdh_phymem_page_wbinvd_hkid); 1870 1871 #ifdef CONFIG_KEXEC_CORE 1872 void tdx_cpu_flush_cache_for_kexec(void) 1873 { 1874 lockdep_assert_preemption_disabled(); 1875 1876 if (!this_cpu_read(cache_state_incoherent)) 1877 return; 1878 1879 /* 1880 * Private memory cachelines need to be clean at the time of 1881 * kexec. Write them back now, as the caller promises that 1882 * there should be no more SEAMCALLs on this CPU. 1883 */ 1884 wbinvd(); 1885 this_cpu_write(cache_state_incoherent, false); 1886 } 1887 EXPORT_SYMBOL_GPL(tdx_cpu_flush_cache_for_kexec); 1888 #endif 1889