1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright(c) 2023 Intel Corporation. 4 * 5 * Intel Trusted Domain Extensions (TDX) support 6 */ 7 8 #include "asm/page_types.h" 9 #define pr_fmt(fmt) "virt/tdx: " fmt 10 11 #include <linux/types.h> 12 #include <linux/cache.h> 13 #include <linux/init.h> 14 #include <linux/errno.h> 15 #include <linux/printk.h> 16 #include <linux/cpu.h> 17 #include <linux/spinlock.h> 18 #include <linux/percpu-defs.h> 19 #include <linux/mutex.h> 20 #include <linux/list.h> 21 #include <linux/memblock.h> 22 #include <linux/memory.h> 23 #include <linux/minmax.h> 24 #include <linux/sizes.h> 25 #include <linux/pfn.h> 26 #include <linux/align.h> 27 #include <linux/sort.h> 28 #include <linux/log2.h> 29 #include <linux/acpi.h> 30 #include <linux/suspend.h> 31 #include <linux/idr.h> 32 #include <linux/kvm_types.h> 33 #include <asm/page.h> 34 #include <asm/special_insns.h> 35 #include <asm/msr-index.h> 36 #include <asm/msr.h> 37 #include <asm/cpufeature.h> 38 #include <asm/tdx.h> 39 #include <asm/cpu_device_id.h> 40 #include <asm/processor.h> 41 #include <asm/mce.h> 42 #include "tdx.h" 43 44 static u32 tdx_global_keyid __ro_after_init; 45 static u32 tdx_guest_keyid_start __ro_after_init; 46 static u32 tdx_nr_guest_keyids __ro_after_init; 47 48 static DEFINE_IDA(tdx_guest_keyid_pool); 49 50 static DEFINE_PER_CPU(bool, tdx_lp_initialized); 51 52 static struct tdmr_info_list tdx_tdmr_list; 53 54 static enum tdx_module_status_t tdx_module_status; 55 static DEFINE_MUTEX(tdx_module_lock); 56 57 /* All TDX-usable memory regions. Protected by mem_hotplug_lock. */ 58 static LIST_HEAD(tdx_memlist); 59 60 static struct tdx_sys_info tdx_sysinfo; 61 62 typedef void (*sc_err_func_t)(u64 fn, u64 err, struct tdx_module_args *args); 63 64 static inline void seamcall_err(u64 fn, u64 err, struct tdx_module_args *args) 65 { 66 pr_err("SEAMCALL (0x%016llx) failed: 0x%016llx\n", fn, err); 67 } 68 69 static inline void seamcall_err_ret(u64 fn, u64 err, 70 struct tdx_module_args *args) 71 { 72 seamcall_err(fn, err, args); 73 pr_err("RCX 0x%016llx RDX 0x%016llx R08 0x%016llx\n", 74 args->rcx, args->rdx, args->r8); 75 pr_err("R09 0x%016llx R10 0x%016llx R11 0x%016llx\n", 76 args->r9, args->r10, args->r11); 77 } 78 79 static __always_inline int sc_retry_prerr(sc_func_t func, 80 sc_err_func_t err_func, 81 u64 fn, struct tdx_module_args *args) 82 { 83 u64 sret = sc_retry(func, fn, args); 84 85 if (sret == TDX_SUCCESS) 86 return 0; 87 88 if (sret == TDX_SEAMCALL_VMFAILINVALID) 89 return -ENODEV; 90 91 if (sret == TDX_SEAMCALL_GP) 92 return -EOPNOTSUPP; 93 94 if (sret == TDX_SEAMCALL_UD) 95 return -EACCES; 96 97 err_func(fn, sret, args); 98 return -EIO; 99 } 100 101 #define seamcall_prerr(__fn, __args) \ 102 sc_retry_prerr(__seamcall, seamcall_err, (__fn), (__args)) 103 104 #define seamcall_prerr_ret(__fn, __args) \ 105 sc_retry_prerr(__seamcall_ret, seamcall_err_ret, (__fn), (__args)) 106 107 /* 108 * Do the module global initialization once and return its result. 109 * It can be done on any cpu. It's always called with interrupts 110 * disabled. 111 */ 112 static int try_init_module_global(void) 113 { 114 struct tdx_module_args args = {}; 115 static DEFINE_RAW_SPINLOCK(sysinit_lock); 116 static bool sysinit_done; 117 static int sysinit_ret; 118 119 lockdep_assert_irqs_disabled(); 120 121 raw_spin_lock(&sysinit_lock); 122 123 if (sysinit_done) 124 goto out; 125 126 /* RCX is module attributes and all bits are reserved */ 127 args.rcx = 0; 128 sysinit_ret = seamcall_prerr(TDH_SYS_INIT, &args); 129 130 /* 131 * The first SEAMCALL also detects the TDX module, thus 132 * it can fail due to the TDX module is not loaded. 133 * Dump message to let the user know. 134 */ 135 if (sysinit_ret == -ENODEV) 136 pr_err("module not loaded\n"); 137 138 sysinit_done = true; 139 out: 140 raw_spin_unlock(&sysinit_lock); 141 return sysinit_ret; 142 } 143 144 /** 145 * tdx_cpu_enable - Enable TDX on local cpu 146 * 147 * Do one-time TDX module per-cpu initialization SEAMCALL (and TDX module 148 * global initialization SEAMCALL if not done) on local cpu to make this 149 * cpu be ready to run any other SEAMCALLs. 150 * 151 * Always call this function via IPI function calls. 152 * 153 * Return 0 on success, otherwise errors. 154 */ 155 int tdx_cpu_enable(void) 156 { 157 struct tdx_module_args args = {}; 158 int ret; 159 160 if (!boot_cpu_has(X86_FEATURE_TDX_HOST_PLATFORM)) 161 return -ENODEV; 162 163 lockdep_assert_irqs_disabled(); 164 165 if (__this_cpu_read(tdx_lp_initialized)) 166 return 0; 167 168 /* 169 * The TDX module global initialization is the very first step 170 * to enable TDX. Need to do it first (if hasn't been done) 171 * before the per-cpu initialization. 172 */ 173 ret = try_init_module_global(); 174 if (ret) 175 return ret; 176 177 ret = seamcall_prerr(TDH_SYS_LP_INIT, &args); 178 if (ret) 179 return ret; 180 181 __this_cpu_write(tdx_lp_initialized, true); 182 183 return 0; 184 } 185 EXPORT_SYMBOL_FOR_KVM(tdx_cpu_enable); 186 187 /* 188 * Add a memory region as a TDX memory block. The caller must make sure 189 * all memory regions are added in address ascending order and don't 190 * overlap. 191 */ 192 static int add_tdx_memblock(struct list_head *tmb_list, unsigned long start_pfn, 193 unsigned long end_pfn, int nid) 194 { 195 struct tdx_memblock *tmb; 196 197 tmb = kmalloc(sizeof(*tmb), GFP_KERNEL); 198 if (!tmb) 199 return -ENOMEM; 200 201 INIT_LIST_HEAD(&tmb->list); 202 tmb->start_pfn = start_pfn; 203 tmb->end_pfn = end_pfn; 204 tmb->nid = nid; 205 206 /* @tmb_list is protected by mem_hotplug_lock */ 207 list_add_tail(&tmb->list, tmb_list); 208 return 0; 209 } 210 211 static void free_tdx_memlist(struct list_head *tmb_list) 212 { 213 /* @tmb_list is protected by mem_hotplug_lock */ 214 while (!list_empty(tmb_list)) { 215 struct tdx_memblock *tmb = list_first_entry(tmb_list, 216 struct tdx_memblock, list); 217 218 list_del(&tmb->list); 219 kfree(tmb); 220 } 221 } 222 223 /* 224 * Ensure that all memblock memory regions are convertible to TDX 225 * memory. Once this has been established, stash the memblock 226 * ranges off in a secondary structure because memblock is modified 227 * in memory hotplug while TDX memory regions are fixed. 228 */ 229 static int build_tdx_memlist(struct list_head *tmb_list) 230 { 231 unsigned long start_pfn, end_pfn; 232 int i, nid, ret; 233 234 for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) { 235 /* 236 * The first 1MB is not reported as TDX convertible memory. 237 * Although the first 1MB is always reserved and won't end up 238 * to the page allocator, it is still in memblock's memory 239 * regions. Skip them manually to exclude them as TDX memory. 240 */ 241 start_pfn = max(start_pfn, PHYS_PFN(SZ_1M)); 242 if (start_pfn >= end_pfn) 243 continue; 244 245 /* 246 * Add the memory regions as TDX memory. The regions in 247 * memblock has already guaranteed they are in address 248 * ascending order and don't overlap. 249 */ 250 ret = add_tdx_memblock(tmb_list, start_pfn, end_pfn, nid); 251 if (ret) 252 goto err; 253 } 254 255 return 0; 256 err: 257 free_tdx_memlist(tmb_list); 258 return ret; 259 } 260 261 static int read_sys_metadata_field(u64 field_id, u64 *data) 262 { 263 struct tdx_module_args args = {}; 264 int ret; 265 266 /* 267 * TDH.SYS.RD -- reads one global metadata field 268 * - RDX (in): the field to read 269 * - R8 (out): the field data 270 */ 271 args.rdx = field_id; 272 ret = seamcall_prerr_ret(TDH_SYS_RD, &args); 273 if (ret) 274 return ret; 275 276 *data = args.r8; 277 278 return 0; 279 } 280 281 #include "tdx_global_metadata.c" 282 283 static int check_features(struct tdx_sys_info *sysinfo) 284 { 285 u64 tdx_features0 = sysinfo->features.tdx_features0; 286 287 if (!(tdx_features0 & TDX_FEATURES0_NO_RBP_MOD)) { 288 pr_err("frame pointer (RBP) clobber bug present, upgrade TDX module\n"); 289 return -EINVAL; 290 } 291 292 return 0; 293 } 294 295 /* Calculate the actual TDMR size */ 296 static int tdmr_size_single(u16 max_reserved_per_tdmr) 297 { 298 int tdmr_sz; 299 300 /* 301 * The actual size of TDMR depends on the maximum 302 * number of reserved areas. 303 */ 304 tdmr_sz = sizeof(struct tdmr_info); 305 tdmr_sz += sizeof(struct tdmr_reserved_area) * max_reserved_per_tdmr; 306 307 return ALIGN(tdmr_sz, TDMR_INFO_ALIGNMENT); 308 } 309 310 static int alloc_tdmr_list(struct tdmr_info_list *tdmr_list, 311 struct tdx_sys_info_tdmr *sysinfo_tdmr) 312 { 313 size_t tdmr_sz, tdmr_array_sz; 314 void *tdmr_array; 315 316 tdmr_sz = tdmr_size_single(sysinfo_tdmr->max_reserved_per_tdmr); 317 tdmr_array_sz = tdmr_sz * sysinfo_tdmr->max_tdmrs; 318 319 /* 320 * To keep things simple, allocate all TDMRs together. 321 * The buffer needs to be physically contiguous to make 322 * sure each TDMR is physically contiguous. 323 */ 324 tdmr_array = alloc_pages_exact(tdmr_array_sz, 325 GFP_KERNEL | __GFP_ZERO); 326 if (!tdmr_array) 327 return -ENOMEM; 328 329 tdmr_list->tdmrs = tdmr_array; 330 331 /* 332 * Keep the size of TDMR to find the target TDMR 333 * at a given index in the TDMR list. 334 */ 335 tdmr_list->tdmr_sz = tdmr_sz; 336 tdmr_list->max_tdmrs = sysinfo_tdmr->max_tdmrs; 337 tdmr_list->nr_consumed_tdmrs = 0; 338 339 return 0; 340 } 341 342 static void free_tdmr_list(struct tdmr_info_list *tdmr_list) 343 { 344 free_pages_exact(tdmr_list->tdmrs, 345 tdmr_list->max_tdmrs * tdmr_list->tdmr_sz); 346 } 347 348 /* Get the TDMR from the list at the given index. */ 349 static struct tdmr_info *tdmr_entry(struct tdmr_info_list *tdmr_list, 350 int idx) 351 { 352 int tdmr_info_offset = tdmr_list->tdmr_sz * idx; 353 354 return (void *)tdmr_list->tdmrs + tdmr_info_offset; 355 } 356 357 #define TDMR_ALIGNMENT SZ_1G 358 #define TDMR_ALIGN_DOWN(_addr) ALIGN_DOWN((_addr), TDMR_ALIGNMENT) 359 #define TDMR_ALIGN_UP(_addr) ALIGN((_addr), TDMR_ALIGNMENT) 360 361 static inline u64 tdmr_end(struct tdmr_info *tdmr) 362 { 363 return tdmr->base + tdmr->size; 364 } 365 366 /* 367 * Take the memory referenced in @tmb_list and populate the 368 * preallocated @tdmr_list, following all the special alignment 369 * and size rules for TDMR. 370 */ 371 static int fill_out_tdmrs(struct list_head *tmb_list, 372 struct tdmr_info_list *tdmr_list) 373 { 374 struct tdx_memblock *tmb; 375 int tdmr_idx = 0; 376 377 /* 378 * Loop over TDX memory regions and fill out TDMRs to cover them. 379 * To keep it simple, always try to use one TDMR to cover one 380 * memory region. 381 * 382 * In practice TDX supports at least 64 TDMRs. A 2-socket system 383 * typically only consumes less than 10 of those. This code is 384 * dumb and simple and may use more TMDRs than is strictly 385 * required. 386 */ 387 list_for_each_entry(tmb, tmb_list, list) { 388 struct tdmr_info *tdmr = tdmr_entry(tdmr_list, tdmr_idx); 389 u64 start, end; 390 391 start = TDMR_ALIGN_DOWN(PFN_PHYS(tmb->start_pfn)); 392 end = TDMR_ALIGN_UP(PFN_PHYS(tmb->end_pfn)); 393 394 /* 395 * A valid size indicates the current TDMR has already 396 * been filled out to cover the previous memory region(s). 397 */ 398 if (tdmr->size) { 399 /* 400 * Loop to the next if the current memory region 401 * has already been fully covered. 402 */ 403 if (end <= tdmr_end(tdmr)) 404 continue; 405 406 /* Otherwise, skip the already covered part. */ 407 if (start < tdmr_end(tdmr)) 408 start = tdmr_end(tdmr); 409 410 /* 411 * Create a new TDMR to cover the current memory 412 * region, or the remaining part of it. 413 */ 414 tdmr_idx++; 415 if (tdmr_idx >= tdmr_list->max_tdmrs) { 416 pr_warn("initialization failed: TDMRs exhausted.\n"); 417 return -ENOSPC; 418 } 419 420 tdmr = tdmr_entry(tdmr_list, tdmr_idx); 421 } 422 423 tdmr->base = start; 424 tdmr->size = end - start; 425 } 426 427 /* @tdmr_idx is always the index of the last valid TDMR. */ 428 tdmr_list->nr_consumed_tdmrs = tdmr_idx + 1; 429 430 /* 431 * Warn early that kernel is about to run out of TDMRs. 432 * 433 * This is an indication that TDMR allocation has to be 434 * reworked to be smarter to not run into an issue. 435 */ 436 if (tdmr_list->max_tdmrs - tdmr_list->nr_consumed_tdmrs < TDMR_NR_WARN) 437 pr_warn("consumed TDMRs reaching limit: %d used out of %d\n", 438 tdmr_list->nr_consumed_tdmrs, 439 tdmr_list->max_tdmrs); 440 441 return 0; 442 } 443 444 /* 445 * Calculate PAMT size given a TDMR and a page size. The returned 446 * PAMT size is always aligned up to 4K page boundary. 447 */ 448 static unsigned long tdmr_get_pamt_sz(struct tdmr_info *tdmr, int pgsz, 449 u16 pamt_entry_size) 450 { 451 unsigned long pamt_sz, nr_pamt_entries; 452 453 switch (pgsz) { 454 case TDX_PS_4K: 455 nr_pamt_entries = tdmr->size >> PAGE_SHIFT; 456 break; 457 case TDX_PS_2M: 458 nr_pamt_entries = tdmr->size >> PMD_SHIFT; 459 break; 460 case TDX_PS_1G: 461 nr_pamt_entries = tdmr->size >> PUD_SHIFT; 462 break; 463 default: 464 WARN_ON_ONCE(1); 465 return 0; 466 } 467 468 pamt_sz = nr_pamt_entries * pamt_entry_size; 469 /* TDX requires PAMT size must be 4K aligned */ 470 pamt_sz = ALIGN(pamt_sz, PAGE_SIZE); 471 472 return pamt_sz; 473 } 474 475 /* 476 * Locate a NUMA node which should hold the allocation of the @tdmr 477 * PAMT. This node will have some memory covered by the TDMR. The 478 * relative amount of memory covered is not considered. 479 */ 480 static int tdmr_get_nid(struct tdmr_info *tdmr, struct list_head *tmb_list) 481 { 482 struct tdx_memblock *tmb; 483 484 /* 485 * A TDMR must cover at least part of one TMB. That TMB will end 486 * after the TDMR begins. But, that TMB may have started before 487 * the TDMR. Find the next 'tmb' that _ends_ after this TDMR 488 * begins. Ignore 'tmb' start addresses. They are irrelevant. 489 */ 490 list_for_each_entry(tmb, tmb_list, list) { 491 if (tmb->end_pfn > PHYS_PFN(tdmr->base)) 492 return tmb->nid; 493 } 494 495 /* 496 * Fall back to allocating the TDMR's metadata from node 0 when 497 * no TDX memory block can be found. This should never happen 498 * since TDMRs originate from TDX memory blocks. 499 */ 500 pr_warn("TDMR [0x%llx, 0x%llx): unable to find local NUMA node for PAMT allocation, fallback to use node 0.\n", 501 tdmr->base, tdmr_end(tdmr)); 502 return 0; 503 } 504 505 /* 506 * Allocate PAMTs from the local NUMA node of some memory in @tmb_list 507 * within @tdmr, and set up PAMTs for @tdmr. 508 */ 509 static int tdmr_set_up_pamt(struct tdmr_info *tdmr, 510 struct list_head *tmb_list, 511 u16 pamt_entry_size[]) 512 { 513 unsigned long pamt_base[TDX_PS_NR]; 514 unsigned long pamt_size[TDX_PS_NR]; 515 unsigned long tdmr_pamt_base; 516 unsigned long tdmr_pamt_size; 517 struct page *pamt; 518 int pgsz, nid; 519 520 nid = tdmr_get_nid(tdmr, tmb_list); 521 522 /* 523 * Calculate the PAMT size for each TDX supported page size 524 * and the total PAMT size. 525 */ 526 tdmr_pamt_size = 0; 527 for (pgsz = TDX_PS_4K; pgsz < TDX_PS_NR; pgsz++) { 528 pamt_size[pgsz] = tdmr_get_pamt_sz(tdmr, pgsz, 529 pamt_entry_size[pgsz]); 530 tdmr_pamt_size += pamt_size[pgsz]; 531 } 532 533 /* 534 * Allocate one chunk of physically contiguous memory for all 535 * PAMTs. This helps minimize the PAMT's use of reserved areas 536 * in overlapped TDMRs. 537 */ 538 pamt = alloc_contig_pages(tdmr_pamt_size >> PAGE_SHIFT, GFP_KERNEL, 539 nid, &node_online_map); 540 if (!pamt) 541 return -ENOMEM; 542 543 /* 544 * Break the contiguous allocation back up into the 545 * individual PAMTs for each page size. 546 */ 547 tdmr_pamt_base = page_to_pfn(pamt) << PAGE_SHIFT; 548 for (pgsz = TDX_PS_4K; pgsz < TDX_PS_NR; pgsz++) { 549 pamt_base[pgsz] = tdmr_pamt_base; 550 tdmr_pamt_base += pamt_size[pgsz]; 551 } 552 553 tdmr->pamt_4k_base = pamt_base[TDX_PS_4K]; 554 tdmr->pamt_4k_size = pamt_size[TDX_PS_4K]; 555 tdmr->pamt_2m_base = pamt_base[TDX_PS_2M]; 556 tdmr->pamt_2m_size = pamt_size[TDX_PS_2M]; 557 tdmr->pamt_1g_base = pamt_base[TDX_PS_1G]; 558 tdmr->pamt_1g_size = pamt_size[TDX_PS_1G]; 559 560 return 0; 561 } 562 563 static void tdmr_get_pamt(struct tdmr_info *tdmr, unsigned long *pamt_base, 564 unsigned long *pamt_size) 565 { 566 unsigned long pamt_bs, pamt_sz; 567 568 /* 569 * The PAMT was allocated in one contiguous unit. The 4K PAMT 570 * should always point to the beginning of that allocation. 571 */ 572 pamt_bs = tdmr->pamt_4k_base; 573 pamt_sz = tdmr->pamt_4k_size + tdmr->pamt_2m_size + tdmr->pamt_1g_size; 574 575 WARN_ON_ONCE((pamt_bs & ~PAGE_MASK) || (pamt_sz & ~PAGE_MASK)); 576 577 *pamt_base = pamt_bs; 578 *pamt_size = pamt_sz; 579 } 580 581 static void tdmr_do_pamt_func(struct tdmr_info *tdmr, 582 void (*pamt_func)(unsigned long base, unsigned long size)) 583 { 584 unsigned long pamt_base, pamt_size; 585 586 tdmr_get_pamt(tdmr, &pamt_base, &pamt_size); 587 588 /* Do nothing if PAMT hasn't been allocated for this TDMR */ 589 if (!pamt_size) 590 return; 591 592 if (WARN_ON_ONCE(!pamt_base)) 593 return; 594 595 pamt_func(pamt_base, pamt_size); 596 } 597 598 static void free_pamt(unsigned long pamt_base, unsigned long pamt_size) 599 { 600 free_contig_range(pamt_base >> PAGE_SHIFT, pamt_size >> PAGE_SHIFT); 601 } 602 603 static void tdmr_free_pamt(struct tdmr_info *tdmr) 604 { 605 tdmr_do_pamt_func(tdmr, free_pamt); 606 } 607 608 static void tdmrs_free_pamt_all(struct tdmr_info_list *tdmr_list) 609 { 610 int i; 611 612 for (i = 0; i < tdmr_list->nr_consumed_tdmrs; i++) 613 tdmr_free_pamt(tdmr_entry(tdmr_list, i)); 614 } 615 616 /* Allocate and set up PAMTs for all TDMRs */ 617 static int tdmrs_set_up_pamt_all(struct tdmr_info_list *tdmr_list, 618 struct list_head *tmb_list, 619 u16 pamt_entry_size[]) 620 { 621 int i, ret = 0; 622 623 for (i = 0; i < tdmr_list->nr_consumed_tdmrs; i++) { 624 ret = tdmr_set_up_pamt(tdmr_entry(tdmr_list, i), tmb_list, 625 pamt_entry_size); 626 if (ret) 627 goto err; 628 } 629 630 return 0; 631 err: 632 tdmrs_free_pamt_all(tdmr_list); 633 return ret; 634 } 635 636 /* 637 * Convert TDX private pages back to normal by using MOVDIR64B to clear these 638 * pages. Typically, any write to the page will convert it from TDX private back 639 * to normal kernel memory. Systems with the X86_BUG_TDX_PW_MCE erratum need to 640 * do the conversion explicitly via MOVDIR64B. 641 */ 642 static void tdx_quirk_reset_paddr(unsigned long base, unsigned long size) 643 { 644 const void *zero_page = (const void *)page_address(ZERO_PAGE(0)); 645 unsigned long phys, end; 646 647 if (!boot_cpu_has_bug(X86_BUG_TDX_PW_MCE)) 648 return; 649 650 end = base + size; 651 for (phys = base; phys < end; phys += 64) 652 movdir64b(__va(phys), zero_page); 653 654 /* 655 * MOVDIR64B uses WC protocol. Use memory barrier to 656 * make sure any later user of these pages sees the 657 * updated data. 658 */ 659 mb(); 660 } 661 662 void tdx_quirk_reset_page(struct page *page) 663 { 664 tdx_quirk_reset_paddr(page_to_phys(page), PAGE_SIZE); 665 } 666 EXPORT_SYMBOL_FOR_KVM(tdx_quirk_reset_page); 667 668 static void tdmr_quirk_reset_pamt(struct tdmr_info *tdmr) 669 { 670 tdmr_do_pamt_func(tdmr, tdx_quirk_reset_paddr); 671 } 672 673 static void tdmrs_quirk_reset_pamt_all(struct tdmr_info_list *tdmr_list) 674 { 675 int i; 676 677 for (i = 0; i < tdmr_list->nr_consumed_tdmrs; i++) 678 tdmr_quirk_reset_pamt(tdmr_entry(tdmr_list, i)); 679 } 680 681 static unsigned long tdmrs_count_pamt_kb(struct tdmr_info_list *tdmr_list) 682 { 683 unsigned long pamt_size = 0; 684 int i; 685 686 for (i = 0; i < tdmr_list->nr_consumed_tdmrs; i++) { 687 unsigned long base, size; 688 689 tdmr_get_pamt(tdmr_entry(tdmr_list, i), &base, &size); 690 pamt_size += size; 691 } 692 693 return pamt_size / 1024; 694 } 695 696 static int tdmr_add_rsvd_area(struct tdmr_info *tdmr, int *p_idx, u64 addr, 697 u64 size, u16 max_reserved_per_tdmr) 698 { 699 struct tdmr_reserved_area *rsvd_areas = tdmr->reserved_areas; 700 int idx = *p_idx; 701 702 /* Reserved area must be 4K aligned in offset and size */ 703 if (WARN_ON(addr & ~PAGE_MASK || size & ~PAGE_MASK)) 704 return -EINVAL; 705 706 if (idx >= max_reserved_per_tdmr) { 707 pr_warn("initialization failed: TDMR [0x%llx, 0x%llx): reserved areas exhausted.\n", 708 tdmr->base, tdmr_end(tdmr)); 709 return -ENOSPC; 710 } 711 712 /* 713 * Consume one reserved area per call. Make no effort to 714 * optimize or reduce the number of reserved areas which are 715 * consumed by contiguous reserved areas, for instance. 716 */ 717 rsvd_areas[idx].offset = addr - tdmr->base; 718 rsvd_areas[idx].size = size; 719 720 *p_idx = idx + 1; 721 722 return 0; 723 } 724 725 /* 726 * Go through @tmb_list to find holes between memory areas. If any of 727 * those holes fall within @tdmr, set up a TDMR reserved area to cover 728 * the hole. 729 */ 730 static int tdmr_populate_rsvd_holes(struct list_head *tmb_list, 731 struct tdmr_info *tdmr, 732 int *rsvd_idx, 733 u16 max_reserved_per_tdmr) 734 { 735 struct tdx_memblock *tmb; 736 u64 prev_end; 737 int ret; 738 739 /* 740 * Start looking for reserved blocks at the 741 * beginning of the TDMR. 742 */ 743 prev_end = tdmr->base; 744 list_for_each_entry(tmb, tmb_list, list) { 745 u64 start, end; 746 747 start = PFN_PHYS(tmb->start_pfn); 748 end = PFN_PHYS(tmb->end_pfn); 749 750 /* Break if this region is after the TDMR */ 751 if (start >= tdmr_end(tdmr)) 752 break; 753 754 /* Exclude regions before this TDMR */ 755 if (end < tdmr->base) 756 continue; 757 758 /* 759 * Skip over memory areas that 760 * have already been dealt with. 761 */ 762 if (start <= prev_end) { 763 prev_end = end; 764 continue; 765 } 766 767 /* Add the hole before this region */ 768 ret = tdmr_add_rsvd_area(tdmr, rsvd_idx, prev_end, 769 start - prev_end, 770 max_reserved_per_tdmr); 771 if (ret) 772 return ret; 773 774 prev_end = end; 775 } 776 777 /* Add the hole after the last region if it exists. */ 778 if (prev_end < tdmr_end(tdmr)) { 779 ret = tdmr_add_rsvd_area(tdmr, rsvd_idx, prev_end, 780 tdmr_end(tdmr) - prev_end, 781 max_reserved_per_tdmr); 782 if (ret) 783 return ret; 784 } 785 786 return 0; 787 } 788 789 /* 790 * Go through @tdmr_list to find all PAMTs. If any of those PAMTs 791 * overlaps with @tdmr, set up a TDMR reserved area to cover the 792 * overlapping part. 793 */ 794 static int tdmr_populate_rsvd_pamts(struct tdmr_info_list *tdmr_list, 795 struct tdmr_info *tdmr, 796 int *rsvd_idx, 797 u16 max_reserved_per_tdmr) 798 { 799 int i, ret; 800 801 for (i = 0; i < tdmr_list->nr_consumed_tdmrs; i++) { 802 struct tdmr_info *tmp = tdmr_entry(tdmr_list, i); 803 unsigned long pamt_base, pamt_size, pamt_end; 804 805 tdmr_get_pamt(tmp, &pamt_base, &pamt_size); 806 /* Each TDMR must already have PAMT allocated */ 807 WARN_ON_ONCE(!pamt_size || !pamt_base); 808 809 pamt_end = pamt_base + pamt_size; 810 /* Skip PAMTs outside of the given TDMR */ 811 if ((pamt_end <= tdmr->base) || 812 (pamt_base >= tdmr_end(tdmr))) 813 continue; 814 815 /* Only mark the part within the TDMR as reserved */ 816 if (pamt_base < tdmr->base) 817 pamt_base = tdmr->base; 818 if (pamt_end > tdmr_end(tdmr)) 819 pamt_end = tdmr_end(tdmr); 820 821 ret = tdmr_add_rsvd_area(tdmr, rsvd_idx, pamt_base, 822 pamt_end - pamt_base, 823 max_reserved_per_tdmr); 824 if (ret) 825 return ret; 826 } 827 828 return 0; 829 } 830 831 /* Compare function called by sort() for TDMR reserved areas */ 832 static int rsvd_area_cmp_func(const void *a, const void *b) 833 { 834 struct tdmr_reserved_area *r1 = (struct tdmr_reserved_area *)a; 835 struct tdmr_reserved_area *r2 = (struct tdmr_reserved_area *)b; 836 837 if (r1->offset + r1->size <= r2->offset) 838 return -1; 839 if (r1->offset >= r2->offset + r2->size) 840 return 1; 841 842 /* Reserved areas cannot overlap. The caller must guarantee. */ 843 WARN_ON_ONCE(1); 844 return -1; 845 } 846 847 /* 848 * Populate reserved areas for the given @tdmr, including memory holes 849 * (via @tmb_list) and PAMTs (via @tdmr_list). 850 */ 851 static int tdmr_populate_rsvd_areas(struct tdmr_info *tdmr, 852 struct list_head *tmb_list, 853 struct tdmr_info_list *tdmr_list, 854 u16 max_reserved_per_tdmr) 855 { 856 int ret, rsvd_idx = 0; 857 858 ret = tdmr_populate_rsvd_holes(tmb_list, tdmr, &rsvd_idx, 859 max_reserved_per_tdmr); 860 if (ret) 861 return ret; 862 863 ret = tdmr_populate_rsvd_pamts(tdmr_list, tdmr, &rsvd_idx, 864 max_reserved_per_tdmr); 865 if (ret) 866 return ret; 867 868 /* TDX requires reserved areas listed in address ascending order */ 869 sort(tdmr->reserved_areas, rsvd_idx, sizeof(struct tdmr_reserved_area), 870 rsvd_area_cmp_func, NULL); 871 872 return 0; 873 } 874 875 /* 876 * Populate reserved areas for all TDMRs in @tdmr_list, including memory 877 * holes (via @tmb_list) and PAMTs. 878 */ 879 static int tdmrs_populate_rsvd_areas_all(struct tdmr_info_list *tdmr_list, 880 struct list_head *tmb_list, 881 u16 max_reserved_per_tdmr) 882 { 883 int i; 884 885 for (i = 0; i < tdmr_list->nr_consumed_tdmrs; i++) { 886 int ret; 887 888 ret = tdmr_populate_rsvd_areas(tdmr_entry(tdmr_list, i), 889 tmb_list, tdmr_list, max_reserved_per_tdmr); 890 if (ret) 891 return ret; 892 } 893 894 return 0; 895 } 896 897 /* 898 * Construct a list of TDMRs on the preallocated space in @tdmr_list 899 * to cover all TDX memory regions in @tmb_list based on the TDX module 900 * TDMR global information in @sysinfo_tdmr. 901 */ 902 static int construct_tdmrs(struct list_head *tmb_list, 903 struct tdmr_info_list *tdmr_list, 904 struct tdx_sys_info_tdmr *sysinfo_tdmr) 905 { 906 u16 pamt_entry_size[TDX_PS_NR] = { 907 sysinfo_tdmr->pamt_4k_entry_size, 908 sysinfo_tdmr->pamt_2m_entry_size, 909 sysinfo_tdmr->pamt_1g_entry_size, 910 }; 911 int ret; 912 913 ret = fill_out_tdmrs(tmb_list, tdmr_list); 914 if (ret) 915 return ret; 916 917 ret = tdmrs_set_up_pamt_all(tdmr_list, tmb_list, pamt_entry_size); 918 if (ret) 919 return ret; 920 921 ret = tdmrs_populate_rsvd_areas_all(tdmr_list, tmb_list, 922 sysinfo_tdmr->max_reserved_per_tdmr); 923 if (ret) 924 tdmrs_free_pamt_all(tdmr_list); 925 926 /* 927 * The tdmr_info_list is read-only from here on out. 928 * Ensure that these writes are seen by other CPUs. 929 * Pairs with a smp_rmb() in is_pamt_page(). 930 */ 931 smp_wmb(); 932 933 return ret; 934 } 935 936 static int config_tdx_module(struct tdmr_info_list *tdmr_list, u64 global_keyid) 937 { 938 struct tdx_module_args args = {}; 939 u64 *tdmr_pa_array; 940 size_t array_sz; 941 int i, ret; 942 943 /* 944 * TDMRs are passed to the TDX module via an array of physical 945 * addresses of each TDMR. The array itself also has certain 946 * alignment requirement. 947 */ 948 array_sz = tdmr_list->nr_consumed_tdmrs * sizeof(u64); 949 array_sz = roundup_pow_of_two(array_sz); 950 if (array_sz < TDMR_INFO_PA_ARRAY_ALIGNMENT) 951 array_sz = TDMR_INFO_PA_ARRAY_ALIGNMENT; 952 953 tdmr_pa_array = kzalloc(array_sz, GFP_KERNEL); 954 if (!tdmr_pa_array) 955 return -ENOMEM; 956 957 for (i = 0; i < tdmr_list->nr_consumed_tdmrs; i++) 958 tdmr_pa_array[i] = __pa(tdmr_entry(tdmr_list, i)); 959 960 args.rcx = __pa(tdmr_pa_array); 961 args.rdx = tdmr_list->nr_consumed_tdmrs; 962 args.r8 = global_keyid; 963 ret = seamcall_prerr(TDH_SYS_CONFIG, &args); 964 965 /* Free the array as it is not required anymore. */ 966 kfree(tdmr_pa_array); 967 968 return ret; 969 } 970 971 static int do_global_key_config(void *unused) 972 { 973 struct tdx_module_args args = {}; 974 975 return seamcall_prerr(TDH_SYS_KEY_CONFIG, &args); 976 } 977 978 /* 979 * Attempt to configure the global KeyID on all physical packages. 980 * 981 * This requires running code on at least one CPU in each package. 982 * TDMR initialization) will fail will fail if any package in the 983 * system has no online CPUs. 984 * 985 * This code takes no affirmative steps to online CPUs. Callers (aka. 986 * KVM) can ensure success by ensuring sufficient CPUs are online and 987 * can run SEAMCALLs. 988 */ 989 static int config_global_keyid(void) 990 { 991 cpumask_var_t packages; 992 int cpu, ret = -EINVAL; 993 994 if (!zalloc_cpumask_var(&packages, GFP_KERNEL)) 995 return -ENOMEM; 996 997 /* 998 * Hardware doesn't guarantee cache coherency across different 999 * KeyIDs. The kernel needs to flush PAMT's dirty cachelines 1000 * (associated with KeyID 0) before the TDX module can use the 1001 * global KeyID to access the PAMT. Given PAMTs are potentially 1002 * large (~1/256th of system RAM), just use WBINVD. 1003 */ 1004 wbinvd_on_all_cpus(); 1005 1006 for_each_online_cpu(cpu) { 1007 /* 1008 * The key configuration only needs to be done once per 1009 * package and will return an error if configured more 1010 * than once. Avoid doing it multiple times per package. 1011 */ 1012 if (cpumask_test_and_set_cpu(topology_physical_package_id(cpu), 1013 packages)) 1014 continue; 1015 1016 /* 1017 * TDH.SYS.KEY.CONFIG cannot run concurrently on 1018 * different cpus. Do it one by one. 1019 */ 1020 ret = smp_call_on_cpu(cpu, do_global_key_config, NULL, true); 1021 if (ret) 1022 break; 1023 } 1024 1025 free_cpumask_var(packages); 1026 return ret; 1027 } 1028 1029 static int init_tdmr(struct tdmr_info *tdmr) 1030 { 1031 u64 next; 1032 1033 /* 1034 * Initializing a TDMR can be time consuming. To avoid long 1035 * SEAMCALLs, the TDX module may only initialize a part of the 1036 * TDMR in each call. 1037 */ 1038 do { 1039 struct tdx_module_args args = { 1040 .rcx = tdmr->base, 1041 }; 1042 int ret; 1043 1044 ret = seamcall_prerr_ret(TDH_SYS_TDMR_INIT, &args); 1045 if (ret) 1046 return ret; 1047 /* 1048 * RDX contains 'next-to-initialize' address if 1049 * TDH.SYS.TDMR.INIT did not fully complete and 1050 * should be retried. 1051 */ 1052 next = args.rdx; 1053 cond_resched(); 1054 /* Keep making SEAMCALLs until the TDMR is done */ 1055 } while (next < tdmr->base + tdmr->size); 1056 1057 return 0; 1058 } 1059 1060 static int init_tdmrs(struct tdmr_info_list *tdmr_list) 1061 { 1062 int i; 1063 1064 /* 1065 * This operation is costly. It can be parallelized, 1066 * but keep it simple for now. 1067 */ 1068 for (i = 0; i < tdmr_list->nr_consumed_tdmrs; i++) { 1069 int ret; 1070 1071 ret = init_tdmr(tdmr_entry(tdmr_list, i)); 1072 if (ret) 1073 return ret; 1074 } 1075 1076 return 0; 1077 } 1078 1079 static int init_tdx_module(void) 1080 { 1081 int ret; 1082 1083 ret = get_tdx_sys_info(&tdx_sysinfo); 1084 if (ret) 1085 return ret; 1086 1087 /* Check whether the kernel can support this module */ 1088 ret = check_features(&tdx_sysinfo); 1089 if (ret) 1090 return ret; 1091 1092 /* 1093 * To keep things simple, assume that all TDX-protected memory 1094 * will come from the page allocator. Make sure all pages in the 1095 * page allocator are TDX-usable memory. 1096 * 1097 * Build the list of "TDX-usable" memory regions which cover all 1098 * pages in the page allocator to guarantee that. Do it while 1099 * holding mem_hotplug_lock read-lock as the memory hotplug code 1100 * path reads the @tdx_memlist to reject any new memory. 1101 */ 1102 get_online_mems(); 1103 1104 ret = build_tdx_memlist(&tdx_memlist); 1105 if (ret) 1106 goto out_put_tdxmem; 1107 1108 /* Allocate enough space for constructing TDMRs */ 1109 ret = alloc_tdmr_list(&tdx_tdmr_list, &tdx_sysinfo.tdmr); 1110 if (ret) 1111 goto err_free_tdxmem; 1112 1113 /* Cover all TDX-usable memory regions in TDMRs */ 1114 ret = construct_tdmrs(&tdx_memlist, &tdx_tdmr_list, &tdx_sysinfo.tdmr); 1115 if (ret) 1116 goto err_free_tdmrs; 1117 1118 /* Pass the TDMRs and the global KeyID to the TDX module */ 1119 ret = config_tdx_module(&tdx_tdmr_list, tdx_global_keyid); 1120 if (ret) 1121 goto err_free_pamts; 1122 1123 /* Config the key of global KeyID on all packages */ 1124 ret = config_global_keyid(); 1125 if (ret) 1126 goto err_reset_pamts; 1127 1128 /* Initialize TDMRs to complete the TDX module initialization */ 1129 ret = init_tdmrs(&tdx_tdmr_list); 1130 if (ret) 1131 goto err_reset_pamts; 1132 1133 pr_info("%lu KB allocated for PAMT\n", tdmrs_count_pamt_kb(&tdx_tdmr_list)); 1134 1135 out_put_tdxmem: 1136 /* 1137 * @tdx_memlist is written here and read at memory hotplug time. 1138 * Lock out memory hotplug code while building it. 1139 */ 1140 put_online_mems(); 1141 return ret; 1142 1143 err_reset_pamts: 1144 /* 1145 * Part of PAMTs may already have been initialized by the 1146 * TDX module. Flush cache before returning PAMTs back 1147 * to the kernel. 1148 */ 1149 wbinvd_on_all_cpus(); 1150 tdmrs_quirk_reset_pamt_all(&tdx_tdmr_list); 1151 err_free_pamts: 1152 tdmrs_free_pamt_all(&tdx_tdmr_list); 1153 err_free_tdmrs: 1154 free_tdmr_list(&tdx_tdmr_list); 1155 err_free_tdxmem: 1156 free_tdx_memlist(&tdx_memlist); 1157 goto out_put_tdxmem; 1158 } 1159 1160 static int __tdx_enable(void) 1161 { 1162 int ret; 1163 1164 ret = init_tdx_module(); 1165 if (ret) { 1166 pr_err("module initialization failed (%d)\n", ret); 1167 tdx_module_status = TDX_MODULE_ERROR; 1168 return ret; 1169 } 1170 1171 pr_info("module initialized\n"); 1172 tdx_module_status = TDX_MODULE_INITIALIZED; 1173 1174 return 0; 1175 } 1176 1177 /** 1178 * tdx_enable - Enable TDX module to make it ready to run TDX guests 1179 * 1180 * This function assumes the caller has: 1) held read lock of CPU hotplug 1181 * lock to prevent any new cpu from becoming online; 2) done both VMXON 1182 * and tdx_cpu_enable() on all online cpus. 1183 * 1184 * This function requires there's at least one online cpu for each CPU 1185 * package to succeed. 1186 * 1187 * This function can be called in parallel by multiple callers. 1188 * 1189 * Return 0 if TDX is enabled successfully, otherwise error. 1190 */ 1191 int tdx_enable(void) 1192 { 1193 int ret; 1194 1195 if (!boot_cpu_has(X86_FEATURE_TDX_HOST_PLATFORM)) 1196 return -ENODEV; 1197 1198 lockdep_assert_cpus_held(); 1199 1200 mutex_lock(&tdx_module_lock); 1201 1202 switch (tdx_module_status) { 1203 case TDX_MODULE_UNINITIALIZED: 1204 ret = __tdx_enable(); 1205 break; 1206 case TDX_MODULE_INITIALIZED: 1207 /* Already initialized, great, tell the caller. */ 1208 ret = 0; 1209 break; 1210 default: 1211 /* Failed to initialize in the previous attempts */ 1212 ret = -EINVAL; 1213 break; 1214 } 1215 1216 mutex_unlock(&tdx_module_lock); 1217 1218 return ret; 1219 } 1220 EXPORT_SYMBOL_FOR_KVM(tdx_enable); 1221 1222 static bool is_pamt_page(unsigned long phys) 1223 { 1224 struct tdmr_info_list *tdmr_list = &tdx_tdmr_list; 1225 int i; 1226 1227 /* Ensure that all remote 'tdmr_list' writes are visible: */ 1228 smp_rmb(); 1229 1230 /* 1231 * The TDX module is no longer returning TDX_SYS_NOT_READY and 1232 * is initialized. The 'tdmr_list' was initialized long ago 1233 * and is now read-only. 1234 */ 1235 for (i = 0; i < tdmr_list->nr_consumed_tdmrs; i++) { 1236 unsigned long base, size; 1237 1238 tdmr_get_pamt(tdmr_entry(tdmr_list, i), &base, &size); 1239 1240 if (phys >= base && phys < (base + size)) 1241 return true; 1242 } 1243 1244 return false; 1245 } 1246 1247 /* 1248 * Return whether the memory page at the given physical address is TDX 1249 * private memory or not. 1250 * 1251 * This can be imprecise for two known reasons: 1252 * 1. PAMTs are private memory and exist before the TDX module is 1253 * ready and TDH_PHYMEM_PAGE_RDMD works. This is a relatively 1254 * short window that occurs once per boot. 1255 * 2. TDH_PHYMEM_PAGE_RDMD reflects the TDX module's knowledge of the 1256 * page. However, the page can still cause #MC until it has been 1257 * fully converted to shared using 64-byte writes like MOVDIR64B. 1258 * Buggy hosts might still leave #MC-causing memory in place which 1259 * this function can not detect. 1260 */ 1261 static bool paddr_is_tdx_private(unsigned long phys) 1262 { 1263 struct tdx_module_args args = { 1264 .rcx = phys & PAGE_MASK, 1265 }; 1266 u64 sret; 1267 1268 if (!boot_cpu_has(X86_FEATURE_TDX_HOST_PLATFORM)) 1269 return false; 1270 1271 /* Get page type from the TDX module */ 1272 sret = __seamcall_dirty_cache(__seamcall_ret, TDH_PHYMEM_PAGE_RDMD, &args); 1273 1274 /* 1275 * The SEAMCALL will not return success unless there is a 1276 * working, "ready" TDX module. Assume an absence of TDX 1277 * private pages until SEAMCALL is working. 1278 */ 1279 if (sret) 1280 return false; 1281 1282 /* 1283 * SEAMCALL was successful -- read page type (via RCX): 1284 * 1285 * - PT_NDA: Page is not used by the TDX module 1286 * - PT_RSVD: Reserved for Non-TDX use 1287 * - Others: Page is used by the TDX module 1288 * 1289 * Note PAMT pages are marked as PT_RSVD but they are also TDX 1290 * private memory. 1291 */ 1292 switch (args.rcx) { 1293 case PT_NDA: 1294 return false; 1295 case PT_RSVD: 1296 return is_pamt_page(phys); 1297 default: 1298 return true; 1299 } 1300 } 1301 1302 /* 1303 * Some TDX-capable CPUs have an erratum. A write to TDX private 1304 * memory poisons that memory, and a subsequent read of that memory 1305 * triggers #MC. 1306 * 1307 * Help distinguish erratum-triggered #MCs from a normal hardware one. 1308 * Just print additional message to show such #MC may be result of the 1309 * erratum. 1310 */ 1311 const char *tdx_dump_mce_info(struct mce *m) 1312 { 1313 if (!m || !mce_is_memory_error(m) || !mce_usable_address(m)) 1314 return NULL; 1315 1316 if (!paddr_is_tdx_private(m->addr)) 1317 return NULL; 1318 1319 return "TDX private memory error. Possible kernel bug."; 1320 } 1321 1322 static __init int record_keyid_partitioning(u32 *tdx_keyid_start, 1323 u32 *nr_tdx_keyids) 1324 { 1325 u32 _nr_mktme_keyids, _tdx_keyid_start, _nr_tdx_keyids; 1326 int ret; 1327 1328 /* 1329 * IA32_MKTME_KEYID_PARTIONING: 1330 * Bit [31:0]: Number of MKTME KeyIDs. 1331 * Bit [63:32]: Number of TDX private KeyIDs. 1332 */ 1333 ret = rdmsr_safe(MSR_IA32_MKTME_KEYID_PARTITIONING, &_nr_mktme_keyids, 1334 &_nr_tdx_keyids); 1335 if (ret || !_nr_tdx_keyids) 1336 return -EINVAL; 1337 1338 /* TDX KeyIDs start after the last MKTME KeyID. */ 1339 _tdx_keyid_start = _nr_mktme_keyids + 1; 1340 1341 *tdx_keyid_start = _tdx_keyid_start; 1342 *nr_tdx_keyids = _nr_tdx_keyids; 1343 1344 return 0; 1345 } 1346 1347 static bool is_tdx_memory(unsigned long start_pfn, unsigned long end_pfn) 1348 { 1349 struct tdx_memblock *tmb; 1350 1351 /* 1352 * This check assumes that the start_pfn<->end_pfn range does not 1353 * cross multiple @tdx_memlist entries. A single memory online 1354 * event across multiple memblocks (from which @tdx_memlist 1355 * entries are derived at the time of module initialization) is 1356 * not possible. This is because memory offline/online is done 1357 * on granularity of 'struct memory_block', and the hotpluggable 1358 * memory region (one memblock) must be multiple of memory_block. 1359 */ 1360 list_for_each_entry(tmb, &tdx_memlist, list) { 1361 if (start_pfn >= tmb->start_pfn && end_pfn <= tmb->end_pfn) 1362 return true; 1363 } 1364 return false; 1365 } 1366 1367 static int tdx_memory_notifier(struct notifier_block *nb, unsigned long action, 1368 void *v) 1369 { 1370 struct memory_notify *mn = v; 1371 1372 if (action != MEM_GOING_ONLINE) 1373 return NOTIFY_OK; 1374 1375 /* 1376 * Empty list means TDX isn't enabled. Allow any memory 1377 * to go online. 1378 */ 1379 if (list_empty(&tdx_memlist)) 1380 return NOTIFY_OK; 1381 1382 /* 1383 * The TDX memory configuration is static and can not be 1384 * changed. Reject onlining any memory which is outside of 1385 * the static configuration whether it supports TDX or not. 1386 */ 1387 if (is_tdx_memory(mn->start_pfn, mn->start_pfn + mn->nr_pages)) 1388 return NOTIFY_OK; 1389 1390 return NOTIFY_BAD; 1391 } 1392 1393 static struct notifier_block tdx_memory_nb = { 1394 .notifier_call = tdx_memory_notifier, 1395 }; 1396 1397 static void __init check_tdx_erratum(void) 1398 { 1399 /* 1400 * These CPUs have an erratum. A partial write from non-TD 1401 * software (e.g. via MOVNTI variants or UC/WC mapping) to TDX 1402 * private memory poisons that memory, and a subsequent read of 1403 * that memory triggers #MC. 1404 */ 1405 switch (boot_cpu_data.x86_vfm) { 1406 case INTEL_SAPPHIRERAPIDS_X: 1407 case INTEL_EMERALDRAPIDS_X: 1408 setup_force_cpu_bug(X86_BUG_TDX_PW_MCE); 1409 } 1410 } 1411 1412 void __init tdx_init(void) 1413 { 1414 u32 tdx_keyid_start, nr_tdx_keyids; 1415 int err; 1416 1417 err = record_keyid_partitioning(&tdx_keyid_start, &nr_tdx_keyids); 1418 if (err) 1419 return; 1420 1421 pr_info("BIOS enabled: private KeyID range [%u, %u)\n", 1422 tdx_keyid_start, tdx_keyid_start + nr_tdx_keyids); 1423 1424 /* 1425 * The TDX module itself requires one 'global KeyID' to protect 1426 * its metadata. If there's only one TDX KeyID, there won't be 1427 * any left for TDX guests thus there's no point to enable TDX 1428 * at all. 1429 */ 1430 if (nr_tdx_keyids < 2) { 1431 pr_err("initialization failed: too few private KeyIDs available.\n"); 1432 return; 1433 } 1434 1435 /* 1436 * At this point, hibernation_available() indicates whether or 1437 * not hibernation support has been permanently disabled. 1438 */ 1439 if (hibernation_available()) { 1440 pr_err("initialization failed: Hibernation support is enabled\n"); 1441 return; 1442 } 1443 1444 err = register_memory_notifier(&tdx_memory_nb); 1445 if (err) { 1446 pr_err("initialization failed: register_memory_notifier() failed (%d)\n", 1447 err); 1448 return; 1449 } 1450 1451 #if defined(CONFIG_ACPI) && defined(CONFIG_SUSPEND) 1452 pr_info("Disable ACPI S3. Turn off TDX in the BIOS to use ACPI S3.\n"); 1453 acpi_suspend_lowlevel = NULL; 1454 #endif 1455 1456 /* 1457 * Just use the first TDX KeyID as the 'global KeyID' and 1458 * leave the rest for TDX guests. 1459 */ 1460 tdx_global_keyid = tdx_keyid_start; 1461 tdx_guest_keyid_start = tdx_keyid_start + 1; 1462 tdx_nr_guest_keyids = nr_tdx_keyids - 1; 1463 1464 setup_force_cpu_cap(X86_FEATURE_TDX_HOST_PLATFORM); 1465 1466 check_tdx_erratum(); 1467 } 1468 1469 const struct tdx_sys_info *tdx_get_sysinfo(void) 1470 { 1471 const struct tdx_sys_info *p = NULL; 1472 1473 /* Make sure all fields in @tdx_sysinfo have been populated */ 1474 mutex_lock(&tdx_module_lock); 1475 if (tdx_module_status == TDX_MODULE_INITIALIZED) 1476 p = (const struct tdx_sys_info *)&tdx_sysinfo; 1477 mutex_unlock(&tdx_module_lock); 1478 1479 return p; 1480 } 1481 EXPORT_SYMBOL_FOR_KVM(tdx_get_sysinfo); 1482 1483 u32 tdx_get_nr_guest_keyids(void) 1484 { 1485 return tdx_nr_guest_keyids; 1486 } 1487 EXPORT_SYMBOL_FOR_KVM(tdx_get_nr_guest_keyids); 1488 1489 int tdx_guest_keyid_alloc(void) 1490 { 1491 return ida_alloc_range(&tdx_guest_keyid_pool, tdx_guest_keyid_start, 1492 tdx_guest_keyid_start + tdx_nr_guest_keyids - 1, 1493 GFP_KERNEL); 1494 } 1495 EXPORT_SYMBOL_FOR_KVM(tdx_guest_keyid_alloc); 1496 1497 void tdx_guest_keyid_free(unsigned int keyid) 1498 { 1499 ida_free(&tdx_guest_keyid_pool, keyid); 1500 } 1501 EXPORT_SYMBOL_FOR_KVM(tdx_guest_keyid_free); 1502 1503 static inline u64 tdx_tdr_pa(struct tdx_td *td) 1504 { 1505 return page_to_phys(td->tdr_page); 1506 } 1507 1508 /* 1509 * The TDX module exposes a CLFLUSH_BEFORE_ALLOC bit to specify whether 1510 * a CLFLUSH of pages is required before handing them to the TDX module. 1511 * Be conservative and make the code simpler by doing the CLFLUSH 1512 * unconditionally. 1513 */ 1514 static void tdx_clflush_page(struct page *page) 1515 { 1516 clflush_cache_range(page_to_virt(page), PAGE_SIZE); 1517 } 1518 1519 noinstr u64 tdh_vp_enter(struct tdx_vp *td, struct tdx_module_args *args) 1520 { 1521 args->rcx = td->tdvpr_pa; 1522 1523 return __seamcall_dirty_cache(__seamcall_saved_ret, TDH_VP_ENTER, args); 1524 } 1525 EXPORT_SYMBOL_FOR_KVM(tdh_vp_enter); 1526 1527 u64 tdh_mng_addcx(struct tdx_td *td, struct page *tdcs_page) 1528 { 1529 struct tdx_module_args args = { 1530 .rcx = page_to_phys(tdcs_page), 1531 .rdx = tdx_tdr_pa(td), 1532 }; 1533 1534 tdx_clflush_page(tdcs_page); 1535 return seamcall(TDH_MNG_ADDCX, &args); 1536 } 1537 EXPORT_SYMBOL_FOR_KVM(tdh_mng_addcx); 1538 1539 u64 tdh_mem_page_add(struct tdx_td *td, u64 gpa, struct page *page, struct page *source, u64 *ext_err1, u64 *ext_err2) 1540 { 1541 struct tdx_module_args args = { 1542 .rcx = gpa, 1543 .rdx = tdx_tdr_pa(td), 1544 .r8 = page_to_phys(page), 1545 .r9 = page_to_phys(source), 1546 }; 1547 u64 ret; 1548 1549 tdx_clflush_page(page); 1550 ret = seamcall_ret(TDH_MEM_PAGE_ADD, &args); 1551 1552 *ext_err1 = args.rcx; 1553 *ext_err2 = args.rdx; 1554 1555 return ret; 1556 } 1557 EXPORT_SYMBOL_FOR_KVM(tdh_mem_page_add); 1558 1559 u64 tdh_mem_sept_add(struct tdx_td *td, u64 gpa, int level, struct page *page, u64 *ext_err1, u64 *ext_err2) 1560 { 1561 struct tdx_module_args args = { 1562 .rcx = gpa | level, 1563 .rdx = tdx_tdr_pa(td), 1564 .r8 = page_to_phys(page), 1565 }; 1566 u64 ret; 1567 1568 tdx_clflush_page(page); 1569 ret = seamcall_ret(TDH_MEM_SEPT_ADD, &args); 1570 1571 *ext_err1 = args.rcx; 1572 *ext_err2 = args.rdx; 1573 1574 return ret; 1575 } 1576 EXPORT_SYMBOL_FOR_KVM(tdh_mem_sept_add); 1577 1578 u64 tdh_vp_addcx(struct tdx_vp *vp, struct page *tdcx_page) 1579 { 1580 struct tdx_module_args args = { 1581 .rcx = page_to_phys(tdcx_page), 1582 .rdx = vp->tdvpr_pa, 1583 }; 1584 1585 tdx_clflush_page(tdcx_page); 1586 return seamcall(TDH_VP_ADDCX, &args); 1587 } 1588 EXPORT_SYMBOL_FOR_KVM(tdh_vp_addcx); 1589 1590 u64 tdh_mem_page_aug(struct tdx_td *td, u64 gpa, int level, struct page *page, u64 *ext_err1, u64 *ext_err2) 1591 { 1592 struct tdx_module_args args = { 1593 .rcx = gpa | level, 1594 .rdx = tdx_tdr_pa(td), 1595 .r8 = page_to_phys(page), 1596 }; 1597 u64 ret; 1598 1599 tdx_clflush_page(page); 1600 ret = seamcall_ret(TDH_MEM_PAGE_AUG, &args); 1601 1602 *ext_err1 = args.rcx; 1603 *ext_err2 = args.rdx; 1604 1605 return ret; 1606 } 1607 EXPORT_SYMBOL_FOR_KVM(tdh_mem_page_aug); 1608 1609 u64 tdh_mem_range_block(struct tdx_td *td, u64 gpa, int level, u64 *ext_err1, u64 *ext_err2) 1610 { 1611 struct tdx_module_args args = { 1612 .rcx = gpa | level, 1613 .rdx = tdx_tdr_pa(td), 1614 }; 1615 u64 ret; 1616 1617 ret = seamcall_ret(TDH_MEM_RANGE_BLOCK, &args); 1618 1619 *ext_err1 = args.rcx; 1620 *ext_err2 = args.rdx; 1621 1622 return ret; 1623 } 1624 EXPORT_SYMBOL_FOR_KVM(tdh_mem_range_block); 1625 1626 u64 tdh_mng_key_config(struct tdx_td *td) 1627 { 1628 struct tdx_module_args args = { 1629 .rcx = tdx_tdr_pa(td), 1630 }; 1631 1632 return seamcall(TDH_MNG_KEY_CONFIG, &args); 1633 } 1634 EXPORT_SYMBOL_FOR_KVM(tdh_mng_key_config); 1635 1636 u64 tdh_mng_create(struct tdx_td *td, u16 hkid) 1637 { 1638 struct tdx_module_args args = { 1639 .rcx = tdx_tdr_pa(td), 1640 .rdx = hkid, 1641 }; 1642 1643 tdx_clflush_page(td->tdr_page); 1644 return seamcall(TDH_MNG_CREATE, &args); 1645 } 1646 EXPORT_SYMBOL_FOR_KVM(tdh_mng_create); 1647 1648 u64 tdh_vp_create(struct tdx_td *td, struct tdx_vp *vp) 1649 { 1650 struct tdx_module_args args = { 1651 .rcx = vp->tdvpr_pa, 1652 .rdx = tdx_tdr_pa(td), 1653 }; 1654 1655 tdx_clflush_page(vp->tdvpr_page); 1656 return seamcall(TDH_VP_CREATE, &args); 1657 } 1658 EXPORT_SYMBOL_FOR_KVM(tdh_vp_create); 1659 1660 u64 tdh_mng_rd(struct tdx_td *td, u64 field, u64 *data) 1661 { 1662 struct tdx_module_args args = { 1663 .rcx = tdx_tdr_pa(td), 1664 .rdx = field, 1665 }; 1666 u64 ret; 1667 1668 ret = seamcall_ret(TDH_MNG_RD, &args); 1669 1670 /* R8: Content of the field, or 0 in case of error. */ 1671 *data = args.r8; 1672 1673 return ret; 1674 } 1675 EXPORT_SYMBOL_FOR_KVM(tdh_mng_rd); 1676 1677 u64 tdh_mr_extend(struct tdx_td *td, u64 gpa, u64 *ext_err1, u64 *ext_err2) 1678 { 1679 struct tdx_module_args args = { 1680 .rcx = gpa, 1681 .rdx = tdx_tdr_pa(td), 1682 }; 1683 u64 ret; 1684 1685 ret = seamcall_ret(TDH_MR_EXTEND, &args); 1686 1687 *ext_err1 = args.rcx; 1688 *ext_err2 = args.rdx; 1689 1690 return ret; 1691 } 1692 EXPORT_SYMBOL_FOR_KVM(tdh_mr_extend); 1693 1694 u64 tdh_mr_finalize(struct tdx_td *td) 1695 { 1696 struct tdx_module_args args = { 1697 .rcx = tdx_tdr_pa(td), 1698 }; 1699 1700 return seamcall(TDH_MR_FINALIZE, &args); 1701 } 1702 EXPORT_SYMBOL_FOR_KVM(tdh_mr_finalize); 1703 1704 u64 tdh_vp_flush(struct tdx_vp *vp) 1705 { 1706 struct tdx_module_args args = { 1707 .rcx = vp->tdvpr_pa, 1708 }; 1709 1710 return seamcall(TDH_VP_FLUSH, &args); 1711 } 1712 EXPORT_SYMBOL_FOR_KVM(tdh_vp_flush); 1713 1714 u64 tdh_mng_vpflushdone(struct tdx_td *td) 1715 { 1716 struct tdx_module_args args = { 1717 .rcx = tdx_tdr_pa(td), 1718 }; 1719 1720 return seamcall(TDH_MNG_VPFLUSHDONE, &args); 1721 } 1722 EXPORT_SYMBOL_FOR_KVM(tdh_mng_vpflushdone); 1723 1724 u64 tdh_mng_key_freeid(struct tdx_td *td) 1725 { 1726 struct tdx_module_args args = { 1727 .rcx = tdx_tdr_pa(td), 1728 }; 1729 1730 return seamcall(TDH_MNG_KEY_FREEID, &args); 1731 } 1732 EXPORT_SYMBOL_FOR_KVM(tdh_mng_key_freeid); 1733 1734 u64 tdh_mng_init(struct tdx_td *td, u64 td_params, u64 *extended_err) 1735 { 1736 struct tdx_module_args args = { 1737 .rcx = tdx_tdr_pa(td), 1738 .rdx = td_params, 1739 }; 1740 u64 ret; 1741 1742 ret = seamcall_ret(TDH_MNG_INIT, &args); 1743 1744 *extended_err = args.rcx; 1745 1746 return ret; 1747 } 1748 EXPORT_SYMBOL_FOR_KVM(tdh_mng_init); 1749 1750 u64 tdh_vp_rd(struct tdx_vp *vp, u64 field, u64 *data) 1751 { 1752 struct tdx_module_args args = { 1753 .rcx = vp->tdvpr_pa, 1754 .rdx = field, 1755 }; 1756 u64 ret; 1757 1758 ret = seamcall_ret(TDH_VP_RD, &args); 1759 1760 /* R8: Content of the field, or 0 in case of error. */ 1761 *data = args.r8; 1762 1763 return ret; 1764 } 1765 EXPORT_SYMBOL_FOR_KVM(tdh_vp_rd); 1766 1767 u64 tdh_vp_wr(struct tdx_vp *vp, u64 field, u64 data, u64 mask) 1768 { 1769 struct tdx_module_args args = { 1770 .rcx = vp->tdvpr_pa, 1771 .rdx = field, 1772 .r8 = data, 1773 .r9 = mask, 1774 }; 1775 1776 return seamcall(TDH_VP_WR, &args); 1777 } 1778 EXPORT_SYMBOL_FOR_KVM(tdh_vp_wr); 1779 1780 u64 tdh_vp_init(struct tdx_vp *vp, u64 initial_rcx, u32 x2apicid) 1781 { 1782 struct tdx_module_args args = { 1783 .rcx = vp->tdvpr_pa, 1784 .rdx = initial_rcx, 1785 .r8 = x2apicid, 1786 }; 1787 1788 /* apicid requires version == 1. */ 1789 return seamcall(TDH_VP_INIT | (1ULL << TDX_VERSION_SHIFT), &args); 1790 } 1791 EXPORT_SYMBOL_FOR_KVM(tdh_vp_init); 1792 1793 /* 1794 * TDX ABI defines output operands as PT, OWNER and SIZE. These are TDX defined fomats. 1795 * So despite the names, they must be interpted specially as described by the spec. Return 1796 * them only for error reporting purposes. 1797 */ 1798 u64 tdh_phymem_page_reclaim(struct page *page, u64 *tdx_pt, u64 *tdx_owner, u64 *tdx_size) 1799 { 1800 struct tdx_module_args args = { 1801 .rcx = page_to_phys(page), 1802 }; 1803 u64 ret; 1804 1805 ret = seamcall_ret(TDH_PHYMEM_PAGE_RECLAIM, &args); 1806 1807 *tdx_pt = args.rcx; 1808 *tdx_owner = args.rdx; 1809 *tdx_size = args.r8; 1810 1811 return ret; 1812 } 1813 EXPORT_SYMBOL_FOR_KVM(tdh_phymem_page_reclaim); 1814 1815 u64 tdh_mem_track(struct tdx_td *td) 1816 { 1817 struct tdx_module_args args = { 1818 .rcx = tdx_tdr_pa(td), 1819 }; 1820 1821 return seamcall(TDH_MEM_TRACK, &args); 1822 } 1823 EXPORT_SYMBOL_FOR_KVM(tdh_mem_track); 1824 1825 u64 tdh_mem_page_remove(struct tdx_td *td, u64 gpa, u64 level, u64 *ext_err1, u64 *ext_err2) 1826 { 1827 struct tdx_module_args args = { 1828 .rcx = gpa | level, 1829 .rdx = tdx_tdr_pa(td), 1830 }; 1831 u64 ret; 1832 1833 ret = seamcall_ret(TDH_MEM_PAGE_REMOVE, &args); 1834 1835 *ext_err1 = args.rcx; 1836 *ext_err2 = args.rdx; 1837 1838 return ret; 1839 } 1840 EXPORT_SYMBOL_FOR_KVM(tdh_mem_page_remove); 1841 1842 u64 tdh_phymem_cache_wb(bool resume) 1843 { 1844 struct tdx_module_args args = { 1845 .rcx = resume ? 1 : 0, 1846 }; 1847 1848 return seamcall(TDH_PHYMEM_CACHE_WB, &args); 1849 } 1850 EXPORT_SYMBOL_FOR_KVM(tdh_phymem_cache_wb); 1851 1852 u64 tdh_phymem_page_wbinvd_tdr(struct tdx_td *td) 1853 { 1854 struct tdx_module_args args = {}; 1855 1856 args.rcx = mk_keyed_paddr(tdx_global_keyid, td->tdr_page); 1857 1858 return seamcall(TDH_PHYMEM_PAGE_WBINVD, &args); 1859 } 1860 EXPORT_SYMBOL_FOR_KVM(tdh_phymem_page_wbinvd_tdr); 1861 1862 u64 tdh_phymem_page_wbinvd_hkid(u64 hkid, struct page *page) 1863 { 1864 struct tdx_module_args args = {}; 1865 1866 args.rcx = mk_keyed_paddr(hkid, page); 1867 1868 return seamcall(TDH_PHYMEM_PAGE_WBINVD, &args); 1869 } 1870 EXPORT_SYMBOL_FOR_KVM(tdh_phymem_page_wbinvd_hkid); 1871 1872 #ifdef CONFIG_KEXEC_CORE 1873 void tdx_cpu_flush_cache_for_kexec(void) 1874 { 1875 lockdep_assert_preemption_disabled(); 1876 1877 if (!this_cpu_read(cache_state_incoherent)) 1878 return; 1879 1880 /* 1881 * Private memory cachelines need to be clean at the time of 1882 * kexec. Write them back now, as the caller promises that 1883 * there should be no more SEAMCALLs on this CPU. 1884 */ 1885 wbinvd(); 1886 this_cpu_write(cache_state_incoherent, false); 1887 } 1888 EXPORT_SYMBOL_FOR_KVM(tdx_cpu_flush_cache_for_kexec); 1889 #endif 1890