1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright(c) 2023 Intel Corporation. 4 * 5 * Intel Trusted Domain Extensions (TDX) support 6 */ 7 8 #include "asm/page_types.h" 9 #define pr_fmt(fmt) "virt/tdx: " fmt 10 11 #include <linux/types.h> 12 #include <linux/cache.h> 13 #include <linux/init.h> 14 #include <linux/errno.h> 15 #include <linux/printk.h> 16 #include <linux/cpu.h> 17 #include <linux/spinlock.h> 18 #include <linux/percpu-defs.h> 19 #include <linux/mutex.h> 20 #include <linux/list.h> 21 #include <linux/memblock.h> 22 #include <linux/memory.h> 23 #include <linux/minmax.h> 24 #include <linux/sizes.h> 25 #include <linux/pfn.h> 26 #include <linux/align.h> 27 #include <linux/sort.h> 28 #include <linux/log2.h> 29 #include <linux/acpi.h> 30 #include <linux/suspend.h> 31 #include <linux/syscore_ops.h> 32 #include <linux/idr.h> 33 #include <linux/kvm_types.h> 34 #include <asm/page.h> 35 #include <asm/special_insns.h> 36 #include <asm/msr-index.h> 37 #include <asm/msr.h> 38 #include <asm/cpufeature.h> 39 #include <asm/tdx.h> 40 #include <asm/cpu_device_id.h> 41 #include <asm/processor.h> 42 #include <asm/mce.h> 43 #include <asm/virt.h> 44 #include "tdx.h" 45 46 static u32 tdx_global_keyid __ro_after_init; 47 static u32 tdx_guest_keyid_start __ro_after_init; 48 static u32 tdx_nr_guest_keyids __ro_after_init; 49 50 static DEFINE_IDA(tdx_guest_keyid_pool); 51 52 static DEFINE_PER_CPU(bool, tdx_lp_initialized); 53 54 static struct tdmr_info_list tdx_tdmr_list; 55 56 /* All TDX-usable memory regions. Protected by mem_hotplug_lock. */ 57 static LIST_HEAD(tdx_memlist); 58 59 static struct tdx_sys_info tdx_sysinfo __ro_after_init; 60 static bool tdx_module_initialized __ro_after_init; 61 62 typedef void (*sc_err_func_t)(u64 fn, u64 err, struct tdx_module_args *args); 63 64 static inline void seamcall_err(u64 fn, u64 err, struct tdx_module_args *args) 65 { 66 pr_err("SEAMCALL (0x%016llx) failed: 0x%016llx\n", fn, err); 67 } 68 69 static inline void seamcall_err_ret(u64 fn, u64 err, 70 struct tdx_module_args *args) 71 { 72 seamcall_err(fn, err, args); 73 pr_err("RCX 0x%016llx RDX 0x%016llx R08 0x%016llx\n", 74 args->rcx, args->rdx, args->r8); 75 pr_err("R09 0x%016llx R10 0x%016llx R11 0x%016llx\n", 76 args->r9, args->r10, args->r11); 77 } 78 79 static __always_inline int sc_retry_prerr(sc_func_t func, 80 sc_err_func_t err_func, 81 u64 fn, struct tdx_module_args *args) 82 { 83 u64 sret = sc_retry(func, fn, args); 84 85 if (sret == TDX_SUCCESS) 86 return 0; 87 88 if (sret == TDX_SEAMCALL_VMFAILINVALID) 89 return -ENODEV; 90 91 if (sret == TDX_SEAMCALL_GP) 92 return -EOPNOTSUPP; 93 94 if (sret == TDX_SEAMCALL_UD) 95 return -EACCES; 96 97 err_func(fn, sret, args); 98 return -EIO; 99 } 100 101 #define seamcall_prerr(__fn, __args) \ 102 sc_retry_prerr(__seamcall, seamcall_err, (__fn), (__args)) 103 104 #define seamcall_prerr_ret(__fn, __args) \ 105 sc_retry_prerr(__seamcall_ret, seamcall_err_ret, (__fn), (__args)) 106 107 /* 108 * Do the module global initialization once and return its result. 109 * It can be done on any cpu, and from task or IRQ context. 110 */ 111 static int try_init_module_global(void) 112 { 113 struct tdx_module_args args = {}; 114 static DEFINE_RAW_SPINLOCK(sysinit_lock); 115 static bool sysinit_done; 116 static int sysinit_ret; 117 118 raw_spin_lock(&sysinit_lock); 119 120 if (sysinit_done) 121 goto out; 122 123 /* RCX is module attributes and all bits are reserved */ 124 args.rcx = 0; 125 sysinit_ret = seamcall_prerr(TDH_SYS_INIT, &args); 126 127 /* 128 * The first SEAMCALL also detects the TDX module, thus 129 * it can fail due to the TDX module is not loaded. 130 * Dump message to let the user know. 131 */ 132 if (sysinit_ret == -ENODEV) 133 pr_err("module not loaded\n"); 134 135 sysinit_done = true; 136 out: 137 raw_spin_unlock(&sysinit_lock); 138 return sysinit_ret; 139 } 140 141 /** 142 * Enable VMXON and then do one-time TDX module per-cpu initialization SEAMCALL 143 * (and TDX module global initialization SEAMCALL if not done) on local cpu to 144 * make this cpu be ready to run any other SEAMCALLs. 145 */ 146 static int tdx_cpu_enable(void) 147 { 148 struct tdx_module_args args = {}; 149 int ret; 150 151 if (__this_cpu_read(tdx_lp_initialized)) 152 return 0; 153 154 /* 155 * The TDX module global initialization is the very first step 156 * to enable TDX. Need to do it first (if hasn't been done) 157 * before the per-cpu initialization. 158 */ 159 ret = try_init_module_global(); 160 if (ret) 161 return ret; 162 163 ret = seamcall_prerr(TDH_SYS_LP_INIT, &args); 164 if (ret) 165 return ret; 166 167 __this_cpu_write(tdx_lp_initialized, true); 168 169 return 0; 170 } 171 172 static int tdx_online_cpu(unsigned int cpu) 173 { 174 int ret; 175 176 ret = x86_virt_get_ref(X86_FEATURE_VMX); 177 if (ret) 178 return ret; 179 180 ret = tdx_cpu_enable(); 181 if (ret) 182 x86_virt_put_ref(X86_FEATURE_VMX); 183 184 return ret; 185 } 186 187 static int tdx_offline_cpu(unsigned int cpu) 188 { 189 int i; 190 191 /* No TD is running. Allow any cpu to be offline. */ 192 if (ida_is_empty(&tdx_guest_keyid_pool)) 193 goto done; 194 195 /* 196 * In order to reclaim TDX HKID, (i.e. when deleting guest TD), need to 197 * call TDH.PHYMEM.PAGE.WBINVD on all packages to program all memory 198 * controller with pconfig. If we have active TDX HKID, refuse to 199 * offline the last online cpu. 200 */ 201 for_each_online_cpu(i) { 202 /* 203 * Found another online cpu on the same package. 204 * Allow to offline. 205 */ 206 if (i != cpu && topology_physical_package_id(i) == 207 topology_physical_package_id(cpu)) 208 goto done; 209 } 210 211 /* 212 * This is the last cpu of this package. Don't offline it. 213 * 214 * Because it's hard for human operator to understand the 215 * reason, warn it. 216 */ 217 #define MSG_ALLPKG_ONLINE \ 218 "TDX requires all packages to have an online CPU. Delete all TDs in order to offline all CPUs of a package.\n" 219 pr_warn_ratelimited(MSG_ALLPKG_ONLINE); 220 return -EBUSY; 221 222 done: 223 x86_virt_put_ref(X86_FEATURE_VMX); 224 return 0; 225 } 226 227 static void tdx_shutdown_cpu(void *ign) 228 { 229 x86_virt_put_ref(X86_FEATURE_VMX); 230 } 231 232 static void tdx_shutdown(void *ign) 233 { 234 on_each_cpu(tdx_shutdown_cpu, NULL, 1); 235 } 236 237 static int tdx_suspend(void *ign) 238 { 239 x86_virt_put_ref(X86_FEATURE_VMX); 240 return 0; 241 } 242 243 static void tdx_resume(void *ign) 244 { 245 WARN_ON_ONCE(x86_virt_get_ref(X86_FEATURE_VMX)); 246 } 247 248 static const struct syscore_ops tdx_syscore_ops = { 249 .suspend = tdx_suspend, 250 .resume = tdx_resume, 251 .shutdown = tdx_shutdown, 252 }; 253 254 static struct syscore tdx_syscore = { 255 .ops = &tdx_syscore_ops, 256 }; 257 258 /* 259 * Add a memory region as a TDX memory block. The caller must make sure 260 * all memory regions are added in address ascending order and don't 261 * overlap. 262 */ 263 static __init int add_tdx_memblock(struct list_head *tmb_list, 264 unsigned long start_pfn, 265 unsigned long end_pfn, int nid) 266 { 267 struct tdx_memblock *tmb; 268 269 tmb = kmalloc_obj(*tmb); 270 if (!tmb) 271 return -ENOMEM; 272 273 INIT_LIST_HEAD(&tmb->list); 274 tmb->start_pfn = start_pfn; 275 tmb->end_pfn = end_pfn; 276 tmb->nid = nid; 277 278 /* @tmb_list is protected by mem_hotplug_lock */ 279 list_add_tail(&tmb->list, tmb_list); 280 return 0; 281 } 282 283 static __init void free_tdx_memlist(struct list_head *tmb_list) 284 { 285 /* @tmb_list is protected by mem_hotplug_lock */ 286 while (!list_empty(tmb_list)) { 287 struct tdx_memblock *tmb = list_first_entry(tmb_list, 288 struct tdx_memblock, list); 289 290 list_del(&tmb->list); 291 kfree(tmb); 292 } 293 } 294 295 /* 296 * Ensure that all memblock memory regions are convertible to TDX 297 * memory. Once this has been established, stash the memblock 298 * ranges off in a secondary structure because memblock is modified 299 * in memory hotplug while TDX memory regions are fixed. 300 */ 301 static __init int build_tdx_memlist(struct list_head *tmb_list) 302 { 303 unsigned long start_pfn, end_pfn; 304 int i, nid, ret; 305 306 for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) { 307 /* 308 * The first 1MB is not reported as TDX convertible memory. 309 * Although the first 1MB is always reserved and won't end up 310 * to the page allocator, it is still in memblock's memory 311 * regions. Skip them manually to exclude them as TDX memory. 312 */ 313 start_pfn = max(start_pfn, PHYS_PFN(SZ_1M)); 314 if (start_pfn >= end_pfn) 315 continue; 316 317 /* 318 * Add the memory regions as TDX memory. The regions in 319 * memblock has already guaranteed they are in address 320 * ascending order and don't overlap. 321 */ 322 ret = add_tdx_memblock(tmb_list, start_pfn, end_pfn, nid); 323 if (ret) 324 goto err; 325 } 326 327 return 0; 328 err: 329 free_tdx_memlist(tmb_list); 330 return ret; 331 } 332 333 static __init int read_sys_metadata_field(u64 field_id, u64 *data) 334 { 335 struct tdx_module_args args = {}; 336 int ret; 337 338 /* 339 * TDH.SYS.RD -- reads one global metadata field 340 * - RDX (in): the field to read 341 * - R8 (out): the field data 342 */ 343 args.rdx = field_id; 344 ret = seamcall_prerr_ret(TDH_SYS_RD, &args); 345 if (ret) 346 return ret; 347 348 *data = args.r8; 349 350 return 0; 351 } 352 353 #include "tdx_global_metadata.c" 354 355 static __init int check_features(struct tdx_sys_info *sysinfo) 356 { 357 u64 tdx_features0 = sysinfo->features.tdx_features0; 358 359 if (!(tdx_features0 & TDX_FEATURES0_NO_RBP_MOD)) { 360 pr_err("frame pointer (RBP) clobber bug present, upgrade TDX module\n"); 361 return -EINVAL; 362 } 363 364 return 0; 365 } 366 367 /* Calculate the actual TDMR size */ 368 static __init int tdmr_size_single(u16 max_reserved_per_tdmr) 369 { 370 int tdmr_sz; 371 372 /* 373 * The actual size of TDMR depends on the maximum 374 * number of reserved areas. 375 */ 376 tdmr_sz = sizeof(struct tdmr_info); 377 tdmr_sz += sizeof(struct tdmr_reserved_area) * max_reserved_per_tdmr; 378 379 return ALIGN(tdmr_sz, TDMR_INFO_ALIGNMENT); 380 } 381 382 static __init int alloc_tdmr_list(struct tdmr_info_list *tdmr_list, 383 struct tdx_sys_info_tdmr *sysinfo_tdmr) 384 { 385 size_t tdmr_sz, tdmr_array_sz; 386 void *tdmr_array; 387 388 tdmr_sz = tdmr_size_single(sysinfo_tdmr->max_reserved_per_tdmr); 389 tdmr_array_sz = tdmr_sz * sysinfo_tdmr->max_tdmrs; 390 391 /* 392 * To keep things simple, allocate all TDMRs together. 393 * The buffer needs to be physically contiguous to make 394 * sure each TDMR is physically contiguous. 395 */ 396 tdmr_array = alloc_pages_exact(tdmr_array_sz, 397 GFP_KERNEL | __GFP_ZERO); 398 if (!tdmr_array) 399 return -ENOMEM; 400 401 tdmr_list->tdmrs = tdmr_array; 402 403 /* 404 * Keep the size of TDMR to find the target TDMR 405 * at a given index in the TDMR list. 406 */ 407 tdmr_list->tdmr_sz = tdmr_sz; 408 tdmr_list->max_tdmrs = sysinfo_tdmr->max_tdmrs; 409 tdmr_list->nr_consumed_tdmrs = 0; 410 411 return 0; 412 } 413 414 static __init void free_tdmr_list(struct tdmr_info_list *tdmr_list) 415 { 416 free_pages_exact(tdmr_list->tdmrs, 417 tdmr_list->max_tdmrs * tdmr_list->tdmr_sz); 418 } 419 420 /* Get the TDMR from the list at the given index. */ 421 static struct tdmr_info *tdmr_entry(struct tdmr_info_list *tdmr_list, 422 int idx) 423 { 424 int tdmr_info_offset = tdmr_list->tdmr_sz * idx; 425 426 return (void *)tdmr_list->tdmrs + tdmr_info_offset; 427 } 428 429 #define TDMR_ALIGNMENT SZ_1G 430 #define TDMR_ALIGN_DOWN(_addr) ALIGN_DOWN((_addr), TDMR_ALIGNMENT) 431 #define TDMR_ALIGN_UP(_addr) ALIGN((_addr), TDMR_ALIGNMENT) 432 433 static inline u64 tdmr_end(struct tdmr_info *tdmr) 434 { 435 return tdmr->base + tdmr->size; 436 } 437 438 /* 439 * Take the memory referenced in @tmb_list and populate the 440 * preallocated @tdmr_list, following all the special alignment 441 * and size rules for TDMR. 442 */ 443 static __init int fill_out_tdmrs(struct list_head *tmb_list, 444 struct tdmr_info_list *tdmr_list) 445 { 446 struct tdx_memblock *tmb; 447 int tdmr_idx = 0; 448 449 /* 450 * Loop over TDX memory regions and fill out TDMRs to cover them. 451 * To keep it simple, always try to use one TDMR to cover one 452 * memory region. 453 * 454 * In practice TDX supports at least 64 TDMRs. A 2-socket system 455 * typically only consumes less than 10 of those. This code is 456 * dumb and simple and may use more TMDRs than is strictly 457 * required. 458 */ 459 list_for_each_entry(tmb, tmb_list, list) { 460 struct tdmr_info *tdmr = tdmr_entry(tdmr_list, tdmr_idx); 461 u64 start, end; 462 463 start = TDMR_ALIGN_DOWN(PFN_PHYS(tmb->start_pfn)); 464 end = TDMR_ALIGN_UP(PFN_PHYS(tmb->end_pfn)); 465 466 /* 467 * A valid size indicates the current TDMR has already 468 * been filled out to cover the previous memory region(s). 469 */ 470 if (tdmr->size) { 471 /* 472 * Loop to the next if the current memory region 473 * has already been fully covered. 474 */ 475 if (end <= tdmr_end(tdmr)) 476 continue; 477 478 /* Otherwise, skip the already covered part. */ 479 if (start < tdmr_end(tdmr)) 480 start = tdmr_end(tdmr); 481 482 /* 483 * Create a new TDMR to cover the current memory 484 * region, or the remaining part of it. 485 */ 486 tdmr_idx++; 487 if (tdmr_idx >= tdmr_list->max_tdmrs) { 488 pr_warn("initialization failed: TDMRs exhausted.\n"); 489 return -ENOSPC; 490 } 491 492 tdmr = tdmr_entry(tdmr_list, tdmr_idx); 493 } 494 495 tdmr->base = start; 496 tdmr->size = end - start; 497 } 498 499 /* @tdmr_idx is always the index of the last valid TDMR. */ 500 tdmr_list->nr_consumed_tdmrs = tdmr_idx + 1; 501 502 /* 503 * Warn early that kernel is about to run out of TDMRs. 504 * 505 * This is an indication that TDMR allocation has to be 506 * reworked to be smarter to not run into an issue. 507 */ 508 if (tdmr_list->max_tdmrs - tdmr_list->nr_consumed_tdmrs < TDMR_NR_WARN) 509 pr_warn("consumed TDMRs reaching limit: %d used out of %d\n", 510 tdmr_list->nr_consumed_tdmrs, 511 tdmr_list->max_tdmrs); 512 513 return 0; 514 } 515 516 /* 517 * Calculate PAMT size given a TDMR and a page size. The returned 518 * PAMT size is always aligned up to 4K page boundary. 519 */ 520 static __init unsigned long tdmr_get_pamt_sz(struct tdmr_info *tdmr, int pgsz, 521 u16 pamt_entry_size) 522 { 523 unsigned long pamt_sz, nr_pamt_entries; 524 525 switch (pgsz) { 526 case TDX_PS_4K: 527 nr_pamt_entries = tdmr->size >> PAGE_SHIFT; 528 break; 529 case TDX_PS_2M: 530 nr_pamt_entries = tdmr->size >> PMD_SHIFT; 531 break; 532 case TDX_PS_1G: 533 nr_pamt_entries = tdmr->size >> PUD_SHIFT; 534 break; 535 default: 536 WARN_ON_ONCE(1); 537 return 0; 538 } 539 540 pamt_sz = nr_pamt_entries * pamt_entry_size; 541 /* TDX requires PAMT size must be 4K aligned */ 542 pamt_sz = ALIGN(pamt_sz, PAGE_SIZE); 543 544 return pamt_sz; 545 } 546 547 /* 548 * Locate a NUMA node which should hold the allocation of the @tdmr 549 * PAMT. This node will have some memory covered by the TDMR. The 550 * relative amount of memory covered is not considered. 551 */ 552 static __init int tdmr_get_nid(struct tdmr_info *tdmr, struct list_head *tmb_list) 553 { 554 struct tdx_memblock *tmb; 555 556 /* 557 * A TDMR must cover at least part of one TMB. That TMB will end 558 * after the TDMR begins. But, that TMB may have started before 559 * the TDMR. Find the next 'tmb' that _ends_ after this TDMR 560 * begins. Ignore 'tmb' start addresses. They are irrelevant. 561 */ 562 list_for_each_entry(tmb, tmb_list, list) { 563 if (tmb->end_pfn > PHYS_PFN(tdmr->base)) 564 return tmb->nid; 565 } 566 567 /* 568 * Fall back to allocating the TDMR's metadata from node 0 when 569 * no TDX memory block can be found. This should never happen 570 * since TDMRs originate from TDX memory blocks. 571 */ 572 pr_warn("TDMR [0x%llx, 0x%llx): unable to find local NUMA node for PAMT allocation, fallback to use node 0.\n", 573 tdmr->base, tdmr_end(tdmr)); 574 return 0; 575 } 576 577 /* 578 * Allocate PAMTs from the local NUMA node of some memory in @tmb_list 579 * within @tdmr, and set up PAMTs for @tdmr. 580 */ 581 static __init int tdmr_set_up_pamt(struct tdmr_info *tdmr, 582 struct list_head *tmb_list, 583 u16 pamt_entry_size[]) 584 { 585 unsigned long pamt_base[TDX_PS_NR]; 586 unsigned long pamt_size[TDX_PS_NR]; 587 unsigned long tdmr_pamt_base; 588 unsigned long tdmr_pamt_size; 589 struct page *pamt; 590 int pgsz, nid; 591 592 nid = tdmr_get_nid(tdmr, tmb_list); 593 594 /* 595 * Calculate the PAMT size for each TDX supported page size 596 * and the total PAMT size. 597 */ 598 tdmr_pamt_size = 0; 599 for (pgsz = TDX_PS_4K; pgsz < TDX_PS_NR; pgsz++) { 600 pamt_size[pgsz] = tdmr_get_pamt_sz(tdmr, pgsz, 601 pamt_entry_size[pgsz]); 602 tdmr_pamt_size += pamt_size[pgsz]; 603 } 604 605 /* 606 * Allocate one chunk of physically contiguous memory for all 607 * PAMTs. This helps minimize the PAMT's use of reserved areas 608 * in overlapped TDMRs. 609 */ 610 pamt = alloc_contig_pages(tdmr_pamt_size >> PAGE_SHIFT, GFP_KERNEL, 611 nid, &node_online_map); 612 if (!pamt) 613 return -ENOMEM; 614 615 /* 616 * Break the contiguous allocation back up into the 617 * individual PAMTs for each page size. 618 */ 619 tdmr_pamt_base = page_to_pfn(pamt) << PAGE_SHIFT; 620 for (pgsz = TDX_PS_4K; pgsz < TDX_PS_NR; pgsz++) { 621 pamt_base[pgsz] = tdmr_pamt_base; 622 tdmr_pamt_base += pamt_size[pgsz]; 623 } 624 625 tdmr->pamt_4k_base = pamt_base[TDX_PS_4K]; 626 tdmr->pamt_4k_size = pamt_size[TDX_PS_4K]; 627 tdmr->pamt_2m_base = pamt_base[TDX_PS_2M]; 628 tdmr->pamt_2m_size = pamt_size[TDX_PS_2M]; 629 tdmr->pamt_1g_base = pamt_base[TDX_PS_1G]; 630 tdmr->pamt_1g_size = pamt_size[TDX_PS_1G]; 631 632 return 0; 633 } 634 635 static void tdmr_get_pamt(struct tdmr_info *tdmr, unsigned long *pamt_base, 636 unsigned long *pamt_size) 637 { 638 unsigned long pamt_bs, pamt_sz; 639 640 /* 641 * The PAMT was allocated in one contiguous unit. The 4K PAMT 642 * should always point to the beginning of that allocation. 643 */ 644 pamt_bs = tdmr->pamt_4k_base; 645 pamt_sz = tdmr->pamt_4k_size + tdmr->pamt_2m_size + tdmr->pamt_1g_size; 646 647 WARN_ON_ONCE((pamt_bs & ~PAGE_MASK) || (pamt_sz & ~PAGE_MASK)); 648 649 *pamt_base = pamt_bs; 650 *pamt_size = pamt_sz; 651 } 652 653 static __init void tdmr_do_pamt_func(struct tdmr_info *tdmr, 654 void (*pamt_func)(unsigned long base, unsigned long size)) 655 { 656 unsigned long pamt_base, pamt_size; 657 658 tdmr_get_pamt(tdmr, &pamt_base, &pamt_size); 659 660 /* Do nothing if PAMT hasn't been allocated for this TDMR */ 661 if (!pamt_size) 662 return; 663 664 if (WARN_ON_ONCE(!pamt_base)) 665 return; 666 667 pamt_func(pamt_base, pamt_size); 668 } 669 670 static __init void free_pamt(unsigned long pamt_base, unsigned long pamt_size) 671 { 672 free_contig_range(pamt_base >> PAGE_SHIFT, pamt_size >> PAGE_SHIFT); 673 } 674 675 static __init void tdmr_free_pamt(struct tdmr_info *tdmr) 676 { 677 tdmr_do_pamt_func(tdmr, free_pamt); 678 } 679 680 static __init void tdmrs_free_pamt_all(struct tdmr_info_list *tdmr_list) 681 { 682 int i; 683 684 for (i = 0; i < tdmr_list->nr_consumed_tdmrs; i++) 685 tdmr_free_pamt(tdmr_entry(tdmr_list, i)); 686 } 687 688 /* Allocate and set up PAMTs for all TDMRs */ 689 static __init int tdmrs_set_up_pamt_all(struct tdmr_info_list *tdmr_list, 690 struct list_head *tmb_list, 691 u16 pamt_entry_size[]) 692 { 693 int i, ret = 0; 694 695 for (i = 0; i < tdmr_list->nr_consumed_tdmrs; i++) { 696 ret = tdmr_set_up_pamt(tdmr_entry(tdmr_list, i), tmb_list, 697 pamt_entry_size); 698 if (ret) 699 goto err; 700 } 701 702 return 0; 703 err: 704 tdmrs_free_pamt_all(tdmr_list); 705 return ret; 706 } 707 708 /* 709 * Convert TDX private pages back to normal by using MOVDIR64B to clear these 710 * pages. Typically, any write to the page will convert it from TDX private back 711 * to normal kernel memory. Systems with the X86_BUG_TDX_PW_MCE erratum need to 712 * do the conversion explicitly via MOVDIR64B. 713 */ 714 static void tdx_quirk_reset_paddr(unsigned long base, unsigned long size) 715 { 716 const void *zero_page = (const void *)page_address(ZERO_PAGE(0)); 717 unsigned long phys, end; 718 719 if (!boot_cpu_has_bug(X86_BUG_TDX_PW_MCE)) 720 return; 721 722 end = base + size; 723 for (phys = base; phys < end; phys += 64) 724 movdir64b(__va(phys), zero_page); 725 726 /* 727 * MOVDIR64B uses WC protocol. Use memory barrier to 728 * make sure any later user of these pages sees the 729 * updated data. 730 */ 731 mb(); 732 } 733 734 void tdx_quirk_reset_page(struct page *page) 735 { 736 tdx_quirk_reset_paddr(page_to_phys(page), PAGE_SIZE); 737 } 738 EXPORT_SYMBOL_FOR_KVM(tdx_quirk_reset_page); 739 740 static __init void tdmr_quirk_reset_pamt(struct tdmr_info *tdmr) 741 742 { 743 tdmr_do_pamt_func(tdmr, tdx_quirk_reset_paddr); 744 } 745 746 static __init void tdmrs_quirk_reset_pamt_all(struct tdmr_info_list *tdmr_list) 747 { 748 int i; 749 750 for (i = 0; i < tdmr_list->nr_consumed_tdmrs; i++) 751 tdmr_quirk_reset_pamt(tdmr_entry(tdmr_list, i)); 752 } 753 754 static __init unsigned long tdmrs_count_pamt_kb(struct tdmr_info_list *tdmr_list) 755 { 756 unsigned long pamt_size = 0; 757 int i; 758 759 for (i = 0; i < tdmr_list->nr_consumed_tdmrs; i++) { 760 unsigned long base, size; 761 762 tdmr_get_pamt(tdmr_entry(tdmr_list, i), &base, &size); 763 pamt_size += size; 764 } 765 766 return pamt_size / 1024; 767 } 768 769 static __init int tdmr_add_rsvd_area(struct tdmr_info *tdmr, int *p_idx, 770 u64 addr, u64 size, u16 max_reserved_per_tdmr) 771 { 772 struct tdmr_reserved_area *rsvd_areas = tdmr->reserved_areas; 773 int idx = *p_idx; 774 775 /* Reserved area must be 4K aligned in offset and size */ 776 if (WARN_ON(addr & ~PAGE_MASK || size & ~PAGE_MASK)) 777 return -EINVAL; 778 779 if (idx >= max_reserved_per_tdmr) { 780 pr_warn("initialization failed: TDMR [0x%llx, 0x%llx): reserved areas exhausted.\n", 781 tdmr->base, tdmr_end(tdmr)); 782 return -ENOSPC; 783 } 784 785 /* 786 * Consume one reserved area per call. Make no effort to 787 * optimize or reduce the number of reserved areas which are 788 * consumed by contiguous reserved areas, for instance. 789 */ 790 rsvd_areas[idx].offset = addr - tdmr->base; 791 rsvd_areas[idx].size = size; 792 793 *p_idx = idx + 1; 794 795 return 0; 796 } 797 798 /* 799 * Go through @tmb_list to find holes between memory areas. If any of 800 * those holes fall within @tdmr, set up a TDMR reserved area to cover 801 * the hole. 802 */ 803 static __init int tdmr_populate_rsvd_holes(struct list_head *tmb_list, 804 struct tdmr_info *tdmr, 805 int *rsvd_idx, 806 u16 max_reserved_per_tdmr) 807 { 808 struct tdx_memblock *tmb; 809 u64 prev_end; 810 int ret; 811 812 /* 813 * Start looking for reserved blocks at the 814 * beginning of the TDMR. 815 */ 816 prev_end = tdmr->base; 817 list_for_each_entry(tmb, tmb_list, list) { 818 u64 start, end; 819 820 start = PFN_PHYS(tmb->start_pfn); 821 end = PFN_PHYS(tmb->end_pfn); 822 823 /* Break if this region is after the TDMR */ 824 if (start >= tdmr_end(tdmr)) 825 break; 826 827 /* Exclude regions before this TDMR */ 828 if (end < tdmr->base) 829 continue; 830 831 /* 832 * Skip over memory areas that 833 * have already been dealt with. 834 */ 835 if (start <= prev_end) { 836 prev_end = end; 837 continue; 838 } 839 840 /* Add the hole before this region */ 841 ret = tdmr_add_rsvd_area(tdmr, rsvd_idx, prev_end, 842 start - prev_end, 843 max_reserved_per_tdmr); 844 if (ret) 845 return ret; 846 847 prev_end = end; 848 } 849 850 /* Add the hole after the last region if it exists. */ 851 if (prev_end < tdmr_end(tdmr)) { 852 ret = tdmr_add_rsvd_area(tdmr, rsvd_idx, prev_end, 853 tdmr_end(tdmr) - prev_end, 854 max_reserved_per_tdmr); 855 if (ret) 856 return ret; 857 } 858 859 return 0; 860 } 861 862 /* 863 * Go through @tdmr_list to find all PAMTs. If any of those PAMTs 864 * overlaps with @tdmr, set up a TDMR reserved area to cover the 865 * overlapping part. 866 */ 867 static __init int tdmr_populate_rsvd_pamts(struct tdmr_info_list *tdmr_list, 868 struct tdmr_info *tdmr, 869 int *rsvd_idx, 870 u16 max_reserved_per_tdmr) 871 { 872 int i, ret; 873 874 for (i = 0; i < tdmr_list->nr_consumed_tdmrs; i++) { 875 struct tdmr_info *tmp = tdmr_entry(tdmr_list, i); 876 unsigned long pamt_base, pamt_size, pamt_end; 877 878 tdmr_get_pamt(tmp, &pamt_base, &pamt_size); 879 /* Each TDMR must already have PAMT allocated */ 880 WARN_ON_ONCE(!pamt_size || !pamt_base); 881 882 pamt_end = pamt_base + pamt_size; 883 /* Skip PAMTs outside of the given TDMR */ 884 if ((pamt_end <= tdmr->base) || 885 (pamt_base >= tdmr_end(tdmr))) 886 continue; 887 888 /* Only mark the part within the TDMR as reserved */ 889 if (pamt_base < tdmr->base) 890 pamt_base = tdmr->base; 891 if (pamt_end > tdmr_end(tdmr)) 892 pamt_end = tdmr_end(tdmr); 893 894 ret = tdmr_add_rsvd_area(tdmr, rsvd_idx, pamt_base, 895 pamt_end - pamt_base, 896 max_reserved_per_tdmr); 897 if (ret) 898 return ret; 899 } 900 901 return 0; 902 } 903 904 /* Compare function called by sort() for TDMR reserved areas */ 905 static __init int rsvd_area_cmp_func(const void *a, const void *b) 906 { 907 struct tdmr_reserved_area *r1 = (struct tdmr_reserved_area *)a; 908 struct tdmr_reserved_area *r2 = (struct tdmr_reserved_area *)b; 909 910 if (r1->offset + r1->size <= r2->offset) 911 return -1; 912 if (r1->offset >= r2->offset + r2->size) 913 return 1; 914 915 /* Reserved areas cannot overlap. The caller must guarantee. */ 916 WARN_ON_ONCE(1); 917 return -1; 918 } 919 920 /* 921 * Populate reserved areas for the given @tdmr, including memory holes 922 * (via @tmb_list) and PAMTs (via @tdmr_list). 923 */ 924 static __init int tdmr_populate_rsvd_areas(struct tdmr_info *tdmr, 925 struct list_head *tmb_list, 926 struct tdmr_info_list *tdmr_list, 927 u16 max_reserved_per_tdmr) 928 { 929 int ret, rsvd_idx = 0; 930 931 ret = tdmr_populate_rsvd_holes(tmb_list, tdmr, &rsvd_idx, 932 max_reserved_per_tdmr); 933 if (ret) 934 return ret; 935 936 ret = tdmr_populate_rsvd_pamts(tdmr_list, tdmr, &rsvd_idx, 937 max_reserved_per_tdmr); 938 if (ret) 939 return ret; 940 941 /* TDX requires reserved areas listed in address ascending order */ 942 sort(tdmr->reserved_areas, rsvd_idx, sizeof(struct tdmr_reserved_area), 943 rsvd_area_cmp_func, NULL); 944 945 return 0; 946 } 947 948 /* 949 * Populate reserved areas for all TDMRs in @tdmr_list, including memory 950 * holes (via @tmb_list) and PAMTs. 951 */ 952 static __init int tdmrs_populate_rsvd_areas_all(struct tdmr_info_list *tdmr_list, 953 struct list_head *tmb_list, 954 u16 max_reserved_per_tdmr) 955 { 956 int i; 957 958 for (i = 0; i < tdmr_list->nr_consumed_tdmrs; i++) { 959 int ret; 960 961 ret = tdmr_populate_rsvd_areas(tdmr_entry(tdmr_list, i), 962 tmb_list, tdmr_list, max_reserved_per_tdmr); 963 if (ret) 964 return ret; 965 } 966 967 return 0; 968 } 969 970 /* 971 * Construct a list of TDMRs on the preallocated space in @tdmr_list 972 * to cover all TDX memory regions in @tmb_list based on the TDX module 973 * TDMR global information in @sysinfo_tdmr. 974 */ 975 static __init int construct_tdmrs(struct list_head *tmb_list, 976 struct tdmr_info_list *tdmr_list, 977 struct tdx_sys_info_tdmr *sysinfo_tdmr) 978 { 979 u16 pamt_entry_size[TDX_PS_NR] = { 980 sysinfo_tdmr->pamt_4k_entry_size, 981 sysinfo_tdmr->pamt_2m_entry_size, 982 sysinfo_tdmr->pamt_1g_entry_size, 983 }; 984 int ret; 985 986 ret = fill_out_tdmrs(tmb_list, tdmr_list); 987 if (ret) 988 return ret; 989 990 ret = tdmrs_set_up_pamt_all(tdmr_list, tmb_list, pamt_entry_size); 991 if (ret) 992 return ret; 993 994 ret = tdmrs_populate_rsvd_areas_all(tdmr_list, tmb_list, 995 sysinfo_tdmr->max_reserved_per_tdmr); 996 if (ret) 997 tdmrs_free_pamt_all(tdmr_list); 998 999 /* 1000 * The tdmr_info_list is read-only from here on out. 1001 * Ensure that these writes are seen by other CPUs. 1002 * Pairs with a smp_rmb() in is_pamt_page(). 1003 */ 1004 smp_wmb(); 1005 1006 return ret; 1007 } 1008 1009 static __init int config_tdx_module(struct tdmr_info_list *tdmr_list, 1010 u64 global_keyid) 1011 { 1012 struct tdx_module_args args = {}; 1013 u64 *tdmr_pa_array; 1014 size_t array_sz; 1015 int i, ret; 1016 1017 /* 1018 * TDMRs are passed to the TDX module via an array of physical 1019 * addresses of each TDMR. The array itself also has certain 1020 * alignment requirement. 1021 */ 1022 array_sz = tdmr_list->nr_consumed_tdmrs * sizeof(u64); 1023 array_sz = roundup_pow_of_two(array_sz); 1024 if (array_sz < TDMR_INFO_PA_ARRAY_ALIGNMENT) 1025 array_sz = TDMR_INFO_PA_ARRAY_ALIGNMENT; 1026 1027 tdmr_pa_array = kzalloc(array_sz, GFP_KERNEL); 1028 if (!tdmr_pa_array) 1029 return -ENOMEM; 1030 1031 for (i = 0; i < tdmr_list->nr_consumed_tdmrs; i++) 1032 tdmr_pa_array[i] = __pa(tdmr_entry(tdmr_list, i)); 1033 1034 args.rcx = __pa(tdmr_pa_array); 1035 args.rdx = tdmr_list->nr_consumed_tdmrs; 1036 args.r8 = global_keyid; 1037 ret = seamcall_prerr(TDH_SYS_CONFIG, &args); 1038 1039 /* Free the array as it is not required anymore. */ 1040 kfree(tdmr_pa_array); 1041 1042 return ret; 1043 } 1044 1045 static __init int do_global_key_config(void *unused) 1046 { 1047 struct tdx_module_args args = {}; 1048 1049 return seamcall_prerr(TDH_SYS_KEY_CONFIG, &args); 1050 } 1051 1052 /* 1053 * Attempt to configure the global KeyID on all physical packages. 1054 * 1055 * This requires running code on at least one CPU in each package. 1056 * TDMR initialization) will fail will fail if any package in the 1057 * system has no online CPUs. 1058 * 1059 * This code takes no affirmative steps to online CPUs. Callers (aka. 1060 * KVM) can ensure success by ensuring sufficient CPUs are online and 1061 * can run SEAMCALLs. 1062 */ 1063 static __init int config_global_keyid(void) 1064 { 1065 cpumask_var_t packages; 1066 int cpu, ret = -EINVAL; 1067 1068 if (!zalloc_cpumask_var(&packages, GFP_KERNEL)) 1069 return -ENOMEM; 1070 1071 /* 1072 * Hardware doesn't guarantee cache coherency across different 1073 * KeyIDs. The kernel needs to flush PAMT's dirty cachelines 1074 * (associated with KeyID 0) before the TDX module can use the 1075 * global KeyID to access the PAMT. Given PAMTs are potentially 1076 * large (~1/256th of system RAM), just use WBINVD. 1077 */ 1078 wbinvd_on_all_cpus(); 1079 1080 for_each_online_cpu(cpu) { 1081 /* 1082 * The key configuration only needs to be done once per 1083 * package and will return an error if configured more 1084 * than once. Avoid doing it multiple times per package. 1085 */ 1086 if (cpumask_test_and_set_cpu(topology_physical_package_id(cpu), 1087 packages)) 1088 continue; 1089 1090 /* 1091 * TDH.SYS.KEY.CONFIG cannot run concurrently on 1092 * different cpus. Do it one by one. 1093 */ 1094 ret = smp_call_on_cpu(cpu, do_global_key_config, NULL, true); 1095 if (ret) 1096 break; 1097 } 1098 1099 free_cpumask_var(packages); 1100 return ret; 1101 } 1102 1103 static __init int init_tdmr(struct tdmr_info *tdmr) 1104 { 1105 u64 next; 1106 1107 /* 1108 * Initializing a TDMR can be time consuming. To avoid long 1109 * SEAMCALLs, the TDX module may only initialize a part of the 1110 * TDMR in each call. 1111 */ 1112 do { 1113 struct tdx_module_args args = { 1114 .rcx = tdmr->base, 1115 }; 1116 int ret; 1117 1118 ret = seamcall_prerr_ret(TDH_SYS_TDMR_INIT, &args); 1119 if (ret) 1120 return ret; 1121 /* 1122 * RDX contains 'next-to-initialize' address if 1123 * TDH.SYS.TDMR.INIT did not fully complete and 1124 * should be retried. 1125 */ 1126 next = args.rdx; 1127 cond_resched(); 1128 /* Keep making SEAMCALLs until the TDMR is done */ 1129 } while (next < tdmr->base + tdmr->size); 1130 1131 return 0; 1132 } 1133 1134 static __init int init_tdmrs(struct tdmr_info_list *tdmr_list) 1135 { 1136 int i; 1137 1138 /* 1139 * This operation is costly. It can be parallelized, 1140 * but keep it simple for now. 1141 */ 1142 for (i = 0; i < tdmr_list->nr_consumed_tdmrs; i++) { 1143 int ret; 1144 1145 ret = init_tdmr(tdmr_entry(tdmr_list, i)); 1146 if (ret) 1147 return ret; 1148 } 1149 1150 return 0; 1151 } 1152 1153 static __init int init_tdx_module(void) 1154 { 1155 int ret; 1156 1157 ret = get_tdx_sys_info(&tdx_sysinfo); 1158 if (ret) 1159 return ret; 1160 1161 /* Check whether the kernel can support this module */ 1162 ret = check_features(&tdx_sysinfo); 1163 if (ret) 1164 return ret; 1165 1166 /* 1167 * To keep things simple, assume that all TDX-protected memory 1168 * will come from the page allocator. Make sure all pages in the 1169 * page allocator are TDX-usable memory. 1170 * 1171 * Build the list of "TDX-usable" memory regions which cover all 1172 * pages in the page allocator to guarantee that. Do it while 1173 * holding mem_hotplug_lock read-lock as the memory hotplug code 1174 * path reads the @tdx_memlist to reject any new memory. 1175 */ 1176 get_online_mems(); 1177 1178 ret = build_tdx_memlist(&tdx_memlist); 1179 if (ret) 1180 goto out_put_tdxmem; 1181 1182 /* Allocate enough space for constructing TDMRs */ 1183 ret = alloc_tdmr_list(&tdx_tdmr_list, &tdx_sysinfo.tdmr); 1184 if (ret) 1185 goto err_free_tdxmem; 1186 1187 /* Cover all TDX-usable memory regions in TDMRs */ 1188 ret = construct_tdmrs(&tdx_memlist, &tdx_tdmr_list, &tdx_sysinfo.tdmr); 1189 if (ret) 1190 goto err_free_tdmrs; 1191 1192 /* Pass the TDMRs and the global KeyID to the TDX module */ 1193 ret = config_tdx_module(&tdx_tdmr_list, tdx_global_keyid); 1194 if (ret) 1195 goto err_free_pamts; 1196 1197 /* Config the key of global KeyID on all packages */ 1198 ret = config_global_keyid(); 1199 if (ret) 1200 goto err_reset_pamts; 1201 1202 /* Initialize TDMRs to complete the TDX module initialization */ 1203 ret = init_tdmrs(&tdx_tdmr_list); 1204 if (ret) 1205 goto err_reset_pamts; 1206 1207 pr_info("%lu KB allocated for PAMT\n", tdmrs_count_pamt_kb(&tdx_tdmr_list)); 1208 1209 out_put_tdxmem: 1210 /* 1211 * @tdx_memlist is written here and read at memory hotplug time. 1212 * Lock out memory hotplug code while building it. 1213 */ 1214 put_online_mems(); 1215 return ret; 1216 1217 err_reset_pamts: 1218 /* 1219 * Part of PAMTs may already have been initialized by the 1220 * TDX module. Flush cache before returning PAMTs back 1221 * to the kernel. 1222 */ 1223 wbinvd_on_all_cpus(); 1224 tdmrs_quirk_reset_pamt_all(&tdx_tdmr_list); 1225 err_free_pamts: 1226 tdmrs_free_pamt_all(&tdx_tdmr_list); 1227 err_free_tdmrs: 1228 free_tdmr_list(&tdx_tdmr_list); 1229 err_free_tdxmem: 1230 free_tdx_memlist(&tdx_memlist); 1231 goto out_put_tdxmem; 1232 } 1233 1234 static __init int tdx_enable(void) 1235 { 1236 enum cpuhp_state state; 1237 int ret; 1238 1239 if (!cpu_feature_enabled(X86_FEATURE_TDX_HOST_PLATFORM)) { 1240 pr_err("TDX not supported by the host platform\n"); 1241 return -ENODEV; 1242 } 1243 1244 if (!cpu_feature_enabled(X86_FEATURE_XSAVE)) { 1245 pr_err("XSAVE is required for TDX\n"); 1246 return -EINVAL; 1247 } 1248 1249 if (!cpu_feature_enabled(X86_FEATURE_MOVDIR64B)) { 1250 pr_err("MOVDIR64B is required for TDX\n"); 1251 return -EINVAL; 1252 } 1253 1254 if (!cpu_feature_enabled(X86_FEATURE_SELFSNOOP)) { 1255 pr_err("Self-snoop is required for TDX\n"); 1256 return -ENODEV; 1257 } 1258 1259 state = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "virt/tdx:online", 1260 tdx_online_cpu, tdx_offline_cpu); 1261 if (state < 0) 1262 return state; 1263 1264 ret = init_tdx_module(); 1265 if (ret) { 1266 pr_err("TDX-Module initialization failed (%d)\n", ret); 1267 cpuhp_remove_state(state); 1268 return ret; 1269 } 1270 1271 register_syscore(&tdx_syscore); 1272 1273 tdx_module_initialized = true; 1274 pr_info("TDX-Module initialized\n"); 1275 return 0; 1276 } 1277 subsys_initcall(tdx_enable); 1278 1279 static bool is_pamt_page(unsigned long phys) 1280 { 1281 struct tdmr_info_list *tdmr_list = &tdx_tdmr_list; 1282 int i; 1283 1284 /* Ensure that all remote 'tdmr_list' writes are visible: */ 1285 smp_rmb(); 1286 1287 /* 1288 * The TDX module is no longer returning TDX_SYS_NOT_READY and 1289 * is initialized. The 'tdmr_list' was initialized long ago 1290 * and is now read-only. 1291 */ 1292 for (i = 0; i < tdmr_list->nr_consumed_tdmrs; i++) { 1293 unsigned long base, size; 1294 1295 tdmr_get_pamt(tdmr_entry(tdmr_list, i), &base, &size); 1296 1297 if (phys >= base && phys < (base + size)) 1298 return true; 1299 } 1300 1301 return false; 1302 } 1303 1304 /* 1305 * Return whether the memory page at the given physical address is TDX 1306 * private memory or not. 1307 * 1308 * This can be imprecise for two known reasons: 1309 * 1. PAMTs are private memory and exist before the TDX module is 1310 * ready and TDH_PHYMEM_PAGE_RDMD works. This is a relatively 1311 * short window that occurs once per boot. 1312 * 2. TDH_PHYMEM_PAGE_RDMD reflects the TDX module's knowledge of the 1313 * page. However, the page can still cause #MC until it has been 1314 * fully converted to shared using 64-byte writes like MOVDIR64B. 1315 * Buggy hosts might still leave #MC-causing memory in place which 1316 * this function can not detect. 1317 */ 1318 static bool paddr_is_tdx_private(unsigned long phys) 1319 { 1320 struct tdx_module_args args = { 1321 .rcx = phys & PAGE_MASK, 1322 }; 1323 u64 sret; 1324 1325 if (!boot_cpu_has(X86_FEATURE_TDX_HOST_PLATFORM)) 1326 return false; 1327 1328 /* Get page type from the TDX module */ 1329 sret = __seamcall_dirty_cache(__seamcall_ret, TDH_PHYMEM_PAGE_RDMD, &args); 1330 1331 /* 1332 * The SEAMCALL will not return success unless there is a 1333 * working, "ready" TDX module. Assume an absence of TDX 1334 * private pages until SEAMCALL is working. 1335 */ 1336 if (sret) 1337 return false; 1338 1339 /* 1340 * SEAMCALL was successful -- read page type (via RCX): 1341 * 1342 * - PT_NDA: Page is not used by the TDX module 1343 * - PT_RSVD: Reserved for Non-TDX use 1344 * - Others: Page is used by the TDX module 1345 * 1346 * Note PAMT pages are marked as PT_RSVD but they are also TDX 1347 * private memory. 1348 */ 1349 switch (args.rcx) { 1350 case PT_NDA: 1351 return false; 1352 case PT_RSVD: 1353 return is_pamt_page(phys); 1354 default: 1355 return true; 1356 } 1357 } 1358 1359 /* 1360 * Some TDX-capable CPUs have an erratum. A write to TDX private 1361 * memory poisons that memory, and a subsequent read of that memory 1362 * triggers #MC. 1363 * 1364 * Help distinguish erratum-triggered #MCs from a normal hardware one. 1365 * Just print additional message to show such #MC may be result of the 1366 * erratum. 1367 */ 1368 const char *tdx_dump_mce_info(struct mce *m) 1369 { 1370 if (!m || !mce_is_memory_error(m) || !mce_usable_address(m)) 1371 return NULL; 1372 1373 if (!paddr_is_tdx_private(m->addr)) 1374 return NULL; 1375 1376 return "TDX private memory error. Possible kernel bug."; 1377 } 1378 1379 static __init int record_keyid_partitioning(u32 *tdx_keyid_start, 1380 u32 *nr_tdx_keyids) 1381 { 1382 u32 _nr_mktme_keyids, _tdx_keyid_start, _nr_tdx_keyids; 1383 int ret; 1384 1385 /* 1386 * IA32_MKTME_KEYID_PARTIONING: 1387 * Bit [31:0]: Number of MKTME KeyIDs. 1388 * Bit [63:32]: Number of TDX private KeyIDs. 1389 */ 1390 ret = rdmsr_safe(MSR_IA32_MKTME_KEYID_PARTITIONING, &_nr_mktme_keyids, 1391 &_nr_tdx_keyids); 1392 if (ret || !_nr_tdx_keyids) 1393 return -EINVAL; 1394 1395 /* TDX KeyIDs start after the last MKTME KeyID. */ 1396 _tdx_keyid_start = _nr_mktme_keyids + 1; 1397 1398 *tdx_keyid_start = _tdx_keyid_start; 1399 *nr_tdx_keyids = _nr_tdx_keyids; 1400 1401 return 0; 1402 } 1403 1404 static bool is_tdx_memory(unsigned long start_pfn, unsigned long end_pfn) 1405 { 1406 struct tdx_memblock *tmb; 1407 1408 /* 1409 * This check assumes that the start_pfn<->end_pfn range does not 1410 * cross multiple @tdx_memlist entries. A single memory online 1411 * event across multiple memblocks (from which @tdx_memlist 1412 * entries are derived at the time of module initialization) is 1413 * not possible. This is because memory offline/online is done 1414 * on granularity of 'struct memory_block', and the hotpluggable 1415 * memory region (one memblock) must be multiple of memory_block. 1416 */ 1417 list_for_each_entry(tmb, &tdx_memlist, list) { 1418 if (start_pfn >= tmb->start_pfn && end_pfn <= tmb->end_pfn) 1419 return true; 1420 } 1421 return false; 1422 } 1423 1424 static int tdx_memory_notifier(struct notifier_block *nb, unsigned long action, 1425 void *v) 1426 { 1427 struct memory_notify *mn = v; 1428 1429 if (action != MEM_GOING_ONLINE) 1430 return NOTIFY_OK; 1431 1432 /* 1433 * Empty list means TDX isn't enabled. Allow any memory 1434 * to go online. 1435 */ 1436 if (list_empty(&tdx_memlist)) 1437 return NOTIFY_OK; 1438 1439 /* 1440 * The TDX memory configuration is static and can not be 1441 * changed. Reject onlining any memory which is outside of 1442 * the static configuration whether it supports TDX or not. 1443 */ 1444 if (is_tdx_memory(mn->start_pfn, mn->start_pfn + mn->nr_pages)) 1445 return NOTIFY_OK; 1446 1447 return NOTIFY_BAD; 1448 } 1449 1450 static struct notifier_block tdx_memory_nb = { 1451 .notifier_call = tdx_memory_notifier, 1452 }; 1453 1454 static void __init check_tdx_erratum(void) 1455 { 1456 /* 1457 * These CPUs have an erratum. A partial write from non-TD 1458 * software (e.g. via MOVNTI variants or UC/WC mapping) to TDX 1459 * private memory poisons that memory, and a subsequent read of 1460 * that memory triggers #MC. 1461 */ 1462 switch (boot_cpu_data.x86_vfm) { 1463 case INTEL_SAPPHIRERAPIDS_X: 1464 case INTEL_EMERALDRAPIDS_X: 1465 setup_force_cpu_bug(X86_BUG_TDX_PW_MCE); 1466 } 1467 } 1468 1469 void __init tdx_init(void) 1470 { 1471 u32 tdx_keyid_start, nr_tdx_keyids; 1472 int err; 1473 1474 err = record_keyid_partitioning(&tdx_keyid_start, &nr_tdx_keyids); 1475 if (err) 1476 return; 1477 1478 pr_info("BIOS enabled: private KeyID range [%u, %u)\n", 1479 tdx_keyid_start, tdx_keyid_start + nr_tdx_keyids); 1480 1481 /* 1482 * The TDX module itself requires one 'global KeyID' to protect 1483 * its metadata. If there's only one TDX KeyID, there won't be 1484 * any left for TDX guests thus there's no point to enable TDX 1485 * at all. 1486 */ 1487 if (nr_tdx_keyids < 2) { 1488 pr_err("initialization failed: too few private KeyIDs available.\n"); 1489 return; 1490 } 1491 1492 /* 1493 * At this point, hibernation_available() indicates whether or 1494 * not hibernation support has been permanently disabled. 1495 */ 1496 if (hibernation_available()) { 1497 pr_err("initialization failed: Hibernation support is enabled\n"); 1498 return; 1499 } 1500 1501 err = register_memory_notifier(&tdx_memory_nb); 1502 if (err) { 1503 pr_err("initialization failed: register_memory_notifier() failed (%d)\n", 1504 err); 1505 return; 1506 } 1507 1508 #if defined(CONFIG_ACPI) && defined(CONFIG_SUSPEND) 1509 pr_info("Disable ACPI S3. Turn off TDX in the BIOS to use ACPI S3.\n"); 1510 acpi_suspend_lowlevel = NULL; 1511 #endif 1512 1513 /* 1514 * Just use the first TDX KeyID as the 'global KeyID' and 1515 * leave the rest for TDX guests. 1516 */ 1517 tdx_global_keyid = tdx_keyid_start; 1518 tdx_guest_keyid_start = tdx_keyid_start + 1; 1519 tdx_nr_guest_keyids = nr_tdx_keyids - 1; 1520 1521 setup_force_cpu_cap(X86_FEATURE_TDX_HOST_PLATFORM); 1522 1523 check_tdx_erratum(); 1524 } 1525 1526 const struct tdx_sys_info *tdx_get_sysinfo(void) 1527 { 1528 if (!tdx_module_initialized) 1529 return NULL; 1530 1531 return (const struct tdx_sys_info *)&tdx_sysinfo; 1532 } 1533 EXPORT_SYMBOL_FOR_KVM(tdx_get_sysinfo); 1534 1535 u32 tdx_get_nr_guest_keyids(void) 1536 { 1537 return tdx_nr_guest_keyids; 1538 } 1539 EXPORT_SYMBOL_FOR_KVM(tdx_get_nr_guest_keyids); 1540 1541 int tdx_guest_keyid_alloc(void) 1542 { 1543 return ida_alloc_range(&tdx_guest_keyid_pool, tdx_guest_keyid_start, 1544 tdx_guest_keyid_start + tdx_nr_guest_keyids - 1, 1545 GFP_KERNEL); 1546 } 1547 EXPORT_SYMBOL_FOR_KVM(tdx_guest_keyid_alloc); 1548 1549 void tdx_guest_keyid_free(unsigned int keyid) 1550 { 1551 ida_free(&tdx_guest_keyid_pool, keyid); 1552 } 1553 EXPORT_SYMBOL_FOR_KVM(tdx_guest_keyid_free); 1554 1555 static inline u64 tdx_tdr_pa(struct tdx_td *td) 1556 { 1557 return page_to_phys(td->tdr_page); 1558 } 1559 1560 /* 1561 * The TDX module exposes a CLFLUSH_BEFORE_ALLOC bit to specify whether 1562 * a CLFLUSH of pages is required before handing them to the TDX module. 1563 * Be conservative and make the code simpler by doing the CLFLUSH 1564 * unconditionally. 1565 */ 1566 static void tdx_clflush_page(struct page *page) 1567 { 1568 clflush_cache_range(page_to_virt(page), PAGE_SIZE); 1569 } 1570 1571 noinstr u64 tdh_vp_enter(struct tdx_vp *td, struct tdx_module_args *args) 1572 { 1573 args->rcx = td->tdvpr_pa; 1574 1575 return __seamcall_dirty_cache(__seamcall_saved_ret, TDH_VP_ENTER, args); 1576 } 1577 EXPORT_SYMBOL_FOR_KVM(tdh_vp_enter); 1578 1579 u64 tdh_mng_addcx(struct tdx_td *td, struct page *tdcs_page) 1580 { 1581 struct tdx_module_args args = { 1582 .rcx = page_to_phys(tdcs_page), 1583 .rdx = tdx_tdr_pa(td), 1584 }; 1585 1586 tdx_clflush_page(tdcs_page); 1587 return seamcall(TDH_MNG_ADDCX, &args); 1588 } 1589 EXPORT_SYMBOL_FOR_KVM(tdh_mng_addcx); 1590 1591 u64 tdh_mem_page_add(struct tdx_td *td, u64 gpa, struct page *page, struct page *source, u64 *ext_err1, u64 *ext_err2) 1592 { 1593 struct tdx_module_args args = { 1594 .rcx = gpa, 1595 .rdx = tdx_tdr_pa(td), 1596 .r8 = page_to_phys(page), 1597 .r9 = page_to_phys(source), 1598 }; 1599 u64 ret; 1600 1601 tdx_clflush_page(page); 1602 ret = seamcall_ret(TDH_MEM_PAGE_ADD, &args); 1603 1604 *ext_err1 = args.rcx; 1605 *ext_err2 = args.rdx; 1606 1607 return ret; 1608 } 1609 EXPORT_SYMBOL_FOR_KVM(tdh_mem_page_add); 1610 1611 u64 tdh_mem_sept_add(struct tdx_td *td, u64 gpa, int level, struct page *page, u64 *ext_err1, u64 *ext_err2) 1612 { 1613 struct tdx_module_args args = { 1614 .rcx = gpa | level, 1615 .rdx = tdx_tdr_pa(td), 1616 .r8 = page_to_phys(page), 1617 }; 1618 u64 ret; 1619 1620 tdx_clflush_page(page); 1621 ret = seamcall_ret(TDH_MEM_SEPT_ADD, &args); 1622 1623 *ext_err1 = args.rcx; 1624 *ext_err2 = args.rdx; 1625 1626 return ret; 1627 } 1628 EXPORT_SYMBOL_FOR_KVM(tdh_mem_sept_add); 1629 1630 u64 tdh_vp_addcx(struct tdx_vp *vp, struct page *tdcx_page) 1631 { 1632 struct tdx_module_args args = { 1633 .rcx = page_to_phys(tdcx_page), 1634 .rdx = vp->tdvpr_pa, 1635 }; 1636 1637 tdx_clflush_page(tdcx_page); 1638 return seamcall(TDH_VP_ADDCX, &args); 1639 } 1640 EXPORT_SYMBOL_FOR_KVM(tdh_vp_addcx); 1641 1642 u64 tdh_mem_page_aug(struct tdx_td *td, u64 gpa, int level, struct page *page, u64 *ext_err1, u64 *ext_err2) 1643 { 1644 struct tdx_module_args args = { 1645 .rcx = gpa | level, 1646 .rdx = tdx_tdr_pa(td), 1647 .r8 = page_to_phys(page), 1648 }; 1649 u64 ret; 1650 1651 tdx_clflush_page(page); 1652 ret = seamcall_ret(TDH_MEM_PAGE_AUG, &args); 1653 1654 *ext_err1 = args.rcx; 1655 *ext_err2 = args.rdx; 1656 1657 return ret; 1658 } 1659 EXPORT_SYMBOL_FOR_KVM(tdh_mem_page_aug); 1660 1661 u64 tdh_mem_range_block(struct tdx_td *td, u64 gpa, int level, u64 *ext_err1, u64 *ext_err2) 1662 { 1663 struct tdx_module_args args = { 1664 .rcx = gpa | level, 1665 .rdx = tdx_tdr_pa(td), 1666 }; 1667 u64 ret; 1668 1669 ret = seamcall_ret(TDH_MEM_RANGE_BLOCK, &args); 1670 1671 *ext_err1 = args.rcx; 1672 *ext_err2 = args.rdx; 1673 1674 return ret; 1675 } 1676 EXPORT_SYMBOL_FOR_KVM(tdh_mem_range_block); 1677 1678 u64 tdh_mng_key_config(struct tdx_td *td) 1679 { 1680 struct tdx_module_args args = { 1681 .rcx = tdx_tdr_pa(td), 1682 }; 1683 1684 return seamcall(TDH_MNG_KEY_CONFIG, &args); 1685 } 1686 EXPORT_SYMBOL_FOR_KVM(tdh_mng_key_config); 1687 1688 u64 tdh_mng_create(struct tdx_td *td, u16 hkid) 1689 { 1690 struct tdx_module_args args = { 1691 .rcx = tdx_tdr_pa(td), 1692 .rdx = hkid, 1693 }; 1694 1695 tdx_clflush_page(td->tdr_page); 1696 return seamcall(TDH_MNG_CREATE, &args); 1697 } 1698 EXPORT_SYMBOL_FOR_KVM(tdh_mng_create); 1699 1700 u64 tdh_vp_create(struct tdx_td *td, struct tdx_vp *vp) 1701 { 1702 struct tdx_module_args args = { 1703 .rcx = vp->tdvpr_pa, 1704 .rdx = tdx_tdr_pa(td), 1705 }; 1706 1707 tdx_clflush_page(vp->tdvpr_page); 1708 return seamcall(TDH_VP_CREATE, &args); 1709 } 1710 EXPORT_SYMBOL_FOR_KVM(tdh_vp_create); 1711 1712 u64 tdh_mng_rd(struct tdx_td *td, u64 field, u64 *data) 1713 { 1714 struct tdx_module_args args = { 1715 .rcx = tdx_tdr_pa(td), 1716 .rdx = field, 1717 }; 1718 u64 ret; 1719 1720 ret = seamcall_ret(TDH_MNG_RD, &args); 1721 1722 /* R8: Content of the field, or 0 in case of error. */ 1723 *data = args.r8; 1724 1725 return ret; 1726 } 1727 EXPORT_SYMBOL_FOR_KVM(tdh_mng_rd); 1728 1729 u64 tdh_mr_extend(struct tdx_td *td, u64 gpa, u64 *ext_err1, u64 *ext_err2) 1730 { 1731 struct tdx_module_args args = { 1732 .rcx = gpa, 1733 .rdx = tdx_tdr_pa(td), 1734 }; 1735 u64 ret; 1736 1737 ret = seamcall_ret(TDH_MR_EXTEND, &args); 1738 1739 *ext_err1 = args.rcx; 1740 *ext_err2 = args.rdx; 1741 1742 return ret; 1743 } 1744 EXPORT_SYMBOL_FOR_KVM(tdh_mr_extend); 1745 1746 u64 tdh_mr_finalize(struct tdx_td *td) 1747 { 1748 struct tdx_module_args args = { 1749 .rcx = tdx_tdr_pa(td), 1750 }; 1751 1752 return seamcall(TDH_MR_FINALIZE, &args); 1753 } 1754 EXPORT_SYMBOL_FOR_KVM(tdh_mr_finalize); 1755 1756 u64 tdh_vp_flush(struct tdx_vp *vp) 1757 { 1758 struct tdx_module_args args = { 1759 .rcx = vp->tdvpr_pa, 1760 }; 1761 1762 return seamcall(TDH_VP_FLUSH, &args); 1763 } 1764 EXPORT_SYMBOL_FOR_KVM(tdh_vp_flush); 1765 1766 u64 tdh_mng_vpflushdone(struct tdx_td *td) 1767 { 1768 struct tdx_module_args args = { 1769 .rcx = tdx_tdr_pa(td), 1770 }; 1771 1772 return seamcall(TDH_MNG_VPFLUSHDONE, &args); 1773 } 1774 EXPORT_SYMBOL_FOR_KVM(tdh_mng_vpflushdone); 1775 1776 u64 tdh_mng_key_freeid(struct tdx_td *td) 1777 { 1778 struct tdx_module_args args = { 1779 .rcx = tdx_tdr_pa(td), 1780 }; 1781 1782 return seamcall(TDH_MNG_KEY_FREEID, &args); 1783 } 1784 EXPORT_SYMBOL_FOR_KVM(tdh_mng_key_freeid); 1785 1786 u64 tdh_mng_init(struct tdx_td *td, u64 td_params, u64 *extended_err) 1787 { 1788 struct tdx_module_args args = { 1789 .rcx = tdx_tdr_pa(td), 1790 .rdx = td_params, 1791 }; 1792 u64 ret; 1793 1794 ret = seamcall_ret(TDH_MNG_INIT, &args); 1795 1796 *extended_err = args.rcx; 1797 1798 return ret; 1799 } 1800 EXPORT_SYMBOL_FOR_KVM(tdh_mng_init); 1801 1802 u64 tdh_vp_rd(struct tdx_vp *vp, u64 field, u64 *data) 1803 { 1804 struct tdx_module_args args = { 1805 .rcx = vp->tdvpr_pa, 1806 .rdx = field, 1807 }; 1808 u64 ret; 1809 1810 ret = seamcall_ret(TDH_VP_RD, &args); 1811 1812 /* R8: Content of the field, or 0 in case of error. */ 1813 *data = args.r8; 1814 1815 return ret; 1816 } 1817 EXPORT_SYMBOL_FOR_KVM(tdh_vp_rd); 1818 1819 u64 tdh_vp_wr(struct tdx_vp *vp, u64 field, u64 data, u64 mask) 1820 { 1821 struct tdx_module_args args = { 1822 .rcx = vp->tdvpr_pa, 1823 .rdx = field, 1824 .r8 = data, 1825 .r9 = mask, 1826 }; 1827 1828 return seamcall(TDH_VP_WR, &args); 1829 } 1830 EXPORT_SYMBOL_FOR_KVM(tdh_vp_wr); 1831 1832 u64 tdh_vp_init(struct tdx_vp *vp, u64 initial_rcx, u32 x2apicid) 1833 { 1834 struct tdx_module_args args = { 1835 .rcx = vp->tdvpr_pa, 1836 .rdx = initial_rcx, 1837 .r8 = x2apicid, 1838 }; 1839 1840 /* apicid requires version == 1. */ 1841 return seamcall(TDH_VP_INIT | (1ULL << TDX_VERSION_SHIFT), &args); 1842 } 1843 EXPORT_SYMBOL_FOR_KVM(tdh_vp_init); 1844 1845 /* 1846 * TDX ABI defines output operands as PT, OWNER and SIZE. These are TDX defined fomats. 1847 * So despite the names, they must be interpted specially as described by the spec. Return 1848 * them only for error reporting purposes. 1849 */ 1850 u64 tdh_phymem_page_reclaim(struct page *page, u64 *tdx_pt, u64 *tdx_owner, u64 *tdx_size) 1851 { 1852 struct tdx_module_args args = { 1853 .rcx = page_to_phys(page), 1854 }; 1855 u64 ret; 1856 1857 ret = seamcall_ret(TDH_PHYMEM_PAGE_RECLAIM, &args); 1858 1859 *tdx_pt = args.rcx; 1860 *tdx_owner = args.rdx; 1861 *tdx_size = args.r8; 1862 1863 return ret; 1864 } 1865 EXPORT_SYMBOL_FOR_KVM(tdh_phymem_page_reclaim); 1866 1867 u64 tdh_mem_track(struct tdx_td *td) 1868 { 1869 struct tdx_module_args args = { 1870 .rcx = tdx_tdr_pa(td), 1871 }; 1872 1873 return seamcall(TDH_MEM_TRACK, &args); 1874 } 1875 EXPORT_SYMBOL_FOR_KVM(tdh_mem_track); 1876 1877 u64 tdh_mem_page_remove(struct tdx_td *td, u64 gpa, u64 level, u64 *ext_err1, u64 *ext_err2) 1878 { 1879 struct tdx_module_args args = { 1880 .rcx = gpa | level, 1881 .rdx = tdx_tdr_pa(td), 1882 }; 1883 u64 ret; 1884 1885 ret = seamcall_ret(TDH_MEM_PAGE_REMOVE, &args); 1886 1887 *ext_err1 = args.rcx; 1888 *ext_err2 = args.rdx; 1889 1890 return ret; 1891 } 1892 EXPORT_SYMBOL_FOR_KVM(tdh_mem_page_remove); 1893 1894 u64 tdh_phymem_cache_wb(bool resume) 1895 { 1896 struct tdx_module_args args = { 1897 .rcx = resume ? 1 : 0, 1898 }; 1899 1900 return seamcall(TDH_PHYMEM_CACHE_WB, &args); 1901 } 1902 EXPORT_SYMBOL_FOR_KVM(tdh_phymem_cache_wb); 1903 1904 u64 tdh_phymem_page_wbinvd_tdr(struct tdx_td *td) 1905 { 1906 struct tdx_module_args args = {}; 1907 1908 args.rcx = mk_keyed_paddr(tdx_global_keyid, td->tdr_page); 1909 1910 return seamcall(TDH_PHYMEM_PAGE_WBINVD, &args); 1911 } 1912 EXPORT_SYMBOL_FOR_KVM(tdh_phymem_page_wbinvd_tdr); 1913 1914 u64 tdh_phymem_page_wbinvd_hkid(u64 hkid, struct page *page) 1915 { 1916 struct tdx_module_args args = {}; 1917 1918 args.rcx = mk_keyed_paddr(hkid, page); 1919 1920 return seamcall(TDH_PHYMEM_PAGE_WBINVD, &args); 1921 } 1922 EXPORT_SYMBOL_FOR_KVM(tdh_phymem_page_wbinvd_hkid); 1923 1924 #ifdef CONFIG_KEXEC_CORE 1925 void tdx_cpu_flush_cache_for_kexec(void) 1926 { 1927 lockdep_assert_preemption_disabled(); 1928 1929 if (!this_cpu_read(cache_state_incoherent)) 1930 return; 1931 1932 /* 1933 * Private memory cachelines need to be clean at the time of 1934 * kexec. Write them back now, as the caller promises that 1935 * there should be no more SEAMCALLs on this CPU. 1936 */ 1937 wbinvd(); 1938 this_cpu_write(cache_state_incoherent, false); 1939 } 1940 EXPORT_SYMBOL_FOR_KVM(tdx_cpu_flush_cache_for_kexec); 1941 #endif 1942