1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright(c) 2023 Intel Corporation. 4 * 5 * Intel Trusted Domain Extensions (TDX) support 6 */ 7 8 #define pr_fmt(fmt) "virt/tdx: " fmt 9 10 #include <linux/types.h> 11 #include <linux/cache.h> 12 #include <linux/init.h> 13 #include <linux/errno.h> 14 #include <linux/printk.h> 15 #include <linux/cpu.h> 16 #include <linux/spinlock.h> 17 #include <linux/percpu-defs.h> 18 #include <linux/mutex.h> 19 #include <linux/list.h> 20 #include <linux/memblock.h> 21 #include <linux/memory.h> 22 #include <linux/minmax.h> 23 #include <linux/sizes.h> 24 #include <linux/pfn.h> 25 #include <linux/align.h> 26 #include <linux/sort.h> 27 #include <linux/log2.h> 28 #include <linux/acpi.h> 29 #include <linux/suspend.h> 30 #include <linux/acpi.h> 31 #include <asm/page.h> 32 #include <asm/special_insns.h> 33 #include <asm/msr-index.h> 34 #include <asm/msr.h> 35 #include <asm/cpufeature.h> 36 #include <asm/tdx.h> 37 #include <asm/intel-family.h> 38 #include <asm/processor.h> 39 #include <asm/mce.h> 40 #include "tdx.h" 41 42 static u32 tdx_global_keyid __ro_after_init; 43 static u32 tdx_guest_keyid_start __ro_after_init; 44 static u32 tdx_nr_guest_keyids __ro_after_init; 45 46 static DEFINE_PER_CPU(bool, tdx_lp_initialized); 47 48 static struct tdmr_info_list tdx_tdmr_list; 49 50 static enum tdx_module_status_t tdx_module_status; 51 static DEFINE_MUTEX(tdx_module_lock); 52 53 /* All TDX-usable memory regions. Protected by mem_hotplug_lock. */ 54 static LIST_HEAD(tdx_memlist); 55 56 typedef void (*sc_err_func_t)(u64 fn, u64 err, struct tdx_module_args *args); 57 58 static inline void seamcall_err(u64 fn, u64 err, struct tdx_module_args *args) 59 { 60 pr_err("SEAMCALL (0x%016llx) failed: 0x%016llx\n", fn, err); 61 } 62 63 static inline void seamcall_err_ret(u64 fn, u64 err, 64 struct tdx_module_args *args) 65 { 66 seamcall_err(fn, err, args); 67 pr_err("RCX 0x%016llx RDX 0x%016llx R08 0x%016llx\n", 68 args->rcx, args->rdx, args->r8); 69 pr_err("R09 0x%016llx R10 0x%016llx R11 0x%016llx\n", 70 args->r9, args->r10, args->r11); 71 } 72 73 static inline int sc_retry_prerr(sc_func_t func, sc_err_func_t err_func, 74 u64 fn, struct tdx_module_args *args) 75 { 76 u64 sret = sc_retry(func, fn, args); 77 78 if (sret == TDX_SUCCESS) 79 return 0; 80 81 if (sret == TDX_SEAMCALL_VMFAILINVALID) 82 return -ENODEV; 83 84 if (sret == TDX_SEAMCALL_GP) 85 return -EOPNOTSUPP; 86 87 if (sret == TDX_SEAMCALL_UD) 88 return -EACCES; 89 90 err_func(fn, sret, args); 91 return -EIO; 92 } 93 94 #define seamcall_prerr(__fn, __args) \ 95 sc_retry_prerr(__seamcall, seamcall_err, (__fn), (__args)) 96 97 #define seamcall_prerr_ret(__fn, __args) \ 98 sc_retry_prerr(__seamcall_ret, seamcall_err_ret, (__fn), (__args)) 99 100 /* 101 * Do the module global initialization once and return its result. 102 * It can be done on any cpu. It's always called with interrupts 103 * disabled. 104 */ 105 static int try_init_module_global(void) 106 { 107 struct tdx_module_args args = {}; 108 static DEFINE_RAW_SPINLOCK(sysinit_lock); 109 static bool sysinit_done; 110 static int sysinit_ret; 111 112 lockdep_assert_irqs_disabled(); 113 114 raw_spin_lock(&sysinit_lock); 115 116 if (sysinit_done) 117 goto out; 118 119 /* RCX is module attributes and all bits are reserved */ 120 args.rcx = 0; 121 sysinit_ret = seamcall_prerr(TDH_SYS_INIT, &args); 122 123 /* 124 * The first SEAMCALL also detects the TDX module, thus 125 * it can fail due to the TDX module is not loaded. 126 * Dump message to let the user know. 127 */ 128 if (sysinit_ret == -ENODEV) 129 pr_err("module not loaded\n"); 130 131 sysinit_done = true; 132 out: 133 raw_spin_unlock(&sysinit_lock); 134 return sysinit_ret; 135 } 136 137 /** 138 * tdx_cpu_enable - Enable TDX on local cpu 139 * 140 * Do one-time TDX module per-cpu initialization SEAMCALL (and TDX module 141 * global initialization SEAMCALL if not done) on local cpu to make this 142 * cpu be ready to run any other SEAMCALLs. 143 * 144 * Always call this function via IPI function calls. 145 * 146 * Return 0 on success, otherwise errors. 147 */ 148 int tdx_cpu_enable(void) 149 { 150 struct tdx_module_args args = {}; 151 int ret; 152 153 if (!boot_cpu_has(X86_FEATURE_TDX_HOST_PLATFORM)) 154 return -ENODEV; 155 156 lockdep_assert_irqs_disabled(); 157 158 if (__this_cpu_read(tdx_lp_initialized)) 159 return 0; 160 161 /* 162 * The TDX module global initialization is the very first step 163 * to enable TDX. Need to do it first (if hasn't been done) 164 * before the per-cpu initialization. 165 */ 166 ret = try_init_module_global(); 167 if (ret) 168 return ret; 169 170 ret = seamcall_prerr(TDH_SYS_LP_INIT, &args); 171 if (ret) 172 return ret; 173 174 __this_cpu_write(tdx_lp_initialized, true); 175 176 return 0; 177 } 178 EXPORT_SYMBOL_GPL(tdx_cpu_enable); 179 180 /* 181 * Add a memory region as a TDX memory block. The caller must make sure 182 * all memory regions are added in address ascending order and don't 183 * overlap. 184 */ 185 static int add_tdx_memblock(struct list_head *tmb_list, unsigned long start_pfn, 186 unsigned long end_pfn, int nid) 187 { 188 struct tdx_memblock *tmb; 189 190 tmb = kmalloc(sizeof(*tmb), GFP_KERNEL); 191 if (!tmb) 192 return -ENOMEM; 193 194 INIT_LIST_HEAD(&tmb->list); 195 tmb->start_pfn = start_pfn; 196 tmb->end_pfn = end_pfn; 197 tmb->nid = nid; 198 199 /* @tmb_list is protected by mem_hotplug_lock */ 200 list_add_tail(&tmb->list, tmb_list); 201 return 0; 202 } 203 204 static void free_tdx_memlist(struct list_head *tmb_list) 205 { 206 /* @tmb_list is protected by mem_hotplug_lock */ 207 while (!list_empty(tmb_list)) { 208 struct tdx_memblock *tmb = list_first_entry(tmb_list, 209 struct tdx_memblock, list); 210 211 list_del(&tmb->list); 212 kfree(tmb); 213 } 214 } 215 216 /* 217 * Ensure that all memblock memory regions are convertible to TDX 218 * memory. Once this has been established, stash the memblock 219 * ranges off in a secondary structure because memblock is modified 220 * in memory hotplug while TDX memory regions are fixed. 221 */ 222 static int build_tdx_memlist(struct list_head *tmb_list) 223 { 224 unsigned long start_pfn, end_pfn; 225 int i, nid, ret; 226 227 for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) { 228 /* 229 * The first 1MB is not reported as TDX convertible memory. 230 * Although the first 1MB is always reserved and won't end up 231 * to the page allocator, it is still in memblock's memory 232 * regions. Skip them manually to exclude them as TDX memory. 233 */ 234 start_pfn = max(start_pfn, PHYS_PFN(SZ_1M)); 235 if (start_pfn >= end_pfn) 236 continue; 237 238 /* 239 * Add the memory regions as TDX memory. The regions in 240 * memblock has already guaranteed they are in address 241 * ascending order and don't overlap. 242 */ 243 ret = add_tdx_memblock(tmb_list, start_pfn, end_pfn, nid); 244 if (ret) 245 goto err; 246 } 247 248 return 0; 249 err: 250 free_tdx_memlist(tmb_list); 251 return ret; 252 } 253 254 static int read_sys_metadata_field(u64 field_id, u64 *data) 255 { 256 struct tdx_module_args args = {}; 257 int ret; 258 259 /* 260 * TDH.SYS.RD -- reads one global metadata field 261 * - RDX (in): the field to read 262 * - R8 (out): the field data 263 */ 264 args.rdx = field_id; 265 ret = seamcall_prerr_ret(TDH_SYS_RD, &args); 266 if (ret) 267 return ret; 268 269 *data = args.r8; 270 271 return 0; 272 } 273 274 static int read_sys_metadata_field16(u64 field_id, 275 int offset, 276 struct tdx_tdmr_sysinfo *ts) 277 { 278 u16 *ts_member = ((void *)ts) + offset; 279 u64 tmp; 280 int ret; 281 282 if (WARN_ON_ONCE(MD_FIELD_ID_ELE_SIZE_CODE(field_id) != 283 MD_FIELD_ID_ELE_SIZE_16BIT)) 284 return -EINVAL; 285 286 ret = read_sys_metadata_field(field_id, &tmp); 287 if (ret) 288 return ret; 289 290 *ts_member = tmp; 291 292 return 0; 293 } 294 295 struct field_mapping { 296 u64 field_id; 297 int offset; 298 }; 299 300 #define TD_SYSINFO_MAP(_field_id, _offset) \ 301 { .field_id = MD_FIELD_ID_##_field_id, \ 302 .offset = offsetof(struct tdx_tdmr_sysinfo, _offset) } 303 304 /* Map TD_SYSINFO fields into 'struct tdx_tdmr_sysinfo': */ 305 static const struct field_mapping fields[] = { 306 TD_SYSINFO_MAP(MAX_TDMRS, max_tdmrs), 307 TD_SYSINFO_MAP(MAX_RESERVED_PER_TDMR, max_reserved_per_tdmr), 308 TD_SYSINFO_MAP(PAMT_4K_ENTRY_SIZE, pamt_entry_size[TDX_PS_4K]), 309 TD_SYSINFO_MAP(PAMT_2M_ENTRY_SIZE, pamt_entry_size[TDX_PS_2M]), 310 TD_SYSINFO_MAP(PAMT_1G_ENTRY_SIZE, pamt_entry_size[TDX_PS_1G]), 311 }; 312 313 static int get_tdx_tdmr_sysinfo(struct tdx_tdmr_sysinfo *tdmr_sysinfo) 314 { 315 int ret; 316 int i; 317 318 /* Populate 'tdmr_sysinfo' fields using the mapping structure above: */ 319 for (i = 0; i < ARRAY_SIZE(fields); i++) { 320 ret = read_sys_metadata_field16(fields[i].field_id, 321 fields[i].offset, 322 tdmr_sysinfo); 323 if (ret) 324 return ret; 325 } 326 327 return 0; 328 } 329 330 /* Calculate the actual TDMR size */ 331 static int tdmr_size_single(u16 max_reserved_per_tdmr) 332 { 333 int tdmr_sz; 334 335 /* 336 * The actual size of TDMR depends on the maximum 337 * number of reserved areas. 338 */ 339 tdmr_sz = sizeof(struct tdmr_info); 340 tdmr_sz += sizeof(struct tdmr_reserved_area) * max_reserved_per_tdmr; 341 342 return ALIGN(tdmr_sz, TDMR_INFO_ALIGNMENT); 343 } 344 345 static int alloc_tdmr_list(struct tdmr_info_list *tdmr_list, 346 struct tdx_tdmr_sysinfo *tdmr_sysinfo) 347 { 348 size_t tdmr_sz, tdmr_array_sz; 349 void *tdmr_array; 350 351 tdmr_sz = tdmr_size_single(tdmr_sysinfo->max_reserved_per_tdmr); 352 tdmr_array_sz = tdmr_sz * tdmr_sysinfo->max_tdmrs; 353 354 /* 355 * To keep things simple, allocate all TDMRs together. 356 * The buffer needs to be physically contiguous to make 357 * sure each TDMR is physically contiguous. 358 */ 359 tdmr_array = alloc_pages_exact(tdmr_array_sz, 360 GFP_KERNEL | __GFP_ZERO); 361 if (!tdmr_array) 362 return -ENOMEM; 363 364 tdmr_list->tdmrs = tdmr_array; 365 366 /* 367 * Keep the size of TDMR to find the target TDMR 368 * at a given index in the TDMR list. 369 */ 370 tdmr_list->tdmr_sz = tdmr_sz; 371 tdmr_list->max_tdmrs = tdmr_sysinfo->max_tdmrs; 372 tdmr_list->nr_consumed_tdmrs = 0; 373 374 return 0; 375 } 376 377 static void free_tdmr_list(struct tdmr_info_list *tdmr_list) 378 { 379 free_pages_exact(tdmr_list->tdmrs, 380 tdmr_list->max_tdmrs * tdmr_list->tdmr_sz); 381 } 382 383 /* Get the TDMR from the list at the given index. */ 384 static struct tdmr_info *tdmr_entry(struct tdmr_info_list *tdmr_list, 385 int idx) 386 { 387 int tdmr_info_offset = tdmr_list->tdmr_sz * idx; 388 389 return (void *)tdmr_list->tdmrs + tdmr_info_offset; 390 } 391 392 #define TDMR_ALIGNMENT SZ_1G 393 #define TDMR_ALIGN_DOWN(_addr) ALIGN_DOWN((_addr), TDMR_ALIGNMENT) 394 #define TDMR_ALIGN_UP(_addr) ALIGN((_addr), TDMR_ALIGNMENT) 395 396 static inline u64 tdmr_end(struct tdmr_info *tdmr) 397 { 398 return tdmr->base + tdmr->size; 399 } 400 401 /* 402 * Take the memory referenced in @tmb_list and populate the 403 * preallocated @tdmr_list, following all the special alignment 404 * and size rules for TDMR. 405 */ 406 static int fill_out_tdmrs(struct list_head *tmb_list, 407 struct tdmr_info_list *tdmr_list) 408 { 409 struct tdx_memblock *tmb; 410 int tdmr_idx = 0; 411 412 /* 413 * Loop over TDX memory regions and fill out TDMRs to cover them. 414 * To keep it simple, always try to use one TDMR to cover one 415 * memory region. 416 * 417 * In practice TDX supports at least 64 TDMRs. A 2-socket system 418 * typically only consumes less than 10 of those. This code is 419 * dumb and simple and may use more TMDRs than is strictly 420 * required. 421 */ 422 list_for_each_entry(tmb, tmb_list, list) { 423 struct tdmr_info *tdmr = tdmr_entry(tdmr_list, tdmr_idx); 424 u64 start, end; 425 426 start = TDMR_ALIGN_DOWN(PFN_PHYS(tmb->start_pfn)); 427 end = TDMR_ALIGN_UP(PFN_PHYS(tmb->end_pfn)); 428 429 /* 430 * A valid size indicates the current TDMR has already 431 * been filled out to cover the previous memory region(s). 432 */ 433 if (tdmr->size) { 434 /* 435 * Loop to the next if the current memory region 436 * has already been fully covered. 437 */ 438 if (end <= tdmr_end(tdmr)) 439 continue; 440 441 /* Otherwise, skip the already covered part. */ 442 if (start < tdmr_end(tdmr)) 443 start = tdmr_end(tdmr); 444 445 /* 446 * Create a new TDMR to cover the current memory 447 * region, or the remaining part of it. 448 */ 449 tdmr_idx++; 450 if (tdmr_idx >= tdmr_list->max_tdmrs) { 451 pr_warn("initialization failed: TDMRs exhausted.\n"); 452 return -ENOSPC; 453 } 454 455 tdmr = tdmr_entry(tdmr_list, tdmr_idx); 456 } 457 458 tdmr->base = start; 459 tdmr->size = end - start; 460 } 461 462 /* @tdmr_idx is always the index of the last valid TDMR. */ 463 tdmr_list->nr_consumed_tdmrs = tdmr_idx + 1; 464 465 /* 466 * Warn early that kernel is about to run out of TDMRs. 467 * 468 * This is an indication that TDMR allocation has to be 469 * reworked to be smarter to not run into an issue. 470 */ 471 if (tdmr_list->max_tdmrs - tdmr_list->nr_consumed_tdmrs < TDMR_NR_WARN) 472 pr_warn("consumed TDMRs reaching limit: %d used out of %d\n", 473 tdmr_list->nr_consumed_tdmrs, 474 tdmr_list->max_tdmrs); 475 476 return 0; 477 } 478 479 /* 480 * Calculate PAMT size given a TDMR and a page size. The returned 481 * PAMT size is always aligned up to 4K page boundary. 482 */ 483 static unsigned long tdmr_get_pamt_sz(struct tdmr_info *tdmr, int pgsz, 484 u16 pamt_entry_size) 485 { 486 unsigned long pamt_sz, nr_pamt_entries; 487 488 switch (pgsz) { 489 case TDX_PS_4K: 490 nr_pamt_entries = tdmr->size >> PAGE_SHIFT; 491 break; 492 case TDX_PS_2M: 493 nr_pamt_entries = tdmr->size >> PMD_SHIFT; 494 break; 495 case TDX_PS_1G: 496 nr_pamt_entries = tdmr->size >> PUD_SHIFT; 497 break; 498 default: 499 WARN_ON_ONCE(1); 500 return 0; 501 } 502 503 pamt_sz = nr_pamt_entries * pamt_entry_size; 504 /* TDX requires PAMT size must be 4K aligned */ 505 pamt_sz = ALIGN(pamt_sz, PAGE_SIZE); 506 507 return pamt_sz; 508 } 509 510 /* 511 * Locate a NUMA node which should hold the allocation of the @tdmr 512 * PAMT. This node will have some memory covered by the TDMR. The 513 * relative amount of memory covered is not considered. 514 */ 515 static int tdmr_get_nid(struct tdmr_info *tdmr, struct list_head *tmb_list) 516 { 517 struct tdx_memblock *tmb; 518 519 /* 520 * A TDMR must cover at least part of one TMB. That TMB will end 521 * after the TDMR begins. But, that TMB may have started before 522 * the TDMR. Find the next 'tmb' that _ends_ after this TDMR 523 * begins. Ignore 'tmb' start addresses. They are irrelevant. 524 */ 525 list_for_each_entry(tmb, tmb_list, list) { 526 if (tmb->end_pfn > PHYS_PFN(tdmr->base)) 527 return tmb->nid; 528 } 529 530 /* 531 * Fall back to allocating the TDMR's metadata from node 0 when 532 * no TDX memory block can be found. This should never happen 533 * since TDMRs originate from TDX memory blocks. 534 */ 535 pr_warn("TDMR [0x%llx, 0x%llx): unable to find local NUMA node for PAMT allocation, fallback to use node 0.\n", 536 tdmr->base, tdmr_end(tdmr)); 537 return 0; 538 } 539 540 /* 541 * Allocate PAMTs from the local NUMA node of some memory in @tmb_list 542 * within @tdmr, and set up PAMTs for @tdmr. 543 */ 544 static int tdmr_set_up_pamt(struct tdmr_info *tdmr, 545 struct list_head *tmb_list, 546 u16 pamt_entry_size[]) 547 { 548 unsigned long pamt_base[TDX_PS_NR]; 549 unsigned long pamt_size[TDX_PS_NR]; 550 unsigned long tdmr_pamt_base; 551 unsigned long tdmr_pamt_size; 552 struct page *pamt; 553 int pgsz, nid; 554 555 nid = tdmr_get_nid(tdmr, tmb_list); 556 557 /* 558 * Calculate the PAMT size for each TDX supported page size 559 * and the total PAMT size. 560 */ 561 tdmr_pamt_size = 0; 562 for (pgsz = TDX_PS_4K; pgsz < TDX_PS_NR; pgsz++) { 563 pamt_size[pgsz] = tdmr_get_pamt_sz(tdmr, pgsz, 564 pamt_entry_size[pgsz]); 565 tdmr_pamt_size += pamt_size[pgsz]; 566 } 567 568 /* 569 * Allocate one chunk of physically contiguous memory for all 570 * PAMTs. This helps minimize the PAMT's use of reserved areas 571 * in overlapped TDMRs. 572 */ 573 pamt = alloc_contig_pages(tdmr_pamt_size >> PAGE_SHIFT, GFP_KERNEL, 574 nid, &node_online_map); 575 if (!pamt) 576 return -ENOMEM; 577 578 /* 579 * Break the contiguous allocation back up into the 580 * individual PAMTs for each page size. 581 */ 582 tdmr_pamt_base = page_to_pfn(pamt) << PAGE_SHIFT; 583 for (pgsz = TDX_PS_4K; pgsz < TDX_PS_NR; pgsz++) { 584 pamt_base[pgsz] = tdmr_pamt_base; 585 tdmr_pamt_base += pamt_size[pgsz]; 586 } 587 588 tdmr->pamt_4k_base = pamt_base[TDX_PS_4K]; 589 tdmr->pamt_4k_size = pamt_size[TDX_PS_4K]; 590 tdmr->pamt_2m_base = pamt_base[TDX_PS_2M]; 591 tdmr->pamt_2m_size = pamt_size[TDX_PS_2M]; 592 tdmr->pamt_1g_base = pamt_base[TDX_PS_1G]; 593 tdmr->pamt_1g_size = pamt_size[TDX_PS_1G]; 594 595 return 0; 596 } 597 598 static void tdmr_get_pamt(struct tdmr_info *tdmr, unsigned long *pamt_base, 599 unsigned long *pamt_size) 600 { 601 unsigned long pamt_bs, pamt_sz; 602 603 /* 604 * The PAMT was allocated in one contiguous unit. The 4K PAMT 605 * should always point to the beginning of that allocation. 606 */ 607 pamt_bs = tdmr->pamt_4k_base; 608 pamt_sz = tdmr->pamt_4k_size + tdmr->pamt_2m_size + tdmr->pamt_1g_size; 609 610 WARN_ON_ONCE((pamt_bs & ~PAGE_MASK) || (pamt_sz & ~PAGE_MASK)); 611 612 *pamt_base = pamt_bs; 613 *pamt_size = pamt_sz; 614 } 615 616 static void tdmr_do_pamt_func(struct tdmr_info *tdmr, 617 void (*pamt_func)(unsigned long base, unsigned long size)) 618 { 619 unsigned long pamt_base, pamt_size; 620 621 tdmr_get_pamt(tdmr, &pamt_base, &pamt_size); 622 623 /* Do nothing if PAMT hasn't been allocated for this TDMR */ 624 if (!pamt_size) 625 return; 626 627 if (WARN_ON_ONCE(!pamt_base)) 628 return; 629 630 pamt_func(pamt_base, pamt_size); 631 } 632 633 static void free_pamt(unsigned long pamt_base, unsigned long pamt_size) 634 { 635 free_contig_range(pamt_base >> PAGE_SHIFT, pamt_size >> PAGE_SHIFT); 636 } 637 638 static void tdmr_free_pamt(struct tdmr_info *tdmr) 639 { 640 tdmr_do_pamt_func(tdmr, free_pamt); 641 } 642 643 static void tdmrs_free_pamt_all(struct tdmr_info_list *tdmr_list) 644 { 645 int i; 646 647 for (i = 0; i < tdmr_list->nr_consumed_tdmrs; i++) 648 tdmr_free_pamt(tdmr_entry(tdmr_list, i)); 649 } 650 651 /* Allocate and set up PAMTs for all TDMRs */ 652 static int tdmrs_set_up_pamt_all(struct tdmr_info_list *tdmr_list, 653 struct list_head *tmb_list, 654 u16 pamt_entry_size[]) 655 { 656 int i, ret = 0; 657 658 for (i = 0; i < tdmr_list->nr_consumed_tdmrs; i++) { 659 ret = tdmr_set_up_pamt(tdmr_entry(tdmr_list, i), tmb_list, 660 pamt_entry_size); 661 if (ret) 662 goto err; 663 } 664 665 return 0; 666 err: 667 tdmrs_free_pamt_all(tdmr_list); 668 return ret; 669 } 670 671 /* 672 * Convert TDX private pages back to normal by using MOVDIR64B to 673 * clear these pages. Note this function doesn't flush cache of 674 * these TDX private pages. The caller should make sure of that. 675 */ 676 static void reset_tdx_pages(unsigned long base, unsigned long size) 677 { 678 const void *zero_page = (const void *)page_address(ZERO_PAGE(0)); 679 unsigned long phys, end; 680 681 end = base + size; 682 for (phys = base; phys < end; phys += 64) 683 movdir64b(__va(phys), zero_page); 684 685 /* 686 * MOVDIR64B uses WC protocol. Use memory barrier to 687 * make sure any later user of these pages sees the 688 * updated data. 689 */ 690 mb(); 691 } 692 693 static void tdmr_reset_pamt(struct tdmr_info *tdmr) 694 { 695 tdmr_do_pamt_func(tdmr, reset_tdx_pages); 696 } 697 698 static void tdmrs_reset_pamt_all(struct tdmr_info_list *tdmr_list) 699 { 700 int i; 701 702 for (i = 0; i < tdmr_list->nr_consumed_tdmrs; i++) 703 tdmr_reset_pamt(tdmr_entry(tdmr_list, i)); 704 } 705 706 static unsigned long tdmrs_count_pamt_kb(struct tdmr_info_list *tdmr_list) 707 { 708 unsigned long pamt_size = 0; 709 int i; 710 711 for (i = 0; i < tdmr_list->nr_consumed_tdmrs; i++) { 712 unsigned long base, size; 713 714 tdmr_get_pamt(tdmr_entry(tdmr_list, i), &base, &size); 715 pamt_size += size; 716 } 717 718 return pamt_size / 1024; 719 } 720 721 static int tdmr_add_rsvd_area(struct tdmr_info *tdmr, int *p_idx, u64 addr, 722 u64 size, u16 max_reserved_per_tdmr) 723 { 724 struct tdmr_reserved_area *rsvd_areas = tdmr->reserved_areas; 725 int idx = *p_idx; 726 727 /* Reserved area must be 4K aligned in offset and size */ 728 if (WARN_ON(addr & ~PAGE_MASK || size & ~PAGE_MASK)) 729 return -EINVAL; 730 731 if (idx >= max_reserved_per_tdmr) { 732 pr_warn("initialization failed: TDMR [0x%llx, 0x%llx): reserved areas exhausted.\n", 733 tdmr->base, tdmr_end(tdmr)); 734 return -ENOSPC; 735 } 736 737 /* 738 * Consume one reserved area per call. Make no effort to 739 * optimize or reduce the number of reserved areas which are 740 * consumed by contiguous reserved areas, for instance. 741 */ 742 rsvd_areas[idx].offset = addr - tdmr->base; 743 rsvd_areas[idx].size = size; 744 745 *p_idx = idx + 1; 746 747 return 0; 748 } 749 750 /* 751 * Go through @tmb_list to find holes between memory areas. If any of 752 * those holes fall within @tdmr, set up a TDMR reserved area to cover 753 * the hole. 754 */ 755 static int tdmr_populate_rsvd_holes(struct list_head *tmb_list, 756 struct tdmr_info *tdmr, 757 int *rsvd_idx, 758 u16 max_reserved_per_tdmr) 759 { 760 struct tdx_memblock *tmb; 761 u64 prev_end; 762 int ret; 763 764 /* 765 * Start looking for reserved blocks at the 766 * beginning of the TDMR. 767 */ 768 prev_end = tdmr->base; 769 list_for_each_entry(tmb, tmb_list, list) { 770 u64 start, end; 771 772 start = PFN_PHYS(tmb->start_pfn); 773 end = PFN_PHYS(tmb->end_pfn); 774 775 /* Break if this region is after the TDMR */ 776 if (start >= tdmr_end(tdmr)) 777 break; 778 779 /* Exclude regions before this TDMR */ 780 if (end < tdmr->base) 781 continue; 782 783 /* 784 * Skip over memory areas that 785 * have already been dealt with. 786 */ 787 if (start <= prev_end) { 788 prev_end = end; 789 continue; 790 } 791 792 /* Add the hole before this region */ 793 ret = tdmr_add_rsvd_area(tdmr, rsvd_idx, prev_end, 794 start - prev_end, 795 max_reserved_per_tdmr); 796 if (ret) 797 return ret; 798 799 prev_end = end; 800 } 801 802 /* Add the hole after the last region if it exists. */ 803 if (prev_end < tdmr_end(tdmr)) { 804 ret = tdmr_add_rsvd_area(tdmr, rsvd_idx, prev_end, 805 tdmr_end(tdmr) - prev_end, 806 max_reserved_per_tdmr); 807 if (ret) 808 return ret; 809 } 810 811 return 0; 812 } 813 814 /* 815 * Go through @tdmr_list to find all PAMTs. If any of those PAMTs 816 * overlaps with @tdmr, set up a TDMR reserved area to cover the 817 * overlapping part. 818 */ 819 static int tdmr_populate_rsvd_pamts(struct tdmr_info_list *tdmr_list, 820 struct tdmr_info *tdmr, 821 int *rsvd_idx, 822 u16 max_reserved_per_tdmr) 823 { 824 int i, ret; 825 826 for (i = 0; i < tdmr_list->nr_consumed_tdmrs; i++) { 827 struct tdmr_info *tmp = tdmr_entry(tdmr_list, i); 828 unsigned long pamt_base, pamt_size, pamt_end; 829 830 tdmr_get_pamt(tmp, &pamt_base, &pamt_size); 831 /* Each TDMR must already have PAMT allocated */ 832 WARN_ON_ONCE(!pamt_size || !pamt_base); 833 834 pamt_end = pamt_base + pamt_size; 835 /* Skip PAMTs outside of the given TDMR */ 836 if ((pamt_end <= tdmr->base) || 837 (pamt_base >= tdmr_end(tdmr))) 838 continue; 839 840 /* Only mark the part within the TDMR as reserved */ 841 if (pamt_base < tdmr->base) 842 pamt_base = tdmr->base; 843 if (pamt_end > tdmr_end(tdmr)) 844 pamt_end = tdmr_end(tdmr); 845 846 ret = tdmr_add_rsvd_area(tdmr, rsvd_idx, pamt_base, 847 pamt_end - pamt_base, 848 max_reserved_per_tdmr); 849 if (ret) 850 return ret; 851 } 852 853 return 0; 854 } 855 856 /* Compare function called by sort() for TDMR reserved areas */ 857 static int rsvd_area_cmp_func(const void *a, const void *b) 858 { 859 struct tdmr_reserved_area *r1 = (struct tdmr_reserved_area *)a; 860 struct tdmr_reserved_area *r2 = (struct tdmr_reserved_area *)b; 861 862 if (r1->offset + r1->size <= r2->offset) 863 return -1; 864 if (r1->offset >= r2->offset + r2->size) 865 return 1; 866 867 /* Reserved areas cannot overlap. The caller must guarantee. */ 868 WARN_ON_ONCE(1); 869 return -1; 870 } 871 872 /* 873 * Populate reserved areas for the given @tdmr, including memory holes 874 * (via @tmb_list) and PAMTs (via @tdmr_list). 875 */ 876 static int tdmr_populate_rsvd_areas(struct tdmr_info *tdmr, 877 struct list_head *tmb_list, 878 struct tdmr_info_list *tdmr_list, 879 u16 max_reserved_per_tdmr) 880 { 881 int ret, rsvd_idx = 0; 882 883 ret = tdmr_populate_rsvd_holes(tmb_list, tdmr, &rsvd_idx, 884 max_reserved_per_tdmr); 885 if (ret) 886 return ret; 887 888 ret = tdmr_populate_rsvd_pamts(tdmr_list, tdmr, &rsvd_idx, 889 max_reserved_per_tdmr); 890 if (ret) 891 return ret; 892 893 /* TDX requires reserved areas listed in address ascending order */ 894 sort(tdmr->reserved_areas, rsvd_idx, sizeof(struct tdmr_reserved_area), 895 rsvd_area_cmp_func, NULL); 896 897 return 0; 898 } 899 900 /* 901 * Populate reserved areas for all TDMRs in @tdmr_list, including memory 902 * holes (via @tmb_list) and PAMTs. 903 */ 904 static int tdmrs_populate_rsvd_areas_all(struct tdmr_info_list *tdmr_list, 905 struct list_head *tmb_list, 906 u16 max_reserved_per_tdmr) 907 { 908 int i; 909 910 for (i = 0; i < tdmr_list->nr_consumed_tdmrs; i++) { 911 int ret; 912 913 ret = tdmr_populate_rsvd_areas(tdmr_entry(tdmr_list, i), 914 tmb_list, tdmr_list, max_reserved_per_tdmr); 915 if (ret) 916 return ret; 917 } 918 919 return 0; 920 } 921 922 /* 923 * Construct a list of TDMRs on the preallocated space in @tdmr_list 924 * to cover all TDX memory regions in @tmb_list based on the TDX module 925 * TDMR global information in @tdmr_sysinfo. 926 */ 927 static int construct_tdmrs(struct list_head *tmb_list, 928 struct tdmr_info_list *tdmr_list, 929 struct tdx_tdmr_sysinfo *tdmr_sysinfo) 930 { 931 int ret; 932 933 ret = fill_out_tdmrs(tmb_list, tdmr_list); 934 if (ret) 935 return ret; 936 937 ret = tdmrs_set_up_pamt_all(tdmr_list, tmb_list, 938 tdmr_sysinfo->pamt_entry_size); 939 if (ret) 940 return ret; 941 942 ret = tdmrs_populate_rsvd_areas_all(tdmr_list, tmb_list, 943 tdmr_sysinfo->max_reserved_per_tdmr); 944 if (ret) 945 tdmrs_free_pamt_all(tdmr_list); 946 947 /* 948 * The tdmr_info_list is read-only from here on out. 949 * Ensure that these writes are seen by other CPUs. 950 * Pairs with a smp_rmb() in is_pamt_page(). 951 */ 952 smp_wmb(); 953 954 return ret; 955 } 956 957 static int config_tdx_module(struct tdmr_info_list *tdmr_list, u64 global_keyid) 958 { 959 struct tdx_module_args args = {}; 960 u64 *tdmr_pa_array; 961 size_t array_sz; 962 int i, ret; 963 964 /* 965 * TDMRs are passed to the TDX module via an array of physical 966 * addresses of each TDMR. The array itself also has certain 967 * alignment requirement. 968 */ 969 array_sz = tdmr_list->nr_consumed_tdmrs * sizeof(u64); 970 array_sz = roundup_pow_of_two(array_sz); 971 if (array_sz < TDMR_INFO_PA_ARRAY_ALIGNMENT) 972 array_sz = TDMR_INFO_PA_ARRAY_ALIGNMENT; 973 974 tdmr_pa_array = kzalloc(array_sz, GFP_KERNEL); 975 if (!tdmr_pa_array) 976 return -ENOMEM; 977 978 for (i = 0; i < tdmr_list->nr_consumed_tdmrs; i++) 979 tdmr_pa_array[i] = __pa(tdmr_entry(tdmr_list, i)); 980 981 args.rcx = __pa(tdmr_pa_array); 982 args.rdx = tdmr_list->nr_consumed_tdmrs; 983 args.r8 = global_keyid; 984 ret = seamcall_prerr(TDH_SYS_CONFIG, &args); 985 986 /* Free the array as it is not required anymore. */ 987 kfree(tdmr_pa_array); 988 989 return ret; 990 } 991 992 static int do_global_key_config(void *unused) 993 { 994 struct tdx_module_args args = {}; 995 996 return seamcall_prerr(TDH_SYS_KEY_CONFIG, &args); 997 } 998 999 /* 1000 * Attempt to configure the global KeyID on all physical packages. 1001 * 1002 * This requires running code on at least one CPU in each package. 1003 * TDMR initialization) will fail will fail if any package in the 1004 * system has no online CPUs. 1005 * 1006 * This code takes no affirmative steps to online CPUs. Callers (aka. 1007 * KVM) can ensure success by ensuring sufficient CPUs are online and 1008 * can run SEAMCALLs. 1009 */ 1010 static int config_global_keyid(void) 1011 { 1012 cpumask_var_t packages; 1013 int cpu, ret = -EINVAL; 1014 1015 if (!zalloc_cpumask_var(&packages, GFP_KERNEL)) 1016 return -ENOMEM; 1017 1018 /* 1019 * Hardware doesn't guarantee cache coherency across different 1020 * KeyIDs. The kernel needs to flush PAMT's dirty cachelines 1021 * (associated with KeyID 0) before the TDX module can use the 1022 * global KeyID to access the PAMT. Given PAMTs are potentially 1023 * large (~1/256th of system RAM), just use WBINVD. 1024 */ 1025 wbinvd_on_all_cpus(); 1026 1027 for_each_online_cpu(cpu) { 1028 /* 1029 * The key configuration only needs to be done once per 1030 * package and will return an error if configured more 1031 * than once. Avoid doing it multiple times per package. 1032 */ 1033 if (cpumask_test_and_set_cpu(topology_physical_package_id(cpu), 1034 packages)) 1035 continue; 1036 1037 /* 1038 * TDH.SYS.KEY.CONFIG cannot run concurrently on 1039 * different cpus. Do it one by one. 1040 */ 1041 ret = smp_call_on_cpu(cpu, do_global_key_config, NULL, true); 1042 if (ret) 1043 break; 1044 } 1045 1046 free_cpumask_var(packages); 1047 return ret; 1048 } 1049 1050 static int init_tdmr(struct tdmr_info *tdmr) 1051 { 1052 u64 next; 1053 1054 /* 1055 * Initializing a TDMR can be time consuming. To avoid long 1056 * SEAMCALLs, the TDX module may only initialize a part of the 1057 * TDMR in each call. 1058 */ 1059 do { 1060 struct tdx_module_args args = { 1061 .rcx = tdmr->base, 1062 }; 1063 int ret; 1064 1065 ret = seamcall_prerr_ret(TDH_SYS_TDMR_INIT, &args); 1066 if (ret) 1067 return ret; 1068 /* 1069 * RDX contains 'next-to-initialize' address if 1070 * TDH.SYS.TDMR.INIT did not fully complete and 1071 * should be retried. 1072 */ 1073 next = args.rdx; 1074 cond_resched(); 1075 /* Keep making SEAMCALLs until the TDMR is done */ 1076 } while (next < tdmr->base + tdmr->size); 1077 1078 return 0; 1079 } 1080 1081 static int init_tdmrs(struct tdmr_info_list *tdmr_list) 1082 { 1083 int i; 1084 1085 /* 1086 * This operation is costly. It can be parallelized, 1087 * but keep it simple for now. 1088 */ 1089 for (i = 0; i < tdmr_list->nr_consumed_tdmrs; i++) { 1090 int ret; 1091 1092 ret = init_tdmr(tdmr_entry(tdmr_list, i)); 1093 if (ret) 1094 return ret; 1095 } 1096 1097 return 0; 1098 } 1099 1100 static int init_tdx_module(void) 1101 { 1102 struct tdx_tdmr_sysinfo tdmr_sysinfo; 1103 int ret; 1104 1105 /* 1106 * To keep things simple, assume that all TDX-protected memory 1107 * will come from the page allocator. Make sure all pages in the 1108 * page allocator are TDX-usable memory. 1109 * 1110 * Build the list of "TDX-usable" memory regions which cover all 1111 * pages in the page allocator to guarantee that. Do it while 1112 * holding mem_hotplug_lock read-lock as the memory hotplug code 1113 * path reads the @tdx_memlist to reject any new memory. 1114 */ 1115 get_online_mems(); 1116 1117 ret = build_tdx_memlist(&tdx_memlist); 1118 if (ret) 1119 goto out_put_tdxmem; 1120 1121 ret = get_tdx_tdmr_sysinfo(&tdmr_sysinfo); 1122 if (ret) 1123 goto err_free_tdxmem; 1124 1125 /* Allocate enough space for constructing TDMRs */ 1126 ret = alloc_tdmr_list(&tdx_tdmr_list, &tdmr_sysinfo); 1127 if (ret) 1128 goto err_free_tdxmem; 1129 1130 /* Cover all TDX-usable memory regions in TDMRs */ 1131 ret = construct_tdmrs(&tdx_memlist, &tdx_tdmr_list, &tdmr_sysinfo); 1132 if (ret) 1133 goto err_free_tdmrs; 1134 1135 /* Pass the TDMRs and the global KeyID to the TDX module */ 1136 ret = config_tdx_module(&tdx_tdmr_list, tdx_global_keyid); 1137 if (ret) 1138 goto err_free_pamts; 1139 1140 /* Config the key of global KeyID on all packages */ 1141 ret = config_global_keyid(); 1142 if (ret) 1143 goto err_reset_pamts; 1144 1145 /* Initialize TDMRs to complete the TDX module initialization */ 1146 ret = init_tdmrs(&tdx_tdmr_list); 1147 if (ret) 1148 goto err_reset_pamts; 1149 1150 pr_info("%lu KB allocated for PAMT\n", tdmrs_count_pamt_kb(&tdx_tdmr_list)); 1151 1152 out_put_tdxmem: 1153 /* 1154 * @tdx_memlist is written here and read at memory hotplug time. 1155 * Lock out memory hotplug code while building it. 1156 */ 1157 put_online_mems(); 1158 return ret; 1159 1160 err_reset_pamts: 1161 /* 1162 * Part of PAMTs may already have been initialized by the 1163 * TDX module. Flush cache before returning PAMTs back 1164 * to the kernel. 1165 */ 1166 wbinvd_on_all_cpus(); 1167 /* 1168 * According to the TDX hardware spec, if the platform 1169 * doesn't have the "partial write machine check" 1170 * erratum, any kernel read/write will never cause #MC 1171 * in kernel space, thus it's OK to not convert PAMTs 1172 * back to normal. But do the conversion anyway here 1173 * as suggested by the TDX spec. 1174 */ 1175 tdmrs_reset_pamt_all(&tdx_tdmr_list); 1176 err_free_pamts: 1177 tdmrs_free_pamt_all(&tdx_tdmr_list); 1178 err_free_tdmrs: 1179 free_tdmr_list(&tdx_tdmr_list); 1180 err_free_tdxmem: 1181 free_tdx_memlist(&tdx_memlist); 1182 goto out_put_tdxmem; 1183 } 1184 1185 static int __tdx_enable(void) 1186 { 1187 int ret; 1188 1189 ret = init_tdx_module(); 1190 if (ret) { 1191 pr_err("module initialization failed (%d)\n", ret); 1192 tdx_module_status = TDX_MODULE_ERROR; 1193 return ret; 1194 } 1195 1196 pr_info("module initialized\n"); 1197 tdx_module_status = TDX_MODULE_INITIALIZED; 1198 1199 return 0; 1200 } 1201 1202 /** 1203 * tdx_enable - Enable TDX module to make it ready to run TDX guests 1204 * 1205 * This function assumes the caller has: 1) held read lock of CPU hotplug 1206 * lock to prevent any new cpu from becoming online; 2) done both VMXON 1207 * and tdx_cpu_enable() on all online cpus. 1208 * 1209 * This function requires there's at least one online cpu for each CPU 1210 * package to succeed. 1211 * 1212 * This function can be called in parallel by multiple callers. 1213 * 1214 * Return 0 if TDX is enabled successfully, otherwise error. 1215 */ 1216 int tdx_enable(void) 1217 { 1218 int ret; 1219 1220 if (!boot_cpu_has(X86_FEATURE_TDX_HOST_PLATFORM)) 1221 return -ENODEV; 1222 1223 lockdep_assert_cpus_held(); 1224 1225 mutex_lock(&tdx_module_lock); 1226 1227 switch (tdx_module_status) { 1228 case TDX_MODULE_UNINITIALIZED: 1229 ret = __tdx_enable(); 1230 break; 1231 case TDX_MODULE_INITIALIZED: 1232 /* Already initialized, great, tell the caller. */ 1233 ret = 0; 1234 break; 1235 default: 1236 /* Failed to initialize in the previous attempts */ 1237 ret = -EINVAL; 1238 break; 1239 } 1240 1241 mutex_unlock(&tdx_module_lock); 1242 1243 return ret; 1244 } 1245 EXPORT_SYMBOL_GPL(tdx_enable); 1246 1247 static bool is_pamt_page(unsigned long phys) 1248 { 1249 struct tdmr_info_list *tdmr_list = &tdx_tdmr_list; 1250 int i; 1251 1252 /* Ensure that all remote 'tdmr_list' writes are visible: */ 1253 smp_rmb(); 1254 1255 /* 1256 * The TDX module is no longer returning TDX_SYS_NOT_READY and 1257 * is initialized. The 'tdmr_list' was initialized long ago 1258 * and is now read-only. 1259 */ 1260 for (i = 0; i < tdmr_list->nr_consumed_tdmrs; i++) { 1261 unsigned long base, size; 1262 1263 tdmr_get_pamt(tdmr_entry(tdmr_list, i), &base, &size); 1264 1265 if (phys >= base && phys < (base + size)) 1266 return true; 1267 } 1268 1269 return false; 1270 } 1271 1272 /* 1273 * Return whether the memory page at the given physical address is TDX 1274 * private memory or not. 1275 * 1276 * This can be imprecise for two known reasons: 1277 * 1. PAMTs are private memory and exist before the TDX module is 1278 * ready and TDH_PHYMEM_PAGE_RDMD works. This is a relatively 1279 * short window that occurs once per boot. 1280 * 2. TDH_PHYMEM_PAGE_RDMD reflects the TDX module's knowledge of the 1281 * page. However, the page can still cause #MC until it has been 1282 * fully converted to shared using 64-byte writes like MOVDIR64B. 1283 * Buggy hosts might still leave #MC-causing memory in place which 1284 * this function can not detect. 1285 */ 1286 static bool paddr_is_tdx_private(unsigned long phys) 1287 { 1288 struct tdx_module_args args = { 1289 .rcx = phys & PAGE_MASK, 1290 }; 1291 u64 sret; 1292 1293 if (!boot_cpu_has(X86_FEATURE_TDX_HOST_PLATFORM)) 1294 return false; 1295 1296 /* Get page type from the TDX module */ 1297 sret = __seamcall_ret(TDH_PHYMEM_PAGE_RDMD, &args); 1298 1299 /* 1300 * The SEAMCALL will not return success unless there is a 1301 * working, "ready" TDX module. Assume an absence of TDX 1302 * private pages until SEAMCALL is working. 1303 */ 1304 if (sret) 1305 return false; 1306 1307 /* 1308 * SEAMCALL was successful -- read page type (via RCX): 1309 * 1310 * - PT_NDA: Page is not used by the TDX module 1311 * - PT_RSVD: Reserved for Non-TDX use 1312 * - Others: Page is used by the TDX module 1313 * 1314 * Note PAMT pages are marked as PT_RSVD but they are also TDX 1315 * private memory. 1316 */ 1317 switch (args.rcx) { 1318 case PT_NDA: 1319 return false; 1320 case PT_RSVD: 1321 return is_pamt_page(phys); 1322 default: 1323 return true; 1324 } 1325 } 1326 1327 /* 1328 * Some TDX-capable CPUs have an erratum. A write to TDX private 1329 * memory poisons that memory, and a subsequent read of that memory 1330 * triggers #MC. 1331 * 1332 * Help distinguish erratum-triggered #MCs from a normal hardware one. 1333 * Just print additional message to show such #MC may be result of the 1334 * erratum. 1335 */ 1336 const char *tdx_dump_mce_info(struct mce *m) 1337 { 1338 if (!m || !mce_is_memory_error(m) || !mce_usable_address(m)) 1339 return NULL; 1340 1341 if (!paddr_is_tdx_private(m->addr)) 1342 return NULL; 1343 1344 return "TDX private memory error. Possible kernel bug."; 1345 } 1346 1347 static __init int record_keyid_partitioning(u32 *tdx_keyid_start, 1348 u32 *nr_tdx_keyids) 1349 { 1350 u32 _nr_mktme_keyids, _tdx_keyid_start, _nr_tdx_keyids; 1351 int ret; 1352 1353 /* 1354 * IA32_MKTME_KEYID_PARTIONING: 1355 * Bit [31:0]: Number of MKTME KeyIDs. 1356 * Bit [63:32]: Number of TDX private KeyIDs. 1357 */ 1358 ret = rdmsr_safe(MSR_IA32_MKTME_KEYID_PARTITIONING, &_nr_mktme_keyids, 1359 &_nr_tdx_keyids); 1360 if (ret || !_nr_tdx_keyids) 1361 return -EINVAL; 1362 1363 /* TDX KeyIDs start after the last MKTME KeyID. */ 1364 _tdx_keyid_start = _nr_mktme_keyids + 1; 1365 1366 *tdx_keyid_start = _tdx_keyid_start; 1367 *nr_tdx_keyids = _nr_tdx_keyids; 1368 1369 return 0; 1370 } 1371 1372 static bool is_tdx_memory(unsigned long start_pfn, unsigned long end_pfn) 1373 { 1374 struct tdx_memblock *tmb; 1375 1376 /* 1377 * This check assumes that the start_pfn<->end_pfn range does not 1378 * cross multiple @tdx_memlist entries. A single memory online 1379 * event across multiple memblocks (from which @tdx_memlist 1380 * entries are derived at the time of module initialization) is 1381 * not possible. This is because memory offline/online is done 1382 * on granularity of 'struct memory_block', and the hotpluggable 1383 * memory region (one memblock) must be multiple of memory_block. 1384 */ 1385 list_for_each_entry(tmb, &tdx_memlist, list) { 1386 if (start_pfn >= tmb->start_pfn && end_pfn <= tmb->end_pfn) 1387 return true; 1388 } 1389 return false; 1390 } 1391 1392 static int tdx_memory_notifier(struct notifier_block *nb, unsigned long action, 1393 void *v) 1394 { 1395 struct memory_notify *mn = v; 1396 1397 if (action != MEM_GOING_ONLINE) 1398 return NOTIFY_OK; 1399 1400 /* 1401 * Empty list means TDX isn't enabled. Allow any memory 1402 * to go online. 1403 */ 1404 if (list_empty(&tdx_memlist)) 1405 return NOTIFY_OK; 1406 1407 /* 1408 * The TDX memory configuration is static and can not be 1409 * changed. Reject onlining any memory which is outside of 1410 * the static configuration whether it supports TDX or not. 1411 */ 1412 if (is_tdx_memory(mn->start_pfn, mn->start_pfn + mn->nr_pages)) 1413 return NOTIFY_OK; 1414 1415 return NOTIFY_BAD; 1416 } 1417 1418 static struct notifier_block tdx_memory_nb = { 1419 .notifier_call = tdx_memory_notifier, 1420 }; 1421 1422 static void __init check_tdx_erratum(void) 1423 { 1424 /* 1425 * These CPUs have an erratum. A partial write from non-TD 1426 * software (e.g. via MOVNTI variants or UC/WC mapping) to TDX 1427 * private memory poisons that memory, and a subsequent read of 1428 * that memory triggers #MC. 1429 */ 1430 switch (boot_cpu_data.x86_model) { 1431 case INTEL_FAM6_SAPPHIRERAPIDS_X: 1432 case INTEL_FAM6_EMERALDRAPIDS_X: 1433 setup_force_cpu_bug(X86_BUG_TDX_PW_MCE); 1434 } 1435 } 1436 1437 void __init tdx_init(void) 1438 { 1439 u32 tdx_keyid_start, nr_tdx_keyids; 1440 int err; 1441 1442 err = record_keyid_partitioning(&tdx_keyid_start, &nr_tdx_keyids); 1443 if (err) 1444 return; 1445 1446 pr_info("BIOS enabled: private KeyID range [%u, %u)\n", 1447 tdx_keyid_start, tdx_keyid_start + nr_tdx_keyids); 1448 1449 /* 1450 * The TDX module itself requires one 'global KeyID' to protect 1451 * its metadata. If there's only one TDX KeyID, there won't be 1452 * any left for TDX guests thus there's no point to enable TDX 1453 * at all. 1454 */ 1455 if (nr_tdx_keyids < 2) { 1456 pr_err("initialization failed: too few private KeyIDs available.\n"); 1457 return; 1458 } 1459 1460 /* 1461 * At this point, hibernation_available() indicates whether or 1462 * not hibernation support has been permanently disabled. 1463 */ 1464 if (hibernation_available()) { 1465 pr_err("initialization failed: Hibernation support is enabled\n"); 1466 return; 1467 } 1468 1469 err = register_memory_notifier(&tdx_memory_nb); 1470 if (err) { 1471 pr_err("initialization failed: register_memory_notifier() failed (%d)\n", 1472 err); 1473 return; 1474 } 1475 1476 #if defined(CONFIG_ACPI) && defined(CONFIG_SUSPEND) 1477 pr_info("Disable ACPI S3. Turn off TDX in the BIOS to use ACPI S3.\n"); 1478 acpi_suspend_lowlevel = NULL; 1479 #endif 1480 1481 /* 1482 * Just use the first TDX KeyID as the 'global KeyID' and 1483 * leave the rest for TDX guests. 1484 */ 1485 tdx_global_keyid = tdx_keyid_start; 1486 tdx_guest_keyid_start = tdx_keyid_start + 1; 1487 tdx_nr_guest_keyids = nr_tdx_keyids - 1; 1488 1489 setup_force_cpu_cap(X86_FEATURE_TDX_HOST_PLATFORM); 1490 1491 check_tdx_erratum(); 1492 } 1493