1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * AMD SVM-SEV Host Support. 4 * 5 * Copyright (C) 2023 Advanced Micro Devices, Inc. 6 * 7 * Author: Ashish Kalra <ashish.kalra@amd.com> 8 * 9 */ 10 11 #include <linux/cc_platform.h> 12 #include <linux/printk.h> 13 #include <linux/mm_types.h> 14 #include <linux/set_memory.h> 15 #include <linux/memblock.h> 16 #include <linux/kernel.h> 17 #include <linux/mm.h> 18 #include <linux/cpumask.h> 19 #include <linux/iommu.h> 20 #include <linux/amd-iommu.h> 21 #include <linux/nospec.h> 22 23 #include <asm/sev.h> 24 #include <asm/processor.h> 25 #include <asm/setup.h> 26 #include <asm/svm.h> 27 #include <asm/smp.h> 28 #include <asm/cpu.h> 29 #include <asm/apic.h> 30 #include <asm/cpuid.h> 31 #include <asm/cmdline.h> 32 #include <asm/iommu.h> 33 34 /* 35 * The RMP entry information as returned by the RMPREAD instruction. 36 */ 37 struct rmpentry { 38 u64 gpa; 39 u8 assigned :1, 40 rsvd1 :7; 41 u8 pagesize :1, 42 hpage_region_status :1, 43 rsvd2 :6; 44 u8 immutable :1, 45 rsvd3 :7; 46 u8 rsvd4; 47 u32 asid; 48 } __packed; 49 50 /* 51 * The raw RMP entry format is not architectural. The format is defined in PPR 52 * Family 19h Model 01h, Rev B1 processor. This format represents the actual 53 * entry in the RMP table memory. The bitfield definitions are used for machines 54 * without the RMPREAD instruction (Zen3 and Zen4), otherwise the "hi" and "lo" 55 * fields are only used for dumping the raw data. 56 */ 57 struct rmpentry_raw { 58 union { 59 struct { 60 u64 assigned : 1, 61 pagesize : 1, 62 immutable : 1, 63 rsvd1 : 9, 64 gpa : 39, 65 asid : 10, 66 vmsa : 1, 67 validated : 1, 68 rsvd2 : 1; 69 }; 70 u64 lo; 71 }; 72 u64 hi; 73 } __packed; 74 75 /* 76 * The first 16KB from the RMP_BASE is used by the processor for the 77 * bookkeeping, the range needs to be added during the RMP entry lookup. 78 */ 79 #define RMPTABLE_CPU_BOOKKEEPING_SZ 0x4000 80 81 /* 82 * For a non-segmented RMP table, use the maximum physical addressing as the 83 * segment size in order to always arrive at index 0 in the table. 84 */ 85 #define RMPTABLE_NON_SEGMENTED_SHIFT 52 86 87 struct rmp_segment_desc { 88 struct rmpentry_raw *rmp_entry; 89 u64 max_index; 90 u64 size; 91 }; 92 93 /* 94 * Segmented RMP Table support. 95 * - The segment size is used for two purposes: 96 * - Identify the amount of memory covered by an RMP segment 97 * - Quickly locate an RMP segment table entry for a physical address 98 * 99 * - The RMP segment table contains pointers to an RMP table that covers 100 * a specific portion of memory. There can be up to 512 8-byte entries, 101 * one pages worth. 102 */ 103 #define RST_ENTRY_MAPPED_SIZE(x) ((x) & GENMASK_ULL(19, 0)) 104 #define RST_ENTRY_SEGMENT_BASE(x) ((x) & GENMASK_ULL(51, 20)) 105 106 #define RST_SIZE SZ_4K 107 static struct rmp_segment_desc **rmp_segment_table __ro_after_init; 108 static unsigned int rst_max_index __ro_after_init = 512; 109 110 static unsigned int rmp_segment_shift; 111 static u64 rmp_segment_size; 112 static u64 rmp_segment_mask; 113 114 #define RST_ENTRY_INDEX(x) ((x) >> rmp_segment_shift) 115 #define RMP_ENTRY_INDEX(x) ((u64)(PHYS_PFN((x) & rmp_segment_mask))) 116 117 static u64 rmp_cfg; 118 119 /* Mask to apply to a PFN to get the first PFN of a 2MB page */ 120 #define PFN_PMD_MASK GENMASK_ULL(63, PMD_SHIFT - PAGE_SHIFT) 121 122 static u64 probed_rmp_base, probed_rmp_size; 123 124 static LIST_HEAD(snp_leaked_pages_list); 125 static DEFINE_SPINLOCK(snp_leaked_pages_list_lock); 126 127 static unsigned long snp_nr_leaked_pages; 128 129 #undef pr_fmt 130 #define pr_fmt(fmt) "SEV-SNP: " fmt 131 132 static int __mfd_enable(unsigned int cpu) 133 { 134 u64 val; 135 136 if (!cc_platform_has(CC_ATTR_HOST_SEV_SNP)) 137 return 0; 138 139 rdmsrl(MSR_AMD64_SYSCFG, val); 140 141 val |= MSR_AMD64_SYSCFG_MFDM; 142 143 wrmsrl(MSR_AMD64_SYSCFG, val); 144 145 return 0; 146 } 147 148 static __init void mfd_enable(void *arg) 149 { 150 __mfd_enable(smp_processor_id()); 151 } 152 153 static int __snp_enable(unsigned int cpu) 154 { 155 u64 val; 156 157 if (!cc_platform_has(CC_ATTR_HOST_SEV_SNP)) 158 return 0; 159 160 rdmsrl(MSR_AMD64_SYSCFG, val); 161 162 val |= MSR_AMD64_SYSCFG_SNP_EN; 163 val |= MSR_AMD64_SYSCFG_SNP_VMPL_EN; 164 165 wrmsrl(MSR_AMD64_SYSCFG, val); 166 167 return 0; 168 } 169 170 static __init void snp_enable(void *arg) 171 { 172 __snp_enable(smp_processor_id()); 173 } 174 175 static void __init __snp_fixup_e820_tables(u64 pa) 176 { 177 if (IS_ALIGNED(pa, PMD_SIZE)) 178 return; 179 180 /* 181 * Handle cases where the RMP table placement by the BIOS is not 182 * 2M aligned and the kexec kernel could try to allocate 183 * from within that chunk which then causes a fatal RMP fault. 184 * 185 * The e820_table needs to be updated as it is converted to 186 * kernel memory resources and used by KEXEC_FILE_LOAD syscall 187 * to load kexec segments. 188 * 189 * The e820_table_firmware needs to be updated as it is exposed 190 * to sysfs and used by the KEXEC_LOAD syscall to load kexec 191 * segments. 192 * 193 * The e820_table_kexec needs to be updated as it passed to 194 * the kexec-ed kernel. 195 */ 196 pa = ALIGN_DOWN(pa, PMD_SIZE); 197 if (e820__mapped_any(pa, pa + PMD_SIZE, E820_TYPE_RAM)) { 198 pr_info("Reserving start/end of RMP table on a 2MB boundary [0x%016llx]\n", pa); 199 e820__range_update(pa, PMD_SIZE, E820_TYPE_RAM, E820_TYPE_RESERVED); 200 e820__range_update_table(e820_table_kexec, pa, PMD_SIZE, E820_TYPE_RAM, E820_TYPE_RESERVED); 201 e820__range_update_table(e820_table_firmware, pa, PMD_SIZE, E820_TYPE_RAM, E820_TYPE_RESERVED); 202 if (!memblock_is_region_reserved(pa, PMD_SIZE)) 203 memblock_reserve(pa, PMD_SIZE); 204 } 205 } 206 207 static void __init fixup_e820_tables_for_segmented_rmp(void) 208 { 209 u64 pa, *rst, size, mapped_size; 210 unsigned int i; 211 212 __snp_fixup_e820_tables(probed_rmp_base); 213 214 pa = probed_rmp_base + RMPTABLE_CPU_BOOKKEEPING_SZ; 215 216 __snp_fixup_e820_tables(pa + RST_SIZE); 217 218 rst = early_memremap(pa, RST_SIZE); 219 if (!rst) 220 return; 221 222 for (i = 0; i < rst_max_index; i++) { 223 pa = RST_ENTRY_SEGMENT_BASE(rst[i]); 224 mapped_size = RST_ENTRY_MAPPED_SIZE(rst[i]); 225 if (!mapped_size) 226 continue; 227 228 __snp_fixup_e820_tables(pa); 229 230 /* 231 * Mapped size in GB. Mapped size is allowed to exceed 232 * the segment coverage size, but gets reduced to the 233 * segment coverage size. 234 */ 235 mapped_size <<= 30; 236 if (mapped_size > rmp_segment_size) 237 mapped_size = rmp_segment_size; 238 239 /* Calculate the RMP segment size (16 bytes/page mapped) */ 240 size = PHYS_PFN(mapped_size) << 4; 241 242 __snp_fixup_e820_tables(pa + size); 243 } 244 245 early_memunmap(rst, RST_SIZE); 246 } 247 248 static void __init fixup_e820_tables_for_contiguous_rmp(void) 249 { 250 __snp_fixup_e820_tables(probed_rmp_base); 251 __snp_fixup_e820_tables(probed_rmp_base + probed_rmp_size); 252 } 253 254 void __init snp_fixup_e820_tables(void) 255 { 256 if (rmp_cfg & MSR_AMD64_SEG_RMP_ENABLED) { 257 fixup_e820_tables_for_segmented_rmp(); 258 } else { 259 fixup_e820_tables_for_contiguous_rmp(); 260 } 261 } 262 263 static bool __init clear_rmptable_bookkeeping(void) 264 { 265 void *bk; 266 267 bk = memremap(probed_rmp_base, RMPTABLE_CPU_BOOKKEEPING_SZ, MEMREMAP_WB); 268 if (!bk) { 269 pr_err("Failed to map RMP bookkeeping area\n"); 270 return false; 271 } 272 273 memset(bk, 0, RMPTABLE_CPU_BOOKKEEPING_SZ); 274 275 memunmap(bk); 276 277 return true; 278 } 279 280 static bool __init alloc_rmp_segment_desc(u64 segment_pa, u64 segment_size, u64 pa) 281 { 282 u64 rst_index, rmp_segment_size_max; 283 struct rmp_segment_desc *desc; 284 void *rmp_segment; 285 286 /* Calculate the maximum size an RMP can be (16 bytes/page mapped) */ 287 rmp_segment_size_max = PHYS_PFN(rmp_segment_size) << 4; 288 289 /* Validate the RMP segment size */ 290 if (segment_size > rmp_segment_size_max) { 291 pr_err("Invalid RMP size 0x%llx for configured segment size 0x%llx\n", 292 segment_size, rmp_segment_size_max); 293 return false; 294 } 295 296 /* Validate the RMP segment table index */ 297 rst_index = RST_ENTRY_INDEX(pa); 298 if (rst_index >= rst_max_index) { 299 pr_err("Invalid RMP segment base address 0x%llx for configured segment size 0x%llx\n", 300 pa, rmp_segment_size); 301 return false; 302 } 303 304 if (rmp_segment_table[rst_index]) { 305 pr_err("RMP segment descriptor already exists at index %llu\n", rst_index); 306 return false; 307 } 308 309 rmp_segment = memremap(segment_pa, segment_size, MEMREMAP_WB); 310 if (!rmp_segment) { 311 pr_err("Failed to map RMP segment addr 0x%llx size 0x%llx\n", 312 segment_pa, segment_size); 313 return false; 314 } 315 316 desc = kzalloc(sizeof(*desc), GFP_KERNEL); 317 if (!desc) { 318 memunmap(rmp_segment); 319 return false; 320 } 321 322 desc->rmp_entry = rmp_segment; 323 desc->max_index = segment_size / sizeof(*desc->rmp_entry); 324 desc->size = segment_size; 325 326 rmp_segment_table[rst_index] = desc; 327 328 return true; 329 } 330 331 static void __init free_rmp_segment_table(void) 332 { 333 unsigned int i; 334 335 for (i = 0; i < rst_max_index; i++) { 336 struct rmp_segment_desc *desc; 337 338 desc = rmp_segment_table[i]; 339 if (!desc) 340 continue; 341 342 memunmap(desc->rmp_entry); 343 344 kfree(desc); 345 } 346 347 free_page((unsigned long)rmp_segment_table); 348 349 rmp_segment_table = NULL; 350 } 351 352 /* Allocate the table used to index into the RMP segments */ 353 static bool __init alloc_rmp_segment_table(void) 354 { 355 struct page *page; 356 357 page = alloc_page(__GFP_ZERO); 358 if (!page) 359 return false; 360 361 rmp_segment_table = page_address(page); 362 363 return true; 364 } 365 366 static bool __init setup_contiguous_rmptable(void) 367 { 368 u64 max_rmp_pfn, calc_rmp_sz, rmptable_segment, rmptable_size, rmp_end; 369 370 if (!probed_rmp_size) 371 return false; 372 373 rmp_end = probed_rmp_base + probed_rmp_size - 1; 374 375 /* 376 * Calculate the amount of memory that must be reserved by the BIOS to 377 * address the whole RAM, including the bookkeeping area. The RMP itself 378 * must also be covered. 379 */ 380 max_rmp_pfn = max_pfn; 381 if (PFN_UP(rmp_end) > max_pfn) 382 max_rmp_pfn = PFN_UP(rmp_end); 383 384 calc_rmp_sz = (max_rmp_pfn << 4) + RMPTABLE_CPU_BOOKKEEPING_SZ; 385 if (calc_rmp_sz > probed_rmp_size) { 386 pr_err("Memory reserved for the RMP table does not cover full system RAM (expected 0x%llx got 0x%llx)\n", 387 calc_rmp_sz, probed_rmp_size); 388 return false; 389 } 390 391 if (!alloc_rmp_segment_table()) 392 return false; 393 394 /* Map only the RMP entries */ 395 rmptable_segment = probed_rmp_base + RMPTABLE_CPU_BOOKKEEPING_SZ; 396 rmptable_size = probed_rmp_size - RMPTABLE_CPU_BOOKKEEPING_SZ; 397 398 if (!alloc_rmp_segment_desc(rmptable_segment, rmptable_size, 0)) { 399 free_rmp_segment_table(); 400 return false; 401 } 402 403 return true; 404 } 405 406 static bool __init setup_segmented_rmptable(void) 407 { 408 u64 rst_pa, *rst, pa, ram_pa_end, ram_pa_max; 409 unsigned int i, max_index; 410 411 if (!probed_rmp_base) 412 return false; 413 414 if (!alloc_rmp_segment_table()) 415 return false; 416 417 rst_pa = probed_rmp_base + RMPTABLE_CPU_BOOKKEEPING_SZ; 418 rst = memremap(rst_pa, RST_SIZE, MEMREMAP_WB); 419 if (!rst) { 420 pr_err("Failed to map RMP segment table addr 0x%llx\n", rst_pa); 421 goto e_free; 422 } 423 424 pr_info("Segmented RMP using %lluGB segments\n", rmp_segment_size >> 30); 425 426 ram_pa_max = max_pfn << PAGE_SHIFT; 427 428 max_index = 0; 429 ram_pa_end = 0; 430 for (i = 0; i < rst_max_index; i++) { 431 u64 rmp_segment, rmp_size, mapped_size; 432 433 mapped_size = RST_ENTRY_MAPPED_SIZE(rst[i]); 434 if (!mapped_size) 435 continue; 436 437 max_index = i; 438 439 /* 440 * Mapped size in GB. Mapped size is allowed to exceed the 441 * segment coverage size, but gets reduced to the segment 442 * coverage size. 443 */ 444 mapped_size <<= 30; 445 if (mapped_size > rmp_segment_size) { 446 pr_info("RMP segment %u mapped size (0x%llx) reduced to 0x%llx\n", 447 i, mapped_size, rmp_segment_size); 448 mapped_size = rmp_segment_size; 449 } 450 451 rmp_segment = RST_ENTRY_SEGMENT_BASE(rst[i]); 452 453 /* Calculate the RMP segment size (16 bytes/page mapped) */ 454 rmp_size = PHYS_PFN(mapped_size) << 4; 455 456 pa = (u64)i << rmp_segment_shift; 457 458 /* 459 * Some segments may be for MMIO mapped above system RAM. These 460 * segments are used for Trusted I/O. 461 */ 462 if (pa < ram_pa_max) 463 ram_pa_end = pa + mapped_size; 464 465 if (!alloc_rmp_segment_desc(rmp_segment, rmp_size, pa)) 466 goto e_unmap; 467 468 pr_info("RMP segment %u physical address [0x%llx - 0x%llx] covering [0x%llx - 0x%llx]\n", 469 i, rmp_segment, rmp_segment + rmp_size - 1, pa, pa + mapped_size - 1); 470 } 471 472 if (ram_pa_max > ram_pa_end) { 473 pr_err("Segmented RMP does not cover full system RAM (expected 0x%llx got 0x%llx)\n", 474 ram_pa_max, ram_pa_end); 475 goto e_unmap; 476 } 477 478 /* Adjust the maximum index based on the found segments */ 479 rst_max_index = max_index + 1; 480 481 memunmap(rst); 482 483 return true; 484 485 e_unmap: 486 memunmap(rst); 487 488 e_free: 489 free_rmp_segment_table(); 490 491 return false; 492 } 493 494 static bool __init setup_rmptable(void) 495 { 496 if (rmp_cfg & MSR_AMD64_SEG_RMP_ENABLED) { 497 return setup_segmented_rmptable(); 498 } else { 499 return setup_contiguous_rmptable(); 500 } 501 } 502 503 /* 504 * Do the necessary preparations which are verified by the firmware as 505 * described in the SNP_INIT_EX firmware command description in the SNP 506 * firmware ABI spec. 507 */ 508 static int __init snp_rmptable_init(void) 509 { 510 unsigned int i; 511 u64 val; 512 513 if (!cc_platform_has(CC_ATTR_HOST_SEV_SNP)) 514 return 0; 515 516 if (!amd_iommu_snp_en) 517 goto nosnp; 518 519 if (!setup_rmptable()) 520 goto nosnp; 521 522 /* 523 * Check if SEV-SNP is already enabled, this can happen in case of 524 * kexec boot. 525 */ 526 rdmsrl(MSR_AMD64_SYSCFG, val); 527 if (val & MSR_AMD64_SYSCFG_SNP_EN) 528 goto skip_enable; 529 530 /* Zero out the RMP bookkeeping area */ 531 if (!clear_rmptable_bookkeeping()) { 532 free_rmp_segment_table(); 533 goto nosnp; 534 } 535 536 /* Zero out the RMP entries */ 537 for (i = 0; i < rst_max_index; i++) { 538 struct rmp_segment_desc *desc; 539 540 desc = rmp_segment_table[i]; 541 if (!desc) 542 continue; 543 544 memset(desc->rmp_entry, 0, desc->size); 545 } 546 547 /* Flush the caches to ensure that data is written before SNP is enabled. */ 548 wbinvd_on_all_cpus(); 549 550 /* MtrrFixDramModEn must be enabled on all the CPUs prior to enabling SNP. */ 551 on_each_cpu(mfd_enable, NULL, 1); 552 553 on_each_cpu(snp_enable, NULL, 1); 554 555 skip_enable: 556 cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "x86/rmptable_init:online", __snp_enable, NULL); 557 558 /* 559 * Setting crash_kexec_post_notifiers to 'true' to ensure that SNP panic 560 * notifier is invoked to do SNP IOMMU shutdown before kdump. 561 */ 562 crash_kexec_post_notifiers = true; 563 564 return 0; 565 566 nosnp: 567 cc_platform_clear(CC_ATTR_HOST_SEV_SNP); 568 return -ENOSYS; 569 } 570 571 /* 572 * This must be called after the IOMMU has been initialized. 573 */ 574 device_initcall(snp_rmptable_init); 575 576 static void set_rmp_segment_info(unsigned int segment_shift) 577 { 578 rmp_segment_shift = segment_shift; 579 rmp_segment_size = 1ULL << rmp_segment_shift; 580 rmp_segment_mask = rmp_segment_size - 1; 581 } 582 583 #define RMP_ADDR_MASK GENMASK_ULL(51, 13) 584 585 static bool probe_contiguous_rmptable_info(void) 586 { 587 u64 rmp_sz, rmp_base, rmp_end; 588 589 rdmsrl(MSR_AMD64_RMP_BASE, rmp_base); 590 rdmsrl(MSR_AMD64_RMP_END, rmp_end); 591 592 if (!(rmp_base & RMP_ADDR_MASK) || !(rmp_end & RMP_ADDR_MASK)) { 593 pr_err("Memory for the RMP table has not been reserved by BIOS\n"); 594 return false; 595 } 596 597 if (rmp_base > rmp_end) { 598 pr_err("RMP configuration not valid: base=%#llx, end=%#llx\n", rmp_base, rmp_end); 599 return false; 600 } 601 602 rmp_sz = rmp_end - rmp_base + 1; 603 604 /* Treat the contiguous RMP table as a single segment */ 605 rst_max_index = 1; 606 607 set_rmp_segment_info(RMPTABLE_NON_SEGMENTED_SHIFT); 608 609 probed_rmp_base = rmp_base; 610 probed_rmp_size = rmp_sz; 611 612 pr_info("RMP table physical range [0x%016llx - 0x%016llx]\n", 613 rmp_base, rmp_end); 614 615 return true; 616 } 617 618 static bool probe_segmented_rmptable_info(void) 619 { 620 unsigned int eax, ebx, segment_shift, segment_shift_min, segment_shift_max; 621 u64 rmp_base, rmp_end; 622 623 rdmsrl(MSR_AMD64_RMP_BASE, rmp_base); 624 if (!(rmp_base & RMP_ADDR_MASK)) { 625 pr_err("Memory for the RMP table has not been reserved by BIOS\n"); 626 return false; 627 } 628 629 rdmsrl(MSR_AMD64_RMP_END, rmp_end); 630 WARN_ONCE(rmp_end & RMP_ADDR_MASK, 631 "Segmented RMP enabled but RMP_END MSR is non-zero\n"); 632 633 /* Obtain the min and max supported RMP segment size */ 634 eax = cpuid_eax(0x80000025); 635 segment_shift_min = eax & GENMASK(5, 0); 636 segment_shift_max = (eax & GENMASK(11, 6)) >> 6; 637 638 /* Verify the segment size is within the supported limits */ 639 segment_shift = MSR_AMD64_RMP_SEGMENT_SHIFT(rmp_cfg); 640 if (segment_shift > segment_shift_max || segment_shift < segment_shift_min) { 641 pr_err("RMP segment size (%u) is not within advertised bounds (min=%u, max=%u)\n", 642 segment_shift, segment_shift_min, segment_shift_max); 643 return false; 644 } 645 646 /* Override the max supported RST index if a hardware limit exists */ 647 ebx = cpuid_ebx(0x80000025); 648 if (ebx & BIT(10)) 649 rst_max_index = ebx & GENMASK(9, 0); 650 651 set_rmp_segment_info(segment_shift); 652 653 probed_rmp_base = rmp_base; 654 probed_rmp_size = 0; 655 656 pr_info("Segmented RMP base table physical range [0x%016llx - 0x%016llx]\n", 657 rmp_base, rmp_base + RMPTABLE_CPU_BOOKKEEPING_SZ + RST_SIZE); 658 659 return true; 660 } 661 662 bool snp_probe_rmptable_info(void) 663 { 664 if (cpu_feature_enabled(X86_FEATURE_SEGMENTED_RMP)) 665 rdmsrl(MSR_AMD64_RMP_CFG, rmp_cfg); 666 667 if (rmp_cfg & MSR_AMD64_SEG_RMP_ENABLED) 668 return probe_segmented_rmptable_info(); 669 else 670 return probe_contiguous_rmptable_info(); 671 } 672 673 /* 674 * About the array_index_nospec() usage below: 675 * 676 * This function can get called by exported functions like 677 * snp_lookup_rmpentry(), which is used by the KVM #PF handler, among 678 * others, and since the @pfn passed in cannot always be trusted, 679 * speculation should be stopped as a protective measure. 680 */ 681 static struct rmpentry_raw *get_raw_rmpentry(u64 pfn) 682 { 683 u64 paddr, rst_index, segment_index; 684 struct rmp_segment_desc *desc; 685 686 if (!rmp_segment_table) 687 return ERR_PTR(-ENODEV); 688 689 paddr = pfn << PAGE_SHIFT; 690 691 rst_index = RST_ENTRY_INDEX(paddr); 692 if (unlikely(rst_index >= rst_max_index)) 693 return ERR_PTR(-EFAULT); 694 695 rst_index = array_index_nospec(rst_index, rst_max_index); 696 697 desc = rmp_segment_table[rst_index]; 698 if (unlikely(!desc)) 699 return ERR_PTR(-EFAULT); 700 701 segment_index = RMP_ENTRY_INDEX(paddr); 702 if (unlikely(segment_index >= desc->max_index)) 703 return ERR_PTR(-EFAULT); 704 705 segment_index = array_index_nospec(segment_index, desc->max_index); 706 707 return desc->rmp_entry + segment_index; 708 } 709 710 static int get_rmpentry(u64 pfn, struct rmpentry *e) 711 { 712 struct rmpentry_raw *e_raw; 713 714 if (cpu_feature_enabled(X86_FEATURE_RMPREAD)) { 715 int ret; 716 717 /* Binutils version 2.44 supports the RMPREAD mnemonic. */ 718 asm volatile(".byte 0xf2, 0x0f, 0x01, 0xfd" 719 : "=a" (ret) 720 : "a" (pfn << PAGE_SHIFT), "c" (e) 721 : "memory", "cc"); 722 723 return ret; 724 } 725 726 e_raw = get_raw_rmpentry(pfn); 727 if (IS_ERR(e_raw)) 728 return PTR_ERR(e_raw); 729 730 /* 731 * Map the raw RMP table entry onto the RMPREAD output format. 732 * The 2MB region status indicator (hpage_region_status field) is not 733 * calculated, since the overhead could be significant and the field 734 * is not used. 735 */ 736 memset(e, 0, sizeof(*e)); 737 e->gpa = e_raw->gpa << PAGE_SHIFT; 738 e->asid = e_raw->asid; 739 e->assigned = e_raw->assigned; 740 e->pagesize = e_raw->pagesize; 741 e->immutable = e_raw->immutable; 742 743 return 0; 744 } 745 746 static int __snp_lookup_rmpentry(u64 pfn, struct rmpentry *e, int *level) 747 { 748 struct rmpentry e_large; 749 int ret; 750 751 if (!cc_platform_has(CC_ATTR_HOST_SEV_SNP)) 752 return -ENODEV; 753 754 ret = get_rmpentry(pfn, e); 755 if (ret) 756 return ret; 757 758 /* 759 * Find the authoritative RMP entry for a PFN. This can be either a 4K 760 * RMP entry or a special large RMP entry that is authoritative for a 761 * whole 2M area. 762 */ 763 ret = get_rmpentry(pfn & PFN_PMD_MASK, &e_large); 764 if (ret) 765 return ret; 766 767 *level = RMP_TO_PG_LEVEL(e_large.pagesize); 768 769 return 0; 770 } 771 772 int snp_lookup_rmpentry(u64 pfn, bool *assigned, int *level) 773 { 774 struct rmpentry e; 775 int ret; 776 777 ret = __snp_lookup_rmpentry(pfn, &e, level); 778 if (ret) 779 return ret; 780 781 *assigned = !!e.assigned; 782 return 0; 783 } 784 EXPORT_SYMBOL_GPL(snp_lookup_rmpentry); 785 786 /* 787 * Dump the raw RMP entry for a particular PFN. These bits are documented in the 788 * PPR for a particular CPU model and provide useful information about how a 789 * particular PFN is being utilized by the kernel/firmware at the time certain 790 * unexpected events occur, such as RMP faults. 791 */ 792 static void dump_rmpentry(u64 pfn) 793 { 794 struct rmpentry_raw *e_raw; 795 u64 pfn_i, pfn_end; 796 struct rmpentry e; 797 int level, ret; 798 799 ret = __snp_lookup_rmpentry(pfn, &e, &level); 800 if (ret) { 801 pr_err("Failed to read RMP entry for PFN 0x%llx, error %d\n", 802 pfn, ret); 803 return; 804 } 805 806 if (e.assigned) { 807 e_raw = get_raw_rmpentry(pfn); 808 if (IS_ERR(e_raw)) { 809 pr_err("Failed to read RMP contents for PFN 0x%llx, error %ld\n", 810 pfn, PTR_ERR(e_raw)); 811 return; 812 } 813 814 pr_info("PFN 0x%llx, RMP entry: [0x%016llx - 0x%016llx]\n", 815 pfn, e_raw->lo, e_raw->hi); 816 return; 817 } 818 819 /* 820 * If the RMP entry for a particular PFN is not in an assigned state, 821 * then it is sometimes useful to get an idea of whether or not any RMP 822 * entries for other PFNs within the same 2MB region are assigned, since 823 * those too can affect the ability to access a particular PFN in 824 * certain situations, such as when the PFN is being accessed via a 2MB 825 * mapping in the host page table. 826 */ 827 pfn_i = ALIGN_DOWN(pfn, PTRS_PER_PMD); 828 pfn_end = pfn_i + PTRS_PER_PMD; 829 830 pr_info("PFN 0x%llx unassigned, dumping non-zero entries in 2M PFN region: [0x%llx - 0x%llx]\n", 831 pfn, pfn_i, pfn_end); 832 833 while (pfn_i < pfn_end) { 834 e_raw = get_raw_rmpentry(pfn_i); 835 if (IS_ERR(e_raw)) { 836 pr_err("Error %ld reading RMP contents for PFN 0x%llx\n", 837 PTR_ERR(e_raw), pfn_i); 838 pfn_i++; 839 continue; 840 } 841 842 if (e_raw->lo || e_raw->hi) 843 pr_info("PFN: 0x%llx, [0x%016llx - 0x%016llx]\n", pfn_i, e_raw->lo, e_raw->hi); 844 pfn_i++; 845 } 846 } 847 848 void snp_dump_hva_rmpentry(unsigned long hva) 849 { 850 unsigned long paddr; 851 unsigned int level; 852 pgd_t *pgd; 853 pte_t *pte; 854 855 pgd = __va(read_cr3_pa()); 856 pgd += pgd_index(hva); 857 pte = lookup_address_in_pgd(pgd, hva, &level); 858 859 if (!pte) { 860 pr_err("Can't dump RMP entry for HVA %lx: no PTE/PFN found\n", hva); 861 return; 862 } 863 864 paddr = PFN_PHYS(pte_pfn(*pte)) | (hva & ~page_level_mask(level)); 865 dump_rmpentry(PHYS_PFN(paddr)); 866 } 867 868 /* 869 * PSMASH a 2MB aligned page into 4K pages in the RMP table while preserving the 870 * Validated bit. 871 */ 872 int psmash(u64 pfn) 873 { 874 unsigned long paddr = pfn << PAGE_SHIFT; 875 int ret; 876 877 if (!cc_platform_has(CC_ATTR_HOST_SEV_SNP)) 878 return -ENODEV; 879 880 if (!pfn_valid(pfn)) 881 return -EINVAL; 882 883 /* Binutils version 2.36 supports the PSMASH mnemonic. */ 884 asm volatile(".byte 0xF3, 0x0F, 0x01, 0xFF" 885 : "=a" (ret) 886 : "a" (paddr) 887 : "memory", "cc"); 888 889 return ret; 890 } 891 EXPORT_SYMBOL_GPL(psmash); 892 893 /* 894 * If the kernel uses a 2MB or larger directmap mapping to write to an address, 895 * and that mapping contains any 4KB pages that are set to private in the RMP 896 * table, an RMP #PF will trigger and cause a host crash. Hypervisor code that 897 * owns the PFNs being transitioned will never attempt such a write, but other 898 * kernel tasks writing to other PFNs in the range may trigger these checks 899 * inadvertently due a large directmap mapping that happens to overlap such a 900 * PFN. 901 * 902 * Prevent this by splitting any 2MB+ mappings that might end up containing a 903 * mix of private/shared PFNs as a result of a subsequent RMPUPDATE for the 904 * PFN/rmp_level passed in. 905 * 906 * Note that there is no attempt here to scan all the RMP entries for the 2MB 907 * physical range, since it would only be worthwhile in determining if a 908 * subsequent RMPUPDATE for a 4KB PFN would result in all the entries being of 909 * the same shared/private state, thus avoiding the need to split the mapping. 910 * But that would mean the entries are currently in a mixed state, and so the 911 * mapping would have already been split as a result of prior transitions. 912 * And since the 4K split is only done if the mapping is 2MB+, and there isn't 913 * currently a mechanism in place to restore 2MB+ mappings, such a check would 914 * not provide any usable benefit. 915 * 916 * More specifics on how these checks are carried out can be found in APM 917 * Volume 2, "RMP and VMPL Access Checks". 918 */ 919 static int adjust_direct_map(u64 pfn, int rmp_level) 920 { 921 unsigned long vaddr; 922 unsigned int level; 923 int npages, ret; 924 pte_t *pte; 925 926 /* 927 * pfn_to_kaddr() will return a vaddr only within the direct 928 * map range. 929 */ 930 vaddr = (unsigned long)pfn_to_kaddr(pfn); 931 932 /* Only 4KB/2MB RMP entries are supported by current hardware. */ 933 if (WARN_ON_ONCE(rmp_level > PG_LEVEL_2M)) 934 return -EINVAL; 935 936 if (!pfn_valid(pfn)) 937 return -EINVAL; 938 939 if (rmp_level == PG_LEVEL_2M && 940 (!IS_ALIGNED(pfn, PTRS_PER_PMD) || !pfn_valid(pfn + PTRS_PER_PMD - 1))) 941 return -EINVAL; 942 943 /* 944 * If an entire 2MB physical range is being transitioned, then there is 945 * no risk of RMP #PFs due to write accesses from overlapping mappings, 946 * since even accesses from 1GB mappings will be treated as 2MB accesses 947 * as far as RMP table checks are concerned. 948 */ 949 if (rmp_level == PG_LEVEL_2M) 950 return 0; 951 952 pte = lookup_address(vaddr, &level); 953 if (!pte || pte_none(*pte)) 954 return 0; 955 956 if (level == PG_LEVEL_4K) 957 return 0; 958 959 npages = page_level_size(rmp_level) / PAGE_SIZE; 960 ret = set_memory_4k(vaddr, npages); 961 if (ret) 962 pr_warn("Failed to split direct map for PFN 0x%llx, ret: %d\n", 963 pfn, ret); 964 965 return ret; 966 } 967 968 /* 969 * It is expected that those operations are seldom enough so that no mutual 970 * exclusion of updaters is needed and thus the overlap error condition below 971 * should happen very rarely and would get resolved relatively quickly by 972 * the firmware. 973 * 974 * If not, one could consider introducing a mutex or so here to sync concurrent 975 * RMP updates and thus diminish the amount of cases where firmware needs to 976 * lock 2M ranges to protect against concurrent updates. 977 * 978 * The optimal solution would be range locking to avoid locking disjoint 979 * regions unnecessarily but there's no support for that yet. 980 */ 981 static int rmpupdate(u64 pfn, struct rmp_state *state) 982 { 983 unsigned long paddr = pfn << PAGE_SHIFT; 984 int ret, level; 985 986 if (!cc_platform_has(CC_ATTR_HOST_SEV_SNP)) 987 return -ENODEV; 988 989 level = RMP_TO_PG_LEVEL(state->pagesize); 990 991 if (adjust_direct_map(pfn, level)) 992 return -EFAULT; 993 994 do { 995 /* Binutils version 2.36 supports the RMPUPDATE mnemonic. */ 996 asm volatile(".byte 0xF2, 0x0F, 0x01, 0xFE" 997 : "=a" (ret) 998 : "a" (paddr), "c" ((unsigned long)state) 999 : "memory", "cc"); 1000 } while (ret == RMPUPDATE_FAIL_OVERLAP); 1001 1002 if (ret) { 1003 pr_err("RMPUPDATE failed for PFN %llx, pg_level: %d, ret: %d\n", 1004 pfn, level, ret); 1005 dump_rmpentry(pfn); 1006 dump_stack(); 1007 return -EFAULT; 1008 } 1009 1010 return 0; 1011 } 1012 1013 /* Transition a page to guest-owned/private state in the RMP table. */ 1014 int rmp_make_private(u64 pfn, u64 gpa, enum pg_level level, u32 asid, bool immutable) 1015 { 1016 struct rmp_state state; 1017 1018 memset(&state, 0, sizeof(state)); 1019 state.assigned = 1; 1020 state.asid = asid; 1021 state.immutable = immutable; 1022 state.gpa = gpa; 1023 state.pagesize = PG_LEVEL_TO_RMP(level); 1024 1025 return rmpupdate(pfn, &state); 1026 } 1027 EXPORT_SYMBOL_GPL(rmp_make_private); 1028 1029 /* Transition a page to hypervisor-owned/shared state in the RMP table. */ 1030 int rmp_make_shared(u64 pfn, enum pg_level level) 1031 { 1032 struct rmp_state state; 1033 1034 memset(&state, 0, sizeof(state)); 1035 state.pagesize = PG_LEVEL_TO_RMP(level); 1036 1037 return rmpupdate(pfn, &state); 1038 } 1039 EXPORT_SYMBOL_GPL(rmp_make_shared); 1040 1041 void snp_leak_pages(u64 pfn, unsigned int npages) 1042 { 1043 struct page *page = pfn_to_page(pfn); 1044 1045 pr_warn("Leaking PFN range 0x%llx-0x%llx\n", pfn, pfn + npages); 1046 1047 spin_lock(&snp_leaked_pages_list_lock); 1048 while (npages--) { 1049 1050 /* 1051 * Reuse the page's buddy list for chaining into the leaked 1052 * pages list. This page should not be on a free list currently 1053 * and is also unsafe to be added to a free list. 1054 */ 1055 if (likely(!PageCompound(page)) || 1056 1057 /* 1058 * Skip inserting tail pages of compound page as 1059 * page->buddy_list of tail pages is not usable. 1060 */ 1061 (PageHead(page) && compound_nr(page) <= npages)) 1062 list_add_tail(&page->buddy_list, &snp_leaked_pages_list); 1063 1064 dump_rmpentry(pfn); 1065 snp_nr_leaked_pages++; 1066 pfn++; 1067 page++; 1068 } 1069 spin_unlock(&snp_leaked_pages_list_lock); 1070 } 1071 EXPORT_SYMBOL_GPL(snp_leak_pages); 1072 1073 void kdump_sev_callback(void) 1074 { 1075 /* 1076 * Do wbinvd() on remote CPUs when SNP is enabled in order to 1077 * safely do SNP_SHUTDOWN on the local CPU. 1078 */ 1079 if (cc_platform_has(CC_ATTR_HOST_SEV_SNP)) 1080 wbinvd(); 1081 } 1082