1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * AMD SVM-SEV Host Support. 4 * 5 * Copyright (C) 2023 Advanced Micro Devices, Inc. 6 * 7 * Author: Ashish Kalra <ashish.kalra@amd.com> 8 * 9 */ 10 11 #include <linux/cc_platform.h> 12 #include <linux/printk.h> 13 #include <linux/mm_types.h> 14 #include <linux/set_memory.h> 15 #include <linux/memblock.h> 16 #include <linux/kernel.h> 17 #include <linux/mm.h> 18 #include <linux/cpumask.h> 19 #include <linux/iommu.h> 20 #include <linux/amd-iommu.h> 21 #include <linux/nospec.h> 22 23 #include <asm/sev.h> 24 #include <asm/processor.h> 25 #include <asm/setup.h> 26 #include <asm/svm.h> 27 #include <asm/smp.h> 28 #include <asm/cpu.h> 29 #include <asm/apic.h> 30 #include <asm/cpuid/api.h> 31 #include <asm/cmdline.h> 32 #include <asm/iommu.h> 33 #include <asm/msr.h> 34 35 /* 36 * The RMP entry information as returned by the RMPREAD instruction. 37 */ 38 struct rmpentry { 39 u64 gpa; 40 u8 assigned :1, 41 rsvd1 :7; 42 u8 pagesize :1, 43 hpage_region_status :1, 44 rsvd2 :6; 45 u8 immutable :1, 46 rsvd3 :7; 47 u8 rsvd4; 48 u32 asid; 49 } __packed; 50 51 /* 52 * The raw RMP entry format is not architectural. The format is defined in PPR 53 * Family 19h Model 01h, Rev B1 processor. This format represents the actual 54 * entry in the RMP table memory. The bitfield definitions are used for machines 55 * without the RMPREAD instruction (Zen3 and Zen4), otherwise the "hi" and "lo" 56 * fields are only used for dumping the raw data. 57 */ 58 struct rmpentry_raw { 59 union { 60 struct { 61 u64 assigned : 1, 62 pagesize : 1, 63 immutable : 1, 64 rsvd1 : 9, 65 gpa : 39, 66 asid : 10, 67 vmsa : 1, 68 validated : 1, 69 rsvd2 : 1; 70 }; 71 u64 lo; 72 }; 73 u64 hi; 74 } __packed; 75 76 /* 77 * The first 16KB from the RMP_BASE is used by the processor for the 78 * bookkeeping, the range needs to be added during the RMP entry lookup. 79 */ 80 #define RMPTABLE_CPU_BOOKKEEPING_SZ 0x4000 81 82 /* 83 * For a non-segmented RMP table, use the maximum physical addressing as the 84 * segment size in order to always arrive at index 0 in the table. 85 */ 86 #define RMPTABLE_NON_SEGMENTED_SHIFT 52 87 88 struct rmp_segment_desc { 89 struct rmpentry_raw *rmp_entry; 90 u64 max_index; 91 u64 size; 92 }; 93 94 /* 95 * Segmented RMP Table support. 96 * - The segment size is used for two purposes: 97 * - Identify the amount of memory covered by an RMP segment 98 * - Quickly locate an RMP segment table entry for a physical address 99 * 100 * - The RMP segment table contains pointers to an RMP table that covers 101 * a specific portion of memory. There can be up to 512 8-byte entries, 102 * one pages worth. 103 */ 104 #define RST_ENTRY_MAPPED_SIZE(x) ((x) & GENMASK_ULL(19, 0)) 105 #define RST_ENTRY_SEGMENT_BASE(x) ((x) & GENMASK_ULL(51, 20)) 106 107 #define RST_SIZE SZ_4K 108 static struct rmp_segment_desc **rmp_segment_table __ro_after_init; 109 static unsigned int rst_max_index __ro_after_init = 512; 110 111 static unsigned int rmp_segment_shift; 112 static u64 rmp_segment_size; 113 static u64 rmp_segment_mask; 114 115 #define RST_ENTRY_INDEX(x) ((x) >> rmp_segment_shift) 116 #define RMP_ENTRY_INDEX(x) ((u64)(PHYS_PFN((x) & rmp_segment_mask))) 117 118 static u64 rmp_cfg; 119 120 static void *rmp_bookkeeping __ro_after_init; 121 122 /* Mask to apply to a PFN to get the first PFN of a 2MB page */ 123 #define PFN_PMD_MASK GENMASK_ULL(63, PMD_SHIFT - PAGE_SHIFT) 124 125 static u64 probed_rmp_base, probed_rmp_size; 126 127 static LIST_HEAD(snp_leaked_pages_list); 128 static DEFINE_SPINLOCK(snp_leaked_pages_list_lock); 129 130 static unsigned long snp_nr_leaked_pages; 131 132 #undef pr_fmt 133 #define pr_fmt(fmt) "SEV-SNP: " fmt 134 135 static void mfd_reconfigure(void *arg) 136 { 137 if (!cc_platform_has(CC_ATTR_HOST_SEV_SNP)) 138 return; 139 140 if (arg) 141 msr_set_bit(MSR_AMD64_SYSCFG, MSR_AMD64_SYSCFG_MFDM_BIT); 142 else 143 msr_clear_bit(MSR_AMD64_SYSCFG, MSR_AMD64_SYSCFG_MFDM_BIT); 144 } 145 146 static void snp_enable(void *arg) 147 { 148 u64 val; 149 150 if (!cc_platform_has(CC_ATTR_HOST_SEV_SNP)) 151 return; 152 153 rdmsrq(MSR_AMD64_SYSCFG, val); 154 155 val |= MSR_AMD64_SYSCFG_SNP_EN; 156 val |= MSR_AMD64_SYSCFG_SNP_VMPL_EN; 157 158 wrmsrq(MSR_AMD64_SYSCFG, val); 159 } 160 161 static void __init __snp_fixup_e820_tables(u64 pa) 162 { 163 if (IS_ALIGNED(pa, PMD_SIZE)) 164 return; 165 166 /* 167 * Handle cases where the RMP table placement by the BIOS is not 168 * 2M aligned and the kexec kernel could try to allocate 169 * from within that chunk which then causes a fatal RMP fault. 170 * 171 * The e820_table needs to be updated as it is converted to 172 * kernel memory resources and used by KEXEC_FILE_LOAD syscall 173 * to load kexec segments. 174 * 175 * The e820_table_firmware needs to be updated as it is exposed 176 * to sysfs and used by the KEXEC_LOAD syscall to load kexec 177 * segments. 178 * 179 * The e820_table_kexec needs to be updated as it passed to 180 * the kexec-ed kernel. 181 */ 182 pa = ALIGN_DOWN(pa, PMD_SIZE); 183 if (e820__mapped_any(pa, pa + PMD_SIZE, E820_TYPE_RAM)) { 184 pr_info("Reserving start/end of RMP table on a 2MB boundary [0x%016llx]\n", pa); 185 e820__range_update(pa, PMD_SIZE, E820_TYPE_RAM, E820_TYPE_RESERVED); 186 e820__range_update_table(e820_table_kexec, pa, PMD_SIZE, E820_TYPE_RAM, E820_TYPE_RESERVED); 187 if (!memblock_is_region_reserved(pa, PMD_SIZE)) 188 memblock_reserve(pa, PMD_SIZE); 189 } 190 } 191 192 static void __init fixup_e820_tables_for_segmented_rmp(void) 193 { 194 u64 pa, *rst, size, mapped_size; 195 unsigned int i; 196 197 __snp_fixup_e820_tables(probed_rmp_base); 198 199 pa = probed_rmp_base + RMPTABLE_CPU_BOOKKEEPING_SZ; 200 201 __snp_fixup_e820_tables(pa + RST_SIZE); 202 203 rst = early_memremap(pa, RST_SIZE); 204 if (!rst) 205 return; 206 207 for (i = 0; i < rst_max_index; i++) { 208 pa = RST_ENTRY_SEGMENT_BASE(rst[i]); 209 mapped_size = RST_ENTRY_MAPPED_SIZE(rst[i]); 210 if (!mapped_size) 211 continue; 212 213 __snp_fixup_e820_tables(pa); 214 215 /* 216 * Mapped size in GB. Mapped size is allowed to exceed 217 * the segment coverage size, but gets reduced to the 218 * segment coverage size. 219 */ 220 mapped_size <<= 30; 221 if (mapped_size > rmp_segment_size) 222 mapped_size = rmp_segment_size; 223 224 /* Calculate the RMP segment size (16 bytes/page mapped) */ 225 size = PHYS_PFN(mapped_size) << 4; 226 227 __snp_fixup_e820_tables(pa + size); 228 } 229 230 early_memunmap(rst, RST_SIZE); 231 } 232 233 static void __init fixup_e820_tables_for_contiguous_rmp(void) 234 { 235 __snp_fixup_e820_tables(probed_rmp_base); 236 __snp_fixup_e820_tables(probed_rmp_base + probed_rmp_size); 237 } 238 239 void __init snp_fixup_e820_tables(void) 240 { 241 if (rmp_cfg & MSR_AMD64_SEG_RMP_ENABLED) { 242 fixup_e820_tables_for_segmented_rmp(); 243 } else { 244 fixup_e820_tables_for_contiguous_rmp(); 245 } 246 } 247 248 static void clear_rmp(void) 249 { 250 unsigned int i; 251 u64 val; 252 253 if (!cc_platform_has(CC_ATTR_HOST_SEV_SNP)) 254 return; 255 256 /* Clearing the RMP while SNP is enabled will cause an exception */ 257 rdmsrq(MSR_AMD64_SYSCFG, val); 258 if (WARN_ON_ONCE(val & MSR_AMD64_SYSCFG_SNP_EN)) 259 return; 260 261 memset(rmp_bookkeeping, 0, RMPTABLE_CPU_BOOKKEEPING_SZ); 262 263 for (i = 0; i < rst_max_index; i++) { 264 struct rmp_segment_desc *desc; 265 266 desc = rmp_segment_table[i]; 267 if (!desc) 268 continue; 269 270 memset(desc->rmp_entry, 0, desc->size); 271 } 272 } 273 274 static bool __init alloc_rmp_segment_desc(u64 segment_pa, u64 segment_size, u64 pa) 275 { 276 u64 rst_index, rmp_segment_size_max; 277 struct rmp_segment_desc *desc; 278 void *rmp_segment; 279 280 /* Calculate the maximum size an RMP can be (16 bytes/page mapped) */ 281 rmp_segment_size_max = PHYS_PFN(rmp_segment_size) << 4; 282 283 /* Validate the RMP segment size */ 284 if (segment_size > rmp_segment_size_max) { 285 pr_err("Invalid RMP size 0x%llx for configured segment size 0x%llx\n", 286 segment_size, rmp_segment_size_max); 287 return false; 288 } 289 290 /* Validate the RMP segment table index */ 291 rst_index = RST_ENTRY_INDEX(pa); 292 if (rst_index >= rst_max_index) { 293 pr_err("Invalid RMP segment base address 0x%llx for configured segment size 0x%llx\n", 294 pa, rmp_segment_size); 295 return false; 296 } 297 298 if (rmp_segment_table[rst_index]) { 299 pr_err("RMP segment descriptor already exists at index %llu\n", rst_index); 300 return false; 301 } 302 303 rmp_segment = memremap(segment_pa, segment_size, MEMREMAP_WB); 304 if (!rmp_segment) { 305 pr_err("Failed to map RMP segment addr 0x%llx size 0x%llx\n", 306 segment_pa, segment_size); 307 return false; 308 } 309 310 desc = kzalloc_obj(*desc); 311 if (!desc) { 312 memunmap(rmp_segment); 313 return false; 314 } 315 316 desc->rmp_entry = rmp_segment; 317 desc->max_index = segment_size / sizeof(*desc->rmp_entry); 318 desc->size = segment_size; 319 320 rmp_segment_table[rst_index] = desc; 321 322 return true; 323 } 324 325 static void __init free_rmp_segment_table(void) 326 { 327 unsigned int i; 328 329 for (i = 0; i < rst_max_index; i++) { 330 struct rmp_segment_desc *desc; 331 332 desc = rmp_segment_table[i]; 333 if (!desc) 334 continue; 335 336 memunmap(desc->rmp_entry); 337 338 kfree(desc); 339 } 340 341 free_page((unsigned long)rmp_segment_table); 342 343 rmp_segment_table = NULL; 344 } 345 346 /* Allocate the table used to index into the RMP segments */ 347 static bool __init alloc_rmp_segment_table(void) 348 { 349 struct page *page; 350 351 page = alloc_page(__GFP_ZERO); 352 if (!page) 353 return false; 354 355 rmp_segment_table = page_address(page); 356 357 return true; 358 } 359 360 static bool __init setup_contiguous_rmptable(void) 361 { 362 u64 max_rmp_pfn, calc_rmp_sz, rmptable_segment, rmptable_size, rmp_end; 363 364 if (!probed_rmp_size) 365 return false; 366 367 rmp_end = probed_rmp_base + probed_rmp_size - 1; 368 369 /* 370 * Calculate the amount of memory that must be reserved by the BIOS to 371 * address the whole RAM, including the bookkeeping area. The RMP itself 372 * must also be covered. 373 */ 374 max_rmp_pfn = max_pfn; 375 if (PFN_UP(rmp_end) > max_pfn) 376 max_rmp_pfn = PFN_UP(rmp_end); 377 378 calc_rmp_sz = (max_rmp_pfn << 4) + RMPTABLE_CPU_BOOKKEEPING_SZ; 379 if (calc_rmp_sz > probed_rmp_size) { 380 pr_err("Memory reserved for the RMP table does not cover full system RAM (expected 0x%llx got 0x%llx)\n", 381 calc_rmp_sz, probed_rmp_size); 382 return false; 383 } 384 385 if (!alloc_rmp_segment_table()) 386 return false; 387 388 /* Map only the RMP entries */ 389 rmptable_segment = probed_rmp_base + RMPTABLE_CPU_BOOKKEEPING_SZ; 390 rmptable_size = probed_rmp_size - RMPTABLE_CPU_BOOKKEEPING_SZ; 391 392 if (!alloc_rmp_segment_desc(rmptable_segment, rmptable_size, 0)) { 393 free_rmp_segment_table(); 394 return false; 395 } 396 397 return true; 398 } 399 400 static bool __init setup_segmented_rmptable(void) 401 { 402 u64 rst_pa, *rst, pa, ram_pa_end, ram_pa_max; 403 unsigned int i, max_index; 404 405 if (!probed_rmp_base) 406 return false; 407 408 if (!alloc_rmp_segment_table()) 409 return false; 410 411 rst_pa = probed_rmp_base + RMPTABLE_CPU_BOOKKEEPING_SZ; 412 rst = memremap(rst_pa, RST_SIZE, MEMREMAP_WB); 413 if (!rst) { 414 pr_err("Failed to map RMP segment table addr 0x%llx\n", rst_pa); 415 goto e_free; 416 } 417 418 pr_info("Segmented RMP using %lluGB segments\n", rmp_segment_size >> 30); 419 420 ram_pa_max = max_pfn << PAGE_SHIFT; 421 422 max_index = 0; 423 ram_pa_end = 0; 424 for (i = 0; i < rst_max_index; i++) { 425 u64 rmp_segment, rmp_size, mapped_size; 426 427 mapped_size = RST_ENTRY_MAPPED_SIZE(rst[i]); 428 if (!mapped_size) 429 continue; 430 431 max_index = i; 432 433 /* 434 * Mapped size in GB. Mapped size is allowed to exceed the 435 * segment coverage size, but gets reduced to the segment 436 * coverage size. 437 */ 438 mapped_size <<= 30; 439 if (mapped_size > rmp_segment_size) { 440 pr_info("RMP segment %u mapped size (0x%llx) reduced to 0x%llx\n", 441 i, mapped_size, rmp_segment_size); 442 mapped_size = rmp_segment_size; 443 } 444 445 rmp_segment = RST_ENTRY_SEGMENT_BASE(rst[i]); 446 447 /* Calculate the RMP segment size (16 bytes/page mapped) */ 448 rmp_size = PHYS_PFN(mapped_size) << 4; 449 450 pa = (u64)i << rmp_segment_shift; 451 452 /* 453 * Some segments may be for MMIO mapped above system RAM. These 454 * segments are used for Trusted I/O. 455 */ 456 if (pa < ram_pa_max) 457 ram_pa_end = pa + mapped_size; 458 459 if (!alloc_rmp_segment_desc(rmp_segment, rmp_size, pa)) 460 goto e_unmap; 461 462 pr_info("RMP segment %u physical address [0x%llx - 0x%llx] covering [0x%llx - 0x%llx]\n", 463 i, rmp_segment, rmp_segment + rmp_size - 1, pa, pa + mapped_size - 1); 464 } 465 466 if (ram_pa_max > ram_pa_end) { 467 pr_err("Segmented RMP does not cover full system RAM (expected 0x%llx got 0x%llx)\n", 468 ram_pa_max, ram_pa_end); 469 goto e_unmap; 470 } 471 472 /* Adjust the maximum index based on the found segments */ 473 rst_max_index = max_index + 1; 474 475 memunmap(rst); 476 477 return true; 478 479 e_unmap: 480 memunmap(rst); 481 482 e_free: 483 free_rmp_segment_table(); 484 485 return false; 486 } 487 488 static bool __init setup_rmptable(void) 489 { 490 if (rmp_cfg & MSR_AMD64_SEG_RMP_ENABLED) { 491 if (!setup_segmented_rmptable()) 492 return false; 493 } else { 494 if (!setup_contiguous_rmptable()) 495 return false; 496 } 497 498 rmp_bookkeeping = memremap(probed_rmp_base, RMPTABLE_CPU_BOOKKEEPING_SZ, MEMREMAP_WB); 499 if (!rmp_bookkeeping) { 500 pr_err("Failed to map RMP bookkeeping area\n"); 501 free_rmp_segment_table(); 502 503 return false; 504 } 505 506 return true; 507 } 508 509 static void clear_hsave_pa(void *arg) 510 { 511 wrmsrq(MSR_VM_HSAVE_PA, 0); 512 } 513 514 void snp_prepare(void) 515 { 516 u64 val; 517 518 /* 519 * Check if SEV-SNP is already enabled, this can happen in case of 520 * kexec boot. 521 */ 522 rdmsrq(MSR_AMD64_SYSCFG, val); 523 if (val & MSR_AMD64_SYSCFG_SNP_EN) 524 return; 525 526 clear_rmp(); 527 528 cpus_read_lock(); 529 530 /* 531 * MtrrFixDramModEn is not shared between threads on a core, 532 * therefore it must be set on all CPUs prior to enabling SNP. 533 */ 534 on_each_cpu(mfd_reconfigure, (void *)1, 1); 535 on_each_cpu(snp_enable, NULL, 1); 536 537 /* SNP_INIT requires MSR_VM_HSAVE_PA to be cleared on all CPUs. */ 538 on_each_cpu(clear_hsave_pa, NULL, 1); 539 540 cpus_read_unlock(); 541 } 542 EXPORT_SYMBOL_FOR_MODULES(snp_prepare, "ccp"); 543 544 void snp_shutdown(void) 545 { 546 u64 syscfg; 547 548 rdmsrq(MSR_AMD64_SYSCFG, syscfg); 549 if (syscfg & MSR_AMD64_SYSCFG_SNP_EN) 550 return; 551 552 clear_rmp(); 553 on_each_cpu(mfd_reconfigure, NULL, 1); 554 } 555 EXPORT_SYMBOL_FOR_MODULES(snp_shutdown, "ccp"); 556 557 /* 558 * Do the necessary preparations which are verified by the firmware as 559 * described in the SNP_INIT_EX firmware command description in the SNP 560 * firmware ABI spec. 561 */ 562 int __init snp_rmptable_init(void) 563 { 564 if (WARN_ON_ONCE(!cc_platform_has(CC_ATTR_HOST_SEV_SNP))) 565 return -ENOSYS; 566 567 if (WARN_ON_ONCE(!amd_iommu_snp_en)) 568 return -ENOSYS; 569 570 if (!setup_rmptable()) 571 return -ENOSYS; 572 573 /* 574 * Setting crash_kexec_post_notifiers to 'true' to ensure that SNP panic 575 * notifier is invoked to do SNP IOMMU shutdown before kdump. 576 */ 577 crash_kexec_post_notifiers = true; 578 579 return 0; 580 } 581 582 static void set_rmp_segment_info(unsigned int segment_shift) 583 { 584 rmp_segment_shift = segment_shift; 585 rmp_segment_size = 1ULL << rmp_segment_shift; 586 rmp_segment_mask = rmp_segment_size - 1; 587 } 588 589 #define RMP_ADDR_MASK GENMASK_ULL(51, 13) 590 591 static bool probe_contiguous_rmptable_info(void) 592 { 593 u64 rmp_sz, rmp_base, rmp_end; 594 595 rdmsrq(MSR_AMD64_RMP_BASE, rmp_base); 596 rdmsrq(MSR_AMD64_RMP_END, rmp_end); 597 598 if (!(rmp_base & RMP_ADDR_MASK) || !(rmp_end & RMP_ADDR_MASK)) { 599 pr_err("Memory for the RMP table has not been reserved by BIOS\n"); 600 return false; 601 } 602 603 if (rmp_base > rmp_end) { 604 pr_err("RMP configuration not valid: base=%#llx, end=%#llx\n", rmp_base, rmp_end); 605 return false; 606 } 607 608 rmp_sz = rmp_end - rmp_base + 1; 609 610 /* Treat the contiguous RMP table as a single segment */ 611 rst_max_index = 1; 612 613 set_rmp_segment_info(RMPTABLE_NON_SEGMENTED_SHIFT); 614 615 probed_rmp_base = rmp_base; 616 probed_rmp_size = rmp_sz; 617 618 pr_info("RMP table physical range [0x%016llx - 0x%016llx]\n", 619 rmp_base, rmp_end); 620 621 return true; 622 } 623 624 static bool probe_segmented_rmptable_info(void) 625 { 626 unsigned int eax, ebx, segment_shift, segment_shift_min, segment_shift_max; 627 u64 rmp_base, rmp_end; 628 629 rdmsrq(MSR_AMD64_RMP_BASE, rmp_base); 630 if (!(rmp_base & RMP_ADDR_MASK)) { 631 pr_err("Memory for the RMP table has not been reserved by BIOS\n"); 632 return false; 633 } 634 635 rdmsrq(MSR_AMD64_RMP_END, rmp_end); 636 WARN_ONCE(rmp_end & RMP_ADDR_MASK, 637 "Segmented RMP enabled but RMP_END MSR is non-zero\n"); 638 639 /* Obtain the min and max supported RMP segment size */ 640 eax = cpuid_eax(0x80000025); 641 segment_shift_min = eax & GENMASK(5, 0); 642 segment_shift_max = (eax & GENMASK(11, 6)) >> 6; 643 644 /* Verify the segment size is within the supported limits */ 645 segment_shift = MSR_AMD64_RMP_SEGMENT_SHIFT(rmp_cfg); 646 if (segment_shift > segment_shift_max || segment_shift < segment_shift_min) { 647 pr_err("RMP segment size (%u) is not within advertised bounds (min=%u, max=%u)\n", 648 segment_shift, segment_shift_min, segment_shift_max); 649 return false; 650 } 651 652 /* Override the max supported RST index if a hardware limit exists */ 653 ebx = cpuid_ebx(0x80000025); 654 if (ebx & BIT(10)) 655 rst_max_index = ebx & GENMASK(9, 0); 656 657 set_rmp_segment_info(segment_shift); 658 659 probed_rmp_base = rmp_base; 660 probed_rmp_size = 0; 661 662 pr_info("Segmented RMP base table physical range [0x%016llx - 0x%016llx]\n", 663 rmp_base, rmp_base + RMPTABLE_CPU_BOOKKEEPING_SZ + RST_SIZE); 664 665 return true; 666 } 667 668 bool snp_probe_rmptable_info(void) 669 { 670 if (cpu_feature_enabled(X86_FEATURE_SEGMENTED_RMP)) 671 rdmsrq(MSR_AMD64_RMP_CFG, rmp_cfg); 672 673 if (rmp_cfg & MSR_AMD64_SEG_RMP_ENABLED) 674 return probe_segmented_rmptable_info(); 675 else 676 return probe_contiguous_rmptable_info(); 677 } 678 679 /* 680 * About the array_index_nospec() usage below: 681 * 682 * This function can get called by exported functions like 683 * snp_lookup_rmpentry(), which is used by the KVM #PF handler, among 684 * others, and since the @pfn passed in cannot always be trusted, 685 * speculation should be stopped as a protective measure. 686 */ 687 static struct rmpentry_raw *get_raw_rmpentry(u64 pfn) 688 { 689 u64 paddr, rst_index, segment_index; 690 struct rmp_segment_desc *desc; 691 692 if (!rmp_segment_table) 693 return ERR_PTR(-ENODEV); 694 695 paddr = pfn << PAGE_SHIFT; 696 697 rst_index = RST_ENTRY_INDEX(paddr); 698 if (unlikely(rst_index >= rst_max_index)) 699 return ERR_PTR(-EFAULT); 700 701 rst_index = array_index_nospec(rst_index, rst_max_index); 702 703 desc = rmp_segment_table[rst_index]; 704 if (unlikely(!desc)) 705 return ERR_PTR(-EFAULT); 706 707 segment_index = RMP_ENTRY_INDEX(paddr); 708 if (unlikely(segment_index >= desc->max_index)) 709 return ERR_PTR(-EFAULT); 710 711 segment_index = array_index_nospec(segment_index, desc->max_index); 712 713 return desc->rmp_entry + segment_index; 714 } 715 716 static int get_rmpentry(u64 pfn, struct rmpentry *e) 717 { 718 struct rmpentry_raw *e_raw; 719 720 if (cpu_feature_enabled(X86_FEATURE_RMPREAD)) { 721 int ret; 722 723 /* Binutils version 2.44 supports the RMPREAD mnemonic. */ 724 asm volatile(".byte 0xf2, 0x0f, 0x01, 0xfd" 725 : "=a" (ret) 726 : "a" (pfn << PAGE_SHIFT), "c" (e) 727 : "memory", "cc"); 728 729 return ret; 730 } 731 732 e_raw = get_raw_rmpentry(pfn); 733 if (IS_ERR(e_raw)) 734 return PTR_ERR(e_raw); 735 736 /* 737 * Map the raw RMP table entry onto the RMPREAD output format. 738 * The 2MB region status indicator (hpage_region_status field) is not 739 * calculated, since the overhead could be significant and the field 740 * is not used. 741 */ 742 memset(e, 0, sizeof(*e)); 743 e->gpa = e_raw->gpa << PAGE_SHIFT; 744 e->asid = e_raw->asid; 745 e->assigned = e_raw->assigned; 746 e->pagesize = e_raw->pagesize; 747 e->immutable = e_raw->immutable; 748 749 return 0; 750 } 751 752 static int __snp_lookup_rmpentry(u64 pfn, struct rmpentry *e, int *level) 753 { 754 struct rmpentry e_large; 755 int ret; 756 757 if (!cc_platform_has(CC_ATTR_HOST_SEV_SNP)) 758 return -ENODEV; 759 760 ret = get_rmpentry(pfn, e); 761 if (ret) 762 return ret; 763 764 /* 765 * Find the authoritative RMP entry for a PFN. This can be either a 4K 766 * RMP entry or a special large RMP entry that is authoritative for a 767 * whole 2M area. 768 */ 769 ret = get_rmpentry(pfn & PFN_PMD_MASK, &e_large); 770 if (ret) 771 return ret; 772 773 *level = RMP_TO_PG_LEVEL(e_large.pagesize); 774 775 return 0; 776 } 777 778 int snp_lookup_rmpentry(u64 pfn, bool *assigned, int *level) 779 { 780 struct rmpentry e; 781 int ret; 782 783 ret = __snp_lookup_rmpentry(pfn, &e, level); 784 if (ret) 785 return ret; 786 787 *assigned = !!e.assigned; 788 return 0; 789 } 790 EXPORT_SYMBOL_GPL(snp_lookup_rmpentry); 791 792 /* 793 * Dump the raw RMP entry for a particular PFN. These bits are documented in the 794 * PPR for a particular CPU model and provide useful information about how a 795 * particular PFN is being utilized by the kernel/firmware at the time certain 796 * unexpected events occur, such as RMP faults. 797 */ 798 static void dump_rmpentry(u64 pfn) 799 { 800 struct rmpentry_raw *e_raw; 801 u64 pfn_i, pfn_end; 802 struct rmpentry e; 803 int level, ret; 804 805 ret = __snp_lookup_rmpentry(pfn, &e, &level); 806 if (ret) { 807 pr_err("Failed to read RMP entry for PFN 0x%llx, error %d\n", 808 pfn, ret); 809 return; 810 } 811 812 if (e.assigned) { 813 e_raw = get_raw_rmpentry(pfn); 814 if (IS_ERR(e_raw)) { 815 pr_err("Failed to read RMP contents for PFN 0x%llx, error %ld\n", 816 pfn, PTR_ERR(e_raw)); 817 return; 818 } 819 820 pr_info("PFN 0x%llx, RMP entry: [0x%016llx - 0x%016llx]\n", 821 pfn, e_raw->lo, e_raw->hi); 822 return; 823 } 824 825 /* 826 * If the RMP entry for a particular PFN is not in an assigned state, 827 * then it is sometimes useful to get an idea of whether or not any RMP 828 * entries for other PFNs within the same 2MB region are assigned, since 829 * those too can affect the ability to access a particular PFN in 830 * certain situations, such as when the PFN is being accessed via a 2MB 831 * mapping in the host page table. 832 */ 833 pfn_i = ALIGN_DOWN(pfn, PTRS_PER_PMD); 834 pfn_end = pfn_i + PTRS_PER_PMD; 835 836 pr_info("PFN 0x%llx unassigned, dumping non-zero entries in 2M PFN region: [0x%llx - 0x%llx]\n", 837 pfn, pfn_i, pfn_end); 838 839 while (pfn_i < pfn_end) { 840 e_raw = get_raw_rmpentry(pfn_i); 841 if (IS_ERR(e_raw)) { 842 pr_err("Error %ld reading RMP contents for PFN 0x%llx\n", 843 PTR_ERR(e_raw), pfn_i); 844 pfn_i++; 845 continue; 846 } 847 848 if (e_raw->lo || e_raw->hi) 849 pr_info("PFN: 0x%llx, [0x%016llx - 0x%016llx]\n", pfn_i, e_raw->lo, e_raw->hi); 850 pfn_i++; 851 } 852 } 853 854 void snp_dump_hva_rmpentry(unsigned long hva) 855 { 856 unsigned long paddr; 857 unsigned int level; 858 pgd_t *pgd; 859 pte_t *pte; 860 861 pgd = __va(read_cr3_pa()); 862 pgd += pgd_index(hva); 863 pte = lookup_address_in_pgd(pgd, hva, &level); 864 865 if (!pte) { 866 pr_err("Can't dump RMP entry for HVA %lx: no PTE/PFN found\n", hva); 867 return; 868 } 869 870 paddr = PFN_PHYS(pte_pfn(*pte)) | (hva & ~page_level_mask(level)); 871 dump_rmpentry(PHYS_PFN(paddr)); 872 } 873 874 /* 875 * PSMASH a 2MB aligned page into 4K pages in the RMP table while preserving the 876 * Validated bit. 877 */ 878 int psmash(u64 pfn) 879 { 880 unsigned long paddr = pfn << PAGE_SHIFT; 881 int ret; 882 883 if (!cc_platform_has(CC_ATTR_HOST_SEV_SNP)) 884 return -ENODEV; 885 886 if (!pfn_valid(pfn)) 887 return -EINVAL; 888 889 /* Binutils version 2.36 supports the PSMASH mnemonic. */ 890 asm volatile(".byte 0xF3, 0x0F, 0x01, 0xFF" 891 : "=a" (ret) 892 : "a" (paddr) 893 : "memory", "cc"); 894 895 return ret; 896 } 897 EXPORT_SYMBOL_GPL(psmash); 898 899 /* 900 * If the kernel uses a 2MB or larger directmap mapping to write to an address, 901 * and that mapping contains any 4KB pages that are set to private in the RMP 902 * table, an RMP #PF will trigger and cause a host crash. Hypervisor code that 903 * owns the PFNs being transitioned will never attempt such a write, but other 904 * kernel tasks writing to other PFNs in the range may trigger these checks 905 * inadvertently due a large directmap mapping that happens to overlap such a 906 * PFN. 907 * 908 * Prevent this by splitting any 2MB+ mappings that might end up containing a 909 * mix of private/shared PFNs as a result of a subsequent RMPUPDATE for the 910 * PFN/rmp_level passed in. 911 * 912 * Note that there is no attempt here to scan all the RMP entries for the 2MB 913 * physical range, since it would only be worthwhile in determining if a 914 * subsequent RMPUPDATE for a 4KB PFN would result in all the entries being of 915 * the same shared/private state, thus avoiding the need to split the mapping. 916 * But that would mean the entries are currently in a mixed state, and so the 917 * mapping would have already been split as a result of prior transitions. 918 * And since the 4K split is only done if the mapping is 2MB+, and there isn't 919 * currently a mechanism in place to restore 2MB+ mappings, such a check would 920 * not provide any usable benefit. 921 * 922 * More specifics on how these checks are carried out can be found in APM 923 * Volume 2, "RMP and VMPL Access Checks". 924 */ 925 static int adjust_direct_map(u64 pfn, int rmp_level) 926 { 927 unsigned long vaddr; 928 unsigned int level; 929 int npages, ret; 930 pte_t *pte; 931 932 /* 933 * pfn_to_kaddr() will return a vaddr only within the direct 934 * map range. 935 */ 936 vaddr = (unsigned long)pfn_to_kaddr(pfn); 937 938 /* Only 4KB/2MB RMP entries are supported by current hardware. */ 939 if (WARN_ON_ONCE(rmp_level > PG_LEVEL_2M)) 940 return -EINVAL; 941 942 if (!pfn_valid(pfn)) 943 return -EINVAL; 944 945 if (rmp_level == PG_LEVEL_2M && 946 (!IS_ALIGNED(pfn, PTRS_PER_PMD) || !pfn_valid(pfn + PTRS_PER_PMD - 1))) 947 return -EINVAL; 948 949 /* 950 * If an entire 2MB physical range is being transitioned, then there is 951 * no risk of RMP #PFs due to write accesses from overlapping mappings, 952 * since even accesses from 1GB mappings will be treated as 2MB accesses 953 * as far as RMP table checks are concerned. 954 */ 955 if (rmp_level == PG_LEVEL_2M) 956 return 0; 957 958 pte = lookup_address(vaddr, &level); 959 if (!pte || pte_none(*pte)) 960 return 0; 961 962 if (level == PG_LEVEL_4K) 963 return 0; 964 965 npages = page_level_size(rmp_level) / PAGE_SIZE; 966 ret = set_memory_4k(vaddr, npages); 967 if (ret) 968 pr_warn("Failed to split direct map for PFN 0x%llx, ret: %d\n", 969 pfn, ret); 970 971 return ret; 972 } 973 974 /* 975 * It is expected that those operations are seldom enough so that no mutual 976 * exclusion of updaters is needed and thus the overlap error condition below 977 * should happen very rarely and would get resolved relatively quickly by 978 * the firmware. 979 * 980 * If not, one could consider introducing a mutex or so here to sync concurrent 981 * RMP updates and thus diminish the amount of cases where firmware needs to 982 * lock 2M ranges to protect against concurrent updates. 983 * 984 * The optimal solution would be range locking to avoid locking disjoint 985 * regions unnecessarily but there's no support for that yet. 986 */ 987 static int rmpupdate(u64 pfn, struct rmp_state *state) 988 { 989 unsigned long paddr = pfn << PAGE_SHIFT; 990 int ret, level; 991 992 if (!cc_platform_has(CC_ATTR_HOST_SEV_SNP)) 993 return -ENODEV; 994 995 level = RMP_TO_PG_LEVEL(state->pagesize); 996 997 if (adjust_direct_map(pfn, level)) 998 return -EFAULT; 999 1000 do { 1001 /* Binutils version 2.36 supports the RMPUPDATE mnemonic. */ 1002 asm volatile(".byte 0xF2, 0x0F, 0x01, 0xFE" 1003 : "=a" (ret) 1004 : "a" (paddr), "c" ((unsigned long)state) 1005 : "memory", "cc"); 1006 } while (ret == RMPUPDATE_FAIL_OVERLAP); 1007 1008 if (ret) { 1009 pr_err("RMPUPDATE failed for PFN %llx, pg_level: %d, ret: %d\n", 1010 pfn, level, ret); 1011 dump_rmpentry(pfn); 1012 dump_stack(); 1013 return -EFAULT; 1014 } 1015 1016 return 0; 1017 } 1018 1019 /* Transition a page to guest-owned/private state in the RMP table. */ 1020 int rmp_make_private(u64 pfn, u64 gpa, enum pg_level level, u32 asid, bool immutable) 1021 { 1022 struct rmp_state state; 1023 1024 memset(&state, 0, sizeof(state)); 1025 state.assigned = 1; 1026 state.asid = asid; 1027 state.immutable = immutable; 1028 state.gpa = gpa; 1029 state.pagesize = PG_LEVEL_TO_RMP(level); 1030 1031 return rmpupdate(pfn, &state); 1032 } 1033 EXPORT_SYMBOL_GPL(rmp_make_private); 1034 1035 /* Transition a page to hypervisor-owned/shared state in the RMP table. */ 1036 int rmp_make_shared(u64 pfn, enum pg_level level) 1037 { 1038 struct rmp_state state; 1039 1040 memset(&state, 0, sizeof(state)); 1041 state.pagesize = PG_LEVEL_TO_RMP(level); 1042 1043 return rmpupdate(pfn, &state); 1044 } 1045 EXPORT_SYMBOL_GPL(rmp_make_shared); 1046 1047 void __snp_leak_pages(u64 pfn, unsigned int npages, bool dump_rmp) 1048 { 1049 struct page *page = pfn_to_page(pfn); 1050 1051 pr_warn("Leaking PFN range 0x%llx-0x%llx\n", pfn, pfn + npages); 1052 1053 spin_lock(&snp_leaked_pages_list_lock); 1054 while (npages--) { 1055 1056 /* 1057 * Reuse the page's buddy list for chaining into the leaked 1058 * pages list. This page should not be on a free list currently 1059 * and is also unsafe to be added to a free list. 1060 */ 1061 if (likely(!PageCompound(page)) || 1062 1063 /* 1064 * Skip inserting tail pages of compound page as 1065 * page->buddy_list of tail pages is not usable. 1066 */ 1067 (PageHead(page) && compound_nr(page) <= npages)) 1068 list_add_tail(&page->buddy_list, &snp_leaked_pages_list); 1069 1070 if (dump_rmp) 1071 dump_rmpentry(pfn); 1072 snp_nr_leaked_pages++; 1073 pfn++; 1074 page++; 1075 } 1076 spin_unlock(&snp_leaked_pages_list_lock); 1077 } 1078 EXPORT_SYMBOL_GPL(__snp_leak_pages); 1079 1080 void kdump_sev_callback(void) 1081 { 1082 /* 1083 * Do wbinvd() on remote CPUs when SNP is enabled in order to 1084 * safely do SNP_SHUTDOWN on the local CPU. 1085 */ 1086 if (cc_platform_has(CC_ATTR_HOST_SEV_SNP)) 1087 wbinvd(); 1088 } 1089