1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * AMD SVM-SEV Host Support. 4 * 5 * Copyright (C) 2023 Advanced Micro Devices, Inc. 6 * 7 * Author: Ashish Kalra <ashish.kalra@amd.com> 8 * 9 */ 10 11 #include <linux/cc_platform.h> 12 #include <linux/printk.h> 13 #include <linux/mm_types.h> 14 #include <linux/set_memory.h> 15 #include <linux/memblock.h> 16 #include <linux/kernel.h> 17 #include <linux/mm.h> 18 #include <linux/cpumask.h> 19 #include <linux/iommu.h> 20 #include <linux/amd-iommu.h> 21 22 #include <asm/sev.h> 23 #include <asm/processor.h> 24 #include <asm/setup.h> 25 #include <asm/svm.h> 26 #include <asm/smp.h> 27 #include <asm/cpu.h> 28 #include <asm/apic.h> 29 #include <asm/cpuid.h> 30 #include <asm/cmdline.h> 31 #include <asm/iommu.h> 32 33 /* 34 * The RMP entry format is not architectural. The format is defined in PPR 35 * Family 19h Model 01h, Rev B1 processor. 36 */ 37 struct rmpentry { 38 union { 39 struct { 40 u64 assigned : 1, 41 pagesize : 1, 42 immutable : 1, 43 rsvd1 : 9, 44 gpa : 39, 45 asid : 10, 46 vmsa : 1, 47 validated : 1, 48 rsvd2 : 1; 49 }; 50 u64 lo; 51 }; 52 u64 hi; 53 } __packed; 54 55 /* 56 * The first 16KB from the RMP_BASE is used by the processor for the 57 * bookkeeping, the range needs to be added during the RMP entry lookup. 58 */ 59 #define RMPTABLE_CPU_BOOKKEEPING_SZ 0x4000 60 61 /* Mask to apply to a PFN to get the first PFN of a 2MB page */ 62 #define PFN_PMD_MASK GENMASK_ULL(63, PMD_SHIFT - PAGE_SHIFT) 63 64 static u64 probed_rmp_base, probed_rmp_size; 65 static struct rmpentry *rmptable __ro_after_init; 66 static u64 rmptable_max_pfn __ro_after_init; 67 68 static LIST_HEAD(snp_leaked_pages_list); 69 static DEFINE_SPINLOCK(snp_leaked_pages_list_lock); 70 71 static unsigned long snp_nr_leaked_pages; 72 73 #undef pr_fmt 74 #define pr_fmt(fmt) "SEV-SNP: " fmt 75 76 static int __mfd_enable(unsigned int cpu) 77 { 78 u64 val; 79 80 if (!cc_platform_has(CC_ATTR_HOST_SEV_SNP)) 81 return 0; 82 83 rdmsrl(MSR_AMD64_SYSCFG, val); 84 85 val |= MSR_AMD64_SYSCFG_MFDM; 86 87 wrmsrl(MSR_AMD64_SYSCFG, val); 88 89 return 0; 90 } 91 92 static __init void mfd_enable(void *arg) 93 { 94 __mfd_enable(smp_processor_id()); 95 } 96 97 static int __snp_enable(unsigned int cpu) 98 { 99 u64 val; 100 101 if (!cc_platform_has(CC_ATTR_HOST_SEV_SNP)) 102 return 0; 103 104 rdmsrl(MSR_AMD64_SYSCFG, val); 105 106 val |= MSR_AMD64_SYSCFG_SNP_EN; 107 val |= MSR_AMD64_SYSCFG_SNP_VMPL_EN; 108 109 wrmsrl(MSR_AMD64_SYSCFG, val); 110 111 return 0; 112 } 113 114 static __init void snp_enable(void *arg) 115 { 116 __snp_enable(smp_processor_id()); 117 } 118 119 #define RMP_ADDR_MASK GENMASK_ULL(51, 13) 120 121 bool snp_probe_rmptable_info(void) 122 { 123 u64 rmp_sz, rmp_base, rmp_end; 124 125 rdmsrl(MSR_AMD64_RMP_BASE, rmp_base); 126 rdmsrl(MSR_AMD64_RMP_END, rmp_end); 127 128 if (!(rmp_base & RMP_ADDR_MASK) || !(rmp_end & RMP_ADDR_MASK)) { 129 pr_err("Memory for the RMP table has not been reserved by BIOS\n"); 130 return false; 131 } 132 133 if (rmp_base > rmp_end) { 134 pr_err("RMP configuration not valid: base=%#llx, end=%#llx\n", rmp_base, rmp_end); 135 return false; 136 } 137 138 rmp_sz = rmp_end - rmp_base + 1; 139 140 probed_rmp_base = rmp_base; 141 probed_rmp_size = rmp_sz; 142 143 pr_info("RMP table physical range [0x%016llx - 0x%016llx]\n", 144 rmp_base, rmp_end); 145 146 return true; 147 } 148 149 static void __init __snp_fixup_e820_tables(u64 pa) 150 { 151 if (IS_ALIGNED(pa, PMD_SIZE)) 152 return; 153 154 /* 155 * Handle cases where the RMP table placement by the BIOS is not 156 * 2M aligned and the kexec kernel could try to allocate 157 * from within that chunk which then causes a fatal RMP fault. 158 * 159 * The e820_table needs to be updated as it is converted to 160 * kernel memory resources and used by KEXEC_FILE_LOAD syscall 161 * to load kexec segments. 162 * 163 * The e820_table_firmware needs to be updated as it is exposed 164 * to sysfs and used by the KEXEC_LOAD syscall to load kexec 165 * segments. 166 * 167 * The e820_table_kexec needs to be updated as it passed to 168 * the kexec-ed kernel. 169 */ 170 pa = ALIGN_DOWN(pa, PMD_SIZE); 171 if (e820__mapped_any(pa, pa + PMD_SIZE, E820_TYPE_RAM)) { 172 pr_info("Reserving start/end of RMP table on a 2MB boundary [0x%016llx]\n", pa); 173 e820__range_update(pa, PMD_SIZE, E820_TYPE_RAM, E820_TYPE_RESERVED); 174 e820__range_update_table(e820_table_kexec, pa, PMD_SIZE, E820_TYPE_RAM, E820_TYPE_RESERVED); 175 e820__range_update_table(e820_table_firmware, pa, PMD_SIZE, E820_TYPE_RAM, E820_TYPE_RESERVED); 176 if (!memblock_is_region_reserved(pa, PMD_SIZE)) 177 memblock_reserve(pa, PMD_SIZE); 178 } 179 } 180 181 void __init snp_fixup_e820_tables(void) 182 { 183 __snp_fixup_e820_tables(probed_rmp_base); 184 __snp_fixup_e820_tables(probed_rmp_base + probed_rmp_size); 185 } 186 187 /* 188 * Do the necessary preparations which are verified by the firmware as 189 * described in the SNP_INIT_EX firmware command description in the SNP 190 * firmware ABI spec. 191 */ 192 static int __init snp_rmptable_init(void) 193 { 194 u64 max_rmp_pfn, calc_rmp_sz, rmptable_size, rmp_end, val; 195 void *rmptable_start; 196 197 if (!cc_platform_has(CC_ATTR_HOST_SEV_SNP)) 198 return 0; 199 200 if (!amd_iommu_snp_en) 201 goto nosnp; 202 203 if (!probed_rmp_size) 204 goto nosnp; 205 206 rmp_end = probed_rmp_base + probed_rmp_size - 1; 207 208 /* 209 * Calculate the amount the memory that must be reserved by the BIOS to 210 * address the whole RAM, including the bookkeeping area. The RMP itself 211 * must also be covered. 212 */ 213 max_rmp_pfn = max_pfn; 214 if (PFN_UP(rmp_end) > max_pfn) 215 max_rmp_pfn = PFN_UP(rmp_end); 216 217 calc_rmp_sz = (max_rmp_pfn << 4) + RMPTABLE_CPU_BOOKKEEPING_SZ; 218 if (calc_rmp_sz > probed_rmp_size) { 219 pr_err("Memory reserved for the RMP table does not cover full system RAM (expected 0x%llx got 0x%llx)\n", 220 calc_rmp_sz, probed_rmp_size); 221 goto nosnp; 222 } 223 224 rmptable_start = memremap(probed_rmp_base, probed_rmp_size, MEMREMAP_WB); 225 if (!rmptable_start) { 226 pr_err("Failed to map RMP table\n"); 227 goto nosnp; 228 } 229 230 /* 231 * Check if SEV-SNP is already enabled, this can happen in case of 232 * kexec boot. 233 */ 234 rdmsrl(MSR_AMD64_SYSCFG, val); 235 if (val & MSR_AMD64_SYSCFG_SNP_EN) 236 goto skip_enable; 237 238 memset(rmptable_start, 0, probed_rmp_size); 239 240 /* Flush the caches to ensure that data is written before SNP is enabled. */ 241 wbinvd_on_all_cpus(); 242 243 /* MtrrFixDramModEn must be enabled on all the CPUs prior to enabling SNP. */ 244 on_each_cpu(mfd_enable, NULL, 1); 245 246 on_each_cpu(snp_enable, NULL, 1); 247 248 skip_enable: 249 rmptable_start += RMPTABLE_CPU_BOOKKEEPING_SZ; 250 rmptable_size = probed_rmp_size - RMPTABLE_CPU_BOOKKEEPING_SZ; 251 252 rmptable = (struct rmpentry *)rmptable_start; 253 rmptable_max_pfn = rmptable_size / sizeof(struct rmpentry) - 1; 254 255 cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "x86/rmptable_init:online", __snp_enable, NULL); 256 257 /* 258 * Setting crash_kexec_post_notifiers to 'true' to ensure that SNP panic 259 * notifier is invoked to do SNP IOMMU shutdown before kdump. 260 */ 261 crash_kexec_post_notifiers = true; 262 263 return 0; 264 265 nosnp: 266 cc_platform_clear(CC_ATTR_HOST_SEV_SNP); 267 return -ENOSYS; 268 } 269 270 /* 271 * This must be called after the IOMMU has been initialized. 272 */ 273 device_initcall(snp_rmptable_init); 274 275 static struct rmpentry *get_rmpentry(u64 pfn) 276 { 277 if (WARN_ON_ONCE(pfn > rmptable_max_pfn)) 278 return ERR_PTR(-EFAULT); 279 280 return &rmptable[pfn]; 281 } 282 283 static struct rmpentry *__snp_lookup_rmpentry(u64 pfn, int *level) 284 { 285 struct rmpentry *large_entry, *entry; 286 287 if (!cc_platform_has(CC_ATTR_HOST_SEV_SNP)) 288 return ERR_PTR(-ENODEV); 289 290 entry = get_rmpentry(pfn); 291 if (IS_ERR(entry)) 292 return entry; 293 294 /* 295 * Find the authoritative RMP entry for a PFN. This can be either a 4K 296 * RMP entry or a special large RMP entry that is authoritative for a 297 * whole 2M area. 298 */ 299 large_entry = get_rmpentry(pfn & PFN_PMD_MASK); 300 if (IS_ERR(large_entry)) 301 return large_entry; 302 303 *level = RMP_TO_PG_LEVEL(large_entry->pagesize); 304 305 return entry; 306 } 307 308 int snp_lookup_rmpentry(u64 pfn, bool *assigned, int *level) 309 { 310 struct rmpentry *e; 311 312 e = __snp_lookup_rmpentry(pfn, level); 313 if (IS_ERR(e)) 314 return PTR_ERR(e); 315 316 *assigned = !!e->assigned; 317 return 0; 318 } 319 EXPORT_SYMBOL_GPL(snp_lookup_rmpentry); 320 321 /* 322 * Dump the raw RMP entry for a particular PFN. These bits are documented in the 323 * PPR for a particular CPU model and provide useful information about how a 324 * particular PFN is being utilized by the kernel/firmware at the time certain 325 * unexpected events occur, such as RMP faults. 326 */ 327 static void dump_rmpentry(u64 pfn) 328 { 329 u64 pfn_i, pfn_end; 330 struct rmpentry *e; 331 int level; 332 333 e = __snp_lookup_rmpentry(pfn, &level); 334 if (IS_ERR(e)) { 335 pr_err("Failed to read RMP entry for PFN 0x%llx, error %ld\n", 336 pfn, PTR_ERR(e)); 337 return; 338 } 339 340 if (e->assigned) { 341 pr_info("PFN 0x%llx, RMP entry: [0x%016llx - 0x%016llx]\n", 342 pfn, e->lo, e->hi); 343 return; 344 } 345 346 /* 347 * If the RMP entry for a particular PFN is not in an assigned state, 348 * then it is sometimes useful to get an idea of whether or not any RMP 349 * entries for other PFNs within the same 2MB region are assigned, since 350 * those too can affect the ability to access a particular PFN in 351 * certain situations, such as when the PFN is being accessed via a 2MB 352 * mapping in the host page table. 353 */ 354 pfn_i = ALIGN_DOWN(pfn, PTRS_PER_PMD); 355 pfn_end = pfn_i + PTRS_PER_PMD; 356 357 pr_info("PFN 0x%llx unassigned, dumping non-zero entries in 2M PFN region: [0x%llx - 0x%llx]\n", 358 pfn, pfn_i, pfn_end); 359 360 while (pfn_i < pfn_end) { 361 e = __snp_lookup_rmpentry(pfn_i, &level); 362 if (IS_ERR(e)) { 363 pr_err("Error %ld reading RMP entry for PFN 0x%llx\n", 364 PTR_ERR(e), pfn_i); 365 pfn_i++; 366 continue; 367 } 368 369 if (e->lo || e->hi) 370 pr_info("PFN: 0x%llx, [0x%016llx - 0x%016llx]\n", pfn_i, e->lo, e->hi); 371 pfn_i++; 372 } 373 } 374 375 void snp_dump_hva_rmpentry(unsigned long hva) 376 { 377 unsigned long paddr; 378 unsigned int level; 379 pgd_t *pgd; 380 pte_t *pte; 381 382 pgd = __va(read_cr3_pa()); 383 pgd += pgd_index(hva); 384 pte = lookup_address_in_pgd(pgd, hva, &level); 385 386 if (!pte) { 387 pr_err("Can't dump RMP entry for HVA %lx: no PTE/PFN found\n", hva); 388 return; 389 } 390 391 paddr = PFN_PHYS(pte_pfn(*pte)) | (hva & ~page_level_mask(level)); 392 dump_rmpentry(PHYS_PFN(paddr)); 393 } 394 395 /* 396 * PSMASH a 2MB aligned page into 4K pages in the RMP table while preserving the 397 * Validated bit. 398 */ 399 int psmash(u64 pfn) 400 { 401 unsigned long paddr = pfn << PAGE_SHIFT; 402 int ret; 403 404 if (!cc_platform_has(CC_ATTR_HOST_SEV_SNP)) 405 return -ENODEV; 406 407 if (!pfn_valid(pfn)) 408 return -EINVAL; 409 410 /* Binutils version 2.36 supports the PSMASH mnemonic. */ 411 asm volatile(".byte 0xF3, 0x0F, 0x01, 0xFF" 412 : "=a" (ret) 413 : "a" (paddr) 414 : "memory", "cc"); 415 416 return ret; 417 } 418 EXPORT_SYMBOL_GPL(psmash); 419 420 /* 421 * If the kernel uses a 2MB or larger directmap mapping to write to an address, 422 * and that mapping contains any 4KB pages that are set to private in the RMP 423 * table, an RMP #PF will trigger and cause a host crash. Hypervisor code that 424 * owns the PFNs being transitioned will never attempt such a write, but other 425 * kernel tasks writing to other PFNs in the range may trigger these checks 426 * inadvertently due a large directmap mapping that happens to overlap such a 427 * PFN. 428 * 429 * Prevent this by splitting any 2MB+ mappings that might end up containing a 430 * mix of private/shared PFNs as a result of a subsequent RMPUPDATE for the 431 * PFN/rmp_level passed in. 432 * 433 * Note that there is no attempt here to scan all the RMP entries for the 2MB 434 * physical range, since it would only be worthwhile in determining if a 435 * subsequent RMPUPDATE for a 4KB PFN would result in all the entries being of 436 * the same shared/private state, thus avoiding the need to split the mapping. 437 * But that would mean the entries are currently in a mixed state, and so the 438 * mapping would have already been split as a result of prior transitions. 439 * And since the 4K split is only done if the mapping is 2MB+, and there isn't 440 * currently a mechanism in place to restore 2MB+ mappings, such a check would 441 * not provide any usable benefit. 442 * 443 * More specifics on how these checks are carried out can be found in APM 444 * Volume 2, "RMP and VMPL Access Checks". 445 */ 446 static int adjust_direct_map(u64 pfn, int rmp_level) 447 { 448 unsigned long vaddr; 449 unsigned int level; 450 int npages, ret; 451 pte_t *pte; 452 453 /* 454 * pfn_to_kaddr() will return a vaddr only within the direct 455 * map range. 456 */ 457 vaddr = (unsigned long)pfn_to_kaddr(pfn); 458 459 /* Only 4KB/2MB RMP entries are supported by current hardware. */ 460 if (WARN_ON_ONCE(rmp_level > PG_LEVEL_2M)) 461 return -EINVAL; 462 463 if (!pfn_valid(pfn)) 464 return -EINVAL; 465 466 if (rmp_level == PG_LEVEL_2M && 467 (!IS_ALIGNED(pfn, PTRS_PER_PMD) || !pfn_valid(pfn + PTRS_PER_PMD - 1))) 468 return -EINVAL; 469 470 /* 471 * If an entire 2MB physical range is being transitioned, then there is 472 * no risk of RMP #PFs due to write accesses from overlapping mappings, 473 * since even accesses from 1GB mappings will be treated as 2MB accesses 474 * as far as RMP table checks are concerned. 475 */ 476 if (rmp_level == PG_LEVEL_2M) 477 return 0; 478 479 pte = lookup_address(vaddr, &level); 480 if (!pte || pte_none(*pte)) 481 return 0; 482 483 if (level == PG_LEVEL_4K) 484 return 0; 485 486 npages = page_level_size(rmp_level) / PAGE_SIZE; 487 ret = set_memory_4k(vaddr, npages); 488 if (ret) 489 pr_warn("Failed to split direct map for PFN 0x%llx, ret: %d\n", 490 pfn, ret); 491 492 return ret; 493 } 494 495 /* 496 * It is expected that those operations are seldom enough so that no mutual 497 * exclusion of updaters is needed and thus the overlap error condition below 498 * should happen very rarely and would get resolved relatively quickly by 499 * the firmware. 500 * 501 * If not, one could consider introducing a mutex or so here to sync concurrent 502 * RMP updates and thus diminish the amount of cases where firmware needs to 503 * lock 2M ranges to protect against concurrent updates. 504 * 505 * The optimal solution would be range locking to avoid locking disjoint 506 * regions unnecessarily but there's no support for that yet. 507 */ 508 static int rmpupdate(u64 pfn, struct rmp_state *state) 509 { 510 unsigned long paddr = pfn << PAGE_SHIFT; 511 int ret, level; 512 513 if (!cc_platform_has(CC_ATTR_HOST_SEV_SNP)) 514 return -ENODEV; 515 516 level = RMP_TO_PG_LEVEL(state->pagesize); 517 518 if (adjust_direct_map(pfn, level)) 519 return -EFAULT; 520 521 do { 522 /* Binutils version 2.36 supports the RMPUPDATE mnemonic. */ 523 asm volatile(".byte 0xF2, 0x0F, 0x01, 0xFE" 524 : "=a" (ret) 525 : "a" (paddr), "c" ((unsigned long)state) 526 : "memory", "cc"); 527 } while (ret == RMPUPDATE_FAIL_OVERLAP); 528 529 if (ret) { 530 pr_err("RMPUPDATE failed for PFN %llx, pg_level: %d, ret: %d\n", 531 pfn, level, ret); 532 dump_rmpentry(pfn); 533 dump_stack(); 534 return -EFAULT; 535 } 536 537 return 0; 538 } 539 540 /* Transition a page to guest-owned/private state in the RMP table. */ 541 int rmp_make_private(u64 pfn, u64 gpa, enum pg_level level, u32 asid, bool immutable) 542 { 543 struct rmp_state state; 544 545 memset(&state, 0, sizeof(state)); 546 state.assigned = 1; 547 state.asid = asid; 548 state.immutable = immutable; 549 state.gpa = gpa; 550 state.pagesize = PG_LEVEL_TO_RMP(level); 551 552 return rmpupdate(pfn, &state); 553 } 554 EXPORT_SYMBOL_GPL(rmp_make_private); 555 556 /* Transition a page to hypervisor-owned/shared state in the RMP table. */ 557 int rmp_make_shared(u64 pfn, enum pg_level level) 558 { 559 struct rmp_state state; 560 561 memset(&state, 0, sizeof(state)); 562 state.pagesize = PG_LEVEL_TO_RMP(level); 563 564 return rmpupdate(pfn, &state); 565 } 566 EXPORT_SYMBOL_GPL(rmp_make_shared); 567 568 void snp_leak_pages(u64 pfn, unsigned int npages) 569 { 570 struct page *page = pfn_to_page(pfn); 571 572 pr_warn("Leaking PFN range 0x%llx-0x%llx\n", pfn, pfn + npages); 573 574 spin_lock(&snp_leaked_pages_list_lock); 575 while (npages--) { 576 577 /* 578 * Reuse the page's buddy list for chaining into the leaked 579 * pages list. This page should not be on a free list currently 580 * and is also unsafe to be added to a free list. 581 */ 582 if (likely(!PageCompound(page)) || 583 584 /* 585 * Skip inserting tail pages of compound page as 586 * page->buddy_list of tail pages is not usable. 587 */ 588 (PageHead(page) && compound_nr(page) <= npages)) 589 list_add_tail(&page->buddy_list, &snp_leaked_pages_list); 590 591 dump_rmpentry(pfn); 592 snp_nr_leaked_pages++; 593 pfn++; 594 page++; 595 } 596 spin_unlock(&snp_leaked_pages_list_lock); 597 } 598 EXPORT_SYMBOL_GPL(snp_leak_pages); 599 600 void kdump_sev_callback(void) 601 { 602 /* 603 * Do wbinvd() on remote CPUs when SNP is enabled in order to 604 * safely do SNP_SHUTDOWN on the local CPU. 605 */ 606 if (cc_platform_has(CC_ATTR_HOST_SEV_SNP)) 607 wbinvd(); 608 } 609