1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3 * AMD SVM-SEV Host Support.
4 *
5 * Copyright (C) 2023 Advanced Micro Devices, Inc.
6 *
7 * Author: Ashish Kalra <ashish.kalra@amd.com>
8 *
9 */
10
11 #include <linux/cc_platform.h>
12 #include <linux/printk.h>
13 #include <linux/mm_types.h>
14 #include <linux/set_memory.h>
15 #include <linux/memblock.h>
16 #include <linux/kernel.h>
17 #include <linux/mm.h>
18 #include <linux/cpumask.h>
19 #include <linux/iommu.h>
20 #include <linux/amd-iommu.h>
21
22 #include <asm/sev.h>
23 #include <asm/processor.h>
24 #include <asm/setup.h>
25 #include <asm/svm.h>
26 #include <asm/smp.h>
27 #include <asm/cpu.h>
28 #include <asm/apic.h>
29 #include <asm/cpuid.h>
30 #include <asm/cmdline.h>
31 #include <asm/iommu.h>
32
33 /*
34 * The RMP entry format is not architectural. The format is defined in PPR
35 * Family 19h Model 01h, Rev B1 processor.
36 */
37 struct rmpentry {
38 union {
39 struct {
40 u64 assigned : 1,
41 pagesize : 1,
42 immutable : 1,
43 rsvd1 : 9,
44 gpa : 39,
45 asid : 10,
46 vmsa : 1,
47 validated : 1,
48 rsvd2 : 1;
49 };
50 u64 lo;
51 };
52 u64 hi;
53 } __packed;
54
55 /*
56 * The first 16KB from the RMP_BASE is used by the processor for the
57 * bookkeeping, the range needs to be added during the RMP entry lookup.
58 */
59 #define RMPTABLE_CPU_BOOKKEEPING_SZ 0x4000
60
61 /* Mask to apply to a PFN to get the first PFN of a 2MB page */
62 #define PFN_PMD_MASK GENMASK_ULL(63, PMD_SHIFT - PAGE_SHIFT)
63
64 static u64 probed_rmp_base, probed_rmp_size;
65 static struct rmpentry *rmptable __ro_after_init;
66 static u64 rmptable_max_pfn __ro_after_init;
67
68 static LIST_HEAD(snp_leaked_pages_list);
69 static DEFINE_SPINLOCK(snp_leaked_pages_list_lock);
70
71 static unsigned long snp_nr_leaked_pages;
72
73 #undef pr_fmt
74 #define pr_fmt(fmt) "SEV-SNP: " fmt
75
__mfd_enable(unsigned int cpu)76 static int __mfd_enable(unsigned int cpu)
77 {
78 u64 val;
79
80 if (!cc_platform_has(CC_ATTR_HOST_SEV_SNP))
81 return 0;
82
83 rdmsrl(MSR_AMD64_SYSCFG, val);
84
85 val |= MSR_AMD64_SYSCFG_MFDM;
86
87 wrmsrl(MSR_AMD64_SYSCFG, val);
88
89 return 0;
90 }
91
mfd_enable(void * arg)92 static __init void mfd_enable(void *arg)
93 {
94 __mfd_enable(smp_processor_id());
95 }
96
__snp_enable(unsigned int cpu)97 static int __snp_enable(unsigned int cpu)
98 {
99 u64 val;
100
101 if (!cc_platform_has(CC_ATTR_HOST_SEV_SNP))
102 return 0;
103
104 rdmsrl(MSR_AMD64_SYSCFG, val);
105
106 val |= MSR_AMD64_SYSCFG_SNP_EN;
107 val |= MSR_AMD64_SYSCFG_SNP_VMPL_EN;
108
109 wrmsrl(MSR_AMD64_SYSCFG, val);
110
111 return 0;
112 }
113
snp_enable(void * arg)114 static __init void snp_enable(void *arg)
115 {
116 __snp_enable(smp_processor_id());
117 }
118
119 #define RMP_ADDR_MASK GENMASK_ULL(51, 13)
120
snp_probe_rmptable_info(void)121 bool snp_probe_rmptable_info(void)
122 {
123 u64 rmp_sz, rmp_base, rmp_end;
124
125 rdmsrl(MSR_AMD64_RMP_BASE, rmp_base);
126 rdmsrl(MSR_AMD64_RMP_END, rmp_end);
127
128 if (!(rmp_base & RMP_ADDR_MASK) || !(rmp_end & RMP_ADDR_MASK)) {
129 pr_err("Memory for the RMP table has not been reserved by BIOS\n");
130 return false;
131 }
132
133 if (rmp_base > rmp_end) {
134 pr_err("RMP configuration not valid: base=%#llx, end=%#llx\n", rmp_base, rmp_end);
135 return false;
136 }
137
138 rmp_sz = rmp_end - rmp_base + 1;
139
140 probed_rmp_base = rmp_base;
141 probed_rmp_size = rmp_sz;
142
143 pr_info("RMP table physical range [0x%016llx - 0x%016llx]\n",
144 rmp_base, rmp_end);
145
146 return true;
147 }
148
__snp_fixup_e820_tables(u64 pa)149 static void __init __snp_fixup_e820_tables(u64 pa)
150 {
151 if (IS_ALIGNED(pa, PMD_SIZE))
152 return;
153
154 /*
155 * Handle cases where the RMP table placement by the BIOS is not
156 * 2M aligned and the kexec kernel could try to allocate
157 * from within that chunk which then causes a fatal RMP fault.
158 *
159 * The e820_table needs to be updated as it is converted to
160 * kernel memory resources and used by KEXEC_FILE_LOAD syscall
161 * to load kexec segments.
162 *
163 * The e820_table_firmware needs to be updated as it is exposed
164 * to sysfs and used by the KEXEC_LOAD syscall to load kexec
165 * segments.
166 *
167 * The e820_table_kexec needs to be updated as it passed to
168 * the kexec-ed kernel.
169 */
170 pa = ALIGN_DOWN(pa, PMD_SIZE);
171 if (e820__mapped_any(pa, pa + PMD_SIZE, E820_TYPE_RAM)) {
172 pr_info("Reserving start/end of RMP table on a 2MB boundary [0x%016llx]\n", pa);
173 e820__range_update(pa, PMD_SIZE, E820_TYPE_RAM, E820_TYPE_RESERVED);
174 e820__range_update_table(e820_table_kexec, pa, PMD_SIZE, E820_TYPE_RAM, E820_TYPE_RESERVED);
175 e820__range_update_table(e820_table_firmware, pa, PMD_SIZE, E820_TYPE_RAM, E820_TYPE_RESERVED);
176 if (!memblock_is_region_reserved(pa, PMD_SIZE))
177 memblock_reserve(pa, PMD_SIZE);
178 }
179 }
180
snp_fixup_e820_tables(void)181 void __init snp_fixup_e820_tables(void)
182 {
183 __snp_fixup_e820_tables(probed_rmp_base);
184 __snp_fixup_e820_tables(probed_rmp_base + probed_rmp_size);
185 }
186
187 /*
188 * Do the necessary preparations which are verified by the firmware as
189 * described in the SNP_INIT_EX firmware command description in the SNP
190 * firmware ABI spec.
191 */
snp_rmptable_init(void)192 static int __init snp_rmptable_init(void)
193 {
194 u64 max_rmp_pfn, calc_rmp_sz, rmptable_size, rmp_end, val;
195 void *rmptable_start;
196
197 if (!cc_platform_has(CC_ATTR_HOST_SEV_SNP))
198 return 0;
199
200 if (!amd_iommu_snp_en)
201 goto nosnp;
202
203 if (!probed_rmp_size)
204 goto nosnp;
205
206 rmp_end = probed_rmp_base + probed_rmp_size - 1;
207
208 /*
209 * Calculate the amount the memory that must be reserved by the BIOS to
210 * address the whole RAM, including the bookkeeping area. The RMP itself
211 * must also be covered.
212 */
213 max_rmp_pfn = max_pfn;
214 if (PFN_UP(rmp_end) > max_pfn)
215 max_rmp_pfn = PFN_UP(rmp_end);
216
217 calc_rmp_sz = (max_rmp_pfn << 4) + RMPTABLE_CPU_BOOKKEEPING_SZ;
218 if (calc_rmp_sz > probed_rmp_size) {
219 pr_err("Memory reserved for the RMP table does not cover full system RAM (expected 0x%llx got 0x%llx)\n",
220 calc_rmp_sz, probed_rmp_size);
221 goto nosnp;
222 }
223
224 rmptable_start = memremap(probed_rmp_base, probed_rmp_size, MEMREMAP_WB);
225 if (!rmptable_start) {
226 pr_err("Failed to map RMP table\n");
227 goto nosnp;
228 }
229
230 /*
231 * Check if SEV-SNP is already enabled, this can happen in case of
232 * kexec boot.
233 */
234 rdmsrl(MSR_AMD64_SYSCFG, val);
235 if (val & MSR_AMD64_SYSCFG_SNP_EN)
236 goto skip_enable;
237
238 memset(rmptable_start, 0, probed_rmp_size);
239
240 /* Flush the caches to ensure that data is written before SNP is enabled. */
241 wbinvd_on_all_cpus();
242
243 /* MtrrFixDramModEn must be enabled on all the CPUs prior to enabling SNP. */
244 on_each_cpu(mfd_enable, NULL, 1);
245
246 on_each_cpu(snp_enable, NULL, 1);
247
248 skip_enable:
249 rmptable_start += RMPTABLE_CPU_BOOKKEEPING_SZ;
250 rmptable_size = probed_rmp_size - RMPTABLE_CPU_BOOKKEEPING_SZ;
251
252 rmptable = (struct rmpentry *)rmptable_start;
253 rmptable_max_pfn = rmptable_size / sizeof(struct rmpentry) - 1;
254
255 cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "x86/rmptable_init:online", __snp_enable, NULL);
256
257 /*
258 * Setting crash_kexec_post_notifiers to 'true' to ensure that SNP panic
259 * notifier is invoked to do SNP IOMMU shutdown before kdump.
260 */
261 crash_kexec_post_notifiers = true;
262
263 return 0;
264
265 nosnp:
266 cc_platform_clear(CC_ATTR_HOST_SEV_SNP);
267 return -ENOSYS;
268 }
269
270 /*
271 * This must be called after the IOMMU has been initialized.
272 */
273 device_initcall(snp_rmptable_init);
274
get_rmpentry(u64 pfn)275 static struct rmpentry *get_rmpentry(u64 pfn)
276 {
277 if (WARN_ON_ONCE(pfn > rmptable_max_pfn))
278 return ERR_PTR(-EFAULT);
279
280 return &rmptable[pfn];
281 }
282
__snp_lookup_rmpentry(u64 pfn,int * level)283 static struct rmpentry *__snp_lookup_rmpentry(u64 pfn, int *level)
284 {
285 struct rmpentry *large_entry, *entry;
286
287 if (!cc_platform_has(CC_ATTR_HOST_SEV_SNP))
288 return ERR_PTR(-ENODEV);
289
290 entry = get_rmpentry(pfn);
291 if (IS_ERR(entry))
292 return entry;
293
294 /*
295 * Find the authoritative RMP entry for a PFN. This can be either a 4K
296 * RMP entry or a special large RMP entry that is authoritative for a
297 * whole 2M area.
298 */
299 large_entry = get_rmpentry(pfn & PFN_PMD_MASK);
300 if (IS_ERR(large_entry))
301 return large_entry;
302
303 *level = RMP_TO_PG_LEVEL(large_entry->pagesize);
304
305 return entry;
306 }
307
snp_lookup_rmpentry(u64 pfn,bool * assigned,int * level)308 int snp_lookup_rmpentry(u64 pfn, bool *assigned, int *level)
309 {
310 struct rmpentry *e;
311
312 e = __snp_lookup_rmpentry(pfn, level);
313 if (IS_ERR(e))
314 return PTR_ERR(e);
315
316 *assigned = !!e->assigned;
317 return 0;
318 }
319 EXPORT_SYMBOL_GPL(snp_lookup_rmpentry);
320
321 /*
322 * Dump the raw RMP entry for a particular PFN. These bits are documented in the
323 * PPR for a particular CPU model and provide useful information about how a
324 * particular PFN is being utilized by the kernel/firmware at the time certain
325 * unexpected events occur, such as RMP faults.
326 */
dump_rmpentry(u64 pfn)327 static void dump_rmpentry(u64 pfn)
328 {
329 u64 pfn_i, pfn_end;
330 struct rmpentry *e;
331 int level;
332
333 e = __snp_lookup_rmpentry(pfn, &level);
334 if (IS_ERR(e)) {
335 pr_err("Failed to read RMP entry for PFN 0x%llx, error %ld\n",
336 pfn, PTR_ERR(e));
337 return;
338 }
339
340 if (e->assigned) {
341 pr_info("PFN 0x%llx, RMP entry: [0x%016llx - 0x%016llx]\n",
342 pfn, e->lo, e->hi);
343 return;
344 }
345
346 /*
347 * If the RMP entry for a particular PFN is not in an assigned state,
348 * then it is sometimes useful to get an idea of whether or not any RMP
349 * entries for other PFNs within the same 2MB region are assigned, since
350 * those too can affect the ability to access a particular PFN in
351 * certain situations, such as when the PFN is being accessed via a 2MB
352 * mapping in the host page table.
353 */
354 pfn_i = ALIGN_DOWN(pfn, PTRS_PER_PMD);
355 pfn_end = pfn_i + PTRS_PER_PMD;
356
357 pr_info("PFN 0x%llx unassigned, dumping non-zero entries in 2M PFN region: [0x%llx - 0x%llx]\n",
358 pfn, pfn_i, pfn_end);
359
360 while (pfn_i < pfn_end) {
361 e = __snp_lookup_rmpentry(pfn_i, &level);
362 if (IS_ERR(e)) {
363 pr_err("Error %ld reading RMP entry for PFN 0x%llx\n",
364 PTR_ERR(e), pfn_i);
365 pfn_i++;
366 continue;
367 }
368
369 if (e->lo || e->hi)
370 pr_info("PFN: 0x%llx, [0x%016llx - 0x%016llx]\n", pfn_i, e->lo, e->hi);
371 pfn_i++;
372 }
373 }
374
snp_dump_hva_rmpentry(unsigned long hva)375 void snp_dump_hva_rmpentry(unsigned long hva)
376 {
377 unsigned long paddr;
378 unsigned int level;
379 pgd_t *pgd;
380 pte_t *pte;
381
382 pgd = __va(read_cr3_pa());
383 pgd += pgd_index(hva);
384 pte = lookup_address_in_pgd(pgd, hva, &level);
385
386 if (!pte) {
387 pr_err("Can't dump RMP entry for HVA %lx: no PTE/PFN found\n", hva);
388 return;
389 }
390
391 paddr = PFN_PHYS(pte_pfn(*pte)) | (hva & ~page_level_mask(level));
392 dump_rmpentry(PHYS_PFN(paddr));
393 }
394
395 /*
396 * PSMASH a 2MB aligned page into 4K pages in the RMP table while preserving the
397 * Validated bit.
398 */
psmash(u64 pfn)399 int psmash(u64 pfn)
400 {
401 unsigned long paddr = pfn << PAGE_SHIFT;
402 int ret;
403
404 if (!cc_platform_has(CC_ATTR_HOST_SEV_SNP))
405 return -ENODEV;
406
407 if (!pfn_valid(pfn))
408 return -EINVAL;
409
410 /* Binutils version 2.36 supports the PSMASH mnemonic. */
411 asm volatile(".byte 0xF3, 0x0F, 0x01, 0xFF"
412 : "=a" (ret)
413 : "a" (paddr)
414 : "memory", "cc");
415
416 return ret;
417 }
418 EXPORT_SYMBOL_GPL(psmash);
419
420 /*
421 * If the kernel uses a 2MB or larger directmap mapping to write to an address,
422 * and that mapping contains any 4KB pages that are set to private in the RMP
423 * table, an RMP #PF will trigger and cause a host crash. Hypervisor code that
424 * owns the PFNs being transitioned will never attempt such a write, but other
425 * kernel tasks writing to other PFNs in the range may trigger these checks
426 * inadvertently due a large directmap mapping that happens to overlap such a
427 * PFN.
428 *
429 * Prevent this by splitting any 2MB+ mappings that might end up containing a
430 * mix of private/shared PFNs as a result of a subsequent RMPUPDATE for the
431 * PFN/rmp_level passed in.
432 *
433 * Note that there is no attempt here to scan all the RMP entries for the 2MB
434 * physical range, since it would only be worthwhile in determining if a
435 * subsequent RMPUPDATE for a 4KB PFN would result in all the entries being of
436 * the same shared/private state, thus avoiding the need to split the mapping.
437 * But that would mean the entries are currently in a mixed state, and so the
438 * mapping would have already been split as a result of prior transitions.
439 * And since the 4K split is only done if the mapping is 2MB+, and there isn't
440 * currently a mechanism in place to restore 2MB+ mappings, such a check would
441 * not provide any usable benefit.
442 *
443 * More specifics on how these checks are carried out can be found in APM
444 * Volume 2, "RMP and VMPL Access Checks".
445 */
adjust_direct_map(u64 pfn,int rmp_level)446 static int adjust_direct_map(u64 pfn, int rmp_level)
447 {
448 unsigned long vaddr;
449 unsigned int level;
450 int npages, ret;
451 pte_t *pte;
452
453 /*
454 * pfn_to_kaddr() will return a vaddr only within the direct
455 * map range.
456 */
457 vaddr = (unsigned long)pfn_to_kaddr(pfn);
458
459 /* Only 4KB/2MB RMP entries are supported by current hardware. */
460 if (WARN_ON_ONCE(rmp_level > PG_LEVEL_2M))
461 return -EINVAL;
462
463 if (!pfn_valid(pfn))
464 return -EINVAL;
465
466 if (rmp_level == PG_LEVEL_2M &&
467 (!IS_ALIGNED(pfn, PTRS_PER_PMD) || !pfn_valid(pfn + PTRS_PER_PMD - 1)))
468 return -EINVAL;
469
470 /*
471 * If an entire 2MB physical range is being transitioned, then there is
472 * no risk of RMP #PFs due to write accesses from overlapping mappings,
473 * since even accesses from 1GB mappings will be treated as 2MB accesses
474 * as far as RMP table checks are concerned.
475 */
476 if (rmp_level == PG_LEVEL_2M)
477 return 0;
478
479 pte = lookup_address(vaddr, &level);
480 if (!pte || pte_none(*pte))
481 return 0;
482
483 if (level == PG_LEVEL_4K)
484 return 0;
485
486 npages = page_level_size(rmp_level) / PAGE_SIZE;
487 ret = set_memory_4k(vaddr, npages);
488 if (ret)
489 pr_warn("Failed to split direct map for PFN 0x%llx, ret: %d\n",
490 pfn, ret);
491
492 return ret;
493 }
494
495 /*
496 * It is expected that those operations are seldom enough so that no mutual
497 * exclusion of updaters is needed and thus the overlap error condition below
498 * should happen very rarely and would get resolved relatively quickly by
499 * the firmware.
500 *
501 * If not, one could consider introducing a mutex or so here to sync concurrent
502 * RMP updates and thus diminish the amount of cases where firmware needs to
503 * lock 2M ranges to protect against concurrent updates.
504 *
505 * The optimal solution would be range locking to avoid locking disjoint
506 * regions unnecessarily but there's no support for that yet.
507 */
rmpupdate(u64 pfn,struct rmp_state * state)508 static int rmpupdate(u64 pfn, struct rmp_state *state)
509 {
510 unsigned long paddr = pfn << PAGE_SHIFT;
511 int ret, level;
512
513 if (!cc_platform_has(CC_ATTR_HOST_SEV_SNP))
514 return -ENODEV;
515
516 level = RMP_TO_PG_LEVEL(state->pagesize);
517
518 if (adjust_direct_map(pfn, level))
519 return -EFAULT;
520
521 do {
522 /* Binutils version 2.36 supports the RMPUPDATE mnemonic. */
523 asm volatile(".byte 0xF2, 0x0F, 0x01, 0xFE"
524 : "=a" (ret)
525 : "a" (paddr), "c" ((unsigned long)state)
526 : "memory", "cc");
527 } while (ret == RMPUPDATE_FAIL_OVERLAP);
528
529 if (ret) {
530 pr_err("RMPUPDATE failed for PFN %llx, pg_level: %d, ret: %d\n",
531 pfn, level, ret);
532 dump_rmpentry(pfn);
533 dump_stack();
534 return -EFAULT;
535 }
536
537 return 0;
538 }
539
540 /* Transition a page to guest-owned/private state in the RMP table. */
rmp_make_private(u64 pfn,u64 gpa,enum pg_level level,u32 asid,bool immutable)541 int rmp_make_private(u64 pfn, u64 gpa, enum pg_level level, u32 asid, bool immutable)
542 {
543 struct rmp_state state;
544
545 memset(&state, 0, sizeof(state));
546 state.assigned = 1;
547 state.asid = asid;
548 state.immutable = immutable;
549 state.gpa = gpa;
550 state.pagesize = PG_LEVEL_TO_RMP(level);
551
552 return rmpupdate(pfn, &state);
553 }
554 EXPORT_SYMBOL_GPL(rmp_make_private);
555
556 /* Transition a page to hypervisor-owned/shared state in the RMP table. */
rmp_make_shared(u64 pfn,enum pg_level level)557 int rmp_make_shared(u64 pfn, enum pg_level level)
558 {
559 struct rmp_state state;
560
561 memset(&state, 0, sizeof(state));
562 state.pagesize = PG_LEVEL_TO_RMP(level);
563
564 return rmpupdate(pfn, &state);
565 }
566 EXPORT_SYMBOL_GPL(rmp_make_shared);
567
snp_leak_pages(u64 pfn,unsigned int npages)568 void snp_leak_pages(u64 pfn, unsigned int npages)
569 {
570 struct page *page = pfn_to_page(pfn);
571
572 pr_warn("Leaking PFN range 0x%llx-0x%llx\n", pfn, pfn + npages);
573
574 spin_lock(&snp_leaked_pages_list_lock);
575 while (npages--) {
576
577 /*
578 * Reuse the page's buddy list for chaining into the leaked
579 * pages list. This page should not be on a free list currently
580 * and is also unsafe to be added to a free list.
581 */
582 if (likely(!PageCompound(page)) ||
583
584 /*
585 * Skip inserting tail pages of compound page as
586 * page->buddy_list of tail pages is not usable.
587 */
588 (PageHead(page) && compound_nr(page) <= npages))
589 list_add_tail(&page->buddy_list, &snp_leaked_pages_list);
590
591 dump_rmpentry(pfn);
592 snp_nr_leaked_pages++;
593 pfn++;
594 page++;
595 }
596 spin_unlock(&snp_leaked_pages_list_lock);
597 }
598 EXPORT_SYMBOL_GPL(snp_leak_pages);
599
kdump_sev_callback(void)600 void kdump_sev_callback(void)
601 {
602 /*
603 * Do wbinvd() on remote CPUs when SNP is enabled in order to
604 * safely do SNP_SHUTDOWN on the local CPU.
605 */
606 if (cc_platform_has(CC_ATTR_HOST_SEV_SNP))
607 wbinvd();
608 }
609