xref: /linux/arch/x86/virt/svm/sev.c (revision 36110669ddf832e6c9ceba4dd203749d5be31d31)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * AMD SVM-SEV Host Support.
4  *
5  * Copyright (C) 2023 Advanced Micro Devices, Inc.
6  *
7  * Author: Ashish Kalra <ashish.kalra@amd.com>
8  *
9  */
10 
11 #include <linux/cc_platform.h>
12 #include <linux/printk.h>
13 #include <linux/mm_types.h>
14 #include <linux/set_memory.h>
15 #include <linux/memblock.h>
16 #include <linux/kernel.h>
17 #include <linux/mm.h>
18 #include <linux/cpumask.h>
19 #include <linux/iommu.h>
20 #include <linux/amd-iommu.h>
21 
22 #include <asm/sev.h>
23 #include <asm/processor.h>
24 #include <asm/setup.h>
25 #include <asm/svm.h>
26 #include <asm/smp.h>
27 #include <asm/cpu.h>
28 #include <asm/apic.h>
29 #include <asm/cpuid.h>
30 #include <asm/cmdline.h>
31 #include <asm/iommu.h>
32 
33 /*
34  * The RMP entry format is not architectural. The format is defined in PPR
35  * Family 19h Model 01h, Rev B1 processor.
36  */
37 struct rmpentry {
38 	union {
39 		struct {
40 			u64 assigned	: 1,
41 			    pagesize	: 1,
42 			    immutable	: 1,
43 			    rsvd1	: 9,
44 			    gpa		: 39,
45 			    asid	: 10,
46 			    vmsa	: 1,
47 			    validated	: 1,
48 			    rsvd2	: 1;
49 		};
50 		u64 lo;
51 	};
52 	u64 hi;
53 } __packed;
54 
55 /*
56  * The first 16KB from the RMP_BASE is used by the processor for the
57  * bookkeeping, the range needs to be added during the RMP entry lookup.
58  */
59 #define RMPTABLE_CPU_BOOKKEEPING_SZ	0x4000
60 
61 /* Mask to apply to a PFN to get the first PFN of a 2MB page */
62 #define PFN_PMD_MASK	GENMASK_ULL(63, PMD_SHIFT - PAGE_SHIFT)
63 
64 static u64 probed_rmp_base, probed_rmp_size;
65 static struct rmpentry *rmptable __ro_after_init;
66 static u64 rmptable_max_pfn __ro_after_init;
67 
68 static LIST_HEAD(snp_leaked_pages_list);
69 static DEFINE_SPINLOCK(snp_leaked_pages_list_lock);
70 
71 static unsigned long snp_nr_leaked_pages;
72 
73 #undef pr_fmt
74 #define pr_fmt(fmt)	"SEV-SNP: " fmt
75 
76 static int __mfd_enable(unsigned int cpu)
77 {
78 	u64 val;
79 
80 	if (!cc_platform_has(CC_ATTR_HOST_SEV_SNP))
81 		return 0;
82 
83 	rdmsrl(MSR_AMD64_SYSCFG, val);
84 
85 	val |= MSR_AMD64_SYSCFG_MFDM;
86 
87 	wrmsrl(MSR_AMD64_SYSCFG, val);
88 
89 	return 0;
90 }
91 
92 static __init void mfd_enable(void *arg)
93 {
94 	__mfd_enable(smp_processor_id());
95 }
96 
97 static int __snp_enable(unsigned int cpu)
98 {
99 	u64 val;
100 
101 	if (!cc_platform_has(CC_ATTR_HOST_SEV_SNP))
102 		return 0;
103 
104 	rdmsrl(MSR_AMD64_SYSCFG, val);
105 
106 	val |= MSR_AMD64_SYSCFG_SNP_EN;
107 	val |= MSR_AMD64_SYSCFG_SNP_VMPL_EN;
108 
109 	wrmsrl(MSR_AMD64_SYSCFG, val);
110 
111 	return 0;
112 }
113 
114 static __init void snp_enable(void *arg)
115 {
116 	__snp_enable(smp_processor_id());
117 }
118 
119 #define RMP_ADDR_MASK GENMASK_ULL(51, 13)
120 
121 bool snp_probe_rmptable_info(void)
122 {
123 	u64 rmp_sz, rmp_base, rmp_end;
124 
125 	rdmsrl(MSR_AMD64_RMP_BASE, rmp_base);
126 	rdmsrl(MSR_AMD64_RMP_END, rmp_end);
127 
128 	if (!(rmp_base & RMP_ADDR_MASK) || !(rmp_end & RMP_ADDR_MASK)) {
129 		pr_err("Memory for the RMP table has not been reserved by BIOS\n");
130 		return false;
131 	}
132 
133 	if (rmp_base > rmp_end) {
134 		pr_err("RMP configuration not valid: base=%#llx, end=%#llx\n", rmp_base, rmp_end);
135 		return false;
136 	}
137 
138 	rmp_sz = rmp_end - rmp_base + 1;
139 
140 	probed_rmp_base = rmp_base;
141 	probed_rmp_size = rmp_sz;
142 
143 	pr_info("RMP table physical range [0x%016llx - 0x%016llx]\n",
144 		rmp_base, rmp_end);
145 
146 	return true;
147 }
148 
149 static void __init __snp_fixup_e820_tables(u64 pa)
150 {
151 	if (IS_ALIGNED(pa, PMD_SIZE))
152 		return;
153 
154 	/*
155 	 * Handle cases where the RMP table placement by the BIOS is not
156 	 * 2M aligned and the kexec kernel could try to allocate
157 	 * from within that chunk which then causes a fatal RMP fault.
158 	 *
159 	 * The e820_table needs to be updated as it is converted to
160 	 * kernel memory resources and used by KEXEC_FILE_LOAD syscall
161 	 * to load kexec segments.
162 	 *
163 	 * The e820_table_firmware needs to be updated as it is exposed
164 	 * to sysfs and used by the KEXEC_LOAD syscall to load kexec
165 	 * segments.
166 	 *
167 	 * The e820_table_kexec needs to be updated as it passed to
168 	 * the kexec-ed kernel.
169 	 */
170 	pa = ALIGN_DOWN(pa, PMD_SIZE);
171 	if (e820__mapped_any(pa, pa + PMD_SIZE, E820_TYPE_RAM)) {
172 		pr_info("Reserving start/end of RMP table on a 2MB boundary [0x%016llx]\n", pa);
173 		e820__range_update(pa, PMD_SIZE, E820_TYPE_RAM, E820_TYPE_RESERVED);
174 		e820__range_update_table(e820_table_kexec, pa, PMD_SIZE, E820_TYPE_RAM, E820_TYPE_RESERVED);
175 		e820__range_update_table(e820_table_firmware, pa, PMD_SIZE, E820_TYPE_RAM, E820_TYPE_RESERVED);
176 	}
177 }
178 
179 void __init snp_fixup_e820_tables(void)
180 {
181 	__snp_fixup_e820_tables(probed_rmp_base);
182 	__snp_fixup_e820_tables(probed_rmp_base + probed_rmp_size);
183 }
184 
185 /*
186  * Do the necessary preparations which are verified by the firmware as
187  * described in the SNP_INIT_EX firmware command description in the SNP
188  * firmware ABI spec.
189  */
190 static int __init snp_rmptable_init(void)
191 {
192 	u64 max_rmp_pfn, calc_rmp_sz, rmptable_size, rmp_end, val;
193 	void *rmptable_start;
194 
195 	if (!cc_platform_has(CC_ATTR_HOST_SEV_SNP))
196 		return 0;
197 
198 	if (!amd_iommu_snp_en)
199 		goto nosnp;
200 
201 	if (!probed_rmp_size)
202 		goto nosnp;
203 
204 	rmp_end = probed_rmp_base + probed_rmp_size - 1;
205 
206 	/*
207 	 * Calculate the amount the memory that must be reserved by the BIOS to
208 	 * address the whole RAM, including the bookkeeping area. The RMP itself
209 	 * must also be covered.
210 	 */
211 	max_rmp_pfn = max_pfn;
212 	if (PFN_UP(rmp_end) > max_pfn)
213 		max_rmp_pfn = PFN_UP(rmp_end);
214 
215 	calc_rmp_sz = (max_rmp_pfn << 4) + RMPTABLE_CPU_BOOKKEEPING_SZ;
216 	if (calc_rmp_sz > probed_rmp_size) {
217 		pr_err("Memory reserved for the RMP table does not cover full system RAM (expected 0x%llx got 0x%llx)\n",
218 		       calc_rmp_sz, probed_rmp_size);
219 		goto nosnp;
220 	}
221 
222 	rmptable_start = memremap(probed_rmp_base, probed_rmp_size, MEMREMAP_WB);
223 	if (!rmptable_start) {
224 		pr_err("Failed to map RMP table\n");
225 		goto nosnp;
226 	}
227 
228 	/*
229 	 * Check if SEV-SNP is already enabled, this can happen in case of
230 	 * kexec boot.
231 	 */
232 	rdmsrl(MSR_AMD64_SYSCFG, val);
233 	if (val & MSR_AMD64_SYSCFG_SNP_EN)
234 		goto skip_enable;
235 
236 	memset(rmptable_start, 0, probed_rmp_size);
237 
238 	/* Flush the caches to ensure that data is written before SNP is enabled. */
239 	wbinvd_on_all_cpus();
240 
241 	/* MtrrFixDramModEn must be enabled on all the CPUs prior to enabling SNP. */
242 	on_each_cpu(mfd_enable, NULL, 1);
243 
244 	on_each_cpu(snp_enable, NULL, 1);
245 
246 skip_enable:
247 	rmptable_start += RMPTABLE_CPU_BOOKKEEPING_SZ;
248 	rmptable_size = probed_rmp_size - RMPTABLE_CPU_BOOKKEEPING_SZ;
249 
250 	rmptable = (struct rmpentry *)rmptable_start;
251 	rmptable_max_pfn = rmptable_size / sizeof(struct rmpentry) - 1;
252 
253 	cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "x86/rmptable_init:online", __snp_enable, NULL);
254 
255 	/*
256 	 * Setting crash_kexec_post_notifiers to 'true' to ensure that SNP panic
257 	 * notifier is invoked to do SNP IOMMU shutdown before kdump.
258 	 */
259 	crash_kexec_post_notifiers = true;
260 
261 	return 0;
262 
263 nosnp:
264 	cc_platform_clear(CC_ATTR_HOST_SEV_SNP);
265 	return -ENOSYS;
266 }
267 
268 /*
269  * This must be called after the IOMMU has been initialized.
270  */
271 device_initcall(snp_rmptable_init);
272 
273 static struct rmpentry *get_rmpentry(u64 pfn)
274 {
275 	if (WARN_ON_ONCE(pfn > rmptable_max_pfn))
276 		return ERR_PTR(-EFAULT);
277 
278 	return &rmptable[pfn];
279 }
280 
281 static struct rmpentry *__snp_lookup_rmpentry(u64 pfn, int *level)
282 {
283 	struct rmpentry *large_entry, *entry;
284 
285 	if (!cc_platform_has(CC_ATTR_HOST_SEV_SNP))
286 		return ERR_PTR(-ENODEV);
287 
288 	entry = get_rmpentry(pfn);
289 	if (IS_ERR(entry))
290 		return entry;
291 
292 	/*
293 	 * Find the authoritative RMP entry for a PFN. This can be either a 4K
294 	 * RMP entry or a special large RMP entry that is authoritative for a
295 	 * whole 2M area.
296 	 */
297 	large_entry = get_rmpentry(pfn & PFN_PMD_MASK);
298 	if (IS_ERR(large_entry))
299 		return large_entry;
300 
301 	*level = RMP_TO_PG_LEVEL(large_entry->pagesize);
302 
303 	return entry;
304 }
305 
306 int snp_lookup_rmpentry(u64 pfn, bool *assigned, int *level)
307 {
308 	struct rmpentry *e;
309 
310 	e = __snp_lookup_rmpentry(pfn, level);
311 	if (IS_ERR(e))
312 		return PTR_ERR(e);
313 
314 	*assigned = !!e->assigned;
315 	return 0;
316 }
317 EXPORT_SYMBOL_GPL(snp_lookup_rmpentry);
318 
319 /*
320  * Dump the raw RMP entry for a particular PFN. These bits are documented in the
321  * PPR for a particular CPU model and provide useful information about how a
322  * particular PFN is being utilized by the kernel/firmware at the time certain
323  * unexpected events occur, such as RMP faults.
324  */
325 static void dump_rmpentry(u64 pfn)
326 {
327 	u64 pfn_i, pfn_end;
328 	struct rmpentry *e;
329 	int level;
330 
331 	e = __snp_lookup_rmpentry(pfn, &level);
332 	if (IS_ERR(e)) {
333 		pr_err("Failed to read RMP entry for PFN 0x%llx, error %ld\n",
334 		       pfn, PTR_ERR(e));
335 		return;
336 	}
337 
338 	if (e->assigned) {
339 		pr_info("PFN 0x%llx, RMP entry: [0x%016llx - 0x%016llx]\n",
340 			pfn, e->lo, e->hi);
341 		return;
342 	}
343 
344 	/*
345 	 * If the RMP entry for a particular PFN is not in an assigned state,
346 	 * then it is sometimes useful to get an idea of whether or not any RMP
347 	 * entries for other PFNs within the same 2MB region are assigned, since
348 	 * those too can affect the ability to access a particular PFN in
349 	 * certain situations, such as when the PFN is being accessed via a 2MB
350 	 * mapping in the host page table.
351 	 */
352 	pfn_i = ALIGN_DOWN(pfn, PTRS_PER_PMD);
353 	pfn_end = pfn_i + PTRS_PER_PMD;
354 
355 	pr_info("PFN 0x%llx unassigned, dumping non-zero entries in 2M PFN region: [0x%llx - 0x%llx]\n",
356 		pfn, pfn_i, pfn_end);
357 
358 	while (pfn_i < pfn_end) {
359 		e = __snp_lookup_rmpentry(pfn_i, &level);
360 		if (IS_ERR(e)) {
361 			pr_err("Error %ld reading RMP entry for PFN 0x%llx\n",
362 			       PTR_ERR(e), pfn_i);
363 			pfn_i++;
364 			continue;
365 		}
366 
367 		if (e->lo || e->hi)
368 			pr_info("PFN: 0x%llx, [0x%016llx - 0x%016llx]\n", pfn_i, e->lo, e->hi);
369 		pfn_i++;
370 	}
371 }
372 
373 void snp_dump_hva_rmpentry(unsigned long hva)
374 {
375 	unsigned long paddr;
376 	unsigned int level;
377 	pgd_t *pgd;
378 	pte_t *pte;
379 
380 	pgd = __va(read_cr3_pa());
381 	pgd += pgd_index(hva);
382 	pte = lookup_address_in_pgd(pgd, hva, &level);
383 
384 	if (!pte) {
385 		pr_err("Can't dump RMP entry for HVA %lx: no PTE/PFN found\n", hva);
386 		return;
387 	}
388 
389 	paddr = PFN_PHYS(pte_pfn(*pte)) | (hva & ~page_level_mask(level));
390 	dump_rmpentry(PHYS_PFN(paddr));
391 }
392 
393 /*
394  * PSMASH a 2MB aligned page into 4K pages in the RMP table while preserving the
395  * Validated bit.
396  */
397 int psmash(u64 pfn)
398 {
399 	unsigned long paddr = pfn << PAGE_SHIFT;
400 	int ret;
401 
402 	if (!cc_platform_has(CC_ATTR_HOST_SEV_SNP))
403 		return -ENODEV;
404 
405 	if (!pfn_valid(pfn))
406 		return -EINVAL;
407 
408 	/* Binutils version 2.36 supports the PSMASH mnemonic. */
409 	asm volatile(".byte 0xF3, 0x0F, 0x01, 0xFF"
410 		      : "=a" (ret)
411 		      : "a" (paddr)
412 		      : "memory", "cc");
413 
414 	return ret;
415 }
416 EXPORT_SYMBOL_GPL(psmash);
417 
418 /*
419  * If the kernel uses a 2MB or larger directmap mapping to write to an address,
420  * and that mapping contains any 4KB pages that are set to private in the RMP
421  * table, an RMP #PF will trigger and cause a host crash. Hypervisor code that
422  * owns the PFNs being transitioned will never attempt such a write, but other
423  * kernel tasks writing to other PFNs in the range may trigger these checks
424  * inadvertently due a large directmap mapping that happens to overlap such a
425  * PFN.
426  *
427  * Prevent this by splitting any 2MB+ mappings that might end up containing a
428  * mix of private/shared PFNs as a result of a subsequent RMPUPDATE for the
429  * PFN/rmp_level passed in.
430  *
431  * Note that there is no attempt here to scan all the RMP entries for the 2MB
432  * physical range, since it would only be worthwhile in determining if a
433  * subsequent RMPUPDATE for a 4KB PFN would result in all the entries being of
434  * the same shared/private state, thus avoiding the need to split the mapping.
435  * But that would mean the entries are currently in a mixed state, and so the
436  * mapping would have already been split as a result of prior transitions.
437  * And since the 4K split is only done if the mapping is 2MB+, and there isn't
438  * currently a mechanism in place to restore 2MB+ mappings, such a check would
439  * not provide any usable benefit.
440  *
441  * More specifics on how these checks are carried out can be found in APM
442  * Volume 2, "RMP and VMPL Access Checks".
443  */
444 static int adjust_direct_map(u64 pfn, int rmp_level)
445 {
446 	unsigned long vaddr;
447 	unsigned int level;
448 	int npages, ret;
449 	pte_t *pte;
450 
451 	/*
452 	 * pfn_to_kaddr() will return a vaddr only within the direct
453 	 * map range.
454 	 */
455 	vaddr = (unsigned long)pfn_to_kaddr(pfn);
456 
457 	/* Only 4KB/2MB RMP entries are supported by current hardware. */
458 	if (WARN_ON_ONCE(rmp_level > PG_LEVEL_2M))
459 		return -EINVAL;
460 
461 	if (!pfn_valid(pfn))
462 		return -EINVAL;
463 
464 	if (rmp_level == PG_LEVEL_2M &&
465 	    (!IS_ALIGNED(pfn, PTRS_PER_PMD) || !pfn_valid(pfn + PTRS_PER_PMD - 1)))
466 		return -EINVAL;
467 
468 	/*
469 	 * If an entire 2MB physical range is being transitioned, then there is
470 	 * no risk of RMP #PFs due to write accesses from overlapping mappings,
471 	 * since even accesses from 1GB mappings will be treated as 2MB accesses
472 	 * as far as RMP table checks are concerned.
473 	 */
474 	if (rmp_level == PG_LEVEL_2M)
475 		return 0;
476 
477 	pte = lookup_address(vaddr, &level);
478 	if (!pte || pte_none(*pte))
479 		return 0;
480 
481 	if (level == PG_LEVEL_4K)
482 		return 0;
483 
484 	npages = page_level_size(rmp_level) / PAGE_SIZE;
485 	ret = set_memory_4k(vaddr, npages);
486 	if (ret)
487 		pr_warn("Failed to split direct map for PFN 0x%llx, ret: %d\n",
488 			pfn, ret);
489 
490 	return ret;
491 }
492 
493 /*
494  * It is expected that those operations are seldom enough so that no mutual
495  * exclusion of updaters is needed and thus the overlap error condition below
496  * should happen very rarely and would get resolved relatively quickly by
497  * the firmware.
498  *
499  * If not, one could consider introducing a mutex or so here to sync concurrent
500  * RMP updates and thus diminish the amount of cases where firmware needs to
501  * lock 2M ranges to protect against concurrent updates.
502  *
503  * The optimal solution would be range locking to avoid locking disjoint
504  * regions unnecessarily but there's no support for that yet.
505  */
506 static int rmpupdate(u64 pfn, struct rmp_state *state)
507 {
508 	unsigned long paddr = pfn << PAGE_SHIFT;
509 	int ret, level;
510 
511 	if (!cc_platform_has(CC_ATTR_HOST_SEV_SNP))
512 		return -ENODEV;
513 
514 	level = RMP_TO_PG_LEVEL(state->pagesize);
515 
516 	if (adjust_direct_map(pfn, level))
517 		return -EFAULT;
518 
519 	do {
520 		/* Binutils version 2.36 supports the RMPUPDATE mnemonic. */
521 		asm volatile(".byte 0xF2, 0x0F, 0x01, 0xFE"
522 			     : "=a" (ret)
523 			     : "a" (paddr), "c" ((unsigned long)state)
524 			     : "memory", "cc");
525 	} while (ret == RMPUPDATE_FAIL_OVERLAP);
526 
527 	if (ret) {
528 		pr_err("RMPUPDATE failed for PFN %llx, pg_level: %d, ret: %d\n",
529 		       pfn, level, ret);
530 		dump_rmpentry(pfn);
531 		dump_stack();
532 		return -EFAULT;
533 	}
534 
535 	return 0;
536 }
537 
538 /* Transition a page to guest-owned/private state in the RMP table. */
539 int rmp_make_private(u64 pfn, u64 gpa, enum pg_level level, u32 asid, bool immutable)
540 {
541 	struct rmp_state state;
542 
543 	memset(&state, 0, sizeof(state));
544 	state.assigned = 1;
545 	state.asid = asid;
546 	state.immutable = immutable;
547 	state.gpa = gpa;
548 	state.pagesize = PG_LEVEL_TO_RMP(level);
549 
550 	return rmpupdate(pfn, &state);
551 }
552 EXPORT_SYMBOL_GPL(rmp_make_private);
553 
554 /* Transition a page to hypervisor-owned/shared state in the RMP table. */
555 int rmp_make_shared(u64 pfn, enum pg_level level)
556 {
557 	struct rmp_state state;
558 
559 	memset(&state, 0, sizeof(state));
560 	state.pagesize = PG_LEVEL_TO_RMP(level);
561 
562 	return rmpupdate(pfn, &state);
563 }
564 EXPORT_SYMBOL_GPL(rmp_make_shared);
565 
566 void snp_leak_pages(u64 pfn, unsigned int npages)
567 {
568 	struct page *page = pfn_to_page(pfn);
569 
570 	pr_warn("Leaking PFN range 0x%llx-0x%llx\n", pfn, pfn + npages);
571 
572 	spin_lock(&snp_leaked_pages_list_lock);
573 	while (npages--) {
574 
575 		/*
576 		 * Reuse the page's buddy list for chaining into the leaked
577 		 * pages list. This page should not be on a free list currently
578 		 * and is also unsafe to be added to a free list.
579 		 */
580 		if (likely(!PageCompound(page)) ||
581 
582 			/*
583 			 * Skip inserting tail pages of compound page as
584 			 * page->buddy_list of tail pages is not usable.
585 			 */
586 		    (PageHead(page) && compound_nr(page) <= npages))
587 			list_add_tail(&page->buddy_list, &snp_leaked_pages_list);
588 
589 		dump_rmpentry(pfn);
590 		snp_nr_leaked_pages++;
591 		pfn++;
592 		page++;
593 	}
594 	spin_unlock(&snp_leaked_pages_list_lock);
595 }
596 EXPORT_SYMBOL_GPL(snp_leak_pages);
597 
598 void kdump_sev_callback(void)
599 {
600 	/*
601 	 * Do wbinvd() on remote CPUs when SNP is enabled in order to
602 	 * safely do SNP_SHUTDOWN on the local CPU.
603 	 */
604 	if (cc_platform_has(CC_ATTR_HOST_SEV_SNP))
605 		wbinvd();
606 }
607