xref: /linux/arch/x86/virt/svm/sev.c (revision e814f3fd16acfb7f9966773953de8f740a1e3202)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * AMD SVM-SEV Host Support.
4  *
5  * Copyright (C) 2023 Advanced Micro Devices, Inc.
6  *
7  * Author: Ashish Kalra <ashish.kalra@amd.com>
8  *
9  */
10 
11 #include <linux/cc_platform.h>
12 #include <linux/printk.h>
13 #include <linux/mm_types.h>
14 #include <linux/set_memory.h>
15 #include <linux/memblock.h>
16 #include <linux/kernel.h>
17 #include <linux/mm.h>
18 #include <linux/cpumask.h>
19 #include <linux/iommu.h>
20 #include <linux/amd-iommu.h>
21 #include <linux/nospec.h>
22 
23 #include <asm/sev.h>
24 #include <asm/processor.h>
25 #include <asm/setup.h>
26 #include <asm/svm.h>
27 #include <asm/smp.h>
28 #include <asm/cpu.h>
29 #include <asm/apic.h>
30 #include <asm/cpuid.h>
31 #include <asm/cmdline.h>
32 #include <asm/iommu.h>
33 
34 /*
35  * The RMP entry information as returned by the RMPREAD instruction.
36  */
37 struct rmpentry {
38 	u64 gpa;
39 	u8  assigned		:1,
40 	    rsvd1		:7;
41 	u8  pagesize		:1,
42 	    hpage_region_status	:1,
43 	    rsvd2		:6;
44 	u8  immutable		:1,
45 	    rsvd3		:7;
46 	u8  rsvd4;
47 	u32 asid;
48 } __packed;
49 
50 /*
51  * The raw RMP entry format is not architectural. The format is defined in PPR
52  * Family 19h Model 01h, Rev B1 processor. This format represents the actual
53  * entry in the RMP table memory. The bitfield definitions are used for machines
54  * without the RMPREAD instruction (Zen3 and Zen4), otherwise the "hi" and "lo"
55  * fields are only used for dumping the raw data.
56  */
57 struct rmpentry_raw {
58 	union {
59 		struct {
60 			u64 assigned	: 1,
61 			    pagesize	: 1,
62 			    immutable	: 1,
63 			    rsvd1	: 9,
64 			    gpa		: 39,
65 			    asid	: 10,
66 			    vmsa	: 1,
67 			    validated	: 1,
68 			    rsvd2	: 1;
69 		};
70 		u64 lo;
71 	};
72 	u64 hi;
73 } __packed;
74 
75 /*
76  * The first 16KB from the RMP_BASE is used by the processor for the
77  * bookkeeping, the range needs to be added during the RMP entry lookup.
78  */
79 #define RMPTABLE_CPU_BOOKKEEPING_SZ	0x4000
80 
81 /*
82  * For a non-segmented RMP table, use the maximum physical addressing as the
83  * segment size in order to always arrive at index 0 in the table.
84  */
85 #define RMPTABLE_NON_SEGMENTED_SHIFT	52
86 
87 struct rmp_segment_desc {
88 	struct rmpentry_raw *rmp_entry;
89 	u64 max_index;
90 	u64 size;
91 };
92 
93 /*
94  * Segmented RMP Table support.
95  *   - The segment size is used for two purposes:
96  *     - Identify the amount of memory covered by an RMP segment
97  *     - Quickly locate an RMP segment table entry for a physical address
98  *
99  *   - The RMP segment table contains pointers to an RMP table that covers
100  *     a specific portion of memory. There can be up to 512 8-byte entries,
101  *     one pages worth.
102  */
103 #define RST_ENTRY_MAPPED_SIZE(x)	((x) & GENMASK_ULL(19, 0))
104 #define RST_ENTRY_SEGMENT_BASE(x)	((x) & GENMASK_ULL(51, 20))
105 
106 #define RST_SIZE SZ_4K
107 static struct rmp_segment_desc **rmp_segment_table __ro_after_init;
108 static unsigned int rst_max_index __ro_after_init = 512;
109 
110 static unsigned int rmp_segment_shift;
111 static u64 rmp_segment_size;
112 static u64 rmp_segment_mask;
113 
114 #define RST_ENTRY_INDEX(x)	((x) >> rmp_segment_shift)
115 #define RMP_ENTRY_INDEX(x)	((u64)(PHYS_PFN((x) & rmp_segment_mask)))
116 
117 static u64 rmp_cfg;
118 
119 /* Mask to apply to a PFN to get the first PFN of a 2MB page */
120 #define PFN_PMD_MASK	GENMASK_ULL(63, PMD_SHIFT - PAGE_SHIFT)
121 
122 static u64 probed_rmp_base, probed_rmp_size;
123 
124 static LIST_HEAD(snp_leaked_pages_list);
125 static DEFINE_SPINLOCK(snp_leaked_pages_list_lock);
126 
127 static unsigned long snp_nr_leaked_pages;
128 
129 #undef pr_fmt
130 #define pr_fmt(fmt)	"SEV-SNP: " fmt
131 
132 static int __mfd_enable(unsigned int cpu)
133 {
134 	u64 val;
135 
136 	if (!cc_platform_has(CC_ATTR_HOST_SEV_SNP))
137 		return 0;
138 
139 	rdmsrl(MSR_AMD64_SYSCFG, val);
140 
141 	val |= MSR_AMD64_SYSCFG_MFDM;
142 
143 	wrmsrl(MSR_AMD64_SYSCFG, val);
144 
145 	return 0;
146 }
147 
148 static __init void mfd_enable(void *arg)
149 {
150 	__mfd_enable(smp_processor_id());
151 }
152 
153 static int __snp_enable(unsigned int cpu)
154 {
155 	u64 val;
156 
157 	if (!cc_platform_has(CC_ATTR_HOST_SEV_SNP))
158 		return 0;
159 
160 	rdmsrl(MSR_AMD64_SYSCFG, val);
161 
162 	val |= MSR_AMD64_SYSCFG_SNP_EN;
163 	val |= MSR_AMD64_SYSCFG_SNP_VMPL_EN;
164 
165 	wrmsrl(MSR_AMD64_SYSCFG, val);
166 
167 	return 0;
168 }
169 
170 static __init void snp_enable(void *arg)
171 {
172 	__snp_enable(smp_processor_id());
173 }
174 
175 static void __init __snp_fixup_e820_tables(u64 pa)
176 {
177 	if (IS_ALIGNED(pa, PMD_SIZE))
178 		return;
179 
180 	/*
181 	 * Handle cases where the RMP table placement by the BIOS is not
182 	 * 2M aligned and the kexec kernel could try to allocate
183 	 * from within that chunk which then causes a fatal RMP fault.
184 	 *
185 	 * The e820_table needs to be updated as it is converted to
186 	 * kernel memory resources and used by KEXEC_FILE_LOAD syscall
187 	 * to load kexec segments.
188 	 *
189 	 * The e820_table_firmware needs to be updated as it is exposed
190 	 * to sysfs and used by the KEXEC_LOAD syscall to load kexec
191 	 * segments.
192 	 *
193 	 * The e820_table_kexec needs to be updated as it passed to
194 	 * the kexec-ed kernel.
195 	 */
196 	pa = ALIGN_DOWN(pa, PMD_SIZE);
197 	if (e820__mapped_any(pa, pa + PMD_SIZE, E820_TYPE_RAM)) {
198 		pr_info("Reserving start/end of RMP table on a 2MB boundary [0x%016llx]\n", pa);
199 		e820__range_update(pa, PMD_SIZE, E820_TYPE_RAM, E820_TYPE_RESERVED);
200 		e820__range_update_table(e820_table_kexec, pa, PMD_SIZE, E820_TYPE_RAM, E820_TYPE_RESERVED);
201 		e820__range_update_table(e820_table_firmware, pa, PMD_SIZE, E820_TYPE_RAM, E820_TYPE_RESERVED);
202 		if (!memblock_is_region_reserved(pa, PMD_SIZE))
203 			memblock_reserve(pa, PMD_SIZE);
204 	}
205 }
206 
207 static void __init fixup_e820_tables_for_segmented_rmp(void)
208 {
209 	u64 pa, *rst, size, mapped_size;
210 	unsigned int i;
211 
212 	__snp_fixup_e820_tables(probed_rmp_base);
213 
214 	pa = probed_rmp_base + RMPTABLE_CPU_BOOKKEEPING_SZ;
215 
216 	__snp_fixup_e820_tables(pa + RST_SIZE);
217 
218 	rst = early_memremap(pa, RST_SIZE);
219 	if (!rst)
220 		return;
221 
222 	for (i = 0; i < rst_max_index; i++) {
223 		pa = RST_ENTRY_SEGMENT_BASE(rst[i]);
224 		mapped_size = RST_ENTRY_MAPPED_SIZE(rst[i]);
225 		if (!mapped_size)
226 			continue;
227 
228 		__snp_fixup_e820_tables(pa);
229 
230 		/*
231 		 * Mapped size in GB. Mapped size is allowed to exceed
232 		 * the segment coverage size, but gets reduced to the
233 		 * segment coverage size.
234 		 */
235 		mapped_size <<= 30;
236 		if (mapped_size > rmp_segment_size)
237 			mapped_size = rmp_segment_size;
238 
239 		/* Calculate the RMP segment size (16 bytes/page mapped) */
240 		size = PHYS_PFN(mapped_size) << 4;
241 
242 		__snp_fixup_e820_tables(pa + size);
243 	}
244 
245 	early_memunmap(rst, RST_SIZE);
246 }
247 
248 static void __init fixup_e820_tables_for_contiguous_rmp(void)
249 {
250 	__snp_fixup_e820_tables(probed_rmp_base);
251 	__snp_fixup_e820_tables(probed_rmp_base + probed_rmp_size);
252 }
253 
254 void __init snp_fixup_e820_tables(void)
255 {
256 	if (rmp_cfg & MSR_AMD64_SEG_RMP_ENABLED) {
257 		fixup_e820_tables_for_segmented_rmp();
258 	} else {
259 		fixup_e820_tables_for_contiguous_rmp();
260 	}
261 }
262 
263 static bool __init clear_rmptable_bookkeeping(void)
264 {
265 	void *bk;
266 
267 	bk = memremap(probed_rmp_base, RMPTABLE_CPU_BOOKKEEPING_SZ, MEMREMAP_WB);
268 	if (!bk) {
269 		pr_err("Failed to map RMP bookkeeping area\n");
270 		return false;
271 	}
272 
273 	memset(bk, 0, RMPTABLE_CPU_BOOKKEEPING_SZ);
274 
275 	memunmap(bk);
276 
277 	return true;
278 }
279 
280 static bool __init alloc_rmp_segment_desc(u64 segment_pa, u64 segment_size, u64 pa)
281 {
282 	u64 rst_index, rmp_segment_size_max;
283 	struct rmp_segment_desc *desc;
284 	void *rmp_segment;
285 
286 	/* Calculate the maximum size an RMP can be (16 bytes/page mapped) */
287 	rmp_segment_size_max = PHYS_PFN(rmp_segment_size) << 4;
288 
289 	/* Validate the RMP segment size */
290 	if (segment_size > rmp_segment_size_max) {
291 		pr_err("Invalid RMP size 0x%llx for configured segment size 0x%llx\n",
292 		       segment_size, rmp_segment_size_max);
293 		return false;
294 	}
295 
296 	/* Validate the RMP segment table index */
297 	rst_index = RST_ENTRY_INDEX(pa);
298 	if (rst_index >= rst_max_index) {
299 		pr_err("Invalid RMP segment base address 0x%llx for configured segment size 0x%llx\n",
300 		       pa, rmp_segment_size);
301 		return false;
302 	}
303 
304 	if (rmp_segment_table[rst_index]) {
305 		pr_err("RMP segment descriptor already exists at index %llu\n", rst_index);
306 		return false;
307 	}
308 
309 	rmp_segment = memremap(segment_pa, segment_size, MEMREMAP_WB);
310 	if (!rmp_segment) {
311 		pr_err("Failed to map RMP segment addr 0x%llx size 0x%llx\n",
312 		       segment_pa, segment_size);
313 		return false;
314 	}
315 
316 	desc = kzalloc(sizeof(*desc), GFP_KERNEL);
317 	if (!desc) {
318 		memunmap(rmp_segment);
319 		return false;
320 	}
321 
322 	desc->rmp_entry = rmp_segment;
323 	desc->max_index = segment_size / sizeof(*desc->rmp_entry);
324 	desc->size = segment_size;
325 
326 	rmp_segment_table[rst_index] = desc;
327 
328 	return true;
329 }
330 
331 static void __init free_rmp_segment_table(void)
332 {
333 	unsigned int i;
334 
335 	for (i = 0; i < rst_max_index; i++) {
336 		struct rmp_segment_desc *desc;
337 
338 		desc = rmp_segment_table[i];
339 		if (!desc)
340 			continue;
341 
342 		memunmap(desc->rmp_entry);
343 
344 		kfree(desc);
345 	}
346 
347 	free_page((unsigned long)rmp_segment_table);
348 
349 	rmp_segment_table = NULL;
350 }
351 
352 /* Allocate the table used to index into the RMP segments */
353 static bool __init alloc_rmp_segment_table(void)
354 {
355 	struct page *page;
356 
357 	page = alloc_page(__GFP_ZERO);
358 	if (!page)
359 		return false;
360 
361 	rmp_segment_table = page_address(page);
362 
363 	return true;
364 }
365 
366 static bool __init setup_contiguous_rmptable(void)
367 {
368 	u64 max_rmp_pfn, calc_rmp_sz, rmptable_segment, rmptable_size, rmp_end;
369 
370 	if (!probed_rmp_size)
371 		return false;
372 
373 	rmp_end = probed_rmp_base + probed_rmp_size - 1;
374 
375 	/*
376 	 * Calculate the amount of memory that must be reserved by the BIOS to
377 	 * address the whole RAM, including the bookkeeping area. The RMP itself
378 	 * must also be covered.
379 	 */
380 	max_rmp_pfn = max_pfn;
381 	if (PFN_UP(rmp_end) > max_pfn)
382 		max_rmp_pfn = PFN_UP(rmp_end);
383 
384 	calc_rmp_sz = (max_rmp_pfn << 4) + RMPTABLE_CPU_BOOKKEEPING_SZ;
385 	if (calc_rmp_sz > probed_rmp_size) {
386 		pr_err("Memory reserved for the RMP table does not cover full system RAM (expected 0x%llx got 0x%llx)\n",
387 		       calc_rmp_sz, probed_rmp_size);
388 		return false;
389 	}
390 
391 	if (!alloc_rmp_segment_table())
392 		return false;
393 
394 	/* Map only the RMP entries */
395 	rmptable_segment = probed_rmp_base + RMPTABLE_CPU_BOOKKEEPING_SZ;
396 	rmptable_size    = probed_rmp_size - RMPTABLE_CPU_BOOKKEEPING_SZ;
397 
398 	if (!alloc_rmp_segment_desc(rmptable_segment, rmptable_size, 0)) {
399 		free_rmp_segment_table();
400 		return false;
401 	}
402 
403 	return true;
404 }
405 
406 static bool __init setup_segmented_rmptable(void)
407 {
408 	u64 rst_pa, *rst, pa, ram_pa_end, ram_pa_max;
409 	unsigned int i, max_index;
410 
411 	if (!probed_rmp_base)
412 		return false;
413 
414 	if (!alloc_rmp_segment_table())
415 		return false;
416 
417 	rst_pa = probed_rmp_base + RMPTABLE_CPU_BOOKKEEPING_SZ;
418 	rst = memremap(rst_pa, RST_SIZE, MEMREMAP_WB);
419 	if (!rst) {
420 		pr_err("Failed to map RMP segment table addr 0x%llx\n", rst_pa);
421 		goto e_free;
422 	}
423 
424 	pr_info("Segmented RMP using %lluGB segments\n", rmp_segment_size >> 30);
425 
426 	ram_pa_max = max_pfn << PAGE_SHIFT;
427 
428 	max_index = 0;
429 	ram_pa_end = 0;
430 	for (i = 0; i < rst_max_index; i++) {
431 		u64 rmp_segment, rmp_size, mapped_size;
432 
433 		mapped_size = RST_ENTRY_MAPPED_SIZE(rst[i]);
434 		if (!mapped_size)
435 			continue;
436 
437 		max_index = i;
438 
439 		/*
440 		 * Mapped size in GB. Mapped size is allowed to exceed the
441 		 * segment coverage size, but gets reduced to the segment
442 		 * coverage size.
443 		 */
444 		mapped_size <<= 30;
445 		if (mapped_size > rmp_segment_size) {
446 			pr_info("RMP segment %u mapped size (0x%llx) reduced to 0x%llx\n",
447 				i, mapped_size, rmp_segment_size);
448 			mapped_size = rmp_segment_size;
449 		}
450 
451 		rmp_segment = RST_ENTRY_SEGMENT_BASE(rst[i]);
452 
453 		/* Calculate the RMP segment size (16 bytes/page mapped) */
454 		rmp_size = PHYS_PFN(mapped_size) << 4;
455 
456 		pa = (u64)i << rmp_segment_shift;
457 
458 		/*
459 		 * Some segments may be for MMIO mapped above system RAM. These
460 		 * segments are used for Trusted I/O.
461 		 */
462 		if (pa < ram_pa_max)
463 			ram_pa_end = pa + mapped_size;
464 
465 		if (!alloc_rmp_segment_desc(rmp_segment, rmp_size, pa))
466 			goto e_unmap;
467 
468 		pr_info("RMP segment %u physical address [0x%llx - 0x%llx] covering [0x%llx - 0x%llx]\n",
469 			i, rmp_segment, rmp_segment + rmp_size - 1, pa, pa + mapped_size - 1);
470 	}
471 
472 	if (ram_pa_max > ram_pa_end) {
473 		pr_err("Segmented RMP does not cover full system RAM (expected 0x%llx got 0x%llx)\n",
474 		       ram_pa_max, ram_pa_end);
475 		goto e_unmap;
476 	}
477 
478 	/* Adjust the maximum index based on the found segments */
479 	rst_max_index = max_index + 1;
480 
481 	memunmap(rst);
482 
483 	return true;
484 
485 e_unmap:
486 	memunmap(rst);
487 
488 e_free:
489 	free_rmp_segment_table();
490 
491 	return false;
492 }
493 
494 static bool __init setup_rmptable(void)
495 {
496 	if (rmp_cfg & MSR_AMD64_SEG_RMP_ENABLED) {
497 		return setup_segmented_rmptable();
498 	} else {
499 		return setup_contiguous_rmptable();
500 	}
501 }
502 
503 /*
504  * Do the necessary preparations which are verified by the firmware as
505  * described in the SNP_INIT_EX firmware command description in the SNP
506  * firmware ABI spec.
507  */
508 static int __init snp_rmptable_init(void)
509 {
510 	unsigned int i;
511 	u64 val;
512 
513 	if (!cc_platform_has(CC_ATTR_HOST_SEV_SNP))
514 		return 0;
515 
516 	if (!amd_iommu_snp_en)
517 		goto nosnp;
518 
519 	if (!setup_rmptable())
520 		goto nosnp;
521 
522 	/*
523 	 * Check if SEV-SNP is already enabled, this can happen in case of
524 	 * kexec boot.
525 	 */
526 	rdmsrl(MSR_AMD64_SYSCFG, val);
527 	if (val & MSR_AMD64_SYSCFG_SNP_EN)
528 		goto skip_enable;
529 
530 	/* Zero out the RMP bookkeeping area */
531 	if (!clear_rmptable_bookkeeping()) {
532 		free_rmp_segment_table();
533 		goto nosnp;
534 	}
535 
536 	/* Zero out the RMP entries */
537 	for (i = 0; i < rst_max_index; i++) {
538 		struct rmp_segment_desc *desc;
539 
540 		desc = rmp_segment_table[i];
541 		if (!desc)
542 			continue;
543 
544 		memset(desc->rmp_entry, 0, desc->size);
545 	}
546 
547 	/* Flush the caches to ensure that data is written before SNP is enabled. */
548 	wbinvd_on_all_cpus();
549 
550 	/* MtrrFixDramModEn must be enabled on all the CPUs prior to enabling SNP. */
551 	on_each_cpu(mfd_enable, NULL, 1);
552 
553 	on_each_cpu(snp_enable, NULL, 1);
554 
555 skip_enable:
556 	cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "x86/rmptable_init:online", __snp_enable, NULL);
557 
558 	/*
559 	 * Setting crash_kexec_post_notifiers to 'true' to ensure that SNP panic
560 	 * notifier is invoked to do SNP IOMMU shutdown before kdump.
561 	 */
562 	crash_kexec_post_notifiers = true;
563 
564 	return 0;
565 
566 nosnp:
567 	cc_platform_clear(CC_ATTR_HOST_SEV_SNP);
568 	return -ENOSYS;
569 }
570 
571 /*
572  * This must be called after the IOMMU has been initialized.
573  */
574 device_initcall(snp_rmptable_init);
575 
576 static void set_rmp_segment_info(unsigned int segment_shift)
577 {
578 	rmp_segment_shift = segment_shift;
579 	rmp_segment_size  = 1ULL << rmp_segment_shift;
580 	rmp_segment_mask  = rmp_segment_size - 1;
581 }
582 
583 #define RMP_ADDR_MASK GENMASK_ULL(51, 13)
584 
585 static bool probe_contiguous_rmptable_info(void)
586 {
587 	u64 rmp_sz, rmp_base, rmp_end;
588 
589 	rdmsrl(MSR_AMD64_RMP_BASE, rmp_base);
590 	rdmsrl(MSR_AMD64_RMP_END, rmp_end);
591 
592 	if (!(rmp_base & RMP_ADDR_MASK) || !(rmp_end & RMP_ADDR_MASK)) {
593 		pr_err("Memory for the RMP table has not been reserved by BIOS\n");
594 		return false;
595 	}
596 
597 	if (rmp_base > rmp_end) {
598 		pr_err("RMP configuration not valid: base=%#llx, end=%#llx\n", rmp_base, rmp_end);
599 		return false;
600 	}
601 
602 	rmp_sz = rmp_end - rmp_base + 1;
603 
604 	/* Treat the contiguous RMP table as a single segment */
605 	rst_max_index = 1;
606 
607 	set_rmp_segment_info(RMPTABLE_NON_SEGMENTED_SHIFT);
608 
609 	probed_rmp_base = rmp_base;
610 	probed_rmp_size = rmp_sz;
611 
612 	pr_info("RMP table physical range [0x%016llx - 0x%016llx]\n",
613 		rmp_base, rmp_end);
614 
615 	return true;
616 }
617 
618 static bool probe_segmented_rmptable_info(void)
619 {
620 	unsigned int eax, ebx, segment_shift, segment_shift_min, segment_shift_max;
621 	u64 rmp_base, rmp_end;
622 
623 	rdmsrl(MSR_AMD64_RMP_BASE, rmp_base);
624 	if (!(rmp_base & RMP_ADDR_MASK)) {
625 		pr_err("Memory for the RMP table has not been reserved by BIOS\n");
626 		return false;
627 	}
628 
629 	rdmsrl(MSR_AMD64_RMP_END, rmp_end);
630 	WARN_ONCE(rmp_end & RMP_ADDR_MASK,
631 		  "Segmented RMP enabled but RMP_END MSR is non-zero\n");
632 
633 	/* Obtain the min and max supported RMP segment size */
634 	eax = cpuid_eax(0x80000025);
635 	segment_shift_min = eax & GENMASK(5, 0);
636 	segment_shift_max = (eax & GENMASK(11, 6)) >> 6;
637 
638 	/* Verify the segment size is within the supported limits */
639 	segment_shift = MSR_AMD64_RMP_SEGMENT_SHIFT(rmp_cfg);
640 	if (segment_shift > segment_shift_max || segment_shift < segment_shift_min) {
641 		pr_err("RMP segment size (%u) is not within advertised bounds (min=%u, max=%u)\n",
642 		       segment_shift, segment_shift_min, segment_shift_max);
643 		return false;
644 	}
645 
646 	/* Override the max supported RST index if a hardware limit exists */
647 	ebx = cpuid_ebx(0x80000025);
648 	if (ebx & BIT(10))
649 		rst_max_index = ebx & GENMASK(9, 0);
650 
651 	set_rmp_segment_info(segment_shift);
652 
653 	probed_rmp_base = rmp_base;
654 	probed_rmp_size = 0;
655 
656 	pr_info("Segmented RMP base table physical range [0x%016llx - 0x%016llx]\n",
657 		rmp_base, rmp_base + RMPTABLE_CPU_BOOKKEEPING_SZ + RST_SIZE);
658 
659 	return true;
660 }
661 
662 bool snp_probe_rmptable_info(void)
663 {
664 	if (cpu_feature_enabled(X86_FEATURE_SEGMENTED_RMP))
665 		rdmsrl(MSR_AMD64_RMP_CFG, rmp_cfg);
666 
667 	if (rmp_cfg & MSR_AMD64_SEG_RMP_ENABLED)
668 		return probe_segmented_rmptable_info();
669 	else
670 		return probe_contiguous_rmptable_info();
671 }
672 
673 /*
674  * About the array_index_nospec() usage below:
675  *
676  * This function can get called by exported functions like
677  * snp_lookup_rmpentry(), which is used by the KVM #PF handler, among
678  * others, and since the @pfn passed in cannot always be trusted,
679  * speculation should be stopped as a protective measure.
680  */
681 static struct rmpentry_raw *get_raw_rmpentry(u64 pfn)
682 {
683 	u64 paddr, rst_index, segment_index;
684 	struct rmp_segment_desc *desc;
685 
686 	if (!rmp_segment_table)
687 		return ERR_PTR(-ENODEV);
688 
689 	paddr = pfn << PAGE_SHIFT;
690 
691 	rst_index = RST_ENTRY_INDEX(paddr);
692 	if (unlikely(rst_index >= rst_max_index))
693 		return ERR_PTR(-EFAULT);
694 
695 	rst_index = array_index_nospec(rst_index, rst_max_index);
696 
697 	desc = rmp_segment_table[rst_index];
698 	if (unlikely(!desc))
699 		return ERR_PTR(-EFAULT);
700 
701 	segment_index = RMP_ENTRY_INDEX(paddr);
702 	if (unlikely(segment_index >= desc->max_index))
703 		return ERR_PTR(-EFAULT);
704 
705 	segment_index = array_index_nospec(segment_index, desc->max_index);
706 
707 	return desc->rmp_entry + segment_index;
708 }
709 
710 static int get_rmpentry(u64 pfn, struct rmpentry *e)
711 {
712 	struct rmpentry_raw *e_raw;
713 
714 	if (cpu_feature_enabled(X86_FEATURE_RMPREAD)) {
715 		int ret;
716 
717 		/* Binutils version 2.44 supports the RMPREAD mnemonic. */
718 		asm volatile(".byte 0xf2, 0x0f, 0x01, 0xfd"
719 			     : "=a" (ret)
720 			     : "a" (pfn << PAGE_SHIFT), "c" (e)
721 			     : "memory", "cc");
722 
723 		return ret;
724 	}
725 
726 	e_raw = get_raw_rmpentry(pfn);
727 	if (IS_ERR(e_raw))
728 		return PTR_ERR(e_raw);
729 
730 	/*
731 	 * Map the raw RMP table entry onto the RMPREAD output format.
732 	 * The 2MB region status indicator (hpage_region_status field) is not
733 	 * calculated, since the overhead could be significant and the field
734 	 * is not used.
735 	 */
736 	memset(e, 0, sizeof(*e));
737 	e->gpa       = e_raw->gpa << PAGE_SHIFT;
738 	e->asid      = e_raw->asid;
739 	e->assigned  = e_raw->assigned;
740 	e->pagesize  = e_raw->pagesize;
741 	e->immutable = e_raw->immutable;
742 
743 	return 0;
744 }
745 
746 static int __snp_lookup_rmpentry(u64 pfn, struct rmpentry *e, int *level)
747 {
748 	struct rmpentry e_large;
749 	int ret;
750 
751 	if (!cc_platform_has(CC_ATTR_HOST_SEV_SNP))
752 		return -ENODEV;
753 
754 	ret = get_rmpentry(pfn, e);
755 	if (ret)
756 		return ret;
757 
758 	/*
759 	 * Find the authoritative RMP entry for a PFN. This can be either a 4K
760 	 * RMP entry or a special large RMP entry that is authoritative for a
761 	 * whole 2M area.
762 	 */
763 	ret = get_rmpentry(pfn & PFN_PMD_MASK, &e_large);
764 	if (ret)
765 		return ret;
766 
767 	*level = RMP_TO_PG_LEVEL(e_large.pagesize);
768 
769 	return 0;
770 }
771 
772 int snp_lookup_rmpentry(u64 pfn, bool *assigned, int *level)
773 {
774 	struct rmpentry e;
775 	int ret;
776 
777 	ret = __snp_lookup_rmpentry(pfn, &e, level);
778 	if (ret)
779 		return ret;
780 
781 	*assigned = !!e.assigned;
782 	return 0;
783 }
784 EXPORT_SYMBOL_GPL(snp_lookup_rmpentry);
785 
786 /*
787  * Dump the raw RMP entry for a particular PFN. These bits are documented in the
788  * PPR for a particular CPU model and provide useful information about how a
789  * particular PFN is being utilized by the kernel/firmware at the time certain
790  * unexpected events occur, such as RMP faults.
791  */
792 static void dump_rmpentry(u64 pfn)
793 {
794 	struct rmpentry_raw *e_raw;
795 	u64 pfn_i, pfn_end;
796 	struct rmpentry e;
797 	int level, ret;
798 
799 	ret = __snp_lookup_rmpentry(pfn, &e, &level);
800 	if (ret) {
801 		pr_err("Failed to read RMP entry for PFN 0x%llx, error %d\n",
802 		       pfn, ret);
803 		return;
804 	}
805 
806 	if (e.assigned) {
807 		e_raw = get_raw_rmpentry(pfn);
808 		if (IS_ERR(e_raw)) {
809 			pr_err("Failed to read RMP contents for PFN 0x%llx, error %ld\n",
810 			       pfn, PTR_ERR(e_raw));
811 			return;
812 		}
813 
814 		pr_info("PFN 0x%llx, RMP entry: [0x%016llx - 0x%016llx]\n",
815 			pfn, e_raw->lo, e_raw->hi);
816 		return;
817 	}
818 
819 	/*
820 	 * If the RMP entry for a particular PFN is not in an assigned state,
821 	 * then it is sometimes useful to get an idea of whether or not any RMP
822 	 * entries for other PFNs within the same 2MB region are assigned, since
823 	 * those too can affect the ability to access a particular PFN in
824 	 * certain situations, such as when the PFN is being accessed via a 2MB
825 	 * mapping in the host page table.
826 	 */
827 	pfn_i = ALIGN_DOWN(pfn, PTRS_PER_PMD);
828 	pfn_end = pfn_i + PTRS_PER_PMD;
829 
830 	pr_info("PFN 0x%llx unassigned, dumping non-zero entries in 2M PFN region: [0x%llx - 0x%llx]\n",
831 		pfn, pfn_i, pfn_end);
832 
833 	while (pfn_i < pfn_end) {
834 		e_raw = get_raw_rmpentry(pfn_i);
835 		if (IS_ERR(e_raw)) {
836 			pr_err("Error %ld reading RMP contents for PFN 0x%llx\n",
837 			       PTR_ERR(e_raw), pfn_i);
838 			pfn_i++;
839 			continue;
840 		}
841 
842 		if (e_raw->lo || e_raw->hi)
843 			pr_info("PFN: 0x%llx, [0x%016llx - 0x%016llx]\n", pfn_i, e_raw->lo, e_raw->hi);
844 		pfn_i++;
845 	}
846 }
847 
848 void snp_dump_hva_rmpentry(unsigned long hva)
849 {
850 	unsigned long paddr;
851 	unsigned int level;
852 	pgd_t *pgd;
853 	pte_t *pte;
854 
855 	pgd = __va(read_cr3_pa());
856 	pgd += pgd_index(hva);
857 	pte = lookup_address_in_pgd(pgd, hva, &level);
858 
859 	if (!pte) {
860 		pr_err("Can't dump RMP entry for HVA %lx: no PTE/PFN found\n", hva);
861 		return;
862 	}
863 
864 	paddr = PFN_PHYS(pte_pfn(*pte)) | (hva & ~page_level_mask(level));
865 	dump_rmpentry(PHYS_PFN(paddr));
866 }
867 
868 /*
869  * PSMASH a 2MB aligned page into 4K pages in the RMP table while preserving the
870  * Validated bit.
871  */
872 int psmash(u64 pfn)
873 {
874 	unsigned long paddr = pfn << PAGE_SHIFT;
875 	int ret;
876 
877 	if (!cc_platform_has(CC_ATTR_HOST_SEV_SNP))
878 		return -ENODEV;
879 
880 	if (!pfn_valid(pfn))
881 		return -EINVAL;
882 
883 	/* Binutils version 2.36 supports the PSMASH mnemonic. */
884 	asm volatile(".byte 0xF3, 0x0F, 0x01, 0xFF"
885 		      : "=a" (ret)
886 		      : "a" (paddr)
887 		      : "memory", "cc");
888 
889 	return ret;
890 }
891 EXPORT_SYMBOL_GPL(psmash);
892 
893 /*
894  * If the kernel uses a 2MB or larger directmap mapping to write to an address,
895  * and that mapping contains any 4KB pages that are set to private in the RMP
896  * table, an RMP #PF will trigger and cause a host crash. Hypervisor code that
897  * owns the PFNs being transitioned will never attempt such a write, but other
898  * kernel tasks writing to other PFNs in the range may trigger these checks
899  * inadvertently due a large directmap mapping that happens to overlap such a
900  * PFN.
901  *
902  * Prevent this by splitting any 2MB+ mappings that might end up containing a
903  * mix of private/shared PFNs as a result of a subsequent RMPUPDATE for the
904  * PFN/rmp_level passed in.
905  *
906  * Note that there is no attempt here to scan all the RMP entries for the 2MB
907  * physical range, since it would only be worthwhile in determining if a
908  * subsequent RMPUPDATE for a 4KB PFN would result in all the entries being of
909  * the same shared/private state, thus avoiding the need to split the mapping.
910  * But that would mean the entries are currently in a mixed state, and so the
911  * mapping would have already been split as a result of prior transitions.
912  * And since the 4K split is only done if the mapping is 2MB+, and there isn't
913  * currently a mechanism in place to restore 2MB+ mappings, such a check would
914  * not provide any usable benefit.
915  *
916  * More specifics on how these checks are carried out can be found in APM
917  * Volume 2, "RMP and VMPL Access Checks".
918  */
919 static int adjust_direct_map(u64 pfn, int rmp_level)
920 {
921 	unsigned long vaddr;
922 	unsigned int level;
923 	int npages, ret;
924 	pte_t *pte;
925 
926 	/*
927 	 * pfn_to_kaddr() will return a vaddr only within the direct
928 	 * map range.
929 	 */
930 	vaddr = (unsigned long)pfn_to_kaddr(pfn);
931 
932 	/* Only 4KB/2MB RMP entries are supported by current hardware. */
933 	if (WARN_ON_ONCE(rmp_level > PG_LEVEL_2M))
934 		return -EINVAL;
935 
936 	if (!pfn_valid(pfn))
937 		return -EINVAL;
938 
939 	if (rmp_level == PG_LEVEL_2M &&
940 	    (!IS_ALIGNED(pfn, PTRS_PER_PMD) || !pfn_valid(pfn + PTRS_PER_PMD - 1)))
941 		return -EINVAL;
942 
943 	/*
944 	 * If an entire 2MB physical range is being transitioned, then there is
945 	 * no risk of RMP #PFs due to write accesses from overlapping mappings,
946 	 * since even accesses from 1GB mappings will be treated as 2MB accesses
947 	 * as far as RMP table checks are concerned.
948 	 */
949 	if (rmp_level == PG_LEVEL_2M)
950 		return 0;
951 
952 	pte = lookup_address(vaddr, &level);
953 	if (!pte || pte_none(*pte))
954 		return 0;
955 
956 	if (level == PG_LEVEL_4K)
957 		return 0;
958 
959 	npages = page_level_size(rmp_level) / PAGE_SIZE;
960 	ret = set_memory_4k(vaddr, npages);
961 	if (ret)
962 		pr_warn("Failed to split direct map for PFN 0x%llx, ret: %d\n",
963 			pfn, ret);
964 
965 	return ret;
966 }
967 
968 /*
969  * It is expected that those operations are seldom enough so that no mutual
970  * exclusion of updaters is needed and thus the overlap error condition below
971  * should happen very rarely and would get resolved relatively quickly by
972  * the firmware.
973  *
974  * If not, one could consider introducing a mutex or so here to sync concurrent
975  * RMP updates and thus diminish the amount of cases where firmware needs to
976  * lock 2M ranges to protect against concurrent updates.
977  *
978  * The optimal solution would be range locking to avoid locking disjoint
979  * regions unnecessarily but there's no support for that yet.
980  */
981 static int rmpupdate(u64 pfn, struct rmp_state *state)
982 {
983 	unsigned long paddr = pfn << PAGE_SHIFT;
984 	int ret, level;
985 
986 	if (!cc_platform_has(CC_ATTR_HOST_SEV_SNP))
987 		return -ENODEV;
988 
989 	level = RMP_TO_PG_LEVEL(state->pagesize);
990 
991 	if (adjust_direct_map(pfn, level))
992 		return -EFAULT;
993 
994 	do {
995 		/* Binutils version 2.36 supports the RMPUPDATE mnemonic. */
996 		asm volatile(".byte 0xF2, 0x0F, 0x01, 0xFE"
997 			     : "=a" (ret)
998 			     : "a" (paddr), "c" ((unsigned long)state)
999 			     : "memory", "cc");
1000 	} while (ret == RMPUPDATE_FAIL_OVERLAP);
1001 
1002 	if (ret) {
1003 		pr_err("RMPUPDATE failed for PFN %llx, pg_level: %d, ret: %d\n",
1004 		       pfn, level, ret);
1005 		dump_rmpentry(pfn);
1006 		dump_stack();
1007 		return -EFAULT;
1008 	}
1009 
1010 	return 0;
1011 }
1012 
1013 /* Transition a page to guest-owned/private state in the RMP table. */
1014 int rmp_make_private(u64 pfn, u64 gpa, enum pg_level level, u32 asid, bool immutable)
1015 {
1016 	struct rmp_state state;
1017 
1018 	memset(&state, 0, sizeof(state));
1019 	state.assigned = 1;
1020 	state.asid = asid;
1021 	state.immutable = immutable;
1022 	state.gpa = gpa;
1023 	state.pagesize = PG_LEVEL_TO_RMP(level);
1024 
1025 	return rmpupdate(pfn, &state);
1026 }
1027 EXPORT_SYMBOL_GPL(rmp_make_private);
1028 
1029 /* Transition a page to hypervisor-owned/shared state in the RMP table. */
1030 int rmp_make_shared(u64 pfn, enum pg_level level)
1031 {
1032 	struct rmp_state state;
1033 
1034 	memset(&state, 0, sizeof(state));
1035 	state.pagesize = PG_LEVEL_TO_RMP(level);
1036 
1037 	return rmpupdate(pfn, &state);
1038 }
1039 EXPORT_SYMBOL_GPL(rmp_make_shared);
1040 
1041 void snp_leak_pages(u64 pfn, unsigned int npages)
1042 {
1043 	struct page *page = pfn_to_page(pfn);
1044 
1045 	pr_warn("Leaking PFN range 0x%llx-0x%llx\n", pfn, pfn + npages);
1046 
1047 	spin_lock(&snp_leaked_pages_list_lock);
1048 	while (npages--) {
1049 
1050 		/*
1051 		 * Reuse the page's buddy list for chaining into the leaked
1052 		 * pages list. This page should not be on a free list currently
1053 		 * and is also unsafe to be added to a free list.
1054 		 */
1055 		if (likely(!PageCompound(page)) ||
1056 
1057 			/*
1058 			 * Skip inserting tail pages of compound page as
1059 			 * page->buddy_list of tail pages is not usable.
1060 			 */
1061 		    (PageHead(page) && compound_nr(page) <= npages))
1062 			list_add_tail(&page->buddy_list, &snp_leaked_pages_list);
1063 
1064 		dump_rmpentry(pfn);
1065 		snp_nr_leaked_pages++;
1066 		pfn++;
1067 		page++;
1068 	}
1069 	spin_unlock(&snp_leaked_pages_list_lock);
1070 }
1071 EXPORT_SYMBOL_GPL(snp_leak_pages);
1072 
1073 void kdump_sev_callback(void)
1074 {
1075 	/*
1076 	 * Do wbinvd() on remote CPUs when SNP is enabled in order to
1077 	 * safely do SNP_SHUTDOWN on the local CPU.
1078 	 */
1079 	if (cc_platform_has(CC_ATTR_HOST_SEV_SNP))
1080 		wbinvd();
1081 }
1082