xref: /linux/arch/x86/virt/svm/sev.c (revision 53597deca0e38c30e6cd4ba2114fa42d2bcd85bb)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * AMD SVM-SEV Host Support.
4  *
5  * Copyright (C) 2023 Advanced Micro Devices, Inc.
6  *
7  * Author: Ashish Kalra <ashish.kalra@amd.com>
8  *
9  */
10 
11 #include <linux/cc_platform.h>
12 #include <linux/printk.h>
13 #include <linux/mm_types.h>
14 #include <linux/set_memory.h>
15 #include <linux/memblock.h>
16 #include <linux/kernel.h>
17 #include <linux/mm.h>
18 #include <linux/cpumask.h>
19 #include <linux/iommu.h>
20 #include <linux/amd-iommu.h>
21 #include <linux/nospec.h>
22 
23 #include <asm/sev.h>
24 #include <asm/processor.h>
25 #include <asm/setup.h>
26 #include <asm/svm.h>
27 #include <asm/smp.h>
28 #include <asm/cpu.h>
29 #include <asm/apic.h>
30 #include <asm/cpuid/api.h>
31 #include <asm/cmdline.h>
32 #include <asm/iommu.h>
33 #include <asm/msr.h>
34 
35 /*
36  * The RMP entry information as returned by the RMPREAD instruction.
37  */
38 struct rmpentry {
39 	u64 gpa;
40 	u8  assigned		:1,
41 	    rsvd1		:7;
42 	u8  pagesize		:1,
43 	    hpage_region_status	:1,
44 	    rsvd2		:6;
45 	u8  immutable		:1,
46 	    rsvd3		:7;
47 	u8  rsvd4;
48 	u32 asid;
49 } __packed;
50 
51 /*
52  * The raw RMP entry format is not architectural. The format is defined in PPR
53  * Family 19h Model 01h, Rev B1 processor. This format represents the actual
54  * entry in the RMP table memory. The bitfield definitions are used for machines
55  * without the RMPREAD instruction (Zen3 and Zen4), otherwise the "hi" and "lo"
56  * fields are only used for dumping the raw data.
57  */
58 struct rmpentry_raw {
59 	union {
60 		struct {
61 			u64 assigned	: 1,
62 			    pagesize	: 1,
63 			    immutable	: 1,
64 			    rsvd1	: 9,
65 			    gpa		: 39,
66 			    asid	: 10,
67 			    vmsa	: 1,
68 			    validated	: 1,
69 			    rsvd2	: 1;
70 		};
71 		u64 lo;
72 	};
73 	u64 hi;
74 } __packed;
75 
76 /*
77  * The first 16KB from the RMP_BASE is used by the processor for the
78  * bookkeeping, the range needs to be added during the RMP entry lookup.
79  */
80 #define RMPTABLE_CPU_BOOKKEEPING_SZ	0x4000
81 
82 /*
83  * For a non-segmented RMP table, use the maximum physical addressing as the
84  * segment size in order to always arrive at index 0 in the table.
85  */
86 #define RMPTABLE_NON_SEGMENTED_SHIFT	52
87 
88 struct rmp_segment_desc {
89 	struct rmpentry_raw *rmp_entry;
90 	u64 max_index;
91 	u64 size;
92 };
93 
94 /*
95  * Segmented RMP Table support.
96  *   - The segment size is used for two purposes:
97  *     - Identify the amount of memory covered by an RMP segment
98  *     - Quickly locate an RMP segment table entry for a physical address
99  *
100  *   - The RMP segment table contains pointers to an RMP table that covers
101  *     a specific portion of memory. There can be up to 512 8-byte entries,
102  *     one pages worth.
103  */
104 #define RST_ENTRY_MAPPED_SIZE(x)	((x) & GENMASK_ULL(19, 0))
105 #define RST_ENTRY_SEGMENT_BASE(x)	((x) & GENMASK_ULL(51, 20))
106 
107 #define RST_SIZE SZ_4K
108 static struct rmp_segment_desc **rmp_segment_table __ro_after_init;
109 static unsigned int rst_max_index __ro_after_init = 512;
110 
111 static unsigned int rmp_segment_shift;
112 static u64 rmp_segment_size;
113 static u64 rmp_segment_mask;
114 
115 #define RST_ENTRY_INDEX(x)	((x) >> rmp_segment_shift)
116 #define RMP_ENTRY_INDEX(x)	((u64)(PHYS_PFN((x) & rmp_segment_mask)))
117 
118 static u64 rmp_cfg;
119 
120 static void *rmp_bookkeeping __ro_after_init;
121 
122 /* Mask to apply to a PFN to get the first PFN of a 2MB page */
123 #define PFN_PMD_MASK	GENMASK_ULL(63, PMD_SHIFT - PAGE_SHIFT)
124 
125 static u64 probed_rmp_base, probed_rmp_size;
126 
127 static LIST_HEAD(snp_leaked_pages_list);
128 static DEFINE_SPINLOCK(snp_leaked_pages_list_lock);
129 
130 static unsigned long snp_nr_leaked_pages;
131 
132 #undef pr_fmt
133 #define pr_fmt(fmt)	"SEV-SNP: " fmt
134 
135 static void mfd_reconfigure(void *arg)
136 {
137 	if (!cc_platform_has(CC_ATTR_HOST_SEV_SNP))
138 		return;
139 
140 	if (arg)
141 		msr_set_bit(MSR_AMD64_SYSCFG, MSR_AMD64_SYSCFG_MFDM_BIT);
142 	else
143 		msr_clear_bit(MSR_AMD64_SYSCFG, MSR_AMD64_SYSCFG_MFDM_BIT);
144 }
145 
146 static void snp_enable(void *arg)
147 {
148 	u64 val;
149 
150 	if (!cc_platform_has(CC_ATTR_HOST_SEV_SNP))
151 		return;
152 
153 	rdmsrq(MSR_AMD64_SYSCFG, val);
154 
155 	val |= MSR_AMD64_SYSCFG_SNP_EN;
156 	val |= MSR_AMD64_SYSCFG_SNP_VMPL_EN;
157 
158 	wrmsrq(MSR_AMD64_SYSCFG, val);
159 }
160 
161 static void __init __snp_fixup_e820_tables(u64 pa)
162 {
163 	if (IS_ALIGNED(pa, PMD_SIZE))
164 		return;
165 
166 	/*
167 	 * Handle cases where the RMP table placement by the BIOS is not
168 	 * 2M aligned and the kexec kernel could try to allocate
169 	 * from within that chunk which then causes a fatal RMP fault.
170 	 *
171 	 * The e820_table needs to be updated as it is converted to
172 	 * kernel memory resources and used by KEXEC_FILE_LOAD syscall
173 	 * to load kexec segments.
174 	 *
175 	 * The e820_table_firmware needs to be updated as it is exposed
176 	 * to sysfs and used by the KEXEC_LOAD syscall to load kexec
177 	 * segments.
178 	 *
179 	 * The e820_table_kexec needs to be updated as it passed to
180 	 * the kexec-ed kernel.
181 	 */
182 	pa = ALIGN_DOWN(pa, PMD_SIZE);
183 	if (e820__mapped_any(pa, pa + PMD_SIZE, E820_TYPE_RAM)) {
184 		pr_info("Reserving start/end of RMP table on a 2MB boundary [0x%016llx]\n", pa);
185 		e820__range_update(pa, PMD_SIZE, E820_TYPE_RAM, E820_TYPE_RESERVED);
186 		e820__range_update_table(e820_table_kexec, pa, PMD_SIZE, E820_TYPE_RAM, E820_TYPE_RESERVED);
187 		if (!memblock_is_region_reserved(pa, PMD_SIZE))
188 			memblock_reserve(pa, PMD_SIZE);
189 	}
190 }
191 
192 static void __init fixup_e820_tables_for_segmented_rmp(void)
193 {
194 	u64 pa, *rst, size, mapped_size;
195 	unsigned int i;
196 
197 	__snp_fixup_e820_tables(probed_rmp_base);
198 
199 	pa = probed_rmp_base + RMPTABLE_CPU_BOOKKEEPING_SZ;
200 
201 	__snp_fixup_e820_tables(pa + RST_SIZE);
202 
203 	rst = early_memremap(pa, RST_SIZE);
204 	if (!rst)
205 		return;
206 
207 	for (i = 0; i < rst_max_index; i++) {
208 		pa = RST_ENTRY_SEGMENT_BASE(rst[i]);
209 		mapped_size = RST_ENTRY_MAPPED_SIZE(rst[i]);
210 		if (!mapped_size)
211 			continue;
212 
213 		__snp_fixup_e820_tables(pa);
214 
215 		/*
216 		 * Mapped size in GB. Mapped size is allowed to exceed
217 		 * the segment coverage size, but gets reduced to the
218 		 * segment coverage size.
219 		 */
220 		mapped_size <<= 30;
221 		if (mapped_size > rmp_segment_size)
222 			mapped_size = rmp_segment_size;
223 
224 		/* Calculate the RMP segment size (16 bytes/page mapped) */
225 		size = PHYS_PFN(mapped_size) << 4;
226 
227 		__snp_fixup_e820_tables(pa + size);
228 	}
229 
230 	early_memunmap(rst, RST_SIZE);
231 }
232 
233 static void __init fixup_e820_tables_for_contiguous_rmp(void)
234 {
235 	__snp_fixup_e820_tables(probed_rmp_base);
236 	__snp_fixup_e820_tables(probed_rmp_base + probed_rmp_size);
237 }
238 
239 void __init snp_fixup_e820_tables(void)
240 {
241 	if (rmp_cfg & MSR_AMD64_SEG_RMP_ENABLED) {
242 		fixup_e820_tables_for_segmented_rmp();
243 	} else {
244 		fixup_e820_tables_for_contiguous_rmp();
245 	}
246 }
247 
248 static void clear_rmp(void)
249 {
250 	unsigned int i;
251 	u64 val;
252 
253 	if (!cc_platform_has(CC_ATTR_HOST_SEV_SNP))
254 		return;
255 
256 	/* Clearing the RMP while SNP is enabled will cause an exception */
257 	rdmsrq(MSR_AMD64_SYSCFG, val);
258 	if (WARN_ON_ONCE(val & MSR_AMD64_SYSCFG_SNP_EN))
259 		return;
260 
261 	memset(rmp_bookkeeping, 0, RMPTABLE_CPU_BOOKKEEPING_SZ);
262 
263 	for (i = 0; i < rst_max_index; i++) {
264 		struct rmp_segment_desc *desc;
265 
266 		desc = rmp_segment_table[i];
267 		if (!desc)
268 			continue;
269 
270 		memset(desc->rmp_entry, 0, desc->size);
271 	}
272 }
273 
274 static bool __init alloc_rmp_segment_desc(u64 segment_pa, u64 segment_size, u64 pa)
275 {
276 	u64 rst_index, rmp_segment_size_max;
277 	struct rmp_segment_desc *desc;
278 	void *rmp_segment;
279 
280 	/* Calculate the maximum size an RMP can be (16 bytes/page mapped) */
281 	rmp_segment_size_max = PHYS_PFN(rmp_segment_size) << 4;
282 
283 	/* Validate the RMP segment size */
284 	if (segment_size > rmp_segment_size_max) {
285 		pr_err("Invalid RMP size 0x%llx for configured segment size 0x%llx\n",
286 		       segment_size, rmp_segment_size_max);
287 		return false;
288 	}
289 
290 	/* Validate the RMP segment table index */
291 	rst_index = RST_ENTRY_INDEX(pa);
292 	if (rst_index >= rst_max_index) {
293 		pr_err("Invalid RMP segment base address 0x%llx for configured segment size 0x%llx\n",
294 		       pa, rmp_segment_size);
295 		return false;
296 	}
297 
298 	if (rmp_segment_table[rst_index]) {
299 		pr_err("RMP segment descriptor already exists at index %llu\n", rst_index);
300 		return false;
301 	}
302 
303 	rmp_segment = memremap(segment_pa, segment_size, MEMREMAP_WB);
304 	if (!rmp_segment) {
305 		pr_err("Failed to map RMP segment addr 0x%llx size 0x%llx\n",
306 		       segment_pa, segment_size);
307 		return false;
308 	}
309 
310 	desc = kzalloc_obj(*desc);
311 	if (!desc) {
312 		memunmap(rmp_segment);
313 		return false;
314 	}
315 
316 	desc->rmp_entry = rmp_segment;
317 	desc->max_index = segment_size / sizeof(*desc->rmp_entry);
318 	desc->size = segment_size;
319 
320 	rmp_segment_table[rst_index] = desc;
321 
322 	return true;
323 }
324 
325 static void __init free_rmp_segment_table(void)
326 {
327 	unsigned int i;
328 
329 	for (i = 0; i < rst_max_index; i++) {
330 		struct rmp_segment_desc *desc;
331 
332 		desc = rmp_segment_table[i];
333 		if (!desc)
334 			continue;
335 
336 		memunmap(desc->rmp_entry);
337 
338 		kfree(desc);
339 	}
340 
341 	free_page((unsigned long)rmp_segment_table);
342 
343 	rmp_segment_table = NULL;
344 }
345 
346 /* Allocate the table used to index into the RMP segments */
347 static bool __init alloc_rmp_segment_table(void)
348 {
349 	struct page *page;
350 
351 	page = alloc_page(__GFP_ZERO);
352 	if (!page)
353 		return false;
354 
355 	rmp_segment_table = page_address(page);
356 
357 	return true;
358 }
359 
360 static bool __init setup_contiguous_rmptable(void)
361 {
362 	u64 max_rmp_pfn, calc_rmp_sz, rmptable_segment, rmptable_size, rmp_end;
363 
364 	if (!probed_rmp_size)
365 		return false;
366 
367 	rmp_end = probed_rmp_base + probed_rmp_size - 1;
368 
369 	/*
370 	 * Calculate the amount of memory that must be reserved by the BIOS to
371 	 * address the whole RAM, including the bookkeeping area. The RMP itself
372 	 * must also be covered.
373 	 */
374 	max_rmp_pfn = max_pfn;
375 	if (PFN_UP(rmp_end) > max_pfn)
376 		max_rmp_pfn = PFN_UP(rmp_end);
377 
378 	calc_rmp_sz = (max_rmp_pfn << 4) + RMPTABLE_CPU_BOOKKEEPING_SZ;
379 	if (calc_rmp_sz > probed_rmp_size) {
380 		pr_err("Memory reserved for the RMP table does not cover full system RAM (expected 0x%llx got 0x%llx)\n",
381 		       calc_rmp_sz, probed_rmp_size);
382 		return false;
383 	}
384 
385 	if (!alloc_rmp_segment_table())
386 		return false;
387 
388 	/* Map only the RMP entries */
389 	rmptable_segment = probed_rmp_base + RMPTABLE_CPU_BOOKKEEPING_SZ;
390 	rmptable_size    = probed_rmp_size - RMPTABLE_CPU_BOOKKEEPING_SZ;
391 
392 	if (!alloc_rmp_segment_desc(rmptable_segment, rmptable_size, 0)) {
393 		free_rmp_segment_table();
394 		return false;
395 	}
396 
397 	return true;
398 }
399 
400 static bool __init setup_segmented_rmptable(void)
401 {
402 	u64 rst_pa, *rst, pa, ram_pa_end, ram_pa_max;
403 	unsigned int i, max_index;
404 
405 	if (!probed_rmp_base)
406 		return false;
407 
408 	if (!alloc_rmp_segment_table())
409 		return false;
410 
411 	rst_pa = probed_rmp_base + RMPTABLE_CPU_BOOKKEEPING_SZ;
412 	rst = memremap(rst_pa, RST_SIZE, MEMREMAP_WB);
413 	if (!rst) {
414 		pr_err("Failed to map RMP segment table addr 0x%llx\n", rst_pa);
415 		goto e_free;
416 	}
417 
418 	pr_info("Segmented RMP using %lluGB segments\n", rmp_segment_size >> 30);
419 
420 	ram_pa_max = max_pfn << PAGE_SHIFT;
421 
422 	max_index = 0;
423 	ram_pa_end = 0;
424 	for (i = 0; i < rst_max_index; i++) {
425 		u64 rmp_segment, rmp_size, mapped_size;
426 
427 		mapped_size = RST_ENTRY_MAPPED_SIZE(rst[i]);
428 		if (!mapped_size)
429 			continue;
430 
431 		max_index = i;
432 
433 		/*
434 		 * Mapped size in GB. Mapped size is allowed to exceed the
435 		 * segment coverage size, but gets reduced to the segment
436 		 * coverage size.
437 		 */
438 		mapped_size <<= 30;
439 		if (mapped_size > rmp_segment_size) {
440 			pr_info("RMP segment %u mapped size (0x%llx) reduced to 0x%llx\n",
441 				i, mapped_size, rmp_segment_size);
442 			mapped_size = rmp_segment_size;
443 		}
444 
445 		rmp_segment = RST_ENTRY_SEGMENT_BASE(rst[i]);
446 
447 		/* Calculate the RMP segment size (16 bytes/page mapped) */
448 		rmp_size = PHYS_PFN(mapped_size) << 4;
449 
450 		pa = (u64)i << rmp_segment_shift;
451 
452 		/*
453 		 * Some segments may be for MMIO mapped above system RAM. These
454 		 * segments are used for Trusted I/O.
455 		 */
456 		if (pa < ram_pa_max)
457 			ram_pa_end = pa + mapped_size;
458 
459 		if (!alloc_rmp_segment_desc(rmp_segment, rmp_size, pa))
460 			goto e_unmap;
461 
462 		pr_info("RMP segment %u physical address [0x%llx - 0x%llx] covering [0x%llx - 0x%llx]\n",
463 			i, rmp_segment, rmp_segment + rmp_size - 1, pa, pa + mapped_size - 1);
464 	}
465 
466 	if (ram_pa_max > ram_pa_end) {
467 		pr_err("Segmented RMP does not cover full system RAM (expected 0x%llx got 0x%llx)\n",
468 		       ram_pa_max, ram_pa_end);
469 		goto e_unmap;
470 	}
471 
472 	/* Adjust the maximum index based on the found segments */
473 	rst_max_index = max_index + 1;
474 
475 	memunmap(rst);
476 
477 	return true;
478 
479 e_unmap:
480 	memunmap(rst);
481 
482 e_free:
483 	free_rmp_segment_table();
484 
485 	return false;
486 }
487 
488 static bool __init setup_rmptable(void)
489 {
490 	if (rmp_cfg & MSR_AMD64_SEG_RMP_ENABLED) {
491 		if (!setup_segmented_rmptable())
492 			return false;
493 	} else {
494 		if (!setup_contiguous_rmptable())
495 			return false;
496 	}
497 
498 	rmp_bookkeeping = memremap(probed_rmp_base, RMPTABLE_CPU_BOOKKEEPING_SZ, MEMREMAP_WB);
499 	if (!rmp_bookkeeping) {
500 		pr_err("Failed to map RMP bookkeeping area\n");
501 		free_rmp_segment_table();
502 
503 		return false;
504 	}
505 
506 	return true;
507 }
508 
509 static void clear_hsave_pa(void *arg)
510 {
511 	wrmsrq(MSR_VM_HSAVE_PA, 0);
512 }
513 
514 void snp_prepare(void)
515 {
516 	u64 val;
517 
518 	/*
519 	 * Check if SEV-SNP is already enabled, this can happen in case of
520 	 * kexec boot.
521 	 */
522 	rdmsrq(MSR_AMD64_SYSCFG, val);
523 	if (val & MSR_AMD64_SYSCFG_SNP_EN)
524 		return;
525 
526 	clear_rmp();
527 
528 	cpus_read_lock();
529 
530 	/*
531 	 * MtrrFixDramModEn is not shared between threads on a core,
532 	 * therefore it must be set on all CPUs prior to enabling SNP.
533 	 */
534 	on_each_cpu(mfd_reconfigure, (void *)1, 1);
535 	on_each_cpu(snp_enable, NULL, 1);
536 
537 	/* SNP_INIT requires MSR_VM_HSAVE_PA to be cleared on all CPUs. */
538 	on_each_cpu(clear_hsave_pa, NULL, 1);
539 
540 	cpus_read_unlock();
541 }
542 EXPORT_SYMBOL_FOR_MODULES(snp_prepare, "ccp");
543 
544 void snp_shutdown(void)
545 {
546 	u64 syscfg;
547 
548 	rdmsrq(MSR_AMD64_SYSCFG, syscfg);
549 	if (syscfg & MSR_AMD64_SYSCFG_SNP_EN)
550 		return;
551 
552 	clear_rmp();
553 	on_each_cpu(mfd_reconfigure, NULL, 1);
554 }
555 EXPORT_SYMBOL_FOR_MODULES(snp_shutdown, "ccp");
556 
557 /*
558  * Do the necessary preparations which are verified by the firmware as
559  * described in the SNP_INIT_EX firmware command description in the SNP
560  * firmware ABI spec.
561  */
562 int __init snp_rmptable_init(void)
563 {
564 	if (WARN_ON_ONCE(!cc_platform_has(CC_ATTR_HOST_SEV_SNP)))
565 		return -ENOSYS;
566 
567 	if (WARN_ON_ONCE(!amd_iommu_snp_en))
568 		return -ENOSYS;
569 
570 	if (!setup_rmptable())
571 		return -ENOSYS;
572 
573 	/*
574 	 * Setting crash_kexec_post_notifiers to 'true' to ensure that SNP panic
575 	 * notifier is invoked to do SNP IOMMU shutdown before kdump.
576 	 */
577 	crash_kexec_post_notifiers = true;
578 
579 	return 0;
580 }
581 
582 static void set_rmp_segment_info(unsigned int segment_shift)
583 {
584 	rmp_segment_shift = segment_shift;
585 	rmp_segment_size  = 1ULL << rmp_segment_shift;
586 	rmp_segment_mask  = rmp_segment_size - 1;
587 }
588 
589 #define RMP_ADDR_MASK GENMASK_ULL(51, 13)
590 
591 static bool probe_contiguous_rmptable_info(void)
592 {
593 	u64 rmp_sz, rmp_base, rmp_end;
594 
595 	rdmsrq(MSR_AMD64_RMP_BASE, rmp_base);
596 	rdmsrq(MSR_AMD64_RMP_END, rmp_end);
597 
598 	if (!(rmp_base & RMP_ADDR_MASK) || !(rmp_end & RMP_ADDR_MASK)) {
599 		pr_err("Memory for the RMP table has not been reserved by BIOS\n");
600 		return false;
601 	}
602 
603 	if (rmp_base > rmp_end) {
604 		pr_err("RMP configuration not valid: base=%#llx, end=%#llx\n", rmp_base, rmp_end);
605 		return false;
606 	}
607 
608 	rmp_sz = rmp_end - rmp_base + 1;
609 
610 	/* Treat the contiguous RMP table as a single segment */
611 	rst_max_index = 1;
612 
613 	set_rmp_segment_info(RMPTABLE_NON_SEGMENTED_SHIFT);
614 
615 	probed_rmp_base = rmp_base;
616 	probed_rmp_size = rmp_sz;
617 
618 	pr_info("RMP table physical range [0x%016llx - 0x%016llx]\n",
619 		rmp_base, rmp_end);
620 
621 	return true;
622 }
623 
624 static bool probe_segmented_rmptable_info(void)
625 {
626 	unsigned int eax, ebx, segment_shift, segment_shift_min, segment_shift_max;
627 	u64 rmp_base, rmp_end;
628 
629 	rdmsrq(MSR_AMD64_RMP_BASE, rmp_base);
630 	if (!(rmp_base & RMP_ADDR_MASK)) {
631 		pr_err("Memory for the RMP table has not been reserved by BIOS\n");
632 		return false;
633 	}
634 
635 	rdmsrq(MSR_AMD64_RMP_END, rmp_end);
636 	WARN_ONCE(rmp_end & RMP_ADDR_MASK,
637 		  "Segmented RMP enabled but RMP_END MSR is non-zero\n");
638 
639 	/* Obtain the min and max supported RMP segment size */
640 	eax = cpuid_eax(0x80000025);
641 	segment_shift_min = eax & GENMASK(5, 0);
642 	segment_shift_max = (eax & GENMASK(11, 6)) >> 6;
643 
644 	/* Verify the segment size is within the supported limits */
645 	segment_shift = MSR_AMD64_RMP_SEGMENT_SHIFT(rmp_cfg);
646 	if (segment_shift > segment_shift_max || segment_shift < segment_shift_min) {
647 		pr_err("RMP segment size (%u) is not within advertised bounds (min=%u, max=%u)\n",
648 		       segment_shift, segment_shift_min, segment_shift_max);
649 		return false;
650 	}
651 
652 	/* Override the max supported RST index if a hardware limit exists */
653 	ebx = cpuid_ebx(0x80000025);
654 	if (ebx & BIT(10))
655 		rst_max_index = ebx & GENMASK(9, 0);
656 
657 	set_rmp_segment_info(segment_shift);
658 
659 	probed_rmp_base = rmp_base;
660 	probed_rmp_size = 0;
661 
662 	pr_info("Segmented RMP base table physical range [0x%016llx - 0x%016llx]\n",
663 		rmp_base, rmp_base + RMPTABLE_CPU_BOOKKEEPING_SZ + RST_SIZE);
664 
665 	return true;
666 }
667 
668 bool snp_probe_rmptable_info(void)
669 {
670 	if (cpu_feature_enabled(X86_FEATURE_SEGMENTED_RMP))
671 		rdmsrq(MSR_AMD64_RMP_CFG, rmp_cfg);
672 
673 	if (rmp_cfg & MSR_AMD64_SEG_RMP_ENABLED)
674 		return probe_segmented_rmptable_info();
675 	else
676 		return probe_contiguous_rmptable_info();
677 }
678 
679 /*
680  * About the array_index_nospec() usage below:
681  *
682  * This function can get called by exported functions like
683  * snp_lookup_rmpentry(), which is used by the KVM #PF handler, among
684  * others, and since the @pfn passed in cannot always be trusted,
685  * speculation should be stopped as a protective measure.
686  */
687 static struct rmpentry_raw *get_raw_rmpentry(u64 pfn)
688 {
689 	u64 paddr, rst_index, segment_index;
690 	struct rmp_segment_desc *desc;
691 
692 	if (!rmp_segment_table)
693 		return ERR_PTR(-ENODEV);
694 
695 	paddr = pfn << PAGE_SHIFT;
696 
697 	rst_index = RST_ENTRY_INDEX(paddr);
698 	if (unlikely(rst_index >= rst_max_index))
699 		return ERR_PTR(-EFAULT);
700 
701 	rst_index = array_index_nospec(rst_index, rst_max_index);
702 
703 	desc = rmp_segment_table[rst_index];
704 	if (unlikely(!desc))
705 		return ERR_PTR(-EFAULT);
706 
707 	segment_index = RMP_ENTRY_INDEX(paddr);
708 	if (unlikely(segment_index >= desc->max_index))
709 		return ERR_PTR(-EFAULT);
710 
711 	segment_index = array_index_nospec(segment_index, desc->max_index);
712 
713 	return desc->rmp_entry + segment_index;
714 }
715 
716 static int get_rmpentry(u64 pfn, struct rmpentry *e)
717 {
718 	struct rmpentry_raw *e_raw;
719 
720 	if (cpu_feature_enabled(X86_FEATURE_RMPREAD)) {
721 		int ret;
722 
723 		/* Binutils version 2.44 supports the RMPREAD mnemonic. */
724 		asm volatile(".byte 0xf2, 0x0f, 0x01, 0xfd"
725 			     : "=a" (ret)
726 			     : "a" (pfn << PAGE_SHIFT), "c" (e)
727 			     : "memory", "cc");
728 
729 		return ret;
730 	}
731 
732 	e_raw = get_raw_rmpentry(pfn);
733 	if (IS_ERR(e_raw))
734 		return PTR_ERR(e_raw);
735 
736 	/*
737 	 * Map the raw RMP table entry onto the RMPREAD output format.
738 	 * The 2MB region status indicator (hpage_region_status field) is not
739 	 * calculated, since the overhead could be significant and the field
740 	 * is not used.
741 	 */
742 	memset(e, 0, sizeof(*e));
743 	e->gpa       = e_raw->gpa << PAGE_SHIFT;
744 	e->asid      = e_raw->asid;
745 	e->assigned  = e_raw->assigned;
746 	e->pagesize  = e_raw->pagesize;
747 	e->immutable = e_raw->immutable;
748 
749 	return 0;
750 }
751 
752 static int __snp_lookup_rmpentry(u64 pfn, struct rmpentry *e, int *level)
753 {
754 	struct rmpentry e_large;
755 	int ret;
756 
757 	if (!cc_platform_has(CC_ATTR_HOST_SEV_SNP))
758 		return -ENODEV;
759 
760 	ret = get_rmpentry(pfn, e);
761 	if (ret)
762 		return ret;
763 
764 	/*
765 	 * Find the authoritative RMP entry for a PFN. This can be either a 4K
766 	 * RMP entry or a special large RMP entry that is authoritative for a
767 	 * whole 2M area.
768 	 */
769 	ret = get_rmpentry(pfn & PFN_PMD_MASK, &e_large);
770 	if (ret)
771 		return ret;
772 
773 	*level = RMP_TO_PG_LEVEL(e_large.pagesize);
774 
775 	return 0;
776 }
777 
778 int snp_lookup_rmpentry(u64 pfn, bool *assigned, int *level)
779 {
780 	struct rmpentry e;
781 	int ret;
782 
783 	ret = __snp_lookup_rmpentry(pfn, &e, level);
784 	if (ret)
785 		return ret;
786 
787 	*assigned = !!e.assigned;
788 	return 0;
789 }
790 EXPORT_SYMBOL_GPL(snp_lookup_rmpentry);
791 
792 /*
793  * Dump the raw RMP entry for a particular PFN. These bits are documented in the
794  * PPR for a particular CPU model and provide useful information about how a
795  * particular PFN is being utilized by the kernel/firmware at the time certain
796  * unexpected events occur, such as RMP faults.
797  */
798 static void dump_rmpentry(u64 pfn)
799 {
800 	struct rmpentry_raw *e_raw;
801 	u64 pfn_i, pfn_end;
802 	struct rmpentry e;
803 	int level, ret;
804 
805 	ret = __snp_lookup_rmpentry(pfn, &e, &level);
806 	if (ret) {
807 		pr_err("Failed to read RMP entry for PFN 0x%llx, error %d\n",
808 		       pfn, ret);
809 		return;
810 	}
811 
812 	if (e.assigned) {
813 		e_raw = get_raw_rmpentry(pfn);
814 		if (IS_ERR(e_raw)) {
815 			pr_err("Failed to read RMP contents for PFN 0x%llx, error %ld\n",
816 			       pfn, PTR_ERR(e_raw));
817 			return;
818 		}
819 
820 		pr_info("PFN 0x%llx, RMP entry: [0x%016llx - 0x%016llx]\n",
821 			pfn, e_raw->lo, e_raw->hi);
822 		return;
823 	}
824 
825 	/*
826 	 * If the RMP entry for a particular PFN is not in an assigned state,
827 	 * then it is sometimes useful to get an idea of whether or not any RMP
828 	 * entries for other PFNs within the same 2MB region are assigned, since
829 	 * those too can affect the ability to access a particular PFN in
830 	 * certain situations, such as when the PFN is being accessed via a 2MB
831 	 * mapping in the host page table.
832 	 */
833 	pfn_i = ALIGN_DOWN(pfn, PTRS_PER_PMD);
834 	pfn_end = pfn_i + PTRS_PER_PMD;
835 
836 	pr_info("PFN 0x%llx unassigned, dumping non-zero entries in 2M PFN region: [0x%llx - 0x%llx]\n",
837 		pfn, pfn_i, pfn_end);
838 
839 	while (pfn_i < pfn_end) {
840 		e_raw = get_raw_rmpentry(pfn_i);
841 		if (IS_ERR(e_raw)) {
842 			pr_err("Error %ld reading RMP contents for PFN 0x%llx\n",
843 			       PTR_ERR(e_raw), pfn_i);
844 			pfn_i++;
845 			continue;
846 		}
847 
848 		if (e_raw->lo || e_raw->hi)
849 			pr_info("PFN: 0x%llx, [0x%016llx - 0x%016llx]\n", pfn_i, e_raw->lo, e_raw->hi);
850 		pfn_i++;
851 	}
852 }
853 
854 void snp_dump_hva_rmpentry(unsigned long hva)
855 {
856 	unsigned long paddr;
857 	unsigned int level;
858 	pgd_t *pgd;
859 	pte_t *pte;
860 
861 	pgd = __va(read_cr3_pa());
862 	pgd += pgd_index(hva);
863 	pte = lookup_address_in_pgd(pgd, hva, &level);
864 
865 	if (!pte) {
866 		pr_err("Can't dump RMP entry for HVA %lx: no PTE/PFN found\n", hva);
867 		return;
868 	}
869 
870 	paddr = PFN_PHYS(pte_pfn(*pte)) | (hva & ~page_level_mask(level));
871 	dump_rmpentry(PHYS_PFN(paddr));
872 }
873 
874 /*
875  * PSMASH a 2MB aligned page into 4K pages in the RMP table while preserving the
876  * Validated bit.
877  */
878 int psmash(u64 pfn)
879 {
880 	unsigned long paddr = pfn << PAGE_SHIFT;
881 	int ret;
882 
883 	if (!cc_platform_has(CC_ATTR_HOST_SEV_SNP))
884 		return -ENODEV;
885 
886 	if (!pfn_valid(pfn))
887 		return -EINVAL;
888 
889 	/* Binutils version 2.36 supports the PSMASH mnemonic. */
890 	asm volatile(".byte 0xF3, 0x0F, 0x01, 0xFF"
891 		      : "=a" (ret)
892 		      : "a" (paddr)
893 		      : "memory", "cc");
894 
895 	return ret;
896 }
897 EXPORT_SYMBOL_GPL(psmash);
898 
899 /*
900  * If the kernel uses a 2MB or larger directmap mapping to write to an address,
901  * and that mapping contains any 4KB pages that are set to private in the RMP
902  * table, an RMP #PF will trigger and cause a host crash. Hypervisor code that
903  * owns the PFNs being transitioned will never attempt such a write, but other
904  * kernel tasks writing to other PFNs in the range may trigger these checks
905  * inadvertently due a large directmap mapping that happens to overlap such a
906  * PFN.
907  *
908  * Prevent this by splitting any 2MB+ mappings that might end up containing a
909  * mix of private/shared PFNs as a result of a subsequent RMPUPDATE for the
910  * PFN/rmp_level passed in.
911  *
912  * Note that there is no attempt here to scan all the RMP entries for the 2MB
913  * physical range, since it would only be worthwhile in determining if a
914  * subsequent RMPUPDATE for a 4KB PFN would result in all the entries being of
915  * the same shared/private state, thus avoiding the need to split the mapping.
916  * But that would mean the entries are currently in a mixed state, and so the
917  * mapping would have already been split as a result of prior transitions.
918  * And since the 4K split is only done if the mapping is 2MB+, and there isn't
919  * currently a mechanism in place to restore 2MB+ mappings, such a check would
920  * not provide any usable benefit.
921  *
922  * More specifics on how these checks are carried out can be found in APM
923  * Volume 2, "RMP and VMPL Access Checks".
924  */
925 static int adjust_direct_map(u64 pfn, int rmp_level)
926 {
927 	unsigned long vaddr;
928 	unsigned int level;
929 	int npages, ret;
930 	pte_t *pte;
931 
932 	/*
933 	 * pfn_to_kaddr() will return a vaddr only within the direct
934 	 * map range.
935 	 */
936 	vaddr = (unsigned long)pfn_to_kaddr(pfn);
937 
938 	/* Only 4KB/2MB RMP entries are supported by current hardware. */
939 	if (WARN_ON_ONCE(rmp_level > PG_LEVEL_2M))
940 		return -EINVAL;
941 
942 	if (!pfn_valid(pfn))
943 		return -EINVAL;
944 
945 	if (rmp_level == PG_LEVEL_2M &&
946 	    (!IS_ALIGNED(pfn, PTRS_PER_PMD) || !pfn_valid(pfn + PTRS_PER_PMD - 1)))
947 		return -EINVAL;
948 
949 	/*
950 	 * If an entire 2MB physical range is being transitioned, then there is
951 	 * no risk of RMP #PFs due to write accesses from overlapping mappings,
952 	 * since even accesses from 1GB mappings will be treated as 2MB accesses
953 	 * as far as RMP table checks are concerned.
954 	 */
955 	if (rmp_level == PG_LEVEL_2M)
956 		return 0;
957 
958 	pte = lookup_address(vaddr, &level);
959 	if (!pte || pte_none(*pte))
960 		return 0;
961 
962 	if (level == PG_LEVEL_4K)
963 		return 0;
964 
965 	npages = page_level_size(rmp_level) / PAGE_SIZE;
966 	ret = set_memory_4k(vaddr, npages);
967 	if (ret)
968 		pr_warn("Failed to split direct map for PFN 0x%llx, ret: %d\n",
969 			pfn, ret);
970 
971 	return ret;
972 }
973 
974 /*
975  * It is expected that those operations are seldom enough so that no mutual
976  * exclusion of updaters is needed and thus the overlap error condition below
977  * should happen very rarely and would get resolved relatively quickly by
978  * the firmware.
979  *
980  * If not, one could consider introducing a mutex or so here to sync concurrent
981  * RMP updates and thus diminish the amount of cases where firmware needs to
982  * lock 2M ranges to protect against concurrent updates.
983  *
984  * The optimal solution would be range locking to avoid locking disjoint
985  * regions unnecessarily but there's no support for that yet.
986  */
987 static int rmpupdate(u64 pfn, struct rmp_state *state)
988 {
989 	unsigned long paddr = pfn << PAGE_SHIFT;
990 	int ret, level;
991 
992 	if (!cc_platform_has(CC_ATTR_HOST_SEV_SNP))
993 		return -ENODEV;
994 
995 	level = RMP_TO_PG_LEVEL(state->pagesize);
996 
997 	if (adjust_direct_map(pfn, level))
998 		return -EFAULT;
999 
1000 	do {
1001 		/* Binutils version 2.36 supports the RMPUPDATE mnemonic. */
1002 		asm volatile(".byte 0xF2, 0x0F, 0x01, 0xFE"
1003 			     : "=a" (ret)
1004 			     : "a" (paddr), "c" ((unsigned long)state)
1005 			     : "memory", "cc");
1006 	} while (ret == RMPUPDATE_FAIL_OVERLAP);
1007 
1008 	if (ret) {
1009 		pr_err("RMPUPDATE failed for PFN %llx, pg_level: %d, ret: %d\n",
1010 		       pfn, level, ret);
1011 		dump_rmpentry(pfn);
1012 		dump_stack();
1013 		return -EFAULT;
1014 	}
1015 
1016 	return 0;
1017 }
1018 
1019 /* Transition a page to guest-owned/private state in the RMP table. */
1020 int rmp_make_private(u64 pfn, u64 gpa, enum pg_level level, u32 asid, bool immutable)
1021 {
1022 	struct rmp_state state;
1023 
1024 	memset(&state, 0, sizeof(state));
1025 	state.assigned = 1;
1026 	state.asid = asid;
1027 	state.immutable = immutable;
1028 	state.gpa = gpa;
1029 	state.pagesize = PG_LEVEL_TO_RMP(level);
1030 
1031 	return rmpupdate(pfn, &state);
1032 }
1033 EXPORT_SYMBOL_GPL(rmp_make_private);
1034 
1035 /* Transition a page to hypervisor-owned/shared state in the RMP table. */
1036 int rmp_make_shared(u64 pfn, enum pg_level level)
1037 {
1038 	struct rmp_state state;
1039 
1040 	memset(&state, 0, sizeof(state));
1041 	state.pagesize = PG_LEVEL_TO_RMP(level);
1042 
1043 	return rmpupdate(pfn, &state);
1044 }
1045 EXPORT_SYMBOL_GPL(rmp_make_shared);
1046 
1047 void __snp_leak_pages(u64 pfn, unsigned int npages, bool dump_rmp)
1048 {
1049 	struct page *page = pfn_to_page(pfn);
1050 
1051 	pr_warn("Leaking PFN range 0x%llx-0x%llx\n", pfn, pfn + npages);
1052 
1053 	spin_lock(&snp_leaked_pages_list_lock);
1054 	while (npages--) {
1055 
1056 		/*
1057 		 * Reuse the page's buddy list for chaining into the leaked
1058 		 * pages list. This page should not be on a free list currently
1059 		 * and is also unsafe to be added to a free list.
1060 		 */
1061 		if (likely(!PageCompound(page)) ||
1062 
1063 			/*
1064 			 * Skip inserting tail pages of compound page as
1065 			 * page->buddy_list of tail pages is not usable.
1066 			 */
1067 		    (PageHead(page) && compound_nr(page) <= npages))
1068 			list_add_tail(&page->buddy_list, &snp_leaked_pages_list);
1069 
1070 		if (dump_rmp)
1071 			dump_rmpentry(pfn);
1072 		snp_nr_leaked_pages++;
1073 		pfn++;
1074 		page++;
1075 	}
1076 	spin_unlock(&snp_leaked_pages_list_lock);
1077 }
1078 EXPORT_SYMBOL_GPL(__snp_leak_pages);
1079 
1080 void kdump_sev_callback(void)
1081 {
1082 	/*
1083 	 * Do wbinvd() on remote CPUs when SNP is enabled in order to
1084 	 * safely do SNP_SHUTDOWN on the local CPU.
1085 	 */
1086 	if (cc_platform_has(CC_ATTR_HOST_SEV_SNP))
1087 		wbinvd();
1088 }
1089