xref: /linux/arch/x86/boot/startup/map_kernel.c (revision cdc8be31cb324a0c52529f192e39a44abcfff513)
1 // SPDX-License-Identifier: GPL-2.0
2 
3 #include <linux/init.h>
4 #include <linux/linkage.h>
5 #include <linux/types.h>
6 #include <linux/kernel.h>
7 #include <linux/pgtable.h>
8 
9 #include <asm/init.h>
10 #include <asm/sections.h>
11 #include <asm/setup.h>
12 #include <asm/sev.h>
13 
14 extern pmd_t early_dynamic_pgts[EARLY_DYNAMIC_PAGE_TABLES][PTRS_PER_PMD];
15 extern unsigned int next_early_pgt;
16 
17 static inline bool check_la57_support(void)
18 {
19 	if (!IS_ENABLED(CONFIG_X86_5LEVEL))
20 		return false;
21 
22 	/*
23 	 * 5-level paging is detected and enabled at kernel decompression
24 	 * stage. Only check if it has been enabled there.
25 	 */
26 	if (!(native_read_cr4() & X86_CR4_LA57))
27 		return false;
28 
29 	__pgtable_l5_enabled	= 1;
30 	pgdir_shift		= 48;
31 	ptrs_per_p4d		= 512;
32 
33 	return true;
34 }
35 
36 static unsigned long __head sme_postprocess_startup(struct boot_params *bp,
37 						    pmdval_t *pmd,
38 						    unsigned long p2v_offset)
39 {
40 	unsigned long paddr, paddr_end;
41 	int i;
42 
43 	/* Encrypt the kernel and related (if SME is active) */
44 	sme_encrypt_kernel(bp);
45 
46 	/*
47 	 * Clear the memory encryption mask from the .bss..decrypted section.
48 	 * The bss section will be memset to zero later in the initialization so
49 	 * there is no need to zero it after changing the memory encryption
50 	 * attribute.
51 	 */
52 	if (sme_get_me_mask()) {
53 		paddr = (unsigned long)rip_rel_ptr(__start_bss_decrypted);
54 		paddr_end = (unsigned long)rip_rel_ptr(__end_bss_decrypted);
55 
56 		for (; paddr < paddr_end; paddr += PMD_SIZE) {
57 			/*
58 			 * On SNP, transition the page to shared in the RMP table so that
59 			 * it is consistent with the page table attribute change.
60 			 *
61 			 * __start_bss_decrypted has a virtual address in the high range
62 			 * mapping (kernel .text). PVALIDATE, by way of
63 			 * early_snp_set_memory_shared(), requires a valid virtual
64 			 * address but the kernel is currently running off of the identity
65 			 * mapping so use the PA to get a *currently* valid virtual address.
66 			 */
67 			early_snp_set_memory_shared(paddr, paddr, PTRS_PER_PMD);
68 
69 			i = pmd_index(paddr - p2v_offset);
70 			pmd[i] -= sme_get_me_mask();
71 		}
72 	}
73 
74 	/*
75 	 * Return the SME encryption mask (if SME is active) to be used as a
76 	 * modifier for the initial pgdir entry programmed into CR3.
77 	 */
78 	return sme_get_me_mask();
79 }
80 
81 /*
82  * This code is compiled using PIC codegen because it will execute from the
83  * early 1:1 mapping of memory, which deviates from the mapping expected by the
84  * linker. Due to this deviation, taking the address of a global variable will
85  * produce an ambiguous result when using the plain & operator.  Instead,
86  * rip_rel_ptr() must be used, which will return the RIP-relative address in
87  * the 1:1 mapping of memory. Kernel virtual addresses can be determined by
88  * subtracting p2v_offset from the RIP-relative address.
89  */
90 unsigned long __head __startup_64(unsigned long p2v_offset,
91 				  struct boot_params *bp)
92 {
93 	pmd_t (*early_pgts)[PTRS_PER_PMD] = rip_rel_ptr(early_dynamic_pgts);
94 	unsigned long physaddr = (unsigned long)rip_rel_ptr(_text);
95 	unsigned long va_text, va_end;
96 	unsigned long pgtable_flags;
97 	unsigned long load_delta;
98 	pgdval_t *pgd;
99 	p4dval_t *p4d;
100 	pudval_t *pud;
101 	pmdval_t *pmd, pmd_entry;
102 	bool la57;
103 	int i;
104 
105 	la57 = check_la57_support();
106 
107 	/* Is the address too large? */
108 	if (physaddr >> MAX_PHYSMEM_BITS)
109 		for (;;);
110 
111 	/*
112 	 * Compute the delta between the address I am compiled to run at
113 	 * and the address I am actually running at.
114 	 */
115 	phys_base = load_delta = __START_KERNEL_map + p2v_offset;
116 
117 	/* Is the address not 2M aligned? */
118 	if (load_delta & ~PMD_MASK)
119 		for (;;);
120 
121 	va_text = physaddr - p2v_offset;
122 	va_end  = (unsigned long)rip_rel_ptr(_end) - p2v_offset;
123 
124 	/* Include the SME encryption mask in the fixup value */
125 	load_delta += sme_get_me_mask();
126 
127 	/* Fixup the physical addresses in the page table */
128 
129 	pgd = rip_rel_ptr(early_top_pgt);
130 	pgd[pgd_index(__START_KERNEL_map)] += load_delta;
131 
132 	if (IS_ENABLED(CONFIG_X86_5LEVEL) && la57) {
133 		p4d = (p4dval_t *)rip_rel_ptr(level4_kernel_pgt);
134 		p4d[MAX_PTRS_PER_P4D - 1] += load_delta;
135 
136 		pgd[pgd_index(__START_KERNEL_map)] = (pgdval_t)p4d | _PAGE_TABLE;
137 	}
138 
139 	level3_kernel_pgt[PTRS_PER_PUD - 2].pud += load_delta;
140 	level3_kernel_pgt[PTRS_PER_PUD - 1].pud += load_delta;
141 
142 	for (i = FIXMAP_PMD_TOP; i > FIXMAP_PMD_TOP - FIXMAP_PMD_NUM; i--)
143 		level2_fixmap_pgt[i].pmd += load_delta;
144 
145 	/*
146 	 * Set up the identity mapping for the switchover.  These
147 	 * entries should *NOT* have the global bit set!  This also
148 	 * creates a bunch of nonsense entries but that is fine --
149 	 * it avoids problems around wraparound.
150 	 */
151 
152 	pud = &early_pgts[0]->pmd;
153 	pmd = &early_pgts[1]->pmd;
154 	next_early_pgt = 2;
155 
156 	pgtable_flags = _KERNPG_TABLE_NOENC + sme_get_me_mask();
157 
158 	if (la57) {
159 		p4d = &early_pgts[next_early_pgt++]->pmd;
160 
161 		i = (physaddr >> PGDIR_SHIFT) % PTRS_PER_PGD;
162 		pgd[i + 0] = (pgdval_t)p4d + pgtable_flags;
163 		pgd[i + 1] = (pgdval_t)p4d + pgtable_flags;
164 
165 		i = physaddr >> P4D_SHIFT;
166 		p4d[(i + 0) % PTRS_PER_P4D] = (pgdval_t)pud + pgtable_flags;
167 		p4d[(i + 1) % PTRS_PER_P4D] = (pgdval_t)pud + pgtable_flags;
168 	} else {
169 		i = (physaddr >> PGDIR_SHIFT) % PTRS_PER_PGD;
170 		pgd[i + 0] = (pgdval_t)pud + pgtable_flags;
171 		pgd[i + 1] = (pgdval_t)pud + pgtable_flags;
172 	}
173 
174 	i = physaddr >> PUD_SHIFT;
175 	pud[(i + 0) % PTRS_PER_PUD] = (pudval_t)pmd + pgtable_flags;
176 	pud[(i + 1) % PTRS_PER_PUD] = (pudval_t)pmd + pgtable_flags;
177 
178 	pmd_entry = __PAGE_KERNEL_LARGE_EXEC & ~_PAGE_GLOBAL;
179 	pmd_entry += sme_get_me_mask();
180 	pmd_entry +=  physaddr;
181 
182 	for (i = 0; i < DIV_ROUND_UP(va_end - va_text, PMD_SIZE); i++) {
183 		int idx = i + (physaddr >> PMD_SHIFT);
184 
185 		pmd[idx % PTRS_PER_PMD] = pmd_entry + i * PMD_SIZE;
186 	}
187 
188 	/*
189 	 * Fixup the kernel text+data virtual addresses. Note that
190 	 * we might write invalid pmds, when the kernel is relocated
191 	 * cleanup_highmap() fixes this up along with the mappings
192 	 * beyond _end.
193 	 *
194 	 * Only the region occupied by the kernel image has so far
195 	 * been checked against the table of usable memory regions
196 	 * provided by the firmware, so invalidate pages outside that
197 	 * region. A page table entry that maps to a reserved area of
198 	 * memory would allow processor speculation into that area,
199 	 * and on some hardware (particularly the UV platform) even
200 	 * speculative access to some reserved areas is caught as an
201 	 * error, causing the BIOS to halt the system.
202 	 */
203 
204 	pmd = rip_rel_ptr(level2_kernel_pgt);
205 
206 	/* invalidate pages before the kernel image */
207 	for (i = 0; i < pmd_index(va_text); i++)
208 		pmd[i] &= ~_PAGE_PRESENT;
209 
210 	/* fixup pages that are part of the kernel image */
211 	for (; i <= pmd_index(va_end); i++)
212 		if (pmd[i] & _PAGE_PRESENT)
213 			pmd[i] += load_delta;
214 
215 	/* invalidate pages after the kernel image */
216 	for (; i < PTRS_PER_PMD; i++)
217 		pmd[i] &= ~_PAGE_PRESENT;
218 
219 	return sme_postprocess_startup(bp, pmd, p2v_offset);
220 }
221