xref: /linux/arch/x86/boot/startup/map_kernel.c (revision dbe0ad775cbbd614583409d96abbc8ea7edb5eb4)
1*dbe0ad77SArd Biesheuvel // SPDX-License-Identifier: GPL-2.0
2*dbe0ad77SArd Biesheuvel 
3*dbe0ad77SArd Biesheuvel #include <linux/init.h>
4*dbe0ad77SArd Biesheuvel #include <linux/linkage.h>
5*dbe0ad77SArd Biesheuvel #include <linux/types.h>
6*dbe0ad77SArd Biesheuvel #include <linux/kernel.h>
7*dbe0ad77SArd Biesheuvel #include <linux/pgtable.h>
8*dbe0ad77SArd Biesheuvel 
9*dbe0ad77SArd Biesheuvel #include <asm/init.h>
10*dbe0ad77SArd Biesheuvel #include <asm/sections.h>
11*dbe0ad77SArd Biesheuvel #include <asm/setup.h>
12*dbe0ad77SArd Biesheuvel #include <asm/sev.h>
13*dbe0ad77SArd Biesheuvel 
14*dbe0ad77SArd Biesheuvel extern pmd_t early_dynamic_pgts[EARLY_DYNAMIC_PAGE_TABLES][PTRS_PER_PMD];
15*dbe0ad77SArd Biesheuvel extern unsigned int next_early_pgt;
16*dbe0ad77SArd Biesheuvel 
17*dbe0ad77SArd Biesheuvel static inline bool check_la57_support(void)
18*dbe0ad77SArd Biesheuvel {
19*dbe0ad77SArd Biesheuvel 	if (!IS_ENABLED(CONFIG_X86_5LEVEL))
20*dbe0ad77SArd Biesheuvel 		return false;
21*dbe0ad77SArd Biesheuvel 
22*dbe0ad77SArd Biesheuvel 	/*
23*dbe0ad77SArd Biesheuvel 	 * 5-level paging is detected and enabled at kernel decompression
24*dbe0ad77SArd Biesheuvel 	 * stage. Only check if it has been enabled there.
25*dbe0ad77SArd Biesheuvel 	 */
26*dbe0ad77SArd Biesheuvel 	if (!(native_read_cr4() & X86_CR4_LA57))
27*dbe0ad77SArd Biesheuvel 		return false;
28*dbe0ad77SArd Biesheuvel 
29*dbe0ad77SArd Biesheuvel 	RIP_REL_REF(__pgtable_l5_enabled)	= 1;
30*dbe0ad77SArd Biesheuvel 	RIP_REL_REF(pgdir_shift)		= 48;
31*dbe0ad77SArd Biesheuvel 	RIP_REL_REF(ptrs_per_p4d)		= 512;
32*dbe0ad77SArd Biesheuvel 	RIP_REL_REF(page_offset_base)		= __PAGE_OFFSET_BASE_L5;
33*dbe0ad77SArd Biesheuvel 	RIP_REL_REF(vmalloc_base)		= __VMALLOC_BASE_L5;
34*dbe0ad77SArd Biesheuvel 	RIP_REL_REF(vmemmap_base)		= __VMEMMAP_BASE_L5;
35*dbe0ad77SArd Biesheuvel 
36*dbe0ad77SArd Biesheuvel 	return true;
37*dbe0ad77SArd Biesheuvel }
38*dbe0ad77SArd Biesheuvel 
39*dbe0ad77SArd Biesheuvel static unsigned long __head sme_postprocess_startup(struct boot_params *bp,
40*dbe0ad77SArd Biesheuvel 						    pmdval_t *pmd,
41*dbe0ad77SArd Biesheuvel 						    unsigned long p2v_offset)
42*dbe0ad77SArd Biesheuvel {
43*dbe0ad77SArd Biesheuvel 	unsigned long paddr, paddr_end;
44*dbe0ad77SArd Biesheuvel 	int i;
45*dbe0ad77SArd Biesheuvel 
46*dbe0ad77SArd Biesheuvel 	/* Encrypt the kernel and related (if SME is active) */
47*dbe0ad77SArd Biesheuvel 	sme_encrypt_kernel(bp);
48*dbe0ad77SArd Biesheuvel 
49*dbe0ad77SArd Biesheuvel 	/*
50*dbe0ad77SArd Biesheuvel 	 * Clear the memory encryption mask from the .bss..decrypted section.
51*dbe0ad77SArd Biesheuvel 	 * The bss section will be memset to zero later in the initialization so
52*dbe0ad77SArd Biesheuvel 	 * there is no need to zero it after changing the memory encryption
53*dbe0ad77SArd Biesheuvel 	 * attribute.
54*dbe0ad77SArd Biesheuvel 	 */
55*dbe0ad77SArd Biesheuvel 	if (sme_get_me_mask()) {
56*dbe0ad77SArd Biesheuvel 		paddr = (unsigned long)rip_rel_ptr(__start_bss_decrypted);
57*dbe0ad77SArd Biesheuvel 		paddr_end = (unsigned long)rip_rel_ptr(__end_bss_decrypted);
58*dbe0ad77SArd Biesheuvel 
59*dbe0ad77SArd Biesheuvel 		for (; paddr < paddr_end; paddr += PMD_SIZE) {
60*dbe0ad77SArd Biesheuvel 			/*
61*dbe0ad77SArd Biesheuvel 			 * On SNP, transition the page to shared in the RMP table so that
62*dbe0ad77SArd Biesheuvel 			 * it is consistent with the page table attribute change.
63*dbe0ad77SArd Biesheuvel 			 *
64*dbe0ad77SArd Biesheuvel 			 * __start_bss_decrypted has a virtual address in the high range
65*dbe0ad77SArd Biesheuvel 			 * mapping (kernel .text). PVALIDATE, by way of
66*dbe0ad77SArd Biesheuvel 			 * early_snp_set_memory_shared(), requires a valid virtual
67*dbe0ad77SArd Biesheuvel 			 * address but the kernel is currently running off of the identity
68*dbe0ad77SArd Biesheuvel 			 * mapping so use the PA to get a *currently* valid virtual address.
69*dbe0ad77SArd Biesheuvel 			 */
70*dbe0ad77SArd Biesheuvel 			early_snp_set_memory_shared(paddr, paddr, PTRS_PER_PMD);
71*dbe0ad77SArd Biesheuvel 
72*dbe0ad77SArd Biesheuvel 			i = pmd_index(paddr - p2v_offset);
73*dbe0ad77SArd Biesheuvel 			pmd[i] -= sme_get_me_mask();
74*dbe0ad77SArd Biesheuvel 		}
75*dbe0ad77SArd Biesheuvel 	}
76*dbe0ad77SArd Biesheuvel 
77*dbe0ad77SArd Biesheuvel 	/*
78*dbe0ad77SArd Biesheuvel 	 * Return the SME encryption mask (if SME is active) to be used as a
79*dbe0ad77SArd Biesheuvel 	 * modifier for the initial pgdir entry programmed into CR3.
80*dbe0ad77SArd Biesheuvel 	 */
81*dbe0ad77SArd Biesheuvel 	return sme_get_me_mask();
82*dbe0ad77SArd Biesheuvel }
83*dbe0ad77SArd Biesheuvel 
84*dbe0ad77SArd Biesheuvel /* Code in __startup_64() can be relocated during execution, but the compiler
85*dbe0ad77SArd Biesheuvel  * doesn't have to generate PC-relative relocations when accessing globals from
86*dbe0ad77SArd Biesheuvel  * that function. Clang actually does not generate them, which leads to
87*dbe0ad77SArd Biesheuvel  * boot-time crashes. To work around this problem, every global pointer must
88*dbe0ad77SArd Biesheuvel  * be accessed using RIP_REL_REF(). Kernel virtual addresses can be determined
89*dbe0ad77SArd Biesheuvel  * by subtracting p2v_offset from the RIP-relative address.
90*dbe0ad77SArd Biesheuvel  */
91*dbe0ad77SArd Biesheuvel unsigned long __head __startup_64(unsigned long p2v_offset,
92*dbe0ad77SArd Biesheuvel 				  struct boot_params *bp)
93*dbe0ad77SArd Biesheuvel {
94*dbe0ad77SArd Biesheuvel 	pmd_t (*early_pgts)[PTRS_PER_PMD] = rip_rel_ptr(early_dynamic_pgts);
95*dbe0ad77SArd Biesheuvel 	unsigned long physaddr = (unsigned long)rip_rel_ptr(_text);
96*dbe0ad77SArd Biesheuvel 	unsigned long va_text, va_end;
97*dbe0ad77SArd Biesheuvel 	unsigned long pgtable_flags;
98*dbe0ad77SArd Biesheuvel 	unsigned long load_delta;
99*dbe0ad77SArd Biesheuvel 	pgdval_t *pgd;
100*dbe0ad77SArd Biesheuvel 	p4dval_t *p4d;
101*dbe0ad77SArd Biesheuvel 	pudval_t *pud;
102*dbe0ad77SArd Biesheuvel 	pmdval_t *pmd, pmd_entry;
103*dbe0ad77SArd Biesheuvel 	bool la57;
104*dbe0ad77SArd Biesheuvel 	int i;
105*dbe0ad77SArd Biesheuvel 
106*dbe0ad77SArd Biesheuvel 	la57 = check_la57_support();
107*dbe0ad77SArd Biesheuvel 
108*dbe0ad77SArd Biesheuvel 	/* Is the address too large? */
109*dbe0ad77SArd Biesheuvel 	if (physaddr >> MAX_PHYSMEM_BITS)
110*dbe0ad77SArd Biesheuvel 		for (;;);
111*dbe0ad77SArd Biesheuvel 
112*dbe0ad77SArd Biesheuvel 	/*
113*dbe0ad77SArd Biesheuvel 	 * Compute the delta between the address I am compiled to run at
114*dbe0ad77SArd Biesheuvel 	 * and the address I am actually running at.
115*dbe0ad77SArd Biesheuvel 	 */
116*dbe0ad77SArd Biesheuvel 	load_delta = __START_KERNEL_map + p2v_offset;
117*dbe0ad77SArd Biesheuvel 	RIP_REL_REF(phys_base) = load_delta;
118*dbe0ad77SArd Biesheuvel 
119*dbe0ad77SArd Biesheuvel 	/* Is the address not 2M aligned? */
120*dbe0ad77SArd Biesheuvel 	if (load_delta & ~PMD_MASK)
121*dbe0ad77SArd Biesheuvel 		for (;;);
122*dbe0ad77SArd Biesheuvel 
123*dbe0ad77SArd Biesheuvel 	va_text = physaddr - p2v_offset;
124*dbe0ad77SArd Biesheuvel 	va_end  = (unsigned long)rip_rel_ptr(_end) - p2v_offset;
125*dbe0ad77SArd Biesheuvel 
126*dbe0ad77SArd Biesheuvel 	/* Include the SME encryption mask in the fixup value */
127*dbe0ad77SArd Biesheuvel 	load_delta += sme_get_me_mask();
128*dbe0ad77SArd Biesheuvel 
129*dbe0ad77SArd Biesheuvel 	/* Fixup the physical addresses in the page table */
130*dbe0ad77SArd Biesheuvel 
131*dbe0ad77SArd Biesheuvel 	pgd = rip_rel_ptr(early_top_pgt);
132*dbe0ad77SArd Biesheuvel 	pgd[pgd_index(__START_KERNEL_map)] += load_delta;
133*dbe0ad77SArd Biesheuvel 
134*dbe0ad77SArd Biesheuvel 	if (IS_ENABLED(CONFIG_X86_5LEVEL) && la57) {
135*dbe0ad77SArd Biesheuvel 		p4d = (p4dval_t *)rip_rel_ptr(level4_kernel_pgt);
136*dbe0ad77SArd Biesheuvel 		p4d[MAX_PTRS_PER_P4D - 1] += load_delta;
137*dbe0ad77SArd Biesheuvel 
138*dbe0ad77SArd Biesheuvel 		pgd[pgd_index(__START_KERNEL_map)] = (pgdval_t)p4d | _PAGE_TABLE;
139*dbe0ad77SArd Biesheuvel 	}
140*dbe0ad77SArd Biesheuvel 
141*dbe0ad77SArd Biesheuvel 	RIP_REL_REF(level3_kernel_pgt)[PTRS_PER_PUD - 2].pud += load_delta;
142*dbe0ad77SArd Biesheuvel 	RIP_REL_REF(level3_kernel_pgt)[PTRS_PER_PUD - 1].pud += load_delta;
143*dbe0ad77SArd Biesheuvel 
144*dbe0ad77SArd Biesheuvel 	for (i = FIXMAP_PMD_TOP; i > FIXMAP_PMD_TOP - FIXMAP_PMD_NUM; i--)
145*dbe0ad77SArd Biesheuvel 		RIP_REL_REF(level2_fixmap_pgt)[i].pmd += load_delta;
146*dbe0ad77SArd Biesheuvel 
147*dbe0ad77SArd Biesheuvel 	/*
148*dbe0ad77SArd Biesheuvel 	 * Set up the identity mapping for the switchover.  These
149*dbe0ad77SArd Biesheuvel 	 * entries should *NOT* have the global bit set!  This also
150*dbe0ad77SArd Biesheuvel 	 * creates a bunch of nonsense entries but that is fine --
151*dbe0ad77SArd Biesheuvel 	 * it avoids problems around wraparound.
152*dbe0ad77SArd Biesheuvel 	 */
153*dbe0ad77SArd Biesheuvel 
154*dbe0ad77SArd Biesheuvel 	pud = &early_pgts[0]->pmd;
155*dbe0ad77SArd Biesheuvel 	pmd = &early_pgts[1]->pmd;
156*dbe0ad77SArd Biesheuvel 	RIP_REL_REF(next_early_pgt) = 2;
157*dbe0ad77SArd Biesheuvel 
158*dbe0ad77SArd Biesheuvel 	pgtable_flags = _KERNPG_TABLE_NOENC + sme_get_me_mask();
159*dbe0ad77SArd Biesheuvel 
160*dbe0ad77SArd Biesheuvel 	if (la57) {
161*dbe0ad77SArd Biesheuvel 		p4d = &early_pgts[RIP_REL_REF(next_early_pgt)++]->pmd;
162*dbe0ad77SArd Biesheuvel 
163*dbe0ad77SArd Biesheuvel 		i = (physaddr >> PGDIR_SHIFT) % PTRS_PER_PGD;
164*dbe0ad77SArd Biesheuvel 		pgd[i + 0] = (pgdval_t)p4d + pgtable_flags;
165*dbe0ad77SArd Biesheuvel 		pgd[i + 1] = (pgdval_t)p4d + pgtable_flags;
166*dbe0ad77SArd Biesheuvel 
167*dbe0ad77SArd Biesheuvel 		i = physaddr >> P4D_SHIFT;
168*dbe0ad77SArd Biesheuvel 		p4d[(i + 0) % PTRS_PER_P4D] = (pgdval_t)pud + pgtable_flags;
169*dbe0ad77SArd Biesheuvel 		p4d[(i + 1) % PTRS_PER_P4D] = (pgdval_t)pud + pgtable_flags;
170*dbe0ad77SArd Biesheuvel 	} else {
171*dbe0ad77SArd Biesheuvel 		i = (physaddr >> PGDIR_SHIFT) % PTRS_PER_PGD;
172*dbe0ad77SArd Biesheuvel 		pgd[i + 0] = (pgdval_t)pud + pgtable_flags;
173*dbe0ad77SArd Biesheuvel 		pgd[i + 1] = (pgdval_t)pud + pgtable_flags;
174*dbe0ad77SArd Biesheuvel 	}
175*dbe0ad77SArd Biesheuvel 
176*dbe0ad77SArd Biesheuvel 	i = physaddr >> PUD_SHIFT;
177*dbe0ad77SArd Biesheuvel 	pud[(i + 0) % PTRS_PER_PUD] = (pudval_t)pmd + pgtable_flags;
178*dbe0ad77SArd Biesheuvel 	pud[(i + 1) % PTRS_PER_PUD] = (pudval_t)pmd + pgtable_flags;
179*dbe0ad77SArd Biesheuvel 
180*dbe0ad77SArd Biesheuvel 	pmd_entry = __PAGE_KERNEL_LARGE_EXEC & ~_PAGE_GLOBAL;
181*dbe0ad77SArd Biesheuvel 	/* Filter out unsupported __PAGE_KERNEL_* bits: */
182*dbe0ad77SArd Biesheuvel 	pmd_entry &= RIP_REL_REF(__supported_pte_mask);
183*dbe0ad77SArd Biesheuvel 	pmd_entry += sme_get_me_mask();
184*dbe0ad77SArd Biesheuvel 	pmd_entry +=  physaddr;
185*dbe0ad77SArd Biesheuvel 
186*dbe0ad77SArd Biesheuvel 	for (i = 0; i < DIV_ROUND_UP(va_end - va_text, PMD_SIZE); i++) {
187*dbe0ad77SArd Biesheuvel 		int idx = i + (physaddr >> PMD_SHIFT);
188*dbe0ad77SArd Biesheuvel 
189*dbe0ad77SArd Biesheuvel 		pmd[idx % PTRS_PER_PMD] = pmd_entry + i * PMD_SIZE;
190*dbe0ad77SArd Biesheuvel 	}
191*dbe0ad77SArd Biesheuvel 
192*dbe0ad77SArd Biesheuvel 	/*
193*dbe0ad77SArd Biesheuvel 	 * Fixup the kernel text+data virtual addresses. Note that
194*dbe0ad77SArd Biesheuvel 	 * we might write invalid pmds, when the kernel is relocated
195*dbe0ad77SArd Biesheuvel 	 * cleanup_highmap() fixes this up along with the mappings
196*dbe0ad77SArd Biesheuvel 	 * beyond _end.
197*dbe0ad77SArd Biesheuvel 	 *
198*dbe0ad77SArd Biesheuvel 	 * Only the region occupied by the kernel image has so far
199*dbe0ad77SArd Biesheuvel 	 * been checked against the table of usable memory regions
200*dbe0ad77SArd Biesheuvel 	 * provided by the firmware, so invalidate pages outside that
201*dbe0ad77SArd Biesheuvel 	 * region. A page table entry that maps to a reserved area of
202*dbe0ad77SArd Biesheuvel 	 * memory would allow processor speculation into that area,
203*dbe0ad77SArd Biesheuvel 	 * and on some hardware (particularly the UV platform) even
204*dbe0ad77SArd Biesheuvel 	 * speculative access to some reserved areas is caught as an
205*dbe0ad77SArd Biesheuvel 	 * error, causing the BIOS to halt the system.
206*dbe0ad77SArd Biesheuvel 	 */
207*dbe0ad77SArd Biesheuvel 
208*dbe0ad77SArd Biesheuvel 	pmd = rip_rel_ptr(level2_kernel_pgt);
209*dbe0ad77SArd Biesheuvel 
210*dbe0ad77SArd Biesheuvel 	/* invalidate pages before the kernel image */
211*dbe0ad77SArd Biesheuvel 	for (i = 0; i < pmd_index(va_text); i++)
212*dbe0ad77SArd Biesheuvel 		pmd[i] &= ~_PAGE_PRESENT;
213*dbe0ad77SArd Biesheuvel 
214*dbe0ad77SArd Biesheuvel 	/* fixup pages that are part of the kernel image */
215*dbe0ad77SArd Biesheuvel 	for (; i <= pmd_index(va_end); i++)
216*dbe0ad77SArd Biesheuvel 		if (pmd[i] & _PAGE_PRESENT)
217*dbe0ad77SArd Biesheuvel 			pmd[i] += load_delta;
218*dbe0ad77SArd Biesheuvel 
219*dbe0ad77SArd Biesheuvel 	/* invalidate pages after the kernel image */
220*dbe0ad77SArd Biesheuvel 	for (; i < PTRS_PER_PMD; i++)
221*dbe0ad77SArd Biesheuvel 		pmd[i] &= ~_PAGE_PRESENT;
222*dbe0ad77SArd Biesheuvel 
223*dbe0ad77SArd Biesheuvel 	return sme_postprocess_startup(bp, pmd, p2v_offset);
224*dbe0ad77SArd Biesheuvel }
225