xref: /linux/arch/x86/kernel/relocate_kernel_64.S (revision 7fc2cd2e4b398c57c9cf961cfea05eadbf34c05c)
1/* SPDX-License-Identifier: GPL-2.0-only */
2/*
3 * relocate_kernel.S - put the kernel image in place to boot
4 * Copyright (C) 2002-2005 Eric Biederman  <ebiederm@xmission.com>
5 */
6
7#include <linux/linkage.h>
8#include <linux/stringify.h>
9#include <asm/alternative.h>
10#include <asm/page_types.h>
11#include <asm/kexec.h>
12#include <asm/processor-flags.h>
13#include <asm/pgtable_types.h>
14#include <asm/nospec-branch.h>
15#include <asm/unwind_hints.h>
16#include <asm/asm-offsets.h>
17
18/*
19 * Must be relocatable PIC code callable as a C function, in particular
20 * there must be a plain RET and not jump to return thunk.
21 */
22
23#define PTR(x) (x << 3)
24#define PAGE_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY)
25
26/*
27 * The .text..relocate_kernel and .data..relocate_kernel sections are copied
28 * into the control page, and the remainder of the page is used as the stack.
29 */
30
31	.section .data..relocate_kernel,"a";
32/* Minimal CPU state */
33SYM_DATA_LOCAL(saved_rsp, .quad 0)
34SYM_DATA_LOCAL(saved_cr0, .quad 0)
35SYM_DATA_LOCAL(saved_cr3, .quad 0)
36SYM_DATA_LOCAL(saved_cr4, .quad 0)
37	/* other data */
38SYM_DATA(kexec_va_control_page, .quad 0)
39SYM_DATA(kexec_pa_table_page, .quad 0)
40SYM_DATA(kexec_pa_swap_page, .quad 0)
41SYM_DATA_LOCAL(pa_backup_pages_map, .quad 0)
42SYM_DATA(kexec_debug_8250_mmio32, .quad 0)
43SYM_DATA(kexec_debug_8250_port, .word 0)
44
45	.balign 16
46SYM_DATA_START_LOCAL(kexec_debug_gdt)
47	.word   kexec_debug_gdt_end - kexec_debug_gdt - 1
48	.long   0
49	.word   0
50	.quad   0x00cf9a000000ffff      /* __KERNEL32_CS */
51	.quad   0x00af9a000000ffff      /* __KERNEL_CS */
52	.quad   0x00cf92000000ffff      /* __KERNEL_DS */
53SYM_DATA_END_LABEL(kexec_debug_gdt, SYM_L_LOCAL, kexec_debug_gdt_end)
54
55	.balign 8
56SYM_DATA_START(kexec_debug_idt)
57	.skip 0x100, 0x00
58SYM_DATA_END(kexec_debug_idt)
59
60	.section .text..relocate_kernel,"ax";
61	.code64
62SYM_CODE_START_NOALIGN(relocate_kernel)
63	UNWIND_HINT_END_OF_STACK
64	ANNOTATE_NOENDBR
65	/*
66	 * %rdi indirection_page
67	 * %rsi pa_control_page
68	 * %rdx start address
69	 * %rcx flags: RELOC_KERNEL_*
70	 */
71
72	/* Save the CPU context, used for jumping back */
73	pushq %rbx
74	pushq %rbp
75	pushq %r12
76	pushq %r13
77	pushq %r14
78	pushq %r15
79	pushf
80
81	/* Invalidate GDT/IDT, zero out flags */
82	pushq	$0
83	pushq	$0
84
85	lidt	(%rsp)
86	lgdt	(%rsp)
87	addq	$8, %rsp
88	popfq
89
90	/* Switch to the identity mapped page tables */
91	movq	%cr3, %rax
92	movq	kexec_pa_table_page(%rip), %r9
93	movq	%r9, %cr3
94
95	/* Leave CR4 in %r13 to enable the right paging mode later. */
96	movq	%cr4, %r13
97
98	/*
99	 * Disable global pages immediately to ensure this mapping is RWX.
100	 * Disable LASS before jumping to the identity mapped page.
101	 */
102	movq	%r13, %r12
103	andq	$~(X86_CR4_PGE | X86_CR4_LASS), %r12
104	movq	%r12, %cr4
105
106	/* Save %rsp and CRs. */
107	movq	%r13, saved_cr4(%rip)
108	movq    %rsp, saved_rsp(%rip)
109	movq	%rax, saved_cr3(%rip)
110	movq	%cr0, %rax
111	movq	%rax, saved_cr0(%rip)
112
113	/* save indirection list for jumping back */
114	movq	%rdi, pa_backup_pages_map(%rip)
115
116	/* Save the flags to %r11 as swap_pages clobbers %rcx. */
117	movq	%rcx, %r11
118
119	/* setup a new stack at the end of the physical control page */
120	lea	PAGE_SIZE(%rsi), %rsp
121
122	/* jump to identity mapped page */
1230:	addq	$identity_mapped - 0b, %rsi
124	subq	$__relocate_kernel_start - 0b, %rsi
125	ANNOTATE_RETPOLINE_SAFE
126	jmp	*%rsi
127SYM_CODE_END(relocate_kernel)
128
129SYM_CODE_START_LOCAL_NOALIGN(identity_mapped)
130	UNWIND_HINT_END_OF_STACK
131	/*
132	 * %rdi	indirection page
133	 * %rdx start address
134	 * %r9 page table page
135	 * %r11 flags: RELOC_KERNEL_*
136	 * %r13 original CR4 when relocate_kernel() was invoked
137	 */
138
139	/* store the start address on the stack */
140	pushq   %rdx
141
142	/* Create a GDTR (16 bits limit, 64 bits addr) on stack */
143	leaq	kexec_debug_gdt(%rip), %rax
144	pushq	%rax
145	pushw	(%rax)
146
147	/* Load the GDT, put the stack back */
148	lgdt	(%rsp)
149	addq	$10, %rsp
150
151	/* Test that we can load segments */
152	movq	%ds, %rax
153	movq	%rax, %ds
154
155	/* Now an IDTR on the stack to load the IDT the kernel created */
156	leaq	kexec_debug_idt(%rip), %rsi
157	pushq	%rsi
158	pushw	$0xff
159	lidt	(%rsp)
160	addq	$10, %rsp
161
162	//int3
163
164	/*
165	 * Clear X86_CR4_CET (if it was set) such that we can clear CR0_WP
166	 * below.
167	 */
168	movq	%cr4, %rax
169	andq	$~(X86_CR4_CET), %rax
170	movq	%rax, %cr4
171
172	/*
173	 * Set cr0 to a known state:
174	 *  - Paging enabled
175	 *  - Alignment check disabled
176	 *  - Write protect disabled
177	 *  - No task switch
178	 *  - Don't do FP software emulation.
179	 *  - Protected mode enabled
180	 */
181	movq	%cr0, %rax
182	andq	$~(X86_CR0_AM | X86_CR0_WP | X86_CR0_TS | X86_CR0_EM), %rax
183	orl	$(X86_CR0_PG | X86_CR0_PE), %eax
184	movq	%rax, %cr0
185
186	/*
187	 * Set cr4 to a known state:
188	 *  - physical address extension enabled
189	 *  - 5-level paging, if it was enabled before
190	 *  - Machine check exception on TDX guest, if it was enabled before.
191	 *    Clearing MCE might not be allowed in TDX guests, depending on setup.
192	 *
193	 * Use R13 that contains the original CR4 value, read in relocate_kernel().
194	 * PAE is always set in the original CR4.
195	 */
196	andl	$(X86_CR4_PAE | X86_CR4_LA57), %r13d
197	ALTERNATIVE "", __stringify(orl $X86_CR4_MCE, %r13d), X86_FEATURE_TDX_GUEST
198	movq	%r13, %cr4
199
200	/* Flush the TLB (needed?) */
201	movq	%r9, %cr3
202
203	/*
204	 * If the memory cache is in incoherent state, e.g., due to
205	 * memory encryption, do WBINVD to flush cache.
206	 *
207	 * If SME is active, there could be old encrypted cache line
208	 * entries that will conflict with the now unencrypted memory
209	 * used by kexec. Flush the caches before copying the kernel.
210	 *
211	 * Note SME sets this flag to true when the platform supports
212	 * SME, so the WBINVD is performed even SME is not activated
213	 * by the kernel.  But this has no harm.
214	 */
215	testb	$RELOC_KERNEL_CACHE_INCOHERENT, %r11b
216	jz .Lnowbinvd
217	wbinvd
218.Lnowbinvd:
219
220	call	swap_pages
221
222	/*
223	 * To be certain of avoiding problems with self-modifying code
224	 * I need to execute a serializing instruction here.
225	 * So I flush the TLB by reloading %cr3 here, it's handy,
226	 * and not processor dependent.
227	 */
228	movq	%cr3, %rax
229	movq	%rax, %cr3
230
231	testb	$RELOC_KERNEL_PRESERVE_CONTEXT, %r11b
232	jnz .Lrelocate
233
234	/*
235	 * set all of the registers to known values
236	 * leave %rsp alone
237	 */
238
239	xorl	%eax, %eax
240	xorl	%ebx, %ebx
241	xorl    %ecx, %ecx
242	xorl    %edx, %edx
243	xorl    %esi, %esi
244	xorl    %edi, %edi
245	xorl    %ebp, %ebp
246	xorl	%r8d, %r8d
247	xorl	%r9d, %r9d
248	xorl	%r10d, %r10d
249	xorl	%r11d, %r11d
250	xorl	%r12d, %r12d
251	xorl	%r13d, %r13d
252	xorl	%r14d, %r14d
253	xorl	%r15d, %r15d
254
255	ANNOTATE_UNRET_SAFE
256	ret
257	int3
258
259.Lrelocate:
260	popq	%rdx
261
262	/* Use the swap page for the callee's stack */
263	movq	kexec_pa_swap_page(%rip), %r10
264	leaq	PAGE_SIZE(%r10), %rsp
265
266	/* push the existing entry point onto the callee's stack */
267	pushq	%rdx
268
269	ANNOTATE_RETPOLINE_SAFE
270	call	*%rdx
271
272	/* get the re-entry point of the peer system */
273	popq	%rbp
274	movq	kexec_pa_swap_page(%rip), %r10
275	movq	pa_backup_pages_map(%rip), %rdi
276	movq	kexec_pa_table_page(%rip), %rax
277	movq	%rax, %cr3
278
279	/* Find start (and end) of this physical mapping of control page */
280	leaq	(%rip), %r8
281	ANNOTATE_NOENDBR
282	andq	$PAGE_MASK, %r8
283	lea	PAGE_SIZE(%r8), %rsp
284	/*
285	 * Ensure RELOC_KERNEL_PRESERVE_CONTEXT flag is set so that
286	 * swap_pages() can swap pages correctly.  Note all other
287	 * RELOC_KERNEL_* flags passed to relocate_kernel() are not
288	 * restored.
289	 */
290	movl	$RELOC_KERNEL_PRESERVE_CONTEXT, %r11d
291	call	swap_pages
292	movq	kexec_va_control_page(%rip), %rax
2930:	addq	$virtual_mapped - 0b, %rax
294	subq	$__relocate_kernel_start - 0b, %rax
295	pushq	%rax
296	ANNOTATE_UNRET_SAFE
297	ret
298	int3
299SYM_CODE_END(identity_mapped)
300
301SYM_CODE_START_LOCAL_NOALIGN(virtual_mapped)
302	UNWIND_HINT_END_OF_STACK
303	ANNOTATE_NOENDBR // RET target, above
304	movq	saved_rsp(%rip), %rsp
305	movq	saved_cr4(%rip), %rax
306	movq	%rax, %cr4
307	movq	saved_cr3(%rip), %rax
308	movq	saved_cr0(%rip), %r8
309	movq	%rax, %cr3
310	movq	%r8, %cr0
311
312#ifdef CONFIG_KEXEC_JUMP
313	/* Saved in save_processor_state. */
314	movq    $saved_context, %rax
315	lgdt    saved_context_gdt_desc(%rax)
316#endif
317
318	/* relocate_kernel() returns the re-entry point for next time */
319	movq	%rbp, %rax
320
321	popf
322	popq	%r15
323	popq	%r14
324	popq	%r13
325	popq	%r12
326	popq	%rbp
327	popq	%rbx
328	ANNOTATE_UNRET_SAFE
329	ret
330	int3
331SYM_CODE_END(virtual_mapped)
332
333	/* Do the copies */
334SYM_CODE_START_LOCAL_NOALIGN(swap_pages)
335	UNWIND_HINT_END_OF_STACK
336	/*
337	 * %rdi indirection page
338	 * %r11 flags: RELOC_KERNEL_*
339	 */
340	movq	%rdi, %rcx	/* Put the indirection_page in %rcx */
341	xorl	%edi, %edi
342	xorl	%esi, %esi
343	jmp	.Lstart		/* Should start with an indirection record */
344
345.Lloop:	/* top, read another word for the indirection page */
346
347	movq	(%rbx), %rcx
348	addq	$8,	%rbx
349.Lstart:
350	testb	$0x1,	%cl   /* is it a destination page? */
351	jz	.Lnotdest
352	movq	%rcx,	%rdi
353	andq	$0xfffffffffffff000, %rdi
354	jmp	.Lloop
355.Lnotdest:
356	testb	$0x2,	%cl   /* is it an indirection page? */
357	jz	.Lnotind
358	movq	%rcx,   %rbx
359	andq	$0xfffffffffffff000, %rbx
360	jmp	.Lloop
361.Lnotind:
362	testb	$0x4,	%cl   /* is it the done indicator? */
363	jz	.Lnotdone
364	jmp	.Ldone
365.Lnotdone:
366	testb	$0x8,	%cl   /* is it the source indicator? */
367	jz	.Lloop	      /* Ignore it otherwise */
368	movq	%rcx,   %rsi  /* For ever source page do a copy */
369	andq	$0xfffffffffffff000, %rsi
370
371	movq	%rdi, %rdx    /* Save destination page to %rdx */
372	movq	%rsi, %rax    /* Save source page to %rax */
373
374	/* Only actually swap for ::preserve_context */
375	testb	$RELOC_KERNEL_PRESERVE_CONTEXT, %r11b
376	jz	.Lnoswap
377
378	/* copy source page to swap page */
379	movq	kexec_pa_swap_page(%rip), %rdi
380	movl	$512, %ecx
381	rep movsq
382
383	/* copy destination page to source page */
384	movq	%rax, %rdi
385	movq	%rdx, %rsi
386	movl	$512, %ecx
387	rep movsq
388
389	/* copy swap page to destination page */
390	movq	%rdx, %rdi
391	movq	kexec_pa_swap_page(%rip), %rsi
392.Lnoswap:
393	movl	$512, %ecx
394	rep movsq
395
396	lea	PAGE_SIZE(%rax), %rsi
397	jmp	.Lloop
398.Ldone:
399	ANNOTATE_UNRET_SAFE
400	ret
401	int3
402SYM_CODE_END(swap_pages)
403
404/*
405 * Generic 'print character' routine
406 *  - %al: Character to be printed (may clobber %rax)
407 *  - %rdx: MMIO address or port.
408 */
409#define XMTRDY          0x20
410
411#define TXR             0       /*  Transmit register (WRITE) */
412#define LSR             5       /*  Line Status               */
413
414SYM_CODE_START_LOCAL_NOALIGN(pr_char_8250)
415	UNWIND_HINT_FUNC
416	ANNOTATE_NOENDBR
417	addw	$LSR, %dx
418	xchg	%al, %ah
419.Lxmtrdy_loop:
420	inb	%dx, %al
421	testb	$XMTRDY, %al
422	jnz	.Lready
423	pause
424	jmp .Lxmtrdy_loop
425
426.Lready:
427	subw	$LSR, %dx
428	xchg	%al, %ah
429	outb	%al, %dx
430pr_char_null:
431	ANNOTATE_NOENDBR
432
433	ANNOTATE_UNRET_SAFE
434	ret
435SYM_CODE_END(pr_char_8250)
436
437SYM_CODE_START_LOCAL_NOALIGN(pr_char_8250_mmio32)
438	UNWIND_HINT_FUNC
439	ANNOTATE_NOENDBR
440.Lxmtrdy_loop_mmio:
441	movb	(LSR*4)(%rdx), %ah
442	testb	$XMTRDY, %ah
443	jnz	.Lready_mmio
444	pause
445	jmp .Lxmtrdy_loop_mmio
446
447.Lready_mmio:
448	movb	%al, (%rdx)
449	ANNOTATE_UNRET_SAFE
450	ret
451SYM_CODE_END(pr_char_8250_mmio32)
452
453/*
454 * Load pr_char function pointer into %rsi and load %rdx with whatever
455 * that function wants to see there (typically port/MMIO address).
456 */
457.macro pr_setup
458	leaq	pr_char_8250(%rip), %rsi
459	movw	kexec_debug_8250_port(%rip), %dx
460	testw	%dx, %dx
461	jnz	1f
462
463	leaq	pr_char_8250_mmio32(%rip), %rsi
464	movq	kexec_debug_8250_mmio32(%rip), %rdx
465	testq	%rdx, %rdx
466	jnz	1f
467
468	leaq	pr_char_null(%rip), %rsi
4691:
470.endm
471
472/* Print the nybble in %bl, clobber %rax */
473SYM_CODE_START_LOCAL_NOALIGN(pr_nybble)
474	UNWIND_HINT_FUNC
475	movb	%bl, %al
476	nop
477	andb	$0x0f, %al
478	addb	$0x30, %al
479	cmpb	$0x3a, %al
480	jb	1f
481	addb	$('a' - '0' - 10), %al
482	ANNOTATE_RETPOLINE_SAFE
4831:	jmp	*%rsi
484SYM_CODE_END(pr_nybble)
485
486SYM_CODE_START_LOCAL_NOALIGN(pr_qword)
487	UNWIND_HINT_FUNC
488	movq	$16, %rcx
4891:	rolq	$4, %rbx
490	call	pr_nybble
491	loop	1b
492	movb	$'\n', %al
493	ANNOTATE_RETPOLINE_SAFE
494	jmp	*%rsi
495SYM_CODE_END(pr_qword)
496
497.macro print_reg a, b, c, d, r
498	movb	$\a, %al
499	ANNOTATE_RETPOLINE_SAFE
500	call	*%rsi
501	movb	$\b, %al
502	ANNOTATE_RETPOLINE_SAFE
503	call	*%rsi
504	movb	$\c, %al
505	ANNOTATE_RETPOLINE_SAFE
506	call	*%rsi
507	movb	$\d, %al
508	ANNOTATE_RETPOLINE_SAFE
509	call	*%rsi
510	movq	\r, %rbx
511	call	pr_qword
512.endm
513
514SYM_CODE_START_NOALIGN(kexec_debug_exc_vectors)
515	/* Each of these is 6 bytes. */
516.macro vec_err exc
517	UNWIND_HINT_ENTRY
518	. = kexec_debug_exc_vectors + (\exc * KEXEC_DEBUG_EXC_HANDLER_SIZE)
519	nop
520	nop
521	pushq	$\exc
522	jmp	exc_handler
523.endm
524
525.macro vec_noerr exc
526	UNWIND_HINT_ENTRY
527	. = kexec_debug_exc_vectors + (\exc * KEXEC_DEBUG_EXC_HANDLER_SIZE)
528	pushq	$0
529	pushq	$\exc
530	jmp	exc_handler
531.endm
532
533	ANNOTATE_NOENDBR
534	vec_noerr  0 // #DE
535	vec_noerr  1 // #DB
536	vec_noerr  2 // #NMI
537	vec_noerr  3 // #BP
538	vec_noerr  4 // #OF
539	vec_noerr  5 // #BR
540	vec_noerr  6 // #UD
541	vec_noerr  7 // #NM
542	vec_err    8 // #DF
543	vec_noerr  9
544	vec_err   10 // #TS
545	vec_err   11 // #NP
546	vec_err   12 // #SS
547	vec_err   13 // #GP
548	vec_err   14 // #PF
549	vec_noerr 15
550SYM_CODE_END(kexec_debug_exc_vectors)
551
552SYM_CODE_START_LOCAL_NOALIGN(exc_handler)
553	/* No need for RET mitigations during kexec */
554	VALIDATE_UNRET_END
555
556	pushq	%rax
557	pushq	%rbx
558	pushq	%rcx
559	pushq	%rdx
560	pushq	%rsi
561
562	/* Stack frame */
563#define EXC_SS		0x58 /* Architectural... */
564#define EXC_RSP		0x50
565#define EXC_EFLAGS	0x48
566#define EXC_CS		0x40
567#define EXC_RIP		0x38
568#define EXC_ERRORCODE	0x30 /* Either architectural or zero pushed by handler */
569#define EXC_EXCEPTION	0x28 /* Pushed by handler entry point */
570#define EXC_RAX		0x20 /* Pushed just above in exc_handler */
571#define EXC_RBX		0x18
572#define EXC_RCX		0x10
573#define EXC_RDX		0x08
574#define EXC_RSI		0x00
575
576	/* Set up %rdx/%rsi for debug output */
577	pr_setup
578
579	/* rip and exception info */
580	print_reg 'E', 'x', 'c', ':', EXC_EXCEPTION(%rsp)
581	print_reg 'E', 'r', 'r', ':', EXC_ERRORCODE(%rsp)
582	print_reg 'r', 'i', 'p', ':', EXC_RIP(%rsp)
583	print_reg 'r', 's', 'p', ':', EXC_RSP(%rsp)
584
585	/* We spilled these to the stack */
586	print_reg 'r', 'a', 'x', ':', EXC_RAX(%rsp)
587	print_reg 'r', 'b', 'x', ':', EXC_RBX(%rsp)
588	print_reg 'r', 'c', 'x', ':', EXC_RCX(%rsp)
589	print_reg 'r', 'd', 'x', ':', EXC_RDX(%rsp)
590	print_reg 'r', 's', 'i', ':', EXC_RSI(%rsp)
591
592	/* Other registers untouched */
593	print_reg 'r', 'd', 'i', ':', %rdi
594	print_reg 'r', '8', ' ', ':', %r8
595	print_reg 'r', '9', ' ', ':', %r9
596	print_reg 'r', '1', '0', ':', %r10
597	print_reg 'r', '1', '1', ':', %r11
598	print_reg 'r', '1', '2', ':', %r12
599	print_reg 'r', '1', '3', ':', %r13
600	print_reg 'r', '1', '4', ':', %r14
601	print_reg 'r', '1', '5', ':', %r15
602	print_reg 'c', 'r', '2', ':', %cr2
603
604	/* Only return from INT3 */
605	cmpq	$3, EXC_EXCEPTION(%rsp)
606	jne	.Ldie
607
608	popq	%rsi
609	popq	%rdx
610	popq	%rcx
611	popq	%rbx
612	popq	%rax
613
614	addq	$16, %rsp
615	iretq
616
617.Ldie:
618	hlt
619	jmp	.Ldie
620
621SYM_CODE_END(exc_handler)
622