xref: /linux/arch/x86/kernel/relocate_kernel_64.S (revision e7f24a388e703e505a7f8d014a428308b35e8f94)
1/* SPDX-License-Identifier: GPL-2.0-only */
2/*
3 * relocate_kernel.S - put the kernel image in place to boot
4 * Copyright (C) 2002-2005 Eric Biederman  <ebiederm@xmission.com>
5 */
6
7#include <linux/linkage.h>
8#include <linux/stringify.h>
9#include <asm/alternative.h>
10#include <asm/page_types.h>
11#include <asm/kexec.h>
12#include <asm/processor-flags.h>
13#include <asm/pgtable_types.h>
14#include <asm/nospec-branch.h>
15#include <asm/unwind_hints.h>
16#include <asm/asm-offsets.h>
17
18/*
19 * Must be relocatable PIC code callable as a C function, in particular
20 * there must be a plain RET and not jump to return thunk.
21 */
22
23#define PTR(x) (x << 3)
24#define PAGE_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY)
25
26/*
27 * The .text..relocate_kernel and .data..relocate_kernel sections are copied
28 * into the control page, and the remainder of the page is used as the stack.
29 */
30
31	.section .data..relocate_kernel,"a";
32/* Minimal CPU state */
33SYM_DATA_LOCAL(saved_rsp, .quad 0)
34SYM_DATA_LOCAL(saved_cr0, .quad 0)
35SYM_DATA_LOCAL(saved_cr3, .quad 0)
36SYM_DATA_LOCAL(saved_cr4, .quad 0)
37	/* other data */
38SYM_DATA(kexec_va_control_page, .quad 0)
39SYM_DATA(kexec_pa_table_page, .quad 0)
40SYM_DATA(kexec_pa_swap_page, .quad 0)
41SYM_DATA_LOCAL(pa_backup_pages_map, .quad 0)
42SYM_DATA(kexec_debug_8250_mmio32, .quad 0)
43SYM_DATA(kexec_debug_8250_port, .word 0)
44
45	.balign 16
46SYM_DATA_START_LOCAL(kexec_debug_gdt)
47	.word   kexec_debug_gdt_end - kexec_debug_gdt - 1
48	.long   0
49	.word   0
50	.quad   0x00cf9a000000ffff      /* __KERNEL32_CS */
51	.quad   0x00af9a000000ffff      /* __KERNEL_CS */
52	.quad   0x00cf92000000ffff      /* __KERNEL_DS */
53SYM_DATA_END_LABEL(kexec_debug_gdt, SYM_L_LOCAL, kexec_debug_gdt_end)
54
55	.balign 8
56SYM_DATA_START(kexec_debug_idt)
57	.skip 0x100, 0x00
58SYM_DATA_END(kexec_debug_idt)
59
60	.section .text..relocate_kernel,"ax";
61	.code64
62SYM_CODE_START_NOALIGN(relocate_kernel)
63	UNWIND_HINT_END_OF_STACK
64	ANNOTATE_NOENDBR
65	/*
66	 * %rdi indirection_page
67	 * %rsi pa_control_page
68	 * %rdx start address
69	 * %rcx flags: RELOC_KERNEL_*
70	 */
71
72	/* Save the CPU context, used for jumping back */
73	pushq %rbx
74	pushq %rbp
75	pushq %r12
76	pushq %r13
77	pushq %r14
78	pushq %r15
79	pushf
80
81	/* Invalidate GDT/IDT, zero out flags */
82	pushq	$0
83	pushq	$0
84
85	lidt	(%rsp)
86	lgdt	(%rsp)
87	addq	$8, %rsp
88	popfq
89
90	/* Switch to the identity mapped page tables */
91	movq	%cr3, %rax
92	movq	kexec_pa_table_page(%rip), %r9
93	movq	%r9, %cr3
94
95	/* Leave CR4 in %r13 to enable the right paging mode later. */
96	movq	%cr4, %r13
97
98	/*
99	 * Disable global pages immediately to ensure this mapping is RWX.
100	 * Disable LASS before jumping to the identity mapped page.
101	 */
102	movq	%r13, %r12
103	andq	$~(X86_CR4_PGE | X86_CR4_LASS), %r12
104	movq	%r12, %cr4
105
106	/* Save %rsp and CRs. */
107	movq	%r13, saved_cr4(%rip)
108	movq    %rsp, saved_rsp(%rip)
109	movq	%rax, saved_cr3(%rip)
110	movq	%cr0, %rax
111	movq	%rax, saved_cr0(%rip)
112
113	/* save indirection list for jumping back */
114	movq	%rdi, pa_backup_pages_map(%rip)
115
116	/* Save the flags to %r11 as swap_pages clobbers %rcx. */
117	movq	%rcx, %r11
118
119	/* setup a new stack at the end of the physical control page */
120	lea	PAGE_SIZE(%rsi), %rsp
121
122	/* jump to identity mapped page */
1230:	addq	$identity_mapped - 0b, %rsi
124	subq	$__relocate_kernel_start - 0b, %rsi
125	ANNOTATE_RETPOLINE_SAFE
126	jmp	*%rsi
127SYM_CODE_END(relocate_kernel)
128
129SYM_CODE_START_LOCAL_NOALIGN(identity_mapped)
130	UNWIND_HINT_END_OF_STACK
131	/*
132	 * %rdi	indirection page
133	 * %rdx start address
134	 * %r9 page table page
135	 * %r11 flags: RELOC_KERNEL_*
136	 * %r13 original CR4 when relocate_kernel() was invoked
137	 */
138
139	/*
140	 * Set return address to 0 if not preserving context. The purgatory
141	 * shipped in kexec-tools will unconditionally look for the return
142	 * address on the stack and set a kexec_jump_back_entry= command
143	 * line option if it's non-zero. There's no other way that it can
144	 * tell a preserve-context (kjump) kexec from a normal one.
145	 */
146	pushq	$0
147	/* store the start address on the stack */
148	pushq   %rdx
149
150	/* Create a GDTR (16 bits limit, 64 bits addr) on stack */
151	leaq	kexec_debug_gdt(%rip), %rax
152	pushq	%rax
153	pushw	(%rax)
154
155	/* Load the GDT, put the stack back */
156	lgdt	(%rsp)
157	addq	$10, %rsp
158
159	/* Test that we can load segments */
160	movq	%ds, %rax
161	movq	%rax, %ds
162
163	/* Now an IDTR on the stack to load the IDT the kernel created */
164	leaq	kexec_debug_idt(%rip), %rsi
165	pushq	%rsi
166	pushw	$0xff
167	lidt	(%rsp)
168	addq	$10, %rsp
169
170	//int3
171
172	/*
173	 * Clear X86_CR4_CET (if it was set) such that we can clear CR0_WP
174	 * below.
175	 */
176	movq	%cr4, %rax
177	andq	$~(X86_CR4_CET), %rax
178	movq	%rax, %cr4
179
180	/*
181	 * Set cr0 to a known state:
182	 *  - Paging enabled
183	 *  - Alignment check disabled
184	 *  - Write protect disabled
185	 *  - No task switch
186	 *  - Don't do FP software emulation.
187	 *  - Protected mode enabled
188	 */
189	movq	%cr0, %rax
190	andq	$~(X86_CR0_AM | X86_CR0_WP | X86_CR0_TS | X86_CR0_EM), %rax
191	orl	$(X86_CR0_PG | X86_CR0_PE), %eax
192	movq	%rax, %cr0
193
194	/*
195	 * Set cr4 to a known state:
196	 *  - physical address extension enabled
197	 *  - 5-level paging, if it was enabled before
198	 *  - Machine check exception on TDX guest, if it was enabled before.
199	 *    Clearing MCE might not be allowed in TDX guests, depending on setup.
200	 *
201	 * Use R13 that contains the original CR4 value, read in relocate_kernel().
202	 * PAE is always set in the original CR4.
203	 */
204	andl	$(X86_CR4_PAE | X86_CR4_LA57), %r13d
205	ALTERNATIVE "", __stringify(orl $X86_CR4_MCE, %r13d), X86_FEATURE_TDX_GUEST
206	movq	%r13, %cr4
207
208	/* Flush the TLB (needed?) */
209	movq	%r9, %cr3
210
211	/*
212	 * If the memory cache is in incoherent state, e.g., due to
213	 * memory encryption, do WBINVD to flush cache.
214	 *
215	 * If SME is active, there could be old encrypted cache line
216	 * entries that will conflict with the now unencrypted memory
217	 * used by kexec. Flush the caches before copying the kernel.
218	 *
219	 * Note SME sets this flag to true when the platform supports
220	 * SME, so the WBINVD is performed even SME is not activated
221	 * by the kernel.  But this has no harm.
222	 */
223	testb	$RELOC_KERNEL_CACHE_INCOHERENT, %r11b
224	jz .Lnowbinvd
225	wbinvd
226.Lnowbinvd:
227
228	call	swap_pages
229
230	/*
231	 * To be certain of avoiding problems with self-modifying code
232	 * I need to execute a serializing instruction here.
233	 * So I flush the TLB by reloading %cr3 here, it's handy,
234	 * and not processor dependent.
235	 */
236	movq	%cr3, %rax
237	movq	%rax, %cr3
238
239	testb	$RELOC_KERNEL_PRESERVE_CONTEXT, %r11b
240	jnz .Lrelocate
241
242	/*
243	 * set all of the registers to known values
244	 * leave %rsp alone
245	 */
246
247	xorl	%eax, %eax
248	xorl	%ebx, %ebx
249	xorl    %ecx, %ecx
250	xorl    %edx, %edx
251	xorl    %esi, %esi
252	xorl    %edi, %edi
253	xorl    %ebp, %ebp
254	xorl	%r8d, %r8d
255	xorl	%r9d, %r9d
256	xorl	%r10d, %r10d
257	xorl	%r11d, %r11d
258	xorl	%r12d, %r12d
259	xorl	%r13d, %r13d
260	xorl	%r14d, %r14d
261	xorl	%r15d, %r15d
262
263	ANNOTATE_UNRET_SAFE
264	ret
265	int3
266
267.Lrelocate:
268	popq	%rdx
269
270	/* Use the swap page for the callee's stack */
271	movq	kexec_pa_swap_page(%rip), %r10
272	leaq	PAGE_SIZE(%r10), %rsp
273
274	/* push the existing entry point onto the callee's stack */
275	pushq	%rdx
276
277	ANNOTATE_RETPOLINE_SAFE
278	call	*%rdx
279
280	/* get the re-entry point of the peer system */
281	popq	%rbp
282	movq	kexec_pa_swap_page(%rip), %r10
283	movq	pa_backup_pages_map(%rip), %rdi
284	movq	kexec_pa_table_page(%rip), %rax
285	movq	%rax, %cr3
286
287	/* Find start (and end) of this physical mapping of control page */
288	leaq	(%rip), %r8
289	ANNOTATE_NOENDBR
290	andq	$PAGE_MASK, %r8
291	lea	PAGE_SIZE(%r8), %rsp
292	/*
293	 * Ensure RELOC_KERNEL_PRESERVE_CONTEXT flag is set so that
294	 * swap_pages() can swap pages correctly.  Note all other
295	 * RELOC_KERNEL_* flags passed to relocate_kernel() are not
296	 * restored.
297	 */
298	movl	$RELOC_KERNEL_PRESERVE_CONTEXT, %r11d
299	call	swap_pages
300	movq	kexec_va_control_page(%rip), %rax
3010:	addq	$virtual_mapped - 0b, %rax
302	subq	$__relocate_kernel_start - 0b, %rax
303	pushq	%rax
304	ANNOTATE_UNRET_SAFE
305	ret
306	int3
307SYM_CODE_END(identity_mapped)
308
309SYM_CODE_START_LOCAL_NOALIGN(virtual_mapped)
310	UNWIND_HINT_END_OF_STACK
311	ANNOTATE_NOENDBR // RET target, above
312	movq	saved_rsp(%rip), %rsp
313	movq	saved_cr4(%rip), %rax
314	movq	%rax, %cr4
315	movq	saved_cr3(%rip), %rax
316	movq	saved_cr0(%rip), %r8
317	movq	%rax, %cr3
318	movq	%r8, %cr0
319
320#ifdef CONFIG_KEXEC_JUMP
321	/* Saved in save_processor_state. */
322	movq    $saved_context, %rax
323	lgdt    saved_context_gdt_desc(%rax)
324#endif
325
326	/* relocate_kernel() returns the re-entry point for next time */
327	movq	%rbp, %rax
328
329	popf
330	popq	%r15
331	popq	%r14
332	popq	%r13
333	popq	%r12
334	popq	%rbp
335	popq	%rbx
336	ANNOTATE_UNRET_SAFE
337	ret
338	int3
339SYM_CODE_END(virtual_mapped)
340
341	/* Do the copies */
342SYM_CODE_START_LOCAL_NOALIGN(swap_pages)
343	UNWIND_HINT_END_OF_STACK
344	/*
345	 * %rdi indirection page
346	 * %r11 flags: RELOC_KERNEL_*
347	 */
348	movq	%rdi, %rcx	/* Put the indirection_page in %rcx */
349	xorl	%edi, %edi
350	xorl	%esi, %esi
351	jmp	.Lstart		/* Should start with an indirection record */
352
353.Lloop:	/* top, read another word for the indirection page */
354
355	movq	(%rbx), %rcx
356	addq	$8,	%rbx
357.Lstart:
358	testb	$0x1,	%cl   /* is it a destination page? */
359	jz	.Lnotdest
360	movq	%rcx,	%rdi
361	andq	$0xfffffffffffff000, %rdi
362	jmp	.Lloop
363.Lnotdest:
364	testb	$0x2,	%cl   /* is it an indirection page? */
365	jz	.Lnotind
366	movq	%rcx,   %rbx
367	andq	$0xfffffffffffff000, %rbx
368	jmp	.Lloop
369.Lnotind:
370	testb	$0x4,	%cl   /* is it the done indicator? */
371	jz	.Lnotdone
372	jmp	.Ldone
373.Lnotdone:
374	testb	$0x8,	%cl   /* is it the source indicator? */
375	jz	.Lloop	      /* Ignore it otherwise */
376	movq	%rcx,   %rsi  /* For ever source page do a copy */
377	andq	$0xfffffffffffff000, %rsi
378
379	movq	%rdi, %rdx    /* Save destination page to %rdx */
380	movq	%rsi, %rax    /* Save source page to %rax */
381
382	/* Only actually swap for ::preserve_context */
383	testb	$RELOC_KERNEL_PRESERVE_CONTEXT, %r11b
384	jz	.Lnoswap
385
386	/* copy source page to swap page */
387	movq	kexec_pa_swap_page(%rip), %rdi
388	movl	$512, %ecx
389	rep movsq
390
391	/* copy destination page to source page */
392	movq	%rax, %rdi
393	movq	%rdx, %rsi
394	movl	$512, %ecx
395	rep movsq
396
397	/* copy swap page to destination page */
398	movq	%rdx, %rdi
399	movq	kexec_pa_swap_page(%rip), %rsi
400.Lnoswap:
401	movl	$512, %ecx
402	rep movsq
403
404	lea	PAGE_SIZE(%rax), %rsi
405	jmp	.Lloop
406.Ldone:
407	ANNOTATE_UNRET_SAFE
408	ret
409	int3
410SYM_CODE_END(swap_pages)
411
412/*
413 * Generic 'print character' routine
414 *  - %al: Character to be printed (may clobber %rax)
415 *  - %rdx: MMIO address or port.
416 */
417#define XMTRDY          0x20
418
419#define TXR             0       /*  Transmit register (WRITE) */
420#define LSR             5       /*  Line Status               */
421
422SYM_CODE_START_LOCAL_NOALIGN(pr_char_8250)
423	UNWIND_HINT_FUNC
424	ANNOTATE_NOENDBR
425	addw	$LSR, %dx
426	xchg	%al, %ah
427.Lxmtrdy_loop:
428	inb	%dx, %al
429	testb	$XMTRDY, %al
430	jnz	.Lready
431	pause
432	jmp .Lxmtrdy_loop
433
434.Lready:
435	subw	$LSR, %dx
436	xchg	%al, %ah
437	outb	%al, %dx
438pr_char_null:
439	ANNOTATE_NOENDBR
440
441	ANNOTATE_UNRET_SAFE
442	ret
443SYM_CODE_END(pr_char_8250)
444
445SYM_CODE_START_LOCAL_NOALIGN(pr_char_8250_mmio32)
446	UNWIND_HINT_FUNC
447	ANNOTATE_NOENDBR
448.Lxmtrdy_loop_mmio:
449	movb	(LSR*4)(%rdx), %ah
450	testb	$XMTRDY, %ah
451	jnz	.Lready_mmio
452	pause
453	jmp .Lxmtrdy_loop_mmio
454
455.Lready_mmio:
456	movb	%al, (%rdx)
457	ANNOTATE_UNRET_SAFE
458	ret
459SYM_CODE_END(pr_char_8250_mmio32)
460
461/*
462 * Load pr_char function pointer into %rsi and load %rdx with whatever
463 * that function wants to see there (typically port/MMIO address).
464 */
465.macro pr_setup
466	leaq	pr_char_8250(%rip), %rsi
467	movw	kexec_debug_8250_port(%rip), %dx
468	testw	%dx, %dx
469	jnz	1f
470
471	leaq	pr_char_8250_mmio32(%rip), %rsi
472	movq	kexec_debug_8250_mmio32(%rip), %rdx
473	testq	%rdx, %rdx
474	jnz	1f
475
476	leaq	pr_char_null(%rip), %rsi
4771:
478.endm
479
480/* Print the nybble in %bl, clobber %rax */
481SYM_CODE_START_LOCAL_NOALIGN(pr_nybble)
482	UNWIND_HINT_FUNC
483	movb	%bl, %al
484	nop
485	andb	$0x0f, %al
486	addb	$0x30, %al
487	cmpb	$0x3a, %al
488	jb	1f
489	addb	$('a' - '0' - 10), %al
490	ANNOTATE_RETPOLINE_SAFE
4911:	jmp	*%rsi
492SYM_CODE_END(pr_nybble)
493
494SYM_CODE_START_LOCAL_NOALIGN(pr_qword)
495	UNWIND_HINT_FUNC
496	movq	$16, %rcx
4971:	rolq	$4, %rbx
498	call	pr_nybble
499	loop	1b
500	movb	$'\n', %al
501	ANNOTATE_RETPOLINE_SAFE
502	jmp	*%rsi
503SYM_CODE_END(pr_qword)
504
505.macro print_reg a, b, c, d, r
506	movb	$\a, %al
507	ANNOTATE_RETPOLINE_SAFE
508	call	*%rsi
509	movb	$\b, %al
510	ANNOTATE_RETPOLINE_SAFE
511	call	*%rsi
512	movb	$\c, %al
513	ANNOTATE_RETPOLINE_SAFE
514	call	*%rsi
515	movb	$\d, %al
516	ANNOTATE_RETPOLINE_SAFE
517	call	*%rsi
518	movq	\r, %rbx
519	call	pr_qword
520.endm
521
522SYM_CODE_START_NOALIGN(kexec_debug_exc_vectors)
523	/* Each of these is 6 bytes. */
524.macro vec_err exc
525	UNWIND_HINT_ENTRY
526	. = kexec_debug_exc_vectors + (\exc * KEXEC_DEBUG_EXC_HANDLER_SIZE)
527	nop
528	nop
529	pushq	$\exc
530	jmp	exc_handler
531.endm
532
533.macro vec_noerr exc
534	UNWIND_HINT_ENTRY
535	. = kexec_debug_exc_vectors + (\exc * KEXEC_DEBUG_EXC_HANDLER_SIZE)
536	pushq	$0
537	pushq	$\exc
538	jmp	exc_handler
539.endm
540
541	ANNOTATE_NOENDBR
542	vec_noerr  0 // #DE
543	vec_noerr  1 // #DB
544	vec_noerr  2 // #NMI
545	vec_noerr  3 // #BP
546	vec_noerr  4 // #OF
547	vec_noerr  5 // #BR
548	vec_noerr  6 // #UD
549	vec_noerr  7 // #NM
550	vec_err    8 // #DF
551	vec_noerr  9
552	vec_err   10 // #TS
553	vec_err   11 // #NP
554	vec_err   12 // #SS
555	vec_err   13 // #GP
556	vec_err   14 // #PF
557	vec_noerr 15
558SYM_CODE_END(kexec_debug_exc_vectors)
559
560SYM_CODE_START_LOCAL_NOALIGN(exc_handler)
561	/* No need for RET mitigations during kexec */
562	VALIDATE_UNRET_END
563
564	pushq	%rax
565	pushq	%rbx
566	pushq	%rcx
567	pushq	%rdx
568	pushq	%rsi
569
570	/* Stack frame */
571#define EXC_SS		0x58 /* Architectural... */
572#define EXC_RSP		0x50
573#define EXC_EFLAGS	0x48
574#define EXC_CS		0x40
575#define EXC_RIP		0x38
576#define EXC_ERRORCODE	0x30 /* Either architectural or zero pushed by handler */
577#define EXC_EXCEPTION	0x28 /* Pushed by handler entry point */
578#define EXC_RAX		0x20 /* Pushed just above in exc_handler */
579#define EXC_RBX		0x18
580#define EXC_RCX		0x10
581#define EXC_RDX		0x08
582#define EXC_RSI		0x00
583
584	/* Set up %rdx/%rsi for debug output */
585	pr_setup
586
587	/* rip and exception info */
588	print_reg 'E', 'x', 'c', ':', EXC_EXCEPTION(%rsp)
589	print_reg 'E', 'r', 'r', ':', EXC_ERRORCODE(%rsp)
590	print_reg 'r', 'i', 'p', ':', EXC_RIP(%rsp)
591	print_reg 'r', 's', 'p', ':', EXC_RSP(%rsp)
592
593	/* We spilled these to the stack */
594	print_reg 'r', 'a', 'x', ':', EXC_RAX(%rsp)
595	print_reg 'r', 'b', 'x', ':', EXC_RBX(%rsp)
596	print_reg 'r', 'c', 'x', ':', EXC_RCX(%rsp)
597	print_reg 'r', 'd', 'x', ':', EXC_RDX(%rsp)
598	print_reg 'r', 's', 'i', ':', EXC_RSI(%rsp)
599
600	/* Other registers untouched */
601	print_reg 'r', 'd', 'i', ':', %rdi
602	print_reg 'r', '8', ' ', ':', %r8
603	print_reg 'r', '9', ' ', ':', %r9
604	print_reg 'r', '1', '0', ':', %r10
605	print_reg 'r', '1', '1', ':', %r11
606	print_reg 'r', '1', '2', ':', %r12
607	print_reg 'r', '1', '3', ':', %r13
608	print_reg 'r', '1', '4', ':', %r14
609	print_reg 'r', '1', '5', ':', %r15
610	print_reg 'c', 'r', '2', ':', %cr2
611
612	/* Only return from INT3 */
613	cmpq	$3, EXC_EXCEPTION(%rsp)
614	jne	.Ldie
615
616	popq	%rsi
617	popq	%rdx
618	popq	%rcx
619	popq	%rbx
620	popq	%rax
621
622	addq	$16, %rsp
623	iretq
624
625.Ldie:
626	hlt
627	jmp	.Ldie
628
629SYM_CODE_END(exc_handler)
630