xref: /linux/arch/x86/kernel/relocate_kernel_64.S (revision 8a7c601e14576a22c2bbf7f67455ccf3f3d2737f)
1/* SPDX-License-Identifier: GPL-2.0-only */
2/*
3 * relocate_kernel.S - put the kernel image in place to boot
4 * Copyright (C) 2002-2005 Eric Biederman  <ebiederm@xmission.com>
5 */
6
7#include <linux/linkage.h>
8#include <linux/stringify.h>
9#include <asm/alternative.h>
10#include <asm/page_types.h>
11#include <asm/kexec.h>
12#include <asm/processor-flags.h>
13#include <asm/pgtable_types.h>
14#include <asm/nospec-branch.h>
15#include <asm/unwind_hints.h>
16#include <asm/asm-offsets.h>
17
18/*
19 * Must be relocatable PIC code callable as a C function, in particular
20 * there must be a plain RET and not jump to return thunk.
21 */
22
23#define PTR(x) (x << 3)
24#define PAGE_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY)
25
26/*
27 * The .text..relocate_kernel and .data..relocate_kernel sections are copied
28 * into the control page, and the remainder of the page is used as the stack.
29 */
30
31	.section .data..relocate_kernel,"a";
32/* Minimal CPU state */
33SYM_DATA_LOCAL(saved_rsp, .quad 0)
34SYM_DATA_LOCAL(saved_cr0, .quad 0)
35SYM_DATA_LOCAL(saved_cr3, .quad 0)
36SYM_DATA_LOCAL(saved_cr4, .quad 0)
37	/* other data */
38SYM_DATA(kexec_va_control_page, .quad 0)
39SYM_DATA(kexec_pa_table_page, .quad 0)
40SYM_DATA(kexec_pa_swap_page, .quad 0)
41SYM_DATA_LOCAL(pa_backup_pages_map, .quad 0)
42SYM_DATA(kexec_debug_8250_mmio32, .quad 0)
43SYM_DATA(kexec_debug_8250_port, .word 0)
44
45	.balign 16
46SYM_DATA_START_LOCAL(kexec_debug_gdt)
47	.word   kexec_debug_gdt_end - kexec_debug_gdt - 1
48	.long   0
49	.word   0
50	.quad   0x00cf9a000000ffff      /* __KERNEL32_CS */
51	.quad   0x00af9a000000ffff      /* __KERNEL_CS */
52	.quad   0x00cf92000000ffff      /* __KERNEL_DS */
53SYM_DATA_END_LABEL(kexec_debug_gdt, SYM_L_LOCAL, kexec_debug_gdt_end)
54
55	.balign 8
56SYM_DATA_START(kexec_debug_idt)
57	.skip 0x100, 0x00
58SYM_DATA_END(kexec_debug_idt)
59
60	.section .text..relocate_kernel,"ax";
61	.code64
62SYM_CODE_START_NOALIGN(relocate_kernel)
63	UNWIND_HINT_END_OF_STACK
64	ANNOTATE_NOENDBR
65	/*
66	 * %rdi indirection_page
67	 * %rsi pa_control_page
68	 * %rdx start address
69	 * %rcx flags: RELOC_KERNEL_*
70	 */
71
72	/* Save the CPU context, used for jumping back */
73	pushq %rbx
74	pushq %rbp
75	pushq %r12
76	pushq %r13
77	pushq %r14
78	pushq %r15
79	pushf
80
81	/* Invalidate GDT/IDT, zero out flags */
82	pushq	$0
83	pushq	$0
84
85	lidt	(%rsp)
86	lgdt	(%rsp)
87	addq	$8, %rsp
88	popfq
89
90	/* Switch to the identity mapped page tables */
91	movq	%cr3, %rax
92	movq	kexec_pa_table_page(%rip), %r9
93	movq	%r9, %cr3
94
95	/* Leave CR4 in %r13 to enable the right paging mode later. */
96	movq	%cr4, %r13
97
98	/* Disable global pages immediately to ensure this mapping is RWX */
99	movq	%r13, %r12
100	andq	$~(X86_CR4_PGE), %r12
101	movq	%r12, %cr4
102
103	/* Save %rsp and CRs. */
104	movq	%r13, saved_cr4(%rip)
105	movq    %rsp, saved_rsp(%rip)
106	movq	%rax, saved_cr3(%rip)
107	movq	%cr0, %rax
108	movq	%rax, saved_cr0(%rip)
109
110	/* save indirection list for jumping back */
111	movq	%rdi, pa_backup_pages_map(%rip)
112
113	/* Save the flags to %r11 as swap_pages clobbers %rcx. */
114	movq	%rcx, %r11
115
116	/* setup a new stack at the end of the physical control page */
117	lea	PAGE_SIZE(%rsi), %rsp
118
119	/* jump to identity mapped page */
1200:	addq	$identity_mapped - 0b, %rsi
121	subq	$__relocate_kernel_start - 0b, %rsi
122	ANNOTATE_RETPOLINE_SAFE
123	jmp	*%rsi
124SYM_CODE_END(relocate_kernel)
125
126SYM_CODE_START_LOCAL_NOALIGN(identity_mapped)
127	UNWIND_HINT_END_OF_STACK
128	/*
129	 * %rdi	indirection page
130	 * %rdx start address
131	 * %r9 page table page
132	 * %r11 flags: RELOC_KERNEL_*
133	 * %r13 original CR4 when relocate_kernel() was invoked
134	 */
135
136	/* store the start address on the stack */
137	pushq   %rdx
138
139	/* Create a GDTR (16 bits limit, 64 bits addr) on stack */
140	leaq	kexec_debug_gdt(%rip), %rax
141	pushq	%rax
142	pushw	(%rax)
143
144	/* Load the GDT, put the stack back */
145	lgdt	(%rsp)
146	addq	$10, %rsp
147
148	/* Test that we can load segments */
149	movq	%ds, %rax
150	movq	%rax, %ds
151
152	/* Now an IDTR on the stack to load the IDT the kernel created */
153	leaq	kexec_debug_idt(%rip), %rsi
154	pushq	%rsi
155	pushw	$0xff
156	lidt	(%rsp)
157	addq	$10, %rsp
158
159	//int3
160
161	/*
162	 * Clear X86_CR4_CET (if it was set) such that we can clear CR0_WP
163	 * below.
164	 */
165	movq	%cr4, %rax
166	andq	$~(X86_CR4_CET), %rax
167	movq	%rax, %cr4
168
169	/*
170	 * Set cr0 to a known state:
171	 *  - Paging enabled
172	 *  - Alignment check disabled
173	 *  - Write protect disabled
174	 *  - No task switch
175	 *  - Don't do FP software emulation.
176	 *  - Protected mode enabled
177	 */
178	movq	%cr0, %rax
179	andq	$~(X86_CR0_AM | X86_CR0_WP | X86_CR0_TS | X86_CR0_EM), %rax
180	orl	$(X86_CR0_PG | X86_CR0_PE), %eax
181	movq	%rax, %cr0
182
183	/*
184	 * Set cr4 to a known state:
185	 *  - physical address extension enabled
186	 *  - 5-level paging, if it was enabled before
187	 *  - Machine check exception on TDX guest, if it was enabled before.
188	 *    Clearing MCE might not be allowed in TDX guests, depending on setup.
189	 *
190	 * Use R13 that contains the original CR4 value, read in relocate_kernel().
191	 * PAE is always set in the original CR4.
192	 */
193	andl	$(X86_CR4_PAE | X86_CR4_LA57), %r13d
194	ALTERNATIVE "", __stringify(orl $X86_CR4_MCE, %r13d), X86_FEATURE_TDX_GUEST
195	movq	%r13, %cr4
196
197	/* Flush the TLB (needed?) */
198	movq	%r9, %cr3
199
200	/*
201	 * If the memory cache is in incoherent state, e.g., due to
202	 * memory encryption, do WBINVD to flush cache.
203	 *
204	 * If SME is active, there could be old encrypted cache line
205	 * entries that will conflict with the now unencrypted memory
206	 * used by kexec. Flush the caches before copying the kernel.
207	 *
208	 * Note SME sets this flag to true when the platform supports
209	 * SME, so the WBINVD is performed even SME is not activated
210	 * by the kernel.  But this has no harm.
211	 */
212	testb	$RELOC_KERNEL_CACHE_INCOHERENT, %r11b
213	jz .Lnowbinvd
214	wbinvd
215.Lnowbinvd:
216
217	call	swap_pages
218
219	/*
220	 * To be certain of avoiding problems with self-modifying code
221	 * I need to execute a serializing instruction here.
222	 * So I flush the TLB by reloading %cr3 here, it's handy,
223	 * and not processor dependent.
224	 */
225	movq	%cr3, %rax
226	movq	%rax, %cr3
227
228	testb	$RELOC_KERNEL_PRESERVE_CONTEXT, %r11b
229	jnz .Lrelocate
230
231	/*
232	 * set all of the registers to known values
233	 * leave %rsp alone
234	 */
235
236	xorl	%eax, %eax
237	xorl	%ebx, %ebx
238	xorl    %ecx, %ecx
239	xorl    %edx, %edx
240	xorl    %esi, %esi
241	xorl    %edi, %edi
242	xorl    %ebp, %ebp
243	xorl	%r8d, %r8d
244	xorl	%r9d, %r9d
245	xorl	%r10d, %r10d
246	xorl	%r11d, %r11d
247	xorl	%r12d, %r12d
248	xorl	%r13d, %r13d
249	xorl	%r14d, %r14d
250	xorl	%r15d, %r15d
251
252	ANNOTATE_UNRET_SAFE
253	ret
254	int3
255
256.Lrelocate:
257	popq	%rdx
258
259	/* Use the swap page for the callee's stack */
260	movq	kexec_pa_swap_page(%rip), %r10
261	leaq	PAGE_SIZE(%r10), %rsp
262
263	/* push the existing entry point onto the callee's stack */
264	pushq	%rdx
265
266	ANNOTATE_RETPOLINE_SAFE
267	call	*%rdx
268
269	/* get the re-entry point of the peer system */
270	popq	%rbp
271	movq	kexec_pa_swap_page(%rip), %r10
272	movq	pa_backup_pages_map(%rip), %rdi
273	movq	kexec_pa_table_page(%rip), %rax
274	movq	%rax, %cr3
275
276	/* Find start (and end) of this physical mapping of control page */
277	leaq	(%rip), %r8
278	ANNOTATE_NOENDBR
279	andq	$PAGE_MASK, %r8
280	lea	PAGE_SIZE(%r8), %rsp
281	/*
282	 * Ensure RELOC_KERNEL_PRESERVE_CONTEXT flag is set so that
283	 * swap_pages() can swap pages correctly.  Note all other
284	 * RELOC_KERNEL_* flags passed to relocate_kernel() are not
285	 * restored.
286	 */
287	movl	$RELOC_KERNEL_PRESERVE_CONTEXT, %r11d
288	call	swap_pages
289	movq	kexec_va_control_page(%rip), %rax
2900:	addq	$virtual_mapped - 0b, %rax
291	subq	$__relocate_kernel_start - 0b, %rax
292	pushq	%rax
293	ANNOTATE_UNRET_SAFE
294	ret
295	int3
296SYM_CODE_END(identity_mapped)
297
298SYM_CODE_START_LOCAL_NOALIGN(virtual_mapped)
299	UNWIND_HINT_END_OF_STACK
300	ANNOTATE_NOENDBR // RET target, above
301	movq	saved_rsp(%rip), %rsp
302	movq	saved_cr4(%rip), %rax
303	movq	%rax, %cr4
304	movq	saved_cr3(%rip), %rax
305	movq	saved_cr0(%rip), %r8
306	movq	%rax, %cr3
307	movq	%r8, %cr0
308
309#ifdef CONFIG_KEXEC_JUMP
310	/* Saved in save_processor_state. */
311	movq    $saved_context, %rax
312	lgdt    saved_context_gdt_desc(%rax)
313#endif
314
315	/* relocate_kernel() returns the re-entry point for next time */
316	movq	%rbp, %rax
317
318	popf
319	popq	%r15
320	popq	%r14
321	popq	%r13
322	popq	%r12
323	popq	%rbp
324	popq	%rbx
325	ANNOTATE_UNRET_SAFE
326	ret
327	int3
328SYM_CODE_END(virtual_mapped)
329
330	/* Do the copies */
331SYM_CODE_START_LOCAL_NOALIGN(swap_pages)
332	UNWIND_HINT_END_OF_STACK
333	/*
334	 * %rdi indirection page
335	 * %r11 flags: RELOC_KERNEL_*
336	 */
337	movq	%rdi, %rcx	/* Put the indirection_page in %rcx */
338	xorl	%edi, %edi
339	xorl	%esi, %esi
340	jmp	.Lstart		/* Should start with an indirection record */
341
342.Lloop:	/* top, read another word for the indirection page */
343
344	movq	(%rbx), %rcx
345	addq	$8,	%rbx
346.Lstart:
347	testb	$0x1,	%cl   /* is it a destination page? */
348	jz	.Lnotdest
349	movq	%rcx,	%rdi
350	andq	$0xfffffffffffff000, %rdi
351	jmp	.Lloop
352.Lnotdest:
353	testb	$0x2,	%cl   /* is it an indirection page? */
354	jz	.Lnotind
355	movq	%rcx,   %rbx
356	andq	$0xfffffffffffff000, %rbx
357	jmp	.Lloop
358.Lnotind:
359	testb	$0x4,	%cl   /* is it the done indicator? */
360	jz	.Lnotdone
361	jmp	.Ldone
362.Lnotdone:
363	testb	$0x8,	%cl   /* is it the source indicator? */
364	jz	.Lloop	      /* Ignore it otherwise */
365	movq	%rcx,   %rsi  /* For ever source page do a copy */
366	andq	$0xfffffffffffff000, %rsi
367
368	movq	%rdi, %rdx    /* Save destination page to %rdx */
369	movq	%rsi, %rax    /* Save source page to %rax */
370
371	/* Only actually swap for ::preserve_context */
372	testb	$RELOC_KERNEL_PRESERVE_CONTEXT, %r11b
373	jz	.Lnoswap
374
375	/* copy source page to swap page */
376	movq	kexec_pa_swap_page(%rip), %rdi
377	movl	$512, %ecx
378	rep movsq
379
380	/* copy destination page to source page */
381	movq	%rax, %rdi
382	movq	%rdx, %rsi
383	movl	$512, %ecx
384	rep movsq
385
386	/* copy swap page to destination page */
387	movq	%rdx, %rdi
388	movq	kexec_pa_swap_page(%rip), %rsi
389.Lnoswap:
390	movl	$512, %ecx
391	rep movsq
392
393	lea	PAGE_SIZE(%rax), %rsi
394	jmp	.Lloop
395.Ldone:
396	ANNOTATE_UNRET_SAFE
397	ret
398	int3
399SYM_CODE_END(swap_pages)
400
401/*
402 * Generic 'print character' routine
403 *  - %al: Character to be printed (may clobber %rax)
404 *  - %rdx: MMIO address or port.
405 */
406#define XMTRDY          0x20
407
408#define TXR             0       /*  Transmit register (WRITE) */
409#define LSR             5       /*  Line Status               */
410
411SYM_CODE_START_LOCAL_NOALIGN(pr_char_8250)
412	UNWIND_HINT_FUNC
413	ANNOTATE_NOENDBR
414	addw	$LSR, %dx
415	xchg	%al, %ah
416.Lxmtrdy_loop:
417	inb	%dx, %al
418	testb	$XMTRDY, %al
419	jnz	.Lready
420	pause
421	jmp .Lxmtrdy_loop
422
423.Lready:
424	subw	$LSR, %dx
425	xchg	%al, %ah
426	outb	%al, %dx
427pr_char_null:
428	ANNOTATE_NOENDBR
429
430	ANNOTATE_UNRET_SAFE
431	ret
432SYM_CODE_END(pr_char_8250)
433
434SYM_CODE_START_LOCAL_NOALIGN(pr_char_8250_mmio32)
435	UNWIND_HINT_FUNC
436	ANNOTATE_NOENDBR
437.Lxmtrdy_loop_mmio:
438	movb	(LSR*4)(%rdx), %ah
439	testb	$XMTRDY, %ah
440	jnz	.Lready_mmio
441	pause
442	jmp .Lxmtrdy_loop_mmio
443
444.Lready_mmio:
445	movb	%al, (%rdx)
446	ANNOTATE_UNRET_SAFE
447	ret
448SYM_CODE_END(pr_char_8250_mmio32)
449
450/*
451 * Load pr_char function pointer into %rsi and load %rdx with whatever
452 * that function wants to see there (typically port/MMIO address).
453 */
454.macro pr_setup
455	leaq	pr_char_8250(%rip), %rsi
456	movw	kexec_debug_8250_port(%rip), %dx
457	testw	%dx, %dx
458	jnz	1f
459
460	leaq	pr_char_8250_mmio32(%rip), %rsi
461	movq	kexec_debug_8250_mmio32(%rip), %rdx
462	testq	%rdx, %rdx
463	jnz	1f
464
465	leaq	pr_char_null(%rip), %rsi
4661:
467.endm
468
469/* Print the nybble in %bl, clobber %rax */
470SYM_CODE_START_LOCAL_NOALIGN(pr_nybble)
471	UNWIND_HINT_FUNC
472	movb	%bl, %al
473	nop
474	andb	$0x0f, %al
475	addb	$0x30, %al
476	cmpb	$0x3a, %al
477	jb	1f
478	addb	$('a' - '0' - 10), %al
479	ANNOTATE_RETPOLINE_SAFE
4801:	jmp	*%rsi
481SYM_CODE_END(pr_nybble)
482
483SYM_CODE_START_LOCAL_NOALIGN(pr_qword)
484	UNWIND_HINT_FUNC
485	movq	$16, %rcx
4861:	rolq	$4, %rbx
487	call	pr_nybble
488	loop	1b
489	movb	$'\n', %al
490	ANNOTATE_RETPOLINE_SAFE
491	jmp	*%rsi
492SYM_CODE_END(pr_qword)
493
494.macro print_reg a, b, c, d, r
495	movb	$\a, %al
496	ANNOTATE_RETPOLINE_SAFE
497	call	*%rsi
498	movb	$\b, %al
499	ANNOTATE_RETPOLINE_SAFE
500	call	*%rsi
501	movb	$\c, %al
502	ANNOTATE_RETPOLINE_SAFE
503	call	*%rsi
504	movb	$\d, %al
505	ANNOTATE_RETPOLINE_SAFE
506	call	*%rsi
507	movq	\r, %rbx
508	call	pr_qword
509.endm
510
511SYM_CODE_START_NOALIGN(kexec_debug_exc_vectors)
512	/* Each of these is 6 bytes. */
513.macro vec_err exc
514	UNWIND_HINT_ENTRY
515	. = kexec_debug_exc_vectors + (\exc * KEXEC_DEBUG_EXC_HANDLER_SIZE)
516	nop
517	nop
518	pushq	$\exc
519	jmp	exc_handler
520.endm
521
522.macro vec_noerr exc
523	UNWIND_HINT_ENTRY
524	. = kexec_debug_exc_vectors + (\exc * KEXEC_DEBUG_EXC_HANDLER_SIZE)
525	pushq	$0
526	pushq	$\exc
527	jmp	exc_handler
528.endm
529
530	ANNOTATE_NOENDBR
531	vec_noerr  0 // #DE
532	vec_noerr  1 // #DB
533	vec_noerr  2 // #NMI
534	vec_noerr  3 // #BP
535	vec_noerr  4 // #OF
536	vec_noerr  5 // #BR
537	vec_noerr  6 // #UD
538	vec_noerr  7 // #NM
539	vec_err    8 // #DF
540	vec_noerr  9
541	vec_err   10 // #TS
542	vec_err   11 // #NP
543	vec_err   12 // #SS
544	vec_err   13 // #GP
545	vec_err   14 // #PF
546	vec_noerr 15
547SYM_CODE_END(kexec_debug_exc_vectors)
548
549SYM_CODE_START_LOCAL_NOALIGN(exc_handler)
550	/* No need for RET mitigations during kexec */
551	VALIDATE_UNRET_END
552
553	pushq	%rax
554	pushq	%rbx
555	pushq	%rcx
556	pushq	%rdx
557	pushq	%rsi
558
559	/* Stack frame */
560#define EXC_SS		0x58 /* Architectural... */
561#define EXC_RSP		0x50
562#define EXC_EFLAGS	0x48
563#define EXC_CS		0x40
564#define EXC_RIP		0x38
565#define EXC_ERRORCODE	0x30 /* Either architectural or zero pushed by handler */
566#define EXC_EXCEPTION	0x28 /* Pushed by handler entry point */
567#define EXC_RAX		0x20 /* Pushed just above in exc_handler */
568#define EXC_RBX		0x18
569#define EXC_RCX		0x10
570#define EXC_RDX		0x08
571#define EXC_RSI		0x00
572
573	/* Set up %rdx/%rsi for debug output */
574	pr_setup
575
576	/* rip and exception info */
577	print_reg 'E', 'x', 'c', ':', EXC_EXCEPTION(%rsp)
578	print_reg 'E', 'r', 'r', ':', EXC_ERRORCODE(%rsp)
579	print_reg 'r', 'i', 'p', ':', EXC_RIP(%rsp)
580	print_reg 'r', 's', 'p', ':', EXC_RSP(%rsp)
581
582	/* We spilled these to the stack */
583	print_reg 'r', 'a', 'x', ':', EXC_RAX(%rsp)
584	print_reg 'r', 'b', 'x', ':', EXC_RBX(%rsp)
585	print_reg 'r', 'c', 'x', ':', EXC_RCX(%rsp)
586	print_reg 'r', 'd', 'x', ':', EXC_RDX(%rsp)
587	print_reg 'r', 's', 'i', ':', EXC_RSI(%rsp)
588
589	/* Other registers untouched */
590	print_reg 'r', 'd', 'i', ':', %rdi
591	print_reg 'r', '8', ' ', ':', %r8
592	print_reg 'r', '9', ' ', ':', %r9
593	print_reg 'r', '1', '0', ':', %r10
594	print_reg 'r', '1', '1', ':', %r11
595	print_reg 'r', '1', '2', ':', %r12
596	print_reg 'r', '1', '3', ':', %r13
597	print_reg 'r', '1', '4', ':', %r14
598	print_reg 'r', '1', '5', ':', %r15
599	print_reg 'c', 'r', '2', ':', %cr2
600
601	/* Only return from INT3 */
602	cmpq	$3, EXC_EXCEPTION(%rsp)
603	jne	.Ldie
604
605	popq	%rsi
606	popq	%rdx
607	popq	%rcx
608	popq	%rbx
609	popq	%rax
610
611	addq	$16, %rsp
612	iretq
613
614.Ldie:
615	hlt
616	jmp	.Ldie
617
618SYM_CODE_END(exc_handler)
619