xref: /linux/arch/x86/kernel/relocate_kernel_64.S (revision e7e86d7697c6ed1dbbde18d7185c35b6967945ed)
1/* SPDX-License-Identifier: GPL-2.0-only */
2/*
3 * relocate_kernel.S - put the kernel image in place to boot
4 * Copyright (C) 2002-2005 Eric Biederman  <ebiederm@xmission.com>
5 */
6
7#include <linux/linkage.h>
8#include <linux/stringify.h>
9#include <asm/alternative.h>
10#include <asm/page_types.h>
11#include <asm/kexec.h>
12#include <asm/processor-flags.h>
13#include <asm/pgtable_types.h>
14#include <asm/nospec-branch.h>
15#include <asm/unwind_hints.h>
16#include <asm/asm-offsets.h>
17
18/*
19 * Must be relocatable PIC code callable as a C function, in particular
20 * there must be a plain RET and not jump to return thunk.
21 */
22
23#define PTR(x) (x << 3)
24#define PAGE_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY)
25
26/*
27 * The .text..relocate_kernel and .data..relocate_kernel sections are copied
28 * into the control page, and the remainder of the page is used as the stack.
29 */
30
31	.section .data..relocate_kernel,"a";
32/* Minimal CPU state */
33SYM_DATA_LOCAL(saved_rsp, .quad 0)
34SYM_DATA_LOCAL(saved_cr0, .quad 0)
35SYM_DATA_LOCAL(saved_cr3, .quad 0)
36SYM_DATA_LOCAL(saved_cr4, .quad 0)
37	/* other data */
38SYM_DATA(kexec_va_control_page, .quad 0)
39SYM_DATA(kexec_pa_table_page, .quad 0)
40SYM_DATA(kexec_pa_swap_page, .quad 0)
41SYM_DATA_LOCAL(pa_backup_pages_map, .quad 0)
42SYM_DATA(kexec_debug_8250_mmio32, .quad 0)
43SYM_DATA(kexec_debug_8250_port, .word 0)
44
45	.balign 16
46SYM_DATA_START_LOCAL(kexec_debug_gdt)
47	.word   kexec_debug_gdt_end - kexec_debug_gdt - 1
48	.long   0
49	.word   0
50	.quad   0x00cf9a000000ffff      /* __KERNEL32_CS */
51	.quad   0x00af9a000000ffff      /* __KERNEL_CS */
52	.quad   0x00cf92000000ffff      /* __KERNEL_DS */
53SYM_DATA_END_LABEL(kexec_debug_gdt, SYM_L_LOCAL, kexec_debug_gdt_end)
54
55	.balign 8
56SYM_DATA_START(kexec_debug_idt)
57	.skip 0x100, 0x00
58SYM_DATA_END(kexec_debug_idt)
59
60	.section .text..relocate_kernel,"ax";
61	.code64
62SYM_CODE_START_NOALIGN(relocate_kernel)
63	UNWIND_HINT_END_OF_STACK
64	ANNOTATE_NOENDBR
65	/*
66	 * %rdi indirection_page
67	 * %rsi pa_control_page
68	 * %rdx start address
69	 * %rcx preserve_context
70	 * %r8  host_mem_enc_active
71	 */
72
73	/* Save the CPU context, used for jumping back */
74	pushq %rbx
75	pushq %rbp
76	pushq %r12
77	pushq %r13
78	pushq %r14
79	pushq %r15
80	pushf
81
82	/* Invalidate GDT/IDT, zero out flags */
83	pushq	$0
84	pushq	$0
85
86	lidt	(%rsp)
87	lgdt	(%rsp)
88	addq	$8, %rsp
89	popfq
90
91	/* Switch to the identity mapped page tables */
92	movq	%cr3, %rax
93	movq	kexec_pa_table_page(%rip), %r9
94	movq	%r9, %cr3
95
96	/* Leave CR4 in %r13 to enable the right paging mode later. */
97	movq	%cr4, %r13
98
99	/* Disable global pages immediately to ensure this mapping is RWX */
100	movq	%r13, %r12
101	andq	$~(X86_CR4_PGE), %r12
102	movq	%r12, %cr4
103
104	/* Save %rsp and CRs. */
105	movq	%r13, saved_cr4(%rip)
106	movq    %rsp, saved_rsp(%rip)
107	movq	%rax, saved_cr3(%rip)
108	movq	%cr0, %rax
109	movq	%rax, saved_cr0(%rip)
110
111	/* save indirection list for jumping back */
112	movq	%rdi, pa_backup_pages_map(%rip)
113
114	/* Save the preserve_context to %r11 as swap_pages clobbers %rcx. */
115	movq	%rcx, %r11
116
117	/* setup a new stack at the end of the physical control page */
118	lea	PAGE_SIZE(%rsi), %rsp
119
120	/* jump to identity mapped page */
1210:	addq	$identity_mapped - 0b, %rsi
122	subq	$__relocate_kernel_start - 0b, %rsi
123	ANNOTATE_RETPOLINE_SAFE
124	jmp	*%rsi
125SYM_CODE_END(relocate_kernel)
126
127SYM_CODE_START_LOCAL_NOALIGN(identity_mapped)
128	UNWIND_HINT_END_OF_STACK
129	/*
130	 * %rdi	indirection page
131	 * %rdx start address
132	 * %r8 host_mem_enc_active
133	 * %r9 page table page
134	 * %r11 preserve_context
135	 * %r13 original CR4 when relocate_kernel() was invoked
136	 */
137
138	/* store the start address on the stack */
139	pushq   %rdx
140
141	/* Create a GDTR (16 bits limit, 64 bits addr) on stack */
142	leaq	kexec_debug_gdt(%rip), %rax
143	pushq	%rax
144	pushw	(%rax)
145
146	/* Load the GDT, put the stack back */
147	lgdt	(%rsp)
148	addq	$10, %rsp
149
150	/* Test that we can load segments */
151	movq	%ds, %rax
152	movq	%rax, %ds
153
154	/* Now an IDTR on the stack to load the IDT the kernel created */
155	leaq	kexec_debug_idt(%rip), %rsi
156	pushq	%rsi
157	pushw	$0xff
158	lidt	(%rsp)
159	addq	$10, %rsp
160
161	//int3
162
163	/*
164	 * Clear X86_CR4_CET (if it was set) such that we can clear CR0_WP
165	 * below.
166	 */
167	movq	%cr4, %rax
168	andq	$~(X86_CR4_CET), %rax
169	movq	%rax, %cr4
170
171	/*
172	 * Set cr0 to a known state:
173	 *  - Paging enabled
174	 *  - Alignment check disabled
175	 *  - Write protect disabled
176	 *  - No task switch
177	 *  - Don't do FP software emulation.
178	 *  - Protected mode enabled
179	 */
180	movq	%cr0, %rax
181	andq	$~(X86_CR0_AM | X86_CR0_WP | X86_CR0_TS | X86_CR0_EM), %rax
182	orl	$(X86_CR0_PG | X86_CR0_PE), %eax
183	movq	%rax, %cr0
184
185	/*
186	 * Set cr4 to a known state:
187	 *  - physical address extension enabled
188	 *  - 5-level paging, if it was enabled before
189	 *  - Machine check exception on TDX guest, if it was enabled before.
190	 *    Clearing MCE might not be allowed in TDX guests, depending on setup.
191	 *
192	 * Use R13 that contains the original CR4 value, read in relocate_kernel().
193	 * PAE is always set in the original CR4.
194	 */
195	andl	$(X86_CR4_PAE | X86_CR4_LA57), %r13d
196	ALTERNATIVE "", __stringify(orl $X86_CR4_MCE, %r13d), X86_FEATURE_TDX_GUEST
197	movq	%r13, %cr4
198
199	/* Flush the TLB (needed?) */
200	movq	%r9, %cr3
201
202	/*
203	 * If SME is active, there could be old encrypted cache line
204	 * entries that will conflict with the now unencrypted memory
205	 * used by kexec. Flush the caches before copying the kernel.
206	 */
207	testq	%r8, %r8
208	jz .Lsme_off
209	wbinvd
210.Lsme_off:
211
212	call	swap_pages
213
214	/*
215	 * To be certain of avoiding problems with self-modifying code
216	 * I need to execute a serializing instruction here.
217	 * So I flush the TLB by reloading %cr3 here, it's handy,
218	 * and not processor dependent.
219	 */
220	movq	%cr3, %rax
221	movq	%rax, %cr3
222
223	testq	%r11, %r11	/* preserve_context */
224	jnz .Lrelocate
225
226	/*
227	 * set all of the registers to known values
228	 * leave %rsp alone
229	 */
230
231	xorl	%eax, %eax
232	xorl	%ebx, %ebx
233	xorl    %ecx, %ecx
234	xorl    %edx, %edx
235	xorl    %esi, %esi
236	xorl    %edi, %edi
237	xorl    %ebp, %ebp
238	xorl	%r8d, %r8d
239	xorl	%r9d, %r9d
240	xorl	%r10d, %r10d
241	xorl	%r11d, %r11d
242	xorl	%r12d, %r12d
243	xorl	%r13d, %r13d
244	xorl	%r14d, %r14d
245	xorl	%r15d, %r15d
246
247	ANNOTATE_UNRET_SAFE
248	ret
249	int3
250
251.Lrelocate:
252	popq	%rdx
253
254	/* Use the swap page for the callee's stack */
255	movq	kexec_pa_swap_page(%rip), %r10
256	leaq	PAGE_SIZE(%r10), %rsp
257
258	/* push the existing entry point onto the callee's stack */
259	pushq	%rdx
260
261	ANNOTATE_RETPOLINE_SAFE
262	call	*%rdx
263
264	/* get the re-entry point of the peer system */
265	popq	%rbp
266	movq	kexec_pa_swap_page(%rip), %r10
267	movq	pa_backup_pages_map(%rip), %rdi
268	movq	kexec_pa_table_page(%rip), %rax
269	movq	%rax, %cr3
270
271	/* Find start (and end) of this physical mapping of control page */
272	leaq	(%rip), %r8
273	ANNOTATE_NOENDBR
274	andq	$PAGE_MASK, %r8
275	lea	PAGE_SIZE(%r8), %rsp
276	movl	$1, %r11d	/* Ensure preserve_context flag is set */
277	call	swap_pages
278	movq	kexec_va_control_page(%rip), %rax
2790:	addq	$virtual_mapped - 0b, %rax
280	subq	$__relocate_kernel_start - 0b, %rax
281	pushq	%rax
282	ANNOTATE_UNRET_SAFE
283	ret
284	int3
285SYM_CODE_END(identity_mapped)
286
287SYM_CODE_START_LOCAL_NOALIGN(virtual_mapped)
288	UNWIND_HINT_END_OF_STACK
289	ANNOTATE_NOENDBR // RET target, above
290	movq	saved_rsp(%rip), %rsp
291	movq	saved_cr4(%rip), %rax
292	movq	%rax, %cr4
293	movq	saved_cr3(%rip), %rax
294	movq	saved_cr0(%rip), %r8
295	movq	%rax, %cr3
296	movq	%r8, %cr0
297
298#ifdef CONFIG_KEXEC_JUMP
299	/* Saved in save_processor_state. */
300	movq    $saved_context, %rax
301	lgdt    saved_context_gdt_desc(%rax)
302#endif
303
304	/* relocate_kernel() returns the re-entry point for next time */
305	movq	%rbp, %rax
306
307	popf
308	popq	%r15
309	popq	%r14
310	popq	%r13
311	popq	%r12
312	popq	%rbp
313	popq	%rbx
314	ANNOTATE_UNRET_SAFE
315	ret
316	int3
317SYM_CODE_END(virtual_mapped)
318
319	/* Do the copies */
320SYM_CODE_START_LOCAL_NOALIGN(swap_pages)
321	UNWIND_HINT_END_OF_STACK
322	/*
323	 * %rdi indirection page
324	 * %r11 preserve_context
325	 */
326	movq	%rdi, %rcx	/* Put the indirection_page in %rcx */
327	xorl	%edi, %edi
328	xorl	%esi, %esi
329	jmp	.Lstart		/* Should start with an indirection record */
330
331.Lloop:	/* top, read another word for the indirection page */
332
333	movq	(%rbx), %rcx
334	addq	$8,	%rbx
335.Lstart:
336	testb	$0x1,	%cl   /* is it a destination page? */
337	jz	.Lnotdest
338	movq	%rcx,	%rdi
339	andq	$0xfffffffffffff000, %rdi
340	jmp	.Lloop
341.Lnotdest:
342	testb	$0x2,	%cl   /* is it an indirection page? */
343	jz	.Lnotind
344	movq	%rcx,   %rbx
345	andq	$0xfffffffffffff000, %rbx
346	jmp	.Lloop
347.Lnotind:
348	testb	$0x4,	%cl   /* is it the done indicator? */
349	jz	.Lnotdone
350	jmp	.Ldone
351.Lnotdone:
352	testb	$0x8,	%cl   /* is it the source indicator? */
353	jz	.Lloop	      /* Ignore it otherwise */
354	movq	%rcx,   %rsi  /* For ever source page do a copy */
355	andq	$0xfffffffffffff000, %rsi
356
357	movq	%rdi, %rdx    /* Save destination page to %rdx */
358	movq	%rsi, %rax    /* Save source page to %rax */
359
360	testq	%r11, %r11    /* Only actually swap for ::preserve_context */
361	jz	.Lnoswap
362
363	/* copy source page to swap page */
364	movq	kexec_pa_swap_page(%rip), %rdi
365	movl	$512, %ecx
366	rep movsq
367
368	/* copy destination page to source page */
369	movq	%rax, %rdi
370	movq	%rdx, %rsi
371	movl	$512, %ecx
372	rep movsq
373
374	/* copy swap page to destination page */
375	movq	%rdx, %rdi
376	movq	kexec_pa_swap_page(%rip), %rsi
377.Lnoswap:
378	movl	$512, %ecx
379	rep movsq
380
381	lea	PAGE_SIZE(%rax), %rsi
382	jmp	.Lloop
383.Ldone:
384	ANNOTATE_UNRET_SAFE
385	ret
386	int3
387SYM_CODE_END(swap_pages)
388
389/*
390 * Generic 'print character' routine
391 *  - %al: Character to be printed (may clobber %rax)
392 *  - %rdx: MMIO address or port.
393 */
394#define XMTRDY          0x20
395
396#define TXR             0       /*  Transmit register (WRITE) */
397#define LSR             5       /*  Line Status               */
398
399SYM_CODE_START_LOCAL_NOALIGN(pr_char_8250)
400	UNWIND_HINT_FUNC
401	ANNOTATE_NOENDBR
402	addw	$LSR, %dx
403	xchg	%al, %ah
404.Lxmtrdy_loop:
405	inb	%dx, %al
406	testb	$XMTRDY, %al
407	jnz	.Lready
408	pause
409	jmp .Lxmtrdy_loop
410
411.Lready:
412	subw	$LSR, %dx
413	xchg	%al, %ah
414	outb	%al, %dx
415pr_char_null:
416	ANNOTATE_NOENDBR
417
418	ANNOTATE_UNRET_SAFE
419	ret
420SYM_CODE_END(pr_char_8250)
421
422SYM_CODE_START_LOCAL_NOALIGN(pr_char_8250_mmio32)
423	UNWIND_HINT_FUNC
424	ANNOTATE_NOENDBR
425.Lxmtrdy_loop_mmio:
426	movb	(LSR*4)(%rdx), %ah
427	testb	$XMTRDY, %ah
428	jnz	.Lready_mmio
429	pause
430	jmp .Lxmtrdy_loop_mmio
431
432.Lready_mmio:
433	movb	%al, (%rdx)
434	ANNOTATE_UNRET_SAFE
435	ret
436SYM_CODE_END(pr_char_8250_mmio32)
437
438/*
439 * Load pr_char function pointer into %rsi and load %rdx with whatever
440 * that function wants to see there (typically port/MMIO address).
441 */
442.macro pr_setup
443	leaq	pr_char_8250(%rip), %rsi
444	movw	kexec_debug_8250_port(%rip), %dx
445	testw	%dx, %dx
446	jnz	1f
447
448	leaq	pr_char_8250_mmio32(%rip), %rsi
449	movq	kexec_debug_8250_mmio32(%rip), %rdx
450	testq	%rdx, %rdx
451	jnz	1f
452
453	leaq	pr_char_null(%rip), %rsi
4541:
455.endm
456
457/* Print the nybble in %bl, clobber %rax */
458SYM_CODE_START_LOCAL_NOALIGN(pr_nybble)
459	UNWIND_HINT_FUNC
460	movb	%bl, %al
461	nop
462	andb	$0x0f, %al
463	addb	$0x30, %al
464	cmpb	$0x3a, %al
465	jb	1f
466	addb	$('a' - '0' - 10), %al
467	ANNOTATE_RETPOLINE_SAFE
4681:	jmp	*%rsi
469SYM_CODE_END(pr_nybble)
470
471SYM_CODE_START_LOCAL_NOALIGN(pr_qword)
472	UNWIND_HINT_FUNC
473	movq	$16, %rcx
4741:	rolq	$4, %rbx
475	call	pr_nybble
476	loop	1b
477	movb	$'\n', %al
478	ANNOTATE_RETPOLINE_SAFE
479	jmp	*%rsi
480SYM_CODE_END(pr_qword)
481
482.macro print_reg a, b, c, d, r
483	movb	$\a, %al
484	ANNOTATE_RETPOLINE_SAFE
485	call	*%rsi
486	movb	$\b, %al
487	ANNOTATE_RETPOLINE_SAFE
488	call	*%rsi
489	movb	$\c, %al
490	ANNOTATE_RETPOLINE_SAFE
491	call	*%rsi
492	movb	$\d, %al
493	ANNOTATE_RETPOLINE_SAFE
494	call	*%rsi
495	movq	\r, %rbx
496	call	pr_qword
497.endm
498
499SYM_CODE_START_NOALIGN(kexec_debug_exc_vectors)
500	/* Each of these is 6 bytes. */
501.macro vec_err exc
502	UNWIND_HINT_ENTRY
503	. = kexec_debug_exc_vectors + (\exc * KEXEC_DEBUG_EXC_HANDLER_SIZE)
504	nop
505	nop
506	pushq	$\exc
507	jmp	exc_handler
508.endm
509
510.macro vec_noerr exc
511	UNWIND_HINT_ENTRY
512	. = kexec_debug_exc_vectors + (\exc * KEXEC_DEBUG_EXC_HANDLER_SIZE)
513	pushq	$0
514	pushq	$\exc
515	jmp	exc_handler
516.endm
517
518	ANNOTATE_NOENDBR
519	vec_noerr  0 // #DE
520	vec_noerr  1 // #DB
521	vec_noerr  2 // #NMI
522	vec_noerr  3 // #BP
523	vec_noerr  4 // #OF
524	vec_noerr  5 // #BR
525	vec_noerr  6 // #UD
526	vec_noerr  7 // #NM
527	vec_err    8 // #DF
528	vec_noerr  9
529	vec_err   10 // #TS
530	vec_err   11 // #NP
531	vec_err   12 // #SS
532	vec_err   13 // #GP
533	vec_err   14 // #PF
534	vec_noerr 15
535SYM_CODE_END(kexec_debug_exc_vectors)
536
537SYM_CODE_START_LOCAL_NOALIGN(exc_handler)
538	/* No need for RET mitigations during kexec */
539	VALIDATE_UNRET_END
540
541	pushq	%rax
542	pushq	%rbx
543	pushq	%rcx
544	pushq	%rdx
545	pushq	%rsi
546
547	/* Stack frame */
548#define EXC_SS		0x58 /* Architectural... */
549#define EXC_RSP		0x50
550#define EXC_EFLAGS	0x48
551#define EXC_CS		0x40
552#define EXC_RIP		0x38
553#define EXC_ERRORCODE	0x30 /* Either architectural or zero pushed by handler */
554#define EXC_EXCEPTION	0x28 /* Pushed by handler entry point */
555#define EXC_RAX		0x20 /* Pushed just above in exc_handler */
556#define EXC_RBX		0x18
557#define EXC_RCX		0x10
558#define EXC_RDX		0x08
559#define EXC_RSI		0x00
560
561	/* Set up %rdx/%rsi for debug output */
562	pr_setup
563
564	/* rip and exception info */
565	print_reg 'E', 'x', 'c', ':', EXC_EXCEPTION(%rsp)
566	print_reg 'E', 'r', 'r', ':', EXC_ERRORCODE(%rsp)
567	print_reg 'r', 'i', 'p', ':', EXC_RIP(%rsp)
568	print_reg 'r', 's', 'p', ':', EXC_RSP(%rsp)
569
570	/* We spilled these to the stack */
571	print_reg 'r', 'a', 'x', ':', EXC_RAX(%rsp)
572	print_reg 'r', 'b', 'x', ':', EXC_RBX(%rsp)
573	print_reg 'r', 'c', 'x', ':', EXC_RCX(%rsp)
574	print_reg 'r', 'd', 'x', ':', EXC_RDX(%rsp)
575	print_reg 'r', 's', 'i', ':', EXC_RSI(%rsp)
576
577	/* Other registers untouched */
578	print_reg 'r', 'd', 'i', ':', %rdi
579	print_reg 'r', '8', ' ', ':', %r8
580	print_reg 'r', '9', ' ', ':', %r9
581	print_reg 'r', '1', '0', ':', %r10
582	print_reg 'r', '1', '1', ':', %r11
583	print_reg 'r', '1', '2', ':', %r12
584	print_reg 'r', '1', '3', ':', %r13
585	print_reg 'r', '1', '4', ':', %r14
586	print_reg 'r', '1', '5', ':', %r15
587	print_reg 'c', 'r', '2', ':', %cr2
588
589	/* Only return from INT3 */
590	cmpq	$3, EXC_EXCEPTION(%rsp)
591	jne	.Ldie
592
593	popq	%rsi
594	popq	%rdx
595	popq	%rcx
596	popq	%rbx
597	popq	%rax
598
599	addq	$16, %rsp
600	iretq
601
602.Ldie:
603	hlt
604	jmp	.Ldie
605
606SYM_CODE_END(exc_handler)
607