1/* SPDX-License-Identifier: GPL-2.0-only */ 2/* 3 * relocate_kernel.S - put the kernel image in place to boot 4 * Copyright (C) 2002-2005 Eric Biederman <ebiederm@xmission.com> 5 */ 6 7#include <linux/linkage.h> 8#include <linux/stringify.h> 9#include <asm/alternative.h> 10#include <asm/page_types.h> 11#include <asm/kexec.h> 12#include <asm/processor-flags.h> 13#include <asm/pgtable_types.h> 14#include <asm/nospec-branch.h> 15#include <asm/unwind_hints.h> 16#include <asm/asm-offsets.h> 17 18/* 19 * Must be relocatable PIC code callable as a C function, in particular 20 * there must be a plain RET and not jump to return thunk. 21 */ 22 23#define PTR(x) (x << 3) 24#define PAGE_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY) 25 26/* 27 * The .text..relocate_kernel and .data..relocate_kernel sections are copied 28 * into the control page, and the remainder of the page is used as the stack. 29 */ 30 31 .section .data..relocate_kernel,"a"; 32/* Minimal CPU state */ 33SYM_DATA_LOCAL(saved_rsp, .quad 0) 34SYM_DATA_LOCAL(saved_cr0, .quad 0) 35SYM_DATA_LOCAL(saved_cr3, .quad 0) 36SYM_DATA_LOCAL(saved_cr4, .quad 0) 37 /* other data */ 38SYM_DATA(kexec_va_control_page, .quad 0) 39SYM_DATA(kexec_pa_table_page, .quad 0) 40SYM_DATA(kexec_pa_swap_page, .quad 0) 41SYM_DATA_LOCAL(pa_backup_pages_map, .quad 0) 42SYM_DATA(kexec_debug_8250_mmio32, .quad 0) 43SYM_DATA(kexec_debug_8250_port, .word 0) 44 45 .balign 16 46SYM_DATA_START_LOCAL(kexec_debug_gdt) 47 .word kexec_debug_gdt_end - kexec_debug_gdt - 1 48 .long 0 49 .word 0 50 .quad 0x00cf9a000000ffff /* __KERNEL32_CS */ 51 .quad 0x00af9a000000ffff /* __KERNEL_CS */ 52 .quad 0x00cf92000000ffff /* __KERNEL_DS */ 53SYM_DATA_END_LABEL(kexec_debug_gdt, SYM_L_LOCAL, kexec_debug_gdt_end) 54 55 .balign 8 56SYM_DATA_START(kexec_debug_idt) 57 .skip 0x100, 0x00 58SYM_DATA_END(kexec_debug_idt) 59 60 .section .text..relocate_kernel,"ax"; 61 .code64 62SYM_CODE_START_NOALIGN(relocate_kernel) 63 UNWIND_HINT_END_OF_STACK 64 ANNOTATE_NOENDBR 65 /* 66 * %rdi indirection_page 67 * %rsi pa_control_page 68 * %rdx start address 69 * %rcx flags: RELOC_KERNEL_* 70 */ 71 72 /* Save the CPU context, used for jumping back */ 73 pushq %rbx 74 pushq %rbp 75 pushq %r12 76 pushq %r13 77 pushq %r14 78 pushq %r15 79 pushf 80 81 /* Invalidate GDT/IDT, zero out flags */ 82 pushq $0 83 pushq $0 84 85 lidt (%rsp) 86 lgdt (%rsp) 87 addq $8, %rsp 88 popfq 89 90 /* Switch to the identity mapped page tables */ 91 movq %cr3, %rax 92 movq kexec_pa_table_page(%rip), %r9 93 movq %r9, %cr3 94 95 /* Leave CR4 in %r13 to enable the right paging mode later. */ 96 movq %cr4, %r13 97 98 /* 99 * Disable global pages immediately to ensure this mapping is RWX. 100 * Disable LASS before jumping to the identity mapped page. 101 */ 102 movq %r13, %r12 103 andq $~(X86_CR4_PGE | X86_CR4_LASS), %r12 104 movq %r12, %cr4 105 106 /* Save %rsp and CRs. */ 107 movq %r13, saved_cr4(%rip) 108 movq %rsp, saved_rsp(%rip) 109 movq %rax, saved_cr3(%rip) 110 movq %cr0, %rax 111 movq %rax, saved_cr0(%rip) 112 113 /* save indirection list for jumping back */ 114 movq %rdi, pa_backup_pages_map(%rip) 115 116 /* Save the flags to %r11 as swap_pages clobbers %rcx. */ 117 movq %rcx, %r11 118 119 /* setup a new stack at the end of the physical control page */ 120 lea PAGE_SIZE(%rsi), %rsp 121 122 /* jump to identity mapped page */ 1230: addq $identity_mapped - 0b, %rsi 124 subq $__relocate_kernel_start - 0b, %rsi 125 ANNOTATE_RETPOLINE_SAFE 126 jmp *%rsi 127SYM_CODE_END(relocate_kernel) 128 129SYM_CODE_START_LOCAL_NOALIGN(identity_mapped) 130 UNWIND_HINT_END_OF_STACK 131 /* 132 * %rdi indirection page 133 * %rdx start address 134 * %r9 page table page 135 * %r11 flags: RELOC_KERNEL_* 136 * %r13 original CR4 when relocate_kernel() was invoked 137 */ 138 139 /* 140 * Set return address to 0 if not preserving context. The purgatory 141 * shipped in kexec-tools will unconditionally look for the return 142 * address on the stack and set a kexec_jump_back_entry= command 143 * line option if it's non-zero. There's no other way that it can 144 * tell a preserve-context (kjump) kexec from a normal one. 145 */ 146 pushq $0 147 /* store the start address on the stack */ 148 pushq %rdx 149 150 /* Create a GDTR (16 bits limit, 64 bits addr) on stack */ 151 leaq kexec_debug_gdt(%rip), %rax 152 pushq %rax 153 pushw (%rax) 154 155 /* Load the GDT, put the stack back */ 156 lgdt (%rsp) 157 addq $10, %rsp 158 159 /* Test that we can load segments */ 160 movq %ds, %rax 161 movq %rax, %ds 162 163 /* Now an IDTR on the stack to load the IDT the kernel created */ 164 leaq kexec_debug_idt(%rip), %rsi 165 pushq %rsi 166 pushw $0xff 167 lidt (%rsp) 168 addq $10, %rsp 169 170 //int3 171 172 /* 173 * Clear X86_CR4_CET (if it was set) such that we can clear CR0_WP 174 * below. 175 */ 176 movq %cr4, %rax 177 andq $~(X86_CR4_CET), %rax 178 movq %rax, %cr4 179 180 /* 181 * Set cr0 to a known state: 182 * - Paging enabled 183 * - Alignment check disabled 184 * - Write protect disabled 185 * - No task switch 186 * - Don't do FP software emulation. 187 * - Protected mode enabled 188 */ 189 movq %cr0, %rax 190 andq $~(X86_CR0_AM | X86_CR0_WP | X86_CR0_TS | X86_CR0_EM), %rax 191 orl $(X86_CR0_PG | X86_CR0_PE), %eax 192 movq %rax, %cr0 193 194 /* 195 * Set cr4 to a known state: 196 * - physical address extension enabled 197 * - 5-level paging, if it was enabled before 198 * - Machine check exception on TDX guest, if it was enabled before. 199 * Clearing MCE might not be allowed in TDX guests, depending on setup. 200 * 201 * Use R13 that contains the original CR4 value, read in relocate_kernel(). 202 * PAE is always set in the original CR4. 203 */ 204 andl $(X86_CR4_PAE | X86_CR4_LA57), %r13d 205 ALTERNATIVE "", __stringify(orl $X86_CR4_MCE, %r13d), X86_FEATURE_TDX_GUEST 206 movq %r13, %cr4 207 208 /* Flush the TLB (needed?) */ 209 movq %r9, %cr3 210 211 /* 212 * If the memory cache is in incoherent state, e.g., due to 213 * memory encryption, do WBINVD to flush cache. 214 * 215 * If SME is active, there could be old encrypted cache line 216 * entries that will conflict with the now unencrypted memory 217 * used by kexec. Flush the caches before copying the kernel. 218 * 219 * Note SME sets this flag to true when the platform supports 220 * SME, so the WBINVD is performed even SME is not activated 221 * by the kernel. But this has no harm. 222 */ 223 testb $RELOC_KERNEL_CACHE_INCOHERENT, %r11b 224 jz .Lnowbinvd 225 wbinvd 226.Lnowbinvd: 227 228 call swap_pages 229 230 /* 231 * To be certain of avoiding problems with self-modifying code 232 * I need to execute a serializing instruction here. 233 * So I flush the TLB by reloading %cr3 here, it's handy, 234 * and not processor dependent. 235 */ 236 movq %cr3, %rax 237 movq %rax, %cr3 238 239 testb $RELOC_KERNEL_PRESERVE_CONTEXT, %r11b 240 jnz .Lrelocate 241 242 /* 243 * set all of the registers to known values 244 * leave %rsp alone 245 */ 246 247 xorl %eax, %eax 248 xorl %ebx, %ebx 249 xorl %ecx, %ecx 250 xorl %edx, %edx 251 xorl %esi, %esi 252 xorl %edi, %edi 253 xorl %ebp, %ebp 254 xorl %r8d, %r8d 255 xorl %r9d, %r9d 256 xorl %r10d, %r10d 257 xorl %r11d, %r11d 258 xorl %r12d, %r12d 259 xorl %r13d, %r13d 260 xorl %r14d, %r14d 261 xorl %r15d, %r15d 262 263 ANNOTATE_UNRET_SAFE 264 ret 265 int3 266 267.Lrelocate: 268 popq %rdx 269 270 /* Use the swap page for the callee's stack */ 271 movq kexec_pa_swap_page(%rip), %r10 272 leaq PAGE_SIZE(%r10), %rsp 273 274 /* push the existing entry point onto the callee's stack */ 275 pushq %rdx 276 277 ANNOTATE_RETPOLINE_SAFE 278 call *%rdx 279 280 /* get the re-entry point of the peer system */ 281 popq %rbp 282 movq kexec_pa_swap_page(%rip), %r10 283 movq pa_backup_pages_map(%rip), %rdi 284 movq kexec_pa_table_page(%rip), %rax 285 movq %rax, %cr3 286 287 /* Find start (and end) of this physical mapping of control page */ 288 leaq (%rip), %r8 289 ANNOTATE_NOENDBR 290 andq $PAGE_MASK, %r8 291 lea PAGE_SIZE(%r8), %rsp 292 /* 293 * Ensure RELOC_KERNEL_PRESERVE_CONTEXT flag is set so that 294 * swap_pages() can swap pages correctly. Note all other 295 * RELOC_KERNEL_* flags passed to relocate_kernel() are not 296 * restored. 297 */ 298 movl $RELOC_KERNEL_PRESERVE_CONTEXT, %r11d 299 call swap_pages 300 movq kexec_va_control_page(%rip), %rax 3010: addq $virtual_mapped - 0b, %rax 302 subq $__relocate_kernel_start - 0b, %rax 303 pushq %rax 304 ANNOTATE_UNRET_SAFE 305 ret 306 int3 307SYM_CODE_END(identity_mapped) 308 309SYM_CODE_START_LOCAL_NOALIGN(virtual_mapped) 310 UNWIND_HINT_END_OF_STACK 311 ANNOTATE_NOENDBR // RET target, above 312 movq saved_rsp(%rip), %rsp 313 movq saved_cr4(%rip), %rax 314 movq %rax, %cr4 315 movq saved_cr3(%rip), %rax 316 movq saved_cr0(%rip), %r8 317 movq %rax, %cr3 318 movq %r8, %cr0 319 320#ifdef CONFIG_KEXEC_JUMP 321 /* Saved in save_processor_state. */ 322 movq $saved_context, %rax 323 lgdt saved_context_gdt_desc(%rax) 324#endif 325 326 /* relocate_kernel() returns the re-entry point for next time */ 327 movq %rbp, %rax 328 329 popf 330 popq %r15 331 popq %r14 332 popq %r13 333 popq %r12 334 popq %rbp 335 popq %rbx 336 ANNOTATE_UNRET_SAFE 337 ret 338 int3 339SYM_CODE_END(virtual_mapped) 340 341 /* Do the copies */ 342SYM_CODE_START_LOCAL_NOALIGN(swap_pages) 343 UNWIND_HINT_END_OF_STACK 344 /* 345 * %rdi indirection page 346 * %r11 flags: RELOC_KERNEL_* 347 */ 348 movq %rdi, %rcx /* Put the indirection_page in %rcx */ 349 xorl %edi, %edi 350 xorl %esi, %esi 351 jmp .Lstart /* Should start with an indirection record */ 352 353.Lloop: /* top, read another word for the indirection page */ 354 355 movq (%rbx), %rcx 356 addq $8, %rbx 357.Lstart: 358 testb $0x1, %cl /* is it a destination page? */ 359 jz .Lnotdest 360 movq %rcx, %rdi 361 andq $0xfffffffffffff000, %rdi 362 jmp .Lloop 363.Lnotdest: 364 testb $0x2, %cl /* is it an indirection page? */ 365 jz .Lnotind 366 movq %rcx, %rbx 367 andq $0xfffffffffffff000, %rbx 368 jmp .Lloop 369.Lnotind: 370 testb $0x4, %cl /* is it the done indicator? */ 371 jz .Lnotdone 372 jmp .Ldone 373.Lnotdone: 374 testb $0x8, %cl /* is it the source indicator? */ 375 jz .Lloop /* Ignore it otherwise */ 376 movq %rcx, %rsi /* For ever source page do a copy */ 377 andq $0xfffffffffffff000, %rsi 378 379 movq %rdi, %rdx /* Save destination page to %rdx */ 380 movq %rsi, %rax /* Save source page to %rax */ 381 382 /* Only actually swap for ::preserve_context */ 383 testb $RELOC_KERNEL_PRESERVE_CONTEXT, %r11b 384 jz .Lnoswap 385 386 /* copy source page to swap page */ 387 movq kexec_pa_swap_page(%rip), %rdi 388 movl $512, %ecx 389 rep movsq 390 391 /* copy destination page to source page */ 392 movq %rax, %rdi 393 movq %rdx, %rsi 394 movl $512, %ecx 395 rep movsq 396 397 /* copy swap page to destination page */ 398 movq %rdx, %rdi 399 movq kexec_pa_swap_page(%rip), %rsi 400.Lnoswap: 401 movl $512, %ecx 402 rep movsq 403 404 lea PAGE_SIZE(%rax), %rsi 405 jmp .Lloop 406.Ldone: 407 ANNOTATE_UNRET_SAFE 408 ret 409 int3 410SYM_CODE_END(swap_pages) 411 412/* 413 * Generic 'print character' routine 414 * - %al: Character to be printed (may clobber %rax) 415 * - %rdx: MMIO address or port. 416 */ 417#define XMTRDY 0x20 418 419#define TXR 0 /* Transmit register (WRITE) */ 420#define LSR 5 /* Line Status */ 421 422SYM_CODE_START_LOCAL_NOALIGN(pr_char_8250) 423 UNWIND_HINT_FUNC 424 ANNOTATE_NOENDBR 425 addw $LSR, %dx 426 xchg %al, %ah 427.Lxmtrdy_loop: 428 inb %dx, %al 429 testb $XMTRDY, %al 430 jnz .Lready 431 pause 432 jmp .Lxmtrdy_loop 433 434.Lready: 435 subw $LSR, %dx 436 xchg %al, %ah 437 outb %al, %dx 438pr_char_null: 439 ANNOTATE_NOENDBR 440 441 ANNOTATE_UNRET_SAFE 442 ret 443SYM_CODE_END(pr_char_8250) 444 445SYM_CODE_START_LOCAL_NOALIGN(pr_char_8250_mmio32) 446 UNWIND_HINT_FUNC 447 ANNOTATE_NOENDBR 448.Lxmtrdy_loop_mmio: 449 movb (LSR*4)(%rdx), %ah 450 testb $XMTRDY, %ah 451 jnz .Lready_mmio 452 pause 453 jmp .Lxmtrdy_loop_mmio 454 455.Lready_mmio: 456 movb %al, (%rdx) 457 ANNOTATE_UNRET_SAFE 458 ret 459SYM_CODE_END(pr_char_8250_mmio32) 460 461/* 462 * Load pr_char function pointer into %rsi and load %rdx with whatever 463 * that function wants to see there (typically port/MMIO address). 464 */ 465.macro pr_setup 466 leaq pr_char_8250(%rip), %rsi 467 movw kexec_debug_8250_port(%rip), %dx 468 testw %dx, %dx 469 jnz 1f 470 471 leaq pr_char_8250_mmio32(%rip), %rsi 472 movq kexec_debug_8250_mmio32(%rip), %rdx 473 testq %rdx, %rdx 474 jnz 1f 475 476 leaq pr_char_null(%rip), %rsi 4771: 478.endm 479 480/* Print the nybble in %bl, clobber %rax */ 481SYM_CODE_START_LOCAL_NOALIGN(pr_nybble) 482 UNWIND_HINT_FUNC 483 movb %bl, %al 484 nop 485 andb $0x0f, %al 486 addb $0x30, %al 487 cmpb $0x3a, %al 488 jb 1f 489 addb $('a' - '0' - 10), %al 490 ANNOTATE_RETPOLINE_SAFE 4911: jmp *%rsi 492SYM_CODE_END(pr_nybble) 493 494SYM_CODE_START_LOCAL_NOALIGN(pr_qword) 495 UNWIND_HINT_FUNC 496 movq $16, %rcx 4971: rolq $4, %rbx 498 call pr_nybble 499 loop 1b 500 movb $'\n', %al 501 ANNOTATE_RETPOLINE_SAFE 502 jmp *%rsi 503SYM_CODE_END(pr_qword) 504 505.macro print_reg a, b, c, d, r 506 movb $\a, %al 507 ANNOTATE_RETPOLINE_SAFE 508 call *%rsi 509 movb $\b, %al 510 ANNOTATE_RETPOLINE_SAFE 511 call *%rsi 512 movb $\c, %al 513 ANNOTATE_RETPOLINE_SAFE 514 call *%rsi 515 movb $\d, %al 516 ANNOTATE_RETPOLINE_SAFE 517 call *%rsi 518 movq \r, %rbx 519 call pr_qword 520.endm 521 522SYM_CODE_START_NOALIGN(kexec_debug_exc_vectors) 523 /* Each of these is 6 bytes. */ 524.macro vec_err exc 525 UNWIND_HINT_ENTRY 526 . = kexec_debug_exc_vectors + (\exc * KEXEC_DEBUG_EXC_HANDLER_SIZE) 527 nop 528 nop 529 pushq $\exc 530 jmp exc_handler 531.endm 532 533.macro vec_noerr exc 534 UNWIND_HINT_ENTRY 535 . = kexec_debug_exc_vectors + (\exc * KEXEC_DEBUG_EXC_HANDLER_SIZE) 536 pushq $0 537 pushq $\exc 538 jmp exc_handler 539.endm 540 541 ANNOTATE_NOENDBR 542 vec_noerr 0 // #DE 543 vec_noerr 1 // #DB 544 vec_noerr 2 // #NMI 545 vec_noerr 3 // #BP 546 vec_noerr 4 // #OF 547 vec_noerr 5 // #BR 548 vec_noerr 6 // #UD 549 vec_noerr 7 // #NM 550 vec_err 8 // #DF 551 vec_noerr 9 552 vec_err 10 // #TS 553 vec_err 11 // #NP 554 vec_err 12 // #SS 555 vec_err 13 // #GP 556 vec_err 14 // #PF 557 vec_noerr 15 558SYM_CODE_END(kexec_debug_exc_vectors) 559 560SYM_CODE_START_LOCAL_NOALIGN(exc_handler) 561 /* No need for RET mitigations during kexec */ 562 VALIDATE_UNRET_END 563 564 pushq %rax 565 pushq %rbx 566 pushq %rcx 567 pushq %rdx 568 pushq %rsi 569 570 /* Stack frame */ 571#define EXC_SS 0x58 /* Architectural... */ 572#define EXC_RSP 0x50 573#define EXC_EFLAGS 0x48 574#define EXC_CS 0x40 575#define EXC_RIP 0x38 576#define EXC_ERRORCODE 0x30 /* Either architectural or zero pushed by handler */ 577#define EXC_EXCEPTION 0x28 /* Pushed by handler entry point */ 578#define EXC_RAX 0x20 /* Pushed just above in exc_handler */ 579#define EXC_RBX 0x18 580#define EXC_RCX 0x10 581#define EXC_RDX 0x08 582#define EXC_RSI 0x00 583 584 /* Set up %rdx/%rsi for debug output */ 585 pr_setup 586 587 /* rip and exception info */ 588 print_reg 'E', 'x', 'c', ':', EXC_EXCEPTION(%rsp) 589 print_reg 'E', 'r', 'r', ':', EXC_ERRORCODE(%rsp) 590 print_reg 'r', 'i', 'p', ':', EXC_RIP(%rsp) 591 print_reg 'r', 's', 'p', ':', EXC_RSP(%rsp) 592 593 /* We spilled these to the stack */ 594 print_reg 'r', 'a', 'x', ':', EXC_RAX(%rsp) 595 print_reg 'r', 'b', 'x', ':', EXC_RBX(%rsp) 596 print_reg 'r', 'c', 'x', ':', EXC_RCX(%rsp) 597 print_reg 'r', 'd', 'x', ':', EXC_RDX(%rsp) 598 print_reg 'r', 's', 'i', ':', EXC_RSI(%rsp) 599 600 /* Other registers untouched */ 601 print_reg 'r', 'd', 'i', ':', %rdi 602 print_reg 'r', '8', ' ', ':', %r8 603 print_reg 'r', '9', ' ', ':', %r9 604 print_reg 'r', '1', '0', ':', %r10 605 print_reg 'r', '1', '1', ':', %r11 606 print_reg 'r', '1', '2', ':', %r12 607 print_reg 'r', '1', '3', ':', %r13 608 print_reg 'r', '1', '4', ':', %r14 609 print_reg 'r', '1', '5', ':', %r15 610 print_reg 'c', 'r', '2', ':', %cr2 611 612 /* Only return from INT3 */ 613 cmpq $3, EXC_EXCEPTION(%rsp) 614 jne .Ldie 615 616 popq %rsi 617 popq %rdx 618 popq %rcx 619 popq %rbx 620 popq %rax 621 622 addq $16, %rsp 623 iretq 624 625.Ldie: 626 hlt 627 jmp .Ldie 628 629SYM_CODE_END(exc_handler) 630