1/* 2 * linux/arch/x86_64/entry.S 3 * 4 * Copyright (C) 1991, 1992 Linus Torvalds 5 * Copyright (C) 2000, 2001, 2002 Andi Kleen SuSE Labs 6 * Copyright (C) 2000 Pavel Machek <pavel@suse.cz> 7 * 8 * entry.S contains the system-call and fault low-level handling routines. 9 * 10 * Some of this is documented in Documentation/x86/entry_64.txt 11 * 12 * A note on terminology: 13 * - iret frame: Architecture defined interrupt frame from SS to RIP 14 * at the top of the kernel process stack. 15 * 16 * Some macro usage: 17 * - ENTRY/END: Define functions in the symbol table. 18 * - TRACE_IRQ_*: Trace hardirq state for lock debugging. 19 * - idtentry: Define exception entry points. 20 */ 21#include <linux/linkage.h> 22#include <asm/segment.h> 23#include <asm/cache.h> 24#include <asm/errno.h> 25#include "calling.h" 26#include <asm/asm-offsets.h> 27#include <asm/msr.h> 28#include <asm/unistd.h> 29#include <asm/thread_info.h> 30#include <asm/hw_irq.h> 31#include <asm/page_types.h> 32#include <asm/irqflags.h> 33#include <asm/paravirt.h> 34#include <asm/percpu.h> 35#include <asm/asm.h> 36#include <asm/context_tracking.h> 37#include <asm/smap.h> 38#include <asm/pgtable_types.h> 39#include <linux/err.h> 40 41/* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */ 42#include <linux/elf-em.h> 43#define AUDIT_ARCH_X86_64 (EM_X86_64|__AUDIT_ARCH_64BIT|__AUDIT_ARCH_LE) 44#define __AUDIT_ARCH_64BIT 0x80000000 45#define __AUDIT_ARCH_LE 0x40000000 46 47.code64 48.section .entry.text, "ax" 49 50#ifdef CONFIG_PARAVIRT 51ENTRY(native_usergs_sysret64) 52 swapgs 53 sysretq 54ENDPROC(native_usergs_sysret64) 55#endif /* CONFIG_PARAVIRT */ 56 57.macro TRACE_IRQS_IRETQ 58#ifdef CONFIG_TRACE_IRQFLAGS 59 bt $9, EFLAGS(%rsp) /* interrupts off? */ 60 jnc 1f 61 TRACE_IRQS_ON 621: 63#endif 64.endm 65 66/* 67 * When dynamic function tracer is enabled it will add a breakpoint 68 * to all locations that it is about to modify, sync CPUs, update 69 * all the code, sync CPUs, then remove the breakpoints. In this time 70 * if lockdep is enabled, it might jump back into the debug handler 71 * outside the updating of the IST protection. (TRACE_IRQS_ON/OFF). 72 * 73 * We need to change the IDT table before calling TRACE_IRQS_ON/OFF to 74 * make sure the stack pointer does not get reset back to the top 75 * of the debug stack, and instead just reuses the current stack. 76 */ 77#if defined(CONFIG_DYNAMIC_FTRACE) && defined(CONFIG_TRACE_IRQFLAGS) 78 79.macro TRACE_IRQS_OFF_DEBUG 80 call debug_stack_set_zero 81 TRACE_IRQS_OFF 82 call debug_stack_reset 83.endm 84 85.macro TRACE_IRQS_ON_DEBUG 86 call debug_stack_set_zero 87 TRACE_IRQS_ON 88 call debug_stack_reset 89.endm 90 91.macro TRACE_IRQS_IRETQ_DEBUG 92 bt $9, EFLAGS(%rsp) /* interrupts off? */ 93 jnc 1f 94 TRACE_IRQS_ON_DEBUG 951: 96.endm 97 98#else 99# define TRACE_IRQS_OFF_DEBUG TRACE_IRQS_OFF 100# define TRACE_IRQS_ON_DEBUG TRACE_IRQS_ON 101# define TRACE_IRQS_IRETQ_DEBUG TRACE_IRQS_IRETQ 102#endif 103 104/* 105 * 64-bit SYSCALL instruction entry. Up to 6 arguments in registers. 106 * 107 * 64-bit SYSCALL saves rip to rcx, clears rflags.RF, then saves rflags to r11, 108 * then loads new ss, cs, and rip from previously programmed MSRs. 109 * rflags gets masked by a value from another MSR (so CLD and CLAC 110 * are not needed). SYSCALL does not save anything on the stack 111 * and does not change rsp. 112 * 113 * Registers on entry: 114 * rax system call number 115 * rcx return address 116 * r11 saved rflags (note: r11 is callee-clobbered register in C ABI) 117 * rdi arg0 118 * rsi arg1 119 * rdx arg2 120 * r10 arg3 (needs to be moved to rcx to conform to C ABI) 121 * r8 arg4 122 * r9 arg5 123 * (note: r12-r15, rbp, rbx are callee-preserved in C ABI) 124 * 125 * Only called from user space. 126 * 127 * When user can change pt_regs->foo always force IRET. That is because 128 * it deals with uncanonical addresses better. SYSRET has trouble 129 * with them due to bugs in both AMD and Intel CPUs. 130 */ 131 132ENTRY(entry_SYSCALL_64) 133 /* 134 * Interrupts are off on entry. 135 * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON, 136 * it is too small to ever cause noticeable irq latency. 137 */ 138 SWAPGS_UNSAFE_STACK 139 /* 140 * A hypervisor implementation might want to use a label 141 * after the swapgs, so that it can do the swapgs 142 * for the guest and jump here on syscall. 143 */ 144GLOBAL(entry_SYSCALL_64_after_swapgs) 145 146 movq %rsp, PER_CPU_VAR(rsp_scratch) 147 movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp 148 149 /* Construct struct pt_regs on stack */ 150 pushq $__USER_DS /* pt_regs->ss */ 151 pushq PER_CPU_VAR(rsp_scratch) /* pt_regs->sp */ 152 /* 153 * Re-enable interrupts. 154 * We use 'rsp_scratch' as a scratch space, hence irq-off block above 155 * must execute atomically in the face of possible interrupt-driven 156 * task preemption. We must enable interrupts only after we're done 157 * with using rsp_scratch: 158 */ 159 ENABLE_INTERRUPTS(CLBR_NONE) 160 pushq %r11 /* pt_regs->flags */ 161 pushq $__USER_CS /* pt_regs->cs */ 162 pushq %rcx /* pt_regs->ip */ 163 pushq %rax /* pt_regs->orig_ax */ 164 pushq %rdi /* pt_regs->di */ 165 pushq %rsi /* pt_regs->si */ 166 pushq %rdx /* pt_regs->dx */ 167 pushq %rcx /* pt_regs->cx */ 168 pushq $-ENOSYS /* pt_regs->ax */ 169 pushq %r8 /* pt_regs->r8 */ 170 pushq %r9 /* pt_regs->r9 */ 171 pushq %r10 /* pt_regs->r10 */ 172 pushq %r11 /* pt_regs->r11 */ 173 sub $(6*8), %rsp /* pt_regs->bp, bx, r12-15 not saved */ 174 175 testl $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) 176 jnz tracesys 177entry_SYSCALL_64_fastpath: 178#if __SYSCALL_MASK == ~0 179 cmpq $__NR_syscall_max, %rax 180#else 181 andl $__SYSCALL_MASK, %eax 182 cmpl $__NR_syscall_max, %eax 183#endif 184 ja 1f /* return -ENOSYS (already in pt_regs->ax) */ 185 movq %r10, %rcx 186 call *sys_call_table(, %rax, 8) 187 movq %rax, RAX(%rsp) 1881: 189/* 190 * Syscall return path ending with SYSRET (fast path). 191 * Has incompletely filled pt_regs. 192 */ 193 LOCKDEP_SYS_EXIT 194 /* 195 * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON, 196 * it is too small to ever cause noticeable irq latency. 197 */ 198 DISABLE_INTERRUPTS(CLBR_NONE) 199 200 /* 201 * We must check ti flags with interrupts (or at least preemption) 202 * off because we must *never* return to userspace without 203 * processing exit work that is enqueued if we're preempted here. 204 * In particular, returning to userspace with any of the one-shot 205 * flags (TIF_NOTIFY_RESUME, TIF_USER_RETURN_NOTIFY, etc) set is 206 * very bad. 207 */ 208 testl $_TIF_ALLWORK_MASK, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) 209 jnz int_ret_from_sys_call_irqs_off /* Go to the slow path */ 210 211 RESTORE_C_REGS_EXCEPT_RCX_R11 212 movq RIP(%rsp), %rcx 213 movq EFLAGS(%rsp), %r11 214 movq RSP(%rsp), %rsp 215 /* 216 * 64-bit SYSRET restores rip from rcx, 217 * rflags from r11 (but RF and VM bits are forced to 0), 218 * cs and ss are loaded from MSRs. 219 * Restoration of rflags re-enables interrupts. 220 * 221 * NB: On AMD CPUs with the X86_BUG_SYSRET_SS_ATTRS bug, the ss 222 * descriptor is not reinitialized. This means that we should 223 * avoid SYSRET with SS == NULL, which could happen if we schedule, 224 * exit the kernel, and re-enter using an interrupt vector. (All 225 * interrupt entries on x86_64 set SS to NULL.) We prevent that 226 * from happening by reloading SS in __switch_to. (Actually 227 * detecting the failure in 64-bit userspace is tricky but can be 228 * done.) 229 */ 230 USERGS_SYSRET64 231 232 /* Do syscall entry tracing */ 233tracesys: 234 movq %rsp, %rdi 235 movl $AUDIT_ARCH_X86_64, %esi 236 call syscall_trace_enter_phase1 237 test %rax, %rax 238 jnz tracesys_phase2 /* if needed, run the slow path */ 239 RESTORE_C_REGS_EXCEPT_RAX /* else restore clobbered regs */ 240 movq ORIG_RAX(%rsp), %rax 241 jmp entry_SYSCALL_64_fastpath /* and return to the fast path */ 242 243tracesys_phase2: 244 SAVE_EXTRA_REGS 245 movq %rsp, %rdi 246 movl $AUDIT_ARCH_X86_64, %esi 247 movq %rax, %rdx 248 call syscall_trace_enter_phase2 249 250 /* 251 * Reload registers from stack in case ptrace changed them. 252 * We don't reload %rax because syscall_trace_entry_phase2() returned 253 * the value it wants us to use in the table lookup. 254 */ 255 RESTORE_C_REGS_EXCEPT_RAX 256 RESTORE_EXTRA_REGS 257#if __SYSCALL_MASK == ~0 258 cmpq $__NR_syscall_max, %rax 259#else 260 andl $__SYSCALL_MASK, %eax 261 cmpl $__NR_syscall_max, %eax 262#endif 263 ja 1f /* return -ENOSYS (already in pt_regs->ax) */ 264 movq %r10, %rcx /* fixup for C */ 265 call *sys_call_table(, %rax, 8) 266 movq %rax, RAX(%rsp) 2671: 268 /* Use IRET because user could have changed pt_regs->foo */ 269 270/* 271 * Syscall return path ending with IRET. 272 * Has correct iret frame. 273 */ 274GLOBAL(int_ret_from_sys_call) 275 DISABLE_INTERRUPTS(CLBR_NONE) 276int_ret_from_sys_call_irqs_off: /* jumps come here from the irqs-off SYSRET path */ 277 TRACE_IRQS_OFF 278 movl $_TIF_ALLWORK_MASK, %edi 279 /* edi: mask to check */ 280GLOBAL(int_with_check) 281 LOCKDEP_SYS_EXIT_IRQ 282 GET_THREAD_INFO(%rcx) 283 movl TI_flags(%rcx), %edx 284 andl %edi, %edx 285 jnz int_careful 286 andl $~TS_COMPAT, TI_status(%rcx) 287 jmp syscall_return 288 289 /* 290 * Either reschedule or signal or syscall exit tracking needed. 291 * First do a reschedule test. 292 * edx: work, edi: workmask 293 */ 294int_careful: 295 bt $TIF_NEED_RESCHED, %edx 296 jnc int_very_careful 297 TRACE_IRQS_ON 298 ENABLE_INTERRUPTS(CLBR_NONE) 299 pushq %rdi 300 SCHEDULE_USER 301 popq %rdi 302 DISABLE_INTERRUPTS(CLBR_NONE) 303 TRACE_IRQS_OFF 304 jmp int_with_check 305 306 /* handle signals and tracing -- both require a full pt_regs */ 307int_very_careful: 308 TRACE_IRQS_ON 309 ENABLE_INTERRUPTS(CLBR_NONE) 310 SAVE_EXTRA_REGS 311 /* Check for syscall exit trace */ 312 testl $_TIF_WORK_SYSCALL_EXIT, %edx 313 jz int_signal 314 pushq %rdi 315 leaq 8(%rsp), %rdi /* &ptregs -> arg1 */ 316 call syscall_trace_leave 317 popq %rdi 318 andl $~(_TIF_WORK_SYSCALL_EXIT|_TIF_SYSCALL_EMU), %edi 319 jmp int_restore_rest 320 321int_signal: 322 testl $_TIF_DO_NOTIFY_MASK, %edx 323 jz 1f 324 movq %rsp, %rdi /* &ptregs -> arg1 */ 325 xorl %esi, %esi /* oldset -> arg2 */ 326 call do_notify_resume 3271: movl $_TIF_WORK_MASK, %edi 328int_restore_rest: 329 RESTORE_EXTRA_REGS 330 DISABLE_INTERRUPTS(CLBR_NONE) 331 TRACE_IRQS_OFF 332 jmp int_with_check 333 334syscall_return: 335 /* The IRETQ could re-enable interrupts: */ 336 DISABLE_INTERRUPTS(CLBR_ANY) 337 TRACE_IRQS_IRETQ 338 339 /* 340 * Try to use SYSRET instead of IRET if we're returning to 341 * a completely clean 64-bit userspace context. 342 */ 343 movq RCX(%rsp), %rcx 344 movq RIP(%rsp), %r11 345 cmpq %rcx, %r11 /* RCX == RIP */ 346 jne opportunistic_sysret_failed 347 348 /* 349 * On Intel CPUs, SYSRET with non-canonical RCX/RIP will #GP 350 * in kernel space. This essentially lets the user take over 351 * the kernel, since userspace controls RSP. 352 * 353 * If width of "canonical tail" ever becomes variable, this will need 354 * to be updated to remain correct on both old and new CPUs. 355 */ 356 .ifne __VIRTUAL_MASK_SHIFT - 47 357 .error "virtual address width changed -- SYSRET checks need update" 358 .endif 359 360 /* Change top 16 bits to be the sign-extension of 47th bit */ 361 shl $(64 - (__VIRTUAL_MASK_SHIFT+1)), %rcx 362 sar $(64 - (__VIRTUAL_MASK_SHIFT+1)), %rcx 363 364 /* If this changed %rcx, it was not canonical */ 365 cmpq %rcx, %r11 366 jne opportunistic_sysret_failed 367 368 cmpq $__USER_CS, CS(%rsp) /* CS must match SYSRET */ 369 jne opportunistic_sysret_failed 370 371 movq R11(%rsp), %r11 372 cmpq %r11, EFLAGS(%rsp) /* R11 == RFLAGS */ 373 jne opportunistic_sysret_failed 374 375 /* 376 * SYSRET can't restore RF. SYSRET can restore TF, but unlike IRET, 377 * restoring TF results in a trap from userspace immediately after 378 * SYSRET. This would cause an infinite loop whenever #DB happens 379 * with register state that satisfies the opportunistic SYSRET 380 * conditions. For example, single-stepping this user code: 381 * 382 * movq $stuck_here, %rcx 383 * pushfq 384 * popq %r11 385 * stuck_here: 386 * 387 * would never get past 'stuck_here'. 388 */ 389 testq $(X86_EFLAGS_RF|X86_EFLAGS_TF), %r11 390 jnz opportunistic_sysret_failed 391 392 /* nothing to check for RSP */ 393 394 cmpq $__USER_DS, SS(%rsp) /* SS must match SYSRET */ 395 jne opportunistic_sysret_failed 396 397 /* 398 * We win! This label is here just for ease of understanding 399 * perf profiles. Nothing jumps here. 400 */ 401syscall_return_via_sysret: 402 /* rcx and r11 are already restored (see code above) */ 403 RESTORE_C_REGS_EXCEPT_RCX_R11 404 movq RSP(%rsp), %rsp 405 USERGS_SYSRET64 406 407opportunistic_sysret_failed: 408 SWAPGS 409 jmp restore_c_regs_and_iret 410END(entry_SYSCALL_64) 411 412 413 .macro FORK_LIKE func 414ENTRY(stub_\func) 415 SAVE_EXTRA_REGS 8 416 jmp sys_\func 417END(stub_\func) 418 .endm 419 420 FORK_LIKE clone 421 FORK_LIKE fork 422 FORK_LIKE vfork 423 424ENTRY(stub_execve) 425 call sys_execve 426return_from_execve: 427 testl %eax, %eax 428 jz 1f 429 /* exec failed, can use fast SYSRET code path in this case */ 430 ret 4311: 432 /* must use IRET code path (pt_regs->cs may have changed) */ 433 addq $8, %rsp 434 ZERO_EXTRA_REGS 435 movq %rax, RAX(%rsp) 436 jmp int_ret_from_sys_call 437END(stub_execve) 438/* 439 * Remaining execve stubs are only 7 bytes long. 440 * ENTRY() often aligns to 16 bytes, which in this case has no benefits. 441 */ 442 .align 8 443GLOBAL(stub_execveat) 444 call sys_execveat 445 jmp return_from_execve 446END(stub_execveat) 447 448#if defined(CONFIG_X86_X32_ABI) || defined(CONFIG_IA32_EMULATION) 449 .align 8 450GLOBAL(stub_x32_execve) 451GLOBAL(stub32_execve) 452 call compat_sys_execve 453 jmp return_from_execve 454END(stub32_execve) 455END(stub_x32_execve) 456 .align 8 457GLOBAL(stub_x32_execveat) 458GLOBAL(stub32_execveat) 459 call compat_sys_execveat 460 jmp return_from_execve 461END(stub32_execveat) 462END(stub_x32_execveat) 463#endif 464 465/* 466 * sigreturn is special because it needs to restore all registers on return. 467 * This cannot be done with SYSRET, so use the IRET return path instead. 468 */ 469ENTRY(stub_rt_sigreturn) 470 /* 471 * SAVE_EXTRA_REGS result is not normally needed: 472 * sigreturn overwrites all pt_regs->GPREGS. 473 * But sigreturn can fail (!), and there is no easy way to detect that. 474 * To make sure RESTORE_EXTRA_REGS doesn't restore garbage on error, 475 * we SAVE_EXTRA_REGS here. 476 */ 477 SAVE_EXTRA_REGS 8 478 call sys_rt_sigreturn 479return_from_stub: 480 addq $8, %rsp 481 RESTORE_EXTRA_REGS 482 movq %rax, RAX(%rsp) 483 jmp int_ret_from_sys_call 484END(stub_rt_sigreturn) 485 486#ifdef CONFIG_X86_X32_ABI 487ENTRY(stub_x32_rt_sigreturn) 488 SAVE_EXTRA_REGS 8 489 call sys32_x32_rt_sigreturn 490 jmp return_from_stub 491END(stub_x32_rt_sigreturn) 492#endif 493 494/* 495 * A newly forked process directly context switches into this address. 496 * 497 * rdi: prev task we switched from 498 */ 499ENTRY(ret_from_fork) 500 501 LOCK ; btr $TIF_FORK, TI_flags(%r8) 502 503 pushq $0x0002 504 popfq /* reset kernel eflags */ 505 506 call schedule_tail /* rdi: 'prev' task parameter */ 507 508 RESTORE_EXTRA_REGS 509 510 testb $3, CS(%rsp) /* from kernel_thread? */ 511 512 /* 513 * By the time we get here, we have no idea whether our pt_regs, 514 * ti flags, and ti status came from the 64-bit SYSCALL fast path, 515 * the slow path, or one of the 32-bit compat paths. 516 * Use IRET code path to return, since it can safely handle 517 * all of the above. 518 */ 519 jnz int_ret_from_sys_call 520 521 /* 522 * We came from kernel_thread 523 * nb: we depend on RESTORE_EXTRA_REGS above 524 */ 525 movq %rbp, %rdi 526 call *%rbx 527 movl $0, RAX(%rsp) 528 RESTORE_EXTRA_REGS 529 jmp int_ret_from_sys_call 530END(ret_from_fork) 531 532/* 533 * Build the entry stubs with some assembler magic. 534 * We pack 1 stub into every 8-byte block. 535 */ 536 .align 8 537ENTRY(irq_entries_start) 538 vector=FIRST_EXTERNAL_VECTOR 539 .rept (FIRST_SYSTEM_VECTOR - FIRST_EXTERNAL_VECTOR) 540 pushq $(~vector+0x80) /* Note: always in signed byte range */ 541 vector=vector+1 542 jmp common_interrupt 543 .align 8 544 .endr 545END(irq_entries_start) 546 547/* 548 * Interrupt entry/exit. 549 * 550 * Interrupt entry points save only callee clobbered registers in fast path. 551 * 552 * Entry runs with interrupts off. 553 */ 554 555/* 0(%rsp): ~(interrupt number) */ 556 .macro interrupt func 557 cld 558 /* 559 * Since nothing in interrupt handling code touches r12...r15 members 560 * of "struct pt_regs", and since interrupts can nest, we can save 561 * four stack slots and simultaneously provide 562 * an unwind-friendly stack layout by saving "truncated" pt_regs 563 * exactly up to rbp slot, without these members. 564 */ 565 ALLOC_PT_GPREGS_ON_STACK -RBP 566 SAVE_C_REGS -RBP 567 /* this goes to 0(%rsp) for unwinder, not for saving the value: */ 568 SAVE_EXTRA_REGS_RBP -RBP 569 570 leaq -RBP(%rsp), %rdi /* arg1 for \func (pointer to pt_regs) */ 571 572 testb $3, CS-RBP(%rsp) 573 jz 1f 574 SWAPGS 5751: 576 /* 577 * Save previous stack pointer, optionally switch to interrupt stack. 578 * irq_count is used to check if a CPU is already on an interrupt stack 579 * or not. While this is essentially redundant with preempt_count it is 580 * a little cheaper to use a separate counter in the PDA (short of 581 * moving irq_enter into assembly, which would be too much work) 582 */ 583 movq %rsp, %rsi 584 incl PER_CPU_VAR(irq_count) 585 cmovzq PER_CPU_VAR(irq_stack_ptr), %rsp 586 pushq %rsi 587 /* We entered an interrupt context - irqs are off: */ 588 TRACE_IRQS_OFF 589 590 call \func 591 .endm 592 593 /* 594 * The interrupt stubs push (~vector+0x80) onto the stack and 595 * then jump to common_interrupt. 596 */ 597 .p2align CONFIG_X86_L1_CACHE_SHIFT 598common_interrupt: 599 ASM_CLAC 600 addq $-0x80, (%rsp) /* Adjust vector to [-256, -1] range */ 601 interrupt do_IRQ 602 /* 0(%rsp): old RSP */ 603ret_from_intr: 604 DISABLE_INTERRUPTS(CLBR_NONE) 605 TRACE_IRQS_OFF 606 decl PER_CPU_VAR(irq_count) 607 608 /* Restore saved previous stack */ 609 popq %rsi 610 /* return code expects complete pt_regs - adjust rsp accordingly: */ 611 leaq -RBP(%rsi), %rsp 612 613 testb $3, CS(%rsp) 614 jz retint_kernel 615 /* Interrupt came from user space */ 616retint_user: 617 GET_THREAD_INFO(%rcx) 618 619 /* %rcx: thread info. Interrupts are off. */ 620retint_with_reschedule: 621 movl $_TIF_WORK_MASK, %edi 622retint_check: 623 LOCKDEP_SYS_EXIT_IRQ 624 movl TI_flags(%rcx), %edx 625 andl %edi, %edx 626 jnz retint_careful 627 628retint_swapgs: /* return to user-space */ 629 /* 630 * The iretq could re-enable interrupts: 631 */ 632 DISABLE_INTERRUPTS(CLBR_ANY) 633 TRACE_IRQS_IRETQ 634 635 SWAPGS 636 jmp restore_c_regs_and_iret 637 638/* Returning to kernel space */ 639retint_kernel: 640#ifdef CONFIG_PREEMPT 641 /* Interrupts are off */ 642 /* Check if we need preemption */ 643 bt $9, EFLAGS(%rsp) /* were interrupts off? */ 644 jnc 1f 6450: cmpl $0, PER_CPU_VAR(__preempt_count) 646 jnz 1f 647 call preempt_schedule_irq 648 jmp 0b 6491: 650#endif 651 /* 652 * The iretq could re-enable interrupts: 653 */ 654 TRACE_IRQS_IRETQ 655 656/* 657 * At this label, code paths which return to kernel and to user, 658 * which come from interrupts/exception and from syscalls, merge. 659 */ 660restore_c_regs_and_iret: 661 RESTORE_C_REGS 662 REMOVE_PT_GPREGS_FROM_STACK 8 663 INTERRUPT_RETURN 664 665ENTRY(native_iret) 666 /* 667 * Are we returning to a stack segment from the LDT? Note: in 668 * 64-bit mode SS:RSP on the exception stack is always valid. 669 */ 670#ifdef CONFIG_X86_ESPFIX64 671 testb $4, (SS-RIP)(%rsp) 672 jnz native_irq_return_ldt 673#endif 674 675.global native_irq_return_iret 676native_irq_return_iret: 677 /* 678 * This may fault. Non-paranoid faults on return to userspace are 679 * handled by fixup_bad_iret. These include #SS, #GP, and #NP. 680 * Double-faults due to espfix64 are handled in do_double_fault. 681 * Other faults here are fatal. 682 */ 683 iretq 684 685#ifdef CONFIG_X86_ESPFIX64 686native_irq_return_ldt: 687 pushq %rax 688 pushq %rdi 689 SWAPGS 690 movq PER_CPU_VAR(espfix_waddr), %rdi 691 movq %rax, (0*8)(%rdi) /* RAX */ 692 movq (2*8)(%rsp), %rax /* RIP */ 693 movq %rax, (1*8)(%rdi) 694 movq (3*8)(%rsp), %rax /* CS */ 695 movq %rax, (2*8)(%rdi) 696 movq (4*8)(%rsp), %rax /* RFLAGS */ 697 movq %rax, (3*8)(%rdi) 698 movq (6*8)(%rsp), %rax /* SS */ 699 movq %rax, (5*8)(%rdi) 700 movq (5*8)(%rsp), %rax /* RSP */ 701 movq %rax, (4*8)(%rdi) 702 andl $0xffff0000, %eax 703 popq %rdi 704 orq PER_CPU_VAR(espfix_stack), %rax 705 SWAPGS 706 movq %rax, %rsp 707 popq %rax 708 jmp native_irq_return_iret 709#endif 710 711 /* edi: workmask, edx: work */ 712retint_careful: 713 bt $TIF_NEED_RESCHED, %edx 714 jnc retint_signal 715 TRACE_IRQS_ON 716 ENABLE_INTERRUPTS(CLBR_NONE) 717 pushq %rdi 718 SCHEDULE_USER 719 popq %rdi 720 GET_THREAD_INFO(%rcx) 721 DISABLE_INTERRUPTS(CLBR_NONE) 722 TRACE_IRQS_OFF 723 jmp retint_check 724 725retint_signal: 726 testl $_TIF_DO_NOTIFY_MASK, %edx 727 jz retint_swapgs 728 TRACE_IRQS_ON 729 ENABLE_INTERRUPTS(CLBR_NONE) 730 SAVE_EXTRA_REGS 731 movq $-1, ORIG_RAX(%rsp) 732 xorl %esi, %esi /* oldset */ 733 movq %rsp, %rdi /* &pt_regs */ 734 call do_notify_resume 735 RESTORE_EXTRA_REGS 736 DISABLE_INTERRUPTS(CLBR_NONE) 737 TRACE_IRQS_OFF 738 GET_THREAD_INFO(%rcx) 739 jmp retint_with_reschedule 740 741END(common_interrupt) 742 743/* 744 * APIC interrupts. 745 */ 746.macro apicinterrupt3 num sym do_sym 747ENTRY(\sym) 748 ASM_CLAC 749 pushq $~(\num) 750.Lcommon_\sym: 751 interrupt \do_sym 752 jmp ret_from_intr 753END(\sym) 754.endm 755 756#ifdef CONFIG_TRACING 757#define trace(sym) trace_##sym 758#define smp_trace(sym) smp_trace_##sym 759 760.macro trace_apicinterrupt num sym 761apicinterrupt3 \num trace(\sym) smp_trace(\sym) 762.endm 763#else 764.macro trace_apicinterrupt num sym do_sym 765.endm 766#endif 767 768.macro apicinterrupt num sym do_sym 769apicinterrupt3 \num \sym \do_sym 770trace_apicinterrupt \num \sym 771.endm 772 773#ifdef CONFIG_SMP 774apicinterrupt3 IRQ_MOVE_CLEANUP_VECTOR irq_move_cleanup_interrupt smp_irq_move_cleanup_interrupt 775apicinterrupt3 REBOOT_VECTOR reboot_interrupt smp_reboot_interrupt 776#endif 777 778#ifdef CONFIG_X86_UV 779apicinterrupt3 UV_BAU_MESSAGE uv_bau_message_intr1 uv_bau_message_interrupt 780#endif 781 782apicinterrupt LOCAL_TIMER_VECTOR apic_timer_interrupt smp_apic_timer_interrupt 783apicinterrupt X86_PLATFORM_IPI_VECTOR x86_platform_ipi smp_x86_platform_ipi 784 785#ifdef CONFIG_HAVE_KVM 786apicinterrupt3 POSTED_INTR_VECTOR kvm_posted_intr_ipi smp_kvm_posted_intr_ipi 787apicinterrupt3 POSTED_INTR_WAKEUP_VECTOR kvm_posted_intr_wakeup_ipi smp_kvm_posted_intr_wakeup_ipi 788#endif 789 790#ifdef CONFIG_X86_MCE_THRESHOLD 791apicinterrupt THRESHOLD_APIC_VECTOR threshold_interrupt smp_threshold_interrupt 792#endif 793 794#ifdef CONFIG_X86_MCE_AMD 795apicinterrupt DEFERRED_ERROR_VECTOR deferred_error_interrupt smp_deferred_error_interrupt 796#endif 797 798#ifdef CONFIG_X86_THERMAL_VECTOR 799apicinterrupt THERMAL_APIC_VECTOR thermal_interrupt smp_thermal_interrupt 800#endif 801 802#ifdef CONFIG_SMP 803apicinterrupt CALL_FUNCTION_SINGLE_VECTOR call_function_single_interrupt smp_call_function_single_interrupt 804apicinterrupt CALL_FUNCTION_VECTOR call_function_interrupt smp_call_function_interrupt 805apicinterrupt RESCHEDULE_VECTOR reschedule_interrupt smp_reschedule_interrupt 806#endif 807 808apicinterrupt ERROR_APIC_VECTOR error_interrupt smp_error_interrupt 809apicinterrupt SPURIOUS_APIC_VECTOR spurious_interrupt smp_spurious_interrupt 810 811#ifdef CONFIG_IRQ_WORK 812apicinterrupt IRQ_WORK_VECTOR irq_work_interrupt smp_irq_work_interrupt 813#endif 814 815/* 816 * Exception entry points. 817 */ 818#define CPU_TSS_IST(x) PER_CPU_VAR(cpu_tss) + (TSS_ist + ((x) - 1) * 8) 819 820.macro idtentry sym do_sym has_error_code:req paranoid=0 shift_ist=-1 821ENTRY(\sym) 822 /* Sanity check */ 823 .if \shift_ist != -1 && \paranoid == 0 824 .error "using shift_ist requires paranoid=1" 825 .endif 826 827 ASM_CLAC 828 PARAVIRT_ADJUST_EXCEPTION_FRAME 829 830 .ifeq \has_error_code 831 pushq $-1 /* ORIG_RAX: no syscall to restart */ 832 .endif 833 834 ALLOC_PT_GPREGS_ON_STACK 835 836 .if \paranoid 837 .if \paranoid == 1 838 testb $3, CS(%rsp) /* If coming from userspace, switch stacks */ 839 jnz 1f 840 .endif 841 call paranoid_entry 842 .else 843 call error_entry 844 .endif 845 /* returned flag: ebx=0: need swapgs on exit, ebx=1: don't need it */ 846 847 .if \paranoid 848 .if \shift_ist != -1 849 TRACE_IRQS_OFF_DEBUG /* reload IDT in case of recursion */ 850 .else 851 TRACE_IRQS_OFF 852 .endif 853 .endif 854 855 movq %rsp, %rdi /* pt_regs pointer */ 856 857 .if \has_error_code 858 movq ORIG_RAX(%rsp), %rsi /* get error code */ 859 movq $-1, ORIG_RAX(%rsp) /* no syscall to restart */ 860 .else 861 xorl %esi, %esi /* no error code */ 862 .endif 863 864 .if \shift_ist != -1 865 subq $EXCEPTION_STKSZ, CPU_TSS_IST(\shift_ist) 866 .endif 867 868 call \do_sym 869 870 .if \shift_ist != -1 871 addq $EXCEPTION_STKSZ, CPU_TSS_IST(\shift_ist) 872 .endif 873 874 /* these procedures expect "no swapgs" flag in ebx */ 875 .if \paranoid 876 jmp paranoid_exit 877 .else 878 jmp error_exit 879 .endif 880 881 .if \paranoid == 1 882 /* 883 * Paranoid entry from userspace. Switch stacks and treat it 884 * as a normal entry. This means that paranoid handlers 885 * run in real process context if user_mode(regs). 886 */ 8871: 888 call error_entry 889 890 891 movq %rsp, %rdi /* pt_regs pointer */ 892 call sync_regs 893 movq %rax, %rsp /* switch stack */ 894 895 movq %rsp, %rdi /* pt_regs pointer */ 896 897 .if \has_error_code 898 movq ORIG_RAX(%rsp), %rsi /* get error code */ 899 movq $-1, ORIG_RAX(%rsp) /* no syscall to restart */ 900 .else 901 xorl %esi, %esi /* no error code */ 902 .endif 903 904 call \do_sym 905 906 jmp error_exit /* %ebx: no swapgs flag */ 907 .endif 908END(\sym) 909.endm 910 911#ifdef CONFIG_TRACING 912.macro trace_idtentry sym do_sym has_error_code:req 913idtentry trace(\sym) trace(\do_sym) has_error_code=\has_error_code 914idtentry \sym \do_sym has_error_code=\has_error_code 915.endm 916#else 917.macro trace_idtentry sym do_sym has_error_code:req 918idtentry \sym \do_sym has_error_code=\has_error_code 919.endm 920#endif 921 922idtentry divide_error do_divide_error has_error_code=0 923idtentry overflow do_overflow has_error_code=0 924idtentry bounds do_bounds has_error_code=0 925idtentry invalid_op do_invalid_op has_error_code=0 926idtentry device_not_available do_device_not_available has_error_code=0 927idtentry double_fault do_double_fault has_error_code=1 paranoid=2 928idtentry coprocessor_segment_overrun do_coprocessor_segment_overrun has_error_code=0 929idtentry invalid_TSS do_invalid_TSS has_error_code=1 930idtentry segment_not_present do_segment_not_present has_error_code=1 931idtentry spurious_interrupt_bug do_spurious_interrupt_bug has_error_code=0 932idtentry coprocessor_error do_coprocessor_error has_error_code=0 933idtentry alignment_check do_alignment_check has_error_code=1 934idtentry simd_coprocessor_error do_simd_coprocessor_error has_error_code=0 935 936 937 /* 938 * Reload gs selector with exception handling 939 * edi: new selector 940 */ 941ENTRY(native_load_gs_index) 942 pushfq 943 DISABLE_INTERRUPTS(CLBR_ANY & ~CLBR_RDI) 944 SWAPGS 945gs_change: 946 movl %edi, %gs 9472: mfence /* workaround */ 948 SWAPGS 949 popfq 950 ret 951END(native_load_gs_index) 952 953 _ASM_EXTABLE(gs_change, bad_gs) 954 .section .fixup, "ax" 955 /* running with kernelgs */ 956bad_gs: 957 SWAPGS /* switch back to user gs */ 958 xorl %eax, %eax 959 movl %eax, %gs 960 jmp 2b 961 .previous 962 963/* Call softirq on interrupt stack. Interrupts are off. */ 964ENTRY(do_softirq_own_stack) 965 pushq %rbp 966 mov %rsp, %rbp 967 incl PER_CPU_VAR(irq_count) 968 cmove PER_CPU_VAR(irq_stack_ptr), %rsp 969 push %rbp /* frame pointer backlink */ 970 call __do_softirq 971 leaveq 972 decl PER_CPU_VAR(irq_count) 973 ret 974END(do_softirq_own_stack) 975 976#ifdef CONFIG_XEN 977idtentry xen_hypervisor_callback xen_do_hypervisor_callback has_error_code=0 978 979/* 980 * A note on the "critical region" in our callback handler. 981 * We want to avoid stacking callback handlers due to events occurring 982 * during handling of the last event. To do this, we keep events disabled 983 * until we've done all processing. HOWEVER, we must enable events before 984 * popping the stack frame (can't be done atomically) and so it would still 985 * be possible to get enough handler activations to overflow the stack. 986 * Although unlikely, bugs of that kind are hard to track down, so we'd 987 * like to avoid the possibility. 988 * So, on entry to the handler we detect whether we interrupted an 989 * existing activation in its critical region -- if so, we pop the current 990 * activation and restart the handler using the previous one. 991 */ 992ENTRY(xen_do_hypervisor_callback) /* do_hypervisor_callback(struct *pt_regs) */ 993 994/* 995 * Since we don't modify %rdi, evtchn_do_upall(struct *pt_regs) will 996 * see the correct pointer to the pt_regs 997 */ 998 movq %rdi, %rsp /* we don't return, adjust the stack frame */ 99911: incl PER_CPU_VAR(irq_count) 1000 movq %rsp, %rbp 1001 cmovzq PER_CPU_VAR(irq_stack_ptr), %rsp 1002 pushq %rbp /* frame pointer backlink */ 1003 call xen_evtchn_do_upcall 1004 popq %rsp 1005 decl PER_CPU_VAR(irq_count) 1006#ifndef CONFIG_PREEMPT 1007 call xen_maybe_preempt_hcall 1008#endif 1009 jmp error_exit 1010END(xen_do_hypervisor_callback) 1011 1012/* 1013 * Hypervisor uses this for application faults while it executes. 1014 * We get here for two reasons: 1015 * 1. Fault while reloading DS, ES, FS or GS 1016 * 2. Fault while executing IRET 1017 * Category 1 we do not need to fix up as Xen has already reloaded all segment 1018 * registers that could be reloaded and zeroed the others. 1019 * Category 2 we fix up by killing the current process. We cannot use the 1020 * normal Linux return path in this case because if we use the IRET hypercall 1021 * to pop the stack frame we end up in an infinite loop of failsafe callbacks. 1022 * We distinguish between categories by comparing each saved segment register 1023 * with its current contents: any discrepancy means we in category 1. 1024 */ 1025ENTRY(xen_failsafe_callback) 1026 movl %ds, %ecx 1027 cmpw %cx, 0x10(%rsp) 1028 jne 1f 1029 movl %es, %ecx 1030 cmpw %cx, 0x18(%rsp) 1031 jne 1f 1032 movl %fs, %ecx 1033 cmpw %cx, 0x20(%rsp) 1034 jne 1f 1035 movl %gs, %ecx 1036 cmpw %cx, 0x28(%rsp) 1037 jne 1f 1038 /* All segments match their saved values => Category 2 (Bad IRET). */ 1039 movq (%rsp), %rcx 1040 movq 8(%rsp), %r11 1041 addq $0x30, %rsp 1042 pushq $0 /* RIP */ 1043 pushq %r11 1044 pushq %rcx 1045 jmp general_protection 10461: /* Segment mismatch => Category 1 (Bad segment). Retry the IRET. */ 1047 movq (%rsp), %rcx 1048 movq 8(%rsp), %r11 1049 addq $0x30, %rsp 1050 pushq $-1 /* orig_ax = -1 => not a system call */ 1051 ALLOC_PT_GPREGS_ON_STACK 1052 SAVE_C_REGS 1053 SAVE_EXTRA_REGS 1054 jmp error_exit 1055END(xen_failsafe_callback) 1056 1057apicinterrupt3 HYPERVISOR_CALLBACK_VECTOR \ 1058 xen_hvm_callback_vector xen_evtchn_do_upcall 1059 1060#endif /* CONFIG_XEN */ 1061 1062#if IS_ENABLED(CONFIG_HYPERV) 1063apicinterrupt3 HYPERVISOR_CALLBACK_VECTOR \ 1064 hyperv_callback_vector hyperv_vector_handler 1065#endif /* CONFIG_HYPERV */ 1066 1067idtentry debug do_debug has_error_code=0 paranoid=1 shift_ist=DEBUG_STACK 1068idtentry int3 do_int3 has_error_code=0 paranoid=1 shift_ist=DEBUG_STACK 1069idtentry stack_segment do_stack_segment has_error_code=1 1070 1071#ifdef CONFIG_XEN 1072idtentry xen_debug do_debug has_error_code=0 1073idtentry xen_int3 do_int3 has_error_code=0 1074idtentry xen_stack_segment do_stack_segment has_error_code=1 1075#endif 1076 1077idtentry general_protection do_general_protection has_error_code=1 1078trace_idtentry page_fault do_page_fault has_error_code=1 1079 1080#ifdef CONFIG_KVM_GUEST 1081idtentry async_page_fault do_async_page_fault has_error_code=1 1082#endif 1083 1084#ifdef CONFIG_X86_MCE 1085idtentry machine_check has_error_code=0 paranoid=1 do_sym=*machine_check_vector(%rip) 1086#endif 1087 1088/* 1089 * Save all registers in pt_regs, and switch gs if needed. 1090 * Use slow, but surefire "are we in kernel?" check. 1091 * Return: ebx=0: need swapgs on exit, ebx=1: otherwise 1092 */ 1093ENTRY(paranoid_entry) 1094 cld 1095 SAVE_C_REGS 8 1096 SAVE_EXTRA_REGS 8 1097 movl $1, %ebx 1098 movl $MSR_GS_BASE, %ecx 1099 rdmsr 1100 testl %edx, %edx 1101 js 1f /* negative -> in kernel */ 1102 SWAPGS 1103 xorl %ebx, %ebx 11041: ret 1105END(paranoid_entry) 1106 1107/* 1108 * "Paranoid" exit path from exception stack. This is invoked 1109 * only on return from non-NMI IST interrupts that came 1110 * from kernel space. 1111 * 1112 * We may be returning to very strange contexts (e.g. very early 1113 * in syscall entry), so checking for preemption here would 1114 * be complicated. Fortunately, we there's no good reason 1115 * to try to handle preemption here. 1116 * 1117 * On entry, ebx is "no swapgs" flag (1: don't need swapgs, 0: need it) 1118 */ 1119ENTRY(paranoid_exit) 1120 DISABLE_INTERRUPTS(CLBR_NONE) 1121 TRACE_IRQS_OFF_DEBUG 1122 testl %ebx, %ebx /* swapgs needed? */ 1123 jnz paranoid_exit_no_swapgs 1124 TRACE_IRQS_IRETQ 1125 SWAPGS_UNSAFE_STACK 1126 jmp paranoid_exit_restore 1127paranoid_exit_no_swapgs: 1128 TRACE_IRQS_IRETQ_DEBUG 1129paranoid_exit_restore: 1130 RESTORE_EXTRA_REGS 1131 RESTORE_C_REGS 1132 REMOVE_PT_GPREGS_FROM_STACK 8 1133 INTERRUPT_RETURN 1134END(paranoid_exit) 1135 1136/* 1137 * Save all registers in pt_regs, and switch gs if needed. 1138 * Return: EBX=0: came from user mode; EBX=1: otherwise 1139 */ 1140ENTRY(error_entry) 1141 cld 1142 SAVE_C_REGS 8 1143 SAVE_EXTRA_REGS 8 1144 xorl %ebx, %ebx 1145 testb $3, CS+8(%rsp) 1146 jz error_kernelspace 1147 1148 /* We entered from user mode */ 1149 SWAPGS 1150 1151error_entry_done: 1152 TRACE_IRQS_OFF 1153 ret 1154 1155 /* 1156 * There are two places in the kernel that can potentially fault with 1157 * usergs. Handle them here. B stepping K8s sometimes report a 1158 * truncated RIP for IRET exceptions returning to compat mode. Check 1159 * for these here too. 1160 */ 1161error_kernelspace: 1162 incl %ebx 1163 leaq native_irq_return_iret(%rip), %rcx 1164 cmpq %rcx, RIP+8(%rsp) 1165 je error_bad_iret 1166 movl %ecx, %eax /* zero extend */ 1167 cmpq %rax, RIP+8(%rsp) 1168 je bstep_iret 1169 cmpq $gs_change, RIP+8(%rsp) 1170 jne error_entry_done 1171 1172 /* 1173 * hack: gs_change can fail with user gsbase. If this happens, fix up 1174 * gsbase and proceed. We'll fix up the exception and land in 1175 * gs_change's error handler with kernel gsbase. 1176 */ 1177 SWAPGS 1178 jmp error_entry_done 1179 1180bstep_iret: 1181 /* Fix truncated RIP */ 1182 movq %rcx, RIP+8(%rsp) 1183 /* fall through */ 1184 1185error_bad_iret: 1186 /* 1187 * We came from an IRET to user mode, so we have user gsbase. 1188 * Switch to kernel gsbase: 1189 */ 1190 SWAPGS 1191 1192 /* 1193 * Pretend that the exception came from user mode: set up pt_regs 1194 * as if we faulted immediately after IRET and clear EBX so that 1195 * error_exit knows that we will be returning to user mode. 1196 */ 1197 mov %rsp, %rdi 1198 call fixup_bad_iret 1199 mov %rax, %rsp 1200 decl %ebx 1201 jmp error_entry_done 1202END(error_entry) 1203 1204 1205/* 1206 * On entry, EBS is a "return to kernel mode" flag: 1207 * 1: already in kernel mode, don't need SWAPGS 1208 * 0: user gsbase is loaded, we need SWAPGS and standard preparation for return to usermode 1209 */ 1210ENTRY(error_exit) 1211 movl %ebx, %eax 1212 RESTORE_EXTRA_REGS 1213 DISABLE_INTERRUPTS(CLBR_NONE) 1214 TRACE_IRQS_OFF 1215 testl %eax, %eax 1216 jnz retint_kernel 1217 jmp retint_user 1218END(error_exit) 1219 1220/* Runs on exception stack */ 1221ENTRY(nmi) 1222 PARAVIRT_ADJUST_EXCEPTION_FRAME 1223 /* 1224 * We allow breakpoints in NMIs. If a breakpoint occurs, then 1225 * the iretq it performs will take us out of NMI context. 1226 * This means that we can have nested NMIs where the next 1227 * NMI is using the top of the stack of the previous NMI. We 1228 * can't let it execute because the nested NMI will corrupt the 1229 * stack of the previous NMI. NMI handlers are not re-entrant 1230 * anyway. 1231 * 1232 * To handle this case we do the following: 1233 * Check the a special location on the stack that contains 1234 * a variable that is set when NMIs are executing. 1235 * The interrupted task's stack is also checked to see if it 1236 * is an NMI stack. 1237 * If the variable is not set and the stack is not the NMI 1238 * stack then: 1239 * o Set the special variable on the stack 1240 * o Copy the interrupt frame into an "outermost" location on the 1241 * stack 1242 * o Copy the interrupt frame into an "iret" location on the stack 1243 * o Continue processing the NMI 1244 * If the variable is set or the previous stack is the NMI stack: 1245 * o Modify the "iret" location to jump to the repeat_nmi 1246 * o return back to the first NMI 1247 * 1248 * Now on exit of the first NMI, we first clear the stack variable 1249 * The NMI stack will tell any nested NMIs at that point that it is 1250 * nested. Then we pop the stack normally with iret, and if there was 1251 * a nested NMI that updated the copy interrupt stack frame, a 1252 * jump will be made to the repeat_nmi code that will handle the second 1253 * NMI. 1254 * 1255 * However, espfix prevents us from directly returning to userspace 1256 * with a single IRET instruction. Similarly, IRET to user mode 1257 * can fault. We therefore handle NMIs from user space like 1258 * other IST entries. 1259 */ 1260 1261 /* Use %rdx as our temp variable throughout */ 1262 pushq %rdx 1263 1264 testb $3, CS-RIP+8(%rsp) 1265 jz .Lnmi_from_kernel 1266 1267 /* 1268 * NMI from user mode. We need to run on the thread stack, but we 1269 * can't go through the normal entry paths: NMIs are masked, and 1270 * we don't want to enable interrupts, because then we'll end 1271 * up in an awkward situation in which IRQs are on but NMIs 1272 * are off. 1273 */ 1274 1275 SWAPGS 1276 cld 1277 movq %rsp, %rdx 1278 movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp 1279 pushq 5*8(%rdx) /* pt_regs->ss */ 1280 pushq 4*8(%rdx) /* pt_regs->rsp */ 1281 pushq 3*8(%rdx) /* pt_regs->flags */ 1282 pushq 2*8(%rdx) /* pt_regs->cs */ 1283 pushq 1*8(%rdx) /* pt_regs->rip */ 1284 pushq $-1 /* pt_regs->orig_ax */ 1285 pushq %rdi /* pt_regs->di */ 1286 pushq %rsi /* pt_regs->si */ 1287 pushq (%rdx) /* pt_regs->dx */ 1288 pushq %rcx /* pt_regs->cx */ 1289 pushq %rax /* pt_regs->ax */ 1290 pushq %r8 /* pt_regs->r8 */ 1291 pushq %r9 /* pt_regs->r9 */ 1292 pushq %r10 /* pt_regs->r10 */ 1293 pushq %r11 /* pt_regs->r11 */ 1294 pushq %rbx /* pt_regs->rbx */ 1295 pushq %rbp /* pt_regs->rbp */ 1296 pushq %r12 /* pt_regs->r12 */ 1297 pushq %r13 /* pt_regs->r13 */ 1298 pushq %r14 /* pt_regs->r14 */ 1299 pushq %r15 /* pt_regs->r15 */ 1300 1301 /* 1302 * At this point we no longer need to worry about stack damage 1303 * due to nesting -- we're on the normal thread stack and we're 1304 * done with the NMI stack. 1305 */ 1306 1307 movq %rsp, %rdi 1308 movq $-1, %rsi 1309 call do_nmi 1310 1311 /* 1312 * Return back to user mode. We must *not* do the normal exit 1313 * work, because we don't want to enable interrupts. Fortunately, 1314 * do_nmi doesn't modify pt_regs. 1315 */ 1316 SWAPGS 1317 jmp restore_c_regs_and_iret 1318 1319.Lnmi_from_kernel: 1320 /* 1321 * Here's what our stack frame will look like: 1322 * +---------------------------------------------------------+ 1323 * | original SS | 1324 * | original Return RSP | 1325 * | original RFLAGS | 1326 * | original CS | 1327 * | original RIP | 1328 * +---------------------------------------------------------+ 1329 * | temp storage for rdx | 1330 * +---------------------------------------------------------+ 1331 * | "NMI executing" variable | 1332 * +---------------------------------------------------------+ 1333 * | iret SS } Copied from "outermost" frame | 1334 * | iret Return RSP } on each loop iteration; overwritten | 1335 * | iret RFLAGS } by a nested NMI to force another | 1336 * | iret CS } iteration if needed. | 1337 * | iret RIP } | 1338 * +---------------------------------------------------------+ 1339 * | outermost SS } initialized in first_nmi; | 1340 * | outermost Return RSP } will not be changed before | 1341 * | outermost RFLAGS } NMI processing is done. | 1342 * | outermost CS } Copied to "iret" frame on each | 1343 * | outermost RIP } iteration. | 1344 * +---------------------------------------------------------+ 1345 * | pt_regs | 1346 * +---------------------------------------------------------+ 1347 * 1348 * The "original" frame is used by hardware. Before re-enabling 1349 * NMIs, we need to be done with it, and we need to leave enough 1350 * space for the asm code here. 1351 * 1352 * We return by executing IRET while RSP points to the "iret" frame. 1353 * That will either return for real or it will loop back into NMI 1354 * processing. 1355 * 1356 * The "outermost" frame is copied to the "iret" frame on each 1357 * iteration of the loop, so each iteration starts with the "iret" 1358 * frame pointing to the final return target. 1359 */ 1360 1361 /* 1362 * Determine whether we're a nested NMI. 1363 * 1364 * If we interrupted kernel code between repeat_nmi and 1365 * end_repeat_nmi, then we are a nested NMI. We must not 1366 * modify the "iret" frame because it's being written by 1367 * the outer NMI. That's okay; the outer NMI handler is 1368 * about to about to call do_nmi anyway, so we can just 1369 * resume the outer NMI. 1370 */ 1371 1372 movq $repeat_nmi, %rdx 1373 cmpq 8(%rsp), %rdx 1374 ja 1f 1375 movq $end_repeat_nmi, %rdx 1376 cmpq 8(%rsp), %rdx 1377 ja nested_nmi_out 13781: 1379 1380 /* 1381 * Now check "NMI executing". If it's set, then we're nested. 1382 * This will not detect if we interrupted an outer NMI just 1383 * before IRET. 1384 */ 1385 cmpl $1, -8(%rsp) 1386 je nested_nmi 1387 1388 /* 1389 * Now test if the previous stack was an NMI stack. This covers 1390 * the case where we interrupt an outer NMI after it clears 1391 * "NMI executing" but before IRET. We need to be careful, though: 1392 * there is one case in which RSP could point to the NMI stack 1393 * despite there being no NMI active: naughty userspace controls 1394 * RSP at the very beginning of the SYSCALL targets. We can 1395 * pull a fast one on naughty userspace, though: we program 1396 * SYSCALL to mask DF, so userspace cannot cause DF to be set 1397 * if it controls the kernel's RSP. We set DF before we clear 1398 * "NMI executing". 1399 */ 1400 lea 6*8(%rsp), %rdx 1401 /* Compare the NMI stack (rdx) with the stack we came from (4*8(%rsp)) */ 1402 cmpq %rdx, 4*8(%rsp) 1403 /* If the stack pointer is above the NMI stack, this is a normal NMI */ 1404 ja first_nmi 1405 1406 subq $EXCEPTION_STKSZ, %rdx 1407 cmpq %rdx, 4*8(%rsp) 1408 /* If it is below the NMI stack, it is a normal NMI */ 1409 jb first_nmi 1410 1411 /* Ah, it is within the NMI stack. */ 1412 1413 testb $(X86_EFLAGS_DF >> 8), (3*8 + 1)(%rsp) 1414 jz first_nmi /* RSP was user controlled. */ 1415 1416 /* This is a nested NMI. */ 1417 1418nested_nmi: 1419 /* 1420 * Modify the "iret" frame to point to repeat_nmi, forcing another 1421 * iteration of NMI handling. 1422 */ 1423 subq $8, %rsp 1424 leaq -10*8(%rsp), %rdx 1425 pushq $__KERNEL_DS 1426 pushq %rdx 1427 pushfq 1428 pushq $__KERNEL_CS 1429 pushq $repeat_nmi 1430 1431 /* Put stack back */ 1432 addq $(6*8), %rsp 1433 1434nested_nmi_out: 1435 popq %rdx 1436 1437 /* We are returning to kernel mode, so this cannot result in a fault. */ 1438 INTERRUPT_RETURN 1439 1440first_nmi: 1441 /* Restore rdx. */ 1442 movq (%rsp), %rdx 1443 1444 /* Make room for "NMI executing". */ 1445 pushq $0 1446 1447 /* Leave room for the "iret" frame */ 1448 subq $(5*8), %rsp 1449 1450 /* Copy the "original" frame to the "outermost" frame */ 1451 .rept 5 1452 pushq 11*8(%rsp) 1453 .endr 1454 1455 /* Everything up to here is safe from nested NMIs */ 1456 1457#ifdef CONFIG_DEBUG_ENTRY 1458 /* 1459 * For ease of testing, unmask NMIs right away. Disabled by 1460 * default because IRET is very expensive. 1461 */ 1462 pushq $0 /* SS */ 1463 pushq %rsp /* RSP (minus 8 because of the previous push) */ 1464 addq $8, (%rsp) /* Fix up RSP */ 1465 pushfq /* RFLAGS */ 1466 pushq $__KERNEL_CS /* CS */ 1467 pushq $1f /* RIP */ 1468 INTERRUPT_RETURN /* continues at repeat_nmi below */ 14691: 1470#endif 1471 1472repeat_nmi: 1473 /* 1474 * If there was a nested NMI, the first NMI's iret will return 1475 * here. But NMIs are still enabled and we can take another 1476 * nested NMI. The nested NMI checks the interrupted RIP to see 1477 * if it is between repeat_nmi and end_repeat_nmi, and if so 1478 * it will just return, as we are about to repeat an NMI anyway. 1479 * This makes it safe to copy to the stack frame that a nested 1480 * NMI will update. 1481 * 1482 * RSP is pointing to "outermost RIP". gsbase is unknown, but, if 1483 * we're repeating an NMI, gsbase has the same value that it had on 1484 * the first iteration. paranoid_entry will load the kernel 1485 * gsbase if needed before we call do_nmi. "NMI executing" 1486 * is zero. 1487 */ 1488 movq $1, 10*8(%rsp) /* Set "NMI executing". */ 1489 1490 /* 1491 * Copy the "outermost" frame to the "iret" frame. NMIs that nest 1492 * here must not modify the "iret" frame while we're writing to 1493 * it or it will end up containing garbage. 1494 */ 1495 addq $(10*8), %rsp 1496 .rept 5 1497 pushq -6*8(%rsp) 1498 .endr 1499 subq $(5*8), %rsp 1500end_repeat_nmi: 1501 1502 /* 1503 * Everything below this point can be preempted by a nested NMI. 1504 * If this happens, then the inner NMI will change the "iret" 1505 * frame to point back to repeat_nmi. 1506 */ 1507 pushq $-1 /* ORIG_RAX: no syscall to restart */ 1508 ALLOC_PT_GPREGS_ON_STACK 1509 1510 /* 1511 * Use paranoid_entry to handle SWAPGS, but no need to use paranoid_exit 1512 * as we should not be calling schedule in NMI context. 1513 * Even with normal interrupts enabled. An NMI should not be 1514 * setting NEED_RESCHED or anything that normal interrupts and 1515 * exceptions might do. 1516 */ 1517 call paranoid_entry 1518 1519 /* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */ 1520 movq %rsp, %rdi 1521 movq $-1, %rsi 1522 call do_nmi 1523 1524 testl %ebx, %ebx /* swapgs needed? */ 1525 jnz nmi_restore 1526nmi_swapgs: 1527 SWAPGS_UNSAFE_STACK 1528nmi_restore: 1529 RESTORE_EXTRA_REGS 1530 RESTORE_C_REGS 1531 1532 /* Point RSP at the "iret" frame. */ 1533 REMOVE_PT_GPREGS_FROM_STACK 6*8 1534 1535 /* 1536 * Clear "NMI executing". Set DF first so that we can easily 1537 * distinguish the remaining code between here and IRET from 1538 * the SYSCALL entry and exit paths. On a native kernel, we 1539 * could just inspect RIP, but, on paravirt kernels, 1540 * INTERRUPT_RETURN can translate into a jump into a 1541 * hypercall page. 1542 */ 1543 std 1544 movq $0, 5*8(%rsp) /* clear "NMI executing" */ 1545 1546 /* 1547 * INTERRUPT_RETURN reads the "iret" frame and exits the NMI 1548 * stack in a single instruction. We are returning to kernel 1549 * mode, so this cannot result in a fault. 1550 */ 1551 INTERRUPT_RETURN 1552END(nmi) 1553 1554ENTRY(ignore_sysret) 1555 mov $-ENOSYS, %eax 1556 sysret 1557END(ignore_sysret) 1558