1 /* SPDX-License-Identifier: GPL-2.0 */ 2 #ifndef __LINUX_ENTRYCOMMON_H 3 #define __LINUX_ENTRYCOMMON_H 4 5 #include <linux/static_call_types.h> 6 #include <linux/ptrace.h> 7 #include <linux/syscalls.h> 8 #include <linux/seccomp.h> 9 #include <linux/sched.h> 10 #include <linux/context_tracking.h> 11 #include <linux/livepatch.h> 12 #include <linux/resume_user_mode.h> 13 #include <linux/tick.h> 14 #include <linux/kmsan.h> 15 16 #include <asm/entry-common.h> 17 #include <asm/syscall.h> 18 19 /* 20 * Define dummy _TIF work flags if not defined by the architecture or for 21 * disabled functionality. 22 */ 23 #ifndef _TIF_PATCH_PENDING 24 # define _TIF_PATCH_PENDING (0) 25 #endif 26 27 #ifndef _TIF_UPROBE 28 # define _TIF_UPROBE (0) 29 #endif 30 31 /* 32 * SYSCALL_WORK flags handled in syscall_enter_from_user_mode() 33 */ 34 #ifndef ARCH_SYSCALL_WORK_ENTER 35 # define ARCH_SYSCALL_WORK_ENTER (0) 36 #endif 37 38 /* 39 * SYSCALL_WORK flags handled in syscall_exit_to_user_mode() 40 */ 41 #ifndef ARCH_SYSCALL_WORK_EXIT 42 # define ARCH_SYSCALL_WORK_EXIT (0) 43 #endif 44 45 #define SYSCALL_WORK_ENTER (SYSCALL_WORK_SECCOMP | \ 46 SYSCALL_WORK_SYSCALL_TRACEPOINT | \ 47 SYSCALL_WORK_SYSCALL_TRACE | \ 48 SYSCALL_WORK_SYSCALL_EMU | \ 49 SYSCALL_WORK_SYSCALL_AUDIT | \ 50 SYSCALL_WORK_SYSCALL_USER_DISPATCH | \ 51 ARCH_SYSCALL_WORK_ENTER) 52 #define SYSCALL_WORK_EXIT (SYSCALL_WORK_SYSCALL_TRACEPOINT | \ 53 SYSCALL_WORK_SYSCALL_TRACE | \ 54 SYSCALL_WORK_SYSCALL_AUDIT | \ 55 SYSCALL_WORK_SYSCALL_USER_DISPATCH | \ 56 SYSCALL_WORK_SYSCALL_EXIT_TRAP | \ 57 ARCH_SYSCALL_WORK_EXIT) 58 59 /* 60 * TIF flags handled in exit_to_user_mode_loop() 61 */ 62 #ifndef ARCH_EXIT_TO_USER_MODE_WORK 63 # define ARCH_EXIT_TO_USER_MODE_WORK (0) 64 #endif 65 66 #define EXIT_TO_USER_MODE_WORK \ 67 (_TIF_SIGPENDING | _TIF_NOTIFY_RESUME | _TIF_UPROBE | \ 68 _TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY | \ 69 _TIF_PATCH_PENDING | _TIF_NOTIFY_SIGNAL | \ 70 ARCH_EXIT_TO_USER_MODE_WORK) 71 72 /** 73 * arch_enter_from_user_mode - Architecture specific sanity check for user mode regs 74 * @regs: Pointer to currents pt_regs 75 * 76 * Defaults to an empty implementation. Can be replaced by architecture 77 * specific code. 78 * 79 * Invoked from syscall_enter_from_user_mode() in the non-instrumentable 80 * section. Use __always_inline so the compiler cannot push it out of line 81 * and make it instrumentable. 82 */ 83 static __always_inline void arch_enter_from_user_mode(struct pt_regs *regs); 84 85 #ifndef arch_enter_from_user_mode 86 static __always_inline void arch_enter_from_user_mode(struct pt_regs *regs) {} 87 #endif 88 89 /** 90 * enter_from_user_mode - Establish state when coming from user mode 91 * 92 * Syscall/interrupt entry disables interrupts, but user mode is traced as 93 * interrupts enabled. Also with NO_HZ_FULL RCU might be idle. 94 * 95 * 1) Tell lockdep that interrupts are disabled 96 * 2) Invoke context tracking if enabled to reactivate RCU 97 * 3) Trace interrupts off state 98 * 99 * Invoked from architecture specific syscall entry code with interrupts 100 * disabled. The calling code has to be non-instrumentable. When the 101 * function returns all state is correct and interrupts are still 102 * disabled. The subsequent functions can be instrumented. 103 * 104 * This is invoked when there is architecture specific functionality to be 105 * done between establishing state and enabling interrupts. The caller must 106 * enable interrupts before invoking syscall_enter_from_user_mode_work(). 107 */ 108 static __always_inline void enter_from_user_mode(struct pt_regs *regs) 109 { 110 arch_enter_from_user_mode(regs); 111 lockdep_hardirqs_off(CALLER_ADDR0); 112 113 CT_WARN_ON(__ct_state() != CT_STATE_USER); 114 user_exit_irqoff(); 115 116 instrumentation_begin(); 117 kmsan_unpoison_entry_regs(regs); 118 trace_hardirqs_off_finish(); 119 instrumentation_end(); 120 } 121 122 /** 123 * syscall_enter_from_user_mode_prepare - Establish state and enable interrupts 124 * @regs: Pointer to currents pt_regs 125 * 126 * Invoked from architecture specific syscall entry code with interrupts 127 * disabled. The calling code has to be non-instrumentable. When the 128 * function returns all state is correct, interrupts are enabled and the 129 * subsequent functions can be instrumented. 130 * 131 * This handles lockdep, RCU (context tracking) and tracing state, i.e. 132 * the functionality provided by enter_from_user_mode(). 133 * 134 * This is invoked when there is extra architecture specific functionality 135 * to be done between establishing state and handling user mode entry work. 136 */ 137 void syscall_enter_from_user_mode_prepare(struct pt_regs *regs); 138 139 long syscall_trace_enter(struct pt_regs *regs, long syscall, 140 unsigned long work); 141 142 /** 143 * syscall_enter_from_user_mode_work - Check and handle work before invoking 144 * a syscall 145 * @regs: Pointer to currents pt_regs 146 * @syscall: The syscall number 147 * 148 * Invoked from architecture specific syscall entry code with interrupts 149 * enabled after invoking syscall_enter_from_user_mode_prepare() and extra 150 * architecture specific work. 151 * 152 * Returns: The original or a modified syscall number 153 * 154 * If the returned syscall number is -1 then the syscall should be 155 * skipped. In this case the caller may invoke syscall_set_error() or 156 * syscall_set_return_value() first. If neither of those are called and -1 157 * is returned, then the syscall will fail with ENOSYS. 158 * 159 * It handles the following work items: 160 * 161 * 1) syscall_work flag dependent invocations of 162 * ptrace_report_syscall_entry(), __secure_computing(), trace_sys_enter() 163 * 2) Invocation of audit_syscall_entry() 164 */ 165 static __always_inline long syscall_enter_from_user_mode_work(struct pt_regs *regs, long syscall) 166 { 167 unsigned long work = READ_ONCE(current_thread_info()->syscall_work); 168 169 if (work & SYSCALL_WORK_ENTER) 170 syscall = syscall_trace_enter(regs, syscall, work); 171 172 return syscall; 173 } 174 175 /** 176 * syscall_enter_from_user_mode - Establish state and check and handle work 177 * before invoking a syscall 178 * @regs: Pointer to currents pt_regs 179 * @syscall: The syscall number 180 * 181 * Invoked from architecture specific syscall entry code with interrupts 182 * disabled. The calling code has to be non-instrumentable. When the 183 * function returns all state is correct, interrupts are enabled and the 184 * subsequent functions can be instrumented. 185 * 186 * This is combination of syscall_enter_from_user_mode_prepare() and 187 * syscall_enter_from_user_mode_work(). 188 * 189 * Returns: The original or a modified syscall number. See 190 * syscall_enter_from_user_mode_work() for further explanation. 191 */ 192 static __always_inline long syscall_enter_from_user_mode(struct pt_regs *regs, long syscall) 193 { 194 long ret; 195 196 enter_from_user_mode(regs); 197 198 instrumentation_begin(); 199 local_irq_enable(); 200 ret = syscall_enter_from_user_mode_work(regs, syscall); 201 instrumentation_end(); 202 203 return ret; 204 } 205 206 /** 207 * local_irq_enable_exit_to_user - Exit to user variant of local_irq_enable() 208 * @ti_work: Cached TIF flags gathered with interrupts disabled 209 * 210 * Defaults to local_irq_enable(). Can be supplied by architecture specific 211 * code. 212 */ 213 static inline void local_irq_enable_exit_to_user(unsigned long ti_work); 214 215 #ifndef local_irq_enable_exit_to_user 216 static inline void local_irq_enable_exit_to_user(unsigned long ti_work) 217 { 218 local_irq_enable(); 219 } 220 #endif 221 222 /** 223 * local_irq_disable_exit_to_user - Exit to user variant of local_irq_disable() 224 * 225 * Defaults to local_irq_disable(). Can be supplied by architecture specific 226 * code. 227 */ 228 static inline void local_irq_disable_exit_to_user(void); 229 230 #ifndef local_irq_disable_exit_to_user 231 static inline void local_irq_disable_exit_to_user(void) 232 { 233 local_irq_disable(); 234 } 235 #endif 236 237 /** 238 * arch_exit_to_user_mode_work - Architecture specific TIF work for exit 239 * to user mode. 240 * @regs: Pointer to currents pt_regs 241 * @ti_work: Cached TIF flags gathered with interrupts disabled 242 * 243 * Invoked from exit_to_user_mode_loop() with interrupt enabled 244 * 245 * Defaults to NOOP. Can be supplied by architecture specific code. 246 */ 247 static inline void arch_exit_to_user_mode_work(struct pt_regs *regs, 248 unsigned long ti_work); 249 250 #ifndef arch_exit_to_user_mode_work 251 static inline void arch_exit_to_user_mode_work(struct pt_regs *regs, 252 unsigned long ti_work) 253 { 254 } 255 #endif 256 257 /** 258 * arch_exit_to_user_mode_prepare - Architecture specific preparation for 259 * exit to user mode. 260 * @regs: Pointer to currents pt_regs 261 * @ti_work: Cached TIF flags gathered with interrupts disabled 262 * 263 * Invoked from exit_to_user_mode_prepare() with interrupt disabled as the last 264 * function before return. Defaults to NOOP. 265 */ 266 static inline void arch_exit_to_user_mode_prepare(struct pt_regs *regs, 267 unsigned long ti_work); 268 269 #ifndef arch_exit_to_user_mode_prepare 270 static inline void arch_exit_to_user_mode_prepare(struct pt_regs *regs, 271 unsigned long ti_work) 272 { 273 } 274 #endif 275 276 /** 277 * arch_exit_to_user_mode - Architecture specific final work before 278 * exit to user mode. 279 * 280 * Invoked from exit_to_user_mode() with interrupt disabled as the last 281 * function before return. Defaults to NOOP. 282 * 283 * This needs to be __always_inline because it is non-instrumentable code 284 * invoked after context tracking switched to user mode. 285 * 286 * An architecture implementation must not do anything complex, no locking 287 * etc. The main purpose is for speculation mitigations. 288 */ 289 static __always_inline void arch_exit_to_user_mode(void); 290 291 #ifndef arch_exit_to_user_mode 292 static __always_inline void arch_exit_to_user_mode(void) { } 293 #endif 294 295 /** 296 * arch_do_signal_or_restart - Architecture specific signal delivery function 297 * @regs: Pointer to currents pt_regs 298 * 299 * Invoked from exit_to_user_mode_loop(). 300 */ 301 void arch_do_signal_or_restart(struct pt_regs *regs); 302 303 /** 304 * exit_to_user_mode_loop - do any pending work before leaving to user space 305 */ 306 unsigned long exit_to_user_mode_loop(struct pt_regs *regs, 307 unsigned long ti_work); 308 309 /** 310 * exit_to_user_mode_prepare - call exit_to_user_mode_loop() if required 311 * @regs: Pointer to pt_regs on entry stack 312 * 313 * 1) check that interrupts are disabled 314 * 2) call tick_nohz_user_enter_prepare() 315 * 3) call exit_to_user_mode_loop() if any flags from 316 * EXIT_TO_USER_MODE_WORK are set 317 * 4) check that interrupts are still disabled 318 */ 319 static __always_inline void exit_to_user_mode_prepare(struct pt_regs *regs) 320 { 321 unsigned long ti_work; 322 323 lockdep_assert_irqs_disabled(); 324 325 /* Flush pending rcuog wakeup before the last need_resched() check */ 326 tick_nohz_user_enter_prepare(); 327 328 ti_work = read_thread_flags(); 329 if (unlikely(ti_work & EXIT_TO_USER_MODE_WORK)) 330 ti_work = exit_to_user_mode_loop(regs, ti_work); 331 332 arch_exit_to_user_mode_prepare(regs, ti_work); 333 334 /* Ensure that kernel state is sane for a return to userspace */ 335 kmap_assert_nomap(); 336 lockdep_assert_irqs_disabled(); 337 lockdep_sys_exit(); 338 } 339 340 /** 341 * exit_to_user_mode - Fixup state when exiting to user mode 342 * 343 * Syscall/interrupt exit enables interrupts, but the kernel state is 344 * interrupts disabled when this is invoked. Also tell RCU about it. 345 * 346 * 1) Trace interrupts on state 347 * 2) Invoke context tracking if enabled to adjust RCU state 348 * 3) Invoke architecture specific last minute exit code, e.g. speculation 349 * mitigations, etc.: arch_exit_to_user_mode() 350 * 4) Tell lockdep that interrupts are enabled 351 * 352 * Invoked from architecture specific code when syscall_exit_to_user_mode() 353 * is not suitable as the last step before returning to userspace. Must be 354 * invoked with interrupts disabled and the caller must be 355 * non-instrumentable. 356 * The caller has to invoke syscall_exit_to_user_mode_work() before this. 357 */ 358 static __always_inline void exit_to_user_mode(void) 359 { 360 instrumentation_begin(); 361 trace_hardirqs_on_prepare(); 362 lockdep_hardirqs_on_prepare(); 363 instrumentation_end(); 364 365 user_enter_irqoff(); 366 arch_exit_to_user_mode(); 367 lockdep_hardirqs_on(CALLER_ADDR0); 368 } 369 370 /** 371 * syscall_exit_work - Handle work before returning to user mode 372 * @regs: Pointer to current pt_regs 373 * @work: Current thread syscall work 374 * 375 * Do one-time syscall specific work. 376 */ 377 void syscall_exit_work(struct pt_regs *regs, unsigned long work); 378 379 /** 380 * syscall_exit_to_user_mode_work - Handle work before returning to user mode 381 * @regs: Pointer to currents pt_regs 382 * 383 * Same as step 1 and 2 of syscall_exit_to_user_mode() but without calling 384 * exit_to_user_mode() to perform the final transition to user mode. 385 * 386 * Calling convention is the same as for syscall_exit_to_user_mode() and it 387 * returns with all work handled and interrupts disabled. The caller must 388 * invoke exit_to_user_mode() before actually switching to user mode to 389 * make the final state transitions. Interrupts must stay disabled between 390 * return from this function and the invocation of exit_to_user_mode(). 391 */ 392 static __always_inline void syscall_exit_to_user_mode_work(struct pt_regs *regs) 393 { 394 unsigned long work = READ_ONCE(current_thread_info()->syscall_work); 395 unsigned long nr = syscall_get_nr(current, regs); 396 397 CT_WARN_ON(ct_state() != CT_STATE_KERNEL); 398 399 if (IS_ENABLED(CONFIG_PROVE_LOCKING)) { 400 if (WARN(irqs_disabled(), "syscall %lu left IRQs disabled", nr)) 401 local_irq_enable(); 402 } 403 404 rseq_syscall(regs); 405 406 /* 407 * Do one-time syscall specific work. If these work items are 408 * enabled, we want to run them exactly once per syscall exit with 409 * interrupts enabled. 410 */ 411 if (unlikely(work & SYSCALL_WORK_EXIT)) 412 syscall_exit_work(regs, work); 413 local_irq_disable_exit_to_user(); 414 exit_to_user_mode_prepare(regs); 415 } 416 417 /** 418 * syscall_exit_to_user_mode - Handle work before returning to user mode 419 * @regs: Pointer to currents pt_regs 420 * 421 * Invoked with interrupts enabled and fully valid regs. Returns with all 422 * work handled, interrupts disabled such that the caller can immediately 423 * switch to user mode. Called from architecture specific syscall and ret 424 * from fork code. 425 * 426 * The call order is: 427 * 1) One-time syscall exit work: 428 * - rseq syscall exit 429 * - audit 430 * - syscall tracing 431 * - ptrace (single stepping) 432 * 433 * 2) Preparatory work 434 * - Exit to user mode loop (common TIF handling). Invokes 435 * arch_exit_to_user_mode_work() for architecture specific TIF work 436 * - Architecture specific one time work arch_exit_to_user_mode_prepare() 437 * - Address limit and lockdep checks 438 * 439 * 3) Final transition (lockdep, tracing, context tracking, RCU), i.e. the 440 * functionality in exit_to_user_mode(). 441 * 442 * This is a combination of syscall_exit_to_user_mode_work() (1,2) and 443 * exit_to_user_mode(). This function is preferred unless there is a 444 * compelling architectural reason to use the separate functions. 445 */ 446 static __always_inline void syscall_exit_to_user_mode(struct pt_regs *regs) 447 { 448 instrumentation_begin(); 449 syscall_exit_to_user_mode_work(regs); 450 instrumentation_end(); 451 exit_to_user_mode(); 452 } 453 454 /** 455 * irqentry_enter_from_user_mode - Establish state before invoking the irq handler 456 * @regs: Pointer to currents pt_regs 457 * 458 * Invoked from architecture specific entry code with interrupts disabled. 459 * Can only be called when the interrupt entry came from user mode. The 460 * calling code must be non-instrumentable. When the function returns all 461 * state is correct and the subsequent functions can be instrumented. 462 * 463 * The function establishes state (lockdep, RCU (context tracking), tracing) 464 */ 465 void irqentry_enter_from_user_mode(struct pt_regs *regs); 466 467 /** 468 * irqentry_exit_to_user_mode - Interrupt exit work 469 * @regs: Pointer to current's pt_regs 470 * 471 * Invoked with interrupts disabled and fully valid regs. Returns with all 472 * work handled, interrupts disabled such that the caller can immediately 473 * switch to user mode. Called from architecture specific interrupt 474 * handling code. 475 * 476 * The call order is #2 and #3 as described in syscall_exit_to_user_mode(). 477 * Interrupt exit is not invoking #1 which is the syscall specific one time 478 * work. 479 */ 480 void irqentry_exit_to_user_mode(struct pt_regs *regs); 481 482 #ifndef irqentry_state 483 /** 484 * struct irqentry_state - Opaque object for exception state storage 485 * @exit_rcu: Used exclusively in the irqentry_*() calls; signals whether the 486 * exit path has to invoke ct_irq_exit(). 487 * @lockdep: Used exclusively in the irqentry_nmi_*() calls; ensures that 488 * lockdep state is restored correctly on exit from nmi. 489 * 490 * This opaque object is filled in by the irqentry_*_enter() functions and 491 * must be passed back into the corresponding irqentry_*_exit() functions 492 * when the exception is complete. 493 * 494 * Callers of irqentry_*_[enter|exit]() must consider this structure opaque 495 * and all members private. Descriptions of the members are provided to aid in 496 * the maintenance of the irqentry_*() functions. 497 */ 498 typedef struct irqentry_state { 499 union { 500 bool exit_rcu; 501 bool lockdep; 502 }; 503 } irqentry_state_t; 504 #endif 505 506 /** 507 * irqentry_enter - Handle state tracking on ordinary interrupt entries 508 * @regs: Pointer to pt_regs of interrupted context 509 * 510 * Invokes: 511 * - lockdep irqflag state tracking as low level ASM entry disabled 512 * interrupts. 513 * 514 * - Context tracking if the exception hit user mode. 515 * 516 * - The hardirq tracer to keep the state consistent as low level ASM 517 * entry disabled interrupts. 518 * 519 * As a precondition, this requires that the entry came from user mode, 520 * idle, or a kernel context in which RCU is watching. 521 * 522 * For kernel mode entries RCU handling is done conditional. If RCU is 523 * watching then the only RCU requirement is to check whether the tick has 524 * to be restarted. If RCU is not watching then ct_irq_enter() has to be 525 * invoked on entry and ct_irq_exit() on exit. 526 * 527 * Avoiding the ct_irq_enter/exit() calls is an optimization but also 528 * solves the problem of kernel mode pagefaults which can schedule, which 529 * is not possible after invoking ct_irq_enter() without undoing it. 530 * 531 * For user mode entries irqentry_enter_from_user_mode() is invoked to 532 * establish the proper context for NOHZ_FULL. Otherwise scheduling on exit 533 * would not be possible. 534 * 535 * Returns: An opaque object that must be passed to idtentry_exit() 536 */ 537 irqentry_state_t noinstr irqentry_enter(struct pt_regs *regs); 538 539 /** 540 * irqentry_exit_cond_resched - Conditionally reschedule on return from interrupt 541 * 542 * Conditional reschedule with additional sanity checks. 543 */ 544 void raw_irqentry_exit_cond_resched(void); 545 #ifdef CONFIG_PREEMPT_DYNAMIC 546 #if defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL) 547 #define irqentry_exit_cond_resched_dynamic_enabled raw_irqentry_exit_cond_resched 548 #define irqentry_exit_cond_resched_dynamic_disabled NULL 549 DECLARE_STATIC_CALL(irqentry_exit_cond_resched, raw_irqentry_exit_cond_resched); 550 #define irqentry_exit_cond_resched() static_call(irqentry_exit_cond_resched)() 551 #elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY) 552 DECLARE_STATIC_KEY_TRUE(sk_dynamic_irqentry_exit_cond_resched); 553 void dynamic_irqentry_exit_cond_resched(void); 554 #define irqentry_exit_cond_resched() dynamic_irqentry_exit_cond_resched() 555 #endif 556 #else /* CONFIG_PREEMPT_DYNAMIC */ 557 #define irqentry_exit_cond_resched() raw_irqentry_exit_cond_resched() 558 #endif /* CONFIG_PREEMPT_DYNAMIC */ 559 560 /** 561 * irqentry_exit - Handle return from exception that used irqentry_enter() 562 * @regs: Pointer to pt_regs (exception entry regs) 563 * @state: Return value from matching call to irqentry_enter() 564 * 565 * Depending on the return target (kernel/user) this runs the necessary 566 * preemption and work checks if possible and required and returns to 567 * the caller with interrupts disabled and no further work pending. 568 * 569 * This is the last action before returning to the low level ASM code which 570 * just needs to return to the appropriate context. 571 * 572 * Counterpart to irqentry_enter(). 573 */ 574 void noinstr irqentry_exit(struct pt_regs *regs, irqentry_state_t state); 575 576 /** 577 * irqentry_nmi_enter - Handle NMI entry 578 * @regs: Pointer to currents pt_regs 579 * 580 * Similar to irqentry_enter() but taking care of the NMI constraints. 581 */ 582 irqentry_state_t noinstr irqentry_nmi_enter(struct pt_regs *regs); 583 584 /** 585 * irqentry_nmi_exit - Handle return from NMI handling 586 * @regs: Pointer to pt_regs (NMI entry regs) 587 * @irq_state: Return value from matching call to irqentry_nmi_enter() 588 * 589 * Last action before returning to the low level assembly code. 590 * 591 * Counterpart to irqentry_nmi_enter(). 592 */ 593 void noinstr irqentry_nmi_exit(struct pt_regs *regs, irqentry_state_t irq_state); 594 595 #endif 596