1 /* SPDX-License-Identifier: GPL-2.0 */ 2 #ifndef __LINUX_ENTRYCOMMON_H 3 #define __LINUX_ENTRYCOMMON_H 4 5 #include <linux/audit.h> 6 #include <linux/irq-entry-common.h> 7 #include <linux/livepatch.h> 8 #include <linux/ptrace.h> 9 #include <linux/resume_user_mode.h> 10 #include <linux/seccomp.h> 11 #include <linux/sched.h> 12 13 #include <asm/entry-common.h> 14 #include <asm/syscall.h> 15 16 #ifndef _TIF_UPROBE 17 # define _TIF_UPROBE (0) 18 #endif 19 20 /* 21 * SYSCALL_WORK flags handled in syscall_enter_from_user_mode() 22 */ 23 #define SYSCALL_WORK_ENTER (SYSCALL_WORK_SECCOMP | \ 24 SYSCALL_WORK_SYSCALL_TRACEPOINT | \ 25 SYSCALL_WORK_SYSCALL_TRACE | \ 26 SYSCALL_WORK_SYSCALL_EMU | \ 27 SYSCALL_WORK_SYSCALL_AUDIT | \ 28 SYSCALL_WORK_SYSCALL_USER_DISPATCH | \ 29 SYSCALL_WORK_SYSCALL_RSEQ_SLICE) 30 /* 31 * SYSCALL_WORK flags handled in syscall_exit_to_user_mode() 32 */ 33 #define SYSCALL_WORK_EXIT (SYSCALL_WORK_SYSCALL_TRACEPOINT | \ 34 SYSCALL_WORK_SYSCALL_TRACE | \ 35 SYSCALL_WORK_SYSCALL_AUDIT | \ 36 SYSCALL_WORK_SYSCALL_USER_DISPATCH | \ 37 SYSCALL_WORK_SYSCALL_EXIT_TRAP) 38 39 /** 40 * arch_ptrace_report_syscall_entry - Architecture specific ptrace_report_syscall_entry() wrapper 41 * @regs: Pointer to the register state at syscall entry 42 * 43 * Invoked from syscall_trace_enter() to wrap ptrace_report_syscall_entry(). 44 * 45 * This allows architecture specific ptrace_report_syscall_entry() 46 * implementations. If not defined by the architecture this falls back to 47 * to ptrace_report_syscall_entry(). 48 */ 49 static __always_inline int arch_ptrace_report_syscall_entry(struct pt_regs *regs); 50 51 #ifndef arch_ptrace_report_syscall_entry 52 static __always_inline int arch_ptrace_report_syscall_entry(struct pt_regs *regs) 53 { 54 return ptrace_report_syscall_entry(regs); 55 } 56 #endif 57 58 bool syscall_user_dispatch(struct pt_regs *regs); 59 long trace_syscall_enter(struct pt_regs *regs, long syscall); 60 void trace_syscall_exit(struct pt_regs *regs, long ret); 61 62 static inline void syscall_enter_audit(struct pt_regs *regs, long syscall) 63 { 64 if (unlikely(audit_context())) { 65 unsigned long args[6]; 66 67 syscall_get_arguments(current, regs, args); 68 audit_syscall_entry(syscall, args[0], args[1], args[2], args[3]); 69 } 70 } 71 72 static __always_inline long syscall_trace_enter(struct pt_regs *regs, unsigned long work) 73 { 74 long syscall, ret = 0; 75 76 /* 77 * Handle Syscall User Dispatch. This must comes first, since 78 * the ABI here can be something that doesn't make sense for 79 * other syscall_work features. 80 */ 81 if (work & SYSCALL_WORK_SYSCALL_USER_DISPATCH) { 82 if (syscall_user_dispatch(regs)) 83 return -1L; 84 } 85 86 /* 87 * User space got a time slice extension granted and relinquishes 88 * the CPU. The work stops the slice timer to avoid an extra round 89 * through hrtimer_interrupt(). 90 */ 91 if (work & SYSCALL_WORK_SYSCALL_RSEQ_SLICE) 92 rseq_syscall_enter_work(syscall_get_nr(current, regs)); 93 94 /* Handle ptrace */ 95 if (work & (SYSCALL_WORK_SYSCALL_TRACE | SYSCALL_WORK_SYSCALL_EMU)) { 96 ret = arch_ptrace_report_syscall_entry(regs); 97 if (ret || (work & SYSCALL_WORK_SYSCALL_EMU)) 98 return -1L; 99 } 100 101 /* Do seccomp after ptrace, to catch any tracer changes. */ 102 if (work & SYSCALL_WORK_SECCOMP) { 103 ret = __secure_computing(); 104 if (ret == -1L) 105 return ret; 106 } 107 108 /* Either of the above might have changed the syscall number */ 109 syscall = syscall_get_nr(current, regs); 110 111 if (unlikely(work & SYSCALL_WORK_SYSCALL_TRACEPOINT)) 112 syscall = trace_syscall_enter(regs, syscall); 113 114 syscall_enter_audit(regs, syscall); 115 116 return ret ? : syscall; 117 } 118 119 /** 120 * syscall_enter_from_user_mode_work - Check and handle work before invoking 121 * a syscall 122 * @regs: Pointer to currents pt_regs 123 * @syscall: The syscall number 124 * 125 * Invoked from architecture specific syscall entry code with interrupts 126 * enabled after invoking enter_from_user_mode(), enabling interrupts and 127 * extra architecture specific work. 128 * 129 * Returns: The original or a modified syscall number 130 * 131 * If the returned syscall number is -1 then the syscall should be 132 * skipped. In this case the caller may invoke syscall_set_error() or 133 * syscall_set_return_value() first. If neither of those are called and -1 134 * is returned, then the syscall will fail with ENOSYS. 135 * 136 * It handles the following work items: 137 * 138 * 1) syscall_work flag dependent invocations of 139 * ptrace_report_syscall_entry(), __secure_computing(), trace_sys_enter() 140 * 2) Invocation of audit_syscall_entry() 141 */ 142 static __always_inline long syscall_enter_from_user_mode_work(struct pt_regs *regs, long syscall) 143 { 144 unsigned long work = READ_ONCE(current_thread_info()->syscall_work); 145 146 if (work & SYSCALL_WORK_ENTER) 147 syscall = syscall_trace_enter(regs, work); 148 149 return syscall; 150 } 151 152 /** 153 * syscall_enter_from_user_mode - Establish state and check and handle work 154 * before invoking a syscall 155 * @regs: Pointer to currents pt_regs 156 * @syscall: The syscall number 157 * 158 * Invoked from architecture specific syscall entry code with interrupts 159 * disabled. The calling code has to be non-instrumentable. When the 160 * function returns all state is correct, interrupts are enabled and the 161 * subsequent functions can be instrumented. 162 * 163 * This is the combination of enter_from_user_mode() and 164 * syscall_enter_from_user_mode_work() to be used when there is no 165 * architecture specific work to be done between the two. 166 * 167 * Returns: The original or a modified syscall number. See 168 * syscall_enter_from_user_mode_work() for further explanation. 169 */ 170 static __always_inline long syscall_enter_from_user_mode(struct pt_regs *regs, long syscall) 171 { 172 long ret; 173 174 enter_from_user_mode(regs); 175 176 instrumentation_begin(); 177 local_irq_enable(); 178 ret = syscall_enter_from_user_mode_work(regs, syscall); 179 instrumentation_end(); 180 181 return ret; 182 } 183 184 /* 185 * If SYSCALL_EMU is set, then the only reason to report is when 186 * SINGLESTEP is set (i.e. PTRACE_SYSEMU_SINGLESTEP). This syscall 187 * instruction has been already reported in syscall_enter_from_user_mode(). 188 */ 189 static __always_inline bool report_single_step(unsigned long work) 190 { 191 if (work & SYSCALL_WORK_SYSCALL_EMU) 192 return false; 193 194 return work & SYSCALL_WORK_SYSCALL_EXIT_TRAP; 195 } 196 197 /** 198 * arch_ptrace_report_syscall_exit - Architecture specific ptrace_report_syscall_exit() 199 * @regs: Pointer to the register state at syscall exit 200 * @step: Indicates a single-step exit rather than a normal syscall exit 201 * 202 * This allows architecture specific ptrace_report_syscall_exit() 203 * implementations. If not defined by the architecture this falls back to 204 * to ptrace_report_syscall_exit(). 205 */ 206 static __always_inline void arch_ptrace_report_syscall_exit(struct pt_regs *regs, 207 int step); 208 209 #ifndef arch_ptrace_report_syscall_exit 210 static __always_inline void arch_ptrace_report_syscall_exit(struct pt_regs *regs, 211 int step) 212 { 213 ptrace_report_syscall_exit(regs, step); 214 } 215 #endif 216 217 /** 218 * syscall_exit_work - Handle work before returning to user mode 219 * @regs: Pointer to current pt_regs 220 * @work: Current thread syscall work 221 * 222 * Do one-time syscall specific work. 223 */ 224 static __always_inline void syscall_exit_work(struct pt_regs *regs, unsigned long work) 225 { 226 bool step; 227 228 /* 229 * If the syscall was rolled back due to syscall user dispatching, 230 * then the tracers below are not invoked for the same reason as 231 * the entry side was not invoked in syscall_trace_enter(): The ABI 232 * of these syscalls is unknown. 233 */ 234 if (work & SYSCALL_WORK_SYSCALL_USER_DISPATCH) { 235 if (unlikely(current->syscall_dispatch.on_dispatch)) { 236 current->syscall_dispatch.on_dispatch = false; 237 return; 238 } 239 } 240 241 audit_syscall_exit(regs); 242 243 if (work & SYSCALL_WORK_SYSCALL_TRACEPOINT) 244 trace_syscall_exit(regs, syscall_get_return_value(current, regs)); 245 246 step = report_single_step(work); 247 if (step || work & SYSCALL_WORK_SYSCALL_TRACE) 248 arch_ptrace_report_syscall_exit(regs, step); 249 } 250 251 /** 252 * syscall_exit_to_user_mode_work - Handle one time work before returning to user mode 253 * @regs: Pointer to currents pt_regs 254 * 255 * Step 1 of syscall_exit_to_user_mode() with the same calling convention. 256 * 257 * The caller must invoke steps 2-3 of syscall_exit_to_user_mode() afterwards. 258 */ 259 static __always_inline void syscall_exit_to_user_mode_work(struct pt_regs *regs) 260 { 261 unsigned long work = READ_ONCE(current_thread_info()->syscall_work); 262 unsigned long nr = syscall_get_nr(current, regs); 263 264 CT_WARN_ON(ct_state() != CT_STATE_KERNEL); 265 266 if (IS_ENABLED(CONFIG_PROVE_LOCKING)) { 267 if (WARN(irqs_disabled(), "syscall %lu left IRQs disabled", nr)) 268 local_irq_enable(); 269 } 270 271 rseq_debug_syscall_return(regs); 272 273 /* 274 * Do one-time syscall specific work. If these work items are 275 * enabled, we want to run them exactly once per syscall exit with 276 * interrupts enabled. 277 */ 278 if (unlikely(work & SYSCALL_WORK_EXIT)) 279 syscall_exit_work(regs, work); 280 } 281 282 /** 283 * syscall_exit_to_user_mode - Handle work before returning to user mode 284 * @regs: Pointer to currents pt_regs 285 * 286 * Invoked with interrupts enabled and fully valid @regs. Returns with all 287 * work handled, interrupts disabled such that the caller can immediately 288 * switch to user mode. Called from architecture specific syscall and ret 289 * from fork code. 290 * 291 * The call order is: 292 * 1) One-time syscall exit work: 293 * - rseq syscall exit 294 * - audit 295 * - syscall tracing 296 * - ptrace (single stepping) 297 * 298 * 2) Preparatory work 299 * - Disable interrupts 300 * - Exit to user mode loop (common TIF handling). Invokes 301 * arch_exit_to_user_mode_work() for architecture specific TIF work 302 * - Architecture specific one time work arch_exit_to_user_mode_prepare() 303 * - Address limit and lockdep checks 304 * 305 * 3) Final transition (lockdep, tracing, context tracking, RCU), i.e. the 306 * functionality in exit_to_user_mode(). 307 * 308 * This is a combination of syscall_exit_to_user_mode_work() (1), disabling 309 * interrupts followed by syscall_exit_to_user_mode_prepare() (2) and 310 * exit_to_user_mode() (3). This function is preferred unless there is a 311 * compelling architectural reason to invoke the functions separately. 312 */ 313 static __always_inline void syscall_exit_to_user_mode(struct pt_regs *regs) 314 { 315 instrumentation_begin(); 316 syscall_exit_to_user_mode_work(regs); 317 local_irq_disable(); 318 syscall_exit_to_user_mode_prepare(regs); 319 instrumentation_end(); 320 exit_to_user_mode(); 321 } 322 323 #endif 324