1 /* SPDX-License-Identifier: GPL-2.0 */ 2 #ifndef __LINUX_ENTRYCOMMON_H 3 #define __LINUX_ENTRYCOMMON_H 4 5 #include <linux/audit.h> 6 #include <linux/irq-entry-common.h> 7 #include <linux/livepatch.h> 8 #include <linux/ptrace.h> 9 #include <linux/resume_user_mode.h> 10 #include <linux/seccomp.h> 11 #include <linux/sched.h> 12 13 #include <asm/entry-common.h> 14 #include <asm/syscall.h> 15 16 #ifndef _TIF_UPROBE 17 # define _TIF_UPROBE (0) 18 #endif 19 20 /* 21 * SYSCALL_WORK flags handled in syscall_enter_from_user_mode() 22 */ 23 #ifndef ARCH_SYSCALL_WORK_ENTER 24 # define ARCH_SYSCALL_WORK_ENTER (0) 25 #endif 26 27 /* 28 * SYSCALL_WORK flags handled in syscall_exit_to_user_mode() 29 */ 30 #ifndef ARCH_SYSCALL_WORK_EXIT 31 # define ARCH_SYSCALL_WORK_EXIT (0) 32 #endif 33 34 #define SYSCALL_WORK_ENTER (SYSCALL_WORK_SECCOMP | \ 35 SYSCALL_WORK_SYSCALL_TRACEPOINT | \ 36 SYSCALL_WORK_SYSCALL_TRACE | \ 37 SYSCALL_WORK_SYSCALL_EMU | \ 38 SYSCALL_WORK_SYSCALL_AUDIT | \ 39 SYSCALL_WORK_SYSCALL_USER_DISPATCH | \ 40 SYSCALL_WORK_SYSCALL_RSEQ_SLICE | \ 41 ARCH_SYSCALL_WORK_ENTER) 42 #define SYSCALL_WORK_EXIT (SYSCALL_WORK_SYSCALL_TRACEPOINT | \ 43 SYSCALL_WORK_SYSCALL_TRACE | \ 44 SYSCALL_WORK_SYSCALL_AUDIT | \ 45 SYSCALL_WORK_SYSCALL_USER_DISPATCH | \ 46 SYSCALL_WORK_SYSCALL_EXIT_TRAP | \ 47 ARCH_SYSCALL_WORK_EXIT) 48 49 /** 50 * arch_ptrace_report_syscall_entry - Architecture specific ptrace_report_syscall_entry() wrapper 51 * 52 * Invoked from syscall_trace_enter() to wrap ptrace_report_syscall_entry(). 53 * 54 * This allows architecture specific ptrace_report_syscall_entry() 55 * implementations. If not defined by the architecture this falls back to 56 * to ptrace_report_syscall_entry(). 57 */ 58 static __always_inline int arch_ptrace_report_syscall_entry(struct pt_regs *regs); 59 60 #ifndef arch_ptrace_report_syscall_entry 61 static __always_inline int arch_ptrace_report_syscall_entry(struct pt_regs *regs) 62 { 63 return ptrace_report_syscall_entry(regs); 64 } 65 #endif 66 67 bool syscall_user_dispatch(struct pt_regs *regs); 68 long trace_syscall_enter(struct pt_regs *regs, long syscall); 69 void trace_syscall_exit(struct pt_regs *regs, long ret); 70 71 static inline void syscall_enter_audit(struct pt_regs *regs, long syscall) 72 { 73 if (unlikely(audit_context())) { 74 unsigned long args[6]; 75 76 syscall_get_arguments(current, regs, args); 77 audit_syscall_entry(syscall, args[0], args[1], args[2], args[3]); 78 } 79 } 80 81 static __always_inline long syscall_trace_enter(struct pt_regs *regs, unsigned long work) 82 { 83 long syscall, ret = 0; 84 85 /* 86 * Handle Syscall User Dispatch. This must comes first, since 87 * the ABI here can be something that doesn't make sense for 88 * other syscall_work features. 89 */ 90 if (work & SYSCALL_WORK_SYSCALL_USER_DISPATCH) { 91 if (syscall_user_dispatch(regs)) 92 return -1L; 93 } 94 95 /* 96 * User space got a time slice extension granted and relinquishes 97 * the CPU. The work stops the slice timer to avoid an extra round 98 * through hrtimer_interrupt(). 99 */ 100 if (work & SYSCALL_WORK_SYSCALL_RSEQ_SLICE) 101 rseq_syscall_enter_work(syscall_get_nr(current, regs)); 102 103 /* Handle ptrace */ 104 if (work & (SYSCALL_WORK_SYSCALL_TRACE | SYSCALL_WORK_SYSCALL_EMU)) { 105 ret = arch_ptrace_report_syscall_entry(regs); 106 if (ret || (work & SYSCALL_WORK_SYSCALL_EMU)) 107 return -1L; 108 } 109 110 /* Do seccomp after ptrace, to catch any tracer changes. */ 111 if (work & SYSCALL_WORK_SECCOMP) { 112 ret = __secure_computing(); 113 if (ret == -1L) 114 return ret; 115 } 116 117 /* Either of the above might have changed the syscall number */ 118 syscall = syscall_get_nr(current, regs); 119 120 if (unlikely(work & SYSCALL_WORK_SYSCALL_TRACEPOINT)) 121 syscall = trace_syscall_enter(regs, syscall); 122 123 syscall_enter_audit(regs, syscall); 124 125 return ret ? : syscall; 126 } 127 128 /** 129 * syscall_enter_from_user_mode_work - Check and handle work before invoking 130 * a syscall 131 * @regs: Pointer to currents pt_regs 132 * @syscall: The syscall number 133 * 134 * Invoked from architecture specific syscall entry code with interrupts 135 * enabled after invoking enter_from_user_mode(), enabling interrupts and 136 * extra architecture specific work. 137 * 138 * Returns: The original or a modified syscall number 139 * 140 * If the returned syscall number is -1 then the syscall should be 141 * skipped. In this case the caller may invoke syscall_set_error() or 142 * syscall_set_return_value() first. If neither of those are called and -1 143 * is returned, then the syscall will fail with ENOSYS. 144 * 145 * It handles the following work items: 146 * 147 * 1) syscall_work flag dependent invocations of 148 * ptrace_report_syscall_entry(), __secure_computing(), trace_sys_enter() 149 * 2) Invocation of audit_syscall_entry() 150 */ 151 static __always_inline long syscall_enter_from_user_mode_work(struct pt_regs *regs, long syscall) 152 { 153 unsigned long work = READ_ONCE(current_thread_info()->syscall_work); 154 155 if (work & SYSCALL_WORK_ENTER) 156 syscall = syscall_trace_enter(regs, work); 157 158 return syscall; 159 } 160 161 /** 162 * syscall_enter_from_user_mode - Establish state and check and handle work 163 * before invoking a syscall 164 * @regs: Pointer to currents pt_regs 165 * @syscall: The syscall number 166 * 167 * Invoked from architecture specific syscall entry code with interrupts 168 * disabled. The calling code has to be non-instrumentable. When the 169 * function returns all state is correct, interrupts are enabled and the 170 * subsequent functions can be instrumented. 171 * 172 * This is the combination of enter_from_user_mode() and 173 * syscall_enter_from_user_mode_work() to be used when there is no 174 * architecture specific work to be done between the two. 175 * 176 * Returns: The original or a modified syscall number. See 177 * syscall_enter_from_user_mode_work() for further explanation. 178 */ 179 static __always_inline long syscall_enter_from_user_mode(struct pt_regs *regs, long syscall) 180 { 181 long ret; 182 183 enter_from_user_mode(regs); 184 185 instrumentation_begin(); 186 local_irq_enable(); 187 ret = syscall_enter_from_user_mode_work(regs, syscall); 188 instrumentation_end(); 189 190 return ret; 191 } 192 193 /* 194 * If SYSCALL_EMU is set, then the only reason to report is when 195 * SINGLESTEP is set (i.e. PTRACE_SYSEMU_SINGLESTEP). This syscall 196 * instruction has been already reported in syscall_enter_from_user_mode(). 197 */ 198 static __always_inline bool report_single_step(unsigned long work) 199 { 200 if (work & SYSCALL_WORK_SYSCALL_EMU) 201 return false; 202 203 return work & SYSCALL_WORK_SYSCALL_EXIT_TRAP; 204 } 205 206 /** 207 * arch_ptrace_report_syscall_exit - Architecture specific ptrace_report_syscall_exit() 208 * 209 * This allows architecture specific ptrace_report_syscall_exit() 210 * implementations. If not defined by the architecture this falls back to 211 * to ptrace_report_syscall_exit(). 212 */ 213 static __always_inline void arch_ptrace_report_syscall_exit(struct pt_regs *regs, 214 int step); 215 216 #ifndef arch_ptrace_report_syscall_exit 217 static __always_inline void arch_ptrace_report_syscall_exit(struct pt_regs *regs, 218 int step) 219 { 220 ptrace_report_syscall_exit(regs, step); 221 } 222 #endif 223 224 /** 225 * syscall_exit_work - Handle work before returning to user mode 226 * @regs: Pointer to current pt_regs 227 * @work: Current thread syscall work 228 * 229 * Do one-time syscall specific work. 230 */ 231 static __always_inline void syscall_exit_work(struct pt_regs *regs, unsigned long work) 232 { 233 bool step; 234 235 /* 236 * If the syscall was rolled back due to syscall user dispatching, 237 * then the tracers below are not invoked for the same reason as 238 * the entry side was not invoked in syscall_trace_enter(): The ABI 239 * of these syscalls is unknown. 240 */ 241 if (work & SYSCALL_WORK_SYSCALL_USER_DISPATCH) { 242 if (unlikely(current->syscall_dispatch.on_dispatch)) { 243 current->syscall_dispatch.on_dispatch = false; 244 return; 245 } 246 } 247 248 audit_syscall_exit(regs); 249 250 if (work & SYSCALL_WORK_SYSCALL_TRACEPOINT) 251 trace_syscall_exit(regs, syscall_get_return_value(current, regs)); 252 253 step = report_single_step(work); 254 if (step || work & SYSCALL_WORK_SYSCALL_TRACE) 255 arch_ptrace_report_syscall_exit(regs, step); 256 } 257 258 /** 259 * syscall_exit_to_user_mode_work - Handle one time work before returning to user mode 260 * @regs: Pointer to currents pt_regs 261 * 262 * Step 1 of syscall_exit_to_user_mode() with the same calling convention. 263 * 264 * The caller must invoke steps 2-3 of syscall_exit_to_user_mode() afterwards. 265 */ 266 static __always_inline void syscall_exit_to_user_mode_work(struct pt_regs *regs) 267 { 268 unsigned long work = READ_ONCE(current_thread_info()->syscall_work); 269 unsigned long nr = syscall_get_nr(current, regs); 270 271 CT_WARN_ON(ct_state() != CT_STATE_KERNEL); 272 273 if (IS_ENABLED(CONFIG_PROVE_LOCKING)) { 274 if (WARN(irqs_disabled(), "syscall %lu left IRQs disabled", nr)) 275 local_irq_enable(); 276 } 277 278 rseq_debug_syscall_return(regs); 279 280 /* 281 * Do one-time syscall specific work. If these work items are 282 * enabled, we want to run them exactly once per syscall exit with 283 * interrupts enabled. 284 */ 285 if (unlikely(work & SYSCALL_WORK_EXIT)) 286 syscall_exit_work(regs, work); 287 } 288 289 /** 290 * syscall_exit_to_user_mode - Handle work before returning to user mode 291 * @regs: Pointer to currents pt_regs 292 * 293 * Invoked with interrupts enabled and fully valid @regs. Returns with all 294 * work handled, interrupts disabled such that the caller can immediately 295 * switch to user mode. Called from architecture specific syscall and ret 296 * from fork code. 297 * 298 * The call order is: 299 * 1) One-time syscall exit work: 300 * - rseq syscall exit 301 * - audit 302 * - syscall tracing 303 * - ptrace (single stepping) 304 * 305 * 2) Preparatory work 306 * - Disable interrupts 307 * - Exit to user mode loop (common TIF handling). Invokes 308 * arch_exit_to_user_mode_work() for architecture specific TIF work 309 * - Architecture specific one time work arch_exit_to_user_mode_prepare() 310 * - Address limit and lockdep checks 311 * 312 * 3) Final transition (lockdep, tracing, context tracking, RCU), i.e. the 313 * functionality in exit_to_user_mode(). 314 * 315 * This is a combination of syscall_exit_to_user_mode_work() (1), disabling 316 * interrupts followed by syscall_exit_to_user_mode_prepare() (2) and 317 * exit_to_user_mode() (3). This function is preferred unless there is a 318 * compelling architectural reason to invoke the functions separately. 319 */ 320 static __always_inline void syscall_exit_to_user_mode(struct pt_regs *regs) 321 { 322 instrumentation_begin(); 323 syscall_exit_to_user_mode_work(regs); 324 local_irq_disable_exit_to_user(); 325 syscall_exit_to_user_mode_prepare(regs); 326 instrumentation_end(); 327 exit_to_user_mode(); 328 } 329 330 #endif 331