xref: /linux/include/linux/entry-common.h (revision e2683c8868d03382da7e1ce8453b543a043066d1)
1 /* SPDX-License-Identifier: GPL-2.0 */
2 #ifndef __LINUX_ENTRYCOMMON_H
3 #define __LINUX_ENTRYCOMMON_H
4 
5 #include <linux/audit.h>
6 #include <linux/irq-entry-common.h>
7 #include <linux/livepatch.h>
8 #include <linux/ptrace.h>
9 #include <linux/resume_user_mode.h>
10 #include <linux/seccomp.h>
11 #include <linux/sched.h>
12 
13 #include <asm/entry-common.h>
14 #include <asm/syscall.h>
15 
16 #ifndef _TIF_UPROBE
17 # define _TIF_UPROBE			(0)
18 #endif
19 
20 /*
21  * SYSCALL_WORK flags handled in syscall_enter_from_user_mode()
22  */
23 #define SYSCALL_WORK_ENTER	(SYSCALL_WORK_SECCOMP |			\
24 				 SYSCALL_WORK_SYSCALL_TRACEPOINT |	\
25 				 SYSCALL_WORK_SYSCALL_TRACE |		\
26 				 SYSCALL_WORK_SYSCALL_EMU |		\
27 				 SYSCALL_WORK_SYSCALL_AUDIT |		\
28 				 SYSCALL_WORK_SYSCALL_USER_DISPATCH |	\
29 				 SYSCALL_WORK_SYSCALL_RSEQ_SLICE)
30 /*
31  * SYSCALL_WORK flags handled in syscall_exit_to_user_mode()
32  */
33 #define SYSCALL_WORK_EXIT	(SYSCALL_WORK_SYSCALL_TRACEPOINT |	\
34 				 SYSCALL_WORK_SYSCALL_TRACE |		\
35 				 SYSCALL_WORK_SYSCALL_AUDIT |		\
36 				 SYSCALL_WORK_SYSCALL_USER_DISPATCH |	\
37 				 SYSCALL_WORK_SYSCALL_EXIT_TRAP)
38 
39 /**
40  * arch_ptrace_report_syscall_entry - Architecture specific ptrace_report_syscall_entry() wrapper
41  * @regs: Pointer to the register state at syscall entry
42  *
43  * Invoked from syscall_trace_enter() to wrap ptrace_report_syscall_entry().
44  *
45  * This allows architecture specific ptrace_report_syscall_entry()
46  * implementations. If not defined by the architecture this falls back to
47  * to ptrace_report_syscall_entry().
48  */
49 static __always_inline int arch_ptrace_report_syscall_entry(struct pt_regs *regs);
50 
51 #ifndef arch_ptrace_report_syscall_entry
52 static __always_inline int arch_ptrace_report_syscall_entry(struct pt_regs *regs)
53 {
54 	return ptrace_report_syscall_entry(regs);
55 }
56 #endif
57 
58 bool syscall_user_dispatch(struct pt_regs *regs);
59 long trace_syscall_enter(struct pt_regs *regs, long syscall);
60 void trace_syscall_exit(struct pt_regs *regs, long ret);
61 
62 static inline void syscall_enter_audit(struct pt_regs *regs, long syscall)
63 {
64 	if (unlikely(audit_context())) {
65 		unsigned long args[6];
66 
67 		syscall_get_arguments(current, regs, args);
68 		audit_syscall_entry(syscall, args[0], args[1], args[2], args[3]);
69 	}
70 }
71 
72 static __always_inline long syscall_trace_enter(struct pt_regs *regs, unsigned long work)
73 {
74 	long syscall, ret = 0;
75 
76 	/*
77 	 * Handle Syscall User Dispatch.  This must comes first, since
78 	 * the ABI here can be something that doesn't make sense for
79 	 * other syscall_work features.
80 	 */
81 	if (work & SYSCALL_WORK_SYSCALL_USER_DISPATCH) {
82 		if (syscall_user_dispatch(regs))
83 			return -1L;
84 	}
85 
86 	/*
87 	 * User space got a time slice extension granted and relinquishes
88 	 * the CPU. The work stops the slice timer to avoid an extra round
89 	 * through hrtimer_interrupt().
90 	 */
91 	if (work & SYSCALL_WORK_SYSCALL_RSEQ_SLICE)
92 		rseq_syscall_enter_work(syscall_get_nr(current, regs));
93 
94 	/* Handle ptrace */
95 	if (work & (SYSCALL_WORK_SYSCALL_TRACE | SYSCALL_WORK_SYSCALL_EMU)) {
96 		ret = arch_ptrace_report_syscall_entry(regs);
97 		if (ret || (work & SYSCALL_WORK_SYSCALL_EMU))
98 			return -1L;
99 	}
100 
101 	/* Do seccomp after ptrace, to catch any tracer changes. */
102 	if (work & SYSCALL_WORK_SECCOMP) {
103 		ret = __secure_computing();
104 		if (ret == -1L)
105 			return ret;
106 	}
107 
108 	/* Either of the above might have changed the syscall number */
109 	syscall = syscall_get_nr(current, regs);
110 
111 	if (unlikely(work & SYSCALL_WORK_SYSCALL_TRACEPOINT))
112 		syscall = trace_syscall_enter(regs, syscall);
113 
114 	syscall_enter_audit(regs, syscall);
115 
116 	return ret ? : syscall;
117 }
118 
119 /**
120  * syscall_enter_from_user_mode_work - Check and handle work before invoking
121  *				       a syscall
122  * @regs:	Pointer to currents pt_regs
123  * @syscall:	The syscall number
124  *
125  * Invoked from architecture specific syscall entry code with interrupts
126  * enabled after invoking enter_from_user_mode(), enabling interrupts and
127  * extra architecture specific work.
128  *
129  * Returns: The original or a modified syscall number
130  *
131  * If the returned syscall number is -1 then the syscall should be
132  * skipped. In this case the caller may invoke syscall_set_error() or
133  * syscall_set_return_value() first.  If neither of those are called and -1
134  * is returned, then the syscall will fail with ENOSYS.
135  *
136  * It handles the following work items:
137  *
138  *  1) syscall_work flag dependent invocations of
139  *     ptrace_report_syscall_entry(), __secure_computing(), trace_sys_enter()
140  *  2) Invocation of audit_syscall_entry()
141  */
142 static __always_inline long syscall_enter_from_user_mode_work(struct pt_regs *regs, long syscall)
143 {
144 	unsigned long work = READ_ONCE(current_thread_info()->syscall_work);
145 
146 	if (work & SYSCALL_WORK_ENTER)
147 		syscall = syscall_trace_enter(regs, work);
148 
149 	return syscall;
150 }
151 
152 /**
153  * syscall_enter_from_user_mode - Establish state and check and handle work
154  *				  before invoking a syscall
155  * @regs:	Pointer to currents pt_regs
156  * @syscall:	The syscall number
157  *
158  * Invoked from architecture specific syscall entry code with interrupts
159  * disabled. The calling code has to be non-instrumentable. When the
160  * function returns all state is correct, interrupts are enabled and the
161  * subsequent functions can be instrumented.
162  *
163  * This is the combination of enter_from_user_mode() and
164  * syscall_enter_from_user_mode_work() to be used when there is no
165  * architecture specific work to be done between the two.
166  *
167  * Returns: The original or a modified syscall number. See
168  * syscall_enter_from_user_mode_work() for further explanation.
169  */
170 static __always_inline long syscall_enter_from_user_mode(struct pt_regs *regs, long syscall)
171 {
172 	long ret;
173 
174 	enter_from_user_mode(regs);
175 
176 	instrumentation_begin();
177 	local_irq_enable();
178 	ret = syscall_enter_from_user_mode_work(regs, syscall);
179 	instrumentation_end();
180 
181 	return ret;
182 }
183 
184 /*
185  * If SYSCALL_EMU is set, then the only reason to report is when
186  * SINGLESTEP is set (i.e. PTRACE_SYSEMU_SINGLESTEP).  This syscall
187  * instruction has been already reported in syscall_enter_from_user_mode().
188  */
189 static __always_inline bool report_single_step(unsigned long work)
190 {
191 	if (work & SYSCALL_WORK_SYSCALL_EMU)
192 		return false;
193 
194 	return work & SYSCALL_WORK_SYSCALL_EXIT_TRAP;
195 }
196 
197 /**
198  * arch_ptrace_report_syscall_exit - Architecture specific ptrace_report_syscall_exit()
199  * @regs: Pointer to the register state at syscall exit
200  * @step: Indicates a single-step exit rather than a normal syscall exit
201  *
202  * This allows architecture specific ptrace_report_syscall_exit()
203  * implementations. If not defined by the architecture this falls back to
204  * to ptrace_report_syscall_exit().
205  */
206 static __always_inline void arch_ptrace_report_syscall_exit(struct pt_regs *regs,
207 							    int step);
208 
209 #ifndef arch_ptrace_report_syscall_exit
210 static __always_inline void arch_ptrace_report_syscall_exit(struct pt_regs *regs,
211 							    int step)
212 {
213 	ptrace_report_syscall_exit(regs, step);
214 }
215 #endif
216 
217 /**
218  * syscall_exit_work - Handle work before returning to user mode
219  * @regs:	Pointer to current pt_regs
220  * @work:	Current thread syscall work
221  *
222  * Do one-time syscall specific work.
223  */
224 static __always_inline void syscall_exit_work(struct pt_regs *regs, unsigned long work)
225 {
226 	bool step;
227 
228 	/*
229 	 * If the syscall was rolled back due to syscall user dispatching,
230 	 * then the tracers below are not invoked for the same reason as
231 	 * the entry side was not invoked in syscall_trace_enter(): The ABI
232 	 * of these syscalls is unknown.
233 	 */
234 	if (work & SYSCALL_WORK_SYSCALL_USER_DISPATCH) {
235 		if (unlikely(current->syscall_dispatch.on_dispatch)) {
236 			current->syscall_dispatch.on_dispatch = false;
237 			return;
238 		}
239 	}
240 
241 	audit_syscall_exit(regs);
242 
243 	if (work & SYSCALL_WORK_SYSCALL_TRACEPOINT)
244 		trace_syscall_exit(regs, syscall_get_return_value(current, regs));
245 
246 	step = report_single_step(work);
247 	if (step || work & SYSCALL_WORK_SYSCALL_TRACE)
248 		arch_ptrace_report_syscall_exit(regs, step);
249 }
250 
251 /**
252  * syscall_exit_to_user_mode_work - Handle one time work before returning to user mode
253  * @regs:	Pointer to currents pt_regs
254  *
255  * Step 1 of syscall_exit_to_user_mode() with the same calling convention.
256  *
257  * The caller must invoke steps 2-3 of syscall_exit_to_user_mode() afterwards.
258  */
259 static __always_inline void syscall_exit_to_user_mode_work(struct pt_regs *regs)
260 {
261 	unsigned long work = READ_ONCE(current_thread_info()->syscall_work);
262 	unsigned long nr = syscall_get_nr(current, regs);
263 
264 	CT_WARN_ON(ct_state() != CT_STATE_KERNEL);
265 
266 	if (IS_ENABLED(CONFIG_PROVE_LOCKING)) {
267 		if (WARN(irqs_disabled(), "syscall %lu left IRQs disabled", nr))
268 			local_irq_enable();
269 	}
270 
271 	rseq_debug_syscall_return(regs);
272 
273 	/*
274 	 * Do one-time syscall specific work. If these work items are
275 	 * enabled, we want to run them exactly once per syscall exit with
276 	 * interrupts enabled.
277 	 */
278 	if (unlikely(work & SYSCALL_WORK_EXIT))
279 		syscall_exit_work(regs, work);
280 }
281 
282 /**
283  * syscall_exit_to_user_mode - Handle work before returning to user mode
284  * @regs:	Pointer to currents pt_regs
285  *
286  * Invoked with interrupts enabled and fully valid @regs. Returns with all
287  * work handled, interrupts disabled such that the caller can immediately
288  * switch to user mode. Called from architecture specific syscall and ret
289  * from fork code.
290  *
291  * The call order is:
292  *  1) One-time syscall exit work:
293  *	- rseq syscall exit
294  *      - audit
295  *	- syscall tracing
296  *	- ptrace (single stepping)
297  *
298  *  2) Preparatory work
299  *	- Disable interrupts
300  *	- Exit to user mode loop (common TIF handling). Invokes
301  *	  arch_exit_to_user_mode_work() for architecture specific TIF work
302  *	- Architecture specific one time work arch_exit_to_user_mode_prepare()
303  *	- Address limit and lockdep checks
304  *
305  *  3) Final transition (lockdep, tracing, context tracking, RCU), i.e. the
306  *     functionality in exit_to_user_mode().
307  *
308  * This is a combination of syscall_exit_to_user_mode_work() (1), disabling
309  * interrupts followed by syscall_exit_to_user_mode_prepare() (2) and
310  * exit_to_user_mode() (3). This function is preferred unless there is a
311  * compelling architectural reason to invoke the functions separately.
312  */
313 static __always_inline void syscall_exit_to_user_mode(struct pt_regs *regs)
314 {
315 	instrumentation_begin();
316 	syscall_exit_to_user_mode_work(regs);
317 	local_irq_disable();
318 	syscall_exit_to_user_mode_prepare(regs);
319 	instrumentation_end();
320 	exit_to_user_mode();
321 }
322 
323 #endif
324