xref: /linux/include/linux/rseq_entry.h (revision 7f0023215262221ca08d56be2203e8a4770be033)
1 /* SPDX-License-Identifier: GPL-2.0 */
2 #ifndef _LINUX_RSEQ_ENTRY_H
3 #define _LINUX_RSEQ_ENTRY_H
4 
5 /* Must be outside the CONFIG_RSEQ guard to resolve the stubs */
6 #ifdef CONFIG_RSEQ_STATS
7 #include <linux/percpu.h>
8 
9 struct rseq_stats {
10 	unsigned long	exit;
11 	unsigned long	signal;
12 	unsigned long	slowpath;
13 	unsigned long	fastpath;
14 	unsigned long	ids;
15 	unsigned long	cs;
16 	unsigned long	clear;
17 	unsigned long	fixup;
18 	unsigned long	s_granted;
19 	unsigned long	s_expired;
20 	unsigned long	s_revoked;
21 	unsigned long	s_yielded;
22 	unsigned long	s_aborted;
23 };
24 
25 DECLARE_PER_CPU(struct rseq_stats, rseq_stats);
26 
27 /*
28  * Slow path has interrupts and preemption enabled, but the fast path
29  * runs with interrupts disabled so there is no point in having the
30  * preemption checks implied in __this_cpu_inc() for every operation.
31  */
32 #ifdef RSEQ_BUILD_SLOW_PATH
33 #define rseq_stat_inc(which)	this_cpu_inc((which))
34 #else
35 #define rseq_stat_inc(which)	raw_cpu_inc((which))
36 #endif
37 
38 #else /* CONFIG_RSEQ_STATS */
39 #define rseq_stat_inc(x)	do { } while (0)
40 #endif /* !CONFIG_RSEQ_STATS */
41 
42 #ifdef CONFIG_RSEQ
43 #include <linux/hrtimer_rearm.h>
44 #include <linux/jump_label.h>
45 #include <linux/rseq.h>
46 #include <linux/sched/signal.h>
47 #include <linux/uaccess.h>
48 
49 #include <linux/tracepoint-defs.h>
50 
51 #ifdef CONFIG_TRACEPOINTS
52 DECLARE_TRACEPOINT(rseq_update);
53 DECLARE_TRACEPOINT(rseq_ip_fixup);
54 void __rseq_trace_update(struct task_struct *t);
55 void __rseq_trace_ip_fixup(unsigned long ip, unsigned long start_ip,
56 			   unsigned long offset, unsigned long abort_ip);
57 
rseq_trace_update(struct task_struct * t,struct rseq_ids * ids)58 static inline void rseq_trace_update(struct task_struct *t, struct rseq_ids *ids)
59 {
60 	if (tracepoint_enabled(rseq_update) && ids)
61 		__rseq_trace_update(t);
62 }
63 
rseq_trace_ip_fixup(unsigned long ip,unsigned long start_ip,unsigned long offset,unsigned long abort_ip)64 static inline void rseq_trace_ip_fixup(unsigned long ip, unsigned long start_ip,
65 				       unsigned long offset, unsigned long abort_ip)
66 {
67 	if (tracepoint_enabled(rseq_ip_fixup))
68 		__rseq_trace_ip_fixup(ip, start_ip, offset, abort_ip);
69 }
70 
71 #else /* CONFIG_TRACEPOINT */
rseq_trace_update(struct task_struct * t,struct rseq_ids * ids)72 static inline void rseq_trace_update(struct task_struct *t, struct rseq_ids *ids) { }
rseq_trace_ip_fixup(unsigned long ip,unsigned long start_ip,unsigned long offset,unsigned long abort_ip)73 static inline void rseq_trace_ip_fixup(unsigned long ip, unsigned long start_ip,
74 				       unsigned long offset, unsigned long abort_ip) { }
75 #endif /* !CONFIG_TRACEPOINT */
76 
77 DECLARE_STATIC_KEY_MAYBE(CONFIG_RSEQ_DEBUG_DEFAULT_ENABLE, rseq_debug_enabled);
78 
79 #ifdef RSEQ_BUILD_SLOW_PATH
80 #define rseq_inline
81 #else
82 #define rseq_inline __always_inline
83 #endif
84 
85 #ifdef CONFIG_RSEQ_SLICE_EXTENSION
86 DECLARE_STATIC_KEY_TRUE(rseq_slice_extension_key);
87 
rseq_slice_extension_enabled(void)88 static __always_inline bool rseq_slice_extension_enabled(void)
89 {
90 	return static_branch_likely(&rseq_slice_extension_key);
91 }
92 
93 extern unsigned int rseq_slice_ext_nsecs;
94 bool __rseq_arm_slice_extension_timer(void);
95 
rseq_arm_slice_extension_timer(void)96 static __always_inline bool rseq_arm_slice_extension_timer(void)
97 {
98 	if (!rseq_slice_extension_enabled())
99 		return false;
100 
101 	if (likely(!current->rseq.slice.state.granted))
102 		return false;
103 
104 	return __rseq_arm_slice_extension_timer();
105 }
106 
rseq_slice_clear_grant(struct task_struct * t)107 static __always_inline void rseq_slice_clear_grant(struct task_struct *t)
108 {
109 	if (IS_ENABLED(CONFIG_RSEQ_STATS) && t->rseq.slice.state.granted)
110 		rseq_stat_inc(rseq_stats.s_revoked);
111 	t->rseq.slice.state.granted = false;
112 }
113 
114 /*
115  * Open coded, so it can be invoked within a user access region.
116  *
117  * This clears the user space state of the time slice extensions field only when
118  * the task has registered the optimized RSEQ_ABI V2. Some legacy registrations,
119  * e.g. TCMalloc, have conflicting non-ABI fields in struct RSEQ, which would be
120  * overwritten by an unconditional write.
121  */
122 #define rseq_slice_clear_user(rseq, efault)				\
123 do {									\
124 	if (rseq_slice_extension_enabled())				\
125 		unsafe_put_user(0U, &rseq->slice_ctrl.all, efault);	\
126 } while (0)
127 
__rseq_grant_slice_extension(bool work_pending)128 static __always_inline bool __rseq_grant_slice_extension(bool work_pending)
129 {
130 	struct task_struct *curr = current;
131 	struct rseq_slice_ctrl usr_ctrl;
132 	union rseq_slice_state state;
133 	struct rseq __user *rseq;
134 
135 	if (!rseq_slice_extension_enabled())
136 		return false;
137 
138 	/* If not enabled or not a return from interrupt, nothing to do. */
139 	state = curr->rseq.slice.state;
140 	state.enabled &= curr->rseq.event.user_irq;
141 	if (likely(!state.state))
142 		return false;
143 
144 	rseq = curr->rseq.usrptr;
145 	scoped_user_rw_access(rseq, efault) {
146 
147 		/*
148 		 * Quick check conditions where a grant is not possible or
149 		 * needs to be revoked.
150 		 *
151 		 *  1) Any TIF bit which needs to do extra work aside of
152 		 *     rescheduling prevents a grant.
153 		 *
154 		 *  2) A previous rescheduling request resulted in a slice
155 		 *     extension grant.
156 		 */
157 		if (unlikely(work_pending || state.granted)) {
158 			/* Clear user control unconditionally. No point for checking */
159 			unsafe_put_user(0U, &rseq->slice_ctrl.all, efault);
160 			rseq_slice_clear_grant(curr);
161 			return false;
162 		}
163 
164 		unsafe_get_user(usr_ctrl.all, &rseq->slice_ctrl.all, efault);
165 		if (likely(!(usr_ctrl.request)))
166 			return false;
167 
168 		/* Grant the slice extention */
169 		usr_ctrl.request = 0;
170 		usr_ctrl.granted = 1;
171 		unsafe_put_user(usr_ctrl.all, &rseq->slice_ctrl.all, efault);
172 	}
173 
174 	rseq_stat_inc(rseq_stats.s_granted);
175 
176 	curr->rseq.slice.state.granted = true;
177 	/* Store expiry time for arming the timer on the way out */
178 	curr->rseq.slice.expires = data_race(rseq_slice_ext_nsecs) + ktime_get_mono_fast_ns();
179 	/*
180 	 * This is racy against a remote CPU setting TIF_NEED_RESCHED in
181 	 * several ways:
182 	 *
183 	 * 1)
184 	 *	CPU0			CPU1
185 	 *	clear_tsk()
186 	 *				set_tsk()
187 	 *	clear_preempt()
188 	 *				Raise scheduler IPI on CPU0
189 	 *	--> IPI
190 	 *	    fold_need_resched() -> Folds correctly
191 	 * 2)
192 	 *	CPU0			CPU1
193 	 *				set_tsk()
194 	 *	clear_tsk()
195 	 *	clear_preempt()
196 	 *				Raise scheduler IPI on CPU0
197 	 *	--> IPI
198 	 *	    fold_need_resched() <- NOOP as TIF_NEED_RESCHED is false
199 	 *
200 	 * #1 is not any different from a regular remote reschedule as it
201 	 *    sets the previously not set bit and then raises the IPI which
202 	 *    folds it into the preempt counter
203 	 *
204 	 * #2 is obviously incorrect from a scheduler POV, but it's not
205 	 *    differently incorrect than the code below clearing the
206 	 *    reschedule request with the safety net of the timer.
207 	 *
208 	 * The important part is that the clearing is protected against the
209 	 * scheduler IPI and also against any other interrupt which might
210 	 * end up waking up a task and setting the bits in the middle of
211 	 * the operation:
212 	 *
213 	 *	clear_tsk()
214 	 *	---> Interrupt
215 	 *		wakeup_on_this_cpu()
216 	 *		set_tsk()
217 	 *		set_preempt()
218 	 *	clear_preempt()
219 	 *
220 	 * which would be inconsistent state.
221 	 */
222 	scoped_guard(irq) {
223 		clear_tsk_need_resched(curr);
224 		clear_preempt_need_resched();
225 	}
226 	return true;
227 
228 efault:
229 	force_sig(SIGSEGV);
230 	return false;
231 }
232 
rseq_grant_slice_extension(unsigned long ti_work,unsigned long mask)233 static __always_inline bool rseq_grant_slice_extension(unsigned long ti_work, unsigned long mask)
234 {
235 	if (unlikely(__rseq_grant_slice_extension(ti_work & mask))) {
236 		hrtimer_rearm_deferred_tif(ti_work);
237 		return true;
238 	}
239 	return false;
240 }
241 
242 #else /* CONFIG_RSEQ_SLICE_EXTENSION */
rseq_slice_extension_enabled(void)243 static __always_inline bool rseq_slice_extension_enabled(void) { return false; }
rseq_arm_slice_extension_timer(void)244 static __always_inline bool rseq_arm_slice_extension_timer(void) { return false; }
rseq_slice_clear_grant(struct task_struct * t)245 static __always_inline void rseq_slice_clear_grant(struct task_struct *t) { }
rseq_grant_slice_extension(unsigned long ti_work,unsigned long mask)246 static __always_inline bool rseq_grant_slice_extension(unsigned long ti_work, unsigned long mask) { return false; }
247 #define rseq_slice_clear_user(rseq, efault) do { } while (0)
248 #endif /* !CONFIG_RSEQ_SLICE_EXTENSION */
249 
250 bool rseq_debug_update_user_cs(struct task_struct *t, struct pt_regs *regs, unsigned long csaddr);
251 
rseq_note_user_irq_entry(void)252 static __always_inline void rseq_note_user_irq_entry(void)
253 {
254 	if (IS_ENABLED(CONFIG_GENERIC_IRQ_ENTRY))
255 		current->rseq.event.user_irq = true;
256 }
257 
258 /*
259  * Check whether there is a valid critical section and whether the
260  * instruction pointer in @regs is inside the critical section.
261  *
262  *  - If the critical section is invalid, terminate the task.
263  *
264  *  - If valid and the instruction pointer is inside, set it to the abort IP.
265  *
266  *  - If valid and the instruction pointer is outside, clear the critical
267  *    section address.
268  *
269  * Returns true, if the section was valid and either fixup or clear was
270  * done, false otherwise.
271  *
272  * In the failure case task::rseq_event::fatal is set when a invalid
273  * section was found. It's clear when the failure was an unresolved page
274  * fault.
275  *
276  * If inlined into the exit to user path with interrupts disabled, the
277  * caller has to protect against page faults with pagefault_disable().
278  *
279  * In preemptible task context this would be counterproductive as the page
280  * faults could not be fully resolved. As a consequence unresolved page
281  * faults in task context are fatal too.
282  */
283 
284 #ifdef RSEQ_BUILD_SLOW_PATH
285 /*
286  * The debug version is put out of line, but kept here so the code stays
287  * together.
288  *
289  * @csaddr has already been checked by the caller to be in user space
290  */
rseq_debug_update_user_cs(struct task_struct * t,struct pt_regs * regs,unsigned long csaddr)291 bool rseq_debug_update_user_cs(struct task_struct *t, struct pt_regs *regs,
292 			       unsigned long csaddr)
293 {
294 	struct rseq_cs __user *ucs = (struct rseq_cs __user *)(unsigned long)csaddr;
295 	u64 start_ip, abort_ip, offset, cs_end, head, tasksize = TASK_SIZE;
296 	unsigned long ip = instruction_pointer(regs);
297 	u64 __user *uc_head = (u64 __user *) ucs;
298 	u32 usig, __user *uc_sig;
299 
300 	scoped_user_rw_access(ucs, efault) {
301 		/*
302 		 * Evaluate the user pile and exit if one of the conditions
303 		 * is not fulfilled.
304 		 */
305 		unsafe_get_user(start_ip, &ucs->start_ip, efault);
306 		if (unlikely(start_ip >= tasksize))
307 			goto die;
308 		/* If outside, just clear the critical section. */
309 		if (ip < start_ip)
310 			goto clear;
311 
312 		unsafe_get_user(offset, &ucs->post_commit_offset, efault);
313 		cs_end = start_ip + offset;
314 		/* Check for overflow and wraparound */
315 		if (unlikely(cs_end >= tasksize || cs_end < start_ip))
316 			goto die;
317 
318 		/* If not inside, clear it. */
319 		if (ip >= cs_end)
320 			goto clear;
321 
322 		unsafe_get_user(abort_ip, &ucs->abort_ip, efault);
323 		/* Ensure it's "valid" */
324 		if (unlikely(abort_ip >= tasksize || abort_ip < sizeof(*uc_sig)))
325 			goto die;
326 		/* Validate that the abort IP is not in the critical section */
327 		if (unlikely(abort_ip - start_ip < offset))
328 			goto die;
329 
330 		/*
331 		 * Check version and flags for 0. No point in emitting
332 		 * deprecated warnings before dying. That could be done in
333 		 * the slow path eventually, but *shrug*.
334 		 */
335 		unsafe_get_user(head, uc_head, efault);
336 		if (unlikely(head))
337 			goto die;
338 
339 		/* abort_ip - 4 is >= 0. See abort_ip check above */
340 		uc_sig = (u32 __user *)(unsigned long)(abort_ip - sizeof(*uc_sig));
341 		unsafe_get_user(usig, uc_sig, efault);
342 		if (unlikely(usig != t->rseq.sig))
343 			goto die;
344 
345 		/* rseq_event.user_irq is only valid if CONFIG_GENERIC_IRQ_ENTRY=y */
346 		if (IS_ENABLED(CONFIG_GENERIC_IRQ_ENTRY)) {
347 			/* If not in interrupt from user context, let it die */
348 			if (unlikely(!t->rseq.event.user_irq))
349 				goto die;
350 		}
351 		unsafe_put_user(0ULL, &t->rseq.usrptr->rseq_cs, efault);
352 		instruction_pointer_set(regs, (unsigned long)abort_ip);
353 		rseq_stat_inc(rseq_stats.fixup);
354 		break;
355 	clear:
356 		unsafe_put_user(0ULL, &t->rseq.usrptr->rseq_cs, efault);
357 		rseq_stat_inc(rseq_stats.clear);
358 		abort_ip = 0ULL;
359 	}
360 
361 	if (unlikely(abort_ip))
362 		rseq_trace_ip_fixup(ip, start_ip, offset, abort_ip);
363 	return true;
364 die:
365 	t->rseq.event.fatal = true;
366 efault:
367 	return false;
368 }
369 
370 #endif /* RSEQ_BUILD_SLOW_PATH */
371 
372 /*
373  * This only ensures that abort_ip is in the user address space and
374  * validates that it is preceded by the signature.
375  *
376  * No other sanity checks are done here, that's what the debug code is for.
377  */
378 static rseq_inline bool
rseq_update_user_cs(struct task_struct * t,struct pt_regs * regs,unsigned long csaddr)379 rseq_update_user_cs(struct task_struct *t, struct pt_regs *regs, unsigned long csaddr)
380 {
381 	struct rseq_cs __user *ucs = (struct rseq_cs __user *)(unsigned long)csaddr;
382 	unsigned long ip = instruction_pointer(regs);
383 	unsigned long tasksize = TASK_SIZE;
384 	u64 start_ip, abort_ip, offset;
385 	u32 usig, __user *uc_sig;
386 
387 	rseq_stat_inc(rseq_stats.cs);
388 
389 	if (unlikely(csaddr >= tasksize)) {
390 		t->rseq.event.fatal = true;
391 		return false;
392 	}
393 
394 	if (static_branch_unlikely(&rseq_debug_enabled))
395 		return rseq_debug_update_user_cs(t, regs, csaddr);
396 
397 	scoped_user_rw_access(ucs, efault) {
398 		unsafe_get_user(start_ip, &ucs->start_ip, efault);
399 		unsafe_get_user(offset, &ucs->post_commit_offset, efault);
400 		unsafe_get_user(abort_ip, &ucs->abort_ip, efault);
401 
402 		/*
403 		 * No sanity checks. If user space screwed it up, it can
404 		 * keep the pieces. That's what debug code is for.
405 		 *
406 		 * If outside, just clear the critical section.
407 		 */
408 		if (ip - start_ip >= offset)
409 			goto clear;
410 
411 		/*
412 		 * Two requirements for @abort_ip:
413 		 *   - Must be in user space as x86 IRET would happily return to
414 		 *     the kernel.
415 		 *   - The four bytes preceding the instruction at @abort_ip must
416 		 *     contain the signature.
417 		 *
418 		 * The latter protects against the following attack vector:
419 		 *
420 		 * An attacker with limited abilities to write, creates a critical
421 		 * section descriptor, sets the abort IP to a library function or
422 		 * some other ROP gadget and stores the address of the descriptor
423 		 * in TLS::rseq::rseq_cs. An RSEQ abort would then evade ROP
424 		 * protection.
425 		 */
426 		if (unlikely(abort_ip >= tasksize || abort_ip < sizeof(*uc_sig)))
427 			goto die;
428 
429 		/* The address is guaranteed to be >= 0 and < TASK_SIZE */
430 		uc_sig = (u32 __user *)(unsigned long)(abort_ip - sizeof(*uc_sig));
431 		unsafe_get_user(usig, uc_sig, efault);
432 		if (unlikely(usig != t->rseq.sig))
433 			goto die;
434 
435 		/* Invalidate the critical section */
436 		unsafe_put_user(0ULL, &t->rseq.usrptr->rseq_cs, efault);
437 		/* Update the instruction pointer */
438 		instruction_pointer_set(regs, (unsigned long)abort_ip);
439 		rseq_stat_inc(rseq_stats.fixup);
440 		break;
441 	clear:
442 		unsafe_put_user(0ULL, &t->rseq.usrptr->rseq_cs, efault);
443 		rseq_stat_inc(rseq_stats.clear);
444 		abort_ip = 0ULL;
445 	}
446 
447 	if (unlikely(abort_ip))
448 		rseq_trace_ip_fixup(ip, start_ip, offset, abort_ip);
449 	return true;
450 die:
451 	t->rseq.event.fatal = true;
452 efault:
453 	return false;
454 }
455 
456 /*
457  * Updates CPU ID, Node ID and MM CID and reads the critical section
458  * address, when @csaddr != NULL. This allows to put the ID update and the
459  * read under the same uaccess region to spare a separate begin/end.
460  *
461  * As this is either invoked from a C wrapper with @csaddr = NULL or from
462  * the fast path code with a valid pointer, a clever compiler should be
463  * able to optimize the read out. Spares a duplicate implementation.
464  *
465  * Returns true, if the operation was successful, false otherwise.
466  *
467  * In the failure case task::rseq_event::fatal is set when invalid data
468  * was found on debug kernels. It's clear when the failure was an unresolved page
469  * fault.
470  *
471  * If inlined into the exit to user path with interrupts disabled, the
472  * caller has to protect against page faults with pagefault_disable().
473  *
474  * In preemptible task context this would be counterproductive as the page
475  * faults could not be fully resolved. As a consequence unresolved page
476  * faults in task context are fatal too.
477  */
478 static rseq_inline
rseq_set_ids_get_csaddr(struct task_struct * t,struct rseq_ids * ids,u64 * csaddr)479 bool rseq_set_ids_get_csaddr(struct task_struct *t, struct rseq_ids *ids, u64 *csaddr)
480 {
481 	struct rseq __user *rseq = t->rseq.usrptr;
482 
483 	scoped_user_rw_access(rseq, efault) {
484 		/* Validate the R/O fields for debug and optimized mode */
485 		if (static_branch_unlikely(&rseq_debug_enabled) || rseq_v2(t)) {
486 			u32 cpu_id, uval;
487 
488 			unsafe_get_user(cpu_id, &rseq->cpu_id_start, efault);
489 			if (cpu_id != t->rseq.ids.cpu_id)
490 				goto die;
491 			unsafe_get_user(uval, &rseq->cpu_id, efault);
492 			if (uval != cpu_id)
493 				goto die;
494 			unsafe_get_user(uval, &rseq->node_id, efault);
495 			if (uval != t->rseq.ids.node_id)
496 				goto die;
497 			unsafe_get_user(uval, &rseq->mm_cid, efault);
498 			if (uval != t->rseq.ids.mm_cid)
499 				goto die;
500 		}
501 
502 		unsafe_put_user(ids->cpu_id, &rseq->cpu_id_start, efault);
503 		unsafe_put_user(ids->cpu_id, &rseq->cpu_id, efault);
504 		unsafe_put_user(ids->node_id, &rseq->node_id, efault);
505 		unsafe_put_user(ids->mm_cid, &rseq->mm_cid, efault);
506 		if (csaddr)
507 			unsafe_get_user(*csaddr, &rseq->rseq_cs, efault);
508 
509 		/* RSEQ ABI V2 only operations */
510 		if (rseq_v2(t))
511 			rseq_slice_clear_user(rseq, efault);
512 	}
513 
514 	rseq_slice_clear_grant(t);
515 	/* Cache the new values */
516 	t->rseq.ids = *ids;
517 	rseq_stat_inc(rseq_stats.ids);
518 	rseq_trace_update(t, ids);
519 	return true;
520 
521 die:
522 	t->rseq.event.fatal = true;
523 efault:
524 	return false;
525 }
526 
527 /*
528  * Update user space with new IDs and conditionally check whether the task
529  * is in a critical section.
530  */
rseq_update_usr(struct task_struct * t,struct pt_regs * regs,struct rseq_ids * ids)531 static rseq_inline bool rseq_update_usr(struct task_struct *t, struct pt_regs *regs,
532 					struct rseq_ids *ids)
533 {
534 	u64 csaddr;
535 
536 	if (!rseq_set_ids_get_csaddr(t, ids, &csaddr))
537 		return false;
538 
539 	/*
540 	 * On architectures which utilize the generic entry code this
541 	 * allows to skip the critical section when the entry was not from
542 	 * a user space interrupt, unless debug mode is enabled.
543 	 */
544 	if (IS_ENABLED(CONFIG_GENERIC_IRQ_ENTRY)) {
545 		if (!static_branch_unlikely(&rseq_debug_enabled)) {
546 			if (likely(!t->rseq.event.user_irq))
547 				return true;
548 		}
549 	}
550 	if (likely(!csaddr))
551 		return true;
552 	/* Sigh, this really needs to do work */
553 	return rseq_update_user_cs(t, regs, csaddr);
554 }
555 
556 /*
557  * If you want to use this then convert your architecture to the generic
558  * entry code. I'm tired of building workarounds for people who can't be
559  * bothered to make the maintenance of generic infrastructure less
560  * burdensome. Just sucking everything into the architecture code and
561  * thereby making others chase the horrible hacks and keep them working is
562  * neither acceptable nor sustainable.
563  */
564 #ifdef CONFIG_GENERIC_ENTRY
565 
566 /*
567  * This is inlined into the exit path because:
568  *
569  * 1) It's a one time comparison in the fast path when there is no event to
570  *    handle
571  *
572  * 2) The access to the user space rseq memory (TLS) is unlikely to fault
573  *    so the straight inline operation is:
574  *
575  *	- Four 32-bit stores only if CPU ID/ MM CID need to be updated
576  *	- One 64-bit load to retrieve the critical section address
577  *
578  * 3) In the unlikely case that the critical section address is != NULL:
579  *
580  *     - One 64-bit load to retrieve the start IP
581  *     - One 64-bit load to retrieve the offset for calculating the end
582  *     - One 64-bit load to retrieve the abort IP
583  *     - One 64-bit load to retrieve the signature
584  *     - One store to clear the critical section address
585  *
586  * The non-debug case implements only the minimal required checking. It
587  * provides protection against a rogue abort IP in kernel space, which
588  * would be exploitable at least on x86, and also against a rogue CS
589  * descriptor by checking the signature at the abort IP. Any fallout from
590  * invalid critical section descriptors is a user space problem. The debug
591  * case provides the full set of checks and terminates the task if a
592  * condition is not met.
593  *
594  * In case of a fault or an invalid value, this sets TIF_NOTIFY_RESUME and
595  * tells the caller to loop back into exit_to_user_mode_loop(). The rseq
596  * slow path there will handle the failure.
597  */
rseq_exit_user_update(struct pt_regs * regs,struct task_struct * t)598 static __always_inline bool rseq_exit_user_update(struct pt_regs *regs, struct task_struct *t)
599 {
600 	/*
601 	 * Page faults need to be disabled as this is called with
602 	 * interrupts disabled
603 	 */
604 	guard(pagefault)();
605 	/*
606 	 * This optimization is only valid when the task registered for the
607 	 * optimized RSEQ_ABI_V2 variant. Some legacy users rely on the original
608 	 * RSEQ implementation behaviour which unconditionally updated the IDs.
609 	 * rseq_sched_switch_event() ensures that legacy registrations always
610 	 * have both sched_switch and ids_changed set, which is compatible with
611 	 * the historical TIF_NOTIFY_RESUME behaviour.
612 	 */
613 	if (likely(!t->rseq.event.ids_changed)) {
614 		struct rseq __user *rseq = t->rseq.usrptr;
615 		/*
616 		 * If IDs have not changed rseq_event::user_irq must be true
617 		 * See rseq_sched_switch_event().
618 		 */
619 		u64 csaddr;
620 
621 		scoped_user_rw_access(rseq, efault) {
622 			unsafe_get_user(csaddr, &rseq->rseq_cs, efault);
623 
624 			/* RSEQ ABI V2 only operations */
625 			if (rseq_v2(t))
626 				rseq_slice_clear_user(rseq, efault);
627 		}
628 
629 		rseq_slice_clear_grant(t);
630 
631 		if (static_branch_unlikely(&rseq_debug_enabled) || unlikely(csaddr)) {
632 			if (unlikely(!rseq_update_user_cs(t, regs, csaddr)))
633 				return false;
634 		}
635 		return true;
636 	}
637 
638 	struct rseq_ids ids = {
639 		.cpu_id	 = task_cpu(t),
640 		.mm_cid	 = task_mm_cid(t),
641 		.node_id = cpu_to_node(ids.cpu_id),
642 	};
643 
644 	return rseq_update_usr(t, regs, &ids);
645 efault:
646 	return false;
647 }
648 
__rseq_exit_to_user_mode_restart(struct pt_regs * regs)649 static __always_inline bool __rseq_exit_to_user_mode_restart(struct pt_regs *regs)
650 {
651 	struct task_struct *t = current;
652 
653 	/*
654 	 * If the task did not go through schedule or got the flag enforced
655 	 * by the rseq syscall or execve, then nothing to do here.
656 	 *
657 	 * CPU ID and MM CID can only change when going through a context
658 	 * switch.
659 	 *
660 	 * rseq_sched_switch_event() sets the rseq_event::sched_switch bit
661 	 * only when rseq_event::has_rseq is true. That conditional is
662 	 * required to avoid setting the TIF bit if RSEQ is not registered
663 	 * for a task. rseq_event::sched_switch is cleared when RSEQ is
664 	 * unregistered by a task so it's sufficient to check for the
665 	 * sched_switch bit alone.
666 	 *
667 	 * A sane compiler requires three instructions for the nothing to do
668 	 * case including clearing the events, but your mileage might vary.
669 	 */
670 	if (unlikely((t->rseq.event.sched_switch))) {
671 		rseq_stat_inc(rseq_stats.fastpath);
672 
673 		if (unlikely(!rseq_exit_user_update(regs, t)))
674 			return true;
675 	}
676 	/* Clear state so next entry starts from a clean slate */
677 	t->rseq.event.events = 0;
678 	return false;
679 }
680 
681 /* Required to allow conversion to GENERIC_ENTRY w/o GENERIC_TIF_BITS */
682 #ifdef CONFIG_HAVE_GENERIC_TIF_BITS
test_tif_rseq(unsigned long ti_work)683 static __always_inline bool test_tif_rseq(unsigned long ti_work)
684 {
685 	return ti_work & _TIF_RSEQ;
686 }
687 
clear_tif_rseq(void)688 static __always_inline void clear_tif_rseq(void)
689 {
690 	static_assert(TIF_RSEQ != TIF_NOTIFY_RESUME);
691 	clear_thread_flag(TIF_RSEQ);
692 }
693 #else
test_tif_rseq(unsigned long ti_work)694 static __always_inline bool test_tif_rseq(unsigned long ti_work) { return true; }
clear_tif_rseq(void)695 static __always_inline void clear_tif_rseq(void) { }
696 #endif
697 
698 static __always_inline bool
rseq_exit_to_user_mode_restart(struct pt_regs * regs,unsigned long ti_work)699 rseq_exit_to_user_mode_restart(struct pt_regs *regs, unsigned long ti_work)
700 {
701 	if (unlikely(test_tif_rseq(ti_work))) {
702 		if (unlikely(__rseq_exit_to_user_mode_restart(regs))) {
703 			current->rseq.event.slowpath = true;
704 			set_tsk_thread_flag(current, TIF_NOTIFY_RESUME);
705 			return true;
706 		}
707 		clear_tif_rseq();
708 	}
709 	/*
710 	 * Arm the slice extension timer if nothing to do anymore and the
711 	 * task really goes out to user space.
712 	 */
713 	return rseq_arm_slice_extension_timer();
714 }
715 
716 #else /* CONFIG_GENERIC_ENTRY */
rseq_exit_to_user_mode_restart(struct pt_regs * regs,unsigned long ti_work)717 static inline bool rseq_exit_to_user_mode_restart(struct pt_regs *regs, unsigned long ti_work)
718 {
719 	return false;
720 }
721 #endif /* !CONFIG_GENERIC_ENTRY */
722 
rseq_syscall_exit_to_user_mode(void)723 static __always_inline void rseq_syscall_exit_to_user_mode(void)
724 {
725 	struct rseq_event *ev = &current->rseq.event;
726 
727 	rseq_stat_inc(rseq_stats.exit);
728 
729 	/* Needed to remove the store for the !lockdep case */
730 	if (IS_ENABLED(CONFIG_LOCKDEP)) {
731 		WARN_ON_ONCE(ev->sched_switch);
732 		ev->events = 0;
733 	}
734 }
735 
rseq_irqentry_exit_to_user_mode(void)736 static __always_inline void rseq_irqentry_exit_to_user_mode(void)
737 {
738 	struct rseq_event *ev = &current->rseq.event;
739 
740 	rseq_stat_inc(rseq_stats.exit);
741 
742 	lockdep_assert_once(!ev->sched_switch);
743 
744 	/*
745 	 * Ensure that event (especially user_irq) is cleared when the
746 	 * interrupt did not result in a schedule and therefore the
747 	 * rseq processing could not clear it.
748 	 */
749 	ev->events = 0;
750 }
751 
752 /* Required to keep ARM64 working */
rseq_exit_to_user_mode_legacy(void)753 static __always_inline void rseq_exit_to_user_mode_legacy(void)
754 {
755 	struct rseq_event *ev = &current->rseq.event;
756 
757 	rseq_stat_inc(rseq_stats.exit);
758 
759 	if (static_branch_unlikely(&rseq_debug_enabled))
760 		WARN_ON_ONCE(ev->sched_switch);
761 
762 	/*
763 	 * Ensure that event (especially user_irq) is cleared when the
764 	 * interrupt did not result in a schedule and therefore the
765 	 * rseq processing did not clear it.
766 	 */
767 	ev->events = 0;
768 }
769 
770 void __rseq_debug_syscall_return(struct pt_regs *regs);
771 
rseq_debug_syscall_return(struct pt_regs * regs)772 static __always_inline void rseq_debug_syscall_return(struct pt_regs *regs)
773 {
774 	if (static_branch_unlikely(&rseq_debug_enabled))
775 		__rseq_debug_syscall_return(regs);
776 }
777 #else /* CONFIG_RSEQ */
rseq_note_user_irq_entry(void)778 static inline void rseq_note_user_irq_entry(void) { }
rseq_exit_to_user_mode_restart(struct pt_regs * regs,unsigned long ti_work)779 static inline bool rseq_exit_to_user_mode_restart(struct pt_regs *regs, unsigned long ti_work)
780 {
781 	return false;
782 }
rseq_syscall_exit_to_user_mode(void)783 static inline void rseq_syscall_exit_to_user_mode(void) { }
rseq_irqentry_exit_to_user_mode(void)784 static inline void rseq_irqentry_exit_to_user_mode(void) { }
rseq_exit_to_user_mode_legacy(void)785 static inline void rseq_exit_to_user_mode_legacy(void) { }
rseq_debug_syscall_return(struct pt_regs * regs)786 static inline void rseq_debug_syscall_return(struct pt_regs *regs) { }
rseq_grant_slice_extension(unsigned long ti_work,unsigned long mask)787 static inline bool rseq_grant_slice_extension(unsigned long ti_work, unsigned long mask) { return false; }
788 #endif /* !CONFIG_RSEQ */
789 
790 #endif /* _LINUX_RSEQ_ENTRY_H */
791