xref: /linux/include/linux/rseq_entry.h (revision 9e4e86a604dfd06402933467578c4b79f5412b2c)
1 /* SPDX-License-Identifier: GPL-2.0 */
2 #ifndef _LINUX_RSEQ_ENTRY_H
3 #define _LINUX_RSEQ_ENTRY_H
4 
5 /* Must be outside the CONFIG_RSEQ guard to resolve the stubs */
6 #ifdef CONFIG_RSEQ_STATS
7 #include <linux/percpu.h>
8 
9 struct rseq_stats {
10 	unsigned long	exit;
11 	unsigned long	signal;
12 	unsigned long	slowpath;
13 	unsigned long	fastpath;
14 	unsigned long	ids;
15 	unsigned long	cs;
16 	unsigned long	clear;
17 	unsigned long	fixup;
18 	unsigned long	s_granted;
19 	unsigned long	s_expired;
20 	unsigned long	s_revoked;
21 	unsigned long	s_yielded;
22 	unsigned long	s_aborted;
23 };
24 
25 DECLARE_PER_CPU(struct rseq_stats, rseq_stats);
26 
27 /*
28  * Slow path has interrupts and preemption enabled, but the fast path
29  * runs with interrupts disabled so there is no point in having the
30  * preemption checks implied in __this_cpu_inc() for every operation.
31  */
32 #ifdef RSEQ_BUILD_SLOW_PATH
33 #define rseq_stat_inc(which)	this_cpu_inc((which))
34 #else
35 #define rseq_stat_inc(which)	raw_cpu_inc((which))
36 #endif
37 
38 #else /* CONFIG_RSEQ_STATS */
39 #define rseq_stat_inc(x)	do { } while (0)
40 #endif /* !CONFIG_RSEQ_STATS */
41 
42 #ifdef CONFIG_RSEQ
43 #include <linux/hrtimer_rearm.h>
44 #include <linux/jump_label.h>
45 #include <linux/rseq.h>
46 #include <linux/sched/signal.h>
47 #include <linux/uaccess.h>
48 
49 #include <linux/tracepoint-defs.h>
50 
51 #ifdef CONFIG_TRACEPOINTS
52 DECLARE_TRACEPOINT(rseq_update);
53 DECLARE_TRACEPOINT(rseq_ip_fixup);
54 void __rseq_trace_update(struct task_struct *t);
55 void __rseq_trace_ip_fixup(unsigned long ip, unsigned long start_ip,
56 			   unsigned long offset, unsigned long abort_ip);
57 
rseq_trace_update(struct task_struct * t,struct rseq_ids * ids)58 static inline void rseq_trace_update(struct task_struct *t, struct rseq_ids *ids)
59 {
60 	if (tracepoint_enabled(rseq_update) && ids)
61 		__rseq_trace_update(t);
62 }
63 
rseq_trace_ip_fixup(unsigned long ip,unsigned long start_ip,unsigned long offset,unsigned long abort_ip)64 static inline void rseq_trace_ip_fixup(unsigned long ip, unsigned long start_ip,
65 				       unsigned long offset, unsigned long abort_ip)
66 {
67 	if (tracepoint_enabled(rseq_ip_fixup))
68 		__rseq_trace_ip_fixup(ip, start_ip, offset, abort_ip);
69 }
70 
71 #else /* CONFIG_TRACEPOINT */
rseq_trace_update(struct task_struct * t,struct rseq_ids * ids)72 static inline void rseq_trace_update(struct task_struct *t, struct rseq_ids *ids) { }
rseq_trace_ip_fixup(unsigned long ip,unsigned long start_ip,unsigned long offset,unsigned long abort_ip)73 static inline void rseq_trace_ip_fixup(unsigned long ip, unsigned long start_ip,
74 				       unsigned long offset, unsigned long abort_ip) { }
75 #endif /* !CONFIG_TRACEPOINT */
76 
77 DECLARE_STATIC_KEY_MAYBE(CONFIG_RSEQ_DEBUG_DEFAULT_ENABLE, rseq_debug_enabled);
78 
79 #ifdef RSEQ_BUILD_SLOW_PATH
80 #define rseq_inline
81 #else
82 #define rseq_inline __always_inline
83 #endif
84 
85 #ifdef CONFIG_RSEQ_SLICE_EXTENSION
86 DECLARE_STATIC_KEY_TRUE(rseq_slice_extension_key);
87 
rseq_slice_extension_enabled(void)88 static __always_inline bool rseq_slice_extension_enabled(void)
89 {
90 	return static_branch_likely(&rseq_slice_extension_key);
91 }
92 
93 extern unsigned int rseq_slice_ext_nsecs;
94 bool __rseq_arm_slice_extension_timer(void);
95 
rseq_arm_slice_extension_timer(void)96 static __always_inline bool rseq_arm_slice_extension_timer(void)
97 {
98 	if (!rseq_slice_extension_enabled())
99 		return false;
100 
101 	if (likely(!current->rseq.slice.state.granted))
102 		return false;
103 
104 	return __rseq_arm_slice_extension_timer();
105 }
106 
rseq_slice_clear_grant(struct task_struct * t)107 static __always_inline void rseq_slice_clear_grant(struct task_struct *t)
108 {
109 	if (IS_ENABLED(CONFIG_RSEQ_STATS) && t->rseq.slice.state.granted)
110 		rseq_stat_inc(rseq_stats.s_revoked);
111 	t->rseq.slice.state.granted = false;
112 }
113 
__rseq_grant_slice_extension(bool work_pending)114 static __always_inline bool __rseq_grant_slice_extension(bool work_pending)
115 {
116 	struct task_struct *curr = current;
117 	struct rseq_slice_ctrl usr_ctrl;
118 	union rseq_slice_state state;
119 	struct rseq __user *rseq;
120 
121 	if (!rseq_slice_extension_enabled())
122 		return false;
123 
124 	/* If not enabled or not a return from interrupt, nothing to do. */
125 	state = curr->rseq.slice.state;
126 	state.enabled &= curr->rseq.event.user_irq;
127 	if (likely(!state.state))
128 		return false;
129 
130 	rseq = curr->rseq.usrptr;
131 	scoped_user_rw_access(rseq, efault) {
132 
133 		/*
134 		 * Quick check conditions where a grant is not possible or
135 		 * needs to be revoked.
136 		 *
137 		 *  1) Any TIF bit which needs to do extra work aside of
138 		 *     rescheduling prevents a grant.
139 		 *
140 		 *  2) A previous rescheduling request resulted in a slice
141 		 *     extension grant.
142 		 */
143 		if (unlikely(work_pending || state.granted)) {
144 			/* Clear user control unconditionally. No point for checking */
145 			unsafe_put_user(0U, &rseq->slice_ctrl.all, efault);
146 			rseq_slice_clear_grant(curr);
147 			return false;
148 		}
149 
150 		unsafe_get_user(usr_ctrl.all, &rseq->slice_ctrl.all, efault);
151 		if (likely(!(usr_ctrl.request)))
152 			return false;
153 
154 		/* Grant the slice extention */
155 		usr_ctrl.request = 0;
156 		usr_ctrl.granted = 1;
157 		unsafe_put_user(usr_ctrl.all, &rseq->slice_ctrl.all, efault);
158 	}
159 
160 	rseq_stat_inc(rseq_stats.s_granted);
161 
162 	curr->rseq.slice.state.granted = true;
163 	/* Store expiry time for arming the timer on the way out */
164 	curr->rseq.slice.expires = data_race(rseq_slice_ext_nsecs) + ktime_get_mono_fast_ns();
165 	/*
166 	 * This is racy against a remote CPU setting TIF_NEED_RESCHED in
167 	 * several ways:
168 	 *
169 	 * 1)
170 	 *	CPU0			CPU1
171 	 *	clear_tsk()
172 	 *				set_tsk()
173 	 *	clear_preempt()
174 	 *				Raise scheduler IPI on CPU0
175 	 *	--> IPI
176 	 *	    fold_need_resched() -> Folds correctly
177 	 * 2)
178 	 *	CPU0			CPU1
179 	 *				set_tsk()
180 	 *	clear_tsk()
181 	 *	clear_preempt()
182 	 *				Raise scheduler IPI on CPU0
183 	 *	--> IPI
184 	 *	    fold_need_resched() <- NOOP as TIF_NEED_RESCHED is false
185 	 *
186 	 * #1 is not any different from a regular remote reschedule as it
187 	 *    sets the previously not set bit and then raises the IPI which
188 	 *    folds it into the preempt counter
189 	 *
190 	 * #2 is obviously incorrect from a scheduler POV, but it's not
191 	 *    differently incorrect than the code below clearing the
192 	 *    reschedule request with the safety net of the timer.
193 	 *
194 	 * The important part is that the clearing is protected against the
195 	 * scheduler IPI and also against any other interrupt which might
196 	 * end up waking up a task and setting the bits in the middle of
197 	 * the operation:
198 	 *
199 	 *	clear_tsk()
200 	 *	---> Interrupt
201 	 *		wakeup_on_this_cpu()
202 	 *		set_tsk()
203 	 *		set_preempt()
204 	 *	clear_preempt()
205 	 *
206 	 * which would be inconsistent state.
207 	 */
208 	scoped_guard(irq) {
209 		clear_tsk_need_resched(curr);
210 		clear_preempt_need_resched();
211 	}
212 	return true;
213 
214 efault:
215 	force_sig(SIGSEGV);
216 	return false;
217 }
218 
rseq_grant_slice_extension(unsigned long ti_work,unsigned long mask)219 static __always_inline bool rseq_grant_slice_extension(unsigned long ti_work, unsigned long mask)
220 {
221 	if (unlikely(__rseq_grant_slice_extension(ti_work & mask))) {
222 		hrtimer_rearm_deferred_tif(ti_work);
223 		return true;
224 	}
225 	return false;
226 }
227 
228 #else /* CONFIG_RSEQ_SLICE_EXTENSION */
rseq_slice_extension_enabled(void)229 static __always_inline bool rseq_slice_extension_enabled(void) { return false; }
rseq_arm_slice_extension_timer(void)230 static __always_inline bool rseq_arm_slice_extension_timer(void) { return false; }
rseq_slice_clear_grant(struct task_struct * t)231 static __always_inline void rseq_slice_clear_grant(struct task_struct *t) { }
rseq_grant_slice_extension(unsigned long ti_work,unsigned long mask)232 static __always_inline bool rseq_grant_slice_extension(unsigned long ti_work, unsigned long mask) { return false; }
233 #endif /* !CONFIG_RSEQ_SLICE_EXTENSION */
234 
235 bool rseq_debug_update_user_cs(struct task_struct *t, struct pt_regs *regs, unsigned long csaddr);
236 bool rseq_debug_validate_ids(struct task_struct *t);
237 
rseq_note_user_irq_entry(void)238 static __always_inline void rseq_note_user_irq_entry(void)
239 {
240 	if (IS_ENABLED(CONFIG_GENERIC_IRQ_ENTRY))
241 		current->rseq.event.user_irq = true;
242 }
243 
244 /*
245  * Check whether there is a valid critical section and whether the
246  * instruction pointer in @regs is inside the critical section.
247  *
248  *  - If the critical section is invalid, terminate the task.
249  *
250  *  - If valid and the instruction pointer is inside, set it to the abort IP.
251  *
252  *  - If valid and the instruction pointer is outside, clear the critical
253  *    section address.
254  *
255  * Returns true, if the section was valid and either fixup or clear was
256  * done, false otherwise.
257  *
258  * In the failure case task::rseq_event::fatal is set when a invalid
259  * section was found. It's clear when the failure was an unresolved page
260  * fault.
261  *
262  * If inlined into the exit to user path with interrupts disabled, the
263  * caller has to protect against page faults with pagefault_disable().
264  *
265  * In preemptible task context this would be counterproductive as the page
266  * faults could not be fully resolved. As a consequence unresolved page
267  * faults in task context are fatal too.
268  */
269 
270 #ifdef RSEQ_BUILD_SLOW_PATH
271 /*
272  * The debug version is put out of line, but kept here so the code stays
273  * together.
274  *
275  * @csaddr has already been checked by the caller to be in user space
276  */
rseq_debug_update_user_cs(struct task_struct * t,struct pt_regs * regs,unsigned long csaddr)277 bool rseq_debug_update_user_cs(struct task_struct *t, struct pt_regs *regs,
278 			       unsigned long csaddr)
279 {
280 	struct rseq_cs __user *ucs = (struct rseq_cs __user *)(unsigned long)csaddr;
281 	u64 start_ip, abort_ip, offset, cs_end, head, tasksize = TASK_SIZE;
282 	unsigned long ip = instruction_pointer(regs);
283 	u64 __user *uc_head = (u64 __user *) ucs;
284 	u32 usig, __user *uc_sig;
285 
286 	scoped_user_rw_access(ucs, efault) {
287 		/*
288 		 * Evaluate the user pile and exit if one of the conditions
289 		 * is not fulfilled.
290 		 */
291 		unsafe_get_user(start_ip, &ucs->start_ip, efault);
292 		if (unlikely(start_ip >= tasksize))
293 			goto die;
294 		/* If outside, just clear the critical section. */
295 		if (ip < start_ip)
296 			goto clear;
297 
298 		unsafe_get_user(offset, &ucs->post_commit_offset, efault);
299 		cs_end = start_ip + offset;
300 		/* Check for overflow and wraparound */
301 		if (unlikely(cs_end >= tasksize || cs_end < start_ip))
302 			goto die;
303 
304 		/* If not inside, clear it. */
305 		if (ip >= cs_end)
306 			goto clear;
307 
308 		unsafe_get_user(abort_ip, &ucs->abort_ip, efault);
309 		/* Ensure it's "valid" */
310 		if (unlikely(abort_ip >= tasksize || abort_ip < sizeof(*uc_sig)))
311 			goto die;
312 		/* Validate that the abort IP is not in the critical section */
313 		if (unlikely(abort_ip - start_ip < offset))
314 			goto die;
315 
316 		/*
317 		 * Check version and flags for 0. No point in emitting
318 		 * deprecated warnings before dying. That could be done in
319 		 * the slow path eventually, but *shrug*.
320 		 */
321 		unsafe_get_user(head, uc_head, efault);
322 		if (unlikely(head))
323 			goto die;
324 
325 		/* abort_ip - 4 is >= 0. See abort_ip check above */
326 		uc_sig = (u32 __user *)(unsigned long)(abort_ip - sizeof(*uc_sig));
327 		unsafe_get_user(usig, uc_sig, efault);
328 		if (unlikely(usig != t->rseq.sig))
329 			goto die;
330 
331 		/* rseq_event.user_irq is only valid if CONFIG_GENERIC_IRQ_ENTRY=y */
332 		if (IS_ENABLED(CONFIG_GENERIC_IRQ_ENTRY)) {
333 			/* If not in interrupt from user context, let it die */
334 			if (unlikely(!t->rseq.event.user_irq))
335 				goto die;
336 		}
337 		unsafe_put_user(0ULL, &t->rseq.usrptr->rseq_cs, efault);
338 		instruction_pointer_set(regs, (unsigned long)abort_ip);
339 		rseq_stat_inc(rseq_stats.fixup);
340 		break;
341 	clear:
342 		unsafe_put_user(0ULL, &t->rseq.usrptr->rseq_cs, efault);
343 		rseq_stat_inc(rseq_stats.clear);
344 		abort_ip = 0ULL;
345 	}
346 
347 	if (unlikely(abort_ip))
348 		rseq_trace_ip_fixup(ip, start_ip, offset, abort_ip);
349 	return true;
350 die:
351 	t->rseq.event.fatal = true;
352 efault:
353 	return false;
354 }
355 
356 /*
357  * On debug kernels validate that user space did not mess with it if the
358  * debug branch is enabled.
359  */
rseq_debug_validate_ids(struct task_struct * t)360 bool rseq_debug_validate_ids(struct task_struct *t)
361 {
362 	struct rseq __user *rseq = t->rseq.usrptr;
363 	u32 cpu_id, uval, node_id;
364 
365 	/*
366 	 * On the first exit after registering the rseq region CPU ID is
367 	 * RSEQ_CPU_ID_UNINITIALIZED and node_id in user space is 0!
368 	 */
369 	node_id = t->rseq.ids.cpu_id != RSEQ_CPU_ID_UNINITIALIZED ?
370 		  cpu_to_node(t->rseq.ids.cpu_id) : 0;
371 
372 	scoped_user_read_access(rseq, efault) {
373 		unsafe_get_user(cpu_id, &rseq->cpu_id_start, efault);
374 		if (cpu_id != t->rseq.ids.cpu_id)
375 			goto die;
376 		unsafe_get_user(uval, &rseq->cpu_id, efault);
377 		if (uval != cpu_id)
378 			goto die;
379 		unsafe_get_user(uval, &rseq->node_id, efault);
380 		if (uval != node_id)
381 			goto die;
382 		unsafe_get_user(uval, &rseq->mm_cid, efault);
383 		if (uval != t->rseq.ids.mm_cid)
384 			goto die;
385 	}
386 	return true;
387 die:
388 	t->rseq.event.fatal = true;
389 efault:
390 	return false;
391 }
392 
393 #endif /* RSEQ_BUILD_SLOW_PATH */
394 
395 /*
396  * This only ensures that abort_ip is in the user address space and
397  * validates that it is preceded by the signature.
398  *
399  * No other sanity checks are done here, that's what the debug code is for.
400  */
401 static rseq_inline bool
rseq_update_user_cs(struct task_struct * t,struct pt_regs * regs,unsigned long csaddr)402 rseq_update_user_cs(struct task_struct *t, struct pt_regs *regs, unsigned long csaddr)
403 {
404 	struct rseq_cs __user *ucs = (struct rseq_cs __user *)(unsigned long)csaddr;
405 	unsigned long ip = instruction_pointer(regs);
406 	unsigned long tasksize = TASK_SIZE;
407 	u64 start_ip, abort_ip, offset;
408 	u32 usig, __user *uc_sig;
409 
410 	rseq_stat_inc(rseq_stats.cs);
411 
412 	if (unlikely(csaddr >= tasksize)) {
413 		t->rseq.event.fatal = true;
414 		return false;
415 	}
416 
417 	if (static_branch_unlikely(&rseq_debug_enabled))
418 		return rseq_debug_update_user_cs(t, regs, csaddr);
419 
420 	scoped_user_rw_access(ucs, efault) {
421 		unsafe_get_user(start_ip, &ucs->start_ip, efault);
422 		unsafe_get_user(offset, &ucs->post_commit_offset, efault);
423 		unsafe_get_user(abort_ip, &ucs->abort_ip, efault);
424 
425 		/*
426 		 * No sanity checks. If user space screwed it up, it can
427 		 * keep the pieces. That's what debug code is for.
428 		 *
429 		 * If outside, just clear the critical section.
430 		 */
431 		if (ip - start_ip >= offset)
432 			goto clear;
433 
434 		/*
435 		 * Two requirements for @abort_ip:
436 		 *   - Must be in user space as x86 IRET would happily return to
437 		 *     the kernel.
438 		 *   - The four bytes preceding the instruction at @abort_ip must
439 		 *     contain the signature.
440 		 *
441 		 * The latter protects against the following attack vector:
442 		 *
443 		 * An attacker with limited abilities to write, creates a critical
444 		 * section descriptor, sets the abort IP to a library function or
445 		 * some other ROP gadget and stores the address of the descriptor
446 		 * in TLS::rseq::rseq_cs. An RSEQ abort would then evade ROP
447 		 * protection.
448 		 */
449 		if (unlikely(abort_ip >= tasksize || abort_ip < sizeof(*uc_sig)))
450 			goto die;
451 
452 		/* The address is guaranteed to be >= 0 and < TASK_SIZE */
453 		uc_sig = (u32 __user *)(unsigned long)(abort_ip - sizeof(*uc_sig));
454 		unsafe_get_user(usig, uc_sig, efault);
455 		if (unlikely(usig != t->rseq.sig))
456 			goto die;
457 
458 		/* Invalidate the critical section */
459 		unsafe_put_user(0ULL, &t->rseq.usrptr->rseq_cs, efault);
460 		/* Update the instruction pointer */
461 		instruction_pointer_set(regs, (unsigned long)abort_ip);
462 		rseq_stat_inc(rseq_stats.fixup);
463 		break;
464 	clear:
465 		unsafe_put_user(0ULL, &t->rseq.usrptr->rseq_cs, efault);
466 		rseq_stat_inc(rseq_stats.clear);
467 		abort_ip = 0ULL;
468 	}
469 
470 	if (unlikely(abort_ip))
471 		rseq_trace_ip_fixup(ip, start_ip, offset, abort_ip);
472 	return true;
473 die:
474 	t->rseq.event.fatal = true;
475 efault:
476 	return false;
477 }
478 
479 /*
480  * Updates CPU ID, Node ID and MM CID and reads the critical section
481  * address, when @csaddr != NULL. This allows to put the ID update and the
482  * read under the same uaccess region to spare a separate begin/end.
483  *
484  * As this is either invoked from a C wrapper with @csaddr = NULL or from
485  * the fast path code with a valid pointer, a clever compiler should be
486  * able to optimize the read out. Spares a duplicate implementation.
487  *
488  * Returns true, if the operation was successful, false otherwise.
489  *
490  * In the failure case task::rseq_event::fatal is set when invalid data
491  * was found on debug kernels. It's clear when the failure was an unresolved page
492  * fault.
493  *
494  * If inlined into the exit to user path with interrupts disabled, the
495  * caller has to protect against page faults with pagefault_disable().
496  *
497  * In preemptible task context this would be counterproductive as the page
498  * faults could not be fully resolved. As a consequence unresolved page
499  * faults in task context are fatal too.
500  */
501 static rseq_inline
rseq_set_ids_get_csaddr(struct task_struct * t,struct rseq_ids * ids,u32 node_id,u64 * csaddr)502 bool rseq_set_ids_get_csaddr(struct task_struct *t, struct rseq_ids *ids,
503 			     u32 node_id, u64 *csaddr)
504 {
505 	struct rseq __user *rseq = t->rseq.usrptr;
506 
507 	if (static_branch_unlikely(&rseq_debug_enabled)) {
508 		if (!rseq_debug_validate_ids(t))
509 			return false;
510 	}
511 
512 	scoped_user_rw_access(rseq, efault) {
513 		unsafe_put_user(ids->cpu_id, &rseq->cpu_id_start, efault);
514 		unsafe_put_user(ids->cpu_id, &rseq->cpu_id, efault);
515 		unsafe_put_user(node_id, &rseq->node_id, efault);
516 		unsafe_put_user(ids->mm_cid, &rseq->mm_cid, efault);
517 		if (csaddr)
518 			unsafe_get_user(*csaddr, &rseq->rseq_cs, efault);
519 
520 		/* Open coded, so it's in the same user access region */
521 		if (rseq_slice_extension_enabled()) {
522 			/* Unconditionally clear it, no point in conditionals */
523 			unsafe_put_user(0U, &rseq->slice_ctrl.all, efault);
524 		}
525 	}
526 
527 	rseq_slice_clear_grant(t);
528 	/* Cache the new values */
529 	t->rseq.ids.cpu_cid = ids->cpu_cid;
530 	rseq_stat_inc(rseq_stats.ids);
531 	rseq_trace_update(t, ids);
532 	return true;
533 efault:
534 	return false;
535 }
536 
537 /*
538  * Update user space with new IDs and conditionally check whether the task
539  * is in a critical section.
540  */
rseq_update_usr(struct task_struct * t,struct pt_regs * regs,struct rseq_ids * ids,u32 node_id)541 static rseq_inline bool rseq_update_usr(struct task_struct *t, struct pt_regs *regs,
542 					struct rseq_ids *ids, u32 node_id)
543 {
544 	u64 csaddr;
545 
546 	if (!rseq_set_ids_get_csaddr(t, ids, node_id, &csaddr))
547 		return false;
548 
549 	/*
550 	 * On architectures which utilize the generic entry code this
551 	 * allows to skip the critical section when the entry was not from
552 	 * a user space interrupt, unless debug mode is enabled.
553 	 */
554 	if (IS_ENABLED(CONFIG_GENERIC_IRQ_ENTRY)) {
555 		if (!static_branch_unlikely(&rseq_debug_enabled)) {
556 			if (likely(!t->rseq.event.user_irq))
557 				return true;
558 		}
559 	}
560 	if (likely(!csaddr))
561 		return true;
562 	/* Sigh, this really needs to do work */
563 	return rseq_update_user_cs(t, regs, csaddr);
564 }
565 
566 /*
567  * If you want to use this then convert your architecture to the generic
568  * entry code. I'm tired of building workarounds for people who can't be
569  * bothered to make the maintenance of generic infrastructure less
570  * burdensome. Just sucking everything into the architecture code and
571  * thereby making others chase the horrible hacks and keep them working is
572  * neither acceptable nor sustainable.
573  */
574 #ifdef CONFIG_GENERIC_ENTRY
575 
576 /*
577  * This is inlined into the exit path because:
578  *
579  * 1) It's a one time comparison in the fast path when there is no event to
580  *    handle
581  *
582  * 2) The access to the user space rseq memory (TLS) is unlikely to fault
583  *    so the straight inline operation is:
584  *
585  *	- Four 32-bit stores only if CPU ID/ MM CID need to be updated
586  *	- One 64-bit load to retrieve the critical section address
587  *
588  * 3) In the unlikely case that the critical section address is != NULL:
589  *
590  *     - One 64-bit load to retrieve the start IP
591  *     - One 64-bit load to retrieve the offset for calculating the end
592  *     - One 64-bit load to retrieve the abort IP
593  *     - One 64-bit load to retrieve the signature
594  *     - One store to clear the critical section address
595  *
596  * The non-debug case implements only the minimal required checking. It
597  * provides protection against a rogue abort IP in kernel space, which
598  * would be exploitable at least on x86, and also against a rogue CS
599  * descriptor by checking the signature at the abort IP. Any fallout from
600  * invalid critical section descriptors is a user space problem. The debug
601  * case provides the full set of checks and terminates the task if a
602  * condition is not met.
603  *
604  * In case of a fault or an invalid value, this sets TIF_NOTIFY_RESUME and
605  * tells the caller to loop back into exit_to_user_mode_loop(). The rseq
606  * slow path there will handle the failure.
607  */
rseq_exit_user_update(struct pt_regs * regs,struct task_struct * t)608 static __always_inline bool rseq_exit_user_update(struct pt_regs *regs, struct task_struct *t)
609 {
610 	/*
611 	 * Page faults need to be disabled as this is called with
612 	 * interrupts disabled
613 	 */
614 	guard(pagefault)();
615 	if (likely(!t->rseq.event.ids_changed)) {
616 		struct rseq __user *rseq = t->rseq.usrptr;
617 		/*
618 		 * If IDs have not changed rseq_event::user_irq must be true
619 		 * See rseq_sched_switch_event().
620 		 */
621 		u64 csaddr;
622 
623 		scoped_user_rw_access(rseq, efault) {
624 			unsafe_get_user(csaddr, &rseq->rseq_cs, efault);
625 
626 			/* Open coded, so it's in the same user access region */
627 			if (rseq_slice_extension_enabled()) {
628 				/* Unconditionally clear it, no point in conditionals */
629 				unsafe_put_user(0U, &rseq->slice_ctrl.all, efault);
630 			}
631 		}
632 
633 		rseq_slice_clear_grant(t);
634 
635 		if (static_branch_unlikely(&rseq_debug_enabled) || unlikely(csaddr)) {
636 			if (unlikely(!rseq_update_user_cs(t, regs, csaddr)))
637 				return false;
638 		}
639 		return true;
640 	}
641 
642 	struct rseq_ids ids = {
643 		.cpu_id = task_cpu(t),
644 		.mm_cid = task_mm_cid(t),
645 	};
646 	u32 node_id = cpu_to_node(ids.cpu_id);
647 
648 	return rseq_update_usr(t, regs, &ids, node_id);
649 efault:
650 	return false;
651 }
652 
__rseq_exit_to_user_mode_restart(struct pt_regs * regs)653 static __always_inline bool __rseq_exit_to_user_mode_restart(struct pt_regs *regs)
654 {
655 	struct task_struct *t = current;
656 
657 	/*
658 	 * If the task did not go through schedule or got the flag enforced
659 	 * by the rseq syscall or execve, then nothing to do here.
660 	 *
661 	 * CPU ID and MM CID can only change when going through a context
662 	 * switch.
663 	 *
664 	 * rseq_sched_switch_event() sets the rseq_event::sched_switch bit
665 	 * only when rseq_event::has_rseq is true. That conditional is
666 	 * required to avoid setting the TIF bit if RSEQ is not registered
667 	 * for a task. rseq_event::sched_switch is cleared when RSEQ is
668 	 * unregistered by a task so it's sufficient to check for the
669 	 * sched_switch bit alone.
670 	 *
671 	 * A sane compiler requires three instructions for the nothing to do
672 	 * case including clearing the events, but your mileage might vary.
673 	 */
674 	if (unlikely((t->rseq.event.sched_switch))) {
675 		rseq_stat_inc(rseq_stats.fastpath);
676 
677 		if (unlikely(!rseq_exit_user_update(regs, t)))
678 			return true;
679 	}
680 	/* Clear state so next entry starts from a clean slate */
681 	t->rseq.event.events = 0;
682 	return false;
683 }
684 
685 /* Required to allow conversion to GENERIC_ENTRY w/o GENERIC_TIF_BITS */
686 #ifdef CONFIG_HAVE_GENERIC_TIF_BITS
test_tif_rseq(unsigned long ti_work)687 static __always_inline bool test_tif_rseq(unsigned long ti_work)
688 {
689 	return ti_work & _TIF_RSEQ;
690 }
691 
clear_tif_rseq(void)692 static __always_inline void clear_tif_rseq(void)
693 {
694 	static_assert(TIF_RSEQ != TIF_NOTIFY_RESUME);
695 	clear_thread_flag(TIF_RSEQ);
696 }
697 #else
test_tif_rseq(unsigned long ti_work)698 static __always_inline bool test_tif_rseq(unsigned long ti_work) { return true; }
clear_tif_rseq(void)699 static __always_inline void clear_tif_rseq(void) { }
700 #endif
701 
702 static __always_inline bool
rseq_exit_to_user_mode_restart(struct pt_regs * regs,unsigned long ti_work)703 rseq_exit_to_user_mode_restart(struct pt_regs *regs, unsigned long ti_work)
704 {
705 	if (unlikely(test_tif_rseq(ti_work))) {
706 		if (unlikely(__rseq_exit_to_user_mode_restart(regs))) {
707 			current->rseq.event.slowpath = true;
708 			set_tsk_thread_flag(current, TIF_NOTIFY_RESUME);
709 			return true;
710 		}
711 		clear_tif_rseq();
712 	}
713 	/*
714 	 * Arm the slice extension timer if nothing to do anymore and the
715 	 * task really goes out to user space.
716 	 */
717 	return rseq_arm_slice_extension_timer();
718 }
719 
720 #else /* CONFIG_GENERIC_ENTRY */
rseq_exit_to_user_mode_restart(struct pt_regs * regs,unsigned long ti_work)721 static inline bool rseq_exit_to_user_mode_restart(struct pt_regs *regs, unsigned long ti_work)
722 {
723 	return false;
724 }
725 #endif /* !CONFIG_GENERIC_ENTRY */
726 
rseq_syscall_exit_to_user_mode(void)727 static __always_inline void rseq_syscall_exit_to_user_mode(void)
728 {
729 	struct rseq_event *ev = &current->rseq.event;
730 
731 	rseq_stat_inc(rseq_stats.exit);
732 
733 	/* Needed to remove the store for the !lockdep case */
734 	if (IS_ENABLED(CONFIG_LOCKDEP)) {
735 		WARN_ON_ONCE(ev->sched_switch);
736 		ev->events = 0;
737 	}
738 }
739 
rseq_irqentry_exit_to_user_mode(void)740 static __always_inline void rseq_irqentry_exit_to_user_mode(void)
741 {
742 	struct rseq_event *ev = &current->rseq.event;
743 
744 	rseq_stat_inc(rseq_stats.exit);
745 
746 	lockdep_assert_once(!ev->sched_switch);
747 
748 	/*
749 	 * Ensure that event (especially user_irq) is cleared when the
750 	 * interrupt did not result in a schedule and therefore the
751 	 * rseq processing could not clear it.
752 	 */
753 	ev->events = 0;
754 }
755 
756 /* Required to keep ARM64 working */
rseq_exit_to_user_mode_legacy(void)757 static __always_inline void rseq_exit_to_user_mode_legacy(void)
758 {
759 	struct rseq_event *ev = &current->rseq.event;
760 
761 	rseq_stat_inc(rseq_stats.exit);
762 
763 	if (static_branch_unlikely(&rseq_debug_enabled))
764 		WARN_ON_ONCE(ev->sched_switch);
765 
766 	/*
767 	 * Ensure that event (especially user_irq) is cleared when the
768 	 * interrupt did not result in a schedule and therefore the
769 	 * rseq processing did not clear it.
770 	 */
771 	ev->events = 0;
772 }
773 
774 void __rseq_debug_syscall_return(struct pt_regs *regs);
775 
rseq_debug_syscall_return(struct pt_regs * regs)776 static __always_inline void rseq_debug_syscall_return(struct pt_regs *regs)
777 {
778 	if (static_branch_unlikely(&rseq_debug_enabled))
779 		__rseq_debug_syscall_return(regs);
780 }
781 #else /* CONFIG_RSEQ */
rseq_note_user_irq_entry(void)782 static inline void rseq_note_user_irq_entry(void) { }
rseq_exit_to_user_mode_restart(struct pt_regs * regs,unsigned long ti_work)783 static inline bool rseq_exit_to_user_mode_restart(struct pt_regs *regs, unsigned long ti_work)
784 {
785 	return false;
786 }
rseq_syscall_exit_to_user_mode(void)787 static inline void rseq_syscall_exit_to_user_mode(void) { }
rseq_irqentry_exit_to_user_mode(void)788 static inline void rseq_irqentry_exit_to_user_mode(void) { }
rseq_exit_to_user_mode_legacy(void)789 static inline void rseq_exit_to_user_mode_legacy(void) { }
rseq_debug_syscall_return(struct pt_regs * regs)790 static inline void rseq_debug_syscall_return(struct pt_regs *regs) { }
rseq_grant_slice_extension(unsigned long ti_work,unsigned long mask)791 static inline bool rseq_grant_slice_extension(unsigned long ti_work, unsigned long mask) { return false; }
792 #endif /* !CONFIG_RSEQ */
793 
794 #endif /* _LINUX_RSEQ_ENTRY_H */
795