xref: /linux/include/linux/rseq_entry.h (revision 61706251492eff650e91c58507bc77e1a12c7fbb)
1 /* SPDX-License-Identifier: GPL-2.0 */
2 #ifndef _LINUX_RSEQ_ENTRY_H
3 #define _LINUX_RSEQ_ENTRY_H
4 
5 /* Must be outside the CONFIG_RSEQ guard to resolve the stubs */
6 #ifdef CONFIG_RSEQ_STATS
7 #include <linux/percpu.h>
8 
9 struct rseq_stats {
10 	unsigned long	exit;
11 	unsigned long	signal;
12 	unsigned long	slowpath;
13 	unsigned long	fastpath;
14 	unsigned long	ids;
15 	unsigned long	cs;
16 	unsigned long	clear;
17 	unsigned long	fixup;
18 	unsigned long	s_granted;
19 	unsigned long	s_expired;
20 	unsigned long	s_revoked;
21 	unsigned long	s_yielded;
22 	unsigned long	s_aborted;
23 };
24 
25 DECLARE_PER_CPU(struct rseq_stats, rseq_stats);
26 
27 /*
28  * Slow path has interrupts and preemption enabled, but the fast path
29  * runs with interrupts disabled so there is no point in having the
30  * preemption checks implied in __this_cpu_inc() for every operation.
31  */
32 #ifdef RSEQ_BUILD_SLOW_PATH
33 #define rseq_stat_inc(which)	this_cpu_inc((which))
34 #else
35 #define rseq_stat_inc(which)	raw_cpu_inc((which))
36 #endif
37 
38 #else /* CONFIG_RSEQ_STATS */
39 #define rseq_stat_inc(x)	do { } while (0)
40 #endif /* !CONFIG_RSEQ_STATS */
41 
42 #ifdef CONFIG_RSEQ
43 #include <linux/jump_label.h>
44 #include <linux/rseq.h>
45 #include <linux/sched/signal.h>
46 #include <linux/uaccess.h>
47 
48 #include <linux/tracepoint-defs.h>
49 
50 #ifdef CONFIG_TRACEPOINTS
51 DECLARE_TRACEPOINT(rseq_update);
52 DECLARE_TRACEPOINT(rseq_ip_fixup);
53 void __rseq_trace_update(struct task_struct *t);
54 void __rseq_trace_ip_fixup(unsigned long ip, unsigned long start_ip,
55 			   unsigned long offset, unsigned long abort_ip);
56 
rseq_trace_update(struct task_struct * t,struct rseq_ids * ids)57 static inline void rseq_trace_update(struct task_struct *t, struct rseq_ids *ids)
58 {
59 	if (tracepoint_enabled(rseq_update) && ids)
60 		__rseq_trace_update(t);
61 }
62 
rseq_trace_ip_fixup(unsigned long ip,unsigned long start_ip,unsigned long offset,unsigned long abort_ip)63 static inline void rseq_trace_ip_fixup(unsigned long ip, unsigned long start_ip,
64 				       unsigned long offset, unsigned long abort_ip)
65 {
66 	if (tracepoint_enabled(rseq_ip_fixup))
67 		__rseq_trace_ip_fixup(ip, start_ip, offset, abort_ip);
68 }
69 
70 #else /* CONFIG_TRACEPOINT */
rseq_trace_update(struct task_struct * t,struct rseq_ids * ids)71 static inline void rseq_trace_update(struct task_struct *t, struct rseq_ids *ids) { }
rseq_trace_ip_fixup(unsigned long ip,unsigned long start_ip,unsigned long offset,unsigned long abort_ip)72 static inline void rseq_trace_ip_fixup(unsigned long ip, unsigned long start_ip,
73 				       unsigned long offset, unsigned long abort_ip) { }
74 #endif /* !CONFIG_TRACEPOINT */
75 
76 DECLARE_STATIC_KEY_MAYBE(CONFIG_RSEQ_DEBUG_DEFAULT_ENABLE, rseq_debug_enabled);
77 
78 #ifdef RSEQ_BUILD_SLOW_PATH
79 #define rseq_inline
80 #else
81 #define rseq_inline __always_inline
82 #endif
83 
84 #ifdef CONFIG_RSEQ_SLICE_EXTENSION
85 DECLARE_STATIC_KEY_TRUE(rseq_slice_extension_key);
86 
rseq_slice_extension_enabled(void)87 static __always_inline bool rseq_slice_extension_enabled(void)
88 {
89 	return static_branch_likely(&rseq_slice_extension_key);
90 }
91 
92 extern unsigned int rseq_slice_ext_nsecs;
93 bool __rseq_arm_slice_extension_timer(void);
94 
rseq_arm_slice_extension_timer(void)95 static __always_inline bool rseq_arm_slice_extension_timer(void)
96 {
97 	if (!rseq_slice_extension_enabled())
98 		return false;
99 
100 	if (likely(!current->rseq.slice.state.granted))
101 		return false;
102 
103 	return __rseq_arm_slice_extension_timer();
104 }
105 
rseq_slice_clear_grant(struct task_struct * t)106 static __always_inline void rseq_slice_clear_grant(struct task_struct *t)
107 {
108 	if (IS_ENABLED(CONFIG_RSEQ_STATS) && t->rseq.slice.state.granted)
109 		rseq_stat_inc(rseq_stats.s_revoked);
110 	t->rseq.slice.state.granted = false;
111 }
112 
rseq_grant_slice_extension(bool work_pending)113 static __always_inline bool rseq_grant_slice_extension(bool work_pending)
114 {
115 	struct task_struct *curr = current;
116 	struct rseq_slice_ctrl usr_ctrl;
117 	union rseq_slice_state state;
118 	struct rseq __user *rseq;
119 
120 	if (!rseq_slice_extension_enabled())
121 		return false;
122 
123 	/* If not enabled or not a return from interrupt, nothing to do. */
124 	state = curr->rseq.slice.state;
125 	state.enabled &= curr->rseq.event.user_irq;
126 	if (likely(!state.state))
127 		return false;
128 
129 	rseq = curr->rseq.usrptr;
130 	scoped_user_rw_access(rseq, efault) {
131 
132 		/*
133 		 * Quick check conditions where a grant is not possible or
134 		 * needs to be revoked.
135 		 *
136 		 *  1) Any TIF bit which needs to do extra work aside of
137 		 *     rescheduling prevents a grant.
138 		 *
139 		 *  2) A previous rescheduling request resulted in a slice
140 		 *     extension grant.
141 		 */
142 		if (unlikely(work_pending || state.granted)) {
143 			/* Clear user control unconditionally. No point for checking */
144 			unsafe_put_user(0U, &rseq->slice_ctrl.all, efault);
145 			rseq_slice_clear_grant(curr);
146 			return false;
147 		}
148 
149 		unsafe_get_user(usr_ctrl.all, &rseq->slice_ctrl.all, efault);
150 		if (likely(!(usr_ctrl.request)))
151 			return false;
152 
153 		/* Grant the slice extention */
154 		usr_ctrl.request = 0;
155 		usr_ctrl.granted = 1;
156 		unsafe_put_user(usr_ctrl.all, &rseq->slice_ctrl.all, efault);
157 	}
158 
159 	rseq_stat_inc(rseq_stats.s_granted);
160 
161 	curr->rseq.slice.state.granted = true;
162 	/* Store expiry time for arming the timer on the way out */
163 	curr->rseq.slice.expires = data_race(rseq_slice_ext_nsecs) + ktime_get_mono_fast_ns();
164 	/*
165 	 * This is racy against a remote CPU setting TIF_NEED_RESCHED in
166 	 * several ways:
167 	 *
168 	 * 1)
169 	 *	CPU0			CPU1
170 	 *	clear_tsk()
171 	 *				set_tsk()
172 	 *	clear_preempt()
173 	 *				Raise scheduler IPI on CPU0
174 	 *	--> IPI
175 	 *	    fold_need_resched() -> Folds correctly
176 	 * 2)
177 	 *	CPU0			CPU1
178 	 *				set_tsk()
179 	 *	clear_tsk()
180 	 *	clear_preempt()
181 	 *				Raise scheduler IPI on CPU0
182 	 *	--> IPI
183 	 *	    fold_need_resched() <- NOOP as TIF_NEED_RESCHED is false
184 	 *
185 	 * #1 is not any different from a regular remote reschedule as it
186 	 *    sets the previously not set bit and then raises the IPI which
187 	 *    folds it into the preempt counter
188 	 *
189 	 * #2 is obviously incorrect from a scheduler POV, but it's not
190 	 *    differently incorrect than the code below clearing the
191 	 *    reschedule request with the safety net of the timer.
192 	 *
193 	 * The important part is that the clearing is protected against the
194 	 * scheduler IPI and also against any other interrupt which might
195 	 * end up waking up a task and setting the bits in the middle of
196 	 * the operation:
197 	 *
198 	 *	clear_tsk()
199 	 *	---> Interrupt
200 	 *		wakeup_on_this_cpu()
201 	 *		set_tsk()
202 	 *		set_preempt()
203 	 *	clear_preempt()
204 	 *
205 	 * which would be inconsistent state.
206 	 */
207 	scoped_guard(irq) {
208 		clear_tsk_need_resched(curr);
209 		clear_preempt_need_resched();
210 	}
211 	return true;
212 
213 efault:
214 	force_sig(SIGSEGV);
215 	return false;
216 }
217 
218 #else /* CONFIG_RSEQ_SLICE_EXTENSION */
rseq_slice_extension_enabled(void)219 static __always_inline bool rseq_slice_extension_enabled(void) { return false; }
rseq_arm_slice_extension_timer(void)220 static __always_inline bool rseq_arm_slice_extension_timer(void) { return false; }
rseq_slice_clear_grant(struct task_struct * t)221 static __always_inline void rseq_slice_clear_grant(struct task_struct *t) { }
rseq_grant_slice_extension(bool work_pending)222 static __always_inline bool rseq_grant_slice_extension(bool work_pending) { return false; }
223 #endif /* !CONFIG_RSEQ_SLICE_EXTENSION */
224 
225 bool rseq_debug_update_user_cs(struct task_struct *t, struct pt_regs *regs, unsigned long csaddr);
226 bool rseq_debug_validate_ids(struct task_struct *t);
227 
rseq_note_user_irq_entry(void)228 static __always_inline void rseq_note_user_irq_entry(void)
229 {
230 	if (IS_ENABLED(CONFIG_GENERIC_IRQ_ENTRY))
231 		current->rseq.event.user_irq = true;
232 }
233 
234 /*
235  * Check whether there is a valid critical section and whether the
236  * instruction pointer in @regs is inside the critical section.
237  *
238  *  - If the critical section is invalid, terminate the task.
239  *
240  *  - If valid and the instruction pointer is inside, set it to the abort IP.
241  *
242  *  - If valid and the instruction pointer is outside, clear the critical
243  *    section address.
244  *
245  * Returns true, if the section was valid and either fixup or clear was
246  * done, false otherwise.
247  *
248  * In the failure case task::rseq_event::fatal is set when a invalid
249  * section was found. It's clear when the failure was an unresolved page
250  * fault.
251  *
252  * If inlined into the exit to user path with interrupts disabled, the
253  * caller has to protect against page faults with pagefault_disable().
254  *
255  * In preemptible task context this would be counterproductive as the page
256  * faults could not be fully resolved. As a consequence unresolved page
257  * faults in task context are fatal too.
258  */
259 
260 #ifdef RSEQ_BUILD_SLOW_PATH
261 /*
262  * The debug version is put out of line, but kept here so the code stays
263  * together.
264  *
265  * @csaddr has already been checked by the caller to be in user space
266  */
rseq_debug_update_user_cs(struct task_struct * t,struct pt_regs * regs,unsigned long csaddr)267 bool rseq_debug_update_user_cs(struct task_struct *t, struct pt_regs *regs,
268 			       unsigned long csaddr)
269 {
270 	struct rseq_cs __user *ucs = (struct rseq_cs __user *)(unsigned long)csaddr;
271 	u64 start_ip, abort_ip, offset, cs_end, head, tasksize = TASK_SIZE;
272 	unsigned long ip = instruction_pointer(regs);
273 	u64 __user *uc_head = (u64 __user *) ucs;
274 	u32 usig, __user *uc_sig;
275 
276 	scoped_user_rw_access(ucs, efault) {
277 		/*
278 		 * Evaluate the user pile and exit if one of the conditions
279 		 * is not fulfilled.
280 		 */
281 		unsafe_get_user(start_ip, &ucs->start_ip, efault);
282 		if (unlikely(start_ip >= tasksize))
283 			goto die;
284 		/* If outside, just clear the critical section. */
285 		if (ip < start_ip)
286 			goto clear;
287 
288 		unsafe_get_user(offset, &ucs->post_commit_offset, efault);
289 		cs_end = start_ip + offset;
290 		/* Check for overflow and wraparound */
291 		if (unlikely(cs_end >= tasksize || cs_end < start_ip))
292 			goto die;
293 
294 		/* If not inside, clear it. */
295 		if (ip >= cs_end)
296 			goto clear;
297 
298 		unsafe_get_user(abort_ip, &ucs->abort_ip, efault);
299 		/* Ensure it's "valid" */
300 		if (unlikely(abort_ip >= tasksize || abort_ip < sizeof(*uc_sig)))
301 			goto die;
302 		/* Validate that the abort IP is not in the critical section */
303 		if (unlikely(abort_ip - start_ip < offset))
304 			goto die;
305 
306 		/*
307 		 * Check version and flags for 0. No point in emitting
308 		 * deprecated warnings before dying. That could be done in
309 		 * the slow path eventually, but *shrug*.
310 		 */
311 		unsafe_get_user(head, uc_head, efault);
312 		if (unlikely(head))
313 			goto die;
314 
315 		/* abort_ip - 4 is >= 0. See abort_ip check above */
316 		uc_sig = (u32 __user *)(unsigned long)(abort_ip - sizeof(*uc_sig));
317 		unsafe_get_user(usig, uc_sig, efault);
318 		if (unlikely(usig != t->rseq.sig))
319 			goto die;
320 
321 		/* rseq_event.user_irq is only valid if CONFIG_GENERIC_IRQ_ENTRY=y */
322 		if (IS_ENABLED(CONFIG_GENERIC_IRQ_ENTRY)) {
323 			/* If not in interrupt from user context, let it die */
324 			if (unlikely(!t->rseq.event.user_irq))
325 				goto die;
326 		}
327 		unsafe_put_user(0ULL, &t->rseq.usrptr->rseq_cs, efault);
328 		instruction_pointer_set(regs, (unsigned long)abort_ip);
329 		rseq_stat_inc(rseq_stats.fixup);
330 		break;
331 	clear:
332 		unsafe_put_user(0ULL, &t->rseq.usrptr->rseq_cs, efault);
333 		rseq_stat_inc(rseq_stats.clear);
334 		abort_ip = 0ULL;
335 	}
336 
337 	if (unlikely(abort_ip))
338 		rseq_trace_ip_fixup(ip, start_ip, offset, abort_ip);
339 	return true;
340 die:
341 	t->rseq.event.fatal = true;
342 efault:
343 	return false;
344 }
345 
346 /*
347  * On debug kernels validate that user space did not mess with it if the
348  * debug branch is enabled.
349  */
rseq_debug_validate_ids(struct task_struct * t)350 bool rseq_debug_validate_ids(struct task_struct *t)
351 {
352 	struct rseq __user *rseq = t->rseq.usrptr;
353 	u32 cpu_id, uval, node_id;
354 
355 	/*
356 	 * On the first exit after registering the rseq region CPU ID is
357 	 * RSEQ_CPU_ID_UNINITIALIZED and node_id in user space is 0!
358 	 */
359 	node_id = t->rseq.ids.cpu_id != RSEQ_CPU_ID_UNINITIALIZED ?
360 		  cpu_to_node(t->rseq.ids.cpu_id) : 0;
361 
362 	scoped_user_read_access(rseq, efault) {
363 		unsafe_get_user(cpu_id, &rseq->cpu_id_start, efault);
364 		if (cpu_id != t->rseq.ids.cpu_id)
365 			goto die;
366 		unsafe_get_user(uval, &rseq->cpu_id, efault);
367 		if (uval != cpu_id)
368 			goto die;
369 		unsafe_get_user(uval, &rseq->node_id, efault);
370 		if (uval != node_id)
371 			goto die;
372 		unsafe_get_user(uval, &rseq->mm_cid, efault);
373 		if (uval != t->rseq.ids.mm_cid)
374 			goto die;
375 	}
376 	return true;
377 die:
378 	t->rseq.event.fatal = true;
379 efault:
380 	return false;
381 }
382 
383 #endif /* RSEQ_BUILD_SLOW_PATH */
384 
385 /*
386  * This only ensures that abort_ip is in the user address space and
387  * validates that it is preceded by the signature.
388  *
389  * No other sanity checks are done here, that's what the debug code is for.
390  */
391 static rseq_inline bool
rseq_update_user_cs(struct task_struct * t,struct pt_regs * regs,unsigned long csaddr)392 rseq_update_user_cs(struct task_struct *t, struct pt_regs *regs, unsigned long csaddr)
393 {
394 	struct rseq_cs __user *ucs = (struct rseq_cs __user *)(unsigned long)csaddr;
395 	unsigned long ip = instruction_pointer(regs);
396 	unsigned long tasksize = TASK_SIZE;
397 	u64 start_ip, abort_ip, offset;
398 	u32 usig, __user *uc_sig;
399 
400 	rseq_stat_inc(rseq_stats.cs);
401 
402 	if (unlikely(csaddr >= tasksize)) {
403 		t->rseq.event.fatal = true;
404 		return false;
405 	}
406 
407 	if (static_branch_unlikely(&rseq_debug_enabled))
408 		return rseq_debug_update_user_cs(t, regs, csaddr);
409 
410 	scoped_user_rw_access(ucs, efault) {
411 		unsafe_get_user(start_ip, &ucs->start_ip, efault);
412 		unsafe_get_user(offset, &ucs->post_commit_offset, efault);
413 		unsafe_get_user(abort_ip, &ucs->abort_ip, efault);
414 
415 		/*
416 		 * No sanity checks. If user space screwed it up, it can
417 		 * keep the pieces. That's what debug code is for.
418 		 *
419 		 * If outside, just clear the critical section.
420 		 */
421 		if (ip - start_ip >= offset)
422 			goto clear;
423 
424 		/*
425 		 * Two requirements for @abort_ip:
426 		 *   - Must be in user space as x86 IRET would happily return to
427 		 *     the kernel.
428 		 *   - The four bytes preceding the instruction at @abort_ip must
429 		 *     contain the signature.
430 		 *
431 		 * The latter protects against the following attack vector:
432 		 *
433 		 * An attacker with limited abilities to write, creates a critical
434 		 * section descriptor, sets the abort IP to a library function or
435 		 * some other ROP gadget and stores the address of the descriptor
436 		 * in TLS::rseq::rseq_cs. An RSEQ abort would then evade ROP
437 		 * protection.
438 		 */
439 		if (unlikely(abort_ip >= tasksize || abort_ip < sizeof(*uc_sig)))
440 			goto die;
441 
442 		/* The address is guaranteed to be >= 0 and < TASK_SIZE */
443 		uc_sig = (u32 __user *)(unsigned long)(abort_ip - sizeof(*uc_sig));
444 		unsafe_get_user(usig, uc_sig, efault);
445 		if (unlikely(usig != t->rseq.sig))
446 			goto die;
447 
448 		/* Invalidate the critical section */
449 		unsafe_put_user(0ULL, &t->rseq.usrptr->rseq_cs, efault);
450 		/* Update the instruction pointer */
451 		instruction_pointer_set(regs, (unsigned long)abort_ip);
452 		rseq_stat_inc(rseq_stats.fixup);
453 		break;
454 	clear:
455 		unsafe_put_user(0ULL, &t->rseq.usrptr->rseq_cs, efault);
456 		rseq_stat_inc(rseq_stats.clear);
457 		abort_ip = 0ULL;
458 	}
459 
460 	if (unlikely(abort_ip))
461 		rseq_trace_ip_fixup(ip, start_ip, offset, abort_ip);
462 	return true;
463 die:
464 	t->rseq.event.fatal = true;
465 efault:
466 	return false;
467 }
468 
469 /*
470  * Updates CPU ID, Node ID and MM CID and reads the critical section
471  * address, when @csaddr != NULL. This allows to put the ID update and the
472  * read under the same uaccess region to spare a separate begin/end.
473  *
474  * As this is either invoked from a C wrapper with @csaddr = NULL or from
475  * the fast path code with a valid pointer, a clever compiler should be
476  * able to optimize the read out. Spares a duplicate implementation.
477  *
478  * Returns true, if the operation was successful, false otherwise.
479  *
480  * In the failure case task::rseq_event::fatal is set when invalid data
481  * was found on debug kernels. It's clear when the failure was an unresolved page
482  * fault.
483  *
484  * If inlined into the exit to user path with interrupts disabled, the
485  * caller has to protect against page faults with pagefault_disable().
486  *
487  * In preemptible task context this would be counterproductive as the page
488  * faults could not be fully resolved. As a consequence unresolved page
489  * faults in task context are fatal too.
490  */
491 static rseq_inline
rseq_set_ids_get_csaddr(struct task_struct * t,struct rseq_ids * ids,u32 node_id,u64 * csaddr)492 bool rseq_set_ids_get_csaddr(struct task_struct *t, struct rseq_ids *ids,
493 			     u32 node_id, u64 *csaddr)
494 {
495 	struct rseq __user *rseq = t->rseq.usrptr;
496 
497 	if (static_branch_unlikely(&rseq_debug_enabled)) {
498 		if (!rseq_debug_validate_ids(t))
499 			return false;
500 	}
501 
502 	scoped_user_rw_access(rseq, efault) {
503 		unsafe_put_user(ids->cpu_id, &rseq->cpu_id_start, efault);
504 		unsafe_put_user(ids->cpu_id, &rseq->cpu_id, efault);
505 		unsafe_put_user(node_id, &rseq->node_id, efault);
506 		unsafe_put_user(ids->mm_cid, &rseq->mm_cid, efault);
507 		if (csaddr)
508 			unsafe_get_user(*csaddr, &rseq->rseq_cs, efault);
509 
510 		/* Open coded, so it's in the same user access region */
511 		if (rseq_slice_extension_enabled()) {
512 			/* Unconditionally clear it, no point in conditionals */
513 			unsafe_put_user(0U, &rseq->slice_ctrl.all, efault);
514 		}
515 	}
516 
517 	rseq_slice_clear_grant(t);
518 	/* Cache the new values */
519 	t->rseq.ids.cpu_cid = ids->cpu_cid;
520 	rseq_stat_inc(rseq_stats.ids);
521 	rseq_trace_update(t, ids);
522 	return true;
523 efault:
524 	return false;
525 }
526 
527 /*
528  * Update user space with new IDs and conditionally check whether the task
529  * is in a critical section.
530  */
rseq_update_usr(struct task_struct * t,struct pt_regs * regs,struct rseq_ids * ids,u32 node_id)531 static rseq_inline bool rseq_update_usr(struct task_struct *t, struct pt_regs *regs,
532 					struct rseq_ids *ids, u32 node_id)
533 {
534 	u64 csaddr;
535 
536 	if (!rseq_set_ids_get_csaddr(t, ids, node_id, &csaddr))
537 		return false;
538 
539 	/*
540 	 * On architectures which utilize the generic entry code this
541 	 * allows to skip the critical section when the entry was not from
542 	 * a user space interrupt, unless debug mode is enabled.
543 	 */
544 	if (IS_ENABLED(CONFIG_GENERIC_IRQ_ENTRY)) {
545 		if (!static_branch_unlikely(&rseq_debug_enabled)) {
546 			if (likely(!t->rseq.event.user_irq))
547 				return true;
548 		}
549 	}
550 	if (likely(!csaddr))
551 		return true;
552 	/* Sigh, this really needs to do work */
553 	return rseq_update_user_cs(t, regs, csaddr);
554 }
555 
556 /*
557  * If you want to use this then convert your architecture to the generic
558  * entry code. I'm tired of building workarounds for people who can't be
559  * bothered to make the maintenance of generic infrastructure less
560  * burdensome. Just sucking everything into the architecture code and
561  * thereby making others chase the horrible hacks and keep them working is
562  * neither acceptable nor sustainable.
563  */
564 #ifdef CONFIG_GENERIC_ENTRY
565 
566 /*
567  * This is inlined into the exit path because:
568  *
569  * 1) It's a one time comparison in the fast path when there is no event to
570  *    handle
571  *
572  * 2) The access to the user space rseq memory (TLS) is unlikely to fault
573  *    so the straight inline operation is:
574  *
575  *	- Four 32-bit stores only if CPU ID/ MM CID need to be updated
576  *	- One 64-bit load to retrieve the critical section address
577  *
578  * 3) In the unlikely case that the critical section address is != NULL:
579  *
580  *     - One 64-bit load to retrieve the start IP
581  *     - One 64-bit load to retrieve the offset for calculating the end
582  *     - One 64-bit load to retrieve the abort IP
583  *     - One 64-bit load to retrieve the signature
584  *     - One store to clear the critical section address
585  *
586  * The non-debug case implements only the minimal required checking. It
587  * provides protection against a rogue abort IP in kernel space, which
588  * would be exploitable at least on x86, and also against a rogue CS
589  * descriptor by checking the signature at the abort IP. Any fallout from
590  * invalid critical section descriptors is a user space problem. The debug
591  * case provides the full set of checks and terminates the task if a
592  * condition is not met.
593  *
594  * In case of a fault or an invalid value, this sets TIF_NOTIFY_RESUME and
595  * tells the caller to loop back into exit_to_user_mode_loop(). The rseq
596  * slow path there will handle the failure.
597  */
rseq_exit_user_update(struct pt_regs * regs,struct task_struct * t)598 static __always_inline bool rseq_exit_user_update(struct pt_regs *regs, struct task_struct *t)
599 {
600 	/*
601 	 * Page faults need to be disabled as this is called with
602 	 * interrupts disabled
603 	 */
604 	guard(pagefault)();
605 	if (likely(!t->rseq.event.ids_changed)) {
606 		struct rseq __user *rseq = t->rseq.usrptr;
607 		/*
608 		 * If IDs have not changed rseq_event::user_irq must be true
609 		 * See rseq_sched_switch_event().
610 		 */
611 		u64 csaddr;
612 
613 		scoped_user_rw_access(rseq, efault) {
614 			unsafe_get_user(csaddr, &rseq->rseq_cs, efault);
615 
616 			/* Open coded, so it's in the same user access region */
617 			if (rseq_slice_extension_enabled()) {
618 				/* Unconditionally clear it, no point in conditionals */
619 				unsafe_put_user(0U, &rseq->slice_ctrl.all, efault);
620 			}
621 		}
622 
623 		rseq_slice_clear_grant(t);
624 
625 		if (static_branch_unlikely(&rseq_debug_enabled) || unlikely(csaddr)) {
626 			if (unlikely(!rseq_update_user_cs(t, regs, csaddr)))
627 				return false;
628 		}
629 		return true;
630 	}
631 
632 	struct rseq_ids ids = {
633 		.cpu_id = task_cpu(t),
634 		.mm_cid = task_mm_cid(t),
635 	};
636 	u32 node_id = cpu_to_node(ids.cpu_id);
637 
638 	return rseq_update_usr(t, regs, &ids, node_id);
639 efault:
640 	return false;
641 }
642 
__rseq_exit_to_user_mode_restart(struct pt_regs * regs)643 static __always_inline bool __rseq_exit_to_user_mode_restart(struct pt_regs *regs)
644 {
645 	struct task_struct *t = current;
646 
647 	/*
648 	 * If the task did not go through schedule or got the flag enforced
649 	 * by the rseq syscall or execve, then nothing to do here.
650 	 *
651 	 * CPU ID and MM CID can only change when going through a context
652 	 * switch.
653 	 *
654 	 * rseq_sched_switch_event() sets the rseq_event::sched_switch bit
655 	 * only when rseq_event::has_rseq is true. That conditional is
656 	 * required to avoid setting the TIF bit if RSEQ is not registered
657 	 * for a task. rseq_event::sched_switch is cleared when RSEQ is
658 	 * unregistered by a task so it's sufficient to check for the
659 	 * sched_switch bit alone.
660 	 *
661 	 * A sane compiler requires three instructions for the nothing to do
662 	 * case including clearing the events, but your mileage might vary.
663 	 */
664 	if (unlikely((t->rseq.event.sched_switch))) {
665 		rseq_stat_inc(rseq_stats.fastpath);
666 
667 		if (unlikely(!rseq_exit_user_update(regs, t)))
668 			return true;
669 	}
670 	/* Clear state so next entry starts from a clean slate */
671 	t->rseq.event.events = 0;
672 	return false;
673 }
674 
675 /* Required to allow conversion to GENERIC_ENTRY w/o GENERIC_TIF_BITS */
676 #ifdef CONFIG_HAVE_GENERIC_TIF_BITS
test_tif_rseq(unsigned long ti_work)677 static __always_inline bool test_tif_rseq(unsigned long ti_work)
678 {
679 	return ti_work & _TIF_RSEQ;
680 }
681 
clear_tif_rseq(void)682 static __always_inline void clear_tif_rseq(void)
683 {
684 	static_assert(TIF_RSEQ != TIF_NOTIFY_RESUME);
685 	clear_thread_flag(TIF_RSEQ);
686 }
687 #else
test_tif_rseq(unsigned long ti_work)688 static __always_inline bool test_tif_rseq(unsigned long ti_work) { return true; }
clear_tif_rseq(void)689 static __always_inline void clear_tif_rseq(void) { }
690 #endif
691 
692 static __always_inline bool
rseq_exit_to_user_mode_restart(struct pt_regs * regs,unsigned long ti_work)693 rseq_exit_to_user_mode_restart(struct pt_regs *regs, unsigned long ti_work)
694 {
695 	if (unlikely(test_tif_rseq(ti_work))) {
696 		if (unlikely(__rseq_exit_to_user_mode_restart(regs))) {
697 			current->rseq.event.slowpath = true;
698 			set_tsk_thread_flag(current, TIF_NOTIFY_RESUME);
699 			return true;
700 		}
701 		clear_tif_rseq();
702 	}
703 	/*
704 	 * Arm the slice extension timer if nothing to do anymore and the
705 	 * task really goes out to user space.
706 	 */
707 	return rseq_arm_slice_extension_timer();
708 }
709 
710 #else /* CONFIG_GENERIC_ENTRY */
rseq_exit_to_user_mode_restart(struct pt_regs * regs,unsigned long ti_work)711 static inline bool rseq_exit_to_user_mode_restart(struct pt_regs *regs, unsigned long ti_work)
712 {
713 	return false;
714 }
715 #endif /* !CONFIG_GENERIC_ENTRY */
716 
rseq_syscall_exit_to_user_mode(void)717 static __always_inline void rseq_syscall_exit_to_user_mode(void)
718 {
719 	struct rseq_event *ev = &current->rseq.event;
720 
721 	rseq_stat_inc(rseq_stats.exit);
722 
723 	/* Needed to remove the store for the !lockdep case */
724 	if (IS_ENABLED(CONFIG_LOCKDEP)) {
725 		WARN_ON_ONCE(ev->sched_switch);
726 		ev->events = 0;
727 	}
728 }
729 
rseq_irqentry_exit_to_user_mode(void)730 static __always_inline void rseq_irqentry_exit_to_user_mode(void)
731 {
732 	struct rseq_event *ev = &current->rseq.event;
733 
734 	rseq_stat_inc(rseq_stats.exit);
735 
736 	lockdep_assert_once(!ev->sched_switch);
737 
738 	/*
739 	 * Ensure that event (especially user_irq) is cleared when the
740 	 * interrupt did not result in a schedule and therefore the
741 	 * rseq processing could not clear it.
742 	 */
743 	ev->events = 0;
744 }
745 
746 /* Required to keep ARM64 working */
rseq_exit_to_user_mode_legacy(void)747 static __always_inline void rseq_exit_to_user_mode_legacy(void)
748 {
749 	struct rseq_event *ev = &current->rseq.event;
750 
751 	rseq_stat_inc(rseq_stats.exit);
752 
753 	if (static_branch_unlikely(&rseq_debug_enabled))
754 		WARN_ON_ONCE(ev->sched_switch);
755 
756 	/*
757 	 * Ensure that event (especially user_irq) is cleared when the
758 	 * interrupt did not result in a schedule and therefore the
759 	 * rseq processing did not clear it.
760 	 */
761 	ev->events = 0;
762 }
763 
764 void __rseq_debug_syscall_return(struct pt_regs *regs);
765 
rseq_debug_syscall_return(struct pt_regs * regs)766 static __always_inline void rseq_debug_syscall_return(struct pt_regs *regs)
767 {
768 	if (static_branch_unlikely(&rseq_debug_enabled))
769 		__rseq_debug_syscall_return(regs);
770 }
771 #else /* CONFIG_RSEQ */
rseq_note_user_irq_entry(void)772 static inline void rseq_note_user_irq_entry(void) { }
rseq_exit_to_user_mode_restart(struct pt_regs * regs,unsigned long ti_work)773 static inline bool rseq_exit_to_user_mode_restart(struct pt_regs *regs, unsigned long ti_work)
774 {
775 	return false;
776 }
rseq_syscall_exit_to_user_mode(void)777 static inline void rseq_syscall_exit_to_user_mode(void) { }
rseq_irqentry_exit_to_user_mode(void)778 static inline void rseq_irqentry_exit_to_user_mode(void) { }
rseq_exit_to_user_mode_legacy(void)779 static inline void rseq_exit_to_user_mode_legacy(void) { }
rseq_debug_syscall_return(struct pt_regs * regs)780 static inline void rseq_debug_syscall_return(struct pt_regs *regs) { }
rseq_grant_slice_extension(bool work_pending)781 static inline bool rseq_grant_slice_extension(bool work_pending) { return false; }
782 #endif /* !CONFIG_RSEQ */
783 
784 #endif /* _LINUX_RSEQ_ENTRY_H */
785