1 /* SPDX-License-Identifier: GPL-2.0 */
2 #ifndef _LINUX_RSEQ_ENTRY_H
3 #define _LINUX_RSEQ_ENTRY_H
4
5 /* Must be outside the CONFIG_RSEQ guard to resolve the stubs */
6 #ifdef CONFIG_RSEQ_STATS
7 #include <linux/percpu.h>
8
9 struct rseq_stats {
10 unsigned long exit;
11 unsigned long signal;
12 unsigned long slowpath;
13 unsigned long fastpath;
14 unsigned long ids;
15 unsigned long cs;
16 unsigned long clear;
17 unsigned long fixup;
18 unsigned long s_granted;
19 unsigned long s_expired;
20 unsigned long s_revoked;
21 unsigned long s_yielded;
22 unsigned long s_aborted;
23 };
24
25 DECLARE_PER_CPU(struct rseq_stats, rseq_stats);
26
27 /*
28 * Slow path has interrupts and preemption enabled, but the fast path
29 * runs with interrupts disabled so there is no point in having the
30 * preemption checks implied in __this_cpu_inc() for every operation.
31 */
32 #ifdef RSEQ_BUILD_SLOW_PATH
33 #define rseq_stat_inc(which) this_cpu_inc((which))
34 #else
35 #define rseq_stat_inc(which) raw_cpu_inc((which))
36 #endif
37
38 #else /* CONFIG_RSEQ_STATS */
39 #define rseq_stat_inc(x) do { } while (0)
40 #endif /* !CONFIG_RSEQ_STATS */
41
42 #ifdef CONFIG_RSEQ
43 #include <linux/hrtimer_rearm.h>
44 #include <linux/jump_label.h>
45 #include <linux/rseq.h>
46 #include <linux/sched/signal.h>
47 #include <linux/uaccess.h>
48
49 #include <linux/tracepoint-defs.h>
50
51 #ifdef CONFIG_TRACEPOINTS
52 DECLARE_TRACEPOINT(rseq_update);
53 DECLARE_TRACEPOINT(rseq_ip_fixup);
54 void __rseq_trace_update(struct task_struct *t);
55 void __rseq_trace_ip_fixup(unsigned long ip, unsigned long start_ip,
56 unsigned long offset, unsigned long abort_ip);
57
rseq_trace_update(struct task_struct * t,struct rseq_ids * ids)58 static inline void rseq_trace_update(struct task_struct *t, struct rseq_ids *ids)
59 {
60 if (tracepoint_enabled(rseq_update) && ids)
61 __rseq_trace_update(t);
62 }
63
rseq_trace_ip_fixup(unsigned long ip,unsigned long start_ip,unsigned long offset,unsigned long abort_ip)64 static inline void rseq_trace_ip_fixup(unsigned long ip, unsigned long start_ip,
65 unsigned long offset, unsigned long abort_ip)
66 {
67 if (tracepoint_enabled(rseq_ip_fixup))
68 __rseq_trace_ip_fixup(ip, start_ip, offset, abort_ip);
69 }
70
71 #else /* CONFIG_TRACEPOINT */
rseq_trace_update(struct task_struct * t,struct rseq_ids * ids)72 static inline void rseq_trace_update(struct task_struct *t, struct rseq_ids *ids) { }
rseq_trace_ip_fixup(unsigned long ip,unsigned long start_ip,unsigned long offset,unsigned long abort_ip)73 static inline void rseq_trace_ip_fixup(unsigned long ip, unsigned long start_ip,
74 unsigned long offset, unsigned long abort_ip) { }
75 #endif /* !CONFIG_TRACEPOINT */
76
77 DECLARE_STATIC_KEY_MAYBE(CONFIG_RSEQ_DEBUG_DEFAULT_ENABLE, rseq_debug_enabled);
78
79 #ifdef RSEQ_BUILD_SLOW_PATH
80 #define rseq_inline
81 #else
82 #define rseq_inline __always_inline
83 #endif
84
85 #ifdef CONFIG_RSEQ_SLICE_EXTENSION
86 DECLARE_STATIC_KEY_TRUE(rseq_slice_extension_key);
87
rseq_slice_extension_enabled(void)88 static __always_inline bool rseq_slice_extension_enabled(void)
89 {
90 return static_branch_likely(&rseq_slice_extension_key);
91 }
92
93 extern unsigned int rseq_slice_ext_nsecs;
94 bool __rseq_arm_slice_extension_timer(void);
95
rseq_arm_slice_extension_timer(void)96 static __always_inline bool rseq_arm_slice_extension_timer(void)
97 {
98 if (!rseq_slice_extension_enabled())
99 return false;
100
101 if (likely(!current->rseq.slice.state.granted))
102 return false;
103
104 return __rseq_arm_slice_extension_timer();
105 }
106
rseq_slice_clear_grant(struct task_struct * t)107 static __always_inline void rseq_slice_clear_grant(struct task_struct *t)
108 {
109 if (IS_ENABLED(CONFIG_RSEQ_STATS) && t->rseq.slice.state.granted)
110 rseq_stat_inc(rseq_stats.s_revoked);
111 t->rseq.slice.state.granted = false;
112 }
113
114 /*
115 * Open coded, so it can be invoked within a user access region.
116 *
117 * This clears the user space state of the time slice extensions field only when
118 * the task has registered the optimized RSEQ_ABI V2. Some legacy registrations,
119 * e.g. TCMalloc, have conflicting non-ABI fields in struct RSEQ, which would be
120 * overwritten by an unconditional write.
121 */
122 #define rseq_slice_clear_user(rseq, efault) \
123 do { \
124 if (rseq_slice_extension_enabled()) \
125 unsafe_put_user(0U, &rseq->slice_ctrl.all, efault); \
126 } while (0)
127
__rseq_grant_slice_extension(bool work_pending)128 static __always_inline bool __rseq_grant_slice_extension(bool work_pending)
129 {
130 struct task_struct *curr = current;
131 struct rseq_slice_ctrl usr_ctrl;
132 union rseq_slice_state state;
133 struct rseq __user *rseq;
134
135 if (!rseq_slice_extension_enabled())
136 return false;
137
138 /* If not enabled or not a return from interrupt, nothing to do. */
139 state = curr->rseq.slice.state;
140 state.enabled &= curr->rseq.event.user_irq;
141 if (likely(!state.state))
142 return false;
143
144 rseq = curr->rseq.usrptr;
145 scoped_user_rw_access(rseq, efault) {
146
147 /*
148 * Quick check conditions where a grant is not possible or
149 * needs to be revoked.
150 *
151 * 1) Any TIF bit which needs to do extra work aside of
152 * rescheduling prevents a grant.
153 *
154 * 2) A previous rescheduling request resulted in a slice
155 * extension grant.
156 */
157 if (unlikely(work_pending || state.granted)) {
158 /* Clear user control unconditionally. No point for checking */
159 unsafe_put_user(0U, &rseq->slice_ctrl.all, efault);
160 rseq_slice_clear_grant(curr);
161 return false;
162 }
163
164 unsafe_get_user(usr_ctrl.all, &rseq->slice_ctrl.all, efault);
165 if (likely(!(usr_ctrl.request)))
166 return false;
167
168 /* Grant the slice extention */
169 usr_ctrl.request = 0;
170 usr_ctrl.granted = 1;
171 unsafe_put_user(usr_ctrl.all, &rseq->slice_ctrl.all, efault);
172 }
173
174 rseq_stat_inc(rseq_stats.s_granted);
175
176 curr->rseq.slice.state.granted = true;
177 /* Store expiry time for arming the timer on the way out */
178 curr->rseq.slice.expires = data_race(rseq_slice_ext_nsecs) + ktime_get_mono_fast_ns();
179 /*
180 * This is racy against a remote CPU setting TIF_NEED_RESCHED in
181 * several ways:
182 *
183 * 1)
184 * CPU0 CPU1
185 * clear_tsk()
186 * set_tsk()
187 * clear_preempt()
188 * Raise scheduler IPI on CPU0
189 * --> IPI
190 * fold_need_resched() -> Folds correctly
191 * 2)
192 * CPU0 CPU1
193 * set_tsk()
194 * clear_tsk()
195 * clear_preempt()
196 * Raise scheduler IPI on CPU0
197 * --> IPI
198 * fold_need_resched() <- NOOP as TIF_NEED_RESCHED is false
199 *
200 * #1 is not any different from a regular remote reschedule as it
201 * sets the previously not set bit and then raises the IPI which
202 * folds it into the preempt counter
203 *
204 * #2 is obviously incorrect from a scheduler POV, but it's not
205 * differently incorrect than the code below clearing the
206 * reschedule request with the safety net of the timer.
207 *
208 * The important part is that the clearing is protected against the
209 * scheduler IPI and also against any other interrupt which might
210 * end up waking up a task and setting the bits in the middle of
211 * the operation:
212 *
213 * clear_tsk()
214 * ---> Interrupt
215 * wakeup_on_this_cpu()
216 * set_tsk()
217 * set_preempt()
218 * clear_preempt()
219 *
220 * which would be inconsistent state.
221 */
222 scoped_guard(irq) {
223 clear_tsk_need_resched(curr);
224 clear_preempt_need_resched();
225 }
226 return true;
227
228 efault:
229 force_sig(SIGSEGV);
230 return false;
231 }
232
rseq_grant_slice_extension(unsigned long ti_work,unsigned long mask)233 static __always_inline bool rseq_grant_slice_extension(unsigned long ti_work, unsigned long mask)
234 {
235 if (unlikely(__rseq_grant_slice_extension(ti_work & mask))) {
236 hrtimer_rearm_deferred_tif(ti_work);
237 return true;
238 }
239 return false;
240 }
241
242 #else /* CONFIG_RSEQ_SLICE_EXTENSION */
rseq_slice_extension_enabled(void)243 static __always_inline bool rseq_slice_extension_enabled(void) { return false; }
rseq_arm_slice_extension_timer(void)244 static __always_inline bool rseq_arm_slice_extension_timer(void) { return false; }
rseq_slice_clear_grant(struct task_struct * t)245 static __always_inline void rseq_slice_clear_grant(struct task_struct *t) { }
rseq_grant_slice_extension(unsigned long ti_work,unsigned long mask)246 static __always_inline bool rseq_grant_slice_extension(unsigned long ti_work, unsigned long mask) { return false; }
247 #define rseq_slice_clear_user(rseq, efault) do { } while (0)
248 #endif /* !CONFIG_RSEQ_SLICE_EXTENSION */
249
250 bool rseq_debug_update_user_cs(struct task_struct *t, struct pt_regs *regs, unsigned long csaddr);
251
rseq_note_user_irq_entry(void)252 static __always_inline void rseq_note_user_irq_entry(void)
253 {
254 if (IS_ENABLED(CONFIG_GENERIC_IRQ_ENTRY))
255 current->rseq.event.user_irq = true;
256 }
257
258 /*
259 * Check whether there is a valid critical section and whether the
260 * instruction pointer in @regs is inside the critical section.
261 *
262 * - If the critical section is invalid, terminate the task.
263 *
264 * - If valid and the instruction pointer is inside, set it to the abort IP.
265 *
266 * - If valid and the instruction pointer is outside, clear the critical
267 * section address.
268 *
269 * Returns true, if the section was valid and either fixup or clear was
270 * done, false otherwise.
271 *
272 * In the failure case task::rseq_event::fatal is set when a invalid
273 * section was found. It's clear when the failure was an unresolved page
274 * fault.
275 *
276 * If inlined into the exit to user path with interrupts disabled, the
277 * caller has to protect against page faults with pagefault_disable().
278 *
279 * In preemptible task context this would be counterproductive as the page
280 * faults could not be fully resolved. As a consequence unresolved page
281 * faults in task context are fatal too.
282 */
283
284 #ifdef RSEQ_BUILD_SLOW_PATH
285 /*
286 * The debug version is put out of line, but kept here so the code stays
287 * together.
288 *
289 * @csaddr has already been checked by the caller to be in user space
290 */
rseq_debug_update_user_cs(struct task_struct * t,struct pt_regs * regs,unsigned long csaddr)291 bool rseq_debug_update_user_cs(struct task_struct *t, struct pt_regs *regs,
292 unsigned long csaddr)
293 {
294 struct rseq_cs __user *ucs = (struct rseq_cs __user *)(unsigned long)csaddr;
295 u64 start_ip, abort_ip, offset, cs_end, head, tasksize = TASK_SIZE;
296 unsigned long ip = instruction_pointer(regs);
297 u64 __user *uc_head = (u64 __user *) ucs;
298 u32 usig, __user *uc_sig;
299
300 scoped_user_rw_access(ucs, efault) {
301 /*
302 * Evaluate the user pile and exit if one of the conditions
303 * is not fulfilled.
304 */
305 unsafe_get_user(start_ip, &ucs->start_ip, efault);
306 if (unlikely(start_ip >= tasksize))
307 goto die;
308 /* If outside, just clear the critical section. */
309 if (ip < start_ip)
310 goto clear;
311
312 unsafe_get_user(offset, &ucs->post_commit_offset, efault);
313 cs_end = start_ip + offset;
314 /* Check for overflow and wraparound */
315 if (unlikely(cs_end >= tasksize || cs_end < start_ip))
316 goto die;
317
318 /* If not inside, clear it. */
319 if (ip >= cs_end)
320 goto clear;
321
322 unsafe_get_user(abort_ip, &ucs->abort_ip, efault);
323 /* Ensure it's "valid" */
324 if (unlikely(abort_ip >= tasksize || abort_ip < sizeof(*uc_sig)))
325 goto die;
326 /* Validate that the abort IP is not in the critical section */
327 if (unlikely(abort_ip - start_ip < offset))
328 goto die;
329
330 /*
331 * Check version and flags for 0. No point in emitting
332 * deprecated warnings before dying. That could be done in
333 * the slow path eventually, but *shrug*.
334 */
335 unsafe_get_user(head, uc_head, efault);
336 if (unlikely(head))
337 goto die;
338
339 /* abort_ip - 4 is >= 0. See abort_ip check above */
340 uc_sig = (u32 __user *)(unsigned long)(abort_ip - sizeof(*uc_sig));
341 unsafe_get_user(usig, uc_sig, efault);
342 if (unlikely(usig != t->rseq.sig))
343 goto die;
344
345 /* rseq_event.user_irq is only valid if CONFIG_GENERIC_IRQ_ENTRY=y */
346 if (IS_ENABLED(CONFIG_GENERIC_IRQ_ENTRY)) {
347 /* If not in interrupt from user context, let it die */
348 if (unlikely(!t->rseq.event.user_irq))
349 goto die;
350 }
351 unsafe_put_user(0ULL, &t->rseq.usrptr->rseq_cs, efault);
352 instruction_pointer_set(regs, (unsigned long)abort_ip);
353 rseq_stat_inc(rseq_stats.fixup);
354 break;
355 clear:
356 unsafe_put_user(0ULL, &t->rseq.usrptr->rseq_cs, efault);
357 rseq_stat_inc(rseq_stats.clear);
358 abort_ip = 0ULL;
359 }
360
361 if (unlikely(abort_ip))
362 rseq_trace_ip_fixup(ip, start_ip, offset, abort_ip);
363 return true;
364 die:
365 t->rseq.event.fatal = true;
366 efault:
367 return false;
368 }
369
370 #endif /* RSEQ_BUILD_SLOW_PATH */
371
372 /*
373 * This only ensures that abort_ip is in the user address space and
374 * validates that it is preceded by the signature.
375 *
376 * No other sanity checks are done here, that's what the debug code is for.
377 */
378 static rseq_inline bool
rseq_update_user_cs(struct task_struct * t,struct pt_regs * regs,unsigned long csaddr)379 rseq_update_user_cs(struct task_struct *t, struct pt_regs *regs, unsigned long csaddr)
380 {
381 struct rseq_cs __user *ucs = (struct rseq_cs __user *)(unsigned long)csaddr;
382 unsigned long ip = instruction_pointer(regs);
383 unsigned long tasksize = TASK_SIZE;
384 u64 start_ip, abort_ip, offset;
385 u32 usig, __user *uc_sig;
386
387 rseq_stat_inc(rseq_stats.cs);
388
389 if (unlikely(csaddr >= tasksize)) {
390 t->rseq.event.fatal = true;
391 return false;
392 }
393
394 if (static_branch_unlikely(&rseq_debug_enabled))
395 return rseq_debug_update_user_cs(t, regs, csaddr);
396
397 scoped_user_rw_access(ucs, efault) {
398 unsafe_get_user(start_ip, &ucs->start_ip, efault);
399 unsafe_get_user(offset, &ucs->post_commit_offset, efault);
400 unsafe_get_user(abort_ip, &ucs->abort_ip, efault);
401
402 /*
403 * No sanity checks. If user space screwed it up, it can
404 * keep the pieces. That's what debug code is for.
405 *
406 * If outside, just clear the critical section.
407 */
408 if (ip - start_ip >= offset)
409 goto clear;
410
411 /*
412 * Two requirements for @abort_ip:
413 * - Must be in user space as x86 IRET would happily return to
414 * the kernel.
415 * - The four bytes preceding the instruction at @abort_ip must
416 * contain the signature.
417 *
418 * The latter protects against the following attack vector:
419 *
420 * An attacker with limited abilities to write, creates a critical
421 * section descriptor, sets the abort IP to a library function or
422 * some other ROP gadget and stores the address of the descriptor
423 * in TLS::rseq::rseq_cs. An RSEQ abort would then evade ROP
424 * protection.
425 */
426 if (unlikely(abort_ip >= tasksize || abort_ip < sizeof(*uc_sig)))
427 goto die;
428
429 /* The address is guaranteed to be >= 0 and < TASK_SIZE */
430 uc_sig = (u32 __user *)(unsigned long)(abort_ip - sizeof(*uc_sig));
431 unsafe_get_user(usig, uc_sig, efault);
432 if (unlikely(usig != t->rseq.sig))
433 goto die;
434
435 /* Invalidate the critical section */
436 unsafe_put_user(0ULL, &t->rseq.usrptr->rseq_cs, efault);
437 /* Update the instruction pointer */
438 instruction_pointer_set(regs, (unsigned long)abort_ip);
439 rseq_stat_inc(rseq_stats.fixup);
440 break;
441 clear:
442 unsafe_put_user(0ULL, &t->rseq.usrptr->rseq_cs, efault);
443 rseq_stat_inc(rseq_stats.clear);
444 abort_ip = 0ULL;
445 }
446
447 if (unlikely(abort_ip))
448 rseq_trace_ip_fixup(ip, start_ip, offset, abort_ip);
449 return true;
450 die:
451 t->rseq.event.fatal = true;
452 efault:
453 return false;
454 }
455
456 /*
457 * Updates CPU ID, Node ID and MM CID and reads the critical section
458 * address, when @csaddr != NULL. This allows to put the ID update and the
459 * read under the same uaccess region to spare a separate begin/end.
460 *
461 * As this is either invoked from a C wrapper with @csaddr = NULL or from
462 * the fast path code with a valid pointer, a clever compiler should be
463 * able to optimize the read out. Spares a duplicate implementation.
464 *
465 * Returns true, if the operation was successful, false otherwise.
466 *
467 * In the failure case task::rseq_event::fatal is set when invalid data
468 * was found on debug kernels. It's clear when the failure was an unresolved page
469 * fault.
470 *
471 * If inlined into the exit to user path with interrupts disabled, the
472 * caller has to protect against page faults with pagefault_disable().
473 *
474 * In preemptible task context this would be counterproductive as the page
475 * faults could not be fully resolved. As a consequence unresolved page
476 * faults in task context are fatal too.
477 */
478 static rseq_inline
rseq_set_ids_get_csaddr(struct task_struct * t,struct rseq_ids * ids,u64 * csaddr)479 bool rseq_set_ids_get_csaddr(struct task_struct *t, struct rseq_ids *ids, u64 *csaddr)
480 {
481 struct rseq __user *rseq = t->rseq.usrptr;
482
483 scoped_user_rw_access(rseq, efault) {
484 /* Validate the R/O fields for debug and optimized mode */
485 if (static_branch_unlikely(&rseq_debug_enabled) || rseq_v2(t)) {
486 u32 cpu_id, uval;
487
488 unsafe_get_user(cpu_id, &rseq->cpu_id_start, efault);
489 if (cpu_id != t->rseq.ids.cpu_id)
490 goto die;
491 unsafe_get_user(uval, &rseq->cpu_id, efault);
492 if (uval != cpu_id)
493 goto die;
494 unsafe_get_user(uval, &rseq->node_id, efault);
495 if (uval != t->rseq.ids.node_id)
496 goto die;
497 unsafe_get_user(uval, &rseq->mm_cid, efault);
498 if (uval != t->rseq.ids.mm_cid)
499 goto die;
500 }
501
502 unsafe_put_user(ids->cpu_id, &rseq->cpu_id_start, efault);
503 unsafe_put_user(ids->cpu_id, &rseq->cpu_id, efault);
504 unsafe_put_user(ids->node_id, &rseq->node_id, efault);
505 unsafe_put_user(ids->mm_cid, &rseq->mm_cid, efault);
506 if (csaddr)
507 unsafe_get_user(*csaddr, &rseq->rseq_cs, efault);
508
509 /* RSEQ ABI V2 only operations */
510 if (rseq_v2(t))
511 rseq_slice_clear_user(rseq, efault);
512 }
513
514 rseq_slice_clear_grant(t);
515 /* Cache the new values */
516 t->rseq.ids = *ids;
517 rseq_stat_inc(rseq_stats.ids);
518 rseq_trace_update(t, ids);
519 return true;
520
521 die:
522 t->rseq.event.fatal = true;
523 efault:
524 return false;
525 }
526
527 /*
528 * Update user space with new IDs and conditionally check whether the task
529 * is in a critical section.
530 */
rseq_update_usr(struct task_struct * t,struct pt_regs * regs,struct rseq_ids * ids)531 static rseq_inline bool rseq_update_usr(struct task_struct *t, struct pt_regs *regs,
532 struct rseq_ids *ids)
533 {
534 u64 csaddr;
535
536 if (!rseq_set_ids_get_csaddr(t, ids, &csaddr))
537 return false;
538
539 /*
540 * On architectures which utilize the generic entry code this
541 * allows to skip the critical section when the entry was not from
542 * a user space interrupt, unless debug mode is enabled.
543 */
544 if (IS_ENABLED(CONFIG_GENERIC_IRQ_ENTRY)) {
545 if (!static_branch_unlikely(&rseq_debug_enabled)) {
546 if (likely(!t->rseq.event.user_irq))
547 return true;
548 }
549 }
550 if (likely(!csaddr))
551 return true;
552 /* Sigh, this really needs to do work */
553 return rseq_update_user_cs(t, regs, csaddr);
554 }
555
556 /*
557 * If you want to use this then convert your architecture to the generic
558 * entry code. I'm tired of building workarounds for people who can't be
559 * bothered to make the maintenance of generic infrastructure less
560 * burdensome. Just sucking everything into the architecture code and
561 * thereby making others chase the horrible hacks and keep them working is
562 * neither acceptable nor sustainable.
563 */
564 #ifdef CONFIG_GENERIC_ENTRY
565
566 /*
567 * This is inlined into the exit path because:
568 *
569 * 1) It's a one time comparison in the fast path when there is no event to
570 * handle
571 *
572 * 2) The access to the user space rseq memory (TLS) is unlikely to fault
573 * so the straight inline operation is:
574 *
575 * - Four 32-bit stores only if CPU ID/ MM CID need to be updated
576 * - One 64-bit load to retrieve the critical section address
577 *
578 * 3) In the unlikely case that the critical section address is != NULL:
579 *
580 * - One 64-bit load to retrieve the start IP
581 * - One 64-bit load to retrieve the offset for calculating the end
582 * - One 64-bit load to retrieve the abort IP
583 * - One 64-bit load to retrieve the signature
584 * - One store to clear the critical section address
585 *
586 * The non-debug case implements only the minimal required checking. It
587 * provides protection against a rogue abort IP in kernel space, which
588 * would be exploitable at least on x86, and also against a rogue CS
589 * descriptor by checking the signature at the abort IP. Any fallout from
590 * invalid critical section descriptors is a user space problem. The debug
591 * case provides the full set of checks and terminates the task if a
592 * condition is not met.
593 *
594 * In case of a fault or an invalid value, this sets TIF_NOTIFY_RESUME and
595 * tells the caller to loop back into exit_to_user_mode_loop(). The rseq
596 * slow path there will handle the failure.
597 */
rseq_exit_user_update(struct pt_regs * regs,struct task_struct * t)598 static __always_inline bool rseq_exit_user_update(struct pt_regs *regs, struct task_struct *t)
599 {
600 /*
601 * Page faults need to be disabled as this is called with
602 * interrupts disabled
603 */
604 guard(pagefault)();
605 /*
606 * This optimization is only valid when the task registered for the
607 * optimized RSEQ_ABI_V2 variant. Some legacy users rely on the original
608 * RSEQ implementation behaviour which unconditionally updated the IDs.
609 * rseq_sched_switch_event() ensures that legacy registrations always
610 * have both sched_switch and ids_changed set, which is compatible with
611 * the historical TIF_NOTIFY_RESUME behaviour.
612 */
613 if (likely(!t->rseq.event.ids_changed)) {
614 struct rseq __user *rseq = t->rseq.usrptr;
615 /*
616 * If IDs have not changed rseq_event::user_irq must be true
617 * See rseq_sched_switch_event().
618 */
619 u64 csaddr;
620
621 scoped_user_rw_access(rseq, efault) {
622 unsafe_get_user(csaddr, &rseq->rseq_cs, efault);
623
624 /* RSEQ ABI V2 only operations */
625 if (rseq_v2(t))
626 rseq_slice_clear_user(rseq, efault);
627 }
628
629 rseq_slice_clear_grant(t);
630
631 if (static_branch_unlikely(&rseq_debug_enabled) || unlikely(csaddr)) {
632 if (unlikely(!rseq_update_user_cs(t, regs, csaddr)))
633 return false;
634 }
635 return true;
636 }
637
638 struct rseq_ids ids = {
639 .cpu_id = task_cpu(t),
640 .mm_cid = task_mm_cid(t),
641 .node_id = cpu_to_node(ids.cpu_id),
642 };
643
644 return rseq_update_usr(t, regs, &ids);
645 efault:
646 return false;
647 }
648
__rseq_exit_to_user_mode_restart(struct pt_regs * regs)649 static __always_inline bool __rseq_exit_to_user_mode_restart(struct pt_regs *regs)
650 {
651 struct task_struct *t = current;
652
653 /*
654 * If the task did not go through schedule or got the flag enforced
655 * by the rseq syscall or execve, then nothing to do here.
656 *
657 * CPU ID and MM CID can only change when going through a context
658 * switch.
659 *
660 * rseq_sched_switch_event() sets the rseq_event::sched_switch bit
661 * only when rseq_event::has_rseq is true. That conditional is
662 * required to avoid setting the TIF bit if RSEQ is not registered
663 * for a task. rseq_event::sched_switch is cleared when RSEQ is
664 * unregistered by a task so it's sufficient to check for the
665 * sched_switch bit alone.
666 *
667 * A sane compiler requires three instructions for the nothing to do
668 * case including clearing the events, but your mileage might vary.
669 */
670 if (unlikely((t->rseq.event.sched_switch))) {
671 rseq_stat_inc(rseq_stats.fastpath);
672
673 if (unlikely(!rseq_exit_user_update(regs, t)))
674 return true;
675 }
676 /* Clear state so next entry starts from a clean slate */
677 t->rseq.event.events = 0;
678 return false;
679 }
680
681 /* Required to allow conversion to GENERIC_ENTRY w/o GENERIC_TIF_BITS */
682 #ifdef CONFIG_HAVE_GENERIC_TIF_BITS
test_tif_rseq(unsigned long ti_work)683 static __always_inline bool test_tif_rseq(unsigned long ti_work)
684 {
685 return ti_work & _TIF_RSEQ;
686 }
687
clear_tif_rseq(void)688 static __always_inline void clear_tif_rseq(void)
689 {
690 static_assert(TIF_RSEQ != TIF_NOTIFY_RESUME);
691 clear_thread_flag(TIF_RSEQ);
692 }
693 #else
test_tif_rseq(unsigned long ti_work)694 static __always_inline bool test_tif_rseq(unsigned long ti_work) { return true; }
clear_tif_rseq(void)695 static __always_inline void clear_tif_rseq(void) { }
696 #endif
697
698 static __always_inline bool
rseq_exit_to_user_mode_restart(struct pt_regs * regs,unsigned long ti_work)699 rseq_exit_to_user_mode_restart(struct pt_regs *regs, unsigned long ti_work)
700 {
701 if (unlikely(test_tif_rseq(ti_work))) {
702 if (unlikely(__rseq_exit_to_user_mode_restart(regs))) {
703 current->rseq.event.slowpath = true;
704 set_tsk_thread_flag(current, TIF_NOTIFY_RESUME);
705 return true;
706 }
707 clear_tif_rseq();
708 }
709 /*
710 * Arm the slice extension timer if nothing to do anymore and the
711 * task really goes out to user space.
712 */
713 return rseq_arm_slice_extension_timer();
714 }
715
716 #else /* CONFIG_GENERIC_ENTRY */
rseq_exit_to_user_mode_restart(struct pt_regs * regs,unsigned long ti_work)717 static inline bool rseq_exit_to_user_mode_restart(struct pt_regs *regs, unsigned long ti_work)
718 {
719 return false;
720 }
721 #endif /* !CONFIG_GENERIC_ENTRY */
722
rseq_syscall_exit_to_user_mode(void)723 static __always_inline void rseq_syscall_exit_to_user_mode(void)
724 {
725 struct rseq_event *ev = ¤t->rseq.event;
726
727 rseq_stat_inc(rseq_stats.exit);
728
729 /* Needed to remove the store for the !lockdep case */
730 if (IS_ENABLED(CONFIG_LOCKDEP)) {
731 WARN_ON_ONCE(ev->sched_switch);
732 ev->events = 0;
733 }
734 }
735
rseq_irqentry_exit_to_user_mode(void)736 static __always_inline void rseq_irqentry_exit_to_user_mode(void)
737 {
738 struct rseq_event *ev = ¤t->rseq.event;
739
740 rseq_stat_inc(rseq_stats.exit);
741
742 lockdep_assert_once(!ev->sched_switch);
743
744 /*
745 * Ensure that event (especially user_irq) is cleared when the
746 * interrupt did not result in a schedule and therefore the
747 * rseq processing could not clear it.
748 */
749 ev->events = 0;
750 }
751
752 /* Required to keep ARM64 working */
rseq_exit_to_user_mode_legacy(void)753 static __always_inline void rseq_exit_to_user_mode_legacy(void)
754 {
755 struct rseq_event *ev = ¤t->rseq.event;
756
757 rseq_stat_inc(rseq_stats.exit);
758
759 if (static_branch_unlikely(&rseq_debug_enabled))
760 WARN_ON_ONCE(ev->sched_switch);
761
762 /*
763 * Ensure that event (especially user_irq) is cleared when the
764 * interrupt did not result in a schedule and therefore the
765 * rseq processing did not clear it.
766 */
767 ev->events = 0;
768 }
769
770 void __rseq_debug_syscall_return(struct pt_regs *regs);
771
rseq_debug_syscall_return(struct pt_regs * regs)772 static __always_inline void rseq_debug_syscall_return(struct pt_regs *regs)
773 {
774 if (static_branch_unlikely(&rseq_debug_enabled))
775 __rseq_debug_syscall_return(regs);
776 }
777 #else /* CONFIG_RSEQ */
rseq_note_user_irq_entry(void)778 static inline void rseq_note_user_irq_entry(void) { }
rseq_exit_to_user_mode_restart(struct pt_regs * regs,unsigned long ti_work)779 static inline bool rseq_exit_to_user_mode_restart(struct pt_regs *regs, unsigned long ti_work)
780 {
781 return false;
782 }
rseq_syscall_exit_to_user_mode(void)783 static inline void rseq_syscall_exit_to_user_mode(void) { }
rseq_irqentry_exit_to_user_mode(void)784 static inline void rseq_irqentry_exit_to_user_mode(void) { }
rseq_exit_to_user_mode_legacy(void)785 static inline void rseq_exit_to_user_mode_legacy(void) { }
rseq_debug_syscall_return(struct pt_regs * regs)786 static inline void rseq_debug_syscall_return(struct pt_regs *regs) { }
rseq_grant_slice_extension(unsigned long ti_work,unsigned long mask)787 static inline bool rseq_grant_slice_extension(unsigned long ti_work, unsigned long mask) { return false; }
788 #endif /* !CONFIG_RSEQ */
789
790 #endif /* _LINUX_RSEQ_ENTRY_H */
791