1 /* SPDX-License-Identifier: GPL-2.0 */
2 #ifndef _LINUX_RSEQ_ENTRY_H
3 #define _LINUX_RSEQ_ENTRY_H
4
5 /* Must be outside the CONFIG_RSEQ guard to resolve the stubs */
6 #ifdef CONFIG_RSEQ_STATS
7 #include <linux/percpu.h>
8
9 struct rseq_stats {
10 unsigned long exit;
11 unsigned long signal;
12 unsigned long slowpath;
13 unsigned long fastpath;
14 unsigned long ids;
15 unsigned long cs;
16 unsigned long clear;
17 unsigned long fixup;
18 unsigned long s_granted;
19 unsigned long s_expired;
20 unsigned long s_revoked;
21 unsigned long s_yielded;
22 unsigned long s_aborted;
23 };
24
25 DECLARE_PER_CPU(struct rseq_stats, rseq_stats);
26
27 /*
28 * Slow path has interrupts and preemption enabled, but the fast path
29 * runs with interrupts disabled so there is no point in having the
30 * preemption checks implied in __this_cpu_inc() for every operation.
31 */
32 #ifdef RSEQ_BUILD_SLOW_PATH
33 #define rseq_stat_inc(which) this_cpu_inc((which))
34 #else
35 #define rseq_stat_inc(which) raw_cpu_inc((which))
36 #endif
37
38 #else /* CONFIG_RSEQ_STATS */
39 #define rseq_stat_inc(x) do { } while (0)
40 #endif /* !CONFIG_RSEQ_STATS */
41
42 #ifdef CONFIG_RSEQ
43 #include <linux/hrtimer_rearm.h>
44 #include <linux/jump_label.h>
45 #include <linux/rseq.h>
46 #include <linux/sched/signal.h>
47 #include <linux/uaccess.h>
48
49 #include <linux/tracepoint-defs.h>
50
51 #ifdef CONFIG_TRACEPOINTS
52 DECLARE_TRACEPOINT(rseq_update);
53 DECLARE_TRACEPOINT(rseq_ip_fixup);
54 void __rseq_trace_update(struct task_struct *t);
55 void __rseq_trace_ip_fixup(unsigned long ip, unsigned long start_ip,
56 unsigned long offset, unsigned long abort_ip);
57
rseq_trace_update(struct task_struct * t,struct rseq_ids * ids)58 static inline void rseq_trace_update(struct task_struct *t, struct rseq_ids *ids)
59 {
60 if (tracepoint_enabled(rseq_update) && ids)
61 __rseq_trace_update(t);
62 }
63
rseq_trace_ip_fixup(unsigned long ip,unsigned long start_ip,unsigned long offset,unsigned long abort_ip)64 static inline void rseq_trace_ip_fixup(unsigned long ip, unsigned long start_ip,
65 unsigned long offset, unsigned long abort_ip)
66 {
67 if (tracepoint_enabled(rseq_ip_fixup))
68 __rseq_trace_ip_fixup(ip, start_ip, offset, abort_ip);
69 }
70
71 #else /* CONFIG_TRACEPOINT */
rseq_trace_update(struct task_struct * t,struct rseq_ids * ids)72 static inline void rseq_trace_update(struct task_struct *t, struct rseq_ids *ids) { }
rseq_trace_ip_fixup(unsigned long ip,unsigned long start_ip,unsigned long offset,unsigned long abort_ip)73 static inline void rseq_trace_ip_fixup(unsigned long ip, unsigned long start_ip,
74 unsigned long offset, unsigned long abort_ip) { }
75 #endif /* !CONFIG_TRACEPOINT */
76
77 DECLARE_STATIC_KEY_MAYBE(CONFIG_RSEQ_DEBUG_DEFAULT_ENABLE, rseq_debug_enabled);
78
79 #ifdef RSEQ_BUILD_SLOW_PATH
80 #define rseq_inline
81 #else
82 #define rseq_inline __always_inline
83 #endif
84
85 #ifdef CONFIG_RSEQ_SLICE_EXTENSION
86 DECLARE_STATIC_KEY_TRUE(rseq_slice_extension_key);
87
rseq_slice_extension_enabled(void)88 static __always_inline bool rseq_slice_extension_enabled(void)
89 {
90 return static_branch_likely(&rseq_slice_extension_key);
91 }
92
93 extern unsigned int rseq_slice_ext_nsecs;
94 bool __rseq_arm_slice_extension_timer(void);
95
rseq_arm_slice_extension_timer(void)96 static __always_inline bool rseq_arm_slice_extension_timer(void)
97 {
98 if (!rseq_slice_extension_enabled())
99 return false;
100
101 if (likely(!current->rseq.slice.state.granted))
102 return false;
103
104 return __rseq_arm_slice_extension_timer();
105 }
106
rseq_slice_clear_grant(struct task_struct * t)107 static __always_inline void rseq_slice_clear_grant(struct task_struct *t)
108 {
109 if (IS_ENABLED(CONFIG_RSEQ_STATS) && t->rseq.slice.state.granted)
110 rseq_stat_inc(rseq_stats.s_revoked);
111 t->rseq.slice.state.granted = false;
112 }
113
__rseq_grant_slice_extension(bool work_pending)114 static __always_inline bool __rseq_grant_slice_extension(bool work_pending)
115 {
116 struct task_struct *curr = current;
117 struct rseq_slice_ctrl usr_ctrl;
118 union rseq_slice_state state;
119 struct rseq __user *rseq;
120
121 if (!rseq_slice_extension_enabled())
122 return false;
123
124 /* If not enabled or not a return from interrupt, nothing to do. */
125 state = curr->rseq.slice.state;
126 state.enabled &= curr->rseq.event.user_irq;
127 if (likely(!state.state))
128 return false;
129
130 rseq = curr->rseq.usrptr;
131 scoped_user_rw_access(rseq, efault) {
132
133 /*
134 * Quick check conditions where a grant is not possible or
135 * needs to be revoked.
136 *
137 * 1) Any TIF bit which needs to do extra work aside of
138 * rescheduling prevents a grant.
139 *
140 * 2) A previous rescheduling request resulted in a slice
141 * extension grant.
142 */
143 if (unlikely(work_pending || state.granted)) {
144 /* Clear user control unconditionally. No point for checking */
145 unsafe_put_user(0U, &rseq->slice_ctrl.all, efault);
146 rseq_slice_clear_grant(curr);
147 return false;
148 }
149
150 unsafe_get_user(usr_ctrl.all, &rseq->slice_ctrl.all, efault);
151 if (likely(!(usr_ctrl.request)))
152 return false;
153
154 /* Grant the slice extention */
155 usr_ctrl.request = 0;
156 usr_ctrl.granted = 1;
157 unsafe_put_user(usr_ctrl.all, &rseq->slice_ctrl.all, efault);
158 }
159
160 rseq_stat_inc(rseq_stats.s_granted);
161
162 curr->rseq.slice.state.granted = true;
163 /* Store expiry time for arming the timer on the way out */
164 curr->rseq.slice.expires = data_race(rseq_slice_ext_nsecs) + ktime_get_mono_fast_ns();
165 /*
166 * This is racy against a remote CPU setting TIF_NEED_RESCHED in
167 * several ways:
168 *
169 * 1)
170 * CPU0 CPU1
171 * clear_tsk()
172 * set_tsk()
173 * clear_preempt()
174 * Raise scheduler IPI on CPU0
175 * --> IPI
176 * fold_need_resched() -> Folds correctly
177 * 2)
178 * CPU0 CPU1
179 * set_tsk()
180 * clear_tsk()
181 * clear_preempt()
182 * Raise scheduler IPI on CPU0
183 * --> IPI
184 * fold_need_resched() <- NOOP as TIF_NEED_RESCHED is false
185 *
186 * #1 is not any different from a regular remote reschedule as it
187 * sets the previously not set bit and then raises the IPI which
188 * folds it into the preempt counter
189 *
190 * #2 is obviously incorrect from a scheduler POV, but it's not
191 * differently incorrect than the code below clearing the
192 * reschedule request with the safety net of the timer.
193 *
194 * The important part is that the clearing is protected against the
195 * scheduler IPI and also against any other interrupt which might
196 * end up waking up a task and setting the bits in the middle of
197 * the operation:
198 *
199 * clear_tsk()
200 * ---> Interrupt
201 * wakeup_on_this_cpu()
202 * set_tsk()
203 * set_preempt()
204 * clear_preempt()
205 *
206 * which would be inconsistent state.
207 */
208 scoped_guard(irq) {
209 clear_tsk_need_resched(curr);
210 clear_preempt_need_resched();
211 }
212 return true;
213
214 efault:
215 force_sig(SIGSEGV);
216 return false;
217 }
218
rseq_grant_slice_extension(unsigned long ti_work,unsigned long mask)219 static __always_inline bool rseq_grant_slice_extension(unsigned long ti_work, unsigned long mask)
220 {
221 if (unlikely(__rseq_grant_slice_extension(ti_work & mask))) {
222 hrtimer_rearm_deferred_tif(ti_work);
223 return true;
224 }
225 return false;
226 }
227
228 #else /* CONFIG_RSEQ_SLICE_EXTENSION */
rseq_slice_extension_enabled(void)229 static __always_inline bool rseq_slice_extension_enabled(void) { return false; }
rseq_arm_slice_extension_timer(void)230 static __always_inline bool rseq_arm_slice_extension_timer(void) { return false; }
rseq_slice_clear_grant(struct task_struct * t)231 static __always_inline void rseq_slice_clear_grant(struct task_struct *t) { }
rseq_grant_slice_extension(unsigned long ti_work,unsigned long mask)232 static __always_inline bool rseq_grant_slice_extension(unsigned long ti_work, unsigned long mask) { return false; }
233 #endif /* !CONFIG_RSEQ_SLICE_EXTENSION */
234
235 bool rseq_debug_update_user_cs(struct task_struct *t, struct pt_regs *regs, unsigned long csaddr);
236 bool rseq_debug_validate_ids(struct task_struct *t);
237
rseq_note_user_irq_entry(void)238 static __always_inline void rseq_note_user_irq_entry(void)
239 {
240 if (IS_ENABLED(CONFIG_GENERIC_IRQ_ENTRY))
241 current->rseq.event.user_irq = true;
242 }
243
244 /*
245 * Check whether there is a valid critical section and whether the
246 * instruction pointer in @regs is inside the critical section.
247 *
248 * - If the critical section is invalid, terminate the task.
249 *
250 * - If valid and the instruction pointer is inside, set it to the abort IP.
251 *
252 * - If valid and the instruction pointer is outside, clear the critical
253 * section address.
254 *
255 * Returns true, if the section was valid and either fixup or clear was
256 * done, false otherwise.
257 *
258 * In the failure case task::rseq_event::fatal is set when a invalid
259 * section was found. It's clear when the failure was an unresolved page
260 * fault.
261 *
262 * If inlined into the exit to user path with interrupts disabled, the
263 * caller has to protect against page faults with pagefault_disable().
264 *
265 * In preemptible task context this would be counterproductive as the page
266 * faults could not be fully resolved. As a consequence unresolved page
267 * faults in task context are fatal too.
268 */
269
270 #ifdef RSEQ_BUILD_SLOW_PATH
271 /*
272 * The debug version is put out of line, but kept here so the code stays
273 * together.
274 *
275 * @csaddr has already been checked by the caller to be in user space
276 */
rseq_debug_update_user_cs(struct task_struct * t,struct pt_regs * regs,unsigned long csaddr)277 bool rseq_debug_update_user_cs(struct task_struct *t, struct pt_regs *regs,
278 unsigned long csaddr)
279 {
280 struct rseq_cs __user *ucs = (struct rseq_cs __user *)(unsigned long)csaddr;
281 u64 start_ip, abort_ip, offset, cs_end, head, tasksize = TASK_SIZE;
282 unsigned long ip = instruction_pointer(regs);
283 u64 __user *uc_head = (u64 __user *) ucs;
284 u32 usig, __user *uc_sig;
285
286 scoped_user_rw_access(ucs, efault) {
287 /*
288 * Evaluate the user pile and exit if one of the conditions
289 * is not fulfilled.
290 */
291 unsafe_get_user(start_ip, &ucs->start_ip, efault);
292 if (unlikely(start_ip >= tasksize))
293 goto die;
294 /* If outside, just clear the critical section. */
295 if (ip < start_ip)
296 goto clear;
297
298 unsafe_get_user(offset, &ucs->post_commit_offset, efault);
299 cs_end = start_ip + offset;
300 /* Check for overflow and wraparound */
301 if (unlikely(cs_end >= tasksize || cs_end < start_ip))
302 goto die;
303
304 /* If not inside, clear it. */
305 if (ip >= cs_end)
306 goto clear;
307
308 unsafe_get_user(abort_ip, &ucs->abort_ip, efault);
309 /* Ensure it's "valid" */
310 if (unlikely(abort_ip >= tasksize || abort_ip < sizeof(*uc_sig)))
311 goto die;
312 /* Validate that the abort IP is not in the critical section */
313 if (unlikely(abort_ip - start_ip < offset))
314 goto die;
315
316 /*
317 * Check version and flags for 0. No point in emitting
318 * deprecated warnings before dying. That could be done in
319 * the slow path eventually, but *shrug*.
320 */
321 unsafe_get_user(head, uc_head, efault);
322 if (unlikely(head))
323 goto die;
324
325 /* abort_ip - 4 is >= 0. See abort_ip check above */
326 uc_sig = (u32 __user *)(unsigned long)(abort_ip - sizeof(*uc_sig));
327 unsafe_get_user(usig, uc_sig, efault);
328 if (unlikely(usig != t->rseq.sig))
329 goto die;
330
331 /* rseq_event.user_irq is only valid if CONFIG_GENERIC_IRQ_ENTRY=y */
332 if (IS_ENABLED(CONFIG_GENERIC_IRQ_ENTRY)) {
333 /* If not in interrupt from user context, let it die */
334 if (unlikely(!t->rseq.event.user_irq))
335 goto die;
336 }
337 unsafe_put_user(0ULL, &t->rseq.usrptr->rseq_cs, efault);
338 instruction_pointer_set(regs, (unsigned long)abort_ip);
339 rseq_stat_inc(rseq_stats.fixup);
340 break;
341 clear:
342 unsafe_put_user(0ULL, &t->rseq.usrptr->rseq_cs, efault);
343 rseq_stat_inc(rseq_stats.clear);
344 abort_ip = 0ULL;
345 }
346
347 if (unlikely(abort_ip))
348 rseq_trace_ip_fixup(ip, start_ip, offset, abort_ip);
349 return true;
350 die:
351 t->rseq.event.fatal = true;
352 efault:
353 return false;
354 }
355
356 /*
357 * On debug kernels validate that user space did not mess with it if the
358 * debug branch is enabled.
359 */
rseq_debug_validate_ids(struct task_struct * t)360 bool rseq_debug_validate_ids(struct task_struct *t)
361 {
362 struct rseq __user *rseq = t->rseq.usrptr;
363 u32 cpu_id, uval, node_id;
364
365 /*
366 * On the first exit after registering the rseq region CPU ID is
367 * RSEQ_CPU_ID_UNINITIALIZED and node_id in user space is 0!
368 */
369 node_id = t->rseq.ids.cpu_id != RSEQ_CPU_ID_UNINITIALIZED ?
370 cpu_to_node(t->rseq.ids.cpu_id) : 0;
371
372 scoped_user_read_access(rseq, efault) {
373 unsafe_get_user(cpu_id, &rseq->cpu_id_start, efault);
374 if (cpu_id != t->rseq.ids.cpu_id)
375 goto die;
376 unsafe_get_user(uval, &rseq->cpu_id, efault);
377 if (uval != cpu_id)
378 goto die;
379 unsafe_get_user(uval, &rseq->node_id, efault);
380 if (uval != node_id)
381 goto die;
382 unsafe_get_user(uval, &rseq->mm_cid, efault);
383 if (uval != t->rseq.ids.mm_cid)
384 goto die;
385 }
386 return true;
387 die:
388 t->rseq.event.fatal = true;
389 efault:
390 return false;
391 }
392
393 #endif /* RSEQ_BUILD_SLOW_PATH */
394
395 /*
396 * This only ensures that abort_ip is in the user address space and
397 * validates that it is preceded by the signature.
398 *
399 * No other sanity checks are done here, that's what the debug code is for.
400 */
401 static rseq_inline bool
rseq_update_user_cs(struct task_struct * t,struct pt_regs * regs,unsigned long csaddr)402 rseq_update_user_cs(struct task_struct *t, struct pt_regs *regs, unsigned long csaddr)
403 {
404 struct rseq_cs __user *ucs = (struct rseq_cs __user *)(unsigned long)csaddr;
405 unsigned long ip = instruction_pointer(regs);
406 unsigned long tasksize = TASK_SIZE;
407 u64 start_ip, abort_ip, offset;
408 u32 usig, __user *uc_sig;
409
410 rseq_stat_inc(rseq_stats.cs);
411
412 if (unlikely(csaddr >= tasksize)) {
413 t->rseq.event.fatal = true;
414 return false;
415 }
416
417 if (static_branch_unlikely(&rseq_debug_enabled))
418 return rseq_debug_update_user_cs(t, regs, csaddr);
419
420 scoped_user_rw_access(ucs, efault) {
421 unsafe_get_user(start_ip, &ucs->start_ip, efault);
422 unsafe_get_user(offset, &ucs->post_commit_offset, efault);
423 unsafe_get_user(abort_ip, &ucs->abort_ip, efault);
424
425 /*
426 * No sanity checks. If user space screwed it up, it can
427 * keep the pieces. That's what debug code is for.
428 *
429 * If outside, just clear the critical section.
430 */
431 if (ip - start_ip >= offset)
432 goto clear;
433
434 /*
435 * Two requirements for @abort_ip:
436 * - Must be in user space as x86 IRET would happily return to
437 * the kernel.
438 * - The four bytes preceding the instruction at @abort_ip must
439 * contain the signature.
440 *
441 * The latter protects against the following attack vector:
442 *
443 * An attacker with limited abilities to write, creates a critical
444 * section descriptor, sets the abort IP to a library function or
445 * some other ROP gadget and stores the address of the descriptor
446 * in TLS::rseq::rseq_cs. An RSEQ abort would then evade ROP
447 * protection.
448 */
449 if (unlikely(abort_ip >= tasksize || abort_ip < sizeof(*uc_sig)))
450 goto die;
451
452 /* The address is guaranteed to be >= 0 and < TASK_SIZE */
453 uc_sig = (u32 __user *)(unsigned long)(abort_ip - sizeof(*uc_sig));
454 unsafe_get_user(usig, uc_sig, efault);
455 if (unlikely(usig != t->rseq.sig))
456 goto die;
457
458 /* Invalidate the critical section */
459 unsafe_put_user(0ULL, &t->rseq.usrptr->rseq_cs, efault);
460 /* Update the instruction pointer */
461 instruction_pointer_set(regs, (unsigned long)abort_ip);
462 rseq_stat_inc(rseq_stats.fixup);
463 break;
464 clear:
465 unsafe_put_user(0ULL, &t->rseq.usrptr->rseq_cs, efault);
466 rseq_stat_inc(rseq_stats.clear);
467 abort_ip = 0ULL;
468 }
469
470 if (unlikely(abort_ip))
471 rseq_trace_ip_fixup(ip, start_ip, offset, abort_ip);
472 return true;
473 die:
474 t->rseq.event.fatal = true;
475 efault:
476 return false;
477 }
478
479 /*
480 * Updates CPU ID, Node ID and MM CID and reads the critical section
481 * address, when @csaddr != NULL. This allows to put the ID update and the
482 * read under the same uaccess region to spare a separate begin/end.
483 *
484 * As this is either invoked from a C wrapper with @csaddr = NULL or from
485 * the fast path code with a valid pointer, a clever compiler should be
486 * able to optimize the read out. Spares a duplicate implementation.
487 *
488 * Returns true, if the operation was successful, false otherwise.
489 *
490 * In the failure case task::rseq_event::fatal is set when invalid data
491 * was found on debug kernels. It's clear when the failure was an unresolved page
492 * fault.
493 *
494 * If inlined into the exit to user path with interrupts disabled, the
495 * caller has to protect against page faults with pagefault_disable().
496 *
497 * In preemptible task context this would be counterproductive as the page
498 * faults could not be fully resolved. As a consequence unresolved page
499 * faults in task context are fatal too.
500 */
501 static rseq_inline
rseq_set_ids_get_csaddr(struct task_struct * t,struct rseq_ids * ids,u32 node_id,u64 * csaddr)502 bool rseq_set_ids_get_csaddr(struct task_struct *t, struct rseq_ids *ids,
503 u32 node_id, u64 *csaddr)
504 {
505 struct rseq __user *rseq = t->rseq.usrptr;
506
507 if (static_branch_unlikely(&rseq_debug_enabled)) {
508 if (!rseq_debug_validate_ids(t))
509 return false;
510 }
511
512 scoped_user_rw_access(rseq, efault) {
513 unsafe_put_user(ids->cpu_id, &rseq->cpu_id_start, efault);
514 unsafe_put_user(ids->cpu_id, &rseq->cpu_id, efault);
515 unsafe_put_user(node_id, &rseq->node_id, efault);
516 unsafe_put_user(ids->mm_cid, &rseq->mm_cid, efault);
517 if (csaddr)
518 unsafe_get_user(*csaddr, &rseq->rseq_cs, efault);
519
520 /* Open coded, so it's in the same user access region */
521 if (rseq_slice_extension_enabled()) {
522 /* Unconditionally clear it, no point in conditionals */
523 unsafe_put_user(0U, &rseq->slice_ctrl.all, efault);
524 }
525 }
526
527 rseq_slice_clear_grant(t);
528 /* Cache the new values */
529 t->rseq.ids.cpu_cid = ids->cpu_cid;
530 rseq_stat_inc(rseq_stats.ids);
531 rseq_trace_update(t, ids);
532 return true;
533 efault:
534 return false;
535 }
536
537 /*
538 * Update user space with new IDs and conditionally check whether the task
539 * is in a critical section.
540 */
rseq_update_usr(struct task_struct * t,struct pt_regs * regs,struct rseq_ids * ids,u32 node_id)541 static rseq_inline bool rseq_update_usr(struct task_struct *t, struct pt_regs *regs,
542 struct rseq_ids *ids, u32 node_id)
543 {
544 u64 csaddr;
545
546 if (!rseq_set_ids_get_csaddr(t, ids, node_id, &csaddr))
547 return false;
548
549 /*
550 * On architectures which utilize the generic entry code this
551 * allows to skip the critical section when the entry was not from
552 * a user space interrupt, unless debug mode is enabled.
553 */
554 if (IS_ENABLED(CONFIG_GENERIC_IRQ_ENTRY)) {
555 if (!static_branch_unlikely(&rseq_debug_enabled)) {
556 if (likely(!t->rseq.event.user_irq))
557 return true;
558 }
559 }
560 if (likely(!csaddr))
561 return true;
562 /* Sigh, this really needs to do work */
563 return rseq_update_user_cs(t, regs, csaddr);
564 }
565
566 /*
567 * If you want to use this then convert your architecture to the generic
568 * entry code. I'm tired of building workarounds for people who can't be
569 * bothered to make the maintenance of generic infrastructure less
570 * burdensome. Just sucking everything into the architecture code and
571 * thereby making others chase the horrible hacks and keep them working is
572 * neither acceptable nor sustainable.
573 */
574 #ifdef CONFIG_GENERIC_ENTRY
575
576 /*
577 * This is inlined into the exit path because:
578 *
579 * 1) It's a one time comparison in the fast path when there is no event to
580 * handle
581 *
582 * 2) The access to the user space rseq memory (TLS) is unlikely to fault
583 * so the straight inline operation is:
584 *
585 * - Four 32-bit stores only if CPU ID/ MM CID need to be updated
586 * - One 64-bit load to retrieve the critical section address
587 *
588 * 3) In the unlikely case that the critical section address is != NULL:
589 *
590 * - One 64-bit load to retrieve the start IP
591 * - One 64-bit load to retrieve the offset for calculating the end
592 * - One 64-bit load to retrieve the abort IP
593 * - One 64-bit load to retrieve the signature
594 * - One store to clear the critical section address
595 *
596 * The non-debug case implements only the minimal required checking. It
597 * provides protection against a rogue abort IP in kernel space, which
598 * would be exploitable at least on x86, and also against a rogue CS
599 * descriptor by checking the signature at the abort IP. Any fallout from
600 * invalid critical section descriptors is a user space problem. The debug
601 * case provides the full set of checks and terminates the task if a
602 * condition is not met.
603 *
604 * In case of a fault or an invalid value, this sets TIF_NOTIFY_RESUME and
605 * tells the caller to loop back into exit_to_user_mode_loop(). The rseq
606 * slow path there will handle the failure.
607 */
rseq_exit_user_update(struct pt_regs * regs,struct task_struct * t)608 static __always_inline bool rseq_exit_user_update(struct pt_regs *regs, struct task_struct *t)
609 {
610 /*
611 * Page faults need to be disabled as this is called with
612 * interrupts disabled
613 */
614 guard(pagefault)();
615 if (likely(!t->rseq.event.ids_changed)) {
616 struct rseq __user *rseq = t->rseq.usrptr;
617 /*
618 * If IDs have not changed rseq_event::user_irq must be true
619 * See rseq_sched_switch_event().
620 */
621 u64 csaddr;
622
623 scoped_user_rw_access(rseq, efault) {
624 unsafe_get_user(csaddr, &rseq->rseq_cs, efault);
625
626 /* Open coded, so it's in the same user access region */
627 if (rseq_slice_extension_enabled()) {
628 /* Unconditionally clear it, no point in conditionals */
629 unsafe_put_user(0U, &rseq->slice_ctrl.all, efault);
630 }
631 }
632
633 rseq_slice_clear_grant(t);
634
635 if (static_branch_unlikely(&rseq_debug_enabled) || unlikely(csaddr)) {
636 if (unlikely(!rseq_update_user_cs(t, regs, csaddr)))
637 return false;
638 }
639 return true;
640 }
641
642 struct rseq_ids ids = {
643 .cpu_id = task_cpu(t),
644 .mm_cid = task_mm_cid(t),
645 };
646 u32 node_id = cpu_to_node(ids.cpu_id);
647
648 return rseq_update_usr(t, regs, &ids, node_id);
649 efault:
650 return false;
651 }
652
__rseq_exit_to_user_mode_restart(struct pt_regs * regs)653 static __always_inline bool __rseq_exit_to_user_mode_restart(struct pt_regs *regs)
654 {
655 struct task_struct *t = current;
656
657 /*
658 * If the task did not go through schedule or got the flag enforced
659 * by the rseq syscall or execve, then nothing to do here.
660 *
661 * CPU ID and MM CID can only change when going through a context
662 * switch.
663 *
664 * rseq_sched_switch_event() sets the rseq_event::sched_switch bit
665 * only when rseq_event::has_rseq is true. That conditional is
666 * required to avoid setting the TIF bit if RSEQ is not registered
667 * for a task. rseq_event::sched_switch is cleared when RSEQ is
668 * unregistered by a task so it's sufficient to check for the
669 * sched_switch bit alone.
670 *
671 * A sane compiler requires three instructions for the nothing to do
672 * case including clearing the events, but your mileage might vary.
673 */
674 if (unlikely((t->rseq.event.sched_switch))) {
675 rseq_stat_inc(rseq_stats.fastpath);
676
677 if (unlikely(!rseq_exit_user_update(regs, t)))
678 return true;
679 }
680 /* Clear state so next entry starts from a clean slate */
681 t->rseq.event.events = 0;
682 return false;
683 }
684
685 /* Required to allow conversion to GENERIC_ENTRY w/o GENERIC_TIF_BITS */
686 #ifdef CONFIG_HAVE_GENERIC_TIF_BITS
test_tif_rseq(unsigned long ti_work)687 static __always_inline bool test_tif_rseq(unsigned long ti_work)
688 {
689 return ti_work & _TIF_RSEQ;
690 }
691
clear_tif_rseq(void)692 static __always_inline void clear_tif_rseq(void)
693 {
694 static_assert(TIF_RSEQ != TIF_NOTIFY_RESUME);
695 clear_thread_flag(TIF_RSEQ);
696 }
697 #else
test_tif_rseq(unsigned long ti_work)698 static __always_inline bool test_tif_rseq(unsigned long ti_work) { return true; }
clear_tif_rseq(void)699 static __always_inline void clear_tif_rseq(void) { }
700 #endif
701
702 static __always_inline bool
rseq_exit_to_user_mode_restart(struct pt_regs * regs,unsigned long ti_work)703 rseq_exit_to_user_mode_restart(struct pt_regs *regs, unsigned long ti_work)
704 {
705 if (unlikely(test_tif_rseq(ti_work))) {
706 if (unlikely(__rseq_exit_to_user_mode_restart(regs))) {
707 current->rseq.event.slowpath = true;
708 set_tsk_thread_flag(current, TIF_NOTIFY_RESUME);
709 return true;
710 }
711 clear_tif_rseq();
712 }
713 /*
714 * Arm the slice extension timer if nothing to do anymore and the
715 * task really goes out to user space.
716 */
717 return rseq_arm_slice_extension_timer();
718 }
719
720 #else /* CONFIG_GENERIC_ENTRY */
rseq_exit_to_user_mode_restart(struct pt_regs * regs,unsigned long ti_work)721 static inline bool rseq_exit_to_user_mode_restart(struct pt_regs *regs, unsigned long ti_work)
722 {
723 return false;
724 }
725 #endif /* !CONFIG_GENERIC_ENTRY */
726
rseq_syscall_exit_to_user_mode(void)727 static __always_inline void rseq_syscall_exit_to_user_mode(void)
728 {
729 struct rseq_event *ev = ¤t->rseq.event;
730
731 rseq_stat_inc(rseq_stats.exit);
732
733 /* Needed to remove the store for the !lockdep case */
734 if (IS_ENABLED(CONFIG_LOCKDEP)) {
735 WARN_ON_ONCE(ev->sched_switch);
736 ev->events = 0;
737 }
738 }
739
rseq_irqentry_exit_to_user_mode(void)740 static __always_inline void rseq_irqentry_exit_to_user_mode(void)
741 {
742 struct rseq_event *ev = ¤t->rseq.event;
743
744 rseq_stat_inc(rseq_stats.exit);
745
746 lockdep_assert_once(!ev->sched_switch);
747
748 /*
749 * Ensure that event (especially user_irq) is cleared when the
750 * interrupt did not result in a schedule and therefore the
751 * rseq processing could not clear it.
752 */
753 ev->events = 0;
754 }
755
756 /* Required to keep ARM64 working */
rseq_exit_to_user_mode_legacy(void)757 static __always_inline void rseq_exit_to_user_mode_legacy(void)
758 {
759 struct rseq_event *ev = ¤t->rseq.event;
760
761 rseq_stat_inc(rseq_stats.exit);
762
763 if (static_branch_unlikely(&rseq_debug_enabled))
764 WARN_ON_ONCE(ev->sched_switch);
765
766 /*
767 * Ensure that event (especially user_irq) is cleared when the
768 * interrupt did not result in a schedule and therefore the
769 * rseq processing did not clear it.
770 */
771 ev->events = 0;
772 }
773
774 void __rseq_debug_syscall_return(struct pt_regs *regs);
775
rseq_debug_syscall_return(struct pt_regs * regs)776 static __always_inline void rseq_debug_syscall_return(struct pt_regs *regs)
777 {
778 if (static_branch_unlikely(&rseq_debug_enabled))
779 __rseq_debug_syscall_return(regs);
780 }
781 #else /* CONFIG_RSEQ */
rseq_note_user_irq_entry(void)782 static inline void rseq_note_user_irq_entry(void) { }
rseq_exit_to_user_mode_restart(struct pt_regs * regs,unsigned long ti_work)783 static inline bool rseq_exit_to_user_mode_restart(struct pt_regs *regs, unsigned long ti_work)
784 {
785 return false;
786 }
rseq_syscall_exit_to_user_mode(void)787 static inline void rseq_syscall_exit_to_user_mode(void) { }
rseq_irqentry_exit_to_user_mode(void)788 static inline void rseq_irqentry_exit_to_user_mode(void) { }
rseq_exit_to_user_mode_legacy(void)789 static inline void rseq_exit_to_user_mode_legacy(void) { }
rseq_debug_syscall_return(struct pt_regs * regs)790 static inline void rseq_debug_syscall_return(struct pt_regs *regs) { }
rseq_grant_slice_extension(unsigned long ti_work,unsigned long mask)791 static inline bool rseq_grant_slice_extension(unsigned long ti_work, unsigned long mask) { return false; }
792 #endif /* !CONFIG_RSEQ */
793
794 #endif /* _LINUX_RSEQ_ENTRY_H */
795