1 /* SPDX-License-Identifier: GPL-2.0 */
2 #ifndef _LINUX_RSEQ_ENTRY_H
3 #define _LINUX_RSEQ_ENTRY_H
4
5 /* Must be outside the CONFIG_RSEQ guard to resolve the stubs */
6 #ifdef CONFIG_RSEQ_STATS
7 #include <linux/percpu.h>
8
9 struct rseq_stats {
10 unsigned long exit;
11 unsigned long signal;
12 unsigned long slowpath;
13 unsigned long fastpath;
14 unsigned long ids;
15 unsigned long cs;
16 unsigned long clear;
17 unsigned long fixup;
18 unsigned long s_granted;
19 unsigned long s_expired;
20 unsigned long s_revoked;
21 unsigned long s_yielded;
22 unsigned long s_aborted;
23 };
24
25 DECLARE_PER_CPU(struct rseq_stats, rseq_stats);
26
27 /*
28 * Slow path has interrupts and preemption enabled, but the fast path
29 * runs with interrupts disabled so there is no point in having the
30 * preemption checks implied in __this_cpu_inc() for every operation.
31 */
32 #ifdef RSEQ_BUILD_SLOW_PATH
33 #define rseq_stat_inc(which) this_cpu_inc((which))
34 #else
35 #define rseq_stat_inc(which) raw_cpu_inc((which))
36 #endif
37
38 #else /* CONFIG_RSEQ_STATS */
39 #define rseq_stat_inc(x) do { } while (0)
40 #endif /* !CONFIG_RSEQ_STATS */
41
42 #ifdef CONFIG_RSEQ
43 #include <linux/jump_label.h>
44 #include <linux/rseq.h>
45 #include <linux/sched/signal.h>
46 #include <linux/uaccess.h>
47
48 #include <linux/tracepoint-defs.h>
49
50 #ifdef CONFIG_TRACEPOINTS
51 DECLARE_TRACEPOINT(rseq_update);
52 DECLARE_TRACEPOINT(rseq_ip_fixup);
53 void __rseq_trace_update(struct task_struct *t);
54 void __rseq_trace_ip_fixup(unsigned long ip, unsigned long start_ip,
55 unsigned long offset, unsigned long abort_ip);
56
rseq_trace_update(struct task_struct * t,struct rseq_ids * ids)57 static inline void rseq_trace_update(struct task_struct *t, struct rseq_ids *ids)
58 {
59 if (tracepoint_enabled(rseq_update) && ids)
60 __rseq_trace_update(t);
61 }
62
rseq_trace_ip_fixup(unsigned long ip,unsigned long start_ip,unsigned long offset,unsigned long abort_ip)63 static inline void rseq_trace_ip_fixup(unsigned long ip, unsigned long start_ip,
64 unsigned long offset, unsigned long abort_ip)
65 {
66 if (tracepoint_enabled(rseq_ip_fixup))
67 __rseq_trace_ip_fixup(ip, start_ip, offset, abort_ip);
68 }
69
70 #else /* CONFIG_TRACEPOINT */
rseq_trace_update(struct task_struct * t,struct rseq_ids * ids)71 static inline void rseq_trace_update(struct task_struct *t, struct rseq_ids *ids) { }
rseq_trace_ip_fixup(unsigned long ip,unsigned long start_ip,unsigned long offset,unsigned long abort_ip)72 static inline void rseq_trace_ip_fixup(unsigned long ip, unsigned long start_ip,
73 unsigned long offset, unsigned long abort_ip) { }
74 #endif /* !CONFIG_TRACEPOINT */
75
76 DECLARE_STATIC_KEY_MAYBE(CONFIG_RSEQ_DEBUG_DEFAULT_ENABLE, rseq_debug_enabled);
77
78 #ifdef RSEQ_BUILD_SLOW_PATH
79 #define rseq_inline
80 #else
81 #define rseq_inline __always_inline
82 #endif
83
84 #ifdef CONFIG_RSEQ_SLICE_EXTENSION
85 DECLARE_STATIC_KEY_TRUE(rseq_slice_extension_key);
86
rseq_slice_extension_enabled(void)87 static __always_inline bool rseq_slice_extension_enabled(void)
88 {
89 return static_branch_likely(&rseq_slice_extension_key);
90 }
91
92 extern unsigned int rseq_slice_ext_nsecs;
93 bool __rseq_arm_slice_extension_timer(void);
94
rseq_arm_slice_extension_timer(void)95 static __always_inline bool rseq_arm_slice_extension_timer(void)
96 {
97 if (!rseq_slice_extension_enabled())
98 return false;
99
100 if (likely(!current->rseq.slice.state.granted))
101 return false;
102
103 return __rseq_arm_slice_extension_timer();
104 }
105
rseq_slice_clear_grant(struct task_struct * t)106 static __always_inline void rseq_slice_clear_grant(struct task_struct *t)
107 {
108 if (IS_ENABLED(CONFIG_RSEQ_STATS) && t->rseq.slice.state.granted)
109 rseq_stat_inc(rseq_stats.s_revoked);
110 t->rseq.slice.state.granted = false;
111 }
112
rseq_grant_slice_extension(bool work_pending)113 static __always_inline bool rseq_grant_slice_extension(bool work_pending)
114 {
115 struct task_struct *curr = current;
116 struct rseq_slice_ctrl usr_ctrl;
117 union rseq_slice_state state;
118 struct rseq __user *rseq;
119
120 if (!rseq_slice_extension_enabled())
121 return false;
122
123 /* If not enabled or not a return from interrupt, nothing to do. */
124 state = curr->rseq.slice.state;
125 state.enabled &= curr->rseq.event.user_irq;
126 if (likely(!state.state))
127 return false;
128
129 rseq = curr->rseq.usrptr;
130 scoped_user_rw_access(rseq, efault) {
131
132 /*
133 * Quick check conditions where a grant is not possible or
134 * needs to be revoked.
135 *
136 * 1) Any TIF bit which needs to do extra work aside of
137 * rescheduling prevents a grant.
138 *
139 * 2) A previous rescheduling request resulted in a slice
140 * extension grant.
141 */
142 if (unlikely(work_pending || state.granted)) {
143 /* Clear user control unconditionally. No point for checking */
144 unsafe_put_user(0U, &rseq->slice_ctrl.all, efault);
145 rseq_slice_clear_grant(curr);
146 return false;
147 }
148
149 unsafe_get_user(usr_ctrl.all, &rseq->slice_ctrl.all, efault);
150 if (likely(!(usr_ctrl.request)))
151 return false;
152
153 /* Grant the slice extention */
154 usr_ctrl.request = 0;
155 usr_ctrl.granted = 1;
156 unsafe_put_user(usr_ctrl.all, &rseq->slice_ctrl.all, efault);
157 }
158
159 rseq_stat_inc(rseq_stats.s_granted);
160
161 curr->rseq.slice.state.granted = true;
162 /* Store expiry time for arming the timer on the way out */
163 curr->rseq.slice.expires = data_race(rseq_slice_ext_nsecs) + ktime_get_mono_fast_ns();
164 /*
165 * This is racy against a remote CPU setting TIF_NEED_RESCHED in
166 * several ways:
167 *
168 * 1)
169 * CPU0 CPU1
170 * clear_tsk()
171 * set_tsk()
172 * clear_preempt()
173 * Raise scheduler IPI on CPU0
174 * --> IPI
175 * fold_need_resched() -> Folds correctly
176 * 2)
177 * CPU0 CPU1
178 * set_tsk()
179 * clear_tsk()
180 * clear_preempt()
181 * Raise scheduler IPI on CPU0
182 * --> IPI
183 * fold_need_resched() <- NOOP as TIF_NEED_RESCHED is false
184 *
185 * #1 is not any different from a regular remote reschedule as it
186 * sets the previously not set bit and then raises the IPI which
187 * folds it into the preempt counter
188 *
189 * #2 is obviously incorrect from a scheduler POV, but it's not
190 * differently incorrect than the code below clearing the
191 * reschedule request with the safety net of the timer.
192 *
193 * The important part is that the clearing is protected against the
194 * scheduler IPI and also against any other interrupt which might
195 * end up waking up a task and setting the bits in the middle of
196 * the operation:
197 *
198 * clear_tsk()
199 * ---> Interrupt
200 * wakeup_on_this_cpu()
201 * set_tsk()
202 * set_preempt()
203 * clear_preempt()
204 *
205 * which would be inconsistent state.
206 */
207 scoped_guard(irq) {
208 clear_tsk_need_resched(curr);
209 clear_preempt_need_resched();
210 }
211 return true;
212
213 efault:
214 force_sig(SIGSEGV);
215 return false;
216 }
217
218 #else /* CONFIG_RSEQ_SLICE_EXTENSION */
rseq_slice_extension_enabled(void)219 static __always_inline bool rseq_slice_extension_enabled(void) { return false; }
rseq_arm_slice_extension_timer(void)220 static __always_inline bool rseq_arm_slice_extension_timer(void) { return false; }
rseq_slice_clear_grant(struct task_struct * t)221 static __always_inline void rseq_slice_clear_grant(struct task_struct *t) { }
rseq_grant_slice_extension(bool work_pending)222 static __always_inline bool rseq_grant_slice_extension(bool work_pending) { return false; }
223 #endif /* !CONFIG_RSEQ_SLICE_EXTENSION */
224
225 bool rseq_debug_update_user_cs(struct task_struct *t, struct pt_regs *regs, unsigned long csaddr);
226 bool rseq_debug_validate_ids(struct task_struct *t);
227
rseq_note_user_irq_entry(void)228 static __always_inline void rseq_note_user_irq_entry(void)
229 {
230 if (IS_ENABLED(CONFIG_GENERIC_IRQ_ENTRY))
231 current->rseq.event.user_irq = true;
232 }
233
234 /*
235 * Check whether there is a valid critical section and whether the
236 * instruction pointer in @regs is inside the critical section.
237 *
238 * - If the critical section is invalid, terminate the task.
239 *
240 * - If valid and the instruction pointer is inside, set it to the abort IP.
241 *
242 * - If valid and the instruction pointer is outside, clear the critical
243 * section address.
244 *
245 * Returns true, if the section was valid and either fixup or clear was
246 * done, false otherwise.
247 *
248 * In the failure case task::rseq_event::fatal is set when a invalid
249 * section was found. It's clear when the failure was an unresolved page
250 * fault.
251 *
252 * If inlined into the exit to user path with interrupts disabled, the
253 * caller has to protect against page faults with pagefault_disable().
254 *
255 * In preemptible task context this would be counterproductive as the page
256 * faults could not be fully resolved. As a consequence unresolved page
257 * faults in task context are fatal too.
258 */
259
260 #ifdef RSEQ_BUILD_SLOW_PATH
261 /*
262 * The debug version is put out of line, but kept here so the code stays
263 * together.
264 *
265 * @csaddr has already been checked by the caller to be in user space
266 */
rseq_debug_update_user_cs(struct task_struct * t,struct pt_regs * regs,unsigned long csaddr)267 bool rseq_debug_update_user_cs(struct task_struct *t, struct pt_regs *regs,
268 unsigned long csaddr)
269 {
270 struct rseq_cs __user *ucs = (struct rseq_cs __user *)(unsigned long)csaddr;
271 u64 start_ip, abort_ip, offset, cs_end, head, tasksize = TASK_SIZE;
272 unsigned long ip = instruction_pointer(regs);
273 u64 __user *uc_head = (u64 __user *) ucs;
274 u32 usig, __user *uc_sig;
275
276 scoped_user_rw_access(ucs, efault) {
277 /*
278 * Evaluate the user pile and exit if one of the conditions
279 * is not fulfilled.
280 */
281 unsafe_get_user(start_ip, &ucs->start_ip, efault);
282 if (unlikely(start_ip >= tasksize))
283 goto die;
284 /* If outside, just clear the critical section. */
285 if (ip < start_ip)
286 goto clear;
287
288 unsafe_get_user(offset, &ucs->post_commit_offset, efault);
289 cs_end = start_ip + offset;
290 /* Check for overflow and wraparound */
291 if (unlikely(cs_end >= tasksize || cs_end < start_ip))
292 goto die;
293
294 /* If not inside, clear it. */
295 if (ip >= cs_end)
296 goto clear;
297
298 unsafe_get_user(abort_ip, &ucs->abort_ip, efault);
299 /* Ensure it's "valid" */
300 if (unlikely(abort_ip >= tasksize || abort_ip < sizeof(*uc_sig)))
301 goto die;
302 /* Validate that the abort IP is not in the critical section */
303 if (unlikely(abort_ip - start_ip < offset))
304 goto die;
305
306 /*
307 * Check version and flags for 0. No point in emitting
308 * deprecated warnings before dying. That could be done in
309 * the slow path eventually, but *shrug*.
310 */
311 unsafe_get_user(head, uc_head, efault);
312 if (unlikely(head))
313 goto die;
314
315 /* abort_ip - 4 is >= 0. See abort_ip check above */
316 uc_sig = (u32 __user *)(unsigned long)(abort_ip - sizeof(*uc_sig));
317 unsafe_get_user(usig, uc_sig, efault);
318 if (unlikely(usig != t->rseq.sig))
319 goto die;
320
321 /* rseq_event.user_irq is only valid if CONFIG_GENERIC_IRQ_ENTRY=y */
322 if (IS_ENABLED(CONFIG_GENERIC_IRQ_ENTRY)) {
323 /* If not in interrupt from user context, let it die */
324 if (unlikely(!t->rseq.event.user_irq))
325 goto die;
326 }
327 unsafe_put_user(0ULL, &t->rseq.usrptr->rseq_cs, efault);
328 instruction_pointer_set(regs, (unsigned long)abort_ip);
329 rseq_stat_inc(rseq_stats.fixup);
330 break;
331 clear:
332 unsafe_put_user(0ULL, &t->rseq.usrptr->rseq_cs, efault);
333 rseq_stat_inc(rseq_stats.clear);
334 abort_ip = 0ULL;
335 }
336
337 if (unlikely(abort_ip))
338 rseq_trace_ip_fixup(ip, start_ip, offset, abort_ip);
339 return true;
340 die:
341 t->rseq.event.fatal = true;
342 efault:
343 return false;
344 }
345
346 /*
347 * On debug kernels validate that user space did not mess with it if the
348 * debug branch is enabled.
349 */
rseq_debug_validate_ids(struct task_struct * t)350 bool rseq_debug_validate_ids(struct task_struct *t)
351 {
352 struct rseq __user *rseq = t->rseq.usrptr;
353 u32 cpu_id, uval, node_id;
354
355 /*
356 * On the first exit after registering the rseq region CPU ID is
357 * RSEQ_CPU_ID_UNINITIALIZED and node_id in user space is 0!
358 */
359 node_id = t->rseq.ids.cpu_id != RSEQ_CPU_ID_UNINITIALIZED ?
360 cpu_to_node(t->rseq.ids.cpu_id) : 0;
361
362 scoped_user_read_access(rseq, efault) {
363 unsafe_get_user(cpu_id, &rseq->cpu_id_start, efault);
364 if (cpu_id != t->rseq.ids.cpu_id)
365 goto die;
366 unsafe_get_user(uval, &rseq->cpu_id, efault);
367 if (uval != cpu_id)
368 goto die;
369 unsafe_get_user(uval, &rseq->node_id, efault);
370 if (uval != node_id)
371 goto die;
372 unsafe_get_user(uval, &rseq->mm_cid, efault);
373 if (uval != t->rseq.ids.mm_cid)
374 goto die;
375 }
376 return true;
377 die:
378 t->rseq.event.fatal = true;
379 efault:
380 return false;
381 }
382
383 #endif /* RSEQ_BUILD_SLOW_PATH */
384
385 /*
386 * This only ensures that abort_ip is in the user address space and
387 * validates that it is preceded by the signature.
388 *
389 * No other sanity checks are done here, that's what the debug code is for.
390 */
391 static rseq_inline bool
rseq_update_user_cs(struct task_struct * t,struct pt_regs * regs,unsigned long csaddr)392 rseq_update_user_cs(struct task_struct *t, struct pt_regs *regs, unsigned long csaddr)
393 {
394 struct rseq_cs __user *ucs = (struct rseq_cs __user *)(unsigned long)csaddr;
395 unsigned long ip = instruction_pointer(regs);
396 unsigned long tasksize = TASK_SIZE;
397 u64 start_ip, abort_ip, offset;
398 u32 usig, __user *uc_sig;
399
400 rseq_stat_inc(rseq_stats.cs);
401
402 if (unlikely(csaddr >= tasksize)) {
403 t->rseq.event.fatal = true;
404 return false;
405 }
406
407 if (static_branch_unlikely(&rseq_debug_enabled))
408 return rseq_debug_update_user_cs(t, regs, csaddr);
409
410 scoped_user_rw_access(ucs, efault) {
411 unsafe_get_user(start_ip, &ucs->start_ip, efault);
412 unsafe_get_user(offset, &ucs->post_commit_offset, efault);
413 unsafe_get_user(abort_ip, &ucs->abort_ip, efault);
414
415 /*
416 * No sanity checks. If user space screwed it up, it can
417 * keep the pieces. That's what debug code is for.
418 *
419 * If outside, just clear the critical section.
420 */
421 if (ip - start_ip >= offset)
422 goto clear;
423
424 /*
425 * Two requirements for @abort_ip:
426 * - Must be in user space as x86 IRET would happily return to
427 * the kernel.
428 * - The four bytes preceding the instruction at @abort_ip must
429 * contain the signature.
430 *
431 * The latter protects against the following attack vector:
432 *
433 * An attacker with limited abilities to write, creates a critical
434 * section descriptor, sets the abort IP to a library function or
435 * some other ROP gadget and stores the address of the descriptor
436 * in TLS::rseq::rseq_cs. An RSEQ abort would then evade ROP
437 * protection.
438 */
439 if (unlikely(abort_ip >= tasksize || abort_ip < sizeof(*uc_sig)))
440 goto die;
441
442 /* The address is guaranteed to be >= 0 and < TASK_SIZE */
443 uc_sig = (u32 __user *)(unsigned long)(abort_ip - sizeof(*uc_sig));
444 unsafe_get_user(usig, uc_sig, efault);
445 if (unlikely(usig != t->rseq.sig))
446 goto die;
447
448 /* Invalidate the critical section */
449 unsafe_put_user(0ULL, &t->rseq.usrptr->rseq_cs, efault);
450 /* Update the instruction pointer */
451 instruction_pointer_set(regs, (unsigned long)abort_ip);
452 rseq_stat_inc(rseq_stats.fixup);
453 break;
454 clear:
455 unsafe_put_user(0ULL, &t->rseq.usrptr->rseq_cs, efault);
456 rseq_stat_inc(rseq_stats.clear);
457 abort_ip = 0ULL;
458 }
459
460 if (unlikely(abort_ip))
461 rseq_trace_ip_fixup(ip, start_ip, offset, abort_ip);
462 return true;
463 die:
464 t->rseq.event.fatal = true;
465 efault:
466 return false;
467 }
468
469 /*
470 * Updates CPU ID, Node ID and MM CID and reads the critical section
471 * address, when @csaddr != NULL. This allows to put the ID update and the
472 * read under the same uaccess region to spare a separate begin/end.
473 *
474 * As this is either invoked from a C wrapper with @csaddr = NULL or from
475 * the fast path code with a valid pointer, a clever compiler should be
476 * able to optimize the read out. Spares a duplicate implementation.
477 *
478 * Returns true, if the operation was successful, false otherwise.
479 *
480 * In the failure case task::rseq_event::fatal is set when invalid data
481 * was found on debug kernels. It's clear when the failure was an unresolved page
482 * fault.
483 *
484 * If inlined into the exit to user path with interrupts disabled, the
485 * caller has to protect against page faults with pagefault_disable().
486 *
487 * In preemptible task context this would be counterproductive as the page
488 * faults could not be fully resolved. As a consequence unresolved page
489 * faults in task context are fatal too.
490 */
491 static rseq_inline
rseq_set_ids_get_csaddr(struct task_struct * t,struct rseq_ids * ids,u32 node_id,u64 * csaddr)492 bool rseq_set_ids_get_csaddr(struct task_struct *t, struct rseq_ids *ids,
493 u32 node_id, u64 *csaddr)
494 {
495 struct rseq __user *rseq = t->rseq.usrptr;
496
497 if (static_branch_unlikely(&rseq_debug_enabled)) {
498 if (!rseq_debug_validate_ids(t))
499 return false;
500 }
501
502 scoped_user_rw_access(rseq, efault) {
503 unsafe_put_user(ids->cpu_id, &rseq->cpu_id_start, efault);
504 unsafe_put_user(ids->cpu_id, &rseq->cpu_id, efault);
505 unsafe_put_user(node_id, &rseq->node_id, efault);
506 unsafe_put_user(ids->mm_cid, &rseq->mm_cid, efault);
507 if (csaddr)
508 unsafe_get_user(*csaddr, &rseq->rseq_cs, efault);
509
510 /* Open coded, so it's in the same user access region */
511 if (rseq_slice_extension_enabled()) {
512 /* Unconditionally clear it, no point in conditionals */
513 unsafe_put_user(0U, &rseq->slice_ctrl.all, efault);
514 }
515 }
516
517 rseq_slice_clear_grant(t);
518 /* Cache the new values */
519 t->rseq.ids.cpu_cid = ids->cpu_cid;
520 rseq_stat_inc(rseq_stats.ids);
521 rseq_trace_update(t, ids);
522 return true;
523 efault:
524 return false;
525 }
526
527 /*
528 * Update user space with new IDs and conditionally check whether the task
529 * is in a critical section.
530 */
rseq_update_usr(struct task_struct * t,struct pt_regs * regs,struct rseq_ids * ids,u32 node_id)531 static rseq_inline bool rseq_update_usr(struct task_struct *t, struct pt_regs *regs,
532 struct rseq_ids *ids, u32 node_id)
533 {
534 u64 csaddr;
535
536 if (!rseq_set_ids_get_csaddr(t, ids, node_id, &csaddr))
537 return false;
538
539 /*
540 * On architectures which utilize the generic entry code this
541 * allows to skip the critical section when the entry was not from
542 * a user space interrupt, unless debug mode is enabled.
543 */
544 if (IS_ENABLED(CONFIG_GENERIC_IRQ_ENTRY)) {
545 if (!static_branch_unlikely(&rseq_debug_enabled)) {
546 if (likely(!t->rseq.event.user_irq))
547 return true;
548 }
549 }
550 if (likely(!csaddr))
551 return true;
552 /* Sigh, this really needs to do work */
553 return rseq_update_user_cs(t, regs, csaddr);
554 }
555
556 /*
557 * If you want to use this then convert your architecture to the generic
558 * entry code. I'm tired of building workarounds for people who can't be
559 * bothered to make the maintenance of generic infrastructure less
560 * burdensome. Just sucking everything into the architecture code and
561 * thereby making others chase the horrible hacks and keep them working is
562 * neither acceptable nor sustainable.
563 */
564 #ifdef CONFIG_GENERIC_ENTRY
565
566 /*
567 * This is inlined into the exit path because:
568 *
569 * 1) It's a one time comparison in the fast path when there is no event to
570 * handle
571 *
572 * 2) The access to the user space rseq memory (TLS) is unlikely to fault
573 * so the straight inline operation is:
574 *
575 * - Four 32-bit stores only if CPU ID/ MM CID need to be updated
576 * - One 64-bit load to retrieve the critical section address
577 *
578 * 3) In the unlikely case that the critical section address is != NULL:
579 *
580 * - One 64-bit load to retrieve the start IP
581 * - One 64-bit load to retrieve the offset for calculating the end
582 * - One 64-bit load to retrieve the abort IP
583 * - One 64-bit load to retrieve the signature
584 * - One store to clear the critical section address
585 *
586 * The non-debug case implements only the minimal required checking. It
587 * provides protection against a rogue abort IP in kernel space, which
588 * would be exploitable at least on x86, and also against a rogue CS
589 * descriptor by checking the signature at the abort IP. Any fallout from
590 * invalid critical section descriptors is a user space problem. The debug
591 * case provides the full set of checks and terminates the task if a
592 * condition is not met.
593 *
594 * In case of a fault or an invalid value, this sets TIF_NOTIFY_RESUME and
595 * tells the caller to loop back into exit_to_user_mode_loop(). The rseq
596 * slow path there will handle the failure.
597 */
rseq_exit_user_update(struct pt_regs * regs,struct task_struct * t)598 static __always_inline bool rseq_exit_user_update(struct pt_regs *regs, struct task_struct *t)
599 {
600 /*
601 * Page faults need to be disabled as this is called with
602 * interrupts disabled
603 */
604 guard(pagefault)();
605 if (likely(!t->rseq.event.ids_changed)) {
606 struct rseq __user *rseq = t->rseq.usrptr;
607 /*
608 * If IDs have not changed rseq_event::user_irq must be true
609 * See rseq_sched_switch_event().
610 */
611 u64 csaddr;
612
613 scoped_user_rw_access(rseq, efault) {
614 unsafe_get_user(csaddr, &rseq->rseq_cs, efault);
615
616 /* Open coded, so it's in the same user access region */
617 if (rseq_slice_extension_enabled()) {
618 /* Unconditionally clear it, no point in conditionals */
619 unsafe_put_user(0U, &rseq->slice_ctrl.all, efault);
620 }
621 }
622
623 rseq_slice_clear_grant(t);
624
625 if (static_branch_unlikely(&rseq_debug_enabled) || unlikely(csaddr)) {
626 if (unlikely(!rseq_update_user_cs(t, regs, csaddr)))
627 return false;
628 }
629 return true;
630 }
631
632 struct rseq_ids ids = {
633 .cpu_id = task_cpu(t),
634 .mm_cid = task_mm_cid(t),
635 };
636 u32 node_id = cpu_to_node(ids.cpu_id);
637
638 return rseq_update_usr(t, regs, &ids, node_id);
639 efault:
640 return false;
641 }
642
__rseq_exit_to_user_mode_restart(struct pt_regs * regs)643 static __always_inline bool __rseq_exit_to_user_mode_restart(struct pt_regs *regs)
644 {
645 struct task_struct *t = current;
646
647 /*
648 * If the task did not go through schedule or got the flag enforced
649 * by the rseq syscall or execve, then nothing to do here.
650 *
651 * CPU ID and MM CID can only change when going through a context
652 * switch.
653 *
654 * rseq_sched_switch_event() sets the rseq_event::sched_switch bit
655 * only when rseq_event::has_rseq is true. That conditional is
656 * required to avoid setting the TIF bit if RSEQ is not registered
657 * for a task. rseq_event::sched_switch is cleared when RSEQ is
658 * unregistered by a task so it's sufficient to check for the
659 * sched_switch bit alone.
660 *
661 * A sane compiler requires three instructions for the nothing to do
662 * case including clearing the events, but your mileage might vary.
663 */
664 if (unlikely((t->rseq.event.sched_switch))) {
665 rseq_stat_inc(rseq_stats.fastpath);
666
667 if (unlikely(!rseq_exit_user_update(regs, t)))
668 return true;
669 }
670 /* Clear state so next entry starts from a clean slate */
671 t->rseq.event.events = 0;
672 return false;
673 }
674
675 /* Required to allow conversion to GENERIC_ENTRY w/o GENERIC_TIF_BITS */
676 #ifdef CONFIG_HAVE_GENERIC_TIF_BITS
test_tif_rseq(unsigned long ti_work)677 static __always_inline bool test_tif_rseq(unsigned long ti_work)
678 {
679 return ti_work & _TIF_RSEQ;
680 }
681
clear_tif_rseq(void)682 static __always_inline void clear_tif_rseq(void)
683 {
684 static_assert(TIF_RSEQ != TIF_NOTIFY_RESUME);
685 clear_thread_flag(TIF_RSEQ);
686 }
687 #else
test_tif_rseq(unsigned long ti_work)688 static __always_inline bool test_tif_rseq(unsigned long ti_work) { return true; }
clear_tif_rseq(void)689 static __always_inline void clear_tif_rseq(void) { }
690 #endif
691
692 static __always_inline bool
rseq_exit_to_user_mode_restart(struct pt_regs * regs,unsigned long ti_work)693 rseq_exit_to_user_mode_restart(struct pt_regs *regs, unsigned long ti_work)
694 {
695 if (unlikely(test_tif_rseq(ti_work))) {
696 if (unlikely(__rseq_exit_to_user_mode_restart(regs))) {
697 current->rseq.event.slowpath = true;
698 set_tsk_thread_flag(current, TIF_NOTIFY_RESUME);
699 return true;
700 }
701 clear_tif_rseq();
702 }
703 /*
704 * Arm the slice extension timer if nothing to do anymore and the
705 * task really goes out to user space.
706 */
707 return rseq_arm_slice_extension_timer();
708 }
709
710 #else /* CONFIG_GENERIC_ENTRY */
rseq_exit_to_user_mode_restart(struct pt_regs * regs,unsigned long ti_work)711 static inline bool rseq_exit_to_user_mode_restart(struct pt_regs *regs, unsigned long ti_work)
712 {
713 return false;
714 }
715 #endif /* !CONFIG_GENERIC_ENTRY */
716
rseq_syscall_exit_to_user_mode(void)717 static __always_inline void rseq_syscall_exit_to_user_mode(void)
718 {
719 struct rseq_event *ev = ¤t->rseq.event;
720
721 rseq_stat_inc(rseq_stats.exit);
722
723 /* Needed to remove the store for the !lockdep case */
724 if (IS_ENABLED(CONFIG_LOCKDEP)) {
725 WARN_ON_ONCE(ev->sched_switch);
726 ev->events = 0;
727 }
728 }
729
rseq_irqentry_exit_to_user_mode(void)730 static __always_inline void rseq_irqentry_exit_to_user_mode(void)
731 {
732 struct rseq_event *ev = ¤t->rseq.event;
733
734 rseq_stat_inc(rseq_stats.exit);
735
736 lockdep_assert_once(!ev->sched_switch);
737
738 /*
739 * Ensure that event (especially user_irq) is cleared when the
740 * interrupt did not result in a schedule and therefore the
741 * rseq processing could not clear it.
742 */
743 ev->events = 0;
744 }
745
746 /* Required to keep ARM64 working */
rseq_exit_to_user_mode_legacy(void)747 static __always_inline void rseq_exit_to_user_mode_legacy(void)
748 {
749 struct rseq_event *ev = ¤t->rseq.event;
750
751 rseq_stat_inc(rseq_stats.exit);
752
753 if (static_branch_unlikely(&rseq_debug_enabled))
754 WARN_ON_ONCE(ev->sched_switch);
755
756 /*
757 * Ensure that event (especially user_irq) is cleared when the
758 * interrupt did not result in a schedule and therefore the
759 * rseq processing did not clear it.
760 */
761 ev->events = 0;
762 }
763
764 void __rseq_debug_syscall_return(struct pt_regs *regs);
765
rseq_debug_syscall_return(struct pt_regs * regs)766 static __always_inline void rseq_debug_syscall_return(struct pt_regs *regs)
767 {
768 if (static_branch_unlikely(&rseq_debug_enabled))
769 __rseq_debug_syscall_return(regs);
770 }
771 #else /* CONFIG_RSEQ */
rseq_note_user_irq_entry(void)772 static inline void rseq_note_user_irq_entry(void) { }
rseq_exit_to_user_mode_restart(struct pt_regs * regs,unsigned long ti_work)773 static inline bool rseq_exit_to_user_mode_restart(struct pt_regs *regs, unsigned long ti_work)
774 {
775 return false;
776 }
rseq_syscall_exit_to_user_mode(void)777 static inline void rseq_syscall_exit_to_user_mode(void) { }
rseq_irqentry_exit_to_user_mode(void)778 static inline void rseq_irqentry_exit_to_user_mode(void) { }
rseq_exit_to_user_mode_legacy(void)779 static inline void rseq_exit_to_user_mode_legacy(void) { }
rseq_debug_syscall_return(struct pt_regs * regs)780 static inline void rseq_debug_syscall_return(struct pt_regs *regs) { }
rseq_grant_slice_extension(bool work_pending)781 static inline bool rseq_grant_slice_extension(bool work_pending) { return false; }
782 #endif /* !CONFIG_RSEQ */
783
784 #endif /* _LINUX_RSEQ_ENTRY_H */
785