1 /* SPDX-License-Identifier: GPL-2.0+ WITH Linux-syscall-note */
2 #ifndef _LINUX_RSEQ_H
3 #define _LINUX_RSEQ_H
4
5 #ifdef CONFIG_RSEQ
6 #include <linux/sched.h>
7
8 #include <uapi/linux/rseq.h>
9
10 void __rseq_handle_slowpath(struct pt_regs *regs);
11
rseq_v2(struct task_struct * t)12 static __always_inline bool rseq_v2(struct task_struct *t)
13 {
14 return IS_ENABLED(CONFIG_GENERIC_IRQ_ENTRY) && likely(t->rseq.event.has_rseq > 1);
15 }
16
17 /* Invoked from resume_user_mode_work() */
rseq_handle_slowpath(struct pt_regs * regs)18 static inline void rseq_handle_slowpath(struct pt_regs *regs)
19 {
20 if (IS_ENABLED(CONFIG_GENERIC_ENTRY)) {
21 if (current->rseq.event.slowpath)
22 __rseq_handle_slowpath(regs);
23 } else {
24 if (current->rseq.event.sched_switch && current->rseq.event.has_rseq)
25 __rseq_handle_slowpath(regs);
26 }
27 }
28
29 void __rseq_signal_deliver(int sig, struct pt_regs *regs);
30
31 /*
32 * Invoked from signal delivery to fixup based on the register context before
33 * switching to the signal delivery context.
34 */
rseq_signal_deliver(struct ksignal * ksig,struct pt_regs * regs)35 static inline void rseq_signal_deliver(struct ksignal *ksig, struct pt_regs *regs)
36 {
37 if (rseq_v2(current)) {
38 /* has_rseq is implied in rseq_v2() */
39 if (current->rseq.event.user_irq)
40 __rseq_signal_deliver(ksig->sig, regs);
41 } else {
42 if (current->rseq.event.has_rseq)
43 __rseq_signal_deliver(ksig->sig, regs);
44 }
45 }
46
rseq_raise_notify_resume(struct task_struct * t)47 static inline void rseq_raise_notify_resume(struct task_struct *t)
48 {
49 set_tsk_thread_flag(t, TIF_RSEQ);
50 }
51
52 /* Invoked from context switch to force evaluation on exit to user */
rseq_sched_switch_event(struct task_struct * t)53 static __always_inline void rseq_sched_switch_event(struct task_struct *t)
54 {
55 struct rseq_event *ev = &t->rseq.event;
56
57 /*
58 * Only apply the user_irq optimization for RSEQ ABI V2 registrations.
59 * Legacy users like TCMalloc rely on the original ABI V1 behaviour
60 * which updates IDs on every context swtich.
61 */
62 if (rseq_v2(t)) {
63 /*
64 * Avoid a boat load of conditionals by using simple logic to
65 * determine whether TIF_NOTIFY_RESUME or TIF_RSEQ needs to be
66 * raised.
67 *
68 * It's required when the CPU or MM CID has changed or the entry
69 * was via interrupt from user space. ev->has_rseq does not have
70 * to be evaluated here because rseq_v2() implies has_rseq.
71 */
72 bool raise = ev->user_irq | ev->ids_changed;
73
74 if (raise) {
75 ev->sched_switch = true;
76 rseq_raise_notify_resume(t);
77 }
78 } else {
79 if (ev->has_rseq) {
80 t->rseq.event.ids_changed = true;
81 t->rseq.event.sched_switch = true;
82 rseq_raise_notify_resume(t);
83 }
84 }
85 }
86
87 /*
88 * Invoked from __set_task_cpu() when a task migrates or from
89 * mm_cid_schedin() when the CID changes to enforce an IDs update.
90 *
91 * This does not raise TIF_NOTIFY_RESUME as that happens in
92 * rseq_sched_switch_event().
93 */
rseq_sched_set_ids_changed(struct task_struct * t)94 static __always_inline void rseq_sched_set_ids_changed(struct task_struct *t)
95 {
96 t->rseq.event.ids_changed = true;
97 }
98
99 /* Enforce a full update after RSEQ registration and when execve() failed */
rseq_force_update(void)100 static inline void rseq_force_update(void)
101 {
102 if (current->rseq.event.has_rseq) {
103 current->rseq.event.ids_changed = true;
104 current->rseq.event.sched_switch = true;
105 rseq_raise_notify_resume(current);
106 }
107 }
108
109 /*
110 * KVM/HYPERV invoke resume_user_mode_work() before entering guest mode,
111 * which clears TIF_NOTIFY_RESUME on architectures that don't use the
112 * generic TIF bits and therefore can't provide a separate TIF_RSEQ flag.
113 *
114 * To avoid updating user space RSEQ in that case just to do it eventually
115 * again before returning to user space, because __rseq_handle_slowpath()
116 * does nothing when invoked with NULL register state.
117 *
118 * After returning from guest mode, before exiting to userspace, hypervisors
119 * must invoke this function to re-raise TIF_NOTIFY_RESUME if necessary.
120 */
rseq_virt_userspace_exit(void)121 static inline void rseq_virt_userspace_exit(void)
122 {
123 /*
124 * The generic optimization for deferring RSEQ updates until the next
125 * exit relies on having a dedicated TIF_RSEQ.
126 */
127 if (!IS_ENABLED(CONFIG_HAVE_GENERIC_TIF_BITS) &&
128 current->rseq.event.sched_switch)
129 rseq_raise_notify_resume(current);
130 }
131
rseq_reset(struct task_struct * t)132 static inline void rseq_reset(struct task_struct *t)
133 {
134 /* Protect against preemption and membarrier IPI */
135 guard(irqsave)();
136 memset(&t->rseq, 0, sizeof(t->rseq));
137 t->rseq.ids.cpu_id = RSEQ_CPU_ID_UNINITIALIZED;
138 }
139
rseq_execve(struct task_struct * t)140 static inline void rseq_execve(struct task_struct *t)
141 {
142 rseq_reset(t);
143 }
144
145 /*
146 * If parent process has a registered restartable sequences area, the
147 * child inherits. Unregister rseq for a clone with CLONE_VM set.
148 *
149 * On fork, keep the IDs (CPU, MMCID) of the parent, which avoids a fault
150 * on the COW page on exit to user space, when the child stays on the same
151 * CPU as the parent. That's obviously not guaranteed, but in overcommit
152 * scenarios it is more likely and optimizes for the fork/exec case without
153 * taking the fault.
154 */
rseq_fork(struct task_struct * t,u64 clone_flags)155 static inline void rseq_fork(struct task_struct *t, u64 clone_flags)
156 {
157 if (clone_flags & CLONE_VM)
158 rseq_reset(t);
159 else
160 t->rseq = current->rseq;
161 }
162
163 /*
164 * Value returned by getauxval(AT_RSEQ_ALIGN) and expected by rseq
165 * registration. This is the active rseq area size rounded up to next
166 * power of 2, which guarantees that the rseq structure will always be
167 * aligned on the nearest power of two large enough to contain it, even
168 * as it grows.
169 */
rseq_alloc_align(void)170 static inline unsigned int rseq_alloc_align(void)
171 {
172 return 1U << get_count_order(offsetof(struct rseq, end));
173 }
174
175 #else /* CONFIG_RSEQ */
rseq_v2(struct task_struct * t)176 static inline bool rseq_v2(struct task_struct *t) { return false; }
rseq_handle_slowpath(struct pt_regs * regs)177 static inline void rseq_handle_slowpath(struct pt_regs *regs) { }
rseq_signal_deliver(struct ksignal * ksig,struct pt_regs * regs)178 static inline void rseq_signal_deliver(struct ksignal *ksig, struct pt_regs *regs) { }
rseq_sched_switch_event(struct task_struct * t)179 static inline void rseq_sched_switch_event(struct task_struct *t) { }
rseq_sched_set_ids_changed(struct task_struct * t)180 static inline void rseq_sched_set_ids_changed(struct task_struct *t) { }
rseq_force_update(void)181 static inline void rseq_force_update(void) { }
rseq_virt_userspace_exit(void)182 static inline void rseq_virt_userspace_exit(void) { }
rseq_fork(struct task_struct * t,u64 clone_flags)183 static inline void rseq_fork(struct task_struct *t, u64 clone_flags) { }
rseq_execve(struct task_struct * t)184 static inline void rseq_execve(struct task_struct *t) { }
185 #endif /* !CONFIG_RSEQ */
186
187 #ifdef CONFIG_DEBUG_RSEQ
188 void rseq_syscall(struct pt_regs *regs);
189 #else /* CONFIG_DEBUG_RSEQ */
rseq_syscall(struct pt_regs * regs)190 static inline void rseq_syscall(struct pt_regs *regs) { }
191 #endif /* !CONFIG_DEBUG_RSEQ */
192
193 #ifdef CONFIG_RSEQ_SLICE_EXTENSION
194 void rseq_syscall_enter_work(long syscall);
195 int rseq_slice_extension_prctl(unsigned long arg2, unsigned long arg3);
196 #else /* CONFIG_RSEQ_SLICE_EXTENSION */
rseq_syscall_enter_work(long syscall)197 static inline void rseq_syscall_enter_work(long syscall) { }
rseq_slice_extension_prctl(unsigned long arg2,unsigned long arg3)198 static inline int rseq_slice_extension_prctl(unsigned long arg2, unsigned long arg3)
199 {
200 return -ENOTSUPP;
201 }
202 #endif /* !CONFIG_RSEQ_SLICE_EXTENSION */
203
204 #endif /* _LINUX_RSEQ_H */
205