xref: /linux/include/linux/rseq.h (revision 7f0023215262221ca08d56be2203e8a4770be033)
1 /* SPDX-License-Identifier: GPL-2.0+ WITH Linux-syscall-note */
2 #ifndef _LINUX_RSEQ_H
3 #define _LINUX_RSEQ_H
4 
5 #ifdef CONFIG_RSEQ
6 #include <linux/sched.h>
7 
8 #include <uapi/linux/rseq.h>
9 
10 void __rseq_handle_slowpath(struct pt_regs *regs);
11 
rseq_v2(struct task_struct * t)12 static __always_inline bool rseq_v2(struct task_struct *t)
13 {
14 	return IS_ENABLED(CONFIG_GENERIC_IRQ_ENTRY) && likely(t->rseq.event.has_rseq > 1);
15 }
16 
17 /* Invoked from resume_user_mode_work() */
rseq_handle_slowpath(struct pt_regs * regs)18 static inline void rseq_handle_slowpath(struct pt_regs *regs)
19 {
20 	if (IS_ENABLED(CONFIG_GENERIC_ENTRY)) {
21 		if (current->rseq.event.slowpath)
22 			__rseq_handle_slowpath(regs);
23 	} else {
24 		if (current->rseq.event.sched_switch && current->rseq.event.has_rseq)
25 			__rseq_handle_slowpath(regs);
26 	}
27 }
28 
29 void __rseq_signal_deliver(int sig, struct pt_regs *regs);
30 
31 /*
32  * Invoked from signal delivery to fixup based on the register context before
33  * switching to the signal delivery context.
34  */
rseq_signal_deliver(struct ksignal * ksig,struct pt_regs * regs)35 static inline void rseq_signal_deliver(struct ksignal *ksig, struct pt_regs *regs)
36 {
37 	if (rseq_v2(current)) {
38 		/* has_rseq is implied in rseq_v2() */
39 		if (current->rseq.event.user_irq)
40 			__rseq_signal_deliver(ksig->sig, regs);
41 	} else {
42 		if (current->rseq.event.has_rseq)
43 			__rseq_signal_deliver(ksig->sig, regs);
44 	}
45 }
46 
rseq_raise_notify_resume(struct task_struct * t)47 static inline void rseq_raise_notify_resume(struct task_struct *t)
48 {
49 	set_tsk_thread_flag(t, TIF_RSEQ);
50 }
51 
52 /* Invoked from context switch to force evaluation on exit to user */
rseq_sched_switch_event(struct task_struct * t)53 static __always_inline void rseq_sched_switch_event(struct task_struct *t)
54 {
55 	struct rseq_event *ev = &t->rseq.event;
56 
57 	/*
58 	 * Only apply the user_irq optimization for RSEQ ABI V2 registrations.
59 	 * Legacy users like TCMalloc rely on the original ABI V1 behaviour
60 	 * which updates IDs on every context swtich.
61 	 */
62 	if (rseq_v2(t)) {
63 		/*
64 		 * Avoid a boat load of conditionals by using simple logic to
65 		 * determine whether TIF_NOTIFY_RESUME or TIF_RSEQ needs to be
66 		 * raised.
67 		 *
68 		 * It's required when the CPU or MM CID has changed or the entry
69 		 * was via interrupt from user space. ev->has_rseq does not have
70 		 * to be evaluated here because rseq_v2() implies has_rseq.
71 		 */
72 		bool raise = ev->user_irq | ev->ids_changed;
73 
74 		if (raise) {
75 			ev->sched_switch = true;
76 			rseq_raise_notify_resume(t);
77 		}
78 	} else {
79 		if (ev->has_rseq) {
80 			t->rseq.event.ids_changed = true;
81 			t->rseq.event.sched_switch = true;
82 			rseq_raise_notify_resume(t);
83 		}
84 	}
85 }
86 
87 /*
88  * Invoked from __set_task_cpu() when a task migrates or from
89  * mm_cid_schedin() when the CID changes to enforce an IDs update.
90  *
91  * This does not raise TIF_NOTIFY_RESUME as that happens in
92  * rseq_sched_switch_event().
93  */
rseq_sched_set_ids_changed(struct task_struct * t)94 static __always_inline void rseq_sched_set_ids_changed(struct task_struct *t)
95 {
96 	t->rseq.event.ids_changed = true;
97 }
98 
99 /* Enforce a full update after RSEQ registration and when execve() failed */
rseq_force_update(void)100 static inline void rseq_force_update(void)
101 {
102 	if (current->rseq.event.has_rseq) {
103 		current->rseq.event.ids_changed = true;
104 		current->rseq.event.sched_switch = true;
105 		rseq_raise_notify_resume(current);
106 	}
107 }
108 
109 /*
110  * KVM/HYPERV invoke resume_user_mode_work() before entering guest mode,
111  * which clears TIF_NOTIFY_RESUME on architectures that don't use the
112  * generic TIF bits and therefore can't provide a separate TIF_RSEQ flag.
113  *
114  * To avoid updating user space RSEQ in that case just to do it eventually
115  * again before returning to user space, because __rseq_handle_slowpath()
116  * does nothing when invoked with NULL register state.
117  *
118  * After returning from guest mode, before exiting to userspace, hypervisors
119  * must invoke this function to re-raise TIF_NOTIFY_RESUME if necessary.
120  */
rseq_virt_userspace_exit(void)121 static inline void rseq_virt_userspace_exit(void)
122 {
123 	/*
124 	 * The generic optimization for deferring RSEQ updates until the next
125 	 * exit relies on having a dedicated TIF_RSEQ.
126 	 */
127 	if (!IS_ENABLED(CONFIG_HAVE_GENERIC_TIF_BITS) &&
128 	    current->rseq.event.sched_switch)
129 		rseq_raise_notify_resume(current);
130 }
131 
rseq_reset(struct task_struct * t)132 static inline void rseq_reset(struct task_struct *t)
133 {
134 	/* Protect against preemption and membarrier IPI */
135 	guard(irqsave)();
136 	memset(&t->rseq, 0, sizeof(t->rseq));
137 	t->rseq.ids.cpu_id = RSEQ_CPU_ID_UNINITIALIZED;
138 }
139 
rseq_execve(struct task_struct * t)140 static inline void rseq_execve(struct task_struct *t)
141 {
142 	rseq_reset(t);
143 }
144 
145 /*
146  * If parent process has a registered restartable sequences area, the
147  * child inherits. Unregister rseq for a clone with CLONE_VM set.
148  *
149  * On fork, keep the IDs (CPU, MMCID) of the parent, which avoids a fault
150  * on the COW page on exit to user space, when the child stays on the same
151  * CPU as the parent. That's obviously not guaranteed, but in overcommit
152  * scenarios it is more likely and optimizes for the fork/exec case without
153  * taking the fault.
154  */
rseq_fork(struct task_struct * t,u64 clone_flags)155 static inline void rseq_fork(struct task_struct *t, u64 clone_flags)
156 {
157 	if (clone_flags & CLONE_VM)
158 		rseq_reset(t);
159 	else
160 		t->rseq = current->rseq;
161 }
162 
163 /*
164  * Value returned by getauxval(AT_RSEQ_ALIGN) and expected by rseq
165  * registration. This is the active rseq area size rounded up to next
166  * power of 2, which guarantees that the rseq structure will always be
167  * aligned on the nearest power of two large enough to contain it, even
168  * as it grows.
169  */
rseq_alloc_align(void)170 static inline unsigned int rseq_alloc_align(void)
171 {
172 	return 1U << get_count_order(offsetof(struct rseq, end));
173 }
174 
175 #else /* CONFIG_RSEQ */
rseq_v2(struct task_struct * t)176 static inline bool rseq_v2(struct task_struct *t) { return false; }
rseq_handle_slowpath(struct pt_regs * regs)177 static inline void rseq_handle_slowpath(struct pt_regs *regs) { }
rseq_signal_deliver(struct ksignal * ksig,struct pt_regs * regs)178 static inline void rseq_signal_deliver(struct ksignal *ksig, struct pt_regs *regs) { }
rseq_sched_switch_event(struct task_struct * t)179 static inline void rseq_sched_switch_event(struct task_struct *t) { }
rseq_sched_set_ids_changed(struct task_struct * t)180 static inline void rseq_sched_set_ids_changed(struct task_struct *t) { }
rseq_force_update(void)181 static inline void rseq_force_update(void) { }
rseq_virt_userspace_exit(void)182 static inline void rseq_virt_userspace_exit(void) { }
rseq_fork(struct task_struct * t,u64 clone_flags)183 static inline void rseq_fork(struct task_struct *t, u64 clone_flags) { }
rseq_execve(struct task_struct * t)184 static inline void rseq_execve(struct task_struct *t) { }
185 #endif  /* !CONFIG_RSEQ */
186 
187 #ifdef CONFIG_DEBUG_RSEQ
188 void rseq_syscall(struct pt_regs *regs);
189 #else /* CONFIG_DEBUG_RSEQ */
rseq_syscall(struct pt_regs * regs)190 static inline void rseq_syscall(struct pt_regs *regs) { }
191 #endif /* !CONFIG_DEBUG_RSEQ */
192 
193 #ifdef CONFIG_RSEQ_SLICE_EXTENSION
194 void rseq_syscall_enter_work(long syscall);
195 int rseq_slice_extension_prctl(unsigned long arg2, unsigned long arg3);
196 #else /* CONFIG_RSEQ_SLICE_EXTENSION */
rseq_syscall_enter_work(long syscall)197 static inline void rseq_syscall_enter_work(long syscall) { }
rseq_slice_extension_prctl(unsigned long arg2,unsigned long arg3)198 static inline int rseq_slice_extension_prctl(unsigned long arg2, unsigned long arg3)
199 {
200 	return -ENOTSUPP;
201 }
202 #endif /* !CONFIG_RSEQ_SLICE_EXTENSION */
203 
204 #endif /* _LINUX_RSEQ_H */
205