xref: /linux/include/linux/rseq.h (revision 61706251492eff650e91c58507bc77e1a12c7fbb)
1 /* SPDX-License-Identifier: GPL-2.0+ WITH Linux-syscall-note */
2 #ifndef _LINUX_RSEQ_H
3 #define _LINUX_RSEQ_H
4 
5 #ifdef CONFIG_RSEQ
6 #include <linux/sched.h>
7 
8 #include <uapi/linux/rseq.h>
9 
10 void __rseq_handle_slowpath(struct pt_regs *regs);
11 
12 /* Invoked from resume_user_mode_work() */
rseq_handle_slowpath(struct pt_regs * regs)13 static inline void rseq_handle_slowpath(struct pt_regs *regs)
14 {
15 	if (IS_ENABLED(CONFIG_GENERIC_ENTRY)) {
16 		if (current->rseq.event.slowpath)
17 			__rseq_handle_slowpath(regs);
18 	} else {
19 		/* '&' is intentional to spare one conditional branch */
20 		if (current->rseq.event.sched_switch & current->rseq.event.has_rseq)
21 			__rseq_handle_slowpath(regs);
22 	}
23 }
24 
25 void __rseq_signal_deliver(int sig, struct pt_regs *regs);
26 
27 /*
28  * Invoked from signal delivery to fixup based on the register context before
29  * switching to the signal delivery context.
30  */
rseq_signal_deliver(struct ksignal * ksig,struct pt_regs * regs)31 static inline void rseq_signal_deliver(struct ksignal *ksig, struct pt_regs *regs)
32 {
33 	if (IS_ENABLED(CONFIG_GENERIC_IRQ_ENTRY)) {
34 		/* '&' is intentional to spare one conditional branch */
35 		if (current->rseq.event.has_rseq & current->rseq.event.user_irq)
36 			__rseq_signal_deliver(ksig->sig, regs);
37 	} else {
38 		if (current->rseq.event.has_rseq)
39 			__rseq_signal_deliver(ksig->sig, regs);
40 	}
41 }
42 
rseq_raise_notify_resume(struct task_struct * t)43 static inline void rseq_raise_notify_resume(struct task_struct *t)
44 {
45 	set_tsk_thread_flag(t, TIF_RSEQ);
46 }
47 
48 /* Invoked from context switch to force evaluation on exit to user */
rseq_sched_switch_event(struct task_struct * t)49 static __always_inline void rseq_sched_switch_event(struct task_struct *t)
50 {
51 	struct rseq_event *ev = &t->rseq.event;
52 
53 	if (IS_ENABLED(CONFIG_GENERIC_IRQ_ENTRY)) {
54 		/*
55 		 * Avoid a boat load of conditionals by using simple logic
56 		 * to determine whether NOTIFY_RESUME needs to be raised.
57 		 *
58 		 * It's required when the CPU or MM CID has changed or
59 		 * the entry was from user space.
60 		 */
61 		bool raise = (ev->user_irq | ev->ids_changed) & ev->has_rseq;
62 
63 		if (raise) {
64 			ev->sched_switch = true;
65 			rseq_raise_notify_resume(t);
66 		}
67 	} else {
68 		if (ev->has_rseq) {
69 			t->rseq.event.sched_switch = true;
70 			rseq_raise_notify_resume(t);
71 		}
72 	}
73 }
74 
75 /*
76  * Invoked from __set_task_cpu() when a task migrates or from
77  * mm_cid_schedin() when the CID changes to enforce an IDs update.
78  *
79  * This does not raise TIF_NOTIFY_RESUME as that happens in
80  * rseq_sched_switch_event().
81  */
rseq_sched_set_ids_changed(struct task_struct * t)82 static __always_inline void rseq_sched_set_ids_changed(struct task_struct *t)
83 {
84 	t->rseq.event.ids_changed = true;
85 }
86 
87 /* Enforce a full update after RSEQ registration and when execve() failed */
rseq_force_update(void)88 static inline void rseq_force_update(void)
89 {
90 	if (current->rseq.event.has_rseq) {
91 		current->rseq.event.ids_changed = true;
92 		current->rseq.event.sched_switch = true;
93 		rseq_raise_notify_resume(current);
94 	}
95 }
96 
97 /*
98  * KVM/HYPERV invoke resume_user_mode_work() before entering guest mode,
99  * which clears TIF_NOTIFY_RESUME on architectures that don't use the
100  * generic TIF bits and therefore can't provide a separate TIF_RSEQ flag.
101  *
102  * To avoid updating user space RSEQ in that case just to do it eventually
103  * again before returning to user space, because __rseq_handle_slowpath()
104  * does nothing when invoked with NULL register state.
105  *
106  * After returning from guest mode, before exiting to userspace, hypervisors
107  * must invoke this function to re-raise TIF_NOTIFY_RESUME if necessary.
108  */
rseq_virt_userspace_exit(void)109 static inline void rseq_virt_userspace_exit(void)
110 {
111 	/*
112 	 * The generic optimization for deferring RSEQ updates until the next
113 	 * exit relies on having a dedicated TIF_RSEQ.
114 	 */
115 	if (!IS_ENABLED(CONFIG_HAVE_GENERIC_TIF_BITS) &&
116 	    current->rseq.event.sched_switch)
117 		rseq_raise_notify_resume(current);
118 }
119 
rseq_reset(struct task_struct * t)120 static inline void rseq_reset(struct task_struct *t)
121 {
122 	memset(&t->rseq, 0, sizeof(t->rseq));
123 	t->rseq.ids.cpu_id = RSEQ_CPU_ID_UNINITIALIZED;
124 }
125 
rseq_execve(struct task_struct * t)126 static inline void rseq_execve(struct task_struct *t)
127 {
128 	rseq_reset(t);
129 }
130 
131 /*
132  * If parent process has a registered restartable sequences area, the
133  * child inherits. Unregister rseq for a clone with CLONE_VM set.
134  *
135  * On fork, keep the IDs (CPU, MMCID) of the parent, which avoids a fault
136  * on the COW page on exit to user space, when the child stays on the same
137  * CPU as the parent. That's obviously not guaranteed, but in overcommit
138  * scenarios it is more likely and optimizes for the fork/exec case without
139  * taking the fault.
140  */
rseq_fork(struct task_struct * t,u64 clone_flags)141 static inline void rseq_fork(struct task_struct *t, u64 clone_flags)
142 {
143 	if (clone_flags & CLONE_VM)
144 		rseq_reset(t);
145 	else
146 		t->rseq = current->rseq;
147 }
148 
149 /*
150  * Value returned by getauxval(AT_RSEQ_ALIGN) and expected by rseq
151  * registration. This is the active rseq area size rounded up to next
152  * power of 2, which guarantees that the rseq structure will always be
153  * aligned on the nearest power of two large enough to contain it, even
154  * as it grows.
155  */
rseq_alloc_align(void)156 static inline unsigned int rseq_alloc_align(void)
157 {
158 	return 1U << get_count_order(offsetof(struct rseq, end));
159 }
160 
161 #else /* CONFIG_RSEQ */
rseq_handle_slowpath(struct pt_regs * regs)162 static inline void rseq_handle_slowpath(struct pt_regs *regs) { }
rseq_signal_deliver(struct ksignal * ksig,struct pt_regs * regs)163 static inline void rseq_signal_deliver(struct ksignal *ksig, struct pt_regs *regs) { }
rseq_sched_switch_event(struct task_struct * t)164 static inline void rseq_sched_switch_event(struct task_struct *t) { }
rseq_sched_set_ids_changed(struct task_struct * t)165 static inline void rseq_sched_set_ids_changed(struct task_struct *t) { }
rseq_force_update(void)166 static inline void rseq_force_update(void) { }
rseq_virt_userspace_exit(void)167 static inline void rseq_virt_userspace_exit(void) { }
rseq_fork(struct task_struct * t,u64 clone_flags)168 static inline void rseq_fork(struct task_struct *t, u64 clone_flags) { }
rseq_execve(struct task_struct * t)169 static inline void rseq_execve(struct task_struct *t) { }
170 #endif  /* !CONFIG_RSEQ */
171 
172 #ifdef CONFIG_DEBUG_RSEQ
173 void rseq_syscall(struct pt_regs *regs);
174 #else /* CONFIG_DEBUG_RSEQ */
rseq_syscall(struct pt_regs * regs)175 static inline void rseq_syscall(struct pt_regs *regs) { }
176 #endif /* !CONFIG_DEBUG_RSEQ */
177 
178 #ifdef CONFIG_RSEQ_SLICE_EXTENSION
179 void rseq_syscall_enter_work(long syscall);
180 int rseq_slice_extension_prctl(unsigned long arg2, unsigned long arg3);
181 #else /* CONFIG_RSEQ_SLICE_EXTENSION */
rseq_syscall_enter_work(long syscall)182 static inline void rseq_syscall_enter_work(long syscall) { }
rseq_slice_extension_prctl(unsigned long arg2,unsigned long arg3)183 static inline int rseq_slice_extension_prctl(unsigned long arg2, unsigned long arg3)
184 {
185 	return -ENOTSUPP;
186 }
187 #endif /* !CONFIG_RSEQ_SLICE_EXTENSION */
188 
189 #endif /* _LINUX_RSEQ_H */
190