1 // SPDX-License-Identifier: GPL-2.0
2 /*
3 * Deferred user space unwinding
4 */
5 #include <linux/sched/task_stack.h>
6 #include <linux/unwind_deferred.h>
7 #include <linux/sched/clock.h>
8 #include <linux/task_work.h>
9 #include <linux/kernel.h>
10 #include <linux/sched.h>
11 #include <linux/sizes.h>
12 #include <linux/slab.h>
13 #include <linux/mm.h>
14
15 /*
16 * For requesting a deferred user space stack trace from NMI context
17 * the architecture must support a safe cmpxchg in NMI context.
18 * For those architectures that do not have that, then it cannot ask
19 * for a deferred user space stack trace from an NMI context. If it
20 * does, then it will get -EINVAL.
21 */
22 #if defined(CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG)
23 # define CAN_USE_IN_NMI 1
try_assign_cnt(struct unwind_task_info * info,u32 cnt)24 static inline bool try_assign_cnt(struct unwind_task_info *info, u32 cnt)
25 {
26 u32 old = 0;
27
28 return try_cmpxchg(&info->id.cnt, &old, cnt);
29 }
30 #else
31 # define CAN_USE_IN_NMI 0
32 /* When NMIs are not allowed, this always succeeds */
try_assign_cnt(struct unwind_task_info * info,u32 cnt)33 static inline bool try_assign_cnt(struct unwind_task_info *info, u32 cnt)
34 {
35 info->id.cnt = cnt;
36 return true;
37 }
38 #endif
39
40 /* Make the cache fit in a 4K page */
41 #define UNWIND_MAX_ENTRIES \
42 ((SZ_4K - sizeof(struct unwind_cache)) / sizeof(long))
43
44 /* Guards adding to or removing from the list of callbacks */
45 static DEFINE_MUTEX(callback_mutex);
46 static LIST_HEAD(callbacks);
47
48 #define RESERVED_BITS (UNWIND_PENDING | UNWIND_USED)
49
50 /* Zero'd bits are available for assigning callback users */
51 static unsigned long unwind_mask = RESERVED_BITS;
52 DEFINE_STATIC_SRCU(unwind_srcu);
53
unwind_pending(struct unwind_task_info * info)54 static inline bool unwind_pending(struct unwind_task_info *info)
55 {
56 return atomic_long_read(&info->unwind_mask) & UNWIND_PENDING;
57 }
58
59 /*
60 * This is a unique percpu identifier for a given task entry context.
61 * Conceptually, it's incremented every time the CPU enters the kernel from
62 * user space, so that each "entry context" on the CPU gets a unique ID. In
63 * reality, as an optimization, it's only incremented on demand for the first
64 * deferred unwind request after a given entry-from-user.
65 *
66 * It's combined with the CPU id to make a systemwide-unique "context cookie".
67 */
68 static DEFINE_PER_CPU(u32, unwind_ctx_ctr);
69
70 /*
71 * The context cookie is a unique identifier that is assigned to a user
72 * space stacktrace. As the user space stacktrace remains the same while
73 * the task is in the kernel, the cookie is an identifier for the stacktrace.
74 * Although it is possible for the stacktrace to get another cookie if another
75 * request is made after the cookie was cleared and before reentering user
76 * space.
77 */
get_cookie(struct unwind_task_info * info)78 static u64 get_cookie(struct unwind_task_info *info)
79 {
80 u32 cnt = 1;
81
82 lockdep_assert_irqs_disabled();
83
84 if (info->id.cpu)
85 return info->id.id;
86
87 /* LSB is always set to ensure 0 is an invalid value */
88 cnt |= __this_cpu_read(unwind_ctx_ctr) + 2;
89 if (try_assign_cnt(info, cnt)) {
90 /* Update the per cpu counter */
91 __this_cpu_write(unwind_ctx_ctr, cnt);
92 }
93 /* Interrupts are disabled, the CPU will always be same */
94 info->id.cpu = smp_processor_id() + 1; /* Must be non zero */
95
96 return info->id.id;
97 }
98
99 /**
100 * unwind_user_faultable - Produce a user stacktrace in faultable context
101 * @trace: The descriptor that will store the user stacktrace
102 *
103 * This must be called in a known faultable context (usually when entering
104 * or exiting user space). Depending on the available implementations
105 * the @trace will be loaded with the addresses of the user space stacktrace
106 * if it can be found.
107 *
108 * Return: 0 on success and negative on error
109 * On success @trace will contain the user space stacktrace
110 */
unwind_user_faultable(struct unwind_stacktrace * trace)111 int unwind_user_faultable(struct unwind_stacktrace *trace)
112 {
113 struct unwind_task_info *info = ¤t->unwind_info;
114 struct unwind_cache *cache;
115
116 /* Should always be called from faultable context */
117 might_fault();
118
119 if (!current->mm)
120 return -EINVAL;
121
122 if (!info->cache) {
123 info->cache = kzalloc_flex(*cache, entries, UNWIND_MAX_ENTRIES);
124 if (!info->cache)
125 return -ENOMEM;
126 }
127
128 cache = info->cache;
129 trace->entries = cache->entries;
130 trace->nr = cache->nr_entries;
131 /*
132 * The user stack has already been previously unwound in this
133 * entry context. Skip the unwind and use the cache.
134 */
135 if (trace->nr)
136 return 0;
137
138 unwind_user(trace, UNWIND_MAX_ENTRIES);
139
140 cache->nr_entries = trace->nr;
141
142 /* Clear nr_entries on way back to user space */
143 atomic_long_or(UNWIND_USED, &info->unwind_mask);
144
145 return 0;
146 }
147
process_unwind_deferred(struct task_struct * task)148 static void process_unwind_deferred(struct task_struct *task)
149 {
150 struct unwind_task_info *info = &task->unwind_info;
151 struct unwind_stacktrace trace;
152 struct unwind_work *work;
153 unsigned long bits;
154 u64 cookie;
155
156 if (WARN_ON_ONCE(!unwind_pending(info)))
157 return;
158
159 /* Clear pending bit but make sure to have the current bits */
160 bits = atomic_long_fetch_andnot(UNWIND_PENDING,
161 &info->unwind_mask);
162 /*
163 * From here on out, the callback must always be called, even if it's
164 * just an empty trace.
165 */
166 trace.nr = 0;
167 trace.entries = NULL;
168
169 unwind_user_faultable(&trace);
170
171 if (info->cache)
172 bits &= ~(info->cache->unwind_completed);
173
174 cookie = info->id.id;
175
176 guard(srcu)(&unwind_srcu);
177 list_for_each_entry_srcu(work, &callbacks, list,
178 srcu_read_lock_held(&unwind_srcu)) {
179 if (test_bit(work->bit, &bits)) {
180 work->func(work, &trace, cookie);
181 if (info->cache)
182 info->cache->unwind_completed |= BIT(work->bit);
183 }
184 }
185 }
186
unwind_deferred_task_work(struct callback_head * head)187 static void unwind_deferred_task_work(struct callback_head *head)
188 {
189 process_unwind_deferred(current);
190 }
191
unwind_deferred_task_exit(struct task_struct * task)192 void unwind_deferred_task_exit(struct task_struct *task)
193 {
194 struct unwind_task_info *info = ¤t->unwind_info;
195
196 if (!unwind_pending(info))
197 return;
198
199 process_unwind_deferred(task);
200
201 task_work_cancel(task, &info->work);
202 }
203
204 /**
205 * unwind_deferred_request - Request a user stacktrace on task kernel exit
206 * @work: Unwind descriptor requesting the trace
207 * @cookie: The cookie of the first request made for this task
208 *
209 * Schedule a user space unwind to be done in task work before exiting the
210 * kernel.
211 *
212 * The returned @cookie output is the generated cookie of the very first
213 * request for a user space stacktrace for this task since it entered the
214 * kernel. It can be from a request by any caller of this infrastructure.
215 * Its value will also be passed to the callback function. It can be
216 * used to stitch kernel and user stack traces together in post-processing.
217 *
218 * It's valid to call this function multiple times for the same @work within
219 * the same task entry context. Each call will return the same cookie
220 * while the task hasn't left the kernel. If the callback is not pending
221 * because it has already been previously called for the same entry context,
222 * it will be called again with the same stack trace and cookie.
223 *
224 * Return: 0 if the callback successfully was queued.
225 * 1 if the callback is pending or was already executed.
226 * Negative if there's an error.
227 * @cookie holds the cookie of the first request by any user
228 */
unwind_deferred_request(struct unwind_work * work,u64 * cookie)229 int unwind_deferred_request(struct unwind_work *work, u64 *cookie)
230 {
231 struct unwind_task_info *info = ¤t->unwind_info;
232 int twa_mode = TWA_RESUME;
233 unsigned long old, bits;
234 unsigned long bit;
235 int ret;
236
237 *cookie = 0;
238
239 if ((current->flags & (PF_KTHREAD | PF_EXITING)) ||
240 !user_mode(task_pt_regs(current)))
241 return -EINVAL;
242
243 /*
244 * NMI requires having safe cmpxchg operations.
245 * Trigger a warning to make it obvious that an architecture
246 * is using this in NMI when it should not be.
247 */
248 if (in_nmi()) {
249 if (WARN_ON_ONCE(!CAN_USE_IN_NMI))
250 return -EINVAL;
251 twa_mode = TWA_NMI_CURRENT;
252 }
253
254 /* Do not allow cancelled works to request again */
255 bit = READ_ONCE(work->bit);
256 if (WARN_ON_ONCE(bit < 0))
257 return -EINVAL;
258
259 /* Only need the mask now */
260 bit = BIT(bit);
261
262 guard(irqsave)();
263
264 *cookie = get_cookie(info);
265
266 old = atomic_long_read(&info->unwind_mask);
267
268 /* Is this already queued or executed */
269 if (old & bit)
270 return 1;
271
272 /*
273 * This work's bit hasn't been set yet. Now set it with the PENDING
274 * bit and fetch the current value of unwind_mask. If ether the
275 * work's bit or PENDING was already set, then this is already queued
276 * to have a callback.
277 */
278 bits = UNWIND_PENDING | bit;
279 old = atomic_long_fetch_or(bits, &info->unwind_mask);
280 if (old & bits) {
281 /*
282 * If the work's bit was set, whatever set it had better
283 * have also set pending and queued a callback.
284 */
285 WARN_ON_ONCE(!(old & UNWIND_PENDING));
286 return old & bit;
287 }
288
289 /* The work has been claimed, now schedule it. */
290 ret = task_work_add(current, &info->work, twa_mode);
291
292 if (WARN_ON_ONCE(ret))
293 atomic_long_set(&info->unwind_mask, 0);
294
295 return ret;
296 }
297
unwind_deferred_cancel(struct unwind_work * work)298 void unwind_deferred_cancel(struct unwind_work *work)
299 {
300 struct task_struct *g, *t;
301 int bit;
302
303 if (!work)
304 return;
305
306 bit = work->bit;
307
308 /* No work should be using a reserved bit */
309 if (WARN_ON_ONCE(BIT(bit) & RESERVED_BITS))
310 return;
311
312 guard(mutex)(&callback_mutex);
313 list_del_rcu(&work->list);
314
315 /* Do not allow any more requests and prevent callbacks */
316 work->bit = -1;
317
318 __clear_bit(bit, &unwind_mask);
319
320 synchronize_srcu(&unwind_srcu);
321
322 guard(rcu)();
323 /* Clear this bit from all threads */
324 for_each_process_thread(g, t) {
325 atomic_long_andnot(BIT(bit),
326 &t->unwind_info.unwind_mask);
327 if (t->unwind_info.cache)
328 clear_bit(bit, &t->unwind_info.cache->unwind_completed);
329 }
330 }
331
unwind_deferred_init(struct unwind_work * work,unwind_callback_t func)332 int unwind_deferred_init(struct unwind_work *work, unwind_callback_t func)
333 {
334 memset(work, 0, sizeof(*work));
335
336 guard(mutex)(&callback_mutex);
337
338 /* See if there's a bit in the mask available */
339 if (unwind_mask == ~0UL)
340 return -EBUSY;
341
342 work->bit = ffz(unwind_mask);
343 __set_bit(work->bit, &unwind_mask);
344
345 list_add_rcu(&work->list, &callbacks);
346 work->func = func;
347 return 0;
348 }
349
unwind_task_init(struct task_struct * task)350 void unwind_task_init(struct task_struct *task)
351 {
352 struct unwind_task_info *info = &task->unwind_info;
353
354 memset(info, 0, sizeof(*info));
355 init_task_work(&info->work, unwind_deferred_task_work);
356 atomic_long_set(&info->unwind_mask, 0);
357 }
358
unwind_task_free(struct task_struct * task)359 void unwind_task_free(struct task_struct *task)
360 {
361 struct unwind_task_info *info = &task->unwind_info;
362
363 kfree(info->cache);
364 task_work_cancel(task, &info->work);
365 }
366