xref: /linux/kernel/unwind/deferred.c (revision 7fc2cd2e4b398c57c9cf961cfea05eadbf34c05c)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Deferred user space unwinding
4  */
5 #include <linux/sched/task_stack.h>
6 #include <linux/unwind_deferred.h>
7 #include <linux/sched/clock.h>
8 #include <linux/task_work.h>
9 #include <linux/kernel.h>
10 #include <linux/sched.h>
11 #include <linux/sizes.h>
12 #include <linux/slab.h>
13 #include <linux/mm.h>
14 
15 /*
16  * For requesting a deferred user space stack trace from NMI context
17  * the architecture must support a safe cmpxchg in NMI context.
18  * For those architectures that do not have that, then it cannot ask
19  * for a deferred user space stack trace from an NMI context. If it
20  * does, then it will get -EINVAL.
21  */
22 #if defined(CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG)
23 # define CAN_USE_IN_NMI		1
24 static inline bool try_assign_cnt(struct unwind_task_info *info, u32 cnt)
25 {
26 	u32 old = 0;
27 
28 	return try_cmpxchg(&info->id.cnt, &old, cnt);
29 }
30 #else
31 # define CAN_USE_IN_NMI		0
32 /* When NMIs are not allowed, this always succeeds */
33 static inline bool try_assign_cnt(struct unwind_task_info *info, u32 cnt)
34 {
35 	info->id.cnt = cnt;
36 	return true;
37 }
38 #endif
39 
40 /* Make the cache fit in a 4K page */
41 #define UNWIND_MAX_ENTRIES					\
42 	((SZ_4K - sizeof(struct unwind_cache)) / sizeof(long))
43 
44 /* Guards adding to or removing from the list of callbacks */
45 static DEFINE_MUTEX(callback_mutex);
46 static LIST_HEAD(callbacks);
47 
48 #define RESERVED_BITS	(UNWIND_PENDING | UNWIND_USED)
49 
50 /* Zero'd bits are available for assigning callback users */
51 static unsigned long unwind_mask = RESERVED_BITS;
52 DEFINE_STATIC_SRCU(unwind_srcu);
53 
54 static inline bool unwind_pending(struct unwind_task_info *info)
55 {
56 	return atomic_long_read(&info->unwind_mask) & UNWIND_PENDING;
57 }
58 
59 /*
60  * This is a unique percpu identifier for a given task entry context.
61  * Conceptually, it's incremented every time the CPU enters the kernel from
62  * user space, so that each "entry context" on the CPU gets a unique ID.  In
63  * reality, as an optimization, it's only incremented on demand for the first
64  * deferred unwind request after a given entry-from-user.
65  *
66  * It's combined with the CPU id to make a systemwide-unique "context cookie".
67  */
68 static DEFINE_PER_CPU(u32, unwind_ctx_ctr);
69 
70 /*
71  * The context cookie is a unique identifier that is assigned to a user
72  * space stacktrace. As the user space stacktrace remains the same while
73  * the task is in the kernel, the cookie is an identifier for the stacktrace.
74  * Although it is possible for the stacktrace to get another cookie if another
75  * request is made after the cookie was cleared and before reentering user
76  * space.
77  */
78 static u64 get_cookie(struct unwind_task_info *info)
79 {
80 	u32 cnt = 1;
81 
82 	lockdep_assert_irqs_disabled();
83 
84 	if (info->id.cpu)
85 		return info->id.id;
86 
87 	/* LSB is always set to ensure 0 is an invalid value */
88 	cnt |= __this_cpu_read(unwind_ctx_ctr) + 2;
89 	if (try_assign_cnt(info, cnt)) {
90 		/* Update the per cpu counter */
91 		__this_cpu_write(unwind_ctx_ctr, cnt);
92 	}
93 	/* Interrupts are disabled, the CPU will always be same */
94 	info->id.cpu = smp_processor_id() + 1; /* Must be non zero */
95 
96 	return info->id.id;
97 }
98 
99 /**
100  * unwind_user_faultable - Produce a user stacktrace in faultable context
101  * @trace: The descriptor that will store the user stacktrace
102  *
103  * This must be called in a known faultable context (usually when entering
104  * or exiting user space). Depending on the available implementations
105  * the @trace will be loaded with the addresses of the user space stacktrace
106  * if it can be found.
107  *
108  * Return: 0 on success and negative on error
109  *         On success @trace will contain the user space stacktrace
110  */
111 int unwind_user_faultable(struct unwind_stacktrace *trace)
112 {
113 	struct unwind_task_info *info = &current->unwind_info;
114 	struct unwind_cache *cache;
115 
116 	/* Should always be called from faultable context */
117 	might_fault();
118 
119 	if (!current->mm)
120 		return -EINVAL;
121 
122 	if (!info->cache) {
123 		info->cache = kzalloc(struct_size(cache, entries, UNWIND_MAX_ENTRIES),
124 				      GFP_KERNEL);
125 		if (!info->cache)
126 			return -ENOMEM;
127 	}
128 
129 	cache = info->cache;
130 	trace->entries = cache->entries;
131 	trace->nr = cache->nr_entries;
132 	/*
133 	 * The user stack has already been previously unwound in this
134 	 * entry context.  Skip the unwind and use the cache.
135 	 */
136 	if (trace->nr)
137 		return 0;
138 
139 	unwind_user(trace, UNWIND_MAX_ENTRIES);
140 
141 	cache->nr_entries = trace->nr;
142 
143 	/* Clear nr_entries on way back to user space */
144 	atomic_long_or(UNWIND_USED, &info->unwind_mask);
145 
146 	return 0;
147 }
148 
149 static void process_unwind_deferred(struct task_struct *task)
150 {
151 	struct unwind_task_info *info = &task->unwind_info;
152 	struct unwind_stacktrace trace;
153 	struct unwind_work *work;
154 	unsigned long bits;
155 	u64 cookie;
156 
157 	if (WARN_ON_ONCE(!unwind_pending(info)))
158 		return;
159 
160 	/* Clear pending bit but make sure to have the current bits */
161 	bits = atomic_long_fetch_andnot(UNWIND_PENDING,
162 					&info->unwind_mask);
163 	/*
164 	 * From here on out, the callback must always be called, even if it's
165 	 * just an empty trace.
166 	 */
167 	trace.nr = 0;
168 	trace.entries = NULL;
169 
170 	unwind_user_faultable(&trace);
171 
172 	if (info->cache)
173 		bits &= ~(info->cache->unwind_completed);
174 
175 	cookie = info->id.id;
176 
177 	guard(srcu)(&unwind_srcu);
178 	list_for_each_entry_srcu(work, &callbacks, list,
179 				 srcu_read_lock_held(&unwind_srcu)) {
180 		if (test_bit(work->bit, &bits)) {
181 			work->func(work, &trace, cookie);
182 			if (info->cache)
183 				info->cache->unwind_completed |= BIT(work->bit);
184 		}
185 	}
186 }
187 
188 static void unwind_deferred_task_work(struct callback_head *head)
189 {
190 	process_unwind_deferred(current);
191 }
192 
193 void unwind_deferred_task_exit(struct task_struct *task)
194 {
195 	struct unwind_task_info *info = &current->unwind_info;
196 
197 	if (!unwind_pending(info))
198 		return;
199 
200 	process_unwind_deferred(task);
201 
202 	task_work_cancel(task, &info->work);
203 }
204 
205 /**
206  * unwind_deferred_request - Request a user stacktrace on task kernel exit
207  * @work: Unwind descriptor requesting the trace
208  * @cookie: The cookie of the first request made for this task
209  *
210  * Schedule a user space unwind to be done in task work before exiting the
211  * kernel.
212  *
213  * The returned @cookie output is the generated cookie of the very first
214  * request for a user space stacktrace for this task since it entered the
215  * kernel. It can be from a request by any caller of this infrastructure.
216  * Its value will also be passed to the callback function.  It can be
217  * used to stitch kernel and user stack traces together in post-processing.
218  *
219  * It's valid to call this function multiple times for the same @work within
220  * the same task entry context.  Each call will return the same cookie
221  * while the task hasn't left the kernel. If the callback is not pending
222  * because it has already been previously called for the same entry context,
223  * it will be called again with the same stack trace and cookie.
224  *
225  * Return: 0 if the callback successfully was queued.
226  *         1 if the callback is pending or was already executed.
227  *         Negative if there's an error.
228  *         @cookie holds the cookie of the first request by any user
229  */
230 int unwind_deferred_request(struct unwind_work *work, u64 *cookie)
231 {
232 	struct unwind_task_info *info = &current->unwind_info;
233 	int twa_mode = TWA_RESUME;
234 	unsigned long old, bits;
235 	unsigned long bit;
236 	int ret;
237 
238 	*cookie = 0;
239 
240 	if ((current->flags & (PF_KTHREAD | PF_EXITING)) ||
241 	    !user_mode(task_pt_regs(current)))
242 		return -EINVAL;
243 
244 	/*
245 	 * NMI requires having safe cmpxchg operations.
246 	 * Trigger a warning to make it obvious that an architecture
247 	 * is using this in NMI when it should not be.
248 	 */
249 	if (in_nmi()) {
250 		if (WARN_ON_ONCE(!CAN_USE_IN_NMI))
251 			return -EINVAL;
252 		twa_mode = TWA_NMI_CURRENT;
253 	}
254 
255 	/* Do not allow cancelled works to request again */
256 	bit = READ_ONCE(work->bit);
257 	if (WARN_ON_ONCE(bit < 0))
258 		return -EINVAL;
259 
260 	/* Only need the mask now */
261 	bit = BIT(bit);
262 
263 	guard(irqsave)();
264 
265 	*cookie = get_cookie(info);
266 
267 	old = atomic_long_read(&info->unwind_mask);
268 
269 	/* Is this already queued or executed */
270 	if (old & bit)
271 		return 1;
272 
273 	/*
274 	 * This work's bit hasn't been set yet. Now set it with the PENDING
275 	 * bit and fetch the current value of unwind_mask. If ether the
276 	 * work's bit or PENDING was already set, then this is already queued
277 	 * to have a callback.
278 	 */
279 	bits = UNWIND_PENDING | bit;
280 	old = atomic_long_fetch_or(bits, &info->unwind_mask);
281 	if (old & bits) {
282 		/*
283 		 * If the work's bit was set, whatever set it had better
284 		 * have also set pending and queued a callback.
285 		 */
286 		WARN_ON_ONCE(!(old & UNWIND_PENDING));
287 		return old & bit;
288 	}
289 
290 	/* The work has been claimed, now schedule it. */
291 	ret = task_work_add(current, &info->work, twa_mode);
292 
293 	if (WARN_ON_ONCE(ret))
294 		atomic_long_set(&info->unwind_mask, 0);
295 
296 	return ret;
297 }
298 
299 void unwind_deferred_cancel(struct unwind_work *work)
300 {
301 	struct task_struct *g, *t;
302 	int bit;
303 
304 	if (!work)
305 		return;
306 
307 	bit = work->bit;
308 
309 	/* No work should be using a reserved bit */
310 	if (WARN_ON_ONCE(BIT(bit) & RESERVED_BITS))
311 		return;
312 
313 	guard(mutex)(&callback_mutex);
314 	list_del_rcu(&work->list);
315 
316 	/* Do not allow any more requests and prevent callbacks */
317 	work->bit = -1;
318 
319 	__clear_bit(bit, &unwind_mask);
320 
321 	synchronize_srcu(&unwind_srcu);
322 
323 	guard(rcu)();
324 	/* Clear this bit from all threads */
325 	for_each_process_thread(g, t) {
326 		atomic_long_andnot(BIT(bit),
327 				   &t->unwind_info.unwind_mask);
328 		if (t->unwind_info.cache)
329 			clear_bit(bit, &t->unwind_info.cache->unwind_completed);
330 	}
331 }
332 
333 int unwind_deferred_init(struct unwind_work *work, unwind_callback_t func)
334 {
335 	memset(work, 0, sizeof(*work));
336 
337 	guard(mutex)(&callback_mutex);
338 
339 	/* See if there's a bit in the mask available */
340 	if (unwind_mask == ~0UL)
341 		return -EBUSY;
342 
343 	work->bit = ffz(unwind_mask);
344 	__set_bit(work->bit, &unwind_mask);
345 
346 	list_add_rcu(&work->list, &callbacks);
347 	work->func = func;
348 	return 0;
349 }
350 
351 void unwind_task_init(struct task_struct *task)
352 {
353 	struct unwind_task_info *info = &task->unwind_info;
354 
355 	memset(info, 0, sizeof(*info));
356 	init_task_work(&info->work, unwind_deferred_task_work);
357 	atomic_long_set(&info->unwind_mask, 0);
358 }
359 
360 void unwind_task_free(struct task_struct *task)
361 {
362 	struct unwind_task_info *info = &task->unwind_info;
363 
364 	kfree(info->cache);
365 	task_work_cancel(task, &info->work);
366 }
367