xref: /linux/kernel/unwind/deferred.c (revision be3d526a5b34109cecf3bc23b96f0081ad600a5b)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Deferred user space unwinding
4  */
5 #include <linux/sched/task_stack.h>
6 #include <linux/unwind_deferred.h>
7 #include <linux/sched/clock.h>
8 #include <linux/task_work.h>
9 #include <linux/kernel.h>
10 #include <linux/sched.h>
11 #include <linux/sizes.h>
12 #include <linux/slab.h>
13 #include <linux/mm.h>
14 
15 /*
16  * For requesting a deferred user space stack trace from NMI context
17  * the architecture must support a safe cmpxchg in NMI context.
18  * For those architectures that do not have that, then it cannot ask
19  * for a deferred user space stack trace from an NMI context. If it
20  * does, then it will get -EINVAL.
21  */
22 #if defined(CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG)
23 # define CAN_USE_IN_NMI		1
24 static inline bool try_assign_cnt(struct unwind_task_info *info, u32 cnt)
25 {
26 	u32 old = 0;
27 
28 	return try_cmpxchg(&info->id.cnt, &old, cnt);
29 }
30 #else
31 # define CAN_USE_IN_NMI		0
32 /* When NMIs are not allowed, this always succeeds */
33 static inline bool try_assign_cnt(struct unwind_task_info *info, u32 cnt)
34 {
35 	info->id.cnt = cnt;
36 	return true;
37 }
38 #endif
39 
40 /* Make the cache fit in a 4K page */
41 #define UNWIND_MAX_ENTRIES					\
42 	((SZ_4K - sizeof(struct unwind_cache)) / sizeof(long))
43 
44 /* Guards adding to and reading the list of callbacks */
45 static DEFINE_MUTEX(callback_mutex);
46 static LIST_HEAD(callbacks);
47 
48 #define RESERVED_BITS	(UNWIND_PENDING)
49 
50 /* Zero'd bits are available for assigning callback users */
51 static unsigned long unwind_mask = RESERVED_BITS;
52 
53 static inline bool unwind_pending(struct unwind_task_info *info)
54 {
55 	return test_bit(UNWIND_PENDING_BIT, &info->unwind_mask);
56 }
57 
58 /*
59  * This is a unique percpu identifier for a given task entry context.
60  * Conceptually, it's incremented every time the CPU enters the kernel from
61  * user space, so that each "entry context" on the CPU gets a unique ID.  In
62  * reality, as an optimization, it's only incremented on demand for the first
63  * deferred unwind request after a given entry-from-user.
64  *
65  * It's combined with the CPU id to make a systemwide-unique "context cookie".
66  */
67 static DEFINE_PER_CPU(u32, unwind_ctx_ctr);
68 
69 /*
70  * The context cookie is a unique identifier that is assigned to a user
71  * space stacktrace. As the user space stacktrace remains the same while
72  * the task is in the kernel, the cookie is an identifier for the stacktrace.
73  * Although it is possible for the stacktrace to get another cookie if another
74  * request is made after the cookie was cleared and before reentering user
75  * space.
76  */
77 static u64 get_cookie(struct unwind_task_info *info)
78 {
79 	u32 cnt = 1;
80 
81 	if (info->id.cpu)
82 		return info->id.id;
83 
84 	/* LSB is always set to ensure 0 is an invalid value */
85 	cnt |= __this_cpu_read(unwind_ctx_ctr) + 2;
86 	if (try_assign_cnt(info, cnt)) {
87 		/* Update the per cpu counter */
88 		__this_cpu_write(unwind_ctx_ctr, cnt);
89 	}
90 	/* Interrupts are disabled, the CPU will always be same */
91 	info->id.cpu = smp_processor_id() + 1; /* Must be non zero */
92 
93 	return info->id.id;
94 }
95 
96 /**
97  * unwind_user_faultable - Produce a user stacktrace in faultable context
98  * @trace: The descriptor that will store the user stacktrace
99  *
100  * This must be called in a known faultable context (usually when entering
101  * or exiting user space). Depending on the available implementations
102  * the @trace will be loaded with the addresses of the user space stacktrace
103  * if it can be found.
104  *
105  * Return: 0 on success and negative on error
106  *         On success @trace will contain the user space stacktrace
107  */
108 int unwind_user_faultable(struct unwind_stacktrace *trace)
109 {
110 	struct unwind_task_info *info = &current->unwind_info;
111 	struct unwind_cache *cache;
112 
113 	/* Should always be called from faultable context */
114 	might_fault();
115 
116 	if (current->flags & PF_EXITING)
117 		return -EINVAL;
118 
119 	if (!info->cache) {
120 		info->cache = kzalloc(struct_size(cache, entries, UNWIND_MAX_ENTRIES),
121 				      GFP_KERNEL);
122 		if (!info->cache)
123 			return -ENOMEM;
124 	}
125 
126 	cache = info->cache;
127 	trace->entries = cache->entries;
128 
129 	if (cache->nr_entries) {
130 		/*
131 		 * The user stack has already been previously unwound in this
132 		 * entry context.  Skip the unwind and use the cache.
133 		 */
134 		trace->nr = cache->nr_entries;
135 		return 0;
136 	}
137 
138 	trace->nr = 0;
139 	unwind_user(trace, UNWIND_MAX_ENTRIES);
140 
141 	cache->nr_entries = trace->nr;
142 
143 	return 0;
144 }
145 
146 static void unwind_deferred_task_work(struct callback_head *head)
147 {
148 	struct unwind_task_info *info = container_of(head, struct unwind_task_info, work);
149 	struct unwind_stacktrace trace;
150 	struct unwind_work *work;
151 	unsigned long bits;
152 	u64 cookie;
153 
154 	if (WARN_ON_ONCE(!unwind_pending(info)))
155 		return;
156 
157 	/* Clear pending bit but make sure to have the current bits */
158 	bits = atomic_long_fetch_andnot(UNWIND_PENDING,
159 				  (atomic_long_t *)&info->unwind_mask);
160 	/*
161 	 * From here on out, the callback must always be called, even if it's
162 	 * just an empty trace.
163 	 */
164 	trace.nr = 0;
165 	trace.entries = NULL;
166 
167 	unwind_user_faultable(&trace);
168 
169 	cookie = info->id.id;
170 
171 	guard(mutex)(&callback_mutex);
172 	list_for_each_entry(work, &callbacks, list) {
173 		if (test_bit(work->bit, &bits))
174 			work->func(work, &trace, cookie);
175 	}
176 }
177 
178 /**
179  * unwind_deferred_request - Request a user stacktrace on task kernel exit
180  * @work: Unwind descriptor requesting the trace
181  * @cookie: The cookie of the first request made for this task
182  *
183  * Schedule a user space unwind to be done in task work before exiting the
184  * kernel.
185  *
186  * The returned @cookie output is the generated cookie of the very first
187  * request for a user space stacktrace for this task since it entered the
188  * kernel. It can be from a request by any caller of this infrastructure.
189  * Its value will also be passed to the callback function.  It can be
190  * used to stitch kernel and user stack traces together in post-processing.
191  *
192  * It's valid to call this function multiple times for the same @work within
193  * the same task entry context.  Each call will return the same cookie
194  * while the task hasn't left the kernel. If the callback is not pending
195  * because it has already been previously called for the same entry context,
196  * it will be called again with the same stack trace and cookie.
197  *
198  * Return: 0 if the callback successfully was queued.
199  *         1 if the callback is pending or was already executed.
200  *         Negative if there's an error.
201  *         @cookie holds the cookie of the first request by any user
202  */
203 int unwind_deferred_request(struct unwind_work *work, u64 *cookie)
204 {
205 	struct unwind_task_info *info = &current->unwind_info;
206 	unsigned long old, bits;
207 	unsigned long bit = BIT(work->bit);
208 	int ret;
209 
210 	*cookie = 0;
211 
212 	if ((current->flags & (PF_KTHREAD | PF_EXITING)) ||
213 	    !user_mode(task_pt_regs(current)))
214 		return -EINVAL;
215 
216 	/*
217 	 * NMI requires having safe cmpxchg operations.
218 	 * Trigger a warning to make it obvious that an architecture
219 	 * is using this in NMI when it should not be.
220 	 */
221 	if (WARN_ON_ONCE(!CAN_USE_IN_NMI && in_nmi()))
222 		return -EINVAL;
223 
224 	guard(irqsave)();
225 
226 	*cookie = get_cookie(info);
227 
228 	old = READ_ONCE(info->unwind_mask);
229 
230 	/* Is this already queued or executed */
231 	if (old & bit)
232 		return 1;
233 
234 	/*
235 	 * This work's bit hasn't been set yet. Now set it with the PENDING
236 	 * bit and fetch the current value of unwind_mask. If ether the
237 	 * work's bit or PENDING was already set, then this is already queued
238 	 * to have a callback.
239 	 */
240 	bits = UNWIND_PENDING | bit;
241 	old = atomic_long_fetch_or(bits, (atomic_long_t *)&info->unwind_mask);
242 	if (old & bits) {
243 		/*
244 		 * If the work's bit was set, whatever set it had better
245 		 * have also set pending and queued a callback.
246 		 */
247 		WARN_ON_ONCE(!(old & UNWIND_PENDING));
248 		return old & bit;
249 	}
250 
251 	/* The work has been claimed, now schedule it. */
252 	ret = task_work_add(current, &info->work, TWA_RESUME);
253 
254 	if (WARN_ON_ONCE(ret))
255 		WRITE_ONCE(info->unwind_mask, 0);
256 
257 	return ret;
258 }
259 
260 void unwind_deferred_cancel(struct unwind_work *work)
261 {
262 	struct task_struct *g, *t;
263 
264 	if (!work)
265 		return;
266 
267 	/* No work should be using a reserved bit */
268 	if (WARN_ON_ONCE(BIT(work->bit) & RESERVED_BITS))
269 		return;
270 
271 	guard(mutex)(&callback_mutex);
272 	list_del(&work->list);
273 
274 	__clear_bit(work->bit, &unwind_mask);
275 
276 	guard(rcu)();
277 	/* Clear this bit from all threads */
278 	for_each_process_thread(g, t) {
279 		clear_bit(work->bit, &t->unwind_info.unwind_mask);
280 	}
281 }
282 
283 int unwind_deferred_init(struct unwind_work *work, unwind_callback_t func)
284 {
285 	memset(work, 0, sizeof(*work));
286 
287 	guard(mutex)(&callback_mutex);
288 
289 	/* See if there's a bit in the mask available */
290 	if (unwind_mask == ~0UL)
291 		return -EBUSY;
292 
293 	work->bit = ffz(unwind_mask);
294 	__set_bit(work->bit, &unwind_mask);
295 
296 	list_add(&work->list, &callbacks);
297 	work->func = func;
298 	return 0;
299 }
300 
301 void unwind_task_init(struct task_struct *task)
302 {
303 	struct unwind_task_info *info = &task->unwind_info;
304 
305 	memset(info, 0, sizeof(*info));
306 	init_task_work(&info->work, unwind_deferred_task_work);
307 	info->unwind_mask = 0;
308 }
309 
310 void unwind_task_free(struct task_struct *task)
311 {
312 	struct unwind_task_info *info = &task->unwind_info;
313 
314 	kfree(info->cache);
315 	task_work_cancel(task, &info->work);
316 }
317