xref: /linux/kernel/unwind/deferred.c (revision 055c7060e7ca71bb86da616158fc74254730ae2a)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Deferred user space unwinding
4  */
5 #include <linux/sched/task_stack.h>
6 #include <linux/unwind_deferred.h>
7 #include <linux/sched/clock.h>
8 #include <linux/task_work.h>
9 #include <linux/kernel.h>
10 #include <linux/sched.h>
11 #include <linux/sizes.h>
12 #include <linux/slab.h>
13 #include <linux/mm.h>
14 
15 /*
16  * For requesting a deferred user space stack trace from NMI context
17  * the architecture must support a safe cmpxchg in NMI context.
18  * For those architectures that do not have that, then it cannot ask
19  * for a deferred user space stack trace from an NMI context. If it
20  * does, then it will get -EINVAL.
21  */
22 #if defined(CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG)
23 # define CAN_USE_IN_NMI		1
24 static inline bool try_assign_cnt(struct unwind_task_info *info, u32 cnt)
25 {
26 	u32 old = 0;
27 
28 	return try_cmpxchg(&info->id.cnt, &old, cnt);
29 }
30 #else
31 # define CAN_USE_IN_NMI		0
32 /* When NMIs are not allowed, this always succeeds */
33 static inline bool try_assign_cnt(struct unwind_task_info *info, u32 cnt)
34 {
35 	info->id.cnt = cnt;
36 	return true;
37 }
38 #endif
39 
40 /* Make the cache fit in a 4K page */
41 #define UNWIND_MAX_ENTRIES					\
42 	((SZ_4K - sizeof(struct unwind_cache)) / sizeof(long))
43 
44 /* Guards adding to and reading the list of callbacks */
45 static DEFINE_MUTEX(callback_mutex);
46 static LIST_HEAD(callbacks);
47 
48 /*
49  * This is a unique percpu identifier for a given task entry context.
50  * Conceptually, it's incremented every time the CPU enters the kernel from
51  * user space, so that each "entry context" on the CPU gets a unique ID.  In
52  * reality, as an optimization, it's only incremented on demand for the first
53  * deferred unwind request after a given entry-from-user.
54  *
55  * It's combined with the CPU id to make a systemwide-unique "context cookie".
56  */
57 static DEFINE_PER_CPU(u32, unwind_ctx_ctr);
58 
59 /*
60  * The context cookie is a unique identifier that is assigned to a user
61  * space stacktrace. As the user space stacktrace remains the same while
62  * the task is in the kernel, the cookie is an identifier for the stacktrace.
63  * Although it is possible for the stacktrace to get another cookie if another
64  * request is made after the cookie was cleared and before reentering user
65  * space.
66  */
67 static u64 get_cookie(struct unwind_task_info *info)
68 {
69 	u32 cnt = 1;
70 
71 	if (info->id.cpu)
72 		return info->id.id;
73 
74 	/* LSB is always set to ensure 0 is an invalid value */
75 	cnt |= __this_cpu_read(unwind_ctx_ctr) + 2;
76 	if (try_assign_cnt(info, cnt)) {
77 		/* Update the per cpu counter */
78 		__this_cpu_write(unwind_ctx_ctr, cnt);
79 	}
80 	/* Interrupts are disabled, the CPU will always be same */
81 	info->id.cpu = smp_processor_id() + 1; /* Must be non zero */
82 
83 	return info->id.id;
84 }
85 
86 /**
87  * unwind_user_faultable - Produce a user stacktrace in faultable context
88  * @trace: The descriptor that will store the user stacktrace
89  *
90  * This must be called in a known faultable context (usually when entering
91  * or exiting user space). Depending on the available implementations
92  * the @trace will be loaded with the addresses of the user space stacktrace
93  * if it can be found.
94  *
95  * Return: 0 on success and negative on error
96  *         On success @trace will contain the user space stacktrace
97  */
98 int unwind_user_faultable(struct unwind_stacktrace *trace)
99 {
100 	struct unwind_task_info *info = &current->unwind_info;
101 	struct unwind_cache *cache;
102 
103 	/* Should always be called from faultable context */
104 	might_fault();
105 
106 	if (current->flags & PF_EXITING)
107 		return -EINVAL;
108 
109 	if (!info->cache) {
110 		info->cache = kzalloc(struct_size(cache, entries, UNWIND_MAX_ENTRIES),
111 				      GFP_KERNEL);
112 		if (!info->cache)
113 			return -ENOMEM;
114 	}
115 
116 	cache = info->cache;
117 	trace->entries = cache->entries;
118 
119 	if (cache->nr_entries) {
120 		/*
121 		 * The user stack has already been previously unwound in this
122 		 * entry context.  Skip the unwind and use the cache.
123 		 */
124 		trace->nr = cache->nr_entries;
125 		return 0;
126 	}
127 
128 	trace->nr = 0;
129 	unwind_user(trace, UNWIND_MAX_ENTRIES);
130 
131 	cache->nr_entries = trace->nr;
132 
133 	return 0;
134 }
135 
136 static void unwind_deferred_task_work(struct callback_head *head)
137 {
138 	struct unwind_task_info *info = container_of(head, struct unwind_task_info, work);
139 	struct unwind_stacktrace trace;
140 	struct unwind_work *work;
141 	u64 cookie;
142 
143 	if (WARN_ON_ONCE(!info->pending))
144 		return;
145 
146 	/* Allow work to come in again */
147 	WRITE_ONCE(info->pending, 0);
148 
149 	/*
150 	 * From here on out, the callback must always be called, even if it's
151 	 * just an empty trace.
152 	 */
153 	trace.nr = 0;
154 	trace.entries = NULL;
155 
156 	unwind_user_faultable(&trace);
157 
158 	cookie = info->id.id;
159 
160 	guard(mutex)(&callback_mutex);
161 	list_for_each_entry(work, &callbacks, list) {
162 		work->func(work, &trace, cookie);
163 	}
164 }
165 
166 /**
167  * unwind_deferred_request - Request a user stacktrace on task kernel exit
168  * @work: Unwind descriptor requesting the trace
169  * @cookie: The cookie of the first request made for this task
170  *
171  * Schedule a user space unwind to be done in task work before exiting the
172  * kernel.
173  *
174  * The returned @cookie output is the generated cookie of the very first
175  * request for a user space stacktrace for this task since it entered the
176  * kernel. It can be from a request by any caller of this infrastructure.
177  * Its value will also be passed to the callback function.  It can be
178  * used to stitch kernel and user stack traces together in post-processing.
179  *
180  * It's valid to call this function multiple times for the same @work within
181  * the same task entry context.  Each call will return the same cookie
182  * while the task hasn't left the kernel. If the callback is not pending
183  * because it has already been previously called for the same entry context,
184  * it will be called again with the same stack trace and cookie.
185  *
186  * Return: 1 if the the callback was already queued.
187  *         0 if the callback successfully was queued.
188  *         Negative if there's an error.
189  *         @cookie holds the cookie of the first request by any user
190  */
191 int unwind_deferred_request(struct unwind_work *work, u64 *cookie)
192 {
193 	struct unwind_task_info *info = &current->unwind_info;
194 	long pending;
195 	int ret;
196 
197 	*cookie = 0;
198 
199 	if ((current->flags & (PF_KTHREAD | PF_EXITING)) ||
200 	    !user_mode(task_pt_regs(current)))
201 		return -EINVAL;
202 
203 	/*
204 	 * NMI requires having safe cmpxchg operations.
205 	 * Trigger a warning to make it obvious that an architecture
206 	 * is using this in NMI when it should not be.
207 	 */
208 	if (WARN_ON_ONCE(!CAN_USE_IN_NMI && in_nmi()))
209 		return -EINVAL;
210 
211 	guard(irqsave)();
212 
213 	*cookie = get_cookie(info);
214 
215 	/* callback already pending? */
216 	pending = READ_ONCE(info->pending);
217 	if (pending)
218 		return 1;
219 
220 	/* Claim the work unless an NMI just now swooped in to do so. */
221 	if (!try_cmpxchg(&info->pending, &pending, 1))
222 		return 1;
223 
224 	/* The work has been claimed, now schedule it. */
225 	ret = task_work_add(current, &info->work, TWA_RESUME);
226 	if (WARN_ON_ONCE(ret)) {
227 		WRITE_ONCE(info->pending, 0);
228 		return ret;
229 	}
230 
231 	return 0;
232 }
233 
234 void unwind_deferred_cancel(struct unwind_work *work)
235 {
236 	if (!work)
237 		return;
238 
239 	guard(mutex)(&callback_mutex);
240 	list_del(&work->list);
241 }
242 
243 int unwind_deferred_init(struct unwind_work *work, unwind_callback_t func)
244 {
245 	memset(work, 0, sizeof(*work));
246 
247 	guard(mutex)(&callback_mutex);
248 	list_add(&work->list, &callbacks);
249 	work->func = func;
250 	return 0;
251 }
252 
253 void unwind_task_init(struct task_struct *task)
254 {
255 	struct unwind_task_info *info = &task->unwind_info;
256 
257 	memset(info, 0, sizeof(*info));
258 	init_task_work(&info->work, unwind_deferred_task_work);
259 }
260 
261 void unwind_task_free(struct task_struct *task)
262 {
263 	struct unwind_task_info *info = &task->unwind_info;
264 
265 	kfree(info->cache);
266 	task_work_cancel(task, &info->work);
267 }
268