1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Deferred user space unwinding 4 */ 5 #include <linux/sched/task_stack.h> 6 #include <linux/unwind_deferred.h> 7 #include <linux/sched/clock.h> 8 #include <linux/task_work.h> 9 #include <linux/kernel.h> 10 #include <linux/sched.h> 11 #include <linux/sizes.h> 12 #include <linux/slab.h> 13 #include <linux/mm.h> 14 15 /* 16 * For requesting a deferred user space stack trace from NMI context 17 * the architecture must support a safe cmpxchg in NMI context. 18 * For those architectures that do not have that, then it cannot ask 19 * for a deferred user space stack trace from an NMI context. If it 20 * does, then it will get -EINVAL. 21 */ 22 #if defined(CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG) 23 # define CAN_USE_IN_NMI 1 24 static inline bool try_assign_cnt(struct unwind_task_info *info, u32 cnt) 25 { 26 u32 old = 0; 27 28 return try_cmpxchg(&info->id.cnt, &old, cnt); 29 } 30 #else 31 # define CAN_USE_IN_NMI 0 32 /* When NMIs are not allowed, this always succeeds */ 33 static inline bool try_assign_cnt(struct unwind_task_info *info, u32 cnt) 34 { 35 info->id.cnt = cnt; 36 return true; 37 } 38 #endif 39 40 /* Make the cache fit in a 4K page */ 41 #define UNWIND_MAX_ENTRIES \ 42 ((SZ_4K - sizeof(struct unwind_cache)) / sizeof(long)) 43 44 /* Guards adding to and reading the list of callbacks */ 45 static DEFINE_MUTEX(callback_mutex); 46 static LIST_HEAD(callbacks); 47 48 #define RESERVED_BITS (UNWIND_PENDING) 49 50 /* Zero'd bits are available for assigning callback users */ 51 static unsigned long unwind_mask = RESERVED_BITS; 52 53 static inline bool unwind_pending(struct unwind_task_info *info) 54 { 55 return test_bit(UNWIND_PENDING_BIT, &info->unwind_mask); 56 } 57 58 /* 59 * This is a unique percpu identifier for a given task entry context. 60 * Conceptually, it's incremented every time the CPU enters the kernel from 61 * user space, so that each "entry context" on the CPU gets a unique ID. In 62 * reality, as an optimization, it's only incremented on demand for the first 63 * deferred unwind request after a given entry-from-user. 64 * 65 * It's combined with the CPU id to make a systemwide-unique "context cookie". 66 */ 67 static DEFINE_PER_CPU(u32, unwind_ctx_ctr); 68 69 /* 70 * The context cookie is a unique identifier that is assigned to a user 71 * space stacktrace. As the user space stacktrace remains the same while 72 * the task is in the kernel, the cookie is an identifier for the stacktrace. 73 * Although it is possible for the stacktrace to get another cookie if another 74 * request is made after the cookie was cleared and before reentering user 75 * space. 76 */ 77 static u64 get_cookie(struct unwind_task_info *info) 78 { 79 u32 cnt = 1; 80 81 if (info->id.cpu) 82 return info->id.id; 83 84 /* LSB is always set to ensure 0 is an invalid value */ 85 cnt |= __this_cpu_read(unwind_ctx_ctr) + 2; 86 if (try_assign_cnt(info, cnt)) { 87 /* Update the per cpu counter */ 88 __this_cpu_write(unwind_ctx_ctr, cnt); 89 } 90 /* Interrupts are disabled, the CPU will always be same */ 91 info->id.cpu = smp_processor_id() + 1; /* Must be non zero */ 92 93 return info->id.id; 94 } 95 96 /** 97 * unwind_user_faultable - Produce a user stacktrace in faultable context 98 * @trace: The descriptor that will store the user stacktrace 99 * 100 * This must be called in a known faultable context (usually when entering 101 * or exiting user space). Depending on the available implementations 102 * the @trace will be loaded with the addresses of the user space stacktrace 103 * if it can be found. 104 * 105 * Return: 0 on success and negative on error 106 * On success @trace will contain the user space stacktrace 107 */ 108 int unwind_user_faultable(struct unwind_stacktrace *trace) 109 { 110 struct unwind_task_info *info = ¤t->unwind_info; 111 struct unwind_cache *cache; 112 113 /* Should always be called from faultable context */ 114 might_fault(); 115 116 if (current->flags & PF_EXITING) 117 return -EINVAL; 118 119 if (!info->cache) { 120 info->cache = kzalloc(struct_size(cache, entries, UNWIND_MAX_ENTRIES), 121 GFP_KERNEL); 122 if (!info->cache) 123 return -ENOMEM; 124 } 125 126 cache = info->cache; 127 trace->entries = cache->entries; 128 129 if (cache->nr_entries) { 130 /* 131 * The user stack has already been previously unwound in this 132 * entry context. Skip the unwind and use the cache. 133 */ 134 trace->nr = cache->nr_entries; 135 return 0; 136 } 137 138 trace->nr = 0; 139 unwind_user(trace, UNWIND_MAX_ENTRIES); 140 141 cache->nr_entries = trace->nr; 142 143 return 0; 144 } 145 146 static void unwind_deferred_task_work(struct callback_head *head) 147 { 148 struct unwind_task_info *info = container_of(head, struct unwind_task_info, work); 149 struct unwind_stacktrace trace; 150 struct unwind_work *work; 151 unsigned long bits; 152 u64 cookie; 153 154 if (WARN_ON_ONCE(!unwind_pending(info))) 155 return; 156 157 /* Clear pending bit but make sure to have the current bits */ 158 bits = atomic_long_fetch_andnot(UNWIND_PENDING, 159 (atomic_long_t *)&info->unwind_mask); 160 /* 161 * From here on out, the callback must always be called, even if it's 162 * just an empty trace. 163 */ 164 trace.nr = 0; 165 trace.entries = NULL; 166 167 unwind_user_faultable(&trace); 168 169 cookie = info->id.id; 170 171 guard(mutex)(&callback_mutex); 172 list_for_each_entry(work, &callbacks, list) { 173 if (test_bit(work->bit, &bits)) 174 work->func(work, &trace, cookie); 175 } 176 } 177 178 /** 179 * unwind_deferred_request - Request a user stacktrace on task kernel exit 180 * @work: Unwind descriptor requesting the trace 181 * @cookie: The cookie of the first request made for this task 182 * 183 * Schedule a user space unwind to be done in task work before exiting the 184 * kernel. 185 * 186 * The returned @cookie output is the generated cookie of the very first 187 * request for a user space stacktrace for this task since it entered the 188 * kernel. It can be from a request by any caller of this infrastructure. 189 * Its value will also be passed to the callback function. It can be 190 * used to stitch kernel and user stack traces together in post-processing. 191 * 192 * It's valid to call this function multiple times for the same @work within 193 * the same task entry context. Each call will return the same cookie 194 * while the task hasn't left the kernel. If the callback is not pending 195 * because it has already been previously called for the same entry context, 196 * it will be called again with the same stack trace and cookie. 197 * 198 * Return: 0 if the callback successfully was queued. 199 * 1 if the callback is pending or was already executed. 200 * Negative if there's an error. 201 * @cookie holds the cookie of the first request by any user 202 */ 203 int unwind_deferred_request(struct unwind_work *work, u64 *cookie) 204 { 205 struct unwind_task_info *info = ¤t->unwind_info; 206 unsigned long old, bits; 207 unsigned long bit = BIT(work->bit); 208 int ret; 209 210 *cookie = 0; 211 212 if ((current->flags & (PF_KTHREAD | PF_EXITING)) || 213 !user_mode(task_pt_regs(current))) 214 return -EINVAL; 215 216 /* 217 * NMI requires having safe cmpxchg operations. 218 * Trigger a warning to make it obvious that an architecture 219 * is using this in NMI when it should not be. 220 */ 221 if (WARN_ON_ONCE(!CAN_USE_IN_NMI && in_nmi())) 222 return -EINVAL; 223 224 guard(irqsave)(); 225 226 *cookie = get_cookie(info); 227 228 old = READ_ONCE(info->unwind_mask); 229 230 /* Is this already queued or executed */ 231 if (old & bit) 232 return 1; 233 234 /* 235 * This work's bit hasn't been set yet. Now set it with the PENDING 236 * bit and fetch the current value of unwind_mask. If ether the 237 * work's bit or PENDING was already set, then this is already queued 238 * to have a callback. 239 */ 240 bits = UNWIND_PENDING | bit; 241 old = atomic_long_fetch_or(bits, (atomic_long_t *)&info->unwind_mask); 242 if (old & bits) { 243 /* 244 * If the work's bit was set, whatever set it had better 245 * have also set pending and queued a callback. 246 */ 247 WARN_ON_ONCE(!(old & UNWIND_PENDING)); 248 return old & bit; 249 } 250 251 /* The work has been claimed, now schedule it. */ 252 ret = task_work_add(current, &info->work, TWA_RESUME); 253 254 if (WARN_ON_ONCE(ret)) 255 WRITE_ONCE(info->unwind_mask, 0); 256 257 return ret; 258 } 259 260 void unwind_deferred_cancel(struct unwind_work *work) 261 { 262 struct task_struct *g, *t; 263 264 if (!work) 265 return; 266 267 /* No work should be using a reserved bit */ 268 if (WARN_ON_ONCE(BIT(work->bit) & RESERVED_BITS)) 269 return; 270 271 guard(mutex)(&callback_mutex); 272 list_del(&work->list); 273 274 __clear_bit(work->bit, &unwind_mask); 275 276 guard(rcu)(); 277 /* Clear this bit from all threads */ 278 for_each_process_thread(g, t) { 279 clear_bit(work->bit, &t->unwind_info.unwind_mask); 280 } 281 } 282 283 int unwind_deferred_init(struct unwind_work *work, unwind_callback_t func) 284 { 285 memset(work, 0, sizeof(*work)); 286 287 guard(mutex)(&callback_mutex); 288 289 /* See if there's a bit in the mask available */ 290 if (unwind_mask == ~0UL) 291 return -EBUSY; 292 293 work->bit = ffz(unwind_mask); 294 __set_bit(work->bit, &unwind_mask); 295 296 list_add(&work->list, &callbacks); 297 work->func = func; 298 return 0; 299 } 300 301 void unwind_task_init(struct task_struct *task) 302 { 303 struct unwind_task_info *info = &task->unwind_info; 304 305 memset(info, 0, sizeof(*info)); 306 init_task_work(&info->work, unwind_deferred_task_work); 307 info->unwind_mask = 0; 308 } 309 310 void unwind_task_free(struct task_struct *task) 311 { 312 struct unwind_task_info *info = &task->unwind_info; 313 314 kfree(info->cache); 315 task_work_cancel(task, &info->work); 316 } 317