1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Deferred user space unwinding 4 */ 5 #include <linux/sched/task_stack.h> 6 #include <linux/unwind_deferred.h> 7 #include <linux/sched/clock.h> 8 #include <linux/task_work.h> 9 #include <linux/kernel.h> 10 #include <linux/sched.h> 11 #include <linux/sizes.h> 12 #include <linux/slab.h> 13 #include <linux/mm.h> 14 15 /* 16 * For requesting a deferred user space stack trace from NMI context 17 * the architecture must support a safe cmpxchg in NMI context. 18 * For those architectures that do not have that, then it cannot ask 19 * for a deferred user space stack trace from an NMI context. If it 20 * does, then it will get -EINVAL. 21 */ 22 #if defined(CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG) 23 # define CAN_USE_IN_NMI 1 24 static inline bool try_assign_cnt(struct unwind_task_info *info, u32 cnt) 25 { 26 u32 old = 0; 27 28 return try_cmpxchg(&info->id.cnt, &old, cnt); 29 } 30 #else 31 # define CAN_USE_IN_NMI 0 32 /* When NMIs are not allowed, this always succeeds */ 33 static inline bool try_assign_cnt(struct unwind_task_info *info, u32 cnt) 34 { 35 info->id.cnt = cnt; 36 return true; 37 } 38 #endif 39 40 /* Make the cache fit in a 4K page */ 41 #define UNWIND_MAX_ENTRIES \ 42 ((SZ_4K - sizeof(struct unwind_cache)) / sizeof(long)) 43 44 /* Guards adding to or removing from the list of callbacks */ 45 static DEFINE_MUTEX(callback_mutex); 46 static LIST_HEAD(callbacks); 47 48 #define RESERVED_BITS (UNWIND_PENDING | UNWIND_USED) 49 50 /* Zero'd bits are available for assigning callback users */ 51 static unsigned long unwind_mask = RESERVED_BITS; 52 DEFINE_STATIC_SRCU(unwind_srcu); 53 54 static inline bool unwind_pending(struct unwind_task_info *info) 55 { 56 return test_bit(UNWIND_PENDING_BIT, &info->unwind_mask); 57 } 58 59 /* 60 * This is a unique percpu identifier for a given task entry context. 61 * Conceptually, it's incremented every time the CPU enters the kernel from 62 * user space, so that each "entry context" on the CPU gets a unique ID. In 63 * reality, as an optimization, it's only incremented on demand for the first 64 * deferred unwind request after a given entry-from-user. 65 * 66 * It's combined with the CPU id to make a systemwide-unique "context cookie". 67 */ 68 static DEFINE_PER_CPU(u32, unwind_ctx_ctr); 69 70 /* 71 * The context cookie is a unique identifier that is assigned to a user 72 * space stacktrace. As the user space stacktrace remains the same while 73 * the task is in the kernel, the cookie is an identifier for the stacktrace. 74 * Although it is possible for the stacktrace to get another cookie if another 75 * request is made after the cookie was cleared and before reentering user 76 * space. 77 */ 78 static u64 get_cookie(struct unwind_task_info *info) 79 { 80 u32 cnt = 1; 81 82 if (info->id.cpu) 83 return info->id.id; 84 85 /* LSB is always set to ensure 0 is an invalid value */ 86 cnt |= __this_cpu_read(unwind_ctx_ctr) + 2; 87 if (try_assign_cnt(info, cnt)) { 88 /* Update the per cpu counter */ 89 __this_cpu_write(unwind_ctx_ctr, cnt); 90 } 91 /* Interrupts are disabled, the CPU will always be same */ 92 info->id.cpu = smp_processor_id() + 1; /* Must be non zero */ 93 94 return info->id.id; 95 } 96 97 /** 98 * unwind_user_faultable - Produce a user stacktrace in faultable context 99 * @trace: The descriptor that will store the user stacktrace 100 * 101 * This must be called in a known faultable context (usually when entering 102 * or exiting user space). Depending on the available implementations 103 * the @trace will be loaded with the addresses of the user space stacktrace 104 * if it can be found. 105 * 106 * Return: 0 on success and negative on error 107 * On success @trace will contain the user space stacktrace 108 */ 109 int unwind_user_faultable(struct unwind_stacktrace *trace) 110 { 111 struct unwind_task_info *info = ¤t->unwind_info; 112 struct unwind_cache *cache; 113 114 /* Should always be called from faultable context */ 115 might_fault(); 116 117 if (!current->mm) 118 return -EINVAL; 119 120 if (!info->cache) { 121 info->cache = kzalloc(struct_size(cache, entries, UNWIND_MAX_ENTRIES), 122 GFP_KERNEL); 123 if (!info->cache) 124 return -ENOMEM; 125 } 126 127 cache = info->cache; 128 trace->entries = cache->entries; 129 130 if (cache->nr_entries) { 131 /* 132 * The user stack has already been previously unwound in this 133 * entry context. Skip the unwind and use the cache. 134 */ 135 trace->nr = cache->nr_entries; 136 return 0; 137 } 138 139 trace->nr = 0; 140 unwind_user(trace, UNWIND_MAX_ENTRIES); 141 142 cache->nr_entries = trace->nr; 143 144 /* Clear nr_entries on way back to user space */ 145 set_bit(UNWIND_USED_BIT, &info->unwind_mask); 146 147 return 0; 148 } 149 150 static void process_unwind_deferred(struct task_struct *task) 151 { 152 struct unwind_task_info *info = &task->unwind_info; 153 struct unwind_stacktrace trace; 154 struct unwind_work *work; 155 unsigned long bits; 156 u64 cookie; 157 158 if (WARN_ON_ONCE(!unwind_pending(info))) 159 return; 160 161 /* Clear pending bit but make sure to have the current bits */ 162 bits = atomic_long_fetch_andnot(UNWIND_PENDING, 163 (atomic_long_t *)&info->unwind_mask); 164 /* 165 * From here on out, the callback must always be called, even if it's 166 * just an empty trace. 167 */ 168 trace.nr = 0; 169 trace.entries = NULL; 170 171 unwind_user_faultable(&trace); 172 173 if (info->cache) 174 bits &= ~(info->cache->unwind_completed); 175 176 cookie = info->id.id; 177 178 guard(srcu)(&unwind_srcu); 179 list_for_each_entry_srcu(work, &callbacks, list, 180 srcu_read_lock_held(&unwind_srcu)) { 181 if (test_bit(work->bit, &bits)) { 182 work->func(work, &trace, cookie); 183 if (info->cache) 184 info->cache->unwind_completed |= BIT(work->bit); 185 } 186 } 187 } 188 189 static void unwind_deferred_task_work(struct callback_head *head) 190 { 191 process_unwind_deferred(current); 192 } 193 194 void unwind_deferred_task_exit(struct task_struct *task) 195 { 196 struct unwind_task_info *info = ¤t->unwind_info; 197 198 if (!unwind_pending(info)) 199 return; 200 201 process_unwind_deferred(task); 202 203 task_work_cancel(task, &info->work); 204 } 205 206 /** 207 * unwind_deferred_request - Request a user stacktrace on task kernel exit 208 * @work: Unwind descriptor requesting the trace 209 * @cookie: The cookie of the first request made for this task 210 * 211 * Schedule a user space unwind to be done in task work before exiting the 212 * kernel. 213 * 214 * The returned @cookie output is the generated cookie of the very first 215 * request for a user space stacktrace for this task since it entered the 216 * kernel. It can be from a request by any caller of this infrastructure. 217 * Its value will also be passed to the callback function. It can be 218 * used to stitch kernel and user stack traces together in post-processing. 219 * 220 * It's valid to call this function multiple times for the same @work within 221 * the same task entry context. Each call will return the same cookie 222 * while the task hasn't left the kernel. If the callback is not pending 223 * because it has already been previously called for the same entry context, 224 * it will be called again with the same stack trace and cookie. 225 * 226 * Return: 0 if the callback successfully was queued. 227 * 1 if the callback is pending or was already executed. 228 * Negative if there's an error. 229 * @cookie holds the cookie of the first request by any user 230 */ 231 int unwind_deferred_request(struct unwind_work *work, u64 *cookie) 232 { 233 struct unwind_task_info *info = ¤t->unwind_info; 234 int twa_mode = TWA_RESUME; 235 unsigned long old, bits; 236 unsigned long bit; 237 int ret; 238 239 *cookie = 0; 240 241 if ((current->flags & (PF_KTHREAD | PF_EXITING)) || 242 !user_mode(task_pt_regs(current))) 243 return -EINVAL; 244 245 /* 246 * NMI requires having safe cmpxchg operations. 247 * Trigger a warning to make it obvious that an architecture 248 * is using this in NMI when it should not be. 249 */ 250 if (in_nmi()) { 251 if (WARN_ON_ONCE(!CAN_USE_IN_NMI)) 252 return -EINVAL; 253 twa_mode = TWA_NMI_CURRENT; 254 } 255 256 /* Do not allow cancelled works to request again */ 257 bit = READ_ONCE(work->bit); 258 if (WARN_ON_ONCE(bit < 0)) 259 return -EINVAL; 260 261 /* Only need the mask now */ 262 bit = BIT(bit); 263 264 guard(irqsave)(); 265 266 *cookie = get_cookie(info); 267 268 old = READ_ONCE(info->unwind_mask); 269 270 /* Is this already queued or executed */ 271 if (old & bit) 272 return 1; 273 274 /* 275 * This work's bit hasn't been set yet. Now set it with the PENDING 276 * bit and fetch the current value of unwind_mask. If ether the 277 * work's bit or PENDING was already set, then this is already queued 278 * to have a callback. 279 */ 280 bits = UNWIND_PENDING | bit; 281 old = atomic_long_fetch_or(bits, (atomic_long_t *)&info->unwind_mask); 282 if (old & bits) { 283 /* 284 * If the work's bit was set, whatever set it had better 285 * have also set pending and queued a callback. 286 */ 287 WARN_ON_ONCE(!(old & UNWIND_PENDING)); 288 return old & bit; 289 } 290 291 /* The work has been claimed, now schedule it. */ 292 ret = task_work_add(current, &info->work, twa_mode); 293 294 if (WARN_ON_ONCE(ret)) 295 WRITE_ONCE(info->unwind_mask, 0); 296 297 return ret; 298 } 299 300 void unwind_deferred_cancel(struct unwind_work *work) 301 { 302 struct task_struct *g, *t; 303 int bit; 304 305 if (!work) 306 return; 307 308 bit = work->bit; 309 310 /* No work should be using a reserved bit */ 311 if (WARN_ON_ONCE(BIT(bit) & RESERVED_BITS)) 312 return; 313 314 guard(mutex)(&callback_mutex); 315 list_del_rcu(&work->list); 316 317 /* Do not allow any more requests and prevent callbacks */ 318 work->bit = -1; 319 320 __clear_bit(bit, &unwind_mask); 321 322 synchronize_srcu(&unwind_srcu); 323 324 guard(rcu)(); 325 /* Clear this bit from all threads */ 326 for_each_process_thread(g, t) { 327 clear_bit(bit, &t->unwind_info.unwind_mask); 328 if (t->unwind_info.cache) 329 clear_bit(bit, &t->unwind_info.cache->unwind_completed); 330 } 331 } 332 333 int unwind_deferred_init(struct unwind_work *work, unwind_callback_t func) 334 { 335 memset(work, 0, sizeof(*work)); 336 337 guard(mutex)(&callback_mutex); 338 339 /* See if there's a bit in the mask available */ 340 if (unwind_mask == ~0UL) 341 return -EBUSY; 342 343 work->bit = ffz(unwind_mask); 344 __set_bit(work->bit, &unwind_mask); 345 346 list_add_rcu(&work->list, &callbacks); 347 work->func = func; 348 return 0; 349 } 350 351 void unwind_task_init(struct task_struct *task) 352 { 353 struct unwind_task_info *info = &task->unwind_info; 354 355 memset(info, 0, sizeof(*info)); 356 init_task_work(&info->work, unwind_deferred_task_work); 357 info->unwind_mask = 0; 358 } 359 360 void unwind_task_free(struct task_struct *task) 361 { 362 struct unwind_task_info *info = &task->unwind_info; 363 364 kfree(info->cache); 365 task_work_cancel(task, &info->work); 366 } 367