1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Deferred user space unwinding 4 */ 5 #include <linux/sched/task_stack.h> 6 #include <linux/unwind_deferred.h> 7 #include <linux/sched/clock.h> 8 #include <linux/task_work.h> 9 #include <linux/kernel.h> 10 #include <linux/sched.h> 11 #include <linux/sizes.h> 12 #include <linux/slab.h> 13 #include <linux/mm.h> 14 15 /* 16 * For requesting a deferred user space stack trace from NMI context 17 * the architecture must support a safe cmpxchg in NMI context. 18 * For those architectures that do not have that, then it cannot ask 19 * for a deferred user space stack trace from an NMI context. If it 20 * does, then it will get -EINVAL. 21 */ 22 #if defined(CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG) 23 # define CAN_USE_IN_NMI 1 24 static inline bool try_assign_cnt(struct unwind_task_info *info, u32 cnt) 25 { 26 u32 old = 0; 27 28 return try_cmpxchg(&info->id.cnt, &old, cnt); 29 } 30 #else 31 # define CAN_USE_IN_NMI 0 32 /* When NMIs are not allowed, this always succeeds */ 33 static inline bool try_assign_cnt(struct unwind_task_info *info, u32 cnt) 34 { 35 info->id.cnt = cnt; 36 return true; 37 } 38 #endif 39 40 /* Make the cache fit in a 4K page */ 41 #define UNWIND_MAX_ENTRIES \ 42 ((SZ_4K - sizeof(struct unwind_cache)) / sizeof(long)) 43 44 /* Guards adding to and reading the list of callbacks */ 45 static DEFINE_MUTEX(callback_mutex); 46 static LIST_HEAD(callbacks); 47 48 /* 49 * This is a unique percpu identifier for a given task entry context. 50 * Conceptually, it's incremented every time the CPU enters the kernel from 51 * user space, so that each "entry context" on the CPU gets a unique ID. In 52 * reality, as an optimization, it's only incremented on demand for the first 53 * deferred unwind request after a given entry-from-user. 54 * 55 * It's combined with the CPU id to make a systemwide-unique "context cookie". 56 */ 57 static DEFINE_PER_CPU(u32, unwind_ctx_ctr); 58 59 /* 60 * The context cookie is a unique identifier that is assigned to a user 61 * space stacktrace. As the user space stacktrace remains the same while 62 * the task is in the kernel, the cookie is an identifier for the stacktrace. 63 * Although it is possible for the stacktrace to get another cookie if another 64 * request is made after the cookie was cleared and before reentering user 65 * space. 66 */ 67 static u64 get_cookie(struct unwind_task_info *info) 68 { 69 u32 cnt = 1; 70 71 if (info->id.cpu) 72 return info->id.id; 73 74 /* LSB is always set to ensure 0 is an invalid value */ 75 cnt |= __this_cpu_read(unwind_ctx_ctr) + 2; 76 if (try_assign_cnt(info, cnt)) { 77 /* Update the per cpu counter */ 78 __this_cpu_write(unwind_ctx_ctr, cnt); 79 } 80 /* Interrupts are disabled, the CPU will always be same */ 81 info->id.cpu = smp_processor_id() + 1; /* Must be non zero */ 82 83 return info->id.id; 84 } 85 86 /** 87 * unwind_user_faultable - Produce a user stacktrace in faultable context 88 * @trace: The descriptor that will store the user stacktrace 89 * 90 * This must be called in a known faultable context (usually when entering 91 * or exiting user space). Depending on the available implementations 92 * the @trace will be loaded with the addresses of the user space stacktrace 93 * if it can be found. 94 * 95 * Return: 0 on success and negative on error 96 * On success @trace will contain the user space stacktrace 97 */ 98 int unwind_user_faultable(struct unwind_stacktrace *trace) 99 { 100 struct unwind_task_info *info = ¤t->unwind_info; 101 struct unwind_cache *cache; 102 103 /* Should always be called from faultable context */ 104 might_fault(); 105 106 if (current->flags & PF_EXITING) 107 return -EINVAL; 108 109 if (!info->cache) { 110 info->cache = kzalloc(struct_size(cache, entries, UNWIND_MAX_ENTRIES), 111 GFP_KERNEL); 112 if (!info->cache) 113 return -ENOMEM; 114 } 115 116 cache = info->cache; 117 trace->entries = cache->entries; 118 119 if (cache->nr_entries) { 120 /* 121 * The user stack has already been previously unwound in this 122 * entry context. Skip the unwind and use the cache. 123 */ 124 trace->nr = cache->nr_entries; 125 return 0; 126 } 127 128 trace->nr = 0; 129 unwind_user(trace, UNWIND_MAX_ENTRIES); 130 131 cache->nr_entries = trace->nr; 132 133 return 0; 134 } 135 136 static void unwind_deferred_task_work(struct callback_head *head) 137 { 138 struct unwind_task_info *info = container_of(head, struct unwind_task_info, work); 139 struct unwind_stacktrace trace; 140 struct unwind_work *work; 141 u64 cookie; 142 143 if (WARN_ON_ONCE(!info->pending)) 144 return; 145 146 /* Allow work to come in again */ 147 WRITE_ONCE(info->pending, 0); 148 149 /* 150 * From here on out, the callback must always be called, even if it's 151 * just an empty trace. 152 */ 153 trace.nr = 0; 154 trace.entries = NULL; 155 156 unwind_user_faultable(&trace); 157 158 cookie = info->id.id; 159 160 guard(mutex)(&callback_mutex); 161 list_for_each_entry(work, &callbacks, list) { 162 work->func(work, &trace, cookie); 163 } 164 } 165 166 /** 167 * unwind_deferred_request - Request a user stacktrace on task kernel exit 168 * @work: Unwind descriptor requesting the trace 169 * @cookie: The cookie of the first request made for this task 170 * 171 * Schedule a user space unwind to be done in task work before exiting the 172 * kernel. 173 * 174 * The returned @cookie output is the generated cookie of the very first 175 * request for a user space stacktrace for this task since it entered the 176 * kernel. It can be from a request by any caller of this infrastructure. 177 * Its value will also be passed to the callback function. It can be 178 * used to stitch kernel and user stack traces together in post-processing. 179 * 180 * It's valid to call this function multiple times for the same @work within 181 * the same task entry context. Each call will return the same cookie 182 * while the task hasn't left the kernel. If the callback is not pending 183 * because it has already been previously called for the same entry context, 184 * it will be called again with the same stack trace and cookie. 185 * 186 * Return: 1 if the the callback was already queued. 187 * 0 if the callback successfully was queued. 188 * Negative if there's an error. 189 * @cookie holds the cookie of the first request by any user 190 */ 191 int unwind_deferred_request(struct unwind_work *work, u64 *cookie) 192 { 193 struct unwind_task_info *info = ¤t->unwind_info; 194 long pending; 195 int ret; 196 197 *cookie = 0; 198 199 if ((current->flags & (PF_KTHREAD | PF_EXITING)) || 200 !user_mode(task_pt_regs(current))) 201 return -EINVAL; 202 203 /* 204 * NMI requires having safe cmpxchg operations. 205 * Trigger a warning to make it obvious that an architecture 206 * is using this in NMI when it should not be. 207 */ 208 if (WARN_ON_ONCE(!CAN_USE_IN_NMI && in_nmi())) 209 return -EINVAL; 210 211 guard(irqsave)(); 212 213 *cookie = get_cookie(info); 214 215 /* callback already pending? */ 216 pending = READ_ONCE(info->pending); 217 if (pending) 218 return 1; 219 220 /* Claim the work unless an NMI just now swooped in to do so. */ 221 if (!try_cmpxchg(&info->pending, &pending, 1)) 222 return 1; 223 224 /* The work has been claimed, now schedule it. */ 225 ret = task_work_add(current, &info->work, TWA_RESUME); 226 if (WARN_ON_ONCE(ret)) { 227 WRITE_ONCE(info->pending, 0); 228 return ret; 229 } 230 231 return 0; 232 } 233 234 void unwind_deferred_cancel(struct unwind_work *work) 235 { 236 if (!work) 237 return; 238 239 guard(mutex)(&callback_mutex); 240 list_del(&work->list); 241 } 242 243 int unwind_deferred_init(struct unwind_work *work, unwind_callback_t func) 244 { 245 memset(work, 0, sizeof(*work)); 246 247 guard(mutex)(&callback_mutex); 248 list_add(&work->list, &callbacks); 249 work->func = func; 250 return 0; 251 } 252 253 void unwind_task_init(struct task_struct *task) 254 { 255 struct unwind_task_info *info = &task->unwind_info; 256 257 memset(info, 0, sizeof(*info)); 258 init_task_work(&info->work, unwind_deferred_task_work); 259 } 260 261 void unwind_task_free(struct task_struct *task) 262 { 263 struct unwind_task_info *info = &task->unwind_info; 264 265 kfree(info->cache); 266 task_work_cancel(task, &info->work); 267 } 268