15e32d0f1SSteven Rostedt // SPDX-License-Identifier: GPL-2.0 25e32d0f1SSteven Rostedt /* 35e32d0f1SSteven Rostedt * Deferred user space unwinding 45e32d0f1SSteven Rostedt */ 52dffa355SJosh Poimboeuf #include <linux/sched/task_stack.h> 62dffa355SJosh Poimboeuf #include <linux/unwind_deferred.h> 72dffa355SJosh Poimboeuf #include <linux/sched/clock.h> 82dffa355SJosh Poimboeuf #include <linux/task_work.h> 95e32d0f1SSteven Rostedt #include <linux/kernel.h> 105e32d0f1SSteven Rostedt #include <linux/sched.h> 11b9c73524SJosh Poimboeuf #include <linux/sizes.h> 125e32d0f1SSteven Rostedt #include <linux/slab.h> 132dffa355SJosh Poimboeuf #include <linux/mm.h> 145e32d0f1SSteven Rostedt 15055c7060SSteven Rostedt /* 16055c7060SSteven Rostedt * For requesting a deferred user space stack trace from NMI context 17055c7060SSteven Rostedt * the architecture must support a safe cmpxchg in NMI context. 18055c7060SSteven Rostedt * For those architectures that do not have that, then it cannot ask 19055c7060SSteven Rostedt * for a deferred user space stack trace from an NMI context. If it 20055c7060SSteven Rostedt * does, then it will get -EINVAL. 21055c7060SSteven Rostedt */ 22055c7060SSteven Rostedt #if defined(CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG) 23055c7060SSteven Rostedt # define CAN_USE_IN_NMI 1 24055c7060SSteven Rostedt static inline bool try_assign_cnt(struct unwind_task_info *info, u32 cnt) 25055c7060SSteven Rostedt { 26055c7060SSteven Rostedt u32 old = 0; 27055c7060SSteven Rostedt 28055c7060SSteven Rostedt return try_cmpxchg(&info->id.cnt, &old, cnt); 29055c7060SSteven Rostedt } 30055c7060SSteven Rostedt #else 31055c7060SSteven Rostedt # define CAN_USE_IN_NMI 0 32055c7060SSteven Rostedt /* When NMIs are not allowed, this always succeeds */ 33055c7060SSteven Rostedt static inline bool try_assign_cnt(struct unwind_task_info *info, u32 cnt) 34055c7060SSteven Rostedt { 35055c7060SSteven Rostedt info->id.cnt = cnt; 36055c7060SSteven Rostedt return true; 37055c7060SSteven Rostedt } 38055c7060SSteven Rostedt #endif 39055c7060SSteven Rostedt 40b9c73524SJosh Poimboeuf /* Make the cache fit in a 4K page */ 41b9c73524SJosh Poimboeuf #define UNWIND_MAX_ENTRIES \ 42b9c73524SJosh Poimboeuf ((SZ_4K - sizeof(struct unwind_cache)) / sizeof(long)) 435e32d0f1SSteven Rostedt 44357eda2dSSteven Rostedt /* Guards adding to or removing from the list of callbacks */ 452dffa355SJosh Poimboeuf static DEFINE_MUTEX(callback_mutex); 462dffa355SJosh Poimboeuf static LIST_HEAD(callbacks); 472dffa355SJosh Poimboeuf 48858fa8a3SSteven Rostedt #define RESERVED_BITS (UNWIND_PENDING | UNWIND_USED) 49be3d526aSSteven Rostedt 50be3d526aSSteven Rostedt /* Zero'd bits are available for assigning callback users */ 51be3d526aSSteven Rostedt static unsigned long unwind_mask = RESERVED_BITS; 52357eda2dSSteven Rostedt DEFINE_STATIC_SRCU(unwind_srcu); 53be3d526aSSteven Rostedt 54be3d526aSSteven Rostedt static inline bool unwind_pending(struct unwind_task_info *info) 55be3d526aSSteven Rostedt { 56be3d526aSSteven Rostedt return test_bit(UNWIND_PENDING_BIT, &info->unwind_mask); 57be3d526aSSteven Rostedt } 58be3d526aSSteven Rostedt 592dffa355SJosh Poimboeuf /* 602dffa355SJosh Poimboeuf * This is a unique percpu identifier for a given task entry context. 612dffa355SJosh Poimboeuf * Conceptually, it's incremented every time the CPU enters the kernel from 622dffa355SJosh Poimboeuf * user space, so that each "entry context" on the CPU gets a unique ID. In 632dffa355SJosh Poimboeuf * reality, as an optimization, it's only incremented on demand for the first 642dffa355SJosh Poimboeuf * deferred unwind request after a given entry-from-user. 652dffa355SJosh Poimboeuf * 662dffa355SJosh Poimboeuf * It's combined with the CPU id to make a systemwide-unique "context cookie". 672dffa355SJosh Poimboeuf */ 682dffa355SJosh Poimboeuf static DEFINE_PER_CPU(u32, unwind_ctx_ctr); 692dffa355SJosh Poimboeuf 702dffa355SJosh Poimboeuf /* 712dffa355SJosh Poimboeuf * The context cookie is a unique identifier that is assigned to a user 722dffa355SJosh Poimboeuf * space stacktrace. As the user space stacktrace remains the same while 732dffa355SJosh Poimboeuf * the task is in the kernel, the cookie is an identifier for the stacktrace. 742dffa355SJosh Poimboeuf * Although it is possible for the stacktrace to get another cookie if another 752dffa355SJosh Poimboeuf * request is made after the cookie was cleared and before reentering user 762dffa355SJosh Poimboeuf * space. 772dffa355SJosh Poimboeuf */ 782dffa355SJosh Poimboeuf static u64 get_cookie(struct unwind_task_info *info) 792dffa355SJosh Poimboeuf { 802dffa355SJosh Poimboeuf u32 cnt = 1; 812dffa355SJosh Poimboeuf 822dffa355SJosh Poimboeuf if (info->id.cpu) 832dffa355SJosh Poimboeuf return info->id.id; 842dffa355SJosh Poimboeuf 852dffa355SJosh Poimboeuf /* LSB is always set to ensure 0 is an invalid value */ 862dffa355SJosh Poimboeuf cnt |= __this_cpu_read(unwind_ctx_ctr) + 2; 87055c7060SSteven Rostedt if (try_assign_cnt(info, cnt)) { 882dffa355SJosh Poimboeuf /* Update the per cpu counter */ 892dffa355SJosh Poimboeuf __this_cpu_write(unwind_ctx_ctr, cnt); 902dffa355SJosh Poimboeuf } 912dffa355SJosh Poimboeuf /* Interrupts are disabled, the CPU will always be same */ 922dffa355SJosh Poimboeuf info->id.cpu = smp_processor_id() + 1; /* Must be non zero */ 932dffa355SJosh Poimboeuf 942dffa355SJosh Poimboeuf return info->id.id; 952dffa355SJosh Poimboeuf } 962dffa355SJosh Poimboeuf 975e32d0f1SSteven Rostedt /** 985e32d0f1SSteven Rostedt * unwind_user_faultable - Produce a user stacktrace in faultable context 995e32d0f1SSteven Rostedt * @trace: The descriptor that will store the user stacktrace 1005e32d0f1SSteven Rostedt * 1015e32d0f1SSteven Rostedt * This must be called in a known faultable context (usually when entering 1025e32d0f1SSteven Rostedt * or exiting user space). Depending on the available implementations 1035e32d0f1SSteven Rostedt * the @trace will be loaded with the addresses of the user space stacktrace 1045e32d0f1SSteven Rostedt * if it can be found. 1055e32d0f1SSteven Rostedt * 1065e32d0f1SSteven Rostedt * Return: 0 on success and negative on error 1075e32d0f1SSteven Rostedt * On success @trace will contain the user space stacktrace 1085e32d0f1SSteven Rostedt */ 1095e32d0f1SSteven Rostedt int unwind_user_faultable(struct unwind_stacktrace *trace) 1105e32d0f1SSteven Rostedt { 1115e32d0f1SSteven Rostedt struct unwind_task_info *info = ¤t->unwind_info; 112b9c73524SJosh Poimboeuf struct unwind_cache *cache; 1135e32d0f1SSteven Rostedt 1145e32d0f1SSteven Rostedt /* Should always be called from faultable context */ 1155e32d0f1SSteven Rostedt might_fault(); 1165e32d0f1SSteven Rostedt 117*b3b9cb11SSteven Rostedt if (!current->mm) 1185e32d0f1SSteven Rostedt return -EINVAL; 1195e32d0f1SSteven Rostedt 120b9c73524SJosh Poimboeuf if (!info->cache) { 121b9c73524SJosh Poimboeuf info->cache = kzalloc(struct_size(cache, entries, UNWIND_MAX_ENTRIES), 1225e32d0f1SSteven Rostedt GFP_KERNEL); 123b9c73524SJosh Poimboeuf if (!info->cache) 1245e32d0f1SSteven Rostedt return -ENOMEM; 1255e32d0f1SSteven Rostedt } 1265e32d0f1SSteven Rostedt 127b9c73524SJosh Poimboeuf cache = info->cache; 128b9c73524SJosh Poimboeuf trace->entries = cache->entries; 129b9c73524SJosh Poimboeuf 130b9c73524SJosh Poimboeuf if (cache->nr_entries) { 131b9c73524SJosh Poimboeuf /* 132b9c73524SJosh Poimboeuf * The user stack has already been previously unwound in this 133b9c73524SJosh Poimboeuf * entry context. Skip the unwind and use the cache. 134b9c73524SJosh Poimboeuf */ 135b9c73524SJosh Poimboeuf trace->nr = cache->nr_entries; 136b9c73524SJosh Poimboeuf return 0; 137b9c73524SJosh Poimboeuf } 138b9c73524SJosh Poimboeuf 1395e32d0f1SSteven Rostedt trace->nr = 0; 1405e32d0f1SSteven Rostedt unwind_user(trace, UNWIND_MAX_ENTRIES); 1415e32d0f1SSteven Rostedt 142b9c73524SJosh Poimboeuf cache->nr_entries = trace->nr; 143b9c73524SJosh Poimboeuf 144858fa8a3SSteven Rostedt /* Clear nr_entries on way back to user space */ 145858fa8a3SSteven Rostedt set_bit(UNWIND_USED_BIT, &info->unwind_mask); 146858fa8a3SSteven Rostedt 1475e32d0f1SSteven Rostedt return 0; 1485e32d0f1SSteven Rostedt } 1495e32d0f1SSteven Rostedt 150*b3b9cb11SSteven Rostedt static void process_unwind_deferred(struct task_struct *task) 1512dffa355SJosh Poimboeuf { 152*b3b9cb11SSteven Rostedt struct unwind_task_info *info = &task->unwind_info; 1532dffa355SJosh Poimboeuf struct unwind_stacktrace trace; 1542dffa355SJosh Poimboeuf struct unwind_work *work; 155be3d526aSSteven Rostedt unsigned long bits; 1562dffa355SJosh Poimboeuf u64 cookie; 1572dffa355SJosh Poimboeuf 158be3d526aSSteven Rostedt if (WARN_ON_ONCE(!unwind_pending(info))) 1592dffa355SJosh Poimboeuf return; 1602dffa355SJosh Poimboeuf 161be3d526aSSteven Rostedt /* Clear pending bit but make sure to have the current bits */ 162be3d526aSSteven Rostedt bits = atomic_long_fetch_andnot(UNWIND_PENDING, 163be3d526aSSteven Rostedt (atomic_long_t *)&info->unwind_mask); 1642dffa355SJosh Poimboeuf /* 1652dffa355SJosh Poimboeuf * From here on out, the callback must always be called, even if it's 1662dffa355SJosh Poimboeuf * just an empty trace. 1672dffa355SJosh Poimboeuf */ 1682dffa355SJosh Poimboeuf trace.nr = 0; 1692dffa355SJosh Poimboeuf trace.entries = NULL; 1702dffa355SJosh Poimboeuf 1712dffa355SJosh Poimboeuf unwind_user_faultable(&trace); 1722dffa355SJosh Poimboeuf 1734c75133eSSteven Rostedt if (info->cache) 1744c75133eSSteven Rostedt bits &= ~(info->cache->unwind_completed); 1754c75133eSSteven Rostedt 1762dffa355SJosh Poimboeuf cookie = info->id.id; 1772dffa355SJosh Poimboeuf 178357eda2dSSteven Rostedt guard(srcu)(&unwind_srcu); 179357eda2dSSteven Rostedt list_for_each_entry_srcu(work, &callbacks, list, 180357eda2dSSteven Rostedt srcu_read_lock_held(&unwind_srcu)) { 1814c75133eSSteven Rostedt if (test_bit(work->bit, &bits)) { 1822dffa355SJosh Poimboeuf work->func(work, &trace, cookie); 1834c75133eSSteven Rostedt if (info->cache) 1844c75133eSSteven Rostedt info->cache->unwind_completed |= BIT(work->bit); 1854c75133eSSteven Rostedt } 1862dffa355SJosh Poimboeuf } 1872dffa355SJosh Poimboeuf } 1882dffa355SJosh Poimboeuf 189*b3b9cb11SSteven Rostedt static void unwind_deferred_task_work(struct callback_head *head) 190*b3b9cb11SSteven Rostedt { 191*b3b9cb11SSteven Rostedt process_unwind_deferred(current); 192*b3b9cb11SSteven Rostedt } 193*b3b9cb11SSteven Rostedt 194*b3b9cb11SSteven Rostedt void unwind_deferred_task_exit(struct task_struct *task) 195*b3b9cb11SSteven Rostedt { 196*b3b9cb11SSteven Rostedt struct unwind_task_info *info = ¤t->unwind_info; 197*b3b9cb11SSteven Rostedt 198*b3b9cb11SSteven Rostedt if (!unwind_pending(info)) 199*b3b9cb11SSteven Rostedt return; 200*b3b9cb11SSteven Rostedt 201*b3b9cb11SSteven Rostedt process_unwind_deferred(task); 202*b3b9cb11SSteven Rostedt 203*b3b9cb11SSteven Rostedt task_work_cancel(task, &info->work); 204*b3b9cb11SSteven Rostedt } 205*b3b9cb11SSteven Rostedt 2062dffa355SJosh Poimboeuf /** 2072dffa355SJosh Poimboeuf * unwind_deferred_request - Request a user stacktrace on task kernel exit 2082dffa355SJosh Poimboeuf * @work: Unwind descriptor requesting the trace 2092dffa355SJosh Poimboeuf * @cookie: The cookie of the first request made for this task 2102dffa355SJosh Poimboeuf * 2112dffa355SJosh Poimboeuf * Schedule a user space unwind to be done in task work before exiting the 2122dffa355SJosh Poimboeuf * kernel. 2132dffa355SJosh Poimboeuf * 2142dffa355SJosh Poimboeuf * The returned @cookie output is the generated cookie of the very first 2152dffa355SJosh Poimboeuf * request for a user space stacktrace for this task since it entered the 2162dffa355SJosh Poimboeuf * kernel. It can be from a request by any caller of this infrastructure. 2172dffa355SJosh Poimboeuf * Its value will also be passed to the callback function. It can be 2182dffa355SJosh Poimboeuf * used to stitch kernel and user stack traces together in post-processing. 2192dffa355SJosh Poimboeuf * 2202dffa355SJosh Poimboeuf * It's valid to call this function multiple times for the same @work within 2212dffa355SJosh Poimboeuf * the same task entry context. Each call will return the same cookie 2222dffa355SJosh Poimboeuf * while the task hasn't left the kernel. If the callback is not pending 2232dffa355SJosh Poimboeuf * because it has already been previously called for the same entry context, 2242dffa355SJosh Poimboeuf * it will be called again with the same stack trace and cookie. 2252dffa355SJosh Poimboeuf * 226be3d526aSSteven Rostedt * Return: 0 if the callback successfully was queued. 227be3d526aSSteven Rostedt * 1 if the callback is pending or was already executed. 2282dffa355SJosh Poimboeuf * Negative if there's an error. 2292dffa355SJosh Poimboeuf * @cookie holds the cookie of the first request by any user 2302dffa355SJosh Poimboeuf */ 2312dffa355SJosh Poimboeuf int unwind_deferred_request(struct unwind_work *work, u64 *cookie) 2322dffa355SJosh Poimboeuf { 2332dffa355SJosh Poimboeuf struct unwind_task_info *info = ¤t->unwind_info; 234be3d526aSSteven Rostedt unsigned long old, bits; 235357eda2dSSteven Rostedt unsigned long bit; 2362dffa355SJosh Poimboeuf int ret; 2372dffa355SJosh Poimboeuf 2382dffa355SJosh Poimboeuf *cookie = 0; 2392dffa355SJosh Poimboeuf 2402dffa355SJosh Poimboeuf if ((current->flags & (PF_KTHREAD | PF_EXITING)) || 2412dffa355SJosh Poimboeuf !user_mode(task_pt_regs(current))) 2422dffa355SJosh Poimboeuf return -EINVAL; 2432dffa355SJosh Poimboeuf 244055c7060SSteven Rostedt /* 245055c7060SSteven Rostedt * NMI requires having safe cmpxchg operations. 246055c7060SSteven Rostedt * Trigger a warning to make it obvious that an architecture 247055c7060SSteven Rostedt * is using this in NMI when it should not be. 248055c7060SSteven Rostedt */ 249055c7060SSteven Rostedt if (WARN_ON_ONCE(!CAN_USE_IN_NMI && in_nmi())) 250055c7060SSteven Rostedt return -EINVAL; 251055c7060SSteven Rostedt 252357eda2dSSteven Rostedt /* Do not allow cancelled works to request again */ 253357eda2dSSteven Rostedt bit = READ_ONCE(work->bit); 254357eda2dSSteven Rostedt if (WARN_ON_ONCE(bit < 0)) 255357eda2dSSteven Rostedt return -EINVAL; 256357eda2dSSteven Rostedt 257357eda2dSSteven Rostedt /* Only need the mask now */ 258357eda2dSSteven Rostedt bit = BIT(bit); 259357eda2dSSteven Rostedt 2602dffa355SJosh Poimboeuf guard(irqsave)(); 2612dffa355SJosh Poimboeuf 2622dffa355SJosh Poimboeuf *cookie = get_cookie(info); 2632dffa355SJosh Poimboeuf 264be3d526aSSteven Rostedt old = READ_ONCE(info->unwind_mask); 265be3d526aSSteven Rostedt 266be3d526aSSteven Rostedt /* Is this already queued or executed */ 267be3d526aSSteven Rostedt if (old & bit) 268055c7060SSteven Rostedt return 1; 269055c7060SSteven Rostedt 270be3d526aSSteven Rostedt /* 271be3d526aSSteven Rostedt * This work's bit hasn't been set yet. Now set it with the PENDING 272be3d526aSSteven Rostedt * bit and fetch the current value of unwind_mask. If ether the 273be3d526aSSteven Rostedt * work's bit or PENDING was already set, then this is already queued 274be3d526aSSteven Rostedt * to have a callback. 275be3d526aSSteven Rostedt */ 276be3d526aSSteven Rostedt bits = UNWIND_PENDING | bit; 277be3d526aSSteven Rostedt old = atomic_long_fetch_or(bits, (atomic_long_t *)&info->unwind_mask); 278be3d526aSSteven Rostedt if (old & bits) { 279be3d526aSSteven Rostedt /* 280be3d526aSSteven Rostedt * If the work's bit was set, whatever set it had better 281be3d526aSSteven Rostedt * have also set pending and queued a callback. 282be3d526aSSteven Rostedt */ 283be3d526aSSteven Rostedt WARN_ON_ONCE(!(old & UNWIND_PENDING)); 284be3d526aSSteven Rostedt return old & bit; 285be3d526aSSteven Rostedt } 2862dffa355SJosh Poimboeuf 2872dffa355SJosh Poimboeuf /* The work has been claimed, now schedule it. */ 2882dffa355SJosh Poimboeuf ret = task_work_add(current, &info->work, TWA_RESUME); 2892dffa355SJosh Poimboeuf 290be3d526aSSteven Rostedt if (WARN_ON_ONCE(ret)) 291be3d526aSSteven Rostedt WRITE_ONCE(info->unwind_mask, 0); 292be3d526aSSteven Rostedt 293be3d526aSSteven Rostedt return ret; 2942dffa355SJosh Poimboeuf } 2952dffa355SJosh Poimboeuf 2962dffa355SJosh Poimboeuf void unwind_deferred_cancel(struct unwind_work *work) 2972dffa355SJosh Poimboeuf { 298be3d526aSSteven Rostedt struct task_struct *g, *t; 2994c75133eSSteven Rostedt int bit; 300be3d526aSSteven Rostedt 3012dffa355SJosh Poimboeuf if (!work) 3022dffa355SJosh Poimboeuf return; 3032dffa355SJosh Poimboeuf 3044c75133eSSteven Rostedt bit = work->bit; 3054c75133eSSteven Rostedt 306be3d526aSSteven Rostedt /* No work should be using a reserved bit */ 3074c75133eSSteven Rostedt if (WARN_ON_ONCE(BIT(bit) & RESERVED_BITS)) 308be3d526aSSteven Rostedt return; 309be3d526aSSteven Rostedt 3102dffa355SJosh Poimboeuf guard(mutex)(&callback_mutex); 311357eda2dSSteven Rostedt list_del_rcu(&work->list); 312357eda2dSSteven Rostedt 313357eda2dSSteven Rostedt /* Do not allow any more requests and prevent callbacks */ 314357eda2dSSteven Rostedt work->bit = -1; 315be3d526aSSteven Rostedt 3164c75133eSSteven Rostedt __clear_bit(bit, &unwind_mask); 317be3d526aSSteven Rostedt 318357eda2dSSteven Rostedt synchronize_srcu(&unwind_srcu); 319357eda2dSSteven Rostedt 320be3d526aSSteven Rostedt guard(rcu)(); 321be3d526aSSteven Rostedt /* Clear this bit from all threads */ 322be3d526aSSteven Rostedt for_each_process_thread(g, t) { 3234c75133eSSteven Rostedt clear_bit(bit, &t->unwind_info.unwind_mask); 3244c75133eSSteven Rostedt if (t->unwind_info.cache) 3254c75133eSSteven Rostedt clear_bit(bit, &t->unwind_info.cache->unwind_completed); 326be3d526aSSteven Rostedt } 3272dffa355SJosh Poimboeuf } 3282dffa355SJosh Poimboeuf 3292dffa355SJosh Poimboeuf int unwind_deferred_init(struct unwind_work *work, unwind_callback_t func) 3302dffa355SJosh Poimboeuf { 3312dffa355SJosh Poimboeuf memset(work, 0, sizeof(*work)); 3322dffa355SJosh Poimboeuf 3332dffa355SJosh Poimboeuf guard(mutex)(&callback_mutex); 334be3d526aSSteven Rostedt 335be3d526aSSteven Rostedt /* See if there's a bit in the mask available */ 336be3d526aSSteven Rostedt if (unwind_mask == ~0UL) 337be3d526aSSteven Rostedt return -EBUSY; 338be3d526aSSteven Rostedt 339be3d526aSSteven Rostedt work->bit = ffz(unwind_mask); 340be3d526aSSteven Rostedt __set_bit(work->bit, &unwind_mask); 341be3d526aSSteven Rostedt 342357eda2dSSteven Rostedt list_add_rcu(&work->list, &callbacks); 3432dffa355SJosh Poimboeuf work->func = func; 3442dffa355SJosh Poimboeuf return 0; 3452dffa355SJosh Poimboeuf } 3462dffa355SJosh Poimboeuf 3475e32d0f1SSteven Rostedt void unwind_task_init(struct task_struct *task) 3485e32d0f1SSteven Rostedt { 3495e32d0f1SSteven Rostedt struct unwind_task_info *info = &task->unwind_info; 3505e32d0f1SSteven Rostedt 3515e32d0f1SSteven Rostedt memset(info, 0, sizeof(*info)); 3522dffa355SJosh Poimboeuf init_task_work(&info->work, unwind_deferred_task_work); 353be3d526aSSteven Rostedt info->unwind_mask = 0; 3545e32d0f1SSteven Rostedt } 3555e32d0f1SSteven Rostedt 3565e32d0f1SSteven Rostedt void unwind_task_free(struct task_struct *task) 3575e32d0f1SSteven Rostedt { 3585e32d0f1SSteven Rostedt struct unwind_task_info *info = &task->unwind_info; 3595e32d0f1SSteven Rostedt 360b9c73524SJosh Poimboeuf kfree(info->cache); 3612dffa355SJosh Poimboeuf task_work_cancel(task, &info->work); 3625e32d0f1SSteven Rostedt } 363