xref: /linux/lib/stackdepot.c (revision bdce82e960d1205d118662f575cec39379984e34)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Stack depot - a stack trace storage that avoids duplication.
4  *
5  * Internally, stack depot maintains a hash table of unique stacktraces. The
6  * stack traces themselves are stored contiguously one after another in a set
7  * of separate page allocations.
8  *
9  * Author: Alexander Potapenko <glider@google.com>
10  * Copyright (C) 2016 Google, Inc.
11  *
12  * Based on the code by Dmitry Chernenkov.
13  */
14 
15 #define pr_fmt(fmt) "stackdepot: " fmt
16 
17 #include <linux/debugfs.h>
18 #include <linux/gfp.h>
19 #include <linux/jhash.h>
20 #include <linux/kernel.h>
21 #include <linux/kmsan.h>
22 #include <linux/list.h>
23 #include <linux/mm.h>
24 #include <linux/mutex.h>
25 #include <linux/printk.h>
26 #include <linux/rculist.h>
27 #include <linux/rcupdate.h>
28 #include <linux/refcount.h>
29 #include <linux/slab.h>
30 #include <linux/spinlock.h>
31 #include <linux/stacktrace.h>
32 #include <linux/stackdepot.h>
33 #include <linux/string.h>
34 #include <linux/types.h>
35 #include <linux/memblock.h>
36 #include <linux/kasan-enabled.h>
37 
38 #define DEPOT_HANDLE_BITS (sizeof(depot_stack_handle_t) * 8)
39 
40 #define DEPOT_POOL_ORDER 2 /* Pool size order, 4 pages */
41 #define DEPOT_POOL_SIZE (1LL << (PAGE_SHIFT + DEPOT_POOL_ORDER))
42 #define DEPOT_STACK_ALIGN 4
43 #define DEPOT_OFFSET_BITS (DEPOT_POOL_ORDER + PAGE_SHIFT - DEPOT_STACK_ALIGN)
44 #define DEPOT_POOL_INDEX_BITS (DEPOT_HANDLE_BITS - DEPOT_OFFSET_BITS - \
45 			       STACK_DEPOT_EXTRA_BITS)
46 #if IS_ENABLED(CONFIG_KMSAN) && CONFIG_STACKDEPOT_MAX_FRAMES >= 32
47 /*
48  * KMSAN is frequently used in fuzzing scenarios and thus saves a lot of stack
49  * traces. As KMSAN does not support evicting stack traces from the stack
50  * depot, the stack depot capacity might be reached quickly with large stack
51  * records. Adjust the maximum number of stack depot pools for this case.
52  */
53 #define DEPOT_POOLS_CAP (8192 * (CONFIG_STACKDEPOT_MAX_FRAMES / 16))
54 #else
55 #define DEPOT_POOLS_CAP 8192
56 #endif
57 #define DEPOT_MAX_POOLS \
58 	(((1LL << (DEPOT_POOL_INDEX_BITS)) < DEPOT_POOLS_CAP) ? \
59 	 (1LL << (DEPOT_POOL_INDEX_BITS)) : DEPOT_POOLS_CAP)
60 
61 /* Compact structure that stores a reference to a stack. */
62 union handle_parts {
63 	depot_stack_handle_t handle;
64 	struct {
65 		u32 pool_index	: DEPOT_POOL_INDEX_BITS;
66 		u32 offset	: DEPOT_OFFSET_BITS;
67 		u32 extra	: STACK_DEPOT_EXTRA_BITS;
68 	};
69 };
70 
71 struct stack_record {
72 	struct list_head hash_list;	/* Links in the hash table */
73 	u32 hash;			/* Hash in hash table */
74 	u32 size;			/* Number of stored frames */
75 	union handle_parts handle;	/* Constant after initialization */
76 	refcount_t count;
77 	union {
78 		unsigned long entries[CONFIG_STACKDEPOT_MAX_FRAMES];	/* Frames */
79 		struct {
80 			/*
81 			 * An important invariant of the implementation is to
82 			 * only place a stack record onto the freelist iff its
83 			 * refcount is zero. Because stack records with a zero
84 			 * refcount are never considered as valid, it is safe to
85 			 * union @entries and freelist management state below.
86 			 * Conversely, as soon as an entry is off the freelist
87 			 * and its refcount becomes non-zero, the below must not
88 			 * be accessed until being placed back on the freelist.
89 			 */
90 			struct list_head free_list;	/* Links in the freelist */
91 			unsigned long rcu_state;	/* RCU cookie */
92 		};
93 	};
94 };
95 
96 #define DEPOT_STACK_RECORD_SIZE \
97 	ALIGN(sizeof(struct stack_record), 1 << DEPOT_STACK_ALIGN)
98 
99 static bool stack_depot_disabled;
100 static bool __stack_depot_early_init_requested __initdata = IS_ENABLED(CONFIG_STACKDEPOT_ALWAYS_INIT);
101 static bool __stack_depot_early_init_passed __initdata;
102 
103 /* Use one hash table bucket per 16 KB of memory. */
104 #define STACK_HASH_TABLE_SCALE 14
105 /* Limit the number of buckets between 4K and 1M. */
106 #define STACK_BUCKET_NUMBER_ORDER_MIN 12
107 #define STACK_BUCKET_NUMBER_ORDER_MAX 20
108 /* Initial seed for jhash2. */
109 #define STACK_HASH_SEED 0x9747b28c
110 
111 /* Hash table of stored stack records. */
112 static struct list_head *stack_table;
113 /* Fixed order of the number of table buckets. Used when KASAN is enabled. */
114 static unsigned int stack_bucket_number_order;
115 /* Hash mask for indexing the table. */
116 static unsigned int stack_hash_mask;
117 
118 /* Array of memory regions that store stack records. */
119 static void *stack_pools[DEPOT_MAX_POOLS];
120 /* Newly allocated pool that is not yet added to stack_pools. */
121 static void *new_pool;
122 /* Number of pools in stack_pools. */
123 static int pools_num;
124 /* Freelist of stack records within stack_pools. */
125 static LIST_HEAD(free_stacks);
126 /*
127  * Stack depot tries to keep an extra pool allocated even before it runs out
128  * of space in the currently used pool. This flag marks whether this extra pool
129  * needs to be allocated. It has the value 0 when either an extra pool is not
130  * yet allocated or if the limit on the number of pools is reached.
131  */
132 static bool new_pool_required = true;
133 /* The lock must be held when performing pool or freelist modifications. */
134 static DEFINE_RAW_SPINLOCK(pool_lock);
135 
136 /* Statistics counters for debugfs. */
137 enum depot_counter_id {
138 	DEPOT_COUNTER_ALLOCS,
139 	DEPOT_COUNTER_FREES,
140 	DEPOT_COUNTER_INUSE,
141 	DEPOT_COUNTER_FREELIST_SIZE,
142 	DEPOT_COUNTER_COUNT,
143 };
144 static long counters[DEPOT_COUNTER_COUNT];
145 static const char *const counter_names[] = {
146 	[DEPOT_COUNTER_ALLOCS]		= "allocations",
147 	[DEPOT_COUNTER_FREES]		= "frees",
148 	[DEPOT_COUNTER_INUSE]		= "in_use",
149 	[DEPOT_COUNTER_FREELIST_SIZE]	= "freelist_size",
150 };
151 static_assert(ARRAY_SIZE(counter_names) == DEPOT_COUNTER_COUNT);
152 
153 static int __init disable_stack_depot(char *str)
154 {
155 	return kstrtobool(str, &stack_depot_disabled);
156 }
157 early_param("stack_depot_disable", disable_stack_depot);
158 
159 void __init stack_depot_request_early_init(void)
160 {
161 	/* Too late to request early init now. */
162 	WARN_ON(__stack_depot_early_init_passed);
163 
164 	__stack_depot_early_init_requested = true;
165 }
166 
167 /* Initialize list_head's within the hash table. */
168 static void init_stack_table(unsigned long entries)
169 {
170 	unsigned long i;
171 
172 	for (i = 0; i < entries; i++)
173 		INIT_LIST_HEAD(&stack_table[i]);
174 }
175 
176 /* Allocates a hash table via memblock. Can only be used during early boot. */
177 int __init stack_depot_early_init(void)
178 {
179 	unsigned long entries = 0;
180 
181 	/* This function must be called only once, from mm_init(). */
182 	if (WARN_ON(__stack_depot_early_init_passed))
183 		return 0;
184 	__stack_depot_early_init_passed = true;
185 
186 	/*
187 	 * Print disabled message even if early init has not been requested:
188 	 * stack_depot_init() will not print one.
189 	 */
190 	if (stack_depot_disabled) {
191 		pr_info("disabled\n");
192 		return 0;
193 	}
194 
195 	/*
196 	 * If KASAN is enabled, use the maximum order: KASAN is frequently used
197 	 * in fuzzing scenarios, which leads to a large number of different
198 	 * stack traces being stored in stack depot.
199 	 */
200 	if (kasan_enabled() && !stack_bucket_number_order)
201 		stack_bucket_number_order = STACK_BUCKET_NUMBER_ORDER_MAX;
202 
203 	/*
204 	 * Check if early init has been requested after setting
205 	 * stack_bucket_number_order: stack_depot_init() uses its value.
206 	 */
207 	if (!__stack_depot_early_init_requested)
208 		return 0;
209 
210 	/*
211 	 * If stack_bucket_number_order is not set, leave entries as 0 to rely
212 	 * on the automatic calculations performed by alloc_large_system_hash().
213 	 */
214 	if (stack_bucket_number_order)
215 		entries = 1UL << stack_bucket_number_order;
216 	pr_info("allocating hash table via alloc_large_system_hash\n");
217 	stack_table = alloc_large_system_hash("stackdepot",
218 						sizeof(struct list_head),
219 						entries,
220 						STACK_HASH_TABLE_SCALE,
221 						HASH_EARLY,
222 						NULL,
223 						&stack_hash_mask,
224 						1UL << STACK_BUCKET_NUMBER_ORDER_MIN,
225 						1UL << STACK_BUCKET_NUMBER_ORDER_MAX);
226 	if (!stack_table) {
227 		pr_err("hash table allocation failed, disabling\n");
228 		stack_depot_disabled = true;
229 		return -ENOMEM;
230 	}
231 	if (!entries) {
232 		/*
233 		 * Obtain the number of entries that was calculated by
234 		 * alloc_large_system_hash().
235 		 */
236 		entries = stack_hash_mask + 1;
237 	}
238 	init_stack_table(entries);
239 
240 	return 0;
241 }
242 
243 /* Allocates a hash table via kvcalloc. Can be used after boot. */
244 int stack_depot_init(void)
245 {
246 	static DEFINE_MUTEX(stack_depot_init_mutex);
247 	unsigned long entries;
248 	int ret = 0;
249 
250 	mutex_lock(&stack_depot_init_mutex);
251 
252 	if (stack_depot_disabled || stack_table)
253 		goto out_unlock;
254 
255 	/*
256 	 * Similarly to stack_depot_early_init, use stack_bucket_number_order
257 	 * if assigned, and rely on automatic scaling otherwise.
258 	 */
259 	if (stack_bucket_number_order) {
260 		entries = 1UL << stack_bucket_number_order;
261 	} else {
262 		int scale = STACK_HASH_TABLE_SCALE;
263 
264 		entries = nr_free_buffer_pages();
265 		entries = roundup_pow_of_two(entries);
266 
267 		if (scale > PAGE_SHIFT)
268 			entries >>= (scale - PAGE_SHIFT);
269 		else
270 			entries <<= (PAGE_SHIFT - scale);
271 	}
272 
273 	if (entries < 1UL << STACK_BUCKET_NUMBER_ORDER_MIN)
274 		entries = 1UL << STACK_BUCKET_NUMBER_ORDER_MIN;
275 	if (entries > 1UL << STACK_BUCKET_NUMBER_ORDER_MAX)
276 		entries = 1UL << STACK_BUCKET_NUMBER_ORDER_MAX;
277 
278 	pr_info("allocating hash table of %lu entries via kvcalloc\n", entries);
279 	stack_table = kvcalloc(entries, sizeof(struct list_head), GFP_KERNEL);
280 	if (!stack_table) {
281 		pr_err("hash table allocation failed, disabling\n");
282 		stack_depot_disabled = true;
283 		ret = -ENOMEM;
284 		goto out_unlock;
285 	}
286 	stack_hash_mask = entries - 1;
287 	init_stack_table(entries);
288 
289 out_unlock:
290 	mutex_unlock(&stack_depot_init_mutex);
291 
292 	return ret;
293 }
294 EXPORT_SYMBOL_GPL(stack_depot_init);
295 
296 /*
297  * Initializes new stack depot @pool, release all its entries to the freelist,
298  * and update the list of pools.
299  */
300 static void depot_init_pool(void *pool)
301 {
302 	int offset;
303 
304 	lockdep_assert_held(&pool_lock);
305 
306 	/* Initialize handles and link stack records into the freelist. */
307 	for (offset = 0; offset <= DEPOT_POOL_SIZE - DEPOT_STACK_RECORD_SIZE;
308 	     offset += DEPOT_STACK_RECORD_SIZE) {
309 		struct stack_record *stack = pool + offset;
310 
311 		stack->handle.pool_index = pools_num;
312 		stack->handle.offset = offset >> DEPOT_STACK_ALIGN;
313 		stack->handle.extra = 0;
314 
315 		/*
316 		 * Stack traces of size 0 are never saved, and we can simply use
317 		 * the size field as an indicator if this is a new unused stack
318 		 * record in the freelist.
319 		 */
320 		stack->size = 0;
321 
322 		INIT_LIST_HEAD(&stack->hash_list);
323 		/*
324 		 * Add to the freelist front to prioritize never-used entries:
325 		 * required in case there are entries in the freelist, but their
326 		 * RCU cookie still belongs to the current RCU grace period
327 		 * (there can still be concurrent readers).
328 		 */
329 		list_add(&stack->free_list, &free_stacks);
330 		counters[DEPOT_COUNTER_FREELIST_SIZE]++;
331 	}
332 
333 	/* Save reference to the pool to be used by depot_fetch_stack(). */
334 	stack_pools[pools_num] = pool;
335 
336 	/* Pairs with concurrent READ_ONCE() in depot_fetch_stack(). */
337 	WRITE_ONCE(pools_num, pools_num + 1);
338 	ASSERT_EXCLUSIVE_WRITER(pools_num);
339 }
340 
341 /* Keeps the preallocated memory to be used for a new stack depot pool. */
342 static void depot_keep_new_pool(void **prealloc)
343 {
344 	lockdep_assert_held(&pool_lock);
345 
346 	/*
347 	 * If a new pool is already saved or the maximum number of
348 	 * pools is reached, do not use the preallocated memory.
349 	 */
350 	if (!new_pool_required)
351 		return;
352 
353 	/*
354 	 * Use the preallocated memory for the new pool
355 	 * as long as we do not exceed the maximum number of pools.
356 	 */
357 	if (pools_num < DEPOT_MAX_POOLS) {
358 		new_pool = *prealloc;
359 		*prealloc = NULL;
360 	}
361 
362 	/*
363 	 * At this point, either a new pool is kept or the maximum
364 	 * number of pools is reached. In either case, take note that
365 	 * keeping another pool is not required.
366 	 */
367 	WRITE_ONCE(new_pool_required, false);
368 }
369 
370 /*
371  * Try to initialize a new stack depot pool from either a previous or the
372  * current pre-allocation, and release all its entries to the freelist.
373  */
374 static bool depot_try_init_pool(void **prealloc)
375 {
376 	lockdep_assert_held(&pool_lock);
377 
378 	/* Check if we have a new pool saved and use it. */
379 	if (new_pool) {
380 		depot_init_pool(new_pool);
381 		new_pool = NULL;
382 
383 		/* Take note that we might need a new new_pool. */
384 		if (pools_num < DEPOT_MAX_POOLS)
385 			WRITE_ONCE(new_pool_required, true);
386 
387 		return true;
388 	}
389 
390 	/* Bail out if we reached the pool limit. */
391 	if (unlikely(pools_num >= DEPOT_MAX_POOLS)) {
392 		WARN_ONCE(1, "Stack depot reached limit capacity");
393 		return false;
394 	}
395 
396 	/* Check if we have preallocated memory and use it. */
397 	if (*prealloc) {
398 		depot_init_pool(*prealloc);
399 		*prealloc = NULL;
400 		return true;
401 	}
402 
403 	return false;
404 }
405 
406 /* Try to find next free usable entry. */
407 static struct stack_record *depot_pop_free(void)
408 {
409 	struct stack_record *stack;
410 
411 	lockdep_assert_held(&pool_lock);
412 
413 	if (list_empty(&free_stacks))
414 		return NULL;
415 
416 	/*
417 	 * We maintain the invariant that the elements in front are least
418 	 * recently used, and are therefore more likely to be associated with an
419 	 * RCU grace period in the past. Consequently it is sufficient to only
420 	 * check the first entry.
421 	 */
422 	stack = list_first_entry(&free_stacks, struct stack_record, free_list);
423 	if (stack->size && !poll_state_synchronize_rcu(stack->rcu_state))
424 		return NULL;
425 
426 	list_del(&stack->free_list);
427 	counters[DEPOT_COUNTER_FREELIST_SIZE]--;
428 
429 	return stack;
430 }
431 
432 /* Allocates a new stack in a stack depot pool. */
433 static struct stack_record *
434 depot_alloc_stack(unsigned long *entries, int size, u32 hash, void **prealloc)
435 {
436 	struct stack_record *stack;
437 
438 	lockdep_assert_held(&pool_lock);
439 
440 	/* This should already be checked by public API entry points. */
441 	if (WARN_ON_ONCE(!size))
442 		return NULL;
443 
444 	/* Check if we have a stack record to save the stack trace. */
445 	stack = depot_pop_free();
446 	if (!stack) {
447 		/* No usable entries on the freelist - try to refill the freelist. */
448 		if (!depot_try_init_pool(prealloc))
449 			return NULL;
450 		stack = depot_pop_free();
451 		if (WARN_ON(!stack))
452 			return NULL;
453 	}
454 
455 	/* Limit number of saved frames to CONFIG_STACKDEPOT_MAX_FRAMES. */
456 	if (size > CONFIG_STACKDEPOT_MAX_FRAMES)
457 		size = CONFIG_STACKDEPOT_MAX_FRAMES;
458 
459 	/* Save the stack trace. */
460 	stack->hash = hash;
461 	stack->size = size;
462 	/* stack->handle is already filled in by depot_init_pool(). */
463 	refcount_set(&stack->count, 1);
464 	memcpy(stack->entries, entries, flex_array_size(stack, entries, size));
465 
466 	/*
467 	 * Let KMSAN know the stored stack record is initialized. This shall
468 	 * prevent false positive reports if instrumented code accesses it.
469 	 */
470 	kmsan_unpoison_memory(stack, DEPOT_STACK_RECORD_SIZE);
471 
472 	counters[DEPOT_COUNTER_ALLOCS]++;
473 	counters[DEPOT_COUNTER_INUSE]++;
474 	return stack;
475 }
476 
477 static struct stack_record *depot_fetch_stack(depot_stack_handle_t handle)
478 {
479 	const int pools_num_cached = READ_ONCE(pools_num);
480 	union handle_parts parts = { .handle = handle };
481 	void *pool;
482 	size_t offset = parts.offset << DEPOT_STACK_ALIGN;
483 	struct stack_record *stack;
484 
485 	lockdep_assert_not_held(&pool_lock);
486 
487 	if (parts.pool_index > pools_num_cached) {
488 		WARN(1, "pool index %d out of bounds (%d) for stack id %08x\n",
489 		     parts.pool_index, pools_num_cached, handle);
490 		return NULL;
491 	}
492 
493 	pool = stack_pools[parts.pool_index];
494 	if (WARN_ON(!pool))
495 		return NULL;
496 
497 	stack = pool + offset;
498 	if (WARN_ON(!refcount_read(&stack->count)))
499 		return NULL;
500 
501 	return stack;
502 }
503 
504 /* Links stack into the freelist. */
505 static void depot_free_stack(struct stack_record *stack)
506 {
507 	unsigned long flags;
508 
509 	lockdep_assert_not_held(&pool_lock);
510 
511 	raw_spin_lock_irqsave(&pool_lock, flags);
512 	printk_deferred_enter();
513 
514 	/*
515 	 * Remove the entry from the hash list. Concurrent list traversal may
516 	 * still observe the entry, but since the refcount is zero, this entry
517 	 * will no longer be considered as valid.
518 	 */
519 	list_del_rcu(&stack->hash_list);
520 
521 	/*
522 	 * Due to being used from constrained contexts such as the allocators,
523 	 * NMI, or even RCU itself, stack depot cannot rely on primitives that
524 	 * would sleep (such as synchronize_rcu()) or recursively call into
525 	 * stack depot again (such as call_rcu()).
526 	 *
527 	 * Instead, get an RCU cookie, so that we can ensure this entry isn't
528 	 * moved onto another list until the next grace period, and concurrent
529 	 * RCU list traversal remains safe.
530 	 */
531 	stack->rcu_state = get_state_synchronize_rcu();
532 
533 	/*
534 	 * Add the entry to the freelist tail, so that older entries are
535 	 * considered first - their RCU cookie is more likely to no longer be
536 	 * associated with the current grace period.
537 	 */
538 	list_add_tail(&stack->free_list, &free_stacks);
539 
540 	counters[DEPOT_COUNTER_FREELIST_SIZE]++;
541 	counters[DEPOT_COUNTER_FREES]++;
542 	counters[DEPOT_COUNTER_INUSE]--;
543 
544 	printk_deferred_exit();
545 	raw_spin_unlock_irqrestore(&pool_lock, flags);
546 }
547 
548 /* Calculates the hash for a stack. */
549 static inline u32 hash_stack(unsigned long *entries, unsigned int size)
550 {
551 	return jhash2((u32 *)entries,
552 		      array_size(size,  sizeof(*entries)) / sizeof(u32),
553 		      STACK_HASH_SEED);
554 }
555 
556 /*
557  * Non-instrumented version of memcmp().
558  * Does not check the lexicographical order, only the equality.
559  */
560 static inline
561 int stackdepot_memcmp(const unsigned long *u1, const unsigned long *u2,
562 			unsigned int n)
563 {
564 	for ( ; n-- ; u1++, u2++) {
565 		if (*u1 != *u2)
566 			return 1;
567 	}
568 	return 0;
569 }
570 
571 /* Finds a stack in a bucket of the hash table. */
572 static inline struct stack_record *find_stack(struct list_head *bucket,
573 					      unsigned long *entries, int size,
574 					      u32 hash, depot_flags_t flags)
575 {
576 	struct stack_record *stack, *ret = NULL;
577 
578 	/*
579 	 * Stack depot may be used from instrumentation that instruments RCU or
580 	 * tracing itself; use variant that does not call into RCU and cannot be
581 	 * traced.
582 	 *
583 	 * Note: Such use cases must take care when using refcounting to evict
584 	 * unused entries, because the stack record free-then-reuse code paths
585 	 * do call into RCU.
586 	 */
587 	rcu_read_lock_sched_notrace();
588 
589 	list_for_each_entry_rcu(stack, bucket, hash_list) {
590 		if (stack->hash != hash || stack->size != size)
591 			continue;
592 
593 		/*
594 		 * This may race with depot_free_stack() accessing the freelist
595 		 * management state unioned with @entries. The refcount is zero
596 		 * in that case and the below refcount_inc_not_zero() will fail.
597 		 */
598 		if (data_race(stackdepot_memcmp(entries, stack->entries, size)))
599 			continue;
600 
601 		/*
602 		 * Try to increment refcount. If this succeeds, the stack record
603 		 * is valid and has not yet been freed.
604 		 *
605 		 * If STACK_DEPOT_FLAG_GET is not used, it is undefined behavior
606 		 * to then call stack_depot_put() later, and we can assume that
607 		 * a stack record is never placed back on the freelist.
608 		 */
609 		if ((flags & STACK_DEPOT_FLAG_GET) && !refcount_inc_not_zero(&stack->count))
610 			continue;
611 
612 		ret = stack;
613 		break;
614 	}
615 
616 	rcu_read_unlock_sched_notrace();
617 
618 	return ret;
619 }
620 
621 depot_stack_handle_t stack_depot_save_flags(unsigned long *entries,
622 					    unsigned int nr_entries,
623 					    gfp_t alloc_flags,
624 					    depot_flags_t depot_flags)
625 {
626 	struct list_head *bucket;
627 	struct stack_record *found = NULL;
628 	depot_stack_handle_t handle = 0;
629 	struct page *page = NULL;
630 	void *prealloc = NULL;
631 	bool can_alloc = depot_flags & STACK_DEPOT_FLAG_CAN_ALLOC;
632 	unsigned long flags;
633 	u32 hash;
634 
635 	if (WARN_ON(depot_flags & ~STACK_DEPOT_FLAGS_MASK))
636 		return 0;
637 
638 	/*
639 	 * If this stack trace is from an interrupt, including anything before
640 	 * interrupt entry usually leads to unbounded stack depot growth.
641 	 *
642 	 * Since use of filter_irq_stacks() is a requirement to ensure stack
643 	 * depot can efficiently deduplicate interrupt stacks, always
644 	 * filter_irq_stacks() to simplify all callers' use of stack depot.
645 	 */
646 	nr_entries = filter_irq_stacks(entries, nr_entries);
647 
648 	if (unlikely(nr_entries == 0) || stack_depot_disabled)
649 		return 0;
650 
651 	hash = hash_stack(entries, nr_entries);
652 	bucket = &stack_table[hash & stack_hash_mask];
653 
654 	/* Fast path: look the stack trace up without locking. */
655 	found = find_stack(bucket, entries, nr_entries, hash, depot_flags);
656 	if (found)
657 		goto exit;
658 
659 	/*
660 	 * Allocate memory for a new pool if required now:
661 	 * we won't be able to do that under the lock.
662 	 */
663 	if (unlikely(can_alloc && READ_ONCE(new_pool_required))) {
664 		/*
665 		 * Zero out zone modifiers, as we don't have specific zone
666 		 * requirements. Keep the flags related to allocation in atomic
667 		 * contexts and I/O.
668 		 */
669 		alloc_flags &= ~GFP_ZONEMASK;
670 		alloc_flags &= (GFP_ATOMIC | GFP_KERNEL);
671 		alloc_flags |= __GFP_NOWARN;
672 		page = alloc_pages(alloc_flags, DEPOT_POOL_ORDER);
673 		if (page)
674 			prealloc = page_address(page);
675 	}
676 
677 	raw_spin_lock_irqsave(&pool_lock, flags);
678 	printk_deferred_enter();
679 
680 	/* Try to find again, to avoid concurrently inserting duplicates. */
681 	found = find_stack(bucket, entries, nr_entries, hash, depot_flags);
682 	if (!found) {
683 		struct stack_record *new =
684 			depot_alloc_stack(entries, nr_entries, hash, &prealloc);
685 
686 		if (new) {
687 			/*
688 			 * This releases the stack record into the bucket and
689 			 * makes it visible to readers in find_stack().
690 			 */
691 			list_add_rcu(&new->hash_list, bucket);
692 			found = new;
693 		}
694 	}
695 
696 	if (prealloc) {
697 		/*
698 		 * Either stack depot already contains this stack trace, or
699 		 * depot_alloc_stack() did not consume the preallocated memory.
700 		 * Try to keep the preallocated memory for future.
701 		 */
702 		depot_keep_new_pool(&prealloc);
703 	}
704 
705 	printk_deferred_exit();
706 	raw_spin_unlock_irqrestore(&pool_lock, flags);
707 exit:
708 	if (prealloc) {
709 		/* Stack depot didn't use this memory, free it. */
710 		free_pages((unsigned long)prealloc, DEPOT_POOL_ORDER);
711 	}
712 	if (found)
713 		handle = found->handle.handle;
714 	return handle;
715 }
716 EXPORT_SYMBOL_GPL(stack_depot_save_flags);
717 
718 depot_stack_handle_t stack_depot_save(unsigned long *entries,
719 				      unsigned int nr_entries,
720 				      gfp_t alloc_flags)
721 {
722 	return stack_depot_save_flags(entries, nr_entries, alloc_flags,
723 				      STACK_DEPOT_FLAG_CAN_ALLOC);
724 }
725 EXPORT_SYMBOL_GPL(stack_depot_save);
726 
727 unsigned int stack_depot_fetch(depot_stack_handle_t handle,
728 			       unsigned long **entries)
729 {
730 	struct stack_record *stack;
731 
732 	*entries = NULL;
733 	/*
734 	 * Let KMSAN know *entries is initialized. This shall prevent false
735 	 * positive reports if instrumented code accesses it.
736 	 */
737 	kmsan_unpoison_memory(entries, sizeof(*entries));
738 
739 	if (!handle || stack_depot_disabled)
740 		return 0;
741 
742 	stack = depot_fetch_stack(handle);
743 	/*
744 	 * Should never be NULL, otherwise this is a use-after-put (or just a
745 	 * corrupt handle).
746 	 */
747 	if (WARN(!stack, "corrupt handle or use after stack_depot_put()"))
748 		return 0;
749 
750 	*entries = stack->entries;
751 	return stack->size;
752 }
753 EXPORT_SYMBOL_GPL(stack_depot_fetch);
754 
755 void stack_depot_put(depot_stack_handle_t handle)
756 {
757 	struct stack_record *stack;
758 
759 	if (!handle || stack_depot_disabled)
760 		return;
761 
762 	stack = depot_fetch_stack(handle);
763 	/*
764 	 * Should always be able to find the stack record, otherwise this is an
765 	 * unbalanced put attempt (or corrupt handle).
766 	 */
767 	if (WARN(!stack, "corrupt handle or unbalanced stack_depot_put()"))
768 		return;
769 
770 	if (refcount_dec_and_test(&stack->count))
771 		depot_free_stack(stack);
772 }
773 EXPORT_SYMBOL_GPL(stack_depot_put);
774 
775 void stack_depot_print(depot_stack_handle_t stack)
776 {
777 	unsigned long *entries;
778 	unsigned int nr_entries;
779 
780 	nr_entries = stack_depot_fetch(stack, &entries);
781 	if (nr_entries > 0)
782 		stack_trace_print(entries, nr_entries, 0);
783 }
784 EXPORT_SYMBOL_GPL(stack_depot_print);
785 
786 int stack_depot_snprint(depot_stack_handle_t handle, char *buf, size_t size,
787 		       int spaces)
788 {
789 	unsigned long *entries;
790 	unsigned int nr_entries;
791 
792 	nr_entries = stack_depot_fetch(handle, &entries);
793 	return nr_entries ? stack_trace_snprint(buf, size, entries, nr_entries,
794 						spaces) : 0;
795 }
796 EXPORT_SYMBOL_GPL(stack_depot_snprint);
797 
798 depot_stack_handle_t __must_check stack_depot_set_extra_bits(
799 			depot_stack_handle_t handle, unsigned int extra_bits)
800 {
801 	union handle_parts parts = { .handle = handle };
802 
803 	/* Don't set extra bits on empty handles. */
804 	if (!handle)
805 		return 0;
806 
807 	parts.extra = extra_bits;
808 	return parts.handle;
809 }
810 EXPORT_SYMBOL(stack_depot_set_extra_bits);
811 
812 unsigned int stack_depot_get_extra_bits(depot_stack_handle_t handle)
813 {
814 	union handle_parts parts = { .handle = handle };
815 
816 	return parts.extra;
817 }
818 EXPORT_SYMBOL(stack_depot_get_extra_bits);
819 
820 static int stats_show(struct seq_file *seq, void *v)
821 {
822 	/*
823 	 * data race ok: These are just statistics counters, and approximate
824 	 * statistics are ok for debugging.
825 	 */
826 	seq_printf(seq, "pools: %d\n", data_race(pools_num));
827 	for (int i = 0; i < DEPOT_COUNTER_COUNT; i++)
828 		seq_printf(seq, "%s: %ld\n", counter_names[i], data_race(counters[i]));
829 
830 	return 0;
831 }
832 DEFINE_SHOW_ATTRIBUTE(stats);
833 
834 static int depot_debugfs_init(void)
835 {
836 	struct dentry *dir;
837 
838 	if (stack_depot_disabled)
839 		return 0;
840 
841 	dir = debugfs_create_dir("stackdepot", NULL);
842 	debugfs_create_file("stats", 0444, dir, NULL, &stats_fops);
843 	return 0;
844 }
845 late_initcall(depot_debugfs_init);
846