xref: /linux/lib/stackdepot.c (revision f6e0a4984c2e7244689ea87b62b433bed9d07e94)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Stack depot - a stack trace storage that avoids duplication.
4  *
5  * Internally, stack depot maintains a hash table of unique stacktraces. The
6  * stack traces themselves are stored contiguously one after another in a set
7  * of separate page allocations.
8  *
9  * Author: Alexander Potapenko <glider@google.com>
10  * Copyright (C) 2016 Google, Inc.
11  *
12  * Based on the code by Dmitry Chernenkov.
13  */
14 
15 #define pr_fmt(fmt) "stackdepot: " fmt
16 
17 #include <linux/debugfs.h>
18 #include <linux/gfp.h>
19 #include <linux/jhash.h>
20 #include <linux/kernel.h>
21 #include <linux/kmsan.h>
22 #include <linux/list.h>
23 #include <linux/mm.h>
24 #include <linux/mutex.h>
25 #include <linux/poison.h>
26 #include <linux/printk.h>
27 #include <linux/rculist.h>
28 #include <linux/rcupdate.h>
29 #include <linux/refcount.h>
30 #include <linux/slab.h>
31 #include <linux/spinlock.h>
32 #include <linux/stacktrace.h>
33 #include <linux/stackdepot.h>
34 #include <linux/string.h>
35 #include <linux/types.h>
36 #include <linux/memblock.h>
37 #include <linux/kasan-enabled.h>
38 
39 #define DEPOT_HANDLE_BITS (sizeof(depot_stack_handle_t) * 8)
40 
41 #define DEPOT_POOL_ORDER 2 /* Pool size order, 4 pages */
42 #define DEPOT_POOL_SIZE (1LL << (PAGE_SHIFT + DEPOT_POOL_ORDER))
43 #define DEPOT_STACK_ALIGN 4
44 #define DEPOT_OFFSET_BITS (DEPOT_POOL_ORDER + PAGE_SHIFT - DEPOT_STACK_ALIGN)
45 #define DEPOT_POOL_INDEX_BITS (DEPOT_HANDLE_BITS - DEPOT_OFFSET_BITS - \
46 			       STACK_DEPOT_EXTRA_BITS)
47 #define DEPOT_POOLS_CAP 8192
48 #define DEPOT_MAX_POOLS \
49 	(((1LL << (DEPOT_POOL_INDEX_BITS)) < DEPOT_POOLS_CAP) ? \
50 	 (1LL << (DEPOT_POOL_INDEX_BITS)) : DEPOT_POOLS_CAP)
51 
52 /* Compact structure that stores a reference to a stack. */
53 union handle_parts {
54 	depot_stack_handle_t handle;
55 	struct {
56 		u32 pool_index	: DEPOT_POOL_INDEX_BITS;
57 		u32 offset	: DEPOT_OFFSET_BITS;
58 		u32 extra	: STACK_DEPOT_EXTRA_BITS;
59 	};
60 };
61 
62 struct stack_record {
63 	struct list_head hash_list;	/* Links in the hash table */
64 	u32 hash;			/* Hash in hash table */
65 	u32 size;			/* Number of stored frames */
66 	union handle_parts handle;	/* Constant after initialization */
67 	refcount_t count;
68 	union {
69 		unsigned long entries[CONFIG_STACKDEPOT_MAX_FRAMES];	/* Frames */
70 		struct {
71 			/*
72 			 * An important invariant of the implementation is to
73 			 * only place a stack record onto the freelist iff its
74 			 * refcount is zero. Because stack records with a zero
75 			 * refcount are never considered as valid, it is safe to
76 			 * union @entries and freelist management state below.
77 			 * Conversely, as soon as an entry is off the freelist
78 			 * and its refcount becomes non-zero, the below must not
79 			 * be accessed until being placed back on the freelist.
80 			 */
81 			struct list_head free_list;	/* Links in the freelist */
82 			unsigned long rcu_state;	/* RCU cookie */
83 		};
84 	};
85 };
86 
87 static bool stack_depot_disabled;
88 static bool __stack_depot_early_init_requested __initdata = IS_ENABLED(CONFIG_STACKDEPOT_ALWAYS_INIT);
89 static bool __stack_depot_early_init_passed __initdata;
90 
91 /* Use one hash table bucket per 16 KB of memory. */
92 #define STACK_HASH_TABLE_SCALE 14
93 /* Limit the number of buckets between 4K and 1M. */
94 #define STACK_BUCKET_NUMBER_ORDER_MIN 12
95 #define STACK_BUCKET_NUMBER_ORDER_MAX 20
96 /* Initial seed for jhash2. */
97 #define STACK_HASH_SEED 0x9747b28c
98 
99 /* Hash table of stored stack records. */
100 static struct list_head *stack_table;
101 /* Fixed order of the number of table buckets. Used when KASAN is enabled. */
102 static unsigned int stack_bucket_number_order;
103 /* Hash mask for indexing the table. */
104 static unsigned int stack_hash_mask;
105 
106 /* Array of memory regions that store stack records. */
107 static void *stack_pools[DEPOT_MAX_POOLS];
108 /* Newly allocated pool that is not yet added to stack_pools. */
109 static void *new_pool;
110 /* Number of pools in stack_pools. */
111 static int pools_num;
112 /* Offset to the unused space in the currently used pool. */
113 static size_t pool_offset = DEPOT_POOL_SIZE;
114 /* Freelist of stack records within stack_pools. */
115 static LIST_HEAD(free_stacks);
116 /* The lock must be held when performing pool or freelist modifications. */
117 static DEFINE_RAW_SPINLOCK(pool_lock);
118 
119 /* Statistics counters for debugfs. */
120 enum depot_counter_id {
121 	DEPOT_COUNTER_REFD_ALLOCS,
122 	DEPOT_COUNTER_REFD_FREES,
123 	DEPOT_COUNTER_REFD_INUSE,
124 	DEPOT_COUNTER_FREELIST_SIZE,
125 	DEPOT_COUNTER_PERSIST_COUNT,
126 	DEPOT_COUNTER_PERSIST_BYTES,
127 	DEPOT_COUNTER_COUNT,
128 };
129 static long counters[DEPOT_COUNTER_COUNT];
130 static const char *const counter_names[] = {
131 	[DEPOT_COUNTER_REFD_ALLOCS]	= "refcounted_allocations",
132 	[DEPOT_COUNTER_REFD_FREES]	= "refcounted_frees",
133 	[DEPOT_COUNTER_REFD_INUSE]	= "refcounted_in_use",
134 	[DEPOT_COUNTER_FREELIST_SIZE]	= "freelist_size",
135 	[DEPOT_COUNTER_PERSIST_COUNT]	= "persistent_count",
136 	[DEPOT_COUNTER_PERSIST_BYTES]	= "persistent_bytes",
137 };
138 static_assert(ARRAY_SIZE(counter_names) == DEPOT_COUNTER_COUNT);
139 
140 static int __init disable_stack_depot(char *str)
141 {
142 	return kstrtobool(str, &stack_depot_disabled);
143 }
144 early_param("stack_depot_disable", disable_stack_depot);
145 
146 void __init stack_depot_request_early_init(void)
147 {
148 	/* Too late to request early init now. */
149 	WARN_ON(__stack_depot_early_init_passed);
150 
151 	__stack_depot_early_init_requested = true;
152 }
153 
154 /* Initialize list_head's within the hash table. */
155 static void init_stack_table(unsigned long entries)
156 {
157 	unsigned long i;
158 
159 	for (i = 0; i < entries; i++)
160 		INIT_LIST_HEAD(&stack_table[i]);
161 }
162 
163 /* Allocates a hash table via memblock. Can only be used during early boot. */
164 int __init stack_depot_early_init(void)
165 {
166 	unsigned long entries = 0;
167 
168 	/* This function must be called only once, from mm_init(). */
169 	if (WARN_ON(__stack_depot_early_init_passed))
170 		return 0;
171 	__stack_depot_early_init_passed = true;
172 
173 	/*
174 	 * Print disabled message even if early init has not been requested:
175 	 * stack_depot_init() will not print one.
176 	 */
177 	if (stack_depot_disabled) {
178 		pr_info("disabled\n");
179 		return 0;
180 	}
181 
182 	/*
183 	 * If KASAN is enabled, use the maximum order: KASAN is frequently used
184 	 * in fuzzing scenarios, which leads to a large number of different
185 	 * stack traces being stored in stack depot.
186 	 */
187 	if (kasan_enabled() && !stack_bucket_number_order)
188 		stack_bucket_number_order = STACK_BUCKET_NUMBER_ORDER_MAX;
189 
190 	/*
191 	 * Check if early init has been requested after setting
192 	 * stack_bucket_number_order: stack_depot_init() uses its value.
193 	 */
194 	if (!__stack_depot_early_init_requested)
195 		return 0;
196 
197 	/*
198 	 * If stack_bucket_number_order is not set, leave entries as 0 to rely
199 	 * on the automatic calculations performed by alloc_large_system_hash().
200 	 */
201 	if (stack_bucket_number_order)
202 		entries = 1UL << stack_bucket_number_order;
203 	pr_info("allocating hash table via alloc_large_system_hash\n");
204 	stack_table = alloc_large_system_hash("stackdepot",
205 						sizeof(struct list_head),
206 						entries,
207 						STACK_HASH_TABLE_SCALE,
208 						HASH_EARLY,
209 						NULL,
210 						&stack_hash_mask,
211 						1UL << STACK_BUCKET_NUMBER_ORDER_MIN,
212 						1UL << STACK_BUCKET_NUMBER_ORDER_MAX);
213 	if (!stack_table) {
214 		pr_err("hash table allocation failed, disabling\n");
215 		stack_depot_disabled = true;
216 		return -ENOMEM;
217 	}
218 	if (!entries) {
219 		/*
220 		 * Obtain the number of entries that was calculated by
221 		 * alloc_large_system_hash().
222 		 */
223 		entries = stack_hash_mask + 1;
224 	}
225 	init_stack_table(entries);
226 
227 	return 0;
228 }
229 
230 /* Allocates a hash table via kvcalloc. Can be used after boot. */
231 int stack_depot_init(void)
232 {
233 	static DEFINE_MUTEX(stack_depot_init_mutex);
234 	unsigned long entries;
235 	int ret = 0;
236 
237 	mutex_lock(&stack_depot_init_mutex);
238 
239 	if (stack_depot_disabled || stack_table)
240 		goto out_unlock;
241 
242 	/*
243 	 * Similarly to stack_depot_early_init, use stack_bucket_number_order
244 	 * if assigned, and rely on automatic scaling otherwise.
245 	 */
246 	if (stack_bucket_number_order) {
247 		entries = 1UL << stack_bucket_number_order;
248 	} else {
249 		int scale = STACK_HASH_TABLE_SCALE;
250 
251 		entries = nr_free_buffer_pages();
252 		entries = roundup_pow_of_two(entries);
253 
254 		if (scale > PAGE_SHIFT)
255 			entries >>= (scale - PAGE_SHIFT);
256 		else
257 			entries <<= (PAGE_SHIFT - scale);
258 	}
259 
260 	if (entries < 1UL << STACK_BUCKET_NUMBER_ORDER_MIN)
261 		entries = 1UL << STACK_BUCKET_NUMBER_ORDER_MIN;
262 	if (entries > 1UL << STACK_BUCKET_NUMBER_ORDER_MAX)
263 		entries = 1UL << STACK_BUCKET_NUMBER_ORDER_MAX;
264 
265 	pr_info("allocating hash table of %lu entries via kvcalloc\n", entries);
266 	stack_table = kvcalloc(entries, sizeof(struct list_head), GFP_KERNEL);
267 	if (!stack_table) {
268 		pr_err("hash table allocation failed, disabling\n");
269 		stack_depot_disabled = true;
270 		ret = -ENOMEM;
271 		goto out_unlock;
272 	}
273 	stack_hash_mask = entries - 1;
274 	init_stack_table(entries);
275 
276 out_unlock:
277 	mutex_unlock(&stack_depot_init_mutex);
278 
279 	return ret;
280 }
281 EXPORT_SYMBOL_GPL(stack_depot_init);
282 
283 /*
284  * Initializes new stack pool, and updates the list of pools.
285  */
286 static bool depot_init_pool(void **prealloc)
287 {
288 	lockdep_assert_held(&pool_lock);
289 
290 	if (unlikely(pools_num >= DEPOT_MAX_POOLS)) {
291 		/* Bail out if we reached the pool limit. */
292 		WARN_ON_ONCE(pools_num > DEPOT_MAX_POOLS); /* should never happen */
293 		WARN_ON_ONCE(!new_pool); /* to avoid unnecessary pre-allocation */
294 		WARN_ONCE(1, "Stack depot reached limit capacity");
295 		return false;
296 	}
297 
298 	if (!new_pool && *prealloc) {
299 		/* We have preallocated memory, use it. */
300 		WRITE_ONCE(new_pool, *prealloc);
301 		*prealloc = NULL;
302 	}
303 
304 	if (!new_pool)
305 		return false; /* new_pool and *prealloc are NULL */
306 
307 	/* Save reference to the pool to be used by depot_fetch_stack(). */
308 	stack_pools[pools_num] = new_pool;
309 
310 	/*
311 	 * Stack depot tries to keep an extra pool allocated even before it runs
312 	 * out of space in the currently used pool.
313 	 *
314 	 * To indicate that a new preallocation is needed new_pool is reset to
315 	 * NULL; do not reset to NULL if we have reached the maximum number of
316 	 * pools.
317 	 */
318 	if (pools_num < DEPOT_MAX_POOLS)
319 		WRITE_ONCE(new_pool, NULL);
320 	else
321 		WRITE_ONCE(new_pool, STACK_DEPOT_POISON);
322 
323 	/* Pairs with concurrent READ_ONCE() in depot_fetch_stack(). */
324 	WRITE_ONCE(pools_num, pools_num + 1);
325 	ASSERT_EXCLUSIVE_WRITER(pools_num);
326 
327 	pool_offset = 0;
328 
329 	return true;
330 }
331 
332 /* Keeps the preallocated memory to be used for a new stack depot pool. */
333 static void depot_keep_new_pool(void **prealloc)
334 {
335 	lockdep_assert_held(&pool_lock);
336 
337 	/*
338 	 * If a new pool is already saved or the maximum number of
339 	 * pools is reached, do not use the preallocated memory.
340 	 */
341 	if (new_pool)
342 		return;
343 
344 	WRITE_ONCE(new_pool, *prealloc);
345 	*prealloc = NULL;
346 }
347 
348 /*
349  * Try to initialize a new stack record from the current pool, a cached pool, or
350  * the current pre-allocation.
351  */
352 static struct stack_record *depot_pop_free_pool(void **prealloc, size_t size)
353 {
354 	struct stack_record *stack;
355 	void *current_pool;
356 	u32 pool_index;
357 
358 	lockdep_assert_held(&pool_lock);
359 
360 	if (pool_offset + size > DEPOT_POOL_SIZE) {
361 		if (!depot_init_pool(prealloc))
362 			return NULL;
363 	}
364 
365 	if (WARN_ON_ONCE(pools_num < 1))
366 		return NULL;
367 	pool_index = pools_num - 1;
368 	current_pool = stack_pools[pool_index];
369 	if (WARN_ON_ONCE(!current_pool))
370 		return NULL;
371 
372 	stack = current_pool + pool_offset;
373 
374 	/* Pre-initialize handle once. */
375 	stack->handle.pool_index = pool_index;
376 	stack->handle.offset = pool_offset >> DEPOT_STACK_ALIGN;
377 	stack->handle.extra = 0;
378 	INIT_LIST_HEAD(&stack->hash_list);
379 
380 	pool_offset += size;
381 
382 	return stack;
383 }
384 
385 /* Try to find next free usable entry from the freelist. */
386 static struct stack_record *depot_pop_free(void)
387 {
388 	struct stack_record *stack;
389 
390 	lockdep_assert_held(&pool_lock);
391 
392 	if (list_empty(&free_stacks))
393 		return NULL;
394 
395 	/*
396 	 * We maintain the invariant that the elements in front are least
397 	 * recently used, and are therefore more likely to be associated with an
398 	 * RCU grace period in the past. Consequently it is sufficient to only
399 	 * check the first entry.
400 	 */
401 	stack = list_first_entry(&free_stacks, struct stack_record, free_list);
402 	if (!poll_state_synchronize_rcu(stack->rcu_state))
403 		return NULL;
404 
405 	list_del(&stack->free_list);
406 	counters[DEPOT_COUNTER_FREELIST_SIZE]--;
407 
408 	return stack;
409 }
410 
411 static inline size_t depot_stack_record_size(struct stack_record *s, unsigned int nr_entries)
412 {
413 	const size_t used = flex_array_size(s, entries, nr_entries);
414 	const size_t unused = sizeof(s->entries) - used;
415 
416 	WARN_ON_ONCE(sizeof(s->entries) < used);
417 
418 	return ALIGN(sizeof(struct stack_record) - unused, 1 << DEPOT_STACK_ALIGN);
419 }
420 
421 /* Allocates a new stack in a stack depot pool. */
422 static struct stack_record *
423 depot_alloc_stack(unsigned long *entries, unsigned int nr_entries, u32 hash, depot_flags_t flags, void **prealloc)
424 {
425 	struct stack_record *stack = NULL;
426 	size_t record_size;
427 
428 	lockdep_assert_held(&pool_lock);
429 
430 	/* This should already be checked by public API entry points. */
431 	if (WARN_ON_ONCE(!nr_entries))
432 		return NULL;
433 
434 	/* Limit number of saved frames to CONFIG_STACKDEPOT_MAX_FRAMES. */
435 	if (nr_entries > CONFIG_STACKDEPOT_MAX_FRAMES)
436 		nr_entries = CONFIG_STACKDEPOT_MAX_FRAMES;
437 
438 	if (flags & STACK_DEPOT_FLAG_GET) {
439 		/*
440 		 * Evictable entries have to allocate the max. size so they may
441 		 * safely be re-used by differently sized allocations.
442 		 */
443 		record_size = depot_stack_record_size(stack, CONFIG_STACKDEPOT_MAX_FRAMES);
444 		stack = depot_pop_free();
445 	} else {
446 		record_size = depot_stack_record_size(stack, nr_entries);
447 	}
448 
449 	if (!stack) {
450 		stack = depot_pop_free_pool(prealloc, record_size);
451 		if (!stack)
452 			return NULL;
453 	}
454 
455 	/* Save the stack trace. */
456 	stack->hash = hash;
457 	stack->size = nr_entries;
458 	/* stack->handle is already filled in by depot_pop_free_pool(). */
459 	memcpy(stack->entries, entries, flex_array_size(stack, entries, nr_entries));
460 
461 	if (flags & STACK_DEPOT_FLAG_GET) {
462 		refcount_set(&stack->count, 1);
463 		counters[DEPOT_COUNTER_REFD_ALLOCS]++;
464 		counters[DEPOT_COUNTER_REFD_INUSE]++;
465 	} else {
466 		/* Warn on attempts to switch to refcounting this entry. */
467 		refcount_set(&stack->count, REFCOUNT_SATURATED);
468 		counters[DEPOT_COUNTER_PERSIST_COUNT]++;
469 		counters[DEPOT_COUNTER_PERSIST_BYTES] += record_size;
470 	}
471 
472 	/*
473 	 * Let KMSAN know the stored stack record is initialized. This shall
474 	 * prevent false positive reports if instrumented code accesses it.
475 	 */
476 	kmsan_unpoison_memory(stack, record_size);
477 
478 	return stack;
479 }
480 
481 static struct stack_record *depot_fetch_stack(depot_stack_handle_t handle)
482 {
483 	const int pools_num_cached = READ_ONCE(pools_num);
484 	union handle_parts parts = { .handle = handle };
485 	void *pool;
486 	size_t offset = parts.offset << DEPOT_STACK_ALIGN;
487 	struct stack_record *stack;
488 
489 	lockdep_assert_not_held(&pool_lock);
490 
491 	if (parts.pool_index > pools_num_cached) {
492 		WARN(1, "pool index %d out of bounds (%d) for stack id %08x\n",
493 		     parts.pool_index, pools_num_cached, handle);
494 		return NULL;
495 	}
496 
497 	pool = stack_pools[parts.pool_index];
498 	if (WARN_ON(!pool))
499 		return NULL;
500 
501 	stack = pool + offset;
502 	if (WARN_ON(!refcount_read(&stack->count)))
503 		return NULL;
504 
505 	return stack;
506 }
507 
508 /* Links stack into the freelist. */
509 static void depot_free_stack(struct stack_record *stack)
510 {
511 	unsigned long flags;
512 
513 	lockdep_assert_not_held(&pool_lock);
514 
515 	raw_spin_lock_irqsave(&pool_lock, flags);
516 	printk_deferred_enter();
517 
518 	/*
519 	 * Remove the entry from the hash list. Concurrent list traversal may
520 	 * still observe the entry, but since the refcount is zero, this entry
521 	 * will no longer be considered as valid.
522 	 */
523 	list_del_rcu(&stack->hash_list);
524 
525 	/*
526 	 * Due to being used from constrained contexts such as the allocators,
527 	 * NMI, or even RCU itself, stack depot cannot rely on primitives that
528 	 * would sleep (such as synchronize_rcu()) or recursively call into
529 	 * stack depot again (such as call_rcu()).
530 	 *
531 	 * Instead, get an RCU cookie, so that we can ensure this entry isn't
532 	 * moved onto another list until the next grace period, and concurrent
533 	 * RCU list traversal remains safe.
534 	 */
535 	stack->rcu_state = get_state_synchronize_rcu();
536 
537 	/*
538 	 * Add the entry to the freelist tail, so that older entries are
539 	 * considered first - their RCU cookie is more likely to no longer be
540 	 * associated with the current grace period.
541 	 */
542 	list_add_tail(&stack->free_list, &free_stacks);
543 
544 	counters[DEPOT_COUNTER_FREELIST_SIZE]++;
545 	counters[DEPOT_COUNTER_REFD_FREES]++;
546 	counters[DEPOT_COUNTER_REFD_INUSE]--;
547 
548 	printk_deferred_exit();
549 	raw_spin_unlock_irqrestore(&pool_lock, flags);
550 }
551 
552 /* Calculates the hash for a stack. */
553 static inline u32 hash_stack(unsigned long *entries, unsigned int size)
554 {
555 	return jhash2((u32 *)entries,
556 		      array_size(size,  sizeof(*entries)) / sizeof(u32),
557 		      STACK_HASH_SEED);
558 }
559 
560 /*
561  * Non-instrumented version of memcmp().
562  * Does not check the lexicographical order, only the equality.
563  */
564 static inline
565 int stackdepot_memcmp(const unsigned long *u1, const unsigned long *u2,
566 			unsigned int n)
567 {
568 	for ( ; n-- ; u1++, u2++) {
569 		if (*u1 != *u2)
570 			return 1;
571 	}
572 	return 0;
573 }
574 
575 /* Finds a stack in a bucket of the hash table. */
576 static inline struct stack_record *find_stack(struct list_head *bucket,
577 					      unsigned long *entries, int size,
578 					      u32 hash, depot_flags_t flags)
579 {
580 	struct stack_record *stack, *ret = NULL;
581 
582 	/*
583 	 * Stack depot may be used from instrumentation that instruments RCU or
584 	 * tracing itself; use variant that does not call into RCU and cannot be
585 	 * traced.
586 	 *
587 	 * Note: Such use cases must take care when using refcounting to evict
588 	 * unused entries, because the stack record free-then-reuse code paths
589 	 * do call into RCU.
590 	 */
591 	rcu_read_lock_sched_notrace();
592 
593 	list_for_each_entry_rcu(stack, bucket, hash_list) {
594 		if (stack->hash != hash || stack->size != size)
595 			continue;
596 
597 		/*
598 		 * This may race with depot_free_stack() accessing the freelist
599 		 * management state unioned with @entries. The refcount is zero
600 		 * in that case and the below refcount_inc_not_zero() will fail.
601 		 */
602 		if (data_race(stackdepot_memcmp(entries, stack->entries, size)))
603 			continue;
604 
605 		/*
606 		 * Try to increment refcount. If this succeeds, the stack record
607 		 * is valid and has not yet been freed.
608 		 *
609 		 * If STACK_DEPOT_FLAG_GET is not used, it is undefined behavior
610 		 * to then call stack_depot_put() later, and we can assume that
611 		 * a stack record is never placed back on the freelist.
612 		 */
613 		if ((flags & STACK_DEPOT_FLAG_GET) && !refcount_inc_not_zero(&stack->count))
614 			continue;
615 
616 		ret = stack;
617 		break;
618 	}
619 
620 	rcu_read_unlock_sched_notrace();
621 
622 	return ret;
623 }
624 
625 depot_stack_handle_t stack_depot_save_flags(unsigned long *entries,
626 					    unsigned int nr_entries,
627 					    gfp_t alloc_flags,
628 					    depot_flags_t depot_flags)
629 {
630 	struct list_head *bucket;
631 	struct stack_record *found = NULL;
632 	depot_stack_handle_t handle = 0;
633 	struct page *page = NULL;
634 	void *prealloc = NULL;
635 	bool can_alloc = depot_flags & STACK_DEPOT_FLAG_CAN_ALLOC;
636 	unsigned long flags;
637 	u32 hash;
638 
639 	if (WARN_ON(depot_flags & ~STACK_DEPOT_FLAGS_MASK))
640 		return 0;
641 
642 	/*
643 	 * If this stack trace is from an interrupt, including anything before
644 	 * interrupt entry usually leads to unbounded stack depot growth.
645 	 *
646 	 * Since use of filter_irq_stacks() is a requirement to ensure stack
647 	 * depot can efficiently deduplicate interrupt stacks, always
648 	 * filter_irq_stacks() to simplify all callers' use of stack depot.
649 	 */
650 	nr_entries = filter_irq_stacks(entries, nr_entries);
651 
652 	if (unlikely(nr_entries == 0) || stack_depot_disabled)
653 		return 0;
654 
655 	hash = hash_stack(entries, nr_entries);
656 	bucket = &stack_table[hash & stack_hash_mask];
657 
658 	/* Fast path: look the stack trace up without locking. */
659 	found = find_stack(bucket, entries, nr_entries, hash, depot_flags);
660 	if (found)
661 		goto exit;
662 
663 	/*
664 	 * Allocate memory for a new pool if required now:
665 	 * we won't be able to do that under the lock.
666 	 */
667 	if (unlikely(can_alloc && !READ_ONCE(new_pool))) {
668 		/*
669 		 * Zero out zone modifiers, as we don't have specific zone
670 		 * requirements. Keep the flags related to allocation in atomic
671 		 * contexts and I/O.
672 		 */
673 		alloc_flags &= ~GFP_ZONEMASK;
674 		alloc_flags &= (GFP_ATOMIC | GFP_KERNEL);
675 		alloc_flags |= __GFP_NOWARN;
676 		page = alloc_pages(alloc_flags, DEPOT_POOL_ORDER);
677 		if (page)
678 			prealloc = page_address(page);
679 	}
680 
681 	raw_spin_lock_irqsave(&pool_lock, flags);
682 	printk_deferred_enter();
683 
684 	/* Try to find again, to avoid concurrently inserting duplicates. */
685 	found = find_stack(bucket, entries, nr_entries, hash, depot_flags);
686 	if (!found) {
687 		struct stack_record *new =
688 			depot_alloc_stack(entries, nr_entries, hash, depot_flags, &prealloc);
689 
690 		if (new) {
691 			/*
692 			 * This releases the stack record into the bucket and
693 			 * makes it visible to readers in find_stack().
694 			 */
695 			list_add_rcu(&new->hash_list, bucket);
696 			found = new;
697 		}
698 	}
699 
700 	if (prealloc) {
701 		/*
702 		 * Either stack depot already contains this stack trace, or
703 		 * depot_alloc_stack() did not consume the preallocated memory.
704 		 * Try to keep the preallocated memory for future.
705 		 */
706 		depot_keep_new_pool(&prealloc);
707 	}
708 
709 	printk_deferred_exit();
710 	raw_spin_unlock_irqrestore(&pool_lock, flags);
711 exit:
712 	if (prealloc) {
713 		/* Stack depot didn't use this memory, free it. */
714 		free_pages((unsigned long)prealloc, DEPOT_POOL_ORDER);
715 	}
716 	if (found)
717 		handle = found->handle.handle;
718 	return handle;
719 }
720 EXPORT_SYMBOL_GPL(stack_depot_save_flags);
721 
722 depot_stack_handle_t stack_depot_save(unsigned long *entries,
723 				      unsigned int nr_entries,
724 				      gfp_t alloc_flags)
725 {
726 	return stack_depot_save_flags(entries, nr_entries, alloc_flags,
727 				      STACK_DEPOT_FLAG_CAN_ALLOC);
728 }
729 EXPORT_SYMBOL_GPL(stack_depot_save);
730 
731 unsigned int stack_depot_fetch(depot_stack_handle_t handle,
732 			       unsigned long **entries)
733 {
734 	struct stack_record *stack;
735 
736 	*entries = NULL;
737 	/*
738 	 * Let KMSAN know *entries is initialized. This shall prevent false
739 	 * positive reports if instrumented code accesses it.
740 	 */
741 	kmsan_unpoison_memory(entries, sizeof(*entries));
742 
743 	if (!handle || stack_depot_disabled)
744 		return 0;
745 
746 	stack = depot_fetch_stack(handle);
747 	/*
748 	 * Should never be NULL, otherwise this is a use-after-put (or just a
749 	 * corrupt handle).
750 	 */
751 	if (WARN(!stack, "corrupt handle or use after stack_depot_put()"))
752 		return 0;
753 
754 	*entries = stack->entries;
755 	return stack->size;
756 }
757 EXPORT_SYMBOL_GPL(stack_depot_fetch);
758 
759 void stack_depot_put(depot_stack_handle_t handle)
760 {
761 	struct stack_record *stack;
762 
763 	if (!handle || stack_depot_disabled)
764 		return;
765 
766 	stack = depot_fetch_stack(handle);
767 	/*
768 	 * Should always be able to find the stack record, otherwise this is an
769 	 * unbalanced put attempt (or corrupt handle).
770 	 */
771 	if (WARN(!stack, "corrupt handle or unbalanced stack_depot_put()"))
772 		return;
773 
774 	if (refcount_dec_and_test(&stack->count))
775 		depot_free_stack(stack);
776 }
777 EXPORT_SYMBOL_GPL(stack_depot_put);
778 
779 void stack_depot_print(depot_stack_handle_t stack)
780 {
781 	unsigned long *entries;
782 	unsigned int nr_entries;
783 
784 	nr_entries = stack_depot_fetch(stack, &entries);
785 	if (nr_entries > 0)
786 		stack_trace_print(entries, nr_entries, 0);
787 }
788 EXPORT_SYMBOL_GPL(stack_depot_print);
789 
790 int stack_depot_snprint(depot_stack_handle_t handle, char *buf, size_t size,
791 		       int spaces)
792 {
793 	unsigned long *entries;
794 	unsigned int nr_entries;
795 
796 	nr_entries = stack_depot_fetch(handle, &entries);
797 	return nr_entries ? stack_trace_snprint(buf, size, entries, nr_entries,
798 						spaces) : 0;
799 }
800 EXPORT_SYMBOL_GPL(stack_depot_snprint);
801 
802 depot_stack_handle_t __must_check stack_depot_set_extra_bits(
803 			depot_stack_handle_t handle, unsigned int extra_bits)
804 {
805 	union handle_parts parts = { .handle = handle };
806 
807 	/* Don't set extra bits on empty handles. */
808 	if (!handle)
809 		return 0;
810 
811 	parts.extra = extra_bits;
812 	return parts.handle;
813 }
814 EXPORT_SYMBOL(stack_depot_set_extra_bits);
815 
816 unsigned int stack_depot_get_extra_bits(depot_stack_handle_t handle)
817 {
818 	union handle_parts parts = { .handle = handle };
819 
820 	return parts.extra;
821 }
822 EXPORT_SYMBOL(stack_depot_get_extra_bits);
823 
824 static int stats_show(struct seq_file *seq, void *v)
825 {
826 	/*
827 	 * data race ok: These are just statistics counters, and approximate
828 	 * statistics are ok for debugging.
829 	 */
830 	seq_printf(seq, "pools: %d\n", data_race(pools_num));
831 	for (int i = 0; i < DEPOT_COUNTER_COUNT; i++)
832 		seq_printf(seq, "%s: %ld\n", counter_names[i], data_race(counters[i]));
833 
834 	return 0;
835 }
836 DEFINE_SHOW_ATTRIBUTE(stats);
837 
838 static int depot_debugfs_init(void)
839 {
840 	struct dentry *dir;
841 
842 	if (stack_depot_disabled)
843 		return 0;
844 
845 	dir = debugfs_create_dir("stackdepot", NULL);
846 	debugfs_create_file("stats", 0444, dir, NULL, &stats_fops);
847 	return 0;
848 }
849 late_initcall(depot_debugfs_init);
850