xref: /linux/kernel/bpf/stackmap.c (revision 157317ba662a7c476320fdb334216154eaa8b856)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /* Copyright (c) 2016 Facebook
3  */
4 #include <linux/bpf.h>
5 #include <linux/jhash.h>
6 #include <linux/filter.h>
7 #include <linux/kernel.h>
8 #include <linux/stacktrace.h>
9 #include <linux/perf_event.h>
10 #include <linux/btf_ids.h>
11 #include <linux/buildid.h>
12 #include <linux/mmap_lock.h>
13 #include "percpu_freelist.h"
14 #include "mmap_unlock_work.h"
15 
16 #define STACK_CREATE_FLAG_MASK					\
17 	(BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY |	\
18 	 BPF_F_STACK_BUILD_ID)
19 
20 struct stack_map_bucket {
21 	struct pcpu_freelist_node fnode;
22 	u32 hash;
23 	u32 nr;
24 	u64 data[];
25 };
26 
27 struct bpf_stack_map {
28 	struct bpf_map map;
29 	void *elems;
30 	struct pcpu_freelist freelist;
31 	u32 n_buckets;
32 	struct stack_map_bucket *buckets[] __counted_by(n_buckets);
33 };
34 
35 static inline bool stack_map_use_build_id(struct bpf_map *map)
36 {
37 	return (map->map_flags & BPF_F_STACK_BUILD_ID);
38 }
39 
40 static inline int stack_map_data_size(struct bpf_map *map)
41 {
42 	return stack_map_use_build_id(map) ?
43 		sizeof(struct bpf_stack_build_id) : sizeof(u64);
44 }
45 
46 /**
47  * stack_map_calculate_max_depth - Calculate maximum allowed stack trace depth
48  * @size:  Size of the buffer/map value in bytes
49  * @elem_size:  Size of each stack trace element
50  * @flags:  BPF stack trace flags (BPF_F_USER_STACK, BPF_F_USER_BUILD_ID, ...)
51  *
52  * Return: Maximum number of stack trace entries that can be safely stored
53  */
54 static u32 stack_map_calculate_max_depth(u32 size, u32 elem_size, u64 flags)
55 {
56 	u32 skip = flags & BPF_F_SKIP_FIELD_MASK;
57 	u32 max_depth;
58 	u32 curr_sysctl_max_stack = READ_ONCE(sysctl_perf_event_max_stack);
59 
60 	max_depth = size / elem_size;
61 	max_depth += skip;
62 	if (max_depth > curr_sysctl_max_stack)
63 		return curr_sysctl_max_stack;
64 
65 	return max_depth;
66 }
67 
68 static int prealloc_elems_and_freelist(struct bpf_stack_map *smap)
69 {
70 	u64 elem_size = sizeof(struct stack_map_bucket) +
71 			(u64)smap->map.value_size;
72 	int err;
73 
74 	smap->elems = bpf_map_area_alloc(elem_size * smap->map.max_entries,
75 					 smap->map.numa_node);
76 	if (!smap->elems)
77 		return -ENOMEM;
78 
79 	err = pcpu_freelist_init(&smap->freelist);
80 	if (err)
81 		goto free_elems;
82 
83 	pcpu_freelist_populate(&smap->freelist, smap->elems, elem_size,
84 			       smap->map.max_entries);
85 	return 0;
86 
87 free_elems:
88 	bpf_map_area_free(smap->elems);
89 	return err;
90 }
91 
92 /* Called from syscall */
93 static struct bpf_map *stack_map_alloc(union bpf_attr *attr)
94 {
95 	u32 value_size = attr->value_size;
96 	struct bpf_stack_map *smap;
97 	u64 cost, n_buckets;
98 	int err;
99 
100 	if (attr->map_flags & ~STACK_CREATE_FLAG_MASK)
101 		return ERR_PTR(-EINVAL);
102 
103 	/* check sanity of attributes */
104 	if (attr->max_entries == 0 || attr->key_size != 4 ||
105 	    value_size < 8 || value_size % 8)
106 		return ERR_PTR(-EINVAL);
107 
108 	BUILD_BUG_ON(sizeof(struct bpf_stack_build_id) % sizeof(u64));
109 	if (attr->map_flags & BPF_F_STACK_BUILD_ID) {
110 		if (value_size % sizeof(struct bpf_stack_build_id) ||
111 		    value_size / sizeof(struct bpf_stack_build_id)
112 		    > sysctl_perf_event_max_stack)
113 			return ERR_PTR(-EINVAL);
114 	} else if (value_size / 8 > sysctl_perf_event_max_stack)
115 		return ERR_PTR(-EINVAL);
116 
117 	/* hash table size must be power of 2; roundup_pow_of_two() can overflow
118 	 * into UB on 32-bit arches, so check that first
119 	 */
120 	if (attr->max_entries > 1UL << 31)
121 		return ERR_PTR(-E2BIG);
122 
123 	n_buckets = roundup_pow_of_two(attr->max_entries);
124 
125 	cost = n_buckets * sizeof(struct stack_map_bucket *) + sizeof(*smap);
126 	smap = bpf_map_area_alloc(cost, bpf_map_attr_numa_node(attr));
127 	if (!smap)
128 		return ERR_PTR(-ENOMEM);
129 
130 	bpf_map_init_from_attr(&smap->map, attr);
131 	smap->n_buckets = n_buckets;
132 
133 	err = get_callchain_buffers(sysctl_perf_event_max_stack);
134 	if (err)
135 		goto free_smap;
136 
137 	err = prealloc_elems_and_freelist(smap);
138 	if (err)
139 		goto put_buffers;
140 
141 	return &smap->map;
142 
143 put_buffers:
144 	put_callchain_buffers();
145 free_smap:
146 	bpf_map_area_free(smap);
147 	return ERR_PTR(err);
148 }
149 
150 static int fetch_build_id(struct vm_area_struct *vma, unsigned char *build_id, bool may_fault)
151 {
152 	return may_fault ? build_id_parse(vma, build_id, NULL)
153 			 : build_id_parse_nofault(vma, build_id, NULL);
154 }
155 
156 static inline void stack_map_build_id_set_ip(struct bpf_stack_build_id *id)
157 {
158 	id->status = BPF_STACK_BUILD_ID_IP;
159 	memset(id->build_id, 0, BUILD_ID_SIZE_MAX);
160 }
161 
162 static inline u64 stack_map_build_id_offset(unsigned long vm_pgoff,
163 					    unsigned long vm_start, u64 ip)
164 {
165 	return (vm_pgoff << PAGE_SHIFT) + ip - vm_start;
166 }
167 
168 static inline void stack_map_build_id_set_valid(struct bpf_stack_build_id *id,
169 						u64 offset,
170 						const unsigned char *build_id)
171 {
172 	id->status = BPF_STACK_BUILD_ID_VALID;
173 	id->offset = offset;
174 	if (id->build_id != build_id)
175 		memcpy(id->build_id, build_id, BUILD_ID_SIZE_MAX);
176 }
177 
178 struct stack_map_vma_lock {
179 	struct vm_area_struct *vma;
180 	struct mm_struct *mm;
181 };
182 
183 /*
184  * Acquire a stable read-side reference on the VMA covering @ip.
185  *
186  * With CONFIG_PER_VMA_LOCK=y this returns a VMA with its per-VMA read
187  * lock held and mmap_lock dropped, so the caller may sleep.
188  *
189  * With CONFIG_PER_VMA_LOCK=n it returns a VMA with mmap_lock still
190  * held; the caller must snapshot any fields it needs and pin vm_file
191  * with get_file() before stack_map_unlock_vma() drops mmap_lock, as
192  * the VMA may be split, merged, or freed after that.
193  *
194  * Returns NULL on failure, in which case no lock is held.
195  */
196 static struct vm_area_struct *
197 stack_map_lock_vma(struct stack_map_vma_lock *lock, unsigned long ip)
198 {
199 	struct mm_struct *mm = lock->mm;
200 	struct vm_area_struct *vma;
201 
202 	/* noop under !CONFIG_PER_VMA_LOCK */
203 	vma = lock_vma_under_rcu(mm, ip);
204 	if (vma) {
205 		lock->vma = vma;
206 		return vma;
207 	}
208 
209 	/*
210 	 * Taking mmap_read_lock() is unsafe here, because the caller BPF
211 	 * program might already hold it, causing a deadlock.
212 	 */
213 	if (!mmap_read_trylock(mm))
214 		return NULL;
215 
216 	vma = vma_lookup(mm, ip);
217 	if (!vma) {
218 		mmap_read_unlock(mm);
219 		return NULL;
220 	}
221 
222 #ifdef CONFIG_PER_VMA_LOCK
223 	if (!vma_start_read_locked(vma)) {
224 		mmap_read_unlock(mm);
225 		return NULL;
226 	}
227 	mmap_read_unlock(mm);
228 #endif
229 
230 	lock->vma = vma;
231 	return vma;
232 }
233 
234 static void stack_map_unlock_vma(struct stack_map_vma_lock *lock)
235 {
236 #ifdef CONFIG_PER_VMA_LOCK
237 	vma_end_read(lock->vma);
238 #else
239 	mmap_read_unlock(lock->mm);
240 #endif
241 	lock->vma = NULL;
242 }
243 
244 static void stack_map_get_build_id_offset_sleepable(struct bpf_stack_build_id *id_offs,
245 						    u32 trace_nr)
246 {
247 	struct mm_struct *mm = current->mm;
248 	struct stack_map_vma_lock lock = { .mm = mm };
249 	struct {
250 		struct file *file;
251 		const unsigned char *build_id;
252 		unsigned long vm_start;
253 		unsigned long vm_end;
254 		unsigned long vm_pgoff;
255 	} cache = {};
256 	unsigned long vm_pgoff, vm_start, vm_end;
257 	struct vm_area_struct *vma;
258 	struct file *file;
259 	u64 offset;
260 	u64 ip;
261 
262 	for (u32 i = 0; i < trace_nr; i++) {
263 		ip = READ_ONCE(id_offs[i].ip);
264 
265 		/*
266 		 * Range cache fast path: if ip falls within the previously
267 		 * resolved VMA range, reuse the cache build_id without
268 		 * re-acquiring the VMA lock.
269 		 */
270 		if (cache.build_id && ip >= cache.vm_start && ip < cache.vm_end) {
271 			offset = stack_map_build_id_offset(cache.vm_pgoff, cache.vm_start, ip);
272 			stack_map_build_id_set_valid(&id_offs[i], offset, cache.build_id);
273 			continue;
274 		}
275 
276 		vma = stack_map_lock_vma(&lock, ip);
277 		if (!vma) {
278 			stack_map_build_id_set_ip(&id_offs[i]);
279 			continue;
280 		}
281 		if (vma_is_anonymous(vma) || !vma->vm_file) {
282 			stack_map_build_id_set_ip(&id_offs[i]);
283 			stack_map_unlock_vma(&lock);
284 			continue;
285 		}
286 
287 		file = vma->vm_file;
288 		vm_pgoff = vma->vm_pgoff;
289 		vm_start = vma->vm_start;
290 		vm_end = vma->vm_end;
291 		offset = stack_map_build_id_offset(vm_pgoff, vm_start, ip);
292 
293 		/*
294 		 * Same backing file as previous (e.g. different VMAs
295 		 * of the same ELF binary). Reuse the cache build_id.
296 		 */
297 		if (file == cache.file) {
298 			stack_map_unlock_vma(&lock);
299 			stack_map_build_id_set_valid(&id_offs[i], offset, cache.build_id);
300 			cache.vm_start = vm_start;
301 			cache.vm_end = vm_end;
302 			cache.vm_pgoff = vm_pgoff;
303 			continue;
304 		}
305 
306 		file = get_file(file);
307 		stack_map_unlock_vma(&lock);
308 
309 		/* build_id_parse_file() may block on filesystem reads */
310 		if (build_id_parse_file(file, id_offs[i].build_id, NULL)) {
311 			stack_map_build_id_set_ip(&id_offs[i]);
312 			fput(file);
313 			continue;
314 		}
315 
316 		stack_map_build_id_set_valid(&id_offs[i], offset, id_offs[i].build_id);
317 		if (cache.file)
318 			fput(cache.file);
319 		cache.file = file;
320 		cache.build_id = id_offs[i].build_id;
321 		cache.vm_start = vm_start;
322 		cache.vm_end = vm_end;
323 		cache.vm_pgoff = vm_pgoff;
324 	}
325 
326 	if (cache.file)
327 		fput(cache.file);
328 }
329 
330 /*
331  * Expects all id_offs[i].ip values to be set to correct initial IPs.
332  * They will be subsequently:
333  *   - either adjusted in place to a file offset, if build ID fetching
334  *     succeeds; in this case id_offs[i].build_id is set to correct build ID,
335  *     and id_offs[i].status is set to BPF_STACK_BUILD_ID_VALID;
336  *   - or IP will be kept intact, if build ID fetching failed; in this case
337  *     id_offs[i].build_id is zeroed out and id_offs[i].status is set to
338  *     BPF_STACK_BUILD_ID_IP.
339  */
340 static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs,
341 					  u32 trace_nr, bool user, bool may_fault)
342 {
343 	struct mmap_unlock_irq_work *work = NULL;
344 	bool irq_work_busy = bpf_mmap_unlock_get_irq_work(&work);
345 	bool has_user_ctx = user && current && current->mm;
346 	struct vm_area_struct *vma, *prev_vma = NULL;
347 	const unsigned char *prev_build_id = NULL;
348 	int i;
349 
350 	if (may_fault && has_user_ctx) {
351 		stack_map_get_build_id_offset_sleepable(id_offs, trace_nr);
352 		return;
353 	}
354 
355 	/* If the irq_work is in use, fall back to report ips. Same
356 	 * fallback is used for kernel stack (!user) on a stackmap with
357 	 * build_id.
358 	 */
359 	if (!has_user_ctx || irq_work_busy || !mmap_read_trylock(current->mm)) {
360 		/* cannot access current->mm, fall back to ips */
361 		for (i = 0; i < trace_nr; i++)
362 			stack_map_build_id_set_ip(&id_offs[i]);
363 		return;
364 	}
365 
366 	for (i = 0; i < trace_nr; i++) {
367 		u64 ip = READ_ONCE(id_offs[i].ip);
368 		u64 offset;
369 
370 		if (prev_build_id && range_in_vma(prev_vma, ip, ip)) {
371 			vma = prev_vma;
372 			offset = stack_map_build_id_offset(vma->vm_pgoff, vma->vm_start, ip);
373 			stack_map_build_id_set_valid(&id_offs[i], offset, prev_build_id);
374 			continue;
375 		}
376 		vma = find_vma(current->mm, ip);
377 		if (!vma || vma_is_anonymous(vma) ||
378 		    fetch_build_id(vma, id_offs[i].build_id, may_fault)) {
379 			/* per entry fall back to ips */
380 			stack_map_build_id_set_ip(&id_offs[i]);
381 			prev_vma = vma;
382 			prev_build_id = NULL;
383 			continue;
384 		}
385 		offset = stack_map_build_id_offset(vma->vm_pgoff, vma->vm_start, ip);
386 		stack_map_build_id_set_valid(&id_offs[i], offset, id_offs[i].build_id);
387 		prev_vma = vma;
388 		prev_build_id = id_offs[i].build_id;
389 	}
390 	bpf_mmap_unlock_mm(work, current->mm);
391 }
392 
393 static struct perf_callchain_entry *
394 get_callchain_entry_for_task(struct task_struct *task, u32 max_depth)
395 {
396 #ifdef CONFIG_STACKTRACE
397 	struct perf_callchain_entry *entry;
398 	int rctx;
399 
400 	entry = get_callchain_entry(&rctx);
401 
402 	if (!entry)
403 		return NULL;
404 
405 	entry->nr = stack_trace_save_tsk(task, (unsigned long *)entry->ip,
406 					 max_depth, 0);
407 
408 	/* stack_trace_save_tsk() works on unsigned long array, while
409 	 * perf_callchain_entry uses u64 array. For 32-bit systems, it is
410 	 * necessary to fix this mismatch.
411 	 */
412 	if (__BITS_PER_LONG != 64) {
413 		unsigned long *from = (unsigned long *) entry->ip;
414 		u64 *to = entry->ip;
415 		int i;
416 
417 		/* copy data from the end to avoid using extra buffer */
418 		for (i = entry->nr - 1; i >= 0; i--)
419 			to[i] = (u64)(from[i]);
420 	}
421 
422 	put_callchain_entry(rctx);
423 
424 	return entry;
425 #else /* CONFIG_STACKTRACE */
426 	return NULL;
427 #endif
428 }
429 
430 static long __bpf_get_stackid(struct bpf_map *map,
431 			      struct perf_callchain_entry *trace, u64 flags)
432 {
433 	struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map);
434 	struct stack_map_bucket *bucket, *new_bucket, *old_bucket;
435 	u32 hash, id, trace_nr, trace_len, i, max_depth;
436 	u32 skip = flags & BPF_F_SKIP_FIELD_MASK;
437 	bool user = flags & BPF_F_USER_STACK;
438 	u64 *ips;
439 	bool hash_matches;
440 
441 	if (trace->nr <= skip)
442 		/* skipping more than usable stack trace */
443 		return -EFAULT;
444 
445 	max_depth = stack_map_calculate_max_depth(map->value_size, stack_map_data_size(map), flags);
446 	trace_nr = min_t(u32, trace->nr - skip, max_depth - skip);
447 	trace_len = trace_nr * sizeof(u64);
448 	ips = trace->ip + skip;
449 	hash = jhash2((u32 *)ips, trace_len / sizeof(u32), 0);
450 	id = hash & (smap->n_buckets - 1);
451 	bucket = READ_ONCE(smap->buckets[id]);
452 
453 	hash_matches = bucket && bucket->hash == hash;
454 	/* fast cmp */
455 	if (hash_matches && flags & BPF_F_FAST_STACK_CMP)
456 		return id;
457 
458 	if (stack_map_use_build_id(map)) {
459 		struct bpf_stack_build_id *id_offs;
460 
461 		/* for build_id+offset, pop a bucket before slow cmp */
462 		new_bucket = (struct stack_map_bucket *)
463 			pcpu_freelist_pop(&smap->freelist);
464 		if (unlikely(!new_bucket))
465 			return -ENOMEM;
466 		new_bucket->nr = trace_nr;
467 		id_offs = (struct bpf_stack_build_id *)new_bucket->data;
468 		for (i = 0; i < trace_nr; i++)
469 			id_offs[i].ip = ips[i];
470 		stack_map_get_build_id_offset(id_offs, trace_nr, user, false /* !may_fault */);
471 		trace_len = trace_nr * sizeof(struct bpf_stack_build_id);
472 		if (hash_matches && bucket->nr == trace_nr &&
473 		    memcmp(bucket->data, new_bucket->data, trace_len) == 0) {
474 			pcpu_freelist_push(&smap->freelist, &new_bucket->fnode);
475 			return id;
476 		}
477 		if (bucket && !(flags & BPF_F_REUSE_STACKID)) {
478 			pcpu_freelist_push(&smap->freelist, &new_bucket->fnode);
479 			return -EEXIST;
480 		}
481 	} else {
482 		if (hash_matches && bucket->nr == trace_nr &&
483 		    memcmp(bucket->data, ips, trace_len) == 0)
484 			return id;
485 		if (bucket && !(flags & BPF_F_REUSE_STACKID))
486 			return -EEXIST;
487 
488 		new_bucket = (struct stack_map_bucket *)
489 			pcpu_freelist_pop(&smap->freelist);
490 		if (unlikely(!new_bucket))
491 			return -ENOMEM;
492 		memcpy(new_bucket->data, ips, trace_len);
493 	}
494 
495 	new_bucket->hash = hash;
496 	new_bucket->nr = trace_nr;
497 
498 	old_bucket = xchg(&smap->buckets[id], new_bucket);
499 	if (old_bucket)
500 		pcpu_freelist_push(&smap->freelist, &old_bucket->fnode);
501 	return id;
502 }
503 
504 BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map,
505 	   u64, flags)
506 {
507 	u32 elem_size = stack_map_data_size(map);
508 	bool user = flags & BPF_F_USER_STACK;
509 	struct perf_callchain_entry *trace;
510 	bool kernel = !user;
511 	u32 max_depth;
512 
513 	if (unlikely(flags & ~(BPF_F_SKIP_FIELD_MASK | BPF_F_USER_STACK |
514 			       BPF_F_FAST_STACK_CMP | BPF_F_REUSE_STACKID)))
515 		return -EINVAL;
516 
517 	max_depth = stack_map_calculate_max_depth(map->value_size, elem_size, flags);
518 	trace = get_perf_callchain(regs, kernel, user, max_depth,
519 				   false, false, 0);
520 
521 	if (unlikely(!trace))
522 		/* couldn't fetch the stack trace */
523 		return -EFAULT;
524 
525 	return __bpf_get_stackid(map, trace, flags);
526 }
527 
528 const struct bpf_func_proto bpf_get_stackid_proto = {
529 	.func		= bpf_get_stackid,
530 	.gpl_only	= true,
531 	.ret_type	= RET_INTEGER,
532 	.arg1_type	= ARG_PTR_TO_CTX,
533 	.arg2_type	= ARG_CONST_MAP_PTR,
534 	.arg3_type	= ARG_ANYTHING,
535 };
536 
537 static __u64 count_kernel_ip(struct perf_callchain_entry *trace)
538 {
539 	__u64 nr_kernel = 0;
540 
541 	while (nr_kernel < trace->nr) {
542 		if (trace->ip[nr_kernel] == PERF_CONTEXT_USER)
543 			break;
544 		nr_kernel++;
545 	}
546 	return nr_kernel;
547 }
548 
549 BPF_CALL_3(bpf_get_stackid_pe, struct bpf_perf_event_data_kern *, ctx,
550 	   struct bpf_map *, map, u64, flags)
551 {
552 	struct perf_event *event = ctx->event;
553 	struct perf_callchain_entry *trace;
554 	bool kernel, user;
555 	__u64 nr_kernel;
556 	int ret;
557 
558 	/* perf_sample_data doesn't have callchain, use bpf_get_stackid */
559 	if (!(event->attr.sample_type & PERF_SAMPLE_CALLCHAIN))
560 		return bpf_get_stackid((unsigned long)(ctx->regs),
561 				       (unsigned long) map, flags, 0, 0);
562 
563 	if (unlikely(flags & ~(BPF_F_SKIP_FIELD_MASK | BPF_F_USER_STACK |
564 			       BPF_F_FAST_STACK_CMP | BPF_F_REUSE_STACKID)))
565 		return -EINVAL;
566 
567 	user = flags & BPF_F_USER_STACK;
568 	kernel = !user;
569 
570 	trace = ctx->data->callchain;
571 	if (unlikely(!trace))
572 		return -EFAULT;
573 
574 	nr_kernel = count_kernel_ip(trace);
575 	__u64 nr = trace->nr; /* save original */
576 
577 	if (kernel) {
578 		trace->nr = nr_kernel;
579 		ret = __bpf_get_stackid(map, trace, flags);
580 	} else { /* user */
581 		u64 skip = flags & BPF_F_SKIP_FIELD_MASK;
582 
583 		skip += nr_kernel;
584 		if (skip > BPF_F_SKIP_FIELD_MASK)
585 			return -EFAULT;
586 
587 		flags = (flags & ~BPF_F_SKIP_FIELD_MASK) | skip;
588 		ret = __bpf_get_stackid(map, trace, flags);
589 	}
590 
591 	/* restore nr */
592 	trace->nr = nr;
593 
594 	return ret;
595 }
596 
597 const struct bpf_func_proto bpf_get_stackid_proto_pe = {
598 	.func		= bpf_get_stackid_pe,
599 	.gpl_only	= false,
600 	.ret_type	= RET_INTEGER,
601 	.arg1_type	= ARG_PTR_TO_CTX,
602 	.arg2_type	= ARG_CONST_MAP_PTR,
603 	.arg3_type	= ARG_ANYTHING,
604 };
605 
606 static long __bpf_get_stack(struct pt_regs *regs, struct task_struct *task,
607 			    struct perf_callchain_entry *trace_in,
608 			    void *buf, u32 size, u64 flags, bool may_fault)
609 {
610 	u32 trace_nr, copy_len, elem_size, max_depth;
611 	bool user_build_id = flags & BPF_F_USER_BUILD_ID;
612 	bool crosstask = task && task != current;
613 	u32 skip = flags & BPF_F_SKIP_FIELD_MASK;
614 	bool user = flags & BPF_F_USER_STACK;
615 	struct perf_callchain_entry *trace;
616 	bool kernel = !user;
617 	int err = -EINVAL;
618 	u64 *ips;
619 
620 	if (unlikely(flags & ~(BPF_F_SKIP_FIELD_MASK | BPF_F_USER_STACK |
621 			       BPF_F_USER_BUILD_ID)))
622 		goto clear;
623 	if (kernel && user_build_id)
624 		goto clear;
625 
626 	elem_size = user_build_id ? sizeof(struct bpf_stack_build_id) : sizeof(u64);
627 	if (unlikely(size % elem_size))
628 		goto clear;
629 
630 	/* cannot get valid user stack for task without user_mode regs */
631 	if (task && user && !user_mode(regs))
632 		goto err_fault;
633 
634 	/* get_perf_callchain does not support crosstask user stack walking
635 	 * but returns an empty stack instead of NULL.
636 	 */
637 	if (crosstask && user) {
638 		err = -EOPNOTSUPP;
639 		goto clear;
640 	}
641 
642 	max_depth = stack_map_calculate_max_depth(size, elem_size, flags);
643 
644 	if (may_fault)
645 		rcu_read_lock(); /* need RCU for perf's callchain below */
646 
647 	if (trace_in) {
648 		trace = trace_in;
649 		trace->nr = min_t(u32, trace->nr, max_depth);
650 	} else if (kernel && task) {
651 		trace = get_callchain_entry_for_task(task, max_depth);
652 	} else {
653 		trace = get_perf_callchain(regs, kernel, user, max_depth,
654 					   crosstask, false, 0);
655 	}
656 
657 	if (unlikely(!trace) || trace->nr < skip) {
658 		if (may_fault)
659 			rcu_read_unlock();
660 		goto err_fault;
661 	}
662 
663 	trace_nr = trace->nr - skip;
664 	copy_len = trace_nr * elem_size;
665 
666 	ips = trace->ip + skip;
667 	if (user_build_id) {
668 		struct bpf_stack_build_id *id_offs = buf;
669 		u32 i;
670 
671 		for (i = 0; i < trace_nr; i++)
672 			id_offs[i].ip = ips[i];
673 	} else {
674 		memcpy(buf, ips, copy_len);
675 	}
676 
677 	/* trace/ips should not be dereferenced after this point */
678 	if (may_fault)
679 		rcu_read_unlock();
680 
681 	if (user_build_id)
682 		stack_map_get_build_id_offset(buf, trace_nr, user, may_fault);
683 
684 	if (size > copy_len)
685 		memset(buf + copy_len, 0, size - copy_len);
686 	return copy_len;
687 
688 err_fault:
689 	err = -EFAULT;
690 clear:
691 	memset(buf, 0, size);
692 	return err;
693 }
694 
695 BPF_CALL_4(bpf_get_stack, struct pt_regs *, regs, void *, buf, u32, size,
696 	   u64, flags)
697 {
698 	return __bpf_get_stack(regs, NULL, NULL, buf, size, flags, false /* !may_fault */);
699 }
700 
701 const struct bpf_func_proto bpf_get_stack_proto = {
702 	.func		= bpf_get_stack,
703 	.gpl_only	= true,
704 	.ret_type	= RET_INTEGER,
705 	.arg1_type	= ARG_PTR_TO_CTX,
706 	.arg2_type	= ARG_PTR_TO_UNINIT_MEM,
707 	.arg3_type	= ARG_CONST_SIZE_OR_ZERO,
708 	.arg4_type	= ARG_ANYTHING,
709 };
710 
711 BPF_CALL_4(bpf_get_stack_sleepable, struct pt_regs *, regs, void *, buf, u32, size,
712 	   u64, flags)
713 {
714 	return __bpf_get_stack(regs, NULL, NULL, buf, size, flags, true /* may_fault */);
715 }
716 
717 const struct bpf_func_proto bpf_get_stack_sleepable_proto = {
718 	.func		= bpf_get_stack_sleepable,
719 	.gpl_only	= true,
720 	.ret_type	= RET_INTEGER,
721 	.arg1_type	= ARG_PTR_TO_CTX,
722 	.arg2_type	= ARG_PTR_TO_UNINIT_MEM,
723 	.arg3_type	= ARG_CONST_SIZE_OR_ZERO,
724 	.arg4_type	= ARG_ANYTHING,
725 };
726 
727 static long __bpf_get_task_stack(struct task_struct *task, void *buf, u32 size,
728 				 u64 flags, bool may_fault)
729 {
730 	struct pt_regs *regs;
731 	long res = -EINVAL;
732 
733 	if (!try_get_task_stack(task))
734 		return -EFAULT;
735 
736 	regs = task_pt_regs(task);
737 	if (regs)
738 		res = __bpf_get_stack(regs, task, NULL, buf, size, flags, may_fault);
739 	put_task_stack(task);
740 
741 	return res;
742 }
743 
744 BPF_CALL_4(bpf_get_task_stack, struct task_struct *, task, void *, buf,
745 	   u32, size, u64, flags)
746 {
747 	return __bpf_get_task_stack(task, buf, size, flags, false /* !may_fault */);
748 }
749 
750 const struct bpf_func_proto bpf_get_task_stack_proto = {
751 	.func		= bpf_get_task_stack,
752 	.gpl_only	= false,
753 	.ret_type	= RET_INTEGER,
754 	.arg1_type	= ARG_PTR_TO_BTF_ID,
755 	.arg1_btf_id	= &btf_tracing_ids[BTF_TRACING_TYPE_TASK],
756 	.arg2_type	= ARG_PTR_TO_UNINIT_MEM,
757 	.arg3_type	= ARG_CONST_SIZE_OR_ZERO,
758 	.arg4_type	= ARG_ANYTHING,
759 };
760 
761 BPF_CALL_4(bpf_get_task_stack_sleepable, struct task_struct *, task, void *, buf,
762 	   u32, size, u64, flags)
763 {
764 	return __bpf_get_task_stack(task, buf, size, flags, true /* !may_fault */);
765 }
766 
767 const struct bpf_func_proto bpf_get_task_stack_sleepable_proto = {
768 	.func		= bpf_get_task_stack_sleepable,
769 	.gpl_only	= false,
770 	.ret_type	= RET_INTEGER,
771 	.arg1_type	= ARG_PTR_TO_BTF_ID,
772 	.arg1_btf_id	= &btf_tracing_ids[BTF_TRACING_TYPE_TASK],
773 	.arg2_type	= ARG_PTR_TO_UNINIT_MEM,
774 	.arg3_type	= ARG_CONST_SIZE_OR_ZERO,
775 	.arg4_type	= ARG_ANYTHING,
776 };
777 
778 BPF_CALL_4(bpf_get_stack_pe, struct bpf_perf_event_data_kern *, ctx,
779 	   void *, buf, u32, size, u64, flags)
780 {
781 	struct pt_regs *regs = (struct pt_regs *)(ctx->regs);
782 	struct perf_event *event = ctx->event;
783 	struct perf_callchain_entry *trace;
784 	bool kernel, user;
785 	int err = -EINVAL;
786 	__u64 nr_kernel;
787 
788 	if (!(event->attr.sample_type & PERF_SAMPLE_CALLCHAIN))
789 		return __bpf_get_stack(regs, NULL, NULL, buf, size, flags, false /* !may_fault */);
790 
791 	if (unlikely(flags & ~(BPF_F_SKIP_FIELD_MASK | BPF_F_USER_STACK |
792 			       BPF_F_USER_BUILD_ID)))
793 		goto clear;
794 
795 	user = flags & BPF_F_USER_STACK;
796 	kernel = !user;
797 
798 	err = -EFAULT;
799 	trace = ctx->data->callchain;
800 	if (unlikely(!trace))
801 		goto clear;
802 
803 	nr_kernel = count_kernel_ip(trace);
804 
805 	if (kernel) {
806 		__u64 nr = trace->nr;
807 
808 		trace->nr = nr_kernel;
809 		err = __bpf_get_stack(regs, NULL, trace, buf, size, flags, false /* !may_fault */);
810 
811 		/* restore nr */
812 		trace->nr = nr;
813 	} else { /* user */
814 		u64 skip = flags & BPF_F_SKIP_FIELD_MASK;
815 
816 		skip += nr_kernel;
817 		if (skip > BPF_F_SKIP_FIELD_MASK)
818 			goto clear;
819 
820 		flags = (flags & ~BPF_F_SKIP_FIELD_MASK) | skip;
821 		err = __bpf_get_stack(regs, NULL, trace, buf, size, flags, false /* !may_fault */);
822 	}
823 	return err;
824 
825 clear:
826 	memset(buf, 0, size);
827 	return err;
828 
829 }
830 
831 const struct bpf_func_proto bpf_get_stack_proto_pe = {
832 	.func		= bpf_get_stack_pe,
833 	.gpl_only	= true,
834 	.ret_type	= RET_INTEGER,
835 	.arg1_type	= ARG_PTR_TO_CTX,
836 	.arg2_type	= ARG_PTR_TO_UNINIT_MEM,
837 	.arg3_type	= ARG_CONST_SIZE_OR_ZERO,
838 	.arg4_type	= ARG_ANYTHING,
839 };
840 
841 /* Called from eBPF program */
842 static void *stack_map_lookup_elem(struct bpf_map *map, void *key)
843 {
844 	return ERR_PTR(-EOPNOTSUPP);
845 }
846 
847 /* Called from syscall */
848 static int stack_map_lookup_and_delete_elem(struct bpf_map *map, void *key,
849 					    void *value, u64 flags)
850 {
851 	return bpf_stackmap_extract(map, key, value, true);
852 }
853 
854 /* Called from syscall */
855 int bpf_stackmap_extract(struct bpf_map *map, void *key, void *value,
856 			 bool delete)
857 {
858 	struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map);
859 	struct stack_map_bucket *bucket, *old_bucket;
860 	u32 id = *(u32 *)key, trace_len;
861 
862 	if (unlikely(id >= smap->n_buckets))
863 		return -ENOENT;
864 
865 	bucket = xchg(&smap->buckets[id], NULL);
866 	if (!bucket)
867 		return -ENOENT;
868 
869 	trace_len = bucket->nr * stack_map_data_size(map);
870 	memcpy(value, bucket->data, trace_len);
871 	memset(value + trace_len, 0, map->value_size - trace_len);
872 
873 	if (delete)
874 		old_bucket = bucket;
875 	else
876 		old_bucket = xchg(&smap->buckets[id], bucket);
877 	if (old_bucket)
878 		pcpu_freelist_push(&smap->freelist, &old_bucket->fnode);
879 	return 0;
880 }
881 
882 static int stack_map_get_next_key(struct bpf_map *map, void *key,
883 				  void *next_key)
884 {
885 	struct bpf_stack_map *smap = container_of(map,
886 						  struct bpf_stack_map, map);
887 	u32 id;
888 
889 	WARN_ON_ONCE(!rcu_read_lock_held());
890 
891 	if (!key) {
892 		id = 0;
893 	} else {
894 		id = *(u32 *)key;
895 		if (id >= smap->n_buckets || !smap->buckets[id])
896 			id = 0;
897 		else
898 			id++;
899 	}
900 
901 	while (id < smap->n_buckets && !smap->buckets[id])
902 		id++;
903 
904 	if (id >= smap->n_buckets)
905 		return -ENOENT;
906 
907 	*(u32 *)next_key = id;
908 	return 0;
909 }
910 
911 static long stack_map_update_elem(struct bpf_map *map, void *key, void *value,
912 				  u64 map_flags)
913 {
914 	return -EINVAL;
915 }
916 
917 /* Called from syscall or from eBPF program */
918 static long stack_map_delete_elem(struct bpf_map *map, void *key)
919 {
920 	struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map);
921 	struct stack_map_bucket *old_bucket;
922 	u32 id = *(u32 *)key;
923 
924 	if (unlikely(id >= smap->n_buckets))
925 		return -E2BIG;
926 
927 	old_bucket = xchg(&smap->buckets[id], NULL);
928 	if (old_bucket) {
929 		pcpu_freelist_push(&smap->freelist, &old_bucket->fnode);
930 		return 0;
931 	} else {
932 		return -ENOENT;
933 	}
934 }
935 
936 /* Called when map->refcnt goes to zero, either from workqueue or from syscall */
937 static void stack_map_free(struct bpf_map *map)
938 {
939 	struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map);
940 
941 	bpf_map_area_free(smap->elems);
942 	pcpu_freelist_destroy(&smap->freelist);
943 	bpf_map_area_free(smap);
944 	put_callchain_buffers();
945 }
946 
947 static u64 stack_map_mem_usage(const struct bpf_map *map)
948 {
949 	struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map);
950 	u64 value_size = map->value_size;
951 	u64 n_buckets = smap->n_buckets;
952 	u64 enties = map->max_entries;
953 	u64 usage = sizeof(*smap);
954 
955 	usage += n_buckets * sizeof(struct stack_map_bucket *);
956 	usage += enties * (sizeof(struct stack_map_bucket) + value_size);
957 	return usage;
958 }
959 
960 BTF_ID_LIST_SINGLE(stack_trace_map_btf_ids, struct, bpf_stack_map)
961 const struct bpf_map_ops stack_trace_map_ops = {
962 	.map_meta_equal = bpf_map_meta_equal,
963 	.map_alloc = stack_map_alloc,
964 	.map_free = stack_map_free,
965 	.map_get_next_key = stack_map_get_next_key,
966 	.map_lookup_elem = stack_map_lookup_elem,
967 	.map_lookup_and_delete_elem = stack_map_lookup_and_delete_elem,
968 	.map_update_elem = stack_map_update_elem,
969 	.map_delete_elem = stack_map_delete_elem,
970 	.map_check_btf = map_check_no_btf,
971 	.map_mem_usage = stack_map_mem_usage,
972 	.map_btf_id = &stack_trace_map_btf_ids[0],
973 };
974