1 // SPDX-License-Identifier: GPL-2.0-only 2 /* Copyright (c) 2016 Facebook 3 */ 4 #include <linux/bpf.h> 5 #include <linux/jhash.h> 6 #include <linux/filter.h> 7 #include <linux/kernel.h> 8 #include <linux/stacktrace.h> 9 #include <linux/perf_event.h> 10 #include <linux/btf_ids.h> 11 #include <linux/buildid.h> 12 #include <linux/mmap_lock.h> 13 #include "percpu_freelist.h" 14 #include "mmap_unlock_work.h" 15 16 #define STACK_CREATE_FLAG_MASK \ 17 (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY | \ 18 BPF_F_STACK_BUILD_ID) 19 20 struct stack_map_bucket { 21 struct pcpu_freelist_node fnode; 22 u32 hash; 23 u32 nr; 24 u64 data[]; 25 }; 26 27 struct bpf_stack_map { 28 struct bpf_map map; 29 void *elems; 30 struct pcpu_freelist freelist; 31 u32 n_buckets; 32 struct stack_map_bucket *buckets[] __counted_by(n_buckets); 33 }; 34 35 static inline bool stack_map_use_build_id(struct bpf_map *map) 36 { 37 return (map->map_flags & BPF_F_STACK_BUILD_ID); 38 } 39 40 static inline int stack_map_data_size(struct bpf_map *map) 41 { 42 return stack_map_use_build_id(map) ? 43 sizeof(struct bpf_stack_build_id) : sizeof(u64); 44 } 45 46 /** 47 * stack_map_calculate_max_depth - Calculate maximum allowed stack trace depth 48 * @size: Size of the buffer/map value in bytes 49 * @elem_size: Size of each stack trace element 50 * @flags: BPF stack trace flags (BPF_F_USER_STACK, BPF_F_USER_BUILD_ID, ...) 51 * 52 * Return: Maximum number of stack trace entries that can be safely stored 53 */ 54 static u32 stack_map_calculate_max_depth(u32 size, u32 elem_size, u64 flags) 55 { 56 u32 skip = flags & BPF_F_SKIP_FIELD_MASK; 57 u32 max_depth; 58 u32 curr_sysctl_max_stack = READ_ONCE(sysctl_perf_event_max_stack); 59 60 max_depth = size / elem_size; 61 max_depth += skip; 62 if (max_depth > curr_sysctl_max_stack) 63 return curr_sysctl_max_stack; 64 65 return max_depth; 66 } 67 68 static int prealloc_elems_and_freelist(struct bpf_stack_map *smap) 69 { 70 u64 elem_size = sizeof(struct stack_map_bucket) + 71 (u64)smap->map.value_size; 72 int err; 73 74 smap->elems = bpf_map_area_alloc(elem_size * smap->map.max_entries, 75 smap->map.numa_node); 76 if (!smap->elems) 77 return -ENOMEM; 78 79 err = pcpu_freelist_init(&smap->freelist); 80 if (err) 81 goto free_elems; 82 83 pcpu_freelist_populate(&smap->freelist, smap->elems, elem_size, 84 smap->map.max_entries); 85 return 0; 86 87 free_elems: 88 bpf_map_area_free(smap->elems); 89 return err; 90 } 91 92 /* Called from syscall */ 93 static struct bpf_map *stack_map_alloc(union bpf_attr *attr) 94 { 95 u32 value_size = attr->value_size; 96 struct bpf_stack_map *smap; 97 u64 cost, n_buckets; 98 int err; 99 100 if (attr->map_flags & ~STACK_CREATE_FLAG_MASK) 101 return ERR_PTR(-EINVAL); 102 103 /* check sanity of attributes */ 104 if (attr->max_entries == 0 || attr->key_size != 4 || 105 value_size < 8 || value_size % 8) 106 return ERR_PTR(-EINVAL); 107 108 BUILD_BUG_ON(sizeof(struct bpf_stack_build_id) % sizeof(u64)); 109 if (attr->map_flags & BPF_F_STACK_BUILD_ID) { 110 if (value_size % sizeof(struct bpf_stack_build_id) || 111 value_size / sizeof(struct bpf_stack_build_id) 112 > sysctl_perf_event_max_stack) 113 return ERR_PTR(-EINVAL); 114 } else if (value_size / 8 > sysctl_perf_event_max_stack) 115 return ERR_PTR(-EINVAL); 116 117 /* hash table size must be power of 2; roundup_pow_of_two() can overflow 118 * into UB on 32-bit arches, so check that first 119 */ 120 if (attr->max_entries > 1UL << 31) 121 return ERR_PTR(-E2BIG); 122 123 n_buckets = roundup_pow_of_two(attr->max_entries); 124 125 cost = n_buckets * sizeof(struct stack_map_bucket *) + sizeof(*smap); 126 smap = bpf_map_area_alloc(cost, bpf_map_attr_numa_node(attr)); 127 if (!smap) 128 return ERR_PTR(-ENOMEM); 129 130 bpf_map_init_from_attr(&smap->map, attr); 131 smap->n_buckets = n_buckets; 132 133 err = get_callchain_buffers(sysctl_perf_event_max_stack); 134 if (err) 135 goto free_smap; 136 137 err = prealloc_elems_and_freelist(smap); 138 if (err) 139 goto put_buffers; 140 141 return &smap->map; 142 143 put_buffers: 144 put_callchain_buffers(); 145 free_smap: 146 bpf_map_area_free(smap); 147 return ERR_PTR(err); 148 } 149 150 static int fetch_build_id(struct vm_area_struct *vma, unsigned char *build_id, bool may_fault) 151 { 152 return may_fault ? build_id_parse(vma, build_id, NULL) 153 : build_id_parse_nofault(vma, build_id, NULL); 154 } 155 156 static inline void stack_map_build_id_set_ip(struct bpf_stack_build_id *id) 157 { 158 id->status = BPF_STACK_BUILD_ID_IP; 159 memset(id->build_id, 0, BUILD_ID_SIZE_MAX); 160 } 161 162 static inline u64 stack_map_build_id_offset(unsigned long vm_pgoff, 163 unsigned long vm_start, u64 ip) 164 { 165 return (vm_pgoff << PAGE_SHIFT) + ip - vm_start; 166 } 167 168 static inline void stack_map_build_id_set_valid(struct bpf_stack_build_id *id, 169 u64 offset, 170 const unsigned char *build_id) 171 { 172 id->status = BPF_STACK_BUILD_ID_VALID; 173 id->offset = offset; 174 if (id->build_id != build_id) 175 memcpy(id->build_id, build_id, BUILD_ID_SIZE_MAX); 176 } 177 178 struct stack_map_vma_lock { 179 struct vm_area_struct *vma; 180 struct mm_struct *mm; 181 }; 182 183 /* 184 * Acquire a stable read-side reference on the VMA covering @ip. 185 * 186 * With CONFIG_PER_VMA_LOCK=y this returns a VMA with its per-VMA read 187 * lock held and mmap_lock dropped, so the caller may sleep. 188 * 189 * With CONFIG_PER_VMA_LOCK=n it returns a VMA with mmap_lock still 190 * held; the caller must snapshot any fields it needs and pin vm_file 191 * with get_file() before stack_map_unlock_vma() drops mmap_lock, as 192 * the VMA may be split, merged, or freed after that. 193 * 194 * Returns NULL on failure, in which case no lock is held. 195 */ 196 static struct vm_area_struct * 197 stack_map_lock_vma(struct stack_map_vma_lock *lock, unsigned long ip) 198 { 199 struct mm_struct *mm = lock->mm; 200 struct vm_area_struct *vma; 201 202 /* noop under !CONFIG_PER_VMA_LOCK */ 203 vma = lock_vma_under_rcu(mm, ip); 204 if (vma) { 205 lock->vma = vma; 206 return vma; 207 } 208 209 /* 210 * Taking mmap_read_lock() is unsafe here, because the caller BPF 211 * program might already hold it, causing a deadlock. 212 */ 213 if (!mmap_read_trylock(mm)) 214 return NULL; 215 216 vma = vma_lookup(mm, ip); 217 if (!vma) { 218 mmap_read_unlock(mm); 219 return NULL; 220 } 221 222 #ifdef CONFIG_PER_VMA_LOCK 223 if (!vma_start_read_locked(vma)) { 224 mmap_read_unlock(mm); 225 return NULL; 226 } 227 mmap_read_unlock(mm); 228 #endif 229 230 lock->vma = vma; 231 return vma; 232 } 233 234 static void stack_map_unlock_vma(struct stack_map_vma_lock *lock) 235 { 236 #ifdef CONFIG_PER_VMA_LOCK 237 vma_end_read(lock->vma); 238 #else 239 mmap_read_unlock(lock->mm); 240 #endif 241 lock->vma = NULL; 242 } 243 244 static void stack_map_get_build_id_offset_sleepable(struct bpf_stack_build_id *id_offs, 245 u32 trace_nr) 246 { 247 struct mm_struct *mm = current->mm; 248 struct stack_map_vma_lock lock = { .mm = mm }; 249 struct { 250 struct file *file; 251 const unsigned char *build_id; 252 unsigned long vm_start; 253 unsigned long vm_end; 254 unsigned long vm_pgoff; 255 } cache = {}; 256 unsigned long vm_pgoff, vm_start, vm_end; 257 struct vm_area_struct *vma; 258 struct file *file; 259 u64 offset; 260 u64 ip; 261 262 for (u32 i = 0; i < trace_nr; i++) { 263 ip = READ_ONCE(id_offs[i].ip); 264 265 /* 266 * Range cache fast path: if ip falls within the previously 267 * resolved VMA range, reuse the cache build_id without 268 * re-acquiring the VMA lock. 269 */ 270 if (cache.build_id && ip >= cache.vm_start && ip < cache.vm_end) { 271 offset = stack_map_build_id_offset(cache.vm_pgoff, cache.vm_start, ip); 272 stack_map_build_id_set_valid(&id_offs[i], offset, cache.build_id); 273 continue; 274 } 275 276 vma = stack_map_lock_vma(&lock, ip); 277 if (!vma) { 278 stack_map_build_id_set_ip(&id_offs[i]); 279 continue; 280 } 281 if (vma_is_anonymous(vma) || !vma->vm_file) { 282 stack_map_build_id_set_ip(&id_offs[i]); 283 stack_map_unlock_vma(&lock); 284 continue; 285 } 286 287 file = vma->vm_file; 288 vm_pgoff = vma->vm_pgoff; 289 vm_start = vma->vm_start; 290 vm_end = vma->vm_end; 291 offset = stack_map_build_id_offset(vm_pgoff, vm_start, ip); 292 293 /* 294 * Same backing file as previous (e.g. different VMAs 295 * of the same ELF binary). Reuse the cache build_id. 296 */ 297 if (file == cache.file) { 298 stack_map_unlock_vma(&lock); 299 stack_map_build_id_set_valid(&id_offs[i], offset, cache.build_id); 300 cache.vm_start = vm_start; 301 cache.vm_end = vm_end; 302 cache.vm_pgoff = vm_pgoff; 303 continue; 304 } 305 306 file = get_file(file); 307 stack_map_unlock_vma(&lock); 308 309 /* build_id_parse_file() may block on filesystem reads */ 310 if (build_id_parse_file(file, id_offs[i].build_id, NULL)) { 311 stack_map_build_id_set_ip(&id_offs[i]); 312 fput(file); 313 continue; 314 } 315 316 stack_map_build_id_set_valid(&id_offs[i], offset, id_offs[i].build_id); 317 if (cache.file) 318 fput(cache.file); 319 cache.file = file; 320 cache.build_id = id_offs[i].build_id; 321 cache.vm_start = vm_start; 322 cache.vm_end = vm_end; 323 cache.vm_pgoff = vm_pgoff; 324 } 325 326 if (cache.file) 327 fput(cache.file); 328 } 329 330 /* 331 * Expects all id_offs[i].ip values to be set to correct initial IPs. 332 * They will be subsequently: 333 * - either adjusted in place to a file offset, if build ID fetching 334 * succeeds; in this case id_offs[i].build_id is set to correct build ID, 335 * and id_offs[i].status is set to BPF_STACK_BUILD_ID_VALID; 336 * - or IP will be kept intact, if build ID fetching failed; in this case 337 * id_offs[i].build_id is zeroed out and id_offs[i].status is set to 338 * BPF_STACK_BUILD_ID_IP. 339 */ 340 static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs, 341 u32 trace_nr, bool user, bool may_fault) 342 { 343 struct mmap_unlock_irq_work *work = NULL; 344 bool irq_work_busy = bpf_mmap_unlock_get_irq_work(&work); 345 bool has_user_ctx = user && current && current->mm; 346 struct vm_area_struct *vma, *prev_vma = NULL; 347 const unsigned char *prev_build_id = NULL; 348 int i; 349 350 if (may_fault && has_user_ctx) { 351 stack_map_get_build_id_offset_sleepable(id_offs, trace_nr); 352 return; 353 } 354 355 /* If the irq_work is in use, fall back to report ips. Same 356 * fallback is used for kernel stack (!user) on a stackmap with 357 * build_id. 358 */ 359 if (!has_user_ctx || irq_work_busy || !mmap_read_trylock(current->mm)) { 360 /* cannot access current->mm, fall back to ips */ 361 for (i = 0; i < trace_nr; i++) 362 stack_map_build_id_set_ip(&id_offs[i]); 363 return; 364 } 365 366 for (i = 0; i < trace_nr; i++) { 367 u64 ip = READ_ONCE(id_offs[i].ip); 368 u64 offset; 369 370 if (prev_build_id && range_in_vma(prev_vma, ip, ip)) { 371 vma = prev_vma; 372 offset = stack_map_build_id_offset(vma->vm_pgoff, vma->vm_start, ip); 373 stack_map_build_id_set_valid(&id_offs[i], offset, prev_build_id); 374 continue; 375 } 376 vma = find_vma(current->mm, ip); 377 if (!vma || vma_is_anonymous(vma) || 378 fetch_build_id(vma, id_offs[i].build_id, may_fault)) { 379 /* per entry fall back to ips */ 380 stack_map_build_id_set_ip(&id_offs[i]); 381 prev_vma = vma; 382 prev_build_id = NULL; 383 continue; 384 } 385 offset = stack_map_build_id_offset(vma->vm_pgoff, vma->vm_start, ip); 386 stack_map_build_id_set_valid(&id_offs[i], offset, id_offs[i].build_id); 387 prev_vma = vma; 388 prev_build_id = id_offs[i].build_id; 389 } 390 bpf_mmap_unlock_mm(work, current->mm); 391 } 392 393 static struct perf_callchain_entry * 394 get_callchain_entry_for_task(struct task_struct *task, u32 max_depth) 395 { 396 #ifdef CONFIG_STACKTRACE 397 struct perf_callchain_entry *entry; 398 int rctx; 399 400 entry = get_callchain_entry(&rctx); 401 402 if (!entry) 403 return NULL; 404 405 entry->nr = stack_trace_save_tsk(task, (unsigned long *)entry->ip, 406 max_depth, 0); 407 408 /* stack_trace_save_tsk() works on unsigned long array, while 409 * perf_callchain_entry uses u64 array. For 32-bit systems, it is 410 * necessary to fix this mismatch. 411 */ 412 if (__BITS_PER_LONG != 64) { 413 unsigned long *from = (unsigned long *) entry->ip; 414 u64 *to = entry->ip; 415 int i; 416 417 /* copy data from the end to avoid using extra buffer */ 418 for (i = entry->nr - 1; i >= 0; i--) 419 to[i] = (u64)(from[i]); 420 } 421 422 put_callchain_entry(rctx); 423 424 return entry; 425 #else /* CONFIG_STACKTRACE */ 426 return NULL; 427 #endif 428 } 429 430 static long __bpf_get_stackid(struct bpf_map *map, 431 struct perf_callchain_entry *trace, u64 flags) 432 { 433 struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map); 434 struct stack_map_bucket *bucket, *new_bucket, *old_bucket; 435 u32 hash, id, trace_nr, trace_len, i, max_depth; 436 u32 skip = flags & BPF_F_SKIP_FIELD_MASK; 437 bool user = flags & BPF_F_USER_STACK; 438 u64 *ips; 439 bool hash_matches; 440 441 if (trace->nr <= skip) 442 /* skipping more than usable stack trace */ 443 return -EFAULT; 444 445 max_depth = stack_map_calculate_max_depth(map->value_size, stack_map_data_size(map), flags); 446 trace_nr = min_t(u32, trace->nr - skip, max_depth - skip); 447 trace_len = trace_nr * sizeof(u64); 448 ips = trace->ip + skip; 449 hash = jhash2((u32 *)ips, trace_len / sizeof(u32), 0); 450 id = hash & (smap->n_buckets - 1); 451 bucket = READ_ONCE(smap->buckets[id]); 452 453 hash_matches = bucket && bucket->hash == hash; 454 /* fast cmp */ 455 if (hash_matches && flags & BPF_F_FAST_STACK_CMP) 456 return id; 457 458 if (stack_map_use_build_id(map)) { 459 struct bpf_stack_build_id *id_offs; 460 461 /* for build_id+offset, pop a bucket before slow cmp */ 462 new_bucket = (struct stack_map_bucket *) 463 pcpu_freelist_pop(&smap->freelist); 464 if (unlikely(!new_bucket)) 465 return -ENOMEM; 466 new_bucket->nr = trace_nr; 467 id_offs = (struct bpf_stack_build_id *)new_bucket->data; 468 for (i = 0; i < trace_nr; i++) 469 id_offs[i].ip = ips[i]; 470 stack_map_get_build_id_offset(id_offs, trace_nr, user, false /* !may_fault */); 471 trace_len = trace_nr * sizeof(struct bpf_stack_build_id); 472 if (hash_matches && bucket->nr == trace_nr && 473 memcmp(bucket->data, new_bucket->data, trace_len) == 0) { 474 pcpu_freelist_push(&smap->freelist, &new_bucket->fnode); 475 return id; 476 } 477 if (bucket && !(flags & BPF_F_REUSE_STACKID)) { 478 pcpu_freelist_push(&smap->freelist, &new_bucket->fnode); 479 return -EEXIST; 480 } 481 } else { 482 if (hash_matches && bucket->nr == trace_nr && 483 memcmp(bucket->data, ips, trace_len) == 0) 484 return id; 485 if (bucket && !(flags & BPF_F_REUSE_STACKID)) 486 return -EEXIST; 487 488 new_bucket = (struct stack_map_bucket *) 489 pcpu_freelist_pop(&smap->freelist); 490 if (unlikely(!new_bucket)) 491 return -ENOMEM; 492 memcpy(new_bucket->data, ips, trace_len); 493 } 494 495 new_bucket->hash = hash; 496 new_bucket->nr = trace_nr; 497 498 old_bucket = xchg(&smap->buckets[id], new_bucket); 499 if (old_bucket) 500 pcpu_freelist_push(&smap->freelist, &old_bucket->fnode); 501 return id; 502 } 503 504 BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map, 505 u64, flags) 506 { 507 u32 elem_size = stack_map_data_size(map); 508 bool user = flags & BPF_F_USER_STACK; 509 struct perf_callchain_entry *trace; 510 bool kernel = !user; 511 u32 max_depth; 512 513 if (unlikely(flags & ~(BPF_F_SKIP_FIELD_MASK | BPF_F_USER_STACK | 514 BPF_F_FAST_STACK_CMP | BPF_F_REUSE_STACKID))) 515 return -EINVAL; 516 517 max_depth = stack_map_calculate_max_depth(map->value_size, elem_size, flags); 518 trace = get_perf_callchain(regs, kernel, user, max_depth, 519 false, false, 0); 520 521 if (unlikely(!trace)) 522 /* couldn't fetch the stack trace */ 523 return -EFAULT; 524 525 return __bpf_get_stackid(map, trace, flags); 526 } 527 528 const struct bpf_func_proto bpf_get_stackid_proto = { 529 .func = bpf_get_stackid, 530 .gpl_only = true, 531 .ret_type = RET_INTEGER, 532 .arg1_type = ARG_PTR_TO_CTX, 533 .arg2_type = ARG_CONST_MAP_PTR, 534 .arg3_type = ARG_ANYTHING, 535 }; 536 537 static __u64 count_kernel_ip(struct perf_callchain_entry *trace) 538 { 539 __u64 nr_kernel = 0; 540 541 while (nr_kernel < trace->nr) { 542 if (trace->ip[nr_kernel] == PERF_CONTEXT_USER) 543 break; 544 nr_kernel++; 545 } 546 return nr_kernel; 547 } 548 549 BPF_CALL_3(bpf_get_stackid_pe, struct bpf_perf_event_data_kern *, ctx, 550 struct bpf_map *, map, u64, flags) 551 { 552 struct perf_event *event = ctx->event; 553 struct perf_callchain_entry *trace; 554 bool kernel, user; 555 __u64 nr_kernel; 556 int ret; 557 558 /* perf_sample_data doesn't have callchain, use bpf_get_stackid */ 559 if (!(event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)) 560 return bpf_get_stackid((unsigned long)(ctx->regs), 561 (unsigned long) map, flags, 0, 0); 562 563 if (unlikely(flags & ~(BPF_F_SKIP_FIELD_MASK | BPF_F_USER_STACK | 564 BPF_F_FAST_STACK_CMP | BPF_F_REUSE_STACKID))) 565 return -EINVAL; 566 567 user = flags & BPF_F_USER_STACK; 568 kernel = !user; 569 570 trace = ctx->data->callchain; 571 if (unlikely(!trace)) 572 return -EFAULT; 573 574 nr_kernel = count_kernel_ip(trace); 575 __u64 nr = trace->nr; /* save original */ 576 577 if (kernel) { 578 trace->nr = nr_kernel; 579 ret = __bpf_get_stackid(map, trace, flags); 580 } else { /* user */ 581 u64 skip = flags & BPF_F_SKIP_FIELD_MASK; 582 583 skip += nr_kernel; 584 if (skip > BPF_F_SKIP_FIELD_MASK) 585 return -EFAULT; 586 587 flags = (flags & ~BPF_F_SKIP_FIELD_MASK) | skip; 588 ret = __bpf_get_stackid(map, trace, flags); 589 } 590 591 /* restore nr */ 592 trace->nr = nr; 593 594 return ret; 595 } 596 597 const struct bpf_func_proto bpf_get_stackid_proto_pe = { 598 .func = bpf_get_stackid_pe, 599 .gpl_only = false, 600 .ret_type = RET_INTEGER, 601 .arg1_type = ARG_PTR_TO_CTX, 602 .arg2_type = ARG_CONST_MAP_PTR, 603 .arg3_type = ARG_ANYTHING, 604 }; 605 606 static long __bpf_get_stack(struct pt_regs *regs, struct task_struct *task, 607 struct perf_callchain_entry *trace_in, 608 void *buf, u32 size, u64 flags, bool may_fault) 609 { 610 u32 trace_nr, copy_len, elem_size, max_depth; 611 bool user_build_id = flags & BPF_F_USER_BUILD_ID; 612 bool crosstask = task && task != current; 613 u32 skip = flags & BPF_F_SKIP_FIELD_MASK; 614 bool user = flags & BPF_F_USER_STACK; 615 struct perf_callchain_entry *trace; 616 bool kernel = !user; 617 int err = -EINVAL; 618 u64 *ips; 619 620 if (unlikely(flags & ~(BPF_F_SKIP_FIELD_MASK | BPF_F_USER_STACK | 621 BPF_F_USER_BUILD_ID))) 622 goto clear; 623 if (kernel && user_build_id) 624 goto clear; 625 626 elem_size = user_build_id ? sizeof(struct bpf_stack_build_id) : sizeof(u64); 627 if (unlikely(size % elem_size)) 628 goto clear; 629 630 /* cannot get valid user stack for task without user_mode regs */ 631 if (task && user && !user_mode(regs)) 632 goto err_fault; 633 634 /* get_perf_callchain does not support crosstask user stack walking 635 * but returns an empty stack instead of NULL. 636 */ 637 if (crosstask && user) { 638 err = -EOPNOTSUPP; 639 goto clear; 640 } 641 642 max_depth = stack_map_calculate_max_depth(size, elem_size, flags); 643 644 if (may_fault) 645 rcu_read_lock(); /* need RCU for perf's callchain below */ 646 647 if (trace_in) { 648 trace = trace_in; 649 trace->nr = min_t(u32, trace->nr, max_depth); 650 } else if (kernel && task) { 651 trace = get_callchain_entry_for_task(task, max_depth); 652 } else { 653 trace = get_perf_callchain(regs, kernel, user, max_depth, 654 crosstask, false, 0); 655 } 656 657 if (unlikely(!trace) || trace->nr < skip) { 658 if (may_fault) 659 rcu_read_unlock(); 660 goto err_fault; 661 } 662 663 trace_nr = trace->nr - skip; 664 copy_len = trace_nr * elem_size; 665 666 ips = trace->ip + skip; 667 if (user_build_id) { 668 struct bpf_stack_build_id *id_offs = buf; 669 u32 i; 670 671 for (i = 0; i < trace_nr; i++) 672 id_offs[i].ip = ips[i]; 673 } else { 674 memcpy(buf, ips, copy_len); 675 } 676 677 /* trace/ips should not be dereferenced after this point */ 678 if (may_fault) 679 rcu_read_unlock(); 680 681 if (user_build_id) 682 stack_map_get_build_id_offset(buf, trace_nr, user, may_fault); 683 684 if (size > copy_len) 685 memset(buf + copy_len, 0, size - copy_len); 686 return copy_len; 687 688 err_fault: 689 err = -EFAULT; 690 clear: 691 memset(buf, 0, size); 692 return err; 693 } 694 695 BPF_CALL_4(bpf_get_stack, struct pt_regs *, regs, void *, buf, u32, size, 696 u64, flags) 697 { 698 return __bpf_get_stack(regs, NULL, NULL, buf, size, flags, false /* !may_fault */); 699 } 700 701 const struct bpf_func_proto bpf_get_stack_proto = { 702 .func = bpf_get_stack, 703 .gpl_only = true, 704 .ret_type = RET_INTEGER, 705 .arg1_type = ARG_PTR_TO_CTX, 706 .arg2_type = ARG_PTR_TO_UNINIT_MEM, 707 .arg3_type = ARG_CONST_SIZE_OR_ZERO, 708 .arg4_type = ARG_ANYTHING, 709 }; 710 711 BPF_CALL_4(bpf_get_stack_sleepable, struct pt_regs *, regs, void *, buf, u32, size, 712 u64, flags) 713 { 714 return __bpf_get_stack(regs, NULL, NULL, buf, size, flags, true /* may_fault */); 715 } 716 717 const struct bpf_func_proto bpf_get_stack_sleepable_proto = { 718 .func = bpf_get_stack_sleepable, 719 .gpl_only = true, 720 .ret_type = RET_INTEGER, 721 .arg1_type = ARG_PTR_TO_CTX, 722 .arg2_type = ARG_PTR_TO_UNINIT_MEM, 723 .arg3_type = ARG_CONST_SIZE_OR_ZERO, 724 .arg4_type = ARG_ANYTHING, 725 }; 726 727 static long __bpf_get_task_stack(struct task_struct *task, void *buf, u32 size, 728 u64 flags, bool may_fault) 729 { 730 struct pt_regs *regs; 731 long res = -EINVAL; 732 733 if (!try_get_task_stack(task)) 734 return -EFAULT; 735 736 regs = task_pt_regs(task); 737 if (regs) 738 res = __bpf_get_stack(regs, task, NULL, buf, size, flags, may_fault); 739 put_task_stack(task); 740 741 return res; 742 } 743 744 BPF_CALL_4(bpf_get_task_stack, struct task_struct *, task, void *, buf, 745 u32, size, u64, flags) 746 { 747 return __bpf_get_task_stack(task, buf, size, flags, false /* !may_fault */); 748 } 749 750 const struct bpf_func_proto bpf_get_task_stack_proto = { 751 .func = bpf_get_task_stack, 752 .gpl_only = false, 753 .ret_type = RET_INTEGER, 754 .arg1_type = ARG_PTR_TO_BTF_ID, 755 .arg1_btf_id = &btf_tracing_ids[BTF_TRACING_TYPE_TASK], 756 .arg2_type = ARG_PTR_TO_UNINIT_MEM, 757 .arg3_type = ARG_CONST_SIZE_OR_ZERO, 758 .arg4_type = ARG_ANYTHING, 759 }; 760 761 BPF_CALL_4(bpf_get_task_stack_sleepable, struct task_struct *, task, void *, buf, 762 u32, size, u64, flags) 763 { 764 return __bpf_get_task_stack(task, buf, size, flags, true /* !may_fault */); 765 } 766 767 const struct bpf_func_proto bpf_get_task_stack_sleepable_proto = { 768 .func = bpf_get_task_stack_sleepable, 769 .gpl_only = false, 770 .ret_type = RET_INTEGER, 771 .arg1_type = ARG_PTR_TO_BTF_ID, 772 .arg1_btf_id = &btf_tracing_ids[BTF_TRACING_TYPE_TASK], 773 .arg2_type = ARG_PTR_TO_UNINIT_MEM, 774 .arg3_type = ARG_CONST_SIZE_OR_ZERO, 775 .arg4_type = ARG_ANYTHING, 776 }; 777 778 BPF_CALL_4(bpf_get_stack_pe, struct bpf_perf_event_data_kern *, ctx, 779 void *, buf, u32, size, u64, flags) 780 { 781 struct pt_regs *regs = (struct pt_regs *)(ctx->regs); 782 struct perf_event *event = ctx->event; 783 struct perf_callchain_entry *trace; 784 bool kernel, user; 785 int err = -EINVAL; 786 __u64 nr_kernel; 787 788 if (!(event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)) 789 return __bpf_get_stack(regs, NULL, NULL, buf, size, flags, false /* !may_fault */); 790 791 if (unlikely(flags & ~(BPF_F_SKIP_FIELD_MASK | BPF_F_USER_STACK | 792 BPF_F_USER_BUILD_ID))) 793 goto clear; 794 795 user = flags & BPF_F_USER_STACK; 796 kernel = !user; 797 798 err = -EFAULT; 799 trace = ctx->data->callchain; 800 if (unlikely(!trace)) 801 goto clear; 802 803 nr_kernel = count_kernel_ip(trace); 804 805 if (kernel) { 806 __u64 nr = trace->nr; 807 808 trace->nr = nr_kernel; 809 err = __bpf_get_stack(regs, NULL, trace, buf, size, flags, false /* !may_fault */); 810 811 /* restore nr */ 812 trace->nr = nr; 813 } else { /* user */ 814 u64 skip = flags & BPF_F_SKIP_FIELD_MASK; 815 816 skip += nr_kernel; 817 if (skip > BPF_F_SKIP_FIELD_MASK) 818 goto clear; 819 820 flags = (flags & ~BPF_F_SKIP_FIELD_MASK) | skip; 821 err = __bpf_get_stack(regs, NULL, trace, buf, size, flags, false /* !may_fault */); 822 } 823 return err; 824 825 clear: 826 memset(buf, 0, size); 827 return err; 828 829 } 830 831 const struct bpf_func_proto bpf_get_stack_proto_pe = { 832 .func = bpf_get_stack_pe, 833 .gpl_only = true, 834 .ret_type = RET_INTEGER, 835 .arg1_type = ARG_PTR_TO_CTX, 836 .arg2_type = ARG_PTR_TO_UNINIT_MEM, 837 .arg3_type = ARG_CONST_SIZE_OR_ZERO, 838 .arg4_type = ARG_ANYTHING, 839 }; 840 841 /* Called from eBPF program */ 842 static void *stack_map_lookup_elem(struct bpf_map *map, void *key) 843 { 844 return ERR_PTR(-EOPNOTSUPP); 845 } 846 847 /* Called from syscall */ 848 static int stack_map_lookup_and_delete_elem(struct bpf_map *map, void *key, 849 void *value, u64 flags) 850 { 851 return bpf_stackmap_extract(map, key, value, true); 852 } 853 854 /* Called from syscall */ 855 int bpf_stackmap_extract(struct bpf_map *map, void *key, void *value, 856 bool delete) 857 { 858 struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map); 859 struct stack_map_bucket *bucket, *old_bucket; 860 u32 id = *(u32 *)key, trace_len; 861 862 if (unlikely(id >= smap->n_buckets)) 863 return -ENOENT; 864 865 bucket = xchg(&smap->buckets[id], NULL); 866 if (!bucket) 867 return -ENOENT; 868 869 trace_len = bucket->nr * stack_map_data_size(map); 870 memcpy(value, bucket->data, trace_len); 871 memset(value + trace_len, 0, map->value_size - trace_len); 872 873 if (delete) 874 old_bucket = bucket; 875 else 876 old_bucket = xchg(&smap->buckets[id], bucket); 877 if (old_bucket) 878 pcpu_freelist_push(&smap->freelist, &old_bucket->fnode); 879 return 0; 880 } 881 882 static int stack_map_get_next_key(struct bpf_map *map, void *key, 883 void *next_key) 884 { 885 struct bpf_stack_map *smap = container_of(map, 886 struct bpf_stack_map, map); 887 u32 id; 888 889 WARN_ON_ONCE(!rcu_read_lock_held()); 890 891 if (!key) { 892 id = 0; 893 } else { 894 id = *(u32 *)key; 895 if (id >= smap->n_buckets || !smap->buckets[id]) 896 id = 0; 897 else 898 id++; 899 } 900 901 while (id < smap->n_buckets && !smap->buckets[id]) 902 id++; 903 904 if (id >= smap->n_buckets) 905 return -ENOENT; 906 907 *(u32 *)next_key = id; 908 return 0; 909 } 910 911 static long stack_map_update_elem(struct bpf_map *map, void *key, void *value, 912 u64 map_flags) 913 { 914 return -EINVAL; 915 } 916 917 /* Called from syscall or from eBPF program */ 918 static long stack_map_delete_elem(struct bpf_map *map, void *key) 919 { 920 struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map); 921 struct stack_map_bucket *old_bucket; 922 u32 id = *(u32 *)key; 923 924 if (unlikely(id >= smap->n_buckets)) 925 return -E2BIG; 926 927 old_bucket = xchg(&smap->buckets[id], NULL); 928 if (old_bucket) { 929 pcpu_freelist_push(&smap->freelist, &old_bucket->fnode); 930 return 0; 931 } else { 932 return -ENOENT; 933 } 934 } 935 936 /* Called when map->refcnt goes to zero, either from workqueue or from syscall */ 937 static void stack_map_free(struct bpf_map *map) 938 { 939 struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map); 940 941 bpf_map_area_free(smap->elems); 942 pcpu_freelist_destroy(&smap->freelist); 943 bpf_map_area_free(smap); 944 put_callchain_buffers(); 945 } 946 947 static u64 stack_map_mem_usage(const struct bpf_map *map) 948 { 949 struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map); 950 u64 value_size = map->value_size; 951 u64 n_buckets = smap->n_buckets; 952 u64 enties = map->max_entries; 953 u64 usage = sizeof(*smap); 954 955 usage += n_buckets * sizeof(struct stack_map_bucket *); 956 usage += enties * (sizeof(struct stack_map_bucket) + value_size); 957 return usage; 958 } 959 960 BTF_ID_LIST_SINGLE(stack_trace_map_btf_ids, struct, bpf_stack_map) 961 const struct bpf_map_ops stack_trace_map_ops = { 962 .map_meta_equal = bpf_map_meta_equal, 963 .map_alloc = stack_map_alloc, 964 .map_free = stack_map_free, 965 .map_get_next_key = stack_map_get_next_key, 966 .map_lookup_elem = stack_map_lookup_elem, 967 .map_lookup_and_delete_elem = stack_map_lookup_and_delete_elem, 968 .map_update_elem = stack_map_update_elem, 969 .map_delete_elem = stack_map_delete_elem, 970 .map_check_btf = map_check_no_btf, 971 .map_mem_usage = stack_map_mem_usage, 972 .map_btf_id = &stack_trace_map_btf_ids[0], 973 }; 974