1 /* Copyright (c) 2016 Facebook 2 * 3 * This program is free software; you can redistribute it and/or 4 * modify it under the terms of version 2 of the GNU General Public 5 * License as published by the Free Software Foundation. 6 */ 7 #include <linux/bpf.h> 8 #include <linux/jhash.h> 9 #include <linux/filter.h> 10 #include <linux/stacktrace.h> 11 #include <linux/perf_event.h> 12 #include <linux/elf.h> 13 #include <linux/pagemap.h> 14 #include "percpu_freelist.h" 15 16 #define STACK_CREATE_FLAG_MASK \ 17 (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY | \ 18 BPF_F_STACK_BUILD_ID) 19 20 struct stack_map_bucket { 21 struct pcpu_freelist_node fnode; 22 u32 hash; 23 u32 nr; 24 u64 data[]; 25 }; 26 27 struct bpf_stack_map { 28 struct bpf_map map; 29 void *elems; 30 struct pcpu_freelist freelist; 31 u32 n_buckets; 32 struct stack_map_bucket *buckets[]; 33 }; 34 35 static inline bool stack_map_use_build_id(struct bpf_map *map) 36 { 37 return (map->map_flags & BPF_F_STACK_BUILD_ID); 38 } 39 40 static inline int stack_map_data_size(struct bpf_map *map) 41 { 42 return stack_map_use_build_id(map) ? 43 sizeof(struct bpf_stack_build_id) : sizeof(u64); 44 } 45 46 static int prealloc_elems_and_freelist(struct bpf_stack_map *smap) 47 { 48 u32 elem_size = sizeof(struct stack_map_bucket) + smap->map.value_size; 49 int err; 50 51 smap->elems = bpf_map_area_alloc(elem_size * smap->map.max_entries, 52 smap->map.numa_node); 53 if (!smap->elems) 54 return -ENOMEM; 55 56 err = pcpu_freelist_init(&smap->freelist); 57 if (err) 58 goto free_elems; 59 60 pcpu_freelist_populate(&smap->freelist, smap->elems, elem_size, 61 smap->map.max_entries); 62 return 0; 63 64 free_elems: 65 bpf_map_area_free(smap->elems); 66 return err; 67 } 68 69 /* Called from syscall */ 70 static struct bpf_map *stack_map_alloc(union bpf_attr *attr) 71 { 72 u32 value_size = attr->value_size; 73 struct bpf_stack_map *smap; 74 u64 cost, n_buckets; 75 int err; 76 77 if (!capable(CAP_SYS_ADMIN)) 78 return ERR_PTR(-EPERM); 79 80 if (attr->map_flags & ~STACK_CREATE_FLAG_MASK) 81 return ERR_PTR(-EINVAL); 82 83 /* check sanity of attributes */ 84 if (attr->max_entries == 0 || attr->key_size != 4 || 85 value_size < 8 || value_size % 8) 86 return ERR_PTR(-EINVAL); 87 88 BUILD_BUG_ON(sizeof(struct bpf_stack_build_id) % sizeof(u64)); 89 if (attr->map_flags & BPF_F_STACK_BUILD_ID) { 90 if (value_size % sizeof(struct bpf_stack_build_id) || 91 value_size / sizeof(struct bpf_stack_build_id) 92 > sysctl_perf_event_max_stack) 93 return ERR_PTR(-EINVAL); 94 } else if (value_size / 8 > sysctl_perf_event_max_stack) 95 return ERR_PTR(-EINVAL); 96 97 /* hash table size must be power of 2 */ 98 n_buckets = roundup_pow_of_two(attr->max_entries); 99 100 cost = n_buckets * sizeof(struct stack_map_bucket *) + sizeof(*smap); 101 if (cost >= U32_MAX - PAGE_SIZE) 102 return ERR_PTR(-E2BIG); 103 104 smap = bpf_map_area_alloc(cost, bpf_map_attr_numa_node(attr)); 105 if (!smap) 106 return ERR_PTR(-ENOMEM); 107 108 err = -E2BIG; 109 cost += n_buckets * (value_size + sizeof(struct stack_map_bucket)); 110 if (cost >= U32_MAX - PAGE_SIZE) 111 goto free_smap; 112 113 bpf_map_init_from_attr(&smap->map, attr); 114 smap->map.value_size = value_size; 115 smap->n_buckets = n_buckets; 116 smap->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; 117 118 err = bpf_map_precharge_memlock(smap->map.pages); 119 if (err) 120 goto free_smap; 121 122 err = get_callchain_buffers(sysctl_perf_event_max_stack); 123 if (err) 124 goto free_smap; 125 126 err = prealloc_elems_and_freelist(smap); 127 if (err) 128 goto put_buffers; 129 130 return &smap->map; 131 132 put_buffers: 133 put_callchain_buffers(); 134 free_smap: 135 bpf_map_area_free(smap); 136 return ERR_PTR(err); 137 } 138 139 #define BPF_BUILD_ID 3 140 /* 141 * Parse build id from the note segment. This logic can be shared between 142 * 32-bit and 64-bit system, because Elf32_Nhdr and Elf64_Nhdr are 143 * identical. 144 */ 145 static inline int stack_map_parse_build_id(void *page_addr, 146 unsigned char *build_id, 147 void *note_start, 148 Elf32_Word note_size) 149 { 150 Elf32_Word note_offs = 0, new_offs; 151 152 /* check for overflow */ 153 if (note_start < page_addr || note_start + note_size < note_start) 154 return -EINVAL; 155 156 /* only supports note that fits in the first page */ 157 if (note_start + note_size > page_addr + PAGE_SIZE) 158 return -EINVAL; 159 160 while (note_offs + sizeof(Elf32_Nhdr) < note_size) { 161 Elf32_Nhdr *nhdr = (Elf32_Nhdr *)(note_start + note_offs); 162 163 if (nhdr->n_type == BPF_BUILD_ID && 164 nhdr->n_namesz == sizeof("GNU") && 165 nhdr->n_descsz == BPF_BUILD_ID_SIZE) { 166 memcpy(build_id, 167 note_start + note_offs + 168 ALIGN(sizeof("GNU"), 4) + sizeof(Elf32_Nhdr), 169 BPF_BUILD_ID_SIZE); 170 return 0; 171 } 172 new_offs = note_offs + sizeof(Elf32_Nhdr) + 173 ALIGN(nhdr->n_namesz, 4) + ALIGN(nhdr->n_descsz, 4); 174 if (new_offs <= note_offs) /* overflow */ 175 break; 176 note_offs = new_offs; 177 } 178 return -EINVAL; 179 } 180 181 /* Parse build ID from 32-bit ELF */ 182 static int stack_map_get_build_id_32(void *page_addr, 183 unsigned char *build_id) 184 { 185 Elf32_Ehdr *ehdr = (Elf32_Ehdr *)page_addr; 186 Elf32_Phdr *phdr; 187 int i; 188 189 /* only supports phdr that fits in one page */ 190 if (ehdr->e_phnum > 191 (PAGE_SIZE - sizeof(Elf32_Ehdr)) / sizeof(Elf32_Phdr)) 192 return -EINVAL; 193 194 phdr = (Elf32_Phdr *)(page_addr + sizeof(Elf32_Ehdr)); 195 196 for (i = 0; i < ehdr->e_phnum; ++i) 197 if (phdr[i].p_type == PT_NOTE) 198 return stack_map_parse_build_id(page_addr, build_id, 199 page_addr + phdr[i].p_offset, 200 phdr[i].p_filesz); 201 return -EINVAL; 202 } 203 204 /* Parse build ID from 64-bit ELF */ 205 static int stack_map_get_build_id_64(void *page_addr, 206 unsigned char *build_id) 207 { 208 Elf64_Ehdr *ehdr = (Elf64_Ehdr *)page_addr; 209 Elf64_Phdr *phdr; 210 int i; 211 212 /* only supports phdr that fits in one page */ 213 if (ehdr->e_phnum > 214 (PAGE_SIZE - sizeof(Elf64_Ehdr)) / sizeof(Elf64_Phdr)) 215 return -EINVAL; 216 217 phdr = (Elf64_Phdr *)(page_addr + sizeof(Elf64_Ehdr)); 218 219 for (i = 0; i < ehdr->e_phnum; ++i) 220 if (phdr[i].p_type == PT_NOTE) 221 return stack_map_parse_build_id(page_addr, build_id, 222 page_addr + phdr[i].p_offset, 223 phdr[i].p_filesz); 224 return -EINVAL; 225 } 226 227 /* Parse build ID of ELF file mapped to vma */ 228 static int stack_map_get_build_id(struct vm_area_struct *vma, 229 unsigned char *build_id) 230 { 231 Elf32_Ehdr *ehdr; 232 struct page *page; 233 void *page_addr; 234 int ret; 235 236 /* only works for page backed storage */ 237 if (!vma->vm_file) 238 return -EINVAL; 239 240 page = find_get_page(vma->vm_file->f_mapping, 0); 241 if (!page) 242 return -EFAULT; /* page not mapped */ 243 244 ret = -EINVAL; 245 page_addr = page_address(page); 246 ehdr = (Elf32_Ehdr *)page_addr; 247 248 /* compare magic x7f "ELF" */ 249 if (memcmp(ehdr->e_ident, ELFMAG, SELFMAG) != 0) 250 goto out; 251 252 /* only support executable file and shared object file */ 253 if (ehdr->e_type != ET_EXEC && ehdr->e_type != ET_DYN) 254 goto out; 255 256 if (ehdr->e_ident[EI_CLASS] == ELFCLASS32) 257 ret = stack_map_get_build_id_32(page_addr, build_id); 258 else if (ehdr->e_ident[EI_CLASS] == ELFCLASS64) 259 ret = stack_map_get_build_id_64(page_addr, build_id); 260 out: 261 put_page(page); 262 return ret; 263 } 264 265 static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs, 266 u64 *ips, u32 trace_nr, bool user) 267 { 268 int i; 269 struct vm_area_struct *vma; 270 271 /* 272 * We cannot do up_read() in nmi context, so build_id lookup is 273 * only supported for non-nmi events. If at some point, it is 274 * possible to run find_vma() without taking the semaphore, we 275 * would like to allow build_id lookup in nmi context. 276 * 277 * Same fallback is used for kernel stack (!user) on a stackmap 278 * with build_id. 279 */ 280 if (!user || !current || !current->mm || in_nmi() || 281 down_read_trylock(¤t->mm->mmap_sem) == 0) { 282 /* cannot access current->mm, fall back to ips */ 283 for (i = 0; i < trace_nr; i++) { 284 id_offs[i].status = BPF_STACK_BUILD_ID_IP; 285 id_offs[i].ip = ips[i]; 286 } 287 return; 288 } 289 290 for (i = 0; i < trace_nr; i++) { 291 vma = find_vma(current->mm, ips[i]); 292 if (!vma || stack_map_get_build_id(vma, id_offs[i].build_id)) { 293 /* per entry fall back to ips */ 294 id_offs[i].status = BPF_STACK_BUILD_ID_IP; 295 id_offs[i].ip = ips[i]; 296 continue; 297 } 298 id_offs[i].offset = (vma->vm_pgoff << PAGE_SHIFT) + ips[i] 299 - vma->vm_start; 300 id_offs[i].status = BPF_STACK_BUILD_ID_VALID; 301 } 302 up_read(¤t->mm->mmap_sem); 303 } 304 305 BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map, 306 u64, flags) 307 { 308 struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map); 309 struct perf_callchain_entry *trace; 310 struct stack_map_bucket *bucket, *new_bucket, *old_bucket; 311 u32 max_depth = map->value_size / stack_map_data_size(map); 312 /* stack_map_alloc() checks that max_depth <= sysctl_perf_event_max_stack */ 313 u32 init_nr = sysctl_perf_event_max_stack - max_depth; 314 u32 skip = flags & BPF_F_SKIP_FIELD_MASK; 315 u32 hash, id, trace_nr, trace_len; 316 bool user = flags & BPF_F_USER_STACK; 317 bool kernel = !user; 318 u64 *ips; 319 bool hash_matches; 320 321 if (unlikely(flags & ~(BPF_F_SKIP_FIELD_MASK | BPF_F_USER_STACK | 322 BPF_F_FAST_STACK_CMP | BPF_F_REUSE_STACKID))) 323 return -EINVAL; 324 325 trace = get_perf_callchain(regs, init_nr, kernel, user, 326 sysctl_perf_event_max_stack, false, false); 327 328 if (unlikely(!trace)) 329 /* couldn't fetch the stack trace */ 330 return -EFAULT; 331 332 /* get_perf_callchain() guarantees that trace->nr >= init_nr 333 * and trace-nr <= sysctl_perf_event_max_stack, so trace_nr <= max_depth 334 */ 335 trace_nr = trace->nr - init_nr; 336 337 if (trace_nr <= skip) 338 /* skipping more than usable stack trace */ 339 return -EFAULT; 340 341 trace_nr -= skip; 342 trace_len = trace_nr * sizeof(u64); 343 ips = trace->ip + skip + init_nr; 344 hash = jhash2((u32 *)ips, trace_len / sizeof(u32), 0); 345 id = hash & (smap->n_buckets - 1); 346 bucket = READ_ONCE(smap->buckets[id]); 347 348 hash_matches = bucket && bucket->hash == hash; 349 /* fast cmp */ 350 if (hash_matches && flags & BPF_F_FAST_STACK_CMP) 351 return id; 352 353 if (stack_map_use_build_id(map)) { 354 /* for build_id+offset, pop a bucket before slow cmp */ 355 new_bucket = (struct stack_map_bucket *) 356 pcpu_freelist_pop(&smap->freelist); 357 if (unlikely(!new_bucket)) 358 return -ENOMEM; 359 new_bucket->nr = trace_nr; 360 stack_map_get_build_id_offset( 361 (struct bpf_stack_build_id *)new_bucket->data, 362 ips, trace_nr, user); 363 trace_len = trace_nr * sizeof(struct bpf_stack_build_id); 364 if (hash_matches && bucket->nr == trace_nr && 365 memcmp(bucket->data, new_bucket->data, trace_len) == 0) { 366 pcpu_freelist_push(&smap->freelist, &new_bucket->fnode); 367 return id; 368 } 369 if (bucket && !(flags & BPF_F_REUSE_STACKID)) { 370 pcpu_freelist_push(&smap->freelist, &new_bucket->fnode); 371 return -EEXIST; 372 } 373 } else { 374 if (hash_matches && bucket->nr == trace_nr && 375 memcmp(bucket->data, ips, trace_len) == 0) 376 return id; 377 if (bucket && !(flags & BPF_F_REUSE_STACKID)) 378 return -EEXIST; 379 380 new_bucket = (struct stack_map_bucket *) 381 pcpu_freelist_pop(&smap->freelist); 382 if (unlikely(!new_bucket)) 383 return -ENOMEM; 384 memcpy(new_bucket->data, ips, trace_len); 385 } 386 387 new_bucket->hash = hash; 388 new_bucket->nr = trace_nr; 389 390 old_bucket = xchg(&smap->buckets[id], new_bucket); 391 if (old_bucket) 392 pcpu_freelist_push(&smap->freelist, &old_bucket->fnode); 393 return id; 394 } 395 396 const struct bpf_func_proto bpf_get_stackid_proto = { 397 .func = bpf_get_stackid, 398 .gpl_only = true, 399 .ret_type = RET_INTEGER, 400 .arg1_type = ARG_PTR_TO_CTX, 401 .arg2_type = ARG_CONST_MAP_PTR, 402 .arg3_type = ARG_ANYTHING, 403 }; 404 405 BPF_CALL_4(bpf_get_stack, struct pt_regs *, regs, void *, buf, u32, size, 406 u64, flags) 407 { 408 u32 init_nr, trace_nr, copy_len, elem_size, num_elem; 409 bool user_build_id = flags & BPF_F_USER_BUILD_ID; 410 u32 skip = flags & BPF_F_SKIP_FIELD_MASK; 411 bool user = flags & BPF_F_USER_STACK; 412 struct perf_callchain_entry *trace; 413 bool kernel = !user; 414 int err = -EINVAL; 415 u64 *ips; 416 417 if (unlikely(flags & ~(BPF_F_SKIP_FIELD_MASK | BPF_F_USER_STACK | 418 BPF_F_USER_BUILD_ID))) 419 goto clear; 420 if (kernel && user_build_id) 421 goto clear; 422 423 elem_size = (user && user_build_id) ? sizeof(struct bpf_stack_build_id) 424 : sizeof(u64); 425 if (unlikely(size % elem_size)) 426 goto clear; 427 428 num_elem = size / elem_size; 429 if (sysctl_perf_event_max_stack < num_elem) 430 init_nr = 0; 431 else 432 init_nr = sysctl_perf_event_max_stack - num_elem; 433 trace = get_perf_callchain(regs, init_nr, kernel, user, 434 sysctl_perf_event_max_stack, false, false); 435 if (unlikely(!trace)) 436 goto err_fault; 437 438 trace_nr = trace->nr - init_nr; 439 if (trace_nr < skip) 440 goto err_fault; 441 442 trace_nr -= skip; 443 trace_nr = (trace_nr <= num_elem) ? trace_nr : num_elem; 444 copy_len = trace_nr * elem_size; 445 ips = trace->ip + skip + init_nr; 446 if (user && user_build_id) 447 stack_map_get_build_id_offset(buf, ips, trace_nr, user); 448 else 449 memcpy(buf, ips, copy_len); 450 451 if (size > copy_len) 452 memset(buf + copy_len, 0, size - copy_len); 453 return copy_len; 454 455 err_fault: 456 err = -EFAULT; 457 clear: 458 memset(buf, 0, size); 459 return err; 460 } 461 462 const struct bpf_func_proto bpf_get_stack_proto = { 463 .func = bpf_get_stack, 464 .gpl_only = true, 465 .ret_type = RET_INTEGER, 466 .arg1_type = ARG_PTR_TO_CTX, 467 .arg2_type = ARG_PTR_TO_UNINIT_MEM, 468 .arg3_type = ARG_CONST_SIZE_OR_ZERO, 469 .arg4_type = ARG_ANYTHING, 470 }; 471 472 /* Called from eBPF program */ 473 static void *stack_map_lookup_elem(struct bpf_map *map, void *key) 474 { 475 return NULL; 476 } 477 478 /* Called from syscall */ 479 int bpf_stackmap_copy(struct bpf_map *map, void *key, void *value) 480 { 481 struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map); 482 struct stack_map_bucket *bucket, *old_bucket; 483 u32 id = *(u32 *)key, trace_len; 484 485 if (unlikely(id >= smap->n_buckets)) 486 return -ENOENT; 487 488 bucket = xchg(&smap->buckets[id], NULL); 489 if (!bucket) 490 return -ENOENT; 491 492 trace_len = bucket->nr * stack_map_data_size(map); 493 memcpy(value, bucket->data, trace_len); 494 memset(value + trace_len, 0, map->value_size - trace_len); 495 496 old_bucket = xchg(&smap->buckets[id], bucket); 497 if (old_bucket) 498 pcpu_freelist_push(&smap->freelist, &old_bucket->fnode); 499 return 0; 500 } 501 502 static int stack_map_get_next_key(struct bpf_map *map, void *key, 503 void *next_key) 504 { 505 struct bpf_stack_map *smap = container_of(map, 506 struct bpf_stack_map, map); 507 u32 id; 508 509 WARN_ON_ONCE(!rcu_read_lock_held()); 510 511 if (!key) { 512 id = 0; 513 } else { 514 id = *(u32 *)key; 515 if (id >= smap->n_buckets || !smap->buckets[id]) 516 id = 0; 517 else 518 id++; 519 } 520 521 while (id < smap->n_buckets && !smap->buckets[id]) 522 id++; 523 524 if (id >= smap->n_buckets) 525 return -ENOENT; 526 527 *(u32 *)next_key = id; 528 return 0; 529 } 530 531 static int stack_map_update_elem(struct bpf_map *map, void *key, void *value, 532 u64 map_flags) 533 { 534 return -EINVAL; 535 } 536 537 /* Called from syscall or from eBPF program */ 538 static int stack_map_delete_elem(struct bpf_map *map, void *key) 539 { 540 struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map); 541 struct stack_map_bucket *old_bucket; 542 u32 id = *(u32 *)key; 543 544 if (unlikely(id >= smap->n_buckets)) 545 return -E2BIG; 546 547 old_bucket = xchg(&smap->buckets[id], NULL); 548 if (old_bucket) { 549 pcpu_freelist_push(&smap->freelist, &old_bucket->fnode); 550 return 0; 551 } else { 552 return -ENOENT; 553 } 554 } 555 556 /* Called when map->refcnt goes to zero, either from workqueue or from syscall */ 557 static void stack_map_free(struct bpf_map *map) 558 { 559 struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map); 560 561 /* wait for bpf programs to complete before freeing stack map */ 562 synchronize_rcu(); 563 564 bpf_map_area_free(smap->elems); 565 pcpu_freelist_destroy(&smap->freelist); 566 bpf_map_area_free(smap); 567 put_callchain_buffers(); 568 } 569 570 const struct bpf_map_ops stack_map_ops = { 571 .map_alloc = stack_map_alloc, 572 .map_free = stack_map_free, 573 .map_get_next_key = stack_map_get_next_key, 574 .map_lookup_elem = stack_map_lookup_elem, 575 .map_update_elem = stack_map_update_elem, 576 .map_delete_elem = stack_map_delete_elem, 577 }; 578