1 // SPDX-License-Identifier: GPL-2.0 2 // Copyright (c) 2019 Facebook 3 4 #include <stdint.h> 5 #include <stddef.h> 6 #include <stdbool.h> 7 #include <linux/bpf.h> 8 #include <linux/ptrace.h> 9 #include <linux/sched.h> 10 #include <linux/types.h> 11 #include <bpf/bpf_helpers.h> 12 13 #include "bpf_compiler.h" 14 15 typedef uint32_t pid_t; 16 struct task_struct {}; 17 18 #define TASK_COMM_LEN 16 19 #define PERF_MAX_STACK_DEPTH 127 20 21 #define STROBE_TYPE_INVALID 0 22 #define STROBE_TYPE_INT 1 23 #define STROBE_TYPE_STR 2 24 #define STROBE_TYPE_MAP 3 25 26 #define STACK_TABLE_EPOCH_SHIFT 20 27 #define STROBE_MAX_STR_LEN 1 28 #define STROBE_MAX_CFGS 32 29 #define READ_MAP_VAR_PAYLOAD_CAP \ 30 ((1 + STROBE_MAX_MAP_ENTRIES * 2) * STROBE_MAX_STR_LEN) 31 #define STROBE_MAX_PAYLOAD \ 32 (STROBE_MAX_STRS * STROBE_MAX_STR_LEN + \ 33 STROBE_MAX_MAPS * READ_MAP_VAR_PAYLOAD_CAP) 34 35 struct strobe_value_header { 36 /* 37 * meaning depends on type: 38 * 1. int: 0, if value not set, 1 otherwise 39 * 2. str: 1 always, whether value is set or not is determined by ptr 40 * 3. map: 1 always, pointer points to additional struct with number 41 * of entries (up to STROBE_MAX_MAP_ENTRIES) 42 */ 43 uint16_t len; 44 /* 45 * _reserved might be used for some future fields/flags, but we always 46 * want to keep strobe_value_header to be 8 bytes, so BPF can read 16 47 * bytes in one go and get both header and value 48 */ 49 uint8_t _reserved[6]; 50 }; 51 52 /* 53 * strobe_value_generic is used from BPF probe only, but needs to be a union 54 * of strobe_value_int/strobe_value_str/strobe_value_map 55 */ 56 struct strobe_value_generic { 57 struct strobe_value_header header; 58 union { 59 int64_t val; 60 void *ptr; 61 }; 62 }; 63 64 struct strobe_value_int { 65 struct strobe_value_header header; 66 int64_t value; 67 }; 68 69 struct strobe_value_str { 70 struct strobe_value_header header; 71 const char* value; 72 }; 73 74 struct strobe_value_map { 75 struct strobe_value_header header; 76 const struct strobe_map_raw* value; 77 }; 78 79 struct strobe_map_entry { 80 const char* key; 81 const char* val; 82 }; 83 84 /* 85 * Map of C-string key/value pairs with fixed maximum capacity. Each map has 86 * corresponding int64 ID, which application can use (or ignore) in whatever 87 * way appropriate. Map is "write-only", there is no way to get data out of 88 * map. Map is intended to be used to provide metadata for profilers and is 89 * not to be used for internal in-app communication. All methods are 90 * thread-safe. 91 */ 92 struct strobe_map_raw { 93 /* 94 * general purpose unique ID that's up to application to decide 95 * whether and how to use; for request metadata use case id is unique 96 * request ID that's used to match metadata with stack traces on 97 * Strobelight backend side 98 */ 99 int64_t id; 100 /* number of used entries in map */ 101 int64_t cnt; 102 /* 103 * having volatile doesn't change anything on BPF side, but clang 104 * emits warnings for passing `volatile const char *` into 105 * bpf_probe_read_user_str that expects just `const char *` 106 */ 107 const char* tag; 108 /* 109 * key/value entries, each consisting of 2 pointers to key and value 110 * C strings 111 */ 112 struct strobe_map_entry entries[STROBE_MAX_MAP_ENTRIES]; 113 }; 114 115 /* Following values define supported values of TLS mode */ 116 #define TLS_NOT_SET -1 117 #define TLS_LOCAL_EXEC 0 118 #define TLS_IMM_EXEC 1 119 #define TLS_GENERAL_DYN 2 120 121 /* 122 * structure that universally represents TLS location (both for static 123 * executables and shared libraries) 124 */ 125 struct strobe_value_loc { 126 /* 127 * tls_mode defines what TLS mode was used for particular metavariable: 128 * - -1 (TLS_NOT_SET) - no metavariable; 129 * - 0 (TLS_LOCAL_EXEC) - Local Executable mode; 130 * - 1 (TLS_IMM_EXEC) - Immediate Executable mode; 131 * - 2 (TLS_GENERAL_DYN) - General Dynamic mode; 132 * Local Dynamic mode is not yet supported, because never seen in 133 * practice. Mode defines how offset field is interpreted. See 134 * calc_location() in below for details. 135 */ 136 int64_t tls_mode; 137 /* 138 * TLS_LOCAL_EXEC: offset from thread pointer (fs:0 for x86-64, 139 * tpidr_el0 for aarch64). 140 * TLS_IMM_EXEC: absolute address of GOT entry containing offset 141 * from thread pointer; 142 * TLS_GENERAL_DYN: absolute address of double GOT entry 143 * containing tls_index_t struct; 144 */ 145 int64_t offset; 146 }; 147 148 struct strobemeta_cfg { 149 int64_t req_meta_idx; 150 struct strobe_value_loc int_locs[STROBE_MAX_INTS]; 151 struct strobe_value_loc str_locs[STROBE_MAX_STRS]; 152 struct strobe_value_loc map_locs[STROBE_MAX_MAPS]; 153 }; 154 155 struct strobe_map_descr { 156 uint64_t id; 157 int16_t tag_len; 158 /* 159 * cnt <0 - map value isn't set; 160 * 0 - map has id set, but no key/value entries 161 */ 162 int16_t cnt; 163 /* 164 * both key_lens[i] and val_lens[i] should be >0 for present key/value 165 * entry 166 */ 167 uint16_t key_lens[STROBE_MAX_MAP_ENTRIES]; 168 uint16_t val_lens[STROBE_MAX_MAP_ENTRIES]; 169 }; 170 171 struct strobemeta_payload { 172 /* req_id has valid request ID, if req_meta_valid == 1 */ 173 int64_t req_id; 174 uint8_t req_meta_valid; 175 /* 176 * mask has Nth bit set to 1, if Nth metavar was present and 177 * successfully read 178 */ 179 uint64_t int_vals_set_mask; 180 int64_t int_vals[STROBE_MAX_INTS]; 181 /* len is >0 for present values */ 182 uint16_t str_lens[STROBE_MAX_STRS]; 183 /* if map_descrs[i].cnt == -1, metavar is not present/set */ 184 struct strobe_map_descr map_descrs[STROBE_MAX_MAPS]; 185 /* 186 * payload has compactly packed values of str and map variables in the 187 * form: strval1\0strval2\0map1key1\0map1val1\0map2key1\0map2val1\0 188 * (and so on); str_lens[i], key_lens[i] and val_lens[i] determines 189 * value length 190 */ 191 char payload[STROBE_MAX_PAYLOAD]; 192 }; 193 194 struct strobelight_bpf_sample { 195 uint64_t ktime; 196 char comm[TASK_COMM_LEN]; 197 pid_t pid; 198 int user_stack_id; 199 int kernel_stack_id; 200 int has_meta; 201 struct strobemeta_payload metadata; 202 /* 203 * makes it possible to pass (<real payload size> + 1) as data size to 204 * perf_submit() to avoid perf_submit's paranoia about passing zero as 205 * size, as it deduces that <real payload size> might be 206 * **theoretically** zero 207 */ 208 char dummy_safeguard; 209 }; 210 211 struct { 212 __uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY); 213 __uint(max_entries, 32); 214 __uint(key_size, sizeof(int)); 215 __uint(value_size, sizeof(int)); 216 } samples SEC(".maps"); 217 218 struct { 219 __uint(type, BPF_MAP_TYPE_STACK_TRACE); 220 __uint(max_entries, 16); 221 __uint(key_size, sizeof(uint32_t)); 222 __uint(value_size, sizeof(uint64_t) * PERF_MAX_STACK_DEPTH); 223 } stacks_0 SEC(".maps"); 224 225 struct { 226 __uint(type, BPF_MAP_TYPE_STACK_TRACE); 227 __uint(max_entries, 16); 228 __uint(key_size, sizeof(uint32_t)); 229 __uint(value_size, sizeof(uint64_t) * PERF_MAX_STACK_DEPTH); 230 } stacks_1 SEC(".maps"); 231 232 struct { 233 __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); 234 __uint(max_entries, 1); 235 __type(key, uint32_t); 236 __type(value, struct strobelight_bpf_sample); 237 } sample_heap SEC(".maps"); 238 239 struct { 240 __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); 241 __uint(max_entries, STROBE_MAX_CFGS); 242 __type(key, pid_t); 243 __type(value, struct strobemeta_cfg); 244 } strobemeta_cfgs SEC(".maps"); 245 246 /* Type for the dtv. */ 247 /* https://github.com/lattera/glibc/blob/master/nptl/sysdeps/x86_64/tls.h#L34 */ 248 typedef union dtv { 249 size_t counter; 250 struct { 251 void* val; 252 bool is_static; 253 } pointer; 254 } dtv_t; 255 256 /* Partial definition for tcbhead_t */ 257 /* https://github.com/bminor/glibc/blob/master/sysdeps/x86_64/nptl/tls.h#L42 */ 258 struct tcbhead { 259 void* tcb; 260 dtv_t* dtv; 261 }; 262 263 /* 264 * TLS module/offset information for shared library case. 265 * For x86-64, this is mapped onto two entries in GOT. 266 * For aarch64, this is pointed to by second GOT entry. 267 */ 268 struct tls_index { 269 uint64_t module; 270 uint64_t offset; 271 }; 272 273 #ifdef SUBPROGS 274 __noinline 275 #else 276 __always_inline 277 #endif 278 static void *calc_location(struct strobe_value_loc *loc, void *tls_base) 279 { 280 /* 281 * tls_mode value is: 282 * - -1 (TLS_NOT_SET), if no metavar is present; 283 * - 0 (TLS_LOCAL_EXEC), if metavar uses Local Executable mode of TLS 284 * (offset from fs:0 for x86-64 or tpidr_el0 for aarch64); 285 * - 1 (TLS_IMM_EXEC), if metavar uses Immediate Executable mode of TLS; 286 * - 2 (TLS_GENERAL_DYN), if metavar uses General Dynamic mode of TLS; 287 * This schema allows to use something like: 288 * (tls_mode + 1) * (tls_base + offset) 289 * to get NULL for "no metavar" location, or correct pointer for local 290 * executable mode without doing extra ifs. 291 */ 292 if (loc->tls_mode <= TLS_LOCAL_EXEC) { 293 /* static executable is simple, we just have offset from 294 * tls_base */ 295 void *addr = tls_base + loc->offset; 296 /* multiply by (tls_mode + 1) to get NULL, if we have no 297 * metavar in this slot */ 298 return (void *)((loc->tls_mode + 1) * (int64_t)addr); 299 } 300 /* 301 * Other modes are more complicated, we need to jump through few hoops. 302 * 303 * For immediate executable mode (currently supported only for aarch64): 304 * - loc->offset is pointing to a GOT entry containing fixed offset 305 * relative to tls_base; 306 * 307 * For general dynamic mode: 308 * - loc->offset is pointing to a beginning of double GOT entries; 309 * - (for aarch64 only) second entry points to tls_index_t struct; 310 * - (for x86-64 only) two GOT entries are already tls_index_t; 311 * - tls_index_t->module is used to find start of TLS section in 312 * which variable resides; 313 * - tls_index_t->offset provides offset within that TLS section, 314 * pointing to value of variable. 315 */ 316 struct tls_index tls_index; 317 dtv_t *dtv; 318 void *tls_ptr; 319 320 bpf_probe_read_user(&tls_index, sizeof(struct tls_index), 321 (void *)loc->offset); 322 /* valid module index is always positive */ 323 if (tls_index.module > 0) { 324 /* dtv = ((struct tcbhead *)tls_base)->dtv[tls_index.module] */ 325 bpf_probe_read_user(&dtv, sizeof(dtv), 326 &((struct tcbhead *)tls_base)->dtv); 327 dtv += tls_index.module; 328 } else { 329 dtv = NULL; 330 } 331 bpf_probe_read_user(&tls_ptr, sizeof(void *), dtv); 332 /* if pointer has (void *)-1 value, then TLS wasn't initialized yet */ 333 return tls_ptr && tls_ptr != (void *)-1 334 ? tls_ptr + tls_index.offset 335 : NULL; 336 } 337 338 #ifdef SUBPROGS 339 __noinline 340 #else 341 __always_inline 342 #endif 343 static void read_int_var(struct strobemeta_cfg *cfg, 344 size_t idx, void *tls_base, 345 struct strobe_value_generic *value, 346 struct strobemeta_payload *data) 347 { 348 void *location = calc_location(&cfg->int_locs[idx], tls_base); 349 if (!location) 350 return; 351 352 bpf_probe_read_user(value, sizeof(struct strobe_value_generic), location); 353 data->int_vals[idx] = value->val; 354 if (value->header.len) 355 data->int_vals_set_mask |= (1 << idx); 356 } 357 358 static __always_inline uint64_t read_str_var(struct strobemeta_cfg *cfg, 359 size_t idx, void *tls_base, 360 struct strobe_value_generic *value, 361 struct strobemeta_payload *data, 362 size_t off) 363 { 364 void *location; 365 uint64_t len; 366 367 data->str_lens[idx] = 0; 368 location = calc_location(&cfg->str_locs[idx], tls_base); 369 if (!location) 370 return 0; 371 372 bpf_probe_read_user(value, sizeof(struct strobe_value_generic), location); 373 len = bpf_probe_read_user_str(&data->payload[off], STROBE_MAX_STR_LEN, value->ptr); 374 /* 375 * if bpf_probe_read_user_str returns error (<0), due to casting to 376 * unsigned int, it will become big number, so next check is 377 * sufficient to check for errors AND prove to BPF verifier, that 378 * bpf_probe_read_user_str won't return anything bigger than 379 * STROBE_MAX_STR_LEN 380 */ 381 if (len > STROBE_MAX_STR_LEN) 382 return 0; 383 384 data->str_lens[idx] = len; 385 return off + len; 386 } 387 388 static __always_inline uint64_t read_map_var(struct strobemeta_cfg *cfg, 389 size_t idx, void *tls_base, 390 struct strobe_value_generic *value, 391 struct strobemeta_payload *data, 392 size_t off) 393 { 394 struct strobe_map_descr* descr = &data->map_descrs[idx]; 395 struct strobe_map_raw map; 396 void *location; 397 uint64_t len; 398 399 descr->tag_len = 0; /* presume no tag is set */ 400 descr->cnt = -1; /* presume no value is set */ 401 402 location = calc_location(&cfg->map_locs[idx], tls_base); 403 if (!location) 404 return off; 405 406 bpf_probe_read_user(value, sizeof(struct strobe_value_generic), location); 407 if (bpf_probe_read_user(&map, sizeof(struct strobe_map_raw), value->ptr)) 408 return off; 409 410 descr->id = map.id; 411 descr->cnt = map.cnt; 412 if (cfg->req_meta_idx == idx) { 413 data->req_id = map.id; 414 data->req_meta_valid = 1; 415 } 416 417 len = bpf_probe_read_user_str(&data->payload[off], STROBE_MAX_STR_LEN, map.tag); 418 if (len <= STROBE_MAX_STR_LEN) { 419 descr->tag_len = len; 420 off += len; 421 } 422 423 #ifdef NO_UNROLL 424 __pragma_loop_no_unroll 425 #else 426 __pragma_loop_unroll 427 #endif 428 for (int i = 0; i < STROBE_MAX_MAP_ENTRIES; ++i) { 429 if (i >= map.cnt) 430 break; 431 432 descr->key_lens[i] = 0; 433 len = bpf_probe_read_user_str(&data->payload[off], STROBE_MAX_STR_LEN, 434 map.entries[i].key); 435 if (len <= STROBE_MAX_STR_LEN) { 436 descr->key_lens[i] = len; 437 off += len; 438 } 439 descr->val_lens[i] = 0; 440 len = bpf_probe_read_user_str(&data->payload[off], STROBE_MAX_STR_LEN, 441 map.entries[i].val); 442 if (len <= STROBE_MAX_STR_LEN) { 443 descr->val_lens[i] = len; 444 off += len; 445 } 446 } 447 448 return off; 449 } 450 451 #ifdef USE_BPF_LOOP 452 enum read_type { 453 READ_INT_VAR, 454 READ_MAP_VAR, 455 READ_STR_VAR, 456 }; 457 458 struct read_var_ctx { 459 struct strobemeta_payload *data; 460 void *tls_base; 461 struct strobemeta_cfg *cfg; 462 size_t payload_off; 463 /* value gets mutated */ 464 struct strobe_value_generic *value; 465 enum read_type type; 466 }; 467 468 static int read_var_callback(__u64 index, struct read_var_ctx *ctx) 469 { 470 /* lose precision info for ctx->payload_off, verifier won't track 471 * double xor, barrier_var() is needed to force clang keep both xors. 472 */ 473 ctx->payload_off ^= index; 474 barrier_var(ctx->payload_off); 475 ctx->payload_off ^= index; 476 switch (ctx->type) { 477 case READ_INT_VAR: 478 if (index >= STROBE_MAX_INTS) 479 return 1; 480 read_int_var(ctx->cfg, index, ctx->tls_base, ctx->value, ctx->data); 481 break; 482 case READ_MAP_VAR: 483 if (index >= STROBE_MAX_MAPS) 484 return 1; 485 if (ctx->payload_off > sizeof(ctx->data->payload) - READ_MAP_VAR_PAYLOAD_CAP) 486 return 1; 487 ctx->payload_off = read_map_var(ctx->cfg, index, ctx->tls_base, 488 ctx->value, ctx->data, ctx->payload_off); 489 break; 490 case READ_STR_VAR: 491 if (index >= STROBE_MAX_STRS) 492 return 1; 493 if (ctx->payload_off > sizeof(ctx->data->payload) - STROBE_MAX_STR_LEN) 494 return 1; 495 ctx->payload_off = read_str_var(ctx->cfg, index, ctx->tls_base, 496 ctx->value, ctx->data, ctx->payload_off); 497 break; 498 } 499 return 0; 500 } 501 #endif /* USE_BPF_LOOP */ 502 503 /* 504 * read_strobe_meta returns NULL, if no metadata was read; otherwise returns 505 * pointer to *right after* payload ends 506 */ 507 #ifdef SUBPROGS 508 __noinline 509 #else 510 __always_inline 511 #endif 512 static void *read_strobe_meta(struct task_struct *task, 513 struct strobemeta_payload *data) 514 { 515 pid_t pid = bpf_get_current_pid_tgid() >> 32; 516 struct strobe_value_generic value = {0}; 517 struct strobemeta_cfg *cfg; 518 size_t payload_off; 519 void *tls_base; 520 521 cfg = bpf_map_lookup_elem(&strobemeta_cfgs, &pid); 522 if (!cfg) 523 return NULL; 524 525 data->int_vals_set_mask = 0; 526 data->req_meta_valid = 0; 527 payload_off = 0; 528 /* 529 * we don't have struct task_struct definition, it should be: 530 * tls_base = (void *)task->thread.fsbase; 531 */ 532 tls_base = (void *)task; 533 534 #ifdef USE_BPF_LOOP 535 struct read_var_ctx ctx = { 536 .cfg = cfg, 537 .tls_base = tls_base, 538 .value = &value, 539 .data = data, 540 .payload_off = 0, 541 }; 542 int err; 543 544 ctx.type = READ_INT_VAR; 545 err = bpf_loop(STROBE_MAX_INTS, read_var_callback, &ctx, 0); 546 if (err != STROBE_MAX_INTS) 547 return NULL; 548 549 ctx.type = READ_STR_VAR; 550 err = bpf_loop(STROBE_MAX_STRS, read_var_callback, &ctx, 0); 551 if (err != STROBE_MAX_STRS) 552 return NULL; 553 554 ctx.type = READ_MAP_VAR; 555 err = bpf_loop(STROBE_MAX_MAPS, read_var_callback, &ctx, 0); 556 if (err != STROBE_MAX_MAPS) 557 return NULL; 558 559 payload_off = ctx.payload_off; 560 /* this should not really happen, here only to satisfy verifier */ 561 if (payload_off > sizeof(data->payload)) 562 payload_off = sizeof(data->payload); 563 #else 564 #ifdef NO_UNROLL 565 __pragma_loop_no_unroll 566 #else 567 __pragma_loop_unroll 568 #endif /* NO_UNROLL */ 569 for (int i = 0; i < STROBE_MAX_INTS; ++i) { 570 read_int_var(cfg, i, tls_base, &value, data); 571 } 572 #ifdef NO_UNROLL 573 __pragma_loop_no_unroll 574 #else 575 __pragma_loop_unroll 576 #endif /* NO_UNROLL */ 577 for (int i = 0; i < STROBE_MAX_STRS; ++i) { 578 payload_off = read_str_var(cfg, i, tls_base, &value, data, payload_off); 579 } 580 #ifdef NO_UNROLL 581 __pragma_loop_no_unroll 582 #else 583 __pragma_loop_unroll 584 #endif /* NO_UNROLL */ 585 for (int i = 0; i < STROBE_MAX_MAPS; ++i) { 586 payload_off = read_map_var(cfg, i, tls_base, &value, data, payload_off); 587 } 588 #endif /* USE_BPF_LOOP */ 589 590 /* 591 * return pointer right after end of payload, so it's possible to 592 * calculate exact amount of useful data that needs to be sent 593 */ 594 return &data->payload[payload_off]; 595 } 596 597 SEC("raw_tracepoint/kfree_skb") 598 int on_event(struct pt_regs *ctx) { 599 pid_t pid = bpf_get_current_pid_tgid() >> 32; 600 struct strobelight_bpf_sample* sample; 601 struct task_struct *task; 602 uint32_t zero = 0; 603 uint64_t ktime_ns; 604 void *sample_end; 605 606 sample = bpf_map_lookup_elem(&sample_heap, &zero); 607 if (!sample) 608 return 0; /* this will never happen */ 609 610 sample->pid = pid; 611 bpf_get_current_comm(&sample->comm, TASK_COMM_LEN); 612 ktime_ns = bpf_ktime_get_ns(); 613 sample->ktime = ktime_ns; 614 615 task = (struct task_struct *)bpf_get_current_task(); 616 sample_end = read_strobe_meta(task, &sample->metadata); 617 sample->has_meta = sample_end != NULL; 618 sample_end = sample_end ? : &sample->metadata; 619 620 if ((ktime_ns >> STACK_TABLE_EPOCH_SHIFT) & 1) { 621 sample->kernel_stack_id = bpf_get_stackid(ctx, &stacks_1, 0); 622 sample->user_stack_id = bpf_get_stackid(ctx, &stacks_1, BPF_F_USER_STACK); 623 } else { 624 sample->kernel_stack_id = bpf_get_stackid(ctx, &stacks_0, 0); 625 sample->user_stack_id = bpf_get_stackid(ctx, &stacks_0, BPF_F_USER_STACK); 626 } 627 628 uint64_t sample_size = sample_end - (void *)sample; 629 /* should always be true */ 630 if (sample_size < sizeof(struct strobelight_bpf_sample)) 631 bpf_perf_event_output(ctx, &samples, 0, sample, 1 + sample_size); 632 return 0; 633 } 634 635 char _license[] SEC("license") = "GPL"; 636