1 // SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) 2 // Copyright (c) 2022 Google 3 #include "vmlinux.h" 4 #include <bpf/bpf_helpers.h> 5 #include <bpf/bpf_tracing.h> 6 #include <bpf/bpf_core_read.h> 7 #include <asm-generic/errno-base.h> 8 9 #include "lock_data.h" 10 11 /* for collect_lock_syms(). 4096 was rejected by the verifier */ 12 #define MAX_CPUS 1024 13 14 /* for collect_zone_lock(). It should be more than the actual zones. */ 15 #define MAX_ZONES 10 16 17 /* for do_lock_delay(). Arbitrarily set to 1 million. */ 18 #define MAX_LOOP (1U << 20) 19 20 /* lock contention flags from include/trace/events/lock.h */ 21 #define LCB_F_SPIN (1U << 0) 22 #define LCB_F_READ (1U << 1) 23 #define LCB_F_WRITE (1U << 2) 24 #define LCB_F_RT (1U << 3) 25 #define LCB_F_PERCPU (1U << 4) 26 #define LCB_F_MUTEX (1U << 5) 27 28 /* callstack storage */ 29 struct { 30 __uint(type, BPF_MAP_TYPE_STACK_TRACE); 31 __uint(key_size, sizeof(__u32)); 32 __uint(value_size, sizeof(__u64)); 33 __uint(max_entries, MAX_ENTRIES); 34 } stacks SEC(".maps"); 35 36 /* buffer for owner stacktrace */ 37 struct { 38 __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); 39 __uint(key_size, sizeof(__u32)); 40 __uint(value_size, sizeof(__u64)); 41 __uint(max_entries, 1); 42 } stack_buf SEC(".maps"); 43 44 /* a map for tracing owner stacktrace to owner stack id */ 45 struct { 46 __uint(type, BPF_MAP_TYPE_HASH); 47 __uint(key_size, sizeof(__u64)); // owner stacktrace 48 __uint(value_size, sizeof(__s32)); // owner stack id 49 __uint(max_entries, 1); 50 } owner_stacks SEC(".maps"); 51 52 /* a map for tracing lock address to owner data */ 53 struct { 54 __uint(type, BPF_MAP_TYPE_HASH); 55 __uint(key_size, sizeof(__u64)); // lock address 56 __uint(value_size, sizeof(struct owner_tracing_data)); 57 __uint(max_entries, 1); 58 } owner_data SEC(".maps"); 59 60 /* a map for contention_key (stores owner stack id) to contention data */ 61 struct { 62 __uint(type, BPF_MAP_TYPE_HASH); 63 __uint(key_size, sizeof(struct contention_key)); 64 __uint(value_size, sizeof(struct contention_data)); 65 __uint(max_entries, 1); 66 } owner_stat SEC(".maps"); 67 68 /* maintain timestamp at the beginning of contention */ 69 struct { 70 __uint(type, BPF_MAP_TYPE_HASH); 71 __type(key, int); 72 __type(value, struct tstamp_data); 73 __uint(max_entries, MAX_ENTRIES); 74 } tstamp SEC(".maps"); 75 76 /* maintain per-CPU timestamp at the beginning of contention */ 77 struct { 78 __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); 79 __uint(key_size, sizeof(__u32)); 80 __uint(value_size, sizeof(struct tstamp_data)); 81 __uint(max_entries, 1); 82 } tstamp_cpu SEC(".maps"); 83 84 /* actual lock contention statistics */ 85 struct { 86 __uint(type, BPF_MAP_TYPE_HASH); 87 __uint(key_size, sizeof(struct contention_key)); 88 __uint(value_size, sizeof(struct contention_data)); 89 __uint(max_entries, MAX_ENTRIES); 90 } lock_stat SEC(".maps"); 91 92 struct { 93 __uint(type, BPF_MAP_TYPE_HASH); 94 __uint(key_size, sizeof(__u32)); 95 __uint(value_size, sizeof(struct contention_task_data)); 96 __uint(max_entries, MAX_ENTRIES); 97 } task_data SEC(".maps"); 98 99 struct { 100 __uint(type, BPF_MAP_TYPE_HASH); 101 __uint(key_size, sizeof(__u64)); 102 __uint(value_size, sizeof(__u32)); 103 __uint(max_entries, MAX_ENTRIES); 104 } lock_syms SEC(".maps"); 105 106 struct { 107 __uint(type, BPF_MAP_TYPE_HASH); 108 __uint(key_size, sizeof(__u32)); 109 __uint(value_size, sizeof(__u8)); 110 __uint(max_entries, 1); 111 } cpu_filter SEC(".maps"); 112 113 struct { 114 __uint(type, BPF_MAP_TYPE_HASH); 115 __uint(key_size, sizeof(__u32)); 116 __uint(value_size, sizeof(__u8)); 117 __uint(max_entries, 1); 118 } task_filter SEC(".maps"); 119 120 struct { 121 __uint(type, BPF_MAP_TYPE_HASH); 122 __uint(key_size, sizeof(__u32)); 123 __uint(value_size, sizeof(__u8)); 124 __uint(max_entries, 1); 125 } type_filter SEC(".maps"); 126 127 struct { 128 __uint(type, BPF_MAP_TYPE_HASH); 129 __uint(key_size, sizeof(__u64)); 130 __uint(value_size, sizeof(__u8)); 131 __uint(max_entries, 1); 132 } addr_filter SEC(".maps"); 133 134 struct { 135 __uint(type, BPF_MAP_TYPE_HASH); 136 __uint(key_size, sizeof(__u64)); 137 __uint(value_size, sizeof(__u8)); 138 __uint(max_entries, 1); 139 } cgroup_filter SEC(".maps"); 140 141 struct { 142 __uint(type, BPF_MAP_TYPE_HASH); 143 __uint(key_size, sizeof(long)); 144 __uint(value_size, sizeof(__u8)); 145 __uint(max_entries, 1); 146 } slab_filter SEC(".maps"); 147 148 struct { 149 __uint(type, BPF_MAP_TYPE_HASH); 150 __uint(key_size, sizeof(long)); 151 __uint(value_size, sizeof(struct slab_cache_data)); 152 __uint(max_entries, 1); 153 } slab_caches SEC(".maps"); 154 155 struct { 156 __uint(type, BPF_MAP_TYPE_HASH); 157 __uint(key_size, sizeof(__u64)); 158 __uint(value_size, sizeof(__u64)); 159 __uint(max_entries, 1); 160 } lock_delays SEC(".maps"); 161 162 struct rw_semaphore___old { 163 struct task_struct *owner; 164 } __attribute__((preserve_access_index)); 165 166 struct rw_semaphore___new { 167 atomic_long_t owner; 168 } __attribute__((preserve_access_index)); 169 170 struct mm_struct___old { 171 struct rw_semaphore mmap_sem; 172 } __attribute__((preserve_access_index)); 173 174 struct mm_struct___new { 175 struct rw_semaphore mmap_lock; 176 } __attribute__((preserve_access_index)); 177 178 struct cas_ctx { 179 struct contention_data *data; 180 u64 duration; 181 int max_done; 182 int min_done; 183 }; 184 185 extern struct kmem_cache *bpf_get_kmem_cache(u64 addr) __ksym __weak; 186 187 /* control flags */ 188 const volatile int has_cpu; 189 const volatile int has_task; 190 const volatile int has_type; 191 const volatile int has_addr; 192 const volatile int has_cgroup; 193 const volatile int has_slab; 194 const volatile int has_mmap_lock; 195 const volatile int needs_callstack; 196 const volatile int stack_skip; 197 const volatile int lock_owner; 198 const volatile int use_cgroup_v2; 199 const volatile int max_stack; 200 const volatile int lock_delay; 201 202 /* determine the key of lock stat */ 203 const volatile int aggr_mode; 204 205 int enabled; 206 207 int perf_subsys_id = -1; 208 209 __u64 end_ts; 210 211 __u32 slab_cache_id; 212 213 /* error stat */ 214 int task_fail; 215 int stack_fail; 216 int time_fail; 217 int data_fail; 218 219 int task_map_full; 220 int data_map_full; 221 222 struct task_struct *bpf_task_from_pid(s32 pid) __ksym __weak; 223 void bpf_task_release(struct task_struct *p) __ksym __weak; 224 225 static inline __u32 check_lock_type(__u64 lock, __u32 flags); 226 227 static inline __u64 get_current_cgroup_id(void) 228 { 229 struct task_struct *task; 230 struct cgroup *cgrp; 231 232 if (use_cgroup_v2) 233 return bpf_get_current_cgroup_id(); 234 235 task = bpf_get_current_task_btf(); 236 237 if (perf_subsys_id == -1) { 238 #if __has_builtin(__builtin_preserve_enum_value) 239 perf_subsys_id = bpf_core_enum_value(enum cgroup_subsys_id, 240 perf_event_cgrp_id); 241 #else 242 perf_subsys_id = perf_event_cgrp_id; 243 #endif 244 } 245 246 cgrp = BPF_CORE_READ(task, cgroups, subsys[perf_subsys_id], cgroup); 247 return BPF_CORE_READ(cgrp, kn, id); 248 } 249 250 static inline int can_record(u64 *ctx) 251 { 252 bool is_addr_ok = false; 253 254 if (has_cpu) { 255 __u32 cpu = bpf_get_smp_processor_id(); 256 __u8 *ok; 257 258 ok = bpf_map_lookup_elem(&cpu_filter, &cpu); 259 if (!ok) 260 return 0; 261 } 262 263 if (has_task) { 264 __u8 *ok; 265 __u32 pid = bpf_get_current_pid_tgid(); 266 267 ok = bpf_map_lookup_elem(&task_filter, &pid); 268 if (!ok) 269 return 0; 270 } 271 272 if (has_type) { 273 __u8 *ok; 274 __u32 flags = (__u32)ctx[1]; 275 276 ok = bpf_map_lookup_elem(&type_filter, &flags); 277 if (!ok) 278 return 0; 279 } 280 281 if (has_addr) { 282 __u8 *ok; 283 __u64 addr = ctx[0]; 284 285 ok = bpf_map_lookup_elem(&addr_filter, &addr); 286 if (!ok && !has_slab && !has_mmap_lock) 287 return 0; 288 289 is_addr_ok = !!ok; 290 } 291 292 if (has_cgroup) { 293 __u8 *ok; 294 __u64 cgrp = get_current_cgroup_id(); 295 296 ok = bpf_map_lookup_elem(&cgroup_filter, &cgrp); 297 if (!ok) 298 return 0; 299 } 300 301 if (is_addr_ok) 302 return 1; 303 304 /* slab and mmap_lock are part of the addr_filter */ 305 if (has_slab && bpf_get_kmem_cache) { 306 __u8 *ok; 307 __u64 addr = ctx[0]; 308 long kmem_cache_addr; 309 310 kmem_cache_addr = (long)bpf_get_kmem_cache(addr); 311 ok = bpf_map_lookup_elem(&slab_filter, &kmem_cache_addr); 312 if (ok) 313 return 1; 314 else if (!has_mmap_lock) 315 return 0; 316 } 317 318 if (has_mmap_lock) { 319 __u64 lock = ctx[0]; 320 __u32 flag = ctx[1]; 321 322 if (check_lock_type(lock, flag) != LCD_F_MMAP_LOCK) 323 return 0; 324 } 325 326 return 1; 327 } 328 329 static inline int update_task_data(struct task_struct *task) 330 { 331 struct contention_task_data *p; 332 int pid, err; 333 334 err = bpf_core_read(&pid, sizeof(pid), &task->pid); 335 if (err) 336 return -1; 337 338 p = bpf_map_lookup_elem(&task_data, &pid); 339 if (p == NULL && !task_map_full) { 340 struct contention_task_data data = {}; 341 342 BPF_CORE_READ_STR_INTO(&data.comm, task, comm); 343 if (bpf_map_update_elem(&task_data, &pid, &data, BPF_NOEXIST) == -E2BIG) 344 task_map_full = 1; 345 } 346 347 return 0; 348 } 349 350 #ifndef __has_builtin 351 # define __has_builtin(x) 0 352 #endif 353 354 static inline struct task_struct *get_lock_owner(__u64 lock, __u32 flags) 355 { 356 struct task_struct *task; 357 __u64 owner = 0; 358 359 if (flags & LCB_F_MUTEX) { 360 struct mutex *mutex = (void *)lock; 361 owner = BPF_CORE_READ(mutex, owner.counter); 362 } else if (flags == LCB_F_READ || flags == LCB_F_WRITE) { 363 /* 364 * Support for the BPF_TYPE_MATCHES argument to the 365 * __builtin_preserve_type_info builtin was added at some point during 366 * development of clang 15 and it's what is needed for 367 * bpf_core_type_matches. 368 */ 369 #if __has_builtin(__builtin_preserve_type_info) && __clang_major__ >= 15 370 if (bpf_core_type_matches(struct rw_semaphore___old)) { 371 struct rw_semaphore___old *rwsem = (void *)lock; 372 owner = (unsigned long)BPF_CORE_READ(rwsem, owner); 373 } else if (bpf_core_type_matches(struct rw_semaphore___new)) { 374 struct rw_semaphore___new *rwsem = (void *)lock; 375 owner = BPF_CORE_READ(rwsem, owner.counter); 376 } 377 #else 378 /* assume new struct */ 379 struct rw_semaphore *rwsem = (void *)lock; 380 owner = BPF_CORE_READ(rwsem, owner.counter); 381 #endif 382 } 383 384 if (!owner) 385 return NULL; 386 387 task = (void *)(owner & ~7UL); 388 return task; 389 } 390 391 static inline __u32 check_lock_type(__u64 lock, __u32 flags) 392 { 393 struct task_struct *curr; 394 struct mm_struct___old *mm_old; 395 struct mm_struct___new *mm_new; 396 struct sighand_struct *sighand; 397 398 switch (flags) { 399 case LCB_F_READ: /* rwsem */ 400 case LCB_F_WRITE: 401 curr = bpf_get_current_task_btf(); 402 if (curr->mm == NULL) 403 break; 404 mm_new = (void *)curr->mm; 405 if (bpf_core_field_exists(mm_new->mmap_lock)) { 406 if (&mm_new->mmap_lock == (void *)lock) 407 return LCD_F_MMAP_LOCK; 408 break; 409 } 410 mm_old = (void *)curr->mm; 411 if (bpf_core_field_exists(mm_old->mmap_sem)) { 412 if (&mm_old->mmap_sem == (void *)lock) 413 return LCD_F_MMAP_LOCK; 414 } 415 break; 416 case LCB_F_SPIN: /* spinlock */ 417 curr = bpf_get_current_task_btf(); 418 sighand = curr->sighand; 419 420 if (sighand && &sighand->siglock == (void *)lock) 421 return LCD_F_SIGHAND_LOCK; 422 break; 423 default: 424 break; 425 } 426 return 0; 427 } 428 429 static inline long delay_callback(__u64 idx, void *arg) 430 { 431 __u64 target = *(__u64 *)arg; 432 433 if (target <= bpf_ktime_get_ns()) 434 return 1; 435 436 /* just to kill time */ 437 (void)bpf_get_prandom_u32(); 438 439 return 0; 440 } 441 442 static inline void do_lock_delay(__u64 duration) 443 { 444 __u64 target = bpf_ktime_get_ns() + duration; 445 446 bpf_loop(MAX_LOOP, delay_callback, &target, /*flags=*/0); 447 } 448 449 static inline void check_lock_delay(__u64 lock) 450 { 451 __u64 *delay; 452 453 delay = bpf_map_lookup_elem(&lock_delays, &lock); 454 if (delay) 455 do_lock_delay(*delay); 456 } 457 458 static inline struct tstamp_data *get_tstamp_elem(__u32 flags) 459 { 460 __u32 pid; 461 struct tstamp_data *pelem; 462 463 /* Use per-cpu array map for spinlock and rwlock */ 464 if ((flags & (LCB_F_SPIN | LCB_F_MUTEX)) == LCB_F_SPIN) { 465 __u32 idx = 0; 466 467 pelem = bpf_map_lookup_elem(&tstamp_cpu, &idx); 468 /* Do not update the element for nested locks */ 469 if (pelem && pelem->lock) 470 pelem = NULL; 471 return pelem; 472 } 473 474 pid = bpf_get_current_pid_tgid(); 475 pelem = bpf_map_lookup_elem(&tstamp, &pid); 476 /* Do not update the element for nested locks */ 477 if (pelem && pelem->lock) 478 return NULL; 479 480 if (pelem == NULL) { 481 struct tstamp_data zero = {}; 482 483 if (bpf_map_update_elem(&tstamp, &pid, &zero, BPF_NOEXIST) < 0) { 484 __sync_fetch_and_add(&task_fail, 1); 485 return NULL; 486 } 487 488 pelem = bpf_map_lookup_elem(&tstamp, &pid); 489 if (pelem == NULL) { 490 __sync_fetch_and_add(&task_fail, 1); 491 return NULL; 492 } 493 } 494 return pelem; 495 } 496 497 static inline s32 get_owner_stack_id(u64 *stacktrace) 498 { 499 s32 *id, new_id; 500 static s64 id_gen = 1; 501 502 id = bpf_map_lookup_elem(&owner_stacks, stacktrace); 503 if (id) 504 return *id; 505 506 new_id = (s32)__sync_fetch_and_add(&id_gen, 1); 507 508 bpf_map_update_elem(&owner_stacks, stacktrace, &new_id, BPF_NOEXIST); 509 510 id = bpf_map_lookup_elem(&owner_stacks, stacktrace); 511 if (id) 512 return *id; 513 514 return -1; 515 } 516 517 static long cas_min_max_cb(u64 idx, void *arg) 518 { 519 struct cas_ctx *ctx = arg; 520 521 if (!ctx->max_done) { 522 u64 old_max = ctx->data->max_time; 523 if (old_max >= ctx->duration) { 524 ctx->max_done = 1; 525 } else { 526 u64 r = __sync_val_compare_and_swap( 527 &ctx->data->max_time, old_max, ctx->duration); 528 if (r == old_max) 529 ctx->max_done = 1; 530 } 531 } 532 533 if (!ctx->min_done) { 534 u64 old_min = ctx->data->min_time; 535 if (old_min <= ctx->duration) { 536 ctx->min_done = 1; 537 } else { 538 u64 r = __sync_val_compare_and_swap( 539 &ctx->data->min_time, old_min, ctx->duration); 540 if (r == old_min) 541 ctx->min_done = 1; 542 } 543 } 544 545 return (ctx->max_done && ctx->min_done) ? 1 : 0; 546 } 547 548 static inline void update_contention_data(struct contention_data *data, u64 duration, u32 count) 549 { 550 __sync_fetch_and_add(&data->total_time, duration); 551 __sync_fetch_and_add(&data->count, count); 552 553 struct cas_ctx ctx = { 554 .data = data, 555 .duration = duration, 556 .max_done = 0, 557 .min_done = 0, 558 }; 559 bpf_loop(64, cas_min_max_cb, &ctx, 0); 560 } 561 562 static inline void update_owner_stat(u32 id, u64 duration, u32 flags) 563 { 564 struct contention_key key = { 565 .stack_id = id, 566 .pid = 0, 567 .lock_addr_or_cgroup = 0, 568 }; 569 struct contention_data *data = bpf_map_lookup_elem(&owner_stat, &key); 570 571 if (!data) { 572 struct contention_data first = { 573 .total_time = duration, 574 .max_time = duration, 575 .min_time = duration, 576 .count = 1, 577 .flags = flags, 578 }; 579 bpf_map_update_elem(&owner_stat, &key, &first, BPF_NOEXIST); 580 } else { 581 update_contention_data(data, duration, 1); 582 } 583 } 584 585 SEC("tp_btf/contention_begin") 586 int contention_begin(u64 *ctx) 587 { 588 struct tstamp_data *pelem; 589 590 if (!enabled || !can_record(ctx)) 591 return 0; 592 593 pelem = get_tstamp_elem(ctx[1]); 594 if (pelem == NULL) 595 return 0; 596 597 pelem->timestamp = bpf_ktime_get_ns(); 598 pelem->lock = (__u64)ctx[0]; 599 pelem->flags = (__u32)ctx[1]; 600 if (aggr_mode == LOCK_AGGR_CGROUP) 601 pelem->cgroup_id = get_current_cgroup_id(); 602 603 if (needs_callstack) { 604 u32 i = 0; 605 u32 id = 0; 606 int owner_pid; 607 u64 *buf; 608 struct task_struct *task; 609 struct owner_tracing_data *otdata; 610 611 if (!lock_owner) 612 goto skip_owner; 613 614 task = get_lock_owner(pelem->lock, pelem->flags); 615 if (!task) 616 goto skip_owner; 617 618 owner_pid = BPF_CORE_READ(task, pid); 619 620 buf = bpf_map_lookup_elem(&stack_buf, &i); 621 if (!buf) 622 goto skip_owner; 623 for (i = 0; i < max_stack; i++) 624 buf[i] = 0x0; 625 626 if (!bpf_task_from_pid) 627 goto skip_owner; 628 629 task = bpf_task_from_pid(owner_pid); 630 if (!task) 631 goto skip_owner; 632 633 bpf_get_task_stack(task, buf, max_stack * sizeof(unsigned long), 0); 634 bpf_task_release(task); 635 636 otdata = bpf_map_lookup_elem(&owner_data, &pelem->lock); 637 id = get_owner_stack_id(buf); 638 639 /* 640 * Contention just happens, or corner case `lock` is owned by process not 641 * `owner_pid`. For the corner case we treat it as unexpected internal error and 642 * just ignore the precvious tracing record. 643 */ 644 if (!otdata || otdata->pid != owner_pid) { 645 struct owner_tracing_data first = { 646 .pid = owner_pid, 647 .timestamp = pelem->timestamp, 648 .count = 1, 649 .stack_id = id, 650 }; 651 bpf_map_update_elem(&owner_data, &pelem->lock, &first, BPF_ANY); 652 } 653 /* Contention is ongoing and new waiter joins */ 654 else { 655 __sync_fetch_and_add(&otdata->count, 1); 656 657 /* 658 * The owner is the same, but stacktrace might be changed. In this case we 659 * store/update `owner_stat` based on current owner stack id. 660 */ 661 if (id != otdata->stack_id) { 662 update_owner_stat(id, pelem->timestamp - otdata->timestamp, 663 pelem->flags); 664 665 otdata->timestamp = pelem->timestamp; 666 otdata->stack_id = id; 667 } 668 } 669 skip_owner: 670 pelem->stack_id = bpf_get_stackid(ctx, &stacks, 671 BPF_F_FAST_STACK_CMP | stack_skip); 672 if (pelem->stack_id < 0) 673 __sync_fetch_and_add(&stack_fail, 1); 674 } else if (aggr_mode == LOCK_AGGR_TASK) { 675 struct task_struct *task; 676 677 if (lock_owner) { 678 task = get_lock_owner(pelem->lock, pelem->flags); 679 680 /* The flags is not used anymore. Pass the owner pid. */ 681 if (task) 682 pelem->flags = BPF_CORE_READ(task, pid); 683 else 684 pelem->flags = -1U; 685 686 } else { 687 task = bpf_get_current_task_btf(); 688 } 689 690 if (task) { 691 if (update_task_data(task) < 0 && lock_owner) 692 pelem->flags = -1U; 693 } 694 } 695 696 return 0; 697 } 698 699 SEC("tp_btf/contention_end") 700 int contention_end(u64 *ctx) 701 { 702 __u32 pid = 0, idx = 0; 703 struct tstamp_data *pelem; 704 struct contention_key key = {}; 705 struct contention_data *data; 706 __u64 timestamp; 707 __u64 duration; 708 bool need_delete = false; 709 710 if (!enabled) 711 return 0; 712 713 /* 714 * For spinlock and rwlock, it needs to get the timestamp for the 715 * per-cpu map. However, contention_end does not have the flags 716 * so it cannot know whether it reads percpu or hash map. 717 * 718 * Try per-cpu map first and check if there's active contention. 719 * If it is, do not read hash map because it cannot go to sleeping 720 * locks before releasing the spinning locks. 721 */ 722 pelem = bpf_map_lookup_elem(&tstamp_cpu, &idx); 723 if (pelem && pelem->lock) { 724 if (pelem->lock != ctx[0]) 725 return 0; 726 } else { 727 pid = bpf_get_current_pid_tgid(); 728 pelem = bpf_map_lookup_elem(&tstamp, &pid); 729 if (!pelem || pelem->lock != ctx[0]) 730 return 0; 731 need_delete = true; 732 } 733 734 timestamp = bpf_ktime_get_ns(); 735 duration = timestamp - pelem->timestamp; 736 if ((__s64)duration < 0) { 737 __sync_fetch_and_add(&time_fail, 1); 738 goto out; 739 } 740 741 if (needs_callstack && lock_owner) { 742 struct owner_tracing_data *otdata = bpf_map_lookup_elem(&owner_data, &pelem->lock); 743 744 if (!otdata) 745 goto skip_owner; 746 747 /* Update `owner_stat` */ 748 update_owner_stat(otdata->stack_id, timestamp - otdata->timestamp, pelem->flags); 749 750 /* No contention is occurring, delete `lock` entry in `owner_data` */ 751 if (otdata->count <= 1) 752 bpf_map_delete_elem(&owner_data, &pelem->lock); 753 /* 754 * Contention is still ongoing, with a new owner (current task). `owner_data` 755 * should be updated accordingly. 756 */ 757 else { 758 u32 i = 0; 759 s32 ret = (s32)ctx[1]; 760 u64 *buf; 761 762 otdata->timestamp = timestamp; 763 __sync_fetch_and_add(&otdata->count, -1); 764 765 buf = bpf_map_lookup_elem(&stack_buf, &i); 766 if (!buf) 767 goto skip_owner; 768 for (i = 0; i < (u32)max_stack; i++) 769 buf[i] = 0x0; 770 771 /* 772 * `ret` has the return code of the lock function. 773 * If `ret` is negative, the current task terminates lock waiting without 774 * acquiring it. Owner is not changed, but we still need to update the owner 775 * stack. 776 */ 777 if (ret < 0) { 778 s32 id = 0; 779 struct task_struct *task; 780 781 if (!bpf_task_from_pid) 782 goto skip_owner; 783 784 task = bpf_task_from_pid(otdata->pid); 785 if (!task) 786 goto skip_owner; 787 788 bpf_get_task_stack(task, buf, 789 max_stack * sizeof(unsigned long), 0); 790 bpf_task_release(task); 791 792 id = get_owner_stack_id(buf); 793 794 /* 795 * If owner stack is changed, update owner stack id for this lock. 796 */ 797 if (id != otdata->stack_id) 798 otdata->stack_id = id; 799 } 800 /* 801 * Otherwise, update tracing data with the current task, which is the new 802 * owner. 803 */ 804 else { 805 otdata->pid = pid; 806 /* 807 * We don't want to retrieve callstack here, since it is where the 808 * current task acquires the lock and provides no additional 809 * information. We simply assign -1 to invalidate it. 810 */ 811 otdata->stack_id = -1; 812 } 813 } 814 } 815 skip_owner: 816 switch (aggr_mode) { 817 case LOCK_AGGR_CALLER: 818 key.stack_id = pelem->stack_id; 819 break; 820 case LOCK_AGGR_TASK: 821 if (lock_owner) 822 key.pid = pelem->flags; 823 else { 824 if (!need_delete) 825 pid = bpf_get_current_pid_tgid(); 826 key.pid = pid; 827 } 828 if (needs_callstack) 829 key.stack_id = pelem->stack_id; 830 break; 831 case LOCK_AGGR_ADDR: 832 key.lock_addr_or_cgroup = pelem->lock; 833 if (needs_callstack) 834 key.stack_id = pelem->stack_id; 835 break; 836 case LOCK_AGGR_CGROUP: 837 key.lock_addr_or_cgroup = pelem->cgroup_id; 838 break; 839 default: 840 /* should not happen */ 841 return 0; 842 } 843 844 data = bpf_map_lookup_elem(&lock_stat, &key); 845 if (!data) { 846 if (data_map_full) { 847 __sync_fetch_and_add(&data_fail, 1); 848 goto out; 849 } 850 851 struct contention_data first = { 852 .total_time = duration, 853 .max_time = duration, 854 .min_time = duration, 855 .count = 1, 856 .flags = pelem->flags, 857 }; 858 int err; 859 860 if (aggr_mode == LOCK_AGGR_ADDR) { 861 first.flags |= check_lock_type(pelem->lock, 862 pelem->flags & LCB_F_TYPE_MASK); 863 864 /* Check if it's from a slab object */ 865 if (bpf_get_kmem_cache) { 866 struct kmem_cache *s; 867 struct slab_cache_data *d; 868 869 s = bpf_get_kmem_cache(pelem->lock); 870 if (s != NULL) { 871 /* 872 * Save the ID of the slab cache in the flags 873 * (instead of full address) to reduce the 874 * space in the contention_data. 875 */ 876 d = bpf_map_lookup_elem(&slab_caches, &s); 877 if (d != NULL) 878 first.flags |= d->id; 879 } 880 } 881 } 882 883 err = bpf_map_update_elem(&lock_stat, &key, &first, BPF_NOEXIST); 884 if (err < 0) { 885 if (err == -EEXIST) { 886 /* it lost the race, try to get it again */ 887 data = bpf_map_lookup_elem(&lock_stat, &key); 888 if (data != NULL) 889 goto found; 890 } 891 if (err == -E2BIG) 892 data_map_full = 1; 893 __sync_fetch_and_add(&data_fail, 1); 894 } 895 goto out; 896 } 897 898 found: 899 update_contention_data(data, duration, 1); 900 901 out: 902 if (lock_delay) 903 check_lock_delay(pelem->lock); 904 905 pelem->lock = 0; 906 if (need_delete) 907 bpf_map_delete_elem(&tstamp, &pid); 908 return 0; 909 } 910 911 extern struct rq runqueues __ksym; 912 913 const volatile __u64 contig_page_data_addr; 914 const volatile __u64 node_data_addr; 915 const volatile int nr_nodes; 916 const volatile int sizeof_zone; 917 918 struct rq___old { 919 raw_spinlock_t lock; 920 } __attribute__((preserve_access_index)); 921 922 struct rq___new { 923 raw_spinlock_t __lock; 924 } __attribute__((preserve_access_index)); 925 926 static void collect_zone_lock(void) 927 { 928 __u64 nr_zones, zone_off; 929 __u64 lock_addr, lock_off; 930 __u32 lock_flag = LOCK_CLASS_ZONE_LOCK; 931 932 zone_off = offsetof(struct pglist_data, node_zones); 933 lock_off = offsetof(struct zone, lock); 934 935 if (contig_page_data_addr) { 936 struct pglist_data *contig_page_data; 937 938 contig_page_data = (void *)(long)contig_page_data_addr; 939 nr_zones = BPF_CORE_READ(contig_page_data, nr_zones); 940 941 for (int i = 0; i < MAX_ZONES; i++) { 942 __u64 zone_addr; 943 944 if (i >= nr_zones) 945 break; 946 947 zone_addr = contig_page_data_addr + (sizeof_zone * i) + zone_off; 948 lock_addr = zone_addr + lock_off; 949 950 bpf_map_update_elem(&lock_syms, &lock_addr, &lock_flag, BPF_ANY); 951 } 952 } else if (nr_nodes > 0) { 953 struct pglist_data **node_data = (void *)(long)node_data_addr; 954 955 for (int i = 0; i < nr_nodes; i++) { 956 struct pglist_data *pgdat = NULL; 957 int err; 958 959 err = bpf_core_read(&pgdat, sizeof(pgdat), &node_data[i]); 960 if (err < 0 || pgdat == NULL) 961 break; 962 963 nr_zones = BPF_CORE_READ(pgdat, nr_zones); 964 for (int k = 0; k < MAX_ZONES; k++) { 965 __u64 zone_addr; 966 967 if (k >= nr_zones) 968 break; 969 970 zone_addr = (__u64)(void *)pgdat + (sizeof_zone * k) + zone_off; 971 lock_addr = zone_addr + lock_off; 972 973 bpf_map_update_elem(&lock_syms, &lock_addr, &lock_flag, BPF_ANY); 974 } 975 } 976 } 977 } 978 979 SEC("raw_tp/bpf_test_finish") 980 int BPF_PROG(collect_lock_syms) 981 { 982 __u64 lock_addr, lock_off; 983 __u32 lock_flag; 984 985 if (bpf_core_field_exists(struct rq___new, __lock)) 986 lock_off = offsetof(struct rq___new, __lock); 987 else 988 lock_off = offsetof(struct rq___old, lock); 989 990 for (int i = 0; i < MAX_CPUS; i++) { 991 struct rq *rq = bpf_per_cpu_ptr(&runqueues, i); 992 993 if (rq == NULL) 994 break; 995 996 lock_addr = (__u64)(void *)rq + lock_off; 997 lock_flag = LOCK_CLASS_RQLOCK; 998 bpf_map_update_elem(&lock_syms, &lock_addr, &lock_flag, BPF_ANY); 999 } 1000 1001 collect_zone_lock(); 1002 1003 return 0; 1004 } 1005 1006 SEC("raw_tp/bpf_test_finish") 1007 int BPF_PROG(end_timestamp) 1008 { 1009 end_ts = bpf_ktime_get_ns(); 1010 return 0; 1011 } 1012 1013 /* 1014 * bpf_iter__kmem_cache added recently so old kernels don't have it in the 1015 * vmlinux.h. But we cannot add it here since it will cause a compiler error 1016 * due to redefinition of the struct on later kernels. 1017 * 1018 * So it uses a CO-RE trick to access the member only if it has the type. 1019 * This will support both old and new kernels without compiler errors. 1020 */ 1021 struct bpf_iter__kmem_cache___new { 1022 struct kmem_cache *s; 1023 } __attribute__((preserve_access_index)); 1024 1025 SEC("iter/kmem_cache") 1026 int slab_cache_iter(void *ctx) 1027 { 1028 struct kmem_cache *s = NULL; 1029 struct slab_cache_data d; 1030 const char *nameptr; 1031 1032 if (bpf_core_type_exists(struct bpf_iter__kmem_cache)) { 1033 struct bpf_iter__kmem_cache___new *iter = ctx; 1034 1035 s = iter->s; 1036 } 1037 1038 if (s == NULL) 1039 return 0; 1040 1041 nameptr = s->name; 1042 bpf_probe_read_kernel_str(d.name, sizeof(d.name), nameptr); 1043 1044 d.id = ++slab_cache_id << LCB_F_SLAB_ID_SHIFT; 1045 if (d.id >= LCB_F_SLAB_ID_END) 1046 return 0; 1047 1048 bpf_map_update_elem(&slab_caches, &s, &d, BPF_NOEXIST); 1049 return 0; 1050 } 1051 1052 char LICENSE[] SEC("license") = "Dual BSD/GPL"; 1053