1 // SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) 2 // Copyright (c) 2022 Google 3 #include "vmlinux.h" 4 #include <bpf/bpf_helpers.h> 5 #include <bpf/bpf_tracing.h> 6 #include <bpf/bpf_core_read.h> 7 #include <asm-generic/errno-base.h> 8 9 #include "lock_data.h" 10 11 /* for collect_lock_syms(). 4096 was rejected by the verifier */ 12 #define MAX_CPUS 1024 13 14 /* for collect_zone_lock(). It should be more than the actual zones. */ 15 #define MAX_ZONES 10 16 17 /* for do_lock_delay(). Arbitrarily set to 1 million. */ 18 #define MAX_LOOP (1U << 20) 19 20 /* lock contention flags from include/trace/events/lock.h */ 21 #define LCB_F_SPIN (1U << 0) 22 #define LCB_F_READ (1U << 1) 23 #define LCB_F_WRITE (1U << 2) 24 #define LCB_F_RT (1U << 3) 25 #define LCB_F_PERCPU (1U << 4) 26 #define LCB_F_MUTEX (1U << 5) 27 28 /* callstack storage */ 29 struct { 30 __uint(type, BPF_MAP_TYPE_STACK_TRACE); 31 __uint(key_size, sizeof(__u32)); 32 __uint(value_size, sizeof(__u64)); 33 __uint(max_entries, MAX_ENTRIES); 34 } stacks SEC(".maps"); 35 36 /* buffer for owner stacktrace */ 37 struct { 38 __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); 39 __uint(key_size, sizeof(__u32)); 40 __uint(value_size, sizeof(__u64)); 41 __uint(max_entries, 1); 42 } stack_buf SEC(".maps"); 43 44 /* a map for tracing owner stacktrace to owner stack id */ 45 struct { 46 __uint(type, BPF_MAP_TYPE_HASH); 47 __uint(key_size, sizeof(__u64)); // owner stacktrace 48 __uint(value_size, sizeof(__s32)); // owner stack id 49 __uint(max_entries, 1); 50 } owner_stacks SEC(".maps"); 51 52 /* a map for tracing lock address to owner data */ 53 struct { 54 __uint(type, BPF_MAP_TYPE_HASH); 55 __uint(key_size, sizeof(__u64)); // lock address 56 __uint(value_size, sizeof(struct owner_tracing_data)); 57 __uint(max_entries, 1); 58 } owner_data SEC(".maps"); 59 60 /* a map for contention_key (stores owner stack id) to contention data */ 61 struct { 62 __uint(type, BPF_MAP_TYPE_HASH); 63 __uint(key_size, sizeof(struct contention_key)); 64 __uint(value_size, sizeof(struct contention_data)); 65 __uint(max_entries, 1); 66 } owner_stat SEC(".maps"); 67 68 /* maintain timestamp at the beginning of contention */ 69 struct { 70 __uint(type, BPF_MAP_TYPE_HASH); 71 __type(key, int); 72 __type(value, struct tstamp_data); 73 __uint(max_entries, MAX_ENTRIES); 74 } tstamp SEC(".maps"); 75 76 /* maintain per-CPU timestamp at the beginning of contention */ 77 struct { 78 __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); 79 __uint(key_size, sizeof(__u32)); 80 __uint(value_size, sizeof(struct tstamp_data)); 81 __uint(max_entries, 1); 82 } tstamp_cpu SEC(".maps"); 83 84 /* actual lock contention statistics */ 85 struct { 86 __uint(type, BPF_MAP_TYPE_HASH); 87 __uint(key_size, sizeof(struct contention_key)); 88 __uint(value_size, sizeof(struct contention_data)); 89 __uint(max_entries, MAX_ENTRIES); 90 } lock_stat SEC(".maps"); 91 92 struct { 93 __uint(type, BPF_MAP_TYPE_HASH); 94 __uint(key_size, sizeof(__u32)); 95 __uint(value_size, sizeof(struct contention_task_data)); 96 __uint(max_entries, MAX_ENTRIES); 97 } task_data SEC(".maps"); 98 99 struct { 100 __uint(type, BPF_MAP_TYPE_HASH); 101 __uint(key_size, sizeof(__u64)); 102 __uint(value_size, sizeof(__u32)); 103 __uint(max_entries, MAX_ENTRIES); 104 } lock_syms SEC(".maps"); 105 106 struct { 107 __uint(type, BPF_MAP_TYPE_HASH); 108 __uint(key_size, sizeof(__u32)); 109 __uint(value_size, sizeof(__u8)); 110 __uint(max_entries, 1); 111 } cpu_filter SEC(".maps"); 112 113 struct { 114 __uint(type, BPF_MAP_TYPE_HASH); 115 __uint(key_size, sizeof(__u32)); 116 __uint(value_size, sizeof(__u8)); 117 __uint(max_entries, 1); 118 } task_filter SEC(".maps"); 119 120 struct { 121 __uint(type, BPF_MAP_TYPE_HASH); 122 __uint(key_size, sizeof(__u32)); 123 __uint(value_size, sizeof(__u8)); 124 __uint(max_entries, 1); 125 } type_filter SEC(".maps"); 126 127 struct { 128 __uint(type, BPF_MAP_TYPE_HASH); 129 __uint(key_size, sizeof(__u64)); 130 __uint(value_size, sizeof(__u8)); 131 __uint(max_entries, 1); 132 } addr_filter SEC(".maps"); 133 134 struct { 135 __uint(type, BPF_MAP_TYPE_HASH); 136 __uint(key_size, sizeof(__u64)); 137 __uint(value_size, sizeof(__u8)); 138 __uint(max_entries, 1); 139 } cgroup_filter SEC(".maps"); 140 141 struct { 142 __uint(type, BPF_MAP_TYPE_HASH); 143 __uint(key_size, sizeof(long)); 144 __uint(value_size, sizeof(__u8)); 145 __uint(max_entries, 1); 146 } slab_filter SEC(".maps"); 147 148 struct { 149 __uint(type, BPF_MAP_TYPE_HASH); 150 __uint(key_size, sizeof(long)); 151 __uint(value_size, sizeof(struct slab_cache_data)); 152 __uint(max_entries, 1); 153 } slab_caches SEC(".maps"); 154 155 struct { 156 __uint(type, BPF_MAP_TYPE_HASH); 157 __uint(key_size, sizeof(__u64)); 158 __uint(value_size, sizeof(__u64)); 159 __uint(max_entries, 1); 160 } lock_delays SEC(".maps"); 161 162 struct rw_semaphore___old { 163 struct task_struct *owner; 164 } __attribute__((preserve_access_index)); 165 166 struct rw_semaphore___new { 167 atomic_long_t owner; 168 } __attribute__((preserve_access_index)); 169 170 struct mm_struct___old { 171 struct rw_semaphore mmap_sem; 172 } __attribute__((preserve_access_index)); 173 174 struct mm_struct___new { 175 struct rw_semaphore mmap_lock; 176 } __attribute__((preserve_access_index)); 177 178 extern struct kmem_cache *bpf_get_kmem_cache(u64 addr) __ksym __weak; 179 180 /* control flags */ 181 const volatile int has_cpu; 182 const volatile int has_task; 183 const volatile int has_type; 184 const volatile int has_addr; 185 const volatile int has_cgroup; 186 const volatile int has_slab; 187 const volatile int needs_callstack; 188 const volatile int stack_skip; 189 const volatile int lock_owner; 190 const volatile int use_cgroup_v2; 191 const volatile int max_stack; 192 const volatile int lock_delay; 193 194 /* determine the key of lock stat */ 195 const volatile int aggr_mode; 196 197 int enabled; 198 199 int perf_subsys_id = -1; 200 201 __u64 end_ts; 202 203 __u32 slab_cache_id; 204 205 /* error stat */ 206 int task_fail; 207 int stack_fail; 208 int time_fail; 209 int data_fail; 210 211 int task_map_full; 212 int data_map_full; 213 214 struct task_struct *bpf_task_from_pid(s32 pid) __ksym __weak; 215 void bpf_task_release(struct task_struct *p) __ksym __weak; 216 217 static inline __u64 get_current_cgroup_id(void) 218 { 219 struct task_struct *task; 220 struct cgroup *cgrp; 221 222 if (use_cgroup_v2) 223 return bpf_get_current_cgroup_id(); 224 225 task = bpf_get_current_task_btf(); 226 227 if (perf_subsys_id == -1) { 228 #if __has_builtin(__builtin_preserve_enum_value) 229 perf_subsys_id = bpf_core_enum_value(enum cgroup_subsys_id, 230 perf_event_cgrp_id); 231 #else 232 perf_subsys_id = perf_event_cgrp_id; 233 #endif 234 } 235 236 cgrp = BPF_CORE_READ(task, cgroups, subsys[perf_subsys_id], cgroup); 237 return BPF_CORE_READ(cgrp, kn, id); 238 } 239 240 static inline int can_record(u64 *ctx) 241 { 242 if (has_cpu) { 243 __u32 cpu = bpf_get_smp_processor_id(); 244 __u8 *ok; 245 246 ok = bpf_map_lookup_elem(&cpu_filter, &cpu); 247 if (!ok) 248 return 0; 249 } 250 251 if (has_task) { 252 __u8 *ok; 253 __u32 pid = bpf_get_current_pid_tgid(); 254 255 ok = bpf_map_lookup_elem(&task_filter, &pid); 256 if (!ok) 257 return 0; 258 } 259 260 if (has_type) { 261 __u8 *ok; 262 __u32 flags = (__u32)ctx[1]; 263 264 ok = bpf_map_lookup_elem(&type_filter, &flags); 265 if (!ok) 266 return 0; 267 } 268 269 if (has_addr) { 270 __u8 *ok; 271 __u64 addr = ctx[0]; 272 273 ok = bpf_map_lookup_elem(&addr_filter, &addr); 274 if (!ok && !has_slab) 275 return 0; 276 } 277 278 if (has_cgroup) { 279 __u8 *ok; 280 __u64 cgrp = get_current_cgroup_id(); 281 282 ok = bpf_map_lookup_elem(&cgroup_filter, &cgrp); 283 if (!ok) 284 return 0; 285 } 286 287 if (has_slab && bpf_get_kmem_cache) { 288 __u8 *ok; 289 __u64 addr = ctx[0]; 290 long kmem_cache_addr; 291 292 kmem_cache_addr = (long)bpf_get_kmem_cache(addr); 293 ok = bpf_map_lookup_elem(&slab_filter, &kmem_cache_addr); 294 if (!ok) 295 return 0; 296 } 297 298 return 1; 299 } 300 301 static inline int update_task_data(struct task_struct *task) 302 { 303 struct contention_task_data *p; 304 int pid, err; 305 306 err = bpf_core_read(&pid, sizeof(pid), &task->pid); 307 if (err) 308 return -1; 309 310 p = bpf_map_lookup_elem(&task_data, &pid); 311 if (p == NULL && !task_map_full) { 312 struct contention_task_data data = {}; 313 314 BPF_CORE_READ_STR_INTO(&data.comm, task, comm); 315 if (bpf_map_update_elem(&task_data, &pid, &data, BPF_NOEXIST) == -E2BIG) 316 task_map_full = 1; 317 } 318 319 return 0; 320 } 321 322 #ifndef __has_builtin 323 # define __has_builtin(x) 0 324 #endif 325 326 static inline struct task_struct *get_lock_owner(__u64 lock, __u32 flags) 327 { 328 struct task_struct *task; 329 __u64 owner = 0; 330 331 if (flags & LCB_F_MUTEX) { 332 struct mutex *mutex = (void *)lock; 333 owner = BPF_CORE_READ(mutex, owner.counter); 334 } else if (flags == LCB_F_READ || flags == LCB_F_WRITE) { 335 /* 336 * Support for the BPF_TYPE_MATCHES argument to the 337 * __builtin_preserve_type_info builtin was added at some point during 338 * development of clang 15 and it's what is needed for 339 * bpf_core_type_matches. 340 */ 341 #if __has_builtin(__builtin_preserve_type_info) && __clang_major__ >= 15 342 if (bpf_core_type_matches(struct rw_semaphore___old)) { 343 struct rw_semaphore___old *rwsem = (void *)lock; 344 owner = (unsigned long)BPF_CORE_READ(rwsem, owner); 345 } else if (bpf_core_type_matches(struct rw_semaphore___new)) { 346 struct rw_semaphore___new *rwsem = (void *)lock; 347 owner = BPF_CORE_READ(rwsem, owner.counter); 348 } 349 #else 350 /* assume new struct */ 351 struct rw_semaphore *rwsem = (void *)lock; 352 owner = BPF_CORE_READ(rwsem, owner.counter); 353 #endif 354 } 355 356 if (!owner) 357 return NULL; 358 359 task = (void *)(owner & ~7UL); 360 return task; 361 } 362 363 static inline __u32 check_lock_type(__u64 lock, __u32 flags) 364 { 365 struct task_struct *curr; 366 struct mm_struct___old *mm_old; 367 struct mm_struct___new *mm_new; 368 struct sighand_struct *sighand; 369 370 switch (flags) { 371 case LCB_F_READ: /* rwsem */ 372 case LCB_F_WRITE: 373 curr = bpf_get_current_task_btf(); 374 if (curr->mm == NULL) 375 break; 376 mm_new = (void *)curr->mm; 377 if (bpf_core_field_exists(mm_new->mmap_lock)) { 378 if (&mm_new->mmap_lock == (void *)lock) 379 return LCD_F_MMAP_LOCK; 380 break; 381 } 382 mm_old = (void *)curr->mm; 383 if (bpf_core_field_exists(mm_old->mmap_sem)) { 384 if (&mm_old->mmap_sem == (void *)lock) 385 return LCD_F_MMAP_LOCK; 386 } 387 break; 388 case LCB_F_SPIN: /* spinlock */ 389 curr = bpf_get_current_task_btf(); 390 sighand = curr->sighand; 391 392 if (sighand && &sighand->siglock == (void *)lock) 393 return LCD_F_SIGHAND_LOCK; 394 break; 395 default: 396 break; 397 } 398 return 0; 399 } 400 401 static inline long delay_callback(__u64 idx, void *arg) 402 { 403 __u64 target = *(__u64 *)arg; 404 405 if (target <= bpf_ktime_get_ns()) 406 return 1; 407 408 /* just to kill time */ 409 (void)bpf_get_prandom_u32(); 410 411 return 0; 412 } 413 414 static inline void do_lock_delay(__u64 duration) 415 { 416 __u64 target = bpf_ktime_get_ns() + duration; 417 418 bpf_loop(MAX_LOOP, delay_callback, &target, /*flags=*/0); 419 } 420 421 static inline void check_lock_delay(__u64 lock) 422 { 423 __u64 *delay; 424 425 delay = bpf_map_lookup_elem(&lock_delays, &lock); 426 if (delay) 427 do_lock_delay(*delay); 428 } 429 430 static inline struct tstamp_data *get_tstamp_elem(__u32 flags) 431 { 432 __u32 pid; 433 struct tstamp_data *pelem; 434 435 /* Use per-cpu array map for spinlock and rwlock */ 436 if ((flags & (LCB_F_SPIN | LCB_F_MUTEX)) == LCB_F_SPIN) { 437 __u32 idx = 0; 438 439 pelem = bpf_map_lookup_elem(&tstamp_cpu, &idx); 440 /* Do not update the element for nested locks */ 441 if (pelem && pelem->lock) 442 pelem = NULL; 443 return pelem; 444 } 445 446 pid = bpf_get_current_pid_tgid(); 447 pelem = bpf_map_lookup_elem(&tstamp, &pid); 448 /* Do not update the element for nested locks */ 449 if (pelem && pelem->lock) 450 return NULL; 451 452 if (pelem == NULL) { 453 struct tstamp_data zero = {}; 454 455 if (bpf_map_update_elem(&tstamp, &pid, &zero, BPF_NOEXIST) < 0) { 456 __sync_fetch_and_add(&task_fail, 1); 457 return NULL; 458 } 459 460 pelem = bpf_map_lookup_elem(&tstamp, &pid); 461 if (pelem == NULL) { 462 __sync_fetch_and_add(&task_fail, 1); 463 return NULL; 464 } 465 } 466 return pelem; 467 } 468 469 static inline s32 get_owner_stack_id(u64 *stacktrace) 470 { 471 s32 *id, new_id; 472 static s64 id_gen = 1; 473 474 id = bpf_map_lookup_elem(&owner_stacks, stacktrace); 475 if (id) 476 return *id; 477 478 new_id = (s32)__sync_fetch_and_add(&id_gen, 1); 479 480 bpf_map_update_elem(&owner_stacks, stacktrace, &new_id, BPF_NOEXIST); 481 482 id = bpf_map_lookup_elem(&owner_stacks, stacktrace); 483 if (id) 484 return *id; 485 486 return -1; 487 } 488 489 static inline void update_contention_data(struct contention_data *data, u64 duration, u32 count) 490 { 491 __sync_fetch_and_add(&data->total_time, duration); 492 __sync_fetch_and_add(&data->count, count); 493 494 /* FIXME: need atomic operations */ 495 if (data->max_time < duration) 496 data->max_time = duration; 497 if (data->min_time > duration) 498 data->min_time = duration; 499 } 500 501 static inline void update_owner_stat(u32 id, u64 duration, u32 flags) 502 { 503 struct contention_key key = { 504 .stack_id = id, 505 .pid = 0, 506 .lock_addr_or_cgroup = 0, 507 }; 508 struct contention_data *data = bpf_map_lookup_elem(&owner_stat, &key); 509 510 if (!data) { 511 struct contention_data first = { 512 .total_time = duration, 513 .max_time = duration, 514 .min_time = duration, 515 .count = 1, 516 .flags = flags, 517 }; 518 bpf_map_update_elem(&owner_stat, &key, &first, BPF_NOEXIST); 519 } else { 520 update_contention_data(data, duration, 1); 521 } 522 } 523 524 SEC("tp_btf/contention_begin") 525 int contention_begin(u64 *ctx) 526 { 527 struct tstamp_data *pelem; 528 529 if (!enabled || !can_record(ctx)) 530 return 0; 531 532 pelem = get_tstamp_elem(ctx[1]); 533 if (pelem == NULL) 534 return 0; 535 536 pelem->timestamp = bpf_ktime_get_ns(); 537 pelem->lock = (__u64)ctx[0]; 538 pelem->flags = (__u32)ctx[1]; 539 540 if (needs_callstack) { 541 u32 i = 0; 542 u32 id = 0; 543 int owner_pid; 544 u64 *buf; 545 struct task_struct *task; 546 struct owner_tracing_data *otdata; 547 548 if (!lock_owner) 549 goto skip_owner; 550 551 task = get_lock_owner(pelem->lock, pelem->flags); 552 if (!task) 553 goto skip_owner; 554 555 owner_pid = BPF_CORE_READ(task, pid); 556 557 buf = bpf_map_lookup_elem(&stack_buf, &i); 558 if (!buf) 559 goto skip_owner; 560 for (i = 0; i < max_stack; i++) 561 buf[i] = 0x0; 562 563 if (!bpf_task_from_pid) 564 goto skip_owner; 565 566 task = bpf_task_from_pid(owner_pid); 567 if (!task) 568 goto skip_owner; 569 570 bpf_get_task_stack(task, buf, max_stack * sizeof(unsigned long), 0); 571 bpf_task_release(task); 572 573 otdata = bpf_map_lookup_elem(&owner_data, &pelem->lock); 574 id = get_owner_stack_id(buf); 575 576 /* 577 * Contention just happens, or corner case `lock` is owned by process not 578 * `owner_pid`. For the corner case we treat it as unexpected internal error and 579 * just ignore the precvious tracing record. 580 */ 581 if (!otdata || otdata->pid != owner_pid) { 582 struct owner_tracing_data first = { 583 .pid = owner_pid, 584 .timestamp = pelem->timestamp, 585 .count = 1, 586 .stack_id = id, 587 }; 588 bpf_map_update_elem(&owner_data, &pelem->lock, &first, BPF_ANY); 589 } 590 /* Contention is ongoing and new waiter joins */ 591 else { 592 __sync_fetch_and_add(&otdata->count, 1); 593 594 /* 595 * The owner is the same, but stacktrace might be changed. In this case we 596 * store/update `owner_stat` based on current owner stack id. 597 */ 598 if (id != otdata->stack_id) { 599 update_owner_stat(id, pelem->timestamp - otdata->timestamp, 600 pelem->flags); 601 602 otdata->timestamp = pelem->timestamp; 603 otdata->stack_id = id; 604 } 605 } 606 skip_owner: 607 pelem->stack_id = bpf_get_stackid(ctx, &stacks, 608 BPF_F_FAST_STACK_CMP | stack_skip); 609 if (pelem->stack_id < 0) 610 __sync_fetch_and_add(&stack_fail, 1); 611 } else if (aggr_mode == LOCK_AGGR_TASK) { 612 struct task_struct *task; 613 614 if (lock_owner) { 615 task = get_lock_owner(pelem->lock, pelem->flags); 616 617 /* The flags is not used anymore. Pass the owner pid. */ 618 if (task) 619 pelem->flags = BPF_CORE_READ(task, pid); 620 else 621 pelem->flags = -1U; 622 623 } else { 624 task = bpf_get_current_task_btf(); 625 } 626 627 if (task) { 628 if (update_task_data(task) < 0 && lock_owner) 629 pelem->flags = -1U; 630 } 631 } 632 633 return 0; 634 } 635 636 SEC("tp_btf/contention_end") 637 int contention_end(u64 *ctx) 638 { 639 __u32 pid = 0, idx = 0; 640 struct tstamp_data *pelem; 641 struct contention_key key = {}; 642 struct contention_data *data; 643 __u64 timestamp; 644 __u64 duration; 645 bool need_delete = false; 646 647 if (!enabled) 648 return 0; 649 650 /* 651 * For spinlock and rwlock, it needs to get the timestamp for the 652 * per-cpu map. However, contention_end does not have the flags 653 * so it cannot know whether it reads percpu or hash map. 654 * 655 * Try per-cpu map first and check if there's active contention. 656 * If it is, do not read hash map because it cannot go to sleeping 657 * locks before releasing the spinning locks. 658 */ 659 pelem = bpf_map_lookup_elem(&tstamp_cpu, &idx); 660 if (pelem && pelem->lock) { 661 if (pelem->lock != ctx[0]) 662 return 0; 663 } else { 664 pid = bpf_get_current_pid_tgid(); 665 pelem = bpf_map_lookup_elem(&tstamp, &pid); 666 if (!pelem || pelem->lock != ctx[0]) 667 return 0; 668 need_delete = true; 669 } 670 671 timestamp = bpf_ktime_get_ns(); 672 duration = timestamp - pelem->timestamp; 673 if ((__s64)duration < 0) { 674 __sync_fetch_and_add(&time_fail, 1); 675 goto out; 676 } 677 678 if (needs_callstack && lock_owner) { 679 struct owner_tracing_data *otdata = bpf_map_lookup_elem(&owner_data, &pelem->lock); 680 681 if (!otdata) 682 goto skip_owner; 683 684 /* Update `owner_stat` */ 685 update_owner_stat(otdata->stack_id, timestamp - otdata->timestamp, pelem->flags); 686 687 /* No contention is occurring, delete `lock` entry in `owner_data` */ 688 if (otdata->count <= 1) 689 bpf_map_delete_elem(&owner_data, &pelem->lock); 690 /* 691 * Contention is still ongoing, with a new owner (current task). `owner_data` 692 * should be updated accordingly. 693 */ 694 else { 695 u32 i = 0; 696 s32 ret = (s32)ctx[1]; 697 u64 *buf; 698 699 otdata->timestamp = timestamp; 700 __sync_fetch_and_add(&otdata->count, -1); 701 702 buf = bpf_map_lookup_elem(&stack_buf, &i); 703 if (!buf) 704 goto skip_owner; 705 for (i = 0; i < (u32)max_stack; i++) 706 buf[i] = 0x0; 707 708 /* 709 * `ret` has the return code of the lock function. 710 * If `ret` is negative, the current task terminates lock waiting without 711 * acquiring it. Owner is not changed, but we still need to update the owner 712 * stack. 713 */ 714 if (ret < 0) { 715 s32 id = 0; 716 struct task_struct *task; 717 718 if (!bpf_task_from_pid) 719 goto skip_owner; 720 721 task = bpf_task_from_pid(otdata->pid); 722 if (!task) 723 goto skip_owner; 724 725 bpf_get_task_stack(task, buf, 726 max_stack * sizeof(unsigned long), 0); 727 bpf_task_release(task); 728 729 id = get_owner_stack_id(buf); 730 731 /* 732 * If owner stack is changed, update owner stack id for this lock. 733 */ 734 if (id != otdata->stack_id) 735 otdata->stack_id = id; 736 } 737 /* 738 * Otherwise, update tracing data with the current task, which is the new 739 * owner. 740 */ 741 else { 742 otdata->pid = pid; 743 /* 744 * We don't want to retrieve callstack here, since it is where the 745 * current task acquires the lock and provides no additional 746 * information. We simply assign -1 to invalidate it. 747 */ 748 otdata->stack_id = -1; 749 } 750 } 751 } 752 skip_owner: 753 switch (aggr_mode) { 754 case LOCK_AGGR_CALLER: 755 key.stack_id = pelem->stack_id; 756 break; 757 case LOCK_AGGR_TASK: 758 if (lock_owner) 759 key.pid = pelem->flags; 760 else { 761 if (!need_delete) 762 pid = bpf_get_current_pid_tgid(); 763 key.pid = pid; 764 } 765 if (needs_callstack) 766 key.stack_id = pelem->stack_id; 767 break; 768 case LOCK_AGGR_ADDR: 769 key.lock_addr_or_cgroup = pelem->lock; 770 if (needs_callstack) 771 key.stack_id = pelem->stack_id; 772 break; 773 case LOCK_AGGR_CGROUP: 774 key.lock_addr_or_cgroup = get_current_cgroup_id(); 775 break; 776 default: 777 /* should not happen */ 778 return 0; 779 } 780 781 data = bpf_map_lookup_elem(&lock_stat, &key); 782 if (!data) { 783 if (data_map_full) { 784 __sync_fetch_and_add(&data_fail, 1); 785 goto out; 786 } 787 788 struct contention_data first = { 789 .total_time = duration, 790 .max_time = duration, 791 .min_time = duration, 792 .count = 1, 793 .flags = pelem->flags, 794 }; 795 int err; 796 797 if (aggr_mode == LOCK_AGGR_ADDR) { 798 first.flags |= check_lock_type(pelem->lock, 799 pelem->flags & LCB_F_TYPE_MASK); 800 801 /* Check if it's from a slab object */ 802 if (bpf_get_kmem_cache) { 803 struct kmem_cache *s; 804 struct slab_cache_data *d; 805 806 s = bpf_get_kmem_cache(pelem->lock); 807 if (s != NULL) { 808 /* 809 * Save the ID of the slab cache in the flags 810 * (instead of full address) to reduce the 811 * space in the contention_data. 812 */ 813 d = bpf_map_lookup_elem(&slab_caches, &s); 814 if (d != NULL) 815 first.flags |= d->id; 816 } 817 } 818 } 819 820 err = bpf_map_update_elem(&lock_stat, &key, &first, BPF_NOEXIST); 821 if (err < 0) { 822 if (err == -EEXIST) { 823 /* it lost the race, try to get it again */ 824 data = bpf_map_lookup_elem(&lock_stat, &key); 825 if (data != NULL) 826 goto found; 827 } 828 if (err == -E2BIG) 829 data_map_full = 1; 830 __sync_fetch_and_add(&data_fail, 1); 831 } 832 goto out; 833 } 834 835 found: 836 update_contention_data(data, duration, 1); 837 838 out: 839 if (lock_delay) 840 check_lock_delay(pelem->lock); 841 842 pelem->lock = 0; 843 if (need_delete) 844 bpf_map_delete_elem(&tstamp, &pid); 845 return 0; 846 } 847 848 extern struct rq runqueues __ksym; 849 850 const volatile __u64 contig_page_data_addr; 851 const volatile __u64 node_data_addr; 852 const volatile int nr_nodes; 853 const volatile int sizeof_zone; 854 855 struct rq___old { 856 raw_spinlock_t lock; 857 } __attribute__((preserve_access_index)); 858 859 struct rq___new { 860 raw_spinlock_t __lock; 861 } __attribute__((preserve_access_index)); 862 863 static void collect_zone_lock(void) 864 { 865 __u64 nr_zones, zone_off; 866 __u64 lock_addr, lock_off; 867 __u32 lock_flag = LOCK_CLASS_ZONE_LOCK; 868 869 zone_off = offsetof(struct pglist_data, node_zones); 870 lock_off = offsetof(struct zone, lock); 871 872 if (contig_page_data_addr) { 873 struct pglist_data *contig_page_data; 874 875 contig_page_data = (void *)(long)contig_page_data_addr; 876 nr_zones = BPF_CORE_READ(contig_page_data, nr_zones); 877 878 for (int i = 0; i < MAX_ZONES; i++) { 879 __u64 zone_addr; 880 881 if (i >= nr_zones) 882 break; 883 884 zone_addr = contig_page_data_addr + (sizeof_zone * i) + zone_off; 885 lock_addr = zone_addr + lock_off; 886 887 bpf_map_update_elem(&lock_syms, &lock_addr, &lock_flag, BPF_ANY); 888 } 889 } else if (nr_nodes > 0) { 890 struct pglist_data **node_data = (void *)(long)node_data_addr; 891 892 for (int i = 0; i < nr_nodes; i++) { 893 struct pglist_data *pgdat = NULL; 894 int err; 895 896 err = bpf_core_read(&pgdat, sizeof(pgdat), &node_data[i]); 897 if (err < 0 || pgdat == NULL) 898 break; 899 900 nr_zones = BPF_CORE_READ(pgdat, nr_zones); 901 for (int k = 0; k < MAX_ZONES; k++) { 902 __u64 zone_addr; 903 904 if (k >= nr_zones) 905 break; 906 907 zone_addr = (__u64)(void *)pgdat + (sizeof_zone * k) + zone_off; 908 lock_addr = zone_addr + lock_off; 909 910 bpf_map_update_elem(&lock_syms, &lock_addr, &lock_flag, BPF_ANY); 911 } 912 } 913 } 914 } 915 916 SEC("raw_tp/bpf_test_finish") 917 int BPF_PROG(collect_lock_syms) 918 { 919 __u64 lock_addr, lock_off; 920 __u32 lock_flag; 921 922 if (bpf_core_field_exists(struct rq___new, __lock)) 923 lock_off = offsetof(struct rq___new, __lock); 924 else 925 lock_off = offsetof(struct rq___old, lock); 926 927 for (int i = 0; i < MAX_CPUS; i++) { 928 struct rq *rq = bpf_per_cpu_ptr(&runqueues, i); 929 930 if (rq == NULL) 931 break; 932 933 lock_addr = (__u64)(void *)rq + lock_off; 934 lock_flag = LOCK_CLASS_RQLOCK; 935 bpf_map_update_elem(&lock_syms, &lock_addr, &lock_flag, BPF_ANY); 936 } 937 938 collect_zone_lock(); 939 940 return 0; 941 } 942 943 SEC("raw_tp/bpf_test_finish") 944 int BPF_PROG(end_timestamp) 945 { 946 end_ts = bpf_ktime_get_ns(); 947 return 0; 948 } 949 950 /* 951 * bpf_iter__kmem_cache added recently so old kernels don't have it in the 952 * vmlinux.h. But we cannot add it here since it will cause a compiler error 953 * due to redefinition of the struct on later kernels. 954 * 955 * So it uses a CO-RE trick to access the member only if it has the type. 956 * This will support both old and new kernels without compiler errors. 957 */ 958 struct bpf_iter__kmem_cache___new { 959 struct kmem_cache *s; 960 } __attribute__((preserve_access_index)); 961 962 SEC("iter/kmem_cache") 963 int slab_cache_iter(void *ctx) 964 { 965 struct kmem_cache *s = NULL; 966 struct slab_cache_data d; 967 const char *nameptr; 968 969 if (bpf_core_type_exists(struct bpf_iter__kmem_cache)) { 970 struct bpf_iter__kmem_cache___new *iter = ctx; 971 972 s = iter->s; 973 } 974 975 if (s == NULL) 976 return 0; 977 978 nameptr = s->name; 979 bpf_probe_read_kernel_str(d.name, sizeof(d.name), nameptr); 980 981 d.id = ++slab_cache_id << LCB_F_SLAB_ID_SHIFT; 982 if (d.id >= LCB_F_SLAB_ID_END) 983 return 0; 984 985 bpf_map_update_elem(&slab_caches, &s, &d, BPF_NOEXIST); 986 return 0; 987 } 988 989 char LICENSE[] SEC("license") = "Dual BSD/GPL"; 990