1 // SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) 2 // Copyright (c) 2022 Google 3 #include "vmlinux.h" 4 #include <bpf/bpf_helpers.h> 5 #include <bpf/bpf_tracing.h> 6 #include <bpf/bpf_core_read.h> 7 #include <asm-generic/errno-base.h> 8 9 #include "lock_data.h" 10 11 /* for collect_lock_syms(). 4096 was rejected by the verifier */ 12 #define MAX_CPUS 1024 13 14 /* lock contention flags from include/trace/events/lock.h */ 15 #define LCB_F_SPIN (1U << 0) 16 #define LCB_F_READ (1U << 1) 17 #define LCB_F_WRITE (1U << 2) 18 #define LCB_F_RT (1U << 3) 19 #define LCB_F_PERCPU (1U << 4) 20 #define LCB_F_MUTEX (1U << 5) 21 22 /* callstack storage */ 23 struct { 24 __uint(type, BPF_MAP_TYPE_STACK_TRACE); 25 __uint(key_size, sizeof(__u32)); 26 __uint(value_size, sizeof(__u64)); 27 __uint(max_entries, MAX_ENTRIES); 28 } stacks SEC(".maps"); 29 30 /* maintain timestamp at the beginning of contention */ 31 struct { 32 __uint(type, BPF_MAP_TYPE_HASH); 33 __type(key, int); 34 __type(value, struct tstamp_data); 35 __uint(max_entries, MAX_ENTRIES); 36 } tstamp SEC(".maps"); 37 38 /* maintain per-CPU timestamp at the beginning of contention */ 39 struct { 40 __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); 41 __uint(key_size, sizeof(__u32)); 42 __uint(value_size, sizeof(struct tstamp_data)); 43 __uint(max_entries, 1); 44 } tstamp_cpu SEC(".maps"); 45 46 /* actual lock contention statistics */ 47 struct { 48 __uint(type, BPF_MAP_TYPE_HASH); 49 __uint(key_size, sizeof(struct contention_key)); 50 __uint(value_size, sizeof(struct contention_data)); 51 __uint(max_entries, MAX_ENTRIES); 52 } lock_stat SEC(".maps"); 53 54 struct { 55 __uint(type, BPF_MAP_TYPE_HASH); 56 __uint(key_size, sizeof(__u32)); 57 __uint(value_size, sizeof(struct contention_task_data)); 58 __uint(max_entries, MAX_ENTRIES); 59 } task_data SEC(".maps"); 60 61 struct { 62 __uint(type, BPF_MAP_TYPE_HASH); 63 __uint(key_size, sizeof(__u64)); 64 __uint(value_size, sizeof(__u32)); 65 __uint(max_entries, MAX_ENTRIES); 66 } lock_syms SEC(".maps"); 67 68 struct { 69 __uint(type, BPF_MAP_TYPE_HASH); 70 __uint(key_size, sizeof(__u32)); 71 __uint(value_size, sizeof(__u8)); 72 __uint(max_entries, 1); 73 } cpu_filter SEC(".maps"); 74 75 struct { 76 __uint(type, BPF_MAP_TYPE_HASH); 77 __uint(key_size, sizeof(__u32)); 78 __uint(value_size, sizeof(__u8)); 79 __uint(max_entries, 1); 80 } task_filter SEC(".maps"); 81 82 struct { 83 __uint(type, BPF_MAP_TYPE_HASH); 84 __uint(key_size, sizeof(__u32)); 85 __uint(value_size, sizeof(__u8)); 86 __uint(max_entries, 1); 87 } type_filter SEC(".maps"); 88 89 struct { 90 __uint(type, BPF_MAP_TYPE_HASH); 91 __uint(key_size, sizeof(__u64)); 92 __uint(value_size, sizeof(__u8)); 93 __uint(max_entries, 1); 94 } addr_filter SEC(".maps"); 95 96 struct { 97 __uint(type, BPF_MAP_TYPE_HASH); 98 __uint(key_size, sizeof(__u64)); 99 __uint(value_size, sizeof(__u8)); 100 __uint(max_entries, 1); 101 } cgroup_filter SEC(".maps"); 102 103 struct { 104 __uint(type, BPF_MAP_TYPE_HASH); 105 __uint(key_size, sizeof(long)); 106 __uint(value_size, sizeof(__u8)); 107 __uint(max_entries, 1); 108 } slab_filter SEC(".maps"); 109 110 struct { 111 __uint(type, BPF_MAP_TYPE_HASH); 112 __uint(key_size, sizeof(long)); 113 __uint(value_size, sizeof(struct slab_cache_data)); 114 __uint(max_entries, 1); 115 } slab_caches SEC(".maps"); 116 117 struct rw_semaphore___old { 118 struct task_struct *owner; 119 } __attribute__((preserve_access_index)); 120 121 struct rw_semaphore___new { 122 atomic_long_t owner; 123 } __attribute__((preserve_access_index)); 124 125 struct mm_struct___old { 126 struct rw_semaphore mmap_sem; 127 } __attribute__((preserve_access_index)); 128 129 struct mm_struct___new { 130 struct rw_semaphore mmap_lock; 131 } __attribute__((preserve_access_index)); 132 133 extern struct kmem_cache *bpf_get_kmem_cache(u64 addr) __ksym __weak; 134 135 /* control flags */ 136 const volatile int has_cpu; 137 const volatile int has_task; 138 const volatile int has_type; 139 const volatile int has_addr; 140 const volatile int has_cgroup; 141 const volatile int has_slab; 142 const volatile int needs_callstack; 143 const volatile int stack_skip; 144 const volatile int lock_owner; 145 const volatile int use_cgroup_v2; 146 147 /* determine the key of lock stat */ 148 const volatile int aggr_mode; 149 150 int enabled; 151 152 int perf_subsys_id = -1; 153 154 __u64 end_ts; 155 156 __u32 slab_cache_id; 157 158 /* error stat */ 159 int task_fail; 160 int stack_fail; 161 int time_fail; 162 int data_fail; 163 164 int task_map_full; 165 int data_map_full; 166 167 static inline __u64 get_current_cgroup_id(void) 168 { 169 struct task_struct *task; 170 struct cgroup *cgrp; 171 172 if (use_cgroup_v2) 173 return bpf_get_current_cgroup_id(); 174 175 task = bpf_get_current_task_btf(); 176 177 if (perf_subsys_id == -1) { 178 #if __has_builtin(__builtin_preserve_enum_value) 179 perf_subsys_id = bpf_core_enum_value(enum cgroup_subsys_id, 180 perf_event_cgrp_id); 181 #else 182 perf_subsys_id = perf_event_cgrp_id; 183 #endif 184 } 185 186 cgrp = BPF_CORE_READ(task, cgroups, subsys[perf_subsys_id], cgroup); 187 return BPF_CORE_READ(cgrp, kn, id); 188 } 189 190 static inline int can_record(u64 *ctx) 191 { 192 if (has_cpu) { 193 __u32 cpu = bpf_get_smp_processor_id(); 194 __u8 *ok; 195 196 ok = bpf_map_lookup_elem(&cpu_filter, &cpu); 197 if (!ok) 198 return 0; 199 } 200 201 if (has_task) { 202 __u8 *ok; 203 __u32 pid = bpf_get_current_pid_tgid(); 204 205 ok = bpf_map_lookup_elem(&task_filter, &pid); 206 if (!ok) 207 return 0; 208 } 209 210 if (has_type) { 211 __u8 *ok; 212 __u32 flags = (__u32)ctx[1]; 213 214 ok = bpf_map_lookup_elem(&type_filter, &flags); 215 if (!ok) 216 return 0; 217 } 218 219 if (has_addr) { 220 __u8 *ok; 221 __u64 addr = ctx[0]; 222 223 ok = bpf_map_lookup_elem(&addr_filter, &addr); 224 if (!ok && !has_slab) 225 return 0; 226 } 227 228 if (has_cgroup) { 229 __u8 *ok; 230 __u64 cgrp = get_current_cgroup_id(); 231 232 ok = bpf_map_lookup_elem(&cgroup_filter, &cgrp); 233 if (!ok) 234 return 0; 235 } 236 237 if (has_slab && bpf_get_kmem_cache) { 238 __u8 *ok; 239 __u64 addr = ctx[0]; 240 long kmem_cache_addr; 241 242 kmem_cache_addr = (long)bpf_get_kmem_cache(addr); 243 ok = bpf_map_lookup_elem(&slab_filter, &kmem_cache_addr); 244 if (!ok) 245 return 0; 246 } 247 248 return 1; 249 } 250 251 static inline int update_task_data(struct task_struct *task) 252 { 253 struct contention_task_data *p; 254 int pid, err; 255 256 err = bpf_core_read(&pid, sizeof(pid), &task->pid); 257 if (err) 258 return -1; 259 260 p = bpf_map_lookup_elem(&task_data, &pid); 261 if (p == NULL && !task_map_full) { 262 struct contention_task_data data = {}; 263 264 BPF_CORE_READ_STR_INTO(&data.comm, task, comm); 265 if (bpf_map_update_elem(&task_data, &pid, &data, BPF_NOEXIST) == -E2BIG) 266 task_map_full = 1; 267 } 268 269 return 0; 270 } 271 272 #ifndef __has_builtin 273 # define __has_builtin(x) 0 274 #endif 275 276 static inline struct task_struct *get_lock_owner(__u64 lock, __u32 flags) 277 { 278 struct task_struct *task; 279 __u64 owner = 0; 280 281 if (flags & LCB_F_MUTEX) { 282 struct mutex *mutex = (void *)lock; 283 owner = BPF_CORE_READ(mutex, owner.counter); 284 } else if (flags == LCB_F_READ || flags == LCB_F_WRITE) { 285 /* 286 * Support for the BPF_TYPE_MATCHES argument to the 287 * __builtin_preserve_type_info builtin was added at some point during 288 * development of clang 15 and it's what is needed for 289 * bpf_core_type_matches. 290 */ 291 #if __has_builtin(__builtin_preserve_type_info) && __clang_major__ >= 15 292 if (bpf_core_type_matches(struct rw_semaphore___old)) { 293 struct rw_semaphore___old *rwsem = (void *)lock; 294 owner = (unsigned long)BPF_CORE_READ(rwsem, owner); 295 } else if (bpf_core_type_matches(struct rw_semaphore___new)) { 296 struct rw_semaphore___new *rwsem = (void *)lock; 297 owner = BPF_CORE_READ(rwsem, owner.counter); 298 } 299 #else 300 /* assume new struct */ 301 struct rw_semaphore *rwsem = (void *)lock; 302 owner = BPF_CORE_READ(rwsem, owner.counter); 303 #endif 304 } 305 306 if (!owner) 307 return NULL; 308 309 task = (void *)(owner & ~7UL); 310 return task; 311 } 312 313 static inline __u32 check_lock_type(__u64 lock, __u32 flags) 314 { 315 struct task_struct *curr; 316 struct mm_struct___old *mm_old; 317 struct mm_struct___new *mm_new; 318 struct sighand_struct *sighand; 319 320 switch (flags) { 321 case LCB_F_READ: /* rwsem */ 322 case LCB_F_WRITE: 323 curr = bpf_get_current_task_btf(); 324 if (curr->mm == NULL) 325 break; 326 mm_new = (void *)curr->mm; 327 if (bpf_core_field_exists(mm_new->mmap_lock)) { 328 if (&mm_new->mmap_lock == (void *)lock) 329 return LCD_F_MMAP_LOCK; 330 break; 331 } 332 mm_old = (void *)curr->mm; 333 if (bpf_core_field_exists(mm_old->mmap_sem)) { 334 if (&mm_old->mmap_sem == (void *)lock) 335 return LCD_F_MMAP_LOCK; 336 } 337 break; 338 case LCB_F_SPIN: /* spinlock */ 339 curr = bpf_get_current_task_btf(); 340 sighand = curr->sighand; 341 342 if (sighand && &sighand->siglock == (void *)lock) 343 return LCD_F_SIGHAND_LOCK; 344 break; 345 default: 346 break; 347 } 348 return 0; 349 } 350 351 static inline struct tstamp_data *get_tstamp_elem(__u32 flags) 352 { 353 __u32 pid; 354 struct tstamp_data *pelem; 355 356 /* Use per-cpu array map for spinlock and rwlock */ 357 if ((flags & (LCB_F_SPIN | LCB_F_MUTEX)) == LCB_F_SPIN) { 358 __u32 idx = 0; 359 360 pelem = bpf_map_lookup_elem(&tstamp_cpu, &idx); 361 /* Do not update the element for nested locks */ 362 if (pelem && pelem->lock) 363 pelem = NULL; 364 return pelem; 365 } 366 367 pid = bpf_get_current_pid_tgid(); 368 pelem = bpf_map_lookup_elem(&tstamp, &pid); 369 /* Do not update the element for nested locks */ 370 if (pelem && pelem->lock) 371 return NULL; 372 373 if (pelem == NULL) { 374 struct tstamp_data zero = {}; 375 376 if (bpf_map_update_elem(&tstamp, &pid, &zero, BPF_NOEXIST) < 0) { 377 __sync_fetch_and_add(&task_fail, 1); 378 return NULL; 379 } 380 381 pelem = bpf_map_lookup_elem(&tstamp, &pid); 382 if (pelem == NULL) { 383 __sync_fetch_and_add(&task_fail, 1); 384 return NULL; 385 } 386 } 387 return pelem; 388 } 389 390 SEC("tp_btf/contention_begin") 391 int contention_begin(u64 *ctx) 392 { 393 struct tstamp_data *pelem; 394 395 if (!enabled || !can_record(ctx)) 396 return 0; 397 398 pelem = get_tstamp_elem(ctx[1]); 399 if (pelem == NULL) 400 return 0; 401 402 pelem->timestamp = bpf_ktime_get_ns(); 403 pelem->lock = (__u64)ctx[0]; 404 pelem->flags = (__u32)ctx[1]; 405 406 if (needs_callstack) { 407 pelem->stack_id = bpf_get_stackid(ctx, &stacks, 408 BPF_F_FAST_STACK_CMP | stack_skip); 409 if (pelem->stack_id < 0) 410 __sync_fetch_and_add(&stack_fail, 1); 411 } else if (aggr_mode == LOCK_AGGR_TASK) { 412 struct task_struct *task; 413 414 if (lock_owner) { 415 task = get_lock_owner(pelem->lock, pelem->flags); 416 417 /* The flags is not used anymore. Pass the owner pid. */ 418 if (task) 419 pelem->flags = BPF_CORE_READ(task, pid); 420 else 421 pelem->flags = -1U; 422 423 } else { 424 task = bpf_get_current_task_btf(); 425 } 426 427 if (task) { 428 if (update_task_data(task) < 0 && lock_owner) 429 pelem->flags = -1U; 430 } 431 } 432 433 return 0; 434 } 435 436 SEC("tp_btf/contention_end") 437 int contention_end(u64 *ctx) 438 { 439 __u32 pid = 0, idx = 0; 440 struct tstamp_data *pelem; 441 struct contention_key key = {}; 442 struct contention_data *data; 443 __u64 duration; 444 bool need_delete = false; 445 446 if (!enabled) 447 return 0; 448 449 /* 450 * For spinlock and rwlock, it needs to get the timestamp for the 451 * per-cpu map. However, contention_end does not have the flags 452 * so it cannot know whether it reads percpu or hash map. 453 * 454 * Try per-cpu map first and check if there's active contention. 455 * If it is, do not read hash map because it cannot go to sleeping 456 * locks before releasing the spinning locks. 457 */ 458 pelem = bpf_map_lookup_elem(&tstamp_cpu, &idx); 459 if (pelem && pelem->lock) { 460 if (pelem->lock != ctx[0]) 461 return 0; 462 } else { 463 pid = bpf_get_current_pid_tgid(); 464 pelem = bpf_map_lookup_elem(&tstamp, &pid); 465 if (!pelem || pelem->lock != ctx[0]) 466 return 0; 467 need_delete = true; 468 } 469 470 duration = bpf_ktime_get_ns() - pelem->timestamp; 471 if ((__s64)duration < 0) { 472 __sync_fetch_and_add(&time_fail, 1); 473 goto out; 474 } 475 476 switch (aggr_mode) { 477 case LOCK_AGGR_CALLER: 478 key.stack_id = pelem->stack_id; 479 break; 480 case LOCK_AGGR_TASK: 481 if (lock_owner) 482 key.pid = pelem->flags; 483 else { 484 if (!need_delete) 485 pid = bpf_get_current_pid_tgid(); 486 key.pid = pid; 487 } 488 if (needs_callstack) 489 key.stack_id = pelem->stack_id; 490 break; 491 case LOCK_AGGR_ADDR: 492 key.lock_addr_or_cgroup = pelem->lock; 493 if (needs_callstack) 494 key.stack_id = pelem->stack_id; 495 break; 496 case LOCK_AGGR_CGROUP: 497 key.lock_addr_or_cgroup = get_current_cgroup_id(); 498 break; 499 default: 500 /* should not happen */ 501 return 0; 502 } 503 504 data = bpf_map_lookup_elem(&lock_stat, &key); 505 if (!data) { 506 if (data_map_full) { 507 __sync_fetch_and_add(&data_fail, 1); 508 goto out; 509 } 510 511 struct contention_data first = { 512 .total_time = duration, 513 .max_time = duration, 514 .min_time = duration, 515 .count = 1, 516 .flags = pelem->flags, 517 }; 518 int err; 519 520 if (aggr_mode == LOCK_AGGR_ADDR) { 521 first.flags |= check_lock_type(pelem->lock, 522 pelem->flags & LCB_F_TYPE_MASK); 523 524 /* Check if it's from a slab object */ 525 if (bpf_get_kmem_cache) { 526 struct kmem_cache *s; 527 struct slab_cache_data *d; 528 529 s = bpf_get_kmem_cache(pelem->lock); 530 if (s != NULL) { 531 /* 532 * Save the ID of the slab cache in the flags 533 * (instead of full address) to reduce the 534 * space in the contention_data. 535 */ 536 d = bpf_map_lookup_elem(&slab_caches, &s); 537 if (d != NULL) 538 first.flags |= d->id; 539 } 540 } 541 } 542 543 err = bpf_map_update_elem(&lock_stat, &key, &first, BPF_NOEXIST); 544 if (err < 0) { 545 if (err == -EEXIST) { 546 /* it lost the race, try to get it again */ 547 data = bpf_map_lookup_elem(&lock_stat, &key); 548 if (data != NULL) 549 goto found; 550 } 551 if (err == -E2BIG) 552 data_map_full = 1; 553 __sync_fetch_and_add(&data_fail, 1); 554 } 555 goto out; 556 } 557 558 found: 559 __sync_fetch_and_add(&data->total_time, duration); 560 __sync_fetch_and_add(&data->count, 1); 561 562 /* FIXME: need atomic operations */ 563 if (data->max_time < duration) 564 data->max_time = duration; 565 if (data->min_time > duration) 566 data->min_time = duration; 567 568 out: 569 pelem->lock = 0; 570 if (need_delete) 571 bpf_map_delete_elem(&tstamp, &pid); 572 return 0; 573 } 574 575 extern struct rq runqueues __ksym; 576 577 struct rq___old { 578 raw_spinlock_t lock; 579 } __attribute__((preserve_access_index)); 580 581 struct rq___new { 582 raw_spinlock_t __lock; 583 } __attribute__((preserve_access_index)); 584 585 SEC("raw_tp/bpf_test_finish") 586 int BPF_PROG(collect_lock_syms) 587 { 588 __u64 lock_addr, lock_off; 589 __u32 lock_flag; 590 591 if (bpf_core_field_exists(struct rq___new, __lock)) 592 lock_off = offsetof(struct rq___new, __lock); 593 else 594 lock_off = offsetof(struct rq___old, lock); 595 596 for (int i = 0; i < MAX_CPUS; i++) { 597 struct rq *rq = bpf_per_cpu_ptr(&runqueues, i); 598 599 if (rq == NULL) 600 break; 601 602 lock_addr = (__u64)(void *)rq + lock_off; 603 lock_flag = LOCK_CLASS_RQLOCK; 604 bpf_map_update_elem(&lock_syms, &lock_addr, &lock_flag, BPF_ANY); 605 } 606 return 0; 607 } 608 609 SEC("raw_tp/bpf_test_finish") 610 int BPF_PROG(end_timestamp) 611 { 612 end_ts = bpf_ktime_get_ns(); 613 return 0; 614 } 615 616 /* 617 * bpf_iter__kmem_cache added recently so old kernels don't have it in the 618 * vmlinux.h. But we cannot add it here since it will cause a compiler error 619 * due to redefinition of the struct on later kernels. 620 * 621 * So it uses a CO-RE trick to access the member only if it has the type. 622 * This will support both old and new kernels without compiler errors. 623 */ 624 struct bpf_iter__kmem_cache___new { 625 struct kmem_cache *s; 626 } __attribute__((preserve_access_index)); 627 628 SEC("iter/kmem_cache") 629 int slab_cache_iter(void *ctx) 630 { 631 struct kmem_cache *s = NULL; 632 struct slab_cache_data d; 633 const char *nameptr; 634 635 if (bpf_core_type_exists(struct bpf_iter__kmem_cache)) { 636 struct bpf_iter__kmem_cache___new *iter = ctx; 637 638 s = iter->s; 639 } 640 641 if (s == NULL) 642 return 0; 643 644 nameptr = s->name; 645 bpf_probe_read_kernel_str(d.name, sizeof(d.name), nameptr); 646 647 d.id = ++slab_cache_id << LCB_F_SLAB_ID_SHIFT; 648 if (d.id >= LCB_F_SLAB_ID_END) 649 return 0; 650 651 bpf_map_update_elem(&slab_caches, &s, &d, BPF_NOEXIST); 652 return 0; 653 } 654 655 char LICENSE[] SEC("license") = "Dual BSD/GPL"; 656