1 // SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
2 // Copyright (c) 2022 Google
3 #include "vmlinux.h"
4 #include <bpf/bpf_helpers.h>
5 #include <bpf/bpf_tracing.h>
6 #include <bpf/bpf_core_read.h>
7 #include <asm-generic/errno-base.h>
8
9 #include "lock_data.h"
10
11 /* for collect_lock_syms(). 4096 was rejected by the verifier */
12 #define MAX_CPUS 1024
13
14 /* for collect_zone_lock(). It should be more than the actual zones. */
15 #define MAX_ZONES 10
16
17 /* for do_lock_delay(). Arbitrarily set to 1 million. */
18 #define MAX_LOOP (1U << 20)
19
20 /* lock contention flags from include/trace/events/lock.h */
21 #define LCB_F_SPIN (1U << 0)
22 #define LCB_F_READ (1U << 1)
23 #define LCB_F_WRITE (1U << 2)
24 #define LCB_F_RT (1U << 3)
25 #define LCB_F_PERCPU (1U << 4)
26 #define LCB_F_MUTEX (1U << 5)
27
28 /* callstack storage */
29 struct {
30 __uint(type, BPF_MAP_TYPE_STACK_TRACE);
31 __uint(key_size, sizeof(__u32));
32 __uint(value_size, sizeof(__u64));
33 __uint(max_entries, MAX_ENTRIES);
34 } stacks SEC(".maps");
35
36 /* buffer for owner stacktrace */
37 struct {
38 __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
39 __uint(key_size, sizeof(__u32));
40 __uint(value_size, sizeof(__u64));
41 __uint(max_entries, 1);
42 } stack_buf SEC(".maps");
43
44 /* a map for tracing owner stacktrace to owner stack id */
45 struct {
46 __uint(type, BPF_MAP_TYPE_HASH);
47 __uint(key_size, sizeof(__u64)); // owner stacktrace
48 __uint(value_size, sizeof(__s32)); // owner stack id
49 __uint(max_entries, 1);
50 } owner_stacks SEC(".maps");
51
52 /* a map for tracing lock address to owner data */
53 struct {
54 __uint(type, BPF_MAP_TYPE_HASH);
55 __uint(key_size, sizeof(__u64)); // lock address
56 __uint(value_size, sizeof(struct owner_tracing_data));
57 __uint(max_entries, 1);
58 } owner_data SEC(".maps");
59
60 /* a map for contention_key (stores owner stack id) to contention data */
61 struct {
62 __uint(type, BPF_MAP_TYPE_HASH);
63 __uint(key_size, sizeof(struct contention_key));
64 __uint(value_size, sizeof(struct contention_data));
65 __uint(max_entries, 1);
66 } owner_stat SEC(".maps");
67
68 /* maintain timestamp at the beginning of contention */
69 struct {
70 __uint(type, BPF_MAP_TYPE_HASH);
71 __type(key, int);
72 __type(value, struct tstamp_data);
73 __uint(max_entries, MAX_ENTRIES);
74 } tstamp SEC(".maps");
75
76 /* maintain per-CPU timestamp at the beginning of contention */
77 struct {
78 __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
79 __uint(key_size, sizeof(__u32));
80 __uint(value_size, sizeof(struct tstamp_data));
81 __uint(max_entries, 1);
82 } tstamp_cpu SEC(".maps");
83
84 /* actual lock contention statistics */
85 struct {
86 __uint(type, BPF_MAP_TYPE_HASH);
87 __uint(key_size, sizeof(struct contention_key));
88 __uint(value_size, sizeof(struct contention_data));
89 __uint(max_entries, MAX_ENTRIES);
90 } lock_stat SEC(".maps");
91
92 struct {
93 __uint(type, BPF_MAP_TYPE_HASH);
94 __uint(key_size, sizeof(__u32));
95 __uint(value_size, sizeof(struct contention_task_data));
96 __uint(max_entries, MAX_ENTRIES);
97 } task_data SEC(".maps");
98
99 struct {
100 __uint(type, BPF_MAP_TYPE_HASH);
101 __uint(key_size, sizeof(__u64));
102 __uint(value_size, sizeof(__u32));
103 __uint(max_entries, MAX_ENTRIES);
104 } lock_syms SEC(".maps");
105
106 struct {
107 __uint(type, BPF_MAP_TYPE_HASH);
108 __uint(key_size, sizeof(__u32));
109 __uint(value_size, sizeof(__u8));
110 __uint(max_entries, 1);
111 } cpu_filter SEC(".maps");
112
113 struct {
114 __uint(type, BPF_MAP_TYPE_HASH);
115 __uint(key_size, sizeof(__u32));
116 __uint(value_size, sizeof(__u8));
117 __uint(max_entries, 1);
118 } task_filter SEC(".maps");
119
120 struct {
121 __uint(type, BPF_MAP_TYPE_HASH);
122 __uint(key_size, sizeof(__u32));
123 __uint(value_size, sizeof(__u8));
124 __uint(max_entries, 1);
125 } type_filter SEC(".maps");
126
127 struct {
128 __uint(type, BPF_MAP_TYPE_HASH);
129 __uint(key_size, sizeof(__u64));
130 __uint(value_size, sizeof(__u8));
131 __uint(max_entries, 1);
132 } addr_filter SEC(".maps");
133
134 struct {
135 __uint(type, BPF_MAP_TYPE_HASH);
136 __uint(key_size, sizeof(__u64));
137 __uint(value_size, sizeof(__u8));
138 __uint(max_entries, 1);
139 } cgroup_filter SEC(".maps");
140
141 struct {
142 __uint(type, BPF_MAP_TYPE_HASH);
143 __uint(key_size, sizeof(long));
144 __uint(value_size, sizeof(__u8));
145 __uint(max_entries, 1);
146 } slab_filter SEC(".maps");
147
148 struct {
149 __uint(type, BPF_MAP_TYPE_HASH);
150 __uint(key_size, sizeof(long));
151 __uint(value_size, sizeof(struct slab_cache_data));
152 __uint(max_entries, 1);
153 } slab_caches SEC(".maps");
154
155 struct {
156 __uint(type, BPF_MAP_TYPE_HASH);
157 __uint(key_size, sizeof(__u64));
158 __uint(value_size, sizeof(__u64));
159 __uint(max_entries, 1);
160 } lock_delays SEC(".maps");
161
162 struct rw_semaphore___old {
163 struct task_struct *owner;
164 } __attribute__((preserve_access_index));
165
166 struct rw_semaphore___new {
167 atomic_long_t owner;
168 } __attribute__((preserve_access_index));
169
170 struct mm_struct___old {
171 struct rw_semaphore mmap_sem;
172 } __attribute__((preserve_access_index));
173
174 struct mm_struct___new {
175 struct rw_semaphore mmap_lock;
176 } __attribute__((preserve_access_index));
177
178 extern struct kmem_cache *bpf_get_kmem_cache(u64 addr) __ksym __weak;
179
180 /* control flags */
181 const volatile int has_cpu;
182 const volatile int has_task;
183 const volatile int has_type;
184 const volatile int has_addr;
185 const volatile int has_cgroup;
186 const volatile int has_slab;
187 const volatile int needs_callstack;
188 const volatile int stack_skip;
189 const volatile int lock_owner;
190 const volatile int use_cgroup_v2;
191 const volatile int max_stack;
192 const volatile int lock_delay;
193
194 /* determine the key of lock stat */
195 const volatile int aggr_mode;
196
197 int enabled;
198
199 int perf_subsys_id = -1;
200
201 __u64 end_ts;
202
203 __u32 slab_cache_id;
204
205 /* error stat */
206 int task_fail;
207 int stack_fail;
208 int time_fail;
209 int data_fail;
210
211 int task_map_full;
212 int data_map_full;
213
214 struct task_struct *bpf_task_from_pid(s32 pid) __ksym __weak;
215 void bpf_task_release(struct task_struct *p) __ksym __weak;
216
get_current_cgroup_id(void)217 static inline __u64 get_current_cgroup_id(void)
218 {
219 struct task_struct *task;
220 struct cgroup *cgrp;
221
222 if (use_cgroup_v2)
223 return bpf_get_current_cgroup_id();
224
225 task = bpf_get_current_task_btf();
226
227 if (perf_subsys_id == -1) {
228 #if __has_builtin(__builtin_preserve_enum_value)
229 perf_subsys_id = bpf_core_enum_value(enum cgroup_subsys_id,
230 perf_event_cgrp_id);
231 #else
232 perf_subsys_id = perf_event_cgrp_id;
233 #endif
234 }
235
236 cgrp = BPF_CORE_READ(task, cgroups, subsys[perf_subsys_id], cgroup);
237 return BPF_CORE_READ(cgrp, kn, id);
238 }
239
can_record(u64 * ctx)240 static inline int can_record(u64 *ctx)
241 {
242 if (has_cpu) {
243 __u32 cpu = bpf_get_smp_processor_id();
244 __u8 *ok;
245
246 ok = bpf_map_lookup_elem(&cpu_filter, &cpu);
247 if (!ok)
248 return 0;
249 }
250
251 if (has_task) {
252 __u8 *ok;
253 __u32 pid = bpf_get_current_pid_tgid();
254
255 ok = bpf_map_lookup_elem(&task_filter, &pid);
256 if (!ok)
257 return 0;
258 }
259
260 if (has_type) {
261 __u8 *ok;
262 __u32 flags = (__u32)ctx[1];
263
264 ok = bpf_map_lookup_elem(&type_filter, &flags);
265 if (!ok)
266 return 0;
267 }
268
269 if (has_addr) {
270 __u8 *ok;
271 __u64 addr = ctx[0];
272
273 ok = bpf_map_lookup_elem(&addr_filter, &addr);
274 if (!ok && !has_slab)
275 return 0;
276 }
277
278 if (has_cgroup) {
279 __u8 *ok;
280 __u64 cgrp = get_current_cgroup_id();
281
282 ok = bpf_map_lookup_elem(&cgroup_filter, &cgrp);
283 if (!ok)
284 return 0;
285 }
286
287 if (has_slab && bpf_get_kmem_cache) {
288 __u8 *ok;
289 __u64 addr = ctx[0];
290 long kmem_cache_addr;
291
292 kmem_cache_addr = (long)bpf_get_kmem_cache(addr);
293 ok = bpf_map_lookup_elem(&slab_filter, &kmem_cache_addr);
294 if (!ok)
295 return 0;
296 }
297
298 return 1;
299 }
300
update_task_data(struct task_struct * task)301 static inline int update_task_data(struct task_struct *task)
302 {
303 struct contention_task_data *p;
304 int pid, err;
305
306 err = bpf_core_read(&pid, sizeof(pid), &task->pid);
307 if (err)
308 return -1;
309
310 p = bpf_map_lookup_elem(&task_data, &pid);
311 if (p == NULL && !task_map_full) {
312 struct contention_task_data data = {};
313
314 BPF_CORE_READ_STR_INTO(&data.comm, task, comm);
315 if (bpf_map_update_elem(&task_data, &pid, &data, BPF_NOEXIST) == -E2BIG)
316 task_map_full = 1;
317 }
318
319 return 0;
320 }
321
322 #ifndef __has_builtin
323 # define __has_builtin(x) 0
324 #endif
325
get_lock_owner(__u64 lock,__u32 flags)326 static inline struct task_struct *get_lock_owner(__u64 lock, __u32 flags)
327 {
328 struct task_struct *task;
329 __u64 owner = 0;
330
331 if (flags & LCB_F_MUTEX) {
332 struct mutex *mutex = (void *)lock;
333 owner = BPF_CORE_READ(mutex, owner.counter);
334 } else if (flags == LCB_F_READ || flags == LCB_F_WRITE) {
335 /*
336 * Support for the BPF_TYPE_MATCHES argument to the
337 * __builtin_preserve_type_info builtin was added at some point during
338 * development of clang 15 and it's what is needed for
339 * bpf_core_type_matches.
340 */
341 #if __has_builtin(__builtin_preserve_type_info) && __clang_major__ >= 15
342 if (bpf_core_type_matches(struct rw_semaphore___old)) {
343 struct rw_semaphore___old *rwsem = (void *)lock;
344 owner = (unsigned long)BPF_CORE_READ(rwsem, owner);
345 } else if (bpf_core_type_matches(struct rw_semaphore___new)) {
346 struct rw_semaphore___new *rwsem = (void *)lock;
347 owner = BPF_CORE_READ(rwsem, owner.counter);
348 }
349 #else
350 /* assume new struct */
351 struct rw_semaphore *rwsem = (void *)lock;
352 owner = BPF_CORE_READ(rwsem, owner.counter);
353 #endif
354 }
355
356 if (!owner)
357 return NULL;
358
359 task = (void *)(owner & ~7UL);
360 return task;
361 }
362
check_lock_type(__u64 lock,__u32 flags)363 static inline __u32 check_lock_type(__u64 lock, __u32 flags)
364 {
365 struct task_struct *curr;
366 struct mm_struct___old *mm_old;
367 struct mm_struct___new *mm_new;
368 struct sighand_struct *sighand;
369
370 switch (flags) {
371 case LCB_F_READ: /* rwsem */
372 case LCB_F_WRITE:
373 curr = bpf_get_current_task_btf();
374 if (curr->mm == NULL)
375 break;
376 mm_new = (void *)curr->mm;
377 if (bpf_core_field_exists(mm_new->mmap_lock)) {
378 if (&mm_new->mmap_lock == (void *)lock)
379 return LCD_F_MMAP_LOCK;
380 break;
381 }
382 mm_old = (void *)curr->mm;
383 if (bpf_core_field_exists(mm_old->mmap_sem)) {
384 if (&mm_old->mmap_sem == (void *)lock)
385 return LCD_F_MMAP_LOCK;
386 }
387 break;
388 case LCB_F_SPIN: /* spinlock */
389 curr = bpf_get_current_task_btf();
390 sighand = curr->sighand;
391
392 if (sighand && &sighand->siglock == (void *)lock)
393 return LCD_F_SIGHAND_LOCK;
394 break;
395 default:
396 break;
397 }
398 return 0;
399 }
400
delay_callback(__u64 idx,void * arg)401 static inline long delay_callback(__u64 idx, void *arg)
402 {
403 __u64 target = *(__u64 *)arg;
404
405 if (target <= bpf_ktime_get_ns())
406 return 1;
407
408 /* just to kill time */
409 (void)bpf_get_prandom_u32();
410
411 return 0;
412 }
413
do_lock_delay(__u64 duration)414 static inline void do_lock_delay(__u64 duration)
415 {
416 __u64 target = bpf_ktime_get_ns() + duration;
417
418 bpf_loop(MAX_LOOP, delay_callback, &target, /*flags=*/0);
419 }
420
check_lock_delay(__u64 lock)421 static inline void check_lock_delay(__u64 lock)
422 {
423 __u64 *delay;
424
425 delay = bpf_map_lookup_elem(&lock_delays, &lock);
426 if (delay)
427 do_lock_delay(*delay);
428 }
429
get_tstamp_elem(__u32 flags)430 static inline struct tstamp_data *get_tstamp_elem(__u32 flags)
431 {
432 __u32 pid;
433 struct tstamp_data *pelem;
434
435 /* Use per-cpu array map for spinlock and rwlock */
436 if ((flags & (LCB_F_SPIN | LCB_F_MUTEX)) == LCB_F_SPIN) {
437 __u32 idx = 0;
438
439 pelem = bpf_map_lookup_elem(&tstamp_cpu, &idx);
440 /* Do not update the element for nested locks */
441 if (pelem && pelem->lock)
442 pelem = NULL;
443 return pelem;
444 }
445
446 pid = bpf_get_current_pid_tgid();
447 pelem = bpf_map_lookup_elem(&tstamp, &pid);
448 /* Do not update the element for nested locks */
449 if (pelem && pelem->lock)
450 return NULL;
451
452 if (pelem == NULL) {
453 struct tstamp_data zero = {};
454
455 if (bpf_map_update_elem(&tstamp, &pid, &zero, BPF_NOEXIST) < 0) {
456 __sync_fetch_and_add(&task_fail, 1);
457 return NULL;
458 }
459
460 pelem = bpf_map_lookup_elem(&tstamp, &pid);
461 if (pelem == NULL) {
462 __sync_fetch_and_add(&task_fail, 1);
463 return NULL;
464 }
465 }
466 return pelem;
467 }
468
get_owner_stack_id(u64 * stacktrace)469 static inline s32 get_owner_stack_id(u64 *stacktrace)
470 {
471 s32 *id, new_id;
472 static s64 id_gen = 1;
473
474 id = bpf_map_lookup_elem(&owner_stacks, stacktrace);
475 if (id)
476 return *id;
477
478 new_id = (s32)__sync_fetch_and_add(&id_gen, 1);
479
480 bpf_map_update_elem(&owner_stacks, stacktrace, &new_id, BPF_NOEXIST);
481
482 id = bpf_map_lookup_elem(&owner_stacks, stacktrace);
483 if (id)
484 return *id;
485
486 return -1;
487 }
488
update_contention_data(struct contention_data * data,u64 duration,u32 count)489 static inline void update_contention_data(struct contention_data *data, u64 duration, u32 count)
490 {
491 __sync_fetch_and_add(&data->total_time, duration);
492 __sync_fetch_and_add(&data->count, count);
493
494 /* FIXME: need atomic operations */
495 if (data->max_time < duration)
496 data->max_time = duration;
497 if (data->min_time > duration)
498 data->min_time = duration;
499 }
500
update_owner_stat(u32 id,u64 duration,u32 flags)501 static inline void update_owner_stat(u32 id, u64 duration, u32 flags)
502 {
503 struct contention_key key = {
504 .stack_id = id,
505 .pid = 0,
506 .lock_addr_or_cgroup = 0,
507 };
508 struct contention_data *data = bpf_map_lookup_elem(&owner_stat, &key);
509
510 if (!data) {
511 struct contention_data first = {
512 .total_time = duration,
513 .max_time = duration,
514 .min_time = duration,
515 .count = 1,
516 .flags = flags,
517 };
518 bpf_map_update_elem(&owner_stat, &key, &first, BPF_NOEXIST);
519 } else {
520 update_contention_data(data, duration, 1);
521 }
522 }
523
524 SEC("tp_btf/contention_begin")
contention_begin(u64 * ctx)525 int contention_begin(u64 *ctx)
526 {
527 struct tstamp_data *pelem;
528
529 if (!enabled || !can_record(ctx))
530 return 0;
531
532 pelem = get_tstamp_elem(ctx[1]);
533 if (pelem == NULL)
534 return 0;
535
536 pelem->timestamp = bpf_ktime_get_ns();
537 pelem->lock = (__u64)ctx[0];
538 pelem->flags = (__u32)ctx[1];
539
540 if (needs_callstack) {
541 u32 i = 0;
542 u32 id = 0;
543 int owner_pid;
544 u64 *buf;
545 struct task_struct *task;
546 struct owner_tracing_data *otdata;
547
548 if (!lock_owner)
549 goto skip_owner;
550
551 task = get_lock_owner(pelem->lock, pelem->flags);
552 if (!task)
553 goto skip_owner;
554
555 owner_pid = BPF_CORE_READ(task, pid);
556
557 buf = bpf_map_lookup_elem(&stack_buf, &i);
558 if (!buf)
559 goto skip_owner;
560 for (i = 0; i < max_stack; i++)
561 buf[i] = 0x0;
562
563 if (!bpf_task_from_pid)
564 goto skip_owner;
565
566 task = bpf_task_from_pid(owner_pid);
567 if (!task)
568 goto skip_owner;
569
570 bpf_get_task_stack(task, buf, max_stack * sizeof(unsigned long), 0);
571 bpf_task_release(task);
572
573 otdata = bpf_map_lookup_elem(&owner_data, &pelem->lock);
574 id = get_owner_stack_id(buf);
575
576 /*
577 * Contention just happens, or corner case `lock` is owned by process not
578 * `owner_pid`. For the corner case we treat it as unexpected internal error and
579 * just ignore the precvious tracing record.
580 */
581 if (!otdata || otdata->pid != owner_pid) {
582 struct owner_tracing_data first = {
583 .pid = owner_pid,
584 .timestamp = pelem->timestamp,
585 .count = 1,
586 .stack_id = id,
587 };
588 bpf_map_update_elem(&owner_data, &pelem->lock, &first, BPF_ANY);
589 }
590 /* Contention is ongoing and new waiter joins */
591 else {
592 __sync_fetch_and_add(&otdata->count, 1);
593
594 /*
595 * The owner is the same, but stacktrace might be changed. In this case we
596 * store/update `owner_stat` based on current owner stack id.
597 */
598 if (id != otdata->stack_id) {
599 update_owner_stat(id, pelem->timestamp - otdata->timestamp,
600 pelem->flags);
601
602 otdata->timestamp = pelem->timestamp;
603 otdata->stack_id = id;
604 }
605 }
606 skip_owner:
607 pelem->stack_id = bpf_get_stackid(ctx, &stacks,
608 BPF_F_FAST_STACK_CMP | stack_skip);
609 if (pelem->stack_id < 0)
610 __sync_fetch_and_add(&stack_fail, 1);
611 } else if (aggr_mode == LOCK_AGGR_TASK) {
612 struct task_struct *task;
613
614 if (lock_owner) {
615 task = get_lock_owner(pelem->lock, pelem->flags);
616
617 /* The flags is not used anymore. Pass the owner pid. */
618 if (task)
619 pelem->flags = BPF_CORE_READ(task, pid);
620 else
621 pelem->flags = -1U;
622
623 } else {
624 task = bpf_get_current_task_btf();
625 }
626
627 if (task) {
628 if (update_task_data(task) < 0 && lock_owner)
629 pelem->flags = -1U;
630 }
631 }
632
633 return 0;
634 }
635
636 SEC("tp_btf/contention_end")
contention_end(u64 * ctx)637 int contention_end(u64 *ctx)
638 {
639 __u32 pid = 0, idx = 0;
640 struct tstamp_data *pelem;
641 struct contention_key key = {};
642 struct contention_data *data;
643 __u64 timestamp;
644 __u64 duration;
645 bool need_delete = false;
646
647 if (!enabled)
648 return 0;
649
650 /*
651 * For spinlock and rwlock, it needs to get the timestamp for the
652 * per-cpu map. However, contention_end does not have the flags
653 * so it cannot know whether it reads percpu or hash map.
654 *
655 * Try per-cpu map first and check if there's active contention.
656 * If it is, do not read hash map because it cannot go to sleeping
657 * locks before releasing the spinning locks.
658 */
659 pelem = bpf_map_lookup_elem(&tstamp_cpu, &idx);
660 if (pelem && pelem->lock) {
661 if (pelem->lock != ctx[0])
662 return 0;
663 } else {
664 pid = bpf_get_current_pid_tgid();
665 pelem = bpf_map_lookup_elem(&tstamp, &pid);
666 if (!pelem || pelem->lock != ctx[0])
667 return 0;
668 need_delete = true;
669 }
670
671 timestamp = bpf_ktime_get_ns();
672 duration = timestamp - pelem->timestamp;
673 if ((__s64)duration < 0) {
674 __sync_fetch_and_add(&time_fail, 1);
675 goto out;
676 }
677
678 if (needs_callstack && lock_owner) {
679 struct owner_tracing_data *otdata = bpf_map_lookup_elem(&owner_data, &pelem->lock);
680
681 if (!otdata)
682 goto skip_owner;
683
684 /* Update `owner_stat` */
685 update_owner_stat(otdata->stack_id, timestamp - otdata->timestamp, pelem->flags);
686
687 /* No contention is occurring, delete `lock` entry in `owner_data` */
688 if (otdata->count <= 1)
689 bpf_map_delete_elem(&owner_data, &pelem->lock);
690 /*
691 * Contention is still ongoing, with a new owner (current task). `owner_data`
692 * should be updated accordingly.
693 */
694 else {
695 u32 i = 0;
696 s32 ret = (s32)ctx[1];
697 u64 *buf;
698
699 otdata->timestamp = timestamp;
700 __sync_fetch_and_add(&otdata->count, -1);
701
702 buf = bpf_map_lookup_elem(&stack_buf, &i);
703 if (!buf)
704 goto skip_owner;
705 for (i = 0; i < (u32)max_stack; i++)
706 buf[i] = 0x0;
707
708 /*
709 * `ret` has the return code of the lock function.
710 * If `ret` is negative, the current task terminates lock waiting without
711 * acquiring it. Owner is not changed, but we still need to update the owner
712 * stack.
713 */
714 if (ret < 0) {
715 s32 id = 0;
716 struct task_struct *task;
717
718 if (!bpf_task_from_pid)
719 goto skip_owner;
720
721 task = bpf_task_from_pid(otdata->pid);
722 if (!task)
723 goto skip_owner;
724
725 bpf_get_task_stack(task, buf,
726 max_stack * sizeof(unsigned long), 0);
727 bpf_task_release(task);
728
729 id = get_owner_stack_id(buf);
730
731 /*
732 * If owner stack is changed, update owner stack id for this lock.
733 */
734 if (id != otdata->stack_id)
735 otdata->stack_id = id;
736 }
737 /*
738 * Otherwise, update tracing data with the current task, which is the new
739 * owner.
740 */
741 else {
742 otdata->pid = pid;
743 /*
744 * We don't want to retrieve callstack here, since it is where the
745 * current task acquires the lock and provides no additional
746 * information. We simply assign -1 to invalidate it.
747 */
748 otdata->stack_id = -1;
749 }
750 }
751 }
752 skip_owner:
753 switch (aggr_mode) {
754 case LOCK_AGGR_CALLER:
755 key.stack_id = pelem->stack_id;
756 break;
757 case LOCK_AGGR_TASK:
758 if (lock_owner)
759 key.pid = pelem->flags;
760 else {
761 if (!need_delete)
762 pid = bpf_get_current_pid_tgid();
763 key.pid = pid;
764 }
765 if (needs_callstack)
766 key.stack_id = pelem->stack_id;
767 break;
768 case LOCK_AGGR_ADDR:
769 key.lock_addr_or_cgroup = pelem->lock;
770 if (needs_callstack)
771 key.stack_id = pelem->stack_id;
772 break;
773 case LOCK_AGGR_CGROUP:
774 key.lock_addr_or_cgroup = get_current_cgroup_id();
775 break;
776 default:
777 /* should not happen */
778 return 0;
779 }
780
781 data = bpf_map_lookup_elem(&lock_stat, &key);
782 if (!data) {
783 if (data_map_full) {
784 __sync_fetch_and_add(&data_fail, 1);
785 goto out;
786 }
787
788 struct contention_data first = {
789 .total_time = duration,
790 .max_time = duration,
791 .min_time = duration,
792 .count = 1,
793 .flags = pelem->flags,
794 };
795 int err;
796
797 if (aggr_mode == LOCK_AGGR_ADDR) {
798 first.flags |= check_lock_type(pelem->lock,
799 pelem->flags & LCB_F_TYPE_MASK);
800
801 /* Check if it's from a slab object */
802 if (bpf_get_kmem_cache) {
803 struct kmem_cache *s;
804 struct slab_cache_data *d;
805
806 s = bpf_get_kmem_cache(pelem->lock);
807 if (s != NULL) {
808 /*
809 * Save the ID of the slab cache in the flags
810 * (instead of full address) to reduce the
811 * space in the contention_data.
812 */
813 d = bpf_map_lookup_elem(&slab_caches, &s);
814 if (d != NULL)
815 first.flags |= d->id;
816 }
817 }
818 }
819
820 err = bpf_map_update_elem(&lock_stat, &key, &first, BPF_NOEXIST);
821 if (err < 0) {
822 if (err == -EEXIST) {
823 /* it lost the race, try to get it again */
824 data = bpf_map_lookup_elem(&lock_stat, &key);
825 if (data != NULL)
826 goto found;
827 }
828 if (err == -E2BIG)
829 data_map_full = 1;
830 __sync_fetch_and_add(&data_fail, 1);
831 }
832 goto out;
833 }
834
835 found:
836 update_contention_data(data, duration, 1);
837
838 out:
839 if (lock_delay)
840 check_lock_delay(pelem->lock);
841
842 pelem->lock = 0;
843 if (need_delete)
844 bpf_map_delete_elem(&tstamp, &pid);
845 return 0;
846 }
847
848 extern struct rq runqueues __ksym;
849
850 const volatile __u64 contig_page_data_addr;
851 const volatile __u64 node_data_addr;
852 const volatile int nr_nodes;
853 const volatile int sizeof_zone;
854
855 struct rq___old {
856 raw_spinlock_t lock;
857 } __attribute__((preserve_access_index));
858
859 struct rq___new {
860 raw_spinlock_t __lock;
861 } __attribute__((preserve_access_index));
862
collect_zone_lock(void)863 static void collect_zone_lock(void)
864 {
865 __u64 nr_zones, zone_off;
866 __u64 lock_addr, lock_off;
867 __u32 lock_flag = LOCK_CLASS_ZONE_LOCK;
868
869 zone_off = offsetof(struct pglist_data, node_zones);
870 lock_off = offsetof(struct zone, lock);
871
872 if (contig_page_data_addr) {
873 struct pglist_data *contig_page_data;
874
875 contig_page_data = (void *)(long)contig_page_data_addr;
876 nr_zones = BPF_CORE_READ(contig_page_data, nr_zones);
877
878 for (int i = 0; i < MAX_ZONES; i++) {
879 __u64 zone_addr;
880
881 if (i >= nr_zones)
882 break;
883
884 zone_addr = contig_page_data_addr + (sizeof_zone * i) + zone_off;
885 lock_addr = zone_addr + lock_off;
886
887 bpf_map_update_elem(&lock_syms, &lock_addr, &lock_flag, BPF_ANY);
888 }
889 } else if (nr_nodes > 0) {
890 struct pglist_data **node_data = (void *)(long)node_data_addr;
891
892 for (int i = 0; i < nr_nodes; i++) {
893 struct pglist_data *pgdat = NULL;
894 int err;
895
896 err = bpf_core_read(&pgdat, sizeof(pgdat), &node_data[i]);
897 if (err < 0 || pgdat == NULL)
898 break;
899
900 nr_zones = BPF_CORE_READ(pgdat, nr_zones);
901 for (int k = 0; k < MAX_ZONES; k++) {
902 __u64 zone_addr;
903
904 if (k >= nr_zones)
905 break;
906
907 zone_addr = (__u64)(void *)pgdat + (sizeof_zone * k) + zone_off;
908 lock_addr = zone_addr + lock_off;
909
910 bpf_map_update_elem(&lock_syms, &lock_addr, &lock_flag, BPF_ANY);
911 }
912 }
913 }
914 }
915
916 SEC("raw_tp/bpf_test_finish")
BPF_PROG(collect_lock_syms)917 int BPF_PROG(collect_lock_syms)
918 {
919 __u64 lock_addr, lock_off;
920 __u32 lock_flag;
921
922 if (bpf_core_field_exists(struct rq___new, __lock))
923 lock_off = offsetof(struct rq___new, __lock);
924 else
925 lock_off = offsetof(struct rq___old, lock);
926
927 for (int i = 0; i < MAX_CPUS; i++) {
928 struct rq *rq = bpf_per_cpu_ptr(&runqueues, i);
929
930 if (rq == NULL)
931 break;
932
933 lock_addr = (__u64)(void *)rq + lock_off;
934 lock_flag = LOCK_CLASS_RQLOCK;
935 bpf_map_update_elem(&lock_syms, &lock_addr, &lock_flag, BPF_ANY);
936 }
937
938 collect_zone_lock();
939
940 return 0;
941 }
942
943 SEC("raw_tp/bpf_test_finish")
BPF_PROG(end_timestamp)944 int BPF_PROG(end_timestamp)
945 {
946 end_ts = bpf_ktime_get_ns();
947 return 0;
948 }
949
950 /*
951 * bpf_iter__kmem_cache added recently so old kernels don't have it in the
952 * vmlinux.h. But we cannot add it here since it will cause a compiler error
953 * due to redefinition of the struct on later kernels.
954 *
955 * So it uses a CO-RE trick to access the member only if it has the type.
956 * This will support both old and new kernels without compiler errors.
957 */
958 struct bpf_iter__kmem_cache___new {
959 struct kmem_cache *s;
960 } __attribute__((preserve_access_index));
961
962 SEC("iter/kmem_cache")
slab_cache_iter(void * ctx)963 int slab_cache_iter(void *ctx)
964 {
965 struct kmem_cache *s = NULL;
966 struct slab_cache_data d;
967 const char *nameptr;
968
969 if (bpf_core_type_exists(struct bpf_iter__kmem_cache)) {
970 struct bpf_iter__kmem_cache___new *iter = ctx;
971
972 s = iter->s;
973 }
974
975 if (s == NULL)
976 return 0;
977
978 nameptr = s->name;
979 bpf_probe_read_kernel_str(d.name, sizeof(d.name), nameptr);
980
981 d.id = ++slab_cache_id << LCB_F_SLAB_ID_SHIFT;
982 if (d.id >= LCB_F_SLAB_ID_END)
983 return 0;
984
985 bpf_map_update_elem(&slab_caches, &s, &d, BPF_NOEXIST);
986 return 0;
987 }
988
989 char LICENSE[] SEC("license") = "Dual BSD/GPL";
990