xref: /linux/tools/perf/util/bpf_skel/lock_contention.bpf.c (revision e9ef810dfee7a2227da9d423aecb0ced35faddbe)
1 // SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
2 // Copyright (c) 2022 Google
3 #include "vmlinux.h"
4 #include <bpf/bpf_helpers.h>
5 #include <bpf/bpf_tracing.h>
6 #include <bpf/bpf_core_read.h>
7 #include <asm-generic/errno-base.h>
8 
9 #include "lock_data.h"
10 
11 /* for collect_lock_syms().  4096 was rejected by the verifier */
12 #define MAX_CPUS  1024
13 
14 /* for collect_zone_lock().  It should be more than the actual zones. */
15 #define MAX_ZONES  10
16 
17 /* for do_lock_delay().  Arbitrarily set to 1 million. */
18 #define MAX_LOOP  (1U << 20)
19 
20 /* lock contention flags from include/trace/events/lock.h */
21 #define LCB_F_SPIN	(1U << 0)
22 #define LCB_F_READ	(1U << 1)
23 #define LCB_F_WRITE	(1U << 2)
24 #define LCB_F_RT	(1U << 3)
25 #define LCB_F_PERCPU	(1U << 4)
26 #define LCB_F_MUTEX	(1U << 5)
27 
28 /* callstack storage  */
29 struct {
30 	__uint(type, BPF_MAP_TYPE_STACK_TRACE);
31 	__uint(key_size, sizeof(__u32));
32 	__uint(value_size, sizeof(__u64));
33 	__uint(max_entries, MAX_ENTRIES);
34 } stacks SEC(".maps");
35 
36 /* buffer for owner stacktrace */
37 struct {
38 	__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
39 	__uint(key_size, sizeof(__u32));
40 	__uint(value_size, sizeof(__u64));
41 	__uint(max_entries, 1);
42 } stack_buf SEC(".maps");
43 
44 /* a map for tracing owner stacktrace to owner stack id */
45 struct {
46 	__uint(type, BPF_MAP_TYPE_HASH);
47 	__uint(key_size, sizeof(__u64)); // owner stacktrace
48 	__uint(value_size, sizeof(__s32)); // owner stack id
49 	__uint(max_entries, 1);
50 } owner_stacks SEC(".maps");
51 
52 /* a map for tracing lock address to owner data */
53 struct {
54 	__uint(type, BPF_MAP_TYPE_HASH);
55 	__uint(key_size, sizeof(__u64)); // lock address
56 	__uint(value_size, sizeof(struct owner_tracing_data));
57 	__uint(max_entries, 1);
58 } owner_data SEC(".maps");
59 
60 /* a map for contention_key (stores owner stack id) to contention data */
61 struct {
62 	__uint(type, BPF_MAP_TYPE_HASH);
63 	__uint(key_size, sizeof(struct contention_key));
64 	__uint(value_size, sizeof(struct contention_data));
65 	__uint(max_entries, 1);
66 } owner_stat SEC(".maps");
67 
68 /* maintain timestamp at the beginning of contention */
69 struct {
70 	__uint(type, BPF_MAP_TYPE_HASH);
71 	__type(key, int);
72 	__type(value, struct tstamp_data);
73 	__uint(max_entries, MAX_ENTRIES);
74 } tstamp SEC(".maps");
75 
76 /* maintain per-CPU timestamp at the beginning of contention */
77 struct {
78 	__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
79 	__uint(key_size, sizeof(__u32));
80 	__uint(value_size, sizeof(struct tstamp_data));
81 	__uint(max_entries, 1);
82 } tstamp_cpu SEC(".maps");
83 
84 /* actual lock contention statistics */
85 struct {
86 	__uint(type, BPF_MAP_TYPE_HASH);
87 	__uint(key_size, sizeof(struct contention_key));
88 	__uint(value_size, sizeof(struct contention_data));
89 	__uint(max_entries, MAX_ENTRIES);
90 } lock_stat SEC(".maps");
91 
92 struct {
93 	__uint(type, BPF_MAP_TYPE_HASH);
94 	__uint(key_size, sizeof(__u32));
95 	__uint(value_size, sizeof(struct contention_task_data));
96 	__uint(max_entries, MAX_ENTRIES);
97 } task_data SEC(".maps");
98 
99 struct {
100 	__uint(type, BPF_MAP_TYPE_HASH);
101 	__uint(key_size, sizeof(__u64));
102 	__uint(value_size, sizeof(__u32));
103 	__uint(max_entries, MAX_ENTRIES);
104 } lock_syms SEC(".maps");
105 
106 struct {
107 	__uint(type, BPF_MAP_TYPE_HASH);
108 	__uint(key_size, sizeof(__u32));
109 	__uint(value_size, sizeof(__u8));
110 	__uint(max_entries, 1);
111 } cpu_filter SEC(".maps");
112 
113 struct {
114 	__uint(type, BPF_MAP_TYPE_HASH);
115 	__uint(key_size, sizeof(__u32));
116 	__uint(value_size, sizeof(__u8));
117 	__uint(max_entries, 1);
118 } task_filter SEC(".maps");
119 
120 struct {
121 	__uint(type, BPF_MAP_TYPE_HASH);
122 	__uint(key_size, sizeof(__u32));
123 	__uint(value_size, sizeof(__u8));
124 	__uint(max_entries, 1);
125 } type_filter SEC(".maps");
126 
127 struct {
128 	__uint(type, BPF_MAP_TYPE_HASH);
129 	__uint(key_size, sizeof(__u64));
130 	__uint(value_size, sizeof(__u8));
131 	__uint(max_entries, 1);
132 } addr_filter SEC(".maps");
133 
134 struct {
135 	__uint(type, BPF_MAP_TYPE_HASH);
136 	__uint(key_size, sizeof(__u64));
137 	__uint(value_size, sizeof(__u8));
138 	__uint(max_entries, 1);
139 } cgroup_filter SEC(".maps");
140 
141 struct {
142 	__uint(type, BPF_MAP_TYPE_HASH);
143 	__uint(key_size, sizeof(long));
144 	__uint(value_size, sizeof(__u8));
145 	__uint(max_entries, 1);
146 } slab_filter SEC(".maps");
147 
148 struct {
149 	__uint(type, BPF_MAP_TYPE_HASH);
150 	__uint(key_size, sizeof(long));
151 	__uint(value_size, sizeof(struct slab_cache_data));
152 	__uint(max_entries, 1);
153 } slab_caches SEC(".maps");
154 
155 struct {
156 	__uint(type, BPF_MAP_TYPE_HASH);
157 	__uint(key_size, sizeof(__u64));
158 	__uint(value_size, sizeof(__u64));
159 	__uint(max_entries, 1);
160 } lock_delays SEC(".maps");
161 
162 struct rw_semaphore___old {
163 	struct task_struct *owner;
164 } __attribute__((preserve_access_index));
165 
166 struct rw_semaphore___new {
167 	atomic_long_t owner;
168 } __attribute__((preserve_access_index));
169 
170 struct mm_struct___old {
171 	struct rw_semaphore mmap_sem;
172 } __attribute__((preserve_access_index));
173 
174 struct mm_struct___new {
175 	struct rw_semaphore mmap_lock;
176 } __attribute__((preserve_access_index));
177 
178 extern struct kmem_cache *bpf_get_kmem_cache(u64 addr) __ksym __weak;
179 
180 /* control flags */
181 const volatile int has_cpu;
182 const volatile int has_task;
183 const volatile int has_type;
184 const volatile int has_addr;
185 const volatile int has_cgroup;
186 const volatile int has_slab;
187 const volatile int needs_callstack;
188 const volatile int stack_skip;
189 const volatile int lock_owner;
190 const volatile int use_cgroup_v2;
191 const volatile int max_stack;
192 const volatile int lock_delay;
193 
194 /* determine the key of lock stat */
195 const volatile int aggr_mode;
196 
197 int enabled;
198 
199 int perf_subsys_id = -1;
200 
201 __u64 end_ts;
202 
203 __u32 slab_cache_id;
204 
205 /* error stat */
206 int task_fail;
207 int stack_fail;
208 int time_fail;
209 int data_fail;
210 
211 int task_map_full;
212 int data_map_full;
213 
214 struct task_struct *bpf_task_from_pid(s32 pid) __ksym __weak;
215 void bpf_task_release(struct task_struct *p) __ksym __weak;
216 
get_current_cgroup_id(void)217 static inline __u64 get_current_cgroup_id(void)
218 {
219 	struct task_struct *task;
220 	struct cgroup *cgrp;
221 
222 	if (use_cgroup_v2)
223 		return bpf_get_current_cgroup_id();
224 
225 	task = bpf_get_current_task_btf();
226 
227 	if (perf_subsys_id == -1) {
228 #if __has_builtin(__builtin_preserve_enum_value)
229 		perf_subsys_id = bpf_core_enum_value(enum cgroup_subsys_id,
230 						     perf_event_cgrp_id);
231 #else
232 		perf_subsys_id = perf_event_cgrp_id;
233 #endif
234 	}
235 
236 	cgrp = BPF_CORE_READ(task, cgroups, subsys[perf_subsys_id], cgroup);
237 	return BPF_CORE_READ(cgrp, kn, id);
238 }
239 
can_record(u64 * ctx)240 static inline int can_record(u64 *ctx)
241 {
242 	if (has_cpu) {
243 		__u32 cpu = bpf_get_smp_processor_id();
244 		__u8 *ok;
245 
246 		ok = bpf_map_lookup_elem(&cpu_filter, &cpu);
247 		if (!ok)
248 			return 0;
249 	}
250 
251 	if (has_task) {
252 		__u8 *ok;
253 		__u32 pid = bpf_get_current_pid_tgid();
254 
255 		ok = bpf_map_lookup_elem(&task_filter, &pid);
256 		if (!ok)
257 			return 0;
258 	}
259 
260 	if (has_type) {
261 		__u8 *ok;
262 		__u32 flags = (__u32)ctx[1];
263 
264 		ok = bpf_map_lookup_elem(&type_filter, &flags);
265 		if (!ok)
266 			return 0;
267 	}
268 
269 	if (has_addr) {
270 		__u8 *ok;
271 		__u64 addr = ctx[0];
272 
273 		ok = bpf_map_lookup_elem(&addr_filter, &addr);
274 		if (!ok && !has_slab)
275 			return 0;
276 	}
277 
278 	if (has_cgroup) {
279 		__u8 *ok;
280 		__u64 cgrp = get_current_cgroup_id();
281 
282 		ok = bpf_map_lookup_elem(&cgroup_filter, &cgrp);
283 		if (!ok)
284 			return 0;
285 	}
286 
287 	if (has_slab && bpf_get_kmem_cache) {
288 		__u8 *ok;
289 		__u64 addr = ctx[0];
290 		long kmem_cache_addr;
291 
292 		kmem_cache_addr = (long)bpf_get_kmem_cache(addr);
293 		ok = bpf_map_lookup_elem(&slab_filter, &kmem_cache_addr);
294 		if (!ok)
295 			return 0;
296 	}
297 
298 	return 1;
299 }
300 
update_task_data(struct task_struct * task)301 static inline int update_task_data(struct task_struct *task)
302 {
303 	struct contention_task_data *p;
304 	int pid, err;
305 
306 	err = bpf_core_read(&pid, sizeof(pid), &task->pid);
307 	if (err)
308 		return -1;
309 
310 	p = bpf_map_lookup_elem(&task_data, &pid);
311 	if (p == NULL && !task_map_full) {
312 		struct contention_task_data data = {};
313 
314 		BPF_CORE_READ_STR_INTO(&data.comm, task, comm);
315 		if (bpf_map_update_elem(&task_data, &pid, &data, BPF_NOEXIST) == -E2BIG)
316 			task_map_full = 1;
317 	}
318 
319 	return 0;
320 }
321 
322 #ifndef __has_builtin
323 # define __has_builtin(x) 0
324 #endif
325 
get_lock_owner(__u64 lock,__u32 flags)326 static inline struct task_struct *get_lock_owner(__u64 lock, __u32 flags)
327 {
328 	struct task_struct *task;
329 	__u64 owner = 0;
330 
331 	if (flags & LCB_F_MUTEX) {
332 		struct mutex *mutex = (void *)lock;
333 		owner = BPF_CORE_READ(mutex, owner.counter);
334 	} else if (flags == LCB_F_READ || flags == LCB_F_WRITE) {
335 	/*
336 	 * Support for the BPF_TYPE_MATCHES argument to the
337 	 * __builtin_preserve_type_info builtin was added at some point during
338 	 * development of clang 15 and it's what is needed for
339 	 * bpf_core_type_matches.
340 	 */
341 #if __has_builtin(__builtin_preserve_type_info) && __clang_major__ >= 15
342 		if (bpf_core_type_matches(struct rw_semaphore___old)) {
343 			struct rw_semaphore___old *rwsem = (void *)lock;
344 			owner = (unsigned long)BPF_CORE_READ(rwsem, owner);
345 		} else if (bpf_core_type_matches(struct rw_semaphore___new)) {
346 			struct rw_semaphore___new *rwsem = (void *)lock;
347 			owner = BPF_CORE_READ(rwsem, owner.counter);
348 		}
349 #else
350 		/* assume new struct */
351 		struct rw_semaphore *rwsem = (void *)lock;
352 		owner = BPF_CORE_READ(rwsem, owner.counter);
353 #endif
354 	}
355 
356 	if (!owner)
357 		return NULL;
358 
359 	task = (void *)(owner & ~7UL);
360 	return task;
361 }
362 
check_lock_type(__u64 lock,__u32 flags)363 static inline __u32 check_lock_type(__u64 lock, __u32 flags)
364 {
365 	struct task_struct *curr;
366 	struct mm_struct___old *mm_old;
367 	struct mm_struct___new *mm_new;
368 	struct sighand_struct *sighand;
369 
370 	switch (flags) {
371 	case LCB_F_READ:  /* rwsem */
372 	case LCB_F_WRITE:
373 		curr = bpf_get_current_task_btf();
374 		if (curr->mm == NULL)
375 			break;
376 		mm_new = (void *)curr->mm;
377 		if (bpf_core_field_exists(mm_new->mmap_lock)) {
378 			if (&mm_new->mmap_lock == (void *)lock)
379 				return LCD_F_MMAP_LOCK;
380 			break;
381 		}
382 		mm_old = (void *)curr->mm;
383 		if (bpf_core_field_exists(mm_old->mmap_sem)) {
384 			if (&mm_old->mmap_sem == (void *)lock)
385 				return LCD_F_MMAP_LOCK;
386 		}
387 		break;
388 	case LCB_F_SPIN:  /* spinlock */
389 		curr = bpf_get_current_task_btf();
390 		sighand = curr->sighand;
391 
392 		if (sighand && &sighand->siglock == (void *)lock)
393 			return LCD_F_SIGHAND_LOCK;
394 		break;
395 	default:
396 		break;
397 	}
398 	return 0;
399 }
400 
delay_callback(__u64 idx,void * arg)401 static inline long delay_callback(__u64 idx, void *arg)
402 {
403 	__u64 target = *(__u64 *)arg;
404 
405 	if (target <= bpf_ktime_get_ns())
406 		return 1;
407 
408 	/* just to kill time */
409 	(void)bpf_get_prandom_u32();
410 
411 	return 0;
412 }
413 
do_lock_delay(__u64 duration)414 static inline void do_lock_delay(__u64 duration)
415 {
416 	__u64 target = bpf_ktime_get_ns() + duration;
417 
418 	bpf_loop(MAX_LOOP, delay_callback, &target, /*flags=*/0);
419 }
420 
check_lock_delay(__u64 lock)421 static inline void check_lock_delay(__u64 lock)
422 {
423 	__u64 *delay;
424 
425 	delay = bpf_map_lookup_elem(&lock_delays, &lock);
426 	if (delay)
427 		do_lock_delay(*delay);
428 }
429 
get_tstamp_elem(__u32 flags)430 static inline struct tstamp_data *get_tstamp_elem(__u32 flags)
431 {
432 	__u32 pid;
433 	struct tstamp_data *pelem;
434 
435 	/* Use per-cpu array map for spinlock and rwlock */
436 	if ((flags & (LCB_F_SPIN | LCB_F_MUTEX)) == LCB_F_SPIN) {
437 		__u32 idx = 0;
438 
439 		pelem = bpf_map_lookup_elem(&tstamp_cpu, &idx);
440 		/* Do not update the element for nested locks */
441 		if (pelem && pelem->lock)
442 			pelem = NULL;
443 		return pelem;
444 	}
445 
446 	pid = bpf_get_current_pid_tgid();
447 	pelem = bpf_map_lookup_elem(&tstamp, &pid);
448 	/* Do not update the element for nested locks */
449 	if (pelem && pelem->lock)
450 		return NULL;
451 
452 	if (pelem == NULL) {
453 		struct tstamp_data zero = {};
454 
455 		if (bpf_map_update_elem(&tstamp, &pid, &zero, BPF_NOEXIST) < 0) {
456 			__sync_fetch_and_add(&task_fail, 1);
457 			return NULL;
458 		}
459 
460 		pelem = bpf_map_lookup_elem(&tstamp, &pid);
461 		if (pelem == NULL) {
462 			__sync_fetch_and_add(&task_fail, 1);
463 			return NULL;
464 		}
465 	}
466 	return pelem;
467 }
468 
get_owner_stack_id(u64 * stacktrace)469 static inline s32 get_owner_stack_id(u64 *stacktrace)
470 {
471 	s32 *id, new_id;
472 	static s64 id_gen = 1;
473 
474 	id = bpf_map_lookup_elem(&owner_stacks, stacktrace);
475 	if (id)
476 		return *id;
477 
478 	new_id = (s32)__sync_fetch_and_add(&id_gen, 1);
479 
480 	bpf_map_update_elem(&owner_stacks, stacktrace, &new_id, BPF_NOEXIST);
481 
482 	id = bpf_map_lookup_elem(&owner_stacks, stacktrace);
483 	if (id)
484 		return *id;
485 
486 	return -1;
487 }
488 
update_contention_data(struct contention_data * data,u64 duration,u32 count)489 static inline void update_contention_data(struct contention_data *data, u64 duration, u32 count)
490 {
491 	__sync_fetch_and_add(&data->total_time, duration);
492 	__sync_fetch_and_add(&data->count, count);
493 
494 	/* FIXME: need atomic operations */
495 	if (data->max_time < duration)
496 		data->max_time = duration;
497 	if (data->min_time > duration)
498 		data->min_time = duration;
499 }
500 
update_owner_stat(u32 id,u64 duration,u32 flags)501 static inline void update_owner_stat(u32 id, u64 duration, u32 flags)
502 {
503 	struct contention_key key = {
504 		.stack_id = id,
505 		.pid = 0,
506 		.lock_addr_or_cgroup = 0,
507 	};
508 	struct contention_data *data = bpf_map_lookup_elem(&owner_stat, &key);
509 
510 	if (!data) {
511 		struct contention_data first = {
512 			.total_time = duration,
513 			.max_time = duration,
514 			.min_time = duration,
515 			.count = 1,
516 			.flags = flags,
517 		};
518 		bpf_map_update_elem(&owner_stat, &key, &first, BPF_NOEXIST);
519 	} else {
520 		update_contention_data(data, duration, 1);
521 	}
522 }
523 
524 SEC("tp_btf/contention_begin")
contention_begin(u64 * ctx)525 int contention_begin(u64 *ctx)
526 {
527 	struct tstamp_data *pelem;
528 
529 	if (!enabled || !can_record(ctx))
530 		return 0;
531 
532 	pelem = get_tstamp_elem(ctx[1]);
533 	if (pelem == NULL)
534 		return 0;
535 
536 	pelem->timestamp = bpf_ktime_get_ns();
537 	pelem->lock = (__u64)ctx[0];
538 	pelem->flags = (__u32)ctx[1];
539 
540 	if (needs_callstack) {
541 		u32 i = 0;
542 		u32 id = 0;
543 		int owner_pid;
544 		u64 *buf;
545 		struct task_struct *task;
546 		struct owner_tracing_data *otdata;
547 
548 		if (!lock_owner)
549 			goto skip_owner;
550 
551 		task = get_lock_owner(pelem->lock, pelem->flags);
552 		if (!task)
553 			goto skip_owner;
554 
555 		owner_pid = BPF_CORE_READ(task, pid);
556 
557 		buf = bpf_map_lookup_elem(&stack_buf, &i);
558 		if (!buf)
559 			goto skip_owner;
560 		for (i = 0; i < max_stack; i++)
561 			buf[i] = 0x0;
562 
563 		if (!bpf_task_from_pid)
564 			goto skip_owner;
565 
566 		task = bpf_task_from_pid(owner_pid);
567 		if (!task)
568 			goto skip_owner;
569 
570 		bpf_get_task_stack(task, buf, max_stack * sizeof(unsigned long), 0);
571 		bpf_task_release(task);
572 
573 		otdata = bpf_map_lookup_elem(&owner_data, &pelem->lock);
574 		id = get_owner_stack_id(buf);
575 
576 		/*
577 		 * Contention just happens, or corner case `lock` is owned by process not
578 		 * `owner_pid`. For the corner case we treat it as unexpected internal error and
579 		 * just ignore the precvious tracing record.
580 		 */
581 		if (!otdata || otdata->pid != owner_pid) {
582 			struct owner_tracing_data first = {
583 				.pid = owner_pid,
584 				.timestamp = pelem->timestamp,
585 				.count = 1,
586 				.stack_id = id,
587 			};
588 			bpf_map_update_elem(&owner_data, &pelem->lock, &first, BPF_ANY);
589 		}
590 		/* Contention is ongoing and new waiter joins */
591 		else {
592 			__sync_fetch_and_add(&otdata->count, 1);
593 
594 			/*
595 			 * The owner is the same, but stacktrace might be changed. In this case we
596 			 * store/update `owner_stat` based on current owner stack id.
597 			 */
598 			if (id != otdata->stack_id) {
599 				update_owner_stat(id, pelem->timestamp - otdata->timestamp,
600 						  pelem->flags);
601 
602 				otdata->timestamp = pelem->timestamp;
603 				otdata->stack_id = id;
604 			}
605 		}
606 skip_owner:
607 		pelem->stack_id = bpf_get_stackid(ctx, &stacks,
608 						  BPF_F_FAST_STACK_CMP | stack_skip);
609 		if (pelem->stack_id < 0)
610 			__sync_fetch_and_add(&stack_fail, 1);
611 	} else if (aggr_mode == LOCK_AGGR_TASK) {
612 		struct task_struct *task;
613 
614 		if (lock_owner) {
615 			task = get_lock_owner(pelem->lock, pelem->flags);
616 
617 			/* The flags is not used anymore.  Pass the owner pid. */
618 			if (task)
619 				pelem->flags = BPF_CORE_READ(task, pid);
620 			else
621 				pelem->flags = -1U;
622 
623 		} else {
624 			task = bpf_get_current_task_btf();
625 		}
626 
627 		if (task) {
628 			if (update_task_data(task) < 0 && lock_owner)
629 				pelem->flags = -1U;
630 		}
631 	}
632 
633 	return 0;
634 }
635 
636 SEC("tp_btf/contention_end")
contention_end(u64 * ctx)637 int contention_end(u64 *ctx)
638 {
639 	__u32 pid = 0, idx = 0;
640 	struct tstamp_data *pelem;
641 	struct contention_key key = {};
642 	struct contention_data *data;
643 	__u64 timestamp;
644 	__u64 duration;
645 	bool need_delete = false;
646 
647 	if (!enabled)
648 		return 0;
649 
650 	/*
651 	 * For spinlock and rwlock, it needs to get the timestamp for the
652 	 * per-cpu map.  However, contention_end does not have the flags
653 	 * so it cannot know whether it reads percpu or hash map.
654 	 *
655 	 * Try per-cpu map first and check if there's active contention.
656 	 * If it is, do not read hash map because it cannot go to sleeping
657 	 * locks before releasing the spinning locks.
658 	 */
659 	pelem = bpf_map_lookup_elem(&tstamp_cpu, &idx);
660 	if (pelem && pelem->lock) {
661 		if (pelem->lock != ctx[0])
662 			return 0;
663 	} else {
664 		pid = bpf_get_current_pid_tgid();
665 		pelem = bpf_map_lookup_elem(&tstamp, &pid);
666 		if (!pelem || pelem->lock != ctx[0])
667 			return 0;
668 		need_delete = true;
669 	}
670 
671 	timestamp = bpf_ktime_get_ns();
672 	duration = timestamp - pelem->timestamp;
673 	if ((__s64)duration < 0) {
674 		__sync_fetch_and_add(&time_fail, 1);
675 		goto out;
676 	}
677 
678 	if (needs_callstack && lock_owner) {
679 		struct owner_tracing_data *otdata = bpf_map_lookup_elem(&owner_data, &pelem->lock);
680 
681 		if (!otdata)
682 			goto skip_owner;
683 
684 		/* Update `owner_stat` */
685 		update_owner_stat(otdata->stack_id, timestamp - otdata->timestamp, pelem->flags);
686 
687 		/* No contention is occurring, delete `lock` entry in `owner_data` */
688 		if (otdata->count <= 1)
689 			bpf_map_delete_elem(&owner_data, &pelem->lock);
690 		/*
691 		 * Contention is still ongoing, with a new owner (current task). `owner_data`
692 		 * should be updated accordingly.
693 		 */
694 		else {
695 			u32 i = 0;
696 			s32 ret = (s32)ctx[1];
697 			u64 *buf;
698 
699 			otdata->timestamp = timestamp;
700 			__sync_fetch_and_add(&otdata->count, -1);
701 
702 			buf = bpf_map_lookup_elem(&stack_buf, &i);
703 			if (!buf)
704 				goto skip_owner;
705 			for (i = 0; i < (u32)max_stack; i++)
706 				buf[i] = 0x0;
707 
708 			/*
709 			 * `ret` has the return code of the lock function.
710 			 * If `ret` is negative, the current task terminates lock waiting without
711 			 * acquiring it. Owner is not changed, but we still need to update the owner
712 			 * stack.
713 			 */
714 			if (ret < 0) {
715 				s32 id = 0;
716 				struct task_struct *task;
717 
718 				if (!bpf_task_from_pid)
719 					goto skip_owner;
720 
721 				task = bpf_task_from_pid(otdata->pid);
722 				if (!task)
723 					goto skip_owner;
724 
725 				bpf_get_task_stack(task, buf,
726 						   max_stack * sizeof(unsigned long), 0);
727 				bpf_task_release(task);
728 
729 				id = get_owner_stack_id(buf);
730 
731 				/*
732 				 * If owner stack is changed, update owner stack id for this lock.
733 				 */
734 				if (id != otdata->stack_id)
735 					otdata->stack_id = id;
736 			}
737 			/*
738 			 * Otherwise, update tracing data with the current task, which is the new
739 			 * owner.
740 			 */
741 			else {
742 				otdata->pid = pid;
743 				/*
744 				 * We don't want to retrieve callstack here, since it is where the
745 				 * current task acquires the lock and provides no additional
746 				 * information. We simply assign -1 to invalidate it.
747 				 */
748 				otdata->stack_id = -1;
749 			}
750 		}
751 	}
752 skip_owner:
753 	switch (aggr_mode) {
754 	case LOCK_AGGR_CALLER:
755 		key.stack_id = pelem->stack_id;
756 		break;
757 	case LOCK_AGGR_TASK:
758 		if (lock_owner)
759 			key.pid = pelem->flags;
760 		else {
761 			if (!need_delete)
762 				pid = bpf_get_current_pid_tgid();
763 			key.pid = pid;
764 		}
765 		if (needs_callstack)
766 			key.stack_id = pelem->stack_id;
767 		break;
768 	case LOCK_AGGR_ADDR:
769 		key.lock_addr_or_cgroup = pelem->lock;
770 		if (needs_callstack)
771 			key.stack_id = pelem->stack_id;
772 		break;
773 	case LOCK_AGGR_CGROUP:
774 		key.lock_addr_or_cgroup = get_current_cgroup_id();
775 		break;
776 	default:
777 		/* should not happen */
778 		return 0;
779 	}
780 
781 	data = bpf_map_lookup_elem(&lock_stat, &key);
782 	if (!data) {
783 		if (data_map_full) {
784 			__sync_fetch_and_add(&data_fail, 1);
785 			goto out;
786 		}
787 
788 		struct contention_data first = {
789 			.total_time = duration,
790 			.max_time = duration,
791 			.min_time = duration,
792 			.count = 1,
793 			.flags = pelem->flags,
794 		};
795 		int err;
796 
797 		if (aggr_mode == LOCK_AGGR_ADDR) {
798 			first.flags |= check_lock_type(pelem->lock,
799 						       pelem->flags & LCB_F_TYPE_MASK);
800 
801 			/* Check if it's from a slab object */
802 			if (bpf_get_kmem_cache) {
803 				struct kmem_cache *s;
804 				struct slab_cache_data *d;
805 
806 				s = bpf_get_kmem_cache(pelem->lock);
807 				if (s != NULL) {
808 					/*
809 					 * Save the ID of the slab cache in the flags
810 					 * (instead of full address) to reduce the
811 					 * space in the contention_data.
812 					 */
813 					d = bpf_map_lookup_elem(&slab_caches, &s);
814 					if (d != NULL)
815 						first.flags |= d->id;
816 				}
817 			}
818 		}
819 
820 		err = bpf_map_update_elem(&lock_stat, &key, &first, BPF_NOEXIST);
821 		if (err < 0) {
822 			if (err == -EEXIST) {
823 				/* it lost the race, try to get it again */
824 				data = bpf_map_lookup_elem(&lock_stat, &key);
825 				if (data != NULL)
826 					goto found;
827 			}
828 			if (err == -E2BIG)
829 				data_map_full = 1;
830 			__sync_fetch_and_add(&data_fail, 1);
831 		}
832 		goto out;
833 	}
834 
835 found:
836 	update_contention_data(data, duration, 1);
837 
838 out:
839 	if (lock_delay)
840 		check_lock_delay(pelem->lock);
841 
842 	pelem->lock = 0;
843 	if (need_delete)
844 		bpf_map_delete_elem(&tstamp, &pid);
845 	return 0;
846 }
847 
848 extern struct rq runqueues __ksym;
849 
850 const volatile __u64 contig_page_data_addr;
851 const volatile __u64 node_data_addr;
852 const volatile int nr_nodes;
853 const volatile int sizeof_zone;
854 
855 struct rq___old {
856 	raw_spinlock_t lock;
857 } __attribute__((preserve_access_index));
858 
859 struct rq___new {
860 	raw_spinlock_t __lock;
861 } __attribute__((preserve_access_index));
862 
collect_zone_lock(void)863 static void collect_zone_lock(void)
864 {
865 	__u64 nr_zones, zone_off;
866 	__u64 lock_addr, lock_off;
867 	__u32 lock_flag = LOCK_CLASS_ZONE_LOCK;
868 
869 	zone_off = offsetof(struct pglist_data, node_zones);
870 	lock_off = offsetof(struct zone, lock);
871 
872 	if (contig_page_data_addr) {
873 		struct pglist_data *contig_page_data;
874 
875 		contig_page_data = (void *)(long)contig_page_data_addr;
876 		nr_zones = BPF_CORE_READ(contig_page_data, nr_zones);
877 
878 		for (int i = 0; i < MAX_ZONES; i++) {
879 			__u64 zone_addr;
880 
881 			if (i >= nr_zones)
882 				break;
883 
884 			zone_addr = contig_page_data_addr + (sizeof_zone * i) + zone_off;
885 			lock_addr = zone_addr + lock_off;
886 
887 			bpf_map_update_elem(&lock_syms, &lock_addr, &lock_flag, BPF_ANY);
888 		}
889 	} else if (nr_nodes > 0) {
890 		struct pglist_data **node_data = (void *)(long)node_data_addr;
891 
892 		for (int i = 0; i < nr_nodes; i++) {
893 			struct pglist_data *pgdat = NULL;
894 			int err;
895 
896 			err = bpf_core_read(&pgdat, sizeof(pgdat), &node_data[i]);
897 			if (err < 0 || pgdat == NULL)
898 				break;
899 
900 			nr_zones = BPF_CORE_READ(pgdat, nr_zones);
901 			for (int k = 0; k < MAX_ZONES; k++) {
902 				__u64 zone_addr;
903 
904 				if (k >= nr_zones)
905 					break;
906 
907 				zone_addr = (__u64)(void *)pgdat + (sizeof_zone * k) + zone_off;
908 				lock_addr = zone_addr + lock_off;
909 
910 				bpf_map_update_elem(&lock_syms, &lock_addr, &lock_flag, BPF_ANY);
911 			}
912 		}
913 	}
914 }
915 
916 SEC("raw_tp/bpf_test_finish")
BPF_PROG(collect_lock_syms)917 int BPF_PROG(collect_lock_syms)
918 {
919 	__u64 lock_addr, lock_off;
920 	__u32 lock_flag;
921 
922 	if (bpf_core_field_exists(struct rq___new, __lock))
923 		lock_off = offsetof(struct rq___new, __lock);
924 	else
925 		lock_off = offsetof(struct rq___old, lock);
926 
927 	for (int i = 0; i < MAX_CPUS; i++) {
928 		struct rq *rq = bpf_per_cpu_ptr(&runqueues, i);
929 
930 		if (rq == NULL)
931 			break;
932 
933 		lock_addr = (__u64)(void *)rq + lock_off;
934 		lock_flag = LOCK_CLASS_RQLOCK;
935 		bpf_map_update_elem(&lock_syms, &lock_addr, &lock_flag, BPF_ANY);
936 	}
937 
938 	collect_zone_lock();
939 
940 	return 0;
941 }
942 
943 SEC("raw_tp/bpf_test_finish")
BPF_PROG(end_timestamp)944 int BPF_PROG(end_timestamp)
945 {
946 	end_ts = bpf_ktime_get_ns();
947 	return 0;
948 }
949 
950 /*
951  * bpf_iter__kmem_cache added recently so old kernels don't have it in the
952  * vmlinux.h.  But we cannot add it here since it will cause a compiler error
953  * due to redefinition of the struct on later kernels.
954  *
955  * So it uses a CO-RE trick to access the member only if it has the type.
956  * This will support both old and new kernels without compiler errors.
957  */
958 struct bpf_iter__kmem_cache___new {
959 	struct kmem_cache *s;
960 } __attribute__((preserve_access_index));
961 
962 SEC("iter/kmem_cache")
slab_cache_iter(void * ctx)963 int slab_cache_iter(void *ctx)
964 {
965 	struct kmem_cache *s = NULL;
966 	struct slab_cache_data d;
967 	const char *nameptr;
968 
969 	if (bpf_core_type_exists(struct bpf_iter__kmem_cache)) {
970 		struct bpf_iter__kmem_cache___new *iter = ctx;
971 
972 		s = iter->s;
973 	}
974 
975 	if (s == NULL)
976 		return 0;
977 
978 	nameptr = s->name;
979 	bpf_probe_read_kernel_str(d.name, sizeof(d.name), nameptr);
980 
981 	d.id = ++slab_cache_id << LCB_F_SLAB_ID_SHIFT;
982 	if (d.id >= LCB_F_SLAB_ID_END)
983 		return 0;
984 
985 	bpf_map_update_elem(&slab_caches, &s, &d, BPF_NOEXIST);
986 	return 0;
987 }
988 
989 char LICENSE[] SEC("license") = "Dual BSD/GPL";
990