xref: /linux/tools/perf/util/bpf_skel/lock_contention.bpf.c (revision 824b18f607d82503a956d3e00f9e9c0b24efcbca)
1 // SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
2 // Copyright (c) 2022 Google
3 #include "vmlinux.h"
4 #include <bpf/bpf_helpers.h>
5 #include <bpf/bpf_tracing.h>
6 #include <bpf/bpf_core_read.h>
7 #include <asm-generic/errno-base.h>
8 
9 #include "lock_data.h"
10 
11 /* for collect_lock_syms().  4096 was rejected by the verifier */
12 #define MAX_CPUS  1024
13 
14 /* for collect_zone_lock().  It should be more than the actual zones. */
15 #define MAX_ZONES  10
16 
17 /* for do_lock_delay().  Arbitrarily set to 1 million. */
18 #define MAX_LOOP  (1U << 20)
19 
20 /* lock contention flags from include/trace/events/lock.h */
21 #define LCB_F_SPIN	(1U << 0)
22 #define LCB_F_READ	(1U << 1)
23 #define LCB_F_WRITE	(1U << 2)
24 #define LCB_F_RT	(1U << 3)
25 #define LCB_F_PERCPU	(1U << 4)
26 #define LCB_F_MUTEX	(1U << 5)
27 
28 /* callstack storage  */
29 struct {
30 	__uint(type, BPF_MAP_TYPE_STACK_TRACE);
31 	__uint(key_size, sizeof(__u32));
32 	__uint(value_size, sizeof(__u64));
33 	__uint(max_entries, MAX_ENTRIES);
34 } stacks SEC(".maps");
35 
36 /* buffer for owner stacktrace */
37 struct {
38 	__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
39 	__uint(key_size, sizeof(__u32));
40 	__uint(value_size, sizeof(__u64));
41 	__uint(max_entries, 1);
42 } stack_buf SEC(".maps");
43 
44 /* a map for tracing owner stacktrace to owner stack id */
45 struct {
46 	__uint(type, BPF_MAP_TYPE_HASH);
47 	__uint(key_size, sizeof(__u64)); // owner stacktrace
48 	__uint(value_size, sizeof(__s32)); // owner stack id
49 	__uint(max_entries, 1);
50 } owner_stacks SEC(".maps");
51 
52 /* a map for tracing lock address to owner data */
53 struct {
54 	__uint(type, BPF_MAP_TYPE_HASH);
55 	__uint(key_size, sizeof(__u64)); // lock address
56 	__uint(value_size, sizeof(struct owner_tracing_data));
57 	__uint(max_entries, 1);
58 } owner_data SEC(".maps");
59 
60 /* a map for contention_key (stores owner stack id) to contention data */
61 struct {
62 	__uint(type, BPF_MAP_TYPE_HASH);
63 	__uint(key_size, sizeof(struct contention_key));
64 	__uint(value_size, sizeof(struct contention_data));
65 	__uint(max_entries, 1);
66 } owner_stat SEC(".maps");
67 
68 /* maintain timestamp at the beginning of contention */
69 struct {
70 	__uint(type, BPF_MAP_TYPE_HASH);
71 	__type(key, int);
72 	__type(value, struct tstamp_data);
73 	__uint(max_entries, MAX_ENTRIES);
74 } tstamp SEC(".maps");
75 
76 /* maintain per-CPU timestamp at the beginning of contention */
77 struct {
78 	__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
79 	__uint(key_size, sizeof(__u32));
80 	__uint(value_size, sizeof(struct tstamp_data));
81 	__uint(max_entries, 1);
82 } tstamp_cpu SEC(".maps");
83 
84 /* actual lock contention statistics */
85 struct {
86 	__uint(type, BPF_MAP_TYPE_HASH);
87 	__uint(key_size, sizeof(struct contention_key));
88 	__uint(value_size, sizeof(struct contention_data));
89 	__uint(max_entries, MAX_ENTRIES);
90 } lock_stat SEC(".maps");
91 
92 struct {
93 	__uint(type, BPF_MAP_TYPE_HASH);
94 	__uint(key_size, sizeof(__u32));
95 	__uint(value_size, sizeof(struct contention_task_data));
96 	__uint(max_entries, MAX_ENTRIES);
97 } task_data SEC(".maps");
98 
99 struct {
100 	__uint(type, BPF_MAP_TYPE_HASH);
101 	__uint(key_size, sizeof(__u64));
102 	__uint(value_size, sizeof(__u32));
103 	__uint(max_entries, MAX_ENTRIES);
104 } lock_syms SEC(".maps");
105 
106 struct {
107 	__uint(type, BPF_MAP_TYPE_HASH);
108 	__uint(key_size, sizeof(__u32));
109 	__uint(value_size, sizeof(__u8));
110 	__uint(max_entries, 1);
111 } cpu_filter SEC(".maps");
112 
113 struct {
114 	__uint(type, BPF_MAP_TYPE_HASH);
115 	__uint(key_size, sizeof(__u32));
116 	__uint(value_size, sizeof(__u8));
117 	__uint(max_entries, 1);
118 } task_filter SEC(".maps");
119 
120 struct {
121 	__uint(type, BPF_MAP_TYPE_HASH);
122 	__uint(key_size, sizeof(__u32));
123 	__uint(value_size, sizeof(__u8));
124 	__uint(max_entries, 1);
125 } type_filter SEC(".maps");
126 
127 struct {
128 	__uint(type, BPF_MAP_TYPE_HASH);
129 	__uint(key_size, sizeof(__u64));
130 	__uint(value_size, sizeof(__u8));
131 	__uint(max_entries, 1);
132 } addr_filter SEC(".maps");
133 
134 struct {
135 	__uint(type, BPF_MAP_TYPE_HASH);
136 	__uint(key_size, sizeof(__u64));
137 	__uint(value_size, sizeof(__u8));
138 	__uint(max_entries, 1);
139 } cgroup_filter SEC(".maps");
140 
141 struct {
142 	__uint(type, BPF_MAP_TYPE_HASH);
143 	__uint(key_size, sizeof(long));
144 	__uint(value_size, sizeof(__u8));
145 	__uint(max_entries, 1);
146 } slab_filter SEC(".maps");
147 
148 struct {
149 	__uint(type, BPF_MAP_TYPE_HASH);
150 	__uint(key_size, sizeof(long));
151 	__uint(value_size, sizeof(struct slab_cache_data));
152 	__uint(max_entries, 1);
153 } slab_caches SEC(".maps");
154 
155 struct {
156 	__uint(type, BPF_MAP_TYPE_HASH);
157 	__uint(key_size, sizeof(__u64));
158 	__uint(value_size, sizeof(__u64));
159 	__uint(max_entries, 1);
160 } lock_delays SEC(".maps");
161 
162 struct rw_semaphore___old {
163 	struct task_struct *owner;
164 } __attribute__((preserve_access_index));
165 
166 struct rw_semaphore___new {
167 	atomic_long_t owner;
168 } __attribute__((preserve_access_index));
169 
170 struct mm_struct___old {
171 	struct rw_semaphore mmap_sem;
172 } __attribute__((preserve_access_index));
173 
174 struct mm_struct___new {
175 	struct rw_semaphore mmap_lock;
176 } __attribute__((preserve_access_index));
177 
178 struct cas_ctx {
179 	struct contention_data *data;
180 	u64 duration;
181 	int max_done;
182 	int min_done;
183 };
184 
185 extern struct kmem_cache *bpf_get_kmem_cache(u64 addr) __ksym __weak;
186 
187 /* control flags */
188 const volatile int has_cpu;
189 const volatile int has_task;
190 const volatile int has_type;
191 const volatile int has_addr;
192 const volatile int has_cgroup;
193 const volatile int has_slab;
194 const volatile int has_mmap_lock;
195 const volatile int needs_callstack;
196 const volatile int stack_skip;
197 const volatile int lock_owner;
198 const volatile int use_cgroup_v2;
199 const volatile int max_stack;
200 const volatile int lock_delay;
201 
202 /* determine the key of lock stat */
203 const volatile int aggr_mode;
204 
205 int enabled;
206 
207 int perf_subsys_id = -1;
208 
209 __u64 end_ts;
210 
211 __u32 slab_cache_id;
212 
213 /* error stat */
214 int task_fail;
215 int stack_fail;
216 int time_fail;
217 int data_fail;
218 
219 int task_map_full;
220 int data_map_full;
221 
222 struct task_struct *bpf_task_from_pid(s32 pid) __ksym __weak;
223 void bpf_task_release(struct task_struct *p) __ksym __weak;
224 
225 static inline __u32 check_lock_type(__u64 lock, __u32 flags);
226 
227 static inline __u64 get_current_cgroup_id(void)
228 {
229 	struct task_struct *task;
230 	struct cgroup *cgrp;
231 
232 	if (use_cgroup_v2)
233 		return bpf_get_current_cgroup_id();
234 
235 	task = bpf_get_current_task_btf();
236 
237 	if (perf_subsys_id == -1) {
238 #if __has_builtin(__builtin_preserve_enum_value)
239 		perf_subsys_id = bpf_core_enum_value(enum cgroup_subsys_id,
240 						     perf_event_cgrp_id);
241 #else
242 		perf_subsys_id = perf_event_cgrp_id;
243 #endif
244 	}
245 
246 	cgrp = BPF_CORE_READ(task, cgroups, subsys[perf_subsys_id], cgroup);
247 	return BPF_CORE_READ(cgrp, kn, id);
248 }
249 
250 static inline int can_record(u64 *ctx)
251 {
252 	bool is_addr_ok = false;
253 
254 	if (has_cpu) {
255 		__u32 cpu = bpf_get_smp_processor_id();
256 		__u8 *ok;
257 
258 		ok = bpf_map_lookup_elem(&cpu_filter, &cpu);
259 		if (!ok)
260 			return 0;
261 	}
262 
263 	if (has_task) {
264 		__u8 *ok;
265 		__u32 pid = bpf_get_current_pid_tgid();
266 
267 		ok = bpf_map_lookup_elem(&task_filter, &pid);
268 		if (!ok)
269 			return 0;
270 	}
271 
272 	if (has_type) {
273 		__u8 *ok;
274 		__u32 flags = (__u32)ctx[1];
275 
276 		ok = bpf_map_lookup_elem(&type_filter, &flags);
277 		if (!ok)
278 			return 0;
279 	}
280 
281 	if (has_addr) {
282 		__u8 *ok;
283 		__u64 addr = ctx[0];
284 
285 		ok = bpf_map_lookup_elem(&addr_filter, &addr);
286 		if (!ok && !has_slab && !has_mmap_lock)
287 			return 0;
288 
289 		is_addr_ok = !!ok;
290 	}
291 
292 	if (has_cgroup) {
293 		__u8 *ok;
294 		__u64 cgrp = get_current_cgroup_id();
295 
296 		ok = bpf_map_lookup_elem(&cgroup_filter, &cgrp);
297 		if (!ok)
298 			return 0;
299 	}
300 
301 	if (is_addr_ok)
302 		return 1;
303 
304 	/* slab and mmap_lock are part of the addr_filter */
305 	if (has_slab && bpf_get_kmem_cache) {
306 		__u8 *ok;
307 		__u64 addr = ctx[0];
308 		long kmem_cache_addr;
309 
310 		kmem_cache_addr = (long)bpf_get_kmem_cache(addr);
311 		ok = bpf_map_lookup_elem(&slab_filter, &kmem_cache_addr);
312 		if (ok)
313 			return 1;
314 		else if (!has_mmap_lock)
315 			return 0;
316 	}
317 
318 	if (has_mmap_lock) {
319 		__u64 lock = ctx[0];
320 		__u32 flag = ctx[1];
321 
322 		if (check_lock_type(lock, flag) != LCD_F_MMAP_LOCK)
323 			return 0;
324 	}
325 
326 	return 1;
327 }
328 
329 static inline int update_task_data(struct task_struct *task)
330 {
331 	struct contention_task_data *p;
332 	int pid, err;
333 
334 	err = bpf_core_read(&pid, sizeof(pid), &task->pid);
335 	if (err)
336 		return -1;
337 
338 	p = bpf_map_lookup_elem(&task_data, &pid);
339 	if (p == NULL && !task_map_full) {
340 		struct contention_task_data data = {};
341 
342 		BPF_CORE_READ_STR_INTO(&data.comm, task, comm);
343 		if (bpf_map_update_elem(&task_data, &pid, &data, BPF_NOEXIST) == -E2BIG)
344 			task_map_full = 1;
345 	}
346 
347 	return 0;
348 }
349 
350 #ifndef __has_builtin
351 # define __has_builtin(x) 0
352 #endif
353 
354 static inline struct task_struct *get_lock_owner(__u64 lock, __u32 flags)
355 {
356 	struct task_struct *task;
357 	__u64 owner = 0;
358 
359 	if (flags & LCB_F_MUTEX) {
360 		struct mutex *mutex = (void *)lock;
361 		owner = BPF_CORE_READ(mutex, owner.counter);
362 	} else if (flags == LCB_F_READ || flags == LCB_F_WRITE) {
363 	/*
364 	 * Support for the BPF_TYPE_MATCHES argument to the
365 	 * __builtin_preserve_type_info builtin was added at some point during
366 	 * development of clang 15 and it's what is needed for
367 	 * bpf_core_type_matches.
368 	 */
369 #if __has_builtin(__builtin_preserve_type_info) && __clang_major__ >= 15
370 		if (bpf_core_type_matches(struct rw_semaphore___old)) {
371 			struct rw_semaphore___old *rwsem = (void *)lock;
372 			owner = (unsigned long)BPF_CORE_READ(rwsem, owner);
373 		} else if (bpf_core_type_matches(struct rw_semaphore___new)) {
374 			struct rw_semaphore___new *rwsem = (void *)lock;
375 			owner = BPF_CORE_READ(rwsem, owner.counter);
376 		}
377 #else
378 		/* assume new struct */
379 		struct rw_semaphore *rwsem = (void *)lock;
380 		owner = BPF_CORE_READ(rwsem, owner.counter);
381 #endif
382 	}
383 
384 	if (!owner)
385 		return NULL;
386 
387 	task = (void *)(owner & ~7UL);
388 	return task;
389 }
390 
391 static inline __u32 check_lock_type(__u64 lock, __u32 flags)
392 {
393 	struct task_struct *curr;
394 	struct mm_struct___old *mm_old;
395 	struct mm_struct___new *mm_new;
396 	struct sighand_struct *sighand;
397 
398 	switch (flags) {
399 	case LCB_F_READ:  /* rwsem */
400 	case LCB_F_WRITE:
401 		curr = bpf_get_current_task_btf();
402 		if (curr->mm == NULL)
403 			break;
404 		mm_new = (void *)curr->mm;
405 		if (bpf_core_field_exists(mm_new->mmap_lock)) {
406 			if (&mm_new->mmap_lock == (void *)lock)
407 				return LCD_F_MMAP_LOCK;
408 			break;
409 		}
410 		mm_old = (void *)curr->mm;
411 		if (bpf_core_field_exists(mm_old->mmap_sem)) {
412 			if (&mm_old->mmap_sem == (void *)lock)
413 				return LCD_F_MMAP_LOCK;
414 		}
415 		break;
416 	case LCB_F_SPIN:  /* spinlock */
417 		curr = bpf_get_current_task_btf();
418 		sighand = curr->sighand;
419 
420 		if (sighand && &sighand->siglock == (void *)lock)
421 			return LCD_F_SIGHAND_LOCK;
422 		break;
423 	default:
424 		break;
425 	}
426 	return 0;
427 }
428 
429 static inline long delay_callback(__u64 idx, void *arg)
430 {
431 	__u64 target = *(__u64 *)arg;
432 
433 	if (target <= bpf_ktime_get_ns())
434 		return 1;
435 
436 	/* just to kill time */
437 	(void)bpf_get_prandom_u32();
438 
439 	return 0;
440 }
441 
442 static inline void do_lock_delay(__u64 duration)
443 {
444 	__u64 target = bpf_ktime_get_ns() + duration;
445 
446 	bpf_loop(MAX_LOOP, delay_callback, &target, /*flags=*/0);
447 }
448 
449 static inline void check_lock_delay(__u64 lock)
450 {
451 	__u64 *delay;
452 
453 	delay = bpf_map_lookup_elem(&lock_delays, &lock);
454 	if (delay)
455 		do_lock_delay(*delay);
456 }
457 
458 static inline struct tstamp_data *get_tstamp_elem(__u32 flags)
459 {
460 	__u32 pid;
461 	struct tstamp_data *pelem;
462 
463 	/* Use per-cpu array map for spinlock and rwlock */
464 	if ((flags & (LCB_F_SPIN | LCB_F_MUTEX)) == LCB_F_SPIN) {
465 		__u32 idx = 0;
466 
467 		pelem = bpf_map_lookup_elem(&tstamp_cpu, &idx);
468 		/* Do not update the element for nested locks */
469 		if (pelem && pelem->lock)
470 			pelem = NULL;
471 		return pelem;
472 	}
473 
474 	pid = bpf_get_current_pid_tgid();
475 	pelem = bpf_map_lookup_elem(&tstamp, &pid);
476 	/* Do not update the element for nested locks */
477 	if (pelem && pelem->lock)
478 		return NULL;
479 
480 	if (pelem == NULL) {
481 		struct tstamp_data zero = {};
482 
483 		if (bpf_map_update_elem(&tstamp, &pid, &zero, BPF_NOEXIST) < 0) {
484 			__sync_fetch_and_add(&task_fail, 1);
485 			return NULL;
486 		}
487 
488 		pelem = bpf_map_lookup_elem(&tstamp, &pid);
489 		if (pelem == NULL) {
490 			__sync_fetch_and_add(&task_fail, 1);
491 			return NULL;
492 		}
493 	}
494 	return pelem;
495 }
496 
497 static inline s32 get_owner_stack_id(u64 *stacktrace)
498 {
499 	s32 *id, new_id;
500 	static s64 id_gen = 1;
501 
502 	id = bpf_map_lookup_elem(&owner_stacks, stacktrace);
503 	if (id)
504 		return *id;
505 
506 	new_id = (s32)__sync_fetch_and_add(&id_gen, 1);
507 
508 	bpf_map_update_elem(&owner_stacks, stacktrace, &new_id, BPF_NOEXIST);
509 
510 	id = bpf_map_lookup_elem(&owner_stacks, stacktrace);
511 	if (id)
512 		return *id;
513 
514 	return -1;
515 }
516 
517 static long cas_min_max_cb(u64 idx, void *arg)
518 {
519 	struct cas_ctx *ctx = arg;
520 
521 	if (!ctx->max_done) {
522 		u64 old_max = ctx->data->max_time;
523 		if (old_max >= ctx->duration) {
524 			ctx->max_done = 1;
525 		} else {
526 			u64 r = __sync_val_compare_and_swap(
527 				&ctx->data->max_time, old_max, ctx->duration);
528 			if (r == old_max)
529 				ctx->max_done = 1;
530 		}
531 	}
532 
533 	if (!ctx->min_done) {
534 		u64 old_min = ctx->data->min_time;
535 		if (old_min <= ctx->duration) {
536 			ctx->min_done = 1;
537 		} else {
538 			u64 r = __sync_val_compare_and_swap(
539 				&ctx->data->min_time, old_min, ctx->duration);
540 			if (r == old_min)
541 				ctx->min_done = 1;
542 		}
543 	}
544 
545 	return (ctx->max_done && ctx->min_done) ? 1 : 0;
546 }
547 
548 static inline void update_contention_data(struct contention_data *data, u64 duration, u32 count)
549 {
550 	__sync_fetch_and_add(&data->total_time, duration);
551 	__sync_fetch_and_add(&data->count, count);
552 
553 	struct cas_ctx ctx = {
554 		.data     = data,
555 		.duration = duration,
556 		.max_done = 0,
557 		.min_done = 0,
558 	};
559 	bpf_loop(64, cas_min_max_cb, &ctx, 0);
560 }
561 
562 static inline void update_owner_stat(u32 id, u64 duration, u32 flags)
563 {
564 	struct contention_key key = {
565 		.stack_id = id,
566 		.pid = 0,
567 		.lock_addr_or_cgroup = 0,
568 	};
569 	struct contention_data *data = bpf_map_lookup_elem(&owner_stat, &key);
570 
571 	if (!data) {
572 		struct contention_data first = {
573 			.total_time = duration,
574 			.max_time = duration,
575 			.min_time = duration,
576 			.count = 1,
577 			.flags = flags,
578 		};
579 		bpf_map_update_elem(&owner_stat, &key, &first, BPF_NOEXIST);
580 	} else {
581 		update_contention_data(data, duration, 1);
582 	}
583 }
584 
585 SEC("tp_btf/contention_begin")
586 int contention_begin(u64 *ctx)
587 {
588 	struct tstamp_data *pelem;
589 
590 	if (!enabled || !can_record(ctx))
591 		return 0;
592 
593 	pelem = get_tstamp_elem(ctx[1]);
594 	if (pelem == NULL)
595 		return 0;
596 
597 	pelem->timestamp = bpf_ktime_get_ns();
598 	pelem->lock = (__u64)ctx[0];
599 	pelem->flags = (__u32)ctx[1];
600 	if (aggr_mode == LOCK_AGGR_CGROUP)
601 		pelem->cgroup_id = get_current_cgroup_id();
602 
603 	if (needs_callstack) {
604 		u32 i = 0;
605 		u32 id = 0;
606 		int owner_pid;
607 		u64 *buf;
608 		struct task_struct *task;
609 		struct owner_tracing_data *otdata;
610 
611 		if (!lock_owner)
612 			goto skip_owner;
613 
614 		task = get_lock_owner(pelem->lock, pelem->flags);
615 		if (!task)
616 			goto skip_owner;
617 
618 		owner_pid = BPF_CORE_READ(task, pid);
619 
620 		buf = bpf_map_lookup_elem(&stack_buf, &i);
621 		if (!buf)
622 			goto skip_owner;
623 		for (i = 0; i < max_stack; i++)
624 			buf[i] = 0x0;
625 
626 		if (!bpf_task_from_pid)
627 			goto skip_owner;
628 
629 		task = bpf_task_from_pid(owner_pid);
630 		if (!task)
631 			goto skip_owner;
632 
633 		bpf_get_task_stack(task, buf, max_stack * sizeof(unsigned long), 0);
634 		bpf_task_release(task);
635 
636 		otdata = bpf_map_lookup_elem(&owner_data, &pelem->lock);
637 		id = get_owner_stack_id(buf);
638 
639 		/*
640 		 * Contention just happens, or corner case `lock` is owned by process not
641 		 * `owner_pid`. For the corner case we treat it as unexpected internal error and
642 		 * just ignore the precvious tracing record.
643 		 */
644 		if (!otdata || otdata->pid != owner_pid) {
645 			struct owner_tracing_data first = {
646 				.pid = owner_pid,
647 				.timestamp = pelem->timestamp,
648 				.count = 1,
649 				.stack_id = id,
650 			};
651 			bpf_map_update_elem(&owner_data, &pelem->lock, &first, BPF_ANY);
652 		}
653 		/* Contention is ongoing and new waiter joins */
654 		else {
655 			__sync_fetch_and_add(&otdata->count, 1);
656 
657 			/*
658 			 * The owner is the same, but stacktrace might be changed. In this case we
659 			 * store/update `owner_stat` based on current owner stack id.
660 			 */
661 			if (id != otdata->stack_id) {
662 				update_owner_stat(id, pelem->timestamp - otdata->timestamp,
663 						  pelem->flags);
664 
665 				otdata->timestamp = pelem->timestamp;
666 				otdata->stack_id = id;
667 			}
668 		}
669 skip_owner:
670 		pelem->stack_id = bpf_get_stackid(ctx, &stacks,
671 						  BPF_F_FAST_STACK_CMP | stack_skip);
672 		if (pelem->stack_id < 0)
673 			__sync_fetch_and_add(&stack_fail, 1);
674 	} else if (aggr_mode == LOCK_AGGR_TASK) {
675 		struct task_struct *task;
676 
677 		if (lock_owner) {
678 			task = get_lock_owner(pelem->lock, pelem->flags);
679 
680 			/* The flags is not used anymore.  Pass the owner pid. */
681 			if (task)
682 				pelem->flags = BPF_CORE_READ(task, pid);
683 			else
684 				pelem->flags = -1U;
685 
686 		} else {
687 			task = bpf_get_current_task_btf();
688 		}
689 
690 		if (task) {
691 			if (update_task_data(task) < 0 && lock_owner)
692 				pelem->flags = -1U;
693 		}
694 	}
695 
696 	return 0;
697 }
698 
699 SEC("tp_btf/contention_end")
700 int contention_end(u64 *ctx)
701 {
702 	__u32 pid = 0, idx = 0;
703 	struct tstamp_data *pelem;
704 	struct contention_key key = {};
705 	struct contention_data *data;
706 	__u64 timestamp;
707 	__u64 duration;
708 	bool need_delete = false;
709 
710 	if (!enabled)
711 		return 0;
712 
713 	/*
714 	 * For spinlock and rwlock, it needs to get the timestamp for the
715 	 * per-cpu map.  However, contention_end does not have the flags
716 	 * so it cannot know whether it reads percpu or hash map.
717 	 *
718 	 * Try per-cpu map first and check if there's active contention.
719 	 * If it is, do not read hash map because it cannot go to sleeping
720 	 * locks before releasing the spinning locks.
721 	 */
722 	pelem = bpf_map_lookup_elem(&tstamp_cpu, &idx);
723 	if (pelem && pelem->lock) {
724 		if (pelem->lock != ctx[0])
725 			return 0;
726 	} else {
727 		pid = bpf_get_current_pid_tgid();
728 		pelem = bpf_map_lookup_elem(&tstamp, &pid);
729 		if (!pelem || pelem->lock != ctx[0])
730 			return 0;
731 		need_delete = true;
732 	}
733 
734 	timestamp = bpf_ktime_get_ns();
735 	duration = timestamp - pelem->timestamp;
736 	if ((__s64)duration < 0) {
737 		__sync_fetch_and_add(&time_fail, 1);
738 		goto out;
739 	}
740 
741 	if (needs_callstack && lock_owner) {
742 		struct owner_tracing_data *otdata = bpf_map_lookup_elem(&owner_data, &pelem->lock);
743 
744 		if (!otdata)
745 			goto skip_owner;
746 
747 		/* Update `owner_stat` */
748 		update_owner_stat(otdata->stack_id, timestamp - otdata->timestamp, pelem->flags);
749 
750 		/* No contention is occurring, delete `lock` entry in `owner_data` */
751 		if (otdata->count <= 1)
752 			bpf_map_delete_elem(&owner_data, &pelem->lock);
753 		/*
754 		 * Contention is still ongoing, with a new owner (current task). `owner_data`
755 		 * should be updated accordingly.
756 		 */
757 		else {
758 			u32 i = 0;
759 			s32 ret = (s32)ctx[1];
760 			u64 *buf;
761 
762 			otdata->timestamp = timestamp;
763 			__sync_fetch_and_add(&otdata->count, -1);
764 
765 			buf = bpf_map_lookup_elem(&stack_buf, &i);
766 			if (!buf)
767 				goto skip_owner;
768 			for (i = 0; i < (u32)max_stack; i++)
769 				buf[i] = 0x0;
770 
771 			/*
772 			 * `ret` has the return code of the lock function.
773 			 * If `ret` is negative, the current task terminates lock waiting without
774 			 * acquiring it. Owner is not changed, but we still need to update the owner
775 			 * stack.
776 			 */
777 			if (ret < 0) {
778 				s32 id = 0;
779 				struct task_struct *task;
780 
781 				if (!bpf_task_from_pid)
782 					goto skip_owner;
783 
784 				task = bpf_task_from_pid(otdata->pid);
785 				if (!task)
786 					goto skip_owner;
787 
788 				bpf_get_task_stack(task, buf,
789 						   max_stack * sizeof(unsigned long), 0);
790 				bpf_task_release(task);
791 
792 				id = get_owner_stack_id(buf);
793 
794 				/*
795 				 * If owner stack is changed, update owner stack id for this lock.
796 				 */
797 				if (id != otdata->stack_id)
798 					otdata->stack_id = id;
799 			}
800 			/*
801 			 * Otherwise, update tracing data with the current task, which is the new
802 			 * owner.
803 			 */
804 			else {
805 				otdata->pid = pid;
806 				/*
807 				 * We don't want to retrieve callstack here, since it is where the
808 				 * current task acquires the lock and provides no additional
809 				 * information. We simply assign -1 to invalidate it.
810 				 */
811 				otdata->stack_id = -1;
812 			}
813 		}
814 	}
815 skip_owner:
816 	switch (aggr_mode) {
817 	case LOCK_AGGR_CALLER:
818 		key.stack_id = pelem->stack_id;
819 		break;
820 	case LOCK_AGGR_TASK:
821 		if (lock_owner)
822 			key.pid = pelem->flags;
823 		else {
824 			if (!need_delete)
825 				pid = bpf_get_current_pid_tgid();
826 			key.pid = pid;
827 		}
828 		if (needs_callstack)
829 			key.stack_id = pelem->stack_id;
830 		break;
831 	case LOCK_AGGR_ADDR:
832 		key.lock_addr_or_cgroup = pelem->lock;
833 		if (needs_callstack)
834 			key.stack_id = pelem->stack_id;
835 		break;
836 	case LOCK_AGGR_CGROUP:
837 		key.lock_addr_or_cgroup = pelem->cgroup_id;
838 		break;
839 	default:
840 		/* should not happen */
841 		return 0;
842 	}
843 
844 	data = bpf_map_lookup_elem(&lock_stat, &key);
845 	if (!data) {
846 		if (data_map_full) {
847 			__sync_fetch_and_add(&data_fail, 1);
848 			goto out;
849 		}
850 
851 		struct contention_data first = {
852 			.total_time = duration,
853 			.max_time = duration,
854 			.min_time = duration,
855 			.count = 1,
856 			.flags = pelem->flags,
857 		};
858 		int err;
859 
860 		if (aggr_mode == LOCK_AGGR_ADDR) {
861 			first.flags |= check_lock_type(pelem->lock,
862 						       pelem->flags & LCB_F_TYPE_MASK);
863 
864 			/* Check if it's from a slab object */
865 			if (bpf_get_kmem_cache) {
866 				struct kmem_cache *s;
867 				struct slab_cache_data *d;
868 
869 				s = bpf_get_kmem_cache(pelem->lock);
870 				if (s != NULL) {
871 					/*
872 					 * Save the ID of the slab cache in the flags
873 					 * (instead of full address) to reduce the
874 					 * space in the contention_data.
875 					 */
876 					d = bpf_map_lookup_elem(&slab_caches, &s);
877 					if (d != NULL)
878 						first.flags |= d->id;
879 				}
880 			}
881 		}
882 
883 		err = bpf_map_update_elem(&lock_stat, &key, &first, BPF_NOEXIST);
884 		if (err < 0) {
885 			if (err == -EEXIST) {
886 				/* it lost the race, try to get it again */
887 				data = bpf_map_lookup_elem(&lock_stat, &key);
888 				if (data != NULL)
889 					goto found;
890 			}
891 			if (err == -E2BIG)
892 				data_map_full = 1;
893 			__sync_fetch_and_add(&data_fail, 1);
894 		}
895 		goto out;
896 	}
897 
898 found:
899 	update_contention_data(data, duration, 1);
900 
901 out:
902 	if (lock_delay)
903 		check_lock_delay(pelem->lock);
904 
905 	pelem->lock = 0;
906 	if (need_delete)
907 		bpf_map_delete_elem(&tstamp, &pid);
908 	return 0;
909 }
910 
911 extern struct rq runqueues __ksym;
912 
913 const volatile __u64 contig_page_data_addr;
914 const volatile __u64 node_data_addr;
915 const volatile int nr_nodes;
916 const volatile int sizeof_zone;
917 
918 struct rq___old {
919 	raw_spinlock_t lock;
920 } __attribute__((preserve_access_index));
921 
922 struct rq___new {
923 	raw_spinlock_t __lock;
924 } __attribute__((preserve_access_index));
925 
926 static void collect_zone_lock(void)
927 {
928 	__u64 nr_zones, zone_off;
929 	__u64 lock_addr, lock_off;
930 	__u32 lock_flag = LOCK_CLASS_ZONE_LOCK;
931 
932 	zone_off = offsetof(struct pglist_data, node_zones);
933 	lock_off = offsetof(struct zone, lock);
934 
935 	if (contig_page_data_addr) {
936 		struct pglist_data *contig_page_data;
937 
938 		contig_page_data = (void *)(long)contig_page_data_addr;
939 		nr_zones = BPF_CORE_READ(contig_page_data, nr_zones);
940 
941 		for (int i = 0; i < MAX_ZONES; i++) {
942 			__u64 zone_addr;
943 
944 			if (i >= nr_zones)
945 				break;
946 
947 			zone_addr = contig_page_data_addr + (sizeof_zone * i) + zone_off;
948 			lock_addr = zone_addr + lock_off;
949 
950 			bpf_map_update_elem(&lock_syms, &lock_addr, &lock_flag, BPF_ANY);
951 		}
952 	} else if (nr_nodes > 0) {
953 		struct pglist_data **node_data = (void *)(long)node_data_addr;
954 
955 		for (int i = 0; i < nr_nodes; i++) {
956 			struct pglist_data *pgdat = NULL;
957 			int err;
958 
959 			err = bpf_core_read(&pgdat, sizeof(pgdat), &node_data[i]);
960 			if (err < 0 || pgdat == NULL)
961 				break;
962 
963 			nr_zones = BPF_CORE_READ(pgdat, nr_zones);
964 			for (int k = 0; k < MAX_ZONES; k++) {
965 				__u64 zone_addr;
966 
967 				if (k >= nr_zones)
968 					break;
969 
970 				zone_addr = (__u64)(void *)pgdat + (sizeof_zone * k) + zone_off;
971 				lock_addr = zone_addr + lock_off;
972 
973 				bpf_map_update_elem(&lock_syms, &lock_addr, &lock_flag, BPF_ANY);
974 			}
975 		}
976 	}
977 }
978 
979 SEC("raw_tp/bpf_test_finish")
980 int BPF_PROG(collect_lock_syms)
981 {
982 	__u64 lock_addr, lock_off;
983 	__u32 lock_flag;
984 
985 	if (bpf_core_field_exists(struct rq___new, __lock))
986 		lock_off = offsetof(struct rq___new, __lock);
987 	else
988 		lock_off = offsetof(struct rq___old, lock);
989 
990 	for (int i = 0; i < MAX_CPUS; i++) {
991 		struct rq *rq = bpf_per_cpu_ptr(&runqueues, i);
992 
993 		if (rq == NULL)
994 			break;
995 
996 		lock_addr = (__u64)(void *)rq + lock_off;
997 		lock_flag = LOCK_CLASS_RQLOCK;
998 		bpf_map_update_elem(&lock_syms, &lock_addr, &lock_flag, BPF_ANY);
999 	}
1000 
1001 	collect_zone_lock();
1002 
1003 	return 0;
1004 }
1005 
1006 SEC("raw_tp/bpf_test_finish")
1007 int BPF_PROG(end_timestamp)
1008 {
1009 	end_ts = bpf_ktime_get_ns();
1010 	return 0;
1011 }
1012 
1013 /*
1014  * bpf_iter__kmem_cache added recently so old kernels don't have it in the
1015  * vmlinux.h.  But we cannot add it here since it will cause a compiler error
1016  * due to redefinition of the struct on later kernels.
1017  *
1018  * So it uses a CO-RE trick to access the member only if it has the type.
1019  * This will support both old and new kernels without compiler errors.
1020  */
1021 struct bpf_iter__kmem_cache___new {
1022 	struct kmem_cache *s;
1023 } __attribute__((preserve_access_index));
1024 
1025 SEC("iter/kmem_cache")
1026 int slab_cache_iter(void *ctx)
1027 {
1028 	struct kmem_cache *s = NULL;
1029 	struct slab_cache_data d;
1030 	const char *nameptr;
1031 
1032 	if (bpf_core_type_exists(struct bpf_iter__kmem_cache)) {
1033 		struct bpf_iter__kmem_cache___new *iter = ctx;
1034 
1035 		s = iter->s;
1036 	}
1037 
1038 	if (s == NULL)
1039 		return 0;
1040 
1041 	nameptr = s->name;
1042 	bpf_probe_read_kernel_str(d.name, sizeof(d.name), nameptr);
1043 
1044 	d.id = ++slab_cache_id << LCB_F_SLAB_ID_SHIFT;
1045 	if (d.id >= LCB_F_SLAB_ID_END)
1046 		return 0;
1047 
1048 	bpf_map_update_elem(&slab_caches, &s, &d, BPF_NOEXIST);
1049 	return 0;
1050 }
1051 
1052 char LICENSE[] SEC("license") = "Dual BSD/GPL";
1053