xref: /linux/tools/perf/util/bpf_skel/off_cpu.bpf.c (revision e47a324d6f07c9ef252cfce1f14cfa5110cbed99)
1 // SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
2 // Copyright (c) 2022 Google
3 #include "vmlinux.h"
4 #include <bpf/bpf_helpers.h>
5 #include <bpf/bpf_tracing.h>
6 #include <bpf/bpf_core_read.h>
7 
8 /* task->flags for off-cpu analysis */
9 #define PF_KTHREAD   0x00200000  /* I am a kernel thread */
10 
11 /* task->state for off-cpu analysis */
12 #define TASK_INTERRUPTIBLE	0x0001
13 #define TASK_UNINTERRUPTIBLE	0x0002
14 
15 /* create a new thread */
16 #define CLONE_THREAD  0x10000
17 
18 #define MAX_STACKS   32
19 #define MAX_ENTRIES  102400
20 
21 #define MAX_CPUS  4096
22 #define MAX_OFFCPU_LEN 37
23 
24 // We have a 'struct stack' in vmlinux.h when building with GEN_VMLINUX_H=1
25 struct __stack {
26 	u64 array[MAX_STACKS];
27 };
28 
29 struct tstamp_data {
30 	__u32 stack_id;
31 	__u32 state;
32 	__u64 timestamp;
33 	struct __stack stack;
34 };
35 
36 struct offcpu_key {
37 	__u32 pid;
38 	__u32 tgid;
39 	__u32 stack_id;
40 	__u32 state;
41 	__u64 cgroup_id;
42 };
43 
44 struct {
45 	__uint(type, BPF_MAP_TYPE_STACK_TRACE);
46 	__uint(key_size, sizeof(__u32));
47 	__uint(value_size, MAX_STACKS * sizeof(__u64));
48 	__uint(max_entries, MAX_ENTRIES);
49 } stacks SEC(".maps");
50 
51 struct offcpu_data {
52 	u64 array[MAX_OFFCPU_LEN];
53 };
54 
55 struct {
56 	__uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY);
57 	__uint(key_size, sizeof(int));
58 	__uint(value_size, sizeof(int));
59 	__uint(max_entries, MAX_CPUS);
60 } offcpu_output SEC(".maps");
61 
62 struct {
63 	__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
64 	__uint(key_size, sizeof(__u32));
65 	__uint(value_size, sizeof(struct offcpu_data));
66 	__uint(max_entries, 1);
67 } offcpu_payload SEC(".maps");
68 
69 struct {
70 	__uint(type, BPF_MAP_TYPE_TASK_STORAGE);
71 	__uint(map_flags, BPF_F_NO_PREALLOC);
72 	__type(key, int);
73 	__type(value, struct tstamp_data);
74 } tstamp SEC(".maps");
75 
76 struct {
77 	__uint(type, BPF_MAP_TYPE_HASH);
78 	__uint(key_size, sizeof(struct offcpu_key));
79 	__uint(value_size, sizeof(__u64));
80 	__uint(max_entries, MAX_ENTRIES);
81 } off_cpu SEC(".maps");
82 
83 struct {
84 	__uint(type, BPF_MAP_TYPE_HASH);
85 	__uint(key_size, sizeof(__u32));
86 	__uint(value_size, sizeof(__u8));
87 	__uint(max_entries, 1);
88 } cpu_filter SEC(".maps");
89 
90 struct {
91 	__uint(type, BPF_MAP_TYPE_HASH);
92 	__uint(key_size, sizeof(__u32));
93 	__uint(value_size, sizeof(__u8));
94 	__uint(max_entries, 1);
95 } task_filter SEC(".maps");
96 
97 struct {
98 	__uint(type, BPF_MAP_TYPE_HASH);
99 	__uint(key_size, sizeof(__u64));
100 	__uint(value_size, sizeof(__u8));
101 	__uint(max_entries, 1);
102 } cgroup_filter SEC(".maps");
103 
104 /* new kernel task_struct definition */
105 struct task_struct___new {
106 	long __state;
107 } __attribute__((preserve_access_index));
108 
109 /* old kernel task_struct definition */
110 struct task_struct___old {
111 	long state;
112 } __attribute__((preserve_access_index));
113 
114 int enabled = 0;
115 
116 const volatile int has_cpu = 0;
117 const volatile int has_task = 0;
118 const volatile int has_cgroup = 0;
119 const volatile int uses_tgid = 0;
120 
121 const volatile bool has_prev_state = false;
122 const volatile bool needs_cgroup = false;
123 const volatile bool uses_cgroup_v1 = false;
124 
125 int perf_subsys_id = -1;
126 
127 __u64 offcpu_thresh_ns;
128 
129 /*
130  * Old kernel used to call it task_struct->state and now it's '__state'.
131  * Use BPF CO-RE "ignored suffix rule" to deal with it like below:
132  *
133  * https://nakryiko.com/posts/bpf-core-reference-guide/#handling-incompatible-field-and-type-changes
134  */
135 static inline int get_task_state(struct task_struct *t)
136 {
137 	/* recast pointer to capture new type for compiler */
138 	struct task_struct___new *t_new = (void *)t;
139 
140 	if (bpf_core_field_exists(t_new->__state)) {
141 		return BPF_CORE_READ(t_new, __state);
142 	} else {
143 		/* recast pointer to capture old type for compiler */
144 		struct task_struct___old *t_old = (void *)t;
145 
146 		return BPF_CORE_READ(t_old, state);
147 	}
148 }
149 
150 static inline __u64 get_cgroup_id(struct task_struct *t)
151 {
152 	struct cgroup *cgrp;
153 
154 	if (!uses_cgroup_v1)
155 		return BPF_CORE_READ(t, cgroups, dfl_cgrp, kn, id);
156 
157 	if (perf_subsys_id == -1) {
158 #if __has_builtin(__builtin_preserve_enum_value)
159 		perf_subsys_id = bpf_core_enum_value(enum cgroup_subsys_id,
160 						     perf_event_cgrp_id);
161 #else
162 		perf_subsys_id = perf_event_cgrp_id;
163 #endif
164 	}
165 
166 	cgrp = BPF_CORE_READ(t, cgroups, subsys[perf_subsys_id], cgroup);
167 	return BPF_CORE_READ(cgrp, kn, id);
168 }
169 
170 static inline int can_record(struct task_struct *t, int state)
171 {
172 	/* kernel threads don't have user stack */
173 	if (t->flags & PF_KTHREAD)
174 		return 0;
175 
176 	if (state != TASK_INTERRUPTIBLE &&
177 	    state != TASK_UNINTERRUPTIBLE)
178 		return 0;
179 
180 	if (has_cpu) {
181 		__u32 cpu = bpf_get_smp_processor_id();
182 		__u8 *ok;
183 
184 		ok = bpf_map_lookup_elem(&cpu_filter, &cpu);
185 		if (!ok)
186 			return 0;
187 	}
188 
189 	if (has_task) {
190 		__u8 *ok;
191 		__u32 pid;
192 
193 		if (uses_tgid)
194 			pid = t->tgid;
195 		else
196 			pid = t->pid;
197 
198 		ok = bpf_map_lookup_elem(&task_filter, &pid);
199 		if (!ok)
200 			return 0;
201 	}
202 
203 	if (has_cgroup) {
204 		__u8 *ok;
205 		__u64 cgrp_id = get_cgroup_id(t);
206 
207 		ok = bpf_map_lookup_elem(&cgroup_filter, &cgrp_id);
208 		if (!ok)
209 			return 0;
210 	}
211 
212 	return 1;
213 }
214 
215 static inline int copy_stack(struct __stack *from, struct offcpu_data *to, int n)
216 {
217 	int len = 0;
218 
219 	for (int i = 0; i < MAX_STACKS && from->array[i]; ++i, ++len)
220 		to->array[n + 2 + i] = from->array[i];
221 
222 	return len;
223 }
224 
225 /**
226  * off_cpu_dump - dump off-cpu samples to ring buffer
227  * @data: payload for dumping off-cpu samples
228  * @key: off-cpu data
229  * @stack: stack trace of the task before being scheduled out
230  *
231  * If the threshold of off-cpu time is reached, acquire tid, period, callchain, and cgroup id
232  * information of the task, and dump it as a raw sample to perf ring buffer
233  */
234 static int off_cpu_dump(void *ctx, struct offcpu_data *data, struct offcpu_key *key,
235 			struct __stack *stack, __u64 delta)
236 {
237 	int n = 0, len = 0;
238 
239 	data->array[n++] = (u64)key->tgid << 32 | key->pid;
240 	data->array[n++] = delta;
241 
242 	/* data->array[n] is callchain->nr (updated later) */
243 	data->array[n + 1] = PERF_CONTEXT_USER;
244 	data->array[n + 2] = 0;
245 	len = copy_stack(stack, data, n);
246 
247 	/* update length of callchain */
248 	data->array[n] = len + 1;
249 	n += len + 2;
250 
251 	data->array[n++] = key->cgroup_id;
252 
253 	return bpf_perf_event_output(ctx, &offcpu_output, BPF_F_CURRENT_CPU, data, n * sizeof(u64));
254 }
255 
256 static int off_cpu_stat(u64 *ctx, struct task_struct *prev,
257 			struct task_struct *next, int state)
258 {
259 	__u64 ts;
260 	__u32 stack_id;
261 	struct tstamp_data *pelem;
262 
263 	ts = bpf_ktime_get_ns();
264 
265 	if (!can_record(prev, state))
266 		goto next;
267 
268 	stack_id = bpf_get_stackid(ctx, &stacks,
269 				   BPF_F_FAST_STACK_CMP | BPF_F_USER_STACK);
270 
271 	pelem = bpf_task_storage_get(&tstamp, prev, NULL,
272 				     BPF_LOCAL_STORAGE_GET_F_CREATE);
273 	if (!pelem)
274 		goto next;
275 
276 	pelem->timestamp = ts;
277 	pelem->state = state;
278 	pelem->stack_id = stack_id;
279 
280 	/*
281 	 * If stacks are successfully collected by bpf_get_stackid(), collect them once more
282 	 * in task_storage for direct off-cpu sample dumping
283 	 */
284 	if (stack_id > 0 && bpf_get_stack(ctx, &pelem->stack, MAX_STACKS * sizeof(u64), BPF_F_USER_STACK)) {
285 		/*
286 		 * This empty if block is used to avoid 'result unused warning' from bpf_get_stack().
287 		 * If the collection fails, continue with the logic for the next task.
288 		 */
289 	}
290 next:
291 	pelem = bpf_task_storage_get(&tstamp, next, NULL, 0);
292 
293 	if (pelem && pelem->timestamp) {
294 		struct offcpu_key key = {
295 			.pid = next->pid,
296 			.tgid = next->tgid,
297 			.stack_id = pelem->stack_id,
298 			.state = pelem->state,
299 			.cgroup_id = needs_cgroup ? get_cgroup_id(next) : 0,
300 		};
301 		__u64 delta = ts - pelem->timestamp;
302 		__u64 *total;
303 
304 		if (delta >= offcpu_thresh_ns) {
305 			int zero = 0;
306 			struct offcpu_data *data = bpf_map_lookup_elem(&offcpu_payload, &zero);
307 
308 			if (data)
309 				off_cpu_dump(ctx, data, &key, &pelem->stack, delta);
310 		} else {
311 			total = bpf_map_lookup_elem(&off_cpu, &key);
312 			if (total)
313 				*total += delta;
314 			else
315 				bpf_map_update_elem(&off_cpu, &key, &delta, BPF_ANY);
316 		}
317 
318 		/* prevent to reuse the timestamp later */
319 		pelem->timestamp = 0;
320 	}
321 
322 	return 0;
323 }
324 
325 SEC("tp_btf/task_newtask")
326 int on_newtask(u64 *ctx)
327 {
328 	struct task_struct *task;
329 	u64 clone_flags;
330 	u32 pid;
331 	u8 val = 1;
332 
333 	if (!uses_tgid)
334 		return 0;
335 
336 	task = (struct task_struct *)bpf_get_current_task();
337 
338 	pid = BPF_CORE_READ(task, tgid);
339 	if (!bpf_map_lookup_elem(&task_filter, &pid))
340 		return 0;
341 
342 	task = (struct task_struct *)ctx[0];
343 	clone_flags = ctx[1];
344 
345 	pid = task->tgid;
346 	if (!(clone_flags & CLONE_THREAD))
347 		bpf_map_update_elem(&task_filter, &pid, &val, BPF_NOEXIST);
348 
349 	return 0;
350 }
351 
352 SEC("tp_btf/sched_switch")
353 int on_switch(u64 *ctx)
354 {
355 	struct task_struct *prev, *next;
356 	int prev_state;
357 
358 	if (!enabled)
359 		return 0;
360 
361 	prev = (struct task_struct *)ctx[1];
362 	next = (struct task_struct *)ctx[2];
363 
364 	if (has_prev_state)
365 		prev_state = (int)ctx[3];
366 	else
367 		prev_state = get_task_state(prev);
368 
369 	return off_cpu_stat(ctx, prev, next, prev_state & 0xff);
370 }
371 
372 char LICENSE[] SEC("license") = "Dual BSD/GPL";
373