xref: /linux/tools/perf/util/bpf_off_cpu.c (revision 68a052239fc4b351e961f698b824f7654a346091)
1 // SPDX-License-Identifier: GPL-2.0
2 #include "util/bpf_counter.h"
3 #include "util/debug.h"
4 #include "util/evsel.h"
5 #include "util/evlist.h"
6 #include "util/off_cpu.h"
7 #include "util/perf-hooks.h"
8 #include "util/record.h"
9 #include "util/session.h"
10 #include "util/target.h"
11 #include "util/cpumap.h"
12 #include "util/thread_map.h"
13 #include "util/cgroup.h"
14 #include "util/strlist.h"
15 #include <bpf/bpf.h>
16 #include <bpf/btf.h>
17 #include <internal/xyarray.h>
18 #include <linux/time64.h>
19 
20 #include "bpf_skel/off_cpu.skel.h"
21 
22 #define MAX_STACKS  32
23 #define MAX_PROC  4096
24 /* we don't need actual timestamp, just want to put the samples at last */
25 #define OFF_CPU_TIMESTAMP  (~0ull << 32)
26 
27 static struct off_cpu_bpf *skel;
28 
29 struct off_cpu_key {
30 	u32 pid;
31 	u32 tgid;
32 	u32 stack_id;
33 	u32 state;
34 	u64 cgroup_id;
35 };
36 
37 union off_cpu_data {
38 	struct perf_event_header hdr;
39 	u64 array[1024 / sizeof(u64)];
40 };
41 
42 u64 off_cpu_raw[MAX_STACKS + 5];
43 
44 static int off_cpu_config(struct evlist *evlist)
45 {
46 	char off_cpu_event[64];
47 	struct evsel *evsel;
48 
49 	scnprintf(off_cpu_event, sizeof(off_cpu_event), "bpf-output/name=%s/", OFFCPU_EVENT);
50 	if (parse_event(evlist, off_cpu_event)) {
51 		pr_err("Failed to open off-cpu event\n");
52 		return -1;
53 	}
54 
55 	evlist__for_each_entry(evlist, evsel) {
56 		if (evsel__is_offcpu_event(evsel)) {
57 			evsel->core.system_wide = true;
58 			break;
59 		}
60 	}
61 
62 	return 0;
63 }
64 
65 static void off_cpu_start(void *arg)
66 {
67 	struct evlist *evlist = arg;
68 	struct evsel *evsel;
69 	struct perf_cpu pcpu;
70 	int i;
71 
72 	/* update task filter for the given workload */
73 	if (skel->rodata->has_task && skel->rodata->uses_tgid &&
74 	    perf_thread_map__pid(evlist->core.threads, 0) != -1) {
75 		int fd;
76 		u32 pid;
77 		u8 val = 1;
78 
79 		fd = bpf_map__fd(skel->maps.task_filter);
80 		pid = perf_thread_map__pid(evlist->core.threads, 0);
81 		bpf_map_update_elem(fd, &pid, &val, BPF_ANY);
82 	}
83 
84 	/* update BPF perf_event map */
85 	evsel = evlist__find_evsel_by_str(evlist, OFFCPU_EVENT);
86 	if (evsel == NULL) {
87 		pr_err("%s evsel not found\n", OFFCPU_EVENT);
88 		return;
89 	}
90 
91 	perf_cpu_map__for_each_cpu(pcpu, i, evsel->core.cpus) {
92 		int err;
93 		int cpu_nr = pcpu.cpu;
94 
95 		err = bpf_map__update_elem(skel->maps.offcpu_output, &cpu_nr, sizeof(int),
96 					   xyarray__entry(evsel->core.fd, cpu_nr, 0),
97 					   sizeof(int), BPF_ANY);
98 		if (err) {
99 			pr_err("Failed to update perf event map for direct off-cpu dumping\n");
100 			return;
101 		}
102 	}
103 
104 	skel->bss->enabled = 1;
105 }
106 
107 static void off_cpu_finish(void *arg __maybe_unused)
108 {
109 	skel->bss->enabled = 0;
110 	off_cpu_bpf__destroy(skel);
111 }
112 
113 /* v5.18 kernel added prev_state arg, so it needs to check the signature */
114 static void check_sched_switch_args(void)
115 {
116 	struct btf *btf = btf__load_vmlinux_btf();
117 	const struct btf_type *t1, *t2, *t3;
118 	u32 type_id;
119 
120 	if (!btf) {
121 		pr_debug("Missing btf, check if CONFIG_DEBUG_INFO_BTF is enabled\n");
122 		goto cleanup;
123 	}
124 
125 	type_id = btf__find_by_name_kind(btf, "btf_trace_sched_switch",
126 					 BTF_KIND_TYPEDEF);
127 	if ((s32)type_id < 0)
128 		goto cleanup;
129 
130 	t1 = btf__type_by_id(btf, type_id);
131 	if (t1 == NULL)
132 		goto cleanup;
133 
134 	t2 = btf__type_by_id(btf, t1->type);
135 	if (t2 == NULL || !btf_is_ptr(t2))
136 		goto cleanup;
137 
138 	t3 = btf__type_by_id(btf, t2->type);
139 	/* btf_trace func proto has one more argument for the context */
140 	if (t3 && btf_is_func_proto(t3) && btf_vlen(t3) == 5) {
141 		/* new format: pass prev_state as 4th arg */
142 		skel->rodata->has_prev_state = true;
143 	}
144 cleanup:
145 	btf__free(btf);
146 }
147 
148 int off_cpu_prepare(struct evlist *evlist, struct target *target,
149 		    struct record_opts *opts)
150 {
151 	int err, fd, i;
152 	int ncpus = 1, ntasks = 1, ncgrps = 1;
153 	struct strlist *pid_slist = NULL;
154 	struct str_node *pos;
155 
156 	if (off_cpu_config(evlist) < 0) {
157 		pr_err("Failed to config off-cpu BPF event\n");
158 		return -1;
159 	}
160 
161 	skel = off_cpu_bpf__open();
162 	if (!skel) {
163 		pr_err("Failed to open off-cpu BPF skeleton\n");
164 		return -1;
165 	}
166 
167 	/* don't need to set cpu filter for system-wide mode */
168 	if (target->cpu_list) {
169 		ncpus = perf_cpu_map__nr(evlist->core.user_requested_cpus);
170 		bpf_map__set_max_entries(skel->maps.cpu_filter, ncpus);
171 		skel->rodata->has_cpu = 1;
172 	}
173 
174 	if (target->pid) {
175 		pid_slist = strlist__new(target->pid, NULL);
176 		if (!pid_slist) {
177 			pr_err("Failed to create a strlist for pid\n");
178 			return -1;
179 		}
180 
181 		ntasks = 0;
182 		strlist__for_each_entry(pos, pid_slist) {
183 			char *end_ptr;
184 			int pid = strtol(pos->s, &end_ptr, 10);
185 
186 			if (pid == INT_MIN || pid == INT_MAX ||
187 			    (*end_ptr != '\0' && *end_ptr != ','))
188 				continue;
189 
190 			ntasks++;
191 		}
192 
193 		if (ntasks < MAX_PROC)
194 			ntasks = MAX_PROC;
195 
196 		bpf_map__set_max_entries(skel->maps.task_filter, ntasks);
197 		skel->rodata->has_task = 1;
198 		skel->rodata->uses_tgid = 1;
199 	} else if (target__has_task(target)) {
200 		ntasks = perf_thread_map__nr(evlist->core.threads);
201 		bpf_map__set_max_entries(skel->maps.task_filter, ntasks);
202 		skel->rodata->has_task = 1;
203 	} else if (target__none(target)) {
204 		bpf_map__set_max_entries(skel->maps.task_filter, MAX_PROC);
205 		skel->rodata->has_task = 1;
206 		skel->rodata->uses_tgid = 1;
207 	}
208 
209 	if (evlist__first(evlist)->cgrp) {
210 		ncgrps = evlist->core.nr_entries - 1; /* excluding a dummy */
211 		bpf_map__set_max_entries(skel->maps.cgroup_filter, ncgrps);
212 
213 		if (!cgroup_is_v2("perf_event"))
214 			skel->rodata->uses_cgroup_v1 = true;
215 		skel->rodata->has_cgroup = 1;
216 	}
217 
218 	if (opts->record_cgroup) {
219 		skel->rodata->needs_cgroup = true;
220 
221 		if (!cgroup_is_v2("perf_event"))
222 			skel->rodata->uses_cgroup_v1 = true;
223 	}
224 
225 	set_max_rlimit();
226 	check_sched_switch_args();
227 
228 	err = off_cpu_bpf__load(skel);
229 	if (err) {
230 		pr_err("Failed to load off-cpu skeleton\n");
231 		goto out;
232 	}
233 
234 	if (target->cpu_list) {
235 		u32 cpu;
236 		u8 val = 1;
237 
238 		fd = bpf_map__fd(skel->maps.cpu_filter);
239 
240 		for (i = 0; i < ncpus; i++) {
241 			cpu = perf_cpu_map__cpu(evlist->core.user_requested_cpus, i).cpu;
242 			bpf_map_update_elem(fd, &cpu, &val, BPF_ANY);
243 		}
244 	}
245 
246 	if (target->pid) {
247 		u8 val = 1;
248 
249 		fd = bpf_map__fd(skel->maps.task_filter);
250 
251 		strlist__for_each_entry(pos, pid_slist) {
252 			char *end_ptr;
253 			u32 tgid;
254 			int pid = strtol(pos->s, &end_ptr, 10);
255 
256 			if (pid == INT_MIN || pid == INT_MAX ||
257 			    (*end_ptr != '\0' && *end_ptr != ','))
258 				continue;
259 
260 			tgid = pid;
261 			bpf_map_update_elem(fd, &tgid, &val, BPF_ANY);
262 		}
263 	} else if (target__has_task(target)) {
264 		u32 pid;
265 		u8 val = 1;
266 
267 		fd = bpf_map__fd(skel->maps.task_filter);
268 
269 		for (i = 0; i < ntasks; i++) {
270 			pid = perf_thread_map__pid(evlist->core.threads, i);
271 			bpf_map_update_elem(fd, &pid, &val, BPF_ANY);
272 		}
273 	}
274 
275 	if (evlist__first(evlist)->cgrp) {
276 		struct evsel *evsel;
277 		u8 val = 1;
278 
279 		fd = bpf_map__fd(skel->maps.cgroup_filter);
280 
281 		evlist__for_each_entry(evlist, evsel) {
282 			struct cgroup *cgrp = evsel->cgrp;
283 
284 			if (cgrp == NULL)
285 				continue;
286 
287 			if (!cgrp->id && read_cgroup_id(cgrp) < 0) {
288 				pr_err("Failed to read cgroup id of %s\n",
289 				       cgrp->name);
290 				goto out;
291 			}
292 
293 			bpf_map_update_elem(fd, &cgrp->id, &val, BPF_ANY);
294 		}
295 	}
296 
297 	skel->bss->offcpu_thresh_ns = opts->off_cpu_thresh_ns;
298 
299 	err = off_cpu_bpf__attach(skel);
300 	if (err) {
301 		pr_err("Failed to attach off-cpu BPF skeleton\n");
302 		goto out;
303 	}
304 
305 	if (perf_hooks__set_hook("record_start", off_cpu_start, evlist) ||
306 	    perf_hooks__set_hook("record_end", off_cpu_finish, evlist)) {
307 		pr_err("Failed to attach off-cpu skeleton\n");
308 		goto out;
309 	}
310 
311 	return 0;
312 
313 out:
314 	off_cpu_bpf__destroy(skel);
315 	return -1;
316 }
317 
318 int off_cpu_write(struct perf_session *session)
319 {
320 	int bytes = 0, size;
321 	int fd, stack;
322 	u32 raw_size;
323 	u64 sample_type, val, sid = 0;
324 	struct evsel *evsel;
325 	struct perf_data_file *file = &session->data->file;
326 	struct off_cpu_key prev, key;
327 	union off_cpu_data data = {
328 		.hdr = {
329 			.type = PERF_RECORD_SAMPLE,
330 			.misc = PERF_RECORD_MISC_USER,
331 		},
332 	};
333 	u64 tstamp = OFF_CPU_TIMESTAMP;
334 
335 	skel->bss->enabled = 0;
336 
337 	evsel = evlist__find_evsel_by_str(session->evlist, OFFCPU_EVENT);
338 	if (evsel == NULL) {
339 		pr_err("%s evsel not found\n", OFFCPU_EVENT);
340 		return 0;
341 	}
342 
343 	sample_type = evsel->core.attr.sample_type;
344 
345 	if (sample_type & ~OFFCPU_SAMPLE_TYPES) {
346 		pr_err("not supported sample type: %llx\n",
347 		       (unsigned long long)sample_type);
348 		return -1;
349 	}
350 
351 	if (sample_type & (PERF_SAMPLE_ID | PERF_SAMPLE_IDENTIFIER)) {
352 		if (evsel->core.id)
353 			sid = evsel->core.id[0];
354 	}
355 
356 	fd = bpf_map__fd(skel->maps.off_cpu);
357 	stack = bpf_map__fd(skel->maps.stacks);
358 	memset(&prev, 0, sizeof(prev));
359 
360 	while (!bpf_map_get_next_key(fd, &prev, &key)) {
361 		int n = 1;  /* start from perf_event_header */
362 
363 		bpf_map_lookup_elem(fd, &key, &val);
364 
365 		/* zero-fill some of the fields, will be overwritten by raw_data when parsing */
366 		if (sample_type & PERF_SAMPLE_IDENTIFIER)
367 			data.array[n++] = sid;
368 		if (sample_type & PERF_SAMPLE_IP)
369 			data.array[n++] = 0;  /* will be updated */
370 		if (sample_type & PERF_SAMPLE_TID)
371 			data.array[n++] = 0;
372 		if (sample_type & PERF_SAMPLE_TIME)
373 			data.array[n++] = tstamp;
374 		if (sample_type & PERF_SAMPLE_CPU)
375 			data.array[n++] = 0;
376 		if (sample_type & PERF_SAMPLE_PERIOD)
377 			data.array[n++] = 0;
378 		if (sample_type & PERF_SAMPLE_RAW) {
379 			/*
380 			 *  [ size ][ data ]
381 			 *  [     data     ]
382 			 *  [     data     ]
383 			 *  [     data     ]
384 			 *  [ data ][ empty]
385 			 */
386 			int len = 0, i = 0;
387 			void *raw_data = (void *)data.array + n * sizeof(u64);
388 
389 			off_cpu_raw[i++] = (u64)key.pid << 32 | key.tgid;
390 			off_cpu_raw[i++] = val;
391 
392 			/* off_cpu_raw[i] is callchain->nr (updated later) */
393 			off_cpu_raw[i + 1] = PERF_CONTEXT_USER;
394 			off_cpu_raw[i + 2] = 0;
395 
396 			bpf_map_lookup_elem(stack, &key.stack_id, &off_cpu_raw[i + 2]);
397 			while (off_cpu_raw[i + 2 + len])
398 				len++;
399 
400 			off_cpu_raw[i] = len + 1;
401 			i += len + 2;
402 
403 			off_cpu_raw[i++] = key.cgroup_id;
404 
405 			raw_size = i * sizeof(u64) + sizeof(u32); /* 4 bytes for alignment */
406 			memcpy(raw_data, &raw_size, sizeof(raw_size));
407 			memcpy(raw_data + sizeof(u32), off_cpu_raw, i * sizeof(u64));
408 
409 			n += i + 1;
410 		}
411 		if (sample_type & PERF_SAMPLE_CGROUP)
412 			data.array[n++] = key.cgroup_id;
413 
414 		size = n * sizeof(u64);
415 		data.hdr.size = size;
416 		bytes += size;
417 
418 		if (perf_data_file__write(file, &data, size) < 0) {
419 			pr_err("failed to write perf data, error: %m\n");
420 			return bytes;
421 		}
422 
423 		prev = key;
424 		/* increase dummy timestamp to sort later samples */
425 		tstamp++;
426 	}
427 	return bytes;
428 }
429