xref: /linux/tools/perf/util/bpf-trace-summary.c (revision 0939bd2fcf337243133b0271335a2838857c319f)
1 /* SPDX-License-Identifier: GPL-2.0 */
2 #include <inttypes.h>
3 #include <math.h>
4 #include <stdio.h>
5 #include <stdlib.h>
6 
7 #include "dwarf-regs.h" /* for EM_HOST */
8 #include "syscalltbl.h"
9 #include "util/cgroup.h"
10 #include "util/hashmap.h"
11 #include "util/trace.h"
12 #include "util/util.h"
13 #include <bpf/bpf.h>
14 #include <linux/rbtree.h>
15 #include <linux/time64.h>
16 #include <tools/libc_compat.h> /* reallocarray */
17 
18 #include "bpf_skel/syscall_summary.h"
19 #include "bpf_skel/syscall_summary.skel.h"
20 
21 
22 static struct syscall_summary_bpf *skel;
23 static struct rb_root cgroups = RB_ROOT;
24 
25 int trace_prepare_bpf_summary(enum trace_summary_mode mode)
26 {
27 	skel = syscall_summary_bpf__open();
28 	if (skel == NULL) {
29 		fprintf(stderr, "failed to open syscall summary bpf skeleton\n");
30 		return -1;
31 	}
32 
33 	if (mode == SUMMARY__BY_THREAD)
34 		skel->rodata->aggr_mode = SYSCALL_AGGR_THREAD;
35 	else if (mode == SUMMARY__BY_CGROUP)
36 		skel->rodata->aggr_mode = SYSCALL_AGGR_CGROUP;
37 	else
38 		skel->rodata->aggr_mode = SYSCALL_AGGR_CPU;
39 
40 	if (cgroup_is_v2("perf_event") > 0)
41 		skel->rodata->use_cgroup_v2 = 1;
42 
43 	if (syscall_summary_bpf__load(skel) < 0) {
44 		fprintf(stderr, "failed to load syscall summary bpf skeleton\n");
45 		return -1;
46 	}
47 
48 	if (syscall_summary_bpf__attach(skel) < 0) {
49 		fprintf(stderr, "failed to attach syscall summary bpf skeleton\n");
50 		return -1;
51 	}
52 
53 	if (mode == SUMMARY__BY_CGROUP)
54 		read_all_cgroups(&cgroups);
55 
56 	return 0;
57 }
58 
59 void trace_start_bpf_summary(void)
60 {
61 	skel->bss->enabled = 1;
62 }
63 
64 void trace_end_bpf_summary(void)
65 {
66 	skel->bss->enabled = 0;
67 }
68 
69 struct syscall_node {
70 	int syscall_nr;
71 	struct syscall_stats stats;
72 };
73 
74 static double rel_stddev(struct syscall_stats *stat)
75 {
76 	double variance, average;
77 
78 	if (stat->count < 2)
79 		return 0;
80 
81 	average = (double)stat->total_time / stat->count;
82 
83 	variance = stat->squared_sum;
84 	variance -= (stat->total_time * stat->total_time) / stat->count;
85 	variance /= stat->count - 1;
86 
87 	return 100 * sqrt(variance / stat->count) / average;
88 }
89 
90 /*
91  * The syscall_data is to maintain syscall stats ordered by total time.
92  * It supports different summary modes like per-thread or global.
93  *
94  * For per-thread stats, it uses two-level data strurcture -
95  * syscall_data is keyed by TID and has an array of nodes which
96  * represents each syscall for the thread.
97  *
98  * For global stats, it's still two-level technically but we don't need
99  * per-cpu analysis so it's keyed by the syscall number to combine stats
100  * from different CPUs.  And syscall_data always has a syscall_node so
101  * it can effectively work as flat hierarchy.
102  *
103  * For per-cgroup stats, it uses two-level data structure like thread
104  * syscall_data is keyed by CGROUP and has an array of node which
105  * represents each syscall for the cgroup.
106  */
107 struct syscall_data {
108 	u64 key; /* tid if AGGR_THREAD, syscall-nr if AGGR_CPU, cgroup if AGGR_CGROUP */
109 	int nr_events;
110 	int nr_nodes;
111 	u64 total_time;
112 	struct syscall_node *nodes;
113 };
114 
115 static int datacmp(const void *a, const void *b)
116 {
117 	const struct syscall_data * const *sa = a;
118 	const struct syscall_data * const *sb = b;
119 
120 	return (*sa)->total_time > (*sb)->total_time ? -1 : 1;
121 }
122 
123 static int nodecmp(const void *a, const void *b)
124 {
125 	const struct syscall_node *na = a;
126 	const struct syscall_node *nb = b;
127 
128 	return na->stats.total_time > nb->stats.total_time ? -1 : 1;
129 }
130 
131 static size_t sc_node_hash(long key, void *ctx __maybe_unused)
132 {
133 	return key;
134 }
135 
136 static bool sc_node_equal(long key1, long key2, void *ctx __maybe_unused)
137 {
138 	return key1 == key2;
139 }
140 
141 static int print_common_stats(struct syscall_data *data, FILE *fp)
142 {
143 	int printed = 0;
144 
145 	for (int i = 0; i < data->nr_nodes; i++) {
146 		struct syscall_node *node = &data->nodes[i];
147 		struct syscall_stats *stat = &node->stats;
148 		double total = (double)(stat->total_time) / NSEC_PER_MSEC;
149 		double min = (double)(stat->min_time) / NSEC_PER_MSEC;
150 		double max = (double)(stat->max_time) / NSEC_PER_MSEC;
151 		double avg = total / stat->count;
152 		const char *name;
153 
154 		/* TODO: support other ABIs */
155 		name = syscalltbl__name(EM_HOST, node->syscall_nr);
156 		if (name)
157 			printed += fprintf(fp, "   %-15s", name);
158 		else
159 			printed += fprintf(fp, "   syscall:%-7d", node->syscall_nr);
160 
161 		printed += fprintf(fp, " %8u %6u %9.3f %9.3f %9.3f %9.3f %9.2f%%\n",
162 				   stat->count, stat->error, total, min, avg, max,
163 				   rel_stddev(stat));
164 	}
165 	return printed;
166 }
167 
168 static int update_thread_stats(struct hashmap *hash, struct syscall_key *map_key,
169 			       struct syscall_stats *map_data)
170 {
171 	struct syscall_data *data;
172 	struct syscall_node *nodes;
173 
174 	if (!hashmap__find(hash, map_key->cpu_or_tid, &data)) {
175 		data = zalloc(sizeof(*data));
176 		if (data == NULL)
177 			return -ENOMEM;
178 
179 		data->key = map_key->cpu_or_tid;
180 		if (hashmap__add(hash, data->key, data) < 0) {
181 			free(data);
182 			return -ENOMEM;
183 		}
184 	}
185 
186 	/* update thread total stats */
187 	data->nr_events += map_data->count;
188 	data->total_time += map_data->total_time;
189 
190 	nodes = reallocarray(data->nodes, data->nr_nodes + 1, sizeof(*nodes));
191 	if (nodes == NULL)
192 		return -ENOMEM;
193 
194 	data->nodes = nodes;
195 	nodes = &data->nodes[data->nr_nodes++];
196 	nodes->syscall_nr = map_key->nr;
197 
198 	/* each thread has an entry for each syscall, just use the stat */
199 	memcpy(&nodes->stats, map_data, sizeof(*map_data));
200 	return 0;
201 }
202 
203 static int print_thread_stat(struct syscall_data *data, FILE *fp)
204 {
205 	int printed = 0;
206 
207 	qsort(data->nodes, data->nr_nodes, sizeof(*data->nodes), nodecmp);
208 
209 	printed += fprintf(fp, " thread (%d), ", (int)data->key);
210 	printed += fprintf(fp, "%d events\n\n", data->nr_events);
211 
212 	printed += fprintf(fp, "   syscall            calls  errors  total       min       avg       max       stddev\n");
213 	printed += fprintf(fp, "                                     (msec)    (msec)    (msec)    (msec)        (%%)\n");
214 	printed += fprintf(fp, "   --------------- --------  ------ -------- --------- --------- ---------     ------\n");
215 
216 	printed += print_common_stats(data, fp);
217 	printed += fprintf(fp, "\n\n");
218 
219 	return printed;
220 }
221 
222 static int print_thread_stats(struct syscall_data **data, int nr_data, FILE *fp)
223 {
224 	int printed = 0;
225 
226 	for (int i = 0; i < nr_data; i++)
227 		printed += print_thread_stat(data[i], fp);
228 
229 	return printed;
230 }
231 
232 static int update_total_stats(struct hashmap *hash, struct syscall_key *map_key,
233 			      struct syscall_stats *map_data)
234 {
235 	struct syscall_data *data;
236 	struct syscall_stats *stat;
237 
238 	if (!hashmap__find(hash, map_key->nr, &data)) {
239 		data = zalloc(sizeof(*data));
240 		if (data == NULL)
241 			return -ENOMEM;
242 
243 		data->nodes = zalloc(sizeof(*data->nodes));
244 		if (data->nodes == NULL) {
245 			free(data);
246 			return -ENOMEM;
247 		}
248 
249 		data->nr_nodes = 1;
250 		data->key = map_key->nr;
251 		data->nodes->syscall_nr = data->key;
252 
253 		if (hashmap__add(hash, data->key, data) < 0) {
254 			free(data->nodes);
255 			free(data);
256 			return -ENOMEM;
257 		}
258 	}
259 
260 	/* update total stats for this syscall */
261 	data->nr_events += map_data->count;
262 	data->total_time += map_data->total_time;
263 
264 	/* This is sum of the same syscall from different CPUs */
265 	stat = &data->nodes->stats;
266 
267 	stat->total_time += map_data->total_time;
268 	stat->squared_sum += map_data->squared_sum;
269 	stat->count += map_data->count;
270 	stat->error += map_data->error;
271 
272 	if (stat->max_time < map_data->max_time)
273 		stat->max_time = map_data->max_time;
274 	if (stat->min_time > map_data->min_time || stat->min_time == 0)
275 		stat->min_time = map_data->min_time;
276 
277 	return 0;
278 }
279 
280 static int print_total_stats(struct syscall_data **data, int nr_data, FILE *fp)
281 {
282 	int printed = 0;
283 	int nr_events = 0;
284 
285 	for (int i = 0; i < nr_data; i++)
286 		nr_events += data[i]->nr_events;
287 
288 	printed += fprintf(fp, " total, %d events\n\n", nr_events);
289 
290 	printed += fprintf(fp, "   syscall            calls  errors  total       min       avg       max       stddev\n");
291 	printed += fprintf(fp, "                                     (msec)    (msec)    (msec)    (msec)        (%%)\n");
292 	printed += fprintf(fp, "   --------------- --------  ------ -------- --------- --------- ---------     ------\n");
293 
294 	for (int i = 0; i < nr_data; i++)
295 		printed += print_common_stats(data[i], fp);
296 
297 	printed += fprintf(fp, "\n\n");
298 	return printed;
299 }
300 
301 static int update_cgroup_stats(struct hashmap *hash, struct syscall_key *map_key,
302 			       struct syscall_stats *map_data)
303 {
304 	struct syscall_data *data;
305 	struct syscall_node *nodes;
306 
307 	if (!hashmap__find(hash, map_key->cgroup, &data)) {
308 		data = zalloc(sizeof(*data));
309 		if (data == NULL)
310 			return -ENOMEM;
311 
312 		data->key = map_key->cgroup;
313 		if (hashmap__add(hash, data->key, data) < 0) {
314 			free(data);
315 			return -ENOMEM;
316 		}
317 	}
318 
319 	/* update thread total stats */
320 	data->nr_events += map_data->count;
321 	data->total_time += map_data->total_time;
322 
323 	nodes = reallocarray(data->nodes, data->nr_nodes + 1, sizeof(*nodes));
324 	if (nodes == NULL)
325 		return -ENOMEM;
326 
327 	data->nodes = nodes;
328 	nodes = &data->nodes[data->nr_nodes++];
329 	nodes->syscall_nr = map_key->nr;
330 
331 	/* each thread has an entry for each syscall, just use the stat */
332 	memcpy(&nodes->stats, map_data, sizeof(*map_data));
333 	return 0;
334 }
335 
336 static int print_cgroup_stat(struct syscall_data *data, FILE *fp)
337 {
338 	int printed = 0;
339 	struct cgroup *cgrp = __cgroup__find(&cgroups, data->key);
340 
341 	qsort(data->nodes, data->nr_nodes, sizeof(*data->nodes), nodecmp);
342 
343 	if (cgrp)
344 		printed += fprintf(fp, " cgroup %s,", cgrp->name);
345 	else
346 		printed += fprintf(fp, " cgroup id:%lu,", (unsigned long)data->key);
347 
348 	printed += fprintf(fp, " %d events\n\n", data->nr_events);
349 
350 	printed += fprintf(fp, "   syscall            calls  errors  total       min       avg       max       stddev\n");
351 	printed += fprintf(fp, "                                     (msec)    (msec)    (msec)    (msec)        (%%)\n");
352 	printed += fprintf(fp, "   --------------- --------  ------ -------- --------- --------- ---------     ------\n");
353 
354 	printed += print_common_stats(data, fp);
355 	printed += fprintf(fp, "\n\n");
356 
357 	return printed;
358 }
359 
360 static int print_cgroup_stats(struct syscall_data **data, int nr_data, FILE *fp)
361 {
362 	int printed = 0;
363 
364 	for (int i = 0; i < nr_data; i++)
365 		printed += print_cgroup_stat(data[i], fp);
366 
367 	return printed;
368 }
369 
370 int trace_print_bpf_summary(FILE *fp)
371 {
372 	struct bpf_map *map = skel->maps.syscall_stats_map;
373 	struct syscall_key *prev_key, key;
374 	struct syscall_data **data = NULL;
375 	struct hashmap schash;
376 	struct hashmap_entry *entry;
377 	int nr_data = 0;
378 	int printed = 0;
379 	int i;
380 	size_t bkt;
381 
382 	hashmap__init(&schash, sc_node_hash, sc_node_equal, /*ctx=*/NULL);
383 
384 	printed = fprintf(fp, "\n Summary of events:\n\n");
385 
386 	/* get stats from the bpf map */
387 	prev_key = NULL;
388 	while (!bpf_map__get_next_key(map, prev_key, &key, sizeof(key))) {
389 		struct syscall_stats stat;
390 
391 		if (!bpf_map__lookup_elem(map, &key, sizeof(key), &stat, sizeof(stat), 0)) {
392 			switch (skel->rodata->aggr_mode) {
393 			case SYSCALL_AGGR_THREAD:
394 				update_thread_stats(&schash, &key, &stat);
395 				break;
396 			case SYSCALL_AGGR_CPU:
397 				update_total_stats(&schash, &key, &stat);
398 				break;
399 			case SYSCALL_AGGR_CGROUP:
400 				update_cgroup_stats(&schash, &key, &stat);
401 				break;
402 			default:
403 				break;
404 			}
405 		}
406 
407 		prev_key = &key;
408 	}
409 
410 	nr_data = hashmap__size(&schash);
411 	data = calloc(nr_data, sizeof(*data));
412 	if (data == NULL)
413 		goto out;
414 
415 	i = 0;
416 	hashmap__for_each_entry(&schash, entry, bkt)
417 		data[i++] = entry->pvalue;
418 
419 	qsort(data, nr_data, sizeof(*data), datacmp);
420 
421 	switch (skel->rodata->aggr_mode) {
422 	case SYSCALL_AGGR_THREAD:
423 		printed += print_thread_stats(data, nr_data, fp);
424 		break;
425 	case SYSCALL_AGGR_CPU:
426 		printed += print_total_stats(data, nr_data, fp);
427 		break;
428 	case SYSCALL_AGGR_CGROUP:
429 		printed += print_cgroup_stats(data, nr_data, fp);
430 		break;
431 	default:
432 		break;
433 	}
434 
435 	for (i = 0; i < nr_data && data; i++) {
436 		free(data[i]->nodes);
437 		free(data[i]);
438 	}
439 	free(data);
440 
441 out:
442 	hashmap__clear(&schash);
443 	return printed;
444 }
445 
446 void trace_cleanup_bpf_summary(void)
447 {
448 	if (!RB_EMPTY_ROOT(&cgroups)) {
449 		struct cgroup *cgrp, *tmp;
450 
451 		rbtree_postorder_for_each_entry_safe(cgrp, tmp, &cgroups, node)
452 			cgroup__put(cgrp);
453 
454 		cgroups = RB_ROOT;
455 	}
456 
457 	syscall_summary_bpf__destroy(skel);
458 }
459