1 /* SPDX-License-Identifier: GPL-2.0 */ 2 #include <errno.h> 3 #include <inttypes.h> 4 #include <math.h> 5 #include <stdio.h> 6 #include <stdlib.h> 7 8 #include "dwarf-regs.h" /* for EM_HOST */ 9 #include "syscalltbl.h" 10 #include "util/cgroup.h" 11 #include "util/hashmap.h" 12 #include "util/trace.h" 13 #include "util/util.h" 14 #include <bpf/bpf.h> 15 #include <linux/rbtree.h> 16 #include <linux/time64.h> 17 #include <tools/libc_compat.h> /* reallocarray */ 18 19 #include "bpf_skel/syscall_summary.h" 20 #include "bpf_skel/syscall_summary.skel.h" 21 22 23 static struct syscall_summary_bpf *skel; 24 static struct rb_root cgroups = RB_ROOT; 25 26 int trace_prepare_bpf_summary(enum trace_summary_mode mode) 27 { 28 skel = syscall_summary_bpf__open(); 29 if (skel == NULL) { 30 fprintf(stderr, "failed to open syscall summary bpf skeleton\n"); 31 return -1; 32 } 33 34 if (mode == SUMMARY__BY_THREAD) 35 skel->rodata->aggr_mode = SYSCALL_AGGR_THREAD; 36 else if (mode == SUMMARY__BY_CGROUP) 37 skel->rodata->aggr_mode = SYSCALL_AGGR_CGROUP; 38 else 39 skel->rodata->aggr_mode = SYSCALL_AGGR_CPU; 40 41 if (cgroup_is_v2("perf_event") > 0) 42 skel->rodata->use_cgroup_v2 = 1; 43 44 if (syscall_summary_bpf__load(skel) < 0) { 45 fprintf(stderr, "failed to load syscall summary bpf skeleton\n"); 46 return -1; 47 } 48 49 if (syscall_summary_bpf__attach(skel) < 0) { 50 fprintf(stderr, "failed to attach syscall summary bpf skeleton\n"); 51 return -1; 52 } 53 54 if (mode == SUMMARY__BY_CGROUP) 55 read_all_cgroups(&cgroups); 56 57 return 0; 58 } 59 60 void trace_start_bpf_summary(void) 61 { 62 skel->bss->enabled = 1; 63 } 64 65 void trace_end_bpf_summary(void) 66 { 67 skel->bss->enabled = 0; 68 } 69 70 struct syscall_node { 71 int syscall_nr; 72 struct syscall_stats stats; 73 }; 74 75 static double rel_stddev(struct syscall_stats *stat) 76 { 77 double variance, average; 78 79 if (stat->count < 2) 80 return 0; 81 82 average = (double)stat->total_time / stat->count; 83 84 variance = stat->squared_sum; 85 variance -= (stat->total_time * stat->total_time) / stat->count; 86 variance /= stat->count - 1; 87 88 return 100 * sqrt(variance / stat->count) / average; 89 } 90 91 /* 92 * The syscall_data is to maintain syscall stats ordered by total time. 93 * It supports different summary modes like per-thread or global. 94 * 95 * For per-thread stats, it uses two-level data strurcture - 96 * syscall_data is keyed by TID and has an array of nodes which 97 * represents each syscall for the thread. 98 * 99 * For global stats, it's still two-level technically but we don't need 100 * per-cpu analysis so it's keyed by the syscall number to combine stats 101 * from different CPUs. And syscall_data always has a syscall_node so 102 * it can effectively work as flat hierarchy. 103 * 104 * For per-cgroup stats, it uses two-level data structure like thread 105 * syscall_data is keyed by CGROUP and has an array of node which 106 * represents each syscall for the cgroup. 107 */ 108 struct syscall_data { 109 u64 key; /* tid if AGGR_THREAD, syscall-nr if AGGR_CPU, cgroup if AGGR_CGROUP */ 110 int nr_events; 111 int nr_nodes; 112 u64 total_time; 113 struct syscall_node *nodes; 114 }; 115 116 static int datacmp(const void *a, const void *b) 117 { 118 const struct syscall_data * const *sa = a; 119 const struct syscall_data * const *sb = b; 120 121 return (*sa)->total_time > (*sb)->total_time ? -1 : 1; 122 } 123 124 static int nodecmp(const void *a, const void *b) 125 { 126 const struct syscall_node *na = a; 127 const struct syscall_node *nb = b; 128 129 return na->stats.total_time > nb->stats.total_time ? -1 : 1; 130 } 131 132 static size_t sc_node_hash(long key, void *ctx __maybe_unused) 133 { 134 return key; 135 } 136 137 static bool sc_node_equal(long key1, long key2, void *ctx __maybe_unused) 138 { 139 return key1 == key2; 140 } 141 142 static int print_common_stats(struct syscall_data *data, int max_summary, FILE *fp) 143 { 144 int printed = 0; 145 146 if (max_summary == 0 || max_summary > data->nr_nodes) 147 max_summary = data->nr_nodes; 148 149 for (int i = 0; i < max_summary; i++) { 150 struct syscall_node *node = &data->nodes[i]; 151 struct syscall_stats *stat = &node->stats; 152 double total = (double)(stat->total_time) / NSEC_PER_MSEC; 153 double min = (double)(stat->min_time) / NSEC_PER_MSEC; 154 double max = (double)(stat->max_time) / NSEC_PER_MSEC; 155 double avg = total / stat->count; 156 const char *name; 157 158 /* TODO: support other ABIs */ 159 name = syscalltbl__name(EM_HOST, node->syscall_nr); 160 if (name) 161 printed += fprintf(fp, " %-15s", name); 162 else 163 printed += fprintf(fp, " syscall:%-7d", node->syscall_nr); 164 165 printed += fprintf(fp, " %8u %6u %9.3f %9.3f %9.3f %9.3f %9.2f%%\n", 166 stat->count, stat->error, total, min, avg, max, 167 rel_stddev(stat)); 168 } 169 return printed; 170 } 171 172 static int update_thread_stats(struct hashmap *hash, struct syscall_key *map_key, 173 struct syscall_stats *map_data) 174 { 175 struct syscall_data *data; 176 struct syscall_node *nodes; 177 178 if (!hashmap__find(hash, map_key->cpu_or_tid, &data)) { 179 data = zalloc(sizeof(*data)); 180 if (data == NULL) 181 return -ENOMEM; 182 183 data->key = map_key->cpu_or_tid; 184 if (hashmap__add(hash, data->key, data) < 0) { 185 free(data); 186 return -ENOMEM; 187 } 188 } 189 190 /* update thread total stats */ 191 data->nr_events += map_data->count; 192 data->total_time += map_data->total_time; 193 194 nodes = reallocarray(data->nodes, data->nr_nodes + 1, sizeof(*nodes)); 195 if (nodes == NULL) 196 return -ENOMEM; 197 198 data->nodes = nodes; 199 nodes = &data->nodes[data->nr_nodes++]; 200 nodes->syscall_nr = map_key->nr; 201 202 /* each thread has an entry for each syscall, just use the stat */ 203 memcpy(&nodes->stats, map_data, sizeof(*map_data)); 204 return 0; 205 } 206 207 static int print_thread_stat(struct syscall_data *data, int max_summary, FILE *fp) 208 { 209 int printed = 0; 210 211 qsort(data->nodes, data->nr_nodes, sizeof(*data->nodes), nodecmp); 212 213 printed += fprintf(fp, " thread (%d), ", (int)data->key); 214 printed += fprintf(fp, "%d events\n\n", data->nr_events); 215 216 printed += fprintf(fp, " syscall calls errors total min avg max stddev\n"); 217 printed += fprintf(fp, " (msec) (msec) (msec) (msec) (%%)\n"); 218 printed += fprintf(fp, " --------------- -------- ------ -------- --------- --------- --------- ------\n"); 219 220 printed += print_common_stats(data, max_summary, fp); 221 printed += fprintf(fp, "\n\n"); 222 223 return printed; 224 } 225 226 static int print_thread_stats(struct syscall_data **data, int nr_data, int max_summary, FILE *fp) 227 { 228 int printed = 0; 229 230 for (int i = 0; i < nr_data; i++) 231 printed += print_thread_stat(data[i], max_summary, fp); 232 233 return printed; 234 } 235 236 static int update_total_stats(struct hashmap *hash, struct syscall_key *map_key, 237 struct syscall_stats *map_data) 238 { 239 struct syscall_data *data; 240 struct syscall_stats *stat; 241 242 if (!hashmap__find(hash, map_key->nr, &data)) { 243 data = zalloc(sizeof(*data)); 244 if (data == NULL) 245 return -ENOMEM; 246 247 data->nodes = zalloc(sizeof(*data->nodes)); 248 if (data->nodes == NULL) { 249 free(data); 250 return -ENOMEM; 251 } 252 253 data->nr_nodes = 1; 254 data->key = map_key->nr; 255 data->nodes->syscall_nr = data->key; 256 257 if (hashmap__add(hash, data->key, data) < 0) { 258 free(data->nodes); 259 free(data); 260 return -ENOMEM; 261 } 262 } 263 264 /* update total stats for this syscall */ 265 data->nr_events += map_data->count; 266 data->total_time += map_data->total_time; 267 268 /* This is sum of the same syscall from different CPUs */ 269 stat = &data->nodes->stats; 270 271 stat->total_time += map_data->total_time; 272 stat->squared_sum += map_data->squared_sum; 273 stat->count += map_data->count; 274 stat->error += map_data->error; 275 276 if (stat->max_time < map_data->max_time) 277 stat->max_time = map_data->max_time; 278 if (stat->min_time > map_data->min_time || stat->min_time == 0) 279 stat->min_time = map_data->min_time; 280 281 return 0; 282 } 283 284 static int print_total_stats(struct syscall_data **data, int nr_data, int max_summary, FILE *fp) 285 { 286 int printed = 0; 287 int nr_events = 0; 288 289 for (int i = 0; i < nr_data; i++) 290 nr_events += data[i]->nr_events; 291 292 printed += fprintf(fp, " total, %d events\n\n", nr_events); 293 294 printed += fprintf(fp, " syscall calls errors total min avg max stddev\n"); 295 printed += fprintf(fp, " (msec) (msec) (msec) (msec) (%%)\n"); 296 printed += fprintf(fp, " --------------- -------- ------ -------- --------- --------- --------- ------\n"); 297 298 if (max_summary == 0 || max_summary > nr_data) 299 max_summary = nr_data; 300 301 for (int i = 0; i < max_summary; i++) 302 printed += print_common_stats(data[i], max_summary, fp); 303 304 printed += fprintf(fp, "\n\n"); 305 return printed; 306 } 307 308 static int update_cgroup_stats(struct hashmap *hash, struct syscall_key *map_key, 309 struct syscall_stats *map_data) 310 { 311 struct syscall_data *data; 312 struct syscall_node *nodes; 313 314 if (!hashmap__find(hash, map_key->cgroup, &data)) { 315 data = zalloc(sizeof(*data)); 316 if (data == NULL) 317 return -ENOMEM; 318 319 data->key = map_key->cgroup; 320 if (hashmap__add(hash, data->key, data) < 0) { 321 free(data); 322 return -ENOMEM; 323 } 324 } 325 326 /* update thread total stats */ 327 data->nr_events += map_data->count; 328 data->total_time += map_data->total_time; 329 330 nodes = reallocarray(data->nodes, data->nr_nodes + 1, sizeof(*nodes)); 331 if (nodes == NULL) 332 return -ENOMEM; 333 334 data->nodes = nodes; 335 nodes = &data->nodes[data->nr_nodes++]; 336 nodes->syscall_nr = map_key->nr; 337 338 /* each thread has an entry for each syscall, just use the stat */ 339 memcpy(&nodes->stats, map_data, sizeof(*map_data)); 340 return 0; 341 } 342 343 static int print_cgroup_stat(struct syscall_data *data, int max_summary, FILE *fp) 344 { 345 int printed = 0; 346 struct cgroup *cgrp = __cgroup__find(&cgroups, data->key); 347 348 qsort(data->nodes, data->nr_nodes, sizeof(*data->nodes), nodecmp); 349 350 if (cgrp) 351 printed += fprintf(fp, " cgroup %s,", cgrp->name); 352 else 353 printed += fprintf(fp, " cgroup id:%lu,", (unsigned long)data->key); 354 355 printed += fprintf(fp, " %d events\n\n", data->nr_events); 356 357 printed += fprintf(fp, " syscall calls errors total min avg max stddev\n"); 358 printed += fprintf(fp, " (msec) (msec) (msec) (msec) (%%)\n"); 359 printed += fprintf(fp, " --------------- -------- ------ -------- --------- --------- --------- ------\n"); 360 361 printed += print_common_stats(data, max_summary, fp); 362 printed += fprintf(fp, "\n\n"); 363 364 return printed; 365 } 366 367 static int print_cgroup_stats(struct syscall_data **data, int nr_data, int max_summary, FILE *fp) 368 { 369 int printed = 0; 370 371 for (int i = 0; i < nr_data; i++) 372 printed += print_cgroup_stat(data[i], max_summary, fp); 373 374 return printed; 375 } 376 377 int trace_print_bpf_summary(FILE *fp, int max_summary) 378 { 379 struct bpf_map *map = skel->maps.syscall_stats_map; 380 struct syscall_key *prev_key, key; 381 struct syscall_data **data = NULL; 382 struct hashmap schash; 383 struct hashmap_entry *entry; 384 int nr_data = 0; 385 int printed = 0; 386 int i; 387 size_t bkt; 388 389 hashmap__init(&schash, sc_node_hash, sc_node_equal, /*ctx=*/NULL); 390 391 printed = fprintf(fp, "\n Summary of events:\n\n"); 392 393 /* get stats from the bpf map */ 394 prev_key = NULL; 395 while (!bpf_map__get_next_key(map, prev_key, &key, sizeof(key))) { 396 struct syscall_stats stat; 397 398 if (!bpf_map__lookup_elem(map, &key, sizeof(key), &stat, sizeof(stat), 0)) { 399 switch (skel->rodata->aggr_mode) { 400 case SYSCALL_AGGR_THREAD: 401 update_thread_stats(&schash, &key, &stat); 402 break; 403 case SYSCALL_AGGR_CPU: 404 update_total_stats(&schash, &key, &stat); 405 break; 406 case SYSCALL_AGGR_CGROUP: 407 update_cgroup_stats(&schash, &key, &stat); 408 break; 409 default: 410 break; 411 } 412 } 413 414 prev_key = &key; 415 } 416 417 nr_data = hashmap__size(&schash); 418 data = calloc(nr_data, sizeof(*data)); 419 if (data == NULL) 420 goto out; 421 422 i = 0; 423 hashmap__for_each_entry(&schash, entry, bkt) 424 data[i++] = entry->pvalue; 425 426 qsort(data, nr_data, sizeof(*data), datacmp); 427 428 switch (skel->rodata->aggr_mode) { 429 case SYSCALL_AGGR_THREAD: 430 printed += print_thread_stats(data, nr_data, max_summary, fp); 431 break; 432 case SYSCALL_AGGR_CPU: 433 printed += print_total_stats(data, nr_data, max_summary, fp); 434 break; 435 case SYSCALL_AGGR_CGROUP: 436 printed += print_cgroup_stats(data, nr_data, max_summary, fp); 437 break; 438 default: 439 break; 440 } 441 442 for (i = 0; i < nr_data && data; i++) { 443 free(data[i]->nodes); 444 free(data[i]); 445 } 446 free(data); 447 448 out: 449 hashmap__clear(&schash); 450 return printed; 451 } 452 453 void trace_cleanup_bpf_summary(void) 454 { 455 if (!RB_EMPTY_ROOT(&cgroups)) { 456 struct cgroup *cgrp, *tmp; 457 458 rbtree_postorder_for_each_entry_safe(cgrp, tmp, &cgroups, node) 459 cgroup__put(cgrp); 460 461 cgroups = RB_ROOT; 462 } 463 464 syscall_summary_bpf__destroy(skel); 465 } 466