1 /* SPDX-License-Identifier: GPL-2.0 */ 2 #include <inttypes.h> 3 #include <math.h> 4 #include <stdio.h> 5 #include <stdlib.h> 6 7 #include "dwarf-regs.h" /* for EM_HOST */ 8 #include "syscalltbl.h" 9 #include "util/cgroup.h" 10 #include "util/hashmap.h" 11 #include "util/trace.h" 12 #include "util/util.h" 13 #include <bpf/bpf.h> 14 #include <linux/rbtree.h> 15 #include <linux/time64.h> 16 #include <tools/libc_compat.h> /* reallocarray */ 17 18 #include "bpf_skel/syscall_summary.h" 19 #include "bpf_skel/syscall_summary.skel.h" 20 21 22 static struct syscall_summary_bpf *skel; 23 static struct rb_root cgroups = RB_ROOT; 24 25 int trace_prepare_bpf_summary(enum trace_summary_mode mode) 26 { 27 skel = syscall_summary_bpf__open(); 28 if (skel == NULL) { 29 fprintf(stderr, "failed to open syscall summary bpf skeleton\n"); 30 return -1; 31 } 32 33 if (mode == SUMMARY__BY_THREAD) 34 skel->rodata->aggr_mode = SYSCALL_AGGR_THREAD; 35 else if (mode == SUMMARY__BY_CGROUP) 36 skel->rodata->aggr_mode = SYSCALL_AGGR_CGROUP; 37 else 38 skel->rodata->aggr_mode = SYSCALL_AGGR_CPU; 39 40 if (cgroup_is_v2("perf_event") > 0) 41 skel->rodata->use_cgroup_v2 = 1; 42 43 if (syscall_summary_bpf__load(skel) < 0) { 44 fprintf(stderr, "failed to load syscall summary bpf skeleton\n"); 45 return -1; 46 } 47 48 if (syscall_summary_bpf__attach(skel) < 0) { 49 fprintf(stderr, "failed to attach syscall summary bpf skeleton\n"); 50 return -1; 51 } 52 53 if (mode == SUMMARY__BY_CGROUP) 54 read_all_cgroups(&cgroups); 55 56 return 0; 57 } 58 59 void trace_start_bpf_summary(void) 60 { 61 skel->bss->enabled = 1; 62 } 63 64 void trace_end_bpf_summary(void) 65 { 66 skel->bss->enabled = 0; 67 } 68 69 struct syscall_node { 70 int syscall_nr; 71 struct syscall_stats stats; 72 }; 73 74 static double rel_stddev(struct syscall_stats *stat) 75 { 76 double variance, average; 77 78 if (stat->count < 2) 79 return 0; 80 81 average = (double)stat->total_time / stat->count; 82 83 variance = stat->squared_sum; 84 variance -= (stat->total_time * stat->total_time) / stat->count; 85 variance /= stat->count - 1; 86 87 return 100 * sqrt(variance / stat->count) / average; 88 } 89 90 /* 91 * The syscall_data is to maintain syscall stats ordered by total time. 92 * It supports different summary modes like per-thread or global. 93 * 94 * For per-thread stats, it uses two-level data strurcture - 95 * syscall_data is keyed by TID and has an array of nodes which 96 * represents each syscall for the thread. 97 * 98 * For global stats, it's still two-level technically but we don't need 99 * per-cpu analysis so it's keyed by the syscall number to combine stats 100 * from different CPUs. And syscall_data always has a syscall_node so 101 * it can effectively work as flat hierarchy. 102 * 103 * For per-cgroup stats, it uses two-level data structure like thread 104 * syscall_data is keyed by CGROUP and has an array of node which 105 * represents each syscall for the cgroup. 106 */ 107 struct syscall_data { 108 u64 key; /* tid if AGGR_THREAD, syscall-nr if AGGR_CPU, cgroup if AGGR_CGROUP */ 109 int nr_events; 110 int nr_nodes; 111 u64 total_time; 112 struct syscall_node *nodes; 113 }; 114 115 static int datacmp(const void *a, const void *b) 116 { 117 const struct syscall_data * const *sa = a; 118 const struct syscall_data * const *sb = b; 119 120 return (*sa)->total_time > (*sb)->total_time ? -1 : 1; 121 } 122 123 static int nodecmp(const void *a, const void *b) 124 { 125 const struct syscall_node *na = a; 126 const struct syscall_node *nb = b; 127 128 return na->stats.total_time > nb->stats.total_time ? -1 : 1; 129 } 130 131 static size_t sc_node_hash(long key, void *ctx __maybe_unused) 132 { 133 return key; 134 } 135 136 static bool sc_node_equal(long key1, long key2, void *ctx __maybe_unused) 137 { 138 return key1 == key2; 139 } 140 141 static int print_common_stats(struct syscall_data *data, int max_summary, FILE *fp) 142 { 143 int printed = 0; 144 145 if (max_summary == 0 || max_summary > data->nr_nodes) 146 max_summary = data->nr_nodes; 147 148 for (int i = 0; i < max_summary; i++) { 149 struct syscall_node *node = &data->nodes[i]; 150 struct syscall_stats *stat = &node->stats; 151 double total = (double)(stat->total_time) / NSEC_PER_MSEC; 152 double min = (double)(stat->min_time) / NSEC_PER_MSEC; 153 double max = (double)(stat->max_time) / NSEC_PER_MSEC; 154 double avg = total / stat->count; 155 const char *name; 156 157 /* TODO: support other ABIs */ 158 name = syscalltbl__name(EM_HOST, node->syscall_nr); 159 if (name) 160 printed += fprintf(fp, " %-15s", name); 161 else 162 printed += fprintf(fp, " syscall:%-7d", node->syscall_nr); 163 164 printed += fprintf(fp, " %8u %6u %9.3f %9.3f %9.3f %9.3f %9.2f%%\n", 165 stat->count, stat->error, total, min, avg, max, 166 rel_stddev(stat)); 167 } 168 return printed; 169 } 170 171 static int update_thread_stats(struct hashmap *hash, struct syscall_key *map_key, 172 struct syscall_stats *map_data) 173 { 174 struct syscall_data *data; 175 struct syscall_node *nodes; 176 177 if (!hashmap__find(hash, map_key->cpu_or_tid, &data)) { 178 data = zalloc(sizeof(*data)); 179 if (data == NULL) 180 return -ENOMEM; 181 182 data->key = map_key->cpu_or_tid; 183 if (hashmap__add(hash, data->key, data) < 0) { 184 free(data); 185 return -ENOMEM; 186 } 187 } 188 189 /* update thread total stats */ 190 data->nr_events += map_data->count; 191 data->total_time += map_data->total_time; 192 193 nodes = reallocarray(data->nodes, data->nr_nodes + 1, sizeof(*nodes)); 194 if (nodes == NULL) 195 return -ENOMEM; 196 197 data->nodes = nodes; 198 nodes = &data->nodes[data->nr_nodes++]; 199 nodes->syscall_nr = map_key->nr; 200 201 /* each thread has an entry for each syscall, just use the stat */ 202 memcpy(&nodes->stats, map_data, sizeof(*map_data)); 203 return 0; 204 } 205 206 static int print_thread_stat(struct syscall_data *data, int max_summary, FILE *fp) 207 { 208 int printed = 0; 209 210 qsort(data->nodes, data->nr_nodes, sizeof(*data->nodes), nodecmp); 211 212 printed += fprintf(fp, " thread (%d), ", (int)data->key); 213 printed += fprintf(fp, "%d events\n\n", data->nr_events); 214 215 printed += fprintf(fp, " syscall calls errors total min avg max stddev\n"); 216 printed += fprintf(fp, " (msec) (msec) (msec) (msec) (%%)\n"); 217 printed += fprintf(fp, " --------------- -------- ------ -------- --------- --------- --------- ------\n"); 218 219 printed += print_common_stats(data, max_summary, fp); 220 printed += fprintf(fp, "\n\n"); 221 222 return printed; 223 } 224 225 static int print_thread_stats(struct syscall_data **data, int nr_data, int max_summary, FILE *fp) 226 { 227 int printed = 0; 228 229 for (int i = 0; i < nr_data; i++) 230 printed += print_thread_stat(data[i], max_summary, fp); 231 232 return printed; 233 } 234 235 static int update_total_stats(struct hashmap *hash, struct syscall_key *map_key, 236 struct syscall_stats *map_data) 237 { 238 struct syscall_data *data; 239 struct syscall_stats *stat; 240 241 if (!hashmap__find(hash, map_key->nr, &data)) { 242 data = zalloc(sizeof(*data)); 243 if (data == NULL) 244 return -ENOMEM; 245 246 data->nodes = zalloc(sizeof(*data->nodes)); 247 if (data->nodes == NULL) { 248 free(data); 249 return -ENOMEM; 250 } 251 252 data->nr_nodes = 1; 253 data->key = map_key->nr; 254 data->nodes->syscall_nr = data->key; 255 256 if (hashmap__add(hash, data->key, data) < 0) { 257 free(data->nodes); 258 free(data); 259 return -ENOMEM; 260 } 261 } 262 263 /* update total stats for this syscall */ 264 data->nr_events += map_data->count; 265 data->total_time += map_data->total_time; 266 267 /* This is sum of the same syscall from different CPUs */ 268 stat = &data->nodes->stats; 269 270 stat->total_time += map_data->total_time; 271 stat->squared_sum += map_data->squared_sum; 272 stat->count += map_data->count; 273 stat->error += map_data->error; 274 275 if (stat->max_time < map_data->max_time) 276 stat->max_time = map_data->max_time; 277 if (stat->min_time > map_data->min_time || stat->min_time == 0) 278 stat->min_time = map_data->min_time; 279 280 return 0; 281 } 282 283 static int print_total_stats(struct syscall_data **data, int nr_data, int max_summary, FILE *fp) 284 { 285 int printed = 0; 286 int nr_events = 0; 287 288 for (int i = 0; i < nr_data; i++) 289 nr_events += data[i]->nr_events; 290 291 printed += fprintf(fp, " total, %d events\n\n", nr_events); 292 293 printed += fprintf(fp, " syscall calls errors total min avg max stddev\n"); 294 printed += fprintf(fp, " (msec) (msec) (msec) (msec) (%%)\n"); 295 printed += fprintf(fp, " --------------- -------- ------ -------- --------- --------- --------- ------\n"); 296 297 if (max_summary == 0 || max_summary > nr_data) 298 max_summary = nr_data; 299 300 for (int i = 0; i < max_summary; i++) 301 printed += print_common_stats(data[i], max_summary, fp); 302 303 printed += fprintf(fp, "\n\n"); 304 return printed; 305 } 306 307 static int update_cgroup_stats(struct hashmap *hash, struct syscall_key *map_key, 308 struct syscall_stats *map_data) 309 { 310 struct syscall_data *data; 311 struct syscall_node *nodes; 312 313 if (!hashmap__find(hash, map_key->cgroup, &data)) { 314 data = zalloc(sizeof(*data)); 315 if (data == NULL) 316 return -ENOMEM; 317 318 data->key = map_key->cgroup; 319 if (hashmap__add(hash, data->key, data) < 0) { 320 free(data); 321 return -ENOMEM; 322 } 323 } 324 325 /* update thread total stats */ 326 data->nr_events += map_data->count; 327 data->total_time += map_data->total_time; 328 329 nodes = reallocarray(data->nodes, data->nr_nodes + 1, sizeof(*nodes)); 330 if (nodes == NULL) 331 return -ENOMEM; 332 333 data->nodes = nodes; 334 nodes = &data->nodes[data->nr_nodes++]; 335 nodes->syscall_nr = map_key->nr; 336 337 /* each thread has an entry for each syscall, just use the stat */ 338 memcpy(&nodes->stats, map_data, sizeof(*map_data)); 339 return 0; 340 } 341 342 static int print_cgroup_stat(struct syscall_data *data, int max_summary, FILE *fp) 343 { 344 int printed = 0; 345 struct cgroup *cgrp = __cgroup__find(&cgroups, data->key); 346 347 qsort(data->nodes, data->nr_nodes, sizeof(*data->nodes), nodecmp); 348 349 if (cgrp) 350 printed += fprintf(fp, " cgroup %s,", cgrp->name); 351 else 352 printed += fprintf(fp, " cgroup id:%lu,", (unsigned long)data->key); 353 354 printed += fprintf(fp, " %d events\n\n", data->nr_events); 355 356 printed += fprintf(fp, " syscall calls errors total min avg max stddev\n"); 357 printed += fprintf(fp, " (msec) (msec) (msec) (msec) (%%)\n"); 358 printed += fprintf(fp, " --------------- -------- ------ -------- --------- --------- --------- ------\n"); 359 360 printed += print_common_stats(data, max_summary, fp); 361 printed += fprintf(fp, "\n\n"); 362 363 return printed; 364 } 365 366 static int print_cgroup_stats(struct syscall_data **data, int nr_data, int max_summary, FILE *fp) 367 { 368 int printed = 0; 369 370 for (int i = 0; i < nr_data; i++) 371 printed += print_cgroup_stat(data[i], max_summary, fp); 372 373 return printed; 374 } 375 376 int trace_print_bpf_summary(FILE *fp, int max_summary) 377 { 378 struct bpf_map *map = skel->maps.syscall_stats_map; 379 struct syscall_key *prev_key, key; 380 struct syscall_data **data = NULL; 381 struct hashmap schash; 382 struct hashmap_entry *entry; 383 int nr_data = 0; 384 int printed = 0; 385 int i; 386 size_t bkt; 387 388 hashmap__init(&schash, sc_node_hash, sc_node_equal, /*ctx=*/NULL); 389 390 printed = fprintf(fp, "\n Summary of events:\n\n"); 391 392 /* get stats from the bpf map */ 393 prev_key = NULL; 394 while (!bpf_map__get_next_key(map, prev_key, &key, sizeof(key))) { 395 struct syscall_stats stat; 396 397 if (!bpf_map__lookup_elem(map, &key, sizeof(key), &stat, sizeof(stat), 0)) { 398 switch (skel->rodata->aggr_mode) { 399 case SYSCALL_AGGR_THREAD: 400 update_thread_stats(&schash, &key, &stat); 401 break; 402 case SYSCALL_AGGR_CPU: 403 update_total_stats(&schash, &key, &stat); 404 break; 405 case SYSCALL_AGGR_CGROUP: 406 update_cgroup_stats(&schash, &key, &stat); 407 break; 408 default: 409 break; 410 } 411 } 412 413 prev_key = &key; 414 } 415 416 nr_data = hashmap__size(&schash); 417 data = calloc(nr_data, sizeof(*data)); 418 if (data == NULL) 419 goto out; 420 421 i = 0; 422 hashmap__for_each_entry(&schash, entry, bkt) 423 data[i++] = entry->pvalue; 424 425 qsort(data, nr_data, sizeof(*data), datacmp); 426 427 switch (skel->rodata->aggr_mode) { 428 case SYSCALL_AGGR_THREAD: 429 printed += print_thread_stats(data, nr_data, max_summary, fp); 430 break; 431 case SYSCALL_AGGR_CPU: 432 printed += print_total_stats(data, nr_data, max_summary, fp); 433 break; 434 case SYSCALL_AGGR_CGROUP: 435 printed += print_cgroup_stats(data, nr_data, max_summary, fp); 436 break; 437 default: 438 break; 439 } 440 441 for (i = 0; i < nr_data && data; i++) { 442 free(data[i]->nodes); 443 free(data[i]); 444 } 445 free(data); 446 447 out: 448 hashmap__clear(&schash); 449 return printed; 450 } 451 452 void trace_cleanup_bpf_summary(void) 453 { 454 if (!RB_EMPTY_ROOT(&cgroups)) { 455 struct cgroup *cgrp, *tmp; 456 457 rbtree_postorder_for_each_entry_safe(cgrp, tmp, &cgroups, node) 458 cgroup__put(cgrp); 459 460 cgroups = RB_ROOT; 461 } 462 463 syscall_summary_bpf__destroy(skel); 464 } 465