1 /* SPDX-License-Identifier: GPL-2.0 */ 2 #include <inttypes.h> 3 #include <math.h> 4 #include <stdio.h> 5 #include <stdlib.h> 6 7 #include "dwarf-regs.h" /* for EM_HOST */ 8 #include "syscalltbl.h" 9 #include "util/cgroup.h" 10 #include "util/hashmap.h" 11 #include "util/trace.h" 12 #include "util/util.h" 13 #include <bpf/bpf.h> 14 #include <linux/rbtree.h> 15 #include <linux/time64.h> 16 #include <tools/libc_compat.h> /* reallocarray */ 17 18 #include "bpf_skel/syscall_summary.h" 19 #include "bpf_skel/syscall_summary.skel.h" 20 21 22 static struct syscall_summary_bpf *skel; 23 static struct rb_root cgroups = RB_ROOT; 24 25 int trace_prepare_bpf_summary(enum trace_summary_mode mode) 26 { 27 skel = syscall_summary_bpf__open(); 28 if (skel == NULL) { 29 fprintf(stderr, "failed to open syscall summary bpf skeleton\n"); 30 return -1; 31 } 32 33 if (mode == SUMMARY__BY_THREAD) 34 skel->rodata->aggr_mode = SYSCALL_AGGR_THREAD; 35 else if (mode == SUMMARY__BY_CGROUP) 36 skel->rodata->aggr_mode = SYSCALL_AGGR_CGROUP; 37 else 38 skel->rodata->aggr_mode = SYSCALL_AGGR_CPU; 39 40 if (cgroup_is_v2("perf_event") > 0) 41 skel->rodata->use_cgroup_v2 = 1; 42 43 if (syscall_summary_bpf__load(skel) < 0) { 44 fprintf(stderr, "failed to load syscall summary bpf skeleton\n"); 45 return -1; 46 } 47 48 if (syscall_summary_bpf__attach(skel) < 0) { 49 fprintf(stderr, "failed to attach syscall summary bpf skeleton\n"); 50 return -1; 51 } 52 53 if (mode == SUMMARY__BY_CGROUP) 54 read_all_cgroups(&cgroups); 55 56 return 0; 57 } 58 59 void trace_start_bpf_summary(void) 60 { 61 skel->bss->enabled = 1; 62 } 63 64 void trace_end_bpf_summary(void) 65 { 66 skel->bss->enabled = 0; 67 } 68 69 struct syscall_node { 70 int syscall_nr; 71 struct syscall_stats stats; 72 }; 73 74 static double rel_stddev(struct syscall_stats *stat) 75 { 76 double variance, average; 77 78 if (stat->count < 2) 79 return 0; 80 81 average = (double)stat->total_time / stat->count; 82 83 variance = stat->squared_sum; 84 variance -= (stat->total_time * stat->total_time) / stat->count; 85 variance /= stat->count - 1; 86 87 return 100 * sqrt(variance / stat->count) / average; 88 } 89 90 /* 91 * The syscall_data is to maintain syscall stats ordered by total time. 92 * It supports different summary modes like per-thread or global. 93 * 94 * For per-thread stats, it uses two-level data strurcture - 95 * syscall_data is keyed by TID and has an array of nodes which 96 * represents each syscall for the thread. 97 * 98 * For global stats, it's still two-level technically but we don't need 99 * per-cpu analysis so it's keyed by the syscall number to combine stats 100 * from different CPUs. And syscall_data always has a syscall_node so 101 * it can effectively work as flat hierarchy. 102 * 103 * For per-cgroup stats, it uses two-level data structure like thread 104 * syscall_data is keyed by CGROUP and has an array of node which 105 * represents each syscall for the cgroup. 106 */ 107 struct syscall_data { 108 u64 key; /* tid if AGGR_THREAD, syscall-nr if AGGR_CPU, cgroup if AGGR_CGROUP */ 109 int nr_events; 110 int nr_nodes; 111 u64 total_time; 112 struct syscall_node *nodes; 113 }; 114 115 static int datacmp(const void *a, const void *b) 116 { 117 const struct syscall_data * const *sa = a; 118 const struct syscall_data * const *sb = b; 119 120 return (*sa)->total_time > (*sb)->total_time ? -1 : 1; 121 } 122 123 static int nodecmp(const void *a, const void *b) 124 { 125 const struct syscall_node *na = a; 126 const struct syscall_node *nb = b; 127 128 return na->stats.total_time > nb->stats.total_time ? -1 : 1; 129 } 130 131 static size_t sc_node_hash(long key, void *ctx __maybe_unused) 132 { 133 return key; 134 } 135 136 static bool sc_node_equal(long key1, long key2, void *ctx __maybe_unused) 137 { 138 return key1 == key2; 139 } 140 141 static int print_common_stats(struct syscall_data *data, FILE *fp) 142 { 143 int printed = 0; 144 145 for (int i = 0; i < data->nr_nodes; i++) { 146 struct syscall_node *node = &data->nodes[i]; 147 struct syscall_stats *stat = &node->stats; 148 double total = (double)(stat->total_time) / NSEC_PER_MSEC; 149 double min = (double)(stat->min_time) / NSEC_PER_MSEC; 150 double max = (double)(stat->max_time) / NSEC_PER_MSEC; 151 double avg = total / stat->count; 152 const char *name; 153 154 /* TODO: support other ABIs */ 155 name = syscalltbl__name(EM_HOST, node->syscall_nr); 156 if (name) 157 printed += fprintf(fp, " %-15s", name); 158 else 159 printed += fprintf(fp, " syscall:%-7d", node->syscall_nr); 160 161 printed += fprintf(fp, " %8u %6u %9.3f %9.3f %9.3f %9.3f %9.2f%%\n", 162 stat->count, stat->error, total, min, avg, max, 163 rel_stddev(stat)); 164 } 165 return printed; 166 } 167 168 static int update_thread_stats(struct hashmap *hash, struct syscall_key *map_key, 169 struct syscall_stats *map_data) 170 { 171 struct syscall_data *data; 172 struct syscall_node *nodes; 173 174 if (!hashmap__find(hash, map_key->cpu_or_tid, &data)) { 175 data = zalloc(sizeof(*data)); 176 if (data == NULL) 177 return -ENOMEM; 178 179 data->key = map_key->cpu_or_tid; 180 if (hashmap__add(hash, data->key, data) < 0) { 181 free(data); 182 return -ENOMEM; 183 } 184 } 185 186 /* update thread total stats */ 187 data->nr_events += map_data->count; 188 data->total_time += map_data->total_time; 189 190 nodes = reallocarray(data->nodes, data->nr_nodes + 1, sizeof(*nodes)); 191 if (nodes == NULL) 192 return -ENOMEM; 193 194 data->nodes = nodes; 195 nodes = &data->nodes[data->nr_nodes++]; 196 nodes->syscall_nr = map_key->nr; 197 198 /* each thread has an entry for each syscall, just use the stat */ 199 memcpy(&nodes->stats, map_data, sizeof(*map_data)); 200 return 0; 201 } 202 203 static int print_thread_stat(struct syscall_data *data, FILE *fp) 204 { 205 int printed = 0; 206 207 qsort(data->nodes, data->nr_nodes, sizeof(*data->nodes), nodecmp); 208 209 printed += fprintf(fp, " thread (%d), ", (int)data->key); 210 printed += fprintf(fp, "%d events\n\n", data->nr_events); 211 212 printed += fprintf(fp, " syscall calls errors total min avg max stddev\n"); 213 printed += fprintf(fp, " (msec) (msec) (msec) (msec) (%%)\n"); 214 printed += fprintf(fp, " --------------- -------- ------ -------- --------- --------- --------- ------\n"); 215 216 printed += print_common_stats(data, fp); 217 printed += fprintf(fp, "\n\n"); 218 219 return printed; 220 } 221 222 static int print_thread_stats(struct syscall_data **data, int nr_data, FILE *fp) 223 { 224 int printed = 0; 225 226 for (int i = 0; i < nr_data; i++) 227 printed += print_thread_stat(data[i], fp); 228 229 return printed; 230 } 231 232 static int update_total_stats(struct hashmap *hash, struct syscall_key *map_key, 233 struct syscall_stats *map_data) 234 { 235 struct syscall_data *data; 236 struct syscall_stats *stat; 237 238 if (!hashmap__find(hash, map_key->nr, &data)) { 239 data = zalloc(sizeof(*data)); 240 if (data == NULL) 241 return -ENOMEM; 242 243 data->nodes = zalloc(sizeof(*data->nodes)); 244 if (data->nodes == NULL) { 245 free(data); 246 return -ENOMEM; 247 } 248 249 data->nr_nodes = 1; 250 data->key = map_key->nr; 251 data->nodes->syscall_nr = data->key; 252 253 if (hashmap__add(hash, data->key, data) < 0) { 254 free(data->nodes); 255 free(data); 256 return -ENOMEM; 257 } 258 } 259 260 /* update total stats for this syscall */ 261 data->nr_events += map_data->count; 262 data->total_time += map_data->total_time; 263 264 /* This is sum of the same syscall from different CPUs */ 265 stat = &data->nodes->stats; 266 267 stat->total_time += map_data->total_time; 268 stat->squared_sum += map_data->squared_sum; 269 stat->count += map_data->count; 270 stat->error += map_data->error; 271 272 if (stat->max_time < map_data->max_time) 273 stat->max_time = map_data->max_time; 274 if (stat->min_time > map_data->min_time || stat->min_time == 0) 275 stat->min_time = map_data->min_time; 276 277 return 0; 278 } 279 280 static int print_total_stats(struct syscall_data **data, int nr_data, FILE *fp) 281 { 282 int printed = 0; 283 int nr_events = 0; 284 285 for (int i = 0; i < nr_data; i++) 286 nr_events += data[i]->nr_events; 287 288 printed += fprintf(fp, " total, %d events\n\n", nr_events); 289 290 printed += fprintf(fp, " syscall calls errors total min avg max stddev\n"); 291 printed += fprintf(fp, " (msec) (msec) (msec) (msec) (%%)\n"); 292 printed += fprintf(fp, " --------------- -------- ------ -------- --------- --------- --------- ------\n"); 293 294 for (int i = 0; i < nr_data; i++) 295 printed += print_common_stats(data[i], fp); 296 297 printed += fprintf(fp, "\n\n"); 298 return printed; 299 } 300 301 static int update_cgroup_stats(struct hashmap *hash, struct syscall_key *map_key, 302 struct syscall_stats *map_data) 303 { 304 struct syscall_data *data; 305 struct syscall_node *nodes; 306 307 if (!hashmap__find(hash, map_key->cgroup, &data)) { 308 data = zalloc(sizeof(*data)); 309 if (data == NULL) 310 return -ENOMEM; 311 312 data->key = map_key->cgroup; 313 if (hashmap__add(hash, data->key, data) < 0) { 314 free(data); 315 return -ENOMEM; 316 } 317 } 318 319 /* update thread total stats */ 320 data->nr_events += map_data->count; 321 data->total_time += map_data->total_time; 322 323 nodes = reallocarray(data->nodes, data->nr_nodes + 1, sizeof(*nodes)); 324 if (nodes == NULL) 325 return -ENOMEM; 326 327 data->nodes = nodes; 328 nodes = &data->nodes[data->nr_nodes++]; 329 nodes->syscall_nr = map_key->nr; 330 331 /* each thread has an entry for each syscall, just use the stat */ 332 memcpy(&nodes->stats, map_data, sizeof(*map_data)); 333 return 0; 334 } 335 336 static int print_cgroup_stat(struct syscall_data *data, FILE *fp) 337 { 338 int printed = 0; 339 struct cgroup *cgrp = __cgroup__find(&cgroups, data->key); 340 341 qsort(data->nodes, data->nr_nodes, sizeof(*data->nodes), nodecmp); 342 343 if (cgrp) 344 printed += fprintf(fp, " cgroup %s,", cgrp->name); 345 else 346 printed += fprintf(fp, " cgroup id:%lu,", (unsigned long)data->key); 347 348 printed += fprintf(fp, " %d events\n\n", data->nr_events); 349 350 printed += fprintf(fp, " syscall calls errors total min avg max stddev\n"); 351 printed += fprintf(fp, " (msec) (msec) (msec) (msec) (%%)\n"); 352 printed += fprintf(fp, " --------------- -------- ------ -------- --------- --------- --------- ------\n"); 353 354 printed += print_common_stats(data, fp); 355 printed += fprintf(fp, "\n\n"); 356 357 return printed; 358 } 359 360 static int print_cgroup_stats(struct syscall_data **data, int nr_data, FILE *fp) 361 { 362 int printed = 0; 363 364 for (int i = 0; i < nr_data; i++) 365 printed += print_cgroup_stat(data[i], fp); 366 367 return printed; 368 } 369 370 int trace_print_bpf_summary(FILE *fp) 371 { 372 struct bpf_map *map = skel->maps.syscall_stats_map; 373 struct syscall_key *prev_key, key; 374 struct syscall_data **data = NULL; 375 struct hashmap schash; 376 struct hashmap_entry *entry; 377 int nr_data = 0; 378 int printed = 0; 379 int i; 380 size_t bkt; 381 382 hashmap__init(&schash, sc_node_hash, sc_node_equal, /*ctx=*/NULL); 383 384 printed = fprintf(fp, "\n Summary of events:\n\n"); 385 386 /* get stats from the bpf map */ 387 prev_key = NULL; 388 while (!bpf_map__get_next_key(map, prev_key, &key, sizeof(key))) { 389 struct syscall_stats stat; 390 391 if (!bpf_map__lookup_elem(map, &key, sizeof(key), &stat, sizeof(stat), 0)) { 392 switch (skel->rodata->aggr_mode) { 393 case SYSCALL_AGGR_THREAD: 394 update_thread_stats(&schash, &key, &stat); 395 break; 396 case SYSCALL_AGGR_CPU: 397 update_total_stats(&schash, &key, &stat); 398 break; 399 case SYSCALL_AGGR_CGROUP: 400 update_cgroup_stats(&schash, &key, &stat); 401 break; 402 default: 403 break; 404 } 405 } 406 407 prev_key = &key; 408 } 409 410 nr_data = hashmap__size(&schash); 411 data = calloc(nr_data, sizeof(*data)); 412 if (data == NULL) 413 goto out; 414 415 i = 0; 416 hashmap__for_each_entry(&schash, entry, bkt) 417 data[i++] = entry->pvalue; 418 419 qsort(data, nr_data, sizeof(*data), datacmp); 420 421 switch (skel->rodata->aggr_mode) { 422 case SYSCALL_AGGR_THREAD: 423 printed += print_thread_stats(data, nr_data, fp); 424 break; 425 case SYSCALL_AGGR_CPU: 426 printed += print_total_stats(data, nr_data, fp); 427 break; 428 case SYSCALL_AGGR_CGROUP: 429 printed += print_cgroup_stats(data, nr_data, fp); 430 break; 431 default: 432 break; 433 } 434 435 for (i = 0; i < nr_data && data; i++) { 436 free(data[i]->nodes); 437 free(data[i]); 438 } 439 free(data); 440 441 out: 442 hashmap__clear(&schash); 443 return printed; 444 } 445 446 void trace_cleanup_bpf_summary(void) 447 { 448 if (!RB_EMPTY_ROOT(&cgroups)) { 449 struct cgroup *cgrp, *tmp; 450 451 rbtree_postorder_for_each_entry_safe(cgrp, tmp, &cgroups, node) 452 cgroup__put(cgrp); 453 454 cgroups = RB_ROOT; 455 } 456 457 syscall_summary_bpf__destroy(skel); 458 } 459