1 /* 2 * builtin-stat.c 3 * 4 * Builtin stat command: Give a precise performance counters summary 5 * overview about any workload, CPU or specific PID. 6 * 7 * Sample output: 8 9 $ perf stat ./hackbench 10 10 11 Time: 0.118 12 13 Performance counter stats for './hackbench 10': 14 15 1708.761321 task-clock # 11.037 CPUs utilized 16 41,190 context-switches # 0.024 M/sec 17 6,735 CPU-migrations # 0.004 M/sec 18 17,318 page-faults # 0.010 M/sec 19 5,205,202,243 cycles # 3.046 GHz 20 3,856,436,920 stalled-cycles-frontend # 74.09% frontend cycles idle 21 1,600,790,871 stalled-cycles-backend # 30.75% backend cycles idle 22 2,603,501,247 instructions # 0.50 insns per cycle 23 # 1.48 stalled cycles per insn 24 484,357,498 branches # 283.455 M/sec 25 6,388,934 branch-misses # 1.32% of all branches 26 27 0.154822978 seconds time elapsed 28 29 * 30 * Copyright (C) 2008-2011, Red Hat Inc, Ingo Molnar <mingo@redhat.com> 31 * 32 * Improvements and fixes by: 33 * 34 * Arjan van de Ven <arjan@linux.intel.com> 35 * Yanmin Zhang <yanmin.zhang@intel.com> 36 * Wu Fengguang <fengguang.wu@intel.com> 37 * Mike Galbraith <efault@gmx.de> 38 * Paul Mackerras <paulus@samba.org> 39 * Jaswinder Singh Rajput <jaswinder@kernel.org> 40 * 41 * Released under the GPL v2. (and only v2, not any later version) 42 */ 43 44 #include "perf.h" 45 #include "builtin.h" 46 #include "util/util.h" 47 #include "util/parse-options.h" 48 #include "util/parse-events.h" 49 #include "util/event.h" 50 #include "util/evlist.h" 51 #include "util/evsel.h" 52 #include "util/debug.h" 53 #include "util/color.h" 54 #include "util/header.h" 55 #include "util/cpumap.h" 56 #include "util/thread.h" 57 #include "util/thread_map.h" 58 59 #include <sys/prctl.h> 60 #include <math.h> 61 #include <locale.h> 62 63 #define DEFAULT_SEPARATOR " " 64 65 static struct perf_event_attr default_attrs[] = { 66 67 { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_SW_TASK_CLOCK }, 68 { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_SW_CONTEXT_SWITCHES }, 69 { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_SW_CPU_MIGRATIONS }, 70 { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_SW_PAGE_FAULTS }, 71 72 { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_CPU_CYCLES }, 73 { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_STALLED_CYCLES_FRONTEND }, 74 { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_STALLED_CYCLES_BACKEND }, 75 { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_INSTRUCTIONS }, 76 { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_BRANCH_INSTRUCTIONS }, 77 { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_BRANCH_MISSES }, 78 79 }; 80 81 /* 82 * Detailed stats (-d), covering the L1 and last level data caches: 83 */ 84 static struct perf_event_attr detailed_attrs[] = { 85 86 { .type = PERF_TYPE_HW_CACHE, 87 .config = 88 PERF_COUNT_HW_CACHE_L1D << 0 | 89 (PERF_COUNT_HW_CACHE_OP_READ << 8) | 90 (PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16) }, 91 92 { .type = PERF_TYPE_HW_CACHE, 93 .config = 94 PERF_COUNT_HW_CACHE_L1D << 0 | 95 (PERF_COUNT_HW_CACHE_OP_READ << 8) | 96 (PERF_COUNT_HW_CACHE_RESULT_MISS << 16) }, 97 98 { .type = PERF_TYPE_HW_CACHE, 99 .config = 100 PERF_COUNT_HW_CACHE_LL << 0 | 101 (PERF_COUNT_HW_CACHE_OP_READ << 8) | 102 (PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16) }, 103 104 { .type = PERF_TYPE_HW_CACHE, 105 .config = 106 PERF_COUNT_HW_CACHE_LL << 0 | 107 (PERF_COUNT_HW_CACHE_OP_READ << 8) | 108 (PERF_COUNT_HW_CACHE_RESULT_MISS << 16) }, 109 }; 110 111 /* 112 * Very detailed stats (-d -d), covering the instruction cache and the TLB caches: 113 */ 114 static struct perf_event_attr very_detailed_attrs[] = { 115 116 { .type = PERF_TYPE_HW_CACHE, 117 .config = 118 PERF_COUNT_HW_CACHE_L1I << 0 | 119 (PERF_COUNT_HW_CACHE_OP_READ << 8) | 120 (PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16) }, 121 122 { .type = PERF_TYPE_HW_CACHE, 123 .config = 124 PERF_COUNT_HW_CACHE_L1I << 0 | 125 (PERF_COUNT_HW_CACHE_OP_READ << 8) | 126 (PERF_COUNT_HW_CACHE_RESULT_MISS << 16) }, 127 128 { .type = PERF_TYPE_HW_CACHE, 129 .config = 130 PERF_COUNT_HW_CACHE_DTLB << 0 | 131 (PERF_COUNT_HW_CACHE_OP_READ << 8) | 132 (PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16) }, 133 134 { .type = PERF_TYPE_HW_CACHE, 135 .config = 136 PERF_COUNT_HW_CACHE_DTLB << 0 | 137 (PERF_COUNT_HW_CACHE_OP_READ << 8) | 138 (PERF_COUNT_HW_CACHE_RESULT_MISS << 16) }, 139 140 { .type = PERF_TYPE_HW_CACHE, 141 .config = 142 PERF_COUNT_HW_CACHE_ITLB << 0 | 143 (PERF_COUNT_HW_CACHE_OP_READ << 8) | 144 (PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16) }, 145 146 { .type = PERF_TYPE_HW_CACHE, 147 .config = 148 PERF_COUNT_HW_CACHE_ITLB << 0 | 149 (PERF_COUNT_HW_CACHE_OP_READ << 8) | 150 (PERF_COUNT_HW_CACHE_RESULT_MISS << 16) }, 151 152 }; 153 154 /* 155 * Very, very detailed stats (-d -d -d), adding prefetch events: 156 */ 157 static struct perf_event_attr very_very_detailed_attrs[] = { 158 159 { .type = PERF_TYPE_HW_CACHE, 160 .config = 161 PERF_COUNT_HW_CACHE_L1D << 0 | 162 (PERF_COUNT_HW_CACHE_OP_PREFETCH << 8) | 163 (PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16) }, 164 165 { .type = PERF_TYPE_HW_CACHE, 166 .config = 167 PERF_COUNT_HW_CACHE_L1D << 0 | 168 (PERF_COUNT_HW_CACHE_OP_PREFETCH << 8) | 169 (PERF_COUNT_HW_CACHE_RESULT_MISS << 16) }, 170 }; 171 172 173 174 struct perf_evlist *evsel_list; 175 176 static bool system_wide = false; 177 static int run_idx = 0; 178 179 static int run_count = 1; 180 static bool no_inherit = false; 181 static bool scale = true; 182 static bool no_aggr = false; 183 static pid_t target_pid = -1; 184 static pid_t target_tid = -1; 185 static pid_t child_pid = -1; 186 static bool null_run = false; 187 static int detailed_run = 0; 188 static bool sync_run = false; 189 static bool big_num = true; 190 static int big_num_opt = -1; 191 static const char *cpu_list; 192 static const char *csv_sep = NULL; 193 static bool csv_output = false; 194 195 static volatile int done = 0; 196 197 struct stats 198 { 199 double n, mean, M2; 200 }; 201 202 struct perf_stat { 203 struct stats res_stats[3]; 204 }; 205 206 static int perf_evsel__alloc_stat_priv(struct perf_evsel *evsel) 207 { 208 evsel->priv = zalloc(sizeof(struct perf_stat)); 209 return evsel->priv == NULL ? -ENOMEM : 0; 210 } 211 212 static void perf_evsel__free_stat_priv(struct perf_evsel *evsel) 213 { 214 free(evsel->priv); 215 evsel->priv = NULL; 216 } 217 218 static void update_stats(struct stats *stats, u64 val) 219 { 220 double delta; 221 222 stats->n++; 223 delta = val - stats->mean; 224 stats->mean += delta / stats->n; 225 stats->M2 += delta*(val - stats->mean); 226 } 227 228 static double avg_stats(struct stats *stats) 229 { 230 return stats->mean; 231 } 232 233 /* 234 * http://en.wikipedia.org/wiki/Algorithms_for_calculating_variance 235 * 236 * (\Sum n_i^2) - ((\Sum n_i)^2)/n 237 * s^2 = ------------------------------- 238 * n - 1 239 * 240 * http://en.wikipedia.org/wiki/Stddev 241 * 242 * The std dev of the mean is related to the std dev by: 243 * 244 * s 245 * s_mean = ------- 246 * sqrt(n) 247 * 248 */ 249 static double stddev_stats(struct stats *stats) 250 { 251 double variance = stats->M2 / (stats->n - 1); 252 double variance_mean = variance / stats->n; 253 254 return sqrt(variance_mean); 255 } 256 257 struct stats runtime_nsecs_stats[MAX_NR_CPUS]; 258 struct stats runtime_cycles_stats[MAX_NR_CPUS]; 259 struct stats runtime_stalled_cycles_front_stats[MAX_NR_CPUS]; 260 struct stats runtime_stalled_cycles_back_stats[MAX_NR_CPUS]; 261 struct stats runtime_branches_stats[MAX_NR_CPUS]; 262 struct stats runtime_cacherefs_stats[MAX_NR_CPUS]; 263 struct stats runtime_l1_dcache_stats[MAX_NR_CPUS]; 264 struct stats runtime_l1_icache_stats[MAX_NR_CPUS]; 265 struct stats runtime_ll_cache_stats[MAX_NR_CPUS]; 266 struct stats runtime_itlb_cache_stats[MAX_NR_CPUS]; 267 struct stats runtime_dtlb_cache_stats[MAX_NR_CPUS]; 268 struct stats walltime_nsecs_stats; 269 270 static int create_perf_stat_counter(struct perf_evsel *evsel) 271 { 272 struct perf_event_attr *attr = &evsel->attr; 273 274 if (scale) 275 attr->read_format = PERF_FORMAT_TOTAL_TIME_ENABLED | 276 PERF_FORMAT_TOTAL_TIME_RUNNING; 277 278 attr->inherit = !no_inherit; 279 280 if (system_wide) 281 return perf_evsel__open_per_cpu(evsel, evsel_list->cpus, false); 282 283 if (target_pid == -1 && target_tid == -1) { 284 attr->disabled = 1; 285 attr->enable_on_exec = 1; 286 } 287 288 return perf_evsel__open_per_thread(evsel, evsel_list->threads, false); 289 } 290 291 /* 292 * Does the counter have nsecs as a unit? 293 */ 294 static inline int nsec_counter(struct perf_evsel *evsel) 295 { 296 if (perf_evsel__match(evsel, SOFTWARE, SW_CPU_CLOCK) || 297 perf_evsel__match(evsel, SOFTWARE, SW_TASK_CLOCK)) 298 return 1; 299 300 return 0; 301 } 302 303 /* 304 * Update various tracking values we maintain to print 305 * more semantic information such as miss/hit ratios, 306 * instruction rates, etc: 307 */ 308 static void update_shadow_stats(struct perf_evsel *counter, u64 *count) 309 { 310 if (perf_evsel__match(counter, SOFTWARE, SW_TASK_CLOCK)) 311 update_stats(&runtime_nsecs_stats[0], count[0]); 312 else if (perf_evsel__match(counter, HARDWARE, HW_CPU_CYCLES)) 313 update_stats(&runtime_cycles_stats[0], count[0]); 314 else if (perf_evsel__match(counter, HARDWARE, HW_STALLED_CYCLES_FRONTEND)) 315 update_stats(&runtime_stalled_cycles_front_stats[0], count[0]); 316 else if (perf_evsel__match(counter, HARDWARE, HW_STALLED_CYCLES_BACKEND)) 317 update_stats(&runtime_stalled_cycles_back_stats[0], count[0]); 318 else if (perf_evsel__match(counter, HARDWARE, HW_BRANCH_INSTRUCTIONS)) 319 update_stats(&runtime_branches_stats[0], count[0]); 320 else if (perf_evsel__match(counter, HARDWARE, HW_CACHE_REFERENCES)) 321 update_stats(&runtime_cacherefs_stats[0], count[0]); 322 else if (perf_evsel__match(counter, HW_CACHE, HW_CACHE_L1D)) 323 update_stats(&runtime_l1_dcache_stats[0], count[0]); 324 else if (perf_evsel__match(counter, HW_CACHE, HW_CACHE_L1I)) 325 update_stats(&runtime_l1_icache_stats[0], count[0]); 326 else if (perf_evsel__match(counter, HW_CACHE, HW_CACHE_LL)) 327 update_stats(&runtime_ll_cache_stats[0], count[0]); 328 else if (perf_evsel__match(counter, HW_CACHE, HW_CACHE_DTLB)) 329 update_stats(&runtime_dtlb_cache_stats[0], count[0]); 330 else if (perf_evsel__match(counter, HW_CACHE, HW_CACHE_ITLB)) 331 update_stats(&runtime_itlb_cache_stats[0], count[0]); 332 } 333 334 /* 335 * Read out the results of a single counter: 336 * aggregate counts across CPUs in system-wide mode 337 */ 338 static int read_counter_aggr(struct perf_evsel *counter) 339 { 340 struct perf_stat *ps = counter->priv; 341 u64 *count = counter->counts->aggr.values; 342 int i; 343 344 if (__perf_evsel__read(counter, evsel_list->cpus->nr, 345 evsel_list->threads->nr, scale) < 0) 346 return -1; 347 348 for (i = 0; i < 3; i++) 349 update_stats(&ps->res_stats[i], count[i]); 350 351 if (verbose) { 352 fprintf(stderr, "%s: %" PRIu64 " %" PRIu64 " %" PRIu64 "\n", 353 event_name(counter), count[0], count[1], count[2]); 354 } 355 356 /* 357 * Save the full runtime - to allow normalization during printout: 358 */ 359 update_shadow_stats(counter, count); 360 361 return 0; 362 } 363 364 /* 365 * Read out the results of a single counter: 366 * do not aggregate counts across CPUs in system-wide mode 367 */ 368 static int read_counter(struct perf_evsel *counter) 369 { 370 u64 *count; 371 int cpu; 372 373 for (cpu = 0; cpu < evsel_list->cpus->nr; cpu++) { 374 if (__perf_evsel__read_on_cpu(counter, cpu, 0, scale) < 0) 375 return -1; 376 377 count = counter->counts->cpu[cpu].values; 378 379 update_shadow_stats(counter, count); 380 } 381 382 return 0; 383 } 384 385 static int run_perf_stat(int argc __used, const char **argv) 386 { 387 unsigned long long t0, t1; 388 struct perf_evsel *counter; 389 int status = 0; 390 int child_ready_pipe[2], go_pipe[2]; 391 const bool forks = (argc > 0); 392 char buf; 393 394 if (forks && (pipe(child_ready_pipe) < 0 || pipe(go_pipe) < 0)) { 395 perror("failed to create pipes"); 396 exit(1); 397 } 398 399 if (forks) { 400 if ((child_pid = fork()) < 0) 401 perror("failed to fork"); 402 403 if (!child_pid) { 404 close(child_ready_pipe[0]); 405 close(go_pipe[1]); 406 fcntl(go_pipe[0], F_SETFD, FD_CLOEXEC); 407 408 /* 409 * Do a dummy execvp to get the PLT entry resolved, 410 * so we avoid the resolver overhead on the real 411 * execvp call. 412 */ 413 execvp("", (char **)argv); 414 415 /* 416 * Tell the parent we're ready to go 417 */ 418 close(child_ready_pipe[1]); 419 420 /* 421 * Wait until the parent tells us to go. 422 */ 423 if (read(go_pipe[0], &buf, 1) == -1) 424 perror("unable to read pipe"); 425 426 execvp(argv[0], (char **)argv); 427 428 perror(argv[0]); 429 exit(-1); 430 } 431 432 if (target_tid == -1 && target_pid == -1 && !system_wide) 433 evsel_list->threads->map[0] = child_pid; 434 435 /* 436 * Wait for the child to be ready to exec. 437 */ 438 close(child_ready_pipe[1]); 439 close(go_pipe[0]); 440 if (read(child_ready_pipe[0], &buf, 1) == -1) 441 perror("unable to read pipe"); 442 close(child_ready_pipe[0]); 443 } 444 445 list_for_each_entry(counter, &evsel_list->entries, node) { 446 if (create_perf_stat_counter(counter) < 0) { 447 if (errno == EINVAL || errno == ENOSYS || errno == ENOENT) { 448 if (verbose) 449 ui__warning("%s event is not supported by the kernel.\n", 450 event_name(counter)); 451 continue; 452 } 453 454 if (errno == EPERM || errno == EACCES) { 455 error("You may not have permission to collect %sstats.\n" 456 "\t Consider tweaking" 457 " /proc/sys/kernel/perf_event_paranoid or running as root.", 458 system_wide ? "system-wide " : ""); 459 } else { 460 error("open_counter returned with %d (%s). " 461 "/bin/dmesg may provide additional information.\n", 462 errno, strerror(errno)); 463 } 464 if (child_pid != -1) 465 kill(child_pid, SIGTERM); 466 die("Not all events could be opened.\n"); 467 return -1; 468 } 469 } 470 471 if (perf_evlist__set_filters(evsel_list)) { 472 error("failed to set filter with %d (%s)\n", errno, 473 strerror(errno)); 474 return -1; 475 } 476 477 /* 478 * Enable counters and exec the command: 479 */ 480 t0 = rdclock(); 481 482 if (forks) { 483 close(go_pipe[1]); 484 wait(&status); 485 } else { 486 while(!done) sleep(1); 487 } 488 489 t1 = rdclock(); 490 491 update_stats(&walltime_nsecs_stats, t1 - t0); 492 493 if (no_aggr) { 494 list_for_each_entry(counter, &evsel_list->entries, node) { 495 read_counter(counter); 496 perf_evsel__close_fd(counter, evsel_list->cpus->nr, 1); 497 } 498 } else { 499 list_for_each_entry(counter, &evsel_list->entries, node) { 500 read_counter_aggr(counter); 501 perf_evsel__close_fd(counter, evsel_list->cpus->nr, 502 evsel_list->threads->nr); 503 } 504 } 505 506 return WEXITSTATUS(status); 507 } 508 509 static void print_noise_pct(double total, double avg) 510 { 511 double pct = 0.0; 512 513 if (avg) 514 pct = 100.0*total/avg; 515 516 fprintf(stderr, " ( +-%6.2f%% )", pct); 517 } 518 519 static void print_noise(struct perf_evsel *evsel, double avg) 520 { 521 struct perf_stat *ps; 522 523 if (run_count == 1) 524 return; 525 526 ps = evsel->priv; 527 print_noise_pct(stddev_stats(&ps->res_stats[0]), avg); 528 } 529 530 static void nsec_printout(int cpu, struct perf_evsel *evsel, double avg) 531 { 532 double msecs = avg / 1e6; 533 char cpustr[16] = { '\0', }; 534 const char *fmt = csv_output ? "%s%.6f%s%s" : "%s%18.6f%s%-25s"; 535 536 if (no_aggr) 537 sprintf(cpustr, "CPU%*d%s", 538 csv_output ? 0 : -4, 539 evsel_list->cpus->map[cpu], csv_sep); 540 541 fprintf(stderr, fmt, cpustr, msecs, csv_sep, event_name(evsel)); 542 543 if (evsel->cgrp) 544 fprintf(stderr, "%s%s", csv_sep, evsel->cgrp->name); 545 546 if (csv_output) 547 return; 548 549 if (perf_evsel__match(evsel, SOFTWARE, SW_TASK_CLOCK)) 550 fprintf(stderr, " # %8.3f CPUs utilized ", avg / avg_stats(&walltime_nsecs_stats)); 551 } 552 553 static void print_stalled_cycles_frontend(int cpu, struct perf_evsel *evsel __used, double avg) 554 { 555 double total, ratio = 0.0; 556 const char *color; 557 558 total = avg_stats(&runtime_cycles_stats[cpu]); 559 560 if (total) 561 ratio = avg / total * 100.0; 562 563 color = PERF_COLOR_NORMAL; 564 if (ratio > 50.0) 565 color = PERF_COLOR_RED; 566 else if (ratio > 30.0) 567 color = PERF_COLOR_MAGENTA; 568 else if (ratio > 10.0) 569 color = PERF_COLOR_YELLOW; 570 571 fprintf(stderr, " # "); 572 color_fprintf(stderr, color, "%6.2f%%", ratio); 573 fprintf(stderr, " frontend cycles idle "); 574 } 575 576 static void print_stalled_cycles_backend(int cpu, struct perf_evsel *evsel __used, double avg) 577 { 578 double total, ratio = 0.0; 579 const char *color; 580 581 total = avg_stats(&runtime_cycles_stats[cpu]); 582 583 if (total) 584 ratio = avg / total * 100.0; 585 586 color = PERF_COLOR_NORMAL; 587 if (ratio > 75.0) 588 color = PERF_COLOR_RED; 589 else if (ratio > 50.0) 590 color = PERF_COLOR_MAGENTA; 591 else if (ratio > 20.0) 592 color = PERF_COLOR_YELLOW; 593 594 fprintf(stderr, " # "); 595 color_fprintf(stderr, color, "%6.2f%%", ratio); 596 fprintf(stderr, " backend cycles idle "); 597 } 598 599 static void print_branch_misses(int cpu, struct perf_evsel *evsel __used, double avg) 600 { 601 double total, ratio = 0.0; 602 const char *color; 603 604 total = avg_stats(&runtime_branches_stats[cpu]); 605 606 if (total) 607 ratio = avg / total * 100.0; 608 609 color = PERF_COLOR_NORMAL; 610 if (ratio > 20.0) 611 color = PERF_COLOR_RED; 612 else if (ratio > 10.0) 613 color = PERF_COLOR_MAGENTA; 614 else if (ratio > 5.0) 615 color = PERF_COLOR_YELLOW; 616 617 fprintf(stderr, " # "); 618 color_fprintf(stderr, color, "%6.2f%%", ratio); 619 fprintf(stderr, " of all branches "); 620 } 621 622 static void print_l1_dcache_misses(int cpu, struct perf_evsel *evsel __used, double avg) 623 { 624 double total, ratio = 0.0; 625 const char *color; 626 627 total = avg_stats(&runtime_l1_dcache_stats[cpu]); 628 629 if (total) 630 ratio = avg / total * 100.0; 631 632 color = PERF_COLOR_NORMAL; 633 if (ratio > 20.0) 634 color = PERF_COLOR_RED; 635 else if (ratio > 10.0) 636 color = PERF_COLOR_MAGENTA; 637 else if (ratio > 5.0) 638 color = PERF_COLOR_YELLOW; 639 640 fprintf(stderr, " # "); 641 color_fprintf(stderr, color, "%6.2f%%", ratio); 642 fprintf(stderr, " of all L1-dcache hits "); 643 } 644 645 static void print_l1_icache_misses(int cpu, struct perf_evsel *evsel __used, double avg) 646 { 647 double total, ratio = 0.0; 648 const char *color; 649 650 total = avg_stats(&runtime_l1_icache_stats[cpu]); 651 652 if (total) 653 ratio = avg / total * 100.0; 654 655 color = PERF_COLOR_NORMAL; 656 if (ratio > 20.0) 657 color = PERF_COLOR_RED; 658 else if (ratio > 10.0) 659 color = PERF_COLOR_MAGENTA; 660 else if (ratio > 5.0) 661 color = PERF_COLOR_YELLOW; 662 663 fprintf(stderr, " # "); 664 color_fprintf(stderr, color, "%6.2f%%", ratio); 665 fprintf(stderr, " of all L1-icache hits "); 666 } 667 668 static void print_dtlb_cache_misses(int cpu, struct perf_evsel *evsel __used, double avg) 669 { 670 double total, ratio = 0.0; 671 const char *color; 672 673 total = avg_stats(&runtime_dtlb_cache_stats[cpu]); 674 675 if (total) 676 ratio = avg / total * 100.0; 677 678 color = PERF_COLOR_NORMAL; 679 if (ratio > 20.0) 680 color = PERF_COLOR_RED; 681 else if (ratio > 10.0) 682 color = PERF_COLOR_MAGENTA; 683 else if (ratio > 5.0) 684 color = PERF_COLOR_YELLOW; 685 686 fprintf(stderr, " # "); 687 color_fprintf(stderr, color, "%6.2f%%", ratio); 688 fprintf(stderr, " of all dTLB cache hits "); 689 } 690 691 static void print_itlb_cache_misses(int cpu, struct perf_evsel *evsel __used, double avg) 692 { 693 double total, ratio = 0.0; 694 const char *color; 695 696 total = avg_stats(&runtime_itlb_cache_stats[cpu]); 697 698 if (total) 699 ratio = avg / total * 100.0; 700 701 color = PERF_COLOR_NORMAL; 702 if (ratio > 20.0) 703 color = PERF_COLOR_RED; 704 else if (ratio > 10.0) 705 color = PERF_COLOR_MAGENTA; 706 else if (ratio > 5.0) 707 color = PERF_COLOR_YELLOW; 708 709 fprintf(stderr, " # "); 710 color_fprintf(stderr, color, "%6.2f%%", ratio); 711 fprintf(stderr, " of all iTLB cache hits "); 712 } 713 714 static void print_ll_cache_misses(int cpu, struct perf_evsel *evsel __used, double avg) 715 { 716 double total, ratio = 0.0; 717 const char *color; 718 719 total = avg_stats(&runtime_ll_cache_stats[cpu]); 720 721 if (total) 722 ratio = avg / total * 100.0; 723 724 color = PERF_COLOR_NORMAL; 725 if (ratio > 20.0) 726 color = PERF_COLOR_RED; 727 else if (ratio > 10.0) 728 color = PERF_COLOR_MAGENTA; 729 else if (ratio > 5.0) 730 color = PERF_COLOR_YELLOW; 731 732 fprintf(stderr, " # "); 733 color_fprintf(stderr, color, "%6.2f%%", ratio); 734 fprintf(stderr, " of all LL-cache hits "); 735 } 736 737 static void abs_printout(int cpu, struct perf_evsel *evsel, double avg) 738 { 739 double total, ratio = 0.0; 740 char cpustr[16] = { '\0', }; 741 const char *fmt; 742 743 if (csv_output) 744 fmt = "%s%.0f%s%s"; 745 else if (big_num) 746 fmt = "%s%'18.0f%s%-25s"; 747 else 748 fmt = "%s%18.0f%s%-25s"; 749 750 if (no_aggr) 751 sprintf(cpustr, "CPU%*d%s", 752 csv_output ? 0 : -4, 753 evsel_list->cpus->map[cpu], csv_sep); 754 else 755 cpu = 0; 756 757 fprintf(stderr, fmt, cpustr, avg, csv_sep, event_name(evsel)); 758 759 if (evsel->cgrp) 760 fprintf(stderr, "%s%s", csv_sep, evsel->cgrp->name); 761 762 if (csv_output) 763 return; 764 765 if (perf_evsel__match(evsel, HARDWARE, HW_INSTRUCTIONS)) { 766 total = avg_stats(&runtime_cycles_stats[cpu]); 767 768 if (total) 769 ratio = avg / total; 770 771 fprintf(stderr, " # %5.2f insns per cycle ", ratio); 772 773 total = avg_stats(&runtime_stalled_cycles_front_stats[cpu]); 774 total = max(total, avg_stats(&runtime_stalled_cycles_back_stats[cpu])); 775 776 if (total && avg) { 777 ratio = total / avg; 778 fprintf(stderr, "\n # %5.2f stalled cycles per insn", ratio); 779 } 780 781 } else if (perf_evsel__match(evsel, HARDWARE, HW_BRANCH_MISSES) && 782 runtime_branches_stats[cpu].n != 0) { 783 print_branch_misses(cpu, evsel, avg); 784 } else if ( 785 evsel->attr.type == PERF_TYPE_HW_CACHE && 786 evsel->attr.config == ( PERF_COUNT_HW_CACHE_L1D | 787 ((PERF_COUNT_HW_CACHE_OP_READ) << 8) | 788 ((PERF_COUNT_HW_CACHE_RESULT_MISS) << 16)) && 789 runtime_l1_dcache_stats[cpu].n != 0) { 790 print_l1_dcache_misses(cpu, evsel, avg); 791 } else if ( 792 evsel->attr.type == PERF_TYPE_HW_CACHE && 793 evsel->attr.config == ( PERF_COUNT_HW_CACHE_L1I | 794 ((PERF_COUNT_HW_CACHE_OP_READ) << 8) | 795 ((PERF_COUNT_HW_CACHE_RESULT_MISS) << 16)) && 796 runtime_l1_icache_stats[cpu].n != 0) { 797 print_l1_icache_misses(cpu, evsel, avg); 798 } else if ( 799 evsel->attr.type == PERF_TYPE_HW_CACHE && 800 evsel->attr.config == ( PERF_COUNT_HW_CACHE_DTLB | 801 ((PERF_COUNT_HW_CACHE_OP_READ) << 8) | 802 ((PERF_COUNT_HW_CACHE_RESULT_MISS) << 16)) && 803 runtime_dtlb_cache_stats[cpu].n != 0) { 804 print_dtlb_cache_misses(cpu, evsel, avg); 805 } else if ( 806 evsel->attr.type == PERF_TYPE_HW_CACHE && 807 evsel->attr.config == ( PERF_COUNT_HW_CACHE_ITLB | 808 ((PERF_COUNT_HW_CACHE_OP_READ) << 8) | 809 ((PERF_COUNT_HW_CACHE_RESULT_MISS) << 16)) && 810 runtime_itlb_cache_stats[cpu].n != 0) { 811 print_itlb_cache_misses(cpu, evsel, avg); 812 } else if ( 813 evsel->attr.type == PERF_TYPE_HW_CACHE && 814 evsel->attr.config == ( PERF_COUNT_HW_CACHE_LL | 815 ((PERF_COUNT_HW_CACHE_OP_READ) << 8) | 816 ((PERF_COUNT_HW_CACHE_RESULT_MISS) << 16)) && 817 runtime_ll_cache_stats[cpu].n != 0) { 818 print_ll_cache_misses(cpu, evsel, avg); 819 } else if (perf_evsel__match(evsel, HARDWARE, HW_CACHE_MISSES) && 820 runtime_cacherefs_stats[cpu].n != 0) { 821 total = avg_stats(&runtime_cacherefs_stats[cpu]); 822 823 if (total) 824 ratio = avg * 100 / total; 825 826 fprintf(stderr, " # %8.3f %% of all cache refs ", ratio); 827 828 } else if (perf_evsel__match(evsel, HARDWARE, HW_STALLED_CYCLES_FRONTEND)) { 829 print_stalled_cycles_frontend(cpu, evsel, avg); 830 } else if (perf_evsel__match(evsel, HARDWARE, HW_STALLED_CYCLES_BACKEND)) { 831 print_stalled_cycles_backend(cpu, evsel, avg); 832 } else if (perf_evsel__match(evsel, HARDWARE, HW_CPU_CYCLES)) { 833 total = avg_stats(&runtime_nsecs_stats[cpu]); 834 835 if (total) 836 ratio = 1.0 * avg / total; 837 838 fprintf(stderr, " # %8.3f GHz ", ratio); 839 } else if (runtime_nsecs_stats[cpu].n != 0) { 840 total = avg_stats(&runtime_nsecs_stats[cpu]); 841 842 if (total) 843 ratio = 1000.0 * avg / total; 844 845 fprintf(stderr, " # %8.3f M/sec ", ratio); 846 } else { 847 fprintf(stderr, " "); 848 } 849 } 850 851 /* 852 * Print out the results of a single counter: 853 * aggregated counts in system-wide mode 854 */ 855 static void print_counter_aggr(struct perf_evsel *counter) 856 { 857 struct perf_stat *ps = counter->priv; 858 double avg = avg_stats(&ps->res_stats[0]); 859 int scaled = counter->counts->scaled; 860 861 if (scaled == -1) { 862 fprintf(stderr, "%*s%s%*s", 863 csv_output ? 0 : 18, 864 "<not counted>", 865 csv_sep, 866 csv_output ? 0 : -24, 867 event_name(counter)); 868 869 if (counter->cgrp) 870 fprintf(stderr, "%s%s", csv_sep, counter->cgrp->name); 871 872 fputc('\n', stderr); 873 return; 874 } 875 876 if (nsec_counter(counter)) 877 nsec_printout(-1, counter, avg); 878 else 879 abs_printout(-1, counter, avg); 880 881 if (csv_output) { 882 fputc('\n', stderr); 883 return; 884 } 885 886 print_noise(counter, avg); 887 888 if (scaled) { 889 double avg_enabled, avg_running; 890 891 avg_enabled = avg_stats(&ps->res_stats[1]); 892 avg_running = avg_stats(&ps->res_stats[2]); 893 894 fprintf(stderr, " [%5.2f%%]", 100 * avg_running / avg_enabled); 895 } 896 fprintf(stderr, "\n"); 897 } 898 899 /* 900 * Print out the results of a single counter: 901 * does not use aggregated count in system-wide 902 */ 903 static void print_counter(struct perf_evsel *counter) 904 { 905 u64 ena, run, val; 906 int cpu; 907 908 for (cpu = 0; cpu < evsel_list->cpus->nr; cpu++) { 909 val = counter->counts->cpu[cpu].val; 910 ena = counter->counts->cpu[cpu].ena; 911 run = counter->counts->cpu[cpu].run; 912 if (run == 0 || ena == 0) { 913 fprintf(stderr, "CPU%*d%s%*s%s%*s", 914 csv_output ? 0 : -4, 915 evsel_list->cpus->map[cpu], csv_sep, 916 csv_output ? 0 : 18, 917 "<not counted>", csv_sep, 918 csv_output ? 0 : -24, 919 event_name(counter)); 920 921 if (counter->cgrp) 922 fprintf(stderr, "%s%s", csv_sep, counter->cgrp->name); 923 924 fputc('\n', stderr); 925 continue; 926 } 927 928 if (nsec_counter(counter)) 929 nsec_printout(cpu, counter, val); 930 else 931 abs_printout(cpu, counter, val); 932 933 if (!csv_output) { 934 print_noise(counter, 1.0); 935 936 if (run != ena) 937 fprintf(stderr, " (%.2f%%)", 100.0 * run / ena); 938 } 939 fputc('\n', stderr); 940 } 941 } 942 943 static void print_stat(int argc, const char **argv) 944 { 945 struct perf_evsel *counter; 946 int i; 947 948 fflush(stdout); 949 950 if (!csv_output) { 951 fprintf(stderr, "\n"); 952 fprintf(stderr, " Performance counter stats for "); 953 if(target_pid == -1 && target_tid == -1) { 954 fprintf(stderr, "\'%s", argv[0]); 955 for (i = 1; i < argc; i++) 956 fprintf(stderr, " %s", argv[i]); 957 } else if (target_pid != -1) 958 fprintf(stderr, "process id \'%d", target_pid); 959 else 960 fprintf(stderr, "thread id \'%d", target_tid); 961 962 fprintf(stderr, "\'"); 963 if (run_count > 1) 964 fprintf(stderr, " (%d runs)", run_count); 965 fprintf(stderr, ":\n\n"); 966 } 967 968 if (no_aggr) { 969 list_for_each_entry(counter, &evsel_list->entries, node) 970 print_counter(counter); 971 } else { 972 list_for_each_entry(counter, &evsel_list->entries, node) 973 print_counter_aggr(counter); 974 } 975 976 if (!csv_output) { 977 if (!null_run) 978 fprintf(stderr, "\n"); 979 fprintf(stderr, " %17.9f seconds time elapsed", 980 avg_stats(&walltime_nsecs_stats)/1e9); 981 if (run_count > 1) { 982 fprintf(stderr, " "); 983 print_noise_pct(stddev_stats(&walltime_nsecs_stats), 984 avg_stats(&walltime_nsecs_stats)); 985 } 986 fprintf(stderr, "\n\n"); 987 } 988 } 989 990 static volatile int signr = -1; 991 992 static void skip_signal(int signo) 993 { 994 if(child_pid == -1) 995 done = 1; 996 997 signr = signo; 998 } 999 1000 static void sig_atexit(void) 1001 { 1002 if (child_pid != -1) 1003 kill(child_pid, SIGTERM); 1004 1005 if (signr == -1) 1006 return; 1007 1008 signal(signr, SIG_DFL); 1009 kill(getpid(), signr); 1010 } 1011 1012 static const char * const stat_usage[] = { 1013 "perf stat [<options>] [<command>]", 1014 NULL 1015 }; 1016 1017 static int stat__set_big_num(const struct option *opt __used, 1018 const char *s __used, int unset) 1019 { 1020 big_num_opt = unset ? 0 : 1; 1021 return 0; 1022 } 1023 1024 static const struct option options[] = { 1025 OPT_CALLBACK('e', "event", &evsel_list, "event", 1026 "event selector. use 'perf list' to list available events", 1027 parse_events), 1028 OPT_CALLBACK(0, "filter", &evsel_list, "filter", 1029 "event filter", parse_filter), 1030 OPT_BOOLEAN('i', "no-inherit", &no_inherit, 1031 "child tasks do not inherit counters"), 1032 OPT_INTEGER('p', "pid", &target_pid, 1033 "stat events on existing process id"), 1034 OPT_INTEGER('t', "tid", &target_tid, 1035 "stat events on existing thread id"), 1036 OPT_BOOLEAN('a', "all-cpus", &system_wide, 1037 "system-wide collection from all CPUs"), 1038 OPT_BOOLEAN('c', "scale", &scale, 1039 "scale/normalize counters"), 1040 OPT_INCR('v', "verbose", &verbose, 1041 "be more verbose (show counter open errors, etc)"), 1042 OPT_INTEGER('r', "repeat", &run_count, 1043 "repeat command and print average + stddev (max: 100)"), 1044 OPT_BOOLEAN('n', "null", &null_run, 1045 "null run - dont start any counters"), 1046 OPT_INCR('d', "detailed", &detailed_run, 1047 "detailed run - start a lot of events"), 1048 OPT_BOOLEAN('S', "sync", &sync_run, 1049 "call sync() before starting a run"), 1050 OPT_CALLBACK_NOOPT('B', "big-num", NULL, NULL, 1051 "print large numbers with thousands\' separators", 1052 stat__set_big_num), 1053 OPT_STRING('C', "cpu", &cpu_list, "cpu", 1054 "list of cpus to monitor in system-wide"), 1055 OPT_BOOLEAN('A', "no-aggr", &no_aggr, 1056 "disable CPU count aggregation"), 1057 OPT_STRING('x', "field-separator", &csv_sep, "separator", 1058 "print counts with custom separator"), 1059 OPT_CALLBACK('G', "cgroup", &evsel_list, "name", 1060 "monitor event in cgroup name only", 1061 parse_cgroups), 1062 OPT_END() 1063 }; 1064 1065 /* 1066 * Add default attributes, if there were no attributes specified or 1067 * if -d/--detailed, -d -d or -d -d -d is used: 1068 */ 1069 static int add_default_attributes(void) 1070 { 1071 struct perf_evsel *pos; 1072 size_t attr_nr = 0; 1073 size_t c; 1074 1075 /* Set attrs if no event is selected and !null_run: */ 1076 if (null_run) 1077 return 0; 1078 1079 if (!evsel_list->nr_entries) { 1080 for (c = 0; c < ARRAY_SIZE(default_attrs); c++) { 1081 pos = perf_evsel__new(default_attrs + c, c + attr_nr); 1082 if (pos == NULL) 1083 return -1; 1084 perf_evlist__add(evsel_list, pos); 1085 } 1086 attr_nr += c; 1087 } 1088 1089 /* Detailed events get appended to the event list: */ 1090 1091 if (detailed_run < 1) 1092 return 0; 1093 1094 /* Append detailed run extra attributes: */ 1095 for (c = 0; c < ARRAY_SIZE(detailed_attrs); c++) { 1096 pos = perf_evsel__new(detailed_attrs + c, c + attr_nr); 1097 if (pos == NULL) 1098 return -1; 1099 perf_evlist__add(evsel_list, pos); 1100 } 1101 attr_nr += c; 1102 1103 if (detailed_run < 2) 1104 return 0; 1105 1106 /* Append very detailed run extra attributes: */ 1107 for (c = 0; c < ARRAY_SIZE(very_detailed_attrs); c++) { 1108 pos = perf_evsel__new(very_detailed_attrs + c, c + attr_nr); 1109 if (pos == NULL) 1110 return -1; 1111 perf_evlist__add(evsel_list, pos); 1112 } 1113 1114 if (detailed_run < 3) 1115 return 0; 1116 1117 /* Append very, very detailed run extra attributes: */ 1118 for (c = 0; c < ARRAY_SIZE(very_very_detailed_attrs); c++) { 1119 pos = perf_evsel__new(very_very_detailed_attrs + c, c + attr_nr); 1120 if (pos == NULL) 1121 return -1; 1122 perf_evlist__add(evsel_list, pos); 1123 } 1124 1125 1126 return 0; 1127 } 1128 1129 int cmd_stat(int argc, const char **argv, const char *prefix __used) 1130 { 1131 struct perf_evsel *pos; 1132 int status = -ENOMEM; 1133 1134 setlocale(LC_ALL, ""); 1135 1136 evsel_list = perf_evlist__new(NULL, NULL); 1137 if (evsel_list == NULL) 1138 return -ENOMEM; 1139 1140 argc = parse_options(argc, argv, options, stat_usage, 1141 PARSE_OPT_STOP_AT_NON_OPTION); 1142 1143 if (csv_sep) 1144 csv_output = true; 1145 else 1146 csv_sep = DEFAULT_SEPARATOR; 1147 1148 /* 1149 * let the spreadsheet do the pretty-printing 1150 */ 1151 if (csv_output) { 1152 /* User explicitely passed -B? */ 1153 if (big_num_opt == 1) { 1154 fprintf(stderr, "-B option not supported with -x\n"); 1155 usage_with_options(stat_usage, options); 1156 } else /* Nope, so disable big number formatting */ 1157 big_num = false; 1158 } else if (big_num_opt == 0) /* User passed --no-big-num */ 1159 big_num = false; 1160 1161 if (!argc && target_pid == -1 && target_tid == -1) 1162 usage_with_options(stat_usage, options); 1163 if (run_count <= 0) 1164 usage_with_options(stat_usage, options); 1165 1166 /* no_aggr, cgroup are for system-wide only */ 1167 if ((no_aggr || nr_cgroups) && !system_wide) { 1168 fprintf(stderr, "both cgroup and no-aggregation " 1169 "modes only available in system-wide mode\n"); 1170 1171 usage_with_options(stat_usage, options); 1172 } 1173 1174 if (add_default_attributes()) 1175 goto out; 1176 1177 if (target_pid != -1) 1178 target_tid = target_pid; 1179 1180 evsel_list->threads = thread_map__new(target_pid, target_tid); 1181 if (evsel_list->threads == NULL) { 1182 pr_err("Problems finding threads of monitor\n"); 1183 usage_with_options(stat_usage, options); 1184 } 1185 1186 if (system_wide) 1187 evsel_list->cpus = cpu_map__new(cpu_list); 1188 else 1189 evsel_list->cpus = cpu_map__dummy_new(); 1190 1191 if (evsel_list->cpus == NULL) { 1192 perror("failed to parse CPUs map"); 1193 usage_with_options(stat_usage, options); 1194 return -1; 1195 } 1196 1197 list_for_each_entry(pos, &evsel_list->entries, node) { 1198 if (perf_evsel__alloc_stat_priv(pos) < 0 || 1199 perf_evsel__alloc_counts(pos, evsel_list->cpus->nr) < 0 || 1200 perf_evsel__alloc_fd(pos, evsel_list->cpus->nr, evsel_list->threads->nr) < 0) 1201 goto out_free_fd; 1202 } 1203 1204 /* 1205 * We dont want to block the signals - that would cause 1206 * child tasks to inherit that and Ctrl-C would not work. 1207 * What we want is for Ctrl-C to work in the exec()-ed 1208 * task, but being ignored by perf stat itself: 1209 */ 1210 atexit(sig_atexit); 1211 signal(SIGINT, skip_signal); 1212 signal(SIGALRM, skip_signal); 1213 signal(SIGABRT, skip_signal); 1214 1215 status = 0; 1216 for (run_idx = 0; run_idx < run_count; run_idx++) { 1217 if (run_count != 1 && verbose) 1218 fprintf(stderr, "[ perf stat: executing run #%d ... ]\n", run_idx + 1); 1219 1220 if (sync_run) 1221 sync(); 1222 1223 status = run_perf_stat(argc, argv); 1224 } 1225 1226 if (status != -1) 1227 print_stat(argc, argv); 1228 out_free_fd: 1229 list_for_each_entry(pos, &evsel_list->entries, node) 1230 perf_evsel__free_stat_priv(pos); 1231 perf_evlist__delete_maps(evsel_list); 1232 out: 1233 perf_evlist__delete(evsel_list); 1234 return status; 1235 } 1236