1 /* 2 * builtin-stat.c 3 * 4 * Builtin stat command: Give a precise performance counters summary 5 * overview about any workload, CPU or specific PID. 6 * 7 * Sample output: 8 9 $ perf stat ./hackbench 10 10 11 Time: 0.118 12 13 Performance counter stats for './hackbench 10': 14 15 1708.761321 task-clock # 11.037 CPUs utilized 16 41,190 context-switches # 0.024 M/sec 17 6,735 CPU-migrations # 0.004 M/sec 18 17,318 page-faults # 0.010 M/sec 19 5,205,202,243 cycles # 3.046 GHz 20 3,856,436,920 stalled-cycles-frontend # 74.09% frontend cycles idle 21 1,600,790,871 stalled-cycles-backend # 30.75% backend cycles idle 22 2,603,501,247 instructions # 0.50 insns per cycle 23 # 1.48 stalled cycles per insn 24 484,357,498 branches # 283.455 M/sec 25 6,388,934 branch-misses # 1.32% of all branches 26 27 0.154822978 seconds time elapsed 28 29 * 30 * Copyright (C) 2008-2011, Red Hat Inc, Ingo Molnar <mingo@redhat.com> 31 * 32 * Improvements and fixes by: 33 * 34 * Arjan van de Ven <arjan@linux.intel.com> 35 * Yanmin Zhang <yanmin.zhang@intel.com> 36 * Wu Fengguang <fengguang.wu@intel.com> 37 * Mike Galbraith <efault@gmx.de> 38 * Paul Mackerras <paulus@samba.org> 39 * Jaswinder Singh Rajput <jaswinder@kernel.org> 40 * 41 * Released under the GPL v2. (and only v2, not any later version) 42 */ 43 44 #include "perf.h" 45 #include "builtin.h" 46 #include "util/util.h" 47 #include "util/parse-options.h" 48 #include "util/parse-events.h" 49 #include "util/pmu.h" 50 #include "util/event.h" 51 #include "util/evlist.h" 52 #include "util/evsel.h" 53 #include "util/debug.h" 54 #include "util/color.h" 55 #include "util/stat.h" 56 #include "util/header.h" 57 #include "util/cpumap.h" 58 #include "util/thread.h" 59 #include "util/thread_map.h" 60 61 #include <stdlib.h> 62 #include <sys/prctl.h> 63 #include <locale.h> 64 65 #define DEFAULT_SEPARATOR " " 66 #define CNTR_NOT_SUPPORTED "<not supported>" 67 #define CNTR_NOT_COUNTED "<not counted>" 68 69 static void print_stat(int argc, const char **argv); 70 static void print_counter_aggr(struct perf_evsel *counter, char *prefix); 71 static void print_counter(struct perf_evsel *counter, char *prefix); 72 static void print_aggr(char *prefix); 73 74 /* Default events used for perf stat -T */ 75 static const char * const transaction_attrs[] = { 76 "task-clock", 77 "{" 78 "instructions," 79 "cycles," 80 "cpu/cycles-t/," 81 "cpu/tx-start/," 82 "cpu/el-start/," 83 "cpu/cycles-ct/" 84 "}" 85 }; 86 87 /* More limited version when the CPU does not have all events. */ 88 static const char * const transaction_limited_attrs[] = { 89 "task-clock", 90 "{" 91 "instructions," 92 "cycles," 93 "cpu/cycles-t/," 94 "cpu/tx-start/" 95 "}" 96 }; 97 98 /* must match transaction_attrs and the beginning limited_attrs */ 99 enum { 100 T_TASK_CLOCK, 101 T_INSTRUCTIONS, 102 T_CYCLES, 103 T_CYCLES_IN_TX, 104 T_TRANSACTION_START, 105 T_ELISION_START, 106 T_CYCLES_IN_TX_CP, 107 }; 108 109 static struct perf_evlist *evsel_list; 110 111 static struct target target = { 112 .uid = UINT_MAX, 113 }; 114 115 enum aggr_mode { 116 AGGR_NONE, 117 AGGR_GLOBAL, 118 AGGR_SOCKET, 119 AGGR_CORE, 120 }; 121 122 static int run_count = 1; 123 static bool no_inherit = false; 124 static bool scale = true; 125 static enum aggr_mode aggr_mode = AGGR_GLOBAL; 126 static volatile pid_t child_pid = -1; 127 static bool null_run = false; 128 static int detailed_run = 0; 129 static bool transaction_run; 130 static bool big_num = true; 131 static int big_num_opt = -1; 132 static const char *csv_sep = NULL; 133 static bool csv_output = false; 134 static bool group = false; 135 static FILE *output = NULL; 136 static const char *pre_cmd = NULL; 137 static const char *post_cmd = NULL; 138 static bool sync_run = false; 139 static unsigned int interval = 0; 140 static unsigned int initial_delay = 0; 141 static unsigned int unit_width = 4; /* strlen("unit") */ 142 static bool forever = false; 143 static struct timespec ref_time; 144 static struct cpu_map *aggr_map; 145 static int (*aggr_get_id)(struct cpu_map *m, int cpu); 146 147 static volatile int done = 0; 148 149 struct perf_stat { 150 struct stats res_stats[3]; 151 }; 152 153 static inline void diff_timespec(struct timespec *r, struct timespec *a, 154 struct timespec *b) 155 { 156 r->tv_sec = a->tv_sec - b->tv_sec; 157 if (a->tv_nsec < b->tv_nsec) { 158 r->tv_nsec = a->tv_nsec + 1000000000L - b->tv_nsec; 159 r->tv_sec--; 160 } else { 161 r->tv_nsec = a->tv_nsec - b->tv_nsec ; 162 } 163 } 164 165 static inline struct cpu_map *perf_evsel__cpus(struct perf_evsel *evsel) 166 { 167 return (evsel->cpus && !target.cpu_list) ? evsel->cpus : evsel_list->cpus; 168 } 169 170 static inline int perf_evsel__nr_cpus(struct perf_evsel *evsel) 171 { 172 return perf_evsel__cpus(evsel)->nr; 173 } 174 175 static void perf_evsel__reset_stat_priv(struct perf_evsel *evsel) 176 { 177 int i; 178 struct perf_stat *ps = evsel->priv; 179 180 for (i = 0; i < 3; i++) 181 init_stats(&ps->res_stats[i]); 182 } 183 184 static int perf_evsel__alloc_stat_priv(struct perf_evsel *evsel) 185 { 186 evsel->priv = zalloc(sizeof(struct perf_stat)); 187 if (evsel == NULL) 188 return -ENOMEM; 189 perf_evsel__reset_stat_priv(evsel); 190 return 0; 191 } 192 193 static void perf_evsel__free_stat_priv(struct perf_evsel *evsel) 194 { 195 zfree(&evsel->priv); 196 } 197 198 static int perf_evsel__alloc_prev_raw_counts(struct perf_evsel *evsel) 199 { 200 void *addr; 201 size_t sz; 202 203 sz = sizeof(*evsel->counts) + 204 (perf_evsel__nr_cpus(evsel) * sizeof(struct perf_counts_values)); 205 206 addr = zalloc(sz); 207 if (!addr) 208 return -ENOMEM; 209 210 evsel->prev_raw_counts = addr; 211 212 return 0; 213 } 214 215 static void perf_evsel__free_prev_raw_counts(struct perf_evsel *evsel) 216 { 217 zfree(&evsel->prev_raw_counts); 218 } 219 220 static void perf_evlist__free_stats(struct perf_evlist *evlist) 221 { 222 struct perf_evsel *evsel; 223 224 evlist__for_each(evlist, evsel) { 225 perf_evsel__free_stat_priv(evsel); 226 perf_evsel__free_counts(evsel); 227 perf_evsel__free_prev_raw_counts(evsel); 228 } 229 } 230 231 static int perf_evlist__alloc_stats(struct perf_evlist *evlist, bool alloc_raw) 232 { 233 struct perf_evsel *evsel; 234 235 evlist__for_each(evlist, evsel) { 236 if (perf_evsel__alloc_stat_priv(evsel) < 0 || 237 perf_evsel__alloc_counts(evsel, perf_evsel__nr_cpus(evsel)) < 0 || 238 (alloc_raw && perf_evsel__alloc_prev_raw_counts(evsel) < 0)) 239 goto out_free; 240 } 241 242 return 0; 243 244 out_free: 245 perf_evlist__free_stats(evlist); 246 return -1; 247 } 248 249 static struct stats runtime_nsecs_stats[MAX_NR_CPUS]; 250 static struct stats runtime_cycles_stats[MAX_NR_CPUS]; 251 static struct stats runtime_stalled_cycles_front_stats[MAX_NR_CPUS]; 252 static struct stats runtime_stalled_cycles_back_stats[MAX_NR_CPUS]; 253 static struct stats runtime_branches_stats[MAX_NR_CPUS]; 254 static struct stats runtime_cacherefs_stats[MAX_NR_CPUS]; 255 static struct stats runtime_l1_dcache_stats[MAX_NR_CPUS]; 256 static struct stats runtime_l1_icache_stats[MAX_NR_CPUS]; 257 static struct stats runtime_ll_cache_stats[MAX_NR_CPUS]; 258 static struct stats runtime_itlb_cache_stats[MAX_NR_CPUS]; 259 static struct stats runtime_dtlb_cache_stats[MAX_NR_CPUS]; 260 static struct stats runtime_cycles_in_tx_stats[MAX_NR_CPUS]; 261 static struct stats walltime_nsecs_stats; 262 static struct stats runtime_transaction_stats[MAX_NR_CPUS]; 263 static struct stats runtime_elision_stats[MAX_NR_CPUS]; 264 265 static void perf_stat__reset_stats(struct perf_evlist *evlist) 266 { 267 struct perf_evsel *evsel; 268 269 evlist__for_each(evlist, evsel) { 270 perf_evsel__reset_stat_priv(evsel); 271 perf_evsel__reset_counts(evsel, perf_evsel__nr_cpus(evsel)); 272 } 273 274 memset(runtime_nsecs_stats, 0, sizeof(runtime_nsecs_stats)); 275 memset(runtime_cycles_stats, 0, sizeof(runtime_cycles_stats)); 276 memset(runtime_stalled_cycles_front_stats, 0, sizeof(runtime_stalled_cycles_front_stats)); 277 memset(runtime_stalled_cycles_back_stats, 0, sizeof(runtime_stalled_cycles_back_stats)); 278 memset(runtime_branches_stats, 0, sizeof(runtime_branches_stats)); 279 memset(runtime_cacherefs_stats, 0, sizeof(runtime_cacherefs_stats)); 280 memset(runtime_l1_dcache_stats, 0, sizeof(runtime_l1_dcache_stats)); 281 memset(runtime_l1_icache_stats, 0, sizeof(runtime_l1_icache_stats)); 282 memset(runtime_ll_cache_stats, 0, sizeof(runtime_ll_cache_stats)); 283 memset(runtime_itlb_cache_stats, 0, sizeof(runtime_itlb_cache_stats)); 284 memset(runtime_dtlb_cache_stats, 0, sizeof(runtime_dtlb_cache_stats)); 285 memset(runtime_cycles_in_tx_stats, 0, 286 sizeof(runtime_cycles_in_tx_stats)); 287 memset(runtime_transaction_stats, 0, 288 sizeof(runtime_transaction_stats)); 289 memset(runtime_elision_stats, 0, sizeof(runtime_elision_stats)); 290 memset(&walltime_nsecs_stats, 0, sizeof(walltime_nsecs_stats)); 291 } 292 293 static int create_perf_stat_counter(struct perf_evsel *evsel) 294 { 295 struct perf_event_attr *attr = &evsel->attr; 296 297 if (scale) 298 attr->read_format = PERF_FORMAT_TOTAL_TIME_ENABLED | 299 PERF_FORMAT_TOTAL_TIME_RUNNING; 300 301 attr->inherit = !no_inherit; 302 303 if (target__has_cpu(&target)) 304 return perf_evsel__open_per_cpu(evsel, perf_evsel__cpus(evsel)); 305 306 if (!target__has_task(&target) && perf_evsel__is_group_leader(evsel)) { 307 attr->disabled = 1; 308 if (!initial_delay) 309 attr->enable_on_exec = 1; 310 } 311 312 return perf_evsel__open_per_thread(evsel, evsel_list->threads); 313 } 314 315 /* 316 * Does the counter have nsecs as a unit? 317 */ 318 static inline int nsec_counter(struct perf_evsel *evsel) 319 { 320 if (perf_evsel__match(evsel, SOFTWARE, SW_CPU_CLOCK) || 321 perf_evsel__match(evsel, SOFTWARE, SW_TASK_CLOCK)) 322 return 1; 323 324 return 0; 325 } 326 327 static struct perf_evsel *nth_evsel(int n) 328 { 329 static struct perf_evsel **array; 330 static int array_len; 331 struct perf_evsel *ev; 332 int j; 333 334 /* Assumes this only called when evsel_list does not change anymore. */ 335 if (!array) { 336 evlist__for_each(evsel_list, ev) 337 array_len++; 338 array = malloc(array_len * sizeof(void *)); 339 if (!array) 340 exit(ENOMEM); 341 j = 0; 342 evlist__for_each(evsel_list, ev) 343 array[j++] = ev; 344 } 345 if (n < array_len) 346 return array[n]; 347 return NULL; 348 } 349 350 /* 351 * Update various tracking values we maintain to print 352 * more semantic information such as miss/hit ratios, 353 * instruction rates, etc: 354 */ 355 static void update_shadow_stats(struct perf_evsel *counter, u64 *count) 356 { 357 if (perf_evsel__match(counter, SOFTWARE, SW_TASK_CLOCK)) 358 update_stats(&runtime_nsecs_stats[0], count[0]); 359 else if (perf_evsel__match(counter, HARDWARE, HW_CPU_CYCLES)) 360 update_stats(&runtime_cycles_stats[0], count[0]); 361 else if (transaction_run && 362 perf_evsel__cmp(counter, nth_evsel(T_CYCLES_IN_TX))) 363 update_stats(&runtime_cycles_in_tx_stats[0], count[0]); 364 else if (transaction_run && 365 perf_evsel__cmp(counter, nth_evsel(T_TRANSACTION_START))) 366 update_stats(&runtime_transaction_stats[0], count[0]); 367 else if (transaction_run && 368 perf_evsel__cmp(counter, nth_evsel(T_ELISION_START))) 369 update_stats(&runtime_elision_stats[0], count[0]); 370 else if (perf_evsel__match(counter, HARDWARE, HW_STALLED_CYCLES_FRONTEND)) 371 update_stats(&runtime_stalled_cycles_front_stats[0], count[0]); 372 else if (perf_evsel__match(counter, HARDWARE, HW_STALLED_CYCLES_BACKEND)) 373 update_stats(&runtime_stalled_cycles_back_stats[0], count[0]); 374 else if (perf_evsel__match(counter, HARDWARE, HW_BRANCH_INSTRUCTIONS)) 375 update_stats(&runtime_branches_stats[0], count[0]); 376 else if (perf_evsel__match(counter, HARDWARE, HW_CACHE_REFERENCES)) 377 update_stats(&runtime_cacherefs_stats[0], count[0]); 378 else if (perf_evsel__match(counter, HW_CACHE, HW_CACHE_L1D)) 379 update_stats(&runtime_l1_dcache_stats[0], count[0]); 380 else if (perf_evsel__match(counter, HW_CACHE, HW_CACHE_L1I)) 381 update_stats(&runtime_l1_icache_stats[0], count[0]); 382 else if (perf_evsel__match(counter, HW_CACHE, HW_CACHE_LL)) 383 update_stats(&runtime_ll_cache_stats[0], count[0]); 384 else if (perf_evsel__match(counter, HW_CACHE, HW_CACHE_DTLB)) 385 update_stats(&runtime_dtlb_cache_stats[0], count[0]); 386 else if (perf_evsel__match(counter, HW_CACHE, HW_CACHE_ITLB)) 387 update_stats(&runtime_itlb_cache_stats[0], count[0]); 388 } 389 390 /* 391 * Read out the results of a single counter: 392 * aggregate counts across CPUs in system-wide mode 393 */ 394 static int read_counter_aggr(struct perf_evsel *counter) 395 { 396 struct perf_stat *ps = counter->priv; 397 u64 *count = counter->counts->aggr.values; 398 int i; 399 400 if (__perf_evsel__read(counter, perf_evsel__nr_cpus(counter), 401 thread_map__nr(evsel_list->threads), scale) < 0) 402 return -1; 403 404 for (i = 0; i < 3; i++) 405 update_stats(&ps->res_stats[i], count[i]); 406 407 if (verbose) { 408 fprintf(output, "%s: %" PRIu64 " %" PRIu64 " %" PRIu64 "\n", 409 perf_evsel__name(counter), count[0], count[1], count[2]); 410 } 411 412 /* 413 * Save the full runtime - to allow normalization during printout: 414 */ 415 update_shadow_stats(counter, count); 416 417 return 0; 418 } 419 420 /* 421 * Read out the results of a single counter: 422 * do not aggregate counts across CPUs in system-wide mode 423 */ 424 static int read_counter(struct perf_evsel *counter) 425 { 426 u64 *count; 427 int cpu; 428 429 for (cpu = 0; cpu < perf_evsel__nr_cpus(counter); cpu++) { 430 if (__perf_evsel__read_on_cpu(counter, cpu, 0, scale) < 0) 431 return -1; 432 433 count = counter->counts->cpu[cpu].values; 434 435 update_shadow_stats(counter, count); 436 } 437 438 return 0; 439 } 440 441 static void print_interval(void) 442 { 443 static int num_print_interval; 444 struct perf_evsel *counter; 445 struct perf_stat *ps; 446 struct timespec ts, rs; 447 char prefix[64]; 448 449 if (aggr_mode == AGGR_GLOBAL) { 450 evlist__for_each(evsel_list, counter) { 451 ps = counter->priv; 452 memset(ps->res_stats, 0, sizeof(ps->res_stats)); 453 read_counter_aggr(counter); 454 } 455 } else { 456 evlist__for_each(evsel_list, counter) { 457 ps = counter->priv; 458 memset(ps->res_stats, 0, sizeof(ps->res_stats)); 459 read_counter(counter); 460 } 461 } 462 463 clock_gettime(CLOCK_MONOTONIC, &ts); 464 diff_timespec(&rs, &ts, &ref_time); 465 sprintf(prefix, "%6lu.%09lu%s", rs.tv_sec, rs.tv_nsec, csv_sep); 466 467 if (num_print_interval == 0 && !csv_output) { 468 switch (aggr_mode) { 469 case AGGR_SOCKET: 470 fprintf(output, "# time socket cpus counts %*s events\n", unit_width, "unit"); 471 break; 472 case AGGR_CORE: 473 fprintf(output, "# time core cpus counts %*s events\n", unit_width, "unit"); 474 break; 475 case AGGR_NONE: 476 fprintf(output, "# time CPU counts %*s events\n", unit_width, "unit"); 477 break; 478 case AGGR_GLOBAL: 479 default: 480 fprintf(output, "# time counts %*s events\n", unit_width, "unit"); 481 } 482 } 483 484 if (++num_print_interval == 25) 485 num_print_interval = 0; 486 487 switch (aggr_mode) { 488 case AGGR_CORE: 489 case AGGR_SOCKET: 490 print_aggr(prefix); 491 break; 492 case AGGR_NONE: 493 evlist__for_each(evsel_list, counter) 494 print_counter(counter, prefix); 495 break; 496 case AGGR_GLOBAL: 497 default: 498 evlist__for_each(evsel_list, counter) 499 print_counter_aggr(counter, prefix); 500 } 501 502 fflush(output); 503 } 504 505 static void handle_initial_delay(void) 506 { 507 struct perf_evsel *counter; 508 509 if (initial_delay) { 510 const int ncpus = cpu_map__nr(evsel_list->cpus), 511 nthreads = thread_map__nr(evsel_list->threads); 512 513 usleep(initial_delay * 1000); 514 evlist__for_each(evsel_list, counter) 515 perf_evsel__enable(counter, ncpus, nthreads); 516 } 517 } 518 519 static volatile int workload_exec_errno; 520 521 /* 522 * perf_evlist__prepare_workload will send a SIGUSR1 523 * if the fork fails, since we asked by setting its 524 * want_signal to true. 525 */ 526 static void workload_exec_failed_signal(int signo __maybe_unused, siginfo_t *info, 527 void *ucontext __maybe_unused) 528 { 529 workload_exec_errno = info->si_value.sival_int; 530 } 531 532 static int __run_perf_stat(int argc, const char **argv) 533 { 534 char msg[512]; 535 unsigned long long t0, t1; 536 struct perf_evsel *counter; 537 struct timespec ts; 538 size_t l; 539 int status = 0; 540 const bool forks = (argc > 0); 541 542 if (interval) { 543 ts.tv_sec = interval / 1000; 544 ts.tv_nsec = (interval % 1000) * 1000000; 545 } else { 546 ts.tv_sec = 1; 547 ts.tv_nsec = 0; 548 } 549 550 if (forks) { 551 if (perf_evlist__prepare_workload(evsel_list, &target, argv, false, 552 workload_exec_failed_signal) < 0) { 553 perror("failed to prepare workload"); 554 return -1; 555 } 556 child_pid = evsel_list->workload.pid; 557 } 558 559 if (group) 560 perf_evlist__set_leader(evsel_list); 561 562 evlist__for_each(evsel_list, counter) { 563 if (create_perf_stat_counter(counter) < 0) { 564 /* 565 * PPC returns ENXIO for HW counters until 2.6.37 566 * (behavior changed with commit b0a873e). 567 */ 568 if (errno == EINVAL || errno == ENOSYS || 569 errno == ENOENT || errno == EOPNOTSUPP || 570 errno == ENXIO) { 571 if (verbose) 572 ui__warning("%s event is not supported by the kernel.\n", 573 perf_evsel__name(counter)); 574 counter->supported = false; 575 continue; 576 } 577 578 perf_evsel__open_strerror(counter, &target, 579 errno, msg, sizeof(msg)); 580 ui__error("%s\n", msg); 581 582 if (child_pid != -1) 583 kill(child_pid, SIGTERM); 584 585 return -1; 586 } 587 counter->supported = true; 588 589 l = strlen(counter->unit); 590 if (l > unit_width) 591 unit_width = l; 592 } 593 594 if (perf_evlist__apply_filters(evsel_list)) { 595 error("failed to set filter with %d (%s)\n", errno, 596 strerror(errno)); 597 return -1; 598 } 599 600 /* 601 * Enable counters and exec the command: 602 */ 603 t0 = rdclock(); 604 clock_gettime(CLOCK_MONOTONIC, &ref_time); 605 606 if (forks) { 607 perf_evlist__start_workload(evsel_list); 608 handle_initial_delay(); 609 610 if (interval) { 611 while (!waitpid(child_pid, &status, WNOHANG)) { 612 nanosleep(&ts, NULL); 613 print_interval(); 614 } 615 } 616 wait(&status); 617 618 if (workload_exec_errno) { 619 const char *emsg = strerror_r(workload_exec_errno, msg, sizeof(msg)); 620 pr_err("Workload failed: %s\n", emsg); 621 return -1; 622 } 623 624 if (WIFSIGNALED(status)) 625 psignal(WTERMSIG(status), argv[0]); 626 } else { 627 handle_initial_delay(); 628 while (!done) { 629 nanosleep(&ts, NULL); 630 if (interval) 631 print_interval(); 632 } 633 } 634 635 t1 = rdclock(); 636 637 update_stats(&walltime_nsecs_stats, t1 - t0); 638 639 if (aggr_mode == AGGR_GLOBAL) { 640 evlist__for_each(evsel_list, counter) { 641 read_counter_aggr(counter); 642 perf_evsel__close_fd(counter, perf_evsel__nr_cpus(counter), 643 thread_map__nr(evsel_list->threads)); 644 } 645 } else { 646 evlist__for_each(evsel_list, counter) { 647 read_counter(counter); 648 perf_evsel__close_fd(counter, perf_evsel__nr_cpus(counter), 1); 649 } 650 } 651 652 return WEXITSTATUS(status); 653 } 654 655 static int run_perf_stat(int argc, const char **argv) 656 { 657 int ret; 658 659 if (pre_cmd) { 660 ret = system(pre_cmd); 661 if (ret) 662 return ret; 663 } 664 665 if (sync_run) 666 sync(); 667 668 ret = __run_perf_stat(argc, argv); 669 if (ret) 670 return ret; 671 672 if (post_cmd) { 673 ret = system(post_cmd); 674 if (ret) 675 return ret; 676 } 677 678 return ret; 679 } 680 681 static void print_noise_pct(double total, double avg) 682 { 683 double pct = rel_stddev_stats(total, avg); 684 685 if (csv_output) 686 fprintf(output, "%s%.2f%%", csv_sep, pct); 687 else if (pct) 688 fprintf(output, " ( +-%6.2f%% )", pct); 689 } 690 691 static void print_noise(struct perf_evsel *evsel, double avg) 692 { 693 struct perf_stat *ps; 694 695 if (run_count == 1) 696 return; 697 698 ps = evsel->priv; 699 print_noise_pct(stddev_stats(&ps->res_stats[0]), avg); 700 } 701 702 static void aggr_printout(struct perf_evsel *evsel, int id, int nr) 703 { 704 switch (aggr_mode) { 705 case AGGR_CORE: 706 fprintf(output, "S%d-C%*d%s%*d%s", 707 cpu_map__id_to_socket(id), 708 csv_output ? 0 : -8, 709 cpu_map__id_to_cpu(id), 710 csv_sep, 711 csv_output ? 0 : 4, 712 nr, 713 csv_sep); 714 break; 715 case AGGR_SOCKET: 716 fprintf(output, "S%*d%s%*d%s", 717 csv_output ? 0 : -5, 718 id, 719 csv_sep, 720 csv_output ? 0 : 4, 721 nr, 722 csv_sep); 723 break; 724 case AGGR_NONE: 725 fprintf(output, "CPU%*d%s", 726 csv_output ? 0 : -4, 727 perf_evsel__cpus(evsel)->map[id], csv_sep); 728 break; 729 case AGGR_GLOBAL: 730 default: 731 break; 732 } 733 } 734 735 static void nsec_printout(int cpu, int nr, struct perf_evsel *evsel, double avg) 736 { 737 double msecs = avg / 1e6; 738 const char *fmt_v, *fmt_n; 739 char name[25]; 740 741 fmt_v = csv_output ? "%.6f%s" : "%18.6f%s"; 742 fmt_n = csv_output ? "%s" : "%-25s"; 743 744 aggr_printout(evsel, cpu, nr); 745 746 scnprintf(name, sizeof(name), "%s%s", 747 perf_evsel__name(evsel), csv_output ? "" : " (msec)"); 748 749 fprintf(output, fmt_v, msecs, csv_sep); 750 751 if (csv_output) 752 fprintf(output, "%s%s", evsel->unit, csv_sep); 753 else 754 fprintf(output, "%-*s%s", unit_width, evsel->unit, csv_sep); 755 756 fprintf(output, fmt_n, name); 757 758 if (evsel->cgrp) 759 fprintf(output, "%s%s", csv_sep, evsel->cgrp->name); 760 761 if (csv_output || interval) 762 return; 763 764 if (perf_evsel__match(evsel, SOFTWARE, SW_TASK_CLOCK)) 765 fprintf(output, " # %8.3f CPUs utilized ", 766 avg / avg_stats(&walltime_nsecs_stats)); 767 else 768 fprintf(output, " "); 769 } 770 771 /* used for get_ratio_color() */ 772 enum grc_type { 773 GRC_STALLED_CYCLES_FE, 774 GRC_STALLED_CYCLES_BE, 775 GRC_CACHE_MISSES, 776 GRC_MAX_NR 777 }; 778 779 static const char *get_ratio_color(enum grc_type type, double ratio) 780 { 781 static const double grc_table[GRC_MAX_NR][3] = { 782 [GRC_STALLED_CYCLES_FE] = { 50.0, 30.0, 10.0 }, 783 [GRC_STALLED_CYCLES_BE] = { 75.0, 50.0, 20.0 }, 784 [GRC_CACHE_MISSES] = { 20.0, 10.0, 5.0 }, 785 }; 786 const char *color = PERF_COLOR_NORMAL; 787 788 if (ratio > grc_table[type][0]) 789 color = PERF_COLOR_RED; 790 else if (ratio > grc_table[type][1]) 791 color = PERF_COLOR_MAGENTA; 792 else if (ratio > grc_table[type][2]) 793 color = PERF_COLOR_YELLOW; 794 795 return color; 796 } 797 798 static void print_stalled_cycles_frontend(int cpu, 799 struct perf_evsel *evsel 800 __maybe_unused, double avg) 801 { 802 double total, ratio = 0.0; 803 const char *color; 804 805 total = avg_stats(&runtime_cycles_stats[cpu]); 806 807 if (total) 808 ratio = avg / total * 100.0; 809 810 color = get_ratio_color(GRC_STALLED_CYCLES_FE, ratio); 811 812 fprintf(output, " # "); 813 color_fprintf(output, color, "%6.2f%%", ratio); 814 fprintf(output, " frontend cycles idle "); 815 } 816 817 static void print_stalled_cycles_backend(int cpu, 818 struct perf_evsel *evsel 819 __maybe_unused, double avg) 820 { 821 double total, ratio = 0.0; 822 const char *color; 823 824 total = avg_stats(&runtime_cycles_stats[cpu]); 825 826 if (total) 827 ratio = avg / total * 100.0; 828 829 color = get_ratio_color(GRC_STALLED_CYCLES_BE, ratio); 830 831 fprintf(output, " # "); 832 color_fprintf(output, color, "%6.2f%%", ratio); 833 fprintf(output, " backend cycles idle "); 834 } 835 836 static void print_branch_misses(int cpu, 837 struct perf_evsel *evsel __maybe_unused, 838 double avg) 839 { 840 double total, ratio = 0.0; 841 const char *color; 842 843 total = avg_stats(&runtime_branches_stats[cpu]); 844 845 if (total) 846 ratio = avg / total * 100.0; 847 848 color = get_ratio_color(GRC_CACHE_MISSES, ratio); 849 850 fprintf(output, " # "); 851 color_fprintf(output, color, "%6.2f%%", ratio); 852 fprintf(output, " of all branches "); 853 } 854 855 static void print_l1_dcache_misses(int cpu, 856 struct perf_evsel *evsel __maybe_unused, 857 double avg) 858 { 859 double total, ratio = 0.0; 860 const char *color; 861 862 total = avg_stats(&runtime_l1_dcache_stats[cpu]); 863 864 if (total) 865 ratio = avg / total * 100.0; 866 867 color = get_ratio_color(GRC_CACHE_MISSES, ratio); 868 869 fprintf(output, " # "); 870 color_fprintf(output, color, "%6.2f%%", ratio); 871 fprintf(output, " of all L1-dcache hits "); 872 } 873 874 static void print_l1_icache_misses(int cpu, 875 struct perf_evsel *evsel __maybe_unused, 876 double avg) 877 { 878 double total, ratio = 0.0; 879 const char *color; 880 881 total = avg_stats(&runtime_l1_icache_stats[cpu]); 882 883 if (total) 884 ratio = avg / total * 100.0; 885 886 color = get_ratio_color(GRC_CACHE_MISSES, ratio); 887 888 fprintf(output, " # "); 889 color_fprintf(output, color, "%6.2f%%", ratio); 890 fprintf(output, " of all L1-icache hits "); 891 } 892 893 static void print_dtlb_cache_misses(int cpu, 894 struct perf_evsel *evsel __maybe_unused, 895 double avg) 896 { 897 double total, ratio = 0.0; 898 const char *color; 899 900 total = avg_stats(&runtime_dtlb_cache_stats[cpu]); 901 902 if (total) 903 ratio = avg / total * 100.0; 904 905 color = get_ratio_color(GRC_CACHE_MISSES, ratio); 906 907 fprintf(output, " # "); 908 color_fprintf(output, color, "%6.2f%%", ratio); 909 fprintf(output, " of all dTLB cache hits "); 910 } 911 912 static void print_itlb_cache_misses(int cpu, 913 struct perf_evsel *evsel __maybe_unused, 914 double avg) 915 { 916 double total, ratio = 0.0; 917 const char *color; 918 919 total = avg_stats(&runtime_itlb_cache_stats[cpu]); 920 921 if (total) 922 ratio = avg / total * 100.0; 923 924 color = get_ratio_color(GRC_CACHE_MISSES, ratio); 925 926 fprintf(output, " # "); 927 color_fprintf(output, color, "%6.2f%%", ratio); 928 fprintf(output, " of all iTLB cache hits "); 929 } 930 931 static void print_ll_cache_misses(int cpu, 932 struct perf_evsel *evsel __maybe_unused, 933 double avg) 934 { 935 double total, ratio = 0.0; 936 const char *color; 937 938 total = avg_stats(&runtime_ll_cache_stats[cpu]); 939 940 if (total) 941 ratio = avg / total * 100.0; 942 943 color = get_ratio_color(GRC_CACHE_MISSES, ratio); 944 945 fprintf(output, " # "); 946 color_fprintf(output, color, "%6.2f%%", ratio); 947 fprintf(output, " of all LL-cache hits "); 948 } 949 950 static void abs_printout(int cpu, int nr, struct perf_evsel *evsel, double avg) 951 { 952 double total, ratio = 0.0, total2; 953 double sc = evsel->scale; 954 const char *fmt; 955 956 if (csv_output) { 957 fmt = sc != 1.0 ? "%.2f%s" : "%.0f%s"; 958 } else { 959 if (big_num) 960 fmt = sc != 1.0 ? "%'18.2f%s" : "%'18.0f%s"; 961 else 962 fmt = sc != 1.0 ? "%18.2f%s" : "%18.0f%s"; 963 } 964 965 aggr_printout(evsel, cpu, nr); 966 967 if (aggr_mode == AGGR_GLOBAL) 968 cpu = 0; 969 970 fprintf(output, fmt, avg, csv_sep); 971 972 if (evsel->unit) 973 fprintf(output, "%-*s%s", 974 csv_output ? 0 : unit_width, 975 evsel->unit, csv_sep); 976 977 fprintf(output, "%-*s", csv_output ? 0 : 25, perf_evsel__name(evsel)); 978 979 if (evsel->cgrp) 980 fprintf(output, "%s%s", csv_sep, evsel->cgrp->name); 981 982 if (csv_output || interval) 983 return; 984 985 if (perf_evsel__match(evsel, HARDWARE, HW_INSTRUCTIONS)) { 986 total = avg_stats(&runtime_cycles_stats[cpu]); 987 if (total) { 988 ratio = avg / total; 989 fprintf(output, " # %5.2f insns per cycle ", ratio); 990 } 991 total = avg_stats(&runtime_stalled_cycles_front_stats[cpu]); 992 total = max(total, avg_stats(&runtime_stalled_cycles_back_stats[cpu])); 993 994 if (total && avg) { 995 ratio = total / avg; 996 fprintf(output, "\n"); 997 if (aggr_mode == AGGR_NONE) 998 fprintf(output, " "); 999 fprintf(output, " # %5.2f stalled cycles per insn", ratio); 1000 } 1001 1002 } else if (perf_evsel__match(evsel, HARDWARE, HW_BRANCH_MISSES) && 1003 runtime_branches_stats[cpu].n != 0) { 1004 print_branch_misses(cpu, evsel, avg); 1005 } else if ( 1006 evsel->attr.type == PERF_TYPE_HW_CACHE && 1007 evsel->attr.config == ( PERF_COUNT_HW_CACHE_L1D | 1008 ((PERF_COUNT_HW_CACHE_OP_READ) << 8) | 1009 ((PERF_COUNT_HW_CACHE_RESULT_MISS) << 16)) && 1010 runtime_l1_dcache_stats[cpu].n != 0) { 1011 print_l1_dcache_misses(cpu, evsel, avg); 1012 } else if ( 1013 evsel->attr.type == PERF_TYPE_HW_CACHE && 1014 evsel->attr.config == ( PERF_COUNT_HW_CACHE_L1I | 1015 ((PERF_COUNT_HW_CACHE_OP_READ) << 8) | 1016 ((PERF_COUNT_HW_CACHE_RESULT_MISS) << 16)) && 1017 runtime_l1_icache_stats[cpu].n != 0) { 1018 print_l1_icache_misses(cpu, evsel, avg); 1019 } else if ( 1020 evsel->attr.type == PERF_TYPE_HW_CACHE && 1021 evsel->attr.config == ( PERF_COUNT_HW_CACHE_DTLB | 1022 ((PERF_COUNT_HW_CACHE_OP_READ) << 8) | 1023 ((PERF_COUNT_HW_CACHE_RESULT_MISS) << 16)) && 1024 runtime_dtlb_cache_stats[cpu].n != 0) { 1025 print_dtlb_cache_misses(cpu, evsel, avg); 1026 } else if ( 1027 evsel->attr.type == PERF_TYPE_HW_CACHE && 1028 evsel->attr.config == ( PERF_COUNT_HW_CACHE_ITLB | 1029 ((PERF_COUNT_HW_CACHE_OP_READ) << 8) | 1030 ((PERF_COUNT_HW_CACHE_RESULT_MISS) << 16)) && 1031 runtime_itlb_cache_stats[cpu].n != 0) { 1032 print_itlb_cache_misses(cpu, evsel, avg); 1033 } else if ( 1034 evsel->attr.type == PERF_TYPE_HW_CACHE && 1035 evsel->attr.config == ( PERF_COUNT_HW_CACHE_LL | 1036 ((PERF_COUNT_HW_CACHE_OP_READ) << 8) | 1037 ((PERF_COUNT_HW_CACHE_RESULT_MISS) << 16)) && 1038 runtime_ll_cache_stats[cpu].n != 0) { 1039 print_ll_cache_misses(cpu, evsel, avg); 1040 } else if (perf_evsel__match(evsel, HARDWARE, HW_CACHE_MISSES) && 1041 runtime_cacherefs_stats[cpu].n != 0) { 1042 total = avg_stats(&runtime_cacherefs_stats[cpu]); 1043 1044 if (total) 1045 ratio = avg * 100 / total; 1046 1047 fprintf(output, " # %8.3f %% of all cache refs ", ratio); 1048 1049 } else if (perf_evsel__match(evsel, HARDWARE, HW_STALLED_CYCLES_FRONTEND)) { 1050 print_stalled_cycles_frontend(cpu, evsel, avg); 1051 } else if (perf_evsel__match(evsel, HARDWARE, HW_STALLED_CYCLES_BACKEND)) { 1052 print_stalled_cycles_backend(cpu, evsel, avg); 1053 } else if (perf_evsel__match(evsel, HARDWARE, HW_CPU_CYCLES)) { 1054 total = avg_stats(&runtime_nsecs_stats[cpu]); 1055 1056 if (total) { 1057 ratio = avg / total; 1058 fprintf(output, " # %8.3f GHz ", ratio); 1059 } 1060 } else if (transaction_run && 1061 perf_evsel__cmp(evsel, nth_evsel(T_CYCLES_IN_TX))) { 1062 total = avg_stats(&runtime_cycles_stats[cpu]); 1063 if (total) 1064 fprintf(output, 1065 " # %5.2f%% transactional cycles ", 1066 100.0 * (avg / total)); 1067 } else if (transaction_run && 1068 perf_evsel__cmp(evsel, nth_evsel(T_CYCLES_IN_TX_CP))) { 1069 total = avg_stats(&runtime_cycles_stats[cpu]); 1070 total2 = avg_stats(&runtime_cycles_in_tx_stats[cpu]); 1071 if (total2 < avg) 1072 total2 = avg; 1073 if (total) 1074 fprintf(output, 1075 " # %5.2f%% aborted cycles ", 1076 100.0 * ((total2-avg) / total)); 1077 } else if (transaction_run && 1078 perf_evsel__cmp(evsel, nth_evsel(T_TRANSACTION_START)) && 1079 avg > 0 && 1080 runtime_cycles_in_tx_stats[cpu].n != 0) { 1081 total = avg_stats(&runtime_cycles_in_tx_stats[cpu]); 1082 1083 if (total) 1084 ratio = total / avg; 1085 1086 fprintf(output, " # %8.0f cycles / transaction ", ratio); 1087 } else if (transaction_run && 1088 perf_evsel__cmp(evsel, nth_evsel(T_ELISION_START)) && 1089 avg > 0 && 1090 runtime_cycles_in_tx_stats[cpu].n != 0) { 1091 total = avg_stats(&runtime_cycles_in_tx_stats[cpu]); 1092 1093 if (total) 1094 ratio = total / avg; 1095 1096 fprintf(output, " # %8.0f cycles / elision ", ratio); 1097 } else if (runtime_nsecs_stats[cpu].n != 0) { 1098 char unit = 'M'; 1099 1100 total = avg_stats(&runtime_nsecs_stats[cpu]); 1101 1102 if (total) 1103 ratio = 1000.0 * avg / total; 1104 if (ratio < 0.001) { 1105 ratio *= 1000; 1106 unit = 'K'; 1107 } 1108 1109 fprintf(output, " # %8.3f %c/sec ", ratio, unit); 1110 } else { 1111 fprintf(output, " "); 1112 } 1113 } 1114 1115 static void print_aggr(char *prefix) 1116 { 1117 struct perf_evsel *counter; 1118 int cpu, cpu2, s, s2, id, nr; 1119 double uval; 1120 u64 ena, run, val; 1121 1122 if (!(aggr_map || aggr_get_id)) 1123 return; 1124 1125 for (s = 0; s < aggr_map->nr; s++) { 1126 id = aggr_map->map[s]; 1127 evlist__for_each(evsel_list, counter) { 1128 val = ena = run = 0; 1129 nr = 0; 1130 for (cpu = 0; cpu < perf_evsel__nr_cpus(counter); cpu++) { 1131 cpu2 = perf_evsel__cpus(counter)->map[cpu]; 1132 s2 = aggr_get_id(evsel_list->cpus, cpu2); 1133 if (s2 != id) 1134 continue; 1135 val += counter->counts->cpu[cpu].val; 1136 ena += counter->counts->cpu[cpu].ena; 1137 run += counter->counts->cpu[cpu].run; 1138 nr++; 1139 } 1140 if (prefix) 1141 fprintf(output, "%s", prefix); 1142 1143 if (run == 0 || ena == 0) { 1144 aggr_printout(counter, id, nr); 1145 1146 fprintf(output, "%*s%s", 1147 csv_output ? 0 : 18, 1148 counter->supported ? CNTR_NOT_COUNTED : CNTR_NOT_SUPPORTED, 1149 csv_sep); 1150 1151 fprintf(output, "%-*s%s", 1152 csv_output ? 0 : unit_width, 1153 counter->unit, csv_sep); 1154 1155 fprintf(output, "%*s", 1156 csv_output ? 0 : -25, 1157 perf_evsel__name(counter)); 1158 1159 if (counter->cgrp) 1160 fprintf(output, "%s%s", 1161 csv_sep, counter->cgrp->name); 1162 1163 fputc('\n', output); 1164 continue; 1165 } 1166 uval = val * counter->scale; 1167 1168 if (nsec_counter(counter)) 1169 nsec_printout(id, nr, counter, uval); 1170 else 1171 abs_printout(id, nr, counter, uval); 1172 1173 if (!csv_output) { 1174 print_noise(counter, 1.0); 1175 1176 if (run != ena) 1177 fprintf(output, " (%.2f%%)", 1178 100.0 * run / ena); 1179 } 1180 fputc('\n', output); 1181 } 1182 } 1183 } 1184 1185 /* 1186 * Print out the results of a single counter: 1187 * aggregated counts in system-wide mode 1188 */ 1189 static void print_counter_aggr(struct perf_evsel *counter, char *prefix) 1190 { 1191 struct perf_stat *ps = counter->priv; 1192 double avg = avg_stats(&ps->res_stats[0]); 1193 int scaled = counter->counts->scaled; 1194 double uval; 1195 1196 if (prefix) 1197 fprintf(output, "%s", prefix); 1198 1199 if (scaled == -1) { 1200 fprintf(output, "%*s%s", 1201 csv_output ? 0 : 18, 1202 counter->supported ? CNTR_NOT_COUNTED : CNTR_NOT_SUPPORTED, 1203 csv_sep); 1204 fprintf(output, "%-*s%s", 1205 csv_output ? 0 : unit_width, 1206 counter->unit, csv_sep); 1207 fprintf(output, "%*s", 1208 csv_output ? 0 : -25, 1209 perf_evsel__name(counter)); 1210 1211 if (counter->cgrp) 1212 fprintf(output, "%s%s", csv_sep, counter->cgrp->name); 1213 1214 fputc('\n', output); 1215 return; 1216 } 1217 1218 uval = avg * counter->scale; 1219 1220 if (nsec_counter(counter)) 1221 nsec_printout(-1, 0, counter, uval); 1222 else 1223 abs_printout(-1, 0, counter, uval); 1224 1225 print_noise(counter, avg); 1226 1227 if (csv_output) { 1228 fputc('\n', output); 1229 return; 1230 } 1231 1232 if (scaled) { 1233 double avg_enabled, avg_running; 1234 1235 avg_enabled = avg_stats(&ps->res_stats[1]); 1236 avg_running = avg_stats(&ps->res_stats[2]); 1237 1238 fprintf(output, " [%5.2f%%]", 100 * avg_running / avg_enabled); 1239 } 1240 fprintf(output, "\n"); 1241 } 1242 1243 /* 1244 * Print out the results of a single counter: 1245 * does not use aggregated count in system-wide 1246 */ 1247 static void print_counter(struct perf_evsel *counter, char *prefix) 1248 { 1249 u64 ena, run, val; 1250 double uval; 1251 int cpu; 1252 1253 for (cpu = 0; cpu < perf_evsel__nr_cpus(counter); cpu++) { 1254 val = counter->counts->cpu[cpu].val; 1255 ena = counter->counts->cpu[cpu].ena; 1256 run = counter->counts->cpu[cpu].run; 1257 1258 if (prefix) 1259 fprintf(output, "%s", prefix); 1260 1261 if (run == 0 || ena == 0) { 1262 fprintf(output, "CPU%*d%s%*s%s", 1263 csv_output ? 0 : -4, 1264 perf_evsel__cpus(counter)->map[cpu], csv_sep, 1265 csv_output ? 0 : 18, 1266 counter->supported ? CNTR_NOT_COUNTED : CNTR_NOT_SUPPORTED, 1267 csv_sep); 1268 1269 fprintf(output, "%-*s%s", 1270 csv_output ? 0 : unit_width, 1271 counter->unit, csv_sep); 1272 1273 fprintf(output, "%*s", 1274 csv_output ? 0 : -25, 1275 perf_evsel__name(counter)); 1276 1277 if (counter->cgrp) 1278 fprintf(output, "%s%s", 1279 csv_sep, counter->cgrp->name); 1280 1281 fputc('\n', output); 1282 continue; 1283 } 1284 1285 uval = val * counter->scale; 1286 1287 if (nsec_counter(counter)) 1288 nsec_printout(cpu, 0, counter, uval); 1289 else 1290 abs_printout(cpu, 0, counter, uval); 1291 1292 if (!csv_output) { 1293 print_noise(counter, 1.0); 1294 1295 if (run != ena) 1296 fprintf(output, " (%.2f%%)", 1297 100.0 * run / ena); 1298 } 1299 fputc('\n', output); 1300 } 1301 } 1302 1303 static void print_stat(int argc, const char **argv) 1304 { 1305 struct perf_evsel *counter; 1306 int i; 1307 1308 fflush(stdout); 1309 1310 if (!csv_output) { 1311 fprintf(output, "\n"); 1312 fprintf(output, " Performance counter stats for "); 1313 if (target.system_wide) 1314 fprintf(output, "\'system wide"); 1315 else if (target.cpu_list) 1316 fprintf(output, "\'CPU(s) %s", target.cpu_list); 1317 else if (!target__has_task(&target)) { 1318 fprintf(output, "\'%s", argv[0]); 1319 for (i = 1; i < argc; i++) 1320 fprintf(output, " %s", argv[i]); 1321 } else if (target.pid) 1322 fprintf(output, "process id \'%s", target.pid); 1323 else 1324 fprintf(output, "thread id \'%s", target.tid); 1325 1326 fprintf(output, "\'"); 1327 if (run_count > 1) 1328 fprintf(output, " (%d runs)", run_count); 1329 fprintf(output, ":\n\n"); 1330 } 1331 1332 switch (aggr_mode) { 1333 case AGGR_CORE: 1334 case AGGR_SOCKET: 1335 print_aggr(NULL); 1336 break; 1337 case AGGR_GLOBAL: 1338 evlist__for_each(evsel_list, counter) 1339 print_counter_aggr(counter, NULL); 1340 break; 1341 case AGGR_NONE: 1342 evlist__for_each(evsel_list, counter) 1343 print_counter(counter, NULL); 1344 break; 1345 default: 1346 break; 1347 } 1348 1349 if (!csv_output) { 1350 if (!null_run) 1351 fprintf(output, "\n"); 1352 fprintf(output, " %17.9f seconds time elapsed", 1353 avg_stats(&walltime_nsecs_stats)/1e9); 1354 if (run_count > 1) { 1355 fprintf(output, " "); 1356 print_noise_pct(stddev_stats(&walltime_nsecs_stats), 1357 avg_stats(&walltime_nsecs_stats)); 1358 } 1359 fprintf(output, "\n\n"); 1360 } 1361 } 1362 1363 static volatile int signr = -1; 1364 1365 static void skip_signal(int signo) 1366 { 1367 if ((child_pid == -1) || interval) 1368 done = 1; 1369 1370 signr = signo; 1371 /* 1372 * render child_pid harmless 1373 * won't send SIGTERM to a random 1374 * process in case of race condition 1375 * and fast PID recycling 1376 */ 1377 child_pid = -1; 1378 } 1379 1380 static void sig_atexit(void) 1381 { 1382 sigset_t set, oset; 1383 1384 /* 1385 * avoid race condition with SIGCHLD handler 1386 * in skip_signal() which is modifying child_pid 1387 * goal is to avoid send SIGTERM to a random 1388 * process 1389 */ 1390 sigemptyset(&set); 1391 sigaddset(&set, SIGCHLD); 1392 sigprocmask(SIG_BLOCK, &set, &oset); 1393 1394 if (child_pid != -1) 1395 kill(child_pid, SIGTERM); 1396 1397 sigprocmask(SIG_SETMASK, &oset, NULL); 1398 1399 if (signr == -1) 1400 return; 1401 1402 signal(signr, SIG_DFL); 1403 kill(getpid(), signr); 1404 } 1405 1406 static int stat__set_big_num(const struct option *opt __maybe_unused, 1407 const char *s __maybe_unused, int unset) 1408 { 1409 big_num_opt = unset ? 0 : 1; 1410 return 0; 1411 } 1412 1413 static int perf_stat_init_aggr_mode(void) 1414 { 1415 switch (aggr_mode) { 1416 case AGGR_SOCKET: 1417 if (cpu_map__build_socket_map(evsel_list->cpus, &aggr_map)) { 1418 perror("cannot build socket map"); 1419 return -1; 1420 } 1421 aggr_get_id = cpu_map__get_socket; 1422 break; 1423 case AGGR_CORE: 1424 if (cpu_map__build_core_map(evsel_list->cpus, &aggr_map)) { 1425 perror("cannot build core map"); 1426 return -1; 1427 } 1428 aggr_get_id = cpu_map__get_core; 1429 break; 1430 case AGGR_NONE: 1431 case AGGR_GLOBAL: 1432 default: 1433 break; 1434 } 1435 return 0; 1436 } 1437 1438 static int setup_events(const char * const *attrs, unsigned len) 1439 { 1440 unsigned i; 1441 1442 for (i = 0; i < len; i++) { 1443 if (parse_events(evsel_list, attrs[i])) 1444 return -1; 1445 } 1446 return 0; 1447 } 1448 1449 /* 1450 * Add default attributes, if there were no attributes specified or 1451 * if -d/--detailed, -d -d or -d -d -d is used: 1452 */ 1453 static int add_default_attributes(void) 1454 { 1455 struct perf_event_attr default_attrs[] = { 1456 1457 { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_SW_TASK_CLOCK }, 1458 { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_SW_CONTEXT_SWITCHES }, 1459 { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_SW_CPU_MIGRATIONS }, 1460 { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_SW_PAGE_FAULTS }, 1461 1462 { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_CPU_CYCLES }, 1463 { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_STALLED_CYCLES_FRONTEND }, 1464 { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_STALLED_CYCLES_BACKEND }, 1465 { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_INSTRUCTIONS }, 1466 { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_BRANCH_INSTRUCTIONS }, 1467 { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_BRANCH_MISSES }, 1468 1469 }; 1470 1471 /* 1472 * Detailed stats (-d), covering the L1 and last level data caches: 1473 */ 1474 struct perf_event_attr detailed_attrs[] = { 1475 1476 { .type = PERF_TYPE_HW_CACHE, 1477 .config = 1478 PERF_COUNT_HW_CACHE_L1D << 0 | 1479 (PERF_COUNT_HW_CACHE_OP_READ << 8) | 1480 (PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16) }, 1481 1482 { .type = PERF_TYPE_HW_CACHE, 1483 .config = 1484 PERF_COUNT_HW_CACHE_L1D << 0 | 1485 (PERF_COUNT_HW_CACHE_OP_READ << 8) | 1486 (PERF_COUNT_HW_CACHE_RESULT_MISS << 16) }, 1487 1488 { .type = PERF_TYPE_HW_CACHE, 1489 .config = 1490 PERF_COUNT_HW_CACHE_LL << 0 | 1491 (PERF_COUNT_HW_CACHE_OP_READ << 8) | 1492 (PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16) }, 1493 1494 { .type = PERF_TYPE_HW_CACHE, 1495 .config = 1496 PERF_COUNT_HW_CACHE_LL << 0 | 1497 (PERF_COUNT_HW_CACHE_OP_READ << 8) | 1498 (PERF_COUNT_HW_CACHE_RESULT_MISS << 16) }, 1499 }; 1500 1501 /* 1502 * Very detailed stats (-d -d), covering the instruction cache and the TLB caches: 1503 */ 1504 struct perf_event_attr very_detailed_attrs[] = { 1505 1506 { .type = PERF_TYPE_HW_CACHE, 1507 .config = 1508 PERF_COUNT_HW_CACHE_L1I << 0 | 1509 (PERF_COUNT_HW_CACHE_OP_READ << 8) | 1510 (PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16) }, 1511 1512 { .type = PERF_TYPE_HW_CACHE, 1513 .config = 1514 PERF_COUNT_HW_CACHE_L1I << 0 | 1515 (PERF_COUNT_HW_CACHE_OP_READ << 8) | 1516 (PERF_COUNT_HW_CACHE_RESULT_MISS << 16) }, 1517 1518 { .type = PERF_TYPE_HW_CACHE, 1519 .config = 1520 PERF_COUNT_HW_CACHE_DTLB << 0 | 1521 (PERF_COUNT_HW_CACHE_OP_READ << 8) | 1522 (PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16) }, 1523 1524 { .type = PERF_TYPE_HW_CACHE, 1525 .config = 1526 PERF_COUNT_HW_CACHE_DTLB << 0 | 1527 (PERF_COUNT_HW_CACHE_OP_READ << 8) | 1528 (PERF_COUNT_HW_CACHE_RESULT_MISS << 16) }, 1529 1530 { .type = PERF_TYPE_HW_CACHE, 1531 .config = 1532 PERF_COUNT_HW_CACHE_ITLB << 0 | 1533 (PERF_COUNT_HW_CACHE_OP_READ << 8) | 1534 (PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16) }, 1535 1536 { .type = PERF_TYPE_HW_CACHE, 1537 .config = 1538 PERF_COUNT_HW_CACHE_ITLB << 0 | 1539 (PERF_COUNT_HW_CACHE_OP_READ << 8) | 1540 (PERF_COUNT_HW_CACHE_RESULT_MISS << 16) }, 1541 1542 }; 1543 1544 /* 1545 * Very, very detailed stats (-d -d -d), adding prefetch events: 1546 */ 1547 struct perf_event_attr very_very_detailed_attrs[] = { 1548 1549 { .type = PERF_TYPE_HW_CACHE, 1550 .config = 1551 PERF_COUNT_HW_CACHE_L1D << 0 | 1552 (PERF_COUNT_HW_CACHE_OP_PREFETCH << 8) | 1553 (PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16) }, 1554 1555 { .type = PERF_TYPE_HW_CACHE, 1556 .config = 1557 PERF_COUNT_HW_CACHE_L1D << 0 | 1558 (PERF_COUNT_HW_CACHE_OP_PREFETCH << 8) | 1559 (PERF_COUNT_HW_CACHE_RESULT_MISS << 16) }, 1560 }; 1561 1562 /* Set attrs if no event is selected and !null_run: */ 1563 if (null_run) 1564 return 0; 1565 1566 if (transaction_run) { 1567 int err; 1568 if (pmu_have_event("cpu", "cycles-ct") && 1569 pmu_have_event("cpu", "el-start")) 1570 err = setup_events(transaction_attrs, 1571 ARRAY_SIZE(transaction_attrs)); 1572 else 1573 err = setup_events(transaction_limited_attrs, 1574 ARRAY_SIZE(transaction_limited_attrs)); 1575 if (err < 0) { 1576 fprintf(stderr, "Cannot set up transaction events\n"); 1577 return -1; 1578 } 1579 return 0; 1580 } 1581 1582 if (!evsel_list->nr_entries) { 1583 if (perf_evlist__add_default_attrs(evsel_list, default_attrs) < 0) 1584 return -1; 1585 } 1586 1587 /* Detailed events get appended to the event list: */ 1588 1589 if (detailed_run < 1) 1590 return 0; 1591 1592 /* Append detailed run extra attributes: */ 1593 if (perf_evlist__add_default_attrs(evsel_list, detailed_attrs) < 0) 1594 return -1; 1595 1596 if (detailed_run < 2) 1597 return 0; 1598 1599 /* Append very detailed run extra attributes: */ 1600 if (perf_evlist__add_default_attrs(evsel_list, very_detailed_attrs) < 0) 1601 return -1; 1602 1603 if (detailed_run < 3) 1604 return 0; 1605 1606 /* Append very, very detailed run extra attributes: */ 1607 return perf_evlist__add_default_attrs(evsel_list, very_very_detailed_attrs); 1608 } 1609 1610 int cmd_stat(int argc, const char **argv, const char *prefix __maybe_unused) 1611 { 1612 bool append_file = false; 1613 int output_fd = 0; 1614 const char *output_name = NULL; 1615 const struct option options[] = { 1616 OPT_BOOLEAN('T', "transaction", &transaction_run, 1617 "hardware transaction statistics"), 1618 OPT_CALLBACK('e', "event", &evsel_list, "event", 1619 "event selector. use 'perf list' to list available events", 1620 parse_events_option), 1621 OPT_CALLBACK(0, "filter", &evsel_list, "filter", 1622 "event filter", parse_filter), 1623 OPT_BOOLEAN('i', "no-inherit", &no_inherit, 1624 "child tasks do not inherit counters"), 1625 OPT_STRING('p', "pid", &target.pid, "pid", 1626 "stat events on existing process id"), 1627 OPT_STRING('t', "tid", &target.tid, "tid", 1628 "stat events on existing thread id"), 1629 OPT_BOOLEAN('a', "all-cpus", &target.system_wide, 1630 "system-wide collection from all CPUs"), 1631 OPT_BOOLEAN('g', "group", &group, 1632 "put the counters into a counter group"), 1633 OPT_BOOLEAN('c', "scale", &scale, "scale/normalize counters"), 1634 OPT_INCR('v', "verbose", &verbose, 1635 "be more verbose (show counter open errors, etc)"), 1636 OPT_INTEGER('r', "repeat", &run_count, 1637 "repeat command and print average + stddev (max: 100, forever: 0)"), 1638 OPT_BOOLEAN('n', "null", &null_run, 1639 "null run - dont start any counters"), 1640 OPT_INCR('d', "detailed", &detailed_run, 1641 "detailed run - start a lot of events"), 1642 OPT_BOOLEAN('S', "sync", &sync_run, 1643 "call sync() before starting a run"), 1644 OPT_CALLBACK_NOOPT('B', "big-num", NULL, NULL, 1645 "print large numbers with thousands\' separators", 1646 stat__set_big_num), 1647 OPT_STRING('C', "cpu", &target.cpu_list, "cpu", 1648 "list of cpus to monitor in system-wide"), 1649 OPT_SET_UINT('A', "no-aggr", &aggr_mode, 1650 "disable CPU count aggregation", AGGR_NONE), 1651 OPT_STRING('x', "field-separator", &csv_sep, "separator", 1652 "print counts with custom separator"), 1653 OPT_CALLBACK('G', "cgroup", &evsel_list, "name", 1654 "monitor event in cgroup name only", parse_cgroups), 1655 OPT_STRING('o', "output", &output_name, "file", "output file name"), 1656 OPT_BOOLEAN(0, "append", &append_file, "append to the output file"), 1657 OPT_INTEGER(0, "log-fd", &output_fd, 1658 "log output to fd, instead of stderr"), 1659 OPT_STRING(0, "pre", &pre_cmd, "command", 1660 "command to run prior to the measured command"), 1661 OPT_STRING(0, "post", &post_cmd, "command", 1662 "command to run after to the measured command"), 1663 OPT_UINTEGER('I', "interval-print", &interval, 1664 "print counts at regular interval in ms (>= 100)"), 1665 OPT_SET_UINT(0, "per-socket", &aggr_mode, 1666 "aggregate counts per processor socket", AGGR_SOCKET), 1667 OPT_SET_UINT(0, "per-core", &aggr_mode, 1668 "aggregate counts per physical processor core", AGGR_CORE), 1669 OPT_UINTEGER('D', "delay", &initial_delay, 1670 "ms to wait before starting measurement after program start"), 1671 OPT_END() 1672 }; 1673 const char * const stat_usage[] = { 1674 "perf stat [<options>] [<command>]", 1675 NULL 1676 }; 1677 int status = -EINVAL, run_idx; 1678 const char *mode; 1679 1680 setlocale(LC_ALL, ""); 1681 1682 evsel_list = perf_evlist__new(); 1683 if (evsel_list == NULL) 1684 return -ENOMEM; 1685 1686 argc = parse_options(argc, argv, options, stat_usage, 1687 PARSE_OPT_STOP_AT_NON_OPTION); 1688 1689 output = stderr; 1690 if (output_name && strcmp(output_name, "-")) 1691 output = NULL; 1692 1693 if (output_name && output_fd) { 1694 fprintf(stderr, "cannot use both --output and --log-fd\n"); 1695 parse_options_usage(stat_usage, options, "o", 1); 1696 parse_options_usage(NULL, options, "log-fd", 0); 1697 goto out; 1698 } 1699 1700 if (output_fd < 0) { 1701 fprintf(stderr, "argument to --log-fd must be a > 0\n"); 1702 parse_options_usage(stat_usage, options, "log-fd", 0); 1703 goto out; 1704 } 1705 1706 if (!output) { 1707 struct timespec tm; 1708 mode = append_file ? "a" : "w"; 1709 1710 output = fopen(output_name, mode); 1711 if (!output) { 1712 perror("failed to create output file"); 1713 return -1; 1714 } 1715 clock_gettime(CLOCK_REALTIME, &tm); 1716 fprintf(output, "# started on %s\n", ctime(&tm.tv_sec)); 1717 } else if (output_fd > 0) { 1718 mode = append_file ? "a" : "w"; 1719 output = fdopen(output_fd, mode); 1720 if (!output) { 1721 perror("Failed opening logfd"); 1722 return -errno; 1723 } 1724 } 1725 1726 if (csv_sep) { 1727 csv_output = true; 1728 if (!strcmp(csv_sep, "\\t")) 1729 csv_sep = "\t"; 1730 } else 1731 csv_sep = DEFAULT_SEPARATOR; 1732 1733 /* 1734 * let the spreadsheet do the pretty-printing 1735 */ 1736 if (csv_output) { 1737 /* User explicitly passed -B? */ 1738 if (big_num_opt == 1) { 1739 fprintf(stderr, "-B option not supported with -x\n"); 1740 parse_options_usage(stat_usage, options, "B", 1); 1741 parse_options_usage(NULL, options, "x", 1); 1742 goto out; 1743 } else /* Nope, so disable big number formatting */ 1744 big_num = false; 1745 } else if (big_num_opt == 0) /* User passed --no-big-num */ 1746 big_num = false; 1747 1748 if (!argc && target__none(&target)) 1749 usage_with_options(stat_usage, options); 1750 1751 if (run_count < 0) { 1752 pr_err("Run count must be a positive number\n"); 1753 parse_options_usage(stat_usage, options, "r", 1); 1754 goto out; 1755 } else if (run_count == 0) { 1756 forever = true; 1757 run_count = 1; 1758 } 1759 1760 /* no_aggr, cgroup are for system-wide only */ 1761 if ((aggr_mode != AGGR_GLOBAL || nr_cgroups) && 1762 !target__has_cpu(&target)) { 1763 fprintf(stderr, "both cgroup and no-aggregation " 1764 "modes only available in system-wide mode\n"); 1765 1766 parse_options_usage(stat_usage, options, "G", 1); 1767 parse_options_usage(NULL, options, "A", 1); 1768 parse_options_usage(NULL, options, "a", 1); 1769 goto out; 1770 } 1771 1772 if (add_default_attributes()) 1773 goto out; 1774 1775 target__validate(&target); 1776 1777 if (perf_evlist__create_maps(evsel_list, &target) < 0) { 1778 if (target__has_task(&target)) { 1779 pr_err("Problems finding threads of monitor\n"); 1780 parse_options_usage(stat_usage, options, "p", 1); 1781 parse_options_usage(NULL, options, "t", 1); 1782 } else if (target__has_cpu(&target)) { 1783 perror("failed to parse CPUs map"); 1784 parse_options_usage(stat_usage, options, "C", 1); 1785 parse_options_usage(NULL, options, "a", 1); 1786 } 1787 goto out; 1788 } 1789 if (interval && interval < 100) { 1790 pr_err("print interval must be >= 100ms\n"); 1791 parse_options_usage(stat_usage, options, "I", 1); 1792 goto out; 1793 } 1794 1795 if (perf_evlist__alloc_stats(evsel_list, interval)) 1796 goto out; 1797 1798 if (perf_stat_init_aggr_mode()) 1799 goto out; 1800 1801 /* 1802 * We dont want to block the signals - that would cause 1803 * child tasks to inherit that and Ctrl-C would not work. 1804 * What we want is for Ctrl-C to work in the exec()-ed 1805 * task, but being ignored by perf stat itself: 1806 */ 1807 atexit(sig_atexit); 1808 if (!forever) 1809 signal(SIGINT, skip_signal); 1810 signal(SIGCHLD, skip_signal); 1811 signal(SIGALRM, skip_signal); 1812 signal(SIGABRT, skip_signal); 1813 1814 status = 0; 1815 for (run_idx = 0; forever || run_idx < run_count; run_idx++) { 1816 if (run_count != 1 && verbose) 1817 fprintf(output, "[ perf stat: executing run #%d ... ]\n", 1818 run_idx + 1); 1819 1820 status = run_perf_stat(argc, argv); 1821 if (forever && status != -1) { 1822 print_stat(argc, argv); 1823 perf_stat__reset_stats(evsel_list); 1824 } 1825 } 1826 1827 if (!forever && status != -1 && !interval) 1828 print_stat(argc, argv); 1829 1830 perf_evlist__free_stats(evsel_list); 1831 out: 1832 perf_evlist__delete(evsel_list); 1833 return status; 1834 } 1835