1 /* 2 * builtin-stat.c 3 * 4 * Builtin stat command: Give a precise performance counters summary 5 * overview about any workload, CPU or specific PID. 6 * 7 * Sample output: 8 9 $ perf stat ~/hackbench 10 10 Time: 0.104 11 12 Performance counter stats for '/home/mingo/hackbench': 13 14 1255.538611 task clock ticks # 10.143 CPU utilization factor 15 54011 context switches # 0.043 M/sec 16 385 CPU migrations # 0.000 M/sec 17 17755 pagefaults # 0.014 M/sec 18 3808323185 CPU cycles # 3033.219 M/sec 19 1575111190 instructions # 1254.530 M/sec 20 17367895 cache references # 13.833 M/sec 21 7674421 cache misses # 6.112 M/sec 22 23 Wall-clock time elapsed: 123.786620 msecs 24 25 * 26 * Copyright (C) 2008, Red Hat Inc, Ingo Molnar <mingo@redhat.com> 27 * 28 * Improvements and fixes by: 29 * 30 * Arjan van de Ven <arjan@linux.intel.com> 31 * Yanmin Zhang <yanmin.zhang@intel.com> 32 * Wu Fengguang <fengguang.wu@intel.com> 33 * Mike Galbraith <efault@gmx.de> 34 * Paul Mackerras <paulus@samba.org> 35 * Jaswinder Singh Rajput <jaswinder@kernel.org> 36 * 37 * Released under the GPL v2. (and only v2, not any later version) 38 */ 39 40 #include "perf.h" 41 #include "builtin.h" 42 #include "util/util.h" 43 #include "util/parse-options.h" 44 #include "util/parse-events.h" 45 46 #include <sys/prctl.h> 47 #include <math.h> 48 49 static struct perf_counter_attr default_attrs[] = { 50 51 { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_SW_TASK_CLOCK }, 52 { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_SW_CONTEXT_SWITCHES}, 53 { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_SW_CPU_MIGRATIONS }, 54 { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_SW_PAGE_FAULTS }, 55 56 { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_CPU_CYCLES }, 57 { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_INSTRUCTIONS }, 58 { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_CACHE_REFERENCES}, 59 { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_CACHE_MISSES }, 60 61 }; 62 63 #define MAX_RUN 100 64 65 static int system_wide = 0; 66 static int verbose = 0; 67 static unsigned int nr_cpus = 0; 68 static int run_idx = 0; 69 70 static int run_count = 1; 71 static int inherit = 1; 72 static int scale = 1; 73 static int target_pid = -1; 74 static int null_run = 0; 75 76 static int fd[MAX_NR_CPUS][MAX_COUNTERS]; 77 78 static u64 runtime_nsecs[MAX_RUN]; 79 static u64 walltime_nsecs[MAX_RUN]; 80 static u64 runtime_cycles[MAX_RUN]; 81 82 static u64 event_res[MAX_RUN][MAX_COUNTERS][3]; 83 static u64 event_scaled[MAX_RUN][MAX_COUNTERS]; 84 85 static u64 event_res_avg[MAX_COUNTERS][3]; 86 static u64 event_res_noise[MAX_COUNTERS][3]; 87 88 static u64 event_scaled_avg[MAX_COUNTERS]; 89 90 static u64 runtime_nsecs_avg; 91 static u64 runtime_nsecs_noise; 92 93 static u64 walltime_nsecs_avg; 94 static u64 walltime_nsecs_noise; 95 96 static u64 runtime_cycles_avg; 97 static u64 runtime_cycles_noise; 98 99 #define MATCH_EVENT(t, c, counter) \ 100 (attrs[counter].type == PERF_TYPE_##t && \ 101 attrs[counter].config == PERF_COUNT_##c) 102 103 #define ERR_PERF_OPEN \ 104 "Error: counter %d, sys_perf_counter_open() syscall returned with %d (%s)\n" 105 106 static void create_perf_stat_counter(int counter, int pid) 107 { 108 struct perf_counter_attr *attr = attrs + counter; 109 110 if (scale) 111 attr->read_format = PERF_FORMAT_TOTAL_TIME_ENABLED | 112 PERF_FORMAT_TOTAL_TIME_RUNNING; 113 114 if (system_wide) { 115 unsigned int cpu; 116 117 for (cpu = 0; cpu < nr_cpus; cpu++) { 118 fd[cpu][counter] = sys_perf_counter_open(attr, -1, cpu, -1, 0); 119 if (fd[cpu][counter] < 0 && verbose) 120 fprintf(stderr, ERR_PERF_OPEN, counter, 121 fd[cpu][counter], strerror(errno)); 122 } 123 } else { 124 attr->inherit = inherit; 125 attr->disabled = 1; 126 attr->enable_on_exec = 1; 127 128 fd[0][counter] = sys_perf_counter_open(attr, pid, -1, -1, 0); 129 if (fd[0][counter] < 0 && verbose) 130 fprintf(stderr, ERR_PERF_OPEN, counter, 131 fd[0][counter], strerror(errno)); 132 } 133 } 134 135 /* 136 * Does the counter have nsecs as a unit? 137 */ 138 static inline int nsec_counter(int counter) 139 { 140 if (MATCH_EVENT(SOFTWARE, SW_CPU_CLOCK, counter) || 141 MATCH_EVENT(SOFTWARE, SW_TASK_CLOCK, counter)) 142 return 1; 143 144 return 0; 145 } 146 147 /* 148 * Read out the results of a single counter: 149 */ 150 static void read_counter(int counter) 151 { 152 u64 *count, single_count[3]; 153 unsigned int cpu; 154 size_t res, nv; 155 int scaled; 156 157 count = event_res[run_idx][counter]; 158 159 count[0] = count[1] = count[2] = 0; 160 161 nv = scale ? 3 : 1; 162 for (cpu = 0; cpu < nr_cpus; cpu++) { 163 if (fd[cpu][counter] < 0) 164 continue; 165 166 res = read(fd[cpu][counter], single_count, nv * sizeof(u64)); 167 assert(res == nv * sizeof(u64)); 168 169 close(fd[cpu][counter]); 170 fd[cpu][counter] = -1; 171 172 count[0] += single_count[0]; 173 if (scale) { 174 count[1] += single_count[1]; 175 count[2] += single_count[2]; 176 } 177 } 178 179 scaled = 0; 180 if (scale) { 181 if (count[2] == 0) { 182 event_scaled[run_idx][counter] = -1; 183 count[0] = 0; 184 return; 185 } 186 187 if (count[2] < count[1]) { 188 event_scaled[run_idx][counter] = 1; 189 count[0] = (unsigned long long) 190 ((double)count[0] * count[1] / count[2] + 0.5); 191 } 192 } 193 /* 194 * Save the full runtime - to allow normalization during printout: 195 */ 196 if (MATCH_EVENT(SOFTWARE, SW_TASK_CLOCK, counter)) 197 runtime_nsecs[run_idx] = count[0]; 198 if (MATCH_EVENT(HARDWARE, HW_CPU_CYCLES, counter)) 199 runtime_cycles[run_idx] = count[0]; 200 } 201 202 static int run_perf_stat(int argc __used, const char **argv) 203 { 204 unsigned long long t0, t1; 205 int status = 0; 206 int counter; 207 int pid; 208 int child_ready_pipe[2], go_pipe[2]; 209 char buf; 210 211 if (!system_wide) 212 nr_cpus = 1; 213 214 if (pipe(child_ready_pipe) < 0 || pipe(go_pipe) < 0) { 215 perror("failed to create pipes"); 216 exit(1); 217 } 218 219 if ((pid = fork()) < 0) 220 perror("failed to fork"); 221 222 if (!pid) { 223 close(child_ready_pipe[0]); 224 close(go_pipe[1]); 225 fcntl(go_pipe[0], F_SETFD, FD_CLOEXEC); 226 227 /* 228 * Do a dummy execvp to get the PLT entry resolved, 229 * so we avoid the resolver overhead on the real 230 * execvp call. 231 */ 232 execvp("", (char **)argv); 233 234 /* 235 * Tell the parent we're ready to go 236 */ 237 close(child_ready_pipe[1]); 238 239 /* 240 * Wait until the parent tells us to go. 241 */ 242 if (read(go_pipe[0], &buf, 1) == -1) 243 perror("unable to read pipe"); 244 245 execvp(argv[0], (char **)argv); 246 247 perror(argv[0]); 248 exit(-1); 249 } 250 251 /* 252 * Wait for the child to be ready to exec. 253 */ 254 close(child_ready_pipe[1]); 255 close(go_pipe[0]); 256 if (read(child_ready_pipe[0], &buf, 1) == -1) 257 perror("unable to read pipe"); 258 close(child_ready_pipe[0]); 259 260 for (counter = 0; counter < nr_counters; counter++) 261 create_perf_stat_counter(counter, pid); 262 263 /* 264 * Enable counters and exec the command: 265 */ 266 t0 = rdclock(); 267 268 close(go_pipe[1]); 269 wait(&status); 270 271 t1 = rdclock(); 272 273 walltime_nsecs[run_idx] = t1 - t0; 274 275 for (counter = 0; counter < nr_counters; counter++) 276 read_counter(counter); 277 278 return WEXITSTATUS(status); 279 } 280 281 static void print_noise(u64 *count, u64 *noise) 282 { 283 if (run_count > 1) 284 fprintf(stderr, " ( +- %7.3f%% )", 285 (double)noise[0]/(count[0]+1)*100.0); 286 } 287 288 static void nsec_printout(int counter, u64 *count, u64 *noise) 289 { 290 double msecs = (double)count[0] / 1000000; 291 292 fprintf(stderr, " %14.6f %-24s", msecs, event_name(counter)); 293 294 if (MATCH_EVENT(SOFTWARE, SW_TASK_CLOCK, counter)) { 295 if (walltime_nsecs_avg) 296 fprintf(stderr, " # %10.3f CPUs ", 297 (double)count[0] / (double)walltime_nsecs_avg); 298 } 299 print_noise(count, noise); 300 } 301 302 static void abs_printout(int counter, u64 *count, u64 *noise) 303 { 304 fprintf(stderr, " %14Ld %-24s", count[0], event_name(counter)); 305 306 if (runtime_cycles_avg && 307 MATCH_EVENT(HARDWARE, HW_INSTRUCTIONS, counter)) { 308 fprintf(stderr, " # %10.3f IPC ", 309 (double)count[0] / (double)runtime_cycles_avg); 310 } else { 311 if (runtime_nsecs_avg) { 312 fprintf(stderr, " # %10.3f M/sec", 313 (double)count[0]/runtime_nsecs_avg*1000.0); 314 } 315 } 316 print_noise(count, noise); 317 } 318 319 /* 320 * Print out the results of a single counter: 321 */ 322 static void print_counter(int counter) 323 { 324 u64 *count, *noise; 325 int scaled; 326 327 count = event_res_avg[counter]; 328 noise = event_res_noise[counter]; 329 scaled = event_scaled_avg[counter]; 330 331 if (scaled == -1) { 332 fprintf(stderr, " %14s %-24s\n", 333 "<not counted>", event_name(counter)); 334 return; 335 } 336 337 if (nsec_counter(counter)) 338 nsec_printout(counter, count, noise); 339 else 340 abs_printout(counter, count, noise); 341 342 if (scaled) 343 fprintf(stderr, " (scaled from %.2f%%)", 344 (double) count[2] / count[1] * 100); 345 346 fprintf(stderr, "\n"); 347 } 348 349 /* 350 * normalize_noise noise values down to stddev: 351 */ 352 static void normalize_noise(u64 *val) 353 { 354 double res; 355 356 res = (double)*val / (run_count * sqrt((double)run_count)); 357 358 *val = (u64)res; 359 } 360 361 static void update_avg(const char *name, int idx, u64 *avg, u64 *val) 362 { 363 *avg += *val; 364 365 if (verbose > 1) 366 fprintf(stderr, "debug: %20s[%d]: %Ld\n", name, idx, *val); 367 } 368 /* 369 * Calculate the averages and noises: 370 */ 371 static void calc_avg(void) 372 { 373 int i, j; 374 375 if (verbose > 1) 376 fprintf(stderr, "\n"); 377 378 for (i = 0; i < run_count; i++) { 379 update_avg("runtime", 0, &runtime_nsecs_avg, runtime_nsecs + i); 380 update_avg("walltime", 0, &walltime_nsecs_avg, walltime_nsecs + i); 381 update_avg("runtime_cycles", 0, &runtime_cycles_avg, runtime_cycles + i); 382 383 for (j = 0; j < nr_counters; j++) { 384 update_avg("counter/0", j, 385 event_res_avg[j]+0, event_res[i][j]+0); 386 update_avg("counter/1", j, 387 event_res_avg[j]+1, event_res[i][j]+1); 388 update_avg("counter/2", j, 389 event_res_avg[j]+2, event_res[i][j]+2); 390 if (event_scaled[i][j] != (u64)-1) 391 update_avg("scaled", j, 392 event_scaled_avg + j, event_scaled[i]+j); 393 else 394 event_scaled_avg[j] = -1; 395 } 396 } 397 runtime_nsecs_avg /= run_count; 398 walltime_nsecs_avg /= run_count; 399 runtime_cycles_avg /= run_count; 400 401 for (j = 0; j < nr_counters; j++) { 402 event_res_avg[j][0] /= run_count; 403 event_res_avg[j][1] /= run_count; 404 event_res_avg[j][2] /= run_count; 405 } 406 407 for (i = 0; i < run_count; i++) { 408 runtime_nsecs_noise += 409 abs((s64)(runtime_nsecs[i] - runtime_nsecs_avg)); 410 walltime_nsecs_noise += 411 abs((s64)(walltime_nsecs[i] - walltime_nsecs_avg)); 412 runtime_cycles_noise += 413 abs((s64)(runtime_cycles[i] - runtime_cycles_avg)); 414 415 for (j = 0; j < nr_counters; j++) { 416 event_res_noise[j][0] += 417 abs((s64)(event_res[i][j][0] - event_res_avg[j][0])); 418 event_res_noise[j][1] += 419 abs((s64)(event_res[i][j][1] - event_res_avg[j][1])); 420 event_res_noise[j][2] += 421 abs((s64)(event_res[i][j][2] - event_res_avg[j][2])); 422 } 423 } 424 425 normalize_noise(&runtime_nsecs_noise); 426 normalize_noise(&walltime_nsecs_noise); 427 normalize_noise(&runtime_cycles_noise); 428 429 for (j = 0; j < nr_counters; j++) { 430 normalize_noise(&event_res_noise[j][0]); 431 normalize_noise(&event_res_noise[j][1]); 432 normalize_noise(&event_res_noise[j][2]); 433 } 434 } 435 436 static void print_stat(int argc, const char **argv) 437 { 438 int i, counter; 439 440 calc_avg(); 441 442 fflush(stdout); 443 444 fprintf(stderr, "\n"); 445 fprintf(stderr, " Performance counter stats for \'%s", argv[0]); 446 447 for (i = 1; i < argc; i++) 448 fprintf(stderr, " %s", argv[i]); 449 450 fprintf(stderr, "\'"); 451 if (run_count > 1) 452 fprintf(stderr, " (%d runs)", run_count); 453 fprintf(stderr, ":\n\n"); 454 455 for (counter = 0; counter < nr_counters; counter++) 456 print_counter(counter); 457 458 fprintf(stderr, "\n"); 459 fprintf(stderr, " %14.9f seconds time elapsed", 460 (double)walltime_nsecs_avg/1e9); 461 if (run_count > 1) { 462 fprintf(stderr, " ( +- %7.3f%% )", 463 100.0*(double)walltime_nsecs_noise/(double)walltime_nsecs_avg); 464 } 465 fprintf(stderr, "\n\n"); 466 } 467 468 static volatile int signr = -1; 469 470 static void skip_signal(int signo) 471 { 472 signr = signo; 473 } 474 475 static void sig_atexit(void) 476 { 477 if (signr == -1) 478 return; 479 480 signal(signr, SIG_DFL); 481 kill(getpid(), signr); 482 } 483 484 static const char * const stat_usage[] = { 485 "perf stat [<options>] <command>", 486 NULL 487 }; 488 489 static const struct option options[] = { 490 OPT_CALLBACK('e', "event", NULL, "event", 491 "event selector. use 'perf list' to list available events", 492 parse_events), 493 OPT_BOOLEAN('i', "inherit", &inherit, 494 "child tasks inherit counters"), 495 OPT_INTEGER('p', "pid", &target_pid, 496 "stat events on existing pid"), 497 OPT_BOOLEAN('a', "all-cpus", &system_wide, 498 "system-wide collection from all CPUs"), 499 OPT_BOOLEAN('S', "scale", &scale, 500 "scale/normalize counters"), 501 OPT_BOOLEAN('v', "verbose", &verbose, 502 "be more verbose (show counter open errors, etc)"), 503 OPT_INTEGER('r', "repeat", &run_count, 504 "repeat command and print average + stddev (max: 100)"), 505 OPT_BOOLEAN('n', "null", &null_run, 506 "null run - dont start any counters"), 507 OPT_END() 508 }; 509 510 int cmd_stat(int argc, const char **argv, const char *prefix __used) 511 { 512 int status; 513 514 argc = parse_options(argc, argv, options, stat_usage, 515 PARSE_OPT_STOP_AT_NON_OPTION); 516 if (!argc) 517 usage_with_options(stat_usage, options); 518 if (run_count <= 0 || run_count > MAX_RUN) 519 usage_with_options(stat_usage, options); 520 521 /* Set attrs and nr_counters if no event is selected and !null_run */ 522 if (!null_run && !nr_counters) { 523 memcpy(attrs, default_attrs, sizeof(default_attrs)); 524 nr_counters = ARRAY_SIZE(default_attrs); 525 } 526 527 nr_cpus = sysconf(_SC_NPROCESSORS_ONLN); 528 assert(nr_cpus <= MAX_NR_CPUS); 529 assert((int)nr_cpus >= 0); 530 531 /* 532 * We dont want to block the signals - that would cause 533 * child tasks to inherit that and Ctrl-C would not work. 534 * What we want is for Ctrl-C to work in the exec()-ed 535 * task, but being ignored by perf stat itself: 536 */ 537 atexit(sig_atexit); 538 signal(SIGINT, skip_signal); 539 signal(SIGALRM, skip_signal); 540 signal(SIGABRT, skip_signal); 541 542 status = 0; 543 for (run_idx = 0; run_idx < run_count; run_idx++) { 544 if (run_count != 1 && verbose) 545 fprintf(stderr, "[ perf stat: executing run #%d ... ]\n", run_idx + 1); 546 status = run_perf_stat(argc, argv); 547 } 548 549 print_stat(argc, argv); 550 551 return status; 552 } 553