1 /* 2 * builtin-record.c 3 * 4 * Builtin record command: Record the profile of a workload 5 * (or a CPU, or a PID) into the perf.data output file - for 6 * later analysis via perf report. 7 */ 8 #include "builtin.h" 9 10 #include "perf.h" 11 12 #include "util/util.h" 13 #include "util/parse-options.h" 14 #include "util/parse-events.h" 15 #include "util/string.h" 16 17 #include "util/header.h" 18 19 #include <unistd.h> 20 #include <sched.h> 21 22 #define ALIGN(x, a) __ALIGN_MASK(x, (typeof(x))(a)-1) 23 #define __ALIGN_MASK(x, mask) (((x)+(mask))&~(mask)) 24 25 static int fd[MAX_NR_CPUS][MAX_COUNTERS]; 26 27 static long default_interval = 100000; 28 29 static int nr_cpus = 0; 30 static unsigned int page_size; 31 static unsigned int mmap_pages = 128; 32 static int freq = 0; 33 static int output; 34 static const char *output_name = "perf.data"; 35 static int group = 0; 36 static unsigned int realtime_prio = 0; 37 static int system_wide = 0; 38 static pid_t target_pid = -1; 39 static int inherit = 1; 40 static int force = 0; 41 static int append_file = 0; 42 static int call_graph = 0; 43 static int verbose = 0; 44 static int inherit_stat = 0; 45 static int no_samples = 0; 46 47 static long samples; 48 static struct timeval last_read; 49 static struct timeval this_read; 50 51 static u64 bytes_written; 52 53 static struct pollfd event_array[MAX_NR_CPUS * MAX_COUNTERS]; 54 55 static int nr_poll; 56 static int nr_cpu; 57 58 static int file_new = 1; 59 60 struct perf_header *header; 61 62 struct mmap_event { 63 struct perf_event_header header; 64 u32 pid; 65 u32 tid; 66 u64 start; 67 u64 len; 68 u64 pgoff; 69 char filename[PATH_MAX]; 70 }; 71 72 struct comm_event { 73 struct perf_event_header header; 74 u32 pid; 75 u32 tid; 76 char comm[16]; 77 }; 78 79 80 struct mmap_data { 81 int counter; 82 void *base; 83 unsigned int mask; 84 unsigned int prev; 85 }; 86 87 static struct mmap_data mmap_array[MAX_NR_CPUS][MAX_COUNTERS]; 88 89 static unsigned long mmap_read_head(struct mmap_data *md) 90 { 91 struct perf_counter_mmap_page *pc = md->base; 92 long head; 93 94 head = pc->data_head; 95 rmb(); 96 97 return head; 98 } 99 100 static void mmap_write_tail(struct mmap_data *md, unsigned long tail) 101 { 102 struct perf_counter_mmap_page *pc = md->base; 103 104 /* 105 * ensure all reads are done before we write the tail out. 106 */ 107 /* mb(); */ 108 pc->data_tail = tail; 109 } 110 111 static void write_output(void *buf, size_t size) 112 { 113 while (size) { 114 int ret = write(output, buf, size); 115 116 if (ret < 0) 117 die("failed to write"); 118 119 size -= ret; 120 buf += ret; 121 122 bytes_written += ret; 123 } 124 } 125 126 static void mmap_read(struct mmap_data *md) 127 { 128 unsigned int head = mmap_read_head(md); 129 unsigned int old = md->prev; 130 unsigned char *data = md->base + page_size; 131 unsigned long size; 132 void *buf; 133 int diff; 134 135 gettimeofday(&this_read, NULL); 136 137 /* 138 * If we're further behind than half the buffer, there's a chance 139 * the writer will bite our tail and mess up the samples under us. 140 * 141 * If we somehow ended up ahead of the head, we got messed up. 142 * 143 * In either case, truncate and restart at head. 144 */ 145 diff = head - old; 146 if (diff < 0) { 147 struct timeval iv; 148 unsigned long msecs; 149 150 timersub(&this_read, &last_read, &iv); 151 msecs = iv.tv_sec*1000 + iv.tv_usec/1000; 152 153 fprintf(stderr, "WARNING: failed to keep up with mmap data." 154 " Last read %lu msecs ago.\n", msecs); 155 156 /* 157 * head points to a known good entry, start there. 158 */ 159 old = head; 160 } 161 162 last_read = this_read; 163 164 if (old != head) 165 samples++; 166 167 size = head - old; 168 169 if ((old & md->mask) + size != (head & md->mask)) { 170 buf = &data[old & md->mask]; 171 size = md->mask + 1 - (old & md->mask); 172 old += size; 173 174 write_output(buf, size); 175 } 176 177 buf = &data[old & md->mask]; 178 size = head - old; 179 old += size; 180 181 write_output(buf, size); 182 183 md->prev = old; 184 mmap_write_tail(md, old); 185 } 186 187 static volatile int done = 0; 188 static volatile int signr = -1; 189 190 static void sig_handler(int sig) 191 { 192 done = 1; 193 signr = sig; 194 } 195 196 static void sig_atexit(void) 197 { 198 if (signr == -1) 199 return; 200 201 signal(signr, SIG_DFL); 202 kill(getpid(), signr); 203 } 204 205 static void pid_synthesize_comm_event(pid_t pid, int full) 206 { 207 struct comm_event comm_ev; 208 char filename[PATH_MAX]; 209 char bf[BUFSIZ]; 210 int fd; 211 size_t size; 212 char *field, *sep; 213 DIR *tasks; 214 struct dirent dirent, *next; 215 216 snprintf(filename, sizeof(filename), "/proc/%d/stat", pid); 217 218 fd = open(filename, O_RDONLY); 219 if (fd < 0) { 220 /* 221 * We raced with a task exiting - just return: 222 */ 223 if (verbose) 224 fprintf(stderr, "couldn't open %s\n", filename); 225 return; 226 } 227 if (read(fd, bf, sizeof(bf)) < 0) { 228 fprintf(stderr, "couldn't read %s\n", filename); 229 exit(EXIT_FAILURE); 230 } 231 close(fd); 232 233 /* 9027 (cat) R 6747 9027 6747 34816 9027 ... */ 234 memset(&comm_ev, 0, sizeof(comm_ev)); 235 field = strchr(bf, '('); 236 if (field == NULL) 237 goto out_failure; 238 sep = strchr(++field, ')'); 239 if (sep == NULL) 240 goto out_failure; 241 size = sep - field; 242 memcpy(comm_ev.comm, field, size++); 243 244 comm_ev.pid = pid; 245 comm_ev.header.type = PERF_EVENT_COMM; 246 size = ALIGN(size, sizeof(u64)); 247 comm_ev.header.size = sizeof(comm_ev) - (sizeof(comm_ev.comm) - size); 248 249 if (!full) { 250 comm_ev.tid = pid; 251 252 write_output(&comm_ev, comm_ev.header.size); 253 return; 254 } 255 256 snprintf(filename, sizeof(filename), "/proc/%d/task", pid); 257 258 tasks = opendir(filename); 259 while (!readdir_r(tasks, &dirent, &next) && next) { 260 char *end; 261 pid = strtol(dirent.d_name, &end, 10); 262 if (*end) 263 continue; 264 265 comm_ev.tid = pid; 266 267 write_output(&comm_ev, comm_ev.header.size); 268 } 269 closedir(tasks); 270 return; 271 272 out_failure: 273 fprintf(stderr, "couldn't get COMM and pgid, malformed %s\n", 274 filename); 275 exit(EXIT_FAILURE); 276 } 277 278 static void pid_synthesize_mmap_samples(pid_t pid) 279 { 280 char filename[PATH_MAX]; 281 FILE *fp; 282 283 snprintf(filename, sizeof(filename), "/proc/%d/maps", pid); 284 285 fp = fopen(filename, "r"); 286 if (fp == NULL) { 287 /* 288 * We raced with a task exiting - just return: 289 */ 290 if (verbose) 291 fprintf(stderr, "couldn't open %s\n", filename); 292 return; 293 } 294 while (1) { 295 char bf[BUFSIZ], *pbf = bf; 296 struct mmap_event mmap_ev = { 297 .header = { .type = PERF_EVENT_MMAP }, 298 }; 299 int n; 300 size_t size; 301 if (fgets(bf, sizeof(bf), fp) == NULL) 302 break; 303 304 /* 00400000-0040c000 r-xp 00000000 fd:01 41038 /bin/cat */ 305 n = hex2u64(pbf, &mmap_ev.start); 306 if (n < 0) 307 continue; 308 pbf += n + 1; 309 n = hex2u64(pbf, &mmap_ev.len); 310 if (n < 0) 311 continue; 312 pbf += n + 3; 313 if (*pbf == 'x') { /* vm_exec */ 314 char *execname = strchr(bf, '/'); 315 316 if (execname == NULL) 317 continue; 318 319 size = strlen(execname); 320 execname[size - 1] = '\0'; /* Remove \n */ 321 memcpy(mmap_ev.filename, execname, size); 322 size = ALIGN(size, sizeof(u64)); 323 mmap_ev.len -= mmap_ev.start; 324 mmap_ev.header.size = (sizeof(mmap_ev) - 325 (sizeof(mmap_ev.filename) - size)); 326 mmap_ev.pid = pid; 327 mmap_ev.tid = pid; 328 329 write_output(&mmap_ev, mmap_ev.header.size); 330 } 331 } 332 333 fclose(fp); 334 } 335 336 static void synthesize_all(void) 337 { 338 DIR *proc; 339 struct dirent dirent, *next; 340 341 proc = opendir("/proc"); 342 343 while (!readdir_r(proc, &dirent, &next) && next) { 344 char *end; 345 pid_t pid; 346 347 pid = strtol(dirent.d_name, &end, 10); 348 if (*end) /* only interested in proper numerical dirents */ 349 continue; 350 351 pid_synthesize_comm_event(pid, 1); 352 pid_synthesize_mmap_samples(pid); 353 } 354 355 closedir(proc); 356 } 357 358 static int group_fd; 359 360 static struct perf_header_attr *get_header_attr(struct perf_counter_attr *a, int nr) 361 { 362 struct perf_header_attr *h_attr; 363 364 if (nr < header->attrs) { 365 h_attr = header->attr[nr]; 366 } else { 367 h_attr = perf_header_attr__new(a); 368 perf_header__add_attr(header, h_attr); 369 } 370 371 return h_attr; 372 } 373 374 static void create_counter(int counter, int cpu, pid_t pid) 375 { 376 struct perf_counter_attr *attr = attrs + counter; 377 struct perf_header_attr *h_attr; 378 int track = !counter; /* only the first counter needs these */ 379 struct { 380 u64 count; 381 u64 time_enabled; 382 u64 time_running; 383 u64 id; 384 } read_data; 385 386 attr->read_format = PERF_FORMAT_TOTAL_TIME_ENABLED | 387 PERF_FORMAT_TOTAL_TIME_RUNNING | 388 PERF_FORMAT_ID; 389 390 attr->sample_type = PERF_SAMPLE_IP | PERF_SAMPLE_TID; 391 392 if (freq) { 393 attr->sample_type |= PERF_SAMPLE_PERIOD; 394 attr->freq = 1; 395 attr->sample_freq = freq; 396 } 397 398 if (no_samples) 399 attr->sample_freq = 0; 400 401 if (inherit_stat) 402 attr->inherit_stat = 1; 403 404 if (call_graph) 405 attr->sample_type |= PERF_SAMPLE_CALLCHAIN; 406 407 attr->mmap = track; 408 attr->comm = track; 409 attr->inherit = (cpu < 0) && inherit; 410 attr->disabled = 1; 411 412 try_again: 413 fd[nr_cpu][counter] = sys_perf_counter_open(attr, pid, cpu, group_fd, 0); 414 415 if (fd[nr_cpu][counter] < 0) { 416 int err = errno; 417 418 if (err == EPERM) 419 die("Permission error - are you root?\n"); 420 421 /* 422 * If it's cycles then fall back to hrtimer 423 * based cpu-clock-tick sw counter, which 424 * is always available even if no PMU support: 425 */ 426 if (attr->type == PERF_TYPE_HARDWARE 427 && attr->config == PERF_COUNT_HW_CPU_CYCLES) { 428 429 if (verbose) 430 warning(" ... trying to fall back to cpu-clock-ticks\n"); 431 attr->type = PERF_TYPE_SOFTWARE; 432 attr->config = PERF_COUNT_SW_CPU_CLOCK; 433 goto try_again; 434 } 435 printf("\n"); 436 error("perfcounter syscall returned with %d (%s)\n", 437 fd[nr_cpu][counter], strerror(err)); 438 die("No CONFIG_PERF_COUNTERS=y kernel support configured?\n"); 439 exit(-1); 440 } 441 442 h_attr = get_header_attr(attr, counter); 443 444 if (!file_new) { 445 if (memcmp(&h_attr->attr, attr, sizeof(*attr))) { 446 fprintf(stderr, "incompatible append\n"); 447 exit(-1); 448 } 449 } 450 451 if (read(fd[nr_cpu][counter], &read_data, sizeof(read_data)) == -1) { 452 perror("Unable to read perf file descriptor\n"); 453 exit(-1); 454 } 455 456 perf_header_attr__add_id(h_attr, read_data.id); 457 458 assert(fd[nr_cpu][counter] >= 0); 459 fcntl(fd[nr_cpu][counter], F_SETFL, O_NONBLOCK); 460 461 /* 462 * First counter acts as the group leader: 463 */ 464 if (group && group_fd == -1) 465 group_fd = fd[nr_cpu][counter]; 466 467 event_array[nr_poll].fd = fd[nr_cpu][counter]; 468 event_array[nr_poll].events = POLLIN; 469 nr_poll++; 470 471 mmap_array[nr_cpu][counter].counter = counter; 472 mmap_array[nr_cpu][counter].prev = 0; 473 mmap_array[nr_cpu][counter].mask = mmap_pages*page_size - 1; 474 mmap_array[nr_cpu][counter].base = mmap(NULL, (mmap_pages+1)*page_size, 475 PROT_READ|PROT_WRITE, MAP_SHARED, fd[nr_cpu][counter], 0); 476 if (mmap_array[nr_cpu][counter].base == MAP_FAILED) { 477 error("failed to mmap with %d (%s)\n", errno, strerror(errno)); 478 exit(-1); 479 } 480 481 ioctl(fd[nr_cpu][counter], PERF_COUNTER_IOC_ENABLE); 482 } 483 484 static void open_counters(int cpu, pid_t pid) 485 { 486 int counter; 487 488 group_fd = -1; 489 for (counter = 0; counter < nr_counters; counter++) 490 create_counter(counter, cpu, pid); 491 492 nr_cpu++; 493 } 494 495 static void atexit_header(void) 496 { 497 header->data_size += bytes_written; 498 499 perf_header__write(header, output); 500 } 501 502 static int __cmd_record(int argc, const char **argv) 503 { 504 int i, counter; 505 struct stat st; 506 pid_t pid = 0; 507 int flags; 508 int ret; 509 510 page_size = sysconf(_SC_PAGE_SIZE); 511 nr_cpus = sysconf(_SC_NPROCESSORS_ONLN); 512 assert(nr_cpus <= MAX_NR_CPUS); 513 assert(nr_cpus >= 0); 514 515 atexit(sig_atexit); 516 signal(SIGCHLD, sig_handler); 517 signal(SIGINT, sig_handler); 518 519 if (!stat(output_name, &st) && !force && !append_file) { 520 fprintf(stderr, "Error, output file %s exists, use -A to append or -f to overwrite.\n", 521 output_name); 522 exit(-1); 523 } 524 525 flags = O_CREAT|O_RDWR; 526 if (append_file) 527 file_new = 0; 528 else 529 flags |= O_TRUNC; 530 531 output = open(output_name, flags, S_IRUSR|S_IWUSR); 532 if (output < 0) { 533 perror("failed to create output file"); 534 exit(-1); 535 } 536 537 if (!file_new) 538 header = perf_header__read(output); 539 else 540 header = perf_header__new(); 541 542 atexit(atexit_header); 543 544 if (!system_wide) { 545 pid = target_pid; 546 if (pid == -1) 547 pid = getpid(); 548 549 open_counters(-1, pid); 550 } else for (i = 0; i < nr_cpus; i++) 551 open_counters(i, target_pid); 552 553 if (file_new) 554 perf_header__write(header, output); 555 556 if (!system_wide) { 557 pid_synthesize_comm_event(pid, 0); 558 pid_synthesize_mmap_samples(pid); 559 } else 560 synthesize_all(); 561 562 if (target_pid == -1 && argc) { 563 pid = fork(); 564 if (pid < 0) 565 perror("failed to fork"); 566 567 if (!pid) { 568 if (execvp(argv[0], (char **)argv)) { 569 perror(argv[0]); 570 exit(-1); 571 } 572 } 573 } 574 575 if (realtime_prio) { 576 struct sched_param param; 577 578 param.sched_priority = realtime_prio; 579 if (sched_setscheduler(0, SCHED_FIFO, ¶m)) { 580 printf("Could not set realtime priority.\n"); 581 exit(-1); 582 } 583 } 584 585 for (;;) { 586 int hits = samples; 587 588 for (i = 0; i < nr_cpu; i++) { 589 for (counter = 0; counter < nr_counters; counter++) 590 mmap_read(&mmap_array[i][counter]); 591 } 592 593 if (hits == samples) { 594 if (done) 595 break; 596 ret = poll(event_array, nr_poll, 100); 597 } 598 } 599 600 /* 601 * Approximate RIP event size: 24 bytes. 602 */ 603 fprintf(stderr, 604 "[ perf record: Captured and wrote %.3f MB %s (~%lld samples) ]\n", 605 (double)bytes_written / 1024.0 / 1024.0, 606 output_name, 607 bytes_written / 24); 608 609 return 0; 610 } 611 612 static const char * const record_usage[] = { 613 "perf record [<options>] [<command>]", 614 "perf record [<options>] -- <command> [<options>]", 615 NULL 616 }; 617 618 static const struct option options[] = { 619 OPT_CALLBACK('e', "event", NULL, "event", 620 "event selector. use 'perf list' to list available events", 621 parse_events), 622 OPT_INTEGER('p', "pid", &target_pid, 623 "record events on existing pid"), 624 OPT_INTEGER('r', "realtime", &realtime_prio, 625 "collect data with this RT SCHED_FIFO priority"), 626 OPT_BOOLEAN('a', "all-cpus", &system_wide, 627 "system-wide collection from all CPUs"), 628 OPT_BOOLEAN('A', "append", &append_file, 629 "append to the output file to do incremental profiling"), 630 OPT_BOOLEAN('f', "force", &force, 631 "overwrite existing data file"), 632 OPT_LONG('c', "count", &default_interval, 633 "event period to sample"), 634 OPT_STRING('o', "output", &output_name, "file", 635 "output file name"), 636 OPT_BOOLEAN('i', "inherit", &inherit, 637 "child tasks inherit counters"), 638 OPT_INTEGER('F', "freq", &freq, 639 "profile at this frequency"), 640 OPT_INTEGER('m', "mmap-pages", &mmap_pages, 641 "number of mmap data pages"), 642 OPT_BOOLEAN('g', "call-graph", &call_graph, 643 "do call-graph (stack chain/backtrace) recording"), 644 OPT_BOOLEAN('v', "verbose", &verbose, 645 "be more verbose (show counter open errors, etc)"), 646 OPT_BOOLEAN('s', "stat", &inherit_stat, 647 "per thread counts"), 648 OPT_BOOLEAN('n', "no-samples", &no_samples, 649 "don't sample"), 650 OPT_END() 651 }; 652 653 int cmd_record(int argc, const char **argv, const char *prefix __used) 654 { 655 int counter; 656 657 argc = parse_options(argc, argv, options, record_usage, 0); 658 if (!argc && target_pid == -1 && !system_wide) 659 usage_with_options(record_usage, options); 660 661 if (!nr_counters) { 662 nr_counters = 1; 663 attrs[0].type = PERF_TYPE_HARDWARE; 664 attrs[0].config = PERF_COUNT_HW_CPU_CYCLES; 665 } 666 667 for (counter = 0; counter < nr_counters; counter++) { 668 if (attrs[counter].sample_period) 669 continue; 670 671 attrs[counter].sample_period = default_interval; 672 } 673 674 return __cmd_record(argc, argv); 675 } 676