1 /* 2 * builtin-record.c 3 * 4 * Builtin record command: Record the profile of a workload 5 * (or a CPU, or a PID) into the perf.data output file - for 6 * later analysis via perf report. 7 */ 8 #include "builtin.h" 9 10 #include "perf.h" 11 12 #include "util/util.h" 13 #include "util/parse-options.h" 14 #include "util/parse-events.h" 15 #include "util/string.h" 16 17 #include "util/header.h" 18 #include "util/event.h" 19 #include "util/debug.h" 20 #include "util/symbol.h" 21 22 #include <unistd.h> 23 #include <sched.h> 24 25 static int fd[MAX_NR_CPUS][MAX_COUNTERS]; 26 27 static long default_interval = 0; 28 29 static int nr_cpus = 0; 30 static unsigned int page_size; 31 static unsigned int mmap_pages = 128; 32 static int freq = 1000; 33 static int output; 34 static const char *output_name = "perf.data"; 35 static int group = 0; 36 static unsigned int realtime_prio = 0; 37 static int raw_samples = 0; 38 static int system_wide = 0; 39 static int profile_cpu = -1; 40 static pid_t target_pid = -1; 41 static pid_t child_pid = -1; 42 static int inherit = 1; 43 static int force = 0; 44 static int append_file = 0; 45 static int call_graph = 0; 46 static int inherit_stat = 0; 47 static int no_samples = 0; 48 static int sample_address = 0; 49 static int multiplex = 0; 50 static int multiplex_fd = -1; 51 52 static long samples = 0; 53 static struct timeval last_read; 54 static struct timeval this_read; 55 56 static u64 bytes_written = 0; 57 58 static struct pollfd event_array[MAX_NR_CPUS * MAX_COUNTERS]; 59 60 static int nr_poll = 0; 61 static int nr_cpu = 0; 62 63 static int file_new = 1; 64 65 struct perf_header *header = NULL; 66 67 struct mmap_data { 68 int counter; 69 void *base; 70 unsigned int mask; 71 unsigned int prev; 72 }; 73 74 static struct mmap_data mmap_array[MAX_NR_CPUS][MAX_COUNTERS]; 75 76 static unsigned long mmap_read_head(struct mmap_data *md) 77 { 78 struct perf_event_mmap_page *pc = md->base; 79 long head; 80 81 head = pc->data_head; 82 rmb(); 83 84 return head; 85 } 86 87 static void mmap_write_tail(struct mmap_data *md, unsigned long tail) 88 { 89 struct perf_event_mmap_page *pc = md->base; 90 91 /* 92 * ensure all reads are done before we write the tail out. 93 */ 94 /* mb(); */ 95 pc->data_tail = tail; 96 } 97 98 static void write_output(void *buf, size_t size) 99 { 100 while (size) { 101 int ret = write(output, buf, size); 102 103 if (ret < 0) 104 die("failed to write"); 105 106 size -= ret; 107 buf += ret; 108 109 bytes_written += ret; 110 } 111 } 112 113 static void write_event(event_t *buf, size_t size) 114 { 115 /* 116 * Add it to the list of DSOs, so that when we finish this 117 * record session we can pick the available build-ids. 118 */ 119 if (buf->header.type == PERF_RECORD_MMAP) 120 dsos__findnew(buf->mmap.filename); 121 122 write_output(buf, size); 123 } 124 125 static int process_synthesized_event(event_t *event) 126 { 127 write_event(event, event->header.size); 128 return 0; 129 } 130 131 static void mmap_read(struct mmap_data *md) 132 { 133 unsigned int head = mmap_read_head(md); 134 unsigned int old = md->prev; 135 unsigned char *data = md->base + page_size; 136 unsigned long size; 137 void *buf; 138 int diff; 139 140 gettimeofday(&this_read, NULL); 141 142 /* 143 * If we're further behind than half the buffer, there's a chance 144 * the writer will bite our tail and mess up the samples under us. 145 * 146 * If we somehow ended up ahead of the head, we got messed up. 147 * 148 * In either case, truncate and restart at head. 149 */ 150 diff = head - old; 151 if (diff < 0) { 152 struct timeval iv; 153 unsigned long msecs; 154 155 timersub(&this_read, &last_read, &iv); 156 msecs = iv.tv_sec*1000 + iv.tv_usec/1000; 157 158 fprintf(stderr, "WARNING: failed to keep up with mmap data." 159 " Last read %lu msecs ago.\n", msecs); 160 161 /* 162 * head points to a known good entry, start there. 163 */ 164 old = head; 165 } 166 167 last_read = this_read; 168 169 if (old != head) 170 samples++; 171 172 size = head - old; 173 174 if ((old & md->mask) + size != (head & md->mask)) { 175 buf = &data[old & md->mask]; 176 size = md->mask + 1 - (old & md->mask); 177 old += size; 178 179 write_event(buf, size); 180 } 181 182 buf = &data[old & md->mask]; 183 size = head - old; 184 old += size; 185 186 write_event(buf, size); 187 188 md->prev = old; 189 mmap_write_tail(md, old); 190 } 191 192 static volatile int done = 0; 193 static volatile int signr = -1; 194 195 static void sig_handler(int sig) 196 { 197 done = 1; 198 signr = sig; 199 } 200 201 static void sig_atexit(void) 202 { 203 if (child_pid != -1) 204 kill(child_pid, SIGTERM); 205 206 if (signr == -1) 207 return; 208 209 signal(signr, SIG_DFL); 210 kill(getpid(), signr); 211 } 212 213 static int group_fd; 214 215 static struct perf_header_attr *get_header_attr(struct perf_event_attr *a, int nr) 216 { 217 struct perf_header_attr *h_attr; 218 219 if (nr < header->attrs) { 220 h_attr = header->attr[nr]; 221 } else { 222 h_attr = perf_header_attr__new(a); 223 if (h_attr != NULL) 224 if (perf_header__add_attr(header, h_attr) < 0) { 225 perf_header_attr__delete(h_attr); 226 h_attr = NULL; 227 } 228 } 229 230 return h_attr; 231 } 232 233 static void create_counter(int counter, int cpu, pid_t pid) 234 { 235 char *filter = filters[counter]; 236 struct perf_event_attr *attr = attrs + counter; 237 struct perf_header_attr *h_attr; 238 int track = !counter; /* only the first counter needs these */ 239 int ret; 240 struct { 241 u64 count; 242 u64 time_enabled; 243 u64 time_running; 244 u64 id; 245 } read_data; 246 247 attr->read_format = PERF_FORMAT_TOTAL_TIME_ENABLED | 248 PERF_FORMAT_TOTAL_TIME_RUNNING | 249 PERF_FORMAT_ID; 250 251 attr->sample_type |= PERF_SAMPLE_IP | PERF_SAMPLE_TID; 252 253 if (freq) { 254 attr->sample_type |= PERF_SAMPLE_PERIOD; 255 attr->freq = 1; 256 attr->sample_freq = freq; 257 } 258 259 if (no_samples) 260 attr->sample_freq = 0; 261 262 if (inherit_stat) 263 attr->inherit_stat = 1; 264 265 if (sample_address) 266 attr->sample_type |= PERF_SAMPLE_ADDR; 267 268 if (call_graph) 269 attr->sample_type |= PERF_SAMPLE_CALLCHAIN; 270 271 if (raw_samples) { 272 attr->sample_type |= PERF_SAMPLE_TIME; 273 attr->sample_type |= PERF_SAMPLE_RAW; 274 attr->sample_type |= PERF_SAMPLE_CPU; 275 } 276 277 attr->mmap = track; 278 attr->comm = track; 279 attr->inherit = (cpu < 0) && inherit; 280 attr->disabled = 1; 281 282 try_again: 283 fd[nr_cpu][counter] = sys_perf_event_open(attr, pid, cpu, group_fd, 0); 284 285 if (fd[nr_cpu][counter] < 0) { 286 int err = errno; 287 288 if (err == EPERM || err == EACCES) 289 die("Permission error - are you root?\n"); 290 else if (err == ENODEV && profile_cpu != -1) 291 die("No such device - did you specify an out-of-range profile CPU?\n"); 292 293 /* 294 * If it's cycles then fall back to hrtimer 295 * based cpu-clock-tick sw counter, which 296 * is always available even if no PMU support: 297 */ 298 if (attr->type == PERF_TYPE_HARDWARE 299 && attr->config == PERF_COUNT_HW_CPU_CYCLES) { 300 301 if (verbose) 302 warning(" ... trying to fall back to cpu-clock-ticks\n"); 303 attr->type = PERF_TYPE_SOFTWARE; 304 attr->config = PERF_COUNT_SW_CPU_CLOCK; 305 goto try_again; 306 } 307 printf("\n"); 308 error("perfcounter syscall returned with %d (%s)\n", 309 fd[nr_cpu][counter], strerror(err)); 310 311 #if defined(__i386__) || defined(__x86_64__) 312 if (attr->type == PERF_TYPE_HARDWARE && err == EOPNOTSUPP) 313 die("No hardware sampling interrupt available. No APIC? If so then you can boot the kernel with the \"lapic\" boot parameter to force-enable it.\n"); 314 #endif 315 316 die("No CONFIG_PERF_EVENTS=y kernel support configured?\n"); 317 exit(-1); 318 } 319 320 h_attr = get_header_attr(attr, counter); 321 if (h_attr == NULL) 322 die("nomem\n"); 323 324 if (!file_new) { 325 if (memcmp(&h_attr->attr, attr, sizeof(*attr))) { 326 fprintf(stderr, "incompatible append\n"); 327 exit(-1); 328 } 329 } 330 331 if (read(fd[nr_cpu][counter], &read_data, sizeof(read_data)) == -1) { 332 perror("Unable to read perf file descriptor\n"); 333 exit(-1); 334 } 335 336 if (perf_header_attr__add_id(h_attr, read_data.id) < 0) { 337 pr_warning("Not enough memory to add id\n"); 338 exit(-1); 339 } 340 341 assert(fd[nr_cpu][counter] >= 0); 342 fcntl(fd[nr_cpu][counter], F_SETFL, O_NONBLOCK); 343 344 /* 345 * First counter acts as the group leader: 346 */ 347 if (group && group_fd == -1) 348 group_fd = fd[nr_cpu][counter]; 349 if (multiplex && multiplex_fd == -1) 350 multiplex_fd = fd[nr_cpu][counter]; 351 352 if (multiplex && fd[nr_cpu][counter] != multiplex_fd) { 353 354 ret = ioctl(fd[nr_cpu][counter], PERF_EVENT_IOC_SET_OUTPUT, multiplex_fd); 355 assert(ret != -1); 356 } else { 357 event_array[nr_poll].fd = fd[nr_cpu][counter]; 358 event_array[nr_poll].events = POLLIN; 359 nr_poll++; 360 361 mmap_array[nr_cpu][counter].counter = counter; 362 mmap_array[nr_cpu][counter].prev = 0; 363 mmap_array[nr_cpu][counter].mask = mmap_pages*page_size - 1; 364 mmap_array[nr_cpu][counter].base = mmap(NULL, (mmap_pages+1)*page_size, 365 PROT_READ|PROT_WRITE, MAP_SHARED, fd[nr_cpu][counter], 0); 366 if (mmap_array[nr_cpu][counter].base == MAP_FAILED) { 367 error("failed to mmap with %d (%s)\n", errno, strerror(errno)); 368 exit(-1); 369 } 370 } 371 372 if (filter != NULL) { 373 ret = ioctl(fd[nr_cpu][counter], 374 PERF_EVENT_IOC_SET_FILTER, filter); 375 if (ret) { 376 error("failed to set filter with %d (%s)\n", errno, 377 strerror(errno)); 378 exit(-1); 379 } 380 } 381 382 ioctl(fd[nr_cpu][counter], PERF_EVENT_IOC_ENABLE); 383 } 384 385 static void open_counters(int cpu, pid_t pid) 386 { 387 int counter; 388 389 group_fd = -1; 390 for (counter = 0; counter < nr_counters; counter++) 391 create_counter(counter, cpu, pid); 392 393 nr_cpu++; 394 } 395 396 static void atexit_header(void) 397 { 398 header->data_size += bytes_written; 399 400 perf_header__write(header, output, true); 401 } 402 403 static int __cmd_record(int argc, const char **argv) 404 { 405 int i, counter; 406 struct stat st; 407 pid_t pid = 0; 408 int flags; 409 int err; 410 unsigned long waking = 0; 411 412 page_size = sysconf(_SC_PAGE_SIZE); 413 nr_cpus = sysconf(_SC_NPROCESSORS_ONLN); 414 assert(nr_cpus <= MAX_NR_CPUS); 415 assert(nr_cpus >= 0); 416 417 atexit(sig_atexit); 418 signal(SIGCHLD, sig_handler); 419 signal(SIGINT, sig_handler); 420 421 if (!stat(output_name, &st) && st.st_size) { 422 if (!force && !append_file) { 423 fprintf(stderr, "Error, output file %s exists, use -A to append or -f to overwrite.\n", 424 output_name); 425 exit(-1); 426 } 427 } else { 428 append_file = 0; 429 } 430 431 flags = O_CREAT|O_RDWR; 432 if (append_file) 433 file_new = 0; 434 else 435 flags |= O_TRUNC; 436 437 output = open(output_name, flags, S_IRUSR|S_IWUSR); 438 if (output < 0) { 439 perror("failed to create output file"); 440 exit(-1); 441 } 442 443 header = perf_header__new(); 444 if (header == NULL) { 445 pr_err("Not enough memory for reading perf file header\n"); 446 return -1; 447 } 448 449 if (!file_new) { 450 err = perf_header__read(header, output); 451 if (err < 0) 452 return err; 453 } 454 455 if (raw_samples) { 456 perf_header__set_feat(header, HEADER_TRACE_INFO); 457 } else { 458 for (i = 0; i < nr_counters; i++) { 459 if (attrs[i].sample_type & PERF_SAMPLE_RAW) { 460 perf_header__set_feat(header, HEADER_TRACE_INFO); 461 break; 462 } 463 } 464 } 465 466 atexit(atexit_header); 467 468 if (!system_wide) { 469 pid = target_pid; 470 if (pid == -1) 471 pid = getpid(); 472 473 open_counters(profile_cpu, pid); 474 } else { 475 if (profile_cpu != -1) { 476 open_counters(profile_cpu, target_pid); 477 } else { 478 for (i = 0; i < nr_cpus; i++) 479 open_counters(i, target_pid); 480 } 481 } 482 483 if (file_new) { 484 err = perf_header__write(header, output, false); 485 if (err < 0) 486 return err; 487 } 488 489 if (!system_wide) 490 event__synthesize_thread(pid, process_synthesized_event); 491 else 492 event__synthesize_threads(process_synthesized_event); 493 494 if (target_pid == -1 && argc) { 495 pid = fork(); 496 if (pid < 0) 497 die("failed to fork"); 498 499 if (!pid) { 500 if (execvp(argv[0], (char **)argv)) { 501 perror(argv[0]); 502 exit(-1); 503 } 504 } else { 505 /* 506 * Wait a bit for the execv'ed child to appear 507 * and be updated in /proc 508 * FIXME: Do you know a less heuristical solution? 509 */ 510 usleep(1000); 511 event__synthesize_thread(pid, 512 process_synthesized_event); 513 } 514 515 child_pid = pid; 516 } 517 518 if (realtime_prio) { 519 struct sched_param param; 520 521 param.sched_priority = realtime_prio; 522 if (sched_setscheduler(0, SCHED_FIFO, ¶m)) { 523 pr_err("Could not set realtime priority.\n"); 524 exit(-1); 525 } 526 } 527 528 for (;;) { 529 int hits = samples; 530 531 for (i = 0; i < nr_cpu; i++) { 532 for (counter = 0; counter < nr_counters; counter++) { 533 if (mmap_array[i][counter].base) 534 mmap_read(&mmap_array[i][counter]); 535 } 536 } 537 538 if (hits == samples) { 539 if (done) 540 break; 541 err = poll(event_array, nr_poll, -1); 542 waking++; 543 } 544 545 if (done) { 546 for (i = 0; i < nr_cpu; i++) { 547 for (counter = 0; counter < nr_counters; counter++) 548 ioctl(fd[i][counter], PERF_EVENT_IOC_DISABLE); 549 } 550 } 551 } 552 553 fprintf(stderr, "[ perf record: Woken up %ld times to write data ]\n", waking); 554 555 /* 556 * Approximate RIP event size: 24 bytes. 557 */ 558 fprintf(stderr, 559 "[ perf record: Captured and wrote %.3f MB %s (~%lld samples) ]\n", 560 (double)bytes_written / 1024.0 / 1024.0, 561 output_name, 562 bytes_written / 24); 563 564 return 0; 565 } 566 567 static const char * const record_usage[] = { 568 "perf record [<options>] [<command>]", 569 "perf record [<options>] -- <command> [<options>]", 570 NULL 571 }; 572 573 static const struct option options[] = { 574 OPT_CALLBACK('e', "event", NULL, "event", 575 "event selector. use 'perf list' to list available events", 576 parse_events), 577 OPT_CALLBACK(0, "filter", NULL, "filter", 578 "event filter", parse_filter), 579 OPT_INTEGER('p', "pid", &target_pid, 580 "record events on existing pid"), 581 OPT_INTEGER('r', "realtime", &realtime_prio, 582 "collect data with this RT SCHED_FIFO priority"), 583 OPT_BOOLEAN('R', "raw-samples", &raw_samples, 584 "collect raw sample records from all opened counters"), 585 OPT_BOOLEAN('a', "all-cpus", &system_wide, 586 "system-wide collection from all CPUs"), 587 OPT_BOOLEAN('A', "append", &append_file, 588 "append to the output file to do incremental profiling"), 589 OPT_INTEGER('C', "profile_cpu", &profile_cpu, 590 "CPU to profile on"), 591 OPT_BOOLEAN('f', "force", &force, 592 "overwrite existing data file"), 593 OPT_LONG('c', "count", &default_interval, 594 "event period to sample"), 595 OPT_STRING('o', "output", &output_name, "file", 596 "output file name"), 597 OPT_BOOLEAN('i', "inherit", &inherit, 598 "child tasks inherit counters"), 599 OPT_INTEGER('F', "freq", &freq, 600 "profile at this frequency"), 601 OPT_INTEGER('m', "mmap-pages", &mmap_pages, 602 "number of mmap data pages"), 603 OPT_BOOLEAN('g', "call-graph", &call_graph, 604 "do call-graph (stack chain/backtrace) recording"), 605 OPT_BOOLEAN('v', "verbose", &verbose, 606 "be more verbose (show counter open errors, etc)"), 607 OPT_BOOLEAN('s', "stat", &inherit_stat, 608 "per thread counts"), 609 OPT_BOOLEAN('d', "data", &sample_address, 610 "Sample addresses"), 611 OPT_BOOLEAN('n', "no-samples", &no_samples, 612 "don't sample"), 613 OPT_BOOLEAN('M', "multiplex", &multiplex, 614 "multiplex counter output in a single channel"), 615 OPT_END() 616 }; 617 618 int cmd_record(int argc, const char **argv, const char *prefix __used) 619 { 620 int counter; 621 622 symbol__init(0); 623 624 argc = parse_options(argc, argv, options, record_usage, 625 PARSE_OPT_STOP_AT_NON_OPTION); 626 if (!argc && target_pid == -1 && !system_wide) 627 usage_with_options(record_usage, options); 628 629 if (!nr_counters) { 630 nr_counters = 1; 631 attrs[0].type = PERF_TYPE_HARDWARE; 632 attrs[0].config = PERF_COUNT_HW_CPU_CYCLES; 633 } 634 635 /* 636 * User specified count overrides default frequency. 637 */ 638 if (default_interval) 639 freq = 0; 640 else if (freq) { 641 default_interval = freq; 642 } else { 643 fprintf(stderr, "frequency and count are zero, aborting\n"); 644 exit(EXIT_FAILURE); 645 } 646 647 for (counter = 0; counter < nr_counters; counter++) { 648 if (attrs[counter].sample_period) 649 continue; 650 651 attrs[counter].sample_period = default_interval; 652 } 653 654 return __cmd_record(argc, argv); 655 } 656