1 /* 2 * builtin-record.c 3 * 4 * Builtin record command: Record the profile of a workload 5 * (or a CPU, or a PID) into the perf.data output file - for 6 * later analysis via perf report. 7 */ 8 #include "builtin.h" 9 10 #include "perf.h" 11 12 #include "util/util.h" 13 #include "util/parse-options.h" 14 #include "util/parse-events.h" 15 #include "util/string.h" 16 17 #include <unistd.h> 18 #include <sched.h> 19 20 #define ALIGN(x, a) __ALIGN_MASK(x, (typeof(x))(a)-1) 21 #define __ALIGN_MASK(x, mask) (((x)+(mask))&~(mask)) 22 23 static int fd[MAX_NR_CPUS][MAX_COUNTERS]; 24 25 static long default_interval = 100000; 26 27 static int nr_cpus = 0; 28 static unsigned int page_size; 29 static unsigned int mmap_pages = 128; 30 static int freq = 0; 31 static int output; 32 static const char *output_name = "perf.data"; 33 static int group = 0; 34 static unsigned int realtime_prio = 0; 35 static int system_wide = 0; 36 static pid_t target_pid = -1; 37 static int inherit = 1; 38 static int force = 0; 39 static int append_file = 0; 40 static int call_graph = 0; 41 static int verbose = 0; 42 43 static long samples; 44 static struct timeval last_read; 45 static struct timeval this_read; 46 47 static u64 bytes_written; 48 49 static struct pollfd event_array[MAX_NR_CPUS * MAX_COUNTERS]; 50 51 static int nr_poll; 52 static int nr_cpu; 53 54 static int file_new = 1; 55 static struct perf_file_header file_header; 56 57 struct mmap_event { 58 struct perf_event_header header; 59 u32 pid; 60 u32 tid; 61 u64 start; 62 u64 len; 63 u64 pgoff; 64 char filename[PATH_MAX]; 65 }; 66 67 struct comm_event { 68 struct perf_event_header header; 69 u32 pid; 70 u32 tid; 71 char comm[16]; 72 }; 73 74 75 struct mmap_data { 76 int counter; 77 void *base; 78 unsigned int mask; 79 unsigned int prev; 80 }; 81 82 static struct mmap_data mmap_array[MAX_NR_CPUS][MAX_COUNTERS]; 83 84 static unsigned long mmap_read_head(struct mmap_data *md) 85 { 86 struct perf_counter_mmap_page *pc = md->base; 87 long head; 88 89 head = pc->data_head; 90 rmb(); 91 92 return head; 93 } 94 95 static void mmap_write_tail(struct mmap_data *md, unsigned long tail) 96 { 97 struct perf_counter_mmap_page *pc = md->base; 98 99 /* 100 * ensure all reads are done before we write the tail out. 101 */ 102 /* mb(); */ 103 pc->data_tail = tail; 104 } 105 106 static void write_output(void *buf, size_t size) 107 { 108 while (size) { 109 int ret = write(output, buf, size); 110 111 if (ret < 0) 112 die("failed to write"); 113 114 size -= ret; 115 buf += ret; 116 117 bytes_written += ret; 118 } 119 } 120 121 static void mmap_read(struct mmap_data *md) 122 { 123 unsigned int head = mmap_read_head(md); 124 unsigned int old = md->prev; 125 unsigned char *data = md->base + page_size; 126 unsigned long size; 127 void *buf; 128 int diff; 129 130 gettimeofday(&this_read, NULL); 131 132 /* 133 * If we're further behind than half the buffer, there's a chance 134 * the writer will bite our tail and mess up the samples under us. 135 * 136 * If we somehow ended up ahead of the head, we got messed up. 137 * 138 * In either case, truncate and restart at head. 139 */ 140 diff = head - old; 141 if (diff < 0) { 142 struct timeval iv; 143 unsigned long msecs; 144 145 timersub(&this_read, &last_read, &iv); 146 msecs = iv.tv_sec*1000 + iv.tv_usec/1000; 147 148 fprintf(stderr, "WARNING: failed to keep up with mmap data." 149 " Last read %lu msecs ago.\n", msecs); 150 151 /* 152 * head points to a known good entry, start there. 153 */ 154 old = head; 155 } 156 157 last_read = this_read; 158 159 if (old != head) 160 samples++; 161 162 size = head - old; 163 164 if ((old & md->mask) + size != (head & md->mask)) { 165 buf = &data[old & md->mask]; 166 size = md->mask + 1 - (old & md->mask); 167 old += size; 168 169 write_output(buf, size); 170 } 171 172 buf = &data[old & md->mask]; 173 size = head - old; 174 old += size; 175 176 write_output(buf, size); 177 178 md->prev = old; 179 mmap_write_tail(md, old); 180 } 181 182 static volatile int done = 0; 183 static volatile int signr = -1; 184 185 static void sig_handler(int sig) 186 { 187 done = 1; 188 signr = sig; 189 } 190 191 static void sig_atexit(void) 192 { 193 if (signr == -1) 194 return; 195 196 signal(signr, SIG_DFL); 197 kill(getpid(), signr); 198 } 199 200 static void pid_synthesize_comm_event(pid_t pid, int full) 201 { 202 struct comm_event comm_ev; 203 char filename[PATH_MAX]; 204 char bf[BUFSIZ]; 205 int fd; 206 size_t size; 207 char *field, *sep; 208 DIR *tasks; 209 struct dirent dirent, *next; 210 211 snprintf(filename, sizeof(filename), "/proc/%d/stat", pid); 212 213 fd = open(filename, O_RDONLY); 214 if (fd < 0) { 215 /* 216 * We raced with a task exiting - just return: 217 */ 218 if (verbose) 219 fprintf(stderr, "couldn't open %s\n", filename); 220 return; 221 } 222 if (read(fd, bf, sizeof(bf)) < 0) { 223 fprintf(stderr, "couldn't read %s\n", filename); 224 exit(EXIT_FAILURE); 225 } 226 close(fd); 227 228 /* 9027 (cat) R 6747 9027 6747 34816 9027 ... */ 229 memset(&comm_ev, 0, sizeof(comm_ev)); 230 field = strchr(bf, '('); 231 if (field == NULL) 232 goto out_failure; 233 sep = strchr(++field, ')'); 234 if (sep == NULL) 235 goto out_failure; 236 size = sep - field; 237 memcpy(comm_ev.comm, field, size++); 238 239 comm_ev.pid = pid; 240 comm_ev.header.type = PERF_EVENT_COMM; 241 size = ALIGN(size, sizeof(u64)); 242 comm_ev.header.size = sizeof(comm_ev) - (sizeof(comm_ev.comm) - size); 243 244 if (!full) { 245 comm_ev.tid = pid; 246 247 write_output(&comm_ev, comm_ev.header.size); 248 return; 249 } 250 251 snprintf(filename, sizeof(filename), "/proc/%d/task", pid); 252 253 tasks = opendir(filename); 254 while (!readdir_r(tasks, &dirent, &next) && next) { 255 char *end; 256 pid = strtol(dirent.d_name, &end, 10); 257 if (*end) 258 continue; 259 260 comm_ev.tid = pid; 261 262 write_output(&comm_ev, comm_ev.header.size); 263 } 264 closedir(tasks); 265 return; 266 267 out_failure: 268 fprintf(stderr, "couldn't get COMM and pgid, malformed %s\n", 269 filename); 270 exit(EXIT_FAILURE); 271 } 272 273 static void pid_synthesize_mmap_samples(pid_t pid) 274 { 275 char filename[PATH_MAX]; 276 FILE *fp; 277 278 snprintf(filename, sizeof(filename), "/proc/%d/maps", pid); 279 280 fp = fopen(filename, "r"); 281 if (fp == NULL) { 282 /* 283 * We raced with a task exiting - just return: 284 */ 285 if (verbose) 286 fprintf(stderr, "couldn't open %s\n", filename); 287 return; 288 } 289 while (1) { 290 char bf[BUFSIZ], *pbf = bf; 291 struct mmap_event mmap_ev = { 292 .header.type = PERF_EVENT_MMAP, 293 }; 294 int n; 295 size_t size; 296 if (fgets(bf, sizeof(bf), fp) == NULL) 297 break; 298 299 /* 00400000-0040c000 r-xp 00000000 fd:01 41038 /bin/cat */ 300 n = hex2u64(pbf, &mmap_ev.start); 301 if (n < 0) 302 continue; 303 pbf += n + 1; 304 n = hex2u64(pbf, &mmap_ev.len); 305 if (n < 0) 306 continue; 307 pbf += n + 3; 308 if (*pbf == 'x') { /* vm_exec */ 309 char *execname = strrchr(bf, ' '); 310 311 if (execname == NULL || execname[1] != '/') 312 continue; 313 314 execname += 1; 315 size = strlen(execname); 316 execname[size - 1] = '\0'; /* Remove \n */ 317 memcpy(mmap_ev.filename, execname, size); 318 size = ALIGN(size, sizeof(u64)); 319 mmap_ev.len -= mmap_ev.start; 320 mmap_ev.header.size = (sizeof(mmap_ev) - 321 (sizeof(mmap_ev.filename) - size)); 322 mmap_ev.pid = pid; 323 mmap_ev.tid = pid; 324 325 write_output(&mmap_ev, mmap_ev.header.size); 326 } 327 } 328 329 fclose(fp); 330 } 331 332 static void synthesize_samples(void) 333 { 334 DIR *proc; 335 struct dirent dirent, *next; 336 337 proc = opendir("/proc"); 338 339 while (!readdir_r(proc, &dirent, &next) && next) { 340 char *end; 341 pid_t pid; 342 343 pid = strtol(dirent.d_name, &end, 10); 344 if (*end) /* only interested in proper numerical dirents */ 345 continue; 346 347 pid_synthesize_comm_event(pid, 1); 348 pid_synthesize_mmap_samples(pid); 349 } 350 351 closedir(proc); 352 } 353 354 static int group_fd; 355 356 static void create_counter(int counter, int cpu, pid_t pid) 357 { 358 struct perf_counter_attr *attr = attrs + counter; 359 int track = 1; 360 361 attr->sample_type = PERF_SAMPLE_IP | PERF_SAMPLE_TID; 362 363 if (freq) { 364 attr->sample_type |= PERF_SAMPLE_PERIOD; 365 attr->freq = 1; 366 attr->sample_freq = freq; 367 } 368 369 if (call_graph) 370 attr->sample_type |= PERF_SAMPLE_CALLCHAIN; 371 372 if (file_new) { 373 file_header.sample_type = attr->sample_type; 374 } else { 375 if (file_header.sample_type != attr->sample_type) { 376 fprintf(stderr, "incompatible append\n"); 377 exit(-1); 378 } 379 } 380 381 attr->mmap = track; 382 attr->comm = track; 383 attr->inherit = (cpu < 0) && inherit; 384 attr->disabled = 1; 385 386 track = 0; /* only the first counter needs these */ 387 388 try_again: 389 fd[nr_cpu][counter] = sys_perf_counter_open(attr, pid, cpu, group_fd, 0); 390 391 if (fd[nr_cpu][counter] < 0) { 392 int err = errno; 393 394 if (err == EPERM) 395 die("Permission error - are you root?\n"); 396 397 /* 398 * If it's cycles then fall back to hrtimer 399 * based cpu-clock-tick sw counter, which 400 * is always available even if no PMU support: 401 */ 402 if (attr->type == PERF_TYPE_HARDWARE 403 && attr->config == PERF_COUNT_HW_CPU_CYCLES) { 404 405 if (verbose) 406 warning(" ... trying to fall back to cpu-clock-ticks\n"); 407 attr->type = PERF_TYPE_SOFTWARE; 408 attr->config = PERF_COUNT_SW_CPU_CLOCK; 409 goto try_again; 410 } 411 printf("\n"); 412 error("perfcounter syscall returned with %d (%s)\n", 413 fd[nr_cpu][counter], strerror(err)); 414 die("No CONFIG_PERF_COUNTERS=y kernel support configured?\n"); 415 exit(-1); 416 } 417 418 assert(fd[nr_cpu][counter] >= 0); 419 fcntl(fd[nr_cpu][counter], F_SETFL, O_NONBLOCK); 420 421 /* 422 * First counter acts as the group leader: 423 */ 424 if (group && group_fd == -1) 425 group_fd = fd[nr_cpu][counter]; 426 427 event_array[nr_poll].fd = fd[nr_cpu][counter]; 428 event_array[nr_poll].events = POLLIN; 429 nr_poll++; 430 431 mmap_array[nr_cpu][counter].counter = counter; 432 mmap_array[nr_cpu][counter].prev = 0; 433 mmap_array[nr_cpu][counter].mask = mmap_pages*page_size - 1; 434 mmap_array[nr_cpu][counter].base = mmap(NULL, (mmap_pages+1)*page_size, 435 PROT_READ|PROT_WRITE, MAP_SHARED, fd[nr_cpu][counter], 0); 436 if (mmap_array[nr_cpu][counter].base == MAP_FAILED) { 437 error("failed to mmap with %d (%s)\n", errno, strerror(errno)); 438 exit(-1); 439 } 440 441 ioctl(fd[nr_cpu][counter], PERF_COUNTER_IOC_ENABLE); 442 } 443 444 static void open_counters(int cpu, pid_t pid) 445 { 446 int counter; 447 448 if (pid > 0) { 449 pid_synthesize_comm_event(pid, 0); 450 pid_synthesize_mmap_samples(pid); 451 } 452 453 group_fd = -1; 454 for (counter = 0; counter < nr_counters; counter++) 455 create_counter(counter, cpu, pid); 456 457 nr_cpu++; 458 } 459 460 static void atexit_header(void) 461 { 462 file_header.data_size += bytes_written; 463 464 if (pwrite(output, &file_header, sizeof(file_header), 0) == -1) 465 perror("failed to write on file headers"); 466 } 467 468 static int __cmd_record(int argc, const char **argv) 469 { 470 int i, counter; 471 struct stat st; 472 pid_t pid; 473 int flags; 474 int ret; 475 476 page_size = sysconf(_SC_PAGE_SIZE); 477 nr_cpus = sysconf(_SC_NPROCESSORS_ONLN); 478 assert(nr_cpus <= MAX_NR_CPUS); 479 assert(nr_cpus >= 0); 480 481 atexit(sig_atexit); 482 signal(SIGCHLD, sig_handler); 483 signal(SIGINT, sig_handler); 484 485 if (!stat(output_name, &st) && !force && !append_file) { 486 fprintf(stderr, "Error, output file %s exists, use -A to append or -f to overwrite.\n", 487 output_name); 488 exit(-1); 489 } 490 491 flags = O_CREAT|O_RDWR; 492 if (append_file) 493 file_new = 0; 494 else 495 flags |= O_TRUNC; 496 497 output = open(output_name, flags, S_IRUSR|S_IWUSR); 498 if (output < 0) { 499 perror("failed to create output file"); 500 exit(-1); 501 } 502 503 if (!file_new) { 504 if (read(output, &file_header, sizeof(file_header)) == -1) { 505 perror("failed to read file headers"); 506 exit(-1); 507 } 508 509 lseek(output, file_header.data_size, SEEK_CUR); 510 } 511 512 atexit(atexit_header); 513 514 if (!system_wide) { 515 open_counters(-1, target_pid != -1 ? target_pid : getpid()); 516 } else for (i = 0; i < nr_cpus; i++) 517 open_counters(i, target_pid); 518 519 if (target_pid == -1 && argc) { 520 pid = fork(); 521 if (pid < 0) 522 perror("failed to fork"); 523 524 if (!pid) { 525 if (execvp(argv[0], (char **)argv)) { 526 perror(argv[0]); 527 exit(-1); 528 } 529 } 530 } 531 532 if (realtime_prio) { 533 struct sched_param param; 534 535 param.sched_priority = realtime_prio; 536 if (sched_setscheduler(0, SCHED_FIFO, ¶m)) { 537 printf("Could not set realtime priority.\n"); 538 exit(-1); 539 } 540 } 541 542 if (system_wide) 543 synthesize_samples(); 544 545 while (!done) { 546 int hits = samples; 547 548 for (i = 0; i < nr_cpu; i++) { 549 for (counter = 0; counter < nr_counters; counter++) 550 mmap_read(&mmap_array[i][counter]); 551 } 552 553 if (hits == samples) 554 ret = poll(event_array, nr_poll, 100); 555 } 556 557 /* 558 * Approximate RIP event size: 24 bytes. 559 */ 560 fprintf(stderr, 561 "[ perf record: Captured and wrote %.3f MB %s (~%lld samples) ]\n", 562 (double)bytes_written / 1024.0 / 1024.0, 563 output_name, 564 bytes_written / 24); 565 566 return 0; 567 } 568 569 static const char * const record_usage[] = { 570 "perf record [<options>] [<command>]", 571 "perf record [<options>] -- <command> [<options>]", 572 NULL 573 }; 574 575 static const struct option options[] = { 576 OPT_CALLBACK('e', "event", NULL, "event", 577 "event selector. use 'perf list' to list available events", 578 parse_events), 579 OPT_INTEGER('p', "pid", &target_pid, 580 "record events on existing pid"), 581 OPT_INTEGER('r', "realtime", &realtime_prio, 582 "collect data with this RT SCHED_FIFO priority"), 583 OPT_BOOLEAN('a', "all-cpus", &system_wide, 584 "system-wide collection from all CPUs"), 585 OPT_BOOLEAN('A', "append", &append_file, 586 "append to the output file to do incremental profiling"), 587 OPT_BOOLEAN('f', "force", &force, 588 "overwrite existing data file"), 589 OPT_LONG('c', "count", &default_interval, 590 "event period to sample"), 591 OPT_STRING('o', "output", &output_name, "file", 592 "output file name"), 593 OPT_BOOLEAN('i', "inherit", &inherit, 594 "child tasks inherit counters"), 595 OPT_INTEGER('F', "freq", &freq, 596 "profile at this frequency"), 597 OPT_INTEGER('m', "mmap-pages", &mmap_pages, 598 "number of mmap data pages"), 599 OPT_BOOLEAN('g', "call-graph", &call_graph, 600 "do call-graph (stack chain/backtrace) recording"), 601 OPT_BOOLEAN('v', "verbose", &verbose, 602 "be more verbose (show counter open errors, etc)"), 603 OPT_END() 604 }; 605 606 int cmd_record(int argc, const char **argv, const char *prefix) 607 { 608 int counter; 609 610 argc = parse_options(argc, argv, options, record_usage, 0); 611 if (!argc && target_pid == -1 && !system_wide) 612 usage_with_options(record_usage, options); 613 614 if (!nr_counters) { 615 nr_counters = 1; 616 attrs[0].type = PERF_TYPE_HARDWARE; 617 attrs[0].config = PERF_COUNT_HW_CPU_CYCLES; 618 } 619 620 for (counter = 0; counter < nr_counters; counter++) { 621 if (attrs[counter].sample_period) 622 continue; 623 624 attrs[counter].sample_period = default_interval; 625 } 626 627 return __cmd_record(argc, argv); 628 } 629