1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * builtin-record.c 4 * 5 * Builtin record command: Record the profile of a workload 6 * (or a CPU, or a PID) into the perf.data output file - for 7 * later analysis via perf report. 8 */ 9 #include "builtin.h" 10 11 #include "util/build-id.h" 12 #include <subcmd/parse-options.h> 13 #include "util/parse-events.h" 14 #include "util/config.h" 15 16 #include "util/callchain.h" 17 #include "util/cgroup.h" 18 #include "util/header.h" 19 #include "util/event.h" 20 #include "util/evlist.h" 21 #include "util/evsel.h" 22 #include "util/debug.h" 23 #include "util/mmap.h" 24 #include "util/target.h" 25 #include "util/session.h" 26 #include "util/tool.h" 27 #include "util/symbol.h" 28 #include "util/record.h" 29 #include "util/cpumap.h" 30 #include "util/thread_map.h" 31 #include "util/data.h" 32 #include "util/perf_regs.h" 33 #include "util/auxtrace.h" 34 #include "util/tsc.h" 35 #include "util/parse-branch-options.h" 36 #include "util/parse-regs-options.h" 37 #include "util/llvm-utils.h" 38 #include "util/bpf-loader.h" 39 #include "util/trigger.h" 40 #include "util/perf-hooks.h" 41 #include "util/cpu-set-sched.h" 42 #include "util/synthetic-events.h" 43 #include "util/time-utils.h" 44 #include "util/units.h" 45 #include "util/bpf-event.h" 46 #include "asm/bug.h" 47 #include "perf.h" 48 49 #include <errno.h> 50 #include <inttypes.h> 51 #include <locale.h> 52 #include <poll.h> 53 #include <unistd.h> 54 #include <sched.h> 55 #include <signal.h> 56 #include <sys/mman.h> 57 #include <sys/wait.h> 58 #include <sys/types.h> 59 #include <sys/stat.h> 60 #include <fcntl.h> 61 #include <linux/err.h> 62 #include <linux/string.h> 63 #include <linux/time64.h> 64 #include <linux/zalloc.h> 65 66 struct switch_output { 67 bool enabled; 68 bool signal; 69 unsigned long size; 70 unsigned long time; 71 const char *str; 72 bool set; 73 char **filenames; 74 int num_files; 75 int cur_file; 76 }; 77 78 struct record { 79 struct perf_tool tool; 80 struct record_opts opts; 81 u64 bytes_written; 82 struct perf_data data; 83 struct auxtrace_record *itr; 84 struct evlist *evlist; 85 struct perf_session *session; 86 int realtime_prio; 87 bool no_buildid; 88 bool no_buildid_set; 89 bool no_buildid_cache; 90 bool no_buildid_cache_set; 91 bool buildid_all; 92 bool timestamp_filename; 93 bool timestamp_boundary; 94 struct switch_output switch_output; 95 unsigned long long samples; 96 cpu_set_t affinity_mask; 97 unsigned long output_max_size; /* = 0: unlimited */ 98 }; 99 100 static volatile int done; 101 102 static volatile int auxtrace_record__snapshot_started; 103 static DEFINE_TRIGGER(auxtrace_snapshot_trigger); 104 static DEFINE_TRIGGER(switch_output_trigger); 105 106 static const char *affinity_tags[PERF_AFFINITY_MAX] = { 107 "SYS", "NODE", "CPU" 108 }; 109 110 static bool switch_output_signal(struct record *rec) 111 { 112 return rec->switch_output.signal && 113 trigger_is_ready(&switch_output_trigger); 114 } 115 116 static bool switch_output_size(struct record *rec) 117 { 118 return rec->switch_output.size && 119 trigger_is_ready(&switch_output_trigger) && 120 (rec->bytes_written >= rec->switch_output.size); 121 } 122 123 static bool switch_output_time(struct record *rec) 124 { 125 return rec->switch_output.time && 126 trigger_is_ready(&switch_output_trigger); 127 } 128 129 static bool record__output_max_size_exceeded(struct record *rec) 130 { 131 return rec->output_max_size && 132 (rec->bytes_written >= rec->output_max_size); 133 } 134 135 static int record__write(struct record *rec, struct mmap *map __maybe_unused, 136 void *bf, size_t size) 137 { 138 struct perf_data_file *file = &rec->session->data->file; 139 140 if (perf_data_file__write(file, bf, size) < 0) { 141 pr_err("failed to write perf data, error: %m\n"); 142 return -1; 143 } 144 145 rec->bytes_written += size; 146 147 if (record__output_max_size_exceeded(rec) && !done) { 148 fprintf(stderr, "[ perf record: perf size limit reached (%" PRIu64 " KB)," 149 " stopping session ]\n", 150 rec->bytes_written >> 10); 151 done = 1; 152 } 153 154 if (switch_output_size(rec)) 155 trigger_hit(&switch_output_trigger); 156 157 return 0; 158 } 159 160 static int record__aio_enabled(struct record *rec); 161 static int record__comp_enabled(struct record *rec); 162 static size_t zstd_compress(struct perf_session *session, void *dst, size_t dst_size, 163 void *src, size_t src_size); 164 165 #ifdef HAVE_AIO_SUPPORT 166 static int record__aio_write(struct aiocb *cblock, int trace_fd, 167 void *buf, size_t size, off_t off) 168 { 169 int rc; 170 171 cblock->aio_fildes = trace_fd; 172 cblock->aio_buf = buf; 173 cblock->aio_nbytes = size; 174 cblock->aio_offset = off; 175 cblock->aio_sigevent.sigev_notify = SIGEV_NONE; 176 177 do { 178 rc = aio_write(cblock); 179 if (rc == 0) { 180 break; 181 } else if (errno != EAGAIN) { 182 cblock->aio_fildes = -1; 183 pr_err("failed to queue perf data, error: %m\n"); 184 break; 185 } 186 } while (1); 187 188 return rc; 189 } 190 191 static int record__aio_complete(struct mmap *md, struct aiocb *cblock) 192 { 193 void *rem_buf; 194 off_t rem_off; 195 size_t rem_size; 196 int rc, aio_errno; 197 ssize_t aio_ret, written; 198 199 aio_errno = aio_error(cblock); 200 if (aio_errno == EINPROGRESS) 201 return 0; 202 203 written = aio_ret = aio_return(cblock); 204 if (aio_ret < 0) { 205 if (aio_errno != EINTR) 206 pr_err("failed to write perf data, error: %m\n"); 207 written = 0; 208 } 209 210 rem_size = cblock->aio_nbytes - written; 211 212 if (rem_size == 0) { 213 cblock->aio_fildes = -1; 214 /* 215 * md->refcount is incremented in record__aio_pushfn() for 216 * every aio write request started in record__aio_push() so 217 * decrement it because the request is now complete. 218 */ 219 perf_mmap__put(&md->core); 220 rc = 1; 221 } else { 222 /* 223 * aio write request may require restart with the 224 * reminder if the kernel didn't write whole 225 * chunk at once. 226 */ 227 rem_off = cblock->aio_offset + written; 228 rem_buf = (void *)(cblock->aio_buf + written); 229 record__aio_write(cblock, cblock->aio_fildes, 230 rem_buf, rem_size, rem_off); 231 rc = 0; 232 } 233 234 return rc; 235 } 236 237 static int record__aio_sync(struct mmap *md, bool sync_all) 238 { 239 struct aiocb **aiocb = md->aio.aiocb; 240 struct aiocb *cblocks = md->aio.cblocks; 241 struct timespec timeout = { 0, 1000 * 1000 * 1 }; /* 1ms */ 242 int i, do_suspend; 243 244 do { 245 do_suspend = 0; 246 for (i = 0; i < md->aio.nr_cblocks; ++i) { 247 if (cblocks[i].aio_fildes == -1 || record__aio_complete(md, &cblocks[i])) { 248 if (sync_all) 249 aiocb[i] = NULL; 250 else 251 return i; 252 } else { 253 /* 254 * Started aio write is not complete yet 255 * so it has to be waited before the 256 * next allocation. 257 */ 258 aiocb[i] = &cblocks[i]; 259 do_suspend = 1; 260 } 261 } 262 if (!do_suspend) 263 return -1; 264 265 while (aio_suspend((const struct aiocb **)aiocb, md->aio.nr_cblocks, &timeout)) { 266 if (!(errno == EAGAIN || errno == EINTR)) 267 pr_err("failed to sync perf data, error: %m\n"); 268 } 269 } while (1); 270 } 271 272 struct record_aio { 273 struct record *rec; 274 void *data; 275 size_t size; 276 }; 277 278 static int record__aio_pushfn(struct mmap *map, void *to, void *buf, size_t size) 279 { 280 struct record_aio *aio = to; 281 282 /* 283 * map->core.base data pointed by buf is copied into free map->aio.data[] buffer 284 * to release space in the kernel buffer as fast as possible, calling 285 * perf_mmap__consume() from perf_mmap__push() function. 286 * 287 * That lets the kernel to proceed with storing more profiling data into 288 * the kernel buffer earlier than other per-cpu kernel buffers are handled. 289 * 290 * Coping can be done in two steps in case the chunk of profiling data 291 * crosses the upper bound of the kernel buffer. In this case we first move 292 * part of data from map->start till the upper bound and then the reminder 293 * from the beginning of the kernel buffer till the end of the data chunk. 294 */ 295 296 if (record__comp_enabled(aio->rec)) { 297 size = zstd_compress(aio->rec->session, aio->data + aio->size, 298 mmap__mmap_len(map) - aio->size, 299 buf, size); 300 } else { 301 memcpy(aio->data + aio->size, buf, size); 302 } 303 304 if (!aio->size) { 305 /* 306 * Increment map->refcount to guard map->aio.data[] buffer 307 * from premature deallocation because map object can be 308 * released earlier than aio write request started on 309 * map->aio.data[] buffer is complete. 310 * 311 * perf_mmap__put() is done at record__aio_complete() 312 * after started aio request completion or at record__aio_push() 313 * if the request failed to start. 314 */ 315 perf_mmap__get(&map->core); 316 } 317 318 aio->size += size; 319 320 return size; 321 } 322 323 static int record__aio_push(struct record *rec, struct mmap *map, off_t *off) 324 { 325 int ret, idx; 326 int trace_fd = rec->session->data->file.fd; 327 struct record_aio aio = { .rec = rec, .size = 0 }; 328 329 /* 330 * Call record__aio_sync() to wait till map->aio.data[] buffer 331 * becomes available after previous aio write operation. 332 */ 333 334 idx = record__aio_sync(map, false); 335 aio.data = map->aio.data[idx]; 336 ret = perf_mmap__push(map, &aio, record__aio_pushfn); 337 if (ret != 0) /* ret > 0 - no data, ret < 0 - error */ 338 return ret; 339 340 rec->samples++; 341 ret = record__aio_write(&(map->aio.cblocks[idx]), trace_fd, aio.data, aio.size, *off); 342 if (!ret) { 343 *off += aio.size; 344 rec->bytes_written += aio.size; 345 if (switch_output_size(rec)) 346 trigger_hit(&switch_output_trigger); 347 } else { 348 /* 349 * Decrement map->refcount incremented in record__aio_pushfn() 350 * back if record__aio_write() operation failed to start, otherwise 351 * map->refcount is decremented in record__aio_complete() after 352 * aio write operation finishes successfully. 353 */ 354 perf_mmap__put(&map->core); 355 } 356 357 return ret; 358 } 359 360 static off_t record__aio_get_pos(int trace_fd) 361 { 362 return lseek(trace_fd, 0, SEEK_CUR); 363 } 364 365 static void record__aio_set_pos(int trace_fd, off_t pos) 366 { 367 lseek(trace_fd, pos, SEEK_SET); 368 } 369 370 static void record__aio_mmap_read_sync(struct record *rec) 371 { 372 int i; 373 struct evlist *evlist = rec->evlist; 374 struct mmap *maps = evlist->mmap; 375 376 if (!record__aio_enabled(rec)) 377 return; 378 379 for (i = 0; i < evlist->core.nr_mmaps; i++) { 380 struct mmap *map = &maps[i]; 381 382 if (map->core.base) 383 record__aio_sync(map, true); 384 } 385 } 386 387 static int nr_cblocks_default = 1; 388 static int nr_cblocks_max = 4; 389 390 static int record__aio_parse(const struct option *opt, 391 const char *str, 392 int unset) 393 { 394 struct record_opts *opts = (struct record_opts *)opt->value; 395 396 if (unset) { 397 opts->nr_cblocks = 0; 398 } else { 399 if (str) 400 opts->nr_cblocks = strtol(str, NULL, 0); 401 if (!opts->nr_cblocks) 402 opts->nr_cblocks = nr_cblocks_default; 403 } 404 405 return 0; 406 } 407 #else /* HAVE_AIO_SUPPORT */ 408 static int nr_cblocks_max = 0; 409 410 static int record__aio_push(struct record *rec __maybe_unused, struct mmap *map __maybe_unused, 411 off_t *off __maybe_unused) 412 { 413 return -1; 414 } 415 416 static off_t record__aio_get_pos(int trace_fd __maybe_unused) 417 { 418 return -1; 419 } 420 421 static void record__aio_set_pos(int trace_fd __maybe_unused, off_t pos __maybe_unused) 422 { 423 } 424 425 static void record__aio_mmap_read_sync(struct record *rec __maybe_unused) 426 { 427 } 428 #endif 429 430 static int record__aio_enabled(struct record *rec) 431 { 432 return rec->opts.nr_cblocks > 0; 433 } 434 435 #define MMAP_FLUSH_DEFAULT 1 436 static int record__mmap_flush_parse(const struct option *opt, 437 const char *str, 438 int unset) 439 { 440 int flush_max; 441 struct record_opts *opts = (struct record_opts *)opt->value; 442 static struct parse_tag tags[] = { 443 { .tag = 'B', .mult = 1 }, 444 { .tag = 'K', .mult = 1 << 10 }, 445 { .tag = 'M', .mult = 1 << 20 }, 446 { .tag = 'G', .mult = 1 << 30 }, 447 { .tag = 0 }, 448 }; 449 450 if (unset) 451 return 0; 452 453 if (str) { 454 opts->mmap_flush = parse_tag_value(str, tags); 455 if (opts->mmap_flush == (int)-1) 456 opts->mmap_flush = strtol(str, NULL, 0); 457 } 458 459 if (!opts->mmap_flush) 460 opts->mmap_flush = MMAP_FLUSH_DEFAULT; 461 462 flush_max = evlist__mmap_size(opts->mmap_pages); 463 flush_max /= 4; 464 if (opts->mmap_flush > flush_max) 465 opts->mmap_flush = flush_max; 466 467 return 0; 468 } 469 470 #ifdef HAVE_ZSTD_SUPPORT 471 static unsigned int comp_level_default = 1; 472 473 static int record__parse_comp_level(const struct option *opt, const char *str, int unset) 474 { 475 struct record_opts *opts = opt->value; 476 477 if (unset) { 478 opts->comp_level = 0; 479 } else { 480 if (str) 481 opts->comp_level = strtol(str, NULL, 0); 482 if (!opts->comp_level) 483 opts->comp_level = comp_level_default; 484 } 485 486 return 0; 487 } 488 #endif 489 static unsigned int comp_level_max = 22; 490 491 static int record__comp_enabled(struct record *rec) 492 { 493 return rec->opts.comp_level > 0; 494 } 495 496 static int process_synthesized_event(struct perf_tool *tool, 497 union perf_event *event, 498 struct perf_sample *sample __maybe_unused, 499 struct machine *machine __maybe_unused) 500 { 501 struct record *rec = container_of(tool, struct record, tool); 502 return record__write(rec, NULL, event, event->header.size); 503 } 504 505 static int record__pushfn(struct mmap *map, void *to, void *bf, size_t size) 506 { 507 struct record *rec = to; 508 509 if (record__comp_enabled(rec)) { 510 size = zstd_compress(rec->session, map->data, mmap__mmap_len(map), bf, size); 511 bf = map->data; 512 } 513 514 rec->samples++; 515 return record__write(rec, map, bf, size); 516 } 517 518 static volatile int signr = -1; 519 static volatile int child_finished; 520 521 static void sig_handler(int sig) 522 { 523 if (sig == SIGCHLD) 524 child_finished = 1; 525 else 526 signr = sig; 527 528 done = 1; 529 } 530 531 static void sigsegv_handler(int sig) 532 { 533 perf_hooks__recover(); 534 sighandler_dump_stack(sig); 535 } 536 537 static void record__sig_exit(void) 538 { 539 if (signr == -1) 540 return; 541 542 signal(signr, SIG_DFL); 543 raise(signr); 544 } 545 546 #ifdef HAVE_AUXTRACE_SUPPORT 547 548 static int record__process_auxtrace(struct perf_tool *tool, 549 struct mmap *map, 550 union perf_event *event, void *data1, 551 size_t len1, void *data2, size_t len2) 552 { 553 struct record *rec = container_of(tool, struct record, tool); 554 struct perf_data *data = &rec->data; 555 size_t padding; 556 u8 pad[8] = {0}; 557 558 if (!perf_data__is_pipe(data) && perf_data__is_single_file(data)) { 559 off_t file_offset; 560 int fd = perf_data__fd(data); 561 int err; 562 563 file_offset = lseek(fd, 0, SEEK_CUR); 564 if (file_offset == -1) 565 return -1; 566 err = auxtrace_index__auxtrace_event(&rec->session->auxtrace_index, 567 event, file_offset); 568 if (err) 569 return err; 570 } 571 572 /* event.auxtrace.size includes padding, see __auxtrace_mmap__read() */ 573 padding = (len1 + len2) & 7; 574 if (padding) 575 padding = 8 - padding; 576 577 record__write(rec, map, event, event->header.size); 578 record__write(rec, map, data1, len1); 579 if (len2) 580 record__write(rec, map, data2, len2); 581 record__write(rec, map, &pad, padding); 582 583 return 0; 584 } 585 586 static int record__auxtrace_mmap_read(struct record *rec, 587 struct mmap *map) 588 { 589 int ret; 590 591 ret = auxtrace_mmap__read(map, rec->itr, &rec->tool, 592 record__process_auxtrace); 593 if (ret < 0) 594 return ret; 595 596 if (ret) 597 rec->samples++; 598 599 return 0; 600 } 601 602 static int record__auxtrace_mmap_read_snapshot(struct record *rec, 603 struct mmap *map) 604 { 605 int ret; 606 607 ret = auxtrace_mmap__read_snapshot(map, rec->itr, &rec->tool, 608 record__process_auxtrace, 609 rec->opts.auxtrace_snapshot_size); 610 if (ret < 0) 611 return ret; 612 613 if (ret) 614 rec->samples++; 615 616 return 0; 617 } 618 619 static int record__auxtrace_read_snapshot_all(struct record *rec) 620 { 621 int i; 622 int rc = 0; 623 624 for (i = 0; i < rec->evlist->core.nr_mmaps; i++) { 625 struct mmap *map = &rec->evlist->mmap[i]; 626 627 if (!map->auxtrace_mmap.base) 628 continue; 629 630 if (record__auxtrace_mmap_read_snapshot(rec, map) != 0) { 631 rc = -1; 632 goto out; 633 } 634 } 635 out: 636 return rc; 637 } 638 639 static void record__read_auxtrace_snapshot(struct record *rec, bool on_exit) 640 { 641 pr_debug("Recording AUX area tracing snapshot\n"); 642 if (record__auxtrace_read_snapshot_all(rec) < 0) { 643 trigger_error(&auxtrace_snapshot_trigger); 644 } else { 645 if (auxtrace_record__snapshot_finish(rec->itr, on_exit)) 646 trigger_error(&auxtrace_snapshot_trigger); 647 else 648 trigger_ready(&auxtrace_snapshot_trigger); 649 } 650 } 651 652 static int record__auxtrace_snapshot_exit(struct record *rec) 653 { 654 if (trigger_is_error(&auxtrace_snapshot_trigger)) 655 return 0; 656 657 if (!auxtrace_record__snapshot_started && 658 auxtrace_record__snapshot_start(rec->itr)) 659 return -1; 660 661 record__read_auxtrace_snapshot(rec, true); 662 if (trigger_is_error(&auxtrace_snapshot_trigger)) 663 return -1; 664 665 return 0; 666 } 667 668 static int record__auxtrace_init(struct record *rec) 669 { 670 int err; 671 672 if (!rec->itr) { 673 rec->itr = auxtrace_record__init(rec->evlist, &err); 674 if (err) 675 return err; 676 } 677 678 err = auxtrace_parse_snapshot_options(rec->itr, &rec->opts, 679 rec->opts.auxtrace_snapshot_opts); 680 if (err) 681 return err; 682 683 return auxtrace_parse_filters(rec->evlist); 684 } 685 686 #else 687 688 static inline 689 int record__auxtrace_mmap_read(struct record *rec __maybe_unused, 690 struct mmap *map __maybe_unused) 691 { 692 return 0; 693 } 694 695 static inline 696 void record__read_auxtrace_snapshot(struct record *rec __maybe_unused, 697 bool on_exit __maybe_unused) 698 { 699 } 700 701 static inline 702 int auxtrace_record__snapshot_start(struct auxtrace_record *itr __maybe_unused) 703 { 704 return 0; 705 } 706 707 static inline 708 int record__auxtrace_snapshot_exit(struct record *rec __maybe_unused) 709 { 710 return 0; 711 } 712 713 static int record__auxtrace_init(struct record *rec __maybe_unused) 714 { 715 return 0; 716 } 717 718 #endif 719 720 static bool record__kcore_readable(struct machine *machine) 721 { 722 char kcore[PATH_MAX]; 723 int fd; 724 725 scnprintf(kcore, sizeof(kcore), "%s/proc/kcore", machine->root_dir); 726 727 fd = open(kcore, O_RDONLY); 728 if (fd < 0) 729 return false; 730 731 close(fd); 732 733 return true; 734 } 735 736 static int record__kcore_copy(struct machine *machine, struct perf_data *data) 737 { 738 char from_dir[PATH_MAX]; 739 char kcore_dir[PATH_MAX]; 740 int ret; 741 742 snprintf(from_dir, sizeof(from_dir), "%s/proc", machine->root_dir); 743 744 ret = perf_data__make_kcore_dir(data, kcore_dir, sizeof(kcore_dir)); 745 if (ret) 746 return ret; 747 748 return kcore_copy(from_dir, kcore_dir); 749 } 750 751 static int record__mmap_evlist(struct record *rec, 752 struct evlist *evlist) 753 { 754 struct record_opts *opts = &rec->opts; 755 char msg[512]; 756 757 if (opts->affinity != PERF_AFFINITY_SYS) 758 cpu__setup_cpunode_map(); 759 760 if (evlist__mmap_ex(evlist, opts->mmap_pages, 761 opts->auxtrace_mmap_pages, 762 opts->auxtrace_snapshot_mode, 763 opts->nr_cblocks, opts->affinity, 764 opts->mmap_flush, opts->comp_level) < 0) { 765 if (errno == EPERM) { 766 pr_err("Permission error mapping pages.\n" 767 "Consider increasing " 768 "/proc/sys/kernel/perf_event_mlock_kb,\n" 769 "or try again with a smaller value of -m/--mmap_pages.\n" 770 "(current value: %u,%u)\n", 771 opts->mmap_pages, opts->auxtrace_mmap_pages); 772 return -errno; 773 } else { 774 pr_err("failed to mmap with %d (%s)\n", errno, 775 str_error_r(errno, msg, sizeof(msg))); 776 if (errno) 777 return -errno; 778 else 779 return -EINVAL; 780 } 781 } 782 return 0; 783 } 784 785 static int record__mmap(struct record *rec) 786 { 787 return record__mmap_evlist(rec, rec->evlist); 788 } 789 790 static int record__open(struct record *rec) 791 { 792 char msg[BUFSIZ]; 793 struct evsel *pos; 794 struct evlist *evlist = rec->evlist; 795 struct perf_session *session = rec->session; 796 struct record_opts *opts = &rec->opts; 797 int rc = 0; 798 799 /* 800 * For initial_delay we need to add a dummy event so that we can track 801 * PERF_RECORD_MMAP while we wait for the initial delay to enable the 802 * real events, the ones asked by the user. 803 */ 804 if (opts->initial_delay) { 805 if (perf_evlist__add_dummy(evlist)) 806 return -ENOMEM; 807 808 pos = evlist__first(evlist); 809 pos->tracking = 0; 810 pos = evlist__last(evlist); 811 pos->tracking = 1; 812 pos->core.attr.enable_on_exec = 1; 813 } 814 815 perf_evlist__config(evlist, opts, &callchain_param); 816 817 evlist__for_each_entry(evlist, pos) { 818 try_again: 819 if (evsel__open(pos, pos->core.cpus, pos->core.threads) < 0) { 820 if (perf_evsel__fallback(pos, errno, msg, sizeof(msg))) { 821 if (verbose > 0) 822 ui__warning("%s\n", msg); 823 goto try_again; 824 } 825 if ((errno == EINVAL || errno == EBADF) && 826 pos->leader != pos && 827 pos->weak_group) { 828 pos = perf_evlist__reset_weak_group(evlist, pos); 829 goto try_again; 830 } 831 rc = -errno; 832 perf_evsel__open_strerror(pos, &opts->target, 833 errno, msg, sizeof(msg)); 834 ui__error("%s\n", msg); 835 goto out; 836 } 837 838 pos->supported = true; 839 } 840 841 if (symbol_conf.kptr_restrict && !perf_evlist__exclude_kernel(evlist)) { 842 pr_warning( 843 "WARNING: Kernel address maps (/proc/{kallsyms,modules}) are restricted,\n" 844 "check /proc/sys/kernel/kptr_restrict and /proc/sys/kernel/perf_event_paranoid.\n\n" 845 "Samples in kernel functions may not be resolved if a suitable vmlinux\n" 846 "file is not found in the buildid cache or in the vmlinux path.\n\n" 847 "Samples in kernel modules won't be resolved at all.\n\n" 848 "If some relocation was applied (e.g. kexec) symbols may be misresolved\n" 849 "even with a suitable vmlinux or kallsyms file.\n\n"); 850 } 851 852 if (perf_evlist__apply_filters(evlist, &pos)) { 853 pr_err("failed to set filter \"%s\" on event %s with %d (%s)\n", 854 pos->filter, perf_evsel__name(pos), errno, 855 str_error_r(errno, msg, sizeof(msg))); 856 rc = -1; 857 goto out; 858 } 859 860 rc = record__mmap(rec); 861 if (rc) 862 goto out; 863 864 session->evlist = evlist; 865 perf_session__set_id_hdr_size(session); 866 out: 867 return rc; 868 } 869 870 static int process_sample_event(struct perf_tool *tool, 871 union perf_event *event, 872 struct perf_sample *sample, 873 struct evsel *evsel, 874 struct machine *machine) 875 { 876 struct record *rec = container_of(tool, struct record, tool); 877 878 if (rec->evlist->first_sample_time == 0) 879 rec->evlist->first_sample_time = sample->time; 880 881 rec->evlist->last_sample_time = sample->time; 882 883 if (rec->buildid_all) 884 return 0; 885 886 rec->samples++; 887 return build_id__mark_dso_hit(tool, event, sample, evsel, machine); 888 } 889 890 static int process_buildids(struct record *rec) 891 { 892 struct perf_session *session = rec->session; 893 894 if (perf_data__size(&rec->data) == 0) 895 return 0; 896 897 /* 898 * During this process, it'll load kernel map and replace the 899 * dso->long_name to a real pathname it found. In this case 900 * we prefer the vmlinux path like 901 * /lib/modules/3.16.4/build/vmlinux 902 * 903 * rather than build-id path (in debug directory). 904 * $HOME/.debug/.build-id/f0/6e17aa50adf4d00b88925e03775de107611551 905 */ 906 symbol_conf.ignore_vmlinux_buildid = true; 907 908 /* 909 * If --buildid-all is given, it marks all DSO regardless of hits, 910 * so no need to process samples. But if timestamp_boundary is enabled, 911 * it still needs to walk on all samples to get the timestamps of 912 * first/last samples. 913 */ 914 if (rec->buildid_all && !rec->timestamp_boundary) 915 rec->tool.sample = NULL; 916 917 return perf_session__process_events(session); 918 } 919 920 static void perf_event__synthesize_guest_os(struct machine *machine, void *data) 921 { 922 int err; 923 struct perf_tool *tool = data; 924 /* 925 *As for guest kernel when processing subcommand record&report, 926 *we arrange module mmap prior to guest kernel mmap and trigger 927 *a preload dso because default guest module symbols are loaded 928 *from guest kallsyms instead of /lib/modules/XXX/XXX. This 929 *method is used to avoid symbol missing when the first addr is 930 *in module instead of in guest kernel. 931 */ 932 err = perf_event__synthesize_modules(tool, process_synthesized_event, 933 machine); 934 if (err < 0) 935 pr_err("Couldn't record guest kernel [%d]'s reference" 936 " relocation symbol.\n", machine->pid); 937 938 /* 939 * We use _stext for guest kernel because guest kernel's /proc/kallsyms 940 * have no _text sometimes. 941 */ 942 err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event, 943 machine); 944 if (err < 0) 945 pr_err("Couldn't record guest kernel [%d]'s reference" 946 " relocation symbol.\n", machine->pid); 947 } 948 949 static struct perf_event_header finished_round_event = { 950 .size = sizeof(struct perf_event_header), 951 .type = PERF_RECORD_FINISHED_ROUND, 952 }; 953 954 static void record__adjust_affinity(struct record *rec, struct mmap *map) 955 { 956 if (rec->opts.affinity != PERF_AFFINITY_SYS && 957 !CPU_EQUAL(&rec->affinity_mask, &map->affinity_mask)) { 958 CPU_ZERO(&rec->affinity_mask); 959 CPU_OR(&rec->affinity_mask, &rec->affinity_mask, &map->affinity_mask); 960 sched_setaffinity(0, sizeof(rec->affinity_mask), &rec->affinity_mask); 961 } 962 } 963 964 static size_t process_comp_header(void *record, size_t increment) 965 { 966 struct perf_record_compressed *event = record; 967 size_t size = sizeof(*event); 968 969 if (increment) { 970 event->header.size += increment; 971 return increment; 972 } 973 974 event->header.type = PERF_RECORD_COMPRESSED; 975 event->header.size = size; 976 977 return size; 978 } 979 980 static size_t zstd_compress(struct perf_session *session, void *dst, size_t dst_size, 981 void *src, size_t src_size) 982 { 983 size_t compressed; 984 size_t max_record_size = PERF_SAMPLE_MAX_SIZE - sizeof(struct perf_record_compressed) - 1; 985 986 compressed = zstd_compress_stream_to_records(&session->zstd_data, dst, dst_size, src, src_size, 987 max_record_size, process_comp_header); 988 989 session->bytes_transferred += src_size; 990 session->bytes_compressed += compressed; 991 992 return compressed; 993 } 994 995 static int record__mmap_read_evlist(struct record *rec, struct evlist *evlist, 996 bool overwrite, bool synch) 997 { 998 u64 bytes_written = rec->bytes_written; 999 int i; 1000 int rc = 0; 1001 struct mmap *maps; 1002 int trace_fd = rec->data.file.fd; 1003 off_t off = 0; 1004 1005 if (!evlist) 1006 return 0; 1007 1008 maps = overwrite ? evlist->overwrite_mmap : evlist->mmap; 1009 if (!maps) 1010 return 0; 1011 1012 if (overwrite && evlist->bkw_mmap_state != BKW_MMAP_DATA_PENDING) 1013 return 0; 1014 1015 if (record__aio_enabled(rec)) 1016 off = record__aio_get_pos(trace_fd); 1017 1018 for (i = 0; i < evlist->core.nr_mmaps; i++) { 1019 u64 flush = 0; 1020 struct mmap *map = &maps[i]; 1021 1022 if (map->core.base) { 1023 record__adjust_affinity(rec, map); 1024 if (synch) { 1025 flush = map->core.flush; 1026 map->core.flush = 1; 1027 } 1028 if (!record__aio_enabled(rec)) { 1029 if (perf_mmap__push(map, rec, record__pushfn) < 0) { 1030 if (synch) 1031 map->core.flush = flush; 1032 rc = -1; 1033 goto out; 1034 } 1035 } else { 1036 if (record__aio_push(rec, map, &off) < 0) { 1037 record__aio_set_pos(trace_fd, off); 1038 if (synch) 1039 map->core.flush = flush; 1040 rc = -1; 1041 goto out; 1042 } 1043 } 1044 if (synch) 1045 map->core.flush = flush; 1046 } 1047 1048 if (map->auxtrace_mmap.base && !rec->opts.auxtrace_snapshot_mode && 1049 record__auxtrace_mmap_read(rec, map) != 0) { 1050 rc = -1; 1051 goto out; 1052 } 1053 } 1054 1055 if (record__aio_enabled(rec)) 1056 record__aio_set_pos(trace_fd, off); 1057 1058 /* 1059 * Mark the round finished in case we wrote 1060 * at least one event. 1061 */ 1062 if (bytes_written != rec->bytes_written) 1063 rc = record__write(rec, NULL, &finished_round_event, sizeof(finished_round_event)); 1064 1065 if (overwrite) 1066 perf_evlist__toggle_bkw_mmap(evlist, BKW_MMAP_EMPTY); 1067 out: 1068 return rc; 1069 } 1070 1071 static int record__mmap_read_all(struct record *rec, bool synch) 1072 { 1073 int err; 1074 1075 err = record__mmap_read_evlist(rec, rec->evlist, false, synch); 1076 if (err) 1077 return err; 1078 1079 return record__mmap_read_evlist(rec, rec->evlist, true, synch); 1080 } 1081 1082 static void record__init_features(struct record *rec) 1083 { 1084 struct perf_session *session = rec->session; 1085 int feat; 1086 1087 for (feat = HEADER_FIRST_FEATURE; feat < HEADER_LAST_FEATURE; feat++) 1088 perf_header__set_feat(&session->header, feat); 1089 1090 if (rec->no_buildid) 1091 perf_header__clear_feat(&session->header, HEADER_BUILD_ID); 1092 1093 if (!have_tracepoints(&rec->evlist->core.entries)) 1094 perf_header__clear_feat(&session->header, HEADER_TRACING_DATA); 1095 1096 if (!rec->opts.branch_stack) 1097 perf_header__clear_feat(&session->header, HEADER_BRANCH_STACK); 1098 1099 if (!rec->opts.full_auxtrace) 1100 perf_header__clear_feat(&session->header, HEADER_AUXTRACE); 1101 1102 if (!(rec->opts.use_clockid && rec->opts.clockid_res_ns)) 1103 perf_header__clear_feat(&session->header, HEADER_CLOCKID); 1104 1105 perf_header__clear_feat(&session->header, HEADER_DIR_FORMAT); 1106 if (!record__comp_enabled(rec)) 1107 perf_header__clear_feat(&session->header, HEADER_COMPRESSED); 1108 1109 perf_header__clear_feat(&session->header, HEADER_STAT); 1110 } 1111 1112 static void 1113 record__finish_output(struct record *rec) 1114 { 1115 struct perf_data *data = &rec->data; 1116 int fd = perf_data__fd(data); 1117 1118 if (data->is_pipe) 1119 return; 1120 1121 rec->session->header.data_size += rec->bytes_written; 1122 data->file.size = lseek(perf_data__fd(data), 0, SEEK_CUR); 1123 1124 if (!rec->no_buildid) { 1125 process_buildids(rec); 1126 1127 if (rec->buildid_all) 1128 dsos__hit_all(rec->session); 1129 } 1130 perf_session__write_header(rec->session, rec->evlist, fd, true); 1131 1132 return; 1133 } 1134 1135 static int record__synthesize_workload(struct record *rec, bool tail) 1136 { 1137 int err; 1138 struct perf_thread_map *thread_map; 1139 1140 if (rec->opts.tail_synthesize != tail) 1141 return 0; 1142 1143 thread_map = thread_map__new_by_tid(rec->evlist->workload.pid); 1144 if (thread_map == NULL) 1145 return -1; 1146 1147 err = perf_event__synthesize_thread_map(&rec->tool, thread_map, 1148 process_synthesized_event, 1149 &rec->session->machines.host, 1150 rec->opts.sample_address); 1151 perf_thread_map__put(thread_map); 1152 return err; 1153 } 1154 1155 static int record__synthesize(struct record *rec, bool tail); 1156 1157 static int 1158 record__switch_output(struct record *rec, bool at_exit) 1159 { 1160 struct perf_data *data = &rec->data; 1161 int fd, err; 1162 char *new_filename; 1163 1164 /* Same Size: "2015122520103046"*/ 1165 char timestamp[] = "InvalidTimestamp"; 1166 1167 record__aio_mmap_read_sync(rec); 1168 1169 record__synthesize(rec, true); 1170 if (target__none(&rec->opts.target)) 1171 record__synthesize_workload(rec, true); 1172 1173 rec->samples = 0; 1174 record__finish_output(rec); 1175 err = fetch_current_timestamp(timestamp, sizeof(timestamp)); 1176 if (err) { 1177 pr_err("Failed to get current timestamp\n"); 1178 return -EINVAL; 1179 } 1180 1181 fd = perf_data__switch(data, timestamp, 1182 rec->session->header.data_offset, 1183 at_exit, &new_filename); 1184 if (fd >= 0 && !at_exit) { 1185 rec->bytes_written = 0; 1186 rec->session->header.data_size = 0; 1187 } 1188 1189 if (!quiet) 1190 fprintf(stderr, "[ perf record: Dump %s.%s ]\n", 1191 data->path, timestamp); 1192 1193 if (rec->switch_output.num_files) { 1194 int n = rec->switch_output.cur_file + 1; 1195 1196 if (n >= rec->switch_output.num_files) 1197 n = 0; 1198 rec->switch_output.cur_file = n; 1199 if (rec->switch_output.filenames[n]) { 1200 remove(rec->switch_output.filenames[n]); 1201 zfree(&rec->switch_output.filenames[n]); 1202 } 1203 rec->switch_output.filenames[n] = new_filename; 1204 } else { 1205 free(new_filename); 1206 } 1207 1208 /* Output tracking events */ 1209 if (!at_exit) { 1210 record__synthesize(rec, false); 1211 1212 /* 1213 * In 'perf record --switch-output' without -a, 1214 * record__synthesize() in record__switch_output() won't 1215 * generate tracking events because there's no thread_map 1216 * in evlist. Which causes newly created perf.data doesn't 1217 * contain map and comm information. 1218 * Create a fake thread_map and directly call 1219 * perf_event__synthesize_thread_map() for those events. 1220 */ 1221 if (target__none(&rec->opts.target)) 1222 record__synthesize_workload(rec, false); 1223 } 1224 return fd; 1225 } 1226 1227 static volatile int workload_exec_errno; 1228 1229 /* 1230 * perf_evlist__prepare_workload will send a SIGUSR1 1231 * if the fork fails, since we asked by setting its 1232 * want_signal to true. 1233 */ 1234 static void workload_exec_failed_signal(int signo __maybe_unused, 1235 siginfo_t *info, 1236 void *ucontext __maybe_unused) 1237 { 1238 workload_exec_errno = info->si_value.sival_int; 1239 done = 1; 1240 child_finished = 1; 1241 } 1242 1243 static void snapshot_sig_handler(int sig); 1244 static void alarm_sig_handler(int sig); 1245 1246 static const struct perf_event_mmap_page * 1247 perf_evlist__pick_pc(struct evlist *evlist) 1248 { 1249 if (evlist) { 1250 if (evlist->mmap && evlist->mmap[0].core.base) 1251 return evlist->mmap[0].core.base; 1252 if (evlist->overwrite_mmap && evlist->overwrite_mmap[0].core.base) 1253 return evlist->overwrite_mmap[0].core.base; 1254 } 1255 return NULL; 1256 } 1257 1258 static const struct perf_event_mmap_page *record__pick_pc(struct record *rec) 1259 { 1260 const struct perf_event_mmap_page *pc; 1261 1262 pc = perf_evlist__pick_pc(rec->evlist); 1263 if (pc) 1264 return pc; 1265 return NULL; 1266 } 1267 1268 static int record__synthesize(struct record *rec, bool tail) 1269 { 1270 struct perf_session *session = rec->session; 1271 struct machine *machine = &session->machines.host; 1272 struct perf_data *data = &rec->data; 1273 struct record_opts *opts = &rec->opts; 1274 struct perf_tool *tool = &rec->tool; 1275 int fd = perf_data__fd(data); 1276 int err = 0; 1277 1278 if (rec->opts.tail_synthesize != tail) 1279 return 0; 1280 1281 if (data->is_pipe) { 1282 /* 1283 * We need to synthesize events first, because some 1284 * features works on top of them (on report side). 1285 */ 1286 err = perf_event__synthesize_attrs(tool, rec->evlist, 1287 process_synthesized_event); 1288 if (err < 0) { 1289 pr_err("Couldn't synthesize attrs.\n"); 1290 goto out; 1291 } 1292 1293 err = perf_event__synthesize_features(tool, session, rec->evlist, 1294 process_synthesized_event); 1295 if (err < 0) { 1296 pr_err("Couldn't synthesize features.\n"); 1297 return err; 1298 } 1299 1300 if (have_tracepoints(&rec->evlist->core.entries)) { 1301 /* 1302 * FIXME err <= 0 here actually means that 1303 * there were no tracepoints so its not really 1304 * an error, just that we don't need to 1305 * synthesize anything. We really have to 1306 * return this more properly and also 1307 * propagate errors that now are calling die() 1308 */ 1309 err = perf_event__synthesize_tracing_data(tool, fd, rec->evlist, 1310 process_synthesized_event); 1311 if (err <= 0) { 1312 pr_err("Couldn't record tracing data.\n"); 1313 goto out; 1314 } 1315 rec->bytes_written += err; 1316 } 1317 } 1318 1319 err = perf_event__synth_time_conv(record__pick_pc(rec), tool, 1320 process_synthesized_event, machine); 1321 if (err) 1322 goto out; 1323 1324 if (rec->opts.full_auxtrace) { 1325 err = perf_event__synthesize_auxtrace_info(rec->itr, tool, 1326 session, process_synthesized_event); 1327 if (err) 1328 goto out; 1329 } 1330 1331 if (!perf_evlist__exclude_kernel(rec->evlist)) { 1332 err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event, 1333 machine); 1334 WARN_ONCE(err < 0, "Couldn't record kernel reference relocation symbol\n" 1335 "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n" 1336 "Check /proc/kallsyms permission or run as root.\n"); 1337 1338 err = perf_event__synthesize_modules(tool, process_synthesized_event, 1339 machine); 1340 WARN_ONCE(err < 0, "Couldn't record kernel module information.\n" 1341 "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n" 1342 "Check /proc/modules permission or run as root.\n"); 1343 } 1344 1345 if (perf_guest) { 1346 machines__process_guests(&session->machines, 1347 perf_event__synthesize_guest_os, tool); 1348 } 1349 1350 err = perf_event__synthesize_extra_attr(&rec->tool, 1351 rec->evlist, 1352 process_synthesized_event, 1353 data->is_pipe); 1354 if (err) 1355 goto out; 1356 1357 err = perf_event__synthesize_thread_map2(&rec->tool, rec->evlist->core.threads, 1358 process_synthesized_event, 1359 NULL); 1360 if (err < 0) { 1361 pr_err("Couldn't synthesize thread map.\n"); 1362 return err; 1363 } 1364 1365 err = perf_event__synthesize_cpu_map(&rec->tool, rec->evlist->core.cpus, 1366 process_synthesized_event, NULL); 1367 if (err < 0) { 1368 pr_err("Couldn't synthesize cpu map.\n"); 1369 return err; 1370 } 1371 1372 err = perf_event__synthesize_bpf_events(session, process_synthesized_event, 1373 machine, opts); 1374 if (err < 0) 1375 pr_warning("Couldn't synthesize bpf events.\n"); 1376 1377 err = __machine__synthesize_threads(machine, tool, &opts->target, rec->evlist->core.threads, 1378 process_synthesized_event, opts->sample_address, 1379 1); 1380 out: 1381 return err; 1382 } 1383 1384 static int __cmd_record(struct record *rec, int argc, const char **argv) 1385 { 1386 int err; 1387 int status = 0; 1388 unsigned long waking = 0; 1389 const bool forks = argc > 0; 1390 struct perf_tool *tool = &rec->tool; 1391 struct record_opts *opts = &rec->opts; 1392 struct perf_data *data = &rec->data; 1393 struct perf_session *session; 1394 bool disabled = false, draining = false; 1395 struct evlist *sb_evlist = NULL; 1396 int fd; 1397 float ratio = 0; 1398 1399 atexit(record__sig_exit); 1400 signal(SIGCHLD, sig_handler); 1401 signal(SIGINT, sig_handler); 1402 signal(SIGTERM, sig_handler); 1403 signal(SIGSEGV, sigsegv_handler); 1404 1405 if (rec->opts.record_namespaces) 1406 tool->namespace_events = true; 1407 1408 if (rec->opts.auxtrace_snapshot_mode || rec->switch_output.enabled) { 1409 signal(SIGUSR2, snapshot_sig_handler); 1410 if (rec->opts.auxtrace_snapshot_mode) 1411 trigger_on(&auxtrace_snapshot_trigger); 1412 if (rec->switch_output.enabled) 1413 trigger_on(&switch_output_trigger); 1414 } else { 1415 signal(SIGUSR2, SIG_IGN); 1416 } 1417 1418 session = perf_session__new(data, false, tool); 1419 if (IS_ERR(session)) { 1420 pr_err("Perf session creation failed.\n"); 1421 return PTR_ERR(session); 1422 } 1423 1424 fd = perf_data__fd(data); 1425 rec->session = session; 1426 1427 if (zstd_init(&session->zstd_data, rec->opts.comp_level) < 0) { 1428 pr_err("Compression initialization failed.\n"); 1429 return -1; 1430 } 1431 1432 session->header.env.comp_type = PERF_COMP_ZSTD; 1433 session->header.env.comp_level = rec->opts.comp_level; 1434 1435 if (rec->opts.kcore && 1436 !record__kcore_readable(&session->machines.host)) { 1437 pr_err("ERROR: kcore is not readable.\n"); 1438 return -1; 1439 } 1440 1441 record__init_features(rec); 1442 1443 if (rec->opts.use_clockid && rec->opts.clockid_res_ns) 1444 session->header.env.clockid_res_ns = rec->opts.clockid_res_ns; 1445 1446 if (forks) { 1447 err = perf_evlist__prepare_workload(rec->evlist, &opts->target, 1448 argv, data->is_pipe, 1449 workload_exec_failed_signal); 1450 if (err < 0) { 1451 pr_err("Couldn't run the workload!\n"); 1452 status = err; 1453 goto out_delete_session; 1454 } 1455 } 1456 1457 /* 1458 * If we have just single event and are sending data 1459 * through pipe, we need to force the ids allocation, 1460 * because we synthesize event name through the pipe 1461 * and need the id for that. 1462 */ 1463 if (data->is_pipe && rec->evlist->core.nr_entries == 1) 1464 rec->opts.sample_id = true; 1465 1466 if (record__open(rec) != 0) { 1467 err = -1; 1468 goto out_child; 1469 } 1470 session->header.env.comp_mmap_len = session->evlist->core.mmap_len; 1471 1472 if (rec->opts.kcore) { 1473 err = record__kcore_copy(&session->machines.host, data); 1474 if (err) { 1475 pr_err("ERROR: Failed to copy kcore\n"); 1476 goto out_child; 1477 } 1478 } 1479 1480 err = bpf__apply_obj_config(); 1481 if (err) { 1482 char errbuf[BUFSIZ]; 1483 1484 bpf__strerror_apply_obj_config(err, errbuf, sizeof(errbuf)); 1485 pr_err("ERROR: Apply config to BPF failed: %s\n", 1486 errbuf); 1487 goto out_child; 1488 } 1489 1490 /* 1491 * Normally perf_session__new would do this, but it doesn't have the 1492 * evlist. 1493 */ 1494 if (rec->tool.ordered_events && !perf_evlist__sample_id_all(rec->evlist)) { 1495 pr_warning("WARNING: No sample_id_all support, falling back to unordered processing\n"); 1496 rec->tool.ordered_events = false; 1497 } 1498 1499 if (!rec->evlist->nr_groups) 1500 perf_header__clear_feat(&session->header, HEADER_GROUP_DESC); 1501 1502 if (data->is_pipe) { 1503 err = perf_header__write_pipe(fd); 1504 if (err < 0) 1505 goto out_child; 1506 } else { 1507 err = perf_session__write_header(session, rec->evlist, fd, false); 1508 if (err < 0) 1509 goto out_child; 1510 } 1511 1512 if (!rec->no_buildid 1513 && !perf_header__has_feat(&session->header, HEADER_BUILD_ID)) { 1514 pr_err("Couldn't generate buildids. " 1515 "Use --no-buildid to profile anyway.\n"); 1516 err = -1; 1517 goto out_child; 1518 } 1519 1520 if (!opts->no_bpf_event) 1521 bpf_event__add_sb_event(&sb_evlist, &session->header.env); 1522 1523 if (perf_evlist__start_sb_thread(sb_evlist, &rec->opts.target)) { 1524 pr_debug("Couldn't start the BPF side band thread:\nBPF programs starting from now on won't be annotatable\n"); 1525 opts->no_bpf_event = true; 1526 } 1527 1528 err = record__synthesize(rec, false); 1529 if (err < 0) 1530 goto out_child; 1531 1532 if (rec->realtime_prio) { 1533 struct sched_param param; 1534 1535 param.sched_priority = rec->realtime_prio; 1536 if (sched_setscheduler(0, SCHED_FIFO, ¶m)) { 1537 pr_err("Could not set realtime priority.\n"); 1538 err = -1; 1539 goto out_child; 1540 } 1541 } 1542 1543 /* 1544 * When perf is starting the traced process, all the events 1545 * (apart from group members) have enable_on_exec=1 set, 1546 * so don't spoil it by prematurely enabling them. 1547 */ 1548 if (!target__none(&opts->target) && !opts->initial_delay) 1549 evlist__enable(rec->evlist); 1550 1551 /* 1552 * Let the child rip 1553 */ 1554 if (forks) { 1555 struct machine *machine = &session->machines.host; 1556 union perf_event *event; 1557 pid_t tgid; 1558 1559 event = malloc(sizeof(event->comm) + machine->id_hdr_size); 1560 if (event == NULL) { 1561 err = -ENOMEM; 1562 goto out_child; 1563 } 1564 1565 /* 1566 * Some H/W events are generated before COMM event 1567 * which is emitted during exec(), so perf script 1568 * cannot see a correct process name for those events. 1569 * Synthesize COMM event to prevent it. 1570 */ 1571 tgid = perf_event__synthesize_comm(tool, event, 1572 rec->evlist->workload.pid, 1573 process_synthesized_event, 1574 machine); 1575 free(event); 1576 1577 if (tgid == -1) 1578 goto out_child; 1579 1580 event = malloc(sizeof(event->namespaces) + 1581 (NR_NAMESPACES * sizeof(struct perf_ns_link_info)) + 1582 machine->id_hdr_size); 1583 if (event == NULL) { 1584 err = -ENOMEM; 1585 goto out_child; 1586 } 1587 1588 /* 1589 * Synthesize NAMESPACES event for the command specified. 1590 */ 1591 perf_event__synthesize_namespaces(tool, event, 1592 rec->evlist->workload.pid, 1593 tgid, process_synthesized_event, 1594 machine); 1595 free(event); 1596 1597 perf_evlist__start_workload(rec->evlist); 1598 } 1599 1600 if (opts->initial_delay) { 1601 usleep(opts->initial_delay * USEC_PER_MSEC); 1602 evlist__enable(rec->evlist); 1603 } 1604 1605 trigger_ready(&auxtrace_snapshot_trigger); 1606 trigger_ready(&switch_output_trigger); 1607 perf_hooks__invoke_record_start(); 1608 for (;;) { 1609 unsigned long long hits = rec->samples; 1610 1611 /* 1612 * rec->evlist->bkw_mmap_state is possible to be 1613 * BKW_MMAP_EMPTY here: when done == true and 1614 * hits != rec->samples in previous round. 1615 * 1616 * perf_evlist__toggle_bkw_mmap ensure we never 1617 * convert BKW_MMAP_EMPTY to BKW_MMAP_DATA_PENDING. 1618 */ 1619 if (trigger_is_hit(&switch_output_trigger) || done || draining) 1620 perf_evlist__toggle_bkw_mmap(rec->evlist, BKW_MMAP_DATA_PENDING); 1621 1622 if (record__mmap_read_all(rec, false) < 0) { 1623 trigger_error(&auxtrace_snapshot_trigger); 1624 trigger_error(&switch_output_trigger); 1625 err = -1; 1626 goto out_child; 1627 } 1628 1629 if (auxtrace_record__snapshot_started) { 1630 auxtrace_record__snapshot_started = 0; 1631 if (!trigger_is_error(&auxtrace_snapshot_trigger)) 1632 record__read_auxtrace_snapshot(rec, false); 1633 if (trigger_is_error(&auxtrace_snapshot_trigger)) { 1634 pr_err("AUX area tracing snapshot failed\n"); 1635 err = -1; 1636 goto out_child; 1637 } 1638 } 1639 1640 if (trigger_is_hit(&switch_output_trigger)) { 1641 /* 1642 * If switch_output_trigger is hit, the data in 1643 * overwritable ring buffer should have been collected, 1644 * so bkw_mmap_state should be set to BKW_MMAP_EMPTY. 1645 * 1646 * If SIGUSR2 raise after or during record__mmap_read_all(), 1647 * record__mmap_read_all() didn't collect data from 1648 * overwritable ring buffer. Read again. 1649 */ 1650 if (rec->evlist->bkw_mmap_state == BKW_MMAP_RUNNING) 1651 continue; 1652 trigger_ready(&switch_output_trigger); 1653 1654 /* 1655 * Reenable events in overwrite ring buffer after 1656 * record__mmap_read_all(): we should have collected 1657 * data from it. 1658 */ 1659 perf_evlist__toggle_bkw_mmap(rec->evlist, BKW_MMAP_RUNNING); 1660 1661 if (!quiet) 1662 fprintf(stderr, "[ perf record: dump data: Woken up %ld times ]\n", 1663 waking); 1664 waking = 0; 1665 fd = record__switch_output(rec, false); 1666 if (fd < 0) { 1667 pr_err("Failed to switch to new file\n"); 1668 trigger_error(&switch_output_trigger); 1669 err = fd; 1670 goto out_child; 1671 } 1672 1673 /* re-arm the alarm */ 1674 if (rec->switch_output.time) 1675 alarm(rec->switch_output.time); 1676 } 1677 1678 if (hits == rec->samples) { 1679 if (done || draining) 1680 break; 1681 err = evlist__poll(rec->evlist, -1); 1682 /* 1683 * Propagate error, only if there's any. Ignore positive 1684 * number of returned events and interrupt error. 1685 */ 1686 if (err > 0 || (err < 0 && errno == EINTR)) 1687 err = 0; 1688 waking++; 1689 1690 if (evlist__filter_pollfd(rec->evlist, POLLERR | POLLHUP) == 0) 1691 draining = true; 1692 } 1693 1694 /* 1695 * When perf is starting the traced process, at the end events 1696 * die with the process and we wait for that. Thus no need to 1697 * disable events in this case. 1698 */ 1699 if (done && !disabled && !target__none(&opts->target)) { 1700 trigger_off(&auxtrace_snapshot_trigger); 1701 evlist__disable(rec->evlist); 1702 disabled = true; 1703 } 1704 } 1705 1706 trigger_off(&auxtrace_snapshot_trigger); 1707 trigger_off(&switch_output_trigger); 1708 1709 if (opts->auxtrace_snapshot_on_exit) 1710 record__auxtrace_snapshot_exit(rec); 1711 1712 if (forks && workload_exec_errno) { 1713 char msg[STRERR_BUFSIZE]; 1714 const char *emsg = str_error_r(workload_exec_errno, msg, sizeof(msg)); 1715 pr_err("Workload failed: %s\n", emsg); 1716 err = -1; 1717 goto out_child; 1718 } 1719 1720 if (!quiet) 1721 fprintf(stderr, "[ perf record: Woken up %ld times to write data ]\n", waking); 1722 1723 if (target__none(&rec->opts.target)) 1724 record__synthesize_workload(rec, true); 1725 1726 out_child: 1727 record__mmap_read_all(rec, true); 1728 record__aio_mmap_read_sync(rec); 1729 1730 if (rec->session->bytes_transferred && rec->session->bytes_compressed) { 1731 ratio = (float)rec->session->bytes_transferred/(float)rec->session->bytes_compressed; 1732 session->header.env.comp_ratio = ratio + 0.5; 1733 } 1734 1735 if (forks) { 1736 int exit_status; 1737 1738 if (!child_finished) 1739 kill(rec->evlist->workload.pid, SIGTERM); 1740 1741 wait(&exit_status); 1742 1743 if (err < 0) 1744 status = err; 1745 else if (WIFEXITED(exit_status)) 1746 status = WEXITSTATUS(exit_status); 1747 else if (WIFSIGNALED(exit_status)) 1748 signr = WTERMSIG(exit_status); 1749 } else 1750 status = err; 1751 1752 record__synthesize(rec, true); 1753 /* this will be recalculated during process_buildids() */ 1754 rec->samples = 0; 1755 1756 if (!err) { 1757 if (!rec->timestamp_filename) { 1758 record__finish_output(rec); 1759 } else { 1760 fd = record__switch_output(rec, true); 1761 if (fd < 0) { 1762 status = fd; 1763 goto out_delete_session; 1764 } 1765 } 1766 } 1767 1768 perf_hooks__invoke_record_end(); 1769 1770 if (!err && !quiet) { 1771 char samples[128]; 1772 const char *postfix = rec->timestamp_filename ? 1773 ".<timestamp>" : ""; 1774 1775 if (rec->samples && !rec->opts.full_auxtrace) 1776 scnprintf(samples, sizeof(samples), 1777 " (%" PRIu64 " samples)", rec->samples); 1778 else 1779 samples[0] = '\0'; 1780 1781 fprintf(stderr, "[ perf record: Captured and wrote %.3f MB %s%s%s", 1782 perf_data__size(data) / 1024.0 / 1024.0, 1783 data->path, postfix, samples); 1784 if (ratio) { 1785 fprintf(stderr, ", compressed (original %.3f MB, ratio is %.3f)", 1786 rec->session->bytes_transferred / 1024.0 / 1024.0, 1787 ratio); 1788 } 1789 fprintf(stderr, " ]\n"); 1790 } 1791 1792 out_delete_session: 1793 zstd_fini(&session->zstd_data); 1794 perf_session__delete(session); 1795 1796 if (!opts->no_bpf_event) 1797 perf_evlist__stop_sb_thread(sb_evlist); 1798 return status; 1799 } 1800 1801 static void callchain_debug(struct callchain_param *callchain) 1802 { 1803 static const char *str[CALLCHAIN_MAX] = { "NONE", "FP", "DWARF", "LBR" }; 1804 1805 pr_debug("callchain: type %s\n", str[callchain->record_mode]); 1806 1807 if (callchain->record_mode == CALLCHAIN_DWARF) 1808 pr_debug("callchain: stack dump size %d\n", 1809 callchain->dump_size); 1810 } 1811 1812 int record_opts__parse_callchain(struct record_opts *record, 1813 struct callchain_param *callchain, 1814 const char *arg, bool unset) 1815 { 1816 int ret; 1817 callchain->enabled = !unset; 1818 1819 /* --no-call-graph */ 1820 if (unset) { 1821 callchain->record_mode = CALLCHAIN_NONE; 1822 pr_debug("callchain: disabled\n"); 1823 return 0; 1824 } 1825 1826 ret = parse_callchain_record_opt(arg, callchain); 1827 if (!ret) { 1828 /* Enable data address sampling for DWARF unwind. */ 1829 if (callchain->record_mode == CALLCHAIN_DWARF) 1830 record->sample_address = true; 1831 callchain_debug(callchain); 1832 } 1833 1834 return ret; 1835 } 1836 1837 int record_parse_callchain_opt(const struct option *opt, 1838 const char *arg, 1839 int unset) 1840 { 1841 return record_opts__parse_callchain(opt->value, &callchain_param, arg, unset); 1842 } 1843 1844 int record_callchain_opt(const struct option *opt, 1845 const char *arg __maybe_unused, 1846 int unset __maybe_unused) 1847 { 1848 struct callchain_param *callchain = opt->value; 1849 1850 callchain->enabled = true; 1851 1852 if (callchain->record_mode == CALLCHAIN_NONE) 1853 callchain->record_mode = CALLCHAIN_FP; 1854 1855 callchain_debug(callchain); 1856 return 0; 1857 } 1858 1859 static int perf_record_config(const char *var, const char *value, void *cb) 1860 { 1861 struct record *rec = cb; 1862 1863 if (!strcmp(var, "record.build-id")) { 1864 if (!strcmp(value, "cache")) 1865 rec->no_buildid_cache = false; 1866 else if (!strcmp(value, "no-cache")) 1867 rec->no_buildid_cache = true; 1868 else if (!strcmp(value, "skip")) 1869 rec->no_buildid = true; 1870 else 1871 return -1; 1872 return 0; 1873 } 1874 if (!strcmp(var, "record.call-graph")) { 1875 var = "call-graph.record-mode"; 1876 return perf_default_config(var, value, cb); 1877 } 1878 #ifdef HAVE_AIO_SUPPORT 1879 if (!strcmp(var, "record.aio")) { 1880 rec->opts.nr_cblocks = strtol(value, NULL, 0); 1881 if (!rec->opts.nr_cblocks) 1882 rec->opts.nr_cblocks = nr_cblocks_default; 1883 } 1884 #endif 1885 1886 return 0; 1887 } 1888 1889 struct clockid_map { 1890 const char *name; 1891 int clockid; 1892 }; 1893 1894 #define CLOCKID_MAP(n, c) \ 1895 { .name = n, .clockid = (c), } 1896 1897 #define CLOCKID_END { .name = NULL, } 1898 1899 1900 /* 1901 * Add the missing ones, we need to build on many distros... 1902 */ 1903 #ifndef CLOCK_MONOTONIC_RAW 1904 #define CLOCK_MONOTONIC_RAW 4 1905 #endif 1906 #ifndef CLOCK_BOOTTIME 1907 #define CLOCK_BOOTTIME 7 1908 #endif 1909 #ifndef CLOCK_TAI 1910 #define CLOCK_TAI 11 1911 #endif 1912 1913 static const struct clockid_map clockids[] = { 1914 /* available for all events, NMI safe */ 1915 CLOCKID_MAP("monotonic", CLOCK_MONOTONIC), 1916 CLOCKID_MAP("monotonic_raw", CLOCK_MONOTONIC_RAW), 1917 1918 /* available for some events */ 1919 CLOCKID_MAP("realtime", CLOCK_REALTIME), 1920 CLOCKID_MAP("boottime", CLOCK_BOOTTIME), 1921 CLOCKID_MAP("tai", CLOCK_TAI), 1922 1923 /* available for the lazy */ 1924 CLOCKID_MAP("mono", CLOCK_MONOTONIC), 1925 CLOCKID_MAP("raw", CLOCK_MONOTONIC_RAW), 1926 CLOCKID_MAP("real", CLOCK_REALTIME), 1927 CLOCKID_MAP("boot", CLOCK_BOOTTIME), 1928 1929 CLOCKID_END, 1930 }; 1931 1932 static int get_clockid_res(clockid_t clk_id, u64 *res_ns) 1933 { 1934 struct timespec res; 1935 1936 *res_ns = 0; 1937 if (!clock_getres(clk_id, &res)) 1938 *res_ns = res.tv_nsec + res.tv_sec * NSEC_PER_SEC; 1939 else 1940 pr_warning("WARNING: Failed to determine specified clock resolution.\n"); 1941 1942 return 0; 1943 } 1944 1945 static int parse_clockid(const struct option *opt, const char *str, int unset) 1946 { 1947 struct record_opts *opts = (struct record_opts *)opt->value; 1948 const struct clockid_map *cm; 1949 const char *ostr = str; 1950 1951 if (unset) { 1952 opts->use_clockid = 0; 1953 return 0; 1954 } 1955 1956 /* no arg passed */ 1957 if (!str) 1958 return 0; 1959 1960 /* no setting it twice */ 1961 if (opts->use_clockid) 1962 return -1; 1963 1964 opts->use_clockid = true; 1965 1966 /* if its a number, we're done */ 1967 if (sscanf(str, "%d", &opts->clockid) == 1) 1968 return get_clockid_res(opts->clockid, &opts->clockid_res_ns); 1969 1970 /* allow a "CLOCK_" prefix to the name */ 1971 if (!strncasecmp(str, "CLOCK_", 6)) 1972 str += 6; 1973 1974 for (cm = clockids; cm->name; cm++) { 1975 if (!strcasecmp(str, cm->name)) { 1976 opts->clockid = cm->clockid; 1977 return get_clockid_res(opts->clockid, 1978 &opts->clockid_res_ns); 1979 } 1980 } 1981 1982 opts->use_clockid = false; 1983 ui__warning("unknown clockid %s, check man page\n", ostr); 1984 return -1; 1985 } 1986 1987 static int record__parse_affinity(const struct option *opt, const char *str, int unset) 1988 { 1989 struct record_opts *opts = (struct record_opts *)opt->value; 1990 1991 if (unset || !str) 1992 return 0; 1993 1994 if (!strcasecmp(str, "node")) 1995 opts->affinity = PERF_AFFINITY_NODE; 1996 else if (!strcasecmp(str, "cpu")) 1997 opts->affinity = PERF_AFFINITY_CPU; 1998 1999 return 0; 2000 } 2001 2002 static int parse_output_max_size(const struct option *opt, 2003 const char *str, int unset) 2004 { 2005 unsigned long *s = (unsigned long *)opt->value; 2006 static struct parse_tag tags_size[] = { 2007 { .tag = 'B', .mult = 1 }, 2008 { .tag = 'K', .mult = 1 << 10 }, 2009 { .tag = 'M', .mult = 1 << 20 }, 2010 { .tag = 'G', .mult = 1 << 30 }, 2011 { .tag = 0 }, 2012 }; 2013 unsigned long val; 2014 2015 if (unset) { 2016 *s = 0; 2017 return 0; 2018 } 2019 2020 val = parse_tag_value(str, tags_size); 2021 if (val != (unsigned long) -1) { 2022 *s = val; 2023 return 0; 2024 } 2025 2026 return -1; 2027 } 2028 2029 static int record__parse_mmap_pages(const struct option *opt, 2030 const char *str, 2031 int unset __maybe_unused) 2032 { 2033 struct record_opts *opts = opt->value; 2034 char *s, *p; 2035 unsigned int mmap_pages; 2036 int ret; 2037 2038 if (!str) 2039 return -EINVAL; 2040 2041 s = strdup(str); 2042 if (!s) 2043 return -ENOMEM; 2044 2045 p = strchr(s, ','); 2046 if (p) 2047 *p = '\0'; 2048 2049 if (*s) { 2050 ret = __perf_evlist__parse_mmap_pages(&mmap_pages, s); 2051 if (ret) 2052 goto out_free; 2053 opts->mmap_pages = mmap_pages; 2054 } 2055 2056 if (!p) { 2057 ret = 0; 2058 goto out_free; 2059 } 2060 2061 ret = __perf_evlist__parse_mmap_pages(&mmap_pages, p + 1); 2062 if (ret) 2063 goto out_free; 2064 2065 opts->auxtrace_mmap_pages = mmap_pages; 2066 2067 out_free: 2068 free(s); 2069 return ret; 2070 } 2071 2072 static void switch_output_size_warn(struct record *rec) 2073 { 2074 u64 wakeup_size = evlist__mmap_size(rec->opts.mmap_pages); 2075 struct switch_output *s = &rec->switch_output; 2076 2077 wakeup_size /= 2; 2078 2079 if (s->size < wakeup_size) { 2080 char buf[100]; 2081 2082 unit_number__scnprintf(buf, sizeof(buf), wakeup_size); 2083 pr_warning("WARNING: switch-output data size lower than " 2084 "wakeup kernel buffer size (%s) " 2085 "expect bigger perf.data sizes\n", buf); 2086 } 2087 } 2088 2089 static int switch_output_setup(struct record *rec) 2090 { 2091 struct switch_output *s = &rec->switch_output; 2092 static struct parse_tag tags_size[] = { 2093 { .tag = 'B', .mult = 1 }, 2094 { .tag = 'K', .mult = 1 << 10 }, 2095 { .tag = 'M', .mult = 1 << 20 }, 2096 { .tag = 'G', .mult = 1 << 30 }, 2097 { .tag = 0 }, 2098 }; 2099 static struct parse_tag tags_time[] = { 2100 { .tag = 's', .mult = 1 }, 2101 { .tag = 'm', .mult = 60 }, 2102 { .tag = 'h', .mult = 60*60 }, 2103 { .tag = 'd', .mult = 60*60*24 }, 2104 { .tag = 0 }, 2105 }; 2106 unsigned long val; 2107 2108 if (!s->set) 2109 return 0; 2110 2111 if (!strcmp(s->str, "signal")) { 2112 s->signal = true; 2113 pr_debug("switch-output with SIGUSR2 signal\n"); 2114 goto enabled; 2115 } 2116 2117 val = parse_tag_value(s->str, tags_size); 2118 if (val != (unsigned long) -1) { 2119 s->size = val; 2120 pr_debug("switch-output with %s size threshold\n", s->str); 2121 goto enabled; 2122 } 2123 2124 val = parse_tag_value(s->str, tags_time); 2125 if (val != (unsigned long) -1) { 2126 s->time = val; 2127 pr_debug("switch-output with %s time threshold (%lu seconds)\n", 2128 s->str, s->time); 2129 goto enabled; 2130 } 2131 2132 return -1; 2133 2134 enabled: 2135 rec->timestamp_filename = true; 2136 s->enabled = true; 2137 2138 if (s->size && !rec->opts.no_buffering) 2139 switch_output_size_warn(rec); 2140 2141 return 0; 2142 } 2143 2144 static const char * const __record_usage[] = { 2145 "perf record [<options>] [<command>]", 2146 "perf record [<options>] -- <command> [<options>]", 2147 NULL 2148 }; 2149 const char * const *record_usage = __record_usage; 2150 2151 /* 2152 * XXX Ideally would be local to cmd_record() and passed to a record__new 2153 * because we need to have access to it in record__exit, that is called 2154 * after cmd_record() exits, but since record_options need to be accessible to 2155 * builtin-script, leave it here. 2156 * 2157 * At least we don't ouch it in all the other functions here directly. 2158 * 2159 * Just say no to tons of global variables, sigh. 2160 */ 2161 static struct record record = { 2162 .opts = { 2163 .sample_time = true, 2164 .mmap_pages = UINT_MAX, 2165 .user_freq = UINT_MAX, 2166 .user_interval = ULLONG_MAX, 2167 .freq = 4000, 2168 .target = { 2169 .uses_mmap = true, 2170 .default_per_cpu = true, 2171 }, 2172 .mmap_flush = MMAP_FLUSH_DEFAULT, 2173 }, 2174 .tool = { 2175 .sample = process_sample_event, 2176 .fork = perf_event__process_fork, 2177 .exit = perf_event__process_exit, 2178 .comm = perf_event__process_comm, 2179 .namespaces = perf_event__process_namespaces, 2180 .mmap = perf_event__process_mmap, 2181 .mmap2 = perf_event__process_mmap2, 2182 .ordered_events = true, 2183 }, 2184 }; 2185 2186 const char record_callchain_help[] = CALLCHAIN_RECORD_HELP 2187 "\n\t\t\t\tDefault: fp"; 2188 2189 static bool dry_run; 2190 2191 /* 2192 * XXX Will stay a global variable till we fix builtin-script.c to stop messing 2193 * with it and switch to use the library functions in perf_evlist that came 2194 * from builtin-record.c, i.e. use record_opts, 2195 * perf_evlist__prepare_workload, etc instead of fork+exec'in 'perf record', 2196 * using pipes, etc. 2197 */ 2198 static struct option __record_options[] = { 2199 OPT_CALLBACK('e', "event", &record.evlist, "event", 2200 "event selector. use 'perf list' to list available events", 2201 parse_events_option), 2202 OPT_CALLBACK(0, "filter", &record.evlist, "filter", 2203 "event filter", parse_filter), 2204 OPT_CALLBACK_NOOPT(0, "exclude-perf", &record.evlist, 2205 NULL, "don't record events from perf itself", 2206 exclude_perf), 2207 OPT_STRING('p', "pid", &record.opts.target.pid, "pid", 2208 "record events on existing process id"), 2209 OPT_STRING('t', "tid", &record.opts.target.tid, "tid", 2210 "record events on existing thread id"), 2211 OPT_INTEGER('r', "realtime", &record.realtime_prio, 2212 "collect data with this RT SCHED_FIFO priority"), 2213 OPT_BOOLEAN(0, "no-buffering", &record.opts.no_buffering, 2214 "collect data without buffering"), 2215 OPT_BOOLEAN('R', "raw-samples", &record.opts.raw_samples, 2216 "collect raw sample records from all opened counters"), 2217 OPT_BOOLEAN('a', "all-cpus", &record.opts.target.system_wide, 2218 "system-wide collection from all CPUs"), 2219 OPT_STRING('C', "cpu", &record.opts.target.cpu_list, "cpu", 2220 "list of cpus to monitor"), 2221 OPT_U64('c', "count", &record.opts.user_interval, "event period to sample"), 2222 OPT_STRING('o', "output", &record.data.path, "file", 2223 "output file name"), 2224 OPT_BOOLEAN_SET('i', "no-inherit", &record.opts.no_inherit, 2225 &record.opts.no_inherit_set, 2226 "child tasks do not inherit counters"), 2227 OPT_BOOLEAN(0, "tail-synthesize", &record.opts.tail_synthesize, 2228 "synthesize non-sample events at the end of output"), 2229 OPT_BOOLEAN(0, "overwrite", &record.opts.overwrite, "use overwrite mode"), 2230 OPT_BOOLEAN(0, "no-bpf-event", &record.opts.no_bpf_event, "record bpf events"), 2231 OPT_BOOLEAN(0, "strict-freq", &record.opts.strict_freq, 2232 "Fail if the specified frequency can't be used"), 2233 OPT_CALLBACK('F', "freq", &record.opts, "freq or 'max'", 2234 "profile at this frequency", 2235 record__parse_freq), 2236 OPT_CALLBACK('m', "mmap-pages", &record.opts, "pages[,pages]", 2237 "number of mmap data pages and AUX area tracing mmap pages", 2238 record__parse_mmap_pages), 2239 OPT_CALLBACK(0, "mmap-flush", &record.opts, "number", 2240 "Minimal number of bytes that is extracted from mmap data pages (default: 1)", 2241 record__mmap_flush_parse), 2242 OPT_BOOLEAN(0, "group", &record.opts.group, 2243 "put the counters into a counter group"), 2244 OPT_CALLBACK_NOOPT('g', NULL, &callchain_param, 2245 NULL, "enables call-graph recording" , 2246 &record_callchain_opt), 2247 OPT_CALLBACK(0, "call-graph", &record.opts, 2248 "record_mode[,record_size]", record_callchain_help, 2249 &record_parse_callchain_opt), 2250 OPT_INCR('v', "verbose", &verbose, 2251 "be more verbose (show counter open errors, etc)"), 2252 OPT_BOOLEAN('q', "quiet", &quiet, "don't print any message"), 2253 OPT_BOOLEAN('s', "stat", &record.opts.inherit_stat, 2254 "per thread counts"), 2255 OPT_BOOLEAN('d', "data", &record.opts.sample_address, "Record the sample addresses"), 2256 OPT_BOOLEAN(0, "phys-data", &record.opts.sample_phys_addr, 2257 "Record the sample physical addresses"), 2258 OPT_BOOLEAN(0, "sample-cpu", &record.opts.sample_cpu, "Record the sample cpu"), 2259 OPT_BOOLEAN_SET('T', "timestamp", &record.opts.sample_time, 2260 &record.opts.sample_time_set, 2261 "Record the sample timestamps"), 2262 OPT_BOOLEAN_SET('P', "period", &record.opts.period, &record.opts.period_set, 2263 "Record the sample period"), 2264 OPT_BOOLEAN('n', "no-samples", &record.opts.no_samples, 2265 "don't sample"), 2266 OPT_BOOLEAN_SET('N', "no-buildid-cache", &record.no_buildid_cache, 2267 &record.no_buildid_cache_set, 2268 "do not update the buildid cache"), 2269 OPT_BOOLEAN_SET('B', "no-buildid", &record.no_buildid, 2270 &record.no_buildid_set, 2271 "do not collect buildids in perf.data"), 2272 OPT_CALLBACK('G', "cgroup", &record.evlist, "name", 2273 "monitor event in cgroup name only", 2274 parse_cgroups), 2275 OPT_UINTEGER('D', "delay", &record.opts.initial_delay, 2276 "ms to wait before starting measurement after program start"), 2277 OPT_BOOLEAN(0, "kcore", &record.opts.kcore, "copy /proc/kcore"), 2278 OPT_STRING('u', "uid", &record.opts.target.uid_str, "user", 2279 "user to profile"), 2280 2281 OPT_CALLBACK_NOOPT('b', "branch-any", &record.opts.branch_stack, 2282 "branch any", "sample any taken branches", 2283 parse_branch_stack), 2284 2285 OPT_CALLBACK('j', "branch-filter", &record.opts.branch_stack, 2286 "branch filter mask", "branch stack filter modes", 2287 parse_branch_stack), 2288 OPT_BOOLEAN('W', "weight", &record.opts.sample_weight, 2289 "sample by weight (on special events only)"), 2290 OPT_BOOLEAN(0, "transaction", &record.opts.sample_transaction, 2291 "sample transaction flags (special events only)"), 2292 OPT_BOOLEAN(0, "per-thread", &record.opts.target.per_thread, 2293 "use per-thread mmaps"), 2294 OPT_CALLBACK_OPTARG('I', "intr-regs", &record.opts.sample_intr_regs, NULL, "any register", 2295 "sample selected machine registers on interrupt," 2296 " use '-I?' to list register names", parse_intr_regs), 2297 OPT_CALLBACK_OPTARG(0, "user-regs", &record.opts.sample_user_regs, NULL, "any register", 2298 "sample selected machine registers on interrupt," 2299 " use '--user-regs=?' to list register names", parse_user_regs), 2300 OPT_BOOLEAN(0, "running-time", &record.opts.running_time, 2301 "Record running/enabled time of read (:S) events"), 2302 OPT_CALLBACK('k', "clockid", &record.opts, 2303 "clockid", "clockid to use for events, see clock_gettime()", 2304 parse_clockid), 2305 OPT_STRING_OPTARG('S', "snapshot", &record.opts.auxtrace_snapshot_opts, 2306 "opts", "AUX area tracing Snapshot Mode", ""), 2307 OPT_UINTEGER(0, "proc-map-timeout", &proc_map_timeout, 2308 "per thread proc mmap processing timeout in ms"), 2309 OPT_BOOLEAN(0, "namespaces", &record.opts.record_namespaces, 2310 "Record namespaces events"), 2311 OPT_BOOLEAN(0, "switch-events", &record.opts.record_switch_events, 2312 "Record context switch events"), 2313 OPT_BOOLEAN_FLAG(0, "all-kernel", &record.opts.all_kernel, 2314 "Configure all used events to run in kernel space.", 2315 PARSE_OPT_EXCLUSIVE), 2316 OPT_BOOLEAN_FLAG(0, "all-user", &record.opts.all_user, 2317 "Configure all used events to run in user space.", 2318 PARSE_OPT_EXCLUSIVE), 2319 OPT_BOOLEAN(0, "kernel-callchains", &record.opts.kernel_callchains, 2320 "collect kernel callchains"), 2321 OPT_BOOLEAN(0, "user-callchains", &record.opts.user_callchains, 2322 "collect user callchains"), 2323 OPT_STRING(0, "clang-path", &llvm_param.clang_path, "clang path", 2324 "clang binary to use for compiling BPF scriptlets"), 2325 OPT_STRING(0, "clang-opt", &llvm_param.clang_opt, "clang options", 2326 "options passed to clang when compiling BPF scriptlets"), 2327 OPT_STRING(0, "vmlinux", &symbol_conf.vmlinux_name, 2328 "file", "vmlinux pathname"), 2329 OPT_BOOLEAN(0, "buildid-all", &record.buildid_all, 2330 "Record build-id of all DSOs regardless of hits"), 2331 OPT_BOOLEAN(0, "timestamp-filename", &record.timestamp_filename, 2332 "append timestamp to output filename"), 2333 OPT_BOOLEAN(0, "timestamp-boundary", &record.timestamp_boundary, 2334 "Record timestamp boundary (time of first/last samples)"), 2335 OPT_STRING_OPTARG_SET(0, "switch-output", &record.switch_output.str, 2336 &record.switch_output.set, "signal or size[BKMG] or time[smhd]", 2337 "Switch output when receiving SIGUSR2 (signal) or cross a size or time threshold", 2338 "signal"), 2339 OPT_INTEGER(0, "switch-max-files", &record.switch_output.num_files, 2340 "Limit number of switch output generated files"), 2341 OPT_BOOLEAN(0, "dry-run", &dry_run, 2342 "Parse options then exit"), 2343 #ifdef HAVE_AIO_SUPPORT 2344 OPT_CALLBACK_OPTARG(0, "aio", &record.opts, 2345 &nr_cblocks_default, "n", "Use <n> control blocks in asynchronous trace writing mode (default: 1, max: 4)", 2346 record__aio_parse), 2347 #endif 2348 OPT_CALLBACK(0, "affinity", &record.opts, "node|cpu", 2349 "Set affinity mask of trace reading thread to NUMA node cpu mask or cpu of processed mmap buffer", 2350 record__parse_affinity), 2351 #ifdef HAVE_ZSTD_SUPPORT 2352 OPT_CALLBACK_OPTARG('z', "compression-level", &record.opts, &comp_level_default, 2353 "n", "Compressed records using specified level (default: 1 - fastest compression, 22 - greatest compression)", 2354 record__parse_comp_level), 2355 #endif 2356 OPT_CALLBACK(0, "max-size", &record.output_max_size, 2357 "size", "Limit the maximum size of the output file", parse_output_max_size), 2358 OPT_END() 2359 }; 2360 2361 struct option *record_options = __record_options; 2362 2363 int cmd_record(int argc, const char **argv) 2364 { 2365 int err; 2366 struct record *rec = &record; 2367 char errbuf[BUFSIZ]; 2368 2369 setlocale(LC_ALL, ""); 2370 2371 #ifndef HAVE_LIBBPF_SUPPORT 2372 # define set_nobuild(s, l, c) set_option_nobuild(record_options, s, l, "NO_LIBBPF=1", c) 2373 set_nobuild('\0', "clang-path", true); 2374 set_nobuild('\0', "clang-opt", true); 2375 # undef set_nobuild 2376 #endif 2377 2378 #ifndef HAVE_BPF_PROLOGUE 2379 # if !defined (HAVE_DWARF_SUPPORT) 2380 # define REASON "NO_DWARF=1" 2381 # elif !defined (HAVE_LIBBPF_SUPPORT) 2382 # define REASON "NO_LIBBPF=1" 2383 # else 2384 # define REASON "this architecture doesn't support BPF prologue" 2385 # endif 2386 # define set_nobuild(s, l, c) set_option_nobuild(record_options, s, l, REASON, c) 2387 set_nobuild('\0', "vmlinux", true); 2388 # undef set_nobuild 2389 # undef REASON 2390 #endif 2391 2392 CPU_ZERO(&rec->affinity_mask); 2393 rec->opts.affinity = PERF_AFFINITY_SYS; 2394 2395 rec->evlist = evlist__new(); 2396 if (rec->evlist == NULL) 2397 return -ENOMEM; 2398 2399 err = perf_config(perf_record_config, rec); 2400 if (err) 2401 return err; 2402 2403 argc = parse_options(argc, argv, record_options, record_usage, 2404 PARSE_OPT_STOP_AT_NON_OPTION); 2405 if (quiet) 2406 perf_quiet_option(); 2407 2408 /* Make system wide (-a) the default target. */ 2409 if (!argc && target__none(&rec->opts.target)) 2410 rec->opts.target.system_wide = true; 2411 2412 if (nr_cgroups && !rec->opts.target.system_wide) { 2413 usage_with_options_msg(record_usage, record_options, 2414 "cgroup monitoring only available in system-wide mode"); 2415 2416 } 2417 2418 if (rec->opts.kcore) 2419 rec->data.is_dir = true; 2420 2421 if (rec->opts.comp_level != 0) { 2422 pr_debug("Compression enabled, disabling build id collection at the end of the session.\n"); 2423 rec->no_buildid = true; 2424 } 2425 2426 if (rec->opts.record_switch_events && 2427 !perf_can_record_switch_events()) { 2428 ui__error("kernel does not support recording context switch events\n"); 2429 parse_options_usage(record_usage, record_options, "switch-events", 0); 2430 return -EINVAL; 2431 } 2432 2433 if (switch_output_setup(rec)) { 2434 parse_options_usage(record_usage, record_options, "switch-output", 0); 2435 return -EINVAL; 2436 } 2437 2438 if (rec->switch_output.time) { 2439 signal(SIGALRM, alarm_sig_handler); 2440 alarm(rec->switch_output.time); 2441 } 2442 2443 if (rec->switch_output.num_files) { 2444 rec->switch_output.filenames = calloc(sizeof(char *), 2445 rec->switch_output.num_files); 2446 if (!rec->switch_output.filenames) 2447 return -EINVAL; 2448 } 2449 2450 /* 2451 * Allow aliases to facilitate the lookup of symbols for address 2452 * filters. Refer to auxtrace_parse_filters(). 2453 */ 2454 symbol_conf.allow_aliases = true; 2455 2456 symbol__init(NULL); 2457 2458 err = record__auxtrace_init(rec); 2459 if (err) 2460 goto out; 2461 2462 if (dry_run) 2463 goto out; 2464 2465 err = bpf__setup_stdout(rec->evlist); 2466 if (err) { 2467 bpf__strerror_setup_stdout(rec->evlist, err, errbuf, sizeof(errbuf)); 2468 pr_err("ERROR: Setup BPF stdout failed: %s\n", 2469 errbuf); 2470 goto out; 2471 } 2472 2473 err = -ENOMEM; 2474 2475 if (rec->no_buildid_cache || rec->no_buildid) { 2476 disable_buildid_cache(); 2477 } else if (rec->switch_output.enabled) { 2478 /* 2479 * In 'perf record --switch-output', disable buildid 2480 * generation by default to reduce data file switching 2481 * overhead. Still generate buildid if they are required 2482 * explicitly using 2483 * 2484 * perf record --switch-output --no-no-buildid \ 2485 * --no-no-buildid-cache 2486 * 2487 * Following code equals to: 2488 * 2489 * if ((rec->no_buildid || !rec->no_buildid_set) && 2490 * (rec->no_buildid_cache || !rec->no_buildid_cache_set)) 2491 * disable_buildid_cache(); 2492 */ 2493 bool disable = true; 2494 2495 if (rec->no_buildid_set && !rec->no_buildid) 2496 disable = false; 2497 if (rec->no_buildid_cache_set && !rec->no_buildid_cache) 2498 disable = false; 2499 if (disable) { 2500 rec->no_buildid = true; 2501 rec->no_buildid_cache = true; 2502 disable_buildid_cache(); 2503 } 2504 } 2505 2506 if (record.opts.overwrite) 2507 record.opts.tail_synthesize = true; 2508 2509 if (rec->evlist->core.nr_entries == 0 && 2510 __perf_evlist__add_default(rec->evlist, !record.opts.no_samples) < 0) { 2511 pr_err("Not enough memory for event selector list\n"); 2512 goto out; 2513 } 2514 2515 if (rec->opts.target.tid && !rec->opts.no_inherit_set) 2516 rec->opts.no_inherit = true; 2517 2518 err = target__validate(&rec->opts.target); 2519 if (err) { 2520 target__strerror(&rec->opts.target, err, errbuf, BUFSIZ); 2521 ui__warning("%s\n", errbuf); 2522 } 2523 2524 err = target__parse_uid(&rec->opts.target); 2525 if (err) { 2526 int saved_errno = errno; 2527 2528 target__strerror(&rec->opts.target, err, errbuf, BUFSIZ); 2529 ui__error("%s", errbuf); 2530 2531 err = -saved_errno; 2532 goto out; 2533 } 2534 2535 /* Enable ignoring missing threads when -u/-p option is defined. */ 2536 rec->opts.ignore_missing_thread = rec->opts.target.uid != UINT_MAX || rec->opts.target.pid; 2537 2538 err = -ENOMEM; 2539 if (perf_evlist__create_maps(rec->evlist, &rec->opts.target) < 0) 2540 usage_with_options(record_usage, record_options); 2541 2542 err = auxtrace_record__options(rec->itr, rec->evlist, &rec->opts); 2543 if (err) 2544 goto out; 2545 2546 /* 2547 * We take all buildids when the file contains 2548 * AUX area tracing data because we do not decode the 2549 * trace because it would take too long. 2550 */ 2551 if (rec->opts.full_auxtrace) 2552 rec->buildid_all = true; 2553 2554 if (record_opts__config(&rec->opts)) { 2555 err = -EINVAL; 2556 goto out; 2557 } 2558 2559 if (rec->opts.nr_cblocks > nr_cblocks_max) 2560 rec->opts.nr_cblocks = nr_cblocks_max; 2561 pr_debug("nr_cblocks: %d\n", rec->opts.nr_cblocks); 2562 2563 pr_debug("affinity: %s\n", affinity_tags[rec->opts.affinity]); 2564 pr_debug("mmap flush: %d\n", rec->opts.mmap_flush); 2565 2566 if (rec->opts.comp_level > comp_level_max) 2567 rec->opts.comp_level = comp_level_max; 2568 pr_debug("comp level: %d\n", rec->opts.comp_level); 2569 2570 err = __cmd_record(&record, argc, argv); 2571 out: 2572 evlist__delete(rec->evlist); 2573 symbol__exit(); 2574 auxtrace_record__free(rec->itr); 2575 return err; 2576 } 2577 2578 static void snapshot_sig_handler(int sig __maybe_unused) 2579 { 2580 struct record *rec = &record; 2581 2582 if (trigger_is_ready(&auxtrace_snapshot_trigger)) { 2583 trigger_hit(&auxtrace_snapshot_trigger); 2584 auxtrace_record__snapshot_started = 1; 2585 if (auxtrace_record__snapshot_start(record.itr)) 2586 trigger_error(&auxtrace_snapshot_trigger); 2587 } 2588 2589 if (switch_output_signal(rec)) 2590 trigger_hit(&switch_output_trigger); 2591 } 2592 2593 static void alarm_sig_handler(int sig __maybe_unused) 2594 { 2595 struct record *rec = &record; 2596 2597 if (switch_output_time(rec)) 2598 trigger_hit(&switch_output_trigger); 2599 } 2600