1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * builtin-record.c 4 * 5 * Builtin record command: Record the profile of a workload 6 * (or a CPU, or a PID) into the perf.data output file - for 7 * later analysis via perf report. 8 */ 9 #include "builtin.h" 10 11 #include "util/build-id.h" 12 #include <subcmd/parse-options.h> 13 #include <internal/xyarray.h> 14 #include "util/parse-events.h" 15 #include "util/config.h" 16 17 #include "util/arm64-frame-pointer-unwind-support.h" 18 #include "util/callchain.h" 19 #include "util/cgroup.h" 20 #include "util/header.h" 21 #include "util/event.h" 22 #include "util/evlist.h" 23 #include "util/evsel.h" 24 #include "util/debug.h" 25 #include "util/mmap.h" 26 #include "util/mutex.h" 27 #include "util/target.h" 28 #include "util/session.h" 29 #include "util/tool.h" 30 #include "util/stat.h" 31 #include "util/symbol.h" 32 #include "util/record.h" 33 #include "util/cpumap.h" 34 #include "util/thread_map.h" 35 #include "util/data.h" 36 #include "util/perf_regs.h" 37 #include "util/auxtrace.h" 38 #include "util/tsc.h" 39 #include "util/parse-branch-options.h" 40 #include "util/parse-regs-options.h" 41 #include "util/perf_api_probe.h" 42 #include "util/trigger.h" 43 #include "util/perf-hooks.h" 44 #include "util/synthetic-events.h" 45 #include "util/time-utils.h" 46 #include "util/units.h" 47 #include "util/bpf-event.h" 48 #include "util/util.h" 49 #include "util/pfm.h" 50 #include "util/pmu.h" 51 #include "util/pmus.h" 52 #include "util/clockid.h" 53 #include "util/off_cpu.h" 54 #include "util/bpf-filter.h" 55 #include "util/strbuf.h" 56 #include "asm/bug.h" 57 #include "perf.h" 58 #include "cputopo.h" 59 #include "dwarf-regs.h" 60 61 #include <errno.h> 62 #include <inttypes.h> 63 #include <locale.h> 64 #include <poll.h> 65 #include <pthread.h> 66 #include <unistd.h> 67 #ifndef HAVE_GETTID 68 #include <syscall.h> 69 #endif 70 #include <sched.h> 71 #include <signal.h> 72 #ifdef HAVE_EVENTFD_SUPPORT 73 #include <sys/eventfd.h> 74 #endif 75 #include <sys/mman.h> 76 #include <sys/wait.h> 77 #include <sys/types.h> 78 #include <sys/stat.h> 79 #include <fcntl.h> 80 #include <linux/err.h> 81 #include <linux/string.h> 82 #include <linux/time64.h> 83 #include <linux/zalloc.h> 84 #include <linux/bitmap.h> 85 #include <sys/time.h> 86 87 struct switch_output { 88 bool enabled; 89 bool signal; 90 unsigned long size; 91 unsigned long time; 92 const char *str; 93 bool set; 94 char **filenames; 95 int num_files; 96 int cur_file; 97 }; 98 99 struct thread_mask { 100 struct mmap_cpu_mask maps; 101 struct mmap_cpu_mask affinity; 102 }; 103 104 struct record_thread { 105 pid_t tid; 106 struct thread_mask *mask; 107 struct { 108 int msg[2]; 109 int ack[2]; 110 } pipes; 111 struct fdarray pollfd; 112 int ctlfd_pos; 113 int nr_mmaps; 114 struct mmap **maps; 115 struct mmap **overwrite_maps; 116 struct record *rec; 117 unsigned long long samples; 118 unsigned long waking; 119 u64 bytes_written; 120 u64 bytes_transferred; 121 u64 bytes_compressed; 122 }; 123 124 static __thread struct record_thread *thread; 125 126 enum thread_msg { 127 THREAD_MSG__UNDEFINED = 0, 128 THREAD_MSG__READY, 129 THREAD_MSG__MAX, 130 }; 131 132 static const char *thread_msg_tags[THREAD_MSG__MAX] = { 133 "UNDEFINED", "READY" 134 }; 135 136 enum thread_spec { 137 THREAD_SPEC__UNDEFINED = 0, 138 THREAD_SPEC__CPU, 139 THREAD_SPEC__CORE, 140 THREAD_SPEC__PACKAGE, 141 THREAD_SPEC__NUMA, 142 THREAD_SPEC__USER, 143 THREAD_SPEC__MAX, 144 }; 145 146 static const char *thread_spec_tags[THREAD_SPEC__MAX] = { 147 "undefined", "cpu", "core", "package", "numa", "user" 148 }; 149 150 struct pollfd_index_map { 151 int evlist_pollfd_index; 152 int thread_pollfd_index; 153 }; 154 155 struct record { 156 struct perf_tool tool; 157 struct record_opts opts; 158 u64 bytes_written; 159 u64 thread_bytes_written; 160 struct perf_data data; 161 struct auxtrace_record *itr; 162 struct evlist *evlist; 163 struct perf_session *session; 164 struct evlist *sb_evlist; 165 pthread_t thread_id; 166 int realtime_prio; 167 bool latency; 168 bool switch_output_event_set; 169 bool no_buildid; 170 bool no_buildid_set; 171 bool no_buildid_cache; 172 bool no_buildid_cache_set; 173 bool buildid_all; 174 bool buildid_mmap; 175 bool buildid_mmap_set; 176 bool timestamp_filename; 177 bool timestamp_boundary; 178 bool off_cpu; 179 const char *filter_action; 180 const char *uid_str; 181 struct switch_output switch_output; 182 unsigned long long samples; 183 unsigned long output_max_size; /* = 0: unlimited */ 184 struct perf_debuginfod debuginfod; 185 int nr_threads; 186 struct thread_mask *thread_masks; 187 struct record_thread *thread_data; 188 struct pollfd_index_map *index_map; 189 size_t index_map_sz; 190 size_t index_map_cnt; 191 }; 192 193 static volatile int done; 194 195 static volatile int auxtrace_record__snapshot_started; 196 static DEFINE_TRIGGER(auxtrace_snapshot_trigger); 197 static DEFINE_TRIGGER(switch_output_trigger); 198 199 static const char *affinity_tags[PERF_AFFINITY_MAX] = { 200 "SYS", "NODE", "CPU" 201 }; 202 203 static int build_id__process_mmap(const struct perf_tool *tool, union perf_event *event, 204 struct perf_sample *sample, struct machine *machine); 205 static int build_id__process_mmap2(const struct perf_tool *tool, union perf_event *event, 206 struct perf_sample *sample, struct machine *machine); 207 static int process_timestamp_boundary(const struct perf_tool *tool, 208 union perf_event *event, 209 struct perf_sample *sample, 210 struct machine *machine); 211 212 #ifndef HAVE_GETTID 213 static inline pid_t gettid(void) 214 { 215 return (pid_t)syscall(__NR_gettid); 216 } 217 #endif 218 219 static int record__threads_enabled(struct record *rec) 220 { 221 return rec->opts.threads_spec; 222 } 223 224 static bool switch_output_signal(struct record *rec) 225 { 226 return rec->switch_output.signal && 227 trigger_is_ready(&switch_output_trigger); 228 } 229 230 static bool switch_output_size(struct record *rec) 231 { 232 return rec->switch_output.size && 233 trigger_is_ready(&switch_output_trigger) && 234 (rec->bytes_written >= rec->switch_output.size); 235 } 236 237 static bool switch_output_time(struct record *rec) 238 { 239 return rec->switch_output.time && 240 trigger_is_ready(&switch_output_trigger); 241 } 242 243 static u64 record__bytes_written(struct record *rec) 244 { 245 return rec->bytes_written + rec->thread_bytes_written; 246 } 247 248 static bool record__output_max_size_exceeded(struct record *rec) 249 { 250 return rec->output_max_size && 251 (record__bytes_written(rec) >= rec->output_max_size); 252 } 253 254 static int record__write(struct record *rec, struct mmap *map __maybe_unused, 255 void *bf, size_t size) 256 { 257 struct perf_data_file *file = &rec->session->data->file; 258 259 if (map && map->file) 260 file = map->file; 261 262 if (perf_data_file__write(file, bf, size) < 0) { 263 pr_err("failed to write perf data, error: %m\n"); 264 return -1; 265 } 266 267 if (map && map->file) { 268 thread->bytes_written += size; 269 rec->thread_bytes_written += size; 270 } else { 271 rec->bytes_written += size; 272 } 273 274 if (record__output_max_size_exceeded(rec) && !done) { 275 fprintf(stderr, "[ perf record: perf size limit reached (%" PRIu64 " KB)," 276 " stopping session ]\n", 277 record__bytes_written(rec) >> 10); 278 done = 1; 279 } 280 281 if (switch_output_size(rec)) 282 trigger_hit(&switch_output_trigger); 283 284 return 0; 285 } 286 287 static int record__aio_enabled(struct record *rec); 288 static int record__comp_enabled(struct record *rec); 289 static ssize_t zstd_compress(struct perf_session *session, struct mmap *map, 290 void *dst, size_t dst_size, void *src, size_t src_size); 291 292 #ifdef HAVE_AIO_SUPPORT 293 static int record__aio_write(struct aiocb *cblock, int trace_fd, 294 void *buf, size_t size, off_t off) 295 { 296 int rc; 297 298 cblock->aio_fildes = trace_fd; 299 cblock->aio_buf = buf; 300 cblock->aio_nbytes = size; 301 cblock->aio_offset = off; 302 cblock->aio_sigevent.sigev_notify = SIGEV_NONE; 303 304 do { 305 rc = aio_write(cblock); 306 if (rc == 0) { 307 break; 308 } else if (errno != EAGAIN) { 309 cblock->aio_fildes = -1; 310 pr_err("failed to queue perf data, error: %m\n"); 311 break; 312 } 313 } while (1); 314 315 return rc; 316 } 317 318 static int record__aio_complete(struct mmap *md, struct aiocb *cblock) 319 { 320 void *rem_buf; 321 off_t rem_off; 322 size_t rem_size; 323 int rc, aio_errno; 324 ssize_t aio_ret, written; 325 326 aio_errno = aio_error(cblock); 327 if (aio_errno == EINPROGRESS) 328 return 0; 329 330 written = aio_ret = aio_return(cblock); 331 if (aio_ret < 0) { 332 if (aio_errno != EINTR) 333 pr_err("failed to write perf data, error: %m\n"); 334 written = 0; 335 } 336 337 rem_size = cblock->aio_nbytes - written; 338 339 if (rem_size == 0) { 340 cblock->aio_fildes = -1; 341 /* 342 * md->refcount is incremented in record__aio_pushfn() for 343 * every aio write request started in record__aio_push() so 344 * decrement it because the request is now complete. 345 */ 346 perf_mmap__put(&md->core); 347 rc = 1; 348 } else { 349 /* 350 * aio write request may require restart with the 351 * remainder if the kernel didn't write whole 352 * chunk at once. 353 */ 354 rem_off = cblock->aio_offset + written; 355 rem_buf = (void *)(cblock->aio_buf + written); 356 record__aio_write(cblock, cblock->aio_fildes, 357 rem_buf, rem_size, rem_off); 358 rc = 0; 359 } 360 361 return rc; 362 } 363 364 static int record__aio_sync(struct mmap *md, bool sync_all) 365 { 366 struct aiocb **aiocb = md->aio.aiocb; 367 struct aiocb *cblocks = md->aio.cblocks; 368 struct timespec timeout = { 0, 1000 * 1000 * 1 }; /* 1ms */ 369 int i, do_suspend; 370 371 do { 372 do_suspend = 0; 373 for (i = 0; i < md->aio.nr_cblocks; ++i) { 374 if (cblocks[i].aio_fildes == -1 || record__aio_complete(md, &cblocks[i])) { 375 if (sync_all) 376 aiocb[i] = NULL; 377 else 378 return i; 379 } else { 380 /* 381 * Started aio write is not complete yet 382 * so it has to be waited before the 383 * next allocation. 384 */ 385 aiocb[i] = &cblocks[i]; 386 do_suspend = 1; 387 } 388 } 389 if (!do_suspend) 390 return -1; 391 392 while (aio_suspend((const struct aiocb **)aiocb, md->aio.nr_cblocks, &timeout)) { 393 if (!(errno == EAGAIN || errno == EINTR)) 394 pr_err("failed to sync perf data, error: %m\n"); 395 } 396 } while (1); 397 } 398 399 struct record_aio { 400 struct record *rec; 401 void *data; 402 size_t size; 403 }; 404 405 static int record__aio_pushfn(struct mmap *map, void *to, void *buf, size_t size) 406 { 407 struct record_aio *aio = to; 408 409 /* 410 * map->core.base data pointed by buf is copied into free map->aio.data[] buffer 411 * to release space in the kernel buffer as fast as possible, calling 412 * perf_mmap__consume() from perf_mmap__push() function. 413 * 414 * That lets the kernel to proceed with storing more profiling data into 415 * the kernel buffer earlier than other per-cpu kernel buffers are handled. 416 * 417 * Coping can be done in two steps in case the chunk of profiling data 418 * crosses the upper bound of the kernel buffer. In this case we first move 419 * part of data from map->start till the upper bound and then the remainder 420 * from the beginning of the kernel buffer till the end of the data chunk. 421 */ 422 423 if (record__comp_enabled(aio->rec)) { 424 ssize_t compressed = zstd_compress(aio->rec->session, NULL, aio->data + aio->size, 425 mmap__mmap_len(map) - aio->size, 426 buf, size); 427 if (compressed < 0) 428 return (int)compressed; 429 430 size = compressed; 431 } else { 432 memcpy(aio->data + aio->size, buf, size); 433 } 434 435 if (!aio->size) { 436 /* 437 * Increment map->refcount to guard map->aio.data[] buffer 438 * from premature deallocation because map object can be 439 * released earlier than aio write request started on 440 * map->aio.data[] buffer is complete. 441 * 442 * perf_mmap__put() is done at record__aio_complete() 443 * after started aio request completion or at record__aio_push() 444 * if the request failed to start. 445 */ 446 perf_mmap__get(&map->core); 447 } 448 449 aio->size += size; 450 451 return size; 452 } 453 454 static int record__aio_push(struct record *rec, struct mmap *map, off_t *off) 455 { 456 int ret, idx; 457 int trace_fd = perf_data__fd(rec->session->data); 458 struct record_aio aio = { .rec = rec, .size = 0 }; 459 460 /* 461 * Call record__aio_sync() to wait till map->aio.data[] buffer 462 * becomes available after previous aio write operation. 463 */ 464 465 idx = record__aio_sync(map, false); 466 aio.data = map->aio.data[idx]; 467 ret = perf_mmap__push(map, &aio, record__aio_pushfn); 468 if (ret != 0) /* ret > 0 - no data, ret < 0 - error */ 469 return ret; 470 471 rec->samples++; 472 ret = record__aio_write(&(map->aio.cblocks[idx]), trace_fd, aio.data, aio.size, *off); 473 if (!ret) { 474 *off += aio.size; 475 rec->bytes_written += aio.size; 476 if (switch_output_size(rec)) 477 trigger_hit(&switch_output_trigger); 478 } else { 479 /* 480 * Decrement map->refcount incremented in record__aio_pushfn() 481 * back if record__aio_write() operation failed to start, otherwise 482 * map->refcount is decremented in record__aio_complete() after 483 * aio write operation finishes successfully. 484 */ 485 perf_mmap__put(&map->core); 486 } 487 488 return ret; 489 } 490 491 static off_t record__aio_get_pos(int trace_fd) 492 { 493 return lseek(trace_fd, 0, SEEK_CUR); 494 } 495 496 static void record__aio_set_pos(int trace_fd, off_t pos) 497 { 498 lseek(trace_fd, pos, SEEK_SET); 499 } 500 501 static void record__aio_mmap_read_sync(struct record *rec) 502 { 503 int i; 504 struct evlist *evlist = rec->evlist; 505 struct mmap *maps = evlist->mmap; 506 507 if (!record__aio_enabled(rec)) 508 return; 509 510 for (i = 0; i < evlist->core.nr_mmaps; i++) { 511 struct mmap *map = &maps[i]; 512 513 if (map->core.base) 514 record__aio_sync(map, true); 515 } 516 } 517 518 static int nr_cblocks_default = 1; 519 static int nr_cblocks_max = 4; 520 521 static int record__aio_parse(const struct option *opt, 522 const char *str, 523 int unset) 524 { 525 struct record_opts *opts = (struct record_opts *)opt->value; 526 527 if (unset) { 528 opts->nr_cblocks = 0; 529 } else { 530 if (str) 531 opts->nr_cblocks = strtol(str, NULL, 0); 532 if (!opts->nr_cblocks) 533 opts->nr_cblocks = nr_cblocks_default; 534 } 535 536 return 0; 537 } 538 #else /* HAVE_AIO_SUPPORT */ 539 static int nr_cblocks_max = 0; 540 541 static int record__aio_push(struct record *rec __maybe_unused, struct mmap *map __maybe_unused, 542 off_t *off __maybe_unused) 543 { 544 return -1; 545 } 546 547 static off_t record__aio_get_pos(int trace_fd __maybe_unused) 548 { 549 return -1; 550 } 551 552 static void record__aio_set_pos(int trace_fd __maybe_unused, off_t pos __maybe_unused) 553 { 554 } 555 556 static void record__aio_mmap_read_sync(struct record *rec __maybe_unused) 557 { 558 } 559 #endif 560 561 static int record__aio_enabled(struct record *rec) 562 { 563 return rec->opts.nr_cblocks > 0; 564 } 565 566 #define MMAP_FLUSH_DEFAULT 1 567 static int record__mmap_flush_parse(const struct option *opt, 568 const char *str, 569 int unset) 570 { 571 int flush_max; 572 struct record_opts *opts = (struct record_opts *)opt->value; 573 static struct parse_tag tags[] = { 574 { .tag = 'B', .mult = 1 }, 575 { .tag = 'K', .mult = 1 << 10 }, 576 { .tag = 'M', .mult = 1 << 20 }, 577 { .tag = 'G', .mult = 1 << 30 }, 578 { .tag = 0 }, 579 }; 580 581 if (unset) 582 return 0; 583 584 if (str) { 585 opts->mmap_flush = parse_tag_value(str, tags); 586 if (opts->mmap_flush == (int)-1) 587 opts->mmap_flush = strtol(str, NULL, 0); 588 } 589 590 if (!opts->mmap_flush) 591 opts->mmap_flush = MMAP_FLUSH_DEFAULT; 592 593 flush_max = evlist__mmap_size(opts->mmap_pages); 594 flush_max /= 4; 595 if (opts->mmap_flush > flush_max) 596 opts->mmap_flush = flush_max; 597 598 return 0; 599 } 600 601 #ifdef HAVE_ZSTD_SUPPORT 602 static unsigned int comp_level_default = 1; 603 604 static int record__parse_comp_level(const struct option *opt, const char *str, int unset) 605 { 606 struct record_opts *opts = opt->value; 607 608 if (unset) { 609 opts->comp_level = 0; 610 } else { 611 if (str) 612 opts->comp_level = strtol(str, NULL, 0); 613 if (!opts->comp_level) 614 opts->comp_level = comp_level_default; 615 } 616 617 return 0; 618 } 619 #endif 620 static unsigned int comp_level_max = 22; 621 622 static int record__comp_enabled(struct record *rec) 623 { 624 return rec->opts.comp_level > 0; 625 } 626 627 static int process_synthesized_event(const struct perf_tool *tool, 628 union perf_event *event, 629 struct perf_sample *sample __maybe_unused, 630 struct machine *machine __maybe_unused) 631 { 632 struct record *rec = container_of(tool, struct record, tool); 633 return record__write(rec, NULL, event, event->header.size); 634 } 635 636 static struct mutex synth_lock; 637 638 static int process_locked_synthesized_event(const struct perf_tool *tool, 639 union perf_event *event, 640 struct perf_sample *sample __maybe_unused, 641 struct machine *machine __maybe_unused) 642 { 643 int ret; 644 645 mutex_lock(&synth_lock); 646 ret = process_synthesized_event(tool, event, sample, machine); 647 mutex_unlock(&synth_lock); 648 return ret; 649 } 650 651 static int record__pushfn(struct mmap *map, void *to, void *bf, size_t size) 652 { 653 struct record *rec = to; 654 655 if (record__comp_enabled(rec)) { 656 struct perf_record_compressed2 *event = map->data; 657 size_t padding = 0; 658 u8 pad[8] = {0}; 659 ssize_t compressed = zstd_compress(rec->session, map, map->data, 660 mmap__mmap_len(map), bf, size); 661 662 if (compressed < 0) 663 return (int)compressed; 664 665 bf = event; 666 thread->samples++; 667 668 /* 669 * The record from `zstd_compress` is not 8 bytes aligned, which would cause asan 670 * error. We make it aligned here. 671 */ 672 event->data_size = compressed - sizeof(struct perf_record_compressed2); 673 event->header.size = PERF_ALIGN(compressed, sizeof(u64)); 674 padding = event->header.size - compressed; 675 return record__write(rec, map, bf, compressed) || 676 record__write(rec, map, &pad, padding); 677 } 678 679 thread->samples++; 680 return record__write(rec, map, bf, size); 681 } 682 683 static volatile sig_atomic_t signr = -1; 684 static volatile sig_atomic_t child_finished; 685 #ifdef HAVE_EVENTFD_SUPPORT 686 static volatile sig_atomic_t done_fd = -1; 687 #endif 688 689 static void sig_handler(int sig) 690 { 691 if (sig == SIGCHLD) 692 child_finished = 1; 693 else 694 signr = sig; 695 696 done = 1; 697 #ifdef HAVE_EVENTFD_SUPPORT 698 if (done_fd >= 0) { 699 u64 tmp = 1; 700 int orig_errno = errno; 701 702 /* 703 * It is possible for this signal handler to run after done is 704 * checked in the main loop, but before the perf counter fds are 705 * polled. If this happens, the poll() will continue to wait 706 * even though done is set, and will only break out if either 707 * another signal is received, or the counters are ready for 708 * read. To ensure the poll() doesn't sleep when done is set, 709 * use an eventfd (done_fd) to wake up the poll(). 710 */ 711 if (write(done_fd, &tmp, sizeof(tmp)) < 0) 712 pr_err("failed to signal wakeup fd, error: %m\n"); 713 714 errno = orig_errno; 715 } 716 #endif // HAVE_EVENTFD_SUPPORT 717 } 718 719 static void sigsegv_handler(int sig) 720 { 721 perf_hooks__recover(); 722 sighandler_dump_stack(sig); 723 } 724 725 static void record__sig_exit(void) 726 { 727 if (signr == -1) 728 return; 729 730 signal(signr, SIG_DFL); 731 raise(signr); 732 } 733 734 static int record__process_auxtrace(const struct perf_tool *tool, 735 struct mmap *map, 736 union perf_event *event, void *data1, 737 size_t len1, void *data2, size_t len2) 738 { 739 struct record *rec = container_of(tool, struct record, tool); 740 struct perf_data *data = &rec->data; 741 size_t padding; 742 u8 pad[8] = {0}; 743 744 if (!perf_data__is_pipe(data) && perf_data__is_single_file(data)) { 745 off_t file_offset; 746 int fd = perf_data__fd(data); 747 int err; 748 749 file_offset = lseek(fd, 0, SEEK_CUR); 750 if (file_offset == -1) 751 return -1; 752 err = auxtrace_index__auxtrace_event(&rec->session->auxtrace_index, 753 event, file_offset); 754 if (err) 755 return err; 756 } 757 758 /* event.auxtrace.size includes padding, see __auxtrace_mmap__read() */ 759 padding = (len1 + len2) & 7; 760 if (padding) 761 padding = 8 - padding; 762 763 record__write(rec, map, event, event->header.size); 764 record__write(rec, map, data1, len1); 765 if (len2) 766 record__write(rec, map, data2, len2); 767 record__write(rec, map, &pad, padding); 768 769 return 0; 770 } 771 772 static int record__auxtrace_mmap_read(struct record *rec, 773 struct mmap *map) 774 { 775 int ret; 776 777 ret = auxtrace_mmap__read(map, rec->itr, 778 perf_session__env(rec->session), 779 &rec->tool, 780 record__process_auxtrace); 781 if (ret < 0) 782 return ret; 783 784 if (ret) 785 rec->samples++; 786 787 return 0; 788 } 789 790 static int record__auxtrace_mmap_read_snapshot(struct record *rec, 791 struct mmap *map) 792 { 793 int ret; 794 795 ret = auxtrace_mmap__read_snapshot(map, rec->itr, 796 perf_session__env(rec->session), 797 &rec->tool, 798 record__process_auxtrace, 799 rec->opts.auxtrace_snapshot_size); 800 if (ret < 0) 801 return ret; 802 803 if (ret) 804 rec->samples++; 805 806 return 0; 807 } 808 809 static int record__auxtrace_read_snapshot_all(struct record *rec) 810 { 811 int i; 812 int rc = 0; 813 814 for (i = 0; i < rec->evlist->core.nr_mmaps; i++) { 815 struct mmap *map = &rec->evlist->mmap[i]; 816 817 if (!map->auxtrace_mmap.base) 818 continue; 819 820 if (record__auxtrace_mmap_read_snapshot(rec, map) != 0) { 821 rc = -1; 822 goto out; 823 } 824 } 825 out: 826 return rc; 827 } 828 829 static void record__read_auxtrace_snapshot(struct record *rec, bool on_exit) 830 { 831 pr_debug("Recording AUX area tracing snapshot\n"); 832 if (record__auxtrace_read_snapshot_all(rec) < 0) { 833 trigger_error(&auxtrace_snapshot_trigger); 834 } else { 835 if (auxtrace_record__snapshot_finish(rec->itr, on_exit)) 836 trigger_error(&auxtrace_snapshot_trigger); 837 else 838 trigger_ready(&auxtrace_snapshot_trigger); 839 } 840 } 841 842 static int record__auxtrace_snapshot_exit(struct record *rec) 843 { 844 if (trigger_is_error(&auxtrace_snapshot_trigger)) 845 return 0; 846 847 if (!auxtrace_record__snapshot_started && 848 auxtrace_record__snapshot_start(rec->itr)) 849 return -1; 850 851 record__read_auxtrace_snapshot(rec, true); 852 if (trigger_is_error(&auxtrace_snapshot_trigger)) 853 return -1; 854 855 return 0; 856 } 857 858 static int record__auxtrace_init(struct record *rec) 859 { 860 int err; 861 862 if ((rec->opts.auxtrace_snapshot_opts || rec->opts.auxtrace_sample_opts) 863 && record__threads_enabled(rec)) { 864 pr_err("AUX area tracing options are not available in parallel streaming mode.\n"); 865 return -EINVAL; 866 } 867 868 if (!rec->itr) { 869 err = -EINVAL; 870 rec->itr = auxtrace_record__init(rec->evlist, &err); 871 if (err) 872 return err; 873 } 874 875 err = auxtrace_parse_snapshot_options(rec->itr, &rec->opts, 876 rec->opts.auxtrace_snapshot_opts); 877 if (err) 878 return err; 879 880 err = auxtrace_parse_sample_options(rec->itr, rec->evlist, &rec->opts, 881 rec->opts.auxtrace_sample_opts); 882 if (err) 883 return err; 884 885 err = auxtrace_parse_aux_action(rec->evlist); 886 if (err) 887 return err; 888 889 return auxtrace_parse_filters(rec->evlist); 890 } 891 892 static int record__config_text_poke(struct evlist *evlist) 893 { 894 struct evsel *evsel; 895 896 /* Nothing to do if text poke is already configured */ 897 evlist__for_each_entry(evlist, evsel) { 898 if (evsel->core.attr.text_poke) 899 return 0; 900 } 901 902 evsel = evlist__add_dummy_on_all_cpus(evlist); 903 if (!evsel) 904 return -ENOMEM; 905 906 evsel->core.attr.text_poke = 1; 907 evsel->core.attr.ksymbol = 1; 908 evsel->immediate = true; 909 evsel__set_sample_bit(evsel, TIME); 910 911 return 0; 912 } 913 914 static int record__config_off_cpu(struct record *rec) 915 { 916 return off_cpu_prepare(rec->evlist, &rec->opts.target, &rec->opts); 917 } 918 919 static bool record__tracking_system_wide(struct record *rec) 920 { 921 struct evlist *evlist = rec->evlist; 922 struct evsel *evsel; 923 924 /* 925 * If non-dummy evsel exists, system_wide sideband is need to 926 * help parse sample information. 927 * For example, PERF_EVENT_MMAP event to help parse symbol, 928 * and PERF_EVENT_COMM event to help parse task executable name. 929 */ 930 evlist__for_each_entry(evlist, evsel) { 931 if (!evsel__is_dummy_event(evsel)) 932 return true; 933 } 934 935 return false; 936 } 937 938 static int record__config_tracking_events(struct record *rec) 939 { 940 struct record_opts *opts = &rec->opts; 941 struct evlist *evlist = rec->evlist; 942 bool system_wide = false; 943 struct evsel *evsel; 944 945 /* 946 * For initial_delay, system wide or a hybrid system, we need to add 947 * tracking event so that we can track PERF_RECORD_MMAP to cover the 948 * delay of waiting or event synthesis. 949 */ 950 if (opts->target.initial_delay || target__has_cpu(&opts->target) || 951 perf_pmus__num_core_pmus() > 1) { 952 /* 953 * User space tasks can migrate between CPUs, so when tracing 954 * selected CPUs, sideband for all CPUs is still needed. 955 */ 956 if (!!opts->target.cpu_list && record__tracking_system_wide(rec)) 957 system_wide = true; 958 959 evsel = evlist__findnew_tracking_event(evlist, system_wide); 960 if (!evsel) 961 return -ENOMEM; 962 963 /* 964 * Enable the tracking event when the process is forked for 965 * initial_delay, immediately for system wide. 966 */ 967 if (opts->target.initial_delay && !evsel->immediate && 968 !target__has_cpu(&opts->target)) 969 evsel->core.attr.enable_on_exec = 1; 970 else 971 evsel->immediate = 1; 972 } 973 974 return 0; 975 } 976 977 static bool record__kcore_readable(struct machine *machine) 978 { 979 char kcore[PATH_MAX]; 980 int fd; 981 982 scnprintf(kcore, sizeof(kcore), "%s/proc/kcore", machine->root_dir); 983 984 fd = open(kcore, O_RDONLY); 985 if (fd < 0) 986 return false; 987 988 close(fd); 989 990 return true; 991 } 992 993 static int record__kcore_copy(struct machine *machine, struct perf_data *data) 994 { 995 char from_dir[PATH_MAX]; 996 char kcore_dir[PATH_MAX]; 997 int ret; 998 999 snprintf(from_dir, sizeof(from_dir), "%s/proc", machine->root_dir); 1000 1001 ret = perf_data__make_kcore_dir(data, kcore_dir, sizeof(kcore_dir)); 1002 if (ret) 1003 return ret; 1004 1005 return kcore_copy(from_dir, kcore_dir); 1006 } 1007 1008 static void record__thread_data_init_pipes(struct record_thread *thread_data) 1009 { 1010 thread_data->pipes.msg[0] = -1; 1011 thread_data->pipes.msg[1] = -1; 1012 thread_data->pipes.ack[0] = -1; 1013 thread_data->pipes.ack[1] = -1; 1014 } 1015 1016 static int record__thread_data_open_pipes(struct record_thread *thread_data) 1017 { 1018 if (pipe(thread_data->pipes.msg)) 1019 return -EINVAL; 1020 1021 if (pipe(thread_data->pipes.ack)) { 1022 close(thread_data->pipes.msg[0]); 1023 thread_data->pipes.msg[0] = -1; 1024 close(thread_data->pipes.msg[1]); 1025 thread_data->pipes.msg[1] = -1; 1026 return -EINVAL; 1027 } 1028 1029 pr_debug2("thread_data[%p]: msg=[%d,%d], ack=[%d,%d]\n", thread_data, 1030 thread_data->pipes.msg[0], thread_data->pipes.msg[1], 1031 thread_data->pipes.ack[0], thread_data->pipes.ack[1]); 1032 1033 return 0; 1034 } 1035 1036 static void record__thread_data_close_pipes(struct record_thread *thread_data) 1037 { 1038 if (thread_data->pipes.msg[0] != -1) { 1039 close(thread_data->pipes.msg[0]); 1040 thread_data->pipes.msg[0] = -1; 1041 } 1042 if (thread_data->pipes.msg[1] != -1) { 1043 close(thread_data->pipes.msg[1]); 1044 thread_data->pipes.msg[1] = -1; 1045 } 1046 if (thread_data->pipes.ack[0] != -1) { 1047 close(thread_data->pipes.ack[0]); 1048 thread_data->pipes.ack[0] = -1; 1049 } 1050 if (thread_data->pipes.ack[1] != -1) { 1051 close(thread_data->pipes.ack[1]); 1052 thread_data->pipes.ack[1] = -1; 1053 } 1054 } 1055 1056 static bool evlist__per_thread(struct evlist *evlist) 1057 { 1058 return cpu_map__is_dummy(evlist->core.user_requested_cpus); 1059 } 1060 1061 static int record__thread_data_init_maps(struct record_thread *thread_data, struct evlist *evlist) 1062 { 1063 int m, tm, nr_mmaps = evlist->core.nr_mmaps; 1064 struct mmap *mmap = evlist->mmap; 1065 struct mmap *overwrite_mmap = evlist->overwrite_mmap; 1066 struct perf_cpu_map *cpus = evlist->core.all_cpus; 1067 bool per_thread = evlist__per_thread(evlist); 1068 1069 if (per_thread) 1070 thread_data->nr_mmaps = nr_mmaps; 1071 else 1072 thread_data->nr_mmaps = bitmap_weight(thread_data->mask->maps.bits, 1073 thread_data->mask->maps.nbits); 1074 if (mmap) { 1075 thread_data->maps = calloc(thread_data->nr_mmaps, sizeof(struct mmap *)); 1076 if (!thread_data->maps) 1077 return -ENOMEM; 1078 } 1079 if (overwrite_mmap) { 1080 thread_data->overwrite_maps = calloc(thread_data->nr_mmaps, sizeof(struct mmap *)); 1081 if (!thread_data->overwrite_maps) { 1082 zfree(&thread_data->maps); 1083 return -ENOMEM; 1084 } 1085 } 1086 pr_debug2("thread_data[%p]: nr_mmaps=%d, maps=%p, ow_maps=%p\n", thread_data, 1087 thread_data->nr_mmaps, thread_data->maps, thread_data->overwrite_maps); 1088 1089 for (m = 0, tm = 0; m < nr_mmaps && tm < thread_data->nr_mmaps; m++) { 1090 if (per_thread || 1091 test_bit(perf_cpu_map__cpu(cpus, m).cpu, thread_data->mask->maps.bits)) { 1092 if (thread_data->maps) { 1093 thread_data->maps[tm] = &mmap[m]; 1094 pr_debug2("thread_data[%p]: cpu%d: maps[%d] -> mmap[%d]\n", 1095 thread_data, perf_cpu_map__cpu(cpus, m).cpu, tm, m); 1096 } 1097 if (thread_data->overwrite_maps) { 1098 thread_data->overwrite_maps[tm] = &overwrite_mmap[m]; 1099 pr_debug2("thread_data[%p]: cpu%d: ow_maps[%d] -> ow_mmap[%d]\n", 1100 thread_data, perf_cpu_map__cpu(cpus, m).cpu, tm, m); 1101 } 1102 tm++; 1103 } 1104 } 1105 1106 return 0; 1107 } 1108 1109 static int record__thread_data_init_pollfd(struct record_thread *thread_data, struct evlist *evlist) 1110 { 1111 int f, tm, pos; 1112 struct mmap *map, *overwrite_map; 1113 1114 fdarray__init(&thread_data->pollfd, 64); 1115 1116 for (tm = 0; tm < thread_data->nr_mmaps; tm++) { 1117 map = thread_data->maps ? thread_data->maps[tm] : NULL; 1118 overwrite_map = thread_data->overwrite_maps ? 1119 thread_data->overwrite_maps[tm] : NULL; 1120 1121 for (f = 0; f < evlist->core.pollfd.nr; f++) { 1122 void *ptr = evlist->core.pollfd.priv[f].ptr; 1123 1124 if ((map && ptr == map) || (overwrite_map && ptr == overwrite_map)) { 1125 pos = fdarray__dup_entry_from(&thread_data->pollfd, f, 1126 &evlist->core.pollfd); 1127 if (pos < 0) 1128 return pos; 1129 pr_debug2("thread_data[%p]: pollfd[%d] <- event_fd=%d\n", 1130 thread_data, pos, evlist->core.pollfd.entries[f].fd); 1131 } 1132 } 1133 } 1134 1135 return 0; 1136 } 1137 1138 static void record__free_thread_data(struct record *rec) 1139 { 1140 int t; 1141 struct record_thread *thread_data = rec->thread_data; 1142 1143 if (thread_data == NULL) 1144 return; 1145 1146 for (t = 0; t < rec->nr_threads; t++) { 1147 record__thread_data_close_pipes(&thread_data[t]); 1148 zfree(&thread_data[t].maps); 1149 zfree(&thread_data[t].overwrite_maps); 1150 fdarray__exit(&thread_data[t].pollfd); 1151 } 1152 1153 zfree(&rec->thread_data); 1154 } 1155 1156 static int record__map_thread_evlist_pollfd_indexes(struct record *rec, 1157 int evlist_pollfd_index, 1158 int thread_pollfd_index) 1159 { 1160 size_t x = rec->index_map_cnt; 1161 1162 if (realloc_array_as_needed(rec->index_map, rec->index_map_sz, x, NULL)) 1163 return -ENOMEM; 1164 rec->index_map[x].evlist_pollfd_index = evlist_pollfd_index; 1165 rec->index_map[x].thread_pollfd_index = thread_pollfd_index; 1166 rec->index_map_cnt += 1; 1167 return 0; 1168 } 1169 1170 static int record__update_evlist_pollfd_from_thread(struct record *rec, 1171 struct evlist *evlist, 1172 struct record_thread *thread_data) 1173 { 1174 struct pollfd *e_entries = evlist->core.pollfd.entries; 1175 struct pollfd *t_entries = thread_data->pollfd.entries; 1176 int err = 0; 1177 size_t i; 1178 1179 for (i = 0; i < rec->index_map_cnt; i++) { 1180 int e_pos = rec->index_map[i].evlist_pollfd_index; 1181 int t_pos = rec->index_map[i].thread_pollfd_index; 1182 1183 if (e_entries[e_pos].fd != t_entries[t_pos].fd || 1184 e_entries[e_pos].events != t_entries[t_pos].events) { 1185 pr_err("Thread and evlist pollfd index mismatch\n"); 1186 err = -EINVAL; 1187 continue; 1188 } 1189 e_entries[e_pos].revents = t_entries[t_pos].revents; 1190 } 1191 return err; 1192 } 1193 1194 static int record__dup_non_perf_events(struct record *rec, 1195 struct evlist *evlist, 1196 struct record_thread *thread_data) 1197 { 1198 struct fdarray *fda = &evlist->core.pollfd; 1199 int i, ret; 1200 1201 for (i = 0; i < fda->nr; i++) { 1202 if (!(fda->priv[i].flags & fdarray_flag__non_perf_event)) 1203 continue; 1204 ret = fdarray__dup_entry_from(&thread_data->pollfd, i, fda); 1205 if (ret < 0) { 1206 pr_err("Failed to duplicate descriptor in main thread pollfd\n"); 1207 return ret; 1208 } 1209 pr_debug2("thread_data[%p]: pollfd[%d] <- non_perf_event fd=%d\n", 1210 thread_data, ret, fda->entries[i].fd); 1211 ret = record__map_thread_evlist_pollfd_indexes(rec, i, ret); 1212 if (ret < 0) { 1213 pr_err("Failed to map thread and evlist pollfd indexes\n"); 1214 return ret; 1215 } 1216 } 1217 return 0; 1218 } 1219 1220 static int record__alloc_thread_data(struct record *rec, struct evlist *evlist) 1221 { 1222 int t, ret; 1223 struct record_thread *thread_data; 1224 1225 rec->thread_data = calloc(rec->nr_threads, sizeof(*(rec->thread_data))); 1226 if (!rec->thread_data) { 1227 pr_err("Failed to allocate thread data\n"); 1228 return -ENOMEM; 1229 } 1230 thread_data = rec->thread_data; 1231 1232 for (t = 0; t < rec->nr_threads; t++) 1233 record__thread_data_init_pipes(&thread_data[t]); 1234 1235 for (t = 0; t < rec->nr_threads; t++) { 1236 thread_data[t].rec = rec; 1237 thread_data[t].mask = &rec->thread_masks[t]; 1238 ret = record__thread_data_init_maps(&thread_data[t], evlist); 1239 if (ret) { 1240 pr_err("Failed to initialize thread[%d] maps\n", t); 1241 goto out_free; 1242 } 1243 ret = record__thread_data_init_pollfd(&thread_data[t], evlist); 1244 if (ret) { 1245 pr_err("Failed to initialize thread[%d] pollfd\n", t); 1246 goto out_free; 1247 } 1248 if (t) { 1249 thread_data[t].tid = -1; 1250 ret = record__thread_data_open_pipes(&thread_data[t]); 1251 if (ret) { 1252 pr_err("Failed to open thread[%d] communication pipes\n", t); 1253 goto out_free; 1254 } 1255 ret = fdarray__add(&thread_data[t].pollfd, thread_data[t].pipes.msg[0], 1256 POLLIN | POLLERR | POLLHUP, fdarray_flag__nonfilterable); 1257 if (ret < 0) { 1258 pr_err("Failed to add descriptor to thread[%d] pollfd\n", t); 1259 goto out_free; 1260 } 1261 thread_data[t].ctlfd_pos = ret; 1262 pr_debug2("thread_data[%p]: pollfd[%d] <- ctl_fd=%d\n", 1263 thread_data, thread_data[t].ctlfd_pos, 1264 thread_data[t].pipes.msg[0]); 1265 } else { 1266 thread_data[t].tid = gettid(); 1267 1268 ret = record__dup_non_perf_events(rec, evlist, &thread_data[t]); 1269 if (ret < 0) 1270 goto out_free; 1271 1272 thread_data[t].ctlfd_pos = -1; /* Not used */ 1273 } 1274 } 1275 1276 return 0; 1277 1278 out_free: 1279 record__free_thread_data(rec); 1280 1281 return ret; 1282 } 1283 1284 static int record__mmap_evlist(struct record *rec, 1285 struct evlist *evlist) 1286 { 1287 int i, ret; 1288 struct record_opts *opts = &rec->opts; 1289 bool auxtrace_overwrite = opts->auxtrace_snapshot_mode || 1290 opts->auxtrace_sample_mode; 1291 1292 if (opts->affinity != PERF_AFFINITY_SYS) 1293 cpu__setup_cpunode_map(); 1294 1295 if (evlist__mmap_ex(evlist, opts->mmap_pages, 1296 opts->auxtrace_mmap_pages, 1297 auxtrace_overwrite, 1298 opts->nr_cblocks, opts->affinity, 1299 opts->mmap_flush, opts->comp_level) < 0) { 1300 if (errno == EPERM) { 1301 pr_err("Permission error mapping pages.\n" 1302 "Consider increasing " 1303 "/proc/sys/kernel/perf_event_mlock_kb,\n" 1304 "or try again with a smaller value of -m/--mmap_pages.\n" 1305 "(current value: %u,%u)\n", 1306 opts->mmap_pages, opts->auxtrace_mmap_pages); 1307 return -errno; 1308 } else { 1309 pr_err("failed to mmap: %m\n"); 1310 if (errno) 1311 return -errno; 1312 else 1313 return -EINVAL; 1314 } 1315 } 1316 1317 if (evlist__initialize_ctlfd(evlist, opts->ctl_fd, opts->ctl_fd_ack)) 1318 return -1; 1319 1320 ret = record__alloc_thread_data(rec, evlist); 1321 if (ret) 1322 return ret; 1323 1324 if (record__threads_enabled(rec)) { 1325 ret = perf_data__create_dir(&rec->data, evlist->core.nr_mmaps); 1326 if (ret) { 1327 errno = -ret; 1328 pr_err("Failed to create data directory: %m\n"); 1329 return ret; 1330 } 1331 for (i = 0; i < evlist->core.nr_mmaps; i++) { 1332 if (evlist->mmap) 1333 evlist->mmap[i].file = &rec->data.dir.files[i]; 1334 if (evlist->overwrite_mmap) 1335 evlist->overwrite_mmap[i].file = &rec->data.dir.files[i]; 1336 } 1337 } 1338 1339 return 0; 1340 } 1341 1342 static int record__mmap(struct record *rec) 1343 { 1344 return record__mmap_evlist(rec, rec->evlist); 1345 } 1346 1347 static int record__open(struct record *rec) 1348 { 1349 char msg[BUFSIZ]; 1350 struct evsel *pos; 1351 struct evlist *evlist = rec->evlist; 1352 struct perf_session *session = rec->session; 1353 struct record_opts *opts = &rec->opts; 1354 int rc = 0; 1355 bool skipped = false; 1356 bool removed_tracking = false; 1357 1358 evlist__for_each_entry(evlist, pos) { 1359 if (removed_tracking) { 1360 /* 1361 * Normally the head of the list has tracking enabled 1362 * for sideband data like mmaps. If this event is 1363 * removed, make sure to add tracking to the next 1364 * processed event. 1365 */ 1366 if (!pos->tracking) { 1367 pos->tracking = true; 1368 evsel__config(pos, opts, &callchain_param); 1369 } 1370 removed_tracking = false; 1371 } 1372 try_again: 1373 if (evsel__open(pos, pos->core.cpus, pos->core.threads) < 0) { 1374 bool report_error = true; 1375 1376 if (evsel__fallback(pos, &opts->target, errno, msg, sizeof(msg))) { 1377 if (verbose > 0) 1378 ui__warning("%s\n", msg); 1379 goto try_again; 1380 } 1381 if ((errno == EINVAL || errno == EBADF) && 1382 pos->core.leader != &pos->core && 1383 pos->weak_group) { 1384 pos = evlist__reset_weak_group(evlist, pos, true); 1385 goto try_again; 1386 } 1387 #if defined(__aarch64__) || defined(__arm__) 1388 if (strstr(evsel__name(pos), "cycles")) { 1389 struct evsel *pos2; 1390 /* 1391 * Unfortunately ARM has many events named 1392 * "cycles" on PMUs like the system-level (L3) 1393 * cache which don't support sampling. Only 1394 * display such failures to open when there is 1395 * only 1 cycles event or verbose is enabled. 1396 */ 1397 evlist__for_each_entry(evlist, pos2) { 1398 if (pos2 == pos) 1399 continue; 1400 if (strstr(evsel__name(pos2), "cycles")) { 1401 report_error = false; 1402 break; 1403 } 1404 } 1405 } 1406 #endif 1407 if (report_error || verbose > 0) { 1408 evsel__open_strerror(pos, &opts->target, errno, msg, sizeof(msg)); 1409 ui__error("Failure to open event '%s' on PMU '%s' which will be " 1410 "removed.\n%s\n", 1411 evsel__name(pos), evsel__pmu_name(pos), msg); 1412 } 1413 if (pos->tracking) 1414 removed_tracking = true; 1415 pos->skippable = true; 1416 skipped = true; 1417 } 1418 } 1419 1420 if (skipped) { 1421 struct evsel *tmp; 1422 int idx = 0; 1423 bool evlist_empty = true; 1424 1425 /* Remove evsels that failed to open and update indices. */ 1426 evlist__for_each_entry_safe(evlist, tmp, pos) { 1427 if (pos->skippable) { 1428 evlist__remove(evlist, pos); 1429 continue; 1430 } 1431 1432 /* 1433 * Note, dummy events may be command line parsed or 1434 * added by the tool. We care about supporting `perf 1435 * record -e dummy` which may be used as a permission 1436 * check. Dummy events that are added to the command 1437 * line and opened along with other events that fail, 1438 * will still fail as if the dummy events were tool 1439 * added events for the sake of code simplicity. 1440 */ 1441 if (!evsel__is_dummy_event(pos)) 1442 evlist_empty = false; 1443 } 1444 evlist__for_each_entry(evlist, pos) { 1445 pos->core.idx = idx++; 1446 } 1447 /* If list is empty then fail. */ 1448 if (evlist_empty) { 1449 ui__error("Failure to open any events for recording.\n"); 1450 rc = -1; 1451 goto out; 1452 } 1453 } 1454 if (symbol_conf.kptr_restrict && !evlist__exclude_kernel(evlist)) { 1455 pr_warning( 1456 "WARNING: Kernel address maps (/proc/{kallsyms,modules}) are restricted,\n" 1457 "check /proc/sys/kernel/kptr_restrict and /proc/sys/kernel/perf_event_paranoid.\n\n" 1458 "Samples in kernel functions may not be resolved if a suitable vmlinux\n" 1459 "file is not found in the buildid cache or in the vmlinux path.\n\n" 1460 "Samples in kernel modules won't be resolved at all.\n\n" 1461 "If some relocation was applied (e.g. kexec) symbols may be misresolved\n" 1462 "even with a suitable vmlinux or kallsyms file.\n\n"); 1463 } 1464 1465 if (evlist__apply_filters(evlist, &pos, &opts->target)) { 1466 pr_err("failed to set filter \"%s\" on event %s: %m\n", 1467 pos->filter ?: "BPF", evsel__name(pos)); 1468 rc = -1; 1469 goto out; 1470 } 1471 1472 rc = record__mmap(rec); 1473 if (rc) 1474 goto out; 1475 1476 session->evlist = evlist; 1477 perf_session__set_id_hdr_size(session); 1478 out: 1479 return rc; 1480 } 1481 1482 static void set_timestamp_boundary(struct record *rec, u64 sample_time) 1483 { 1484 if (rec->evlist->first_sample_time == 0) 1485 rec->evlist->first_sample_time = sample_time; 1486 1487 if (sample_time) 1488 rec->evlist->last_sample_time = sample_time; 1489 } 1490 1491 static int process_sample_event(const struct perf_tool *tool, 1492 union perf_event *event, 1493 struct perf_sample *sample, 1494 struct machine *machine) 1495 { 1496 struct record *rec = container_of(tool, struct record, tool); 1497 1498 set_timestamp_boundary(rec, sample->time); 1499 1500 if (rec->buildid_all) 1501 return 0; 1502 1503 rec->samples++; 1504 return build_id__mark_dso_hit(tool, event, sample, machine); 1505 } 1506 1507 static int process_buildids(struct record *rec) 1508 { 1509 struct perf_session *session = rec->session; 1510 1511 if (perf_data__size(&rec->data) == 0) 1512 return 0; 1513 1514 /* A single DSO is needed and not all inline frames. */ 1515 symbol_conf.inline_name = false; 1516 /* 1517 * During this process, it'll load kernel map and replace the 1518 * dso->long_name to a real pathname it found. In this case 1519 * we prefer the vmlinux path like 1520 * /lib/modules/3.16.4/build/vmlinux 1521 * 1522 * rather than build-id path (in debug directory). 1523 * $HOME/.debug/.build-id/f0/6e17aa50adf4d00b88925e03775de107611551 1524 */ 1525 symbol_conf.ignore_vmlinux_buildid = true; 1526 /* 1527 * If --buildid-all is given, it marks all DSO regardless of hits, 1528 * so no need to process samples. But if timestamp_boundary is enabled, 1529 * it still needs to walk on all samples to get the timestamps of 1530 * first/last samples. 1531 */ 1532 if (rec->buildid_all && !rec->timestamp_boundary) 1533 rec->tool.sample = process_event_sample_stub; 1534 1535 return perf_session__process_events(session); 1536 } 1537 1538 static void perf_event__synthesize_guest_os(struct machine *machine, void *data) 1539 { 1540 int err; 1541 struct perf_tool *tool = data; 1542 /* 1543 *As for guest kernel when processing subcommand record&report, 1544 *we arrange module mmap prior to guest kernel mmap and trigger 1545 *a preload dso because default guest module symbols are loaded 1546 *from guest kallsyms instead of /lib/modules/XXX/XXX. This 1547 *method is used to avoid symbol missing when the first addr is 1548 *in module instead of in guest kernel. 1549 */ 1550 err = perf_event__synthesize_modules(tool, process_synthesized_event, 1551 machine); 1552 if (err < 0) 1553 pr_err("Couldn't record guest kernel [%d]'s reference" 1554 " relocation symbol.\n", machine->pid); 1555 1556 /* 1557 * We use _stext for guest kernel because guest kernel's /proc/kallsyms 1558 * have no _text sometimes. 1559 */ 1560 err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event, 1561 machine); 1562 if (err < 0) 1563 pr_err("Couldn't record guest kernel [%d]'s reference" 1564 " relocation symbol.\n", machine->pid); 1565 } 1566 1567 static struct perf_event_header finished_round_event = { 1568 .size = sizeof(struct perf_event_header), 1569 .type = PERF_RECORD_FINISHED_ROUND, 1570 }; 1571 1572 static struct perf_event_header finished_init_event = { 1573 .size = sizeof(struct perf_event_header), 1574 .type = PERF_RECORD_FINISHED_INIT, 1575 }; 1576 1577 static void record__adjust_affinity(struct record *rec, struct mmap *map) 1578 { 1579 if (rec->opts.affinity != PERF_AFFINITY_SYS && 1580 !bitmap_equal(thread->mask->affinity.bits, map->affinity_mask.bits, 1581 thread->mask->affinity.nbits)) { 1582 bitmap_zero(thread->mask->affinity.bits, thread->mask->affinity.nbits); 1583 bitmap_or(thread->mask->affinity.bits, thread->mask->affinity.bits, 1584 map->affinity_mask.bits, thread->mask->affinity.nbits); 1585 sched_setaffinity(0, MMAP_CPU_MASK_BYTES(&thread->mask->affinity), 1586 (cpu_set_t *)thread->mask->affinity.bits); 1587 if (verbose == 2) { 1588 pr_debug("threads[%d]: running on cpu%d: ", thread->tid, sched_getcpu()); 1589 mmap_cpu_mask__scnprintf(&thread->mask->affinity, "affinity"); 1590 } 1591 } 1592 } 1593 1594 static size_t process_comp_header(void *record, size_t increment) 1595 { 1596 struct perf_record_compressed2 *event = record; 1597 size_t size = sizeof(*event); 1598 1599 if (increment) { 1600 event->header.size += increment; 1601 return increment; 1602 } 1603 1604 event->header.type = PERF_RECORD_COMPRESSED2; 1605 event->header.size = size; 1606 1607 return size; 1608 } 1609 1610 static ssize_t zstd_compress(struct perf_session *session, struct mmap *map, 1611 void *dst, size_t dst_size, void *src, size_t src_size) 1612 { 1613 ssize_t compressed; 1614 size_t max_record_size = PERF_SAMPLE_MAX_SIZE - sizeof(struct perf_record_compressed2) - 1; 1615 struct zstd_data *zstd_data = &session->zstd_data; 1616 1617 if (map && map->file) 1618 zstd_data = &map->zstd_data; 1619 1620 compressed = zstd_compress_stream_to_records(zstd_data, dst, dst_size, src, src_size, 1621 max_record_size, process_comp_header); 1622 if (compressed < 0) 1623 return compressed; 1624 1625 if (map && map->file) { 1626 thread->bytes_transferred += src_size; 1627 thread->bytes_compressed += compressed; 1628 } else { 1629 session->bytes_transferred += src_size; 1630 session->bytes_compressed += compressed; 1631 } 1632 1633 return compressed; 1634 } 1635 1636 static int record__mmap_read_evlist(struct record *rec, struct evlist *evlist, 1637 bool overwrite, bool synch) 1638 { 1639 u64 bytes_written = rec->bytes_written; 1640 int i; 1641 int rc = 0; 1642 int nr_mmaps; 1643 struct mmap **maps; 1644 int trace_fd = perf_data__fd(&rec->data); 1645 off_t off = 0; 1646 1647 if (!evlist) 1648 return 0; 1649 1650 nr_mmaps = thread->nr_mmaps; 1651 maps = overwrite ? thread->overwrite_maps : thread->maps; 1652 1653 if (!maps) 1654 return 0; 1655 1656 if (overwrite && evlist->bkw_mmap_state != BKW_MMAP_DATA_PENDING) 1657 return 0; 1658 1659 if (record__aio_enabled(rec)) 1660 off = record__aio_get_pos(trace_fd); 1661 1662 for (i = 0; i < nr_mmaps; i++) { 1663 u64 flush = 0; 1664 struct mmap *map = maps[i]; 1665 1666 if (map->core.base) { 1667 record__adjust_affinity(rec, map); 1668 if (synch) { 1669 flush = map->core.flush; 1670 map->core.flush = 1; 1671 } 1672 if (!record__aio_enabled(rec)) { 1673 if (perf_mmap__push(map, rec, record__pushfn) < 0) { 1674 if (synch) 1675 map->core.flush = flush; 1676 rc = -1; 1677 goto out; 1678 } 1679 } else { 1680 if (record__aio_push(rec, map, &off) < 0) { 1681 record__aio_set_pos(trace_fd, off); 1682 if (synch) 1683 map->core.flush = flush; 1684 rc = -1; 1685 goto out; 1686 } 1687 } 1688 if (synch) 1689 map->core.flush = flush; 1690 } 1691 1692 if (map->auxtrace_mmap.base && !rec->opts.auxtrace_snapshot_mode && 1693 !rec->opts.auxtrace_sample_mode && 1694 record__auxtrace_mmap_read(rec, map) != 0) { 1695 rc = -1; 1696 goto out; 1697 } 1698 } 1699 1700 if (record__aio_enabled(rec)) 1701 record__aio_set_pos(trace_fd, off); 1702 1703 /* 1704 * Mark the round finished in case we wrote 1705 * at least one event. 1706 * 1707 * No need for round events in directory mode, 1708 * because per-cpu maps and files have data 1709 * sorted by kernel. 1710 */ 1711 if (!record__threads_enabled(rec) && bytes_written != rec->bytes_written) 1712 rc = record__write(rec, NULL, &finished_round_event, sizeof(finished_round_event)); 1713 1714 if (overwrite) 1715 evlist__toggle_bkw_mmap(evlist, BKW_MMAP_EMPTY); 1716 out: 1717 return rc; 1718 } 1719 1720 static int record__mmap_read_all(struct record *rec, bool synch) 1721 { 1722 int err; 1723 1724 err = record__mmap_read_evlist(rec, rec->evlist, false, synch); 1725 if (err) 1726 return err; 1727 1728 return record__mmap_read_evlist(rec, rec->evlist, true, synch); 1729 } 1730 1731 static void record__thread_munmap_filtered(struct fdarray *fda, int fd, 1732 void *arg __maybe_unused) 1733 { 1734 struct perf_mmap *map = fda->priv[fd].ptr; 1735 1736 if (map) 1737 perf_mmap__put(map); 1738 } 1739 1740 static void *record__thread(void *arg) 1741 { 1742 enum thread_msg msg = THREAD_MSG__READY; 1743 bool terminate = false; 1744 struct fdarray *pollfd; 1745 int err, ctlfd_pos; 1746 1747 thread = arg; 1748 thread->tid = gettid(); 1749 1750 err = write(thread->pipes.ack[1], &msg, sizeof(msg)); 1751 if (err == -1) 1752 pr_warning("threads[%d]: failed to notify on start: %m\n", thread->tid); 1753 1754 pr_debug("threads[%d]: started on cpu%d\n", thread->tid, sched_getcpu()); 1755 1756 pollfd = &thread->pollfd; 1757 ctlfd_pos = thread->ctlfd_pos; 1758 1759 for (;;) { 1760 unsigned long long hits = thread->samples; 1761 1762 if (record__mmap_read_all(thread->rec, false) < 0 || terminate) 1763 break; 1764 1765 if (hits == thread->samples) { 1766 1767 err = fdarray__poll(pollfd, -1); 1768 /* 1769 * Propagate error, only if there's any. Ignore positive 1770 * number of returned events and interrupt error. 1771 */ 1772 if (err > 0 || (err < 0 && errno == EINTR)) 1773 err = 0; 1774 thread->waking++; 1775 1776 if (fdarray__filter(pollfd, POLLERR | POLLHUP, 1777 record__thread_munmap_filtered, NULL) == 0) 1778 break; 1779 } 1780 1781 if (pollfd->entries[ctlfd_pos].revents & POLLHUP) { 1782 terminate = true; 1783 close(thread->pipes.msg[0]); 1784 thread->pipes.msg[0] = -1; 1785 pollfd->entries[ctlfd_pos].fd = -1; 1786 pollfd->entries[ctlfd_pos].events = 0; 1787 } 1788 1789 pollfd->entries[ctlfd_pos].revents = 0; 1790 } 1791 record__mmap_read_all(thread->rec, true); 1792 1793 err = write(thread->pipes.ack[1], &msg, sizeof(msg)); 1794 if (err == -1) 1795 pr_warning("threads[%d]: failed to notify on termination: %m\n", thread->tid); 1796 1797 return NULL; 1798 } 1799 1800 static void record__init_features(struct record *rec) 1801 { 1802 struct perf_session *session = rec->session; 1803 int feat; 1804 1805 for (feat = HEADER_FIRST_FEATURE; feat < HEADER_LAST_FEATURE; feat++) 1806 perf_header__set_feat(&session->header, feat); 1807 1808 if (rec->no_buildid) 1809 perf_header__clear_feat(&session->header, HEADER_BUILD_ID); 1810 1811 if (!have_tracepoints(&rec->evlist->core.entries)) 1812 perf_header__clear_feat(&session->header, HEADER_TRACING_DATA); 1813 1814 if (!rec->opts.branch_stack) 1815 perf_header__clear_feat(&session->header, HEADER_BRANCH_STACK); 1816 1817 if (!rec->opts.full_auxtrace) 1818 perf_header__clear_feat(&session->header, HEADER_AUXTRACE); 1819 1820 if (!(rec->opts.use_clockid && rec->opts.clockid_res_ns)) 1821 perf_header__clear_feat(&session->header, HEADER_CLOCKID); 1822 1823 if (!rec->opts.use_clockid) 1824 perf_header__clear_feat(&session->header, HEADER_CLOCK_DATA); 1825 1826 if (!record__threads_enabled(rec)) 1827 perf_header__clear_feat(&session->header, HEADER_DIR_FORMAT); 1828 1829 if (!record__comp_enabled(rec)) 1830 perf_header__clear_feat(&session->header, HEADER_COMPRESSED); 1831 1832 perf_header__clear_feat(&session->header, HEADER_STAT); 1833 } 1834 1835 static void 1836 record__finish_output(struct record *rec) 1837 { 1838 int i; 1839 struct perf_data *data = &rec->data; 1840 int fd = perf_data__fd(data); 1841 1842 if (data->is_pipe) { 1843 /* Just to display approx. size */ 1844 data->file.size = rec->bytes_written; 1845 return; 1846 } 1847 1848 rec->session->header.data_size += rec->bytes_written; 1849 data->file.size = perf_data__seek(data, 0, SEEK_CUR); 1850 if (record__threads_enabled(rec)) { 1851 for (i = 0; i < data->dir.nr; i++) { 1852 data->dir.files[i].size = 1853 perf_data_file__seek(&data->dir.files[i], 0, SEEK_CUR); 1854 } 1855 } 1856 1857 /* Buildid scanning disabled or build ID in kernel and synthesized map events. */ 1858 if (!rec->no_buildid || !rec->no_buildid_cache) { 1859 process_buildids(rec); 1860 1861 if (rec->buildid_all) 1862 perf_session__dsos_hit_all(rec->session); 1863 } 1864 perf_session__write_header(rec->session, rec->evlist, fd, true); 1865 perf_session__cache_build_ids(rec->session); 1866 } 1867 1868 static int record__synthesize_workload(struct record *rec, bool tail) 1869 { 1870 int err; 1871 struct perf_thread_map *thread_map; 1872 bool needs_mmap = rec->opts.synth & PERF_SYNTH_MMAP; 1873 1874 if (rec->opts.tail_synthesize != tail) 1875 return 0; 1876 1877 thread_map = thread_map__new_by_tid(rec->evlist->workload.pid); 1878 if (thread_map == NULL) 1879 return -1; 1880 1881 err = perf_event__synthesize_thread_map(&rec->tool, thread_map, 1882 process_synthesized_event, 1883 &rec->session->machines.host, 1884 needs_mmap, 1885 rec->opts.record_data_mmap); 1886 perf_thread_map__put(thread_map); 1887 return err; 1888 } 1889 1890 static int write_finished_init(struct record *rec, bool tail) 1891 { 1892 if (rec->opts.tail_synthesize != tail) 1893 return 0; 1894 1895 return record__write(rec, NULL, &finished_init_event, sizeof(finished_init_event)); 1896 } 1897 1898 static int record__synthesize(struct record *rec, bool tail); 1899 1900 static int 1901 record__switch_output(struct record *rec, bool at_exit) 1902 { 1903 struct perf_data *data = &rec->data; 1904 char *new_filename = NULL; 1905 int fd, err; 1906 1907 /* Same Size: "2015122520103046"*/ 1908 char timestamp[] = "InvalidTimestamp"; 1909 1910 record__aio_mmap_read_sync(rec); 1911 1912 write_finished_init(rec, true); 1913 1914 record__synthesize(rec, true); 1915 if (target__none(&rec->opts.target)) 1916 record__synthesize_workload(rec, true); 1917 1918 rec->samples = 0; 1919 record__finish_output(rec); 1920 err = fetch_current_timestamp(timestamp, sizeof(timestamp)); 1921 if (err) { 1922 pr_err("Failed to get current timestamp\n"); 1923 return -EINVAL; 1924 } 1925 1926 fd = perf_data__switch(data, timestamp, 1927 rec->session->header.data_offset, 1928 at_exit, &new_filename); 1929 if (fd >= 0 && !at_exit) { 1930 rec->bytes_written = 0; 1931 rec->session->header.data_size = 0; 1932 } 1933 1934 if (!quiet) { 1935 fprintf(stderr, "[ perf record: Dump %s.%s ]\n", 1936 data->path, timestamp); 1937 } 1938 1939 if (rec->switch_output.num_files) { 1940 int n = rec->switch_output.cur_file + 1; 1941 1942 if (n >= rec->switch_output.num_files) 1943 n = 0; 1944 rec->switch_output.cur_file = n; 1945 if (rec->switch_output.filenames[n]) { 1946 remove(rec->switch_output.filenames[n]); 1947 zfree(&rec->switch_output.filenames[n]); 1948 } 1949 rec->switch_output.filenames[n] = new_filename; 1950 } else { 1951 free(new_filename); 1952 } 1953 1954 /* Output tracking events */ 1955 if (!at_exit) { 1956 record__synthesize(rec, false); 1957 1958 /* 1959 * In 'perf record --switch-output' without -a, 1960 * record__synthesize() in record__switch_output() won't 1961 * generate tracking events because there's no thread_map 1962 * in evlist. Which causes newly created perf.data doesn't 1963 * contain map and comm information. 1964 * Create a fake thread_map and directly call 1965 * perf_event__synthesize_thread_map() for those events. 1966 */ 1967 if (target__none(&rec->opts.target)) 1968 record__synthesize_workload(rec, false); 1969 write_finished_init(rec, false); 1970 } 1971 return fd; 1972 } 1973 1974 static void __record__save_lost_samples(struct record *rec, struct evsel *evsel, 1975 struct perf_record_lost_samples *lost, 1976 int cpu_idx, int thread_idx, u64 lost_count, 1977 u16 misc_flag) 1978 { 1979 struct perf_sample_id *sid; 1980 struct perf_sample sample; 1981 int id_hdr_size; 1982 1983 perf_sample__init(&sample, /*all=*/true); 1984 lost->lost = lost_count; 1985 if (evsel->core.ids) { 1986 sid = xyarray__entry(evsel->core.sample_id, cpu_idx, thread_idx); 1987 sample.id = sid->id; 1988 } 1989 1990 id_hdr_size = perf_event__synthesize_id_sample((void *)(lost + 1), 1991 evsel->core.attr.sample_type, &sample); 1992 lost->header.size = sizeof(*lost) + id_hdr_size; 1993 lost->header.misc = misc_flag; 1994 record__write(rec, NULL, lost, lost->header.size); 1995 perf_sample__exit(&sample); 1996 } 1997 1998 static void record__read_lost_samples(struct record *rec) 1999 { 2000 struct perf_session *session = rec->session; 2001 struct perf_record_lost_samples_and_ids lost; 2002 struct evsel *evsel; 2003 2004 /* there was an error during record__open */ 2005 if (session->evlist == NULL) 2006 return; 2007 2008 evlist__for_each_entry(session->evlist, evsel) { 2009 struct xyarray *xy = evsel->core.sample_id; 2010 u64 lost_count; 2011 2012 if (xy == NULL || evsel->core.fd == NULL) 2013 continue; 2014 if (xyarray__max_x(evsel->core.fd) != xyarray__max_x(xy) || 2015 xyarray__max_y(evsel->core.fd) != xyarray__max_y(xy)) { 2016 pr_debug("Unmatched FD vs. sample ID: skip reading LOST count\n"); 2017 continue; 2018 } 2019 2020 for (int x = 0; x < xyarray__max_x(xy); x++) { 2021 for (int y = 0; y < xyarray__max_y(xy); y++) { 2022 struct perf_counts_values count; 2023 2024 if (perf_evsel__read(&evsel->core, x, y, &count) < 0) { 2025 pr_debug("read LOST count failed\n"); 2026 return; 2027 } 2028 2029 if (count.lost) { 2030 memset(&lost, 0, sizeof(lost)); 2031 lost.lost.header.type = PERF_RECORD_LOST_SAMPLES; 2032 __record__save_lost_samples(rec, evsel, &lost.lost, 2033 x, y, count.lost, 0); 2034 } 2035 } 2036 } 2037 2038 lost_count = perf_bpf_filter__lost_count(evsel); 2039 if (lost_count) { 2040 memset(&lost, 0, sizeof(lost)); 2041 lost.lost.header.type = PERF_RECORD_LOST_SAMPLES; 2042 __record__save_lost_samples(rec, evsel, &lost.lost, 0, 0, lost_count, 2043 PERF_RECORD_MISC_LOST_SAMPLES_BPF); 2044 } 2045 } 2046 } 2047 2048 static volatile sig_atomic_t workload_exec_errno; 2049 2050 /* 2051 * evlist__prepare_workload will send a SIGUSR1 2052 * if the fork fails, since we asked by setting its 2053 * want_signal to true. 2054 */ 2055 static void workload_exec_failed_signal(int signo __maybe_unused, 2056 siginfo_t *info, 2057 void *ucontext __maybe_unused) 2058 { 2059 workload_exec_errno = info->si_value.sival_int; 2060 done = 1; 2061 child_finished = 1; 2062 } 2063 2064 static void snapshot_sig_handler(int sig); 2065 static void alarm_sig_handler(int sig); 2066 2067 static const struct perf_event_mmap_page *evlist__pick_pc(struct evlist *evlist) 2068 { 2069 if (evlist) { 2070 if (evlist->mmap && evlist->mmap[0].core.base) 2071 return evlist->mmap[0].core.base; 2072 if (evlist->overwrite_mmap && evlist->overwrite_mmap[0].core.base) 2073 return evlist->overwrite_mmap[0].core.base; 2074 } 2075 return NULL; 2076 } 2077 2078 static const struct perf_event_mmap_page *record__pick_pc(struct record *rec) 2079 { 2080 const struct perf_event_mmap_page *pc = evlist__pick_pc(rec->evlist); 2081 if (pc) 2082 return pc; 2083 return NULL; 2084 } 2085 2086 static int record__synthesize(struct record *rec, bool tail) 2087 { 2088 struct perf_session *session = rec->session; 2089 struct machine *machine = &session->machines.host; 2090 struct perf_data *data = &rec->data; 2091 struct record_opts *opts = &rec->opts; 2092 struct perf_tool *tool = &rec->tool; 2093 int err = 0; 2094 event_op f = process_synthesized_event; 2095 2096 if (rec->opts.tail_synthesize != tail) 2097 return 0; 2098 2099 if (data->is_pipe) { 2100 err = perf_event__synthesize_for_pipe(tool, session, data, 2101 process_synthesized_event); 2102 if (err < 0) 2103 goto out; 2104 2105 rec->bytes_written += err; 2106 } 2107 2108 err = perf_event__synth_time_conv(record__pick_pc(rec), tool, 2109 process_synthesized_event, machine); 2110 if (err) 2111 goto out; 2112 2113 /* Synthesize id_index before auxtrace_info */ 2114 err = perf_event__synthesize_id_index(tool, 2115 process_synthesized_event, 2116 session->evlist, machine); 2117 if (err) 2118 goto out; 2119 2120 if (rec->opts.full_auxtrace) { 2121 err = perf_event__synthesize_auxtrace_info(rec->itr, tool, 2122 session, process_synthesized_event); 2123 if (err) 2124 goto out; 2125 } 2126 2127 if (!evlist__exclude_kernel(rec->evlist)) { 2128 err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event, 2129 machine); 2130 WARN_ONCE(err < 0, "Couldn't record kernel reference relocation symbol\n" 2131 "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n" 2132 "Check /proc/kallsyms permission or run as root.\n"); 2133 2134 err = perf_event__synthesize_modules(tool, process_synthesized_event, 2135 machine); 2136 WARN_ONCE(err < 0, "Couldn't record kernel module information.\n" 2137 "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n" 2138 "Check /proc/modules permission or run as root.\n"); 2139 } 2140 2141 if (perf_guest) { 2142 machines__process_guests(&session->machines, 2143 perf_event__synthesize_guest_os, tool); 2144 } 2145 2146 err = perf_event__synthesize_extra_attr(&rec->tool, 2147 rec->evlist, 2148 process_synthesized_event, 2149 data->is_pipe); 2150 if (err) 2151 goto out; 2152 2153 err = perf_event__synthesize_thread_map2(&rec->tool, rec->evlist->core.threads, 2154 process_synthesized_event, 2155 NULL); 2156 if (err < 0) { 2157 pr_err("Couldn't synthesize thread map.\n"); 2158 return err; 2159 } 2160 2161 err = perf_event__synthesize_cpu_map(&rec->tool, rec->evlist->core.all_cpus, 2162 process_synthesized_event, NULL); 2163 if (err < 0) { 2164 pr_err("Couldn't synthesize cpu map.\n"); 2165 return err; 2166 } 2167 2168 err = perf_event__synthesize_bpf_events(session, process_synthesized_event, 2169 machine, opts); 2170 if (err < 0) { 2171 pr_warning("Couldn't synthesize bpf events.\n"); 2172 err = 0; 2173 } 2174 2175 if (rec->opts.synth & PERF_SYNTH_CGROUP) { 2176 err = perf_event__synthesize_cgroups(tool, process_synthesized_event, 2177 machine); 2178 if (err < 0) { 2179 pr_warning("Couldn't synthesize cgroup events.\n"); 2180 err = 0; 2181 } 2182 } 2183 2184 if (rec->opts.nr_threads_synthesize > 1) { 2185 mutex_init(&synth_lock); 2186 perf_set_multithreaded(); 2187 f = process_locked_synthesized_event; 2188 } 2189 2190 if (rec->opts.synth & PERF_SYNTH_TASK) { 2191 bool needs_mmap = rec->opts.synth & PERF_SYNTH_MMAP; 2192 2193 err = __machine__synthesize_threads(machine, tool, &opts->target, 2194 rec->evlist->core.threads, 2195 f, needs_mmap, opts->record_data_mmap, 2196 rec->opts.nr_threads_synthesize); 2197 } 2198 2199 if (rec->opts.nr_threads_synthesize > 1) { 2200 perf_set_singlethreaded(); 2201 mutex_destroy(&synth_lock); 2202 } 2203 2204 out: 2205 return err; 2206 } 2207 2208 static void record__synthesize_final_bpf_metadata(struct record *rec __maybe_unused) 2209 { 2210 #ifdef HAVE_LIBBPF_SUPPORT 2211 perf_event__synthesize_final_bpf_metadata(rec->session, 2212 process_synthesized_event); 2213 #endif 2214 } 2215 2216 static int record__process_signal_event(union perf_event *event __maybe_unused, void *data) 2217 { 2218 struct record *rec = data; 2219 pthread_kill(rec->thread_id, SIGUSR2); 2220 return 0; 2221 } 2222 2223 static int record__setup_sb_evlist(struct record *rec) 2224 { 2225 struct record_opts *opts = &rec->opts; 2226 2227 if (rec->sb_evlist != NULL) { 2228 /* 2229 * We get here if --switch-output-event populated the 2230 * sb_evlist, so associate a callback that will send a SIGUSR2 2231 * to the main thread. 2232 */ 2233 evlist__set_cb(rec->sb_evlist, record__process_signal_event, rec); 2234 rec->thread_id = pthread_self(); 2235 } 2236 #ifdef HAVE_LIBBPF_SUPPORT 2237 if (!opts->no_bpf_event) { 2238 if (rec->sb_evlist == NULL) { 2239 rec->sb_evlist = evlist__new(); 2240 2241 if (rec->sb_evlist == NULL) { 2242 pr_err("Couldn't create side band evlist.\n."); 2243 return -1; 2244 } 2245 } 2246 2247 if (evlist__add_bpf_sb_event(rec->sb_evlist, perf_session__env(rec->session))) { 2248 pr_err("Couldn't ask for PERF_RECORD_BPF_EVENT side band events.\n."); 2249 return -1; 2250 } 2251 } 2252 #endif 2253 if (evlist__start_sb_thread(rec->sb_evlist, &rec->opts.target)) { 2254 pr_debug("Couldn't start the BPF side band thread:\nBPF programs starting from now on won't be annotatable\n"); 2255 opts->no_bpf_event = true; 2256 } 2257 2258 return 0; 2259 } 2260 2261 static int record__init_clock(struct record *rec) 2262 { 2263 struct perf_session *session = rec->session; 2264 struct timespec ref_clockid; 2265 struct timeval ref_tod; 2266 struct perf_env *env = perf_session__env(session); 2267 u64 ref; 2268 2269 if (!rec->opts.use_clockid) 2270 return 0; 2271 2272 if (rec->opts.use_clockid && rec->opts.clockid_res_ns) 2273 env->clock.clockid_res_ns = rec->opts.clockid_res_ns; 2274 2275 env->clock.clockid = rec->opts.clockid; 2276 2277 if (gettimeofday(&ref_tod, NULL) != 0) { 2278 pr_err("gettimeofday failed, cannot set reference time.\n"); 2279 return -1; 2280 } 2281 2282 if (clock_gettime(rec->opts.clockid, &ref_clockid)) { 2283 pr_err("clock_gettime failed, cannot set reference time.\n"); 2284 return -1; 2285 } 2286 2287 ref = (u64) ref_tod.tv_sec * NSEC_PER_SEC + 2288 (u64) ref_tod.tv_usec * NSEC_PER_USEC; 2289 2290 env->clock.tod_ns = ref; 2291 2292 ref = (u64) ref_clockid.tv_sec * NSEC_PER_SEC + 2293 (u64) ref_clockid.tv_nsec; 2294 2295 env->clock.clockid_ns = ref; 2296 return 0; 2297 } 2298 2299 static void hit_auxtrace_snapshot_trigger(struct record *rec) 2300 { 2301 if (trigger_is_ready(&auxtrace_snapshot_trigger)) { 2302 trigger_hit(&auxtrace_snapshot_trigger); 2303 auxtrace_record__snapshot_started = 1; 2304 if (auxtrace_record__snapshot_start(rec->itr)) 2305 trigger_error(&auxtrace_snapshot_trigger); 2306 } 2307 } 2308 2309 static int record__terminate_thread(struct record_thread *thread_data) 2310 { 2311 int err; 2312 enum thread_msg ack = THREAD_MSG__UNDEFINED; 2313 pid_t tid = thread_data->tid; 2314 2315 close(thread_data->pipes.msg[1]); 2316 thread_data->pipes.msg[1] = -1; 2317 err = read(thread_data->pipes.ack[0], &ack, sizeof(ack)); 2318 if (err > 0) 2319 pr_debug2("threads[%d]: sent %s\n", tid, thread_msg_tags[ack]); 2320 else 2321 pr_warning("threads[%d]: failed to receive termination notification from %d\n", 2322 thread->tid, tid); 2323 2324 return 0; 2325 } 2326 2327 static int record__start_threads(struct record *rec) 2328 { 2329 int t, tt, err, ret = 0, nr_threads = rec->nr_threads; 2330 struct record_thread *thread_data = rec->thread_data; 2331 sigset_t full, mask; 2332 pthread_t handle; 2333 pthread_attr_t attrs; 2334 2335 thread = &thread_data[0]; 2336 2337 if (!record__threads_enabled(rec)) 2338 return 0; 2339 2340 sigfillset(&full); 2341 if (sigprocmask(SIG_SETMASK, &full, &mask)) { 2342 pr_err("Failed to block signals on threads start: %m\n"); 2343 return -1; 2344 } 2345 2346 pthread_attr_init(&attrs); 2347 pthread_attr_setdetachstate(&attrs, PTHREAD_CREATE_DETACHED); 2348 2349 for (t = 1; t < nr_threads; t++) { 2350 enum thread_msg msg = THREAD_MSG__UNDEFINED; 2351 2352 #ifdef HAVE_PTHREAD_ATTR_SETAFFINITY_NP 2353 pthread_attr_setaffinity_np(&attrs, 2354 MMAP_CPU_MASK_BYTES(&(thread_data[t].mask->affinity)), 2355 (cpu_set_t *)(thread_data[t].mask->affinity.bits)); 2356 #endif 2357 if (pthread_create(&handle, &attrs, record__thread, &thread_data[t])) { 2358 for (tt = 1; tt < t; tt++) 2359 record__terminate_thread(&thread_data[t]); 2360 pr_err("Failed to start threads: %m\n"); 2361 ret = -1; 2362 goto out_err; 2363 } 2364 2365 err = read(thread_data[t].pipes.ack[0], &msg, sizeof(msg)); 2366 if (err > 0) 2367 pr_debug2("threads[%d]: sent %s\n", rec->thread_data[t].tid, 2368 thread_msg_tags[msg]); 2369 else 2370 pr_warning("threads[%d]: failed to receive start notification from %d\n", 2371 thread->tid, rec->thread_data[t].tid); 2372 } 2373 2374 sched_setaffinity(0, MMAP_CPU_MASK_BYTES(&thread->mask->affinity), 2375 (cpu_set_t *)thread->mask->affinity.bits); 2376 2377 pr_debug("threads[%d]: started on cpu%d\n", thread->tid, sched_getcpu()); 2378 2379 out_err: 2380 pthread_attr_destroy(&attrs); 2381 2382 if (sigprocmask(SIG_SETMASK, &mask, NULL)) { 2383 pr_err("Failed to unblock signals on threads start: %m\n"); 2384 ret = -1; 2385 } 2386 2387 return ret; 2388 } 2389 2390 static int record__stop_threads(struct record *rec) 2391 { 2392 int t; 2393 struct record_thread *thread_data = rec->thread_data; 2394 2395 for (t = 1; t < rec->nr_threads; t++) 2396 record__terminate_thread(&thread_data[t]); 2397 2398 for (t = 0; t < rec->nr_threads; t++) { 2399 rec->samples += thread_data[t].samples; 2400 if (!record__threads_enabled(rec)) 2401 continue; 2402 rec->session->bytes_transferred += thread_data[t].bytes_transferred; 2403 rec->session->bytes_compressed += thread_data[t].bytes_compressed; 2404 pr_debug("threads[%d]: samples=%lld, wakes=%ld, ", thread_data[t].tid, 2405 thread_data[t].samples, thread_data[t].waking); 2406 if (thread_data[t].bytes_transferred && thread_data[t].bytes_compressed) 2407 pr_debug("transferred=%" PRIu64 ", compressed=%" PRIu64 "\n", 2408 thread_data[t].bytes_transferred, thread_data[t].bytes_compressed); 2409 else 2410 pr_debug("written=%" PRIu64 "\n", thread_data[t].bytes_written); 2411 } 2412 2413 return 0; 2414 } 2415 2416 static unsigned long record__waking(struct record *rec) 2417 { 2418 int t; 2419 unsigned long waking = 0; 2420 struct record_thread *thread_data = rec->thread_data; 2421 2422 for (t = 0; t < rec->nr_threads; t++) 2423 waking += thread_data[t].waking; 2424 2425 return waking; 2426 } 2427 2428 static int __cmd_record(struct record *rec, int argc, const char **argv) 2429 { 2430 int err; 2431 int status = 0; 2432 const bool forks = argc > 0; 2433 struct perf_tool *tool = &rec->tool; 2434 struct record_opts *opts = &rec->opts; 2435 struct perf_data *data = &rec->data; 2436 struct perf_session *session; 2437 bool disabled = false, draining = false; 2438 int fd; 2439 float ratio = 0; 2440 enum evlist_ctl_cmd cmd = EVLIST_CTL_CMD_UNSUPPORTED; 2441 struct perf_env *env; 2442 2443 atexit(record__sig_exit); 2444 signal(SIGCHLD, sig_handler); 2445 signal(SIGINT, sig_handler); 2446 signal(SIGTERM, sig_handler); 2447 signal(SIGSEGV, sigsegv_handler); 2448 2449 if (rec->opts.record_cgroup) { 2450 #ifndef HAVE_FILE_HANDLE 2451 pr_err("cgroup tracking is not supported\n"); 2452 return -1; 2453 #endif 2454 } 2455 2456 if (rec->opts.auxtrace_snapshot_mode || rec->switch_output.enabled) { 2457 signal(SIGUSR2, snapshot_sig_handler); 2458 if (rec->opts.auxtrace_snapshot_mode) 2459 trigger_on(&auxtrace_snapshot_trigger); 2460 if (rec->switch_output.enabled) 2461 trigger_on(&switch_output_trigger); 2462 } else { 2463 signal(SIGUSR2, SIG_IGN); 2464 } 2465 2466 perf_tool__init(tool, /*ordered_events=*/true); 2467 tool->sample = process_sample_event; 2468 tool->fork = perf_event__process_fork; 2469 tool->exit = perf_event__process_exit; 2470 tool->comm = perf_event__process_comm; 2471 tool->namespaces = perf_event__process_namespaces; 2472 tool->mmap = build_id__process_mmap; 2473 tool->mmap2 = build_id__process_mmap2; 2474 tool->itrace_start = process_timestamp_boundary; 2475 tool->aux = process_timestamp_boundary; 2476 tool->namespace_events = rec->opts.record_namespaces; 2477 tool->cgroup_events = rec->opts.record_cgroup; 2478 session = perf_session__new(data, tool); 2479 if (IS_ERR(session)) { 2480 pr_err("Perf session creation failed.\n"); 2481 return PTR_ERR(session); 2482 } 2483 env = perf_session__env(session); 2484 if (record__threads_enabled(rec)) { 2485 if (perf_data__is_pipe(&rec->data)) { 2486 pr_err("Parallel trace streaming is not available in pipe mode.\n"); 2487 return -1; 2488 } 2489 if (rec->opts.full_auxtrace) { 2490 pr_err("Parallel trace streaming is not available in AUX area tracing mode.\n"); 2491 return -1; 2492 } 2493 } 2494 2495 fd = perf_data__fd(data); 2496 rec->session = session; 2497 2498 if (zstd_init(&session->zstd_data, rec->opts.comp_level) < 0) { 2499 pr_err("Compression initialization failed.\n"); 2500 return -1; 2501 } 2502 #ifdef HAVE_EVENTFD_SUPPORT 2503 done_fd = eventfd(0, EFD_NONBLOCK); 2504 if (done_fd < 0) { 2505 pr_err("Failed to create wakeup eventfd, error: %m\n"); 2506 status = -1; 2507 goto out_delete_session; 2508 } 2509 err = evlist__add_wakeup_eventfd(rec->evlist, done_fd); 2510 if (err < 0) { 2511 pr_err("Failed to add wakeup eventfd to poll list\n"); 2512 status = err; 2513 goto out_delete_session; 2514 } 2515 #endif // HAVE_EVENTFD_SUPPORT 2516 2517 env->comp_type = PERF_COMP_ZSTD; 2518 env->comp_level = rec->opts.comp_level; 2519 2520 if (rec->opts.kcore && 2521 !record__kcore_readable(&session->machines.host)) { 2522 pr_err("ERROR: kcore is not readable.\n"); 2523 return -1; 2524 } 2525 2526 if (record__init_clock(rec)) 2527 return -1; 2528 2529 record__init_features(rec); 2530 2531 if (forks) { 2532 err = evlist__prepare_workload(rec->evlist, &opts->target, argv, data->is_pipe, 2533 workload_exec_failed_signal); 2534 if (err < 0) { 2535 pr_err("Couldn't run the workload!\n"); 2536 status = err; 2537 goto out_delete_session; 2538 } 2539 } 2540 2541 /* 2542 * If we have just single event and are sending data 2543 * through pipe, we need to force the ids allocation, 2544 * because we synthesize event name through the pipe 2545 * and need the id for that. 2546 */ 2547 if (data->is_pipe && rec->evlist->core.nr_entries == 1) 2548 rec->opts.sample_id = true; 2549 2550 if (rec->timestamp_filename && perf_data__is_pipe(data)) { 2551 rec->timestamp_filename = false; 2552 pr_warning("WARNING: --timestamp-filename option is not available in pipe mode.\n"); 2553 } 2554 2555 /* 2556 * Use global stat_config that is zero meaning aggr_mode is AGGR_NONE 2557 * and hybrid_merge is false. 2558 */ 2559 evlist__uniquify_evsel_names(rec->evlist, &stat_config); 2560 2561 evlist__config(rec->evlist, opts, &callchain_param); 2562 2563 /* Debug message used by test scripts */ 2564 pr_debug3("perf record opening and mmapping events\n"); 2565 if (record__open(rec) != 0) { 2566 err = -1; 2567 goto out_free_threads; 2568 } 2569 /* Debug message used by test scripts */ 2570 pr_debug3("perf record done opening and mmapping events\n"); 2571 env->comp_mmap_len = session->evlist->core.mmap_len; 2572 2573 if (rec->opts.kcore) { 2574 err = record__kcore_copy(&session->machines.host, data); 2575 if (err) { 2576 pr_err("ERROR: Failed to copy kcore\n"); 2577 goto out_free_threads; 2578 } 2579 } 2580 2581 /* 2582 * Normally perf_session__new would do this, but it doesn't have the 2583 * evlist. 2584 */ 2585 if (rec->tool.ordered_events && !evlist__sample_id_all(rec->evlist)) { 2586 pr_warning("WARNING: No sample_id_all support, falling back to unordered processing\n"); 2587 rec->tool.ordered_events = false; 2588 } 2589 2590 if (evlist__nr_groups(rec->evlist) == 0) 2591 perf_header__clear_feat(&session->header, HEADER_GROUP_DESC); 2592 2593 if (data->is_pipe) { 2594 err = perf_header__write_pipe(fd); 2595 if (err < 0) 2596 goto out_free_threads; 2597 } else { 2598 err = perf_session__write_header(session, rec->evlist, fd, false); 2599 if (err < 0) 2600 goto out_free_threads; 2601 } 2602 2603 err = -1; 2604 if (!rec->no_buildid 2605 && !perf_header__has_feat(&session->header, HEADER_BUILD_ID)) { 2606 pr_err("Couldn't generate buildids. " 2607 "Use --no-buildid to profile anyway.\n"); 2608 goto out_free_threads; 2609 } 2610 2611 if (!evlist__needs_bpf_sb_event(rec->evlist)) 2612 opts->no_bpf_event = true; 2613 2614 err = record__setup_sb_evlist(rec); 2615 if (err) 2616 goto out_free_threads; 2617 2618 err = record__synthesize(rec, false); 2619 if (err < 0) 2620 goto out_free_threads; 2621 2622 if (rec->realtime_prio) { 2623 struct sched_param param; 2624 2625 param.sched_priority = rec->realtime_prio; 2626 if (sched_setscheduler(0, SCHED_FIFO, ¶m)) { 2627 pr_err("Could not set realtime priority.\n"); 2628 err = -1; 2629 goto out_free_threads; 2630 } 2631 } 2632 2633 if (record__start_threads(rec)) 2634 goto out_free_threads; 2635 2636 /* 2637 * When perf is starting the traced process, all the events 2638 * (apart from group members) have enable_on_exec=1 set, 2639 * so don't spoil it by prematurely enabling them. 2640 */ 2641 if (!target__none(&opts->target) && !opts->target.initial_delay) 2642 evlist__enable(rec->evlist); 2643 2644 /* 2645 * offcpu-time does not call execve, so enable_on_exe wouldn't work 2646 * when recording a workload, do it manually 2647 */ 2648 if (rec->off_cpu) 2649 evlist__enable_evsel(rec->evlist, (char *)OFFCPU_EVENT); 2650 2651 /* 2652 * Let the child rip 2653 */ 2654 if (forks) { 2655 struct machine *machine = &session->machines.host; 2656 union perf_event *event; 2657 pid_t tgid; 2658 2659 event = malloc(sizeof(event->comm) + machine->id_hdr_size); 2660 if (event == NULL) { 2661 err = -ENOMEM; 2662 goto out_child; 2663 } 2664 2665 /* 2666 * Some H/W events are generated before COMM event 2667 * which is emitted during exec(), so perf script 2668 * cannot see a correct process name for those events. 2669 * Synthesize COMM event to prevent it. 2670 */ 2671 tgid = perf_event__synthesize_comm(tool, event, 2672 rec->evlist->workload.pid, 2673 process_synthesized_event, 2674 machine); 2675 free(event); 2676 2677 if (tgid == -1) 2678 goto out_child; 2679 2680 event = malloc(sizeof(event->namespaces) + 2681 (NR_NAMESPACES * sizeof(struct perf_ns_link_info)) + 2682 machine->id_hdr_size); 2683 if (event == NULL) { 2684 err = -ENOMEM; 2685 goto out_child; 2686 } 2687 2688 /* 2689 * Synthesize NAMESPACES event for the command specified. 2690 */ 2691 perf_event__synthesize_namespaces(tool, event, 2692 rec->evlist->workload.pid, 2693 tgid, process_synthesized_event, 2694 machine); 2695 free(event); 2696 2697 evlist__start_workload(rec->evlist); 2698 } 2699 2700 if (opts->target.initial_delay) { 2701 pr_info(EVLIST_DISABLED_MSG); 2702 if (opts->target.initial_delay > 0) { 2703 usleep(opts->target.initial_delay * USEC_PER_MSEC); 2704 evlist__enable(rec->evlist); 2705 pr_info(EVLIST_ENABLED_MSG); 2706 } 2707 } 2708 2709 err = event_enable_timer__start(rec->evlist->eet); 2710 if (err) 2711 goto out_child; 2712 2713 /* Debug message used by test scripts */ 2714 pr_debug3("perf record has started\n"); 2715 fflush(stderr); 2716 2717 trigger_ready(&auxtrace_snapshot_trigger); 2718 trigger_ready(&switch_output_trigger); 2719 perf_hooks__invoke_record_start(); 2720 2721 /* 2722 * Must write FINISHED_INIT so it will be seen after all other 2723 * synthesized user events, but before any regular events. 2724 */ 2725 err = write_finished_init(rec, false); 2726 if (err < 0) 2727 goto out_child; 2728 2729 for (;;) { 2730 unsigned long long hits = thread->samples; 2731 2732 /* 2733 * rec->evlist->bkw_mmap_state is possible to be 2734 * BKW_MMAP_EMPTY here: when done == true and 2735 * hits != rec->samples in previous round. 2736 * 2737 * evlist__toggle_bkw_mmap ensure we never 2738 * convert BKW_MMAP_EMPTY to BKW_MMAP_DATA_PENDING. 2739 */ 2740 if (trigger_is_hit(&switch_output_trigger) || done || draining) 2741 evlist__toggle_bkw_mmap(rec->evlist, BKW_MMAP_DATA_PENDING); 2742 2743 if (record__mmap_read_all(rec, false) < 0) { 2744 trigger_error(&auxtrace_snapshot_trigger); 2745 trigger_error(&switch_output_trigger); 2746 err = -1; 2747 goto out_child; 2748 } 2749 2750 if (auxtrace_record__snapshot_started) { 2751 auxtrace_record__snapshot_started = 0; 2752 if (!trigger_is_error(&auxtrace_snapshot_trigger)) 2753 record__read_auxtrace_snapshot(rec, false); 2754 if (trigger_is_error(&auxtrace_snapshot_trigger)) { 2755 pr_err("AUX area tracing snapshot failed\n"); 2756 err = -1; 2757 goto out_child; 2758 } 2759 } 2760 2761 if (trigger_is_hit(&switch_output_trigger)) { 2762 /* 2763 * If switch_output_trigger is hit, the data in 2764 * overwritable ring buffer should have been collected, 2765 * so bkw_mmap_state should be set to BKW_MMAP_EMPTY. 2766 * 2767 * If SIGUSR2 raise after or during record__mmap_read_all(), 2768 * record__mmap_read_all() didn't collect data from 2769 * overwritable ring buffer. Read again. 2770 */ 2771 if (rec->evlist->bkw_mmap_state == BKW_MMAP_RUNNING) 2772 continue; 2773 trigger_ready(&switch_output_trigger); 2774 2775 /* 2776 * Reenable events in overwrite ring buffer after 2777 * record__mmap_read_all(): we should have collected 2778 * data from it. 2779 */ 2780 evlist__toggle_bkw_mmap(rec->evlist, BKW_MMAP_RUNNING); 2781 2782 if (!quiet) 2783 fprintf(stderr, "[ perf record: dump data: Woken up %ld times ]\n", 2784 record__waking(rec)); 2785 thread->waking = 0; 2786 fd = record__switch_output(rec, false); 2787 if (fd < 0) { 2788 pr_err("Failed to switch to new file\n"); 2789 trigger_error(&switch_output_trigger); 2790 err = fd; 2791 goto out_child; 2792 } 2793 2794 /* re-arm the alarm */ 2795 if (rec->switch_output.time) 2796 alarm(rec->switch_output.time); 2797 } 2798 2799 if (hits == thread->samples) { 2800 if (done || draining) 2801 break; 2802 err = fdarray__poll(&thread->pollfd, -1); 2803 /* 2804 * Propagate error, only if there's any. Ignore positive 2805 * number of returned events and interrupt error. 2806 */ 2807 if (err > 0 || (err < 0 && errno == EINTR)) 2808 err = 0; 2809 thread->waking++; 2810 2811 if (fdarray__filter(&thread->pollfd, POLLERR | POLLHUP, 2812 record__thread_munmap_filtered, NULL) == 0) 2813 draining = true; 2814 2815 err = record__update_evlist_pollfd_from_thread(rec, rec->evlist, thread); 2816 if (err) 2817 goto out_child; 2818 } 2819 2820 if (evlist__ctlfd_process(rec->evlist, &cmd) > 0) { 2821 switch (cmd) { 2822 case EVLIST_CTL_CMD_SNAPSHOT: 2823 hit_auxtrace_snapshot_trigger(rec); 2824 evlist__ctlfd_ack(rec->evlist); 2825 break; 2826 case EVLIST_CTL_CMD_STOP: 2827 done = 1; 2828 break; 2829 case EVLIST_CTL_CMD_ACK: 2830 case EVLIST_CTL_CMD_UNSUPPORTED: 2831 case EVLIST_CTL_CMD_ENABLE: 2832 case EVLIST_CTL_CMD_DISABLE: 2833 case EVLIST_CTL_CMD_EVLIST: 2834 case EVLIST_CTL_CMD_PING: 2835 default: 2836 break; 2837 } 2838 } 2839 2840 err = event_enable_timer__process(rec->evlist->eet); 2841 if (err < 0) 2842 goto out_child; 2843 if (err) { 2844 err = 0; 2845 done = 1; 2846 } 2847 2848 /* 2849 * When perf is starting the traced process, at the end events 2850 * die with the process and we wait for that. Thus no need to 2851 * disable events in this case. 2852 */ 2853 if (done && !disabled && !target__none(&opts->target)) { 2854 trigger_off(&auxtrace_snapshot_trigger); 2855 evlist__disable(rec->evlist); 2856 disabled = true; 2857 } 2858 } 2859 2860 trigger_off(&auxtrace_snapshot_trigger); 2861 trigger_off(&switch_output_trigger); 2862 2863 record__synthesize_final_bpf_metadata(rec); 2864 2865 if (opts->auxtrace_snapshot_on_exit) 2866 record__auxtrace_snapshot_exit(rec); 2867 2868 if (forks && workload_exec_errno) { 2869 char msg[STRERR_BUFSIZE]; 2870 const char *emsg = str_error_r(workload_exec_errno, msg, sizeof(msg)); 2871 struct strbuf sb = STRBUF_INIT; 2872 2873 evlist__format_evsels(rec->evlist, &sb, 2048); 2874 2875 pr_err("Failed to collect '%s' for the '%s' workload: %s\n", 2876 sb.buf, argv[0], emsg); 2877 strbuf_release(&sb); 2878 err = -1; 2879 goto out_child; 2880 } 2881 2882 if (!quiet) 2883 fprintf(stderr, "[ perf record: Woken up %ld times to write data ]\n", 2884 record__waking(rec)); 2885 2886 write_finished_init(rec, true); 2887 2888 if (target__none(&rec->opts.target)) 2889 record__synthesize_workload(rec, true); 2890 2891 out_child: 2892 record__stop_threads(rec); 2893 record__mmap_read_all(rec, true); 2894 out_free_threads: 2895 record__free_thread_data(rec); 2896 evlist__finalize_ctlfd(rec->evlist); 2897 record__aio_mmap_read_sync(rec); 2898 2899 if (rec->session->bytes_transferred && rec->session->bytes_compressed) { 2900 ratio = (float)rec->session->bytes_transferred/(float)rec->session->bytes_compressed; 2901 env->comp_ratio = ratio + 0.5; 2902 } 2903 2904 if (forks) { 2905 int exit_status; 2906 2907 if (!child_finished) 2908 kill(rec->evlist->workload.pid, SIGTERM); 2909 2910 wait(&exit_status); 2911 2912 if (err < 0) 2913 status = err; 2914 else if (WIFEXITED(exit_status)) 2915 status = WEXITSTATUS(exit_status); 2916 else if (WIFSIGNALED(exit_status)) 2917 signr = WTERMSIG(exit_status); 2918 } else 2919 status = err; 2920 2921 if (rec->off_cpu) 2922 rec->bytes_written += off_cpu_write(rec->session); 2923 2924 record__read_lost_samples(rec); 2925 /* this will be recalculated during process_buildids() */ 2926 rec->samples = 0; 2927 2928 if (!err) { 2929 record__synthesize(rec, true); 2930 if (!rec->timestamp_filename) { 2931 record__finish_output(rec); 2932 } else { 2933 fd = record__switch_output(rec, true); 2934 if (fd < 0) { 2935 status = fd; 2936 goto out_delete_session; 2937 } 2938 } 2939 } 2940 2941 perf_hooks__invoke_record_end(); 2942 2943 if (!err && !quiet) { 2944 char samples[128]; 2945 const char *postfix = rec->timestamp_filename ? 2946 ".<timestamp>" : ""; 2947 2948 if (rec->samples && !rec->opts.full_auxtrace) 2949 scnprintf(samples, sizeof(samples), 2950 " (%" PRIu64 " samples)", rec->samples); 2951 else 2952 samples[0] = '\0'; 2953 2954 fprintf(stderr, "[ perf record: Captured and wrote %.3f MB %s%s%s", 2955 perf_data__size(data) / 1024.0 / 1024.0, 2956 data->path, postfix, samples); 2957 if (ratio) { 2958 fprintf(stderr, ", compressed (original %.3f MB, ratio is %.3f)", 2959 rec->session->bytes_transferred / 1024.0 / 1024.0, 2960 ratio); 2961 } 2962 fprintf(stderr, " ]\n"); 2963 } 2964 2965 out_delete_session: 2966 #ifdef HAVE_EVENTFD_SUPPORT 2967 if (done_fd >= 0) { 2968 fd = done_fd; 2969 done_fd = -1; 2970 2971 close(fd); 2972 } 2973 #endif 2974 zstd_fini(&session->zstd_data); 2975 if (!opts->no_bpf_event) 2976 evlist__stop_sb_thread(rec->sb_evlist); 2977 2978 perf_session__delete(session); 2979 return status; 2980 } 2981 2982 static int record_parse_callchain_opt(const struct option *opt, 2983 const char *arg, 2984 int unset) 2985 { 2986 return record_opts__parse_callchain(opt->value, &callchain_param, arg, unset); 2987 } 2988 2989 static int record_callchain_opt(const struct option *opt, 2990 const char *arg __maybe_unused, 2991 int unset) 2992 { 2993 /* 2994 * The -g option only sets the callchain if not already configured by 2995 * .perfconfig. It does, however, enable it. 2996 */ 2997 if (callchain_param.record_mode != CALLCHAIN_NONE) { 2998 callchain_param.enabled = true; 2999 return 0; 3000 } 3001 3002 return record_opts__parse_callchain(opt->value, &callchain_param, 3003 EM_HOST != EM_S390 ? "fp" : "dwarf", 3004 unset); 3005 } 3006 3007 3008 static int perf_record_config(const char *var, const char *value, void *cb) 3009 { 3010 struct record *rec = cb; 3011 3012 if (!strcmp(var, "record.build-id")) { 3013 if (!strcmp(value, "cache")) 3014 rec->no_buildid_cache = false; 3015 else if (!strcmp(value, "no-cache")) 3016 rec->no_buildid_cache = true; 3017 else if (!strcmp(value, "skip")) 3018 rec->no_buildid = rec->no_buildid_cache = true; 3019 else if (!strcmp(value, "mmap")) 3020 rec->buildid_mmap = true; 3021 else if (!strcmp(value, "no-mmap")) 3022 rec->buildid_mmap = false; 3023 else 3024 return -1; 3025 return 0; 3026 } 3027 if (!strcmp(var, "record.call-graph")) { 3028 var = "call-graph.record-mode"; 3029 return perf_default_config(var, value, cb); 3030 } 3031 #ifdef HAVE_AIO_SUPPORT 3032 if (!strcmp(var, "record.aio")) { 3033 rec->opts.nr_cblocks = strtol(value, NULL, 0); 3034 if (!rec->opts.nr_cblocks) 3035 rec->opts.nr_cblocks = nr_cblocks_default; 3036 } 3037 #endif 3038 if (!strcmp(var, "record.debuginfod")) { 3039 rec->debuginfod.urls = strdup(value); 3040 if (!rec->debuginfod.urls) 3041 return -ENOMEM; 3042 rec->debuginfod.set = true; 3043 } 3044 3045 return 0; 3046 } 3047 3048 static int record__parse_event_enable_time(const struct option *opt, const char *str, int unset) 3049 { 3050 struct record *rec = (struct record *)opt->value; 3051 3052 return evlist__parse_event_enable_time(rec->evlist, &rec->opts, str, unset); 3053 } 3054 3055 static int record__parse_affinity(const struct option *opt, const char *str, int unset) 3056 { 3057 struct record_opts *opts = (struct record_opts *)opt->value; 3058 3059 if (unset || !str) 3060 return 0; 3061 3062 if (!strcasecmp(str, "node")) 3063 opts->affinity = PERF_AFFINITY_NODE; 3064 else if (!strcasecmp(str, "cpu")) 3065 opts->affinity = PERF_AFFINITY_CPU; 3066 3067 return 0; 3068 } 3069 3070 static int record__mmap_cpu_mask_alloc(struct mmap_cpu_mask *mask, int nr_bits) 3071 { 3072 mask->nbits = nr_bits; 3073 mask->bits = bitmap_zalloc(mask->nbits); 3074 if (!mask->bits) 3075 return -ENOMEM; 3076 3077 return 0; 3078 } 3079 3080 static void record__mmap_cpu_mask_free(struct mmap_cpu_mask *mask) 3081 { 3082 bitmap_free(mask->bits); 3083 mask->nbits = 0; 3084 } 3085 3086 static int record__thread_mask_alloc(struct thread_mask *mask, int nr_bits) 3087 { 3088 int ret; 3089 3090 ret = record__mmap_cpu_mask_alloc(&mask->maps, nr_bits); 3091 if (ret) { 3092 mask->affinity.bits = NULL; 3093 return ret; 3094 } 3095 3096 ret = record__mmap_cpu_mask_alloc(&mask->affinity, nr_bits); 3097 if (ret) { 3098 record__mmap_cpu_mask_free(&mask->maps); 3099 mask->maps.bits = NULL; 3100 } 3101 3102 return ret; 3103 } 3104 3105 static void record__thread_mask_free(struct thread_mask *mask) 3106 { 3107 record__mmap_cpu_mask_free(&mask->maps); 3108 record__mmap_cpu_mask_free(&mask->affinity); 3109 } 3110 3111 static int record__parse_threads(const struct option *opt, const char *str, int unset) 3112 { 3113 int s; 3114 struct record_opts *opts = opt->value; 3115 3116 if (unset || !str || !strlen(str)) { 3117 opts->threads_spec = THREAD_SPEC__CPU; 3118 } else { 3119 for (s = 1; s < THREAD_SPEC__MAX; s++) { 3120 if (s == THREAD_SPEC__USER) { 3121 opts->threads_user_spec = strdup(str); 3122 if (!opts->threads_user_spec) 3123 return -ENOMEM; 3124 opts->threads_spec = THREAD_SPEC__USER; 3125 break; 3126 } 3127 if (!strncasecmp(str, thread_spec_tags[s], strlen(thread_spec_tags[s]))) { 3128 opts->threads_spec = s; 3129 break; 3130 } 3131 } 3132 } 3133 3134 if (opts->threads_spec == THREAD_SPEC__USER) 3135 pr_debug("threads_spec: %s\n", opts->threads_user_spec); 3136 else 3137 pr_debug("threads_spec: %s\n", thread_spec_tags[opts->threads_spec]); 3138 3139 return 0; 3140 } 3141 3142 static int parse_output_max_size(const struct option *opt, 3143 const char *str, int unset) 3144 { 3145 unsigned long *s = (unsigned long *)opt->value; 3146 static struct parse_tag tags_size[] = { 3147 { .tag = 'B', .mult = 1 }, 3148 { .tag = 'K', .mult = 1 << 10 }, 3149 { .tag = 'M', .mult = 1 << 20 }, 3150 { .tag = 'G', .mult = 1 << 30 }, 3151 { .tag = 0 }, 3152 }; 3153 unsigned long val; 3154 3155 if (unset) { 3156 *s = 0; 3157 return 0; 3158 } 3159 3160 val = parse_tag_value(str, tags_size); 3161 if (val != (unsigned long) -1) { 3162 *s = val; 3163 return 0; 3164 } 3165 3166 return -1; 3167 } 3168 3169 static int record__parse_mmap_pages(const struct option *opt, 3170 const char *str, 3171 int unset __maybe_unused) 3172 { 3173 struct record_opts *opts = opt->value; 3174 char *s, *p; 3175 unsigned int mmap_pages; 3176 int ret; 3177 3178 if (!str) 3179 return -EINVAL; 3180 3181 s = strdup(str); 3182 if (!s) 3183 return -ENOMEM; 3184 3185 p = strchr(s, ','); 3186 if (p) 3187 *p = '\0'; 3188 3189 if (*s) { 3190 ret = __evlist__parse_mmap_pages(&mmap_pages, s); 3191 if (ret) 3192 goto out_free; 3193 opts->mmap_pages = mmap_pages; 3194 } 3195 3196 if (!p) { 3197 ret = 0; 3198 goto out_free; 3199 } 3200 3201 ret = __evlist__parse_mmap_pages(&mmap_pages, p + 1); 3202 if (ret) 3203 goto out_free; 3204 3205 opts->auxtrace_mmap_pages = mmap_pages; 3206 3207 out_free: 3208 free(s); 3209 return ret; 3210 } 3211 3212 static int record__parse_off_cpu_thresh(const struct option *opt, 3213 const char *str, 3214 int unset __maybe_unused) 3215 { 3216 struct record_opts *opts = opt->value; 3217 char *endptr; 3218 u64 off_cpu_thresh_ms; 3219 3220 if (!str) 3221 return -EINVAL; 3222 3223 off_cpu_thresh_ms = strtoull(str, &endptr, 10); 3224 3225 /* the threshold isn't string "0", yet strtoull() returns 0, parsing failed */ 3226 if (*endptr || (off_cpu_thresh_ms == 0 && strcmp(str, "0"))) 3227 return -EINVAL; 3228 else 3229 opts->off_cpu_thresh_ns = off_cpu_thresh_ms * NSEC_PER_MSEC; 3230 3231 return 0; 3232 } 3233 3234 static int parse_control_option(const struct option *opt, 3235 const char *str, 3236 int unset __maybe_unused) 3237 { 3238 struct record_opts *opts = opt->value; 3239 3240 return evlist__parse_control(str, &opts->ctl_fd, &opts->ctl_fd_ack, &opts->ctl_fd_close); 3241 } 3242 3243 static void switch_output_size_warn(struct record *rec) 3244 { 3245 u64 wakeup_size = evlist__mmap_size(rec->opts.mmap_pages); 3246 struct switch_output *s = &rec->switch_output; 3247 3248 wakeup_size /= 2; 3249 3250 if (s->size < wakeup_size) { 3251 char buf[100]; 3252 3253 unit_number__scnprintf(buf, sizeof(buf), wakeup_size); 3254 pr_warning("WARNING: switch-output data size lower than " 3255 "wakeup kernel buffer size (%s) " 3256 "expect bigger perf.data sizes\n", buf); 3257 } 3258 } 3259 3260 static int switch_output_setup(struct record *rec) 3261 { 3262 struct switch_output *s = &rec->switch_output; 3263 static struct parse_tag tags_size[] = { 3264 { .tag = 'B', .mult = 1 }, 3265 { .tag = 'K', .mult = 1 << 10 }, 3266 { .tag = 'M', .mult = 1 << 20 }, 3267 { .tag = 'G', .mult = 1 << 30 }, 3268 { .tag = 0 }, 3269 }; 3270 static struct parse_tag tags_time[] = { 3271 { .tag = 's', .mult = 1 }, 3272 { .tag = 'm', .mult = 60 }, 3273 { .tag = 'h', .mult = 60*60 }, 3274 { .tag = 'd', .mult = 60*60*24 }, 3275 { .tag = 0 }, 3276 }; 3277 unsigned long val; 3278 3279 /* 3280 * If we're using --switch-output-events, then we imply its 3281 * --switch-output=signal, as we'll send a SIGUSR2 from the side band 3282 * thread to its parent. 3283 */ 3284 if (rec->switch_output_event_set) { 3285 if (record__threads_enabled(rec)) { 3286 pr_warning("WARNING: --switch-output-event option is not available in parallel streaming mode.\n"); 3287 return 0; 3288 } 3289 goto do_signal; 3290 } 3291 3292 if (!s->set) 3293 return 0; 3294 3295 if (record__threads_enabled(rec)) { 3296 pr_warning("WARNING: --switch-output option is not available in parallel streaming mode.\n"); 3297 return 0; 3298 } 3299 3300 if (!strcmp(s->str, "signal")) { 3301 do_signal: 3302 s->signal = true; 3303 pr_debug("switch-output with SIGUSR2 signal\n"); 3304 goto enabled; 3305 } 3306 3307 val = parse_tag_value(s->str, tags_size); 3308 if (val != (unsigned long) -1) { 3309 s->size = val; 3310 pr_debug("switch-output with %s size threshold\n", s->str); 3311 goto enabled; 3312 } 3313 3314 val = parse_tag_value(s->str, tags_time); 3315 if (val != (unsigned long) -1) { 3316 s->time = val; 3317 pr_debug("switch-output with %s time threshold (%lu seconds)\n", 3318 s->str, s->time); 3319 goto enabled; 3320 } 3321 3322 return -1; 3323 3324 enabled: 3325 rec->timestamp_filename = true; 3326 s->enabled = true; 3327 3328 if (s->size && !rec->opts.no_buffering) 3329 switch_output_size_warn(rec); 3330 3331 return 0; 3332 } 3333 3334 static const char * const __record_usage[] = { 3335 "perf record [<options>] [<command>]", 3336 "perf record [<options>] -- <command> [<options>]", 3337 NULL 3338 }; 3339 const char * const *record_usage = __record_usage; 3340 3341 static int build_id__process_mmap(const struct perf_tool *tool, union perf_event *event, 3342 struct perf_sample *sample, struct machine *machine) 3343 { 3344 /* 3345 * We already have the kernel maps, put in place via perf_session__create_kernel_maps() 3346 * no need to add them twice. 3347 */ 3348 if (!(event->header.misc & PERF_RECORD_MISC_USER)) 3349 return 0; 3350 return perf_event__process_mmap(tool, event, sample, machine); 3351 } 3352 3353 static int build_id__process_mmap2(const struct perf_tool *tool, union perf_event *event, 3354 struct perf_sample *sample, struct machine *machine) 3355 { 3356 /* 3357 * We already have the kernel maps, put in place via perf_session__create_kernel_maps() 3358 * no need to add them twice. 3359 */ 3360 if (!(event->header.misc & PERF_RECORD_MISC_USER)) 3361 return 0; 3362 3363 return perf_event__process_mmap2(tool, event, sample, machine); 3364 } 3365 3366 static int process_timestamp_boundary(const struct perf_tool *tool, 3367 union perf_event *event __maybe_unused, 3368 struct perf_sample *sample, 3369 struct machine *machine __maybe_unused) 3370 { 3371 struct record *rec = container_of(tool, struct record, tool); 3372 3373 set_timestamp_boundary(rec, sample->time); 3374 return 0; 3375 } 3376 3377 static int parse_record_synth_option(const struct option *opt, 3378 const char *str, 3379 int unset __maybe_unused) 3380 { 3381 struct record_opts *opts = opt->value; 3382 char *p = strdup(str); 3383 3384 if (p == NULL) 3385 return -1; 3386 3387 opts->synth = parse_synth_opt(p); 3388 free(p); 3389 3390 if (opts->synth < 0) { 3391 pr_err("Invalid synth option: %s\n", str); 3392 return -1; 3393 } 3394 return 0; 3395 } 3396 3397 /* 3398 * XXX Ideally would be local to cmd_record() and passed to a record__new 3399 * because we need to have access to it in record__exit, that is called 3400 * after cmd_record() exits, but since record_options need to be accessible to 3401 * builtin-script, leave it here. 3402 * 3403 * At least we don't ouch it in all the other functions here directly. 3404 * 3405 * Just say no to tons of global variables, sigh. 3406 */ 3407 static struct record record = { 3408 .opts = { 3409 .sample_time = true, 3410 .mmap_pages = UINT_MAX, 3411 .user_freq = UINT_MAX, 3412 .user_interval = ULLONG_MAX, 3413 .freq = 4000, 3414 .target = { 3415 .uses_mmap = true, 3416 .default_per_cpu = true, 3417 }, 3418 .mmap_flush = MMAP_FLUSH_DEFAULT, 3419 .nr_threads_synthesize = 1, 3420 .ctl_fd = -1, 3421 .ctl_fd_ack = -1, 3422 .synth = PERF_SYNTH_ALL, 3423 .off_cpu_thresh_ns = OFFCPU_THRESH, 3424 }, 3425 .buildid_mmap = true, 3426 }; 3427 3428 const char record_callchain_help[] = CALLCHAIN_RECORD_HELP 3429 "\n\t\t\t\tDefault: fp"; 3430 3431 static bool dry_run; 3432 3433 static struct parse_events_option_args parse_events_option_args = { 3434 .evlistp = &record.evlist, 3435 }; 3436 3437 static struct parse_events_option_args switch_output_parse_events_option_args = { 3438 .evlistp = &record.sb_evlist, 3439 }; 3440 3441 /* 3442 * XXX Will stay a global variable till we fix builtin-script.c to stop messing 3443 * with it and switch to use the library functions in perf_evlist that came 3444 * from builtin-record.c, i.e. use record_opts, 3445 * evlist__prepare_workload, etc instead of fork+exec'in 'perf record', 3446 * using pipes, etc. 3447 */ 3448 static struct option __record_options[] = { 3449 OPT_CALLBACK('e', "event", &parse_events_option_args, "event", 3450 "event selector. use 'perf list' to list available events", 3451 parse_events_option), 3452 OPT_CALLBACK(0, "filter", &record.evlist, "filter", 3453 "event filter", parse_filter), 3454 OPT_BOOLEAN(0, "latency", &record.latency, 3455 "Enable data collection for latency profiling.\n" 3456 "\t\t\t Use perf report --latency for latency-centric profile."), 3457 OPT_CALLBACK_NOOPT(0, "exclude-perf", &record.evlist, 3458 NULL, "don't record events from perf itself", 3459 exclude_perf), 3460 OPT_STRING('p', "pid", &record.opts.target.pid, "pid", 3461 "record events on existing process id"), 3462 OPT_STRING('t', "tid", &record.opts.target.tid, "tid", 3463 "record events on existing thread id"), 3464 OPT_INTEGER('r', "realtime", &record.realtime_prio, 3465 "collect data with this RT SCHED_FIFO priority"), 3466 OPT_BOOLEAN(0, "no-buffering", &record.opts.no_buffering, 3467 "collect data without buffering"), 3468 OPT_BOOLEAN('R', "raw-samples", &record.opts.raw_samples, 3469 "collect raw sample records from all opened counters"), 3470 OPT_BOOLEAN('a', "all-cpus", &record.opts.target.system_wide, 3471 "system-wide collection from all CPUs"), 3472 OPT_STRING('C', "cpu", &record.opts.target.cpu_list, "cpu", 3473 "list of cpus to monitor"), 3474 OPT_U64('c', "count", &record.opts.user_interval, "event period to sample"), 3475 OPT_STRING('o', "output", &record.data.path, "file", 3476 "output file name"), 3477 OPT_BOOLEAN_SET('i', "no-inherit", &record.opts.no_inherit, 3478 &record.opts.no_inherit_set, 3479 "child tasks do not inherit counters"), 3480 OPT_BOOLEAN(0, "tail-synthesize", &record.opts.tail_synthesize, 3481 "synthesize non-sample events at the end of output"), 3482 OPT_BOOLEAN(0, "overwrite", &record.opts.overwrite, "use overwrite mode"), 3483 OPT_BOOLEAN(0, "no-bpf-event", &record.opts.no_bpf_event, "do not record bpf events"), 3484 OPT_BOOLEAN(0, "strict-freq", &record.opts.strict_freq, 3485 "Fail if the specified frequency can't be used"), 3486 OPT_CALLBACK('F', "freq", &record.opts, "freq or 'max'", 3487 "profile at this frequency", 3488 record__parse_freq), 3489 OPT_CALLBACK('m', "mmap-pages", &record.opts, "pages[,pages]", 3490 "number of mmap data pages and AUX area tracing mmap pages", 3491 record__parse_mmap_pages), 3492 OPT_CALLBACK(0, "mmap-flush", &record.opts, "number", 3493 "Minimal number of bytes that is extracted from mmap data pages (default: 1)", 3494 record__mmap_flush_parse), 3495 OPT_CALLBACK_NOOPT('g', NULL, &record.opts, 3496 NULL, "enables call-graph recording" , 3497 &record_callchain_opt), 3498 OPT_CALLBACK(0, "call-graph", &record.opts, 3499 "record_mode[,record_size]", record_callchain_help, 3500 &record_parse_callchain_opt), 3501 OPT_INCR('v', "verbose", &verbose, 3502 "be more verbose (show counter open errors, etc)"), 3503 OPT_BOOLEAN('q', "quiet", &quiet, "don't print any warnings or messages"), 3504 OPT_BOOLEAN('s', "stat", &record.opts.inherit_stat, 3505 "per thread counts"), 3506 OPT_BOOLEAN('d', "data", &record.opts.sample_address, "Record the sample addresses"), 3507 OPT_BOOLEAN(0, "phys-data", &record.opts.sample_phys_addr, 3508 "Record the sample physical addresses"), 3509 OPT_BOOLEAN(0, "data-page-size", &record.opts.sample_data_page_size, 3510 "Record the sampled data address data page size"), 3511 OPT_BOOLEAN(0, "code-page-size", &record.opts.sample_code_page_size, 3512 "Record the sampled code address (ip) page size"), 3513 OPT_BOOLEAN(0, "sample-mem-info", &record.opts.sample_data_src, 3514 "Record the data source for memory operations"), 3515 OPT_BOOLEAN(0, "sample-cpu", &record.opts.sample_cpu, "Record the sample cpu"), 3516 OPT_BOOLEAN(0, "sample-identifier", &record.opts.sample_identifier, 3517 "Record the sample identifier"), 3518 OPT_BOOLEAN_SET('T', "timestamp", &record.opts.sample_time, 3519 &record.opts.sample_time_set, 3520 "Record the sample timestamps"), 3521 OPT_BOOLEAN_SET('P', "period", &record.opts.period, &record.opts.period_set, 3522 "Record the sample period"), 3523 OPT_BOOLEAN('n', "no-samples", &record.opts.no_samples, 3524 "don't sample"), 3525 OPT_BOOLEAN_SET('N', "no-buildid-cache", &record.no_buildid_cache, 3526 &record.no_buildid_cache_set, 3527 "do not update the buildid cache"), 3528 OPT_BOOLEAN_SET('B', "no-buildid", &record.no_buildid, 3529 &record.no_buildid_set, 3530 "do not collect buildids in perf.data"), 3531 OPT_CALLBACK('G', "cgroup", &record.evlist, "name", 3532 "monitor event in cgroup name only", 3533 parse_cgroups), 3534 OPT_CALLBACK('D', "delay", &record, "ms", 3535 "ms to wait before starting measurement after program start (-1: start with events disabled), " 3536 "or ranges of time to enable events e.g. '-D 10-20,30-40'", 3537 record__parse_event_enable_time), 3538 OPT_BOOLEAN(0, "kcore", &record.opts.kcore, "copy /proc/kcore"), 3539 OPT_STRING('u', "uid", &record.uid_str, "user", "user to profile"), 3540 3541 OPT_CALLBACK_NOOPT('b', "branch-any", &record.opts.branch_stack, 3542 "branch any", "sample any taken branches", 3543 parse_branch_stack), 3544 3545 OPT_CALLBACK('j', "branch-filter", &record.opts.branch_stack, 3546 "branch filter mask", "branch stack filter modes", 3547 parse_branch_stack), 3548 OPT_BOOLEAN('W', "weight", &record.opts.sample_weight, 3549 "sample by weight (on special events only)"), 3550 OPT_BOOLEAN(0, "transaction", &record.opts.sample_transaction, 3551 "sample transaction flags (special events only)"), 3552 OPT_BOOLEAN(0, "per-thread", &record.opts.target.per_thread, 3553 "use per-thread mmaps"), 3554 OPT_CALLBACK_OPTARG('I', "intr-regs", &record.opts.sample_intr_regs, NULL, "any register", 3555 "sample selected machine registers on interrupt," 3556 " use '-I?' to list register names", parse_intr_regs), 3557 OPT_CALLBACK_OPTARG(0, "user-regs", &record.opts.sample_user_regs, NULL, "any register", 3558 "sample selected machine registers in user space," 3559 " use '--user-regs=?' to list register names", parse_user_regs), 3560 OPT_BOOLEAN(0, "running-time", &record.opts.running_time, 3561 "Record running/enabled time of read (:S) events"), 3562 OPT_CALLBACK('k', "clockid", &record.opts, 3563 "clockid", "clockid to use for events, see clock_gettime()", 3564 parse_clockid), 3565 OPT_STRING_OPTARG('S', "snapshot", &record.opts.auxtrace_snapshot_opts, 3566 "opts", "AUX area tracing Snapshot Mode", ""), 3567 OPT_STRING_OPTARG(0, "aux-sample", &record.opts.auxtrace_sample_opts, 3568 "opts", "sample AUX area", ""), 3569 OPT_UINTEGER(0, "proc-map-timeout", &proc_map_timeout, 3570 "per thread proc mmap processing timeout in ms"), 3571 OPT_BOOLEAN(0, "namespaces", &record.opts.record_namespaces, 3572 "Record namespaces events"), 3573 OPT_BOOLEAN(0, "all-cgroups", &record.opts.record_cgroup, 3574 "Record cgroup events"), 3575 OPT_BOOLEAN_SET(0, "switch-events", &record.opts.record_switch_events, 3576 &record.opts.record_switch_events_set, 3577 "Record context switch events"), 3578 OPT_BOOLEAN_FLAG(0, "all-kernel", &record.opts.all_kernel, 3579 "Configure all used events to run in kernel space.", 3580 PARSE_OPT_EXCLUSIVE), 3581 OPT_BOOLEAN_FLAG(0, "all-user", &record.opts.all_user, 3582 "Configure all used events to run in user space.", 3583 PARSE_OPT_EXCLUSIVE), 3584 OPT_BOOLEAN(0, "kernel-callchains", &record.opts.kernel_callchains, 3585 "collect kernel callchains"), 3586 OPT_BOOLEAN(0, "user-callchains", &record.opts.user_callchains, 3587 "collect user callchains"), 3588 OPT_STRING(0, "vmlinux", &symbol_conf.vmlinux_name, 3589 "file", "vmlinux pathname"), 3590 OPT_BOOLEAN(0, "buildid-all", &record.buildid_all, 3591 "Record build-id of all DSOs regardless of hits"), 3592 OPT_BOOLEAN_SET(0, "buildid-mmap", &record.buildid_mmap, &record.buildid_mmap_set, 3593 "Record build-id in mmap events and skip build-id processing."), 3594 OPT_BOOLEAN(0, "timestamp-filename", &record.timestamp_filename, 3595 "append timestamp to output filename"), 3596 OPT_BOOLEAN(0, "timestamp-boundary", &record.timestamp_boundary, 3597 "Record timestamp boundary (time of first/last samples)"), 3598 OPT_STRING_OPTARG_SET(0, "switch-output", &record.switch_output.str, 3599 &record.switch_output.set, "signal or size[BKMG] or time[smhd]", 3600 "Switch output when receiving SIGUSR2 (signal) or cross a size or time threshold", 3601 "signal"), 3602 OPT_CALLBACK_SET(0, "switch-output-event", &switch_output_parse_events_option_args, 3603 &record.switch_output_event_set, "switch output event", 3604 "switch output event selector. use 'perf list' to list available events", 3605 parse_events_option_new_evlist), 3606 OPT_INTEGER(0, "switch-max-files", &record.switch_output.num_files, 3607 "Limit number of switch output generated files"), 3608 OPT_BOOLEAN(0, "dry-run", &dry_run, 3609 "Parse options then exit"), 3610 #ifdef HAVE_AIO_SUPPORT 3611 OPT_CALLBACK_OPTARG(0, "aio", &record.opts, 3612 &nr_cblocks_default, "n", "Use <n> control blocks in asynchronous trace writing mode (default: 1, max: 4)", 3613 record__aio_parse), 3614 #endif 3615 OPT_CALLBACK(0, "affinity", &record.opts, "node|cpu", 3616 "Set affinity mask of trace reading thread to NUMA node cpu mask or cpu of processed mmap buffer", 3617 record__parse_affinity), 3618 #ifdef HAVE_ZSTD_SUPPORT 3619 OPT_CALLBACK_OPTARG('z', "compression-level", &record.opts, &comp_level_default, "n", 3620 "Compress records using specified level (default: 1 - fastest compression, 22 - greatest compression)", 3621 record__parse_comp_level), 3622 #endif 3623 OPT_CALLBACK(0, "max-size", &record.output_max_size, 3624 "size", "Limit the maximum size of the output file", parse_output_max_size), 3625 OPT_UINTEGER(0, "num-thread-synthesize", 3626 &record.opts.nr_threads_synthesize, 3627 "number of threads to run for event synthesis"), 3628 #ifdef HAVE_LIBPFM 3629 OPT_CALLBACK(0, "pfm-events", &record.evlist, "event", 3630 "libpfm4 event selector. use 'perf list' to list available events", 3631 parse_libpfm_events_option), 3632 #endif 3633 OPT_CALLBACK(0, "control", &record.opts, "fd:ctl-fd[,ack-fd] or fifo:ctl-fifo[,ack-fifo]", 3634 "Listen on ctl-fd descriptor for command to control measurement ('enable': enable events, 'disable': disable events,\n" 3635 "\t\t\t 'snapshot': AUX area tracing snapshot).\n" 3636 "\t\t\t Optionally send control command completion ('ack\\n') to ack-fd descriptor.\n" 3637 "\t\t\t Alternatively, ctl-fifo / ack-fifo will be opened and used as ctl-fd / ack-fd.", 3638 parse_control_option), 3639 OPT_CALLBACK(0, "synth", &record.opts, "no|all|task|mmap|cgroup", 3640 "Fine-tune event synthesis: default=all", parse_record_synth_option), 3641 OPT_STRING_OPTARG_SET(0, "debuginfod", &record.debuginfod.urls, 3642 &record.debuginfod.set, "debuginfod urls", 3643 "Enable debuginfod data retrieval from DEBUGINFOD_URLS or specified urls", 3644 "system"), 3645 OPT_CALLBACK_OPTARG(0, "threads", &record.opts, NULL, "spec", 3646 "write collected trace data into several data files using parallel threads", 3647 record__parse_threads), 3648 OPT_BOOLEAN(0, "off-cpu", &record.off_cpu, "Enable off-cpu analysis"), 3649 OPT_STRING(0, "setup-filter", &record.filter_action, "pin|unpin", 3650 "BPF filter action"), 3651 OPT_CALLBACK(0, "off-cpu-thresh", &record.opts, "ms", 3652 "Dump off-cpu samples if off-cpu time exceeds this threshold (in milliseconds). (Default: 500ms)", 3653 record__parse_off_cpu_thresh), 3654 OPT_BOOLEAN_SET(0, "data-mmap", &record.opts.record_data_mmap, 3655 &record.opts.record_data_mmap_set, 3656 "Record mmap events for non-executable mappings"), 3657 OPT_END() 3658 }; 3659 3660 struct option *record_options = __record_options; 3661 3662 static int record__mmap_cpu_mask_init(struct mmap_cpu_mask *mask, struct perf_cpu_map *cpus) 3663 { 3664 struct perf_cpu cpu; 3665 unsigned int idx; 3666 3667 if (cpu_map__is_dummy(cpus)) 3668 return 0; 3669 3670 perf_cpu_map__for_each_cpu_skip_any(cpu, idx, cpus) { 3671 /* Return ENODEV is input cpu is greater than max cpu */ 3672 if ((unsigned long)cpu.cpu > mask->nbits) 3673 return -ENODEV; 3674 __set_bit(cpu.cpu, mask->bits); 3675 } 3676 3677 return 0; 3678 } 3679 3680 static int record__mmap_cpu_mask_init_spec(struct mmap_cpu_mask *mask, const char *mask_spec) 3681 { 3682 struct perf_cpu_map *cpus; 3683 3684 cpus = perf_cpu_map__new(mask_spec); 3685 if (!cpus) 3686 return -ENOMEM; 3687 3688 bitmap_zero(mask->bits, mask->nbits); 3689 if (record__mmap_cpu_mask_init(mask, cpus)) 3690 return -ENODEV; 3691 3692 perf_cpu_map__put(cpus); 3693 3694 return 0; 3695 } 3696 3697 static void record__free_thread_masks(struct record *rec, int nr_threads) 3698 { 3699 int t; 3700 3701 if (rec->thread_masks) 3702 for (t = 0; t < nr_threads; t++) 3703 record__thread_mask_free(&rec->thread_masks[t]); 3704 3705 zfree(&rec->thread_masks); 3706 } 3707 3708 static int record__alloc_thread_masks(struct record *rec, int nr_threads, int nr_bits) 3709 { 3710 int t, ret; 3711 3712 rec->thread_masks = calloc(nr_threads, sizeof(*(rec->thread_masks))); 3713 if (!rec->thread_masks) { 3714 pr_err("Failed to allocate thread masks\n"); 3715 return -ENOMEM; 3716 } 3717 3718 for (t = 0; t < nr_threads; t++) { 3719 ret = record__thread_mask_alloc(&rec->thread_masks[t], nr_bits); 3720 if (ret) { 3721 pr_err("Failed to allocate thread masks[%d]\n", t); 3722 goto out_free; 3723 } 3724 } 3725 3726 return 0; 3727 3728 out_free: 3729 record__free_thread_masks(rec, nr_threads); 3730 3731 return ret; 3732 } 3733 3734 static int record__init_thread_cpu_masks(struct record *rec, struct perf_cpu_map *cpus) 3735 { 3736 int t, ret, nr_cpus = perf_cpu_map__nr(cpus); 3737 3738 ret = record__alloc_thread_masks(rec, nr_cpus, cpu__max_cpu().cpu); 3739 if (ret) 3740 return ret; 3741 3742 rec->nr_threads = nr_cpus; 3743 pr_debug("nr_threads: %d\n", rec->nr_threads); 3744 3745 for (t = 0; t < rec->nr_threads; t++) { 3746 __set_bit(perf_cpu_map__cpu(cpus, t).cpu, rec->thread_masks[t].maps.bits); 3747 __set_bit(perf_cpu_map__cpu(cpus, t).cpu, rec->thread_masks[t].affinity.bits); 3748 if (verbose > 0) { 3749 pr_debug("thread_masks[%d]: ", t); 3750 mmap_cpu_mask__scnprintf(&rec->thread_masks[t].maps, "maps"); 3751 pr_debug("thread_masks[%d]: ", t); 3752 mmap_cpu_mask__scnprintf(&rec->thread_masks[t].affinity, "affinity"); 3753 } 3754 } 3755 3756 return 0; 3757 } 3758 3759 static int record__init_thread_masks_spec(struct record *rec, struct perf_cpu_map *cpus, 3760 const char **maps_spec, const char **affinity_spec, 3761 u32 nr_spec) 3762 { 3763 u32 s; 3764 int ret = 0, t = 0; 3765 struct mmap_cpu_mask cpus_mask; 3766 struct thread_mask thread_mask, full_mask, *thread_masks; 3767 3768 ret = record__mmap_cpu_mask_alloc(&cpus_mask, cpu__max_cpu().cpu); 3769 if (ret) { 3770 pr_err("Failed to allocate CPUs mask\n"); 3771 return ret; 3772 } 3773 3774 ret = record__mmap_cpu_mask_init(&cpus_mask, cpus); 3775 if (ret) { 3776 pr_err("Failed to init cpu mask\n"); 3777 goto out_free_cpu_mask; 3778 } 3779 3780 ret = record__thread_mask_alloc(&full_mask, cpu__max_cpu().cpu); 3781 if (ret) { 3782 pr_err("Failed to allocate full mask\n"); 3783 goto out_free_cpu_mask; 3784 } 3785 3786 ret = record__thread_mask_alloc(&thread_mask, cpu__max_cpu().cpu); 3787 if (ret) { 3788 pr_err("Failed to allocate thread mask\n"); 3789 goto out_free_full_and_cpu_masks; 3790 } 3791 3792 for (s = 0; s < nr_spec; s++) { 3793 ret = record__mmap_cpu_mask_init_spec(&thread_mask.maps, maps_spec[s]); 3794 if (ret) { 3795 pr_err("Failed to initialize maps thread mask\n"); 3796 goto out_free; 3797 } 3798 ret = record__mmap_cpu_mask_init_spec(&thread_mask.affinity, affinity_spec[s]); 3799 if (ret) { 3800 pr_err("Failed to initialize affinity thread mask\n"); 3801 goto out_free; 3802 } 3803 3804 /* ignore invalid CPUs but do not allow empty masks */ 3805 if (!bitmap_and(thread_mask.maps.bits, thread_mask.maps.bits, 3806 cpus_mask.bits, thread_mask.maps.nbits)) { 3807 pr_err("Empty maps mask: %s\n", maps_spec[s]); 3808 ret = -EINVAL; 3809 goto out_free; 3810 } 3811 if (!bitmap_and(thread_mask.affinity.bits, thread_mask.affinity.bits, 3812 cpus_mask.bits, thread_mask.affinity.nbits)) { 3813 pr_err("Empty affinity mask: %s\n", affinity_spec[s]); 3814 ret = -EINVAL; 3815 goto out_free; 3816 } 3817 3818 /* do not allow intersection with other masks (full_mask) */ 3819 if (bitmap_intersects(thread_mask.maps.bits, full_mask.maps.bits, 3820 thread_mask.maps.nbits)) { 3821 pr_err("Intersecting maps mask: %s\n", maps_spec[s]); 3822 ret = -EINVAL; 3823 goto out_free; 3824 } 3825 if (bitmap_intersects(thread_mask.affinity.bits, full_mask.affinity.bits, 3826 thread_mask.affinity.nbits)) { 3827 pr_err("Intersecting affinity mask: %s\n", affinity_spec[s]); 3828 ret = -EINVAL; 3829 goto out_free; 3830 } 3831 3832 bitmap_or(full_mask.maps.bits, full_mask.maps.bits, 3833 thread_mask.maps.bits, full_mask.maps.nbits); 3834 bitmap_or(full_mask.affinity.bits, full_mask.affinity.bits, 3835 thread_mask.affinity.bits, full_mask.maps.nbits); 3836 3837 thread_masks = realloc(rec->thread_masks, (t + 1) * sizeof(struct thread_mask)); 3838 if (!thread_masks) { 3839 pr_err("Failed to reallocate thread masks\n"); 3840 ret = -ENOMEM; 3841 goto out_free; 3842 } 3843 rec->thread_masks = thread_masks; 3844 rec->thread_masks[t] = thread_mask; 3845 if (verbose > 0) { 3846 pr_debug("thread_masks[%d]: ", t); 3847 mmap_cpu_mask__scnprintf(&rec->thread_masks[t].maps, "maps"); 3848 pr_debug("thread_masks[%d]: ", t); 3849 mmap_cpu_mask__scnprintf(&rec->thread_masks[t].affinity, "affinity"); 3850 } 3851 t++; 3852 ret = record__thread_mask_alloc(&thread_mask, cpu__max_cpu().cpu); 3853 if (ret) { 3854 pr_err("Failed to allocate thread mask\n"); 3855 goto out_free_full_and_cpu_masks; 3856 } 3857 } 3858 rec->nr_threads = t; 3859 pr_debug("nr_threads: %d\n", rec->nr_threads); 3860 if (!rec->nr_threads) 3861 ret = -EINVAL; 3862 3863 out_free: 3864 record__thread_mask_free(&thread_mask); 3865 out_free_full_and_cpu_masks: 3866 record__thread_mask_free(&full_mask); 3867 out_free_cpu_mask: 3868 record__mmap_cpu_mask_free(&cpus_mask); 3869 3870 return ret; 3871 } 3872 3873 static int record__init_thread_core_masks(struct record *rec, struct perf_cpu_map *cpus) 3874 { 3875 int ret; 3876 struct cpu_topology *topo; 3877 3878 topo = cpu_topology__new(); 3879 if (!topo) { 3880 pr_err("Failed to allocate CPU topology\n"); 3881 return -ENOMEM; 3882 } 3883 3884 ret = record__init_thread_masks_spec(rec, cpus, topo->core_cpus_list, 3885 topo->core_cpus_list, topo->core_cpus_lists); 3886 cpu_topology__delete(topo); 3887 3888 return ret; 3889 } 3890 3891 static int record__init_thread_package_masks(struct record *rec, struct perf_cpu_map *cpus) 3892 { 3893 int ret; 3894 struct cpu_topology *topo; 3895 3896 topo = cpu_topology__new(); 3897 if (!topo) { 3898 pr_err("Failed to allocate CPU topology\n"); 3899 return -ENOMEM; 3900 } 3901 3902 ret = record__init_thread_masks_spec(rec, cpus, topo->package_cpus_list, 3903 topo->package_cpus_list, topo->package_cpus_lists); 3904 cpu_topology__delete(topo); 3905 3906 return ret; 3907 } 3908 3909 static int record__init_thread_numa_masks(struct record *rec, struct perf_cpu_map *cpus) 3910 { 3911 u32 s; 3912 int ret; 3913 const char **spec; 3914 struct numa_topology *topo; 3915 3916 topo = numa_topology__new(); 3917 if (!topo) { 3918 pr_err("Failed to allocate NUMA topology\n"); 3919 return -ENOMEM; 3920 } 3921 3922 spec = calloc(topo->nr, sizeof(char *)); 3923 if (!spec) { 3924 pr_err("Failed to allocate NUMA spec\n"); 3925 ret = -ENOMEM; 3926 goto out_delete_topo; 3927 } 3928 for (s = 0; s < topo->nr; s++) 3929 spec[s] = topo->nodes[s].cpus; 3930 3931 ret = record__init_thread_masks_spec(rec, cpus, spec, spec, topo->nr); 3932 3933 zfree(&spec); 3934 3935 out_delete_topo: 3936 numa_topology__delete(topo); 3937 3938 return ret; 3939 } 3940 3941 static int record__init_thread_user_masks(struct record *rec, struct perf_cpu_map *cpus) 3942 { 3943 int t, ret; 3944 u32 s, nr_spec = 0; 3945 char **maps_spec = NULL, **affinity_spec = NULL, **tmp_spec; 3946 char *user_spec, *spec, *spec_ptr, *mask, *mask_ptr, *dup_mask = NULL; 3947 3948 for (t = 0, user_spec = (char *)rec->opts.threads_user_spec; ; t++, user_spec = NULL) { 3949 spec = strtok_r(user_spec, ":", &spec_ptr); 3950 if (spec == NULL) 3951 break; 3952 pr_debug2("threads_spec[%d]: %s\n", t, spec); 3953 mask = strtok_r(spec, "/", &mask_ptr); 3954 if (mask == NULL) 3955 break; 3956 pr_debug2(" maps mask: %s\n", mask); 3957 tmp_spec = realloc(maps_spec, (nr_spec + 1) * sizeof(char *)); 3958 if (!tmp_spec) { 3959 pr_err("Failed to reallocate maps spec\n"); 3960 ret = -ENOMEM; 3961 goto out_free; 3962 } 3963 maps_spec = tmp_spec; 3964 maps_spec[nr_spec] = dup_mask = strdup(mask); 3965 if (!maps_spec[nr_spec]) { 3966 pr_err("Failed to allocate maps spec[%d]\n", nr_spec); 3967 ret = -ENOMEM; 3968 goto out_free; 3969 } 3970 mask = strtok_r(NULL, "/", &mask_ptr); 3971 if (mask == NULL) { 3972 pr_err("Invalid thread maps or affinity specs\n"); 3973 ret = -EINVAL; 3974 goto out_free; 3975 } 3976 pr_debug2(" affinity mask: %s\n", mask); 3977 tmp_spec = realloc(affinity_spec, (nr_spec + 1) * sizeof(char *)); 3978 if (!tmp_spec) { 3979 pr_err("Failed to reallocate affinity spec\n"); 3980 ret = -ENOMEM; 3981 goto out_free; 3982 } 3983 affinity_spec = tmp_spec; 3984 affinity_spec[nr_spec] = strdup(mask); 3985 if (!affinity_spec[nr_spec]) { 3986 pr_err("Failed to allocate affinity spec[%d]\n", nr_spec); 3987 ret = -ENOMEM; 3988 goto out_free; 3989 } 3990 dup_mask = NULL; 3991 nr_spec++; 3992 } 3993 3994 ret = record__init_thread_masks_spec(rec, cpus, (const char **)maps_spec, 3995 (const char **)affinity_spec, nr_spec); 3996 3997 out_free: 3998 free(dup_mask); 3999 for (s = 0; s < nr_spec; s++) { 4000 if (maps_spec) 4001 free(maps_spec[s]); 4002 if (affinity_spec) 4003 free(affinity_spec[s]); 4004 } 4005 free(affinity_spec); 4006 free(maps_spec); 4007 4008 return ret; 4009 } 4010 4011 static int record__init_thread_default_masks(struct record *rec, struct perf_cpu_map *cpus) 4012 { 4013 int ret; 4014 4015 ret = record__alloc_thread_masks(rec, 1, cpu__max_cpu().cpu); 4016 if (ret) 4017 return ret; 4018 4019 if (record__mmap_cpu_mask_init(&rec->thread_masks->maps, cpus)) 4020 return -ENODEV; 4021 4022 rec->nr_threads = 1; 4023 4024 return 0; 4025 } 4026 4027 static int record__init_thread_masks(struct record *rec) 4028 { 4029 int ret = 0; 4030 struct perf_cpu_map *cpus = rec->evlist->core.all_cpus; 4031 4032 if (!record__threads_enabled(rec)) 4033 return record__init_thread_default_masks(rec, cpus); 4034 4035 if (evlist__per_thread(rec->evlist)) { 4036 pr_err("--per-thread option is mutually exclusive to parallel streaming mode.\n"); 4037 return -EINVAL; 4038 } 4039 4040 switch (rec->opts.threads_spec) { 4041 case THREAD_SPEC__CPU: 4042 ret = record__init_thread_cpu_masks(rec, cpus); 4043 break; 4044 case THREAD_SPEC__CORE: 4045 ret = record__init_thread_core_masks(rec, cpus); 4046 break; 4047 case THREAD_SPEC__PACKAGE: 4048 ret = record__init_thread_package_masks(rec, cpus); 4049 break; 4050 case THREAD_SPEC__NUMA: 4051 ret = record__init_thread_numa_masks(rec, cpus); 4052 break; 4053 case THREAD_SPEC__USER: 4054 ret = record__init_thread_user_masks(rec, cpus); 4055 break; 4056 default: 4057 break; 4058 } 4059 4060 return ret; 4061 } 4062 4063 int cmd_record(int argc, const char **argv) 4064 { 4065 int err; 4066 struct record *rec = &record; 4067 char errbuf[BUFSIZ]; 4068 4069 setlocale(LC_ALL, ""); 4070 4071 #ifndef HAVE_BPF_SKEL 4072 # define set_nobuild(s, l, m, c) set_option_nobuild(record_options, s, l, m, c) 4073 set_nobuild('\0', "off-cpu", "no BUILD_BPF_SKEL=1", true); 4074 # undef set_nobuild 4075 #endif 4076 4077 /* Disable eager loading of kernel symbols that adds overhead to perf record. */ 4078 symbol_conf.lazy_load_kernel_maps = true; 4079 rec->opts.affinity = PERF_AFFINITY_SYS; 4080 4081 rec->evlist = evlist__new(); 4082 if (rec->evlist == NULL) 4083 return -ENOMEM; 4084 4085 err = perf_config(perf_record_config, rec); 4086 if (err) 4087 return err; 4088 4089 argc = parse_options(argc, argv, record_options, record_usage, 4090 PARSE_OPT_STOP_AT_NON_OPTION); 4091 if (quiet) 4092 perf_quiet_option(); 4093 4094 err = symbol__validate_sym_arguments(); 4095 if (err) 4096 return err; 4097 4098 perf_debuginfod_setup(&record.debuginfod); 4099 4100 /* 4101 * Use system wide (-a) for the default target (ie. when no 4102 * workload). User ID filtering also implies system-wide. 4103 */ 4104 if ((!argc && target__none(&rec->opts.target)) || rec->uid_str) 4105 rec->opts.target.system_wide = true; 4106 4107 if (nr_cgroups && !rec->opts.target.system_wide) { 4108 usage_with_options_msg(record_usage, record_options, 4109 "cgroup monitoring only available in system-wide mode"); 4110 4111 } 4112 4113 if (record.latency) { 4114 /* 4115 * There is no fundamental reason why latency profiling 4116 * can't work for system-wide mode, but exact semantics 4117 * and details are to be defined. 4118 * See the following thread for details: 4119 * https://lore.kernel.org/all/Z4XDJyvjiie3howF@google.com/ 4120 */ 4121 if (record.opts.target.system_wide) { 4122 pr_err("Failed: latency profiling is not supported with system-wide collection.\n"); 4123 err = -EINVAL; 4124 goto out_opts; 4125 } 4126 record.opts.record_switch_events = true; 4127 } 4128 4129 if (rec->buildid_mmap && !perf_can_record_build_id()) { 4130 pr_warning("Missing support for build id in kernel mmap events.\n" 4131 "Disable this warning with --no-buildid-mmap\n"); 4132 rec->buildid_mmap = false; 4133 } 4134 4135 if (rec->buildid_mmap) { 4136 /* Enable perf_event_attr::build_id bit. */ 4137 rec->opts.build_id = true; 4138 /* Disable build-ID table in the header. */ 4139 rec->no_buildid = true; 4140 } else { 4141 pr_debug("Disabling build id in synthesized mmap2 events.\n"); 4142 symbol_conf.no_buildid_mmap2 = true; 4143 } 4144 4145 if (rec->no_buildid_set && rec->no_buildid) { 4146 /* -B implies -N for historic reasons. */ 4147 rec->no_buildid_cache = true; 4148 } 4149 4150 if (rec->opts.record_cgroup && !perf_can_record_cgroup()) { 4151 pr_err("Kernel has no cgroup sampling support.\n"); 4152 err = -EINVAL; 4153 goto out_opts; 4154 } 4155 4156 if (rec->opts.kcore) 4157 rec->opts.text_poke = true; 4158 4159 if (rec->opts.kcore || record__threads_enabled(rec)) 4160 rec->data.is_dir = true; 4161 4162 if (record__threads_enabled(rec)) { 4163 if (rec->opts.affinity != PERF_AFFINITY_SYS) { 4164 pr_err("--affinity option is mutually exclusive to parallel streaming mode.\n"); 4165 goto out_opts; 4166 } 4167 if (record__aio_enabled(rec)) { 4168 pr_err("Asynchronous streaming mode (--aio) is mutually exclusive to parallel streaming mode.\n"); 4169 goto out_opts; 4170 } 4171 } 4172 4173 if (rec->opts.comp_level != 0) { 4174 pr_debug("Compression enabled, disabling build id collection at the end of the session.\n"); 4175 rec->no_buildid = true; 4176 } 4177 4178 if (rec->opts.record_switch_events && 4179 !perf_can_record_switch_events()) { 4180 ui__error("kernel does not support recording context switch events\n"); 4181 parse_options_usage(record_usage, record_options, "switch-events", 0); 4182 err = -EINVAL; 4183 goto out_opts; 4184 } 4185 4186 if (switch_output_setup(rec)) { 4187 parse_options_usage(record_usage, record_options, "switch-output", 0); 4188 err = -EINVAL; 4189 goto out_opts; 4190 } 4191 4192 if (rec->switch_output.time) { 4193 signal(SIGALRM, alarm_sig_handler); 4194 alarm(rec->switch_output.time); 4195 } 4196 4197 if (rec->switch_output.num_files) { 4198 rec->switch_output.filenames = calloc(rec->switch_output.num_files, 4199 sizeof(char *)); 4200 if (!rec->switch_output.filenames) { 4201 err = -EINVAL; 4202 goto out_opts; 4203 } 4204 } 4205 4206 if (rec->timestamp_filename && record__threads_enabled(rec)) { 4207 rec->timestamp_filename = false; 4208 pr_warning("WARNING: --timestamp-filename option is not available in parallel streaming mode.\n"); 4209 } 4210 4211 if (rec->filter_action) { 4212 if (!strcmp(rec->filter_action, "pin")) 4213 err = perf_bpf_filter__pin(); 4214 else if (!strcmp(rec->filter_action, "unpin")) 4215 err = perf_bpf_filter__unpin(); 4216 else { 4217 pr_warning("Unknown BPF filter action: %s\n", rec->filter_action); 4218 err = -EINVAL; 4219 } 4220 goto out_opts; 4221 } 4222 4223 /* For backward compatibility, -d implies --mem-info and --data-mmap */ 4224 if (rec->opts.sample_address) { 4225 rec->opts.sample_data_src = true; 4226 if (!rec->opts.record_data_mmap_set) 4227 rec->opts.record_data_mmap = true; 4228 } 4229 4230 /* 4231 * Allow aliases to facilitate the lookup of symbols for address 4232 * filters. Refer to auxtrace_parse_filters(). 4233 */ 4234 symbol_conf.allow_aliases = true; 4235 4236 symbol__init(NULL); 4237 4238 err = record__auxtrace_init(rec); 4239 if (err) 4240 goto out; 4241 4242 if (dry_run) 4243 goto out; 4244 4245 err = -ENOMEM; 4246 4247 if (rec->no_buildid_cache) { 4248 disable_buildid_cache(); 4249 } else if (rec->switch_output.enabled) { 4250 /* 4251 * In 'perf record --switch-output', disable buildid 4252 * generation by default to reduce data file switching 4253 * overhead. Still generate buildid if they are required 4254 * explicitly using 4255 * 4256 * perf record --switch-output --no-no-buildid \ 4257 * --no-no-buildid-cache 4258 * 4259 * Following code equals to: 4260 * 4261 * if ((rec->no_buildid || !rec->no_buildid_set) && 4262 * (rec->no_buildid_cache || !rec->no_buildid_cache_set)) 4263 * disable_buildid_cache(); 4264 */ 4265 bool disable = true; 4266 4267 if (rec->no_buildid_set && !rec->no_buildid) 4268 disable = false; 4269 if (rec->no_buildid_cache_set && !rec->no_buildid_cache) 4270 disable = false; 4271 if (disable) { 4272 rec->no_buildid = true; 4273 rec->no_buildid_cache = true; 4274 disable_buildid_cache(); 4275 } 4276 } 4277 4278 if (record.opts.overwrite) 4279 record.opts.tail_synthesize = true; 4280 4281 if (rec->evlist->core.nr_entries == 0) { 4282 struct evlist *def_evlist = evlist__new_default(&rec->opts.target, 4283 callchain_param.enabled); 4284 4285 if (!def_evlist) 4286 goto out; 4287 4288 evlist__splice_list_tail(rec->evlist, &def_evlist->core.entries); 4289 evlist__delete(def_evlist); 4290 } 4291 4292 if (rec->opts.target.tid && !rec->opts.no_inherit_set) 4293 rec->opts.no_inherit = true; 4294 4295 err = target__validate(&rec->opts.target); 4296 if (err) { 4297 target__strerror(&rec->opts.target, err, errbuf, BUFSIZ); 4298 ui__warning("%s\n", errbuf); 4299 } 4300 4301 if (rec->uid_str) { 4302 uid_t uid = parse_uid(rec->uid_str); 4303 4304 if (uid == UINT_MAX) { 4305 ui__error("Invalid User: %s", rec->uid_str); 4306 err = -EINVAL; 4307 goto out; 4308 } 4309 err = parse_uid_filter(rec->evlist, uid); 4310 if (err) 4311 goto out; 4312 } 4313 4314 /* Enable ignoring missing threads when -p option is defined. */ 4315 rec->opts.ignore_missing_thread = rec->opts.target.pid; 4316 4317 evlist__warn_user_requested_cpus(rec->evlist, rec->opts.target.cpu_list); 4318 4319 if (callchain_param.enabled && callchain_param.record_mode == CALLCHAIN_FP) { 4320 if (EM_HOST == EM_AARCH64) 4321 add_leaf_frame_caller_opts_aarch64(&rec->opts); 4322 } 4323 4324 err = -ENOMEM; 4325 if (evlist__create_maps(rec->evlist, &rec->opts.target) < 0) { 4326 if (rec->opts.target.pid != NULL) { 4327 pr_err("Couldn't create thread/CPU maps: %s\n", 4328 errno == ENOENT ? "No such process" : str_error_r(errno, errbuf, sizeof(errbuf))); 4329 goto out; 4330 } 4331 else 4332 usage_with_options(record_usage, record_options); 4333 } 4334 4335 err = auxtrace_record__options(rec->itr, rec->evlist, &rec->opts); 4336 if (err) 4337 goto out; 4338 4339 /* 4340 * We take all buildids when the file contains 4341 * AUX area tracing data because we do not decode the 4342 * trace because it would take too long. 4343 */ 4344 if (rec->opts.full_auxtrace) 4345 rec->buildid_all = true; 4346 4347 if (rec->opts.text_poke) { 4348 err = record__config_text_poke(rec->evlist); 4349 if (err) { 4350 pr_err("record__config_text_poke failed, error %d\n", err); 4351 goto out; 4352 } 4353 } 4354 4355 if (rec->off_cpu) { 4356 err = record__config_off_cpu(rec); 4357 if (err) { 4358 pr_err("record__config_off_cpu failed, error %d\n", err); 4359 goto out; 4360 } 4361 } 4362 4363 if (record_opts__config(&rec->opts)) { 4364 err = -EINVAL; 4365 goto out; 4366 } 4367 4368 err = record__config_tracking_events(rec); 4369 if (err) { 4370 pr_err("record__config_tracking_events failed, error %d\n", err); 4371 goto out; 4372 } 4373 4374 err = record__init_thread_masks(rec); 4375 if (err) { 4376 pr_err("Failed to initialize parallel data streaming masks\n"); 4377 goto out; 4378 } 4379 4380 if (rec->opts.nr_cblocks > nr_cblocks_max) 4381 rec->opts.nr_cblocks = nr_cblocks_max; 4382 pr_debug("nr_cblocks: %d\n", rec->opts.nr_cblocks); 4383 4384 pr_debug("affinity: %s\n", affinity_tags[rec->opts.affinity]); 4385 pr_debug("mmap flush: %d\n", rec->opts.mmap_flush); 4386 4387 if (rec->opts.comp_level > comp_level_max) 4388 rec->opts.comp_level = comp_level_max; 4389 pr_debug("comp level: %d\n", rec->opts.comp_level); 4390 4391 err = __cmd_record(&record, argc, argv); 4392 out: 4393 record__free_thread_masks(rec, rec->nr_threads); 4394 rec->nr_threads = 0; 4395 symbol__exit(); 4396 auxtrace_record__free(rec->itr); 4397 out_opts: 4398 evlist__close_control(rec->opts.ctl_fd, rec->opts.ctl_fd_ack, &rec->opts.ctl_fd_close); 4399 evlist__delete(rec->evlist); 4400 return err; 4401 } 4402 4403 static void snapshot_sig_handler(int sig __maybe_unused) 4404 { 4405 struct record *rec = &record; 4406 4407 hit_auxtrace_snapshot_trigger(rec); 4408 4409 if (switch_output_signal(rec)) 4410 trigger_hit(&switch_output_trigger); 4411 } 4412 4413 static void alarm_sig_handler(int sig __maybe_unused) 4414 { 4415 struct record *rec = &record; 4416 4417 if (switch_output_time(rec)) 4418 trigger_hit(&switch_output_trigger); 4419 } 4420