1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * builtin-record.c 4 * 5 * Builtin record command: Record the profile of a workload 6 * (or a CPU, or a PID) into the perf.data output file - for 7 * later analysis via perf report. 8 */ 9 #include "builtin.h" 10 11 #include "util/build-id.h" 12 #include <subcmd/parse-options.h> 13 #include <internal/xyarray.h> 14 #include "util/parse-events.h" 15 #include "util/config.h" 16 17 #include "util/callchain.h" 18 #include "util/cgroup.h" 19 #include "util/header.h" 20 #include "util/event.h" 21 #include "util/evlist.h" 22 #include "util/evsel.h" 23 #include "util/debug.h" 24 #include "util/mmap.h" 25 #include "util/mutex.h" 26 #include "util/target.h" 27 #include "util/session.h" 28 #include "util/tool.h" 29 #include "util/stat.h" 30 #include "util/symbol.h" 31 #include "util/record.h" 32 #include "util/cpumap.h" 33 #include "util/thread_map.h" 34 #include "util/data.h" 35 #include "util/perf_regs.h" 36 #include "util/auxtrace.h" 37 #include "util/tsc.h" 38 #include "util/parse-branch-options.h" 39 #include "util/parse-regs-options.h" 40 #include "util/perf_api_probe.h" 41 #include "util/trigger.h" 42 #include "util/perf-hooks.h" 43 #include "util/cpu-set-sched.h" 44 #include "util/synthetic-events.h" 45 #include "util/time-utils.h" 46 #include "util/units.h" 47 #include "util/bpf-event.h" 48 #include "util/util.h" 49 #include "util/pfm.h" 50 #include "util/pmu.h" 51 #include "util/pmus.h" 52 #include "util/clockid.h" 53 #include "util/off_cpu.h" 54 #include "util/bpf-filter.h" 55 #include "util/strbuf.h" 56 #include "asm/bug.h" 57 #include "perf.h" 58 #include "cputopo.h" 59 60 #include <errno.h> 61 #include <inttypes.h> 62 #include <locale.h> 63 #include <poll.h> 64 #include <pthread.h> 65 #include <unistd.h> 66 #ifndef HAVE_GETTID 67 #include <syscall.h> 68 #endif 69 #include <sched.h> 70 #include <signal.h> 71 #ifdef HAVE_EVENTFD_SUPPORT 72 #include <sys/eventfd.h> 73 #endif 74 #include <sys/mman.h> 75 #include <sys/wait.h> 76 #include <sys/types.h> 77 #include <sys/stat.h> 78 #include <fcntl.h> 79 #include <linux/err.h> 80 #include <linux/string.h> 81 #include <linux/time64.h> 82 #include <linux/zalloc.h> 83 #include <linux/bitmap.h> 84 #include <sys/time.h> 85 86 struct switch_output { 87 bool enabled; 88 bool signal; 89 unsigned long size; 90 unsigned long time; 91 const char *str; 92 bool set; 93 char **filenames; 94 int num_files; 95 int cur_file; 96 }; 97 98 struct thread_mask { 99 struct mmap_cpu_mask maps; 100 struct mmap_cpu_mask affinity; 101 }; 102 103 struct record_thread { 104 pid_t tid; 105 struct thread_mask *mask; 106 struct { 107 int msg[2]; 108 int ack[2]; 109 } pipes; 110 struct fdarray pollfd; 111 int ctlfd_pos; 112 int nr_mmaps; 113 struct mmap **maps; 114 struct mmap **overwrite_maps; 115 struct record *rec; 116 unsigned long long samples; 117 unsigned long waking; 118 u64 bytes_written; 119 u64 bytes_transferred; 120 u64 bytes_compressed; 121 }; 122 123 static __thread struct record_thread *thread; 124 125 enum thread_msg { 126 THREAD_MSG__UNDEFINED = 0, 127 THREAD_MSG__READY, 128 THREAD_MSG__MAX, 129 }; 130 131 static const char *thread_msg_tags[THREAD_MSG__MAX] = { 132 "UNDEFINED", "READY" 133 }; 134 135 enum thread_spec { 136 THREAD_SPEC__UNDEFINED = 0, 137 THREAD_SPEC__CPU, 138 THREAD_SPEC__CORE, 139 THREAD_SPEC__PACKAGE, 140 THREAD_SPEC__NUMA, 141 THREAD_SPEC__USER, 142 THREAD_SPEC__MAX, 143 }; 144 145 static const char *thread_spec_tags[THREAD_SPEC__MAX] = { 146 "undefined", "cpu", "core", "package", "numa", "user" 147 }; 148 149 struct pollfd_index_map { 150 int evlist_pollfd_index; 151 int thread_pollfd_index; 152 }; 153 154 struct record { 155 struct perf_tool tool; 156 struct record_opts opts; 157 u64 bytes_written; 158 u64 thread_bytes_written; 159 struct perf_data data; 160 struct auxtrace_record *itr; 161 struct evlist *evlist; 162 struct perf_session *session; 163 struct evlist *sb_evlist; 164 pthread_t thread_id; 165 int realtime_prio; 166 bool latency; 167 bool switch_output_event_set; 168 bool no_buildid; 169 bool no_buildid_set; 170 bool no_buildid_cache; 171 bool no_buildid_cache_set; 172 bool buildid_all; 173 bool buildid_mmap; 174 bool buildid_mmap_set; 175 bool timestamp_filename; 176 bool timestamp_boundary; 177 bool off_cpu; 178 const char *filter_action; 179 const char *uid_str; 180 struct switch_output switch_output; 181 unsigned long long samples; 182 unsigned long output_max_size; /* = 0: unlimited */ 183 struct perf_debuginfod debuginfod; 184 int nr_threads; 185 struct thread_mask *thread_masks; 186 struct record_thread *thread_data; 187 struct pollfd_index_map *index_map; 188 size_t index_map_sz; 189 size_t index_map_cnt; 190 }; 191 192 static volatile int done; 193 194 static volatile int auxtrace_record__snapshot_started; 195 static DEFINE_TRIGGER(auxtrace_snapshot_trigger); 196 static DEFINE_TRIGGER(switch_output_trigger); 197 198 static const char *affinity_tags[PERF_AFFINITY_MAX] = { 199 "SYS", "NODE", "CPU" 200 }; 201 202 static int build_id__process_mmap(const struct perf_tool *tool, union perf_event *event, 203 struct perf_sample *sample, struct machine *machine); 204 static int build_id__process_mmap2(const struct perf_tool *tool, union perf_event *event, 205 struct perf_sample *sample, struct machine *machine); 206 static int process_timestamp_boundary(const struct perf_tool *tool, 207 union perf_event *event, 208 struct perf_sample *sample, 209 struct machine *machine); 210 211 #ifndef HAVE_GETTID 212 static inline pid_t gettid(void) 213 { 214 return (pid_t)syscall(__NR_gettid); 215 } 216 #endif 217 218 static int record__threads_enabled(struct record *rec) 219 { 220 return rec->opts.threads_spec; 221 } 222 223 static bool switch_output_signal(struct record *rec) 224 { 225 return rec->switch_output.signal && 226 trigger_is_ready(&switch_output_trigger); 227 } 228 229 static bool switch_output_size(struct record *rec) 230 { 231 return rec->switch_output.size && 232 trigger_is_ready(&switch_output_trigger) && 233 (rec->bytes_written >= rec->switch_output.size); 234 } 235 236 static bool switch_output_time(struct record *rec) 237 { 238 return rec->switch_output.time && 239 trigger_is_ready(&switch_output_trigger); 240 } 241 242 static u64 record__bytes_written(struct record *rec) 243 { 244 return rec->bytes_written + rec->thread_bytes_written; 245 } 246 247 static bool record__output_max_size_exceeded(struct record *rec) 248 { 249 return rec->output_max_size && 250 (record__bytes_written(rec) >= rec->output_max_size); 251 } 252 253 static int record__write(struct record *rec, struct mmap *map __maybe_unused, 254 void *bf, size_t size) 255 { 256 struct perf_data_file *file = &rec->session->data->file; 257 258 if (map && map->file) 259 file = map->file; 260 261 if (perf_data_file__write(file, bf, size) < 0) { 262 pr_err("failed to write perf data, error: %m\n"); 263 return -1; 264 } 265 266 if (map && map->file) { 267 thread->bytes_written += size; 268 rec->thread_bytes_written += size; 269 } else { 270 rec->bytes_written += size; 271 } 272 273 if (record__output_max_size_exceeded(rec) && !done) { 274 fprintf(stderr, "[ perf record: perf size limit reached (%" PRIu64 " KB)," 275 " stopping session ]\n", 276 record__bytes_written(rec) >> 10); 277 done = 1; 278 } 279 280 if (switch_output_size(rec)) 281 trigger_hit(&switch_output_trigger); 282 283 return 0; 284 } 285 286 static int record__aio_enabled(struct record *rec); 287 static int record__comp_enabled(struct record *rec); 288 static ssize_t zstd_compress(struct perf_session *session, struct mmap *map, 289 void *dst, size_t dst_size, void *src, size_t src_size); 290 291 #ifdef HAVE_AIO_SUPPORT 292 static int record__aio_write(struct aiocb *cblock, int trace_fd, 293 void *buf, size_t size, off_t off) 294 { 295 int rc; 296 297 cblock->aio_fildes = trace_fd; 298 cblock->aio_buf = buf; 299 cblock->aio_nbytes = size; 300 cblock->aio_offset = off; 301 cblock->aio_sigevent.sigev_notify = SIGEV_NONE; 302 303 do { 304 rc = aio_write(cblock); 305 if (rc == 0) { 306 break; 307 } else if (errno != EAGAIN) { 308 cblock->aio_fildes = -1; 309 pr_err("failed to queue perf data, error: %m\n"); 310 break; 311 } 312 } while (1); 313 314 return rc; 315 } 316 317 static int record__aio_complete(struct mmap *md, struct aiocb *cblock) 318 { 319 void *rem_buf; 320 off_t rem_off; 321 size_t rem_size; 322 int rc, aio_errno; 323 ssize_t aio_ret, written; 324 325 aio_errno = aio_error(cblock); 326 if (aio_errno == EINPROGRESS) 327 return 0; 328 329 written = aio_ret = aio_return(cblock); 330 if (aio_ret < 0) { 331 if (aio_errno != EINTR) 332 pr_err("failed to write perf data, error: %m\n"); 333 written = 0; 334 } 335 336 rem_size = cblock->aio_nbytes - written; 337 338 if (rem_size == 0) { 339 cblock->aio_fildes = -1; 340 /* 341 * md->refcount is incremented in record__aio_pushfn() for 342 * every aio write request started in record__aio_push() so 343 * decrement it because the request is now complete. 344 */ 345 perf_mmap__put(&md->core); 346 rc = 1; 347 } else { 348 /* 349 * aio write request may require restart with the 350 * remainder if the kernel didn't write whole 351 * chunk at once. 352 */ 353 rem_off = cblock->aio_offset + written; 354 rem_buf = (void *)(cblock->aio_buf + written); 355 record__aio_write(cblock, cblock->aio_fildes, 356 rem_buf, rem_size, rem_off); 357 rc = 0; 358 } 359 360 return rc; 361 } 362 363 static int record__aio_sync(struct mmap *md, bool sync_all) 364 { 365 struct aiocb **aiocb = md->aio.aiocb; 366 struct aiocb *cblocks = md->aio.cblocks; 367 struct timespec timeout = { 0, 1000 * 1000 * 1 }; /* 1ms */ 368 int i, do_suspend; 369 370 do { 371 do_suspend = 0; 372 for (i = 0; i < md->aio.nr_cblocks; ++i) { 373 if (cblocks[i].aio_fildes == -1 || record__aio_complete(md, &cblocks[i])) { 374 if (sync_all) 375 aiocb[i] = NULL; 376 else 377 return i; 378 } else { 379 /* 380 * Started aio write is not complete yet 381 * so it has to be waited before the 382 * next allocation. 383 */ 384 aiocb[i] = &cblocks[i]; 385 do_suspend = 1; 386 } 387 } 388 if (!do_suspend) 389 return -1; 390 391 while (aio_suspend((const struct aiocb **)aiocb, md->aio.nr_cblocks, &timeout)) { 392 if (!(errno == EAGAIN || errno == EINTR)) 393 pr_err("failed to sync perf data, error: %m\n"); 394 } 395 } while (1); 396 } 397 398 struct record_aio { 399 struct record *rec; 400 void *data; 401 size_t size; 402 }; 403 404 static int record__aio_pushfn(struct mmap *map, void *to, void *buf, size_t size) 405 { 406 struct record_aio *aio = to; 407 408 /* 409 * map->core.base data pointed by buf is copied into free map->aio.data[] buffer 410 * to release space in the kernel buffer as fast as possible, calling 411 * perf_mmap__consume() from perf_mmap__push() function. 412 * 413 * That lets the kernel to proceed with storing more profiling data into 414 * the kernel buffer earlier than other per-cpu kernel buffers are handled. 415 * 416 * Coping can be done in two steps in case the chunk of profiling data 417 * crosses the upper bound of the kernel buffer. In this case we first move 418 * part of data from map->start till the upper bound and then the remainder 419 * from the beginning of the kernel buffer till the end of the data chunk. 420 */ 421 422 if (record__comp_enabled(aio->rec)) { 423 ssize_t compressed = zstd_compress(aio->rec->session, NULL, aio->data + aio->size, 424 mmap__mmap_len(map) - aio->size, 425 buf, size); 426 if (compressed < 0) 427 return (int)compressed; 428 429 size = compressed; 430 } else { 431 memcpy(aio->data + aio->size, buf, size); 432 } 433 434 if (!aio->size) { 435 /* 436 * Increment map->refcount to guard map->aio.data[] buffer 437 * from premature deallocation because map object can be 438 * released earlier than aio write request started on 439 * map->aio.data[] buffer is complete. 440 * 441 * perf_mmap__put() is done at record__aio_complete() 442 * after started aio request completion or at record__aio_push() 443 * if the request failed to start. 444 */ 445 perf_mmap__get(&map->core); 446 } 447 448 aio->size += size; 449 450 return size; 451 } 452 453 static int record__aio_push(struct record *rec, struct mmap *map, off_t *off) 454 { 455 int ret, idx; 456 int trace_fd = rec->session->data->file.fd; 457 struct record_aio aio = { .rec = rec, .size = 0 }; 458 459 /* 460 * Call record__aio_sync() to wait till map->aio.data[] buffer 461 * becomes available after previous aio write operation. 462 */ 463 464 idx = record__aio_sync(map, false); 465 aio.data = map->aio.data[idx]; 466 ret = perf_mmap__push(map, &aio, record__aio_pushfn); 467 if (ret != 0) /* ret > 0 - no data, ret < 0 - error */ 468 return ret; 469 470 rec->samples++; 471 ret = record__aio_write(&(map->aio.cblocks[idx]), trace_fd, aio.data, aio.size, *off); 472 if (!ret) { 473 *off += aio.size; 474 rec->bytes_written += aio.size; 475 if (switch_output_size(rec)) 476 trigger_hit(&switch_output_trigger); 477 } else { 478 /* 479 * Decrement map->refcount incremented in record__aio_pushfn() 480 * back if record__aio_write() operation failed to start, otherwise 481 * map->refcount is decremented in record__aio_complete() after 482 * aio write operation finishes successfully. 483 */ 484 perf_mmap__put(&map->core); 485 } 486 487 return ret; 488 } 489 490 static off_t record__aio_get_pos(int trace_fd) 491 { 492 return lseek(trace_fd, 0, SEEK_CUR); 493 } 494 495 static void record__aio_set_pos(int trace_fd, off_t pos) 496 { 497 lseek(trace_fd, pos, SEEK_SET); 498 } 499 500 static void record__aio_mmap_read_sync(struct record *rec) 501 { 502 int i; 503 struct evlist *evlist = rec->evlist; 504 struct mmap *maps = evlist->mmap; 505 506 if (!record__aio_enabled(rec)) 507 return; 508 509 for (i = 0; i < evlist->core.nr_mmaps; i++) { 510 struct mmap *map = &maps[i]; 511 512 if (map->core.base) 513 record__aio_sync(map, true); 514 } 515 } 516 517 static int nr_cblocks_default = 1; 518 static int nr_cblocks_max = 4; 519 520 static int record__aio_parse(const struct option *opt, 521 const char *str, 522 int unset) 523 { 524 struct record_opts *opts = (struct record_opts *)opt->value; 525 526 if (unset) { 527 opts->nr_cblocks = 0; 528 } else { 529 if (str) 530 opts->nr_cblocks = strtol(str, NULL, 0); 531 if (!opts->nr_cblocks) 532 opts->nr_cblocks = nr_cblocks_default; 533 } 534 535 return 0; 536 } 537 #else /* HAVE_AIO_SUPPORT */ 538 static int nr_cblocks_max = 0; 539 540 static int record__aio_push(struct record *rec __maybe_unused, struct mmap *map __maybe_unused, 541 off_t *off __maybe_unused) 542 { 543 return -1; 544 } 545 546 static off_t record__aio_get_pos(int trace_fd __maybe_unused) 547 { 548 return -1; 549 } 550 551 static void record__aio_set_pos(int trace_fd __maybe_unused, off_t pos __maybe_unused) 552 { 553 } 554 555 static void record__aio_mmap_read_sync(struct record *rec __maybe_unused) 556 { 557 } 558 #endif 559 560 static int record__aio_enabled(struct record *rec) 561 { 562 return rec->opts.nr_cblocks > 0; 563 } 564 565 #define MMAP_FLUSH_DEFAULT 1 566 static int record__mmap_flush_parse(const struct option *opt, 567 const char *str, 568 int unset) 569 { 570 int flush_max; 571 struct record_opts *opts = (struct record_opts *)opt->value; 572 static struct parse_tag tags[] = { 573 { .tag = 'B', .mult = 1 }, 574 { .tag = 'K', .mult = 1 << 10 }, 575 { .tag = 'M', .mult = 1 << 20 }, 576 { .tag = 'G', .mult = 1 << 30 }, 577 { .tag = 0 }, 578 }; 579 580 if (unset) 581 return 0; 582 583 if (str) { 584 opts->mmap_flush = parse_tag_value(str, tags); 585 if (opts->mmap_flush == (int)-1) 586 opts->mmap_flush = strtol(str, NULL, 0); 587 } 588 589 if (!opts->mmap_flush) 590 opts->mmap_flush = MMAP_FLUSH_DEFAULT; 591 592 flush_max = evlist__mmap_size(opts->mmap_pages); 593 flush_max /= 4; 594 if (opts->mmap_flush > flush_max) 595 opts->mmap_flush = flush_max; 596 597 return 0; 598 } 599 600 #ifdef HAVE_ZSTD_SUPPORT 601 static unsigned int comp_level_default = 1; 602 603 static int record__parse_comp_level(const struct option *opt, const char *str, int unset) 604 { 605 struct record_opts *opts = opt->value; 606 607 if (unset) { 608 opts->comp_level = 0; 609 } else { 610 if (str) 611 opts->comp_level = strtol(str, NULL, 0); 612 if (!opts->comp_level) 613 opts->comp_level = comp_level_default; 614 } 615 616 return 0; 617 } 618 #endif 619 static unsigned int comp_level_max = 22; 620 621 static int record__comp_enabled(struct record *rec) 622 { 623 return rec->opts.comp_level > 0; 624 } 625 626 static int process_synthesized_event(const struct perf_tool *tool, 627 union perf_event *event, 628 struct perf_sample *sample __maybe_unused, 629 struct machine *machine __maybe_unused) 630 { 631 struct record *rec = container_of(tool, struct record, tool); 632 return record__write(rec, NULL, event, event->header.size); 633 } 634 635 static struct mutex synth_lock; 636 637 static int process_locked_synthesized_event(const struct perf_tool *tool, 638 union perf_event *event, 639 struct perf_sample *sample __maybe_unused, 640 struct machine *machine __maybe_unused) 641 { 642 int ret; 643 644 mutex_lock(&synth_lock); 645 ret = process_synthesized_event(tool, event, sample, machine); 646 mutex_unlock(&synth_lock); 647 return ret; 648 } 649 650 static int record__pushfn(struct mmap *map, void *to, void *bf, size_t size) 651 { 652 struct record *rec = to; 653 654 if (record__comp_enabled(rec)) { 655 struct perf_record_compressed2 *event = map->data; 656 size_t padding = 0; 657 u8 pad[8] = {0}; 658 ssize_t compressed = zstd_compress(rec->session, map, map->data, 659 mmap__mmap_len(map), bf, size); 660 661 if (compressed < 0) 662 return (int)compressed; 663 664 bf = event; 665 thread->samples++; 666 667 /* 668 * The record from `zstd_compress` is not 8 bytes aligned, which would cause asan 669 * error. We make it aligned here. 670 */ 671 event->data_size = compressed - sizeof(struct perf_record_compressed2); 672 event->header.size = PERF_ALIGN(compressed, sizeof(u64)); 673 padding = event->header.size - compressed; 674 return record__write(rec, map, bf, compressed) || 675 record__write(rec, map, &pad, padding); 676 } 677 678 thread->samples++; 679 return record__write(rec, map, bf, size); 680 } 681 682 static volatile sig_atomic_t signr = -1; 683 static volatile sig_atomic_t child_finished; 684 #ifdef HAVE_EVENTFD_SUPPORT 685 static volatile sig_atomic_t done_fd = -1; 686 #endif 687 688 static void sig_handler(int sig) 689 { 690 if (sig == SIGCHLD) 691 child_finished = 1; 692 else 693 signr = sig; 694 695 done = 1; 696 #ifdef HAVE_EVENTFD_SUPPORT 697 if (done_fd >= 0) { 698 u64 tmp = 1; 699 int orig_errno = errno; 700 701 /* 702 * It is possible for this signal handler to run after done is 703 * checked in the main loop, but before the perf counter fds are 704 * polled. If this happens, the poll() will continue to wait 705 * even though done is set, and will only break out if either 706 * another signal is received, or the counters are ready for 707 * read. To ensure the poll() doesn't sleep when done is set, 708 * use an eventfd (done_fd) to wake up the poll(). 709 */ 710 if (write(done_fd, &tmp, sizeof(tmp)) < 0) 711 pr_err("failed to signal wakeup fd, error: %m\n"); 712 713 errno = orig_errno; 714 } 715 #endif // HAVE_EVENTFD_SUPPORT 716 } 717 718 static void sigsegv_handler(int sig) 719 { 720 perf_hooks__recover(); 721 sighandler_dump_stack(sig); 722 } 723 724 static void record__sig_exit(void) 725 { 726 if (signr == -1) 727 return; 728 729 signal(signr, SIG_DFL); 730 raise(signr); 731 } 732 733 #ifdef HAVE_AUXTRACE_SUPPORT 734 735 static int record__process_auxtrace(const struct perf_tool *tool, 736 struct mmap *map, 737 union perf_event *event, void *data1, 738 size_t len1, void *data2, size_t len2) 739 { 740 struct record *rec = container_of(tool, struct record, tool); 741 struct perf_data *data = &rec->data; 742 size_t padding; 743 u8 pad[8] = {0}; 744 745 if (!perf_data__is_pipe(data) && perf_data__is_single_file(data)) { 746 off_t file_offset; 747 int fd = perf_data__fd(data); 748 int err; 749 750 file_offset = lseek(fd, 0, SEEK_CUR); 751 if (file_offset == -1) 752 return -1; 753 err = auxtrace_index__auxtrace_event(&rec->session->auxtrace_index, 754 event, file_offset); 755 if (err) 756 return err; 757 } 758 759 /* event.auxtrace.size includes padding, see __auxtrace_mmap__read() */ 760 padding = (len1 + len2) & 7; 761 if (padding) 762 padding = 8 - padding; 763 764 record__write(rec, map, event, event->header.size); 765 record__write(rec, map, data1, len1); 766 if (len2) 767 record__write(rec, map, data2, len2); 768 record__write(rec, map, &pad, padding); 769 770 return 0; 771 } 772 773 static int record__auxtrace_mmap_read(struct record *rec, 774 struct mmap *map) 775 { 776 int ret; 777 778 ret = auxtrace_mmap__read(map, rec->itr, 779 perf_session__env(rec->session), 780 &rec->tool, 781 record__process_auxtrace); 782 if (ret < 0) 783 return ret; 784 785 if (ret) 786 rec->samples++; 787 788 return 0; 789 } 790 791 static int record__auxtrace_mmap_read_snapshot(struct record *rec, 792 struct mmap *map) 793 { 794 int ret; 795 796 ret = auxtrace_mmap__read_snapshot(map, rec->itr, 797 perf_session__env(rec->session), 798 &rec->tool, 799 record__process_auxtrace, 800 rec->opts.auxtrace_snapshot_size); 801 if (ret < 0) 802 return ret; 803 804 if (ret) 805 rec->samples++; 806 807 return 0; 808 } 809 810 static int record__auxtrace_read_snapshot_all(struct record *rec) 811 { 812 int i; 813 int rc = 0; 814 815 for (i = 0; i < rec->evlist->core.nr_mmaps; i++) { 816 struct mmap *map = &rec->evlist->mmap[i]; 817 818 if (!map->auxtrace_mmap.base) 819 continue; 820 821 if (record__auxtrace_mmap_read_snapshot(rec, map) != 0) { 822 rc = -1; 823 goto out; 824 } 825 } 826 out: 827 return rc; 828 } 829 830 static void record__read_auxtrace_snapshot(struct record *rec, bool on_exit) 831 { 832 pr_debug("Recording AUX area tracing snapshot\n"); 833 if (record__auxtrace_read_snapshot_all(rec) < 0) { 834 trigger_error(&auxtrace_snapshot_trigger); 835 } else { 836 if (auxtrace_record__snapshot_finish(rec->itr, on_exit)) 837 trigger_error(&auxtrace_snapshot_trigger); 838 else 839 trigger_ready(&auxtrace_snapshot_trigger); 840 } 841 } 842 843 static int record__auxtrace_snapshot_exit(struct record *rec) 844 { 845 if (trigger_is_error(&auxtrace_snapshot_trigger)) 846 return 0; 847 848 if (!auxtrace_record__snapshot_started && 849 auxtrace_record__snapshot_start(rec->itr)) 850 return -1; 851 852 record__read_auxtrace_snapshot(rec, true); 853 if (trigger_is_error(&auxtrace_snapshot_trigger)) 854 return -1; 855 856 return 0; 857 } 858 859 static int record__auxtrace_init(struct record *rec) 860 { 861 int err; 862 863 if ((rec->opts.auxtrace_snapshot_opts || rec->opts.auxtrace_sample_opts) 864 && record__threads_enabled(rec)) { 865 pr_err("AUX area tracing options are not available in parallel streaming mode.\n"); 866 return -EINVAL; 867 } 868 869 if (!rec->itr) { 870 rec->itr = auxtrace_record__init(rec->evlist, &err); 871 if (err) 872 return err; 873 } 874 875 err = auxtrace_parse_snapshot_options(rec->itr, &rec->opts, 876 rec->opts.auxtrace_snapshot_opts); 877 if (err) 878 return err; 879 880 err = auxtrace_parse_sample_options(rec->itr, rec->evlist, &rec->opts, 881 rec->opts.auxtrace_sample_opts); 882 if (err) 883 return err; 884 885 err = auxtrace_parse_aux_action(rec->evlist); 886 if (err) 887 return err; 888 889 return auxtrace_parse_filters(rec->evlist); 890 } 891 892 #else 893 894 static inline 895 int record__auxtrace_mmap_read(struct record *rec __maybe_unused, 896 struct mmap *map __maybe_unused) 897 { 898 return 0; 899 } 900 901 static inline 902 void record__read_auxtrace_snapshot(struct record *rec __maybe_unused, 903 bool on_exit __maybe_unused) 904 { 905 } 906 907 static inline 908 int auxtrace_record__snapshot_start(struct auxtrace_record *itr __maybe_unused) 909 { 910 return 0; 911 } 912 913 static inline 914 int record__auxtrace_snapshot_exit(struct record *rec __maybe_unused) 915 { 916 return 0; 917 } 918 919 static int record__auxtrace_init(struct record *rec __maybe_unused) 920 { 921 return 0; 922 } 923 924 #endif 925 926 static int record__config_text_poke(struct evlist *evlist) 927 { 928 struct evsel *evsel; 929 930 /* Nothing to do if text poke is already configured */ 931 evlist__for_each_entry(evlist, evsel) { 932 if (evsel->core.attr.text_poke) 933 return 0; 934 } 935 936 evsel = evlist__add_dummy_on_all_cpus(evlist); 937 if (!evsel) 938 return -ENOMEM; 939 940 evsel->core.attr.text_poke = 1; 941 evsel->core.attr.ksymbol = 1; 942 evsel->immediate = true; 943 evsel__set_sample_bit(evsel, TIME); 944 945 return 0; 946 } 947 948 static int record__config_off_cpu(struct record *rec) 949 { 950 return off_cpu_prepare(rec->evlist, &rec->opts.target, &rec->opts); 951 } 952 953 static bool record__tracking_system_wide(struct record *rec) 954 { 955 struct evlist *evlist = rec->evlist; 956 struct evsel *evsel; 957 958 /* 959 * If non-dummy evsel exists, system_wide sideband is need to 960 * help parse sample information. 961 * For example, PERF_EVENT_MMAP event to help parse symbol, 962 * and PERF_EVENT_COMM event to help parse task executable name. 963 */ 964 evlist__for_each_entry(evlist, evsel) { 965 if (!evsel__is_dummy_event(evsel)) 966 return true; 967 } 968 969 return false; 970 } 971 972 static int record__config_tracking_events(struct record *rec) 973 { 974 struct record_opts *opts = &rec->opts; 975 struct evlist *evlist = rec->evlist; 976 bool system_wide = false; 977 struct evsel *evsel; 978 979 /* 980 * For initial_delay, system wide or a hybrid system, we need to add 981 * tracking event so that we can track PERF_RECORD_MMAP to cover the 982 * delay of waiting or event synthesis. 983 */ 984 if (opts->target.initial_delay || target__has_cpu(&opts->target) || 985 perf_pmus__num_core_pmus() > 1) { 986 /* 987 * User space tasks can migrate between CPUs, so when tracing 988 * selected CPUs, sideband for all CPUs is still needed. 989 */ 990 if (!!opts->target.cpu_list && record__tracking_system_wide(rec)) 991 system_wide = true; 992 993 evsel = evlist__findnew_tracking_event(evlist, system_wide); 994 if (!evsel) 995 return -ENOMEM; 996 997 /* 998 * Enable the tracking event when the process is forked for 999 * initial_delay, immediately for system wide. 1000 */ 1001 if (opts->target.initial_delay && !evsel->immediate && 1002 !target__has_cpu(&opts->target)) 1003 evsel->core.attr.enable_on_exec = 1; 1004 else 1005 evsel->immediate = 1; 1006 } 1007 1008 return 0; 1009 } 1010 1011 static bool record__kcore_readable(struct machine *machine) 1012 { 1013 char kcore[PATH_MAX]; 1014 int fd; 1015 1016 scnprintf(kcore, sizeof(kcore), "%s/proc/kcore", machine->root_dir); 1017 1018 fd = open(kcore, O_RDONLY); 1019 if (fd < 0) 1020 return false; 1021 1022 close(fd); 1023 1024 return true; 1025 } 1026 1027 static int record__kcore_copy(struct machine *machine, struct perf_data *data) 1028 { 1029 char from_dir[PATH_MAX]; 1030 char kcore_dir[PATH_MAX]; 1031 int ret; 1032 1033 snprintf(from_dir, sizeof(from_dir), "%s/proc", machine->root_dir); 1034 1035 ret = perf_data__make_kcore_dir(data, kcore_dir, sizeof(kcore_dir)); 1036 if (ret) 1037 return ret; 1038 1039 return kcore_copy(from_dir, kcore_dir); 1040 } 1041 1042 static void record__thread_data_init_pipes(struct record_thread *thread_data) 1043 { 1044 thread_data->pipes.msg[0] = -1; 1045 thread_data->pipes.msg[1] = -1; 1046 thread_data->pipes.ack[0] = -1; 1047 thread_data->pipes.ack[1] = -1; 1048 } 1049 1050 static int record__thread_data_open_pipes(struct record_thread *thread_data) 1051 { 1052 if (pipe(thread_data->pipes.msg)) 1053 return -EINVAL; 1054 1055 if (pipe(thread_data->pipes.ack)) { 1056 close(thread_data->pipes.msg[0]); 1057 thread_data->pipes.msg[0] = -1; 1058 close(thread_data->pipes.msg[1]); 1059 thread_data->pipes.msg[1] = -1; 1060 return -EINVAL; 1061 } 1062 1063 pr_debug2("thread_data[%p]: msg=[%d,%d], ack=[%d,%d]\n", thread_data, 1064 thread_data->pipes.msg[0], thread_data->pipes.msg[1], 1065 thread_data->pipes.ack[0], thread_data->pipes.ack[1]); 1066 1067 return 0; 1068 } 1069 1070 static void record__thread_data_close_pipes(struct record_thread *thread_data) 1071 { 1072 if (thread_data->pipes.msg[0] != -1) { 1073 close(thread_data->pipes.msg[0]); 1074 thread_data->pipes.msg[0] = -1; 1075 } 1076 if (thread_data->pipes.msg[1] != -1) { 1077 close(thread_data->pipes.msg[1]); 1078 thread_data->pipes.msg[1] = -1; 1079 } 1080 if (thread_data->pipes.ack[0] != -1) { 1081 close(thread_data->pipes.ack[0]); 1082 thread_data->pipes.ack[0] = -1; 1083 } 1084 if (thread_data->pipes.ack[1] != -1) { 1085 close(thread_data->pipes.ack[1]); 1086 thread_data->pipes.ack[1] = -1; 1087 } 1088 } 1089 1090 static bool evlist__per_thread(struct evlist *evlist) 1091 { 1092 return cpu_map__is_dummy(evlist->core.user_requested_cpus); 1093 } 1094 1095 static int record__thread_data_init_maps(struct record_thread *thread_data, struct evlist *evlist) 1096 { 1097 int m, tm, nr_mmaps = evlist->core.nr_mmaps; 1098 struct mmap *mmap = evlist->mmap; 1099 struct mmap *overwrite_mmap = evlist->overwrite_mmap; 1100 struct perf_cpu_map *cpus = evlist->core.all_cpus; 1101 bool per_thread = evlist__per_thread(evlist); 1102 1103 if (per_thread) 1104 thread_data->nr_mmaps = nr_mmaps; 1105 else 1106 thread_data->nr_mmaps = bitmap_weight(thread_data->mask->maps.bits, 1107 thread_data->mask->maps.nbits); 1108 if (mmap) { 1109 thread_data->maps = zalloc(thread_data->nr_mmaps * sizeof(struct mmap *)); 1110 if (!thread_data->maps) 1111 return -ENOMEM; 1112 } 1113 if (overwrite_mmap) { 1114 thread_data->overwrite_maps = zalloc(thread_data->nr_mmaps * sizeof(struct mmap *)); 1115 if (!thread_data->overwrite_maps) { 1116 zfree(&thread_data->maps); 1117 return -ENOMEM; 1118 } 1119 } 1120 pr_debug2("thread_data[%p]: nr_mmaps=%d, maps=%p, ow_maps=%p\n", thread_data, 1121 thread_data->nr_mmaps, thread_data->maps, thread_data->overwrite_maps); 1122 1123 for (m = 0, tm = 0; m < nr_mmaps && tm < thread_data->nr_mmaps; m++) { 1124 if (per_thread || 1125 test_bit(perf_cpu_map__cpu(cpus, m).cpu, thread_data->mask->maps.bits)) { 1126 if (thread_data->maps) { 1127 thread_data->maps[tm] = &mmap[m]; 1128 pr_debug2("thread_data[%p]: cpu%d: maps[%d] -> mmap[%d]\n", 1129 thread_data, perf_cpu_map__cpu(cpus, m).cpu, tm, m); 1130 } 1131 if (thread_data->overwrite_maps) { 1132 thread_data->overwrite_maps[tm] = &overwrite_mmap[m]; 1133 pr_debug2("thread_data[%p]: cpu%d: ow_maps[%d] -> ow_mmap[%d]\n", 1134 thread_data, perf_cpu_map__cpu(cpus, m).cpu, tm, m); 1135 } 1136 tm++; 1137 } 1138 } 1139 1140 return 0; 1141 } 1142 1143 static int record__thread_data_init_pollfd(struct record_thread *thread_data, struct evlist *evlist) 1144 { 1145 int f, tm, pos; 1146 struct mmap *map, *overwrite_map; 1147 1148 fdarray__init(&thread_data->pollfd, 64); 1149 1150 for (tm = 0; tm < thread_data->nr_mmaps; tm++) { 1151 map = thread_data->maps ? thread_data->maps[tm] : NULL; 1152 overwrite_map = thread_data->overwrite_maps ? 1153 thread_data->overwrite_maps[tm] : NULL; 1154 1155 for (f = 0; f < evlist->core.pollfd.nr; f++) { 1156 void *ptr = evlist->core.pollfd.priv[f].ptr; 1157 1158 if ((map && ptr == map) || (overwrite_map && ptr == overwrite_map)) { 1159 pos = fdarray__dup_entry_from(&thread_data->pollfd, f, 1160 &evlist->core.pollfd); 1161 if (pos < 0) 1162 return pos; 1163 pr_debug2("thread_data[%p]: pollfd[%d] <- event_fd=%d\n", 1164 thread_data, pos, evlist->core.pollfd.entries[f].fd); 1165 } 1166 } 1167 } 1168 1169 return 0; 1170 } 1171 1172 static void record__free_thread_data(struct record *rec) 1173 { 1174 int t; 1175 struct record_thread *thread_data = rec->thread_data; 1176 1177 if (thread_data == NULL) 1178 return; 1179 1180 for (t = 0; t < rec->nr_threads; t++) { 1181 record__thread_data_close_pipes(&thread_data[t]); 1182 zfree(&thread_data[t].maps); 1183 zfree(&thread_data[t].overwrite_maps); 1184 fdarray__exit(&thread_data[t].pollfd); 1185 } 1186 1187 zfree(&rec->thread_data); 1188 } 1189 1190 static int record__map_thread_evlist_pollfd_indexes(struct record *rec, 1191 int evlist_pollfd_index, 1192 int thread_pollfd_index) 1193 { 1194 size_t x = rec->index_map_cnt; 1195 1196 if (realloc_array_as_needed(rec->index_map, rec->index_map_sz, x, NULL)) 1197 return -ENOMEM; 1198 rec->index_map[x].evlist_pollfd_index = evlist_pollfd_index; 1199 rec->index_map[x].thread_pollfd_index = thread_pollfd_index; 1200 rec->index_map_cnt += 1; 1201 return 0; 1202 } 1203 1204 static int record__update_evlist_pollfd_from_thread(struct record *rec, 1205 struct evlist *evlist, 1206 struct record_thread *thread_data) 1207 { 1208 struct pollfd *e_entries = evlist->core.pollfd.entries; 1209 struct pollfd *t_entries = thread_data->pollfd.entries; 1210 int err = 0; 1211 size_t i; 1212 1213 for (i = 0; i < rec->index_map_cnt; i++) { 1214 int e_pos = rec->index_map[i].evlist_pollfd_index; 1215 int t_pos = rec->index_map[i].thread_pollfd_index; 1216 1217 if (e_entries[e_pos].fd != t_entries[t_pos].fd || 1218 e_entries[e_pos].events != t_entries[t_pos].events) { 1219 pr_err("Thread and evlist pollfd index mismatch\n"); 1220 err = -EINVAL; 1221 continue; 1222 } 1223 e_entries[e_pos].revents = t_entries[t_pos].revents; 1224 } 1225 return err; 1226 } 1227 1228 static int record__dup_non_perf_events(struct record *rec, 1229 struct evlist *evlist, 1230 struct record_thread *thread_data) 1231 { 1232 struct fdarray *fda = &evlist->core.pollfd; 1233 int i, ret; 1234 1235 for (i = 0; i < fda->nr; i++) { 1236 if (!(fda->priv[i].flags & fdarray_flag__non_perf_event)) 1237 continue; 1238 ret = fdarray__dup_entry_from(&thread_data->pollfd, i, fda); 1239 if (ret < 0) { 1240 pr_err("Failed to duplicate descriptor in main thread pollfd\n"); 1241 return ret; 1242 } 1243 pr_debug2("thread_data[%p]: pollfd[%d] <- non_perf_event fd=%d\n", 1244 thread_data, ret, fda->entries[i].fd); 1245 ret = record__map_thread_evlist_pollfd_indexes(rec, i, ret); 1246 if (ret < 0) { 1247 pr_err("Failed to map thread and evlist pollfd indexes\n"); 1248 return ret; 1249 } 1250 } 1251 return 0; 1252 } 1253 1254 static int record__alloc_thread_data(struct record *rec, struct evlist *evlist) 1255 { 1256 int t, ret; 1257 struct record_thread *thread_data; 1258 1259 rec->thread_data = zalloc(rec->nr_threads * sizeof(*(rec->thread_data))); 1260 if (!rec->thread_data) { 1261 pr_err("Failed to allocate thread data\n"); 1262 return -ENOMEM; 1263 } 1264 thread_data = rec->thread_data; 1265 1266 for (t = 0; t < rec->nr_threads; t++) 1267 record__thread_data_init_pipes(&thread_data[t]); 1268 1269 for (t = 0; t < rec->nr_threads; t++) { 1270 thread_data[t].rec = rec; 1271 thread_data[t].mask = &rec->thread_masks[t]; 1272 ret = record__thread_data_init_maps(&thread_data[t], evlist); 1273 if (ret) { 1274 pr_err("Failed to initialize thread[%d] maps\n", t); 1275 goto out_free; 1276 } 1277 ret = record__thread_data_init_pollfd(&thread_data[t], evlist); 1278 if (ret) { 1279 pr_err("Failed to initialize thread[%d] pollfd\n", t); 1280 goto out_free; 1281 } 1282 if (t) { 1283 thread_data[t].tid = -1; 1284 ret = record__thread_data_open_pipes(&thread_data[t]); 1285 if (ret) { 1286 pr_err("Failed to open thread[%d] communication pipes\n", t); 1287 goto out_free; 1288 } 1289 ret = fdarray__add(&thread_data[t].pollfd, thread_data[t].pipes.msg[0], 1290 POLLIN | POLLERR | POLLHUP, fdarray_flag__nonfilterable); 1291 if (ret < 0) { 1292 pr_err("Failed to add descriptor to thread[%d] pollfd\n", t); 1293 goto out_free; 1294 } 1295 thread_data[t].ctlfd_pos = ret; 1296 pr_debug2("thread_data[%p]: pollfd[%d] <- ctl_fd=%d\n", 1297 thread_data, thread_data[t].ctlfd_pos, 1298 thread_data[t].pipes.msg[0]); 1299 } else { 1300 thread_data[t].tid = gettid(); 1301 1302 ret = record__dup_non_perf_events(rec, evlist, &thread_data[t]); 1303 if (ret < 0) 1304 goto out_free; 1305 1306 thread_data[t].ctlfd_pos = -1; /* Not used */ 1307 } 1308 } 1309 1310 return 0; 1311 1312 out_free: 1313 record__free_thread_data(rec); 1314 1315 return ret; 1316 } 1317 1318 static int record__mmap_evlist(struct record *rec, 1319 struct evlist *evlist) 1320 { 1321 int i, ret; 1322 struct record_opts *opts = &rec->opts; 1323 bool auxtrace_overwrite = opts->auxtrace_snapshot_mode || 1324 opts->auxtrace_sample_mode; 1325 char msg[512]; 1326 1327 if (opts->affinity != PERF_AFFINITY_SYS) 1328 cpu__setup_cpunode_map(); 1329 1330 if (evlist__mmap_ex(evlist, opts->mmap_pages, 1331 opts->auxtrace_mmap_pages, 1332 auxtrace_overwrite, 1333 opts->nr_cblocks, opts->affinity, 1334 opts->mmap_flush, opts->comp_level) < 0) { 1335 if (errno == EPERM) { 1336 pr_err("Permission error mapping pages.\n" 1337 "Consider increasing " 1338 "/proc/sys/kernel/perf_event_mlock_kb,\n" 1339 "or try again with a smaller value of -m/--mmap_pages.\n" 1340 "(current value: %u,%u)\n", 1341 opts->mmap_pages, opts->auxtrace_mmap_pages); 1342 return -errno; 1343 } else { 1344 pr_err("failed to mmap with %d (%s)\n", errno, 1345 str_error_r(errno, msg, sizeof(msg))); 1346 if (errno) 1347 return -errno; 1348 else 1349 return -EINVAL; 1350 } 1351 } 1352 1353 if (evlist__initialize_ctlfd(evlist, opts->ctl_fd, opts->ctl_fd_ack)) 1354 return -1; 1355 1356 ret = record__alloc_thread_data(rec, evlist); 1357 if (ret) 1358 return ret; 1359 1360 if (record__threads_enabled(rec)) { 1361 ret = perf_data__create_dir(&rec->data, evlist->core.nr_mmaps); 1362 if (ret) { 1363 pr_err("Failed to create data directory: %s\n", strerror(-ret)); 1364 return ret; 1365 } 1366 for (i = 0; i < evlist->core.nr_mmaps; i++) { 1367 if (evlist->mmap) 1368 evlist->mmap[i].file = &rec->data.dir.files[i]; 1369 if (evlist->overwrite_mmap) 1370 evlist->overwrite_mmap[i].file = &rec->data.dir.files[i]; 1371 } 1372 } 1373 1374 return 0; 1375 } 1376 1377 static int record__mmap(struct record *rec) 1378 { 1379 return record__mmap_evlist(rec, rec->evlist); 1380 } 1381 1382 static int record__open(struct record *rec) 1383 { 1384 char msg[BUFSIZ]; 1385 struct evsel *pos; 1386 struct evlist *evlist = rec->evlist; 1387 struct perf_session *session = rec->session; 1388 struct record_opts *opts = &rec->opts; 1389 int rc = 0; 1390 bool skipped = false; 1391 bool removed_tracking = false; 1392 1393 evlist__for_each_entry(evlist, pos) { 1394 if (removed_tracking) { 1395 /* 1396 * Normally the head of the list has tracking enabled 1397 * for sideband data like mmaps. If this event is 1398 * removed, make sure to add tracking to the next 1399 * processed event. 1400 */ 1401 if (!pos->tracking) { 1402 pos->tracking = true; 1403 evsel__config(pos, opts, &callchain_param); 1404 } 1405 removed_tracking = false; 1406 } 1407 try_again: 1408 if (evsel__open(pos, pos->core.cpus, pos->core.threads) < 0) { 1409 bool report_error = true; 1410 1411 if (evsel__fallback(pos, &opts->target, errno, msg, sizeof(msg))) { 1412 if (verbose > 0) 1413 ui__warning("%s\n", msg); 1414 goto try_again; 1415 } 1416 if ((errno == EINVAL || errno == EBADF) && 1417 pos->core.leader != &pos->core && 1418 pos->weak_group) { 1419 pos = evlist__reset_weak_group(evlist, pos, true); 1420 goto try_again; 1421 } 1422 #if defined(__aarch64__) || defined(__arm__) 1423 if (strstr(evsel__name(pos), "cycles")) { 1424 struct evsel *pos2; 1425 /* 1426 * Unfortunately ARM has many events named 1427 * "cycles" on PMUs like the system-level (L3) 1428 * cache which don't support sampling. Only 1429 * display such failures to open when there is 1430 * only 1 cycles event or verbose is enabled. 1431 */ 1432 evlist__for_each_entry(evlist, pos2) { 1433 if (pos2 == pos) 1434 continue; 1435 if (strstr(evsel__name(pos2), "cycles")) { 1436 report_error = false; 1437 break; 1438 } 1439 } 1440 } 1441 #endif 1442 if (report_error || verbose > 0) { 1443 ui__error("Failure to open event '%s' on PMU '%s' which will be " 1444 "removed.\n%s\n", 1445 evsel__name(pos), evsel__pmu_name(pos), msg); 1446 } 1447 if (pos->tracking) 1448 removed_tracking = true; 1449 pos->skippable = true; 1450 skipped = true; 1451 } 1452 } 1453 1454 if (skipped) { 1455 struct evsel *tmp; 1456 int idx = 0; 1457 bool evlist_empty = true; 1458 1459 /* Remove evsels that failed to open and update indices. */ 1460 evlist__for_each_entry_safe(evlist, tmp, pos) { 1461 if (pos->skippable) { 1462 evlist__remove(evlist, pos); 1463 continue; 1464 } 1465 1466 /* 1467 * Note, dummy events may be command line parsed or 1468 * added by the tool. We care about supporting `perf 1469 * record -e dummy` which may be used as a permission 1470 * check. Dummy events that are added to the command 1471 * line and opened along with other events that fail, 1472 * will still fail as if the dummy events were tool 1473 * added events for the sake of code simplicity. 1474 */ 1475 if (!evsel__is_dummy_event(pos)) 1476 evlist_empty = false; 1477 } 1478 evlist__for_each_entry(evlist, pos) { 1479 pos->core.idx = idx++; 1480 } 1481 /* If list is empty then fail. */ 1482 if (evlist_empty) { 1483 ui__error("Failure to open any events for recording.\n"); 1484 rc = -1; 1485 goto out; 1486 } 1487 } 1488 if (symbol_conf.kptr_restrict && !evlist__exclude_kernel(evlist)) { 1489 pr_warning( 1490 "WARNING: Kernel address maps (/proc/{kallsyms,modules}) are restricted,\n" 1491 "check /proc/sys/kernel/kptr_restrict and /proc/sys/kernel/perf_event_paranoid.\n\n" 1492 "Samples in kernel functions may not be resolved if a suitable vmlinux\n" 1493 "file is not found in the buildid cache or in the vmlinux path.\n\n" 1494 "Samples in kernel modules won't be resolved at all.\n\n" 1495 "If some relocation was applied (e.g. kexec) symbols may be misresolved\n" 1496 "even with a suitable vmlinux or kallsyms file.\n\n"); 1497 } 1498 1499 if (evlist__apply_filters(evlist, &pos, &opts->target)) { 1500 pr_err("failed to set filter \"%s\" on event %s with %d (%s)\n", 1501 pos->filter ?: "BPF", evsel__name(pos), errno, 1502 str_error_r(errno, msg, sizeof(msg))); 1503 rc = -1; 1504 goto out; 1505 } 1506 1507 rc = record__mmap(rec); 1508 if (rc) 1509 goto out; 1510 1511 session->evlist = evlist; 1512 perf_session__set_id_hdr_size(session); 1513 out: 1514 return rc; 1515 } 1516 1517 static void set_timestamp_boundary(struct record *rec, u64 sample_time) 1518 { 1519 if (rec->evlist->first_sample_time == 0) 1520 rec->evlist->first_sample_time = sample_time; 1521 1522 if (sample_time) 1523 rec->evlist->last_sample_time = sample_time; 1524 } 1525 1526 static int process_sample_event(const struct perf_tool *tool, 1527 union perf_event *event, 1528 struct perf_sample *sample, 1529 struct evsel *evsel, 1530 struct machine *machine) 1531 { 1532 struct record *rec = container_of(tool, struct record, tool); 1533 1534 set_timestamp_boundary(rec, sample->time); 1535 1536 if (rec->buildid_all) 1537 return 0; 1538 1539 rec->samples++; 1540 return build_id__mark_dso_hit(tool, event, sample, evsel, machine); 1541 } 1542 1543 static int process_buildids(struct record *rec) 1544 { 1545 struct perf_session *session = rec->session; 1546 1547 if (perf_data__size(&rec->data) == 0) 1548 return 0; 1549 1550 /* 1551 * During this process, it'll load kernel map and replace the 1552 * dso->long_name to a real pathname it found. In this case 1553 * we prefer the vmlinux path like 1554 * /lib/modules/3.16.4/build/vmlinux 1555 * 1556 * rather than build-id path (in debug directory). 1557 * $HOME/.debug/.build-id/f0/6e17aa50adf4d00b88925e03775de107611551 1558 */ 1559 symbol_conf.ignore_vmlinux_buildid = true; 1560 1561 /* 1562 * If --buildid-all is given, it marks all DSO regardless of hits, 1563 * so no need to process samples. But if timestamp_boundary is enabled, 1564 * it still needs to walk on all samples to get the timestamps of 1565 * first/last samples. 1566 */ 1567 if (rec->buildid_all && !rec->timestamp_boundary) 1568 rec->tool.sample = process_event_sample_stub; 1569 1570 return perf_session__process_events(session); 1571 } 1572 1573 static void perf_event__synthesize_guest_os(struct machine *machine, void *data) 1574 { 1575 int err; 1576 struct perf_tool *tool = data; 1577 /* 1578 *As for guest kernel when processing subcommand record&report, 1579 *we arrange module mmap prior to guest kernel mmap and trigger 1580 *a preload dso because default guest module symbols are loaded 1581 *from guest kallsyms instead of /lib/modules/XXX/XXX. This 1582 *method is used to avoid symbol missing when the first addr is 1583 *in module instead of in guest kernel. 1584 */ 1585 err = perf_event__synthesize_modules(tool, process_synthesized_event, 1586 machine); 1587 if (err < 0) 1588 pr_err("Couldn't record guest kernel [%d]'s reference" 1589 " relocation symbol.\n", machine->pid); 1590 1591 /* 1592 * We use _stext for guest kernel because guest kernel's /proc/kallsyms 1593 * have no _text sometimes. 1594 */ 1595 err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event, 1596 machine); 1597 if (err < 0) 1598 pr_err("Couldn't record guest kernel [%d]'s reference" 1599 " relocation symbol.\n", machine->pid); 1600 } 1601 1602 static struct perf_event_header finished_round_event = { 1603 .size = sizeof(struct perf_event_header), 1604 .type = PERF_RECORD_FINISHED_ROUND, 1605 }; 1606 1607 static struct perf_event_header finished_init_event = { 1608 .size = sizeof(struct perf_event_header), 1609 .type = PERF_RECORD_FINISHED_INIT, 1610 }; 1611 1612 static void record__adjust_affinity(struct record *rec, struct mmap *map) 1613 { 1614 if (rec->opts.affinity != PERF_AFFINITY_SYS && 1615 !bitmap_equal(thread->mask->affinity.bits, map->affinity_mask.bits, 1616 thread->mask->affinity.nbits)) { 1617 bitmap_zero(thread->mask->affinity.bits, thread->mask->affinity.nbits); 1618 bitmap_or(thread->mask->affinity.bits, thread->mask->affinity.bits, 1619 map->affinity_mask.bits, thread->mask->affinity.nbits); 1620 sched_setaffinity(0, MMAP_CPU_MASK_BYTES(&thread->mask->affinity), 1621 (cpu_set_t *)thread->mask->affinity.bits); 1622 if (verbose == 2) { 1623 pr_debug("threads[%d]: running on cpu%d: ", thread->tid, sched_getcpu()); 1624 mmap_cpu_mask__scnprintf(&thread->mask->affinity, "affinity"); 1625 } 1626 } 1627 } 1628 1629 static size_t process_comp_header(void *record, size_t increment) 1630 { 1631 struct perf_record_compressed2 *event = record; 1632 size_t size = sizeof(*event); 1633 1634 if (increment) { 1635 event->header.size += increment; 1636 return increment; 1637 } 1638 1639 event->header.type = PERF_RECORD_COMPRESSED2; 1640 event->header.size = size; 1641 1642 return size; 1643 } 1644 1645 static ssize_t zstd_compress(struct perf_session *session, struct mmap *map, 1646 void *dst, size_t dst_size, void *src, size_t src_size) 1647 { 1648 ssize_t compressed; 1649 size_t max_record_size = PERF_SAMPLE_MAX_SIZE - sizeof(struct perf_record_compressed2) - 1; 1650 struct zstd_data *zstd_data = &session->zstd_data; 1651 1652 if (map && map->file) 1653 zstd_data = &map->zstd_data; 1654 1655 compressed = zstd_compress_stream_to_records(zstd_data, dst, dst_size, src, src_size, 1656 max_record_size, process_comp_header); 1657 if (compressed < 0) 1658 return compressed; 1659 1660 if (map && map->file) { 1661 thread->bytes_transferred += src_size; 1662 thread->bytes_compressed += compressed; 1663 } else { 1664 session->bytes_transferred += src_size; 1665 session->bytes_compressed += compressed; 1666 } 1667 1668 return compressed; 1669 } 1670 1671 static int record__mmap_read_evlist(struct record *rec, struct evlist *evlist, 1672 bool overwrite, bool synch) 1673 { 1674 u64 bytes_written = rec->bytes_written; 1675 int i; 1676 int rc = 0; 1677 int nr_mmaps; 1678 struct mmap **maps; 1679 int trace_fd = rec->data.file.fd; 1680 off_t off = 0; 1681 1682 if (!evlist) 1683 return 0; 1684 1685 nr_mmaps = thread->nr_mmaps; 1686 maps = overwrite ? thread->overwrite_maps : thread->maps; 1687 1688 if (!maps) 1689 return 0; 1690 1691 if (overwrite && evlist->bkw_mmap_state != BKW_MMAP_DATA_PENDING) 1692 return 0; 1693 1694 if (record__aio_enabled(rec)) 1695 off = record__aio_get_pos(trace_fd); 1696 1697 for (i = 0; i < nr_mmaps; i++) { 1698 u64 flush = 0; 1699 struct mmap *map = maps[i]; 1700 1701 if (map->core.base) { 1702 record__adjust_affinity(rec, map); 1703 if (synch) { 1704 flush = map->core.flush; 1705 map->core.flush = 1; 1706 } 1707 if (!record__aio_enabled(rec)) { 1708 if (perf_mmap__push(map, rec, record__pushfn) < 0) { 1709 if (synch) 1710 map->core.flush = flush; 1711 rc = -1; 1712 goto out; 1713 } 1714 } else { 1715 if (record__aio_push(rec, map, &off) < 0) { 1716 record__aio_set_pos(trace_fd, off); 1717 if (synch) 1718 map->core.flush = flush; 1719 rc = -1; 1720 goto out; 1721 } 1722 } 1723 if (synch) 1724 map->core.flush = flush; 1725 } 1726 1727 if (map->auxtrace_mmap.base && !rec->opts.auxtrace_snapshot_mode && 1728 !rec->opts.auxtrace_sample_mode && 1729 record__auxtrace_mmap_read(rec, map) != 0) { 1730 rc = -1; 1731 goto out; 1732 } 1733 } 1734 1735 if (record__aio_enabled(rec)) 1736 record__aio_set_pos(trace_fd, off); 1737 1738 /* 1739 * Mark the round finished in case we wrote 1740 * at least one event. 1741 * 1742 * No need for round events in directory mode, 1743 * because per-cpu maps and files have data 1744 * sorted by kernel. 1745 */ 1746 if (!record__threads_enabled(rec) && bytes_written != rec->bytes_written) 1747 rc = record__write(rec, NULL, &finished_round_event, sizeof(finished_round_event)); 1748 1749 if (overwrite) 1750 evlist__toggle_bkw_mmap(evlist, BKW_MMAP_EMPTY); 1751 out: 1752 return rc; 1753 } 1754 1755 static int record__mmap_read_all(struct record *rec, bool synch) 1756 { 1757 int err; 1758 1759 err = record__mmap_read_evlist(rec, rec->evlist, false, synch); 1760 if (err) 1761 return err; 1762 1763 return record__mmap_read_evlist(rec, rec->evlist, true, synch); 1764 } 1765 1766 static void record__thread_munmap_filtered(struct fdarray *fda, int fd, 1767 void *arg __maybe_unused) 1768 { 1769 struct perf_mmap *map = fda->priv[fd].ptr; 1770 1771 if (map) 1772 perf_mmap__put(map); 1773 } 1774 1775 static void *record__thread(void *arg) 1776 { 1777 enum thread_msg msg = THREAD_MSG__READY; 1778 bool terminate = false; 1779 struct fdarray *pollfd; 1780 int err, ctlfd_pos; 1781 1782 thread = arg; 1783 thread->tid = gettid(); 1784 1785 err = write(thread->pipes.ack[1], &msg, sizeof(msg)); 1786 if (err == -1) 1787 pr_warning("threads[%d]: failed to notify on start: %s\n", 1788 thread->tid, strerror(errno)); 1789 1790 pr_debug("threads[%d]: started on cpu%d\n", thread->tid, sched_getcpu()); 1791 1792 pollfd = &thread->pollfd; 1793 ctlfd_pos = thread->ctlfd_pos; 1794 1795 for (;;) { 1796 unsigned long long hits = thread->samples; 1797 1798 if (record__mmap_read_all(thread->rec, false) < 0 || terminate) 1799 break; 1800 1801 if (hits == thread->samples) { 1802 1803 err = fdarray__poll(pollfd, -1); 1804 /* 1805 * Propagate error, only if there's any. Ignore positive 1806 * number of returned events and interrupt error. 1807 */ 1808 if (err > 0 || (err < 0 && errno == EINTR)) 1809 err = 0; 1810 thread->waking++; 1811 1812 if (fdarray__filter(pollfd, POLLERR | POLLHUP, 1813 record__thread_munmap_filtered, NULL) == 0) 1814 break; 1815 } 1816 1817 if (pollfd->entries[ctlfd_pos].revents & POLLHUP) { 1818 terminate = true; 1819 close(thread->pipes.msg[0]); 1820 thread->pipes.msg[0] = -1; 1821 pollfd->entries[ctlfd_pos].fd = -1; 1822 pollfd->entries[ctlfd_pos].events = 0; 1823 } 1824 1825 pollfd->entries[ctlfd_pos].revents = 0; 1826 } 1827 record__mmap_read_all(thread->rec, true); 1828 1829 err = write(thread->pipes.ack[1], &msg, sizeof(msg)); 1830 if (err == -1) 1831 pr_warning("threads[%d]: failed to notify on termination: %s\n", 1832 thread->tid, strerror(errno)); 1833 1834 return NULL; 1835 } 1836 1837 static void record__init_features(struct record *rec) 1838 { 1839 struct perf_session *session = rec->session; 1840 int feat; 1841 1842 for (feat = HEADER_FIRST_FEATURE; feat < HEADER_LAST_FEATURE; feat++) 1843 perf_header__set_feat(&session->header, feat); 1844 1845 if (rec->no_buildid) 1846 perf_header__clear_feat(&session->header, HEADER_BUILD_ID); 1847 1848 if (!have_tracepoints(&rec->evlist->core.entries)) 1849 perf_header__clear_feat(&session->header, HEADER_TRACING_DATA); 1850 1851 if (!rec->opts.branch_stack) 1852 perf_header__clear_feat(&session->header, HEADER_BRANCH_STACK); 1853 1854 if (!rec->opts.full_auxtrace) 1855 perf_header__clear_feat(&session->header, HEADER_AUXTRACE); 1856 1857 if (!(rec->opts.use_clockid && rec->opts.clockid_res_ns)) 1858 perf_header__clear_feat(&session->header, HEADER_CLOCKID); 1859 1860 if (!rec->opts.use_clockid) 1861 perf_header__clear_feat(&session->header, HEADER_CLOCK_DATA); 1862 1863 if (!record__threads_enabled(rec)) 1864 perf_header__clear_feat(&session->header, HEADER_DIR_FORMAT); 1865 1866 if (!record__comp_enabled(rec)) 1867 perf_header__clear_feat(&session->header, HEADER_COMPRESSED); 1868 1869 perf_header__clear_feat(&session->header, HEADER_STAT); 1870 } 1871 1872 static void 1873 record__finish_output(struct record *rec) 1874 { 1875 int i; 1876 struct perf_data *data = &rec->data; 1877 int fd = perf_data__fd(data); 1878 1879 if (data->is_pipe) { 1880 /* Just to display approx. size */ 1881 data->file.size = rec->bytes_written; 1882 return; 1883 } 1884 1885 rec->session->header.data_size += rec->bytes_written; 1886 data->file.size = lseek(perf_data__fd(data), 0, SEEK_CUR); 1887 if (record__threads_enabled(rec)) { 1888 for (i = 0; i < data->dir.nr; i++) 1889 data->dir.files[i].size = lseek(data->dir.files[i].fd, 0, SEEK_CUR); 1890 } 1891 1892 /* Buildid scanning disabled or build ID in kernel and synthesized map events. */ 1893 if (!rec->no_buildid || !rec->no_buildid_cache) { 1894 process_buildids(rec); 1895 1896 if (rec->buildid_all) 1897 perf_session__dsos_hit_all(rec->session); 1898 } 1899 perf_session__write_header(rec->session, rec->evlist, fd, true); 1900 perf_session__cache_build_ids(rec->session); 1901 } 1902 1903 static int record__synthesize_workload(struct record *rec, bool tail) 1904 { 1905 int err; 1906 struct perf_thread_map *thread_map; 1907 bool needs_mmap = rec->opts.synth & PERF_SYNTH_MMAP; 1908 1909 if (rec->opts.tail_synthesize != tail) 1910 return 0; 1911 1912 thread_map = thread_map__new_by_tid(rec->evlist->workload.pid); 1913 if (thread_map == NULL) 1914 return -1; 1915 1916 err = perf_event__synthesize_thread_map(&rec->tool, thread_map, 1917 process_synthesized_event, 1918 &rec->session->machines.host, 1919 needs_mmap, 1920 rec->opts.sample_address); 1921 perf_thread_map__put(thread_map); 1922 return err; 1923 } 1924 1925 static int write_finished_init(struct record *rec, bool tail) 1926 { 1927 if (rec->opts.tail_synthesize != tail) 1928 return 0; 1929 1930 return record__write(rec, NULL, &finished_init_event, sizeof(finished_init_event)); 1931 } 1932 1933 static int record__synthesize(struct record *rec, bool tail); 1934 1935 static int 1936 record__switch_output(struct record *rec, bool at_exit) 1937 { 1938 struct perf_data *data = &rec->data; 1939 char *new_filename = NULL; 1940 int fd, err; 1941 1942 /* Same Size: "2015122520103046"*/ 1943 char timestamp[] = "InvalidTimestamp"; 1944 1945 record__aio_mmap_read_sync(rec); 1946 1947 write_finished_init(rec, true); 1948 1949 record__synthesize(rec, true); 1950 if (target__none(&rec->opts.target)) 1951 record__synthesize_workload(rec, true); 1952 1953 rec->samples = 0; 1954 record__finish_output(rec); 1955 err = fetch_current_timestamp(timestamp, sizeof(timestamp)); 1956 if (err) { 1957 pr_err("Failed to get current timestamp\n"); 1958 return -EINVAL; 1959 } 1960 1961 fd = perf_data__switch(data, timestamp, 1962 rec->session->header.data_offset, 1963 at_exit, &new_filename); 1964 if (fd >= 0 && !at_exit) { 1965 rec->bytes_written = 0; 1966 rec->session->header.data_size = 0; 1967 } 1968 1969 if (!quiet) { 1970 fprintf(stderr, "[ perf record: Dump %s.%s ]\n", 1971 data->path, timestamp); 1972 } 1973 1974 if (rec->switch_output.num_files) { 1975 int n = rec->switch_output.cur_file + 1; 1976 1977 if (n >= rec->switch_output.num_files) 1978 n = 0; 1979 rec->switch_output.cur_file = n; 1980 if (rec->switch_output.filenames[n]) { 1981 remove(rec->switch_output.filenames[n]); 1982 zfree(&rec->switch_output.filenames[n]); 1983 } 1984 rec->switch_output.filenames[n] = new_filename; 1985 } else { 1986 free(new_filename); 1987 } 1988 1989 /* Output tracking events */ 1990 if (!at_exit) { 1991 record__synthesize(rec, false); 1992 1993 /* 1994 * In 'perf record --switch-output' without -a, 1995 * record__synthesize() in record__switch_output() won't 1996 * generate tracking events because there's no thread_map 1997 * in evlist. Which causes newly created perf.data doesn't 1998 * contain map and comm information. 1999 * Create a fake thread_map and directly call 2000 * perf_event__synthesize_thread_map() for those events. 2001 */ 2002 if (target__none(&rec->opts.target)) 2003 record__synthesize_workload(rec, false); 2004 write_finished_init(rec, false); 2005 } 2006 return fd; 2007 } 2008 2009 static void __record__save_lost_samples(struct record *rec, struct evsel *evsel, 2010 struct perf_record_lost_samples *lost, 2011 int cpu_idx, int thread_idx, u64 lost_count, 2012 u16 misc_flag) 2013 { 2014 struct perf_sample_id *sid; 2015 struct perf_sample sample; 2016 int id_hdr_size; 2017 2018 perf_sample__init(&sample, /*all=*/true); 2019 lost->lost = lost_count; 2020 if (evsel->core.ids) { 2021 sid = xyarray__entry(evsel->core.sample_id, cpu_idx, thread_idx); 2022 sample.id = sid->id; 2023 } 2024 2025 id_hdr_size = perf_event__synthesize_id_sample((void *)(lost + 1), 2026 evsel->core.attr.sample_type, &sample); 2027 lost->header.size = sizeof(*lost) + id_hdr_size; 2028 lost->header.misc = misc_flag; 2029 record__write(rec, NULL, lost, lost->header.size); 2030 perf_sample__exit(&sample); 2031 } 2032 2033 static void record__read_lost_samples(struct record *rec) 2034 { 2035 struct perf_session *session = rec->session; 2036 struct perf_record_lost_samples_and_ids lost; 2037 struct evsel *evsel; 2038 2039 /* there was an error during record__open */ 2040 if (session->evlist == NULL) 2041 return; 2042 2043 evlist__for_each_entry(session->evlist, evsel) { 2044 struct xyarray *xy = evsel->core.sample_id; 2045 u64 lost_count; 2046 2047 if (xy == NULL || evsel->core.fd == NULL) 2048 continue; 2049 if (xyarray__max_x(evsel->core.fd) != xyarray__max_x(xy) || 2050 xyarray__max_y(evsel->core.fd) != xyarray__max_y(xy)) { 2051 pr_debug("Unmatched FD vs. sample ID: skip reading LOST count\n"); 2052 continue; 2053 } 2054 2055 for (int x = 0; x < xyarray__max_x(xy); x++) { 2056 for (int y = 0; y < xyarray__max_y(xy); y++) { 2057 struct perf_counts_values count; 2058 2059 if (perf_evsel__read(&evsel->core, x, y, &count) < 0) { 2060 pr_debug("read LOST count failed\n"); 2061 return; 2062 } 2063 2064 if (count.lost) { 2065 memset(&lost, 0, sizeof(lost)); 2066 lost.lost.header.type = PERF_RECORD_LOST_SAMPLES; 2067 __record__save_lost_samples(rec, evsel, &lost.lost, 2068 x, y, count.lost, 0); 2069 } 2070 } 2071 } 2072 2073 lost_count = perf_bpf_filter__lost_count(evsel); 2074 if (lost_count) { 2075 memset(&lost, 0, sizeof(lost)); 2076 lost.lost.header.type = PERF_RECORD_LOST_SAMPLES; 2077 __record__save_lost_samples(rec, evsel, &lost.lost, 0, 0, lost_count, 2078 PERF_RECORD_MISC_LOST_SAMPLES_BPF); 2079 } 2080 } 2081 } 2082 2083 static volatile sig_atomic_t workload_exec_errno; 2084 2085 /* 2086 * evlist__prepare_workload will send a SIGUSR1 2087 * if the fork fails, since we asked by setting its 2088 * want_signal to true. 2089 */ 2090 static void workload_exec_failed_signal(int signo __maybe_unused, 2091 siginfo_t *info, 2092 void *ucontext __maybe_unused) 2093 { 2094 workload_exec_errno = info->si_value.sival_int; 2095 done = 1; 2096 child_finished = 1; 2097 } 2098 2099 static void snapshot_sig_handler(int sig); 2100 static void alarm_sig_handler(int sig); 2101 2102 static const struct perf_event_mmap_page *evlist__pick_pc(struct evlist *evlist) 2103 { 2104 if (evlist) { 2105 if (evlist->mmap && evlist->mmap[0].core.base) 2106 return evlist->mmap[0].core.base; 2107 if (evlist->overwrite_mmap && evlist->overwrite_mmap[0].core.base) 2108 return evlist->overwrite_mmap[0].core.base; 2109 } 2110 return NULL; 2111 } 2112 2113 static const struct perf_event_mmap_page *record__pick_pc(struct record *rec) 2114 { 2115 const struct perf_event_mmap_page *pc = evlist__pick_pc(rec->evlist); 2116 if (pc) 2117 return pc; 2118 return NULL; 2119 } 2120 2121 static int record__synthesize(struct record *rec, bool tail) 2122 { 2123 struct perf_session *session = rec->session; 2124 struct machine *machine = &session->machines.host; 2125 struct perf_data *data = &rec->data; 2126 struct record_opts *opts = &rec->opts; 2127 struct perf_tool *tool = &rec->tool; 2128 int err = 0; 2129 event_op f = process_synthesized_event; 2130 2131 if (rec->opts.tail_synthesize != tail) 2132 return 0; 2133 2134 if (data->is_pipe) { 2135 err = perf_event__synthesize_for_pipe(tool, session, data, 2136 process_synthesized_event); 2137 if (err < 0) 2138 goto out; 2139 2140 rec->bytes_written += err; 2141 } 2142 2143 err = perf_event__synth_time_conv(record__pick_pc(rec), tool, 2144 process_synthesized_event, machine); 2145 if (err) 2146 goto out; 2147 2148 /* Synthesize id_index before auxtrace_info */ 2149 err = perf_event__synthesize_id_index(tool, 2150 process_synthesized_event, 2151 session->evlist, machine); 2152 if (err) 2153 goto out; 2154 2155 if (rec->opts.full_auxtrace) { 2156 err = perf_event__synthesize_auxtrace_info(rec->itr, tool, 2157 session, process_synthesized_event); 2158 if (err) 2159 goto out; 2160 } 2161 2162 if (!evlist__exclude_kernel(rec->evlist)) { 2163 err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event, 2164 machine); 2165 WARN_ONCE(err < 0, "Couldn't record kernel reference relocation symbol\n" 2166 "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n" 2167 "Check /proc/kallsyms permission or run as root.\n"); 2168 2169 err = perf_event__synthesize_modules(tool, process_synthesized_event, 2170 machine); 2171 WARN_ONCE(err < 0, "Couldn't record kernel module information.\n" 2172 "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n" 2173 "Check /proc/modules permission or run as root.\n"); 2174 } 2175 2176 if (perf_guest) { 2177 machines__process_guests(&session->machines, 2178 perf_event__synthesize_guest_os, tool); 2179 } 2180 2181 err = perf_event__synthesize_extra_attr(&rec->tool, 2182 rec->evlist, 2183 process_synthesized_event, 2184 data->is_pipe); 2185 if (err) 2186 goto out; 2187 2188 err = perf_event__synthesize_thread_map2(&rec->tool, rec->evlist->core.threads, 2189 process_synthesized_event, 2190 NULL); 2191 if (err < 0) { 2192 pr_err("Couldn't synthesize thread map.\n"); 2193 return err; 2194 } 2195 2196 err = perf_event__synthesize_cpu_map(&rec->tool, rec->evlist->core.all_cpus, 2197 process_synthesized_event, NULL); 2198 if (err < 0) { 2199 pr_err("Couldn't synthesize cpu map.\n"); 2200 return err; 2201 } 2202 2203 err = perf_event__synthesize_bpf_events(session, process_synthesized_event, 2204 machine, opts); 2205 if (err < 0) { 2206 pr_warning("Couldn't synthesize bpf events.\n"); 2207 err = 0; 2208 } 2209 2210 if (rec->opts.synth & PERF_SYNTH_CGROUP) { 2211 err = perf_event__synthesize_cgroups(tool, process_synthesized_event, 2212 machine); 2213 if (err < 0) { 2214 pr_warning("Couldn't synthesize cgroup events.\n"); 2215 err = 0; 2216 } 2217 } 2218 2219 if (rec->opts.nr_threads_synthesize > 1) { 2220 mutex_init(&synth_lock); 2221 perf_set_multithreaded(); 2222 f = process_locked_synthesized_event; 2223 } 2224 2225 if (rec->opts.synth & PERF_SYNTH_TASK) { 2226 bool needs_mmap = rec->opts.synth & PERF_SYNTH_MMAP; 2227 2228 err = __machine__synthesize_threads(machine, tool, &opts->target, 2229 rec->evlist->core.threads, 2230 f, needs_mmap, opts->sample_address, 2231 rec->opts.nr_threads_synthesize); 2232 } 2233 2234 if (rec->opts.nr_threads_synthesize > 1) { 2235 perf_set_singlethreaded(); 2236 mutex_destroy(&synth_lock); 2237 } 2238 2239 out: 2240 return err; 2241 } 2242 2243 static void record__synthesize_final_bpf_metadata(struct record *rec __maybe_unused) 2244 { 2245 #ifdef HAVE_LIBBPF_SUPPORT 2246 perf_event__synthesize_final_bpf_metadata(rec->session, 2247 process_synthesized_event); 2248 #endif 2249 } 2250 2251 static int record__process_signal_event(union perf_event *event __maybe_unused, void *data) 2252 { 2253 struct record *rec = data; 2254 pthread_kill(rec->thread_id, SIGUSR2); 2255 return 0; 2256 } 2257 2258 static int record__setup_sb_evlist(struct record *rec) 2259 { 2260 struct record_opts *opts = &rec->opts; 2261 2262 if (rec->sb_evlist != NULL) { 2263 /* 2264 * We get here if --switch-output-event populated the 2265 * sb_evlist, so associate a callback that will send a SIGUSR2 2266 * to the main thread. 2267 */ 2268 evlist__set_cb(rec->sb_evlist, record__process_signal_event, rec); 2269 rec->thread_id = pthread_self(); 2270 } 2271 #ifdef HAVE_LIBBPF_SUPPORT 2272 if (!opts->no_bpf_event) { 2273 if (rec->sb_evlist == NULL) { 2274 rec->sb_evlist = evlist__new(); 2275 2276 if (rec->sb_evlist == NULL) { 2277 pr_err("Couldn't create side band evlist.\n."); 2278 return -1; 2279 } 2280 } 2281 2282 if (evlist__add_bpf_sb_event(rec->sb_evlist, perf_session__env(rec->session))) { 2283 pr_err("Couldn't ask for PERF_RECORD_BPF_EVENT side band events.\n."); 2284 return -1; 2285 } 2286 } 2287 #endif 2288 if (evlist__start_sb_thread(rec->sb_evlist, &rec->opts.target)) { 2289 pr_debug("Couldn't start the BPF side band thread:\nBPF programs starting from now on won't be annotatable\n"); 2290 opts->no_bpf_event = true; 2291 } 2292 2293 return 0; 2294 } 2295 2296 static int record__init_clock(struct record *rec) 2297 { 2298 struct perf_session *session = rec->session; 2299 struct timespec ref_clockid; 2300 struct timeval ref_tod; 2301 struct perf_env *env = perf_session__env(session); 2302 u64 ref; 2303 2304 if (!rec->opts.use_clockid) 2305 return 0; 2306 2307 if (rec->opts.use_clockid && rec->opts.clockid_res_ns) 2308 env->clock.clockid_res_ns = rec->opts.clockid_res_ns; 2309 2310 env->clock.clockid = rec->opts.clockid; 2311 2312 if (gettimeofday(&ref_tod, NULL) != 0) { 2313 pr_err("gettimeofday failed, cannot set reference time.\n"); 2314 return -1; 2315 } 2316 2317 if (clock_gettime(rec->opts.clockid, &ref_clockid)) { 2318 pr_err("clock_gettime failed, cannot set reference time.\n"); 2319 return -1; 2320 } 2321 2322 ref = (u64) ref_tod.tv_sec * NSEC_PER_SEC + 2323 (u64) ref_tod.tv_usec * NSEC_PER_USEC; 2324 2325 env->clock.tod_ns = ref; 2326 2327 ref = (u64) ref_clockid.tv_sec * NSEC_PER_SEC + 2328 (u64) ref_clockid.tv_nsec; 2329 2330 env->clock.clockid_ns = ref; 2331 return 0; 2332 } 2333 2334 static void hit_auxtrace_snapshot_trigger(struct record *rec) 2335 { 2336 if (trigger_is_ready(&auxtrace_snapshot_trigger)) { 2337 trigger_hit(&auxtrace_snapshot_trigger); 2338 auxtrace_record__snapshot_started = 1; 2339 if (auxtrace_record__snapshot_start(rec->itr)) 2340 trigger_error(&auxtrace_snapshot_trigger); 2341 } 2342 } 2343 2344 static int record__terminate_thread(struct record_thread *thread_data) 2345 { 2346 int err; 2347 enum thread_msg ack = THREAD_MSG__UNDEFINED; 2348 pid_t tid = thread_data->tid; 2349 2350 close(thread_data->pipes.msg[1]); 2351 thread_data->pipes.msg[1] = -1; 2352 err = read(thread_data->pipes.ack[0], &ack, sizeof(ack)); 2353 if (err > 0) 2354 pr_debug2("threads[%d]: sent %s\n", tid, thread_msg_tags[ack]); 2355 else 2356 pr_warning("threads[%d]: failed to receive termination notification from %d\n", 2357 thread->tid, tid); 2358 2359 return 0; 2360 } 2361 2362 static int record__start_threads(struct record *rec) 2363 { 2364 int t, tt, err, ret = 0, nr_threads = rec->nr_threads; 2365 struct record_thread *thread_data = rec->thread_data; 2366 sigset_t full, mask; 2367 pthread_t handle; 2368 pthread_attr_t attrs; 2369 2370 thread = &thread_data[0]; 2371 2372 if (!record__threads_enabled(rec)) 2373 return 0; 2374 2375 sigfillset(&full); 2376 if (sigprocmask(SIG_SETMASK, &full, &mask)) { 2377 pr_err("Failed to block signals on threads start: %s\n", strerror(errno)); 2378 return -1; 2379 } 2380 2381 pthread_attr_init(&attrs); 2382 pthread_attr_setdetachstate(&attrs, PTHREAD_CREATE_DETACHED); 2383 2384 for (t = 1; t < nr_threads; t++) { 2385 enum thread_msg msg = THREAD_MSG__UNDEFINED; 2386 2387 #ifdef HAVE_PTHREAD_ATTR_SETAFFINITY_NP 2388 pthread_attr_setaffinity_np(&attrs, 2389 MMAP_CPU_MASK_BYTES(&(thread_data[t].mask->affinity)), 2390 (cpu_set_t *)(thread_data[t].mask->affinity.bits)); 2391 #endif 2392 if (pthread_create(&handle, &attrs, record__thread, &thread_data[t])) { 2393 for (tt = 1; tt < t; tt++) 2394 record__terminate_thread(&thread_data[t]); 2395 pr_err("Failed to start threads: %s\n", strerror(errno)); 2396 ret = -1; 2397 goto out_err; 2398 } 2399 2400 err = read(thread_data[t].pipes.ack[0], &msg, sizeof(msg)); 2401 if (err > 0) 2402 pr_debug2("threads[%d]: sent %s\n", rec->thread_data[t].tid, 2403 thread_msg_tags[msg]); 2404 else 2405 pr_warning("threads[%d]: failed to receive start notification from %d\n", 2406 thread->tid, rec->thread_data[t].tid); 2407 } 2408 2409 sched_setaffinity(0, MMAP_CPU_MASK_BYTES(&thread->mask->affinity), 2410 (cpu_set_t *)thread->mask->affinity.bits); 2411 2412 pr_debug("threads[%d]: started on cpu%d\n", thread->tid, sched_getcpu()); 2413 2414 out_err: 2415 pthread_attr_destroy(&attrs); 2416 2417 if (sigprocmask(SIG_SETMASK, &mask, NULL)) { 2418 pr_err("Failed to unblock signals on threads start: %s\n", strerror(errno)); 2419 ret = -1; 2420 } 2421 2422 return ret; 2423 } 2424 2425 static int record__stop_threads(struct record *rec) 2426 { 2427 int t; 2428 struct record_thread *thread_data = rec->thread_data; 2429 2430 for (t = 1; t < rec->nr_threads; t++) 2431 record__terminate_thread(&thread_data[t]); 2432 2433 for (t = 0; t < rec->nr_threads; t++) { 2434 rec->samples += thread_data[t].samples; 2435 if (!record__threads_enabled(rec)) 2436 continue; 2437 rec->session->bytes_transferred += thread_data[t].bytes_transferred; 2438 rec->session->bytes_compressed += thread_data[t].bytes_compressed; 2439 pr_debug("threads[%d]: samples=%lld, wakes=%ld, ", thread_data[t].tid, 2440 thread_data[t].samples, thread_data[t].waking); 2441 if (thread_data[t].bytes_transferred && thread_data[t].bytes_compressed) 2442 pr_debug("transferred=%" PRIu64 ", compressed=%" PRIu64 "\n", 2443 thread_data[t].bytes_transferred, thread_data[t].bytes_compressed); 2444 else 2445 pr_debug("written=%" PRIu64 "\n", thread_data[t].bytes_written); 2446 } 2447 2448 return 0; 2449 } 2450 2451 static unsigned long record__waking(struct record *rec) 2452 { 2453 int t; 2454 unsigned long waking = 0; 2455 struct record_thread *thread_data = rec->thread_data; 2456 2457 for (t = 0; t < rec->nr_threads; t++) 2458 waking += thread_data[t].waking; 2459 2460 return waking; 2461 } 2462 2463 static int __cmd_record(struct record *rec, int argc, const char **argv) 2464 { 2465 int err; 2466 int status = 0; 2467 const bool forks = argc > 0; 2468 struct perf_tool *tool = &rec->tool; 2469 struct record_opts *opts = &rec->opts; 2470 struct perf_data *data = &rec->data; 2471 struct perf_session *session; 2472 bool disabled = false, draining = false; 2473 int fd; 2474 float ratio = 0; 2475 enum evlist_ctl_cmd cmd = EVLIST_CTL_CMD_UNSUPPORTED; 2476 struct perf_env *env; 2477 2478 atexit(record__sig_exit); 2479 signal(SIGCHLD, sig_handler); 2480 signal(SIGINT, sig_handler); 2481 signal(SIGTERM, sig_handler); 2482 signal(SIGSEGV, sigsegv_handler); 2483 2484 if (rec->opts.record_cgroup) { 2485 #ifndef HAVE_FILE_HANDLE 2486 pr_err("cgroup tracking is not supported\n"); 2487 return -1; 2488 #endif 2489 } 2490 2491 if (rec->opts.auxtrace_snapshot_mode || rec->switch_output.enabled) { 2492 signal(SIGUSR2, snapshot_sig_handler); 2493 if (rec->opts.auxtrace_snapshot_mode) 2494 trigger_on(&auxtrace_snapshot_trigger); 2495 if (rec->switch_output.enabled) 2496 trigger_on(&switch_output_trigger); 2497 } else { 2498 signal(SIGUSR2, SIG_IGN); 2499 } 2500 2501 perf_tool__init(tool, /*ordered_events=*/true); 2502 tool->sample = process_sample_event; 2503 tool->fork = perf_event__process_fork; 2504 tool->exit = perf_event__process_exit; 2505 tool->comm = perf_event__process_comm; 2506 tool->namespaces = perf_event__process_namespaces; 2507 tool->mmap = build_id__process_mmap; 2508 tool->mmap2 = build_id__process_mmap2; 2509 tool->itrace_start = process_timestamp_boundary; 2510 tool->aux = process_timestamp_boundary; 2511 tool->namespace_events = rec->opts.record_namespaces; 2512 tool->cgroup_events = rec->opts.record_cgroup; 2513 session = perf_session__new(data, tool); 2514 if (IS_ERR(session)) { 2515 pr_err("Perf session creation failed.\n"); 2516 return PTR_ERR(session); 2517 } 2518 env = perf_session__env(session); 2519 if (record__threads_enabled(rec)) { 2520 if (perf_data__is_pipe(&rec->data)) { 2521 pr_err("Parallel trace streaming is not available in pipe mode.\n"); 2522 return -1; 2523 } 2524 if (rec->opts.full_auxtrace) { 2525 pr_err("Parallel trace streaming is not available in AUX area tracing mode.\n"); 2526 return -1; 2527 } 2528 } 2529 2530 fd = perf_data__fd(data); 2531 rec->session = session; 2532 2533 if (zstd_init(&session->zstd_data, rec->opts.comp_level) < 0) { 2534 pr_err("Compression initialization failed.\n"); 2535 return -1; 2536 } 2537 #ifdef HAVE_EVENTFD_SUPPORT 2538 done_fd = eventfd(0, EFD_NONBLOCK); 2539 if (done_fd < 0) { 2540 pr_err("Failed to create wakeup eventfd, error: %m\n"); 2541 status = -1; 2542 goto out_delete_session; 2543 } 2544 err = evlist__add_wakeup_eventfd(rec->evlist, done_fd); 2545 if (err < 0) { 2546 pr_err("Failed to add wakeup eventfd to poll list\n"); 2547 status = err; 2548 goto out_delete_session; 2549 } 2550 #endif // HAVE_EVENTFD_SUPPORT 2551 2552 env->comp_type = PERF_COMP_ZSTD; 2553 env->comp_level = rec->opts.comp_level; 2554 2555 if (rec->opts.kcore && 2556 !record__kcore_readable(&session->machines.host)) { 2557 pr_err("ERROR: kcore is not readable.\n"); 2558 return -1; 2559 } 2560 2561 if (record__init_clock(rec)) 2562 return -1; 2563 2564 record__init_features(rec); 2565 2566 if (forks) { 2567 err = evlist__prepare_workload(rec->evlist, &opts->target, argv, data->is_pipe, 2568 workload_exec_failed_signal); 2569 if (err < 0) { 2570 pr_err("Couldn't run the workload!\n"); 2571 status = err; 2572 goto out_delete_session; 2573 } 2574 } 2575 2576 /* 2577 * If we have just single event and are sending data 2578 * through pipe, we need to force the ids allocation, 2579 * because we synthesize event name through the pipe 2580 * and need the id for that. 2581 */ 2582 if (data->is_pipe && rec->evlist->core.nr_entries == 1) 2583 rec->opts.sample_id = true; 2584 2585 if (rec->timestamp_filename && perf_data__is_pipe(data)) { 2586 rec->timestamp_filename = false; 2587 pr_warning("WARNING: --timestamp-filename option is not available in pipe mode.\n"); 2588 } 2589 2590 /* 2591 * Use global stat_config that is zero meaning aggr_mode is AGGR_NONE 2592 * and hybrid_merge is false. 2593 */ 2594 evlist__uniquify_evsel_names(rec->evlist, &stat_config); 2595 2596 evlist__config(rec->evlist, opts, &callchain_param); 2597 2598 /* Debug message used by test scripts */ 2599 pr_debug3("perf record opening and mmapping events\n"); 2600 if (record__open(rec) != 0) { 2601 err = -1; 2602 goto out_free_threads; 2603 } 2604 /* Debug message used by test scripts */ 2605 pr_debug3("perf record done opening and mmapping events\n"); 2606 env->comp_mmap_len = session->evlist->core.mmap_len; 2607 2608 if (rec->opts.kcore) { 2609 err = record__kcore_copy(&session->machines.host, data); 2610 if (err) { 2611 pr_err("ERROR: Failed to copy kcore\n"); 2612 goto out_free_threads; 2613 } 2614 } 2615 2616 /* 2617 * Normally perf_session__new would do this, but it doesn't have the 2618 * evlist. 2619 */ 2620 if (rec->tool.ordered_events && !evlist__sample_id_all(rec->evlist)) { 2621 pr_warning("WARNING: No sample_id_all support, falling back to unordered processing\n"); 2622 rec->tool.ordered_events = false; 2623 } 2624 2625 if (evlist__nr_groups(rec->evlist) == 0) 2626 perf_header__clear_feat(&session->header, HEADER_GROUP_DESC); 2627 2628 if (data->is_pipe) { 2629 err = perf_header__write_pipe(fd); 2630 if (err < 0) 2631 goto out_free_threads; 2632 } else { 2633 err = perf_session__write_header(session, rec->evlist, fd, false); 2634 if (err < 0) 2635 goto out_free_threads; 2636 } 2637 2638 err = -1; 2639 if (!rec->no_buildid 2640 && !perf_header__has_feat(&session->header, HEADER_BUILD_ID)) { 2641 pr_err("Couldn't generate buildids. " 2642 "Use --no-buildid to profile anyway.\n"); 2643 goto out_free_threads; 2644 } 2645 2646 if (!evlist__needs_bpf_sb_event(rec->evlist)) 2647 opts->no_bpf_event = true; 2648 2649 err = record__setup_sb_evlist(rec); 2650 if (err) 2651 goto out_free_threads; 2652 2653 err = record__synthesize(rec, false); 2654 if (err < 0) 2655 goto out_free_threads; 2656 2657 if (rec->realtime_prio) { 2658 struct sched_param param; 2659 2660 param.sched_priority = rec->realtime_prio; 2661 if (sched_setscheduler(0, SCHED_FIFO, ¶m)) { 2662 pr_err("Could not set realtime priority.\n"); 2663 err = -1; 2664 goto out_free_threads; 2665 } 2666 } 2667 2668 if (record__start_threads(rec)) 2669 goto out_free_threads; 2670 2671 /* 2672 * When perf is starting the traced process, all the events 2673 * (apart from group members) have enable_on_exec=1 set, 2674 * so don't spoil it by prematurely enabling them. 2675 */ 2676 if (!target__none(&opts->target) && !opts->target.initial_delay) 2677 evlist__enable(rec->evlist); 2678 2679 /* 2680 * offcpu-time does not call execve, so enable_on_exe wouldn't work 2681 * when recording a workload, do it manually 2682 */ 2683 if (rec->off_cpu) 2684 evlist__enable_evsel(rec->evlist, (char *)OFFCPU_EVENT); 2685 2686 /* 2687 * Let the child rip 2688 */ 2689 if (forks) { 2690 struct machine *machine = &session->machines.host; 2691 union perf_event *event; 2692 pid_t tgid; 2693 2694 event = malloc(sizeof(event->comm) + machine->id_hdr_size); 2695 if (event == NULL) { 2696 err = -ENOMEM; 2697 goto out_child; 2698 } 2699 2700 /* 2701 * Some H/W events are generated before COMM event 2702 * which is emitted during exec(), so perf script 2703 * cannot see a correct process name for those events. 2704 * Synthesize COMM event to prevent it. 2705 */ 2706 tgid = perf_event__synthesize_comm(tool, event, 2707 rec->evlist->workload.pid, 2708 process_synthesized_event, 2709 machine); 2710 free(event); 2711 2712 if (tgid == -1) 2713 goto out_child; 2714 2715 event = malloc(sizeof(event->namespaces) + 2716 (NR_NAMESPACES * sizeof(struct perf_ns_link_info)) + 2717 machine->id_hdr_size); 2718 if (event == NULL) { 2719 err = -ENOMEM; 2720 goto out_child; 2721 } 2722 2723 /* 2724 * Synthesize NAMESPACES event for the command specified. 2725 */ 2726 perf_event__synthesize_namespaces(tool, event, 2727 rec->evlist->workload.pid, 2728 tgid, process_synthesized_event, 2729 machine); 2730 free(event); 2731 2732 evlist__start_workload(rec->evlist); 2733 } 2734 2735 if (opts->target.initial_delay) { 2736 pr_info(EVLIST_DISABLED_MSG); 2737 if (opts->target.initial_delay > 0) { 2738 usleep(opts->target.initial_delay * USEC_PER_MSEC); 2739 evlist__enable(rec->evlist); 2740 pr_info(EVLIST_ENABLED_MSG); 2741 } 2742 } 2743 2744 err = event_enable_timer__start(rec->evlist->eet); 2745 if (err) 2746 goto out_child; 2747 2748 /* Debug message used by test scripts */ 2749 pr_debug3("perf record has started\n"); 2750 fflush(stderr); 2751 2752 trigger_ready(&auxtrace_snapshot_trigger); 2753 trigger_ready(&switch_output_trigger); 2754 perf_hooks__invoke_record_start(); 2755 2756 /* 2757 * Must write FINISHED_INIT so it will be seen after all other 2758 * synthesized user events, but before any regular events. 2759 */ 2760 err = write_finished_init(rec, false); 2761 if (err < 0) 2762 goto out_child; 2763 2764 for (;;) { 2765 unsigned long long hits = thread->samples; 2766 2767 /* 2768 * rec->evlist->bkw_mmap_state is possible to be 2769 * BKW_MMAP_EMPTY here: when done == true and 2770 * hits != rec->samples in previous round. 2771 * 2772 * evlist__toggle_bkw_mmap ensure we never 2773 * convert BKW_MMAP_EMPTY to BKW_MMAP_DATA_PENDING. 2774 */ 2775 if (trigger_is_hit(&switch_output_trigger) || done || draining) 2776 evlist__toggle_bkw_mmap(rec->evlist, BKW_MMAP_DATA_PENDING); 2777 2778 if (record__mmap_read_all(rec, false) < 0) { 2779 trigger_error(&auxtrace_snapshot_trigger); 2780 trigger_error(&switch_output_trigger); 2781 err = -1; 2782 goto out_child; 2783 } 2784 2785 if (auxtrace_record__snapshot_started) { 2786 auxtrace_record__snapshot_started = 0; 2787 if (!trigger_is_error(&auxtrace_snapshot_trigger)) 2788 record__read_auxtrace_snapshot(rec, false); 2789 if (trigger_is_error(&auxtrace_snapshot_trigger)) { 2790 pr_err("AUX area tracing snapshot failed\n"); 2791 err = -1; 2792 goto out_child; 2793 } 2794 } 2795 2796 if (trigger_is_hit(&switch_output_trigger)) { 2797 /* 2798 * If switch_output_trigger is hit, the data in 2799 * overwritable ring buffer should have been collected, 2800 * so bkw_mmap_state should be set to BKW_MMAP_EMPTY. 2801 * 2802 * If SIGUSR2 raise after or during record__mmap_read_all(), 2803 * record__mmap_read_all() didn't collect data from 2804 * overwritable ring buffer. Read again. 2805 */ 2806 if (rec->evlist->bkw_mmap_state == BKW_MMAP_RUNNING) 2807 continue; 2808 trigger_ready(&switch_output_trigger); 2809 2810 /* 2811 * Reenable events in overwrite ring buffer after 2812 * record__mmap_read_all(): we should have collected 2813 * data from it. 2814 */ 2815 evlist__toggle_bkw_mmap(rec->evlist, BKW_MMAP_RUNNING); 2816 2817 if (!quiet) 2818 fprintf(stderr, "[ perf record: dump data: Woken up %ld times ]\n", 2819 record__waking(rec)); 2820 thread->waking = 0; 2821 fd = record__switch_output(rec, false); 2822 if (fd < 0) { 2823 pr_err("Failed to switch to new file\n"); 2824 trigger_error(&switch_output_trigger); 2825 err = fd; 2826 goto out_child; 2827 } 2828 2829 /* re-arm the alarm */ 2830 if (rec->switch_output.time) 2831 alarm(rec->switch_output.time); 2832 } 2833 2834 if (hits == thread->samples) { 2835 if (done || draining) 2836 break; 2837 err = fdarray__poll(&thread->pollfd, -1); 2838 /* 2839 * Propagate error, only if there's any. Ignore positive 2840 * number of returned events and interrupt error. 2841 */ 2842 if (err > 0 || (err < 0 && errno == EINTR)) 2843 err = 0; 2844 thread->waking++; 2845 2846 if (fdarray__filter(&thread->pollfd, POLLERR | POLLHUP, 2847 record__thread_munmap_filtered, NULL) == 0) 2848 draining = true; 2849 2850 err = record__update_evlist_pollfd_from_thread(rec, rec->evlist, thread); 2851 if (err) 2852 goto out_child; 2853 } 2854 2855 if (evlist__ctlfd_process(rec->evlist, &cmd) > 0) { 2856 switch (cmd) { 2857 case EVLIST_CTL_CMD_SNAPSHOT: 2858 hit_auxtrace_snapshot_trigger(rec); 2859 evlist__ctlfd_ack(rec->evlist); 2860 break; 2861 case EVLIST_CTL_CMD_STOP: 2862 done = 1; 2863 break; 2864 case EVLIST_CTL_CMD_ACK: 2865 case EVLIST_CTL_CMD_UNSUPPORTED: 2866 case EVLIST_CTL_CMD_ENABLE: 2867 case EVLIST_CTL_CMD_DISABLE: 2868 case EVLIST_CTL_CMD_EVLIST: 2869 case EVLIST_CTL_CMD_PING: 2870 default: 2871 break; 2872 } 2873 } 2874 2875 err = event_enable_timer__process(rec->evlist->eet); 2876 if (err < 0) 2877 goto out_child; 2878 if (err) { 2879 err = 0; 2880 done = 1; 2881 } 2882 2883 /* 2884 * When perf is starting the traced process, at the end events 2885 * die with the process and we wait for that. Thus no need to 2886 * disable events in this case. 2887 */ 2888 if (done && !disabled && !target__none(&opts->target)) { 2889 trigger_off(&auxtrace_snapshot_trigger); 2890 evlist__disable(rec->evlist); 2891 disabled = true; 2892 } 2893 } 2894 2895 trigger_off(&auxtrace_snapshot_trigger); 2896 trigger_off(&switch_output_trigger); 2897 2898 record__synthesize_final_bpf_metadata(rec); 2899 2900 if (opts->auxtrace_snapshot_on_exit) 2901 record__auxtrace_snapshot_exit(rec); 2902 2903 if (forks && workload_exec_errno) { 2904 char msg[STRERR_BUFSIZE]; 2905 const char *emsg = str_error_r(workload_exec_errno, msg, sizeof(msg)); 2906 struct strbuf sb = STRBUF_INIT; 2907 2908 evlist__format_evsels(rec->evlist, &sb, 2048); 2909 2910 pr_err("Failed to collect '%s' for the '%s' workload: %s\n", 2911 sb.buf, argv[0], emsg); 2912 strbuf_release(&sb); 2913 err = -1; 2914 goto out_child; 2915 } 2916 2917 if (!quiet) 2918 fprintf(stderr, "[ perf record: Woken up %ld times to write data ]\n", 2919 record__waking(rec)); 2920 2921 write_finished_init(rec, true); 2922 2923 if (target__none(&rec->opts.target)) 2924 record__synthesize_workload(rec, true); 2925 2926 out_child: 2927 record__stop_threads(rec); 2928 record__mmap_read_all(rec, true); 2929 out_free_threads: 2930 record__free_thread_data(rec); 2931 evlist__finalize_ctlfd(rec->evlist); 2932 record__aio_mmap_read_sync(rec); 2933 2934 if (rec->session->bytes_transferred && rec->session->bytes_compressed) { 2935 ratio = (float)rec->session->bytes_transferred/(float)rec->session->bytes_compressed; 2936 env->comp_ratio = ratio + 0.5; 2937 } 2938 2939 if (forks) { 2940 int exit_status; 2941 2942 if (!child_finished) 2943 kill(rec->evlist->workload.pid, SIGTERM); 2944 2945 wait(&exit_status); 2946 2947 if (err < 0) 2948 status = err; 2949 else if (WIFEXITED(exit_status)) 2950 status = WEXITSTATUS(exit_status); 2951 else if (WIFSIGNALED(exit_status)) 2952 signr = WTERMSIG(exit_status); 2953 } else 2954 status = err; 2955 2956 if (rec->off_cpu) 2957 rec->bytes_written += off_cpu_write(rec->session); 2958 2959 record__read_lost_samples(rec); 2960 /* this will be recalculated during process_buildids() */ 2961 rec->samples = 0; 2962 2963 if (!err) { 2964 record__synthesize(rec, true); 2965 if (!rec->timestamp_filename) { 2966 record__finish_output(rec); 2967 } else { 2968 fd = record__switch_output(rec, true); 2969 if (fd < 0) { 2970 status = fd; 2971 goto out_delete_session; 2972 } 2973 } 2974 } 2975 2976 perf_hooks__invoke_record_end(); 2977 2978 if (!err && !quiet) { 2979 char samples[128]; 2980 const char *postfix = rec->timestamp_filename ? 2981 ".<timestamp>" : ""; 2982 2983 if (rec->samples && !rec->opts.full_auxtrace) 2984 scnprintf(samples, sizeof(samples), 2985 " (%" PRIu64 " samples)", rec->samples); 2986 else 2987 samples[0] = '\0'; 2988 2989 fprintf(stderr, "[ perf record: Captured and wrote %.3f MB %s%s%s", 2990 perf_data__size(data) / 1024.0 / 1024.0, 2991 data->path, postfix, samples); 2992 if (ratio) { 2993 fprintf(stderr, ", compressed (original %.3f MB, ratio is %.3f)", 2994 rec->session->bytes_transferred / 1024.0 / 1024.0, 2995 ratio); 2996 } 2997 fprintf(stderr, " ]\n"); 2998 } 2999 3000 out_delete_session: 3001 #ifdef HAVE_EVENTFD_SUPPORT 3002 if (done_fd >= 0) { 3003 fd = done_fd; 3004 done_fd = -1; 3005 3006 close(fd); 3007 } 3008 #endif 3009 zstd_fini(&session->zstd_data); 3010 if (!opts->no_bpf_event) 3011 evlist__stop_sb_thread(rec->sb_evlist); 3012 3013 perf_session__delete(session); 3014 return status; 3015 } 3016 3017 static void callchain_debug(struct callchain_param *callchain) 3018 { 3019 static const char *str[CALLCHAIN_MAX] = { "NONE", "FP", "DWARF", "LBR" }; 3020 3021 pr_debug("callchain: type %s\n", str[callchain->record_mode]); 3022 3023 if (callchain->record_mode == CALLCHAIN_DWARF) 3024 pr_debug("callchain: stack dump size %d\n", 3025 callchain->dump_size); 3026 } 3027 3028 int record_opts__parse_callchain(struct record_opts *record, 3029 struct callchain_param *callchain, 3030 const char *arg, bool unset) 3031 { 3032 int ret; 3033 callchain->enabled = !unset; 3034 3035 /* --no-call-graph */ 3036 if (unset) { 3037 callchain->record_mode = CALLCHAIN_NONE; 3038 pr_debug("callchain: disabled\n"); 3039 return 0; 3040 } 3041 3042 ret = parse_callchain_record_opt(arg, callchain); 3043 if (!ret) { 3044 /* Enable data address sampling for DWARF unwind. */ 3045 if (callchain->record_mode == CALLCHAIN_DWARF) 3046 record->sample_address = true; 3047 callchain_debug(callchain); 3048 } 3049 3050 return ret; 3051 } 3052 3053 int record_parse_callchain_opt(const struct option *opt, 3054 const char *arg, 3055 int unset) 3056 { 3057 return record_opts__parse_callchain(opt->value, &callchain_param, arg, unset); 3058 } 3059 3060 int record_callchain_opt(const struct option *opt, 3061 const char *arg __maybe_unused, 3062 int unset __maybe_unused) 3063 { 3064 struct callchain_param *callchain = opt->value; 3065 3066 callchain->enabled = true; 3067 3068 if (callchain->record_mode == CALLCHAIN_NONE) 3069 callchain->record_mode = CALLCHAIN_FP; 3070 3071 callchain_debug(callchain); 3072 return 0; 3073 } 3074 3075 static int perf_record_config(const char *var, const char *value, void *cb) 3076 { 3077 struct record *rec = cb; 3078 3079 if (!strcmp(var, "record.build-id")) { 3080 if (!strcmp(value, "cache")) 3081 rec->no_buildid_cache = false; 3082 else if (!strcmp(value, "no-cache")) 3083 rec->no_buildid_cache = true; 3084 else if (!strcmp(value, "skip")) 3085 rec->no_buildid = rec->no_buildid_cache = true; 3086 else if (!strcmp(value, "mmap")) 3087 rec->buildid_mmap = true; 3088 else if (!strcmp(value, "no-mmap")) 3089 rec->buildid_mmap = false; 3090 else 3091 return -1; 3092 return 0; 3093 } 3094 if (!strcmp(var, "record.call-graph")) { 3095 var = "call-graph.record-mode"; 3096 return perf_default_config(var, value, cb); 3097 } 3098 #ifdef HAVE_AIO_SUPPORT 3099 if (!strcmp(var, "record.aio")) { 3100 rec->opts.nr_cblocks = strtol(value, NULL, 0); 3101 if (!rec->opts.nr_cblocks) 3102 rec->opts.nr_cblocks = nr_cblocks_default; 3103 } 3104 #endif 3105 if (!strcmp(var, "record.debuginfod")) { 3106 rec->debuginfod.urls = strdup(value); 3107 if (!rec->debuginfod.urls) 3108 return -ENOMEM; 3109 rec->debuginfod.set = true; 3110 } 3111 3112 return 0; 3113 } 3114 3115 static int record__parse_event_enable_time(const struct option *opt, const char *str, int unset) 3116 { 3117 struct record *rec = (struct record *)opt->value; 3118 3119 return evlist__parse_event_enable_time(rec->evlist, &rec->opts, str, unset); 3120 } 3121 3122 static int record__parse_affinity(const struct option *opt, const char *str, int unset) 3123 { 3124 struct record_opts *opts = (struct record_opts *)opt->value; 3125 3126 if (unset || !str) 3127 return 0; 3128 3129 if (!strcasecmp(str, "node")) 3130 opts->affinity = PERF_AFFINITY_NODE; 3131 else if (!strcasecmp(str, "cpu")) 3132 opts->affinity = PERF_AFFINITY_CPU; 3133 3134 return 0; 3135 } 3136 3137 static int record__mmap_cpu_mask_alloc(struct mmap_cpu_mask *mask, int nr_bits) 3138 { 3139 mask->nbits = nr_bits; 3140 mask->bits = bitmap_zalloc(mask->nbits); 3141 if (!mask->bits) 3142 return -ENOMEM; 3143 3144 return 0; 3145 } 3146 3147 static void record__mmap_cpu_mask_free(struct mmap_cpu_mask *mask) 3148 { 3149 bitmap_free(mask->bits); 3150 mask->nbits = 0; 3151 } 3152 3153 static int record__thread_mask_alloc(struct thread_mask *mask, int nr_bits) 3154 { 3155 int ret; 3156 3157 ret = record__mmap_cpu_mask_alloc(&mask->maps, nr_bits); 3158 if (ret) { 3159 mask->affinity.bits = NULL; 3160 return ret; 3161 } 3162 3163 ret = record__mmap_cpu_mask_alloc(&mask->affinity, nr_bits); 3164 if (ret) { 3165 record__mmap_cpu_mask_free(&mask->maps); 3166 mask->maps.bits = NULL; 3167 } 3168 3169 return ret; 3170 } 3171 3172 static void record__thread_mask_free(struct thread_mask *mask) 3173 { 3174 record__mmap_cpu_mask_free(&mask->maps); 3175 record__mmap_cpu_mask_free(&mask->affinity); 3176 } 3177 3178 static int record__parse_threads(const struct option *opt, const char *str, int unset) 3179 { 3180 int s; 3181 struct record_opts *opts = opt->value; 3182 3183 if (unset || !str || !strlen(str)) { 3184 opts->threads_spec = THREAD_SPEC__CPU; 3185 } else { 3186 for (s = 1; s < THREAD_SPEC__MAX; s++) { 3187 if (s == THREAD_SPEC__USER) { 3188 opts->threads_user_spec = strdup(str); 3189 if (!opts->threads_user_spec) 3190 return -ENOMEM; 3191 opts->threads_spec = THREAD_SPEC__USER; 3192 break; 3193 } 3194 if (!strncasecmp(str, thread_spec_tags[s], strlen(thread_spec_tags[s]))) { 3195 opts->threads_spec = s; 3196 break; 3197 } 3198 } 3199 } 3200 3201 if (opts->threads_spec == THREAD_SPEC__USER) 3202 pr_debug("threads_spec: %s\n", opts->threads_user_spec); 3203 else 3204 pr_debug("threads_spec: %s\n", thread_spec_tags[opts->threads_spec]); 3205 3206 return 0; 3207 } 3208 3209 static int parse_output_max_size(const struct option *opt, 3210 const char *str, int unset) 3211 { 3212 unsigned long *s = (unsigned long *)opt->value; 3213 static struct parse_tag tags_size[] = { 3214 { .tag = 'B', .mult = 1 }, 3215 { .tag = 'K', .mult = 1 << 10 }, 3216 { .tag = 'M', .mult = 1 << 20 }, 3217 { .tag = 'G', .mult = 1 << 30 }, 3218 { .tag = 0 }, 3219 }; 3220 unsigned long val; 3221 3222 if (unset) { 3223 *s = 0; 3224 return 0; 3225 } 3226 3227 val = parse_tag_value(str, tags_size); 3228 if (val != (unsigned long) -1) { 3229 *s = val; 3230 return 0; 3231 } 3232 3233 return -1; 3234 } 3235 3236 static int record__parse_mmap_pages(const struct option *opt, 3237 const char *str, 3238 int unset __maybe_unused) 3239 { 3240 struct record_opts *opts = opt->value; 3241 char *s, *p; 3242 unsigned int mmap_pages; 3243 int ret; 3244 3245 if (!str) 3246 return -EINVAL; 3247 3248 s = strdup(str); 3249 if (!s) 3250 return -ENOMEM; 3251 3252 p = strchr(s, ','); 3253 if (p) 3254 *p = '\0'; 3255 3256 if (*s) { 3257 ret = __evlist__parse_mmap_pages(&mmap_pages, s); 3258 if (ret) 3259 goto out_free; 3260 opts->mmap_pages = mmap_pages; 3261 } 3262 3263 if (!p) { 3264 ret = 0; 3265 goto out_free; 3266 } 3267 3268 ret = __evlist__parse_mmap_pages(&mmap_pages, p + 1); 3269 if (ret) 3270 goto out_free; 3271 3272 opts->auxtrace_mmap_pages = mmap_pages; 3273 3274 out_free: 3275 free(s); 3276 return ret; 3277 } 3278 3279 static int record__parse_off_cpu_thresh(const struct option *opt, 3280 const char *str, 3281 int unset __maybe_unused) 3282 { 3283 struct record_opts *opts = opt->value; 3284 char *endptr; 3285 u64 off_cpu_thresh_ms; 3286 3287 if (!str) 3288 return -EINVAL; 3289 3290 off_cpu_thresh_ms = strtoull(str, &endptr, 10); 3291 3292 /* the threshold isn't string "0", yet strtoull() returns 0, parsing failed */ 3293 if (*endptr || (off_cpu_thresh_ms == 0 && strcmp(str, "0"))) 3294 return -EINVAL; 3295 else 3296 opts->off_cpu_thresh_ns = off_cpu_thresh_ms * NSEC_PER_MSEC; 3297 3298 return 0; 3299 } 3300 3301 void __weak arch__add_leaf_frame_record_opts(struct record_opts *opts __maybe_unused) 3302 { 3303 } 3304 3305 static int parse_control_option(const struct option *opt, 3306 const char *str, 3307 int unset __maybe_unused) 3308 { 3309 struct record_opts *opts = opt->value; 3310 3311 return evlist__parse_control(str, &opts->ctl_fd, &opts->ctl_fd_ack, &opts->ctl_fd_close); 3312 } 3313 3314 static void switch_output_size_warn(struct record *rec) 3315 { 3316 u64 wakeup_size = evlist__mmap_size(rec->opts.mmap_pages); 3317 struct switch_output *s = &rec->switch_output; 3318 3319 wakeup_size /= 2; 3320 3321 if (s->size < wakeup_size) { 3322 char buf[100]; 3323 3324 unit_number__scnprintf(buf, sizeof(buf), wakeup_size); 3325 pr_warning("WARNING: switch-output data size lower than " 3326 "wakeup kernel buffer size (%s) " 3327 "expect bigger perf.data sizes\n", buf); 3328 } 3329 } 3330 3331 static int switch_output_setup(struct record *rec) 3332 { 3333 struct switch_output *s = &rec->switch_output; 3334 static struct parse_tag tags_size[] = { 3335 { .tag = 'B', .mult = 1 }, 3336 { .tag = 'K', .mult = 1 << 10 }, 3337 { .tag = 'M', .mult = 1 << 20 }, 3338 { .tag = 'G', .mult = 1 << 30 }, 3339 { .tag = 0 }, 3340 }; 3341 static struct parse_tag tags_time[] = { 3342 { .tag = 's', .mult = 1 }, 3343 { .tag = 'm', .mult = 60 }, 3344 { .tag = 'h', .mult = 60*60 }, 3345 { .tag = 'd', .mult = 60*60*24 }, 3346 { .tag = 0 }, 3347 }; 3348 unsigned long val; 3349 3350 /* 3351 * If we're using --switch-output-events, then we imply its 3352 * --switch-output=signal, as we'll send a SIGUSR2 from the side band 3353 * thread to its parent. 3354 */ 3355 if (rec->switch_output_event_set) { 3356 if (record__threads_enabled(rec)) { 3357 pr_warning("WARNING: --switch-output-event option is not available in parallel streaming mode.\n"); 3358 return 0; 3359 } 3360 goto do_signal; 3361 } 3362 3363 if (!s->set) 3364 return 0; 3365 3366 if (record__threads_enabled(rec)) { 3367 pr_warning("WARNING: --switch-output option is not available in parallel streaming mode.\n"); 3368 return 0; 3369 } 3370 3371 if (!strcmp(s->str, "signal")) { 3372 do_signal: 3373 s->signal = true; 3374 pr_debug("switch-output with SIGUSR2 signal\n"); 3375 goto enabled; 3376 } 3377 3378 val = parse_tag_value(s->str, tags_size); 3379 if (val != (unsigned long) -1) { 3380 s->size = val; 3381 pr_debug("switch-output with %s size threshold\n", s->str); 3382 goto enabled; 3383 } 3384 3385 val = parse_tag_value(s->str, tags_time); 3386 if (val != (unsigned long) -1) { 3387 s->time = val; 3388 pr_debug("switch-output with %s time threshold (%lu seconds)\n", 3389 s->str, s->time); 3390 goto enabled; 3391 } 3392 3393 return -1; 3394 3395 enabled: 3396 rec->timestamp_filename = true; 3397 s->enabled = true; 3398 3399 if (s->size && !rec->opts.no_buffering) 3400 switch_output_size_warn(rec); 3401 3402 return 0; 3403 } 3404 3405 static const char * const __record_usage[] = { 3406 "perf record [<options>] [<command>]", 3407 "perf record [<options>] -- <command> [<options>]", 3408 NULL 3409 }; 3410 const char * const *record_usage = __record_usage; 3411 3412 static int build_id__process_mmap(const struct perf_tool *tool, union perf_event *event, 3413 struct perf_sample *sample, struct machine *machine) 3414 { 3415 /* 3416 * We already have the kernel maps, put in place via perf_session__create_kernel_maps() 3417 * no need to add them twice. 3418 */ 3419 if (!(event->header.misc & PERF_RECORD_MISC_USER)) 3420 return 0; 3421 return perf_event__process_mmap(tool, event, sample, machine); 3422 } 3423 3424 static int build_id__process_mmap2(const struct perf_tool *tool, union perf_event *event, 3425 struct perf_sample *sample, struct machine *machine) 3426 { 3427 /* 3428 * We already have the kernel maps, put in place via perf_session__create_kernel_maps() 3429 * no need to add them twice. 3430 */ 3431 if (!(event->header.misc & PERF_RECORD_MISC_USER)) 3432 return 0; 3433 3434 return perf_event__process_mmap2(tool, event, sample, machine); 3435 } 3436 3437 static int process_timestamp_boundary(const struct perf_tool *tool, 3438 union perf_event *event __maybe_unused, 3439 struct perf_sample *sample, 3440 struct machine *machine __maybe_unused) 3441 { 3442 struct record *rec = container_of(tool, struct record, tool); 3443 3444 set_timestamp_boundary(rec, sample->time); 3445 return 0; 3446 } 3447 3448 static int parse_record_synth_option(const struct option *opt, 3449 const char *str, 3450 int unset __maybe_unused) 3451 { 3452 struct record_opts *opts = opt->value; 3453 char *p = strdup(str); 3454 3455 if (p == NULL) 3456 return -1; 3457 3458 opts->synth = parse_synth_opt(p); 3459 free(p); 3460 3461 if (opts->synth < 0) { 3462 pr_err("Invalid synth option: %s\n", str); 3463 return -1; 3464 } 3465 return 0; 3466 } 3467 3468 /* 3469 * XXX Ideally would be local to cmd_record() and passed to a record__new 3470 * because we need to have access to it in record__exit, that is called 3471 * after cmd_record() exits, but since record_options need to be accessible to 3472 * builtin-script, leave it here. 3473 * 3474 * At least we don't ouch it in all the other functions here directly. 3475 * 3476 * Just say no to tons of global variables, sigh. 3477 */ 3478 static struct record record = { 3479 .opts = { 3480 .sample_time = true, 3481 .mmap_pages = UINT_MAX, 3482 .user_freq = UINT_MAX, 3483 .user_interval = ULLONG_MAX, 3484 .freq = 4000, 3485 .target = { 3486 .uses_mmap = true, 3487 .default_per_cpu = true, 3488 }, 3489 .mmap_flush = MMAP_FLUSH_DEFAULT, 3490 .nr_threads_synthesize = 1, 3491 .ctl_fd = -1, 3492 .ctl_fd_ack = -1, 3493 .synth = PERF_SYNTH_ALL, 3494 .off_cpu_thresh_ns = OFFCPU_THRESH, 3495 }, 3496 .buildid_mmap = true, 3497 }; 3498 3499 const char record_callchain_help[] = CALLCHAIN_RECORD_HELP 3500 "\n\t\t\t\tDefault: fp"; 3501 3502 static bool dry_run; 3503 3504 static struct parse_events_option_args parse_events_option_args = { 3505 .evlistp = &record.evlist, 3506 }; 3507 3508 static struct parse_events_option_args switch_output_parse_events_option_args = { 3509 .evlistp = &record.sb_evlist, 3510 }; 3511 3512 /* 3513 * XXX Will stay a global variable till we fix builtin-script.c to stop messing 3514 * with it and switch to use the library functions in perf_evlist that came 3515 * from builtin-record.c, i.e. use record_opts, 3516 * evlist__prepare_workload, etc instead of fork+exec'in 'perf record', 3517 * using pipes, etc. 3518 */ 3519 static struct option __record_options[] = { 3520 OPT_CALLBACK('e', "event", &parse_events_option_args, "event", 3521 "event selector. use 'perf list' to list available events", 3522 parse_events_option), 3523 OPT_CALLBACK(0, "filter", &record.evlist, "filter", 3524 "event filter", parse_filter), 3525 OPT_BOOLEAN(0, "latency", &record.latency, 3526 "Enable data collection for latency profiling.\n" 3527 "\t\t\t Use perf report --latency for latency-centric profile."), 3528 OPT_CALLBACK_NOOPT(0, "exclude-perf", &record.evlist, 3529 NULL, "don't record events from perf itself", 3530 exclude_perf), 3531 OPT_STRING('p', "pid", &record.opts.target.pid, "pid", 3532 "record events on existing process id"), 3533 OPT_STRING('t', "tid", &record.opts.target.tid, "tid", 3534 "record events on existing thread id"), 3535 OPT_INTEGER('r', "realtime", &record.realtime_prio, 3536 "collect data with this RT SCHED_FIFO priority"), 3537 OPT_BOOLEAN(0, "no-buffering", &record.opts.no_buffering, 3538 "collect data without buffering"), 3539 OPT_BOOLEAN('R', "raw-samples", &record.opts.raw_samples, 3540 "collect raw sample records from all opened counters"), 3541 OPT_BOOLEAN('a', "all-cpus", &record.opts.target.system_wide, 3542 "system-wide collection from all CPUs"), 3543 OPT_STRING('C', "cpu", &record.opts.target.cpu_list, "cpu", 3544 "list of cpus to monitor"), 3545 OPT_U64('c', "count", &record.opts.user_interval, "event period to sample"), 3546 OPT_STRING('o', "output", &record.data.path, "file", 3547 "output file name"), 3548 OPT_BOOLEAN_SET('i', "no-inherit", &record.opts.no_inherit, 3549 &record.opts.no_inherit_set, 3550 "child tasks do not inherit counters"), 3551 OPT_BOOLEAN(0, "tail-synthesize", &record.opts.tail_synthesize, 3552 "synthesize non-sample events at the end of output"), 3553 OPT_BOOLEAN(0, "overwrite", &record.opts.overwrite, "use overwrite mode"), 3554 OPT_BOOLEAN(0, "no-bpf-event", &record.opts.no_bpf_event, "do not record bpf events"), 3555 OPT_BOOLEAN(0, "strict-freq", &record.opts.strict_freq, 3556 "Fail if the specified frequency can't be used"), 3557 OPT_CALLBACK('F', "freq", &record.opts, "freq or 'max'", 3558 "profile at this frequency", 3559 record__parse_freq), 3560 OPT_CALLBACK('m', "mmap-pages", &record.opts, "pages[,pages]", 3561 "number of mmap data pages and AUX area tracing mmap pages", 3562 record__parse_mmap_pages), 3563 OPT_CALLBACK(0, "mmap-flush", &record.opts, "number", 3564 "Minimal number of bytes that is extracted from mmap data pages (default: 1)", 3565 record__mmap_flush_parse), 3566 OPT_CALLBACK_NOOPT('g', NULL, &callchain_param, 3567 NULL, "enables call-graph recording" , 3568 &record_callchain_opt), 3569 OPT_CALLBACK(0, "call-graph", &record.opts, 3570 "record_mode[,record_size]", record_callchain_help, 3571 &record_parse_callchain_opt), 3572 OPT_INCR('v', "verbose", &verbose, 3573 "be more verbose (show counter open errors, etc)"), 3574 OPT_BOOLEAN('q', "quiet", &quiet, "don't print any warnings or messages"), 3575 OPT_BOOLEAN('s', "stat", &record.opts.inherit_stat, 3576 "per thread counts"), 3577 OPT_BOOLEAN('d', "data", &record.opts.sample_address, "Record the sample addresses"), 3578 OPT_BOOLEAN(0, "phys-data", &record.opts.sample_phys_addr, 3579 "Record the sample physical addresses"), 3580 OPT_BOOLEAN(0, "data-page-size", &record.opts.sample_data_page_size, 3581 "Record the sampled data address data page size"), 3582 OPT_BOOLEAN(0, "code-page-size", &record.opts.sample_code_page_size, 3583 "Record the sampled code address (ip) page size"), 3584 OPT_BOOLEAN(0, "sample-mem-info", &record.opts.sample_data_src, 3585 "Record the data source for memory operations"), 3586 OPT_BOOLEAN(0, "sample-cpu", &record.opts.sample_cpu, "Record the sample cpu"), 3587 OPT_BOOLEAN(0, "sample-identifier", &record.opts.sample_identifier, 3588 "Record the sample identifier"), 3589 OPT_BOOLEAN_SET('T', "timestamp", &record.opts.sample_time, 3590 &record.opts.sample_time_set, 3591 "Record the sample timestamps"), 3592 OPT_BOOLEAN_SET('P', "period", &record.opts.period, &record.opts.period_set, 3593 "Record the sample period"), 3594 OPT_BOOLEAN('n', "no-samples", &record.opts.no_samples, 3595 "don't sample"), 3596 OPT_BOOLEAN_SET('N', "no-buildid-cache", &record.no_buildid_cache, 3597 &record.no_buildid_cache_set, 3598 "do not update the buildid cache"), 3599 OPT_BOOLEAN_SET('B', "no-buildid", &record.no_buildid, 3600 &record.no_buildid_set, 3601 "do not collect buildids in perf.data"), 3602 OPT_CALLBACK('G', "cgroup", &record.evlist, "name", 3603 "monitor event in cgroup name only", 3604 parse_cgroups), 3605 OPT_CALLBACK('D', "delay", &record, "ms", 3606 "ms to wait before starting measurement after program start (-1: start with events disabled), " 3607 "or ranges of time to enable events e.g. '-D 10-20,30-40'", 3608 record__parse_event_enable_time), 3609 OPT_BOOLEAN(0, "kcore", &record.opts.kcore, "copy /proc/kcore"), 3610 OPT_STRING('u', "uid", &record.uid_str, "user", "user to profile"), 3611 3612 OPT_CALLBACK_NOOPT('b', "branch-any", &record.opts.branch_stack, 3613 "branch any", "sample any taken branches", 3614 parse_branch_stack), 3615 3616 OPT_CALLBACK('j', "branch-filter", &record.opts.branch_stack, 3617 "branch filter mask", "branch stack filter modes", 3618 parse_branch_stack), 3619 OPT_BOOLEAN('W', "weight", &record.opts.sample_weight, 3620 "sample by weight (on special events only)"), 3621 OPT_BOOLEAN(0, "transaction", &record.opts.sample_transaction, 3622 "sample transaction flags (special events only)"), 3623 OPT_BOOLEAN(0, "per-thread", &record.opts.target.per_thread, 3624 "use per-thread mmaps"), 3625 OPT_CALLBACK_OPTARG('I', "intr-regs", &record.opts.sample_intr_regs, NULL, "any register", 3626 "sample selected machine registers on interrupt," 3627 " use '-I?' to list register names", parse_intr_regs), 3628 OPT_CALLBACK_OPTARG(0, "user-regs", &record.opts.sample_user_regs, NULL, "any register", 3629 "sample selected machine registers in user space," 3630 " use '--user-regs=?' to list register names", parse_user_regs), 3631 OPT_BOOLEAN(0, "running-time", &record.opts.running_time, 3632 "Record running/enabled time of read (:S) events"), 3633 OPT_CALLBACK('k', "clockid", &record.opts, 3634 "clockid", "clockid to use for events, see clock_gettime()", 3635 parse_clockid), 3636 OPT_STRING_OPTARG('S', "snapshot", &record.opts.auxtrace_snapshot_opts, 3637 "opts", "AUX area tracing Snapshot Mode", ""), 3638 OPT_STRING_OPTARG(0, "aux-sample", &record.opts.auxtrace_sample_opts, 3639 "opts", "sample AUX area", ""), 3640 OPT_UINTEGER(0, "proc-map-timeout", &proc_map_timeout, 3641 "per thread proc mmap processing timeout in ms"), 3642 OPT_BOOLEAN(0, "namespaces", &record.opts.record_namespaces, 3643 "Record namespaces events"), 3644 OPT_BOOLEAN(0, "all-cgroups", &record.opts.record_cgroup, 3645 "Record cgroup events"), 3646 OPT_BOOLEAN_SET(0, "switch-events", &record.opts.record_switch_events, 3647 &record.opts.record_switch_events_set, 3648 "Record context switch events"), 3649 OPT_BOOLEAN_FLAG(0, "all-kernel", &record.opts.all_kernel, 3650 "Configure all used events to run in kernel space.", 3651 PARSE_OPT_EXCLUSIVE), 3652 OPT_BOOLEAN_FLAG(0, "all-user", &record.opts.all_user, 3653 "Configure all used events to run in user space.", 3654 PARSE_OPT_EXCLUSIVE), 3655 OPT_BOOLEAN(0, "kernel-callchains", &record.opts.kernel_callchains, 3656 "collect kernel callchains"), 3657 OPT_BOOLEAN(0, "user-callchains", &record.opts.user_callchains, 3658 "collect user callchains"), 3659 OPT_STRING(0, "vmlinux", &symbol_conf.vmlinux_name, 3660 "file", "vmlinux pathname"), 3661 OPT_BOOLEAN(0, "buildid-all", &record.buildid_all, 3662 "Record build-id of all DSOs regardless of hits"), 3663 OPT_BOOLEAN_SET(0, "buildid-mmap", &record.buildid_mmap, &record.buildid_mmap_set, 3664 "Record build-id in mmap events and skip build-id processing."), 3665 OPT_BOOLEAN(0, "timestamp-filename", &record.timestamp_filename, 3666 "append timestamp to output filename"), 3667 OPT_BOOLEAN(0, "timestamp-boundary", &record.timestamp_boundary, 3668 "Record timestamp boundary (time of first/last samples)"), 3669 OPT_STRING_OPTARG_SET(0, "switch-output", &record.switch_output.str, 3670 &record.switch_output.set, "signal or size[BKMG] or time[smhd]", 3671 "Switch output when receiving SIGUSR2 (signal) or cross a size or time threshold", 3672 "signal"), 3673 OPT_CALLBACK_SET(0, "switch-output-event", &switch_output_parse_events_option_args, 3674 &record.switch_output_event_set, "switch output event", 3675 "switch output event selector. use 'perf list' to list available events", 3676 parse_events_option_new_evlist), 3677 OPT_INTEGER(0, "switch-max-files", &record.switch_output.num_files, 3678 "Limit number of switch output generated files"), 3679 OPT_BOOLEAN(0, "dry-run", &dry_run, 3680 "Parse options then exit"), 3681 #ifdef HAVE_AIO_SUPPORT 3682 OPT_CALLBACK_OPTARG(0, "aio", &record.opts, 3683 &nr_cblocks_default, "n", "Use <n> control blocks in asynchronous trace writing mode (default: 1, max: 4)", 3684 record__aio_parse), 3685 #endif 3686 OPT_CALLBACK(0, "affinity", &record.opts, "node|cpu", 3687 "Set affinity mask of trace reading thread to NUMA node cpu mask or cpu of processed mmap buffer", 3688 record__parse_affinity), 3689 #ifdef HAVE_ZSTD_SUPPORT 3690 OPT_CALLBACK_OPTARG('z', "compression-level", &record.opts, &comp_level_default, "n", 3691 "Compress records using specified level (default: 1 - fastest compression, 22 - greatest compression)", 3692 record__parse_comp_level), 3693 #endif 3694 OPT_CALLBACK(0, "max-size", &record.output_max_size, 3695 "size", "Limit the maximum size of the output file", parse_output_max_size), 3696 OPT_UINTEGER(0, "num-thread-synthesize", 3697 &record.opts.nr_threads_synthesize, 3698 "number of threads to run for event synthesis"), 3699 #ifdef HAVE_LIBPFM 3700 OPT_CALLBACK(0, "pfm-events", &record.evlist, "event", 3701 "libpfm4 event selector. use 'perf list' to list available events", 3702 parse_libpfm_events_option), 3703 #endif 3704 OPT_CALLBACK(0, "control", &record.opts, "fd:ctl-fd[,ack-fd] or fifo:ctl-fifo[,ack-fifo]", 3705 "Listen on ctl-fd descriptor for command to control measurement ('enable': enable events, 'disable': disable events,\n" 3706 "\t\t\t 'snapshot': AUX area tracing snapshot).\n" 3707 "\t\t\t Optionally send control command completion ('ack\\n') to ack-fd descriptor.\n" 3708 "\t\t\t Alternatively, ctl-fifo / ack-fifo will be opened and used as ctl-fd / ack-fd.", 3709 parse_control_option), 3710 OPT_CALLBACK(0, "synth", &record.opts, "no|all|task|mmap|cgroup", 3711 "Fine-tune event synthesis: default=all", parse_record_synth_option), 3712 OPT_STRING_OPTARG_SET(0, "debuginfod", &record.debuginfod.urls, 3713 &record.debuginfod.set, "debuginfod urls", 3714 "Enable debuginfod data retrieval from DEBUGINFOD_URLS or specified urls", 3715 "system"), 3716 OPT_CALLBACK_OPTARG(0, "threads", &record.opts, NULL, "spec", 3717 "write collected trace data into several data files using parallel threads", 3718 record__parse_threads), 3719 OPT_BOOLEAN(0, "off-cpu", &record.off_cpu, "Enable off-cpu analysis"), 3720 OPT_STRING(0, "setup-filter", &record.filter_action, "pin|unpin", 3721 "BPF filter action"), 3722 OPT_CALLBACK(0, "off-cpu-thresh", &record.opts, "ms", 3723 "Dump off-cpu samples if off-cpu time exceeds this threshold (in milliseconds). (Default: 500ms)", 3724 record__parse_off_cpu_thresh), 3725 OPT_END() 3726 }; 3727 3728 struct option *record_options = __record_options; 3729 3730 static int record__mmap_cpu_mask_init(struct mmap_cpu_mask *mask, struct perf_cpu_map *cpus) 3731 { 3732 struct perf_cpu cpu; 3733 int idx; 3734 3735 if (cpu_map__is_dummy(cpus)) 3736 return 0; 3737 3738 perf_cpu_map__for_each_cpu_skip_any(cpu, idx, cpus) { 3739 /* Return ENODEV is input cpu is greater than max cpu */ 3740 if ((unsigned long)cpu.cpu > mask->nbits) 3741 return -ENODEV; 3742 __set_bit(cpu.cpu, mask->bits); 3743 } 3744 3745 return 0; 3746 } 3747 3748 static int record__mmap_cpu_mask_init_spec(struct mmap_cpu_mask *mask, const char *mask_spec) 3749 { 3750 struct perf_cpu_map *cpus; 3751 3752 cpus = perf_cpu_map__new(mask_spec); 3753 if (!cpus) 3754 return -ENOMEM; 3755 3756 bitmap_zero(mask->bits, mask->nbits); 3757 if (record__mmap_cpu_mask_init(mask, cpus)) 3758 return -ENODEV; 3759 3760 perf_cpu_map__put(cpus); 3761 3762 return 0; 3763 } 3764 3765 static void record__free_thread_masks(struct record *rec, int nr_threads) 3766 { 3767 int t; 3768 3769 if (rec->thread_masks) 3770 for (t = 0; t < nr_threads; t++) 3771 record__thread_mask_free(&rec->thread_masks[t]); 3772 3773 zfree(&rec->thread_masks); 3774 } 3775 3776 static int record__alloc_thread_masks(struct record *rec, int nr_threads, int nr_bits) 3777 { 3778 int t, ret; 3779 3780 rec->thread_masks = zalloc(nr_threads * sizeof(*(rec->thread_masks))); 3781 if (!rec->thread_masks) { 3782 pr_err("Failed to allocate thread masks\n"); 3783 return -ENOMEM; 3784 } 3785 3786 for (t = 0; t < nr_threads; t++) { 3787 ret = record__thread_mask_alloc(&rec->thread_masks[t], nr_bits); 3788 if (ret) { 3789 pr_err("Failed to allocate thread masks[%d]\n", t); 3790 goto out_free; 3791 } 3792 } 3793 3794 return 0; 3795 3796 out_free: 3797 record__free_thread_masks(rec, nr_threads); 3798 3799 return ret; 3800 } 3801 3802 static int record__init_thread_cpu_masks(struct record *rec, struct perf_cpu_map *cpus) 3803 { 3804 int t, ret, nr_cpus = perf_cpu_map__nr(cpus); 3805 3806 ret = record__alloc_thread_masks(rec, nr_cpus, cpu__max_cpu().cpu); 3807 if (ret) 3808 return ret; 3809 3810 rec->nr_threads = nr_cpus; 3811 pr_debug("nr_threads: %d\n", rec->nr_threads); 3812 3813 for (t = 0; t < rec->nr_threads; t++) { 3814 __set_bit(perf_cpu_map__cpu(cpus, t).cpu, rec->thread_masks[t].maps.bits); 3815 __set_bit(perf_cpu_map__cpu(cpus, t).cpu, rec->thread_masks[t].affinity.bits); 3816 if (verbose > 0) { 3817 pr_debug("thread_masks[%d]: ", t); 3818 mmap_cpu_mask__scnprintf(&rec->thread_masks[t].maps, "maps"); 3819 pr_debug("thread_masks[%d]: ", t); 3820 mmap_cpu_mask__scnprintf(&rec->thread_masks[t].affinity, "affinity"); 3821 } 3822 } 3823 3824 return 0; 3825 } 3826 3827 static int record__init_thread_masks_spec(struct record *rec, struct perf_cpu_map *cpus, 3828 const char **maps_spec, const char **affinity_spec, 3829 u32 nr_spec) 3830 { 3831 u32 s; 3832 int ret = 0, t = 0; 3833 struct mmap_cpu_mask cpus_mask; 3834 struct thread_mask thread_mask, full_mask, *thread_masks; 3835 3836 ret = record__mmap_cpu_mask_alloc(&cpus_mask, cpu__max_cpu().cpu); 3837 if (ret) { 3838 pr_err("Failed to allocate CPUs mask\n"); 3839 return ret; 3840 } 3841 3842 ret = record__mmap_cpu_mask_init(&cpus_mask, cpus); 3843 if (ret) { 3844 pr_err("Failed to init cpu mask\n"); 3845 goto out_free_cpu_mask; 3846 } 3847 3848 ret = record__thread_mask_alloc(&full_mask, cpu__max_cpu().cpu); 3849 if (ret) { 3850 pr_err("Failed to allocate full mask\n"); 3851 goto out_free_cpu_mask; 3852 } 3853 3854 ret = record__thread_mask_alloc(&thread_mask, cpu__max_cpu().cpu); 3855 if (ret) { 3856 pr_err("Failed to allocate thread mask\n"); 3857 goto out_free_full_and_cpu_masks; 3858 } 3859 3860 for (s = 0; s < nr_spec; s++) { 3861 ret = record__mmap_cpu_mask_init_spec(&thread_mask.maps, maps_spec[s]); 3862 if (ret) { 3863 pr_err("Failed to initialize maps thread mask\n"); 3864 goto out_free; 3865 } 3866 ret = record__mmap_cpu_mask_init_spec(&thread_mask.affinity, affinity_spec[s]); 3867 if (ret) { 3868 pr_err("Failed to initialize affinity thread mask\n"); 3869 goto out_free; 3870 } 3871 3872 /* ignore invalid CPUs but do not allow empty masks */ 3873 if (!bitmap_and(thread_mask.maps.bits, thread_mask.maps.bits, 3874 cpus_mask.bits, thread_mask.maps.nbits)) { 3875 pr_err("Empty maps mask: %s\n", maps_spec[s]); 3876 ret = -EINVAL; 3877 goto out_free; 3878 } 3879 if (!bitmap_and(thread_mask.affinity.bits, thread_mask.affinity.bits, 3880 cpus_mask.bits, thread_mask.affinity.nbits)) { 3881 pr_err("Empty affinity mask: %s\n", affinity_spec[s]); 3882 ret = -EINVAL; 3883 goto out_free; 3884 } 3885 3886 /* do not allow intersection with other masks (full_mask) */ 3887 if (bitmap_intersects(thread_mask.maps.bits, full_mask.maps.bits, 3888 thread_mask.maps.nbits)) { 3889 pr_err("Intersecting maps mask: %s\n", maps_spec[s]); 3890 ret = -EINVAL; 3891 goto out_free; 3892 } 3893 if (bitmap_intersects(thread_mask.affinity.bits, full_mask.affinity.bits, 3894 thread_mask.affinity.nbits)) { 3895 pr_err("Intersecting affinity mask: %s\n", affinity_spec[s]); 3896 ret = -EINVAL; 3897 goto out_free; 3898 } 3899 3900 bitmap_or(full_mask.maps.bits, full_mask.maps.bits, 3901 thread_mask.maps.bits, full_mask.maps.nbits); 3902 bitmap_or(full_mask.affinity.bits, full_mask.affinity.bits, 3903 thread_mask.affinity.bits, full_mask.maps.nbits); 3904 3905 thread_masks = realloc(rec->thread_masks, (t + 1) * sizeof(struct thread_mask)); 3906 if (!thread_masks) { 3907 pr_err("Failed to reallocate thread masks\n"); 3908 ret = -ENOMEM; 3909 goto out_free; 3910 } 3911 rec->thread_masks = thread_masks; 3912 rec->thread_masks[t] = thread_mask; 3913 if (verbose > 0) { 3914 pr_debug("thread_masks[%d]: ", t); 3915 mmap_cpu_mask__scnprintf(&rec->thread_masks[t].maps, "maps"); 3916 pr_debug("thread_masks[%d]: ", t); 3917 mmap_cpu_mask__scnprintf(&rec->thread_masks[t].affinity, "affinity"); 3918 } 3919 t++; 3920 ret = record__thread_mask_alloc(&thread_mask, cpu__max_cpu().cpu); 3921 if (ret) { 3922 pr_err("Failed to allocate thread mask\n"); 3923 goto out_free_full_and_cpu_masks; 3924 } 3925 } 3926 rec->nr_threads = t; 3927 pr_debug("nr_threads: %d\n", rec->nr_threads); 3928 if (!rec->nr_threads) 3929 ret = -EINVAL; 3930 3931 out_free: 3932 record__thread_mask_free(&thread_mask); 3933 out_free_full_and_cpu_masks: 3934 record__thread_mask_free(&full_mask); 3935 out_free_cpu_mask: 3936 record__mmap_cpu_mask_free(&cpus_mask); 3937 3938 return ret; 3939 } 3940 3941 static int record__init_thread_core_masks(struct record *rec, struct perf_cpu_map *cpus) 3942 { 3943 int ret; 3944 struct cpu_topology *topo; 3945 3946 topo = cpu_topology__new(); 3947 if (!topo) { 3948 pr_err("Failed to allocate CPU topology\n"); 3949 return -ENOMEM; 3950 } 3951 3952 ret = record__init_thread_masks_spec(rec, cpus, topo->core_cpus_list, 3953 topo->core_cpus_list, topo->core_cpus_lists); 3954 cpu_topology__delete(topo); 3955 3956 return ret; 3957 } 3958 3959 static int record__init_thread_package_masks(struct record *rec, struct perf_cpu_map *cpus) 3960 { 3961 int ret; 3962 struct cpu_topology *topo; 3963 3964 topo = cpu_topology__new(); 3965 if (!topo) { 3966 pr_err("Failed to allocate CPU topology\n"); 3967 return -ENOMEM; 3968 } 3969 3970 ret = record__init_thread_masks_spec(rec, cpus, topo->package_cpus_list, 3971 topo->package_cpus_list, topo->package_cpus_lists); 3972 cpu_topology__delete(topo); 3973 3974 return ret; 3975 } 3976 3977 static int record__init_thread_numa_masks(struct record *rec, struct perf_cpu_map *cpus) 3978 { 3979 u32 s; 3980 int ret; 3981 const char **spec; 3982 struct numa_topology *topo; 3983 3984 topo = numa_topology__new(); 3985 if (!topo) { 3986 pr_err("Failed to allocate NUMA topology\n"); 3987 return -ENOMEM; 3988 } 3989 3990 spec = zalloc(topo->nr * sizeof(char *)); 3991 if (!spec) { 3992 pr_err("Failed to allocate NUMA spec\n"); 3993 ret = -ENOMEM; 3994 goto out_delete_topo; 3995 } 3996 for (s = 0; s < topo->nr; s++) 3997 spec[s] = topo->nodes[s].cpus; 3998 3999 ret = record__init_thread_masks_spec(rec, cpus, spec, spec, topo->nr); 4000 4001 zfree(&spec); 4002 4003 out_delete_topo: 4004 numa_topology__delete(topo); 4005 4006 return ret; 4007 } 4008 4009 static int record__init_thread_user_masks(struct record *rec, struct perf_cpu_map *cpus) 4010 { 4011 int t, ret; 4012 u32 s, nr_spec = 0; 4013 char **maps_spec = NULL, **affinity_spec = NULL, **tmp_spec; 4014 char *user_spec, *spec, *spec_ptr, *mask, *mask_ptr, *dup_mask = NULL; 4015 4016 for (t = 0, user_spec = (char *)rec->opts.threads_user_spec; ; t++, user_spec = NULL) { 4017 spec = strtok_r(user_spec, ":", &spec_ptr); 4018 if (spec == NULL) 4019 break; 4020 pr_debug2("threads_spec[%d]: %s\n", t, spec); 4021 mask = strtok_r(spec, "/", &mask_ptr); 4022 if (mask == NULL) 4023 break; 4024 pr_debug2(" maps mask: %s\n", mask); 4025 tmp_spec = realloc(maps_spec, (nr_spec + 1) * sizeof(char *)); 4026 if (!tmp_spec) { 4027 pr_err("Failed to reallocate maps spec\n"); 4028 ret = -ENOMEM; 4029 goto out_free; 4030 } 4031 maps_spec = tmp_spec; 4032 maps_spec[nr_spec] = dup_mask = strdup(mask); 4033 if (!maps_spec[nr_spec]) { 4034 pr_err("Failed to allocate maps spec[%d]\n", nr_spec); 4035 ret = -ENOMEM; 4036 goto out_free; 4037 } 4038 mask = strtok_r(NULL, "/", &mask_ptr); 4039 if (mask == NULL) { 4040 pr_err("Invalid thread maps or affinity specs\n"); 4041 ret = -EINVAL; 4042 goto out_free; 4043 } 4044 pr_debug2(" affinity mask: %s\n", mask); 4045 tmp_spec = realloc(affinity_spec, (nr_spec + 1) * sizeof(char *)); 4046 if (!tmp_spec) { 4047 pr_err("Failed to reallocate affinity spec\n"); 4048 ret = -ENOMEM; 4049 goto out_free; 4050 } 4051 affinity_spec = tmp_spec; 4052 affinity_spec[nr_spec] = strdup(mask); 4053 if (!affinity_spec[nr_spec]) { 4054 pr_err("Failed to allocate affinity spec[%d]\n", nr_spec); 4055 ret = -ENOMEM; 4056 goto out_free; 4057 } 4058 dup_mask = NULL; 4059 nr_spec++; 4060 } 4061 4062 ret = record__init_thread_masks_spec(rec, cpus, (const char **)maps_spec, 4063 (const char **)affinity_spec, nr_spec); 4064 4065 out_free: 4066 free(dup_mask); 4067 for (s = 0; s < nr_spec; s++) { 4068 if (maps_spec) 4069 free(maps_spec[s]); 4070 if (affinity_spec) 4071 free(affinity_spec[s]); 4072 } 4073 free(affinity_spec); 4074 free(maps_spec); 4075 4076 return ret; 4077 } 4078 4079 static int record__init_thread_default_masks(struct record *rec, struct perf_cpu_map *cpus) 4080 { 4081 int ret; 4082 4083 ret = record__alloc_thread_masks(rec, 1, cpu__max_cpu().cpu); 4084 if (ret) 4085 return ret; 4086 4087 if (record__mmap_cpu_mask_init(&rec->thread_masks->maps, cpus)) 4088 return -ENODEV; 4089 4090 rec->nr_threads = 1; 4091 4092 return 0; 4093 } 4094 4095 static int record__init_thread_masks(struct record *rec) 4096 { 4097 int ret = 0; 4098 struct perf_cpu_map *cpus = rec->evlist->core.all_cpus; 4099 4100 if (!record__threads_enabled(rec)) 4101 return record__init_thread_default_masks(rec, cpus); 4102 4103 if (evlist__per_thread(rec->evlist)) { 4104 pr_err("--per-thread option is mutually exclusive to parallel streaming mode.\n"); 4105 return -EINVAL; 4106 } 4107 4108 switch (rec->opts.threads_spec) { 4109 case THREAD_SPEC__CPU: 4110 ret = record__init_thread_cpu_masks(rec, cpus); 4111 break; 4112 case THREAD_SPEC__CORE: 4113 ret = record__init_thread_core_masks(rec, cpus); 4114 break; 4115 case THREAD_SPEC__PACKAGE: 4116 ret = record__init_thread_package_masks(rec, cpus); 4117 break; 4118 case THREAD_SPEC__NUMA: 4119 ret = record__init_thread_numa_masks(rec, cpus); 4120 break; 4121 case THREAD_SPEC__USER: 4122 ret = record__init_thread_user_masks(rec, cpus); 4123 break; 4124 default: 4125 break; 4126 } 4127 4128 return ret; 4129 } 4130 4131 int cmd_record(int argc, const char **argv) 4132 { 4133 int err; 4134 struct record *rec = &record; 4135 char errbuf[BUFSIZ]; 4136 4137 setlocale(LC_ALL, ""); 4138 4139 #ifndef HAVE_BPF_SKEL 4140 # define set_nobuild(s, l, m, c) set_option_nobuild(record_options, s, l, m, c) 4141 set_nobuild('\0', "off-cpu", "no BUILD_BPF_SKEL=1", true); 4142 # undef set_nobuild 4143 #endif 4144 4145 /* Disable eager loading of kernel symbols that adds overhead to perf record. */ 4146 symbol_conf.lazy_load_kernel_maps = true; 4147 rec->opts.affinity = PERF_AFFINITY_SYS; 4148 4149 rec->evlist = evlist__new(); 4150 if (rec->evlist == NULL) 4151 return -ENOMEM; 4152 4153 err = perf_config(perf_record_config, rec); 4154 if (err) 4155 return err; 4156 4157 argc = parse_options(argc, argv, record_options, record_usage, 4158 PARSE_OPT_STOP_AT_NON_OPTION); 4159 if (quiet) 4160 perf_quiet_option(); 4161 4162 err = symbol__validate_sym_arguments(); 4163 if (err) 4164 return err; 4165 4166 perf_debuginfod_setup(&record.debuginfod); 4167 4168 /* Make system wide (-a) the default target. */ 4169 if (!argc && target__none(&rec->opts.target)) 4170 rec->opts.target.system_wide = true; 4171 4172 if (nr_cgroups && !rec->opts.target.system_wide) { 4173 usage_with_options_msg(record_usage, record_options, 4174 "cgroup monitoring only available in system-wide mode"); 4175 4176 } 4177 4178 if (record.latency) { 4179 /* 4180 * There is no fundamental reason why latency profiling 4181 * can't work for system-wide mode, but exact semantics 4182 * and details are to be defined. 4183 * See the following thread for details: 4184 * https://lore.kernel.org/all/Z4XDJyvjiie3howF@google.com/ 4185 */ 4186 if (record.opts.target.system_wide) { 4187 pr_err("Failed: latency profiling is not supported with system-wide collection.\n"); 4188 err = -EINVAL; 4189 goto out_opts; 4190 } 4191 record.opts.record_switch_events = true; 4192 } 4193 4194 if (rec->buildid_mmap && !perf_can_record_build_id()) { 4195 pr_warning("Missing support for build id in kernel mmap events.\n" 4196 "Disable this warning with --no-buildid-mmap\n"); 4197 rec->buildid_mmap = false; 4198 } 4199 4200 if (rec->buildid_mmap) { 4201 /* Enable perf_event_attr::build_id bit. */ 4202 rec->opts.build_id = true; 4203 /* Disable build-ID table in the header. */ 4204 rec->no_buildid = true; 4205 } else { 4206 pr_debug("Disabling build id in synthesized mmap2 events.\n"); 4207 symbol_conf.no_buildid_mmap2 = true; 4208 } 4209 4210 if (rec->no_buildid_set && rec->no_buildid) { 4211 /* -B implies -N for historic reasons. */ 4212 rec->no_buildid_cache = true; 4213 } 4214 4215 if (rec->opts.record_cgroup && !perf_can_record_cgroup()) { 4216 pr_err("Kernel has no cgroup sampling support.\n"); 4217 err = -EINVAL; 4218 goto out_opts; 4219 } 4220 4221 if (rec->opts.kcore) 4222 rec->opts.text_poke = true; 4223 4224 if (rec->opts.kcore || record__threads_enabled(rec)) 4225 rec->data.is_dir = true; 4226 4227 if (record__threads_enabled(rec)) { 4228 if (rec->opts.affinity != PERF_AFFINITY_SYS) { 4229 pr_err("--affinity option is mutually exclusive to parallel streaming mode.\n"); 4230 goto out_opts; 4231 } 4232 if (record__aio_enabled(rec)) { 4233 pr_err("Asynchronous streaming mode (--aio) is mutually exclusive to parallel streaming mode.\n"); 4234 goto out_opts; 4235 } 4236 } 4237 4238 if (rec->opts.comp_level != 0) { 4239 pr_debug("Compression enabled, disabling build id collection at the end of the session.\n"); 4240 rec->no_buildid = true; 4241 } 4242 4243 if (rec->opts.record_switch_events && 4244 !perf_can_record_switch_events()) { 4245 ui__error("kernel does not support recording context switch events\n"); 4246 parse_options_usage(record_usage, record_options, "switch-events", 0); 4247 err = -EINVAL; 4248 goto out_opts; 4249 } 4250 4251 if (switch_output_setup(rec)) { 4252 parse_options_usage(record_usage, record_options, "switch-output", 0); 4253 err = -EINVAL; 4254 goto out_opts; 4255 } 4256 4257 if (rec->switch_output.time) { 4258 signal(SIGALRM, alarm_sig_handler); 4259 alarm(rec->switch_output.time); 4260 } 4261 4262 if (rec->switch_output.num_files) { 4263 rec->switch_output.filenames = calloc(rec->switch_output.num_files, 4264 sizeof(char *)); 4265 if (!rec->switch_output.filenames) { 4266 err = -EINVAL; 4267 goto out_opts; 4268 } 4269 } 4270 4271 if (rec->timestamp_filename && record__threads_enabled(rec)) { 4272 rec->timestamp_filename = false; 4273 pr_warning("WARNING: --timestamp-filename option is not available in parallel streaming mode.\n"); 4274 } 4275 4276 if (rec->filter_action) { 4277 if (!strcmp(rec->filter_action, "pin")) 4278 err = perf_bpf_filter__pin(); 4279 else if (!strcmp(rec->filter_action, "unpin")) 4280 err = perf_bpf_filter__unpin(); 4281 else { 4282 pr_warning("Unknown BPF filter action: %s\n", rec->filter_action); 4283 err = -EINVAL; 4284 } 4285 goto out_opts; 4286 } 4287 4288 /* For backward compatibility, -d implies --mem-info */ 4289 if (rec->opts.sample_address) 4290 rec->opts.sample_data_src = true; 4291 4292 /* 4293 * Allow aliases to facilitate the lookup of symbols for address 4294 * filters. Refer to auxtrace_parse_filters(). 4295 */ 4296 symbol_conf.allow_aliases = true; 4297 4298 symbol__init(NULL); 4299 4300 err = record__auxtrace_init(rec); 4301 if (err) 4302 goto out; 4303 4304 if (dry_run) 4305 goto out; 4306 4307 err = -ENOMEM; 4308 4309 if (rec->no_buildid_cache) { 4310 disable_buildid_cache(); 4311 } else if (rec->switch_output.enabled) { 4312 /* 4313 * In 'perf record --switch-output', disable buildid 4314 * generation by default to reduce data file switching 4315 * overhead. Still generate buildid if they are required 4316 * explicitly using 4317 * 4318 * perf record --switch-output --no-no-buildid \ 4319 * --no-no-buildid-cache 4320 * 4321 * Following code equals to: 4322 * 4323 * if ((rec->no_buildid || !rec->no_buildid_set) && 4324 * (rec->no_buildid_cache || !rec->no_buildid_cache_set)) 4325 * disable_buildid_cache(); 4326 */ 4327 bool disable = true; 4328 4329 if (rec->no_buildid_set && !rec->no_buildid) 4330 disable = false; 4331 if (rec->no_buildid_cache_set && !rec->no_buildid_cache) 4332 disable = false; 4333 if (disable) { 4334 rec->no_buildid = true; 4335 rec->no_buildid_cache = true; 4336 disable_buildid_cache(); 4337 } 4338 } 4339 4340 if (record.opts.overwrite) 4341 record.opts.tail_synthesize = true; 4342 4343 if (rec->evlist->core.nr_entries == 0) { 4344 struct evlist *def_evlist = evlist__new_default(); 4345 4346 if (!def_evlist) 4347 goto out; 4348 4349 evlist__splice_list_tail(rec->evlist, &def_evlist->core.entries); 4350 evlist__delete(def_evlist); 4351 } 4352 4353 if (rec->opts.target.tid && !rec->opts.no_inherit_set) 4354 rec->opts.no_inherit = true; 4355 4356 err = target__validate(&rec->opts.target); 4357 if (err) { 4358 target__strerror(&rec->opts.target, err, errbuf, BUFSIZ); 4359 ui__warning("%s\n", errbuf); 4360 } 4361 4362 if (rec->uid_str) { 4363 uid_t uid = parse_uid(rec->uid_str); 4364 4365 if (uid == UINT_MAX) { 4366 ui__error("Invalid User: %s", rec->uid_str); 4367 err = -EINVAL; 4368 goto out; 4369 } 4370 err = parse_uid_filter(rec->evlist, uid); 4371 if (err) 4372 goto out; 4373 4374 /* User ID filtering implies system wide. */ 4375 rec->opts.target.system_wide = true; 4376 } 4377 4378 /* Enable ignoring missing threads when -p option is defined. */ 4379 rec->opts.ignore_missing_thread = rec->opts.target.pid; 4380 4381 evlist__warn_user_requested_cpus(rec->evlist, rec->opts.target.cpu_list); 4382 4383 if (callchain_param.enabled && callchain_param.record_mode == CALLCHAIN_FP) 4384 arch__add_leaf_frame_record_opts(&rec->opts); 4385 4386 err = -ENOMEM; 4387 if (evlist__create_maps(rec->evlist, &rec->opts.target) < 0) { 4388 if (rec->opts.target.pid != NULL) { 4389 pr_err("Couldn't create thread/CPU maps: %s\n", 4390 errno == ENOENT ? "No such process" : str_error_r(errno, errbuf, sizeof(errbuf))); 4391 goto out; 4392 } 4393 else 4394 usage_with_options(record_usage, record_options); 4395 } 4396 4397 err = auxtrace_record__options(rec->itr, rec->evlist, &rec->opts); 4398 if (err) 4399 goto out; 4400 4401 /* 4402 * We take all buildids when the file contains 4403 * AUX area tracing data because we do not decode the 4404 * trace because it would take too long. 4405 */ 4406 if (rec->opts.full_auxtrace) 4407 rec->buildid_all = true; 4408 4409 if (rec->opts.text_poke) { 4410 err = record__config_text_poke(rec->evlist); 4411 if (err) { 4412 pr_err("record__config_text_poke failed, error %d\n", err); 4413 goto out; 4414 } 4415 } 4416 4417 if (rec->off_cpu) { 4418 err = record__config_off_cpu(rec); 4419 if (err) { 4420 pr_err("record__config_off_cpu failed, error %d\n", err); 4421 goto out; 4422 } 4423 } 4424 4425 if (record_opts__config(&rec->opts)) { 4426 err = -EINVAL; 4427 goto out; 4428 } 4429 4430 err = record__config_tracking_events(rec); 4431 if (err) { 4432 pr_err("record__config_tracking_events failed, error %d\n", err); 4433 goto out; 4434 } 4435 4436 err = record__init_thread_masks(rec); 4437 if (err) { 4438 pr_err("Failed to initialize parallel data streaming masks\n"); 4439 goto out; 4440 } 4441 4442 if (rec->opts.nr_cblocks > nr_cblocks_max) 4443 rec->opts.nr_cblocks = nr_cblocks_max; 4444 pr_debug("nr_cblocks: %d\n", rec->opts.nr_cblocks); 4445 4446 pr_debug("affinity: %s\n", affinity_tags[rec->opts.affinity]); 4447 pr_debug("mmap flush: %d\n", rec->opts.mmap_flush); 4448 4449 if (rec->opts.comp_level > comp_level_max) 4450 rec->opts.comp_level = comp_level_max; 4451 pr_debug("comp level: %d\n", rec->opts.comp_level); 4452 4453 err = __cmd_record(&record, argc, argv); 4454 out: 4455 record__free_thread_masks(rec, rec->nr_threads); 4456 rec->nr_threads = 0; 4457 symbol__exit(); 4458 auxtrace_record__free(rec->itr); 4459 out_opts: 4460 evlist__close_control(rec->opts.ctl_fd, rec->opts.ctl_fd_ack, &rec->opts.ctl_fd_close); 4461 evlist__delete(rec->evlist); 4462 return err; 4463 } 4464 4465 static void snapshot_sig_handler(int sig __maybe_unused) 4466 { 4467 struct record *rec = &record; 4468 4469 hit_auxtrace_snapshot_trigger(rec); 4470 4471 if (switch_output_signal(rec)) 4472 trigger_hit(&switch_output_trigger); 4473 } 4474 4475 static void alarm_sig_handler(int sig __maybe_unused) 4476 { 4477 struct record *rec = &record; 4478 4479 if (switch_output_time(rec)) 4480 trigger_hit(&switch_output_trigger); 4481 } 4482