1 // SPDX-License-Identifier: GPL-2.0
2 /*
3 * builtin-record.c
4 *
5 * Builtin record command: Record the profile of a workload
6 * (or a CPU, or a PID) into the perf.data output file - for
7 * later analysis via perf report.
8 */
9 #include "builtin.h"
10
11 #include "util/build-id.h"
12 #include <subcmd/parse-options.h>
13 #include <internal/xyarray.h>
14 #include "util/parse-events.h"
15 #include "util/config.h"
16
17 #include "util/callchain.h"
18 #include "util/cgroup.h"
19 #include "util/header.h"
20 #include "util/event.h"
21 #include "util/evlist.h"
22 #include "util/evsel.h"
23 #include "util/debug.h"
24 #include "util/mmap.h"
25 #include "util/mutex.h"
26 #include "util/target.h"
27 #include "util/session.h"
28 #include "util/tool.h"
29 #include "util/stat.h"
30 #include "util/symbol.h"
31 #include "util/record.h"
32 #include "util/cpumap.h"
33 #include "util/thread_map.h"
34 #include "util/data.h"
35 #include "util/perf_regs.h"
36 #include "util/auxtrace.h"
37 #include "util/tsc.h"
38 #include "util/parse-branch-options.h"
39 #include "util/parse-regs-options.h"
40 #include "util/perf_api_probe.h"
41 #include "util/trigger.h"
42 #include "util/perf-hooks.h"
43 #include "util/cpu-set-sched.h"
44 #include "util/synthetic-events.h"
45 #include "util/time-utils.h"
46 #include "util/units.h"
47 #include "util/bpf-event.h"
48 #include "util/util.h"
49 #include "util/pfm.h"
50 #include "util/pmu.h"
51 #include "util/pmus.h"
52 #include "util/clockid.h"
53 #include "util/off_cpu.h"
54 #include "util/bpf-filter.h"
55 #include "util/strbuf.h"
56 #include "asm/bug.h"
57 #include "perf.h"
58 #include "cputopo.h"
59
60 #include <errno.h>
61 #include <inttypes.h>
62 #include <locale.h>
63 #include <poll.h>
64 #include <pthread.h>
65 #include <unistd.h>
66 #ifndef HAVE_GETTID
67 #include <syscall.h>
68 #endif
69 #include <sched.h>
70 #include <signal.h>
71 #ifdef HAVE_EVENTFD_SUPPORT
72 #include <sys/eventfd.h>
73 #endif
74 #include <sys/mman.h>
75 #include <sys/wait.h>
76 #include <sys/types.h>
77 #include <sys/stat.h>
78 #include <fcntl.h>
79 #include <linux/err.h>
80 #include <linux/string.h>
81 #include <linux/time64.h>
82 #include <linux/zalloc.h>
83 #include <linux/bitmap.h>
84 #include <sys/time.h>
85
86 struct switch_output {
87 bool enabled;
88 bool signal;
89 unsigned long size;
90 unsigned long time;
91 const char *str;
92 bool set;
93 char **filenames;
94 int num_files;
95 int cur_file;
96 };
97
98 struct thread_mask {
99 struct mmap_cpu_mask maps;
100 struct mmap_cpu_mask affinity;
101 };
102
103 struct record_thread {
104 pid_t tid;
105 struct thread_mask *mask;
106 struct {
107 int msg[2];
108 int ack[2];
109 } pipes;
110 struct fdarray pollfd;
111 int ctlfd_pos;
112 int nr_mmaps;
113 struct mmap **maps;
114 struct mmap **overwrite_maps;
115 struct record *rec;
116 unsigned long long samples;
117 unsigned long waking;
118 u64 bytes_written;
119 u64 bytes_transferred;
120 u64 bytes_compressed;
121 };
122
123 static __thread struct record_thread *thread;
124
125 enum thread_msg {
126 THREAD_MSG__UNDEFINED = 0,
127 THREAD_MSG__READY,
128 THREAD_MSG__MAX,
129 };
130
131 static const char *thread_msg_tags[THREAD_MSG__MAX] = {
132 "UNDEFINED", "READY"
133 };
134
135 enum thread_spec {
136 THREAD_SPEC__UNDEFINED = 0,
137 THREAD_SPEC__CPU,
138 THREAD_SPEC__CORE,
139 THREAD_SPEC__PACKAGE,
140 THREAD_SPEC__NUMA,
141 THREAD_SPEC__USER,
142 THREAD_SPEC__MAX,
143 };
144
145 static const char *thread_spec_tags[THREAD_SPEC__MAX] = {
146 "undefined", "cpu", "core", "package", "numa", "user"
147 };
148
149 struct pollfd_index_map {
150 int evlist_pollfd_index;
151 int thread_pollfd_index;
152 };
153
154 struct record {
155 struct perf_tool tool;
156 struct record_opts opts;
157 u64 bytes_written;
158 u64 thread_bytes_written;
159 struct perf_data data;
160 struct auxtrace_record *itr;
161 struct evlist *evlist;
162 struct perf_session *session;
163 struct evlist *sb_evlist;
164 pthread_t thread_id;
165 int realtime_prio;
166 bool latency;
167 bool switch_output_event_set;
168 bool no_buildid;
169 bool no_buildid_set;
170 bool no_buildid_cache;
171 bool no_buildid_cache_set;
172 bool buildid_all;
173 bool buildid_mmap;
174 bool buildid_mmap_set;
175 bool timestamp_filename;
176 bool timestamp_boundary;
177 bool off_cpu;
178 const char *filter_action;
179 const char *uid_str;
180 struct switch_output switch_output;
181 unsigned long long samples;
182 unsigned long output_max_size; /* = 0: unlimited */
183 struct perf_debuginfod debuginfod;
184 int nr_threads;
185 struct thread_mask *thread_masks;
186 struct record_thread *thread_data;
187 struct pollfd_index_map *index_map;
188 size_t index_map_sz;
189 size_t index_map_cnt;
190 };
191
192 static volatile int done;
193
194 static volatile int auxtrace_record__snapshot_started;
195 static DEFINE_TRIGGER(auxtrace_snapshot_trigger);
196 static DEFINE_TRIGGER(switch_output_trigger);
197
198 static const char *affinity_tags[PERF_AFFINITY_MAX] = {
199 "SYS", "NODE", "CPU"
200 };
201
202 static int build_id__process_mmap(const struct perf_tool *tool, union perf_event *event,
203 struct perf_sample *sample, struct machine *machine);
204 static int build_id__process_mmap2(const struct perf_tool *tool, union perf_event *event,
205 struct perf_sample *sample, struct machine *machine);
206 static int process_timestamp_boundary(const struct perf_tool *tool,
207 union perf_event *event,
208 struct perf_sample *sample,
209 struct machine *machine);
210
211 #ifndef HAVE_GETTID
gettid(void)212 static inline pid_t gettid(void)
213 {
214 return (pid_t)syscall(__NR_gettid);
215 }
216 #endif
217
record__threads_enabled(struct record * rec)218 static int record__threads_enabled(struct record *rec)
219 {
220 return rec->opts.threads_spec;
221 }
222
switch_output_signal(struct record * rec)223 static bool switch_output_signal(struct record *rec)
224 {
225 return rec->switch_output.signal &&
226 trigger_is_ready(&switch_output_trigger);
227 }
228
switch_output_size(struct record * rec)229 static bool switch_output_size(struct record *rec)
230 {
231 return rec->switch_output.size &&
232 trigger_is_ready(&switch_output_trigger) &&
233 (rec->bytes_written >= rec->switch_output.size);
234 }
235
switch_output_time(struct record * rec)236 static bool switch_output_time(struct record *rec)
237 {
238 return rec->switch_output.time &&
239 trigger_is_ready(&switch_output_trigger);
240 }
241
record__bytes_written(struct record * rec)242 static u64 record__bytes_written(struct record *rec)
243 {
244 return rec->bytes_written + rec->thread_bytes_written;
245 }
246
record__output_max_size_exceeded(struct record * rec)247 static bool record__output_max_size_exceeded(struct record *rec)
248 {
249 return rec->output_max_size &&
250 (record__bytes_written(rec) >= rec->output_max_size);
251 }
252
record__write(struct record * rec,struct mmap * map __maybe_unused,void * bf,size_t size)253 static int record__write(struct record *rec, struct mmap *map __maybe_unused,
254 void *bf, size_t size)
255 {
256 struct perf_data_file *file = &rec->session->data->file;
257
258 if (map && map->file)
259 file = map->file;
260
261 if (perf_data_file__write(file, bf, size) < 0) {
262 pr_err("failed to write perf data, error: %m\n");
263 return -1;
264 }
265
266 if (map && map->file) {
267 thread->bytes_written += size;
268 rec->thread_bytes_written += size;
269 } else {
270 rec->bytes_written += size;
271 }
272
273 if (record__output_max_size_exceeded(rec) && !done) {
274 fprintf(stderr, "[ perf record: perf size limit reached (%" PRIu64 " KB),"
275 " stopping session ]\n",
276 record__bytes_written(rec) >> 10);
277 done = 1;
278 }
279
280 if (switch_output_size(rec))
281 trigger_hit(&switch_output_trigger);
282
283 return 0;
284 }
285
286 static int record__aio_enabled(struct record *rec);
287 static int record__comp_enabled(struct record *rec);
288 static ssize_t zstd_compress(struct perf_session *session, struct mmap *map,
289 void *dst, size_t dst_size, void *src, size_t src_size);
290
291 #ifdef HAVE_AIO_SUPPORT
record__aio_write(struct aiocb * cblock,int trace_fd,void * buf,size_t size,off_t off)292 static int record__aio_write(struct aiocb *cblock, int trace_fd,
293 void *buf, size_t size, off_t off)
294 {
295 int rc;
296
297 cblock->aio_fildes = trace_fd;
298 cblock->aio_buf = buf;
299 cblock->aio_nbytes = size;
300 cblock->aio_offset = off;
301 cblock->aio_sigevent.sigev_notify = SIGEV_NONE;
302
303 do {
304 rc = aio_write(cblock);
305 if (rc == 0) {
306 break;
307 } else if (errno != EAGAIN) {
308 cblock->aio_fildes = -1;
309 pr_err("failed to queue perf data, error: %m\n");
310 break;
311 }
312 } while (1);
313
314 return rc;
315 }
316
record__aio_complete(struct mmap * md,struct aiocb * cblock)317 static int record__aio_complete(struct mmap *md, struct aiocb *cblock)
318 {
319 void *rem_buf;
320 off_t rem_off;
321 size_t rem_size;
322 int rc, aio_errno;
323 ssize_t aio_ret, written;
324
325 aio_errno = aio_error(cblock);
326 if (aio_errno == EINPROGRESS)
327 return 0;
328
329 written = aio_ret = aio_return(cblock);
330 if (aio_ret < 0) {
331 if (aio_errno != EINTR)
332 pr_err("failed to write perf data, error: %m\n");
333 written = 0;
334 }
335
336 rem_size = cblock->aio_nbytes - written;
337
338 if (rem_size == 0) {
339 cblock->aio_fildes = -1;
340 /*
341 * md->refcount is incremented in record__aio_pushfn() for
342 * every aio write request started in record__aio_push() so
343 * decrement it because the request is now complete.
344 */
345 perf_mmap__put(&md->core);
346 rc = 1;
347 } else {
348 /*
349 * aio write request may require restart with the
350 * remainder if the kernel didn't write whole
351 * chunk at once.
352 */
353 rem_off = cblock->aio_offset + written;
354 rem_buf = (void *)(cblock->aio_buf + written);
355 record__aio_write(cblock, cblock->aio_fildes,
356 rem_buf, rem_size, rem_off);
357 rc = 0;
358 }
359
360 return rc;
361 }
362
record__aio_sync(struct mmap * md,bool sync_all)363 static int record__aio_sync(struct mmap *md, bool sync_all)
364 {
365 struct aiocb **aiocb = md->aio.aiocb;
366 struct aiocb *cblocks = md->aio.cblocks;
367 struct timespec timeout = { 0, 1000 * 1000 * 1 }; /* 1ms */
368 int i, do_suspend;
369
370 do {
371 do_suspend = 0;
372 for (i = 0; i < md->aio.nr_cblocks; ++i) {
373 if (cblocks[i].aio_fildes == -1 || record__aio_complete(md, &cblocks[i])) {
374 if (sync_all)
375 aiocb[i] = NULL;
376 else
377 return i;
378 } else {
379 /*
380 * Started aio write is not complete yet
381 * so it has to be waited before the
382 * next allocation.
383 */
384 aiocb[i] = &cblocks[i];
385 do_suspend = 1;
386 }
387 }
388 if (!do_suspend)
389 return -1;
390
391 while (aio_suspend((const struct aiocb **)aiocb, md->aio.nr_cblocks, &timeout)) {
392 if (!(errno == EAGAIN || errno == EINTR))
393 pr_err("failed to sync perf data, error: %m\n");
394 }
395 } while (1);
396 }
397
398 struct record_aio {
399 struct record *rec;
400 void *data;
401 size_t size;
402 };
403
record__aio_pushfn(struct mmap * map,void * to,void * buf,size_t size)404 static int record__aio_pushfn(struct mmap *map, void *to, void *buf, size_t size)
405 {
406 struct record_aio *aio = to;
407
408 /*
409 * map->core.base data pointed by buf is copied into free map->aio.data[] buffer
410 * to release space in the kernel buffer as fast as possible, calling
411 * perf_mmap__consume() from perf_mmap__push() function.
412 *
413 * That lets the kernel to proceed with storing more profiling data into
414 * the kernel buffer earlier than other per-cpu kernel buffers are handled.
415 *
416 * Coping can be done in two steps in case the chunk of profiling data
417 * crosses the upper bound of the kernel buffer. In this case we first move
418 * part of data from map->start till the upper bound and then the remainder
419 * from the beginning of the kernel buffer till the end of the data chunk.
420 */
421
422 if (record__comp_enabled(aio->rec)) {
423 ssize_t compressed = zstd_compress(aio->rec->session, NULL, aio->data + aio->size,
424 mmap__mmap_len(map) - aio->size,
425 buf, size);
426 if (compressed < 0)
427 return (int)compressed;
428
429 size = compressed;
430 } else {
431 memcpy(aio->data + aio->size, buf, size);
432 }
433
434 if (!aio->size) {
435 /*
436 * Increment map->refcount to guard map->aio.data[] buffer
437 * from premature deallocation because map object can be
438 * released earlier than aio write request started on
439 * map->aio.data[] buffer is complete.
440 *
441 * perf_mmap__put() is done at record__aio_complete()
442 * after started aio request completion or at record__aio_push()
443 * if the request failed to start.
444 */
445 perf_mmap__get(&map->core);
446 }
447
448 aio->size += size;
449
450 return size;
451 }
452
record__aio_push(struct record * rec,struct mmap * map,off_t * off)453 static int record__aio_push(struct record *rec, struct mmap *map, off_t *off)
454 {
455 int ret, idx;
456 int trace_fd = rec->session->data->file.fd;
457 struct record_aio aio = { .rec = rec, .size = 0 };
458
459 /*
460 * Call record__aio_sync() to wait till map->aio.data[] buffer
461 * becomes available after previous aio write operation.
462 */
463
464 idx = record__aio_sync(map, false);
465 aio.data = map->aio.data[idx];
466 ret = perf_mmap__push(map, &aio, record__aio_pushfn);
467 if (ret != 0) /* ret > 0 - no data, ret < 0 - error */
468 return ret;
469
470 rec->samples++;
471 ret = record__aio_write(&(map->aio.cblocks[idx]), trace_fd, aio.data, aio.size, *off);
472 if (!ret) {
473 *off += aio.size;
474 rec->bytes_written += aio.size;
475 if (switch_output_size(rec))
476 trigger_hit(&switch_output_trigger);
477 } else {
478 /*
479 * Decrement map->refcount incremented in record__aio_pushfn()
480 * back if record__aio_write() operation failed to start, otherwise
481 * map->refcount is decremented in record__aio_complete() after
482 * aio write operation finishes successfully.
483 */
484 perf_mmap__put(&map->core);
485 }
486
487 return ret;
488 }
489
record__aio_get_pos(int trace_fd)490 static off_t record__aio_get_pos(int trace_fd)
491 {
492 return lseek(trace_fd, 0, SEEK_CUR);
493 }
494
record__aio_set_pos(int trace_fd,off_t pos)495 static void record__aio_set_pos(int trace_fd, off_t pos)
496 {
497 lseek(trace_fd, pos, SEEK_SET);
498 }
499
record__aio_mmap_read_sync(struct record * rec)500 static void record__aio_mmap_read_sync(struct record *rec)
501 {
502 int i;
503 struct evlist *evlist = rec->evlist;
504 struct mmap *maps = evlist->mmap;
505
506 if (!record__aio_enabled(rec))
507 return;
508
509 for (i = 0; i < evlist->core.nr_mmaps; i++) {
510 struct mmap *map = &maps[i];
511
512 if (map->core.base)
513 record__aio_sync(map, true);
514 }
515 }
516
517 static int nr_cblocks_default = 1;
518 static int nr_cblocks_max = 4;
519
record__aio_parse(const struct option * opt,const char * str,int unset)520 static int record__aio_parse(const struct option *opt,
521 const char *str,
522 int unset)
523 {
524 struct record_opts *opts = (struct record_opts *)opt->value;
525
526 if (unset) {
527 opts->nr_cblocks = 0;
528 } else {
529 if (str)
530 opts->nr_cblocks = strtol(str, NULL, 0);
531 if (!opts->nr_cblocks)
532 opts->nr_cblocks = nr_cblocks_default;
533 }
534
535 return 0;
536 }
537 #else /* HAVE_AIO_SUPPORT */
538 static int nr_cblocks_max = 0;
539
record__aio_push(struct record * rec __maybe_unused,struct mmap * map __maybe_unused,off_t * off __maybe_unused)540 static int record__aio_push(struct record *rec __maybe_unused, struct mmap *map __maybe_unused,
541 off_t *off __maybe_unused)
542 {
543 return -1;
544 }
545
record__aio_get_pos(int trace_fd __maybe_unused)546 static off_t record__aio_get_pos(int trace_fd __maybe_unused)
547 {
548 return -1;
549 }
550
record__aio_set_pos(int trace_fd __maybe_unused,off_t pos __maybe_unused)551 static void record__aio_set_pos(int trace_fd __maybe_unused, off_t pos __maybe_unused)
552 {
553 }
554
record__aio_mmap_read_sync(struct record * rec __maybe_unused)555 static void record__aio_mmap_read_sync(struct record *rec __maybe_unused)
556 {
557 }
558 #endif
559
record__aio_enabled(struct record * rec)560 static int record__aio_enabled(struct record *rec)
561 {
562 return rec->opts.nr_cblocks > 0;
563 }
564
565 #define MMAP_FLUSH_DEFAULT 1
record__mmap_flush_parse(const struct option * opt,const char * str,int unset)566 static int record__mmap_flush_parse(const struct option *opt,
567 const char *str,
568 int unset)
569 {
570 int flush_max;
571 struct record_opts *opts = (struct record_opts *)opt->value;
572 static struct parse_tag tags[] = {
573 { .tag = 'B', .mult = 1 },
574 { .tag = 'K', .mult = 1 << 10 },
575 { .tag = 'M', .mult = 1 << 20 },
576 { .tag = 'G', .mult = 1 << 30 },
577 { .tag = 0 },
578 };
579
580 if (unset)
581 return 0;
582
583 if (str) {
584 opts->mmap_flush = parse_tag_value(str, tags);
585 if (opts->mmap_flush == (int)-1)
586 opts->mmap_flush = strtol(str, NULL, 0);
587 }
588
589 if (!opts->mmap_flush)
590 opts->mmap_flush = MMAP_FLUSH_DEFAULT;
591
592 flush_max = evlist__mmap_size(opts->mmap_pages);
593 flush_max /= 4;
594 if (opts->mmap_flush > flush_max)
595 opts->mmap_flush = flush_max;
596
597 return 0;
598 }
599
600 #ifdef HAVE_ZSTD_SUPPORT
601 static unsigned int comp_level_default = 1;
602
record__parse_comp_level(const struct option * opt,const char * str,int unset)603 static int record__parse_comp_level(const struct option *opt, const char *str, int unset)
604 {
605 struct record_opts *opts = opt->value;
606
607 if (unset) {
608 opts->comp_level = 0;
609 } else {
610 if (str)
611 opts->comp_level = strtol(str, NULL, 0);
612 if (!opts->comp_level)
613 opts->comp_level = comp_level_default;
614 }
615
616 return 0;
617 }
618 #endif
619 static unsigned int comp_level_max = 22;
620
record__comp_enabled(struct record * rec)621 static int record__comp_enabled(struct record *rec)
622 {
623 return rec->opts.comp_level > 0;
624 }
625
process_synthesized_event(const struct perf_tool * tool,union perf_event * event,struct perf_sample * sample __maybe_unused,struct machine * machine __maybe_unused)626 static int process_synthesized_event(const struct perf_tool *tool,
627 union perf_event *event,
628 struct perf_sample *sample __maybe_unused,
629 struct machine *machine __maybe_unused)
630 {
631 struct record *rec = container_of(tool, struct record, tool);
632 return record__write(rec, NULL, event, event->header.size);
633 }
634
635 static struct mutex synth_lock;
636
process_locked_synthesized_event(const struct perf_tool * tool,union perf_event * event,struct perf_sample * sample __maybe_unused,struct machine * machine __maybe_unused)637 static int process_locked_synthesized_event(const struct perf_tool *tool,
638 union perf_event *event,
639 struct perf_sample *sample __maybe_unused,
640 struct machine *machine __maybe_unused)
641 {
642 int ret;
643
644 mutex_lock(&synth_lock);
645 ret = process_synthesized_event(tool, event, sample, machine);
646 mutex_unlock(&synth_lock);
647 return ret;
648 }
649
record__pushfn(struct mmap * map,void * to,void * bf,size_t size)650 static int record__pushfn(struct mmap *map, void *to, void *bf, size_t size)
651 {
652 struct record *rec = to;
653
654 if (record__comp_enabled(rec)) {
655 struct perf_record_compressed2 *event = map->data;
656 size_t padding = 0;
657 u8 pad[8] = {0};
658 ssize_t compressed = zstd_compress(rec->session, map, map->data,
659 mmap__mmap_len(map), bf, size);
660
661 if (compressed < 0)
662 return (int)compressed;
663
664 bf = event;
665 thread->samples++;
666
667 /*
668 * The record from `zstd_compress` is not 8 bytes aligned, which would cause asan
669 * error. We make it aligned here.
670 */
671 event->data_size = compressed - sizeof(struct perf_record_compressed2);
672 event->header.size = PERF_ALIGN(compressed, sizeof(u64));
673 padding = event->header.size - compressed;
674 return record__write(rec, map, bf, compressed) ||
675 record__write(rec, map, &pad, padding);
676 }
677
678 thread->samples++;
679 return record__write(rec, map, bf, size);
680 }
681
682 static volatile sig_atomic_t signr = -1;
683 static volatile sig_atomic_t child_finished;
684 #ifdef HAVE_EVENTFD_SUPPORT
685 static volatile sig_atomic_t done_fd = -1;
686 #endif
687
sig_handler(int sig)688 static void sig_handler(int sig)
689 {
690 if (sig == SIGCHLD)
691 child_finished = 1;
692 else
693 signr = sig;
694
695 done = 1;
696 #ifdef HAVE_EVENTFD_SUPPORT
697 if (done_fd >= 0) {
698 u64 tmp = 1;
699 int orig_errno = errno;
700
701 /*
702 * It is possible for this signal handler to run after done is
703 * checked in the main loop, but before the perf counter fds are
704 * polled. If this happens, the poll() will continue to wait
705 * even though done is set, and will only break out if either
706 * another signal is received, or the counters are ready for
707 * read. To ensure the poll() doesn't sleep when done is set,
708 * use an eventfd (done_fd) to wake up the poll().
709 */
710 if (write(done_fd, &tmp, sizeof(tmp)) < 0)
711 pr_err("failed to signal wakeup fd, error: %m\n");
712
713 errno = orig_errno;
714 }
715 #endif // HAVE_EVENTFD_SUPPORT
716 }
717
sigsegv_handler(int sig)718 static void sigsegv_handler(int sig)
719 {
720 perf_hooks__recover();
721 sighandler_dump_stack(sig);
722 }
723
record__sig_exit(void)724 static void record__sig_exit(void)
725 {
726 if (signr == -1)
727 return;
728
729 signal(signr, SIG_DFL);
730 raise(signr);
731 }
732
733 #ifdef HAVE_AUXTRACE_SUPPORT
734
record__process_auxtrace(const struct perf_tool * tool,struct mmap * map,union perf_event * event,void * data1,size_t len1,void * data2,size_t len2)735 static int record__process_auxtrace(const struct perf_tool *tool,
736 struct mmap *map,
737 union perf_event *event, void *data1,
738 size_t len1, void *data2, size_t len2)
739 {
740 struct record *rec = container_of(tool, struct record, tool);
741 struct perf_data *data = &rec->data;
742 size_t padding;
743 u8 pad[8] = {0};
744
745 if (!perf_data__is_pipe(data) && perf_data__is_single_file(data)) {
746 off_t file_offset;
747 int fd = perf_data__fd(data);
748 int err;
749
750 file_offset = lseek(fd, 0, SEEK_CUR);
751 if (file_offset == -1)
752 return -1;
753 err = auxtrace_index__auxtrace_event(&rec->session->auxtrace_index,
754 event, file_offset);
755 if (err)
756 return err;
757 }
758
759 /* event.auxtrace.size includes padding, see __auxtrace_mmap__read() */
760 padding = (len1 + len2) & 7;
761 if (padding)
762 padding = 8 - padding;
763
764 record__write(rec, map, event, event->header.size);
765 record__write(rec, map, data1, len1);
766 if (len2)
767 record__write(rec, map, data2, len2);
768 record__write(rec, map, &pad, padding);
769
770 return 0;
771 }
772
record__auxtrace_mmap_read(struct record * rec,struct mmap * map)773 static int record__auxtrace_mmap_read(struct record *rec,
774 struct mmap *map)
775 {
776 int ret;
777
778 ret = auxtrace_mmap__read(map, rec->itr,
779 perf_session__env(rec->session),
780 &rec->tool,
781 record__process_auxtrace);
782 if (ret < 0)
783 return ret;
784
785 if (ret)
786 rec->samples++;
787
788 return 0;
789 }
790
record__auxtrace_mmap_read_snapshot(struct record * rec,struct mmap * map)791 static int record__auxtrace_mmap_read_snapshot(struct record *rec,
792 struct mmap *map)
793 {
794 int ret;
795
796 ret = auxtrace_mmap__read_snapshot(map, rec->itr,
797 perf_session__env(rec->session),
798 &rec->tool,
799 record__process_auxtrace,
800 rec->opts.auxtrace_snapshot_size);
801 if (ret < 0)
802 return ret;
803
804 if (ret)
805 rec->samples++;
806
807 return 0;
808 }
809
record__auxtrace_read_snapshot_all(struct record * rec)810 static int record__auxtrace_read_snapshot_all(struct record *rec)
811 {
812 int i;
813 int rc = 0;
814
815 for (i = 0; i < rec->evlist->core.nr_mmaps; i++) {
816 struct mmap *map = &rec->evlist->mmap[i];
817
818 if (!map->auxtrace_mmap.base)
819 continue;
820
821 if (record__auxtrace_mmap_read_snapshot(rec, map) != 0) {
822 rc = -1;
823 goto out;
824 }
825 }
826 out:
827 return rc;
828 }
829
record__read_auxtrace_snapshot(struct record * rec,bool on_exit)830 static void record__read_auxtrace_snapshot(struct record *rec, bool on_exit)
831 {
832 pr_debug("Recording AUX area tracing snapshot\n");
833 if (record__auxtrace_read_snapshot_all(rec) < 0) {
834 trigger_error(&auxtrace_snapshot_trigger);
835 } else {
836 if (auxtrace_record__snapshot_finish(rec->itr, on_exit))
837 trigger_error(&auxtrace_snapshot_trigger);
838 else
839 trigger_ready(&auxtrace_snapshot_trigger);
840 }
841 }
842
record__auxtrace_snapshot_exit(struct record * rec)843 static int record__auxtrace_snapshot_exit(struct record *rec)
844 {
845 if (trigger_is_error(&auxtrace_snapshot_trigger))
846 return 0;
847
848 if (!auxtrace_record__snapshot_started &&
849 auxtrace_record__snapshot_start(rec->itr))
850 return -1;
851
852 record__read_auxtrace_snapshot(rec, true);
853 if (trigger_is_error(&auxtrace_snapshot_trigger))
854 return -1;
855
856 return 0;
857 }
858
record__auxtrace_init(struct record * rec)859 static int record__auxtrace_init(struct record *rec)
860 {
861 int err;
862
863 if ((rec->opts.auxtrace_snapshot_opts || rec->opts.auxtrace_sample_opts)
864 && record__threads_enabled(rec)) {
865 pr_err("AUX area tracing options are not available in parallel streaming mode.\n");
866 return -EINVAL;
867 }
868
869 if (!rec->itr) {
870 rec->itr = auxtrace_record__init(rec->evlist, &err);
871 if (err)
872 return err;
873 }
874
875 err = auxtrace_parse_snapshot_options(rec->itr, &rec->opts,
876 rec->opts.auxtrace_snapshot_opts);
877 if (err)
878 return err;
879
880 err = auxtrace_parse_sample_options(rec->itr, rec->evlist, &rec->opts,
881 rec->opts.auxtrace_sample_opts);
882 if (err)
883 return err;
884
885 err = auxtrace_parse_aux_action(rec->evlist);
886 if (err)
887 return err;
888
889 return auxtrace_parse_filters(rec->evlist);
890 }
891
892 #else
893
894 static inline
record__auxtrace_mmap_read(struct record * rec __maybe_unused,struct mmap * map __maybe_unused)895 int record__auxtrace_mmap_read(struct record *rec __maybe_unused,
896 struct mmap *map __maybe_unused)
897 {
898 return 0;
899 }
900
901 static inline
record__read_auxtrace_snapshot(struct record * rec __maybe_unused,bool on_exit __maybe_unused)902 void record__read_auxtrace_snapshot(struct record *rec __maybe_unused,
903 bool on_exit __maybe_unused)
904 {
905 }
906
907 static inline
auxtrace_record__snapshot_start(struct auxtrace_record * itr __maybe_unused)908 int auxtrace_record__snapshot_start(struct auxtrace_record *itr __maybe_unused)
909 {
910 return 0;
911 }
912
913 static inline
record__auxtrace_snapshot_exit(struct record * rec __maybe_unused)914 int record__auxtrace_snapshot_exit(struct record *rec __maybe_unused)
915 {
916 return 0;
917 }
918
record__auxtrace_init(struct record * rec __maybe_unused)919 static int record__auxtrace_init(struct record *rec __maybe_unused)
920 {
921 return 0;
922 }
923
924 #endif
925
record__config_text_poke(struct evlist * evlist)926 static int record__config_text_poke(struct evlist *evlist)
927 {
928 struct evsel *evsel;
929
930 /* Nothing to do if text poke is already configured */
931 evlist__for_each_entry(evlist, evsel) {
932 if (evsel->core.attr.text_poke)
933 return 0;
934 }
935
936 evsel = evlist__add_dummy_on_all_cpus(evlist);
937 if (!evsel)
938 return -ENOMEM;
939
940 evsel->core.attr.text_poke = 1;
941 evsel->core.attr.ksymbol = 1;
942 evsel->immediate = true;
943 evsel__set_sample_bit(evsel, TIME);
944
945 return 0;
946 }
947
record__config_off_cpu(struct record * rec)948 static int record__config_off_cpu(struct record *rec)
949 {
950 return off_cpu_prepare(rec->evlist, &rec->opts.target, &rec->opts);
951 }
952
record__tracking_system_wide(struct record * rec)953 static bool record__tracking_system_wide(struct record *rec)
954 {
955 struct evlist *evlist = rec->evlist;
956 struct evsel *evsel;
957
958 /*
959 * If non-dummy evsel exists, system_wide sideband is need to
960 * help parse sample information.
961 * For example, PERF_EVENT_MMAP event to help parse symbol,
962 * and PERF_EVENT_COMM event to help parse task executable name.
963 */
964 evlist__for_each_entry(evlist, evsel) {
965 if (!evsel__is_dummy_event(evsel))
966 return true;
967 }
968
969 return false;
970 }
971
record__config_tracking_events(struct record * rec)972 static int record__config_tracking_events(struct record *rec)
973 {
974 struct record_opts *opts = &rec->opts;
975 struct evlist *evlist = rec->evlist;
976 bool system_wide = false;
977 struct evsel *evsel;
978
979 /*
980 * For initial_delay, system wide or a hybrid system, we need to add
981 * tracking event so that we can track PERF_RECORD_MMAP to cover the
982 * delay of waiting or event synthesis.
983 */
984 if (opts->target.initial_delay || target__has_cpu(&opts->target) ||
985 perf_pmus__num_core_pmus() > 1) {
986
987 /*
988 * User space tasks can migrate between CPUs, so when tracing
989 * selected CPUs, sideband for all CPUs is still needed.
990 */
991 if (!!opts->target.cpu_list && record__tracking_system_wide(rec))
992 system_wide = true;
993
994 evsel = evlist__findnew_tracking_event(evlist, system_wide);
995 if (!evsel)
996 return -ENOMEM;
997
998 /*
999 * Enable the tracking event when the process is forked for
1000 * initial_delay, immediately for system wide.
1001 */
1002 if (opts->target.initial_delay && !evsel->immediate &&
1003 !target__has_cpu(&opts->target))
1004 evsel->core.attr.enable_on_exec = 1;
1005 else
1006 evsel->immediate = 1;
1007 }
1008
1009 return 0;
1010 }
1011
record__kcore_readable(struct machine * machine)1012 static bool record__kcore_readable(struct machine *machine)
1013 {
1014 char kcore[PATH_MAX];
1015 int fd;
1016
1017 scnprintf(kcore, sizeof(kcore), "%s/proc/kcore", machine->root_dir);
1018
1019 fd = open(kcore, O_RDONLY);
1020 if (fd < 0)
1021 return false;
1022
1023 close(fd);
1024
1025 return true;
1026 }
1027
record__kcore_copy(struct machine * machine,struct perf_data * data)1028 static int record__kcore_copy(struct machine *machine, struct perf_data *data)
1029 {
1030 char from_dir[PATH_MAX];
1031 char kcore_dir[PATH_MAX];
1032 int ret;
1033
1034 snprintf(from_dir, sizeof(from_dir), "%s/proc", machine->root_dir);
1035
1036 ret = perf_data__make_kcore_dir(data, kcore_dir, sizeof(kcore_dir));
1037 if (ret)
1038 return ret;
1039
1040 return kcore_copy(from_dir, kcore_dir);
1041 }
1042
record__thread_data_init_pipes(struct record_thread * thread_data)1043 static void record__thread_data_init_pipes(struct record_thread *thread_data)
1044 {
1045 thread_data->pipes.msg[0] = -1;
1046 thread_data->pipes.msg[1] = -1;
1047 thread_data->pipes.ack[0] = -1;
1048 thread_data->pipes.ack[1] = -1;
1049 }
1050
record__thread_data_open_pipes(struct record_thread * thread_data)1051 static int record__thread_data_open_pipes(struct record_thread *thread_data)
1052 {
1053 if (pipe(thread_data->pipes.msg))
1054 return -EINVAL;
1055
1056 if (pipe(thread_data->pipes.ack)) {
1057 close(thread_data->pipes.msg[0]);
1058 thread_data->pipes.msg[0] = -1;
1059 close(thread_data->pipes.msg[1]);
1060 thread_data->pipes.msg[1] = -1;
1061 return -EINVAL;
1062 }
1063
1064 pr_debug2("thread_data[%p]: msg=[%d,%d], ack=[%d,%d]\n", thread_data,
1065 thread_data->pipes.msg[0], thread_data->pipes.msg[1],
1066 thread_data->pipes.ack[0], thread_data->pipes.ack[1]);
1067
1068 return 0;
1069 }
1070
record__thread_data_close_pipes(struct record_thread * thread_data)1071 static void record__thread_data_close_pipes(struct record_thread *thread_data)
1072 {
1073 if (thread_data->pipes.msg[0] != -1) {
1074 close(thread_data->pipes.msg[0]);
1075 thread_data->pipes.msg[0] = -1;
1076 }
1077 if (thread_data->pipes.msg[1] != -1) {
1078 close(thread_data->pipes.msg[1]);
1079 thread_data->pipes.msg[1] = -1;
1080 }
1081 if (thread_data->pipes.ack[0] != -1) {
1082 close(thread_data->pipes.ack[0]);
1083 thread_data->pipes.ack[0] = -1;
1084 }
1085 if (thread_data->pipes.ack[1] != -1) {
1086 close(thread_data->pipes.ack[1]);
1087 thread_data->pipes.ack[1] = -1;
1088 }
1089 }
1090
evlist__per_thread(struct evlist * evlist)1091 static bool evlist__per_thread(struct evlist *evlist)
1092 {
1093 return cpu_map__is_dummy(evlist->core.user_requested_cpus);
1094 }
1095
record__thread_data_init_maps(struct record_thread * thread_data,struct evlist * evlist)1096 static int record__thread_data_init_maps(struct record_thread *thread_data, struct evlist *evlist)
1097 {
1098 int m, tm, nr_mmaps = evlist->core.nr_mmaps;
1099 struct mmap *mmap = evlist->mmap;
1100 struct mmap *overwrite_mmap = evlist->overwrite_mmap;
1101 struct perf_cpu_map *cpus = evlist->core.all_cpus;
1102 bool per_thread = evlist__per_thread(evlist);
1103
1104 if (per_thread)
1105 thread_data->nr_mmaps = nr_mmaps;
1106 else
1107 thread_data->nr_mmaps = bitmap_weight(thread_data->mask->maps.bits,
1108 thread_data->mask->maps.nbits);
1109 if (mmap) {
1110 thread_data->maps = zalloc(thread_data->nr_mmaps * sizeof(struct mmap *));
1111 if (!thread_data->maps)
1112 return -ENOMEM;
1113 }
1114 if (overwrite_mmap) {
1115 thread_data->overwrite_maps = zalloc(thread_data->nr_mmaps * sizeof(struct mmap *));
1116 if (!thread_data->overwrite_maps) {
1117 zfree(&thread_data->maps);
1118 return -ENOMEM;
1119 }
1120 }
1121 pr_debug2("thread_data[%p]: nr_mmaps=%d, maps=%p, ow_maps=%p\n", thread_data,
1122 thread_data->nr_mmaps, thread_data->maps, thread_data->overwrite_maps);
1123
1124 for (m = 0, tm = 0; m < nr_mmaps && tm < thread_data->nr_mmaps; m++) {
1125 if (per_thread ||
1126 test_bit(perf_cpu_map__cpu(cpus, m).cpu, thread_data->mask->maps.bits)) {
1127 if (thread_data->maps) {
1128 thread_data->maps[tm] = &mmap[m];
1129 pr_debug2("thread_data[%p]: cpu%d: maps[%d] -> mmap[%d]\n",
1130 thread_data, perf_cpu_map__cpu(cpus, m).cpu, tm, m);
1131 }
1132 if (thread_data->overwrite_maps) {
1133 thread_data->overwrite_maps[tm] = &overwrite_mmap[m];
1134 pr_debug2("thread_data[%p]: cpu%d: ow_maps[%d] -> ow_mmap[%d]\n",
1135 thread_data, perf_cpu_map__cpu(cpus, m).cpu, tm, m);
1136 }
1137 tm++;
1138 }
1139 }
1140
1141 return 0;
1142 }
1143
record__thread_data_init_pollfd(struct record_thread * thread_data,struct evlist * evlist)1144 static int record__thread_data_init_pollfd(struct record_thread *thread_data, struct evlist *evlist)
1145 {
1146 int f, tm, pos;
1147 struct mmap *map, *overwrite_map;
1148
1149 fdarray__init(&thread_data->pollfd, 64);
1150
1151 for (tm = 0; tm < thread_data->nr_mmaps; tm++) {
1152 map = thread_data->maps ? thread_data->maps[tm] : NULL;
1153 overwrite_map = thread_data->overwrite_maps ?
1154 thread_data->overwrite_maps[tm] : NULL;
1155
1156 for (f = 0; f < evlist->core.pollfd.nr; f++) {
1157 void *ptr = evlist->core.pollfd.priv[f].ptr;
1158
1159 if ((map && ptr == map) || (overwrite_map && ptr == overwrite_map)) {
1160 pos = fdarray__dup_entry_from(&thread_data->pollfd, f,
1161 &evlist->core.pollfd);
1162 if (pos < 0)
1163 return pos;
1164 pr_debug2("thread_data[%p]: pollfd[%d] <- event_fd=%d\n",
1165 thread_data, pos, evlist->core.pollfd.entries[f].fd);
1166 }
1167 }
1168 }
1169
1170 return 0;
1171 }
1172
record__free_thread_data(struct record * rec)1173 static void record__free_thread_data(struct record *rec)
1174 {
1175 int t;
1176 struct record_thread *thread_data = rec->thread_data;
1177
1178 if (thread_data == NULL)
1179 return;
1180
1181 for (t = 0; t < rec->nr_threads; t++) {
1182 record__thread_data_close_pipes(&thread_data[t]);
1183 zfree(&thread_data[t].maps);
1184 zfree(&thread_data[t].overwrite_maps);
1185 fdarray__exit(&thread_data[t].pollfd);
1186 }
1187
1188 zfree(&rec->thread_data);
1189 }
1190
record__map_thread_evlist_pollfd_indexes(struct record * rec,int evlist_pollfd_index,int thread_pollfd_index)1191 static int record__map_thread_evlist_pollfd_indexes(struct record *rec,
1192 int evlist_pollfd_index,
1193 int thread_pollfd_index)
1194 {
1195 size_t x = rec->index_map_cnt;
1196
1197 if (realloc_array_as_needed(rec->index_map, rec->index_map_sz, x, NULL))
1198 return -ENOMEM;
1199 rec->index_map[x].evlist_pollfd_index = evlist_pollfd_index;
1200 rec->index_map[x].thread_pollfd_index = thread_pollfd_index;
1201 rec->index_map_cnt += 1;
1202 return 0;
1203 }
1204
record__update_evlist_pollfd_from_thread(struct record * rec,struct evlist * evlist,struct record_thread * thread_data)1205 static int record__update_evlist_pollfd_from_thread(struct record *rec,
1206 struct evlist *evlist,
1207 struct record_thread *thread_data)
1208 {
1209 struct pollfd *e_entries = evlist->core.pollfd.entries;
1210 struct pollfd *t_entries = thread_data->pollfd.entries;
1211 int err = 0;
1212 size_t i;
1213
1214 for (i = 0; i < rec->index_map_cnt; i++) {
1215 int e_pos = rec->index_map[i].evlist_pollfd_index;
1216 int t_pos = rec->index_map[i].thread_pollfd_index;
1217
1218 if (e_entries[e_pos].fd != t_entries[t_pos].fd ||
1219 e_entries[e_pos].events != t_entries[t_pos].events) {
1220 pr_err("Thread and evlist pollfd index mismatch\n");
1221 err = -EINVAL;
1222 continue;
1223 }
1224 e_entries[e_pos].revents = t_entries[t_pos].revents;
1225 }
1226 return err;
1227 }
1228
record__dup_non_perf_events(struct record * rec,struct evlist * evlist,struct record_thread * thread_data)1229 static int record__dup_non_perf_events(struct record *rec,
1230 struct evlist *evlist,
1231 struct record_thread *thread_data)
1232 {
1233 struct fdarray *fda = &evlist->core.pollfd;
1234 int i, ret;
1235
1236 for (i = 0; i < fda->nr; i++) {
1237 if (!(fda->priv[i].flags & fdarray_flag__non_perf_event))
1238 continue;
1239 ret = fdarray__dup_entry_from(&thread_data->pollfd, i, fda);
1240 if (ret < 0) {
1241 pr_err("Failed to duplicate descriptor in main thread pollfd\n");
1242 return ret;
1243 }
1244 pr_debug2("thread_data[%p]: pollfd[%d] <- non_perf_event fd=%d\n",
1245 thread_data, ret, fda->entries[i].fd);
1246 ret = record__map_thread_evlist_pollfd_indexes(rec, i, ret);
1247 if (ret < 0) {
1248 pr_err("Failed to map thread and evlist pollfd indexes\n");
1249 return ret;
1250 }
1251 }
1252 return 0;
1253 }
1254
record__alloc_thread_data(struct record * rec,struct evlist * evlist)1255 static int record__alloc_thread_data(struct record *rec, struct evlist *evlist)
1256 {
1257 int t, ret;
1258 struct record_thread *thread_data;
1259
1260 rec->thread_data = zalloc(rec->nr_threads * sizeof(*(rec->thread_data)));
1261 if (!rec->thread_data) {
1262 pr_err("Failed to allocate thread data\n");
1263 return -ENOMEM;
1264 }
1265 thread_data = rec->thread_data;
1266
1267 for (t = 0; t < rec->nr_threads; t++)
1268 record__thread_data_init_pipes(&thread_data[t]);
1269
1270 for (t = 0; t < rec->nr_threads; t++) {
1271 thread_data[t].rec = rec;
1272 thread_data[t].mask = &rec->thread_masks[t];
1273 ret = record__thread_data_init_maps(&thread_data[t], evlist);
1274 if (ret) {
1275 pr_err("Failed to initialize thread[%d] maps\n", t);
1276 goto out_free;
1277 }
1278 ret = record__thread_data_init_pollfd(&thread_data[t], evlist);
1279 if (ret) {
1280 pr_err("Failed to initialize thread[%d] pollfd\n", t);
1281 goto out_free;
1282 }
1283 if (t) {
1284 thread_data[t].tid = -1;
1285 ret = record__thread_data_open_pipes(&thread_data[t]);
1286 if (ret) {
1287 pr_err("Failed to open thread[%d] communication pipes\n", t);
1288 goto out_free;
1289 }
1290 ret = fdarray__add(&thread_data[t].pollfd, thread_data[t].pipes.msg[0],
1291 POLLIN | POLLERR | POLLHUP, fdarray_flag__nonfilterable);
1292 if (ret < 0) {
1293 pr_err("Failed to add descriptor to thread[%d] pollfd\n", t);
1294 goto out_free;
1295 }
1296 thread_data[t].ctlfd_pos = ret;
1297 pr_debug2("thread_data[%p]: pollfd[%d] <- ctl_fd=%d\n",
1298 thread_data, thread_data[t].ctlfd_pos,
1299 thread_data[t].pipes.msg[0]);
1300 } else {
1301 thread_data[t].tid = gettid();
1302
1303 ret = record__dup_non_perf_events(rec, evlist, &thread_data[t]);
1304 if (ret < 0)
1305 goto out_free;
1306
1307 thread_data[t].ctlfd_pos = -1; /* Not used */
1308 }
1309 }
1310
1311 return 0;
1312
1313 out_free:
1314 record__free_thread_data(rec);
1315
1316 return ret;
1317 }
1318
record__mmap_evlist(struct record * rec,struct evlist * evlist)1319 static int record__mmap_evlist(struct record *rec,
1320 struct evlist *evlist)
1321 {
1322 int i, ret;
1323 struct record_opts *opts = &rec->opts;
1324 bool auxtrace_overwrite = opts->auxtrace_snapshot_mode ||
1325 opts->auxtrace_sample_mode;
1326 char msg[512];
1327
1328 if (opts->affinity != PERF_AFFINITY_SYS)
1329 cpu__setup_cpunode_map();
1330
1331 if (evlist__mmap_ex(evlist, opts->mmap_pages,
1332 opts->auxtrace_mmap_pages,
1333 auxtrace_overwrite,
1334 opts->nr_cblocks, opts->affinity,
1335 opts->mmap_flush, opts->comp_level) < 0) {
1336 if (errno == EPERM) {
1337 pr_err("Permission error mapping pages.\n"
1338 "Consider increasing "
1339 "/proc/sys/kernel/perf_event_mlock_kb,\n"
1340 "or try again with a smaller value of -m/--mmap_pages.\n"
1341 "(current value: %u,%u)\n",
1342 opts->mmap_pages, opts->auxtrace_mmap_pages);
1343 return -errno;
1344 } else {
1345 pr_err("failed to mmap with %d (%s)\n", errno,
1346 str_error_r(errno, msg, sizeof(msg)));
1347 if (errno)
1348 return -errno;
1349 else
1350 return -EINVAL;
1351 }
1352 }
1353
1354 if (evlist__initialize_ctlfd(evlist, opts->ctl_fd, opts->ctl_fd_ack))
1355 return -1;
1356
1357 ret = record__alloc_thread_data(rec, evlist);
1358 if (ret)
1359 return ret;
1360
1361 if (record__threads_enabled(rec)) {
1362 ret = perf_data__create_dir(&rec->data, evlist->core.nr_mmaps);
1363 if (ret) {
1364 pr_err("Failed to create data directory: %s\n", strerror(-ret));
1365 return ret;
1366 }
1367 for (i = 0; i < evlist->core.nr_mmaps; i++) {
1368 if (evlist->mmap)
1369 evlist->mmap[i].file = &rec->data.dir.files[i];
1370 if (evlist->overwrite_mmap)
1371 evlist->overwrite_mmap[i].file = &rec->data.dir.files[i];
1372 }
1373 }
1374
1375 return 0;
1376 }
1377
record__mmap(struct record * rec)1378 static int record__mmap(struct record *rec)
1379 {
1380 return record__mmap_evlist(rec, rec->evlist);
1381 }
1382
record__open(struct record * rec)1383 static int record__open(struct record *rec)
1384 {
1385 char msg[BUFSIZ];
1386 struct evsel *pos;
1387 struct evlist *evlist = rec->evlist;
1388 struct perf_session *session = rec->session;
1389 struct record_opts *opts = &rec->opts;
1390 int rc = 0;
1391
1392 evlist__for_each_entry(evlist, pos) {
1393 try_again:
1394 if (evsel__open(pos, pos->core.cpus, pos->core.threads) < 0) {
1395 if (evsel__fallback(pos, &opts->target, errno, msg, sizeof(msg))) {
1396 if (verbose > 0)
1397 ui__warning("%s\n", msg);
1398 goto try_again;
1399 }
1400 if ((errno == EINVAL || errno == EBADF) &&
1401 pos->core.leader != &pos->core &&
1402 pos->weak_group) {
1403 pos = evlist__reset_weak_group(evlist, pos, true);
1404 goto try_again;
1405 }
1406 rc = -errno;
1407 evsel__open_strerror(pos, &opts->target, errno, msg, sizeof(msg));
1408 ui__error("%s\n", msg);
1409 goto out;
1410 }
1411 }
1412
1413 if (symbol_conf.kptr_restrict && !evlist__exclude_kernel(evlist)) {
1414 pr_warning(
1415 "WARNING: Kernel address maps (/proc/{kallsyms,modules}) are restricted,\n"
1416 "check /proc/sys/kernel/kptr_restrict and /proc/sys/kernel/perf_event_paranoid.\n\n"
1417 "Samples in kernel functions may not be resolved if a suitable vmlinux\n"
1418 "file is not found in the buildid cache or in the vmlinux path.\n\n"
1419 "Samples in kernel modules won't be resolved at all.\n\n"
1420 "If some relocation was applied (e.g. kexec) symbols may be misresolved\n"
1421 "even with a suitable vmlinux or kallsyms file.\n\n");
1422 }
1423
1424 if (evlist__apply_filters(evlist, &pos, &opts->target)) {
1425 pr_err("failed to set filter \"%s\" on event %s with %d (%s)\n",
1426 pos->filter ?: "BPF", evsel__name(pos), errno,
1427 str_error_r(errno, msg, sizeof(msg)));
1428 rc = -1;
1429 goto out;
1430 }
1431
1432 rc = record__mmap(rec);
1433 if (rc)
1434 goto out;
1435
1436 session->evlist = evlist;
1437 perf_session__set_id_hdr_size(session);
1438 out:
1439 return rc;
1440 }
1441
set_timestamp_boundary(struct record * rec,u64 sample_time)1442 static void set_timestamp_boundary(struct record *rec, u64 sample_time)
1443 {
1444 if (rec->evlist->first_sample_time == 0)
1445 rec->evlist->first_sample_time = sample_time;
1446
1447 if (sample_time)
1448 rec->evlist->last_sample_time = sample_time;
1449 }
1450
process_sample_event(const struct perf_tool * tool,union perf_event * event,struct perf_sample * sample,struct evsel * evsel,struct machine * machine)1451 static int process_sample_event(const struct perf_tool *tool,
1452 union perf_event *event,
1453 struct perf_sample *sample,
1454 struct evsel *evsel,
1455 struct machine *machine)
1456 {
1457 struct record *rec = container_of(tool, struct record, tool);
1458
1459 set_timestamp_boundary(rec, sample->time);
1460
1461 if (rec->buildid_all)
1462 return 0;
1463
1464 rec->samples++;
1465 return build_id__mark_dso_hit(tool, event, sample, evsel, machine);
1466 }
1467
process_buildids(struct record * rec)1468 static int process_buildids(struct record *rec)
1469 {
1470 struct perf_session *session = rec->session;
1471
1472 if (perf_data__size(&rec->data) == 0)
1473 return 0;
1474
1475 /*
1476 * During this process, it'll load kernel map and replace the
1477 * dso->long_name to a real pathname it found. In this case
1478 * we prefer the vmlinux path like
1479 * /lib/modules/3.16.4/build/vmlinux
1480 *
1481 * rather than build-id path (in debug directory).
1482 * $HOME/.debug/.build-id/f0/6e17aa50adf4d00b88925e03775de107611551
1483 */
1484 symbol_conf.ignore_vmlinux_buildid = true;
1485
1486 /*
1487 * If --buildid-all is given, it marks all DSO regardless of hits,
1488 * so no need to process samples. But if timestamp_boundary is enabled,
1489 * it still needs to walk on all samples to get the timestamps of
1490 * first/last samples.
1491 */
1492 if (rec->buildid_all && !rec->timestamp_boundary)
1493 rec->tool.sample = process_event_sample_stub;
1494
1495 return perf_session__process_events(session);
1496 }
1497
perf_event__synthesize_guest_os(struct machine * machine,void * data)1498 static void perf_event__synthesize_guest_os(struct machine *machine, void *data)
1499 {
1500 int err;
1501 struct perf_tool *tool = data;
1502 /*
1503 *As for guest kernel when processing subcommand record&report,
1504 *we arrange module mmap prior to guest kernel mmap and trigger
1505 *a preload dso because default guest module symbols are loaded
1506 *from guest kallsyms instead of /lib/modules/XXX/XXX. This
1507 *method is used to avoid symbol missing when the first addr is
1508 *in module instead of in guest kernel.
1509 */
1510 err = perf_event__synthesize_modules(tool, process_synthesized_event,
1511 machine);
1512 if (err < 0)
1513 pr_err("Couldn't record guest kernel [%d]'s reference"
1514 " relocation symbol.\n", machine->pid);
1515
1516 /*
1517 * We use _stext for guest kernel because guest kernel's /proc/kallsyms
1518 * have no _text sometimes.
1519 */
1520 err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
1521 machine);
1522 if (err < 0)
1523 pr_err("Couldn't record guest kernel [%d]'s reference"
1524 " relocation symbol.\n", machine->pid);
1525 }
1526
1527 static struct perf_event_header finished_round_event = {
1528 .size = sizeof(struct perf_event_header),
1529 .type = PERF_RECORD_FINISHED_ROUND,
1530 };
1531
1532 static struct perf_event_header finished_init_event = {
1533 .size = sizeof(struct perf_event_header),
1534 .type = PERF_RECORD_FINISHED_INIT,
1535 };
1536
record__adjust_affinity(struct record * rec,struct mmap * map)1537 static void record__adjust_affinity(struct record *rec, struct mmap *map)
1538 {
1539 if (rec->opts.affinity != PERF_AFFINITY_SYS &&
1540 !bitmap_equal(thread->mask->affinity.bits, map->affinity_mask.bits,
1541 thread->mask->affinity.nbits)) {
1542 bitmap_zero(thread->mask->affinity.bits, thread->mask->affinity.nbits);
1543 bitmap_or(thread->mask->affinity.bits, thread->mask->affinity.bits,
1544 map->affinity_mask.bits, thread->mask->affinity.nbits);
1545 sched_setaffinity(0, MMAP_CPU_MASK_BYTES(&thread->mask->affinity),
1546 (cpu_set_t *)thread->mask->affinity.bits);
1547 if (verbose == 2) {
1548 pr_debug("threads[%d]: running on cpu%d: ", thread->tid, sched_getcpu());
1549 mmap_cpu_mask__scnprintf(&thread->mask->affinity, "affinity");
1550 }
1551 }
1552 }
1553
process_comp_header(void * record,size_t increment)1554 static size_t process_comp_header(void *record, size_t increment)
1555 {
1556 struct perf_record_compressed2 *event = record;
1557 size_t size = sizeof(*event);
1558
1559 if (increment) {
1560 event->header.size += increment;
1561 return increment;
1562 }
1563
1564 event->header.type = PERF_RECORD_COMPRESSED2;
1565 event->header.size = size;
1566
1567 return size;
1568 }
1569
zstd_compress(struct perf_session * session,struct mmap * map,void * dst,size_t dst_size,void * src,size_t src_size)1570 static ssize_t zstd_compress(struct perf_session *session, struct mmap *map,
1571 void *dst, size_t dst_size, void *src, size_t src_size)
1572 {
1573 ssize_t compressed;
1574 size_t max_record_size = PERF_SAMPLE_MAX_SIZE - sizeof(struct perf_record_compressed2) - 1;
1575 struct zstd_data *zstd_data = &session->zstd_data;
1576
1577 if (map && map->file)
1578 zstd_data = &map->zstd_data;
1579
1580 compressed = zstd_compress_stream_to_records(zstd_data, dst, dst_size, src, src_size,
1581 max_record_size, process_comp_header);
1582 if (compressed < 0)
1583 return compressed;
1584
1585 if (map && map->file) {
1586 thread->bytes_transferred += src_size;
1587 thread->bytes_compressed += compressed;
1588 } else {
1589 session->bytes_transferred += src_size;
1590 session->bytes_compressed += compressed;
1591 }
1592
1593 return compressed;
1594 }
1595
record__mmap_read_evlist(struct record * rec,struct evlist * evlist,bool overwrite,bool synch)1596 static int record__mmap_read_evlist(struct record *rec, struct evlist *evlist,
1597 bool overwrite, bool synch)
1598 {
1599 u64 bytes_written = rec->bytes_written;
1600 int i;
1601 int rc = 0;
1602 int nr_mmaps;
1603 struct mmap **maps;
1604 int trace_fd = rec->data.file.fd;
1605 off_t off = 0;
1606
1607 if (!evlist)
1608 return 0;
1609
1610 nr_mmaps = thread->nr_mmaps;
1611 maps = overwrite ? thread->overwrite_maps : thread->maps;
1612
1613 if (!maps)
1614 return 0;
1615
1616 if (overwrite && evlist->bkw_mmap_state != BKW_MMAP_DATA_PENDING)
1617 return 0;
1618
1619 if (record__aio_enabled(rec))
1620 off = record__aio_get_pos(trace_fd);
1621
1622 for (i = 0; i < nr_mmaps; i++) {
1623 u64 flush = 0;
1624 struct mmap *map = maps[i];
1625
1626 if (map->core.base) {
1627 record__adjust_affinity(rec, map);
1628 if (synch) {
1629 flush = map->core.flush;
1630 map->core.flush = 1;
1631 }
1632 if (!record__aio_enabled(rec)) {
1633 if (perf_mmap__push(map, rec, record__pushfn) < 0) {
1634 if (synch)
1635 map->core.flush = flush;
1636 rc = -1;
1637 goto out;
1638 }
1639 } else {
1640 if (record__aio_push(rec, map, &off) < 0) {
1641 record__aio_set_pos(trace_fd, off);
1642 if (synch)
1643 map->core.flush = flush;
1644 rc = -1;
1645 goto out;
1646 }
1647 }
1648 if (synch)
1649 map->core.flush = flush;
1650 }
1651
1652 if (map->auxtrace_mmap.base && !rec->opts.auxtrace_snapshot_mode &&
1653 !rec->opts.auxtrace_sample_mode &&
1654 record__auxtrace_mmap_read(rec, map) != 0) {
1655 rc = -1;
1656 goto out;
1657 }
1658 }
1659
1660 if (record__aio_enabled(rec))
1661 record__aio_set_pos(trace_fd, off);
1662
1663 /*
1664 * Mark the round finished in case we wrote
1665 * at least one event.
1666 *
1667 * No need for round events in directory mode,
1668 * because per-cpu maps and files have data
1669 * sorted by kernel.
1670 */
1671 if (!record__threads_enabled(rec) && bytes_written != rec->bytes_written)
1672 rc = record__write(rec, NULL, &finished_round_event, sizeof(finished_round_event));
1673
1674 if (overwrite)
1675 evlist__toggle_bkw_mmap(evlist, BKW_MMAP_EMPTY);
1676 out:
1677 return rc;
1678 }
1679
record__mmap_read_all(struct record * rec,bool synch)1680 static int record__mmap_read_all(struct record *rec, bool synch)
1681 {
1682 int err;
1683
1684 err = record__mmap_read_evlist(rec, rec->evlist, false, synch);
1685 if (err)
1686 return err;
1687
1688 return record__mmap_read_evlist(rec, rec->evlist, true, synch);
1689 }
1690
record__thread_munmap_filtered(struct fdarray * fda,int fd,void * arg __maybe_unused)1691 static void record__thread_munmap_filtered(struct fdarray *fda, int fd,
1692 void *arg __maybe_unused)
1693 {
1694 struct perf_mmap *map = fda->priv[fd].ptr;
1695
1696 if (map)
1697 perf_mmap__put(map);
1698 }
1699
record__thread(void * arg)1700 static void *record__thread(void *arg)
1701 {
1702 enum thread_msg msg = THREAD_MSG__READY;
1703 bool terminate = false;
1704 struct fdarray *pollfd;
1705 int err, ctlfd_pos;
1706
1707 thread = arg;
1708 thread->tid = gettid();
1709
1710 err = write(thread->pipes.ack[1], &msg, sizeof(msg));
1711 if (err == -1)
1712 pr_warning("threads[%d]: failed to notify on start: %s\n",
1713 thread->tid, strerror(errno));
1714
1715 pr_debug("threads[%d]: started on cpu%d\n", thread->tid, sched_getcpu());
1716
1717 pollfd = &thread->pollfd;
1718 ctlfd_pos = thread->ctlfd_pos;
1719
1720 for (;;) {
1721 unsigned long long hits = thread->samples;
1722
1723 if (record__mmap_read_all(thread->rec, false) < 0 || terminate)
1724 break;
1725
1726 if (hits == thread->samples) {
1727
1728 err = fdarray__poll(pollfd, -1);
1729 /*
1730 * Propagate error, only if there's any. Ignore positive
1731 * number of returned events and interrupt error.
1732 */
1733 if (err > 0 || (err < 0 && errno == EINTR))
1734 err = 0;
1735 thread->waking++;
1736
1737 if (fdarray__filter(pollfd, POLLERR | POLLHUP,
1738 record__thread_munmap_filtered, NULL) == 0)
1739 break;
1740 }
1741
1742 if (pollfd->entries[ctlfd_pos].revents & POLLHUP) {
1743 terminate = true;
1744 close(thread->pipes.msg[0]);
1745 thread->pipes.msg[0] = -1;
1746 pollfd->entries[ctlfd_pos].fd = -1;
1747 pollfd->entries[ctlfd_pos].events = 0;
1748 }
1749
1750 pollfd->entries[ctlfd_pos].revents = 0;
1751 }
1752 record__mmap_read_all(thread->rec, true);
1753
1754 err = write(thread->pipes.ack[1], &msg, sizeof(msg));
1755 if (err == -1)
1756 pr_warning("threads[%d]: failed to notify on termination: %s\n",
1757 thread->tid, strerror(errno));
1758
1759 return NULL;
1760 }
1761
record__init_features(struct record * rec)1762 static void record__init_features(struct record *rec)
1763 {
1764 struct perf_session *session = rec->session;
1765 int feat;
1766
1767 for (feat = HEADER_FIRST_FEATURE; feat < HEADER_LAST_FEATURE; feat++)
1768 perf_header__set_feat(&session->header, feat);
1769
1770 if (rec->no_buildid)
1771 perf_header__clear_feat(&session->header, HEADER_BUILD_ID);
1772
1773 if (!have_tracepoints(&rec->evlist->core.entries))
1774 perf_header__clear_feat(&session->header, HEADER_TRACING_DATA);
1775
1776 if (!rec->opts.branch_stack)
1777 perf_header__clear_feat(&session->header, HEADER_BRANCH_STACK);
1778
1779 if (!rec->opts.full_auxtrace)
1780 perf_header__clear_feat(&session->header, HEADER_AUXTRACE);
1781
1782 if (!(rec->opts.use_clockid && rec->opts.clockid_res_ns))
1783 perf_header__clear_feat(&session->header, HEADER_CLOCKID);
1784
1785 if (!rec->opts.use_clockid)
1786 perf_header__clear_feat(&session->header, HEADER_CLOCK_DATA);
1787
1788 if (!record__threads_enabled(rec))
1789 perf_header__clear_feat(&session->header, HEADER_DIR_FORMAT);
1790
1791 if (!record__comp_enabled(rec))
1792 perf_header__clear_feat(&session->header, HEADER_COMPRESSED);
1793
1794 perf_header__clear_feat(&session->header, HEADER_STAT);
1795 }
1796
1797 static void
record__finish_output(struct record * rec)1798 record__finish_output(struct record *rec)
1799 {
1800 int i;
1801 struct perf_data *data = &rec->data;
1802 int fd = perf_data__fd(data);
1803
1804 if (data->is_pipe) {
1805 /* Just to display approx. size */
1806 data->file.size = rec->bytes_written;
1807 return;
1808 }
1809
1810 rec->session->header.data_size += rec->bytes_written;
1811 data->file.size = lseek(perf_data__fd(data), 0, SEEK_CUR);
1812 if (record__threads_enabled(rec)) {
1813 for (i = 0; i < data->dir.nr; i++)
1814 data->dir.files[i].size = lseek(data->dir.files[i].fd, 0, SEEK_CUR);
1815 }
1816
1817 /* Buildid scanning disabled or build ID in kernel and synthesized map events. */
1818 if (!rec->no_buildid) {
1819 process_buildids(rec);
1820
1821 if (rec->buildid_all)
1822 perf_session__dsos_hit_all(rec->session);
1823 }
1824 perf_session__write_header(rec->session, rec->evlist, fd, true);
1825
1826 return;
1827 }
1828
record__synthesize_workload(struct record * rec,bool tail)1829 static int record__synthesize_workload(struct record *rec, bool tail)
1830 {
1831 int err;
1832 struct perf_thread_map *thread_map;
1833 bool needs_mmap = rec->opts.synth & PERF_SYNTH_MMAP;
1834
1835 if (rec->opts.tail_synthesize != tail)
1836 return 0;
1837
1838 thread_map = thread_map__new_by_tid(rec->evlist->workload.pid);
1839 if (thread_map == NULL)
1840 return -1;
1841
1842 err = perf_event__synthesize_thread_map(&rec->tool, thread_map,
1843 process_synthesized_event,
1844 &rec->session->machines.host,
1845 needs_mmap,
1846 rec->opts.sample_address);
1847 perf_thread_map__put(thread_map);
1848 return err;
1849 }
1850
write_finished_init(struct record * rec,bool tail)1851 static int write_finished_init(struct record *rec, bool tail)
1852 {
1853 if (rec->opts.tail_synthesize != tail)
1854 return 0;
1855
1856 return record__write(rec, NULL, &finished_init_event, sizeof(finished_init_event));
1857 }
1858
1859 static int record__synthesize(struct record *rec, bool tail);
1860
1861 static int
record__switch_output(struct record * rec,bool at_exit)1862 record__switch_output(struct record *rec, bool at_exit)
1863 {
1864 struct perf_data *data = &rec->data;
1865 char *new_filename = NULL;
1866 int fd, err;
1867
1868 /* Same Size: "2015122520103046"*/
1869 char timestamp[] = "InvalidTimestamp";
1870
1871 record__aio_mmap_read_sync(rec);
1872
1873 write_finished_init(rec, true);
1874
1875 record__synthesize(rec, true);
1876 if (target__none(&rec->opts.target))
1877 record__synthesize_workload(rec, true);
1878
1879 rec->samples = 0;
1880 record__finish_output(rec);
1881 err = fetch_current_timestamp(timestamp, sizeof(timestamp));
1882 if (err) {
1883 pr_err("Failed to get current timestamp\n");
1884 return -EINVAL;
1885 }
1886
1887 fd = perf_data__switch(data, timestamp,
1888 rec->session->header.data_offset,
1889 at_exit, &new_filename);
1890 if (fd >= 0 && !at_exit) {
1891 rec->bytes_written = 0;
1892 rec->session->header.data_size = 0;
1893 }
1894
1895 if (!quiet) {
1896 fprintf(stderr, "[ perf record: Dump %s.%s ]\n",
1897 data->path, timestamp);
1898 }
1899
1900 if (rec->switch_output.num_files) {
1901 int n = rec->switch_output.cur_file + 1;
1902
1903 if (n >= rec->switch_output.num_files)
1904 n = 0;
1905 rec->switch_output.cur_file = n;
1906 if (rec->switch_output.filenames[n]) {
1907 remove(rec->switch_output.filenames[n]);
1908 zfree(&rec->switch_output.filenames[n]);
1909 }
1910 rec->switch_output.filenames[n] = new_filename;
1911 } else {
1912 free(new_filename);
1913 }
1914
1915 /* Output tracking events */
1916 if (!at_exit) {
1917 record__synthesize(rec, false);
1918
1919 /*
1920 * In 'perf record --switch-output' without -a,
1921 * record__synthesize() in record__switch_output() won't
1922 * generate tracking events because there's no thread_map
1923 * in evlist. Which causes newly created perf.data doesn't
1924 * contain map and comm information.
1925 * Create a fake thread_map and directly call
1926 * perf_event__synthesize_thread_map() for those events.
1927 */
1928 if (target__none(&rec->opts.target))
1929 record__synthesize_workload(rec, false);
1930 write_finished_init(rec, false);
1931 }
1932 return fd;
1933 }
1934
__record__save_lost_samples(struct record * rec,struct evsel * evsel,struct perf_record_lost_samples * lost,int cpu_idx,int thread_idx,u64 lost_count,u16 misc_flag)1935 static void __record__save_lost_samples(struct record *rec, struct evsel *evsel,
1936 struct perf_record_lost_samples *lost,
1937 int cpu_idx, int thread_idx, u64 lost_count,
1938 u16 misc_flag)
1939 {
1940 struct perf_sample_id *sid;
1941 struct perf_sample sample;
1942 int id_hdr_size;
1943
1944 perf_sample__init(&sample, /*all=*/true);
1945 lost->lost = lost_count;
1946 if (evsel->core.ids) {
1947 sid = xyarray__entry(evsel->core.sample_id, cpu_idx, thread_idx);
1948 sample.id = sid->id;
1949 }
1950
1951 id_hdr_size = perf_event__synthesize_id_sample((void *)(lost + 1),
1952 evsel->core.attr.sample_type, &sample);
1953 lost->header.size = sizeof(*lost) + id_hdr_size;
1954 lost->header.misc = misc_flag;
1955 record__write(rec, NULL, lost, lost->header.size);
1956 perf_sample__exit(&sample);
1957 }
1958
record__read_lost_samples(struct record * rec)1959 static void record__read_lost_samples(struct record *rec)
1960 {
1961 struct perf_session *session = rec->session;
1962 struct perf_record_lost_samples_and_ids lost;
1963 struct evsel *evsel;
1964
1965 /* there was an error during record__open */
1966 if (session->evlist == NULL)
1967 return;
1968
1969 evlist__for_each_entry(session->evlist, evsel) {
1970 struct xyarray *xy = evsel->core.sample_id;
1971 u64 lost_count;
1972
1973 if (xy == NULL || evsel->core.fd == NULL)
1974 continue;
1975 if (xyarray__max_x(evsel->core.fd) != xyarray__max_x(xy) ||
1976 xyarray__max_y(evsel->core.fd) != xyarray__max_y(xy)) {
1977 pr_debug("Unmatched FD vs. sample ID: skip reading LOST count\n");
1978 continue;
1979 }
1980
1981 for (int x = 0; x < xyarray__max_x(xy); x++) {
1982 for (int y = 0; y < xyarray__max_y(xy); y++) {
1983 struct perf_counts_values count;
1984
1985 if (perf_evsel__read(&evsel->core, x, y, &count) < 0) {
1986 pr_debug("read LOST count failed\n");
1987 return;
1988 }
1989
1990 if (count.lost) {
1991 memset(&lost, 0, sizeof(lost));
1992 lost.lost.header.type = PERF_RECORD_LOST_SAMPLES;
1993 __record__save_lost_samples(rec, evsel, &lost.lost,
1994 x, y, count.lost, 0);
1995 }
1996 }
1997 }
1998
1999 lost_count = perf_bpf_filter__lost_count(evsel);
2000 if (lost_count) {
2001 memset(&lost, 0, sizeof(lost));
2002 lost.lost.header.type = PERF_RECORD_LOST_SAMPLES;
2003 __record__save_lost_samples(rec, evsel, &lost.lost, 0, 0, lost_count,
2004 PERF_RECORD_MISC_LOST_SAMPLES_BPF);
2005 }
2006 }
2007 }
2008
2009 static volatile sig_atomic_t workload_exec_errno;
2010
2011 /*
2012 * evlist__prepare_workload will send a SIGUSR1
2013 * if the fork fails, since we asked by setting its
2014 * want_signal to true.
2015 */
workload_exec_failed_signal(int signo __maybe_unused,siginfo_t * info,void * ucontext __maybe_unused)2016 static void workload_exec_failed_signal(int signo __maybe_unused,
2017 siginfo_t *info,
2018 void *ucontext __maybe_unused)
2019 {
2020 workload_exec_errno = info->si_value.sival_int;
2021 done = 1;
2022 child_finished = 1;
2023 }
2024
2025 static void snapshot_sig_handler(int sig);
2026 static void alarm_sig_handler(int sig);
2027
evlist__pick_pc(struct evlist * evlist)2028 static const struct perf_event_mmap_page *evlist__pick_pc(struct evlist *evlist)
2029 {
2030 if (evlist) {
2031 if (evlist->mmap && evlist->mmap[0].core.base)
2032 return evlist->mmap[0].core.base;
2033 if (evlist->overwrite_mmap && evlist->overwrite_mmap[0].core.base)
2034 return evlist->overwrite_mmap[0].core.base;
2035 }
2036 return NULL;
2037 }
2038
record__pick_pc(struct record * rec)2039 static const struct perf_event_mmap_page *record__pick_pc(struct record *rec)
2040 {
2041 const struct perf_event_mmap_page *pc = evlist__pick_pc(rec->evlist);
2042 if (pc)
2043 return pc;
2044 return NULL;
2045 }
2046
record__synthesize(struct record * rec,bool tail)2047 static int record__synthesize(struct record *rec, bool tail)
2048 {
2049 struct perf_session *session = rec->session;
2050 struct machine *machine = &session->machines.host;
2051 struct perf_data *data = &rec->data;
2052 struct record_opts *opts = &rec->opts;
2053 struct perf_tool *tool = &rec->tool;
2054 int err = 0;
2055 event_op f = process_synthesized_event;
2056
2057 if (rec->opts.tail_synthesize != tail)
2058 return 0;
2059
2060 if (data->is_pipe) {
2061 err = perf_event__synthesize_for_pipe(tool, session, data,
2062 process_synthesized_event);
2063 if (err < 0)
2064 goto out;
2065
2066 rec->bytes_written += err;
2067 }
2068
2069 err = perf_event__synth_time_conv(record__pick_pc(rec), tool,
2070 process_synthesized_event, machine);
2071 if (err)
2072 goto out;
2073
2074 /* Synthesize id_index before auxtrace_info */
2075 err = perf_event__synthesize_id_index(tool,
2076 process_synthesized_event,
2077 session->evlist, machine);
2078 if (err)
2079 goto out;
2080
2081 if (rec->opts.full_auxtrace) {
2082 err = perf_event__synthesize_auxtrace_info(rec->itr, tool,
2083 session, process_synthesized_event);
2084 if (err)
2085 goto out;
2086 }
2087
2088 if (!evlist__exclude_kernel(rec->evlist)) {
2089 err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
2090 machine);
2091 WARN_ONCE(err < 0, "Couldn't record kernel reference relocation symbol\n"
2092 "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
2093 "Check /proc/kallsyms permission or run as root.\n");
2094
2095 err = perf_event__synthesize_modules(tool, process_synthesized_event,
2096 machine);
2097 WARN_ONCE(err < 0, "Couldn't record kernel module information.\n"
2098 "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
2099 "Check /proc/modules permission or run as root.\n");
2100 }
2101
2102 if (perf_guest) {
2103 machines__process_guests(&session->machines,
2104 perf_event__synthesize_guest_os, tool);
2105 }
2106
2107 err = perf_event__synthesize_extra_attr(&rec->tool,
2108 rec->evlist,
2109 process_synthesized_event,
2110 data->is_pipe);
2111 if (err)
2112 goto out;
2113
2114 err = perf_event__synthesize_thread_map2(&rec->tool, rec->evlist->core.threads,
2115 process_synthesized_event,
2116 NULL);
2117 if (err < 0) {
2118 pr_err("Couldn't synthesize thread map.\n");
2119 return err;
2120 }
2121
2122 err = perf_event__synthesize_cpu_map(&rec->tool, rec->evlist->core.all_cpus,
2123 process_synthesized_event, NULL);
2124 if (err < 0) {
2125 pr_err("Couldn't synthesize cpu map.\n");
2126 return err;
2127 }
2128
2129 err = perf_event__synthesize_bpf_events(session, process_synthesized_event,
2130 machine, opts);
2131 if (err < 0) {
2132 pr_warning("Couldn't synthesize bpf events.\n");
2133 err = 0;
2134 }
2135
2136 if (rec->opts.synth & PERF_SYNTH_CGROUP) {
2137 err = perf_event__synthesize_cgroups(tool, process_synthesized_event,
2138 machine);
2139 if (err < 0) {
2140 pr_warning("Couldn't synthesize cgroup events.\n");
2141 err = 0;
2142 }
2143 }
2144
2145 if (rec->opts.nr_threads_synthesize > 1) {
2146 mutex_init(&synth_lock);
2147 perf_set_multithreaded();
2148 f = process_locked_synthesized_event;
2149 }
2150
2151 if (rec->opts.synth & PERF_SYNTH_TASK) {
2152 bool needs_mmap = rec->opts.synth & PERF_SYNTH_MMAP;
2153
2154 err = __machine__synthesize_threads(machine, tool, &opts->target,
2155 rec->evlist->core.threads,
2156 f, needs_mmap, opts->sample_address,
2157 rec->opts.nr_threads_synthesize);
2158 }
2159
2160 if (rec->opts.nr_threads_synthesize > 1) {
2161 perf_set_singlethreaded();
2162 mutex_destroy(&synth_lock);
2163 }
2164
2165 out:
2166 return err;
2167 }
2168
record__synthesize_final_bpf_metadata(struct record * rec __maybe_unused)2169 static void record__synthesize_final_bpf_metadata(struct record *rec __maybe_unused)
2170 {
2171 #ifdef HAVE_LIBBPF_SUPPORT
2172 perf_event__synthesize_final_bpf_metadata(rec->session,
2173 process_synthesized_event);
2174 #endif
2175 }
2176
record__process_signal_event(union perf_event * event __maybe_unused,void * data)2177 static int record__process_signal_event(union perf_event *event __maybe_unused, void *data)
2178 {
2179 struct record *rec = data;
2180 pthread_kill(rec->thread_id, SIGUSR2);
2181 return 0;
2182 }
2183
record__setup_sb_evlist(struct record * rec)2184 static int record__setup_sb_evlist(struct record *rec)
2185 {
2186 struct record_opts *opts = &rec->opts;
2187
2188 if (rec->sb_evlist != NULL) {
2189 /*
2190 * We get here if --switch-output-event populated the
2191 * sb_evlist, so associate a callback that will send a SIGUSR2
2192 * to the main thread.
2193 */
2194 evlist__set_cb(rec->sb_evlist, record__process_signal_event, rec);
2195 rec->thread_id = pthread_self();
2196 }
2197 #ifdef HAVE_LIBBPF_SUPPORT
2198 if (!opts->no_bpf_event) {
2199 if (rec->sb_evlist == NULL) {
2200 rec->sb_evlist = evlist__new();
2201
2202 if (rec->sb_evlist == NULL) {
2203 pr_err("Couldn't create side band evlist.\n.");
2204 return -1;
2205 }
2206 }
2207
2208 if (evlist__add_bpf_sb_event(rec->sb_evlist, perf_session__env(rec->session))) {
2209 pr_err("Couldn't ask for PERF_RECORD_BPF_EVENT side band events.\n.");
2210 return -1;
2211 }
2212 }
2213 #endif
2214 if (evlist__start_sb_thread(rec->sb_evlist, &rec->opts.target)) {
2215 pr_debug("Couldn't start the BPF side band thread:\nBPF programs starting from now on won't be annotatable\n");
2216 opts->no_bpf_event = true;
2217 }
2218
2219 return 0;
2220 }
2221
record__init_clock(struct record * rec)2222 static int record__init_clock(struct record *rec)
2223 {
2224 struct perf_session *session = rec->session;
2225 struct timespec ref_clockid;
2226 struct timeval ref_tod;
2227 struct perf_env *env = perf_session__env(session);
2228 u64 ref;
2229
2230 if (!rec->opts.use_clockid)
2231 return 0;
2232
2233 if (rec->opts.use_clockid && rec->opts.clockid_res_ns)
2234 env->clock.clockid_res_ns = rec->opts.clockid_res_ns;
2235
2236 env->clock.clockid = rec->opts.clockid;
2237
2238 if (gettimeofday(&ref_tod, NULL) != 0) {
2239 pr_err("gettimeofday failed, cannot set reference time.\n");
2240 return -1;
2241 }
2242
2243 if (clock_gettime(rec->opts.clockid, &ref_clockid)) {
2244 pr_err("clock_gettime failed, cannot set reference time.\n");
2245 return -1;
2246 }
2247
2248 ref = (u64) ref_tod.tv_sec * NSEC_PER_SEC +
2249 (u64) ref_tod.tv_usec * NSEC_PER_USEC;
2250
2251 env->clock.tod_ns = ref;
2252
2253 ref = (u64) ref_clockid.tv_sec * NSEC_PER_SEC +
2254 (u64) ref_clockid.tv_nsec;
2255
2256 env->clock.clockid_ns = ref;
2257 return 0;
2258 }
2259
hit_auxtrace_snapshot_trigger(struct record * rec)2260 static void hit_auxtrace_snapshot_trigger(struct record *rec)
2261 {
2262 if (trigger_is_ready(&auxtrace_snapshot_trigger)) {
2263 trigger_hit(&auxtrace_snapshot_trigger);
2264 auxtrace_record__snapshot_started = 1;
2265 if (auxtrace_record__snapshot_start(rec->itr))
2266 trigger_error(&auxtrace_snapshot_trigger);
2267 }
2268 }
2269
record__terminate_thread(struct record_thread * thread_data)2270 static int record__terminate_thread(struct record_thread *thread_data)
2271 {
2272 int err;
2273 enum thread_msg ack = THREAD_MSG__UNDEFINED;
2274 pid_t tid = thread_data->tid;
2275
2276 close(thread_data->pipes.msg[1]);
2277 thread_data->pipes.msg[1] = -1;
2278 err = read(thread_data->pipes.ack[0], &ack, sizeof(ack));
2279 if (err > 0)
2280 pr_debug2("threads[%d]: sent %s\n", tid, thread_msg_tags[ack]);
2281 else
2282 pr_warning("threads[%d]: failed to receive termination notification from %d\n",
2283 thread->tid, tid);
2284
2285 return 0;
2286 }
2287
record__start_threads(struct record * rec)2288 static int record__start_threads(struct record *rec)
2289 {
2290 int t, tt, err, ret = 0, nr_threads = rec->nr_threads;
2291 struct record_thread *thread_data = rec->thread_data;
2292 sigset_t full, mask;
2293 pthread_t handle;
2294 pthread_attr_t attrs;
2295
2296 thread = &thread_data[0];
2297
2298 if (!record__threads_enabled(rec))
2299 return 0;
2300
2301 sigfillset(&full);
2302 if (sigprocmask(SIG_SETMASK, &full, &mask)) {
2303 pr_err("Failed to block signals on threads start: %s\n", strerror(errno));
2304 return -1;
2305 }
2306
2307 pthread_attr_init(&attrs);
2308 pthread_attr_setdetachstate(&attrs, PTHREAD_CREATE_DETACHED);
2309
2310 for (t = 1; t < nr_threads; t++) {
2311 enum thread_msg msg = THREAD_MSG__UNDEFINED;
2312
2313 #ifdef HAVE_PTHREAD_ATTR_SETAFFINITY_NP
2314 pthread_attr_setaffinity_np(&attrs,
2315 MMAP_CPU_MASK_BYTES(&(thread_data[t].mask->affinity)),
2316 (cpu_set_t *)(thread_data[t].mask->affinity.bits));
2317 #endif
2318 if (pthread_create(&handle, &attrs, record__thread, &thread_data[t])) {
2319 for (tt = 1; tt < t; tt++)
2320 record__terminate_thread(&thread_data[t]);
2321 pr_err("Failed to start threads: %s\n", strerror(errno));
2322 ret = -1;
2323 goto out_err;
2324 }
2325
2326 err = read(thread_data[t].pipes.ack[0], &msg, sizeof(msg));
2327 if (err > 0)
2328 pr_debug2("threads[%d]: sent %s\n", rec->thread_data[t].tid,
2329 thread_msg_tags[msg]);
2330 else
2331 pr_warning("threads[%d]: failed to receive start notification from %d\n",
2332 thread->tid, rec->thread_data[t].tid);
2333 }
2334
2335 sched_setaffinity(0, MMAP_CPU_MASK_BYTES(&thread->mask->affinity),
2336 (cpu_set_t *)thread->mask->affinity.bits);
2337
2338 pr_debug("threads[%d]: started on cpu%d\n", thread->tid, sched_getcpu());
2339
2340 out_err:
2341 pthread_attr_destroy(&attrs);
2342
2343 if (sigprocmask(SIG_SETMASK, &mask, NULL)) {
2344 pr_err("Failed to unblock signals on threads start: %s\n", strerror(errno));
2345 ret = -1;
2346 }
2347
2348 return ret;
2349 }
2350
record__stop_threads(struct record * rec)2351 static int record__stop_threads(struct record *rec)
2352 {
2353 int t;
2354 struct record_thread *thread_data = rec->thread_data;
2355
2356 for (t = 1; t < rec->nr_threads; t++)
2357 record__terminate_thread(&thread_data[t]);
2358
2359 for (t = 0; t < rec->nr_threads; t++) {
2360 rec->samples += thread_data[t].samples;
2361 if (!record__threads_enabled(rec))
2362 continue;
2363 rec->session->bytes_transferred += thread_data[t].bytes_transferred;
2364 rec->session->bytes_compressed += thread_data[t].bytes_compressed;
2365 pr_debug("threads[%d]: samples=%lld, wakes=%ld, ", thread_data[t].tid,
2366 thread_data[t].samples, thread_data[t].waking);
2367 if (thread_data[t].bytes_transferred && thread_data[t].bytes_compressed)
2368 pr_debug("transferred=%" PRIu64 ", compressed=%" PRIu64 "\n",
2369 thread_data[t].bytes_transferred, thread_data[t].bytes_compressed);
2370 else
2371 pr_debug("written=%" PRIu64 "\n", thread_data[t].bytes_written);
2372 }
2373
2374 return 0;
2375 }
2376
record__waking(struct record * rec)2377 static unsigned long record__waking(struct record *rec)
2378 {
2379 int t;
2380 unsigned long waking = 0;
2381 struct record_thread *thread_data = rec->thread_data;
2382
2383 for (t = 0; t < rec->nr_threads; t++)
2384 waking += thread_data[t].waking;
2385
2386 return waking;
2387 }
2388
__cmd_record(struct record * rec,int argc,const char ** argv)2389 static int __cmd_record(struct record *rec, int argc, const char **argv)
2390 {
2391 int err;
2392 int status = 0;
2393 const bool forks = argc > 0;
2394 struct perf_tool *tool = &rec->tool;
2395 struct record_opts *opts = &rec->opts;
2396 struct perf_data *data = &rec->data;
2397 struct perf_session *session;
2398 bool disabled = false, draining = false;
2399 int fd;
2400 float ratio = 0;
2401 enum evlist_ctl_cmd cmd = EVLIST_CTL_CMD_UNSUPPORTED;
2402 struct perf_env *env;
2403
2404 atexit(record__sig_exit);
2405 signal(SIGCHLD, sig_handler);
2406 signal(SIGINT, sig_handler);
2407 signal(SIGTERM, sig_handler);
2408 signal(SIGSEGV, sigsegv_handler);
2409
2410 if (rec->opts.record_cgroup) {
2411 #ifndef HAVE_FILE_HANDLE
2412 pr_err("cgroup tracking is not supported\n");
2413 return -1;
2414 #endif
2415 }
2416
2417 if (rec->opts.auxtrace_snapshot_mode || rec->switch_output.enabled) {
2418 signal(SIGUSR2, snapshot_sig_handler);
2419 if (rec->opts.auxtrace_snapshot_mode)
2420 trigger_on(&auxtrace_snapshot_trigger);
2421 if (rec->switch_output.enabled)
2422 trigger_on(&switch_output_trigger);
2423 } else {
2424 signal(SIGUSR2, SIG_IGN);
2425 }
2426
2427 perf_tool__init(tool, /*ordered_events=*/true);
2428 tool->sample = process_sample_event;
2429 tool->fork = perf_event__process_fork;
2430 tool->exit = perf_event__process_exit;
2431 tool->comm = perf_event__process_comm;
2432 tool->namespaces = perf_event__process_namespaces;
2433 tool->mmap = build_id__process_mmap;
2434 tool->mmap2 = build_id__process_mmap2;
2435 tool->itrace_start = process_timestamp_boundary;
2436 tool->aux = process_timestamp_boundary;
2437 tool->namespace_events = rec->opts.record_namespaces;
2438 tool->cgroup_events = rec->opts.record_cgroup;
2439 session = perf_session__new(data, tool);
2440 if (IS_ERR(session)) {
2441 pr_err("Perf session creation failed.\n");
2442 return PTR_ERR(session);
2443 }
2444 env = perf_session__env(session);
2445 if (record__threads_enabled(rec)) {
2446 if (perf_data__is_pipe(&rec->data)) {
2447 pr_err("Parallel trace streaming is not available in pipe mode.\n");
2448 return -1;
2449 }
2450 if (rec->opts.full_auxtrace) {
2451 pr_err("Parallel trace streaming is not available in AUX area tracing mode.\n");
2452 return -1;
2453 }
2454 }
2455
2456 fd = perf_data__fd(data);
2457 rec->session = session;
2458
2459 if (zstd_init(&session->zstd_data, rec->opts.comp_level) < 0) {
2460 pr_err("Compression initialization failed.\n");
2461 return -1;
2462 }
2463 #ifdef HAVE_EVENTFD_SUPPORT
2464 done_fd = eventfd(0, EFD_NONBLOCK);
2465 if (done_fd < 0) {
2466 pr_err("Failed to create wakeup eventfd, error: %m\n");
2467 status = -1;
2468 goto out_delete_session;
2469 }
2470 err = evlist__add_wakeup_eventfd(rec->evlist, done_fd);
2471 if (err < 0) {
2472 pr_err("Failed to add wakeup eventfd to poll list\n");
2473 status = err;
2474 goto out_delete_session;
2475 }
2476 #endif // HAVE_EVENTFD_SUPPORT
2477
2478 env->comp_type = PERF_COMP_ZSTD;
2479 env->comp_level = rec->opts.comp_level;
2480
2481 if (rec->opts.kcore &&
2482 !record__kcore_readable(&session->machines.host)) {
2483 pr_err("ERROR: kcore is not readable.\n");
2484 return -1;
2485 }
2486
2487 if (record__init_clock(rec))
2488 return -1;
2489
2490 record__init_features(rec);
2491
2492 if (forks) {
2493 err = evlist__prepare_workload(rec->evlist, &opts->target, argv, data->is_pipe,
2494 workload_exec_failed_signal);
2495 if (err < 0) {
2496 pr_err("Couldn't run the workload!\n");
2497 status = err;
2498 goto out_delete_session;
2499 }
2500 }
2501
2502 /*
2503 * If we have just single event and are sending data
2504 * through pipe, we need to force the ids allocation,
2505 * because we synthesize event name through the pipe
2506 * and need the id for that.
2507 */
2508 if (data->is_pipe && rec->evlist->core.nr_entries == 1)
2509 rec->opts.sample_id = true;
2510
2511 if (rec->timestamp_filename && perf_data__is_pipe(data)) {
2512 rec->timestamp_filename = false;
2513 pr_warning("WARNING: --timestamp-filename option is not available in pipe mode.\n");
2514 }
2515
2516 /*
2517 * Use global stat_config that is zero meaning aggr_mode is AGGR_NONE
2518 * and hybrid_merge is false.
2519 */
2520 evlist__uniquify_evsel_names(rec->evlist, &stat_config);
2521
2522 evlist__config(rec->evlist, opts, &callchain_param);
2523
2524 /* Debug message used by test scripts */
2525 pr_debug3("perf record opening and mmapping events\n");
2526 if (record__open(rec) != 0) {
2527 err = -1;
2528 goto out_free_threads;
2529 }
2530 /* Debug message used by test scripts */
2531 pr_debug3("perf record done opening and mmapping events\n");
2532 env->comp_mmap_len = session->evlist->core.mmap_len;
2533
2534 if (rec->opts.kcore) {
2535 err = record__kcore_copy(&session->machines.host, data);
2536 if (err) {
2537 pr_err("ERROR: Failed to copy kcore\n");
2538 goto out_free_threads;
2539 }
2540 }
2541
2542 /*
2543 * Normally perf_session__new would do this, but it doesn't have the
2544 * evlist.
2545 */
2546 if (rec->tool.ordered_events && !evlist__sample_id_all(rec->evlist)) {
2547 pr_warning("WARNING: No sample_id_all support, falling back to unordered processing\n");
2548 rec->tool.ordered_events = false;
2549 }
2550
2551 if (evlist__nr_groups(rec->evlist) == 0)
2552 perf_header__clear_feat(&session->header, HEADER_GROUP_DESC);
2553
2554 if (data->is_pipe) {
2555 err = perf_header__write_pipe(fd);
2556 if (err < 0)
2557 goto out_free_threads;
2558 } else {
2559 err = perf_session__write_header(session, rec->evlist, fd, false);
2560 if (err < 0)
2561 goto out_free_threads;
2562 }
2563
2564 err = -1;
2565 if (!rec->no_buildid
2566 && !perf_header__has_feat(&session->header, HEADER_BUILD_ID)) {
2567 pr_err("Couldn't generate buildids. "
2568 "Use --no-buildid to profile anyway.\n");
2569 goto out_free_threads;
2570 }
2571
2572 if (!evlist__needs_bpf_sb_event(rec->evlist))
2573 opts->no_bpf_event = true;
2574
2575 err = record__setup_sb_evlist(rec);
2576 if (err)
2577 goto out_free_threads;
2578
2579 err = record__synthesize(rec, false);
2580 if (err < 0)
2581 goto out_free_threads;
2582
2583 if (rec->realtime_prio) {
2584 struct sched_param param;
2585
2586 param.sched_priority = rec->realtime_prio;
2587 if (sched_setscheduler(0, SCHED_FIFO, ¶m)) {
2588 pr_err("Could not set realtime priority.\n");
2589 err = -1;
2590 goto out_free_threads;
2591 }
2592 }
2593
2594 if (record__start_threads(rec))
2595 goto out_free_threads;
2596
2597 /*
2598 * When perf is starting the traced process, all the events
2599 * (apart from group members) have enable_on_exec=1 set,
2600 * so don't spoil it by prematurely enabling them.
2601 */
2602 if (!target__none(&opts->target) && !opts->target.initial_delay)
2603 evlist__enable(rec->evlist);
2604
2605 /*
2606 * offcpu-time does not call execve, so enable_on_exe wouldn't work
2607 * when recording a workload, do it manually
2608 */
2609 if (rec->off_cpu)
2610 evlist__enable_evsel(rec->evlist, (char *)OFFCPU_EVENT);
2611
2612 /*
2613 * Let the child rip
2614 */
2615 if (forks) {
2616 struct machine *machine = &session->machines.host;
2617 union perf_event *event;
2618 pid_t tgid;
2619
2620 event = malloc(sizeof(event->comm) + machine->id_hdr_size);
2621 if (event == NULL) {
2622 err = -ENOMEM;
2623 goto out_child;
2624 }
2625
2626 /*
2627 * Some H/W events are generated before COMM event
2628 * which is emitted during exec(), so perf script
2629 * cannot see a correct process name for those events.
2630 * Synthesize COMM event to prevent it.
2631 */
2632 tgid = perf_event__synthesize_comm(tool, event,
2633 rec->evlist->workload.pid,
2634 process_synthesized_event,
2635 machine);
2636 free(event);
2637
2638 if (tgid == -1)
2639 goto out_child;
2640
2641 event = malloc(sizeof(event->namespaces) +
2642 (NR_NAMESPACES * sizeof(struct perf_ns_link_info)) +
2643 machine->id_hdr_size);
2644 if (event == NULL) {
2645 err = -ENOMEM;
2646 goto out_child;
2647 }
2648
2649 /*
2650 * Synthesize NAMESPACES event for the command specified.
2651 */
2652 perf_event__synthesize_namespaces(tool, event,
2653 rec->evlist->workload.pid,
2654 tgid, process_synthesized_event,
2655 machine);
2656 free(event);
2657
2658 evlist__start_workload(rec->evlist);
2659 }
2660
2661 if (opts->target.initial_delay) {
2662 pr_info(EVLIST_DISABLED_MSG);
2663 if (opts->target.initial_delay > 0) {
2664 usleep(opts->target.initial_delay * USEC_PER_MSEC);
2665 evlist__enable(rec->evlist);
2666 pr_info(EVLIST_ENABLED_MSG);
2667 }
2668 }
2669
2670 err = event_enable_timer__start(rec->evlist->eet);
2671 if (err)
2672 goto out_child;
2673
2674 /* Debug message used by test scripts */
2675 pr_debug3("perf record has started\n");
2676 fflush(stderr);
2677
2678 trigger_ready(&auxtrace_snapshot_trigger);
2679 trigger_ready(&switch_output_trigger);
2680 perf_hooks__invoke_record_start();
2681
2682 /*
2683 * Must write FINISHED_INIT so it will be seen after all other
2684 * synthesized user events, but before any regular events.
2685 */
2686 err = write_finished_init(rec, false);
2687 if (err < 0)
2688 goto out_child;
2689
2690 for (;;) {
2691 unsigned long long hits = thread->samples;
2692
2693 /*
2694 * rec->evlist->bkw_mmap_state is possible to be
2695 * BKW_MMAP_EMPTY here: when done == true and
2696 * hits != rec->samples in previous round.
2697 *
2698 * evlist__toggle_bkw_mmap ensure we never
2699 * convert BKW_MMAP_EMPTY to BKW_MMAP_DATA_PENDING.
2700 */
2701 if (trigger_is_hit(&switch_output_trigger) || done || draining)
2702 evlist__toggle_bkw_mmap(rec->evlist, BKW_MMAP_DATA_PENDING);
2703
2704 if (record__mmap_read_all(rec, false) < 0) {
2705 trigger_error(&auxtrace_snapshot_trigger);
2706 trigger_error(&switch_output_trigger);
2707 err = -1;
2708 goto out_child;
2709 }
2710
2711 if (auxtrace_record__snapshot_started) {
2712 auxtrace_record__snapshot_started = 0;
2713 if (!trigger_is_error(&auxtrace_snapshot_trigger))
2714 record__read_auxtrace_snapshot(rec, false);
2715 if (trigger_is_error(&auxtrace_snapshot_trigger)) {
2716 pr_err("AUX area tracing snapshot failed\n");
2717 err = -1;
2718 goto out_child;
2719 }
2720 }
2721
2722 if (trigger_is_hit(&switch_output_trigger)) {
2723 /*
2724 * If switch_output_trigger is hit, the data in
2725 * overwritable ring buffer should have been collected,
2726 * so bkw_mmap_state should be set to BKW_MMAP_EMPTY.
2727 *
2728 * If SIGUSR2 raise after or during record__mmap_read_all(),
2729 * record__mmap_read_all() didn't collect data from
2730 * overwritable ring buffer. Read again.
2731 */
2732 if (rec->evlist->bkw_mmap_state == BKW_MMAP_RUNNING)
2733 continue;
2734 trigger_ready(&switch_output_trigger);
2735
2736 /*
2737 * Reenable events in overwrite ring buffer after
2738 * record__mmap_read_all(): we should have collected
2739 * data from it.
2740 */
2741 evlist__toggle_bkw_mmap(rec->evlist, BKW_MMAP_RUNNING);
2742
2743 if (!quiet)
2744 fprintf(stderr, "[ perf record: dump data: Woken up %ld times ]\n",
2745 record__waking(rec));
2746 thread->waking = 0;
2747 fd = record__switch_output(rec, false);
2748 if (fd < 0) {
2749 pr_err("Failed to switch to new file\n");
2750 trigger_error(&switch_output_trigger);
2751 err = fd;
2752 goto out_child;
2753 }
2754
2755 /* re-arm the alarm */
2756 if (rec->switch_output.time)
2757 alarm(rec->switch_output.time);
2758 }
2759
2760 if (hits == thread->samples) {
2761 if (done || draining)
2762 break;
2763 err = fdarray__poll(&thread->pollfd, -1);
2764 /*
2765 * Propagate error, only if there's any. Ignore positive
2766 * number of returned events and interrupt error.
2767 */
2768 if (err > 0 || (err < 0 && errno == EINTR))
2769 err = 0;
2770 thread->waking++;
2771
2772 if (fdarray__filter(&thread->pollfd, POLLERR | POLLHUP,
2773 record__thread_munmap_filtered, NULL) == 0)
2774 draining = true;
2775
2776 err = record__update_evlist_pollfd_from_thread(rec, rec->evlist, thread);
2777 if (err)
2778 goto out_child;
2779 }
2780
2781 if (evlist__ctlfd_process(rec->evlist, &cmd) > 0) {
2782 switch (cmd) {
2783 case EVLIST_CTL_CMD_SNAPSHOT:
2784 hit_auxtrace_snapshot_trigger(rec);
2785 evlist__ctlfd_ack(rec->evlist);
2786 break;
2787 case EVLIST_CTL_CMD_STOP:
2788 done = 1;
2789 break;
2790 case EVLIST_CTL_CMD_ACK:
2791 case EVLIST_CTL_CMD_UNSUPPORTED:
2792 case EVLIST_CTL_CMD_ENABLE:
2793 case EVLIST_CTL_CMD_DISABLE:
2794 case EVLIST_CTL_CMD_EVLIST:
2795 case EVLIST_CTL_CMD_PING:
2796 default:
2797 break;
2798 }
2799 }
2800
2801 err = event_enable_timer__process(rec->evlist->eet);
2802 if (err < 0)
2803 goto out_child;
2804 if (err) {
2805 err = 0;
2806 done = 1;
2807 }
2808
2809 /*
2810 * When perf is starting the traced process, at the end events
2811 * die with the process and we wait for that. Thus no need to
2812 * disable events in this case.
2813 */
2814 if (done && !disabled && !target__none(&opts->target)) {
2815 trigger_off(&auxtrace_snapshot_trigger);
2816 evlist__disable(rec->evlist);
2817 disabled = true;
2818 }
2819 }
2820
2821 trigger_off(&auxtrace_snapshot_trigger);
2822 trigger_off(&switch_output_trigger);
2823
2824 record__synthesize_final_bpf_metadata(rec);
2825
2826 if (opts->auxtrace_snapshot_on_exit)
2827 record__auxtrace_snapshot_exit(rec);
2828
2829 if (forks && workload_exec_errno) {
2830 char msg[STRERR_BUFSIZE];
2831 const char *emsg = str_error_r(workload_exec_errno, msg, sizeof(msg));
2832 struct strbuf sb = STRBUF_INIT;
2833
2834 evlist__format_evsels(rec->evlist, &sb, 2048);
2835
2836 pr_err("Failed to collect '%s' for the '%s' workload: %s\n",
2837 sb.buf, argv[0], emsg);
2838 strbuf_release(&sb);
2839 err = -1;
2840 goto out_child;
2841 }
2842
2843 if (!quiet)
2844 fprintf(stderr, "[ perf record: Woken up %ld times to write data ]\n",
2845 record__waking(rec));
2846
2847 write_finished_init(rec, true);
2848
2849 if (target__none(&rec->opts.target))
2850 record__synthesize_workload(rec, true);
2851
2852 out_child:
2853 record__stop_threads(rec);
2854 record__mmap_read_all(rec, true);
2855 out_free_threads:
2856 record__free_thread_data(rec);
2857 evlist__finalize_ctlfd(rec->evlist);
2858 record__aio_mmap_read_sync(rec);
2859
2860 if (rec->session->bytes_transferred && rec->session->bytes_compressed) {
2861 ratio = (float)rec->session->bytes_transferred/(float)rec->session->bytes_compressed;
2862 env->comp_ratio = ratio + 0.5;
2863 }
2864
2865 if (forks) {
2866 int exit_status;
2867
2868 if (!child_finished)
2869 kill(rec->evlist->workload.pid, SIGTERM);
2870
2871 wait(&exit_status);
2872
2873 if (err < 0)
2874 status = err;
2875 else if (WIFEXITED(exit_status))
2876 status = WEXITSTATUS(exit_status);
2877 else if (WIFSIGNALED(exit_status))
2878 signr = WTERMSIG(exit_status);
2879 } else
2880 status = err;
2881
2882 if (rec->off_cpu)
2883 rec->bytes_written += off_cpu_write(rec->session);
2884
2885 record__read_lost_samples(rec);
2886 record__synthesize(rec, true);
2887 /* this will be recalculated during process_buildids() */
2888 rec->samples = 0;
2889
2890 if (!err) {
2891 if (!rec->timestamp_filename) {
2892 record__finish_output(rec);
2893 } else {
2894 fd = record__switch_output(rec, true);
2895 if (fd < 0) {
2896 status = fd;
2897 goto out_delete_session;
2898 }
2899 }
2900 }
2901
2902 perf_hooks__invoke_record_end();
2903
2904 if (!err && !quiet) {
2905 char samples[128];
2906 const char *postfix = rec->timestamp_filename ?
2907 ".<timestamp>" : "";
2908
2909 if (rec->samples && !rec->opts.full_auxtrace)
2910 scnprintf(samples, sizeof(samples),
2911 " (%" PRIu64 " samples)", rec->samples);
2912 else
2913 samples[0] = '\0';
2914
2915 fprintf(stderr, "[ perf record: Captured and wrote %.3f MB %s%s%s",
2916 perf_data__size(data) / 1024.0 / 1024.0,
2917 data->path, postfix, samples);
2918 if (ratio) {
2919 fprintf(stderr, ", compressed (original %.3f MB, ratio is %.3f)",
2920 rec->session->bytes_transferred / 1024.0 / 1024.0,
2921 ratio);
2922 }
2923 fprintf(stderr, " ]\n");
2924 }
2925
2926 out_delete_session:
2927 #ifdef HAVE_EVENTFD_SUPPORT
2928 if (done_fd >= 0) {
2929 fd = done_fd;
2930 done_fd = -1;
2931
2932 close(fd);
2933 }
2934 #endif
2935 zstd_fini(&session->zstd_data);
2936 if (!opts->no_bpf_event)
2937 evlist__stop_sb_thread(rec->sb_evlist);
2938
2939 perf_session__delete(session);
2940 return status;
2941 }
2942
callchain_debug(struct callchain_param * callchain)2943 static void callchain_debug(struct callchain_param *callchain)
2944 {
2945 static const char *str[CALLCHAIN_MAX] = { "NONE", "FP", "DWARF", "LBR" };
2946
2947 pr_debug("callchain: type %s\n", str[callchain->record_mode]);
2948
2949 if (callchain->record_mode == CALLCHAIN_DWARF)
2950 pr_debug("callchain: stack dump size %d\n",
2951 callchain->dump_size);
2952 }
2953
record_opts__parse_callchain(struct record_opts * record,struct callchain_param * callchain,const char * arg,bool unset)2954 int record_opts__parse_callchain(struct record_opts *record,
2955 struct callchain_param *callchain,
2956 const char *arg, bool unset)
2957 {
2958 int ret;
2959 callchain->enabled = !unset;
2960
2961 /* --no-call-graph */
2962 if (unset) {
2963 callchain->record_mode = CALLCHAIN_NONE;
2964 pr_debug("callchain: disabled\n");
2965 return 0;
2966 }
2967
2968 ret = parse_callchain_record_opt(arg, callchain);
2969 if (!ret) {
2970 /* Enable data address sampling for DWARF unwind. */
2971 if (callchain->record_mode == CALLCHAIN_DWARF)
2972 record->sample_address = true;
2973 callchain_debug(callchain);
2974 }
2975
2976 return ret;
2977 }
2978
record_parse_callchain_opt(const struct option * opt,const char * arg,int unset)2979 int record_parse_callchain_opt(const struct option *opt,
2980 const char *arg,
2981 int unset)
2982 {
2983 return record_opts__parse_callchain(opt->value, &callchain_param, arg, unset);
2984 }
2985
record_callchain_opt(const struct option * opt,const char * arg __maybe_unused,int unset __maybe_unused)2986 int record_callchain_opt(const struct option *opt,
2987 const char *arg __maybe_unused,
2988 int unset __maybe_unused)
2989 {
2990 struct callchain_param *callchain = opt->value;
2991
2992 callchain->enabled = true;
2993
2994 if (callchain->record_mode == CALLCHAIN_NONE)
2995 callchain->record_mode = CALLCHAIN_FP;
2996
2997 callchain_debug(callchain);
2998 return 0;
2999 }
3000
perf_record_config(const char * var,const char * value,void * cb)3001 static int perf_record_config(const char *var, const char *value, void *cb)
3002 {
3003 struct record *rec = cb;
3004
3005 if (!strcmp(var, "record.build-id")) {
3006 if (!strcmp(value, "cache"))
3007 rec->no_buildid_cache = false;
3008 else if (!strcmp(value, "no-cache"))
3009 rec->no_buildid_cache = true;
3010 else if (!strcmp(value, "skip"))
3011 rec->no_buildid = true;
3012 else if (!strcmp(value, "mmap"))
3013 rec->buildid_mmap = true;
3014 else if (!strcmp(value, "no-mmap"))
3015 rec->buildid_mmap = false;
3016 else
3017 return -1;
3018 return 0;
3019 }
3020 if (!strcmp(var, "record.call-graph")) {
3021 var = "call-graph.record-mode";
3022 return perf_default_config(var, value, cb);
3023 }
3024 #ifdef HAVE_AIO_SUPPORT
3025 if (!strcmp(var, "record.aio")) {
3026 rec->opts.nr_cblocks = strtol(value, NULL, 0);
3027 if (!rec->opts.nr_cblocks)
3028 rec->opts.nr_cblocks = nr_cblocks_default;
3029 }
3030 #endif
3031 if (!strcmp(var, "record.debuginfod")) {
3032 rec->debuginfod.urls = strdup(value);
3033 if (!rec->debuginfod.urls)
3034 return -ENOMEM;
3035 rec->debuginfod.set = true;
3036 }
3037
3038 return 0;
3039 }
3040
record__parse_event_enable_time(const struct option * opt,const char * str,int unset)3041 static int record__parse_event_enable_time(const struct option *opt, const char *str, int unset)
3042 {
3043 struct record *rec = (struct record *)opt->value;
3044
3045 return evlist__parse_event_enable_time(rec->evlist, &rec->opts, str, unset);
3046 }
3047
record__parse_affinity(const struct option * opt,const char * str,int unset)3048 static int record__parse_affinity(const struct option *opt, const char *str, int unset)
3049 {
3050 struct record_opts *opts = (struct record_opts *)opt->value;
3051
3052 if (unset || !str)
3053 return 0;
3054
3055 if (!strcasecmp(str, "node"))
3056 opts->affinity = PERF_AFFINITY_NODE;
3057 else if (!strcasecmp(str, "cpu"))
3058 opts->affinity = PERF_AFFINITY_CPU;
3059
3060 return 0;
3061 }
3062
record__mmap_cpu_mask_alloc(struct mmap_cpu_mask * mask,int nr_bits)3063 static int record__mmap_cpu_mask_alloc(struct mmap_cpu_mask *mask, int nr_bits)
3064 {
3065 mask->nbits = nr_bits;
3066 mask->bits = bitmap_zalloc(mask->nbits);
3067 if (!mask->bits)
3068 return -ENOMEM;
3069
3070 return 0;
3071 }
3072
record__mmap_cpu_mask_free(struct mmap_cpu_mask * mask)3073 static void record__mmap_cpu_mask_free(struct mmap_cpu_mask *mask)
3074 {
3075 bitmap_free(mask->bits);
3076 mask->nbits = 0;
3077 }
3078
record__thread_mask_alloc(struct thread_mask * mask,int nr_bits)3079 static int record__thread_mask_alloc(struct thread_mask *mask, int nr_bits)
3080 {
3081 int ret;
3082
3083 ret = record__mmap_cpu_mask_alloc(&mask->maps, nr_bits);
3084 if (ret) {
3085 mask->affinity.bits = NULL;
3086 return ret;
3087 }
3088
3089 ret = record__mmap_cpu_mask_alloc(&mask->affinity, nr_bits);
3090 if (ret) {
3091 record__mmap_cpu_mask_free(&mask->maps);
3092 mask->maps.bits = NULL;
3093 }
3094
3095 return ret;
3096 }
3097
record__thread_mask_free(struct thread_mask * mask)3098 static void record__thread_mask_free(struct thread_mask *mask)
3099 {
3100 record__mmap_cpu_mask_free(&mask->maps);
3101 record__mmap_cpu_mask_free(&mask->affinity);
3102 }
3103
record__parse_threads(const struct option * opt,const char * str,int unset)3104 static int record__parse_threads(const struct option *opt, const char *str, int unset)
3105 {
3106 int s;
3107 struct record_opts *opts = opt->value;
3108
3109 if (unset || !str || !strlen(str)) {
3110 opts->threads_spec = THREAD_SPEC__CPU;
3111 } else {
3112 for (s = 1; s < THREAD_SPEC__MAX; s++) {
3113 if (s == THREAD_SPEC__USER) {
3114 opts->threads_user_spec = strdup(str);
3115 if (!opts->threads_user_spec)
3116 return -ENOMEM;
3117 opts->threads_spec = THREAD_SPEC__USER;
3118 break;
3119 }
3120 if (!strncasecmp(str, thread_spec_tags[s], strlen(thread_spec_tags[s]))) {
3121 opts->threads_spec = s;
3122 break;
3123 }
3124 }
3125 }
3126
3127 if (opts->threads_spec == THREAD_SPEC__USER)
3128 pr_debug("threads_spec: %s\n", opts->threads_user_spec);
3129 else
3130 pr_debug("threads_spec: %s\n", thread_spec_tags[opts->threads_spec]);
3131
3132 return 0;
3133 }
3134
parse_output_max_size(const struct option * opt,const char * str,int unset)3135 static int parse_output_max_size(const struct option *opt,
3136 const char *str, int unset)
3137 {
3138 unsigned long *s = (unsigned long *)opt->value;
3139 static struct parse_tag tags_size[] = {
3140 { .tag = 'B', .mult = 1 },
3141 { .tag = 'K', .mult = 1 << 10 },
3142 { .tag = 'M', .mult = 1 << 20 },
3143 { .tag = 'G', .mult = 1 << 30 },
3144 { .tag = 0 },
3145 };
3146 unsigned long val;
3147
3148 if (unset) {
3149 *s = 0;
3150 return 0;
3151 }
3152
3153 val = parse_tag_value(str, tags_size);
3154 if (val != (unsigned long) -1) {
3155 *s = val;
3156 return 0;
3157 }
3158
3159 return -1;
3160 }
3161
record__parse_mmap_pages(const struct option * opt,const char * str,int unset __maybe_unused)3162 static int record__parse_mmap_pages(const struct option *opt,
3163 const char *str,
3164 int unset __maybe_unused)
3165 {
3166 struct record_opts *opts = opt->value;
3167 char *s, *p;
3168 unsigned int mmap_pages;
3169 int ret;
3170
3171 if (!str)
3172 return -EINVAL;
3173
3174 s = strdup(str);
3175 if (!s)
3176 return -ENOMEM;
3177
3178 p = strchr(s, ',');
3179 if (p)
3180 *p = '\0';
3181
3182 if (*s) {
3183 ret = __evlist__parse_mmap_pages(&mmap_pages, s);
3184 if (ret)
3185 goto out_free;
3186 opts->mmap_pages = mmap_pages;
3187 }
3188
3189 if (!p) {
3190 ret = 0;
3191 goto out_free;
3192 }
3193
3194 ret = __evlist__parse_mmap_pages(&mmap_pages, p + 1);
3195 if (ret)
3196 goto out_free;
3197
3198 opts->auxtrace_mmap_pages = mmap_pages;
3199
3200 out_free:
3201 free(s);
3202 return ret;
3203 }
3204
record__parse_off_cpu_thresh(const struct option * opt,const char * str,int unset __maybe_unused)3205 static int record__parse_off_cpu_thresh(const struct option *opt,
3206 const char *str,
3207 int unset __maybe_unused)
3208 {
3209 struct record_opts *opts = opt->value;
3210 char *endptr;
3211 u64 off_cpu_thresh_ms;
3212
3213 if (!str)
3214 return -EINVAL;
3215
3216 off_cpu_thresh_ms = strtoull(str, &endptr, 10);
3217
3218 /* the threshold isn't string "0", yet strtoull() returns 0, parsing failed */
3219 if (*endptr || (off_cpu_thresh_ms == 0 && strcmp(str, "0")))
3220 return -EINVAL;
3221 else
3222 opts->off_cpu_thresh_ns = off_cpu_thresh_ms * NSEC_PER_MSEC;
3223
3224 return 0;
3225 }
3226
arch__add_leaf_frame_record_opts(struct record_opts * opts __maybe_unused)3227 void __weak arch__add_leaf_frame_record_opts(struct record_opts *opts __maybe_unused)
3228 {
3229 }
3230
parse_control_option(const struct option * opt,const char * str,int unset __maybe_unused)3231 static int parse_control_option(const struct option *opt,
3232 const char *str,
3233 int unset __maybe_unused)
3234 {
3235 struct record_opts *opts = opt->value;
3236
3237 return evlist__parse_control(str, &opts->ctl_fd, &opts->ctl_fd_ack, &opts->ctl_fd_close);
3238 }
3239
switch_output_size_warn(struct record * rec)3240 static void switch_output_size_warn(struct record *rec)
3241 {
3242 u64 wakeup_size = evlist__mmap_size(rec->opts.mmap_pages);
3243 struct switch_output *s = &rec->switch_output;
3244
3245 wakeup_size /= 2;
3246
3247 if (s->size < wakeup_size) {
3248 char buf[100];
3249
3250 unit_number__scnprintf(buf, sizeof(buf), wakeup_size);
3251 pr_warning("WARNING: switch-output data size lower than "
3252 "wakeup kernel buffer size (%s) "
3253 "expect bigger perf.data sizes\n", buf);
3254 }
3255 }
3256
switch_output_setup(struct record * rec)3257 static int switch_output_setup(struct record *rec)
3258 {
3259 struct switch_output *s = &rec->switch_output;
3260 static struct parse_tag tags_size[] = {
3261 { .tag = 'B', .mult = 1 },
3262 { .tag = 'K', .mult = 1 << 10 },
3263 { .tag = 'M', .mult = 1 << 20 },
3264 { .tag = 'G', .mult = 1 << 30 },
3265 { .tag = 0 },
3266 };
3267 static struct parse_tag tags_time[] = {
3268 { .tag = 's', .mult = 1 },
3269 { .tag = 'm', .mult = 60 },
3270 { .tag = 'h', .mult = 60*60 },
3271 { .tag = 'd', .mult = 60*60*24 },
3272 { .tag = 0 },
3273 };
3274 unsigned long val;
3275
3276 /*
3277 * If we're using --switch-output-events, then we imply its
3278 * --switch-output=signal, as we'll send a SIGUSR2 from the side band
3279 * thread to its parent.
3280 */
3281 if (rec->switch_output_event_set) {
3282 if (record__threads_enabled(rec)) {
3283 pr_warning("WARNING: --switch-output-event option is not available in parallel streaming mode.\n");
3284 return 0;
3285 }
3286 goto do_signal;
3287 }
3288
3289 if (!s->set)
3290 return 0;
3291
3292 if (record__threads_enabled(rec)) {
3293 pr_warning("WARNING: --switch-output option is not available in parallel streaming mode.\n");
3294 return 0;
3295 }
3296
3297 if (!strcmp(s->str, "signal")) {
3298 do_signal:
3299 s->signal = true;
3300 pr_debug("switch-output with SIGUSR2 signal\n");
3301 goto enabled;
3302 }
3303
3304 val = parse_tag_value(s->str, tags_size);
3305 if (val != (unsigned long) -1) {
3306 s->size = val;
3307 pr_debug("switch-output with %s size threshold\n", s->str);
3308 goto enabled;
3309 }
3310
3311 val = parse_tag_value(s->str, tags_time);
3312 if (val != (unsigned long) -1) {
3313 s->time = val;
3314 pr_debug("switch-output with %s time threshold (%lu seconds)\n",
3315 s->str, s->time);
3316 goto enabled;
3317 }
3318
3319 return -1;
3320
3321 enabled:
3322 rec->timestamp_filename = true;
3323 s->enabled = true;
3324
3325 if (s->size && !rec->opts.no_buffering)
3326 switch_output_size_warn(rec);
3327
3328 return 0;
3329 }
3330
3331 static const char * const __record_usage[] = {
3332 "perf record [<options>] [<command>]",
3333 "perf record [<options>] -- <command> [<options>]",
3334 NULL
3335 };
3336 const char * const *record_usage = __record_usage;
3337
build_id__process_mmap(const struct perf_tool * tool,union perf_event * event,struct perf_sample * sample,struct machine * machine)3338 static int build_id__process_mmap(const struct perf_tool *tool, union perf_event *event,
3339 struct perf_sample *sample, struct machine *machine)
3340 {
3341 /*
3342 * We already have the kernel maps, put in place via perf_session__create_kernel_maps()
3343 * no need to add them twice.
3344 */
3345 if (!(event->header.misc & PERF_RECORD_MISC_USER))
3346 return 0;
3347 return perf_event__process_mmap(tool, event, sample, machine);
3348 }
3349
build_id__process_mmap2(const struct perf_tool * tool,union perf_event * event,struct perf_sample * sample,struct machine * machine)3350 static int build_id__process_mmap2(const struct perf_tool *tool, union perf_event *event,
3351 struct perf_sample *sample, struct machine *machine)
3352 {
3353 /*
3354 * We already have the kernel maps, put in place via perf_session__create_kernel_maps()
3355 * no need to add them twice.
3356 */
3357 if (!(event->header.misc & PERF_RECORD_MISC_USER))
3358 return 0;
3359
3360 return perf_event__process_mmap2(tool, event, sample, machine);
3361 }
3362
process_timestamp_boundary(const struct perf_tool * tool,union perf_event * event __maybe_unused,struct perf_sample * sample,struct machine * machine __maybe_unused)3363 static int process_timestamp_boundary(const struct perf_tool *tool,
3364 union perf_event *event __maybe_unused,
3365 struct perf_sample *sample,
3366 struct machine *machine __maybe_unused)
3367 {
3368 struct record *rec = container_of(tool, struct record, tool);
3369
3370 set_timestamp_boundary(rec, sample->time);
3371 return 0;
3372 }
3373
parse_record_synth_option(const struct option * opt,const char * str,int unset __maybe_unused)3374 static int parse_record_synth_option(const struct option *opt,
3375 const char *str,
3376 int unset __maybe_unused)
3377 {
3378 struct record_opts *opts = opt->value;
3379 char *p = strdup(str);
3380
3381 if (p == NULL)
3382 return -1;
3383
3384 opts->synth = parse_synth_opt(p);
3385 free(p);
3386
3387 if (opts->synth < 0) {
3388 pr_err("Invalid synth option: %s\n", str);
3389 return -1;
3390 }
3391 return 0;
3392 }
3393
3394 /*
3395 * XXX Ideally would be local to cmd_record() and passed to a record__new
3396 * because we need to have access to it in record__exit, that is called
3397 * after cmd_record() exits, but since record_options need to be accessible to
3398 * builtin-script, leave it here.
3399 *
3400 * At least we don't ouch it in all the other functions here directly.
3401 *
3402 * Just say no to tons of global variables, sigh.
3403 */
3404 static struct record record = {
3405 .opts = {
3406 .sample_time = true,
3407 .mmap_pages = UINT_MAX,
3408 .user_freq = UINT_MAX,
3409 .user_interval = ULLONG_MAX,
3410 .freq = 4000,
3411 .target = {
3412 .uses_mmap = true,
3413 .default_per_cpu = true,
3414 },
3415 .mmap_flush = MMAP_FLUSH_DEFAULT,
3416 .nr_threads_synthesize = 1,
3417 .ctl_fd = -1,
3418 .ctl_fd_ack = -1,
3419 .synth = PERF_SYNTH_ALL,
3420 .off_cpu_thresh_ns = OFFCPU_THRESH,
3421 },
3422 .buildid_mmap = true,
3423 };
3424
3425 const char record_callchain_help[] = CALLCHAIN_RECORD_HELP
3426 "\n\t\t\t\tDefault: fp";
3427
3428 static bool dry_run;
3429
3430 static struct parse_events_option_args parse_events_option_args = {
3431 .evlistp = &record.evlist,
3432 };
3433
3434 static struct parse_events_option_args switch_output_parse_events_option_args = {
3435 .evlistp = &record.sb_evlist,
3436 };
3437
3438 /*
3439 * XXX Will stay a global variable till we fix builtin-script.c to stop messing
3440 * with it and switch to use the library functions in perf_evlist that came
3441 * from builtin-record.c, i.e. use record_opts,
3442 * evlist__prepare_workload, etc instead of fork+exec'in 'perf record',
3443 * using pipes, etc.
3444 */
3445 static struct option __record_options[] = {
3446 OPT_CALLBACK('e', "event", &parse_events_option_args, "event",
3447 "event selector. use 'perf list' to list available events",
3448 parse_events_option),
3449 OPT_CALLBACK(0, "filter", &record.evlist, "filter",
3450 "event filter", parse_filter),
3451 OPT_BOOLEAN(0, "latency", &record.latency,
3452 "Enable data collection for latency profiling.\n"
3453 "\t\t\t Use perf report --latency for latency-centric profile."),
3454 OPT_CALLBACK_NOOPT(0, "exclude-perf", &record.evlist,
3455 NULL, "don't record events from perf itself",
3456 exclude_perf),
3457 OPT_STRING('p', "pid", &record.opts.target.pid, "pid",
3458 "record events on existing process id"),
3459 OPT_STRING('t', "tid", &record.opts.target.tid, "tid",
3460 "record events on existing thread id"),
3461 OPT_INTEGER('r', "realtime", &record.realtime_prio,
3462 "collect data with this RT SCHED_FIFO priority"),
3463 OPT_BOOLEAN(0, "no-buffering", &record.opts.no_buffering,
3464 "collect data without buffering"),
3465 OPT_BOOLEAN('R', "raw-samples", &record.opts.raw_samples,
3466 "collect raw sample records from all opened counters"),
3467 OPT_BOOLEAN('a', "all-cpus", &record.opts.target.system_wide,
3468 "system-wide collection from all CPUs"),
3469 OPT_STRING('C', "cpu", &record.opts.target.cpu_list, "cpu",
3470 "list of cpus to monitor"),
3471 OPT_U64('c', "count", &record.opts.user_interval, "event period to sample"),
3472 OPT_STRING('o', "output", &record.data.path, "file",
3473 "output file name"),
3474 OPT_BOOLEAN_SET('i', "no-inherit", &record.opts.no_inherit,
3475 &record.opts.no_inherit_set,
3476 "child tasks do not inherit counters"),
3477 OPT_BOOLEAN(0, "tail-synthesize", &record.opts.tail_synthesize,
3478 "synthesize non-sample events at the end of output"),
3479 OPT_BOOLEAN(0, "overwrite", &record.opts.overwrite, "use overwrite mode"),
3480 OPT_BOOLEAN(0, "no-bpf-event", &record.opts.no_bpf_event, "do not record bpf events"),
3481 OPT_BOOLEAN(0, "strict-freq", &record.opts.strict_freq,
3482 "Fail if the specified frequency can't be used"),
3483 OPT_CALLBACK('F', "freq", &record.opts, "freq or 'max'",
3484 "profile at this frequency",
3485 record__parse_freq),
3486 OPT_CALLBACK('m', "mmap-pages", &record.opts, "pages[,pages]",
3487 "number of mmap data pages and AUX area tracing mmap pages",
3488 record__parse_mmap_pages),
3489 OPT_CALLBACK(0, "mmap-flush", &record.opts, "number",
3490 "Minimal number of bytes that is extracted from mmap data pages (default: 1)",
3491 record__mmap_flush_parse),
3492 OPT_CALLBACK_NOOPT('g', NULL, &callchain_param,
3493 NULL, "enables call-graph recording" ,
3494 &record_callchain_opt),
3495 OPT_CALLBACK(0, "call-graph", &record.opts,
3496 "record_mode[,record_size]", record_callchain_help,
3497 &record_parse_callchain_opt),
3498 OPT_INCR('v', "verbose", &verbose,
3499 "be more verbose (show counter open errors, etc)"),
3500 OPT_BOOLEAN('q', "quiet", &quiet, "don't print any warnings or messages"),
3501 OPT_BOOLEAN('s', "stat", &record.opts.inherit_stat,
3502 "per thread counts"),
3503 OPT_BOOLEAN('d', "data", &record.opts.sample_address, "Record the sample addresses"),
3504 OPT_BOOLEAN(0, "phys-data", &record.opts.sample_phys_addr,
3505 "Record the sample physical addresses"),
3506 OPT_BOOLEAN(0, "data-page-size", &record.opts.sample_data_page_size,
3507 "Record the sampled data address data page size"),
3508 OPT_BOOLEAN(0, "code-page-size", &record.opts.sample_code_page_size,
3509 "Record the sampled code address (ip) page size"),
3510 OPT_BOOLEAN(0, "sample-mem-info", &record.opts.sample_data_src,
3511 "Record the data source for memory operations"),
3512 OPT_BOOLEAN(0, "sample-cpu", &record.opts.sample_cpu, "Record the sample cpu"),
3513 OPT_BOOLEAN(0, "sample-identifier", &record.opts.sample_identifier,
3514 "Record the sample identifier"),
3515 OPT_BOOLEAN_SET('T', "timestamp", &record.opts.sample_time,
3516 &record.opts.sample_time_set,
3517 "Record the sample timestamps"),
3518 OPT_BOOLEAN_SET('P', "period", &record.opts.period, &record.opts.period_set,
3519 "Record the sample period"),
3520 OPT_BOOLEAN('n', "no-samples", &record.opts.no_samples,
3521 "don't sample"),
3522 OPT_BOOLEAN_SET('N', "no-buildid-cache", &record.no_buildid_cache,
3523 &record.no_buildid_cache_set,
3524 "do not update the buildid cache"),
3525 OPT_BOOLEAN_SET('B', "no-buildid", &record.no_buildid,
3526 &record.no_buildid_set,
3527 "do not collect buildids in perf.data"),
3528 OPT_CALLBACK('G', "cgroup", &record.evlist, "name",
3529 "monitor event in cgroup name only",
3530 parse_cgroups),
3531 OPT_CALLBACK('D', "delay", &record, "ms",
3532 "ms to wait before starting measurement after program start (-1: start with events disabled), "
3533 "or ranges of time to enable events e.g. '-D 10-20,30-40'",
3534 record__parse_event_enable_time),
3535 OPT_BOOLEAN(0, "kcore", &record.opts.kcore, "copy /proc/kcore"),
3536 OPT_STRING('u', "uid", &record.uid_str, "user", "user to profile"),
3537
3538 OPT_CALLBACK_NOOPT('b', "branch-any", &record.opts.branch_stack,
3539 "branch any", "sample any taken branches",
3540 parse_branch_stack),
3541
3542 OPT_CALLBACK('j', "branch-filter", &record.opts.branch_stack,
3543 "branch filter mask", "branch stack filter modes",
3544 parse_branch_stack),
3545 OPT_BOOLEAN('W', "weight", &record.opts.sample_weight,
3546 "sample by weight (on special events only)"),
3547 OPT_BOOLEAN(0, "transaction", &record.opts.sample_transaction,
3548 "sample transaction flags (special events only)"),
3549 OPT_BOOLEAN(0, "per-thread", &record.opts.target.per_thread,
3550 "use per-thread mmaps"),
3551 OPT_CALLBACK_OPTARG('I', "intr-regs", &record.opts.sample_intr_regs, NULL, "any register",
3552 "sample selected machine registers on interrupt,"
3553 " use '-I?' to list register names", parse_intr_regs),
3554 OPT_CALLBACK_OPTARG(0, "user-regs", &record.opts.sample_user_regs, NULL, "any register",
3555 "sample selected machine registers in user space,"
3556 " use '--user-regs=?' to list register names", parse_user_regs),
3557 OPT_BOOLEAN(0, "running-time", &record.opts.running_time,
3558 "Record running/enabled time of read (:S) events"),
3559 OPT_CALLBACK('k', "clockid", &record.opts,
3560 "clockid", "clockid to use for events, see clock_gettime()",
3561 parse_clockid),
3562 OPT_STRING_OPTARG('S', "snapshot", &record.opts.auxtrace_snapshot_opts,
3563 "opts", "AUX area tracing Snapshot Mode", ""),
3564 OPT_STRING_OPTARG(0, "aux-sample", &record.opts.auxtrace_sample_opts,
3565 "opts", "sample AUX area", ""),
3566 OPT_UINTEGER(0, "proc-map-timeout", &proc_map_timeout,
3567 "per thread proc mmap processing timeout in ms"),
3568 OPT_BOOLEAN(0, "namespaces", &record.opts.record_namespaces,
3569 "Record namespaces events"),
3570 OPT_BOOLEAN(0, "all-cgroups", &record.opts.record_cgroup,
3571 "Record cgroup events"),
3572 OPT_BOOLEAN_SET(0, "switch-events", &record.opts.record_switch_events,
3573 &record.opts.record_switch_events_set,
3574 "Record context switch events"),
3575 OPT_BOOLEAN_FLAG(0, "all-kernel", &record.opts.all_kernel,
3576 "Configure all used events to run in kernel space.",
3577 PARSE_OPT_EXCLUSIVE),
3578 OPT_BOOLEAN_FLAG(0, "all-user", &record.opts.all_user,
3579 "Configure all used events to run in user space.",
3580 PARSE_OPT_EXCLUSIVE),
3581 OPT_BOOLEAN(0, "kernel-callchains", &record.opts.kernel_callchains,
3582 "collect kernel callchains"),
3583 OPT_BOOLEAN(0, "user-callchains", &record.opts.user_callchains,
3584 "collect user callchains"),
3585 OPT_STRING(0, "vmlinux", &symbol_conf.vmlinux_name,
3586 "file", "vmlinux pathname"),
3587 OPT_BOOLEAN(0, "buildid-all", &record.buildid_all,
3588 "Record build-id of all DSOs regardless of hits"),
3589 OPT_BOOLEAN_SET(0, "buildid-mmap", &record.buildid_mmap, &record.buildid_mmap_set,
3590 "Record build-id in mmap events and skip build-id processing."),
3591 OPT_BOOLEAN(0, "timestamp-filename", &record.timestamp_filename,
3592 "append timestamp to output filename"),
3593 OPT_BOOLEAN(0, "timestamp-boundary", &record.timestamp_boundary,
3594 "Record timestamp boundary (time of first/last samples)"),
3595 OPT_STRING_OPTARG_SET(0, "switch-output", &record.switch_output.str,
3596 &record.switch_output.set, "signal or size[BKMG] or time[smhd]",
3597 "Switch output when receiving SIGUSR2 (signal) or cross a size or time threshold",
3598 "signal"),
3599 OPT_CALLBACK_SET(0, "switch-output-event", &switch_output_parse_events_option_args,
3600 &record.switch_output_event_set, "switch output event",
3601 "switch output event selector. use 'perf list' to list available events",
3602 parse_events_option_new_evlist),
3603 OPT_INTEGER(0, "switch-max-files", &record.switch_output.num_files,
3604 "Limit number of switch output generated files"),
3605 OPT_BOOLEAN(0, "dry-run", &dry_run,
3606 "Parse options then exit"),
3607 #ifdef HAVE_AIO_SUPPORT
3608 OPT_CALLBACK_OPTARG(0, "aio", &record.opts,
3609 &nr_cblocks_default, "n", "Use <n> control blocks in asynchronous trace writing mode (default: 1, max: 4)",
3610 record__aio_parse),
3611 #endif
3612 OPT_CALLBACK(0, "affinity", &record.opts, "node|cpu",
3613 "Set affinity mask of trace reading thread to NUMA node cpu mask or cpu of processed mmap buffer",
3614 record__parse_affinity),
3615 #ifdef HAVE_ZSTD_SUPPORT
3616 OPT_CALLBACK_OPTARG('z', "compression-level", &record.opts, &comp_level_default, "n",
3617 "Compress records using specified level (default: 1 - fastest compression, 22 - greatest compression)",
3618 record__parse_comp_level),
3619 #endif
3620 OPT_CALLBACK(0, "max-size", &record.output_max_size,
3621 "size", "Limit the maximum size of the output file", parse_output_max_size),
3622 OPT_UINTEGER(0, "num-thread-synthesize",
3623 &record.opts.nr_threads_synthesize,
3624 "number of threads to run for event synthesis"),
3625 #ifdef HAVE_LIBPFM
3626 OPT_CALLBACK(0, "pfm-events", &record.evlist, "event",
3627 "libpfm4 event selector. use 'perf list' to list available events",
3628 parse_libpfm_events_option),
3629 #endif
3630 OPT_CALLBACK(0, "control", &record.opts, "fd:ctl-fd[,ack-fd] or fifo:ctl-fifo[,ack-fifo]",
3631 "Listen on ctl-fd descriptor for command to control measurement ('enable': enable events, 'disable': disable events,\n"
3632 "\t\t\t 'snapshot': AUX area tracing snapshot).\n"
3633 "\t\t\t Optionally send control command completion ('ack\\n') to ack-fd descriptor.\n"
3634 "\t\t\t Alternatively, ctl-fifo / ack-fifo will be opened and used as ctl-fd / ack-fd.",
3635 parse_control_option),
3636 OPT_CALLBACK(0, "synth", &record.opts, "no|all|task|mmap|cgroup",
3637 "Fine-tune event synthesis: default=all", parse_record_synth_option),
3638 OPT_STRING_OPTARG_SET(0, "debuginfod", &record.debuginfod.urls,
3639 &record.debuginfod.set, "debuginfod urls",
3640 "Enable debuginfod data retrieval from DEBUGINFOD_URLS or specified urls",
3641 "system"),
3642 OPT_CALLBACK_OPTARG(0, "threads", &record.opts, NULL, "spec",
3643 "write collected trace data into several data files using parallel threads",
3644 record__parse_threads),
3645 OPT_BOOLEAN(0, "off-cpu", &record.off_cpu, "Enable off-cpu analysis"),
3646 OPT_STRING(0, "setup-filter", &record.filter_action, "pin|unpin",
3647 "BPF filter action"),
3648 OPT_CALLBACK(0, "off-cpu-thresh", &record.opts, "ms",
3649 "Dump off-cpu samples if off-cpu time exceeds this threshold (in milliseconds). (Default: 500ms)",
3650 record__parse_off_cpu_thresh),
3651 OPT_END()
3652 };
3653
3654 struct option *record_options = __record_options;
3655
record__mmap_cpu_mask_init(struct mmap_cpu_mask * mask,struct perf_cpu_map * cpus)3656 static int record__mmap_cpu_mask_init(struct mmap_cpu_mask *mask, struct perf_cpu_map *cpus)
3657 {
3658 struct perf_cpu cpu;
3659 int idx;
3660
3661 if (cpu_map__is_dummy(cpus))
3662 return 0;
3663
3664 perf_cpu_map__for_each_cpu_skip_any(cpu, idx, cpus) {
3665 /* Return ENODEV is input cpu is greater than max cpu */
3666 if ((unsigned long)cpu.cpu > mask->nbits)
3667 return -ENODEV;
3668 __set_bit(cpu.cpu, mask->bits);
3669 }
3670
3671 return 0;
3672 }
3673
record__mmap_cpu_mask_init_spec(struct mmap_cpu_mask * mask,const char * mask_spec)3674 static int record__mmap_cpu_mask_init_spec(struct mmap_cpu_mask *mask, const char *mask_spec)
3675 {
3676 struct perf_cpu_map *cpus;
3677
3678 cpus = perf_cpu_map__new(mask_spec);
3679 if (!cpus)
3680 return -ENOMEM;
3681
3682 bitmap_zero(mask->bits, mask->nbits);
3683 if (record__mmap_cpu_mask_init(mask, cpus))
3684 return -ENODEV;
3685
3686 perf_cpu_map__put(cpus);
3687
3688 return 0;
3689 }
3690
record__free_thread_masks(struct record * rec,int nr_threads)3691 static void record__free_thread_masks(struct record *rec, int nr_threads)
3692 {
3693 int t;
3694
3695 if (rec->thread_masks)
3696 for (t = 0; t < nr_threads; t++)
3697 record__thread_mask_free(&rec->thread_masks[t]);
3698
3699 zfree(&rec->thread_masks);
3700 }
3701
record__alloc_thread_masks(struct record * rec,int nr_threads,int nr_bits)3702 static int record__alloc_thread_masks(struct record *rec, int nr_threads, int nr_bits)
3703 {
3704 int t, ret;
3705
3706 rec->thread_masks = zalloc(nr_threads * sizeof(*(rec->thread_masks)));
3707 if (!rec->thread_masks) {
3708 pr_err("Failed to allocate thread masks\n");
3709 return -ENOMEM;
3710 }
3711
3712 for (t = 0; t < nr_threads; t++) {
3713 ret = record__thread_mask_alloc(&rec->thread_masks[t], nr_bits);
3714 if (ret) {
3715 pr_err("Failed to allocate thread masks[%d]\n", t);
3716 goto out_free;
3717 }
3718 }
3719
3720 return 0;
3721
3722 out_free:
3723 record__free_thread_masks(rec, nr_threads);
3724
3725 return ret;
3726 }
3727
record__init_thread_cpu_masks(struct record * rec,struct perf_cpu_map * cpus)3728 static int record__init_thread_cpu_masks(struct record *rec, struct perf_cpu_map *cpus)
3729 {
3730 int t, ret, nr_cpus = perf_cpu_map__nr(cpus);
3731
3732 ret = record__alloc_thread_masks(rec, nr_cpus, cpu__max_cpu().cpu);
3733 if (ret)
3734 return ret;
3735
3736 rec->nr_threads = nr_cpus;
3737 pr_debug("nr_threads: %d\n", rec->nr_threads);
3738
3739 for (t = 0; t < rec->nr_threads; t++) {
3740 __set_bit(perf_cpu_map__cpu(cpus, t).cpu, rec->thread_masks[t].maps.bits);
3741 __set_bit(perf_cpu_map__cpu(cpus, t).cpu, rec->thread_masks[t].affinity.bits);
3742 if (verbose > 0) {
3743 pr_debug("thread_masks[%d]: ", t);
3744 mmap_cpu_mask__scnprintf(&rec->thread_masks[t].maps, "maps");
3745 pr_debug("thread_masks[%d]: ", t);
3746 mmap_cpu_mask__scnprintf(&rec->thread_masks[t].affinity, "affinity");
3747 }
3748 }
3749
3750 return 0;
3751 }
3752
record__init_thread_masks_spec(struct record * rec,struct perf_cpu_map * cpus,const char ** maps_spec,const char ** affinity_spec,u32 nr_spec)3753 static int record__init_thread_masks_spec(struct record *rec, struct perf_cpu_map *cpus,
3754 const char **maps_spec, const char **affinity_spec,
3755 u32 nr_spec)
3756 {
3757 u32 s;
3758 int ret = 0, t = 0;
3759 struct mmap_cpu_mask cpus_mask;
3760 struct thread_mask thread_mask, full_mask, *thread_masks;
3761
3762 ret = record__mmap_cpu_mask_alloc(&cpus_mask, cpu__max_cpu().cpu);
3763 if (ret) {
3764 pr_err("Failed to allocate CPUs mask\n");
3765 return ret;
3766 }
3767
3768 ret = record__mmap_cpu_mask_init(&cpus_mask, cpus);
3769 if (ret) {
3770 pr_err("Failed to init cpu mask\n");
3771 goto out_free_cpu_mask;
3772 }
3773
3774 ret = record__thread_mask_alloc(&full_mask, cpu__max_cpu().cpu);
3775 if (ret) {
3776 pr_err("Failed to allocate full mask\n");
3777 goto out_free_cpu_mask;
3778 }
3779
3780 ret = record__thread_mask_alloc(&thread_mask, cpu__max_cpu().cpu);
3781 if (ret) {
3782 pr_err("Failed to allocate thread mask\n");
3783 goto out_free_full_and_cpu_masks;
3784 }
3785
3786 for (s = 0; s < nr_spec; s++) {
3787 ret = record__mmap_cpu_mask_init_spec(&thread_mask.maps, maps_spec[s]);
3788 if (ret) {
3789 pr_err("Failed to initialize maps thread mask\n");
3790 goto out_free;
3791 }
3792 ret = record__mmap_cpu_mask_init_spec(&thread_mask.affinity, affinity_spec[s]);
3793 if (ret) {
3794 pr_err("Failed to initialize affinity thread mask\n");
3795 goto out_free;
3796 }
3797
3798 /* ignore invalid CPUs but do not allow empty masks */
3799 if (!bitmap_and(thread_mask.maps.bits, thread_mask.maps.bits,
3800 cpus_mask.bits, thread_mask.maps.nbits)) {
3801 pr_err("Empty maps mask: %s\n", maps_spec[s]);
3802 ret = -EINVAL;
3803 goto out_free;
3804 }
3805 if (!bitmap_and(thread_mask.affinity.bits, thread_mask.affinity.bits,
3806 cpus_mask.bits, thread_mask.affinity.nbits)) {
3807 pr_err("Empty affinity mask: %s\n", affinity_spec[s]);
3808 ret = -EINVAL;
3809 goto out_free;
3810 }
3811
3812 /* do not allow intersection with other masks (full_mask) */
3813 if (bitmap_intersects(thread_mask.maps.bits, full_mask.maps.bits,
3814 thread_mask.maps.nbits)) {
3815 pr_err("Intersecting maps mask: %s\n", maps_spec[s]);
3816 ret = -EINVAL;
3817 goto out_free;
3818 }
3819 if (bitmap_intersects(thread_mask.affinity.bits, full_mask.affinity.bits,
3820 thread_mask.affinity.nbits)) {
3821 pr_err("Intersecting affinity mask: %s\n", affinity_spec[s]);
3822 ret = -EINVAL;
3823 goto out_free;
3824 }
3825
3826 bitmap_or(full_mask.maps.bits, full_mask.maps.bits,
3827 thread_mask.maps.bits, full_mask.maps.nbits);
3828 bitmap_or(full_mask.affinity.bits, full_mask.affinity.bits,
3829 thread_mask.affinity.bits, full_mask.maps.nbits);
3830
3831 thread_masks = realloc(rec->thread_masks, (t + 1) * sizeof(struct thread_mask));
3832 if (!thread_masks) {
3833 pr_err("Failed to reallocate thread masks\n");
3834 ret = -ENOMEM;
3835 goto out_free;
3836 }
3837 rec->thread_masks = thread_masks;
3838 rec->thread_masks[t] = thread_mask;
3839 if (verbose > 0) {
3840 pr_debug("thread_masks[%d]: ", t);
3841 mmap_cpu_mask__scnprintf(&rec->thread_masks[t].maps, "maps");
3842 pr_debug("thread_masks[%d]: ", t);
3843 mmap_cpu_mask__scnprintf(&rec->thread_masks[t].affinity, "affinity");
3844 }
3845 t++;
3846 ret = record__thread_mask_alloc(&thread_mask, cpu__max_cpu().cpu);
3847 if (ret) {
3848 pr_err("Failed to allocate thread mask\n");
3849 goto out_free_full_and_cpu_masks;
3850 }
3851 }
3852 rec->nr_threads = t;
3853 pr_debug("nr_threads: %d\n", rec->nr_threads);
3854 if (!rec->nr_threads)
3855 ret = -EINVAL;
3856
3857 out_free:
3858 record__thread_mask_free(&thread_mask);
3859 out_free_full_and_cpu_masks:
3860 record__thread_mask_free(&full_mask);
3861 out_free_cpu_mask:
3862 record__mmap_cpu_mask_free(&cpus_mask);
3863
3864 return ret;
3865 }
3866
record__init_thread_core_masks(struct record * rec,struct perf_cpu_map * cpus)3867 static int record__init_thread_core_masks(struct record *rec, struct perf_cpu_map *cpus)
3868 {
3869 int ret;
3870 struct cpu_topology *topo;
3871
3872 topo = cpu_topology__new();
3873 if (!topo) {
3874 pr_err("Failed to allocate CPU topology\n");
3875 return -ENOMEM;
3876 }
3877
3878 ret = record__init_thread_masks_spec(rec, cpus, topo->core_cpus_list,
3879 topo->core_cpus_list, topo->core_cpus_lists);
3880 cpu_topology__delete(topo);
3881
3882 return ret;
3883 }
3884
record__init_thread_package_masks(struct record * rec,struct perf_cpu_map * cpus)3885 static int record__init_thread_package_masks(struct record *rec, struct perf_cpu_map *cpus)
3886 {
3887 int ret;
3888 struct cpu_topology *topo;
3889
3890 topo = cpu_topology__new();
3891 if (!topo) {
3892 pr_err("Failed to allocate CPU topology\n");
3893 return -ENOMEM;
3894 }
3895
3896 ret = record__init_thread_masks_spec(rec, cpus, topo->package_cpus_list,
3897 topo->package_cpus_list, topo->package_cpus_lists);
3898 cpu_topology__delete(topo);
3899
3900 return ret;
3901 }
3902
record__init_thread_numa_masks(struct record * rec,struct perf_cpu_map * cpus)3903 static int record__init_thread_numa_masks(struct record *rec, struct perf_cpu_map *cpus)
3904 {
3905 u32 s;
3906 int ret;
3907 const char **spec;
3908 struct numa_topology *topo;
3909
3910 topo = numa_topology__new();
3911 if (!topo) {
3912 pr_err("Failed to allocate NUMA topology\n");
3913 return -ENOMEM;
3914 }
3915
3916 spec = zalloc(topo->nr * sizeof(char *));
3917 if (!spec) {
3918 pr_err("Failed to allocate NUMA spec\n");
3919 ret = -ENOMEM;
3920 goto out_delete_topo;
3921 }
3922 for (s = 0; s < topo->nr; s++)
3923 spec[s] = topo->nodes[s].cpus;
3924
3925 ret = record__init_thread_masks_spec(rec, cpus, spec, spec, topo->nr);
3926
3927 zfree(&spec);
3928
3929 out_delete_topo:
3930 numa_topology__delete(topo);
3931
3932 return ret;
3933 }
3934
record__init_thread_user_masks(struct record * rec,struct perf_cpu_map * cpus)3935 static int record__init_thread_user_masks(struct record *rec, struct perf_cpu_map *cpus)
3936 {
3937 int t, ret;
3938 u32 s, nr_spec = 0;
3939 char **maps_spec = NULL, **affinity_spec = NULL, **tmp_spec;
3940 char *user_spec, *spec, *spec_ptr, *mask, *mask_ptr, *dup_mask = NULL;
3941
3942 for (t = 0, user_spec = (char *)rec->opts.threads_user_spec; ; t++, user_spec = NULL) {
3943 spec = strtok_r(user_spec, ":", &spec_ptr);
3944 if (spec == NULL)
3945 break;
3946 pr_debug2("threads_spec[%d]: %s\n", t, spec);
3947 mask = strtok_r(spec, "/", &mask_ptr);
3948 if (mask == NULL)
3949 break;
3950 pr_debug2(" maps mask: %s\n", mask);
3951 tmp_spec = realloc(maps_spec, (nr_spec + 1) * sizeof(char *));
3952 if (!tmp_spec) {
3953 pr_err("Failed to reallocate maps spec\n");
3954 ret = -ENOMEM;
3955 goto out_free;
3956 }
3957 maps_spec = tmp_spec;
3958 maps_spec[nr_spec] = dup_mask = strdup(mask);
3959 if (!maps_spec[nr_spec]) {
3960 pr_err("Failed to allocate maps spec[%d]\n", nr_spec);
3961 ret = -ENOMEM;
3962 goto out_free;
3963 }
3964 mask = strtok_r(NULL, "/", &mask_ptr);
3965 if (mask == NULL) {
3966 pr_err("Invalid thread maps or affinity specs\n");
3967 ret = -EINVAL;
3968 goto out_free;
3969 }
3970 pr_debug2(" affinity mask: %s\n", mask);
3971 tmp_spec = realloc(affinity_spec, (nr_spec + 1) * sizeof(char *));
3972 if (!tmp_spec) {
3973 pr_err("Failed to reallocate affinity spec\n");
3974 ret = -ENOMEM;
3975 goto out_free;
3976 }
3977 affinity_spec = tmp_spec;
3978 affinity_spec[nr_spec] = strdup(mask);
3979 if (!affinity_spec[nr_spec]) {
3980 pr_err("Failed to allocate affinity spec[%d]\n", nr_spec);
3981 ret = -ENOMEM;
3982 goto out_free;
3983 }
3984 dup_mask = NULL;
3985 nr_spec++;
3986 }
3987
3988 ret = record__init_thread_masks_spec(rec, cpus, (const char **)maps_spec,
3989 (const char **)affinity_spec, nr_spec);
3990
3991 out_free:
3992 free(dup_mask);
3993 for (s = 0; s < nr_spec; s++) {
3994 if (maps_spec)
3995 free(maps_spec[s]);
3996 if (affinity_spec)
3997 free(affinity_spec[s]);
3998 }
3999 free(affinity_spec);
4000 free(maps_spec);
4001
4002 return ret;
4003 }
4004
record__init_thread_default_masks(struct record * rec,struct perf_cpu_map * cpus)4005 static int record__init_thread_default_masks(struct record *rec, struct perf_cpu_map *cpus)
4006 {
4007 int ret;
4008
4009 ret = record__alloc_thread_masks(rec, 1, cpu__max_cpu().cpu);
4010 if (ret)
4011 return ret;
4012
4013 if (record__mmap_cpu_mask_init(&rec->thread_masks->maps, cpus))
4014 return -ENODEV;
4015
4016 rec->nr_threads = 1;
4017
4018 return 0;
4019 }
4020
record__init_thread_masks(struct record * rec)4021 static int record__init_thread_masks(struct record *rec)
4022 {
4023 int ret = 0;
4024 struct perf_cpu_map *cpus = rec->evlist->core.all_cpus;
4025
4026 if (!record__threads_enabled(rec))
4027 return record__init_thread_default_masks(rec, cpus);
4028
4029 if (evlist__per_thread(rec->evlist)) {
4030 pr_err("--per-thread option is mutually exclusive to parallel streaming mode.\n");
4031 return -EINVAL;
4032 }
4033
4034 switch (rec->opts.threads_spec) {
4035 case THREAD_SPEC__CPU:
4036 ret = record__init_thread_cpu_masks(rec, cpus);
4037 break;
4038 case THREAD_SPEC__CORE:
4039 ret = record__init_thread_core_masks(rec, cpus);
4040 break;
4041 case THREAD_SPEC__PACKAGE:
4042 ret = record__init_thread_package_masks(rec, cpus);
4043 break;
4044 case THREAD_SPEC__NUMA:
4045 ret = record__init_thread_numa_masks(rec, cpus);
4046 break;
4047 case THREAD_SPEC__USER:
4048 ret = record__init_thread_user_masks(rec, cpus);
4049 break;
4050 default:
4051 break;
4052 }
4053
4054 return ret;
4055 }
4056
cmd_record(int argc,const char ** argv)4057 int cmd_record(int argc, const char **argv)
4058 {
4059 int err;
4060 struct record *rec = &record;
4061 char errbuf[BUFSIZ];
4062
4063 setlocale(LC_ALL, "");
4064
4065 #ifndef HAVE_BPF_SKEL
4066 # define set_nobuild(s, l, m, c) set_option_nobuild(record_options, s, l, m, c)
4067 set_nobuild('\0', "off-cpu", "no BUILD_BPF_SKEL=1", true);
4068 # undef set_nobuild
4069 #endif
4070
4071 /* Disable eager loading of kernel symbols that adds overhead to perf record. */
4072 symbol_conf.lazy_load_kernel_maps = true;
4073 rec->opts.affinity = PERF_AFFINITY_SYS;
4074
4075 rec->evlist = evlist__new();
4076 if (rec->evlist == NULL)
4077 return -ENOMEM;
4078
4079 err = perf_config(perf_record_config, rec);
4080 if (err)
4081 return err;
4082
4083 argc = parse_options(argc, argv, record_options, record_usage,
4084 PARSE_OPT_STOP_AT_NON_OPTION);
4085 if (quiet)
4086 perf_quiet_option();
4087
4088 err = symbol__validate_sym_arguments();
4089 if (err)
4090 return err;
4091
4092 perf_debuginfod_setup(&record.debuginfod);
4093
4094 /* Make system wide (-a) the default target. */
4095 if (!argc && target__none(&rec->opts.target))
4096 rec->opts.target.system_wide = true;
4097
4098 if (nr_cgroups && !rec->opts.target.system_wide) {
4099 usage_with_options_msg(record_usage, record_options,
4100 "cgroup monitoring only available in system-wide mode");
4101
4102 }
4103
4104 if (record.latency) {
4105 /*
4106 * There is no fundamental reason why latency profiling
4107 * can't work for system-wide mode, but exact semantics
4108 * and details are to be defined.
4109 * See the following thread for details:
4110 * https://lore.kernel.org/all/Z4XDJyvjiie3howF@google.com/
4111 */
4112 if (record.opts.target.system_wide) {
4113 pr_err("Failed: latency profiling is not supported with system-wide collection.\n");
4114 err = -EINVAL;
4115 goto out_opts;
4116 }
4117 record.opts.record_switch_events = true;
4118 }
4119
4120 if (!rec->buildid_mmap) {
4121 pr_debug("Disabling build id in synthesized mmap2 events.\n");
4122 symbol_conf.no_buildid_mmap2 = true;
4123 } else if (rec->buildid_mmap_set) {
4124 /*
4125 * Explicitly passing --buildid-mmap disables buildid processing
4126 * and cache generation.
4127 */
4128 rec->no_buildid = true;
4129 }
4130 if (rec->buildid_mmap && !perf_can_record_build_id()) {
4131 pr_warning("Missing support for build id in kernel mmap events.\n"
4132 "Disable this warning with --no-buildid-mmap\n");
4133 rec->buildid_mmap = false;
4134 }
4135 if (rec->buildid_mmap) {
4136 /* Enable perf_event_attr::build_id bit. */
4137 rec->opts.build_id = true;
4138 }
4139
4140 if (rec->opts.record_cgroup && !perf_can_record_cgroup()) {
4141 pr_err("Kernel has no cgroup sampling support.\n");
4142 err = -EINVAL;
4143 goto out_opts;
4144 }
4145
4146 if (rec->opts.kcore)
4147 rec->opts.text_poke = true;
4148
4149 if (rec->opts.kcore || record__threads_enabled(rec))
4150 rec->data.is_dir = true;
4151
4152 if (record__threads_enabled(rec)) {
4153 if (rec->opts.affinity != PERF_AFFINITY_SYS) {
4154 pr_err("--affinity option is mutually exclusive to parallel streaming mode.\n");
4155 goto out_opts;
4156 }
4157 if (record__aio_enabled(rec)) {
4158 pr_err("Asynchronous streaming mode (--aio) is mutually exclusive to parallel streaming mode.\n");
4159 goto out_opts;
4160 }
4161 }
4162
4163 if (rec->opts.comp_level != 0) {
4164 pr_debug("Compression enabled, disabling build id collection at the end of the session.\n");
4165 rec->no_buildid = true;
4166 }
4167
4168 if (rec->opts.record_switch_events &&
4169 !perf_can_record_switch_events()) {
4170 ui__error("kernel does not support recording context switch events\n");
4171 parse_options_usage(record_usage, record_options, "switch-events", 0);
4172 err = -EINVAL;
4173 goto out_opts;
4174 }
4175
4176 if (switch_output_setup(rec)) {
4177 parse_options_usage(record_usage, record_options, "switch-output", 0);
4178 err = -EINVAL;
4179 goto out_opts;
4180 }
4181
4182 if (rec->switch_output.time) {
4183 signal(SIGALRM, alarm_sig_handler);
4184 alarm(rec->switch_output.time);
4185 }
4186
4187 if (rec->switch_output.num_files) {
4188 rec->switch_output.filenames = calloc(rec->switch_output.num_files,
4189 sizeof(char *));
4190 if (!rec->switch_output.filenames) {
4191 err = -EINVAL;
4192 goto out_opts;
4193 }
4194 }
4195
4196 if (rec->timestamp_filename && record__threads_enabled(rec)) {
4197 rec->timestamp_filename = false;
4198 pr_warning("WARNING: --timestamp-filename option is not available in parallel streaming mode.\n");
4199 }
4200
4201 if (rec->filter_action) {
4202 if (!strcmp(rec->filter_action, "pin"))
4203 err = perf_bpf_filter__pin();
4204 else if (!strcmp(rec->filter_action, "unpin"))
4205 err = perf_bpf_filter__unpin();
4206 else {
4207 pr_warning("Unknown BPF filter action: %s\n", rec->filter_action);
4208 err = -EINVAL;
4209 }
4210 goto out_opts;
4211 }
4212
4213 /* For backward compatibility, -d implies --mem-info */
4214 if (rec->opts.sample_address)
4215 rec->opts.sample_data_src = true;
4216
4217 /*
4218 * Allow aliases to facilitate the lookup of symbols for address
4219 * filters. Refer to auxtrace_parse_filters().
4220 */
4221 symbol_conf.allow_aliases = true;
4222
4223 symbol__init(NULL);
4224
4225 err = record__auxtrace_init(rec);
4226 if (err)
4227 goto out;
4228
4229 if (dry_run)
4230 goto out;
4231
4232 err = -ENOMEM;
4233
4234 if (rec->no_buildid_cache || rec->no_buildid) {
4235 disable_buildid_cache();
4236 } else if (rec->switch_output.enabled) {
4237 /*
4238 * In 'perf record --switch-output', disable buildid
4239 * generation by default to reduce data file switching
4240 * overhead. Still generate buildid if they are required
4241 * explicitly using
4242 *
4243 * perf record --switch-output --no-no-buildid \
4244 * --no-no-buildid-cache
4245 *
4246 * Following code equals to:
4247 *
4248 * if ((rec->no_buildid || !rec->no_buildid_set) &&
4249 * (rec->no_buildid_cache || !rec->no_buildid_cache_set))
4250 * disable_buildid_cache();
4251 */
4252 bool disable = true;
4253
4254 if (rec->no_buildid_set && !rec->no_buildid)
4255 disable = false;
4256 if (rec->no_buildid_cache_set && !rec->no_buildid_cache)
4257 disable = false;
4258 if (disable) {
4259 rec->no_buildid = true;
4260 rec->no_buildid_cache = true;
4261 disable_buildid_cache();
4262 }
4263 }
4264
4265 if (record.opts.overwrite)
4266 record.opts.tail_synthesize = true;
4267
4268 if (rec->evlist->core.nr_entries == 0) {
4269 err = parse_event(rec->evlist, "cycles:P");
4270 if (err)
4271 goto out;
4272 }
4273
4274 if (rec->opts.target.tid && !rec->opts.no_inherit_set)
4275 rec->opts.no_inherit = true;
4276
4277 err = target__validate(&rec->opts.target);
4278 if (err) {
4279 target__strerror(&rec->opts.target, err, errbuf, BUFSIZ);
4280 ui__warning("%s\n", errbuf);
4281 }
4282
4283 if (rec->uid_str) {
4284 uid_t uid = parse_uid(rec->uid_str);
4285
4286 if (uid == UINT_MAX) {
4287 ui__error("Invalid User: %s", rec->uid_str);
4288 err = -EINVAL;
4289 goto out;
4290 }
4291 err = parse_uid_filter(rec->evlist, uid);
4292 if (err)
4293 goto out;
4294
4295 /* User ID filtering implies system wide. */
4296 rec->opts.target.system_wide = true;
4297 }
4298
4299 /* Enable ignoring missing threads when -p option is defined. */
4300 rec->opts.ignore_missing_thread = rec->opts.target.pid;
4301
4302 evlist__warn_user_requested_cpus(rec->evlist, rec->opts.target.cpu_list);
4303
4304 if (callchain_param.enabled && callchain_param.record_mode == CALLCHAIN_FP)
4305 arch__add_leaf_frame_record_opts(&rec->opts);
4306
4307 err = -ENOMEM;
4308 if (evlist__create_maps(rec->evlist, &rec->opts.target) < 0) {
4309 if (rec->opts.target.pid != NULL) {
4310 pr_err("Couldn't create thread/CPU maps: %s\n",
4311 errno == ENOENT ? "No such process" : str_error_r(errno, errbuf, sizeof(errbuf)));
4312 goto out;
4313 }
4314 else
4315 usage_with_options(record_usage, record_options);
4316 }
4317
4318 err = auxtrace_record__options(rec->itr, rec->evlist, &rec->opts);
4319 if (err)
4320 goto out;
4321
4322 /*
4323 * We take all buildids when the file contains
4324 * AUX area tracing data because we do not decode the
4325 * trace because it would take too long.
4326 */
4327 if (rec->opts.full_auxtrace)
4328 rec->buildid_all = true;
4329
4330 if (rec->opts.text_poke) {
4331 err = record__config_text_poke(rec->evlist);
4332 if (err) {
4333 pr_err("record__config_text_poke failed, error %d\n", err);
4334 goto out;
4335 }
4336 }
4337
4338 if (rec->off_cpu) {
4339 err = record__config_off_cpu(rec);
4340 if (err) {
4341 pr_err("record__config_off_cpu failed, error %d\n", err);
4342 goto out;
4343 }
4344 }
4345
4346 if (record_opts__config(&rec->opts)) {
4347 err = -EINVAL;
4348 goto out;
4349 }
4350
4351 err = record__config_tracking_events(rec);
4352 if (err) {
4353 pr_err("record__config_tracking_events failed, error %d\n", err);
4354 goto out;
4355 }
4356
4357 err = record__init_thread_masks(rec);
4358 if (err) {
4359 pr_err("Failed to initialize parallel data streaming masks\n");
4360 goto out;
4361 }
4362
4363 if (rec->opts.nr_cblocks > nr_cblocks_max)
4364 rec->opts.nr_cblocks = nr_cblocks_max;
4365 pr_debug("nr_cblocks: %d\n", rec->opts.nr_cblocks);
4366
4367 pr_debug("affinity: %s\n", affinity_tags[rec->opts.affinity]);
4368 pr_debug("mmap flush: %d\n", rec->opts.mmap_flush);
4369
4370 if (rec->opts.comp_level > comp_level_max)
4371 rec->opts.comp_level = comp_level_max;
4372 pr_debug("comp level: %d\n", rec->opts.comp_level);
4373
4374 err = __cmd_record(&record, argc, argv);
4375 out:
4376 record__free_thread_masks(rec, rec->nr_threads);
4377 rec->nr_threads = 0;
4378 symbol__exit();
4379 auxtrace_record__free(rec->itr);
4380 out_opts:
4381 evlist__close_control(rec->opts.ctl_fd, rec->opts.ctl_fd_ack, &rec->opts.ctl_fd_close);
4382 evlist__delete(rec->evlist);
4383 return err;
4384 }
4385
snapshot_sig_handler(int sig __maybe_unused)4386 static void snapshot_sig_handler(int sig __maybe_unused)
4387 {
4388 struct record *rec = &record;
4389
4390 hit_auxtrace_snapshot_trigger(rec);
4391
4392 if (switch_output_signal(rec))
4393 trigger_hit(&switch_output_trigger);
4394 }
4395
alarm_sig_handler(int sig __maybe_unused)4396 static void alarm_sig_handler(int sig __maybe_unused)
4397 {
4398 struct record *rec = &record;
4399
4400 if (switch_output_time(rec))
4401 trigger_hit(&switch_output_trigger);
4402 }
4403