1 // SPDX-License-Identifier: GPL-2.0
2 /*
3 * builtin-record.c
4 *
5 * Builtin record command: Record the profile of a workload
6 * (or a CPU, or a PID) into the perf.data output file - for
7 * later analysis via perf report.
8 */
9 #include "builtin.h"
10
11 #include "util/build-id.h"
12 #include <subcmd/parse-options.h>
13 #include <internal/xyarray.h>
14 #include "util/parse-events.h"
15 #include "util/config.h"
16
17 #include "util/callchain.h"
18 #include "util/cgroup.h"
19 #include "util/header.h"
20 #include "util/event.h"
21 #include "util/evlist.h"
22 #include "util/evsel.h"
23 #include "util/debug.h"
24 #include "util/mmap.h"
25 #include "util/mutex.h"
26 #include "util/target.h"
27 #include "util/session.h"
28 #include "util/tool.h"
29 #include "util/stat.h"
30 #include "util/symbol.h"
31 #include "util/record.h"
32 #include "util/cpumap.h"
33 #include "util/thread_map.h"
34 #include "util/data.h"
35 #include "util/perf_regs.h"
36 #include "util/auxtrace.h"
37 #include "util/tsc.h"
38 #include "util/parse-branch-options.h"
39 #include "util/parse-regs-options.h"
40 #include "util/perf_api_probe.h"
41 #include "util/trigger.h"
42 #include "util/perf-hooks.h"
43 #include "util/cpu-set-sched.h"
44 #include "util/synthetic-events.h"
45 #include "util/time-utils.h"
46 #include "util/units.h"
47 #include "util/bpf-event.h"
48 #include "util/util.h"
49 #include "util/pfm.h"
50 #include "util/pmu.h"
51 #include "util/pmus.h"
52 #include "util/clockid.h"
53 #include "util/off_cpu.h"
54 #include "util/bpf-filter.h"
55 #include "util/strbuf.h"
56 #include "asm/bug.h"
57 #include "perf.h"
58 #include "cputopo.h"
59
60 #include <errno.h>
61 #include <inttypes.h>
62 #include <locale.h>
63 #include <poll.h>
64 #include <pthread.h>
65 #include <unistd.h>
66 #ifndef HAVE_GETTID
67 #include <syscall.h>
68 #endif
69 #include <sched.h>
70 #include <signal.h>
71 #ifdef HAVE_EVENTFD_SUPPORT
72 #include <sys/eventfd.h>
73 #endif
74 #include <sys/mman.h>
75 #include <sys/wait.h>
76 #include <sys/types.h>
77 #include <sys/stat.h>
78 #include <fcntl.h>
79 #include <linux/err.h>
80 #include <linux/string.h>
81 #include <linux/time64.h>
82 #include <linux/zalloc.h>
83 #include <linux/bitmap.h>
84 #include <sys/time.h>
85
86 struct switch_output {
87 bool enabled;
88 bool signal;
89 unsigned long size;
90 unsigned long time;
91 const char *str;
92 bool set;
93 char **filenames;
94 int num_files;
95 int cur_file;
96 };
97
98 struct thread_mask {
99 struct mmap_cpu_mask maps;
100 struct mmap_cpu_mask affinity;
101 };
102
103 struct record_thread {
104 pid_t tid;
105 struct thread_mask *mask;
106 struct {
107 int msg[2];
108 int ack[2];
109 } pipes;
110 struct fdarray pollfd;
111 int ctlfd_pos;
112 int nr_mmaps;
113 struct mmap **maps;
114 struct mmap **overwrite_maps;
115 struct record *rec;
116 unsigned long long samples;
117 unsigned long waking;
118 u64 bytes_written;
119 u64 bytes_transferred;
120 u64 bytes_compressed;
121 };
122
123 static __thread struct record_thread *thread;
124
125 enum thread_msg {
126 THREAD_MSG__UNDEFINED = 0,
127 THREAD_MSG__READY,
128 THREAD_MSG__MAX,
129 };
130
131 static const char *thread_msg_tags[THREAD_MSG__MAX] = {
132 "UNDEFINED", "READY"
133 };
134
135 enum thread_spec {
136 THREAD_SPEC__UNDEFINED = 0,
137 THREAD_SPEC__CPU,
138 THREAD_SPEC__CORE,
139 THREAD_SPEC__PACKAGE,
140 THREAD_SPEC__NUMA,
141 THREAD_SPEC__USER,
142 THREAD_SPEC__MAX,
143 };
144
145 static const char *thread_spec_tags[THREAD_SPEC__MAX] = {
146 "undefined", "cpu", "core", "package", "numa", "user"
147 };
148
149 struct pollfd_index_map {
150 int evlist_pollfd_index;
151 int thread_pollfd_index;
152 };
153
154 struct record {
155 struct perf_tool tool;
156 struct record_opts opts;
157 u64 bytes_written;
158 u64 thread_bytes_written;
159 struct perf_data data;
160 struct auxtrace_record *itr;
161 struct evlist *evlist;
162 struct perf_session *session;
163 struct evlist *sb_evlist;
164 pthread_t thread_id;
165 int realtime_prio;
166 bool latency;
167 bool switch_output_event_set;
168 bool no_buildid;
169 bool no_buildid_set;
170 bool no_buildid_cache;
171 bool no_buildid_cache_set;
172 bool buildid_all;
173 bool buildid_mmap;
174 bool buildid_mmap_set;
175 bool timestamp_filename;
176 bool timestamp_boundary;
177 bool off_cpu;
178 const char *filter_action;
179 const char *uid_str;
180 struct switch_output switch_output;
181 unsigned long long samples;
182 unsigned long output_max_size; /* = 0: unlimited */
183 struct perf_debuginfod debuginfod;
184 int nr_threads;
185 struct thread_mask *thread_masks;
186 struct record_thread *thread_data;
187 struct pollfd_index_map *index_map;
188 size_t index_map_sz;
189 size_t index_map_cnt;
190 };
191
192 static volatile int done;
193
194 static volatile int auxtrace_record__snapshot_started;
195 static DEFINE_TRIGGER(auxtrace_snapshot_trigger);
196 static DEFINE_TRIGGER(switch_output_trigger);
197
198 static const char *affinity_tags[PERF_AFFINITY_MAX] = {
199 "SYS", "NODE", "CPU"
200 };
201
202 static int build_id__process_mmap(const struct perf_tool *tool, union perf_event *event,
203 struct perf_sample *sample, struct machine *machine);
204 static int build_id__process_mmap2(const struct perf_tool *tool, union perf_event *event,
205 struct perf_sample *sample, struct machine *machine);
206 static int process_timestamp_boundary(const struct perf_tool *tool,
207 union perf_event *event,
208 struct perf_sample *sample,
209 struct machine *machine);
210
211 #ifndef HAVE_GETTID
gettid(void)212 static inline pid_t gettid(void)
213 {
214 return (pid_t)syscall(__NR_gettid);
215 }
216 #endif
217
record__threads_enabled(struct record * rec)218 static int record__threads_enabled(struct record *rec)
219 {
220 return rec->opts.threads_spec;
221 }
222
switch_output_signal(struct record * rec)223 static bool switch_output_signal(struct record *rec)
224 {
225 return rec->switch_output.signal &&
226 trigger_is_ready(&switch_output_trigger);
227 }
228
switch_output_size(struct record * rec)229 static bool switch_output_size(struct record *rec)
230 {
231 return rec->switch_output.size &&
232 trigger_is_ready(&switch_output_trigger) &&
233 (rec->bytes_written >= rec->switch_output.size);
234 }
235
switch_output_time(struct record * rec)236 static bool switch_output_time(struct record *rec)
237 {
238 return rec->switch_output.time &&
239 trigger_is_ready(&switch_output_trigger);
240 }
241
record__bytes_written(struct record * rec)242 static u64 record__bytes_written(struct record *rec)
243 {
244 return rec->bytes_written + rec->thread_bytes_written;
245 }
246
record__output_max_size_exceeded(struct record * rec)247 static bool record__output_max_size_exceeded(struct record *rec)
248 {
249 return rec->output_max_size &&
250 (record__bytes_written(rec) >= rec->output_max_size);
251 }
252
record__write(struct record * rec,struct mmap * map __maybe_unused,void * bf,size_t size)253 static int record__write(struct record *rec, struct mmap *map __maybe_unused,
254 void *bf, size_t size)
255 {
256 struct perf_data_file *file = &rec->session->data->file;
257
258 if (map && map->file)
259 file = map->file;
260
261 if (perf_data_file__write(file, bf, size) < 0) {
262 pr_err("failed to write perf data, error: %m\n");
263 return -1;
264 }
265
266 if (map && map->file) {
267 thread->bytes_written += size;
268 rec->thread_bytes_written += size;
269 } else {
270 rec->bytes_written += size;
271 }
272
273 if (record__output_max_size_exceeded(rec) && !done) {
274 fprintf(stderr, "[ perf record: perf size limit reached (%" PRIu64 " KB),"
275 " stopping session ]\n",
276 record__bytes_written(rec) >> 10);
277 done = 1;
278 }
279
280 if (switch_output_size(rec))
281 trigger_hit(&switch_output_trigger);
282
283 return 0;
284 }
285
286 static int record__aio_enabled(struct record *rec);
287 static int record__comp_enabled(struct record *rec);
288 static ssize_t zstd_compress(struct perf_session *session, struct mmap *map,
289 void *dst, size_t dst_size, void *src, size_t src_size);
290
291 #ifdef HAVE_AIO_SUPPORT
record__aio_write(struct aiocb * cblock,int trace_fd,void * buf,size_t size,off_t off)292 static int record__aio_write(struct aiocb *cblock, int trace_fd,
293 void *buf, size_t size, off_t off)
294 {
295 int rc;
296
297 cblock->aio_fildes = trace_fd;
298 cblock->aio_buf = buf;
299 cblock->aio_nbytes = size;
300 cblock->aio_offset = off;
301 cblock->aio_sigevent.sigev_notify = SIGEV_NONE;
302
303 do {
304 rc = aio_write(cblock);
305 if (rc == 0) {
306 break;
307 } else if (errno != EAGAIN) {
308 cblock->aio_fildes = -1;
309 pr_err("failed to queue perf data, error: %m\n");
310 break;
311 }
312 } while (1);
313
314 return rc;
315 }
316
record__aio_complete(struct mmap * md,struct aiocb * cblock)317 static int record__aio_complete(struct mmap *md, struct aiocb *cblock)
318 {
319 void *rem_buf;
320 off_t rem_off;
321 size_t rem_size;
322 int rc, aio_errno;
323 ssize_t aio_ret, written;
324
325 aio_errno = aio_error(cblock);
326 if (aio_errno == EINPROGRESS)
327 return 0;
328
329 written = aio_ret = aio_return(cblock);
330 if (aio_ret < 0) {
331 if (aio_errno != EINTR)
332 pr_err("failed to write perf data, error: %m\n");
333 written = 0;
334 }
335
336 rem_size = cblock->aio_nbytes - written;
337
338 if (rem_size == 0) {
339 cblock->aio_fildes = -1;
340 /*
341 * md->refcount is incremented in record__aio_pushfn() for
342 * every aio write request started in record__aio_push() so
343 * decrement it because the request is now complete.
344 */
345 perf_mmap__put(&md->core);
346 rc = 1;
347 } else {
348 /*
349 * aio write request may require restart with the
350 * remainder if the kernel didn't write whole
351 * chunk at once.
352 */
353 rem_off = cblock->aio_offset + written;
354 rem_buf = (void *)(cblock->aio_buf + written);
355 record__aio_write(cblock, cblock->aio_fildes,
356 rem_buf, rem_size, rem_off);
357 rc = 0;
358 }
359
360 return rc;
361 }
362
record__aio_sync(struct mmap * md,bool sync_all)363 static int record__aio_sync(struct mmap *md, bool sync_all)
364 {
365 struct aiocb **aiocb = md->aio.aiocb;
366 struct aiocb *cblocks = md->aio.cblocks;
367 struct timespec timeout = { 0, 1000 * 1000 * 1 }; /* 1ms */
368 int i, do_suspend;
369
370 do {
371 do_suspend = 0;
372 for (i = 0; i < md->aio.nr_cblocks; ++i) {
373 if (cblocks[i].aio_fildes == -1 || record__aio_complete(md, &cblocks[i])) {
374 if (sync_all)
375 aiocb[i] = NULL;
376 else
377 return i;
378 } else {
379 /*
380 * Started aio write is not complete yet
381 * so it has to be waited before the
382 * next allocation.
383 */
384 aiocb[i] = &cblocks[i];
385 do_suspend = 1;
386 }
387 }
388 if (!do_suspend)
389 return -1;
390
391 while (aio_suspend((const struct aiocb **)aiocb, md->aio.nr_cblocks, &timeout)) {
392 if (!(errno == EAGAIN || errno == EINTR))
393 pr_err("failed to sync perf data, error: %m\n");
394 }
395 } while (1);
396 }
397
398 struct record_aio {
399 struct record *rec;
400 void *data;
401 size_t size;
402 };
403
record__aio_pushfn(struct mmap * map,void * to,void * buf,size_t size)404 static int record__aio_pushfn(struct mmap *map, void *to, void *buf, size_t size)
405 {
406 struct record_aio *aio = to;
407
408 /*
409 * map->core.base data pointed by buf is copied into free map->aio.data[] buffer
410 * to release space in the kernel buffer as fast as possible, calling
411 * perf_mmap__consume() from perf_mmap__push() function.
412 *
413 * That lets the kernel to proceed with storing more profiling data into
414 * the kernel buffer earlier than other per-cpu kernel buffers are handled.
415 *
416 * Coping can be done in two steps in case the chunk of profiling data
417 * crosses the upper bound of the kernel buffer. In this case we first move
418 * part of data from map->start till the upper bound and then the remainder
419 * from the beginning of the kernel buffer till the end of the data chunk.
420 */
421
422 if (record__comp_enabled(aio->rec)) {
423 ssize_t compressed = zstd_compress(aio->rec->session, NULL, aio->data + aio->size,
424 mmap__mmap_len(map) - aio->size,
425 buf, size);
426 if (compressed < 0)
427 return (int)compressed;
428
429 size = compressed;
430 } else {
431 memcpy(aio->data + aio->size, buf, size);
432 }
433
434 if (!aio->size) {
435 /*
436 * Increment map->refcount to guard map->aio.data[] buffer
437 * from premature deallocation because map object can be
438 * released earlier than aio write request started on
439 * map->aio.data[] buffer is complete.
440 *
441 * perf_mmap__put() is done at record__aio_complete()
442 * after started aio request completion or at record__aio_push()
443 * if the request failed to start.
444 */
445 perf_mmap__get(&map->core);
446 }
447
448 aio->size += size;
449
450 return size;
451 }
452
record__aio_push(struct record * rec,struct mmap * map,off_t * off)453 static int record__aio_push(struct record *rec, struct mmap *map, off_t *off)
454 {
455 int ret, idx;
456 int trace_fd = rec->session->data->file.fd;
457 struct record_aio aio = { .rec = rec, .size = 0 };
458
459 /*
460 * Call record__aio_sync() to wait till map->aio.data[] buffer
461 * becomes available after previous aio write operation.
462 */
463
464 idx = record__aio_sync(map, false);
465 aio.data = map->aio.data[idx];
466 ret = perf_mmap__push(map, &aio, record__aio_pushfn);
467 if (ret != 0) /* ret > 0 - no data, ret < 0 - error */
468 return ret;
469
470 rec->samples++;
471 ret = record__aio_write(&(map->aio.cblocks[idx]), trace_fd, aio.data, aio.size, *off);
472 if (!ret) {
473 *off += aio.size;
474 rec->bytes_written += aio.size;
475 if (switch_output_size(rec))
476 trigger_hit(&switch_output_trigger);
477 } else {
478 /*
479 * Decrement map->refcount incremented in record__aio_pushfn()
480 * back if record__aio_write() operation failed to start, otherwise
481 * map->refcount is decremented in record__aio_complete() after
482 * aio write operation finishes successfully.
483 */
484 perf_mmap__put(&map->core);
485 }
486
487 return ret;
488 }
489
record__aio_get_pos(int trace_fd)490 static off_t record__aio_get_pos(int trace_fd)
491 {
492 return lseek(trace_fd, 0, SEEK_CUR);
493 }
494
record__aio_set_pos(int trace_fd,off_t pos)495 static void record__aio_set_pos(int trace_fd, off_t pos)
496 {
497 lseek(trace_fd, pos, SEEK_SET);
498 }
499
record__aio_mmap_read_sync(struct record * rec)500 static void record__aio_mmap_read_sync(struct record *rec)
501 {
502 int i;
503 struct evlist *evlist = rec->evlist;
504 struct mmap *maps = evlist->mmap;
505
506 if (!record__aio_enabled(rec))
507 return;
508
509 for (i = 0; i < evlist->core.nr_mmaps; i++) {
510 struct mmap *map = &maps[i];
511
512 if (map->core.base)
513 record__aio_sync(map, true);
514 }
515 }
516
517 static int nr_cblocks_default = 1;
518 static int nr_cblocks_max = 4;
519
record__aio_parse(const struct option * opt,const char * str,int unset)520 static int record__aio_parse(const struct option *opt,
521 const char *str,
522 int unset)
523 {
524 struct record_opts *opts = (struct record_opts *)opt->value;
525
526 if (unset) {
527 opts->nr_cblocks = 0;
528 } else {
529 if (str)
530 opts->nr_cblocks = strtol(str, NULL, 0);
531 if (!opts->nr_cblocks)
532 opts->nr_cblocks = nr_cblocks_default;
533 }
534
535 return 0;
536 }
537 #else /* HAVE_AIO_SUPPORT */
538 static int nr_cblocks_max = 0;
539
record__aio_push(struct record * rec __maybe_unused,struct mmap * map __maybe_unused,off_t * off __maybe_unused)540 static int record__aio_push(struct record *rec __maybe_unused, struct mmap *map __maybe_unused,
541 off_t *off __maybe_unused)
542 {
543 return -1;
544 }
545
record__aio_get_pos(int trace_fd __maybe_unused)546 static off_t record__aio_get_pos(int trace_fd __maybe_unused)
547 {
548 return -1;
549 }
550
record__aio_set_pos(int trace_fd __maybe_unused,off_t pos __maybe_unused)551 static void record__aio_set_pos(int trace_fd __maybe_unused, off_t pos __maybe_unused)
552 {
553 }
554
record__aio_mmap_read_sync(struct record * rec __maybe_unused)555 static void record__aio_mmap_read_sync(struct record *rec __maybe_unused)
556 {
557 }
558 #endif
559
record__aio_enabled(struct record * rec)560 static int record__aio_enabled(struct record *rec)
561 {
562 return rec->opts.nr_cblocks > 0;
563 }
564
565 #define MMAP_FLUSH_DEFAULT 1
record__mmap_flush_parse(const struct option * opt,const char * str,int unset)566 static int record__mmap_flush_parse(const struct option *opt,
567 const char *str,
568 int unset)
569 {
570 int flush_max;
571 struct record_opts *opts = (struct record_opts *)opt->value;
572 static struct parse_tag tags[] = {
573 { .tag = 'B', .mult = 1 },
574 { .tag = 'K', .mult = 1 << 10 },
575 { .tag = 'M', .mult = 1 << 20 },
576 { .tag = 'G', .mult = 1 << 30 },
577 { .tag = 0 },
578 };
579
580 if (unset)
581 return 0;
582
583 if (str) {
584 opts->mmap_flush = parse_tag_value(str, tags);
585 if (opts->mmap_flush == (int)-1)
586 opts->mmap_flush = strtol(str, NULL, 0);
587 }
588
589 if (!opts->mmap_flush)
590 opts->mmap_flush = MMAP_FLUSH_DEFAULT;
591
592 flush_max = evlist__mmap_size(opts->mmap_pages);
593 flush_max /= 4;
594 if (opts->mmap_flush > flush_max)
595 opts->mmap_flush = flush_max;
596
597 return 0;
598 }
599
600 #ifdef HAVE_ZSTD_SUPPORT
601 static unsigned int comp_level_default = 1;
602
record__parse_comp_level(const struct option * opt,const char * str,int unset)603 static int record__parse_comp_level(const struct option *opt, const char *str, int unset)
604 {
605 struct record_opts *opts = opt->value;
606
607 if (unset) {
608 opts->comp_level = 0;
609 } else {
610 if (str)
611 opts->comp_level = strtol(str, NULL, 0);
612 if (!opts->comp_level)
613 opts->comp_level = comp_level_default;
614 }
615
616 return 0;
617 }
618 #endif
619 static unsigned int comp_level_max = 22;
620
record__comp_enabled(struct record * rec)621 static int record__comp_enabled(struct record *rec)
622 {
623 return rec->opts.comp_level > 0;
624 }
625
process_synthesized_event(const struct perf_tool * tool,union perf_event * event,struct perf_sample * sample __maybe_unused,struct machine * machine __maybe_unused)626 static int process_synthesized_event(const struct perf_tool *tool,
627 union perf_event *event,
628 struct perf_sample *sample __maybe_unused,
629 struct machine *machine __maybe_unused)
630 {
631 struct record *rec = container_of(tool, struct record, tool);
632 return record__write(rec, NULL, event, event->header.size);
633 }
634
635 static struct mutex synth_lock;
636
process_locked_synthesized_event(const struct perf_tool * tool,union perf_event * event,struct perf_sample * sample __maybe_unused,struct machine * machine __maybe_unused)637 static int process_locked_synthesized_event(const struct perf_tool *tool,
638 union perf_event *event,
639 struct perf_sample *sample __maybe_unused,
640 struct machine *machine __maybe_unused)
641 {
642 int ret;
643
644 mutex_lock(&synth_lock);
645 ret = process_synthesized_event(tool, event, sample, machine);
646 mutex_unlock(&synth_lock);
647 return ret;
648 }
649
record__pushfn(struct mmap * map,void * to,void * bf,size_t size)650 static int record__pushfn(struct mmap *map, void *to, void *bf, size_t size)
651 {
652 struct record *rec = to;
653
654 if (record__comp_enabled(rec)) {
655 struct perf_record_compressed2 *event = map->data;
656 size_t padding = 0;
657 u8 pad[8] = {0};
658 ssize_t compressed = zstd_compress(rec->session, map, map->data,
659 mmap__mmap_len(map), bf, size);
660
661 if (compressed < 0)
662 return (int)compressed;
663
664 bf = event;
665 thread->samples++;
666
667 /*
668 * The record from `zstd_compress` is not 8 bytes aligned, which would cause asan
669 * error. We make it aligned here.
670 */
671 event->data_size = compressed - sizeof(struct perf_record_compressed2);
672 event->header.size = PERF_ALIGN(compressed, sizeof(u64));
673 padding = event->header.size - compressed;
674 return record__write(rec, map, bf, compressed) ||
675 record__write(rec, map, &pad, padding);
676 }
677
678 thread->samples++;
679 return record__write(rec, map, bf, size);
680 }
681
682 static volatile sig_atomic_t signr = -1;
683 static volatile sig_atomic_t child_finished;
684 #ifdef HAVE_EVENTFD_SUPPORT
685 static volatile sig_atomic_t done_fd = -1;
686 #endif
687
sig_handler(int sig)688 static void sig_handler(int sig)
689 {
690 if (sig == SIGCHLD)
691 child_finished = 1;
692 else
693 signr = sig;
694
695 done = 1;
696 #ifdef HAVE_EVENTFD_SUPPORT
697 if (done_fd >= 0) {
698 u64 tmp = 1;
699 int orig_errno = errno;
700
701 /*
702 * It is possible for this signal handler to run after done is
703 * checked in the main loop, but before the perf counter fds are
704 * polled. If this happens, the poll() will continue to wait
705 * even though done is set, and will only break out if either
706 * another signal is received, or the counters are ready for
707 * read. To ensure the poll() doesn't sleep when done is set,
708 * use an eventfd (done_fd) to wake up the poll().
709 */
710 if (write(done_fd, &tmp, sizeof(tmp)) < 0)
711 pr_err("failed to signal wakeup fd, error: %m\n");
712
713 errno = orig_errno;
714 }
715 #endif // HAVE_EVENTFD_SUPPORT
716 }
717
sigsegv_handler(int sig)718 static void sigsegv_handler(int sig)
719 {
720 perf_hooks__recover();
721 sighandler_dump_stack(sig);
722 }
723
record__sig_exit(void)724 static void record__sig_exit(void)
725 {
726 if (signr == -1)
727 return;
728
729 signal(signr, SIG_DFL);
730 raise(signr);
731 }
732
record__process_auxtrace(const struct perf_tool * tool,struct mmap * map,union perf_event * event,void * data1,size_t len1,void * data2,size_t len2)733 static int record__process_auxtrace(const struct perf_tool *tool,
734 struct mmap *map,
735 union perf_event *event, void *data1,
736 size_t len1, void *data2, size_t len2)
737 {
738 struct record *rec = container_of(tool, struct record, tool);
739 struct perf_data *data = &rec->data;
740 size_t padding;
741 u8 pad[8] = {0};
742
743 if (!perf_data__is_pipe(data) && perf_data__is_single_file(data)) {
744 off_t file_offset;
745 int fd = perf_data__fd(data);
746 int err;
747
748 file_offset = lseek(fd, 0, SEEK_CUR);
749 if (file_offset == -1)
750 return -1;
751 err = auxtrace_index__auxtrace_event(&rec->session->auxtrace_index,
752 event, file_offset);
753 if (err)
754 return err;
755 }
756
757 /* event.auxtrace.size includes padding, see __auxtrace_mmap__read() */
758 padding = (len1 + len2) & 7;
759 if (padding)
760 padding = 8 - padding;
761
762 record__write(rec, map, event, event->header.size);
763 record__write(rec, map, data1, len1);
764 if (len2)
765 record__write(rec, map, data2, len2);
766 record__write(rec, map, &pad, padding);
767
768 return 0;
769 }
770
record__auxtrace_mmap_read(struct record * rec,struct mmap * map)771 static int record__auxtrace_mmap_read(struct record *rec,
772 struct mmap *map)
773 {
774 int ret;
775
776 ret = auxtrace_mmap__read(map, rec->itr,
777 perf_session__env(rec->session),
778 &rec->tool,
779 record__process_auxtrace);
780 if (ret < 0)
781 return ret;
782
783 if (ret)
784 rec->samples++;
785
786 return 0;
787 }
788
record__auxtrace_mmap_read_snapshot(struct record * rec,struct mmap * map)789 static int record__auxtrace_mmap_read_snapshot(struct record *rec,
790 struct mmap *map)
791 {
792 int ret;
793
794 ret = auxtrace_mmap__read_snapshot(map, rec->itr,
795 perf_session__env(rec->session),
796 &rec->tool,
797 record__process_auxtrace,
798 rec->opts.auxtrace_snapshot_size);
799 if (ret < 0)
800 return ret;
801
802 if (ret)
803 rec->samples++;
804
805 return 0;
806 }
807
record__auxtrace_read_snapshot_all(struct record * rec)808 static int record__auxtrace_read_snapshot_all(struct record *rec)
809 {
810 int i;
811 int rc = 0;
812
813 for (i = 0; i < rec->evlist->core.nr_mmaps; i++) {
814 struct mmap *map = &rec->evlist->mmap[i];
815
816 if (!map->auxtrace_mmap.base)
817 continue;
818
819 if (record__auxtrace_mmap_read_snapshot(rec, map) != 0) {
820 rc = -1;
821 goto out;
822 }
823 }
824 out:
825 return rc;
826 }
827
record__read_auxtrace_snapshot(struct record * rec,bool on_exit)828 static void record__read_auxtrace_snapshot(struct record *rec, bool on_exit)
829 {
830 pr_debug("Recording AUX area tracing snapshot\n");
831 if (record__auxtrace_read_snapshot_all(rec) < 0) {
832 trigger_error(&auxtrace_snapshot_trigger);
833 } else {
834 if (auxtrace_record__snapshot_finish(rec->itr, on_exit))
835 trigger_error(&auxtrace_snapshot_trigger);
836 else
837 trigger_ready(&auxtrace_snapshot_trigger);
838 }
839 }
840
record__auxtrace_snapshot_exit(struct record * rec)841 static int record__auxtrace_snapshot_exit(struct record *rec)
842 {
843 if (trigger_is_error(&auxtrace_snapshot_trigger))
844 return 0;
845
846 if (!auxtrace_record__snapshot_started &&
847 auxtrace_record__snapshot_start(rec->itr))
848 return -1;
849
850 record__read_auxtrace_snapshot(rec, true);
851 if (trigger_is_error(&auxtrace_snapshot_trigger))
852 return -1;
853
854 return 0;
855 }
856
record__auxtrace_init(struct record * rec)857 static int record__auxtrace_init(struct record *rec)
858 {
859 int err;
860
861 if ((rec->opts.auxtrace_snapshot_opts || rec->opts.auxtrace_sample_opts)
862 && record__threads_enabled(rec)) {
863 pr_err("AUX area tracing options are not available in parallel streaming mode.\n");
864 return -EINVAL;
865 }
866
867 if (!rec->itr) {
868 rec->itr = auxtrace_record__init(rec->evlist, &err);
869 if (err)
870 return err;
871 }
872
873 err = auxtrace_parse_snapshot_options(rec->itr, &rec->opts,
874 rec->opts.auxtrace_snapshot_opts);
875 if (err)
876 return err;
877
878 err = auxtrace_parse_sample_options(rec->itr, rec->evlist, &rec->opts,
879 rec->opts.auxtrace_sample_opts);
880 if (err)
881 return err;
882
883 err = auxtrace_parse_aux_action(rec->evlist);
884 if (err)
885 return err;
886
887 return auxtrace_parse_filters(rec->evlist);
888 }
889
record__config_text_poke(struct evlist * evlist)890 static int record__config_text_poke(struct evlist *evlist)
891 {
892 struct evsel *evsel;
893
894 /* Nothing to do if text poke is already configured */
895 evlist__for_each_entry(evlist, evsel) {
896 if (evsel->core.attr.text_poke)
897 return 0;
898 }
899
900 evsel = evlist__add_dummy_on_all_cpus(evlist);
901 if (!evsel)
902 return -ENOMEM;
903
904 evsel->core.attr.text_poke = 1;
905 evsel->core.attr.ksymbol = 1;
906 evsel->immediate = true;
907 evsel__set_sample_bit(evsel, TIME);
908
909 return 0;
910 }
911
record__config_off_cpu(struct record * rec)912 static int record__config_off_cpu(struct record *rec)
913 {
914 return off_cpu_prepare(rec->evlist, &rec->opts.target, &rec->opts);
915 }
916
record__tracking_system_wide(struct record * rec)917 static bool record__tracking_system_wide(struct record *rec)
918 {
919 struct evlist *evlist = rec->evlist;
920 struct evsel *evsel;
921
922 /*
923 * If non-dummy evsel exists, system_wide sideband is need to
924 * help parse sample information.
925 * For example, PERF_EVENT_MMAP event to help parse symbol,
926 * and PERF_EVENT_COMM event to help parse task executable name.
927 */
928 evlist__for_each_entry(evlist, evsel) {
929 if (!evsel__is_dummy_event(evsel))
930 return true;
931 }
932
933 return false;
934 }
935
record__config_tracking_events(struct record * rec)936 static int record__config_tracking_events(struct record *rec)
937 {
938 struct record_opts *opts = &rec->opts;
939 struct evlist *evlist = rec->evlist;
940 bool system_wide = false;
941 struct evsel *evsel;
942
943 /*
944 * For initial_delay, system wide or a hybrid system, we need to add
945 * tracking event so that we can track PERF_RECORD_MMAP to cover the
946 * delay of waiting or event synthesis.
947 */
948 if (opts->target.initial_delay || target__has_cpu(&opts->target) ||
949 perf_pmus__num_core_pmus() > 1) {
950 /*
951 * User space tasks can migrate between CPUs, so when tracing
952 * selected CPUs, sideband for all CPUs is still needed.
953 */
954 if (!!opts->target.cpu_list && record__tracking_system_wide(rec))
955 system_wide = true;
956
957 evsel = evlist__findnew_tracking_event(evlist, system_wide);
958 if (!evsel)
959 return -ENOMEM;
960
961 /*
962 * Enable the tracking event when the process is forked for
963 * initial_delay, immediately for system wide.
964 */
965 if (opts->target.initial_delay && !evsel->immediate &&
966 !target__has_cpu(&opts->target))
967 evsel->core.attr.enable_on_exec = 1;
968 else
969 evsel->immediate = 1;
970 }
971
972 return 0;
973 }
974
record__kcore_readable(struct machine * machine)975 static bool record__kcore_readable(struct machine *machine)
976 {
977 char kcore[PATH_MAX];
978 int fd;
979
980 scnprintf(kcore, sizeof(kcore), "%s/proc/kcore", machine->root_dir);
981
982 fd = open(kcore, O_RDONLY);
983 if (fd < 0)
984 return false;
985
986 close(fd);
987
988 return true;
989 }
990
record__kcore_copy(struct machine * machine,struct perf_data * data)991 static int record__kcore_copy(struct machine *machine, struct perf_data *data)
992 {
993 char from_dir[PATH_MAX];
994 char kcore_dir[PATH_MAX];
995 int ret;
996
997 snprintf(from_dir, sizeof(from_dir), "%s/proc", machine->root_dir);
998
999 ret = perf_data__make_kcore_dir(data, kcore_dir, sizeof(kcore_dir));
1000 if (ret)
1001 return ret;
1002
1003 return kcore_copy(from_dir, kcore_dir);
1004 }
1005
record__thread_data_init_pipes(struct record_thread * thread_data)1006 static void record__thread_data_init_pipes(struct record_thread *thread_data)
1007 {
1008 thread_data->pipes.msg[0] = -1;
1009 thread_data->pipes.msg[1] = -1;
1010 thread_data->pipes.ack[0] = -1;
1011 thread_data->pipes.ack[1] = -1;
1012 }
1013
record__thread_data_open_pipes(struct record_thread * thread_data)1014 static int record__thread_data_open_pipes(struct record_thread *thread_data)
1015 {
1016 if (pipe(thread_data->pipes.msg))
1017 return -EINVAL;
1018
1019 if (pipe(thread_data->pipes.ack)) {
1020 close(thread_data->pipes.msg[0]);
1021 thread_data->pipes.msg[0] = -1;
1022 close(thread_data->pipes.msg[1]);
1023 thread_data->pipes.msg[1] = -1;
1024 return -EINVAL;
1025 }
1026
1027 pr_debug2("thread_data[%p]: msg=[%d,%d], ack=[%d,%d]\n", thread_data,
1028 thread_data->pipes.msg[0], thread_data->pipes.msg[1],
1029 thread_data->pipes.ack[0], thread_data->pipes.ack[1]);
1030
1031 return 0;
1032 }
1033
record__thread_data_close_pipes(struct record_thread * thread_data)1034 static void record__thread_data_close_pipes(struct record_thread *thread_data)
1035 {
1036 if (thread_data->pipes.msg[0] != -1) {
1037 close(thread_data->pipes.msg[0]);
1038 thread_data->pipes.msg[0] = -1;
1039 }
1040 if (thread_data->pipes.msg[1] != -1) {
1041 close(thread_data->pipes.msg[1]);
1042 thread_data->pipes.msg[1] = -1;
1043 }
1044 if (thread_data->pipes.ack[0] != -1) {
1045 close(thread_data->pipes.ack[0]);
1046 thread_data->pipes.ack[0] = -1;
1047 }
1048 if (thread_data->pipes.ack[1] != -1) {
1049 close(thread_data->pipes.ack[1]);
1050 thread_data->pipes.ack[1] = -1;
1051 }
1052 }
1053
evlist__per_thread(struct evlist * evlist)1054 static bool evlist__per_thread(struct evlist *evlist)
1055 {
1056 return cpu_map__is_dummy(evlist->core.user_requested_cpus);
1057 }
1058
record__thread_data_init_maps(struct record_thread * thread_data,struct evlist * evlist)1059 static int record__thread_data_init_maps(struct record_thread *thread_data, struct evlist *evlist)
1060 {
1061 int m, tm, nr_mmaps = evlist->core.nr_mmaps;
1062 struct mmap *mmap = evlist->mmap;
1063 struct mmap *overwrite_mmap = evlist->overwrite_mmap;
1064 struct perf_cpu_map *cpus = evlist->core.all_cpus;
1065 bool per_thread = evlist__per_thread(evlist);
1066
1067 if (per_thread)
1068 thread_data->nr_mmaps = nr_mmaps;
1069 else
1070 thread_data->nr_mmaps = bitmap_weight(thread_data->mask->maps.bits,
1071 thread_data->mask->maps.nbits);
1072 if (mmap) {
1073 thread_data->maps = zalloc(thread_data->nr_mmaps * sizeof(struct mmap *));
1074 if (!thread_data->maps)
1075 return -ENOMEM;
1076 }
1077 if (overwrite_mmap) {
1078 thread_data->overwrite_maps = zalloc(thread_data->nr_mmaps * sizeof(struct mmap *));
1079 if (!thread_data->overwrite_maps) {
1080 zfree(&thread_data->maps);
1081 return -ENOMEM;
1082 }
1083 }
1084 pr_debug2("thread_data[%p]: nr_mmaps=%d, maps=%p, ow_maps=%p\n", thread_data,
1085 thread_data->nr_mmaps, thread_data->maps, thread_data->overwrite_maps);
1086
1087 for (m = 0, tm = 0; m < nr_mmaps && tm < thread_data->nr_mmaps; m++) {
1088 if (per_thread ||
1089 test_bit(perf_cpu_map__cpu(cpus, m).cpu, thread_data->mask->maps.bits)) {
1090 if (thread_data->maps) {
1091 thread_data->maps[tm] = &mmap[m];
1092 pr_debug2("thread_data[%p]: cpu%d: maps[%d] -> mmap[%d]\n",
1093 thread_data, perf_cpu_map__cpu(cpus, m).cpu, tm, m);
1094 }
1095 if (thread_data->overwrite_maps) {
1096 thread_data->overwrite_maps[tm] = &overwrite_mmap[m];
1097 pr_debug2("thread_data[%p]: cpu%d: ow_maps[%d] -> ow_mmap[%d]\n",
1098 thread_data, perf_cpu_map__cpu(cpus, m).cpu, tm, m);
1099 }
1100 tm++;
1101 }
1102 }
1103
1104 return 0;
1105 }
1106
record__thread_data_init_pollfd(struct record_thread * thread_data,struct evlist * evlist)1107 static int record__thread_data_init_pollfd(struct record_thread *thread_data, struct evlist *evlist)
1108 {
1109 int f, tm, pos;
1110 struct mmap *map, *overwrite_map;
1111
1112 fdarray__init(&thread_data->pollfd, 64);
1113
1114 for (tm = 0; tm < thread_data->nr_mmaps; tm++) {
1115 map = thread_data->maps ? thread_data->maps[tm] : NULL;
1116 overwrite_map = thread_data->overwrite_maps ?
1117 thread_data->overwrite_maps[tm] : NULL;
1118
1119 for (f = 0; f < evlist->core.pollfd.nr; f++) {
1120 void *ptr = evlist->core.pollfd.priv[f].ptr;
1121
1122 if ((map && ptr == map) || (overwrite_map && ptr == overwrite_map)) {
1123 pos = fdarray__dup_entry_from(&thread_data->pollfd, f,
1124 &evlist->core.pollfd);
1125 if (pos < 0)
1126 return pos;
1127 pr_debug2("thread_data[%p]: pollfd[%d] <- event_fd=%d\n",
1128 thread_data, pos, evlist->core.pollfd.entries[f].fd);
1129 }
1130 }
1131 }
1132
1133 return 0;
1134 }
1135
record__free_thread_data(struct record * rec)1136 static void record__free_thread_data(struct record *rec)
1137 {
1138 int t;
1139 struct record_thread *thread_data = rec->thread_data;
1140
1141 if (thread_data == NULL)
1142 return;
1143
1144 for (t = 0; t < rec->nr_threads; t++) {
1145 record__thread_data_close_pipes(&thread_data[t]);
1146 zfree(&thread_data[t].maps);
1147 zfree(&thread_data[t].overwrite_maps);
1148 fdarray__exit(&thread_data[t].pollfd);
1149 }
1150
1151 zfree(&rec->thread_data);
1152 }
1153
record__map_thread_evlist_pollfd_indexes(struct record * rec,int evlist_pollfd_index,int thread_pollfd_index)1154 static int record__map_thread_evlist_pollfd_indexes(struct record *rec,
1155 int evlist_pollfd_index,
1156 int thread_pollfd_index)
1157 {
1158 size_t x = rec->index_map_cnt;
1159
1160 if (realloc_array_as_needed(rec->index_map, rec->index_map_sz, x, NULL))
1161 return -ENOMEM;
1162 rec->index_map[x].evlist_pollfd_index = evlist_pollfd_index;
1163 rec->index_map[x].thread_pollfd_index = thread_pollfd_index;
1164 rec->index_map_cnt += 1;
1165 return 0;
1166 }
1167
record__update_evlist_pollfd_from_thread(struct record * rec,struct evlist * evlist,struct record_thread * thread_data)1168 static int record__update_evlist_pollfd_from_thread(struct record *rec,
1169 struct evlist *evlist,
1170 struct record_thread *thread_data)
1171 {
1172 struct pollfd *e_entries = evlist->core.pollfd.entries;
1173 struct pollfd *t_entries = thread_data->pollfd.entries;
1174 int err = 0;
1175 size_t i;
1176
1177 for (i = 0; i < rec->index_map_cnt; i++) {
1178 int e_pos = rec->index_map[i].evlist_pollfd_index;
1179 int t_pos = rec->index_map[i].thread_pollfd_index;
1180
1181 if (e_entries[e_pos].fd != t_entries[t_pos].fd ||
1182 e_entries[e_pos].events != t_entries[t_pos].events) {
1183 pr_err("Thread and evlist pollfd index mismatch\n");
1184 err = -EINVAL;
1185 continue;
1186 }
1187 e_entries[e_pos].revents = t_entries[t_pos].revents;
1188 }
1189 return err;
1190 }
1191
record__dup_non_perf_events(struct record * rec,struct evlist * evlist,struct record_thread * thread_data)1192 static int record__dup_non_perf_events(struct record *rec,
1193 struct evlist *evlist,
1194 struct record_thread *thread_data)
1195 {
1196 struct fdarray *fda = &evlist->core.pollfd;
1197 int i, ret;
1198
1199 for (i = 0; i < fda->nr; i++) {
1200 if (!(fda->priv[i].flags & fdarray_flag__non_perf_event))
1201 continue;
1202 ret = fdarray__dup_entry_from(&thread_data->pollfd, i, fda);
1203 if (ret < 0) {
1204 pr_err("Failed to duplicate descriptor in main thread pollfd\n");
1205 return ret;
1206 }
1207 pr_debug2("thread_data[%p]: pollfd[%d] <- non_perf_event fd=%d\n",
1208 thread_data, ret, fda->entries[i].fd);
1209 ret = record__map_thread_evlist_pollfd_indexes(rec, i, ret);
1210 if (ret < 0) {
1211 pr_err("Failed to map thread and evlist pollfd indexes\n");
1212 return ret;
1213 }
1214 }
1215 return 0;
1216 }
1217
record__alloc_thread_data(struct record * rec,struct evlist * evlist)1218 static int record__alloc_thread_data(struct record *rec, struct evlist *evlist)
1219 {
1220 int t, ret;
1221 struct record_thread *thread_data;
1222
1223 rec->thread_data = zalloc(rec->nr_threads * sizeof(*(rec->thread_data)));
1224 if (!rec->thread_data) {
1225 pr_err("Failed to allocate thread data\n");
1226 return -ENOMEM;
1227 }
1228 thread_data = rec->thread_data;
1229
1230 for (t = 0; t < rec->nr_threads; t++)
1231 record__thread_data_init_pipes(&thread_data[t]);
1232
1233 for (t = 0; t < rec->nr_threads; t++) {
1234 thread_data[t].rec = rec;
1235 thread_data[t].mask = &rec->thread_masks[t];
1236 ret = record__thread_data_init_maps(&thread_data[t], evlist);
1237 if (ret) {
1238 pr_err("Failed to initialize thread[%d] maps\n", t);
1239 goto out_free;
1240 }
1241 ret = record__thread_data_init_pollfd(&thread_data[t], evlist);
1242 if (ret) {
1243 pr_err("Failed to initialize thread[%d] pollfd\n", t);
1244 goto out_free;
1245 }
1246 if (t) {
1247 thread_data[t].tid = -1;
1248 ret = record__thread_data_open_pipes(&thread_data[t]);
1249 if (ret) {
1250 pr_err("Failed to open thread[%d] communication pipes\n", t);
1251 goto out_free;
1252 }
1253 ret = fdarray__add(&thread_data[t].pollfd, thread_data[t].pipes.msg[0],
1254 POLLIN | POLLERR | POLLHUP, fdarray_flag__nonfilterable);
1255 if (ret < 0) {
1256 pr_err("Failed to add descriptor to thread[%d] pollfd\n", t);
1257 goto out_free;
1258 }
1259 thread_data[t].ctlfd_pos = ret;
1260 pr_debug2("thread_data[%p]: pollfd[%d] <- ctl_fd=%d\n",
1261 thread_data, thread_data[t].ctlfd_pos,
1262 thread_data[t].pipes.msg[0]);
1263 } else {
1264 thread_data[t].tid = gettid();
1265
1266 ret = record__dup_non_perf_events(rec, evlist, &thread_data[t]);
1267 if (ret < 0)
1268 goto out_free;
1269
1270 thread_data[t].ctlfd_pos = -1; /* Not used */
1271 }
1272 }
1273
1274 return 0;
1275
1276 out_free:
1277 record__free_thread_data(rec);
1278
1279 return ret;
1280 }
1281
record__mmap_evlist(struct record * rec,struct evlist * evlist)1282 static int record__mmap_evlist(struct record *rec,
1283 struct evlist *evlist)
1284 {
1285 int i, ret;
1286 struct record_opts *opts = &rec->opts;
1287 bool auxtrace_overwrite = opts->auxtrace_snapshot_mode ||
1288 opts->auxtrace_sample_mode;
1289 char msg[512];
1290
1291 if (opts->affinity != PERF_AFFINITY_SYS)
1292 cpu__setup_cpunode_map();
1293
1294 if (evlist__mmap_ex(evlist, opts->mmap_pages,
1295 opts->auxtrace_mmap_pages,
1296 auxtrace_overwrite,
1297 opts->nr_cblocks, opts->affinity,
1298 opts->mmap_flush, opts->comp_level) < 0) {
1299 if (errno == EPERM) {
1300 pr_err("Permission error mapping pages.\n"
1301 "Consider increasing "
1302 "/proc/sys/kernel/perf_event_mlock_kb,\n"
1303 "or try again with a smaller value of -m/--mmap_pages.\n"
1304 "(current value: %u,%u)\n",
1305 opts->mmap_pages, opts->auxtrace_mmap_pages);
1306 return -errno;
1307 } else {
1308 pr_err("failed to mmap with %d (%s)\n", errno,
1309 str_error_r(errno, msg, sizeof(msg)));
1310 if (errno)
1311 return -errno;
1312 else
1313 return -EINVAL;
1314 }
1315 }
1316
1317 if (evlist__initialize_ctlfd(evlist, opts->ctl_fd, opts->ctl_fd_ack))
1318 return -1;
1319
1320 ret = record__alloc_thread_data(rec, evlist);
1321 if (ret)
1322 return ret;
1323
1324 if (record__threads_enabled(rec)) {
1325 ret = perf_data__create_dir(&rec->data, evlist->core.nr_mmaps);
1326 if (ret) {
1327 pr_err("Failed to create data directory: %s\n", strerror(-ret));
1328 return ret;
1329 }
1330 for (i = 0; i < evlist->core.nr_mmaps; i++) {
1331 if (evlist->mmap)
1332 evlist->mmap[i].file = &rec->data.dir.files[i];
1333 if (evlist->overwrite_mmap)
1334 evlist->overwrite_mmap[i].file = &rec->data.dir.files[i];
1335 }
1336 }
1337
1338 return 0;
1339 }
1340
record__mmap(struct record * rec)1341 static int record__mmap(struct record *rec)
1342 {
1343 return record__mmap_evlist(rec, rec->evlist);
1344 }
1345
record__open(struct record * rec)1346 static int record__open(struct record *rec)
1347 {
1348 char msg[BUFSIZ];
1349 struct evsel *pos;
1350 struct evlist *evlist = rec->evlist;
1351 struct perf_session *session = rec->session;
1352 struct record_opts *opts = &rec->opts;
1353 int rc = 0;
1354 bool skipped = false;
1355 bool removed_tracking = false;
1356
1357 evlist__for_each_entry(evlist, pos) {
1358 if (removed_tracking) {
1359 /*
1360 * Normally the head of the list has tracking enabled
1361 * for sideband data like mmaps. If this event is
1362 * removed, make sure to add tracking to the next
1363 * processed event.
1364 */
1365 if (!pos->tracking) {
1366 pos->tracking = true;
1367 evsel__config(pos, opts, &callchain_param);
1368 }
1369 removed_tracking = false;
1370 }
1371 try_again:
1372 if (evsel__open(pos, pos->core.cpus, pos->core.threads) < 0) {
1373 bool report_error = true;
1374
1375 if (evsel__fallback(pos, &opts->target, errno, msg, sizeof(msg))) {
1376 if (verbose > 0)
1377 ui__warning("%s\n", msg);
1378 goto try_again;
1379 }
1380 if ((errno == EINVAL || errno == EBADF) &&
1381 pos->core.leader != &pos->core &&
1382 pos->weak_group) {
1383 pos = evlist__reset_weak_group(evlist, pos, true);
1384 goto try_again;
1385 }
1386 #if defined(__aarch64__) || defined(__arm__)
1387 if (strstr(evsel__name(pos), "cycles")) {
1388 struct evsel *pos2;
1389 /*
1390 * Unfortunately ARM has many events named
1391 * "cycles" on PMUs like the system-level (L3)
1392 * cache which don't support sampling. Only
1393 * display such failures to open when there is
1394 * only 1 cycles event or verbose is enabled.
1395 */
1396 evlist__for_each_entry(evlist, pos2) {
1397 if (pos2 == pos)
1398 continue;
1399 if (strstr(evsel__name(pos2), "cycles")) {
1400 report_error = false;
1401 break;
1402 }
1403 }
1404 }
1405 #endif
1406 if (report_error || verbose > 0) {
1407 ui__error("Failure to open event '%s' on PMU '%s' which will be "
1408 "removed.\n%s\n",
1409 evsel__name(pos), evsel__pmu_name(pos), msg);
1410 }
1411 if (pos->tracking)
1412 removed_tracking = true;
1413 pos->skippable = true;
1414 skipped = true;
1415 }
1416 }
1417
1418 if (skipped) {
1419 struct evsel *tmp;
1420 int idx = 0;
1421 bool evlist_empty = true;
1422
1423 /* Remove evsels that failed to open and update indices. */
1424 evlist__for_each_entry_safe(evlist, tmp, pos) {
1425 if (pos->skippable) {
1426 evlist__remove(evlist, pos);
1427 continue;
1428 }
1429
1430 /*
1431 * Note, dummy events may be command line parsed or
1432 * added by the tool. We care about supporting `perf
1433 * record -e dummy` which may be used as a permission
1434 * check. Dummy events that are added to the command
1435 * line and opened along with other events that fail,
1436 * will still fail as if the dummy events were tool
1437 * added events for the sake of code simplicity.
1438 */
1439 if (!evsel__is_dummy_event(pos))
1440 evlist_empty = false;
1441 }
1442 evlist__for_each_entry(evlist, pos) {
1443 pos->core.idx = idx++;
1444 }
1445 /* If list is empty then fail. */
1446 if (evlist_empty) {
1447 ui__error("Failure to open any events for recording.\n");
1448 rc = -1;
1449 goto out;
1450 }
1451 }
1452 if (symbol_conf.kptr_restrict && !evlist__exclude_kernel(evlist)) {
1453 pr_warning(
1454 "WARNING: Kernel address maps (/proc/{kallsyms,modules}) are restricted,\n"
1455 "check /proc/sys/kernel/kptr_restrict and /proc/sys/kernel/perf_event_paranoid.\n\n"
1456 "Samples in kernel functions may not be resolved if a suitable vmlinux\n"
1457 "file is not found in the buildid cache or in the vmlinux path.\n\n"
1458 "Samples in kernel modules won't be resolved at all.\n\n"
1459 "If some relocation was applied (e.g. kexec) symbols may be misresolved\n"
1460 "even with a suitable vmlinux or kallsyms file.\n\n");
1461 }
1462
1463 if (evlist__apply_filters(evlist, &pos, &opts->target)) {
1464 pr_err("failed to set filter \"%s\" on event %s with %d (%s)\n",
1465 pos->filter ?: "BPF", evsel__name(pos), errno,
1466 str_error_r(errno, msg, sizeof(msg)));
1467 rc = -1;
1468 goto out;
1469 }
1470
1471 rc = record__mmap(rec);
1472 if (rc)
1473 goto out;
1474
1475 session->evlist = evlist;
1476 perf_session__set_id_hdr_size(session);
1477 out:
1478 return rc;
1479 }
1480
set_timestamp_boundary(struct record * rec,u64 sample_time)1481 static void set_timestamp_boundary(struct record *rec, u64 sample_time)
1482 {
1483 if (rec->evlist->first_sample_time == 0)
1484 rec->evlist->first_sample_time = sample_time;
1485
1486 if (sample_time)
1487 rec->evlist->last_sample_time = sample_time;
1488 }
1489
process_sample_event(const struct perf_tool * tool,union perf_event * event,struct perf_sample * sample,struct evsel * evsel,struct machine * machine)1490 static int process_sample_event(const struct perf_tool *tool,
1491 union perf_event *event,
1492 struct perf_sample *sample,
1493 struct evsel *evsel,
1494 struct machine *machine)
1495 {
1496 struct record *rec = container_of(tool, struct record, tool);
1497
1498 set_timestamp_boundary(rec, sample->time);
1499
1500 if (rec->buildid_all)
1501 return 0;
1502
1503 rec->samples++;
1504 return build_id__mark_dso_hit(tool, event, sample, evsel, machine);
1505 }
1506
process_buildids(struct record * rec)1507 static int process_buildids(struct record *rec)
1508 {
1509 struct perf_session *session = rec->session;
1510
1511 if (perf_data__size(&rec->data) == 0)
1512 return 0;
1513
1514 /*
1515 * During this process, it'll load kernel map and replace the
1516 * dso->long_name to a real pathname it found. In this case
1517 * we prefer the vmlinux path like
1518 * /lib/modules/3.16.4/build/vmlinux
1519 *
1520 * rather than build-id path (in debug directory).
1521 * $HOME/.debug/.build-id/f0/6e17aa50adf4d00b88925e03775de107611551
1522 */
1523 symbol_conf.ignore_vmlinux_buildid = true;
1524
1525 /*
1526 * If --buildid-all is given, it marks all DSO regardless of hits,
1527 * so no need to process samples. But if timestamp_boundary is enabled,
1528 * it still needs to walk on all samples to get the timestamps of
1529 * first/last samples.
1530 */
1531 if (rec->buildid_all && !rec->timestamp_boundary)
1532 rec->tool.sample = process_event_sample_stub;
1533
1534 return perf_session__process_events(session);
1535 }
1536
perf_event__synthesize_guest_os(struct machine * machine,void * data)1537 static void perf_event__synthesize_guest_os(struct machine *machine, void *data)
1538 {
1539 int err;
1540 struct perf_tool *tool = data;
1541 /*
1542 *As for guest kernel when processing subcommand record&report,
1543 *we arrange module mmap prior to guest kernel mmap and trigger
1544 *a preload dso because default guest module symbols are loaded
1545 *from guest kallsyms instead of /lib/modules/XXX/XXX. This
1546 *method is used to avoid symbol missing when the first addr is
1547 *in module instead of in guest kernel.
1548 */
1549 err = perf_event__synthesize_modules(tool, process_synthesized_event,
1550 machine);
1551 if (err < 0)
1552 pr_err("Couldn't record guest kernel [%d]'s reference"
1553 " relocation symbol.\n", machine->pid);
1554
1555 /*
1556 * We use _stext for guest kernel because guest kernel's /proc/kallsyms
1557 * have no _text sometimes.
1558 */
1559 err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
1560 machine);
1561 if (err < 0)
1562 pr_err("Couldn't record guest kernel [%d]'s reference"
1563 " relocation symbol.\n", machine->pid);
1564 }
1565
1566 static struct perf_event_header finished_round_event = {
1567 .size = sizeof(struct perf_event_header),
1568 .type = PERF_RECORD_FINISHED_ROUND,
1569 };
1570
1571 static struct perf_event_header finished_init_event = {
1572 .size = sizeof(struct perf_event_header),
1573 .type = PERF_RECORD_FINISHED_INIT,
1574 };
1575
record__adjust_affinity(struct record * rec,struct mmap * map)1576 static void record__adjust_affinity(struct record *rec, struct mmap *map)
1577 {
1578 if (rec->opts.affinity != PERF_AFFINITY_SYS &&
1579 !bitmap_equal(thread->mask->affinity.bits, map->affinity_mask.bits,
1580 thread->mask->affinity.nbits)) {
1581 bitmap_zero(thread->mask->affinity.bits, thread->mask->affinity.nbits);
1582 bitmap_or(thread->mask->affinity.bits, thread->mask->affinity.bits,
1583 map->affinity_mask.bits, thread->mask->affinity.nbits);
1584 sched_setaffinity(0, MMAP_CPU_MASK_BYTES(&thread->mask->affinity),
1585 (cpu_set_t *)thread->mask->affinity.bits);
1586 if (verbose == 2) {
1587 pr_debug("threads[%d]: running on cpu%d: ", thread->tid, sched_getcpu());
1588 mmap_cpu_mask__scnprintf(&thread->mask->affinity, "affinity");
1589 }
1590 }
1591 }
1592
process_comp_header(void * record,size_t increment)1593 static size_t process_comp_header(void *record, size_t increment)
1594 {
1595 struct perf_record_compressed2 *event = record;
1596 size_t size = sizeof(*event);
1597
1598 if (increment) {
1599 event->header.size += increment;
1600 return increment;
1601 }
1602
1603 event->header.type = PERF_RECORD_COMPRESSED2;
1604 event->header.size = size;
1605
1606 return size;
1607 }
1608
zstd_compress(struct perf_session * session,struct mmap * map,void * dst,size_t dst_size,void * src,size_t src_size)1609 static ssize_t zstd_compress(struct perf_session *session, struct mmap *map,
1610 void *dst, size_t dst_size, void *src, size_t src_size)
1611 {
1612 ssize_t compressed;
1613 size_t max_record_size = PERF_SAMPLE_MAX_SIZE - sizeof(struct perf_record_compressed2) - 1;
1614 struct zstd_data *zstd_data = &session->zstd_data;
1615
1616 if (map && map->file)
1617 zstd_data = &map->zstd_data;
1618
1619 compressed = zstd_compress_stream_to_records(zstd_data, dst, dst_size, src, src_size,
1620 max_record_size, process_comp_header);
1621 if (compressed < 0)
1622 return compressed;
1623
1624 if (map && map->file) {
1625 thread->bytes_transferred += src_size;
1626 thread->bytes_compressed += compressed;
1627 } else {
1628 session->bytes_transferred += src_size;
1629 session->bytes_compressed += compressed;
1630 }
1631
1632 return compressed;
1633 }
1634
record__mmap_read_evlist(struct record * rec,struct evlist * evlist,bool overwrite,bool synch)1635 static int record__mmap_read_evlist(struct record *rec, struct evlist *evlist,
1636 bool overwrite, bool synch)
1637 {
1638 u64 bytes_written = rec->bytes_written;
1639 int i;
1640 int rc = 0;
1641 int nr_mmaps;
1642 struct mmap **maps;
1643 int trace_fd = rec->data.file.fd;
1644 off_t off = 0;
1645
1646 if (!evlist)
1647 return 0;
1648
1649 nr_mmaps = thread->nr_mmaps;
1650 maps = overwrite ? thread->overwrite_maps : thread->maps;
1651
1652 if (!maps)
1653 return 0;
1654
1655 if (overwrite && evlist->bkw_mmap_state != BKW_MMAP_DATA_PENDING)
1656 return 0;
1657
1658 if (record__aio_enabled(rec))
1659 off = record__aio_get_pos(trace_fd);
1660
1661 for (i = 0; i < nr_mmaps; i++) {
1662 u64 flush = 0;
1663 struct mmap *map = maps[i];
1664
1665 if (map->core.base) {
1666 record__adjust_affinity(rec, map);
1667 if (synch) {
1668 flush = map->core.flush;
1669 map->core.flush = 1;
1670 }
1671 if (!record__aio_enabled(rec)) {
1672 if (perf_mmap__push(map, rec, record__pushfn) < 0) {
1673 if (synch)
1674 map->core.flush = flush;
1675 rc = -1;
1676 goto out;
1677 }
1678 } else {
1679 if (record__aio_push(rec, map, &off) < 0) {
1680 record__aio_set_pos(trace_fd, off);
1681 if (synch)
1682 map->core.flush = flush;
1683 rc = -1;
1684 goto out;
1685 }
1686 }
1687 if (synch)
1688 map->core.flush = flush;
1689 }
1690
1691 if (map->auxtrace_mmap.base && !rec->opts.auxtrace_snapshot_mode &&
1692 !rec->opts.auxtrace_sample_mode &&
1693 record__auxtrace_mmap_read(rec, map) != 0) {
1694 rc = -1;
1695 goto out;
1696 }
1697 }
1698
1699 if (record__aio_enabled(rec))
1700 record__aio_set_pos(trace_fd, off);
1701
1702 /*
1703 * Mark the round finished in case we wrote
1704 * at least one event.
1705 *
1706 * No need for round events in directory mode,
1707 * because per-cpu maps and files have data
1708 * sorted by kernel.
1709 */
1710 if (!record__threads_enabled(rec) && bytes_written != rec->bytes_written)
1711 rc = record__write(rec, NULL, &finished_round_event, sizeof(finished_round_event));
1712
1713 if (overwrite)
1714 evlist__toggle_bkw_mmap(evlist, BKW_MMAP_EMPTY);
1715 out:
1716 return rc;
1717 }
1718
record__mmap_read_all(struct record * rec,bool synch)1719 static int record__mmap_read_all(struct record *rec, bool synch)
1720 {
1721 int err;
1722
1723 err = record__mmap_read_evlist(rec, rec->evlist, false, synch);
1724 if (err)
1725 return err;
1726
1727 return record__mmap_read_evlist(rec, rec->evlist, true, synch);
1728 }
1729
record__thread_munmap_filtered(struct fdarray * fda,int fd,void * arg __maybe_unused)1730 static void record__thread_munmap_filtered(struct fdarray *fda, int fd,
1731 void *arg __maybe_unused)
1732 {
1733 struct perf_mmap *map = fda->priv[fd].ptr;
1734
1735 if (map)
1736 perf_mmap__put(map);
1737 }
1738
record__thread(void * arg)1739 static void *record__thread(void *arg)
1740 {
1741 enum thread_msg msg = THREAD_MSG__READY;
1742 bool terminate = false;
1743 struct fdarray *pollfd;
1744 int err, ctlfd_pos;
1745
1746 thread = arg;
1747 thread->tid = gettid();
1748
1749 err = write(thread->pipes.ack[1], &msg, sizeof(msg));
1750 if (err == -1)
1751 pr_warning("threads[%d]: failed to notify on start: %s\n",
1752 thread->tid, strerror(errno));
1753
1754 pr_debug("threads[%d]: started on cpu%d\n", thread->tid, sched_getcpu());
1755
1756 pollfd = &thread->pollfd;
1757 ctlfd_pos = thread->ctlfd_pos;
1758
1759 for (;;) {
1760 unsigned long long hits = thread->samples;
1761
1762 if (record__mmap_read_all(thread->rec, false) < 0 || terminate)
1763 break;
1764
1765 if (hits == thread->samples) {
1766
1767 err = fdarray__poll(pollfd, -1);
1768 /*
1769 * Propagate error, only if there's any. Ignore positive
1770 * number of returned events and interrupt error.
1771 */
1772 if (err > 0 || (err < 0 && errno == EINTR))
1773 err = 0;
1774 thread->waking++;
1775
1776 if (fdarray__filter(pollfd, POLLERR | POLLHUP,
1777 record__thread_munmap_filtered, NULL) == 0)
1778 break;
1779 }
1780
1781 if (pollfd->entries[ctlfd_pos].revents & POLLHUP) {
1782 terminate = true;
1783 close(thread->pipes.msg[0]);
1784 thread->pipes.msg[0] = -1;
1785 pollfd->entries[ctlfd_pos].fd = -1;
1786 pollfd->entries[ctlfd_pos].events = 0;
1787 }
1788
1789 pollfd->entries[ctlfd_pos].revents = 0;
1790 }
1791 record__mmap_read_all(thread->rec, true);
1792
1793 err = write(thread->pipes.ack[1], &msg, sizeof(msg));
1794 if (err == -1)
1795 pr_warning("threads[%d]: failed to notify on termination: %s\n",
1796 thread->tid, strerror(errno));
1797
1798 return NULL;
1799 }
1800
record__init_features(struct record * rec)1801 static void record__init_features(struct record *rec)
1802 {
1803 struct perf_session *session = rec->session;
1804 int feat;
1805
1806 for (feat = HEADER_FIRST_FEATURE; feat < HEADER_LAST_FEATURE; feat++)
1807 perf_header__set_feat(&session->header, feat);
1808
1809 if (rec->no_buildid)
1810 perf_header__clear_feat(&session->header, HEADER_BUILD_ID);
1811
1812 if (!have_tracepoints(&rec->evlist->core.entries))
1813 perf_header__clear_feat(&session->header, HEADER_TRACING_DATA);
1814
1815 if (!rec->opts.branch_stack)
1816 perf_header__clear_feat(&session->header, HEADER_BRANCH_STACK);
1817
1818 if (!rec->opts.full_auxtrace)
1819 perf_header__clear_feat(&session->header, HEADER_AUXTRACE);
1820
1821 if (!(rec->opts.use_clockid && rec->opts.clockid_res_ns))
1822 perf_header__clear_feat(&session->header, HEADER_CLOCKID);
1823
1824 if (!rec->opts.use_clockid)
1825 perf_header__clear_feat(&session->header, HEADER_CLOCK_DATA);
1826
1827 if (!record__threads_enabled(rec))
1828 perf_header__clear_feat(&session->header, HEADER_DIR_FORMAT);
1829
1830 if (!record__comp_enabled(rec))
1831 perf_header__clear_feat(&session->header, HEADER_COMPRESSED);
1832
1833 perf_header__clear_feat(&session->header, HEADER_STAT);
1834 }
1835
1836 static void
record__finish_output(struct record * rec)1837 record__finish_output(struct record *rec)
1838 {
1839 int i;
1840 struct perf_data *data = &rec->data;
1841 int fd = perf_data__fd(data);
1842
1843 if (data->is_pipe) {
1844 /* Just to display approx. size */
1845 data->file.size = rec->bytes_written;
1846 return;
1847 }
1848
1849 rec->session->header.data_size += rec->bytes_written;
1850 data->file.size = lseek(perf_data__fd(data), 0, SEEK_CUR);
1851 if (record__threads_enabled(rec)) {
1852 for (i = 0; i < data->dir.nr; i++)
1853 data->dir.files[i].size = lseek(data->dir.files[i].fd, 0, SEEK_CUR);
1854 }
1855
1856 /* Buildid scanning disabled or build ID in kernel and synthesized map events. */
1857 if (!rec->no_buildid || !rec->no_buildid_cache) {
1858 process_buildids(rec);
1859
1860 if (rec->buildid_all)
1861 perf_session__dsos_hit_all(rec->session);
1862 }
1863 perf_session__write_header(rec->session, rec->evlist, fd, true);
1864 perf_session__cache_build_ids(rec->session);
1865 }
1866
record__synthesize_workload(struct record * rec,bool tail)1867 static int record__synthesize_workload(struct record *rec, bool tail)
1868 {
1869 int err;
1870 struct perf_thread_map *thread_map;
1871 bool needs_mmap = rec->opts.synth & PERF_SYNTH_MMAP;
1872
1873 if (rec->opts.tail_synthesize != tail)
1874 return 0;
1875
1876 thread_map = thread_map__new_by_tid(rec->evlist->workload.pid);
1877 if (thread_map == NULL)
1878 return -1;
1879
1880 err = perf_event__synthesize_thread_map(&rec->tool, thread_map,
1881 process_synthesized_event,
1882 &rec->session->machines.host,
1883 needs_mmap,
1884 rec->opts.sample_address);
1885 perf_thread_map__put(thread_map);
1886 return err;
1887 }
1888
write_finished_init(struct record * rec,bool tail)1889 static int write_finished_init(struct record *rec, bool tail)
1890 {
1891 if (rec->opts.tail_synthesize != tail)
1892 return 0;
1893
1894 return record__write(rec, NULL, &finished_init_event, sizeof(finished_init_event));
1895 }
1896
1897 static int record__synthesize(struct record *rec, bool tail);
1898
1899 static int
record__switch_output(struct record * rec,bool at_exit)1900 record__switch_output(struct record *rec, bool at_exit)
1901 {
1902 struct perf_data *data = &rec->data;
1903 char *new_filename = NULL;
1904 int fd, err;
1905
1906 /* Same Size: "2015122520103046"*/
1907 char timestamp[] = "InvalidTimestamp";
1908
1909 record__aio_mmap_read_sync(rec);
1910
1911 write_finished_init(rec, true);
1912
1913 record__synthesize(rec, true);
1914 if (target__none(&rec->opts.target))
1915 record__synthesize_workload(rec, true);
1916
1917 rec->samples = 0;
1918 record__finish_output(rec);
1919 err = fetch_current_timestamp(timestamp, sizeof(timestamp));
1920 if (err) {
1921 pr_err("Failed to get current timestamp\n");
1922 return -EINVAL;
1923 }
1924
1925 fd = perf_data__switch(data, timestamp,
1926 rec->session->header.data_offset,
1927 at_exit, &new_filename);
1928 if (fd >= 0 && !at_exit) {
1929 rec->bytes_written = 0;
1930 rec->session->header.data_size = 0;
1931 }
1932
1933 if (!quiet) {
1934 fprintf(stderr, "[ perf record: Dump %s.%s ]\n",
1935 data->path, timestamp);
1936 }
1937
1938 if (rec->switch_output.num_files) {
1939 int n = rec->switch_output.cur_file + 1;
1940
1941 if (n >= rec->switch_output.num_files)
1942 n = 0;
1943 rec->switch_output.cur_file = n;
1944 if (rec->switch_output.filenames[n]) {
1945 remove(rec->switch_output.filenames[n]);
1946 zfree(&rec->switch_output.filenames[n]);
1947 }
1948 rec->switch_output.filenames[n] = new_filename;
1949 } else {
1950 free(new_filename);
1951 }
1952
1953 /* Output tracking events */
1954 if (!at_exit) {
1955 record__synthesize(rec, false);
1956
1957 /*
1958 * In 'perf record --switch-output' without -a,
1959 * record__synthesize() in record__switch_output() won't
1960 * generate tracking events because there's no thread_map
1961 * in evlist. Which causes newly created perf.data doesn't
1962 * contain map and comm information.
1963 * Create a fake thread_map and directly call
1964 * perf_event__synthesize_thread_map() for those events.
1965 */
1966 if (target__none(&rec->opts.target))
1967 record__synthesize_workload(rec, false);
1968 write_finished_init(rec, false);
1969 }
1970 return fd;
1971 }
1972
__record__save_lost_samples(struct record * rec,struct evsel * evsel,struct perf_record_lost_samples * lost,int cpu_idx,int thread_idx,u64 lost_count,u16 misc_flag)1973 static void __record__save_lost_samples(struct record *rec, struct evsel *evsel,
1974 struct perf_record_lost_samples *lost,
1975 int cpu_idx, int thread_idx, u64 lost_count,
1976 u16 misc_flag)
1977 {
1978 struct perf_sample_id *sid;
1979 struct perf_sample sample;
1980 int id_hdr_size;
1981
1982 perf_sample__init(&sample, /*all=*/true);
1983 lost->lost = lost_count;
1984 if (evsel->core.ids) {
1985 sid = xyarray__entry(evsel->core.sample_id, cpu_idx, thread_idx);
1986 sample.id = sid->id;
1987 }
1988
1989 id_hdr_size = perf_event__synthesize_id_sample((void *)(lost + 1),
1990 evsel->core.attr.sample_type, &sample);
1991 lost->header.size = sizeof(*lost) + id_hdr_size;
1992 lost->header.misc = misc_flag;
1993 record__write(rec, NULL, lost, lost->header.size);
1994 perf_sample__exit(&sample);
1995 }
1996
record__read_lost_samples(struct record * rec)1997 static void record__read_lost_samples(struct record *rec)
1998 {
1999 struct perf_session *session = rec->session;
2000 struct perf_record_lost_samples_and_ids lost;
2001 struct evsel *evsel;
2002
2003 /* there was an error during record__open */
2004 if (session->evlist == NULL)
2005 return;
2006
2007 evlist__for_each_entry(session->evlist, evsel) {
2008 struct xyarray *xy = evsel->core.sample_id;
2009 u64 lost_count;
2010
2011 if (xy == NULL || evsel->core.fd == NULL)
2012 continue;
2013 if (xyarray__max_x(evsel->core.fd) != xyarray__max_x(xy) ||
2014 xyarray__max_y(evsel->core.fd) != xyarray__max_y(xy)) {
2015 pr_debug("Unmatched FD vs. sample ID: skip reading LOST count\n");
2016 continue;
2017 }
2018
2019 for (int x = 0; x < xyarray__max_x(xy); x++) {
2020 for (int y = 0; y < xyarray__max_y(xy); y++) {
2021 struct perf_counts_values count;
2022
2023 if (perf_evsel__read(&evsel->core, x, y, &count) < 0) {
2024 pr_debug("read LOST count failed\n");
2025 return;
2026 }
2027
2028 if (count.lost) {
2029 memset(&lost, 0, sizeof(lost));
2030 lost.lost.header.type = PERF_RECORD_LOST_SAMPLES;
2031 __record__save_lost_samples(rec, evsel, &lost.lost,
2032 x, y, count.lost, 0);
2033 }
2034 }
2035 }
2036
2037 lost_count = perf_bpf_filter__lost_count(evsel);
2038 if (lost_count) {
2039 memset(&lost, 0, sizeof(lost));
2040 lost.lost.header.type = PERF_RECORD_LOST_SAMPLES;
2041 __record__save_lost_samples(rec, evsel, &lost.lost, 0, 0, lost_count,
2042 PERF_RECORD_MISC_LOST_SAMPLES_BPF);
2043 }
2044 }
2045 }
2046
2047 static volatile sig_atomic_t workload_exec_errno;
2048
2049 /*
2050 * evlist__prepare_workload will send a SIGUSR1
2051 * if the fork fails, since we asked by setting its
2052 * want_signal to true.
2053 */
workload_exec_failed_signal(int signo __maybe_unused,siginfo_t * info,void * ucontext __maybe_unused)2054 static void workload_exec_failed_signal(int signo __maybe_unused,
2055 siginfo_t *info,
2056 void *ucontext __maybe_unused)
2057 {
2058 workload_exec_errno = info->si_value.sival_int;
2059 done = 1;
2060 child_finished = 1;
2061 }
2062
2063 static void snapshot_sig_handler(int sig);
2064 static void alarm_sig_handler(int sig);
2065
evlist__pick_pc(struct evlist * evlist)2066 static const struct perf_event_mmap_page *evlist__pick_pc(struct evlist *evlist)
2067 {
2068 if (evlist) {
2069 if (evlist->mmap && evlist->mmap[0].core.base)
2070 return evlist->mmap[0].core.base;
2071 if (evlist->overwrite_mmap && evlist->overwrite_mmap[0].core.base)
2072 return evlist->overwrite_mmap[0].core.base;
2073 }
2074 return NULL;
2075 }
2076
record__pick_pc(struct record * rec)2077 static const struct perf_event_mmap_page *record__pick_pc(struct record *rec)
2078 {
2079 const struct perf_event_mmap_page *pc = evlist__pick_pc(rec->evlist);
2080 if (pc)
2081 return pc;
2082 return NULL;
2083 }
2084
record__synthesize(struct record * rec,bool tail)2085 static int record__synthesize(struct record *rec, bool tail)
2086 {
2087 struct perf_session *session = rec->session;
2088 struct machine *machine = &session->machines.host;
2089 struct perf_data *data = &rec->data;
2090 struct record_opts *opts = &rec->opts;
2091 struct perf_tool *tool = &rec->tool;
2092 int err = 0;
2093 event_op f = process_synthesized_event;
2094
2095 if (rec->opts.tail_synthesize != tail)
2096 return 0;
2097
2098 if (data->is_pipe) {
2099 err = perf_event__synthesize_for_pipe(tool, session, data,
2100 process_synthesized_event);
2101 if (err < 0)
2102 goto out;
2103
2104 rec->bytes_written += err;
2105 }
2106
2107 err = perf_event__synth_time_conv(record__pick_pc(rec), tool,
2108 process_synthesized_event, machine);
2109 if (err)
2110 goto out;
2111
2112 /* Synthesize id_index before auxtrace_info */
2113 err = perf_event__synthesize_id_index(tool,
2114 process_synthesized_event,
2115 session->evlist, machine);
2116 if (err)
2117 goto out;
2118
2119 if (rec->opts.full_auxtrace) {
2120 err = perf_event__synthesize_auxtrace_info(rec->itr, tool,
2121 session, process_synthesized_event);
2122 if (err)
2123 goto out;
2124 }
2125
2126 if (!evlist__exclude_kernel(rec->evlist)) {
2127 err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
2128 machine);
2129 WARN_ONCE(err < 0, "Couldn't record kernel reference relocation symbol\n"
2130 "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
2131 "Check /proc/kallsyms permission or run as root.\n");
2132
2133 err = perf_event__synthesize_modules(tool, process_synthesized_event,
2134 machine);
2135 WARN_ONCE(err < 0, "Couldn't record kernel module information.\n"
2136 "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
2137 "Check /proc/modules permission or run as root.\n");
2138 }
2139
2140 if (perf_guest) {
2141 machines__process_guests(&session->machines,
2142 perf_event__synthesize_guest_os, tool);
2143 }
2144
2145 err = perf_event__synthesize_extra_attr(&rec->tool,
2146 rec->evlist,
2147 process_synthesized_event,
2148 data->is_pipe);
2149 if (err)
2150 goto out;
2151
2152 err = perf_event__synthesize_thread_map2(&rec->tool, rec->evlist->core.threads,
2153 process_synthesized_event,
2154 NULL);
2155 if (err < 0) {
2156 pr_err("Couldn't synthesize thread map.\n");
2157 return err;
2158 }
2159
2160 err = perf_event__synthesize_cpu_map(&rec->tool, rec->evlist->core.all_cpus,
2161 process_synthesized_event, NULL);
2162 if (err < 0) {
2163 pr_err("Couldn't synthesize cpu map.\n");
2164 return err;
2165 }
2166
2167 err = perf_event__synthesize_bpf_events(session, process_synthesized_event,
2168 machine, opts);
2169 if (err < 0) {
2170 pr_warning("Couldn't synthesize bpf events.\n");
2171 err = 0;
2172 }
2173
2174 if (rec->opts.synth & PERF_SYNTH_CGROUP) {
2175 err = perf_event__synthesize_cgroups(tool, process_synthesized_event,
2176 machine);
2177 if (err < 0) {
2178 pr_warning("Couldn't synthesize cgroup events.\n");
2179 err = 0;
2180 }
2181 }
2182
2183 if (rec->opts.nr_threads_synthesize > 1) {
2184 mutex_init(&synth_lock);
2185 perf_set_multithreaded();
2186 f = process_locked_synthesized_event;
2187 }
2188
2189 if (rec->opts.synth & PERF_SYNTH_TASK) {
2190 bool needs_mmap = rec->opts.synth & PERF_SYNTH_MMAP;
2191
2192 err = __machine__synthesize_threads(machine, tool, &opts->target,
2193 rec->evlist->core.threads,
2194 f, needs_mmap, opts->sample_address,
2195 rec->opts.nr_threads_synthesize);
2196 }
2197
2198 if (rec->opts.nr_threads_synthesize > 1) {
2199 perf_set_singlethreaded();
2200 mutex_destroy(&synth_lock);
2201 }
2202
2203 out:
2204 return err;
2205 }
2206
record__synthesize_final_bpf_metadata(struct record * rec __maybe_unused)2207 static void record__synthesize_final_bpf_metadata(struct record *rec __maybe_unused)
2208 {
2209 #ifdef HAVE_LIBBPF_SUPPORT
2210 perf_event__synthesize_final_bpf_metadata(rec->session,
2211 process_synthesized_event);
2212 #endif
2213 }
2214
record__process_signal_event(union perf_event * event __maybe_unused,void * data)2215 static int record__process_signal_event(union perf_event *event __maybe_unused, void *data)
2216 {
2217 struct record *rec = data;
2218 pthread_kill(rec->thread_id, SIGUSR2);
2219 return 0;
2220 }
2221
record__setup_sb_evlist(struct record * rec)2222 static int record__setup_sb_evlist(struct record *rec)
2223 {
2224 struct record_opts *opts = &rec->opts;
2225
2226 if (rec->sb_evlist != NULL) {
2227 /*
2228 * We get here if --switch-output-event populated the
2229 * sb_evlist, so associate a callback that will send a SIGUSR2
2230 * to the main thread.
2231 */
2232 evlist__set_cb(rec->sb_evlist, record__process_signal_event, rec);
2233 rec->thread_id = pthread_self();
2234 }
2235 #ifdef HAVE_LIBBPF_SUPPORT
2236 if (!opts->no_bpf_event) {
2237 if (rec->sb_evlist == NULL) {
2238 rec->sb_evlist = evlist__new();
2239
2240 if (rec->sb_evlist == NULL) {
2241 pr_err("Couldn't create side band evlist.\n.");
2242 return -1;
2243 }
2244 }
2245
2246 if (evlist__add_bpf_sb_event(rec->sb_evlist, perf_session__env(rec->session))) {
2247 pr_err("Couldn't ask for PERF_RECORD_BPF_EVENT side band events.\n.");
2248 return -1;
2249 }
2250 }
2251 #endif
2252 if (evlist__start_sb_thread(rec->sb_evlist, &rec->opts.target)) {
2253 pr_debug("Couldn't start the BPF side band thread:\nBPF programs starting from now on won't be annotatable\n");
2254 opts->no_bpf_event = true;
2255 }
2256
2257 return 0;
2258 }
2259
record__init_clock(struct record * rec)2260 static int record__init_clock(struct record *rec)
2261 {
2262 struct perf_session *session = rec->session;
2263 struct timespec ref_clockid;
2264 struct timeval ref_tod;
2265 struct perf_env *env = perf_session__env(session);
2266 u64 ref;
2267
2268 if (!rec->opts.use_clockid)
2269 return 0;
2270
2271 if (rec->opts.use_clockid && rec->opts.clockid_res_ns)
2272 env->clock.clockid_res_ns = rec->opts.clockid_res_ns;
2273
2274 env->clock.clockid = rec->opts.clockid;
2275
2276 if (gettimeofday(&ref_tod, NULL) != 0) {
2277 pr_err("gettimeofday failed, cannot set reference time.\n");
2278 return -1;
2279 }
2280
2281 if (clock_gettime(rec->opts.clockid, &ref_clockid)) {
2282 pr_err("clock_gettime failed, cannot set reference time.\n");
2283 return -1;
2284 }
2285
2286 ref = (u64) ref_tod.tv_sec * NSEC_PER_SEC +
2287 (u64) ref_tod.tv_usec * NSEC_PER_USEC;
2288
2289 env->clock.tod_ns = ref;
2290
2291 ref = (u64) ref_clockid.tv_sec * NSEC_PER_SEC +
2292 (u64) ref_clockid.tv_nsec;
2293
2294 env->clock.clockid_ns = ref;
2295 return 0;
2296 }
2297
hit_auxtrace_snapshot_trigger(struct record * rec)2298 static void hit_auxtrace_snapshot_trigger(struct record *rec)
2299 {
2300 if (trigger_is_ready(&auxtrace_snapshot_trigger)) {
2301 trigger_hit(&auxtrace_snapshot_trigger);
2302 auxtrace_record__snapshot_started = 1;
2303 if (auxtrace_record__snapshot_start(rec->itr))
2304 trigger_error(&auxtrace_snapshot_trigger);
2305 }
2306 }
2307
record__terminate_thread(struct record_thread * thread_data)2308 static int record__terminate_thread(struct record_thread *thread_data)
2309 {
2310 int err;
2311 enum thread_msg ack = THREAD_MSG__UNDEFINED;
2312 pid_t tid = thread_data->tid;
2313
2314 close(thread_data->pipes.msg[1]);
2315 thread_data->pipes.msg[1] = -1;
2316 err = read(thread_data->pipes.ack[0], &ack, sizeof(ack));
2317 if (err > 0)
2318 pr_debug2("threads[%d]: sent %s\n", tid, thread_msg_tags[ack]);
2319 else
2320 pr_warning("threads[%d]: failed to receive termination notification from %d\n",
2321 thread->tid, tid);
2322
2323 return 0;
2324 }
2325
record__start_threads(struct record * rec)2326 static int record__start_threads(struct record *rec)
2327 {
2328 int t, tt, err, ret = 0, nr_threads = rec->nr_threads;
2329 struct record_thread *thread_data = rec->thread_data;
2330 sigset_t full, mask;
2331 pthread_t handle;
2332 pthread_attr_t attrs;
2333
2334 thread = &thread_data[0];
2335
2336 if (!record__threads_enabled(rec))
2337 return 0;
2338
2339 sigfillset(&full);
2340 if (sigprocmask(SIG_SETMASK, &full, &mask)) {
2341 pr_err("Failed to block signals on threads start: %s\n", strerror(errno));
2342 return -1;
2343 }
2344
2345 pthread_attr_init(&attrs);
2346 pthread_attr_setdetachstate(&attrs, PTHREAD_CREATE_DETACHED);
2347
2348 for (t = 1; t < nr_threads; t++) {
2349 enum thread_msg msg = THREAD_MSG__UNDEFINED;
2350
2351 #ifdef HAVE_PTHREAD_ATTR_SETAFFINITY_NP
2352 pthread_attr_setaffinity_np(&attrs,
2353 MMAP_CPU_MASK_BYTES(&(thread_data[t].mask->affinity)),
2354 (cpu_set_t *)(thread_data[t].mask->affinity.bits));
2355 #endif
2356 if (pthread_create(&handle, &attrs, record__thread, &thread_data[t])) {
2357 for (tt = 1; tt < t; tt++)
2358 record__terminate_thread(&thread_data[t]);
2359 pr_err("Failed to start threads: %s\n", strerror(errno));
2360 ret = -1;
2361 goto out_err;
2362 }
2363
2364 err = read(thread_data[t].pipes.ack[0], &msg, sizeof(msg));
2365 if (err > 0)
2366 pr_debug2("threads[%d]: sent %s\n", rec->thread_data[t].tid,
2367 thread_msg_tags[msg]);
2368 else
2369 pr_warning("threads[%d]: failed to receive start notification from %d\n",
2370 thread->tid, rec->thread_data[t].tid);
2371 }
2372
2373 sched_setaffinity(0, MMAP_CPU_MASK_BYTES(&thread->mask->affinity),
2374 (cpu_set_t *)thread->mask->affinity.bits);
2375
2376 pr_debug("threads[%d]: started on cpu%d\n", thread->tid, sched_getcpu());
2377
2378 out_err:
2379 pthread_attr_destroy(&attrs);
2380
2381 if (sigprocmask(SIG_SETMASK, &mask, NULL)) {
2382 pr_err("Failed to unblock signals on threads start: %s\n", strerror(errno));
2383 ret = -1;
2384 }
2385
2386 return ret;
2387 }
2388
record__stop_threads(struct record * rec)2389 static int record__stop_threads(struct record *rec)
2390 {
2391 int t;
2392 struct record_thread *thread_data = rec->thread_data;
2393
2394 for (t = 1; t < rec->nr_threads; t++)
2395 record__terminate_thread(&thread_data[t]);
2396
2397 for (t = 0; t < rec->nr_threads; t++) {
2398 rec->samples += thread_data[t].samples;
2399 if (!record__threads_enabled(rec))
2400 continue;
2401 rec->session->bytes_transferred += thread_data[t].bytes_transferred;
2402 rec->session->bytes_compressed += thread_data[t].bytes_compressed;
2403 pr_debug("threads[%d]: samples=%lld, wakes=%ld, ", thread_data[t].tid,
2404 thread_data[t].samples, thread_data[t].waking);
2405 if (thread_data[t].bytes_transferred && thread_data[t].bytes_compressed)
2406 pr_debug("transferred=%" PRIu64 ", compressed=%" PRIu64 "\n",
2407 thread_data[t].bytes_transferred, thread_data[t].bytes_compressed);
2408 else
2409 pr_debug("written=%" PRIu64 "\n", thread_data[t].bytes_written);
2410 }
2411
2412 return 0;
2413 }
2414
record__waking(struct record * rec)2415 static unsigned long record__waking(struct record *rec)
2416 {
2417 int t;
2418 unsigned long waking = 0;
2419 struct record_thread *thread_data = rec->thread_data;
2420
2421 for (t = 0; t < rec->nr_threads; t++)
2422 waking += thread_data[t].waking;
2423
2424 return waking;
2425 }
2426
__cmd_record(struct record * rec,int argc,const char ** argv)2427 static int __cmd_record(struct record *rec, int argc, const char **argv)
2428 {
2429 int err;
2430 int status = 0;
2431 const bool forks = argc > 0;
2432 struct perf_tool *tool = &rec->tool;
2433 struct record_opts *opts = &rec->opts;
2434 struct perf_data *data = &rec->data;
2435 struct perf_session *session;
2436 bool disabled = false, draining = false;
2437 int fd;
2438 float ratio = 0;
2439 enum evlist_ctl_cmd cmd = EVLIST_CTL_CMD_UNSUPPORTED;
2440 struct perf_env *env;
2441
2442 atexit(record__sig_exit);
2443 signal(SIGCHLD, sig_handler);
2444 signal(SIGINT, sig_handler);
2445 signal(SIGTERM, sig_handler);
2446 signal(SIGSEGV, sigsegv_handler);
2447
2448 if (rec->opts.record_cgroup) {
2449 #ifndef HAVE_FILE_HANDLE
2450 pr_err("cgroup tracking is not supported\n");
2451 return -1;
2452 #endif
2453 }
2454
2455 if (rec->opts.auxtrace_snapshot_mode || rec->switch_output.enabled) {
2456 signal(SIGUSR2, snapshot_sig_handler);
2457 if (rec->opts.auxtrace_snapshot_mode)
2458 trigger_on(&auxtrace_snapshot_trigger);
2459 if (rec->switch_output.enabled)
2460 trigger_on(&switch_output_trigger);
2461 } else {
2462 signal(SIGUSR2, SIG_IGN);
2463 }
2464
2465 perf_tool__init(tool, /*ordered_events=*/true);
2466 tool->sample = process_sample_event;
2467 tool->fork = perf_event__process_fork;
2468 tool->exit = perf_event__process_exit;
2469 tool->comm = perf_event__process_comm;
2470 tool->namespaces = perf_event__process_namespaces;
2471 tool->mmap = build_id__process_mmap;
2472 tool->mmap2 = build_id__process_mmap2;
2473 tool->itrace_start = process_timestamp_boundary;
2474 tool->aux = process_timestamp_boundary;
2475 tool->namespace_events = rec->opts.record_namespaces;
2476 tool->cgroup_events = rec->opts.record_cgroup;
2477 session = perf_session__new(data, tool);
2478 if (IS_ERR(session)) {
2479 pr_err("Perf session creation failed.\n");
2480 return PTR_ERR(session);
2481 }
2482 env = perf_session__env(session);
2483 if (record__threads_enabled(rec)) {
2484 if (perf_data__is_pipe(&rec->data)) {
2485 pr_err("Parallel trace streaming is not available in pipe mode.\n");
2486 return -1;
2487 }
2488 if (rec->opts.full_auxtrace) {
2489 pr_err("Parallel trace streaming is not available in AUX area tracing mode.\n");
2490 return -1;
2491 }
2492 }
2493
2494 fd = perf_data__fd(data);
2495 rec->session = session;
2496
2497 if (zstd_init(&session->zstd_data, rec->opts.comp_level) < 0) {
2498 pr_err("Compression initialization failed.\n");
2499 return -1;
2500 }
2501 #ifdef HAVE_EVENTFD_SUPPORT
2502 done_fd = eventfd(0, EFD_NONBLOCK);
2503 if (done_fd < 0) {
2504 pr_err("Failed to create wakeup eventfd, error: %m\n");
2505 status = -1;
2506 goto out_delete_session;
2507 }
2508 err = evlist__add_wakeup_eventfd(rec->evlist, done_fd);
2509 if (err < 0) {
2510 pr_err("Failed to add wakeup eventfd to poll list\n");
2511 status = err;
2512 goto out_delete_session;
2513 }
2514 #endif // HAVE_EVENTFD_SUPPORT
2515
2516 env->comp_type = PERF_COMP_ZSTD;
2517 env->comp_level = rec->opts.comp_level;
2518
2519 if (rec->opts.kcore &&
2520 !record__kcore_readable(&session->machines.host)) {
2521 pr_err("ERROR: kcore is not readable.\n");
2522 return -1;
2523 }
2524
2525 if (record__init_clock(rec))
2526 return -1;
2527
2528 record__init_features(rec);
2529
2530 if (forks) {
2531 err = evlist__prepare_workload(rec->evlist, &opts->target, argv, data->is_pipe,
2532 workload_exec_failed_signal);
2533 if (err < 0) {
2534 pr_err("Couldn't run the workload!\n");
2535 status = err;
2536 goto out_delete_session;
2537 }
2538 }
2539
2540 /*
2541 * If we have just single event and are sending data
2542 * through pipe, we need to force the ids allocation,
2543 * because we synthesize event name through the pipe
2544 * and need the id for that.
2545 */
2546 if (data->is_pipe && rec->evlist->core.nr_entries == 1)
2547 rec->opts.sample_id = true;
2548
2549 if (rec->timestamp_filename && perf_data__is_pipe(data)) {
2550 rec->timestamp_filename = false;
2551 pr_warning("WARNING: --timestamp-filename option is not available in pipe mode.\n");
2552 }
2553
2554 /*
2555 * Use global stat_config that is zero meaning aggr_mode is AGGR_NONE
2556 * and hybrid_merge is false.
2557 */
2558 evlist__uniquify_evsel_names(rec->evlist, &stat_config);
2559
2560 evlist__config(rec->evlist, opts, &callchain_param);
2561
2562 /* Debug message used by test scripts */
2563 pr_debug3("perf record opening and mmapping events\n");
2564 if (record__open(rec) != 0) {
2565 err = -1;
2566 goto out_free_threads;
2567 }
2568 /* Debug message used by test scripts */
2569 pr_debug3("perf record done opening and mmapping events\n");
2570 env->comp_mmap_len = session->evlist->core.mmap_len;
2571
2572 if (rec->opts.kcore) {
2573 err = record__kcore_copy(&session->machines.host, data);
2574 if (err) {
2575 pr_err("ERROR: Failed to copy kcore\n");
2576 goto out_free_threads;
2577 }
2578 }
2579
2580 /*
2581 * Normally perf_session__new would do this, but it doesn't have the
2582 * evlist.
2583 */
2584 if (rec->tool.ordered_events && !evlist__sample_id_all(rec->evlist)) {
2585 pr_warning("WARNING: No sample_id_all support, falling back to unordered processing\n");
2586 rec->tool.ordered_events = false;
2587 }
2588
2589 if (evlist__nr_groups(rec->evlist) == 0)
2590 perf_header__clear_feat(&session->header, HEADER_GROUP_DESC);
2591
2592 if (data->is_pipe) {
2593 err = perf_header__write_pipe(fd);
2594 if (err < 0)
2595 goto out_free_threads;
2596 } else {
2597 err = perf_session__write_header(session, rec->evlist, fd, false);
2598 if (err < 0)
2599 goto out_free_threads;
2600 }
2601
2602 err = -1;
2603 if (!rec->no_buildid
2604 && !perf_header__has_feat(&session->header, HEADER_BUILD_ID)) {
2605 pr_err("Couldn't generate buildids. "
2606 "Use --no-buildid to profile anyway.\n");
2607 goto out_free_threads;
2608 }
2609
2610 if (!evlist__needs_bpf_sb_event(rec->evlist))
2611 opts->no_bpf_event = true;
2612
2613 err = record__setup_sb_evlist(rec);
2614 if (err)
2615 goto out_free_threads;
2616
2617 err = record__synthesize(rec, false);
2618 if (err < 0)
2619 goto out_free_threads;
2620
2621 if (rec->realtime_prio) {
2622 struct sched_param param;
2623
2624 param.sched_priority = rec->realtime_prio;
2625 if (sched_setscheduler(0, SCHED_FIFO, ¶m)) {
2626 pr_err("Could not set realtime priority.\n");
2627 err = -1;
2628 goto out_free_threads;
2629 }
2630 }
2631
2632 if (record__start_threads(rec))
2633 goto out_free_threads;
2634
2635 /*
2636 * When perf is starting the traced process, all the events
2637 * (apart from group members) have enable_on_exec=1 set,
2638 * so don't spoil it by prematurely enabling them.
2639 */
2640 if (!target__none(&opts->target) && !opts->target.initial_delay)
2641 evlist__enable(rec->evlist);
2642
2643 /*
2644 * offcpu-time does not call execve, so enable_on_exe wouldn't work
2645 * when recording a workload, do it manually
2646 */
2647 if (rec->off_cpu)
2648 evlist__enable_evsel(rec->evlist, (char *)OFFCPU_EVENT);
2649
2650 /*
2651 * Let the child rip
2652 */
2653 if (forks) {
2654 struct machine *machine = &session->machines.host;
2655 union perf_event *event;
2656 pid_t tgid;
2657
2658 event = malloc(sizeof(event->comm) + machine->id_hdr_size);
2659 if (event == NULL) {
2660 err = -ENOMEM;
2661 goto out_child;
2662 }
2663
2664 /*
2665 * Some H/W events are generated before COMM event
2666 * which is emitted during exec(), so perf script
2667 * cannot see a correct process name for those events.
2668 * Synthesize COMM event to prevent it.
2669 */
2670 tgid = perf_event__synthesize_comm(tool, event,
2671 rec->evlist->workload.pid,
2672 process_synthesized_event,
2673 machine);
2674 free(event);
2675
2676 if (tgid == -1)
2677 goto out_child;
2678
2679 event = malloc(sizeof(event->namespaces) +
2680 (NR_NAMESPACES * sizeof(struct perf_ns_link_info)) +
2681 machine->id_hdr_size);
2682 if (event == NULL) {
2683 err = -ENOMEM;
2684 goto out_child;
2685 }
2686
2687 /*
2688 * Synthesize NAMESPACES event for the command specified.
2689 */
2690 perf_event__synthesize_namespaces(tool, event,
2691 rec->evlist->workload.pid,
2692 tgid, process_synthesized_event,
2693 machine);
2694 free(event);
2695
2696 evlist__start_workload(rec->evlist);
2697 }
2698
2699 if (opts->target.initial_delay) {
2700 pr_info(EVLIST_DISABLED_MSG);
2701 if (opts->target.initial_delay > 0) {
2702 usleep(opts->target.initial_delay * USEC_PER_MSEC);
2703 evlist__enable(rec->evlist);
2704 pr_info(EVLIST_ENABLED_MSG);
2705 }
2706 }
2707
2708 err = event_enable_timer__start(rec->evlist->eet);
2709 if (err)
2710 goto out_child;
2711
2712 /* Debug message used by test scripts */
2713 pr_debug3("perf record has started\n");
2714 fflush(stderr);
2715
2716 trigger_ready(&auxtrace_snapshot_trigger);
2717 trigger_ready(&switch_output_trigger);
2718 perf_hooks__invoke_record_start();
2719
2720 /*
2721 * Must write FINISHED_INIT so it will be seen after all other
2722 * synthesized user events, but before any regular events.
2723 */
2724 err = write_finished_init(rec, false);
2725 if (err < 0)
2726 goto out_child;
2727
2728 for (;;) {
2729 unsigned long long hits = thread->samples;
2730
2731 /*
2732 * rec->evlist->bkw_mmap_state is possible to be
2733 * BKW_MMAP_EMPTY here: when done == true and
2734 * hits != rec->samples in previous round.
2735 *
2736 * evlist__toggle_bkw_mmap ensure we never
2737 * convert BKW_MMAP_EMPTY to BKW_MMAP_DATA_PENDING.
2738 */
2739 if (trigger_is_hit(&switch_output_trigger) || done || draining)
2740 evlist__toggle_bkw_mmap(rec->evlist, BKW_MMAP_DATA_PENDING);
2741
2742 if (record__mmap_read_all(rec, false) < 0) {
2743 trigger_error(&auxtrace_snapshot_trigger);
2744 trigger_error(&switch_output_trigger);
2745 err = -1;
2746 goto out_child;
2747 }
2748
2749 if (auxtrace_record__snapshot_started) {
2750 auxtrace_record__snapshot_started = 0;
2751 if (!trigger_is_error(&auxtrace_snapshot_trigger))
2752 record__read_auxtrace_snapshot(rec, false);
2753 if (trigger_is_error(&auxtrace_snapshot_trigger)) {
2754 pr_err("AUX area tracing snapshot failed\n");
2755 err = -1;
2756 goto out_child;
2757 }
2758 }
2759
2760 if (trigger_is_hit(&switch_output_trigger)) {
2761 /*
2762 * If switch_output_trigger is hit, the data in
2763 * overwritable ring buffer should have been collected,
2764 * so bkw_mmap_state should be set to BKW_MMAP_EMPTY.
2765 *
2766 * If SIGUSR2 raise after or during record__mmap_read_all(),
2767 * record__mmap_read_all() didn't collect data from
2768 * overwritable ring buffer. Read again.
2769 */
2770 if (rec->evlist->bkw_mmap_state == BKW_MMAP_RUNNING)
2771 continue;
2772 trigger_ready(&switch_output_trigger);
2773
2774 /*
2775 * Reenable events in overwrite ring buffer after
2776 * record__mmap_read_all(): we should have collected
2777 * data from it.
2778 */
2779 evlist__toggle_bkw_mmap(rec->evlist, BKW_MMAP_RUNNING);
2780
2781 if (!quiet)
2782 fprintf(stderr, "[ perf record: dump data: Woken up %ld times ]\n",
2783 record__waking(rec));
2784 thread->waking = 0;
2785 fd = record__switch_output(rec, false);
2786 if (fd < 0) {
2787 pr_err("Failed to switch to new file\n");
2788 trigger_error(&switch_output_trigger);
2789 err = fd;
2790 goto out_child;
2791 }
2792
2793 /* re-arm the alarm */
2794 if (rec->switch_output.time)
2795 alarm(rec->switch_output.time);
2796 }
2797
2798 if (hits == thread->samples) {
2799 if (done || draining)
2800 break;
2801 err = fdarray__poll(&thread->pollfd, -1);
2802 /*
2803 * Propagate error, only if there's any. Ignore positive
2804 * number of returned events and interrupt error.
2805 */
2806 if (err > 0 || (err < 0 && errno == EINTR))
2807 err = 0;
2808 thread->waking++;
2809
2810 if (fdarray__filter(&thread->pollfd, POLLERR | POLLHUP,
2811 record__thread_munmap_filtered, NULL) == 0)
2812 draining = true;
2813
2814 err = record__update_evlist_pollfd_from_thread(rec, rec->evlist, thread);
2815 if (err)
2816 goto out_child;
2817 }
2818
2819 if (evlist__ctlfd_process(rec->evlist, &cmd) > 0) {
2820 switch (cmd) {
2821 case EVLIST_CTL_CMD_SNAPSHOT:
2822 hit_auxtrace_snapshot_trigger(rec);
2823 evlist__ctlfd_ack(rec->evlist);
2824 break;
2825 case EVLIST_CTL_CMD_STOP:
2826 done = 1;
2827 break;
2828 case EVLIST_CTL_CMD_ACK:
2829 case EVLIST_CTL_CMD_UNSUPPORTED:
2830 case EVLIST_CTL_CMD_ENABLE:
2831 case EVLIST_CTL_CMD_DISABLE:
2832 case EVLIST_CTL_CMD_EVLIST:
2833 case EVLIST_CTL_CMD_PING:
2834 default:
2835 break;
2836 }
2837 }
2838
2839 err = event_enable_timer__process(rec->evlist->eet);
2840 if (err < 0)
2841 goto out_child;
2842 if (err) {
2843 err = 0;
2844 done = 1;
2845 }
2846
2847 /*
2848 * When perf is starting the traced process, at the end events
2849 * die with the process and we wait for that. Thus no need to
2850 * disable events in this case.
2851 */
2852 if (done && !disabled && !target__none(&opts->target)) {
2853 trigger_off(&auxtrace_snapshot_trigger);
2854 evlist__disable(rec->evlist);
2855 disabled = true;
2856 }
2857 }
2858
2859 trigger_off(&auxtrace_snapshot_trigger);
2860 trigger_off(&switch_output_trigger);
2861
2862 record__synthesize_final_bpf_metadata(rec);
2863
2864 if (opts->auxtrace_snapshot_on_exit)
2865 record__auxtrace_snapshot_exit(rec);
2866
2867 if (forks && workload_exec_errno) {
2868 char msg[STRERR_BUFSIZE];
2869 const char *emsg = str_error_r(workload_exec_errno, msg, sizeof(msg));
2870 struct strbuf sb = STRBUF_INIT;
2871
2872 evlist__format_evsels(rec->evlist, &sb, 2048);
2873
2874 pr_err("Failed to collect '%s' for the '%s' workload: %s\n",
2875 sb.buf, argv[0], emsg);
2876 strbuf_release(&sb);
2877 err = -1;
2878 goto out_child;
2879 }
2880
2881 if (!quiet)
2882 fprintf(stderr, "[ perf record: Woken up %ld times to write data ]\n",
2883 record__waking(rec));
2884
2885 write_finished_init(rec, true);
2886
2887 if (target__none(&rec->opts.target))
2888 record__synthesize_workload(rec, true);
2889
2890 out_child:
2891 record__stop_threads(rec);
2892 record__mmap_read_all(rec, true);
2893 out_free_threads:
2894 record__free_thread_data(rec);
2895 evlist__finalize_ctlfd(rec->evlist);
2896 record__aio_mmap_read_sync(rec);
2897
2898 if (rec->session->bytes_transferred && rec->session->bytes_compressed) {
2899 ratio = (float)rec->session->bytes_transferred/(float)rec->session->bytes_compressed;
2900 env->comp_ratio = ratio + 0.5;
2901 }
2902
2903 if (forks) {
2904 int exit_status;
2905
2906 if (!child_finished)
2907 kill(rec->evlist->workload.pid, SIGTERM);
2908
2909 wait(&exit_status);
2910
2911 if (err < 0)
2912 status = err;
2913 else if (WIFEXITED(exit_status))
2914 status = WEXITSTATUS(exit_status);
2915 else if (WIFSIGNALED(exit_status))
2916 signr = WTERMSIG(exit_status);
2917 } else
2918 status = err;
2919
2920 if (rec->off_cpu)
2921 rec->bytes_written += off_cpu_write(rec->session);
2922
2923 record__read_lost_samples(rec);
2924 /* this will be recalculated during process_buildids() */
2925 rec->samples = 0;
2926
2927 if (!err) {
2928 record__synthesize(rec, true);
2929 if (!rec->timestamp_filename) {
2930 record__finish_output(rec);
2931 } else {
2932 fd = record__switch_output(rec, true);
2933 if (fd < 0) {
2934 status = fd;
2935 goto out_delete_session;
2936 }
2937 }
2938 }
2939
2940 perf_hooks__invoke_record_end();
2941
2942 if (!err && !quiet) {
2943 char samples[128];
2944 const char *postfix = rec->timestamp_filename ?
2945 ".<timestamp>" : "";
2946
2947 if (rec->samples && !rec->opts.full_auxtrace)
2948 scnprintf(samples, sizeof(samples),
2949 " (%" PRIu64 " samples)", rec->samples);
2950 else
2951 samples[0] = '\0';
2952
2953 fprintf(stderr, "[ perf record: Captured and wrote %.3f MB %s%s%s",
2954 perf_data__size(data) / 1024.0 / 1024.0,
2955 data->path, postfix, samples);
2956 if (ratio) {
2957 fprintf(stderr, ", compressed (original %.3f MB, ratio is %.3f)",
2958 rec->session->bytes_transferred / 1024.0 / 1024.0,
2959 ratio);
2960 }
2961 fprintf(stderr, " ]\n");
2962 }
2963
2964 out_delete_session:
2965 #ifdef HAVE_EVENTFD_SUPPORT
2966 if (done_fd >= 0) {
2967 fd = done_fd;
2968 done_fd = -1;
2969
2970 close(fd);
2971 }
2972 #endif
2973 zstd_fini(&session->zstd_data);
2974 if (!opts->no_bpf_event)
2975 evlist__stop_sb_thread(rec->sb_evlist);
2976
2977 perf_session__delete(session);
2978 return status;
2979 }
2980
callchain_debug(struct callchain_param * callchain)2981 static void callchain_debug(struct callchain_param *callchain)
2982 {
2983 static const char *str[CALLCHAIN_MAX] = { "NONE", "FP", "DWARF", "LBR" };
2984
2985 pr_debug("callchain: type %s\n", str[callchain->record_mode]);
2986
2987 if (callchain->record_mode == CALLCHAIN_DWARF)
2988 pr_debug("callchain: stack dump size %d\n",
2989 callchain->dump_size);
2990 }
2991
record_opts__parse_callchain(struct record_opts * record,struct callchain_param * callchain,const char * arg,bool unset)2992 int record_opts__parse_callchain(struct record_opts *record,
2993 struct callchain_param *callchain,
2994 const char *arg, bool unset)
2995 {
2996 int ret;
2997 callchain->enabled = !unset;
2998
2999 /* --no-call-graph */
3000 if (unset) {
3001 callchain->record_mode = CALLCHAIN_NONE;
3002 pr_debug("callchain: disabled\n");
3003 return 0;
3004 }
3005
3006 ret = parse_callchain_record_opt(arg, callchain);
3007 if (!ret) {
3008 /* Enable data address sampling for DWARF unwind. */
3009 if (callchain->record_mode == CALLCHAIN_DWARF)
3010 record->sample_address = true;
3011 callchain_debug(callchain);
3012 }
3013
3014 return ret;
3015 }
3016
record_parse_callchain_opt(const struct option * opt,const char * arg,int unset)3017 int record_parse_callchain_opt(const struct option *opt,
3018 const char *arg,
3019 int unset)
3020 {
3021 return record_opts__parse_callchain(opt->value, &callchain_param, arg, unset);
3022 }
3023
record_callchain_opt(const struct option * opt,const char * arg __maybe_unused,int unset __maybe_unused)3024 int record_callchain_opt(const struct option *opt,
3025 const char *arg __maybe_unused,
3026 int unset __maybe_unused)
3027 {
3028 struct callchain_param *callchain = opt->value;
3029
3030 callchain->enabled = true;
3031
3032 if (callchain->record_mode == CALLCHAIN_NONE)
3033 callchain->record_mode = CALLCHAIN_FP;
3034
3035 callchain_debug(callchain);
3036 return 0;
3037 }
3038
perf_record_config(const char * var,const char * value,void * cb)3039 static int perf_record_config(const char *var, const char *value, void *cb)
3040 {
3041 struct record *rec = cb;
3042
3043 if (!strcmp(var, "record.build-id")) {
3044 if (!strcmp(value, "cache"))
3045 rec->no_buildid_cache = false;
3046 else if (!strcmp(value, "no-cache"))
3047 rec->no_buildid_cache = true;
3048 else if (!strcmp(value, "skip"))
3049 rec->no_buildid = rec->no_buildid_cache = true;
3050 else if (!strcmp(value, "mmap"))
3051 rec->buildid_mmap = true;
3052 else if (!strcmp(value, "no-mmap"))
3053 rec->buildid_mmap = false;
3054 else
3055 return -1;
3056 return 0;
3057 }
3058 if (!strcmp(var, "record.call-graph")) {
3059 var = "call-graph.record-mode";
3060 return perf_default_config(var, value, cb);
3061 }
3062 #ifdef HAVE_AIO_SUPPORT
3063 if (!strcmp(var, "record.aio")) {
3064 rec->opts.nr_cblocks = strtol(value, NULL, 0);
3065 if (!rec->opts.nr_cblocks)
3066 rec->opts.nr_cblocks = nr_cblocks_default;
3067 }
3068 #endif
3069 if (!strcmp(var, "record.debuginfod")) {
3070 rec->debuginfod.urls = strdup(value);
3071 if (!rec->debuginfod.urls)
3072 return -ENOMEM;
3073 rec->debuginfod.set = true;
3074 }
3075
3076 return 0;
3077 }
3078
record__parse_event_enable_time(const struct option * opt,const char * str,int unset)3079 static int record__parse_event_enable_time(const struct option *opt, const char *str, int unset)
3080 {
3081 struct record *rec = (struct record *)opt->value;
3082
3083 return evlist__parse_event_enable_time(rec->evlist, &rec->opts, str, unset);
3084 }
3085
record__parse_affinity(const struct option * opt,const char * str,int unset)3086 static int record__parse_affinity(const struct option *opt, const char *str, int unset)
3087 {
3088 struct record_opts *opts = (struct record_opts *)opt->value;
3089
3090 if (unset || !str)
3091 return 0;
3092
3093 if (!strcasecmp(str, "node"))
3094 opts->affinity = PERF_AFFINITY_NODE;
3095 else if (!strcasecmp(str, "cpu"))
3096 opts->affinity = PERF_AFFINITY_CPU;
3097
3098 return 0;
3099 }
3100
record__mmap_cpu_mask_alloc(struct mmap_cpu_mask * mask,int nr_bits)3101 static int record__mmap_cpu_mask_alloc(struct mmap_cpu_mask *mask, int nr_bits)
3102 {
3103 mask->nbits = nr_bits;
3104 mask->bits = bitmap_zalloc(mask->nbits);
3105 if (!mask->bits)
3106 return -ENOMEM;
3107
3108 return 0;
3109 }
3110
record__mmap_cpu_mask_free(struct mmap_cpu_mask * mask)3111 static void record__mmap_cpu_mask_free(struct mmap_cpu_mask *mask)
3112 {
3113 bitmap_free(mask->bits);
3114 mask->nbits = 0;
3115 }
3116
record__thread_mask_alloc(struct thread_mask * mask,int nr_bits)3117 static int record__thread_mask_alloc(struct thread_mask *mask, int nr_bits)
3118 {
3119 int ret;
3120
3121 ret = record__mmap_cpu_mask_alloc(&mask->maps, nr_bits);
3122 if (ret) {
3123 mask->affinity.bits = NULL;
3124 return ret;
3125 }
3126
3127 ret = record__mmap_cpu_mask_alloc(&mask->affinity, nr_bits);
3128 if (ret) {
3129 record__mmap_cpu_mask_free(&mask->maps);
3130 mask->maps.bits = NULL;
3131 }
3132
3133 return ret;
3134 }
3135
record__thread_mask_free(struct thread_mask * mask)3136 static void record__thread_mask_free(struct thread_mask *mask)
3137 {
3138 record__mmap_cpu_mask_free(&mask->maps);
3139 record__mmap_cpu_mask_free(&mask->affinity);
3140 }
3141
record__parse_threads(const struct option * opt,const char * str,int unset)3142 static int record__parse_threads(const struct option *opt, const char *str, int unset)
3143 {
3144 int s;
3145 struct record_opts *opts = opt->value;
3146
3147 if (unset || !str || !strlen(str)) {
3148 opts->threads_spec = THREAD_SPEC__CPU;
3149 } else {
3150 for (s = 1; s < THREAD_SPEC__MAX; s++) {
3151 if (s == THREAD_SPEC__USER) {
3152 opts->threads_user_spec = strdup(str);
3153 if (!opts->threads_user_spec)
3154 return -ENOMEM;
3155 opts->threads_spec = THREAD_SPEC__USER;
3156 break;
3157 }
3158 if (!strncasecmp(str, thread_spec_tags[s], strlen(thread_spec_tags[s]))) {
3159 opts->threads_spec = s;
3160 break;
3161 }
3162 }
3163 }
3164
3165 if (opts->threads_spec == THREAD_SPEC__USER)
3166 pr_debug("threads_spec: %s\n", opts->threads_user_spec);
3167 else
3168 pr_debug("threads_spec: %s\n", thread_spec_tags[opts->threads_spec]);
3169
3170 return 0;
3171 }
3172
parse_output_max_size(const struct option * opt,const char * str,int unset)3173 static int parse_output_max_size(const struct option *opt,
3174 const char *str, int unset)
3175 {
3176 unsigned long *s = (unsigned long *)opt->value;
3177 static struct parse_tag tags_size[] = {
3178 { .tag = 'B', .mult = 1 },
3179 { .tag = 'K', .mult = 1 << 10 },
3180 { .tag = 'M', .mult = 1 << 20 },
3181 { .tag = 'G', .mult = 1 << 30 },
3182 { .tag = 0 },
3183 };
3184 unsigned long val;
3185
3186 if (unset) {
3187 *s = 0;
3188 return 0;
3189 }
3190
3191 val = parse_tag_value(str, tags_size);
3192 if (val != (unsigned long) -1) {
3193 *s = val;
3194 return 0;
3195 }
3196
3197 return -1;
3198 }
3199
record__parse_mmap_pages(const struct option * opt,const char * str,int unset __maybe_unused)3200 static int record__parse_mmap_pages(const struct option *opt,
3201 const char *str,
3202 int unset __maybe_unused)
3203 {
3204 struct record_opts *opts = opt->value;
3205 char *s, *p;
3206 unsigned int mmap_pages;
3207 int ret;
3208
3209 if (!str)
3210 return -EINVAL;
3211
3212 s = strdup(str);
3213 if (!s)
3214 return -ENOMEM;
3215
3216 p = strchr(s, ',');
3217 if (p)
3218 *p = '\0';
3219
3220 if (*s) {
3221 ret = __evlist__parse_mmap_pages(&mmap_pages, s);
3222 if (ret)
3223 goto out_free;
3224 opts->mmap_pages = mmap_pages;
3225 }
3226
3227 if (!p) {
3228 ret = 0;
3229 goto out_free;
3230 }
3231
3232 ret = __evlist__parse_mmap_pages(&mmap_pages, p + 1);
3233 if (ret)
3234 goto out_free;
3235
3236 opts->auxtrace_mmap_pages = mmap_pages;
3237
3238 out_free:
3239 free(s);
3240 return ret;
3241 }
3242
record__parse_off_cpu_thresh(const struct option * opt,const char * str,int unset __maybe_unused)3243 static int record__parse_off_cpu_thresh(const struct option *opt,
3244 const char *str,
3245 int unset __maybe_unused)
3246 {
3247 struct record_opts *opts = opt->value;
3248 char *endptr;
3249 u64 off_cpu_thresh_ms;
3250
3251 if (!str)
3252 return -EINVAL;
3253
3254 off_cpu_thresh_ms = strtoull(str, &endptr, 10);
3255
3256 /* the threshold isn't string "0", yet strtoull() returns 0, parsing failed */
3257 if (*endptr || (off_cpu_thresh_ms == 0 && strcmp(str, "0")))
3258 return -EINVAL;
3259 else
3260 opts->off_cpu_thresh_ns = off_cpu_thresh_ms * NSEC_PER_MSEC;
3261
3262 return 0;
3263 }
3264
arch__add_leaf_frame_record_opts(struct record_opts * opts __maybe_unused)3265 void __weak arch__add_leaf_frame_record_opts(struct record_opts *opts __maybe_unused)
3266 {
3267 }
3268
parse_control_option(const struct option * opt,const char * str,int unset __maybe_unused)3269 static int parse_control_option(const struct option *opt,
3270 const char *str,
3271 int unset __maybe_unused)
3272 {
3273 struct record_opts *opts = opt->value;
3274
3275 return evlist__parse_control(str, &opts->ctl_fd, &opts->ctl_fd_ack, &opts->ctl_fd_close);
3276 }
3277
switch_output_size_warn(struct record * rec)3278 static void switch_output_size_warn(struct record *rec)
3279 {
3280 u64 wakeup_size = evlist__mmap_size(rec->opts.mmap_pages);
3281 struct switch_output *s = &rec->switch_output;
3282
3283 wakeup_size /= 2;
3284
3285 if (s->size < wakeup_size) {
3286 char buf[100];
3287
3288 unit_number__scnprintf(buf, sizeof(buf), wakeup_size);
3289 pr_warning("WARNING: switch-output data size lower than "
3290 "wakeup kernel buffer size (%s) "
3291 "expect bigger perf.data sizes\n", buf);
3292 }
3293 }
3294
switch_output_setup(struct record * rec)3295 static int switch_output_setup(struct record *rec)
3296 {
3297 struct switch_output *s = &rec->switch_output;
3298 static struct parse_tag tags_size[] = {
3299 { .tag = 'B', .mult = 1 },
3300 { .tag = 'K', .mult = 1 << 10 },
3301 { .tag = 'M', .mult = 1 << 20 },
3302 { .tag = 'G', .mult = 1 << 30 },
3303 { .tag = 0 },
3304 };
3305 static struct parse_tag tags_time[] = {
3306 { .tag = 's', .mult = 1 },
3307 { .tag = 'm', .mult = 60 },
3308 { .tag = 'h', .mult = 60*60 },
3309 { .tag = 'd', .mult = 60*60*24 },
3310 { .tag = 0 },
3311 };
3312 unsigned long val;
3313
3314 /*
3315 * If we're using --switch-output-events, then we imply its
3316 * --switch-output=signal, as we'll send a SIGUSR2 from the side band
3317 * thread to its parent.
3318 */
3319 if (rec->switch_output_event_set) {
3320 if (record__threads_enabled(rec)) {
3321 pr_warning("WARNING: --switch-output-event option is not available in parallel streaming mode.\n");
3322 return 0;
3323 }
3324 goto do_signal;
3325 }
3326
3327 if (!s->set)
3328 return 0;
3329
3330 if (record__threads_enabled(rec)) {
3331 pr_warning("WARNING: --switch-output option is not available in parallel streaming mode.\n");
3332 return 0;
3333 }
3334
3335 if (!strcmp(s->str, "signal")) {
3336 do_signal:
3337 s->signal = true;
3338 pr_debug("switch-output with SIGUSR2 signal\n");
3339 goto enabled;
3340 }
3341
3342 val = parse_tag_value(s->str, tags_size);
3343 if (val != (unsigned long) -1) {
3344 s->size = val;
3345 pr_debug("switch-output with %s size threshold\n", s->str);
3346 goto enabled;
3347 }
3348
3349 val = parse_tag_value(s->str, tags_time);
3350 if (val != (unsigned long) -1) {
3351 s->time = val;
3352 pr_debug("switch-output with %s time threshold (%lu seconds)\n",
3353 s->str, s->time);
3354 goto enabled;
3355 }
3356
3357 return -1;
3358
3359 enabled:
3360 rec->timestamp_filename = true;
3361 s->enabled = true;
3362
3363 if (s->size && !rec->opts.no_buffering)
3364 switch_output_size_warn(rec);
3365
3366 return 0;
3367 }
3368
3369 static const char * const __record_usage[] = {
3370 "perf record [<options>] [<command>]",
3371 "perf record [<options>] -- <command> [<options>]",
3372 NULL
3373 };
3374 const char * const *record_usage = __record_usage;
3375
build_id__process_mmap(const struct perf_tool * tool,union perf_event * event,struct perf_sample * sample,struct machine * machine)3376 static int build_id__process_mmap(const struct perf_tool *tool, union perf_event *event,
3377 struct perf_sample *sample, struct machine *machine)
3378 {
3379 /*
3380 * We already have the kernel maps, put in place via perf_session__create_kernel_maps()
3381 * no need to add them twice.
3382 */
3383 if (!(event->header.misc & PERF_RECORD_MISC_USER))
3384 return 0;
3385 return perf_event__process_mmap(tool, event, sample, machine);
3386 }
3387
build_id__process_mmap2(const struct perf_tool * tool,union perf_event * event,struct perf_sample * sample,struct machine * machine)3388 static int build_id__process_mmap2(const struct perf_tool *tool, union perf_event *event,
3389 struct perf_sample *sample, struct machine *machine)
3390 {
3391 /*
3392 * We already have the kernel maps, put in place via perf_session__create_kernel_maps()
3393 * no need to add them twice.
3394 */
3395 if (!(event->header.misc & PERF_RECORD_MISC_USER))
3396 return 0;
3397
3398 return perf_event__process_mmap2(tool, event, sample, machine);
3399 }
3400
process_timestamp_boundary(const struct perf_tool * tool,union perf_event * event __maybe_unused,struct perf_sample * sample,struct machine * machine __maybe_unused)3401 static int process_timestamp_boundary(const struct perf_tool *tool,
3402 union perf_event *event __maybe_unused,
3403 struct perf_sample *sample,
3404 struct machine *machine __maybe_unused)
3405 {
3406 struct record *rec = container_of(tool, struct record, tool);
3407
3408 set_timestamp_boundary(rec, sample->time);
3409 return 0;
3410 }
3411
parse_record_synth_option(const struct option * opt,const char * str,int unset __maybe_unused)3412 static int parse_record_synth_option(const struct option *opt,
3413 const char *str,
3414 int unset __maybe_unused)
3415 {
3416 struct record_opts *opts = opt->value;
3417 char *p = strdup(str);
3418
3419 if (p == NULL)
3420 return -1;
3421
3422 opts->synth = parse_synth_opt(p);
3423 free(p);
3424
3425 if (opts->synth < 0) {
3426 pr_err("Invalid synth option: %s\n", str);
3427 return -1;
3428 }
3429 return 0;
3430 }
3431
3432 /*
3433 * XXX Ideally would be local to cmd_record() and passed to a record__new
3434 * because we need to have access to it in record__exit, that is called
3435 * after cmd_record() exits, but since record_options need to be accessible to
3436 * builtin-script, leave it here.
3437 *
3438 * At least we don't ouch it in all the other functions here directly.
3439 *
3440 * Just say no to tons of global variables, sigh.
3441 */
3442 static struct record record = {
3443 .opts = {
3444 .sample_time = true,
3445 .mmap_pages = UINT_MAX,
3446 .user_freq = UINT_MAX,
3447 .user_interval = ULLONG_MAX,
3448 .freq = 4000,
3449 .target = {
3450 .uses_mmap = true,
3451 .default_per_cpu = true,
3452 },
3453 .mmap_flush = MMAP_FLUSH_DEFAULT,
3454 .nr_threads_synthesize = 1,
3455 .ctl_fd = -1,
3456 .ctl_fd_ack = -1,
3457 .synth = PERF_SYNTH_ALL,
3458 .off_cpu_thresh_ns = OFFCPU_THRESH,
3459 },
3460 .buildid_mmap = true,
3461 };
3462
3463 const char record_callchain_help[] = CALLCHAIN_RECORD_HELP
3464 "\n\t\t\t\tDefault: fp";
3465
3466 static bool dry_run;
3467
3468 static struct parse_events_option_args parse_events_option_args = {
3469 .evlistp = &record.evlist,
3470 };
3471
3472 static struct parse_events_option_args switch_output_parse_events_option_args = {
3473 .evlistp = &record.sb_evlist,
3474 };
3475
3476 /*
3477 * XXX Will stay a global variable till we fix builtin-script.c to stop messing
3478 * with it and switch to use the library functions in perf_evlist that came
3479 * from builtin-record.c, i.e. use record_opts,
3480 * evlist__prepare_workload, etc instead of fork+exec'in 'perf record',
3481 * using pipes, etc.
3482 */
3483 static struct option __record_options[] = {
3484 OPT_CALLBACK('e', "event", &parse_events_option_args, "event",
3485 "event selector. use 'perf list' to list available events",
3486 parse_events_option),
3487 OPT_CALLBACK(0, "filter", &record.evlist, "filter",
3488 "event filter", parse_filter),
3489 OPT_BOOLEAN(0, "latency", &record.latency,
3490 "Enable data collection for latency profiling.\n"
3491 "\t\t\t Use perf report --latency for latency-centric profile."),
3492 OPT_CALLBACK_NOOPT(0, "exclude-perf", &record.evlist,
3493 NULL, "don't record events from perf itself",
3494 exclude_perf),
3495 OPT_STRING('p', "pid", &record.opts.target.pid, "pid",
3496 "record events on existing process id"),
3497 OPT_STRING('t', "tid", &record.opts.target.tid, "tid",
3498 "record events on existing thread id"),
3499 OPT_INTEGER('r', "realtime", &record.realtime_prio,
3500 "collect data with this RT SCHED_FIFO priority"),
3501 OPT_BOOLEAN(0, "no-buffering", &record.opts.no_buffering,
3502 "collect data without buffering"),
3503 OPT_BOOLEAN('R', "raw-samples", &record.opts.raw_samples,
3504 "collect raw sample records from all opened counters"),
3505 OPT_BOOLEAN('a', "all-cpus", &record.opts.target.system_wide,
3506 "system-wide collection from all CPUs"),
3507 OPT_STRING('C', "cpu", &record.opts.target.cpu_list, "cpu",
3508 "list of cpus to monitor"),
3509 OPT_U64('c', "count", &record.opts.user_interval, "event period to sample"),
3510 OPT_STRING('o', "output", &record.data.path, "file",
3511 "output file name"),
3512 OPT_BOOLEAN_SET('i', "no-inherit", &record.opts.no_inherit,
3513 &record.opts.no_inherit_set,
3514 "child tasks do not inherit counters"),
3515 OPT_BOOLEAN(0, "tail-synthesize", &record.opts.tail_synthesize,
3516 "synthesize non-sample events at the end of output"),
3517 OPT_BOOLEAN(0, "overwrite", &record.opts.overwrite, "use overwrite mode"),
3518 OPT_BOOLEAN(0, "no-bpf-event", &record.opts.no_bpf_event, "do not record bpf events"),
3519 OPT_BOOLEAN(0, "strict-freq", &record.opts.strict_freq,
3520 "Fail if the specified frequency can't be used"),
3521 OPT_CALLBACK('F', "freq", &record.opts, "freq or 'max'",
3522 "profile at this frequency",
3523 record__parse_freq),
3524 OPT_CALLBACK('m', "mmap-pages", &record.opts, "pages[,pages]",
3525 "number of mmap data pages and AUX area tracing mmap pages",
3526 record__parse_mmap_pages),
3527 OPT_CALLBACK(0, "mmap-flush", &record.opts, "number",
3528 "Minimal number of bytes that is extracted from mmap data pages (default: 1)",
3529 record__mmap_flush_parse),
3530 OPT_CALLBACK_NOOPT('g', NULL, &callchain_param,
3531 NULL, "enables call-graph recording" ,
3532 &record_callchain_opt),
3533 OPT_CALLBACK(0, "call-graph", &record.opts,
3534 "record_mode[,record_size]", record_callchain_help,
3535 &record_parse_callchain_opt),
3536 OPT_INCR('v', "verbose", &verbose,
3537 "be more verbose (show counter open errors, etc)"),
3538 OPT_BOOLEAN('q', "quiet", &quiet, "don't print any warnings or messages"),
3539 OPT_BOOLEAN('s', "stat", &record.opts.inherit_stat,
3540 "per thread counts"),
3541 OPT_BOOLEAN('d', "data", &record.opts.sample_address, "Record the sample addresses"),
3542 OPT_BOOLEAN(0, "phys-data", &record.opts.sample_phys_addr,
3543 "Record the sample physical addresses"),
3544 OPT_BOOLEAN(0, "data-page-size", &record.opts.sample_data_page_size,
3545 "Record the sampled data address data page size"),
3546 OPT_BOOLEAN(0, "code-page-size", &record.opts.sample_code_page_size,
3547 "Record the sampled code address (ip) page size"),
3548 OPT_BOOLEAN(0, "sample-mem-info", &record.opts.sample_data_src,
3549 "Record the data source for memory operations"),
3550 OPT_BOOLEAN(0, "sample-cpu", &record.opts.sample_cpu, "Record the sample cpu"),
3551 OPT_BOOLEAN(0, "sample-identifier", &record.opts.sample_identifier,
3552 "Record the sample identifier"),
3553 OPT_BOOLEAN_SET('T', "timestamp", &record.opts.sample_time,
3554 &record.opts.sample_time_set,
3555 "Record the sample timestamps"),
3556 OPT_BOOLEAN_SET('P', "period", &record.opts.period, &record.opts.period_set,
3557 "Record the sample period"),
3558 OPT_BOOLEAN('n', "no-samples", &record.opts.no_samples,
3559 "don't sample"),
3560 OPT_BOOLEAN_SET('N', "no-buildid-cache", &record.no_buildid_cache,
3561 &record.no_buildid_cache_set,
3562 "do not update the buildid cache"),
3563 OPT_BOOLEAN_SET('B', "no-buildid", &record.no_buildid,
3564 &record.no_buildid_set,
3565 "do not collect buildids in perf.data"),
3566 OPT_CALLBACK('G', "cgroup", &record.evlist, "name",
3567 "monitor event in cgroup name only",
3568 parse_cgroups),
3569 OPT_CALLBACK('D', "delay", &record, "ms",
3570 "ms to wait before starting measurement after program start (-1: start with events disabled), "
3571 "or ranges of time to enable events e.g. '-D 10-20,30-40'",
3572 record__parse_event_enable_time),
3573 OPT_BOOLEAN(0, "kcore", &record.opts.kcore, "copy /proc/kcore"),
3574 OPT_STRING('u', "uid", &record.uid_str, "user", "user to profile"),
3575
3576 OPT_CALLBACK_NOOPT('b', "branch-any", &record.opts.branch_stack,
3577 "branch any", "sample any taken branches",
3578 parse_branch_stack),
3579
3580 OPT_CALLBACK('j', "branch-filter", &record.opts.branch_stack,
3581 "branch filter mask", "branch stack filter modes",
3582 parse_branch_stack),
3583 OPT_BOOLEAN('W', "weight", &record.opts.sample_weight,
3584 "sample by weight (on special events only)"),
3585 OPT_BOOLEAN(0, "transaction", &record.opts.sample_transaction,
3586 "sample transaction flags (special events only)"),
3587 OPT_BOOLEAN(0, "per-thread", &record.opts.target.per_thread,
3588 "use per-thread mmaps"),
3589 OPT_CALLBACK_OPTARG('I', "intr-regs", &record.opts.sample_intr_regs, NULL, "any register",
3590 "sample selected machine registers on interrupt,"
3591 " use '-I?' to list register names", parse_intr_regs),
3592 OPT_CALLBACK_OPTARG(0, "user-regs", &record.opts.sample_user_regs, NULL, "any register",
3593 "sample selected machine registers in user space,"
3594 " use '--user-regs=?' to list register names", parse_user_regs),
3595 OPT_BOOLEAN(0, "running-time", &record.opts.running_time,
3596 "Record running/enabled time of read (:S) events"),
3597 OPT_CALLBACK('k', "clockid", &record.opts,
3598 "clockid", "clockid to use for events, see clock_gettime()",
3599 parse_clockid),
3600 OPT_STRING_OPTARG('S', "snapshot", &record.opts.auxtrace_snapshot_opts,
3601 "opts", "AUX area tracing Snapshot Mode", ""),
3602 OPT_STRING_OPTARG(0, "aux-sample", &record.opts.auxtrace_sample_opts,
3603 "opts", "sample AUX area", ""),
3604 OPT_UINTEGER(0, "proc-map-timeout", &proc_map_timeout,
3605 "per thread proc mmap processing timeout in ms"),
3606 OPT_BOOLEAN(0, "namespaces", &record.opts.record_namespaces,
3607 "Record namespaces events"),
3608 OPT_BOOLEAN(0, "all-cgroups", &record.opts.record_cgroup,
3609 "Record cgroup events"),
3610 OPT_BOOLEAN_SET(0, "switch-events", &record.opts.record_switch_events,
3611 &record.opts.record_switch_events_set,
3612 "Record context switch events"),
3613 OPT_BOOLEAN_FLAG(0, "all-kernel", &record.opts.all_kernel,
3614 "Configure all used events to run in kernel space.",
3615 PARSE_OPT_EXCLUSIVE),
3616 OPT_BOOLEAN_FLAG(0, "all-user", &record.opts.all_user,
3617 "Configure all used events to run in user space.",
3618 PARSE_OPT_EXCLUSIVE),
3619 OPT_BOOLEAN(0, "kernel-callchains", &record.opts.kernel_callchains,
3620 "collect kernel callchains"),
3621 OPT_BOOLEAN(0, "user-callchains", &record.opts.user_callchains,
3622 "collect user callchains"),
3623 OPT_STRING(0, "vmlinux", &symbol_conf.vmlinux_name,
3624 "file", "vmlinux pathname"),
3625 OPT_BOOLEAN(0, "buildid-all", &record.buildid_all,
3626 "Record build-id of all DSOs regardless of hits"),
3627 OPT_BOOLEAN_SET(0, "buildid-mmap", &record.buildid_mmap, &record.buildid_mmap_set,
3628 "Record build-id in mmap events and skip build-id processing."),
3629 OPT_BOOLEAN(0, "timestamp-filename", &record.timestamp_filename,
3630 "append timestamp to output filename"),
3631 OPT_BOOLEAN(0, "timestamp-boundary", &record.timestamp_boundary,
3632 "Record timestamp boundary (time of first/last samples)"),
3633 OPT_STRING_OPTARG_SET(0, "switch-output", &record.switch_output.str,
3634 &record.switch_output.set, "signal or size[BKMG] or time[smhd]",
3635 "Switch output when receiving SIGUSR2 (signal) or cross a size or time threshold",
3636 "signal"),
3637 OPT_CALLBACK_SET(0, "switch-output-event", &switch_output_parse_events_option_args,
3638 &record.switch_output_event_set, "switch output event",
3639 "switch output event selector. use 'perf list' to list available events",
3640 parse_events_option_new_evlist),
3641 OPT_INTEGER(0, "switch-max-files", &record.switch_output.num_files,
3642 "Limit number of switch output generated files"),
3643 OPT_BOOLEAN(0, "dry-run", &dry_run,
3644 "Parse options then exit"),
3645 #ifdef HAVE_AIO_SUPPORT
3646 OPT_CALLBACK_OPTARG(0, "aio", &record.opts,
3647 &nr_cblocks_default, "n", "Use <n> control blocks in asynchronous trace writing mode (default: 1, max: 4)",
3648 record__aio_parse),
3649 #endif
3650 OPT_CALLBACK(0, "affinity", &record.opts, "node|cpu",
3651 "Set affinity mask of trace reading thread to NUMA node cpu mask or cpu of processed mmap buffer",
3652 record__parse_affinity),
3653 #ifdef HAVE_ZSTD_SUPPORT
3654 OPT_CALLBACK_OPTARG('z', "compression-level", &record.opts, &comp_level_default, "n",
3655 "Compress records using specified level (default: 1 - fastest compression, 22 - greatest compression)",
3656 record__parse_comp_level),
3657 #endif
3658 OPT_CALLBACK(0, "max-size", &record.output_max_size,
3659 "size", "Limit the maximum size of the output file", parse_output_max_size),
3660 OPT_UINTEGER(0, "num-thread-synthesize",
3661 &record.opts.nr_threads_synthesize,
3662 "number of threads to run for event synthesis"),
3663 #ifdef HAVE_LIBPFM
3664 OPT_CALLBACK(0, "pfm-events", &record.evlist, "event",
3665 "libpfm4 event selector. use 'perf list' to list available events",
3666 parse_libpfm_events_option),
3667 #endif
3668 OPT_CALLBACK(0, "control", &record.opts, "fd:ctl-fd[,ack-fd] or fifo:ctl-fifo[,ack-fifo]",
3669 "Listen on ctl-fd descriptor for command to control measurement ('enable': enable events, 'disable': disable events,\n"
3670 "\t\t\t 'snapshot': AUX area tracing snapshot).\n"
3671 "\t\t\t Optionally send control command completion ('ack\\n') to ack-fd descriptor.\n"
3672 "\t\t\t Alternatively, ctl-fifo / ack-fifo will be opened and used as ctl-fd / ack-fd.",
3673 parse_control_option),
3674 OPT_CALLBACK(0, "synth", &record.opts, "no|all|task|mmap|cgroup",
3675 "Fine-tune event synthesis: default=all", parse_record_synth_option),
3676 OPT_STRING_OPTARG_SET(0, "debuginfod", &record.debuginfod.urls,
3677 &record.debuginfod.set, "debuginfod urls",
3678 "Enable debuginfod data retrieval from DEBUGINFOD_URLS or specified urls",
3679 "system"),
3680 OPT_CALLBACK_OPTARG(0, "threads", &record.opts, NULL, "spec",
3681 "write collected trace data into several data files using parallel threads",
3682 record__parse_threads),
3683 OPT_BOOLEAN(0, "off-cpu", &record.off_cpu, "Enable off-cpu analysis"),
3684 OPT_STRING(0, "setup-filter", &record.filter_action, "pin|unpin",
3685 "BPF filter action"),
3686 OPT_CALLBACK(0, "off-cpu-thresh", &record.opts, "ms",
3687 "Dump off-cpu samples if off-cpu time exceeds this threshold (in milliseconds). (Default: 500ms)",
3688 record__parse_off_cpu_thresh),
3689 OPT_END()
3690 };
3691
3692 struct option *record_options = __record_options;
3693
record__mmap_cpu_mask_init(struct mmap_cpu_mask * mask,struct perf_cpu_map * cpus)3694 static int record__mmap_cpu_mask_init(struct mmap_cpu_mask *mask, struct perf_cpu_map *cpus)
3695 {
3696 struct perf_cpu cpu;
3697 int idx;
3698
3699 if (cpu_map__is_dummy(cpus))
3700 return 0;
3701
3702 perf_cpu_map__for_each_cpu_skip_any(cpu, idx, cpus) {
3703 /* Return ENODEV is input cpu is greater than max cpu */
3704 if ((unsigned long)cpu.cpu > mask->nbits)
3705 return -ENODEV;
3706 __set_bit(cpu.cpu, mask->bits);
3707 }
3708
3709 return 0;
3710 }
3711
record__mmap_cpu_mask_init_spec(struct mmap_cpu_mask * mask,const char * mask_spec)3712 static int record__mmap_cpu_mask_init_spec(struct mmap_cpu_mask *mask, const char *mask_spec)
3713 {
3714 struct perf_cpu_map *cpus;
3715
3716 cpus = perf_cpu_map__new(mask_spec);
3717 if (!cpus)
3718 return -ENOMEM;
3719
3720 bitmap_zero(mask->bits, mask->nbits);
3721 if (record__mmap_cpu_mask_init(mask, cpus))
3722 return -ENODEV;
3723
3724 perf_cpu_map__put(cpus);
3725
3726 return 0;
3727 }
3728
record__free_thread_masks(struct record * rec,int nr_threads)3729 static void record__free_thread_masks(struct record *rec, int nr_threads)
3730 {
3731 int t;
3732
3733 if (rec->thread_masks)
3734 for (t = 0; t < nr_threads; t++)
3735 record__thread_mask_free(&rec->thread_masks[t]);
3736
3737 zfree(&rec->thread_masks);
3738 }
3739
record__alloc_thread_masks(struct record * rec,int nr_threads,int nr_bits)3740 static int record__alloc_thread_masks(struct record *rec, int nr_threads, int nr_bits)
3741 {
3742 int t, ret;
3743
3744 rec->thread_masks = zalloc(nr_threads * sizeof(*(rec->thread_masks)));
3745 if (!rec->thread_masks) {
3746 pr_err("Failed to allocate thread masks\n");
3747 return -ENOMEM;
3748 }
3749
3750 for (t = 0; t < nr_threads; t++) {
3751 ret = record__thread_mask_alloc(&rec->thread_masks[t], nr_bits);
3752 if (ret) {
3753 pr_err("Failed to allocate thread masks[%d]\n", t);
3754 goto out_free;
3755 }
3756 }
3757
3758 return 0;
3759
3760 out_free:
3761 record__free_thread_masks(rec, nr_threads);
3762
3763 return ret;
3764 }
3765
record__init_thread_cpu_masks(struct record * rec,struct perf_cpu_map * cpus)3766 static int record__init_thread_cpu_masks(struct record *rec, struct perf_cpu_map *cpus)
3767 {
3768 int t, ret, nr_cpus = perf_cpu_map__nr(cpus);
3769
3770 ret = record__alloc_thread_masks(rec, nr_cpus, cpu__max_cpu().cpu);
3771 if (ret)
3772 return ret;
3773
3774 rec->nr_threads = nr_cpus;
3775 pr_debug("nr_threads: %d\n", rec->nr_threads);
3776
3777 for (t = 0; t < rec->nr_threads; t++) {
3778 __set_bit(perf_cpu_map__cpu(cpus, t).cpu, rec->thread_masks[t].maps.bits);
3779 __set_bit(perf_cpu_map__cpu(cpus, t).cpu, rec->thread_masks[t].affinity.bits);
3780 if (verbose > 0) {
3781 pr_debug("thread_masks[%d]: ", t);
3782 mmap_cpu_mask__scnprintf(&rec->thread_masks[t].maps, "maps");
3783 pr_debug("thread_masks[%d]: ", t);
3784 mmap_cpu_mask__scnprintf(&rec->thread_masks[t].affinity, "affinity");
3785 }
3786 }
3787
3788 return 0;
3789 }
3790
record__init_thread_masks_spec(struct record * rec,struct perf_cpu_map * cpus,const char ** maps_spec,const char ** affinity_spec,u32 nr_spec)3791 static int record__init_thread_masks_spec(struct record *rec, struct perf_cpu_map *cpus,
3792 const char **maps_spec, const char **affinity_spec,
3793 u32 nr_spec)
3794 {
3795 u32 s;
3796 int ret = 0, t = 0;
3797 struct mmap_cpu_mask cpus_mask;
3798 struct thread_mask thread_mask, full_mask, *thread_masks;
3799
3800 ret = record__mmap_cpu_mask_alloc(&cpus_mask, cpu__max_cpu().cpu);
3801 if (ret) {
3802 pr_err("Failed to allocate CPUs mask\n");
3803 return ret;
3804 }
3805
3806 ret = record__mmap_cpu_mask_init(&cpus_mask, cpus);
3807 if (ret) {
3808 pr_err("Failed to init cpu mask\n");
3809 goto out_free_cpu_mask;
3810 }
3811
3812 ret = record__thread_mask_alloc(&full_mask, cpu__max_cpu().cpu);
3813 if (ret) {
3814 pr_err("Failed to allocate full mask\n");
3815 goto out_free_cpu_mask;
3816 }
3817
3818 ret = record__thread_mask_alloc(&thread_mask, cpu__max_cpu().cpu);
3819 if (ret) {
3820 pr_err("Failed to allocate thread mask\n");
3821 goto out_free_full_and_cpu_masks;
3822 }
3823
3824 for (s = 0; s < nr_spec; s++) {
3825 ret = record__mmap_cpu_mask_init_spec(&thread_mask.maps, maps_spec[s]);
3826 if (ret) {
3827 pr_err("Failed to initialize maps thread mask\n");
3828 goto out_free;
3829 }
3830 ret = record__mmap_cpu_mask_init_spec(&thread_mask.affinity, affinity_spec[s]);
3831 if (ret) {
3832 pr_err("Failed to initialize affinity thread mask\n");
3833 goto out_free;
3834 }
3835
3836 /* ignore invalid CPUs but do not allow empty masks */
3837 if (!bitmap_and(thread_mask.maps.bits, thread_mask.maps.bits,
3838 cpus_mask.bits, thread_mask.maps.nbits)) {
3839 pr_err("Empty maps mask: %s\n", maps_spec[s]);
3840 ret = -EINVAL;
3841 goto out_free;
3842 }
3843 if (!bitmap_and(thread_mask.affinity.bits, thread_mask.affinity.bits,
3844 cpus_mask.bits, thread_mask.affinity.nbits)) {
3845 pr_err("Empty affinity mask: %s\n", affinity_spec[s]);
3846 ret = -EINVAL;
3847 goto out_free;
3848 }
3849
3850 /* do not allow intersection with other masks (full_mask) */
3851 if (bitmap_intersects(thread_mask.maps.bits, full_mask.maps.bits,
3852 thread_mask.maps.nbits)) {
3853 pr_err("Intersecting maps mask: %s\n", maps_spec[s]);
3854 ret = -EINVAL;
3855 goto out_free;
3856 }
3857 if (bitmap_intersects(thread_mask.affinity.bits, full_mask.affinity.bits,
3858 thread_mask.affinity.nbits)) {
3859 pr_err("Intersecting affinity mask: %s\n", affinity_spec[s]);
3860 ret = -EINVAL;
3861 goto out_free;
3862 }
3863
3864 bitmap_or(full_mask.maps.bits, full_mask.maps.bits,
3865 thread_mask.maps.bits, full_mask.maps.nbits);
3866 bitmap_or(full_mask.affinity.bits, full_mask.affinity.bits,
3867 thread_mask.affinity.bits, full_mask.maps.nbits);
3868
3869 thread_masks = realloc(rec->thread_masks, (t + 1) * sizeof(struct thread_mask));
3870 if (!thread_masks) {
3871 pr_err("Failed to reallocate thread masks\n");
3872 ret = -ENOMEM;
3873 goto out_free;
3874 }
3875 rec->thread_masks = thread_masks;
3876 rec->thread_masks[t] = thread_mask;
3877 if (verbose > 0) {
3878 pr_debug("thread_masks[%d]: ", t);
3879 mmap_cpu_mask__scnprintf(&rec->thread_masks[t].maps, "maps");
3880 pr_debug("thread_masks[%d]: ", t);
3881 mmap_cpu_mask__scnprintf(&rec->thread_masks[t].affinity, "affinity");
3882 }
3883 t++;
3884 ret = record__thread_mask_alloc(&thread_mask, cpu__max_cpu().cpu);
3885 if (ret) {
3886 pr_err("Failed to allocate thread mask\n");
3887 goto out_free_full_and_cpu_masks;
3888 }
3889 }
3890 rec->nr_threads = t;
3891 pr_debug("nr_threads: %d\n", rec->nr_threads);
3892 if (!rec->nr_threads)
3893 ret = -EINVAL;
3894
3895 out_free:
3896 record__thread_mask_free(&thread_mask);
3897 out_free_full_and_cpu_masks:
3898 record__thread_mask_free(&full_mask);
3899 out_free_cpu_mask:
3900 record__mmap_cpu_mask_free(&cpus_mask);
3901
3902 return ret;
3903 }
3904
record__init_thread_core_masks(struct record * rec,struct perf_cpu_map * cpus)3905 static int record__init_thread_core_masks(struct record *rec, struct perf_cpu_map *cpus)
3906 {
3907 int ret;
3908 struct cpu_topology *topo;
3909
3910 topo = cpu_topology__new();
3911 if (!topo) {
3912 pr_err("Failed to allocate CPU topology\n");
3913 return -ENOMEM;
3914 }
3915
3916 ret = record__init_thread_masks_spec(rec, cpus, topo->core_cpus_list,
3917 topo->core_cpus_list, topo->core_cpus_lists);
3918 cpu_topology__delete(topo);
3919
3920 return ret;
3921 }
3922
record__init_thread_package_masks(struct record * rec,struct perf_cpu_map * cpus)3923 static int record__init_thread_package_masks(struct record *rec, struct perf_cpu_map *cpus)
3924 {
3925 int ret;
3926 struct cpu_topology *topo;
3927
3928 topo = cpu_topology__new();
3929 if (!topo) {
3930 pr_err("Failed to allocate CPU topology\n");
3931 return -ENOMEM;
3932 }
3933
3934 ret = record__init_thread_masks_spec(rec, cpus, topo->package_cpus_list,
3935 topo->package_cpus_list, topo->package_cpus_lists);
3936 cpu_topology__delete(topo);
3937
3938 return ret;
3939 }
3940
record__init_thread_numa_masks(struct record * rec,struct perf_cpu_map * cpus)3941 static int record__init_thread_numa_masks(struct record *rec, struct perf_cpu_map *cpus)
3942 {
3943 u32 s;
3944 int ret;
3945 const char **spec;
3946 struct numa_topology *topo;
3947
3948 topo = numa_topology__new();
3949 if (!topo) {
3950 pr_err("Failed to allocate NUMA topology\n");
3951 return -ENOMEM;
3952 }
3953
3954 spec = zalloc(topo->nr * sizeof(char *));
3955 if (!spec) {
3956 pr_err("Failed to allocate NUMA spec\n");
3957 ret = -ENOMEM;
3958 goto out_delete_topo;
3959 }
3960 for (s = 0; s < topo->nr; s++)
3961 spec[s] = topo->nodes[s].cpus;
3962
3963 ret = record__init_thread_masks_spec(rec, cpus, spec, spec, topo->nr);
3964
3965 zfree(&spec);
3966
3967 out_delete_topo:
3968 numa_topology__delete(topo);
3969
3970 return ret;
3971 }
3972
record__init_thread_user_masks(struct record * rec,struct perf_cpu_map * cpus)3973 static int record__init_thread_user_masks(struct record *rec, struct perf_cpu_map *cpus)
3974 {
3975 int t, ret;
3976 u32 s, nr_spec = 0;
3977 char **maps_spec = NULL, **affinity_spec = NULL, **tmp_spec;
3978 char *user_spec, *spec, *spec_ptr, *mask, *mask_ptr, *dup_mask = NULL;
3979
3980 for (t = 0, user_spec = (char *)rec->opts.threads_user_spec; ; t++, user_spec = NULL) {
3981 spec = strtok_r(user_spec, ":", &spec_ptr);
3982 if (spec == NULL)
3983 break;
3984 pr_debug2("threads_spec[%d]: %s\n", t, spec);
3985 mask = strtok_r(spec, "/", &mask_ptr);
3986 if (mask == NULL)
3987 break;
3988 pr_debug2(" maps mask: %s\n", mask);
3989 tmp_spec = realloc(maps_spec, (nr_spec + 1) * sizeof(char *));
3990 if (!tmp_spec) {
3991 pr_err("Failed to reallocate maps spec\n");
3992 ret = -ENOMEM;
3993 goto out_free;
3994 }
3995 maps_spec = tmp_spec;
3996 maps_spec[nr_spec] = dup_mask = strdup(mask);
3997 if (!maps_spec[nr_spec]) {
3998 pr_err("Failed to allocate maps spec[%d]\n", nr_spec);
3999 ret = -ENOMEM;
4000 goto out_free;
4001 }
4002 mask = strtok_r(NULL, "/", &mask_ptr);
4003 if (mask == NULL) {
4004 pr_err("Invalid thread maps or affinity specs\n");
4005 ret = -EINVAL;
4006 goto out_free;
4007 }
4008 pr_debug2(" affinity mask: %s\n", mask);
4009 tmp_spec = realloc(affinity_spec, (nr_spec + 1) * sizeof(char *));
4010 if (!tmp_spec) {
4011 pr_err("Failed to reallocate affinity spec\n");
4012 ret = -ENOMEM;
4013 goto out_free;
4014 }
4015 affinity_spec = tmp_spec;
4016 affinity_spec[nr_spec] = strdup(mask);
4017 if (!affinity_spec[nr_spec]) {
4018 pr_err("Failed to allocate affinity spec[%d]\n", nr_spec);
4019 ret = -ENOMEM;
4020 goto out_free;
4021 }
4022 dup_mask = NULL;
4023 nr_spec++;
4024 }
4025
4026 ret = record__init_thread_masks_spec(rec, cpus, (const char **)maps_spec,
4027 (const char **)affinity_spec, nr_spec);
4028
4029 out_free:
4030 free(dup_mask);
4031 for (s = 0; s < nr_spec; s++) {
4032 if (maps_spec)
4033 free(maps_spec[s]);
4034 if (affinity_spec)
4035 free(affinity_spec[s]);
4036 }
4037 free(affinity_spec);
4038 free(maps_spec);
4039
4040 return ret;
4041 }
4042
record__init_thread_default_masks(struct record * rec,struct perf_cpu_map * cpus)4043 static int record__init_thread_default_masks(struct record *rec, struct perf_cpu_map *cpus)
4044 {
4045 int ret;
4046
4047 ret = record__alloc_thread_masks(rec, 1, cpu__max_cpu().cpu);
4048 if (ret)
4049 return ret;
4050
4051 if (record__mmap_cpu_mask_init(&rec->thread_masks->maps, cpus))
4052 return -ENODEV;
4053
4054 rec->nr_threads = 1;
4055
4056 return 0;
4057 }
4058
record__init_thread_masks(struct record * rec)4059 static int record__init_thread_masks(struct record *rec)
4060 {
4061 int ret = 0;
4062 struct perf_cpu_map *cpus = rec->evlist->core.all_cpus;
4063
4064 if (!record__threads_enabled(rec))
4065 return record__init_thread_default_masks(rec, cpus);
4066
4067 if (evlist__per_thread(rec->evlist)) {
4068 pr_err("--per-thread option is mutually exclusive to parallel streaming mode.\n");
4069 return -EINVAL;
4070 }
4071
4072 switch (rec->opts.threads_spec) {
4073 case THREAD_SPEC__CPU:
4074 ret = record__init_thread_cpu_masks(rec, cpus);
4075 break;
4076 case THREAD_SPEC__CORE:
4077 ret = record__init_thread_core_masks(rec, cpus);
4078 break;
4079 case THREAD_SPEC__PACKAGE:
4080 ret = record__init_thread_package_masks(rec, cpus);
4081 break;
4082 case THREAD_SPEC__NUMA:
4083 ret = record__init_thread_numa_masks(rec, cpus);
4084 break;
4085 case THREAD_SPEC__USER:
4086 ret = record__init_thread_user_masks(rec, cpus);
4087 break;
4088 default:
4089 break;
4090 }
4091
4092 return ret;
4093 }
4094
cmd_record(int argc,const char ** argv)4095 int cmd_record(int argc, const char **argv)
4096 {
4097 int err;
4098 struct record *rec = &record;
4099 char errbuf[BUFSIZ];
4100
4101 setlocale(LC_ALL, "");
4102
4103 #ifndef HAVE_BPF_SKEL
4104 # define set_nobuild(s, l, m, c) set_option_nobuild(record_options, s, l, m, c)
4105 set_nobuild('\0', "off-cpu", "no BUILD_BPF_SKEL=1", true);
4106 # undef set_nobuild
4107 #endif
4108
4109 /* Disable eager loading of kernel symbols that adds overhead to perf record. */
4110 symbol_conf.lazy_load_kernel_maps = true;
4111 rec->opts.affinity = PERF_AFFINITY_SYS;
4112
4113 rec->evlist = evlist__new();
4114 if (rec->evlist == NULL)
4115 return -ENOMEM;
4116
4117 err = perf_config(perf_record_config, rec);
4118 if (err)
4119 return err;
4120
4121 argc = parse_options(argc, argv, record_options, record_usage,
4122 PARSE_OPT_STOP_AT_NON_OPTION);
4123 if (quiet)
4124 perf_quiet_option();
4125
4126 err = symbol__validate_sym_arguments();
4127 if (err)
4128 return err;
4129
4130 perf_debuginfod_setup(&record.debuginfod);
4131
4132 /* Make system wide (-a) the default target. */
4133 if (!argc && target__none(&rec->opts.target))
4134 rec->opts.target.system_wide = true;
4135
4136 if (nr_cgroups && !rec->opts.target.system_wide) {
4137 usage_with_options_msg(record_usage, record_options,
4138 "cgroup monitoring only available in system-wide mode");
4139
4140 }
4141
4142 if (record.latency) {
4143 /*
4144 * There is no fundamental reason why latency profiling
4145 * can't work for system-wide mode, but exact semantics
4146 * and details are to be defined.
4147 * See the following thread for details:
4148 * https://lore.kernel.org/all/Z4XDJyvjiie3howF@google.com/
4149 */
4150 if (record.opts.target.system_wide) {
4151 pr_err("Failed: latency profiling is not supported with system-wide collection.\n");
4152 err = -EINVAL;
4153 goto out_opts;
4154 }
4155 record.opts.record_switch_events = true;
4156 }
4157
4158 if (rec->buildid_mmap && !perf_can_record_build_id()) {
4159 pr_warning("Missing support for build id in kernel mmap events.\n"
4160 "Disable this warning with --no-buildid-mmap\n");
4161 rec->buildid_mmap = false;
4162 }
4163
4164 if (rec->buildid_mmap) {
4165 /* Enable perf_event_attr::build_id bit. */
4166 rec->opts.build_id = true;
4167 /* Disable build-ID table in the header. */
4168 rec->no_buildid = true;
4169 } else {
4170 pr_debug("Disabling build id in synthesized mmap2 events.\n");
4171 symbol_conf.no_buildid_mmap2 = true;
4172 }
4173
4174 if (rec->no_buildid_set && rec->no_buildid) {
4175 /* -B implies -N for historic reasons. */
4176 rec->no_buildid_cache = true;
4177 }
4178
4179 if (rec->opts.record_cgroup && !perf_can_record_cgroup()) {
4180 pr_err("Kernel has no cgroup sampling support.\n");
4181 err = -EINVAL;
4182 goto out_opts;
4183 }
4184
4185 if (rec->opts.kcore)
4186 rec->opts.text_poke = true;
4187
4188 if (rec->opts.kcore || record__threads_enabled(rec))
4189 rec->data.is_dir = true;
4190
4191 if (record__threads_enabled(rec)) {
4192 if (rec->opts.affinity != PERF_AFFINITY_SYS) {
4193 pr_err("--affinity option is mutually exclusive to parallel streaming mode.\n");
4194 goto out_opts;
4195 }
4196 if (record__aio_enabled(rec)) {
4197 pr_err("Asynchronous streaming mode (--aio) is mutually exclusive to parallel streaming mode.\n");
4198 goto out_opts;
4199 }
4200 }
4201
4202 if (rec->opts.comp_level != 0) {
4203 pr_debug("Compression enabled, disabling build id collection at the end of the session.\n");
4204 rec->no_buildid = true;
4205 }
4206
4207 if (rec->opts.record_switch_events &&
4208 !perf_can_record_switch_events()) {
4209 ui__error("kernel does not support recording context switch events\n");
4210 parse_options_usage(record_usage, record_options, "switch-events", 0);
4211 err = -EINVAL;
4212 goto out_opts;
4213 }
4214
4215 if (switch_output_setup(rec)) {
4216 parse_options_usage(record_usage, record_options, "switch-output", 0);
4217 err = -EINVAL;
4218 goto out_opts;
4219 }
4220
4221 if (rec->switch_output.time) {
4222 signal(SIGALRM, alarm_sig_handler);
4223 alarm(rec->switch_output.time);
4224 }
4225
4226 if (rec->switch_output.num_files) {
4227 rec->switch_output.filenames = calloc(rec->switch_output.num_files,
4228 sizeof(char *));
4229 if (!rec->switch_output.filenames) {
4230 err = -EINVAL;
4231 goto out_opts;
4232 }
4233 }
4234
4235 if (rec->timestamp_filename && record__threads_enabled(rec)) {
4236 rec->timestamp_filename = false;
4237 pr_warning("WARNING: --timestamp-filename option is not available in parallel streaming mode.\n");
4238 }
4239
4240 if (rec->filter_action) {
4241 if (!strcmp(rec->filter_action, "pin"))
4242 err = perf_bpf_filter__pin();
4243 else if (!strcmp(rec->filter_action, "unpin"))
4244 err = perf_bpf_filter__unpin();
4245 else {
4246 pr_warning("Unknown BPF filter action: %s\n", rec->filter_action);
4247 err = -EINVAL;
4248 }
4249 goto out_opts;
4250 }
4251
4252 /* For backward compatibility, -d implies --mem-info */
4253 if (rec->opts.sample_address)
4254 rec->opts.sample_data_src = true;
4255
4256 /*
4257 * Allow aliases to facilitate the lookup of symbols for address
4258 * filters. Refer to auxtrace_parse_filters().
4259 */
4260 symbol_conf.allow_aliases = true;
4261
4262 symbol__init(NULL);
4263
4264 err = record__auxtrace_init(rec);
4265 if (err)
4266 goto out;
4267
4268 if (dry_run)
4269 goto out;
4270
4271 err = -ENOMEM;
4272
4273 if (rec->no_buildid_cache) {
4274 disable_buildid_cache();
4275 } else if (rec->switch_output.enabled) {
4276 /*
4277 * In 'perf record --switch-output', disable buildid
4278 * generation by default to reduce data file switching
4279 * overhead. Still generate buildid if they are required
4280 * explicitly using
4281 *
4282 * perf record --switch-output --no-no-buildid \
4283 * --no-no-buildid-cache
4284 *
4285 * Following code equals to:
4286 *
4287 * if ((rec->no_buildid || !rec->no_buildid_set) &&
4288 * (rec->no_buildid_cache || !rec->no_buildid_cache_set))
4289 * disable_buildid_cache();
4290 */
4291 bool disable = true;
4292
4293 if (rec->no_buildid_set && !rec->no_buildid)
4294 disable = false;
4295 if (rec->no_buildid_cache_set && !rec->no_buildid_cache)
4296 disable = false;
4297 if (disable) {
4298 rec->no_buildid = true;
4299 rec->no_buildid_cache = true;
4300 disable_buildid_cache();
4301 }
4302 }
4303
4304 if (record.opts.overwrite)
4305 record.opts.tail_synthesize = true;
4306
4307 if (rec->evlist->core.nr_entries == 0) {
4308 struct evlist *def_evlist = evlist__new_default();
4309
4310 if (!def_evlist)
4311 goto out;
4312
4313 evlist__splice_list_tail(rec->evlist, &def_evlist->core.entries);
4314 evlist__delete(def_evlist);
4315 }
4316
4317 if (rec->opts.target.tid && !rec->opts.no_inherit_set)
4318 rec->opts.no_inherit = true;
4319
4320 err = target__validate(&rec->opts.target);
4321 if (err) {
4322 target__strerror(&rec->opts.target, err, errbuf, BUFSIZ);
4323 ui__warning("%s\n", errbuf);
4324 }
4325
4326 if (rec->uid_str) {
4327 uid_t uid = parse_uid(rec->uid_str);
4328
4329 if (uid == UINT_MAX) {
4330 ui__error("Invalid User: %s", rec->uid_str);
4331 err = -EINVAL;
4332 goto out;
4333 }
4334 err = parse_uid_filter(rec->evlist, uid);
4335 if (err)
4336 goto out;
4337
4338 /* User ID filtering implies system wide. */
4339 rec->opts.target.system_wide = true;
4340 }
4341
4342 /* Enable ignoring missing threads when -p option is defined. */
4343 rec->opts.ignore_missing_thread = rec->opts.target.pid;
4344
4345 evlist__warn_user_requested_cpus(rec->evlist, rec->opts.target.cpu_list);
4346
4347 if (callchain_param.enabled && callchain_param.record_mode == CALLCHAIN_FP)
4348 arch__add_leaf_frame_record_opts(&rec->opts);
4349
4350 err = -ENOMEM;
4351 if (evlist__create_maps(rec->evlist, &rec->opts.target) < 0) {
4352 if (rec->opts.target.pid != NULL) {
4353 pr_err("Couldn't create thread/CPU maps: %s\n",
4354 errno == ENOENT ? "No such process" : str_error_r(errno, errbuf, sizeof(errbuf)));
4355 goto out;
4356 }
4357 else
4358 usage_with_options(record_usage, record_options);
4359 }
4360
4361 err = auxtrace_record__options(rec->itr, rec->evlist, &rec->opts);
4362 if (err)
4363 goto out;
4364
4365 /*
4366 * We take all buildids when the file contains
4367 * AUX area tracing data because we do not decode the
4368 * trace because it would take too long.
4369 */
4370 if (rec->opts.full_auxtrace)
4371 rec->buildid_all = true;
4372
4373 if (rec->opts.text_poke) {
4374 err = record__config_text_poke(rec->evlist);
4375 if (err) {
4376 pr_err("record__config_text_poke failed, error %d\n", err);
4377 goto out;
4378 }
4379 }
4380
4381 if (rec->off_cpu) {
4382 err = record__config_off_cpu(rec);
4383 if (err) {
4384 pr_err("record__config_off_cpu failed, error %d\n", err);
4385 goto out;
4386 }
4387 }
4388
4389 if (record_opts__config(&rec->opts)) {
4390 err = -EINVAL;
4391 goto out;
4392 }
4393
4394 err = record__config_tracking_events(rec);
4395 if (err) {
4396 pr_err("record__config_tracking_events failed, error %d\n", err);
4397 goto out;
4398 }
4399
4400 err = record__init_thread_masks(rec);
4401 if (err) {
4402 pr_err("Failed to initialize parallel data streaming masks\n");
4403 goto out;
4404 }
4405
4406 if (rec->opts.nr_cblocks > nr_cblocks_max)
4407 rec->opts.nr_cblocks = nr_cblocks_max;
4408 pr_debug("nr_cblocks: %d\n", rec->opts.nr_cblocks);
4409
4410 pr_debug("affinity: %s\n", affinity_tags[rec->opts.affinity]);
4411 pr_debug("mmap flush: %d\n", rec->opts.mmap_flush);
4412
4413 if (rec->opts.comp_level > comp_level_max)
4414 rec->opts.comp_level = comp_level_max;
4415 pr_debug("comp level: %d\n", rec->opts.comp_level);
4416
4417 err = __cmd_record(&record, argc, argv);
4418 out:
4419 record__free_thread_masks(rec, rec->nr_threads);
4420 rec->nr_threads = 0;
4421 symbol__exit();
4422 auxtrace_record__free(rec->itr);
4423 out_opts:
4424 evlist__close_control(rec->opts.ctl_fd, rec->opts.ctl_fd_ack, &rec->opts.ctl_fd_close);
4425 evlist__delete(rec->evlist);
4426 return err;
4427 }
4428
snapshot_sig_handler(int sig __maybe_unused)4429 static void snapshot_sig_handler(int sig __maybe_unused)
4430 {
4431 struct record *rec = &record;
4432
4433 hit_auxtrace_snapshot_trigger(rec);
4434
4435 if (switch_output_signal(rec))
4436 trigger_hit(&switch_output_trigger);
4437 }
4438
alarm_sig_handler(int sig __maybe_unused)4439 static void alarm_sig_handler(int sig __maybe_unused)
4440 {
4441 struct record *rec = &record;
4442
4443 if (switch_output_time(rec))
4444 trigger_hit(&switch_output_trigger);
4445 }
4446