1 // SPDX-License-Identifier: GPL-2.0
2 /*
3 * builtin-record.c
4 *
5 * Builtin record command: Record the profile of a workload
6 * (or a CPU, or a PID) into the perf.data output file - for
7 * later analysis via perf report.
8 */
9 #include "builtin.h"
10
11 #include "util/build-id.h"
12 #include <subcmd/parse-options.h>
13 #include <internal/xyarray.h>
14 #include "util/parse-events.h"
15 #include "util/config.h"
16
17 #include "util/callchain.h"
18 #include "util/cgroup.h"
19 #include "util/header.h"
20 #include "util/event.h"
21 #include "util/evlist.h"
22 #include "util/evsel.h"
23 #include "util/debug.h"
24 #include "util/mmap.h"
25 #include "util/mutex.h"
26 #include "util/target.h"
27 #include "util/session.h"
28 #include "util/tool.h"
29 #include "util/stat.h"
30 #include "util/symbol.h"
31 #include "util/record.h"
32 #include "util/cpumap.h"
33 #include "util/thread_map.h"
34 #include "util/data.h"
35 #include "util/perf_regs.h"
36 #include "util/auxtrace.h"
37 #include "util/tsc.h"
38 #include "util/parse-branch-options.h"
39 #include "util/parse-regs-options.h"
40 #include "util/perf_api_probe.h"
41 #include "util/trigger.h"
42 #include "util/perf-hooks.h"
43 #include "util/cpu-set-sched.h"
44 #include "util/synthetic-events.h"
45 #include "util/time-utils.h"
46 #include "util/units.h"
47 #include "util/bpf-event.h"
48 #include "util/util.h"
49 #include "util/pfm.h"
50 #include "util/pmu.h"
51 #include "util/pmus.h"
52 #include "util/clockid.h"
53 #include "util/off_cpu.h"
54 #include "util/bpf-filter.h"
55 #include "util/strbuf.h"
56 #include "asm/bug.h"
57 #include "perf.h"
58 #include "cputopo.h"
59
60 #include <errno.h>
61 #include <inttypes.h>
62 #include <locale.h>
63 #include <poll.h>
64 #include <pthread.h>
65 #include <unistd.h>
66 #ifndef HAVE_GETTID
67 #include <syscall.h>
68 #endif
69 #include <sched.h>
70 #include <signal.h>
71 #ifdef HAVE_EVENTFD_SUPPORT
72 #include <sys/eventfd.h>
73 #endif
74 #include <sys/mman.h>
75 #include <sys/wait.h>
76 #include <sys/types.h>
77 #include <sys/stat.h>
78 #include <fcntl.h>
79 #include <linux/err.h>
80 #include <linux/string.h>
81 #include <linux/time64.h>
82 #include <linux/zalloc.h>
83 #include <linux/bitmap.h>
84 #include <sys/time.h>
85
86 struct switch_output {
87 bool enabled;
88 bool signal;
89 unsigned long size;
90 unsigned long time;
91 const char *str;
92 bool set;
93 char **filenames;
94 int num_files;
95 int cur_file;
96 };
97
98 struct thread_mask {
99 struct mmap_cpu_mask maps;
100 struct mmap_cpu_mask affinity;
101 };
102
103 struct record_thread {
104 pid_t tid;
105 struct thread_mask *mask;
106 struct {
107 int msg[2];
108 int ack[2];
109 } pipes;
110 struct fdarray pollfd;
111 int ctlfd_pos;
112 int nr_mmaps;
113 struct mmap **maps;
114 struct mmap **overwrite_maps;
115 struct record *rec;
116 unsigned long long samples;
117 unsigned long waking;
118 u64 bytes_written;
119 u64 bytes_transferred;
120 u64 bytes_compressed;
121 };
122
123 static __thread struct record_thread *thread;
124
125 enum thread_msg {
126 THREAD_MSG__UNDEFINED = 0,
127 THREAD_MSG__READY,
128 THREAD_MSG__MAX,
129 };
130
131 static const char *thread_msg_tags[THREAD_MSG__MAX] = {
132 "UNDEFINED", "READY"
133 };
134
135 enum thread_spec {
136 THREAD_SPEC__UNDEFINED = 0,
137 THREAD_SPEC__CPU,
138 THREAD_SPEC__CORE,
139 THREAD_SPEC__PACKAGE,
140 THREAD_SPEC__NUMA,
141 THREAD_SPEC__USER,
142 THREAD_SPEC__MAX,
143 };
144
145 static const char *thread_spec_tags[THREAD_SPEC__MAX] = {
146 "undefined", "cpu", "core", "package", "numa", "user"
147 };
148
149 struct pollfd_index_map {
150 int evlist_pollfd_index;
151 int thread_pollfd_index;
152 };
153
154 struct record {
155 struct perf_tool tool;
156 struct record_opts opts;
157 u64 bytes_written;
158 u64 thread_bytes_written;
159 struct perf_data data;
160 struct auxtrace_record *itr;
161 struct evlist *evlist;
162 struct perf_session *session;
163 struct evlist *sb_evlist;
164 pthread_t thread_id;
165 int realtime_prio;
166 bool latency;
167 bool switch_output_event_set;
168 bool no_buildid;
169 bool no_buildid_set;
170 bool no_buildid_cache;
171 bool no_buildid_cache_set;
172 bool buildid_all;
173 bool buildid_mmap;
174 bool buildid_mmap_set;
175 bool timestamp_filename;
176 bool timestamp_boundary;
177 bool off_cpu;
178 const char *filter_action;
179 const char *uid_str;
180 struct switch_output switch_output;
181 unsigned long long samples;
182 unsigned long output_max_size; /* = 0: unlimited */
183 struct perf_debuginfod debuginfod;
184 int nr_threads;
185 struct thread_mask *thread_masks;
186 struct record_thread *thread_data;
187 struct pollfd_index_map *index_map;
188 size_t index_map_sz;
189 size_t index_map_cnt;
190 };
191
192 static volatile int done;
193
194 static volatile int auxtrace_record__snapshot_started;
195 static DEFINE_TRIGGER(auxtrace_snapshot_trigger);
196 static DEFINE_TRIGGER(switch_output_trigger);
197
198 static const char *affinity_tags[PERF_AFFINITY_MAX] = {
199 "SYS", "NODE", "CPU"
200 };
201
202 static int build_id__process_mmap(const struct perf_tool *tool, union perf_event *event,
203 struct perf_sample *sample, struct machine *machine);
204 static int build_id__process_mmap2(const struct perf_tool *tool, union perf_event *event,
205 struct perf_sample *sample, struct machine *machine);
206 static int process_timestamp_boundary(const struct perf_tool *tool,
207 union perf_event *event,
208 struct perf_sample *sample,
209 struct machine *machine);
210
211 #ifndef HAVE_GETTID
gettid(void)212 static inline pid_t gettid(void)
213 {
214 return (pid_t)syscall(__NR_gettid);
215 }
216 #endif
217
record__threads_enabled(struct record * rec)218 static int record__threads_enabled(struct record *rec)
219 {
220 return rec->opts.threads_spec;
221 }
222
switch_output_signal(struct record * rec)223 static bool switch_output_signal(struct record *rec)
224 {
225 return rec->switch_output.signal &&
226 trigger_is_ready(&switch_output_trigger);
227 }
228
switch_output_size(struct record * rec)229 static bool switch_output_size(struct record *rec)
230 {
231 return rec->switch_output.size &&
232 trigger_is_ready(&switch_output_trigger) &&
233 (rec->bytes_written >= rec->switch_output.size);
234 }
235
switch_output_time(struct record * rec)236 static bool switch_output_time(struct record *rec)
237 {
238 return rec->switch_output.time &&
239 trigger_is_ready(&switch_output_trigger);
240 }
241
record__bytes_written(struct record * rec)242 static u64 record__bytes_written(struct record *rec)
243 {
244 return rec->bytes_written + rec->thread_bytes_written;
245 }
246
record__output_max_size_exceeded(struct record * rec)247 static bool record__output_max_size_exceeded(struct record *rec)
248 {
249 return rec->output_max_size &&
250 (record__bytes_written(rec) >= rec->output_max_size);
251 }
252
record__write(struct record * rec,struct mmap * map __maybe_unused,void * bf,size_t size)253 static int record__write(struct record *rec, struct mmap *map __maybe_unused,
254 void *bf, size_t size)
255 {
256 struct perf_data_file *file = &rec->session->data->file;
257
258 if (map && map->file)
259 file = map->file;
260
261 if (perf_data_file__write(file, bf, size) < 0) {
262 pr_err("failed to write perf data, error: %m\n");
263 return -1;
264 }
265
266 if (map && map->file) {
267 thread->bytes_written += size;
268 rec->thread_bytes_written += size;
269 } else {
270 rec->bytes_written += size;
271 }
272
273 if (record__output_max_size_exceeded(rec) && !done) {
274 fprintf(stderr, "[ perf record: perf size limit reached (%" PRIu64 " KB),"
275 " stopping session ]\n",
276 record__bytes_written(rec) >> 10);
277 done = 1;
278 }
279
280 if (switch_output_size(rec))
281 trigger_hit(&switch_output_trigger);
282
283 return 0;
284 }
285
286 static int record__aio_enabled(struct record *rec);
287 static int record__comp_enabled(struct record *rec);
288 static ssize_t zstd_compress(struct perf_session *session, struct mmap *map,
289 void *dst, size_t dst_size, void *src, size_t src_size);
290
291 #ifdef HAVE_AIO_SUPPORT
record__aio_write(struct aiocb * cblock,int trace_fd,void * buf,size_t size,off_t off)292 static int record__aio_write(struct aiocb *cblock, int trace_fd,
293 void *buf, size_t size, off_t off)
294 {
295 int rc;
296
297 cblock->aio_fildes = trace_fd;
298 cblock->aio_buf = buf;
299 cblock->aio_nbytes = size;
300 cblock->aio_offset = off;
301 cblock->aio_sigevent.sigev_notify = SIGEV_NONE;
302
303 do {
304 rc = aio_write(cblock);
305 if (rc == 0) {
306 break;
307 } else if (errno != EAGAIN) {
308 cblock->aio_fildes = -1;
309 pr_err("failed to queue perf data, error: %m\n");
310 break;
311 }
312 } while (1);
313
314 return rc;
315 }
316
record__aio_complete(struct mmap * md,struct aiocb * cblock)317 static int record__aio_complete(struct mmap *md, struct aiocb *cblock)
318 {
319 void *rem_buf;
320 off_t rem_off;
321 size_t rem_size;
322 int rc, aio_errno;
323 ssize_t aio_ret, written;
324
325 aio_errno = aio_error(cblock);
326 if (aio_errno == EINPROGRESS)
327 return 0;
328
329 written = aio_ret = aio_return(cblock);
330 if (aio_ret < 0) {
331 if (aio_errno != EINTR)
332 pr_err("failed to write perf data, error: %m\n");
333 written = 0;
334 }
335
336 rem_size = cblock->aio_nbytes - written;
337
338 if (rem_size == 0) {
339 cblock->aio_fildes = -1;
340 /*
341 * md->refcount is incremented in record__aio_pushfn() for
342 * every aio write request started in record__aio_push() so
343 * decrement it because the request is now complete.
344 */
345 perf_mmap__put(&md->core);
346 rc = 1;
347 } else {
348 /*
349 * aio write request may require restart with the
350 * remainder if the kernel didn't write whole
351 * chunk at once.
352 */
353 rem_off = cblock->aio_offset + written;
354 rem_buf = (void *)(cblock->aio_buf + written);
355 record__aio_write(cblock, cblock->aio_fildes,
356 rem_buf, rem_size, rem_off);
357 rc = 0;
358 }
359
360 return rc;
361 }
362
record__aio_sync(struct mmap * md,bool sync_all)363 static int record__aio_sync(struct mmap *md, bool sync_all)
364 {
365 struct aiocb **aiocb = md->aio.aiocb;
366 struct aiocb *cblocks = md->aio.cblocks;
367 struct timespec timeout = { 0, 1000 * 1000 * 1 }; /* 1ms */
368 int i, do_suspend;
369
370 do {
371 do_suspend = 0;
372 for (i = 0; i < md->aio.nr_cblocks; ++i) {
373 if (cblocks[i].aio_fildes == -1 || record__aio_complete(md, &cblocks[i])) {
374 if (sync_all)
375 aiocb[i] = NULL;
376 else
377 return i;
378 } else {
379 /*
380 * Started aio write is not complete yet
381 * so it has to be waited before the
382 * next allocation.
383 */
384 aiocb[i] = &cblocks[i];
385 do_suspend = 1;
386 }
387 }
388 if (!do_suspend)
389 return -1;
390
391 while (aio_suspend((const struct aiocb **)aiocb, md->aio.nr_cblocks, &timeout)) {
392 if (!(errno == EAGAIN || errno == EINTR))
393 pr_err("failed to sync perf data, error: %m\n");
394 }
395 } while (1);
396 }
397
398 struct record_aio {
399 struct record *rec;
400 void *data;
401 size_t size;
402 };
403
record__aio_pushfn(struct mmap * map,void * to,void * buf,size_t size)404 static int record__aio_pushfn(struct mmap *map, void *to, void *buf, size_t size)
405 {
406 struct record_aio *aio = to;
407
408 /*
409 * map->core.base data pointed by buf is copied into free map->aio.data[] buffer
410 * to release space in the kernel buffer as fast as possible, calling
411 * perf_mmap__consume() from perf_mmap__push() function.
412 *
413 * That lets the kernel to proceed with storing more profiling data into
414 * the kernel buffer earlier than other per-cpu kernel buffers are handled.
415 *
416 * Coping can be done in two steps in case the chunk of profiling data
417 * crosses the upper bound of the kernel buffer. In this case we first move
418 * part of data from map->start till the upper bound and then the remainder
419 * from the beginning of the kernel buffer till the end of the data chunk.
420 */
421
422 if (record__comp_enabled(aio->rec)) {
423 ssize_t compressed = zstd_compress(aio->rec->session, NULL, aio->data + aio->size,
424 mmap__mmap_len(map) - aio->size,
425 buf, size);
426 if (compressed < 0)
427 return (int)compressed;
428
429 size = compressed;
430 } else {
431 memcpy(aio->data + aio->size, buf, size);
432 }
433
434 if (!aio->size) {
435 /*
436 * Increment map->refcount to guard map->aio.data[] buffer
437 * from premature deallocation because map object can be
438 * released earlier than aio write request started on
439 * map->aio.data[] buffer is complete.
440 *
441 * perf_mmap__put() is done at record__aio_complete()
442 * after started aio request completion or at record__aio_push()
443 * if the request failed to start.
444 */
445 perf_mmap__get(&map->core);
446 }
447
448 aio->size += size;
449
450 return size;
451 }
452
record__aio_push(struct record * rec,struct mmap * map,off_t * off)453 static int record__aio_push(struct record *rec, struct mmap *map, off_t *off)
454 {
455 int ret, idx;
456 int trace_fd = rec->session->data->file.fd;
457 struct record_aio aio = { .rec = rec, .size = 0 };
458
459 /*
460 * Call record__aio_sync() to wait till map->aio.data[] buffer
461 * becomes available after previous aio write operation.
462 */
463
464 idx = record__aio_sync(map, false);
465 aio.data = map->aio.data[idx];
466 ret = perf_mmap__push(map, &aio, record__aio_pushfn);
467 if (ret != 0) /* ret > 0 - no data, ret < 0 - error */
468 return ret;
469
470 rec->samples++;
471 ret = record__aio_write(&(map->aio.cblocks[idx]), trace_fd, aio.data, aio.size, *off);
472 if (!ret) {
473 *off += aio.size;
474 rec->bytes_written += aio.size;
475 if (switch_output_size(rec))
476 trigger_hit(&switch_output_trigger);
477 } else {
478 /*
479 * Decrement map->refcount incremented in record__aio_pushfn()
480 * back if record__aio_write() operation failed to start, otherwise
481 * map->refcount is decremented in record__aio_complete() after
482 * aio write operation finishes successfully.
483 */
484 perf_mmap__put(&map->core);
485 }
486
487 return ret;
488 }
489
record__aio_get_pos(int trace_fd)490 static off_t record__aio_get_pos(int trace_fd)
491 {
492 return lseek(trace_fd, 0, SEEK_CUR);
493 }
494
record__aio_set_pos(int trace_fd,off_t pos)495 static void record__aio_set_pos(int trace_fd, off_t pos)
496 {
497 lseek(trace_fd, pos, SEEK_SET);
498 }
499
record__aio_mmap_read_sync(struct record * rec)500 static void record__aio_mmap_read_sync(struct record *rec)
501 {
502 int i;
503 struct evlist *evlist = rec->evlist;
504 struct mmap *maps = evlist->mmap;
505
506 if (!record__aio_enabled(rec))
507 return;
508
509 for (i = 0; i < evlist->core.nr_mmaps; i++) {
510 struct mmap *map = &maps[i];
511
512 if (map->core.base)
513 record__aio_sync(map, true);
514 }
515 }
516
517 static int nr_cblocks_default = 1;
518 static int nr_cblocks_max = 4;
519
record__aio_parse(const struct option * opt,const char * str,int unset)520 static int record__aio_parse(const struct option *opt,
521 const char *str,
522 int unset)
523 {
524 struct record_opts *opts = (struct record_opts *)opt->value;
525
526 if (unset) {
527 opts->nr_cblocks = 0;
528 } else {
529 if (str)
530 opts->nr_cblocks = strtol(str, NULL, 0);
531 if (!opts->nr_cblocks)
532 opts->nr_cblocks = nr_cblocks_default;
533 }
534
535 return 0;
536 }
537 #else /* HAVE_AIO_SUPPORT */
538 static int nr_cblocks_max = 0;
539
record__aio_push(struct record * rec __maybe_unused,struct mmap * map __maybe_unused,off_t * off __maybe_unused)540 static int record__aio_push(struct record *rec __maybe_unused, struct mmap *map __maybe_unused,
541 off_t *off __maybe_unused)
542 {
543 return -1;
544 }
545
record__aio_get_pos(int trace_fd __maybe_unused)546 static off_t record__aio_get_pos(int trace_fd __maybe_unused)
547 {
548 return -1;
549 }
550
record__aio_set_pos(int trace_fd __maybe_unused,off_t pos __maybe_unused)551 static void record__aio_set_pos(int trace_fd __maybe_unused, off_t pos __maybe_unused)
552 {
553 }
554
record__aio_mmap_read_sync(struct record * rec __maybe_unused)555 static void record__aio_mmap_read_sync(struct record *rec __maybe_unused)
556 {
557 }
558 #endif
559
record__aio_enabled(struct record * rec)560 static int record__aio_enabled(struct record *rec)
561 {
562 return rec->opts.nr_cblocks > 0;
563 }
564
565 #define MMAP_FLUSH_DEFAULT 1
record__mmap_flush_parse(const struct option * opt,const char * str,int unset)566 static int record__mmap_flush_parse(const struct option *opt,
567 const char *str,
568 int unset)
569 {
570 int flush_max;
571 struct record_opts *opts = (struct record_opts *)opt->value;
572 static struct parse_tag tags[] = {
573 { .tag = 'B', .mult = 1 },
574 { .tag = 'K', .mult = 1 << 10 },
575 { .tag = 'M', .mult = 1 << 20 },
576 { .tag = 'G', .mult = 1 << 30 },
577 { .tag = 0 },
578 };
579
580 if (unset)
581 return 0;
582
583 if (str) {
584 opts->mmap_flush = parse_tag_value(str, tags);
585 if (opts->mmap_flush == (int)-1)
586 opts->mmap_flush = strtol(str, NULL, 0);
587 }
588
589 if (!opts->mmap_flush)
590 opts->mmap_flush = MMAP_FLUSH_DEFAULT;
591
592 flush_max = evlist__mmap_size(opts->mmap_pages);
593 flush_max /= 4;
594 if (opts->mmap_flush > flush_max)
595 opts->mmap_flush = flush_max;
596
597 return 0;
598 }
599
600 #ifdef HAVE_ZSTD_SUPPORT
601 static unsigned int comp_level_default = 1;
602
record__parse_comp_level(const struct option * opt,const char * str,int unset)603 static int record__parse_comp_level(const struct option *opt, const char *str, int unset)
604 {
605 struct record_opts *opts = opt->value;
606
607 if (unset) {
608 opts->comp_level = 0;
609 } else {
610 if (str)
611 opts->comp_level = strtol(str, NULL, 0);
612 if (!opts->comp_level)
613 opts->comp_level = comp_level_default;
614 }
615
616 return 0;
617 }
618 #endif
619 static unsigned int comp_level_max = 22;
620
record__comp_enabled(struct record * rec)621 static int record__comp_enabled(struct record *rec)
622 {
623 return rec->opts.comp_level > 0;
624 }
625
process_synthesized_event(const struct perf_tool * tool,union perf_event * event,struct perf_sample * sample __maybe_unused,struct machine * machine __maybe_unused)626 static int process_synthesized_event(const struct perf_tool *tool,
627 union perf_event *event,
628 struct perf_sample *sample __maybe_unused,
629 struct machine *machine __maybe_unused)
630 {
631 struct record *rec = container_of(tool, struct record, tool);
632 return record__write(rec, NULL, event, event->header.size);
633 }
634
635 static struct mutex synth_lock;
636
process_locked_synthesized_event(const struct perf_tool * tool,union perf_event * event,struct perf_sample * sample __maybe_unused,struct machine * machine __maybe_unused)637 static int process_locked_synthesized_event(const struct perf_tool *tool,
638 union perf_event *event,
639 struct perf_sample *sample __maybe_unused,
640 struct machine *machine __maybe_unused)
641 {
642 int ret;
643
644 mutex_lock(&synth_lock);
645 ret = process_synthesized_event(tool, event, sample, machine);
646 mutex_unlock(&synth_lock);
647 return ret;
648 }
649
record__pushfn(struct mmap * map,void * to,void * bf,size_t size)650 static int record__pushfn(struct mmap *map, void *to, void *bf, size_t size)
651 {
652 struct record *rec = to;
653
654 if (record__comp_enabled(rec)) {
655 struct perf_record_compressed2 *event = map->data;
656 size_t padding = 0;
657 u8 pad[8] = {0};
658 ssize_t compressed = zstd_compress(rec->session, map, map->data,
659 mmap__mmap_len(map), bf, size);
660
661 if (compressed < 0)
662 return (int)compressed;
663
664 bf = event;
665 thread->samples++;
666
667 /*
668 * The record from `zstd_compress` is not 8 bytes aligned, which would cause asan
669 * error. We make it aligned here.
670 */
671 event->data_size = compressed - sizeof(struct perf_record_compressed2);
672 event->header.size = PERF_ALIGN(compressed, sizeof(u64));
673 padding = event->header.size - compressed;
674 return record__write(rec, map, bf, compressed) ||
675 record__write(rec, map, &pad, padding);
676 }
677
678 thread->samples++;
679 return record__write(rec, map, bf, size);
680 }
681
682 static volatile sig_atomic_t signr = -1;
683 static volatile sig_atomic_t child_finished;
684 #ifdef HAVE_EVENTFD_SUPPORT
685 static volatile sig_atomic_t done_fd = -1;
686 #endif
687
sig_handler(int sig)688 static void sig_handler(int sig)
689 {
690 if (sig == SIGCHLD)
691 child_finished = 1;
692 else
693 signr = sig;
694
695 done = 1;
696 #ifdef HAVE_EVENTFD_SUPPORT
697 if (done_fd >= 0) {
698 u64 tmp = 1;
699 int orig_errno = errno;
700
701 /*
702 * It is possible for this signal handler to run after done is
703 * checked in the main loop, but before the perf counter fds are
704 * polled. If this happens, the poll() will continue to wait
705 * even though done is set, and will only break out if either
706 * another signal is received, or the counters are ready for
707 * read. To ensure the poll() doesn't sleep when done is set,
708 * use an eventfd (done_fd) to wake up the poll().
709 */
710 if (write(done_fd, &tmp, sizeof(tmp)) < 0)
711 pr_err("failed to signal wakeup fd, error: %m\n");
712
713 errno = orig_errno;
714 }
715 #endif // HAVE_EVENTFD_SUPPORT
716 }
717
sigsegv_handler(int sig)718 static void sigsegv_handler(int sig)
719 {
720 perf_hooks__recover();
721 sighandler_dump_stack(sig);
722 }
723
record__sig_exit(void)724 static void record__sig_exit(void)
725 {
726 if (signr == -1)
727 return;
728
729 signal(signr, SIG_DFL);
730 raise(signr);
731 }
732
record__process_auxtrace(const struct perf_tool * tool,struct mmap * map,union perf_event * event,void * data1,size_t len1,void * data2,size_t len2)733 static int record__process_auxtrace(const struct perf_tool *tool,
734 struct mmap *map,
735 union perf_event *event, void *data1,
736 size_t len1, void *data2, size_t len2)
737 {
738 struct record *rec = container_of(tool, struct record, tool);
739 struct perf_data *data = &rec->data;
740 size_t padding;
741 u8 pad[8] = {0};
742
743 if (!perf_data__is_pipe(data) && perf_data__is_single_file(data)) {
744 off_t file_offset;
745 int fd = perf_data__fd(data);
746 int err;
747
748 file_offset = lseek(fd, 0, SEEK_CUR);
749 if (file_offset == -1)
750 return -1;
751 err = auxtrace_index__auxtrace_event(&rec->session->auxtrace_index,
752 event, file_offset);
753 if (err)
754 return err;
755 }
756
757 /* event.auxtrace.size includes padding, see __auxtrace_mmap__read() */
758 padding = (len1 + len2) & 7;
759 if (padding)
760 padding = 8 - padding;
761
762 record__write(rec, map, event, event->header.size);
763 record__write(rec, map, data1, len1);
764 if (len2)
765 record__write(rec, map, data2, len2);
766 record__write(rec, map, &pad, padding);
767
768 return 0;
769 }
770
record__auxtrace_mmap_read(struct record * rec,struct mmap * map)771 static int record__auxtrace_mmap_read(struct record *rec,
772 struct mmap *map)
773 {
774 int ret;
775
776 ret = auxtrace_mmap__read(map, rec->itr,
777 perf_session__env(rec->session),
778 &rec->tool,
779 record__process_auxtrace);
780 if (ret < 0)
781 return ret;
782
783 if (ret)
784 rec->samples++;
785
786 return 0;
787 }
788
record__auxtrace_mmap_read_snapshot(struct record * rec,struct mmap * map)789 static int record__auxtrace_mmap_read_snapshot(struct record *rec,
790 struct mmap *map)
791 {
792 int ret;
793
794 ret = auxtrace_mmap__read_snapshot(map, rec->itr,
795 perf_session__env(rec->session),
796 &rec->tool,
797 record__process_auxtrace,
798 rec->opts.auxtrace_snapshot_size);
799 if (ret < 0)
800 return ret;
801
802 if (ret)
803 rec->samples++;
804
805 return 0;
806 }
807
record__auxtrace_read_snapshot_all(struct record * rec)808 static int record__auxtrace_read_snapshot_all(struct record *rec)
809 {
810 int i;
811 int rc = 0;
812
813 for (i = 0; i < rec->evlist->core.nr_mmaps; i++) {
814 struct mmap *map = &rec->evlist->mmap[i];
815
816 if (!map->auxtrace_mmap.base)
817 continue;
818
819 if (record__auxtrace_mmap_read_snapshot(rec, map) != 0) {
820 rc = -1;
821 goto out;
822 }
823 }
824 out:
825 return rc;
826 }
827
record__read_auxtrace_snapshot(struct record * rec,bool on_exit)828 static void record__read_auxtrace_snapshot(struct record *rec, bool on_exit)
829 {
830 pr_debug("Recording AUX area tracing snapshot\n");
831 if (record__auxtrace_read_snapshot_all(rec) < 0) {
832 trigger_error(&auxtrace_snapshot_trigger);
833 } else {
834 if (auxtrace_record__snapshot_finish(rec->itr, on_exit))
835 trigger_error(&auxtrace_snapshot_trigger);
836 else
837 trigger_ready(&auxtrace_snapshot_trigger);
838 }
839 }
840
record__auxtrace_snapshot_exit(struct record * rec)841 static int record__auxtrace_snapshot_exit(struct record *rec)
842 {
843 if (trigger_is_error(&auxtrace_snapshot_trigger))
844 return 0;
845
846 if (!auxtrace_record__snapshot_started &&
847 auxtrace_record__snapshot_start(rec->itr))
848 return -1;
849
850 record__read_auxtrace_snapshot(rec, true);
851 if (trigger_is_error(&auxtrace_snapshot_trigger))
852 return -1;
853
854 return 0;
855 }
856
record__auxtrace_init(struct record * rec)857 static int record__auxtrace_init(struct record *rec)
858 {
859 int err;
860
861 if ((rec->opts.auxtrace_snapshot_opts || rec->opts.auxtrace_sample_opts)
862 && record__threads_enabled(rec)) {
863 pr_err("AUX area tracing options are not available in parallel streaming mode.\n");
864 return -EINVAL;
865 }
866
867 if (!rec->itr) {
868 rec->itr = auxtrace_record__init(rec->evlist, &err);
869 if (err)
870 return err;
871 }
872
873 err = auxtrace_parse_snapshot_options(rec->itr, &rec->opts,
874 rec->opts.auxtrace_snapshot_opts);
875 if (err)
876 return err;
877
878 err = auxtrace_parse_sample_options(rec->itr, rec->evlist, &rec->opts,
879 rec->opts.auxtrace_sample_opts);
880 if (err)
881 return err;
882
883 err = auxtrace_parse_aux_action(rec->evlist);
884 if (err)
885 return err;
886
887 return auxtrace_parse_filters(rec->evlist);
888 }
889
record__config_text_poke(struct evlist * evlist)890 static int record__config_text_poke(struct evlist *evlist)
891 {
892 struct evsel *evsel;
893
894 /* Nothing to do if text poke is already configured */
895 evlist__for_each_entry(evlist, evsel) {
896 if (evsel->core.attr.text_poke)
897 return 0;
898 }
899
900 evsel = evlist__add_dummy_on_all_cpus(evlist);
901 if (!evsel)
902 return -ENOMEM;
903
904 evsel->core.attr.text_poke = 1;
905 evsel->core.attr.ksymbol = 1;
906 evsel->immediate = true;
907 evsel__set_sample_bit(evsel, TIME);
908
909 return 0;
910 }
911
record__config_off_cpu(struct record * rec)912 static int record__config_off_cpu(struct record *rec)
913 {
914 return off_cpu_prepare(rec->evlist, &rec->opts.target, &rec->opts);
915 }
916
record__tracking_system_wide(struct record * rec)917 static bool record__tracking_system_wide(struct record *rec)
918 {
919 struct evlist *evlist = rec->evlist;
920 struct evsel *evsel;
921
922 /*
923 * If non-dummy evsel exists, system_wide sideband is need to
924 * help parse sample information.
925 * For example, PERF_EVENT_MMAP event to help parse symbol,
926 * and PERF_EVENT_COMM event to help parse task executable name.
927 */
928 evlist__for_each_entry(evlist, evsel) {
929 if (!evsel__is_dummy_event(evsel))
930 return true;
931 }
932
933 return false;
934 }
935
record__config_tracking_events(struct record * rec)936 static int record__config_tracking_events(struct record *rec)
937 {
938 struct record_opts *opts = &rec->opts;
939 struct evlist *evlist = rec->evlist;
940 bool system_wide = false;
941 struct evsel *evsel;
942
943 /*
944 * For initial_delay, system wide or a hybrid system, we need to add
945 * tracking event so that we can track PERF_RECORD_MMAP to cover the
946 * delay of waiting or event synthesis.
947 */
948 if (opts->target.initial_delay || target__has_cpu(&opts->target) ||
949 perf_pmus__num_core_pmus() > 1) {
950 /*
951 * User space tasks can migrate between CPUs, so when tracing
952 * selected CPUs, sideband for all CPUs is still needed.
953 */
954 if (!!opts->target.cpu_list && record__tracking_system_wide(rec))
955 system_wide = true;
956
957 evsel = evlist__findnew_tracking_event(evlist, system_wide);
958 if (!evsel)
959 return -ENOMEM;
960
961 /*
962 * Enable the tracking event when the process is forked for
963 * initial_delay, immediately for system wide.
964 */
965 if (opts->target.initial_delay && !evsel->immediate &&
966 !target__has_cpu(&opts->target))
967 evsel->core.attr.enable_on_exec = 1;
968 else
969 evsel->immediate = 1;
970 }
971
972 return 0;
973 }
974
record__kcore_readable(struct machine * machine)975 static bool record__kcore_readable(struct machine *machine)
976 {
977 char kcore[PATH_MAX];
978 int fd;
979
980 scnprintf(kcore, sizeof(kcore), "%s/proc/kcore", machine->root_dir);
981
982 fd = open(kcore, O_RDONLY);
983 if (fd < 0)
984 return false;
985
986 close(fd);
987
988 return true;
989 }
990
record__kcore_copy(struct machine * machine,struct perf_data * data)991 static int record__kcore_copy(struct machine *machine, struct perf_data *data)
992 {
993 char from_dir[PATH_MAX];
994 char kcore_dir[PATH_MAX];
995 int ret;
996
997 snprintf(from_dir, sizeof(from_dir), "%s/proc", machine->root_dir);
998
999 ret = perf_data__make_kcore_dir(data, kcore_dir, sizeof(kcore_dir));
1000 if (ret)
1001 return ret;
1002
1003 return kcore_copy(from_dir, kcore_dir);
1004 }
1005
record__thread_data_init_pipes(struct record_thread * thread_data)1006 static void record__thread_data_init_pipes(struct record_thread *thread_data)
1007 {
1008 thread_data->pipes.msg[0] = -1;
1009 thread_data->pipes.msg[1] = -1;
1010 thread_data->pipes.ack[0] = -1;
1011 thread_data->pipes.ack[1] = -1;
1012 }
1013
record__thread_data_open_pipes(struct record_thread * thread_data)1014 static int record__thread_data_open_pipes(struct record_thread *thread_data)
1015 {
1016 if (pipe(thread_data->pipes.msg))
1017 return -EINVAL;
1018
1019 if (pipe(thread_data->pipes.ack)) {
1020 close(thread_data->pipes.msg[0]);
1021 thread_data->pipes.msg[0] = -1;
1022 close(thread_data->pipes.msg[1]);
1023 thread_data->pipes.msg[1] = -1;
1024 return -EINVAL;
1025 }
1026
1027 pr_debug2("thread_data[%p]: msg=[%d,%d], ack=[%d,%d]\n", thread_data,
1028 thread_data->pipes.msg[0], thread_data->pipes.msg[1],
1029 thread_data->pipes.ack[0], thread_data->pipes.ack[1]);
1030
1031 return 0;
1032 }
1033
record__thread_data_close_pipes(struct record_thread * thread_data)1034 static void record__thread_data_close_pipes(struct record_thread *thread_data)
1035 {
1036 if (thread_data->pipes.msg[0] != -1) {
1037 close(thread_data->pipes.msg[0]);
1038 thread_data->pipes.msg[0] = -1;
1039 }
1040 if (thread_data->pipes.msg[1] != -1) {
1041 close(thread_data->pipes.msg[1]);
1042 thread_data->pipes.msg[1] = -1;
1043 }
1044 if (thread_data->pipes.ack[0] != -1) {
1045 close(thread_data->pipes.ack[0]);
1046 thread_data->pipes.ack[0] = -1;
1047 }
1048 if (thread_data->pipes.ack[1] != -1) {
1049 close(thread_data->pipes.ack[1]);
1050 thread_data->pipes.ack[1] = -1;
1051 }
1052 }
1053
evlist__per_thread(struct evlist * evlist)1054 static bool evlist__per_thread(struct evlist *evlist)
1055 {
1056 return cpu_map__is_dummy(evlist->core.user_requested_cpus);
1057 }
1058
record__thread_data_init_maps(struct record_thread * thread_data,struct evlist * evlist)1059 static int record__thread_data_init_maps(struct record_thread *thread_data, struct evlist *evlist)
1060 {
1061 int m, tm, nr_mmaps = evlist->core.nr_mmaps;
1062 struct mmap *mmap = evlist->mmap;
1063 struct mmap *overwrite_mmap = evlist->overwrite_mmap;
1064 struct perf_cpu_map *cpus = evlist->core.all_cpus;
1065 bool per_thread = evlist__per_thread(evlist);
1066
1067 if (per_thread)
1068 thread_data->nr_mmaps = nr_mmaps;
1069 else
1070 thread_data->nr_mmaps = bitmap_weight(thread_data->mask->maps.bits,
1071 thread_data->mask->maps.nbits);
1072 if (mmap) {
1073 thread_data->maps = zalloc(thread_data->nr_mmaps * sizeof(struct mmap *));
1074 if (!thread_data->maps)
1075 return -ENOMEM;
1076 }
1077 if (overwrite_mmap) {
1078 thread_data->overwrite_maps = zalloc(thread_data->nr_mmaps * sizeof(struct mmap *));
1079 if (!thread_data->overwrite_maps) {
1080 zfree(&thread_data->maps);
1081 return -ENOMEM;
1082 }
1083 }
1084 pr_debug2("thread_data[%p]: nr_mmaps=%d, maps=%p, ow_maps=%p\n", thread_data,
1085 thread_data->nr_mmaps, thread_data->maps, thread_data->overwrite_maps);
1086
1087 for (m = 0, tm = 0; m < nr_mmaps && tm < thread_data->nr_mmaps; m++) {
1088 if (per_thread ||
1089 test_bit(perf_cpu_map__cpu(cpus, m).cpu, thread_data->mask->maps.bits)) {
1090 if (thread_data->maps) {
1091 thread_data->maps[tm] = &mmap[m];
1092 pr_debug2("thread_data[%p]: cpu%d: maps[%d] -> mmap[%d]\n",
1093 thread_data, perf_cpu_map__cpu(cpus, m).cpu, tm, m);
1094 }
1095 if (thread_data->overwrite_maps) {
1096 thread_data->overwrite_maps[tm] = &overwrite_mmap[m];
1097 pr_debug2("thread_data[%p]: cpu%d: ow_maps[%d] -> ow_mmap[%d]\n",
1098 thread_data, perf_cpu_map__cpu(cpus, m).cpu, tm, m);
1099 }
1100 tm++;
1101 }
1102 }
1103
1104 return 0;
1105 }
1106
record__thread_data_init_pollfd(struct record_thread * thread_data,struct evlist * evlist)1107 static int record__thread_data_init_pollfd(struct record_thread *thread_data, struct evlist *evlist)
1108 {
1109 int f, tm, pos;
1110 struct mmap *map, *overwrite_map;
1111
1112 fdarray__init(&thread_data->pollfd, 64);
1113
1114 for (tm = 0; tm < thread_data->nr_mmaps; tm++) {
1115 map = thread_data->maps ? thread_data->maps[tm] : NULL;
1116 overwrite_map = thread_data->overwrite_maps ?
1117 thread_data->overwrite_maps[tm] : NULL;
1118
1119 for (f = 0; f < evlist->core.pollfd.nr; f++) {
1120 void *ptr = evlist->core.pollfd.priv[f].ptr;
1121
1122 if ((map && ptr == map) || (overwrite_map && ptr == overwrite_map)) {
1123 pos = fdarray__dup_entry_from(&thread_data->pollfd, f,
1124 &evlist->core.pollfd);
1125 if (pos < 0)
1126 return pos;
1127 pr_debug2("thread_data[%p]: pollfd[%d] <- event_fd=%d\n",
1128 thread_data, pos, evlist->core.pollfd.entries[f].fd);
1129 }
1130 }
1131 }
1132
1133 return 0;
1134 }
1135
record__free_thread_data(struct record * rec)1136 static void record__free_thread_data(struct record *rec)
1137 {
1138 int t;
1139 struct record_thread *thread_data = rec->thread_data;
1140
1141 if (thread_data == NULL)
1142 return;
1143
1144 for (t = 0; t < rec->nr_threads; t++) {
1145 record__thread_data_close_pipes(&thread_data[t]);
1146 zfree(&thread_data[t].maps);
1147 zfree(&thread_data[t].overwrite_maps);
1148 fdarray__exit(&thread_data[t].pollfd);
1149 }
1150
1151 zfree(&rec->thread_data);
1152 }
1153
record__map_thread_evlist_pollfd_indexes(struct record * rec,int evlist_pollfd_index,int thread_pollfd_index)1154 static int record__map_thread_evlist_pollfd_indexes(struct record *rec,
1155 int evlist_pollfd_index,
1156 int thread_pollfd_index)
1157 {
1158 size_t x = rec->index_map_cnt;
1159
1160 if (realloc_array_as_needed(rec->index_map, rec->index_map_sz, x, NULL))
1161 return -ENOMEM;
1162 rec->index_map[x].evlist_pollfd_index = evlist_pollfd_index;
1163 rec->index_map[x].thread_pollfd_index = thread_pollfd_index;
1164 rec->index_map_cnt += 1;
1165 return 0;
1166 }
1167
record__update_evlist_pollfd_from_thread(struct record * rec,struct evlist * evlist,struct record_thread * thread_data)1168 static int record__update_evlist_pollfd_from_thread(struct record *rec,
1169 struct evlist *evlist,
1170 struct record_thread *thread_data)
1171 {
1172 struct pollfd *e_entries = evlist->core.pollfd.entries;
1173 struct pollfd *t_entries = thread_data->pollfd.entries;
1174 int err = 0;
1175 size_t i;
1176
1177 for (i = 0; i < rec->index_map_cnt; i++) {
1178 int e_pos = rec->index_map[i].evlist_pollfd_index;
1179 int t_pos = rec->index_map[i].thread_pollfd_index;
1180
1181 if (e_entries[e_pos].fd != t_entries[t_pos].fd ||
1182 e_entries[e_pos].events != t_entries[t_pos].events) {
1183 pr_err("Thread and evlist pollfd index mismatch\n");
1184 err = -EINVAL;
1185 continue;
1186 }
1187 e_entries[e_pos].revents = t_entries[t_pos].revents;
1188 }
1189 return err;
1190 }
1191
record__dup_non_perf_events(struct record * rec,struct evlist * evlist,struct record_thread * thread_data)1192 static int record__dup_non_perf_events(struct record *rec,
1193 struct evlist *evlist,
1194 struct record_thread *thread_data)
1195 {
1196 struct fdarray *fda = &evlist->core.pollfd;
1197 int i, ret;
1198
1199 for (i = 0; i < fda->nr; i++) {
1200 if (!(fda->priv[i].flags & fdarray_flag__non_perf_event))
1201 continue;
1202 ret = fdarray__dup_entry_from(&thread_data->pollfd, i, fda);
1203 if (ret < 0) {
1204 pr_err("Failed to duplicate descriptor in main thread pollfd\n");
1205 return ret;
1206 }
1207 pr_debug2("thread_data[%p]: pollfd[%d] <- non_perf_event fd=%d\n",
1208 thread_data, ret, fda->entries[i].fd);
1209 ret = record__map_thread_evlist_pollfd_indexes(rec, i, ret);
1210 if (ret < 0) {
1211 pr_err("Failed to map thread and evlist pollfd indexes\n");
1212 return ret;
1213 }
1214 }
1215 return 0;
1216 }
1217
record__alloc_thread_data(struct record * rec,struct evlist * evlist)1218 static int record__alloc_thread_data(struct record *rec, struct evlist *evlist)
1219 {
1220 int t, ret;
1221 struct record_thread *thread_data;
1222
1223 rec->thread_data = zalloc(rec->nr_threads * sizeof(*(rec->thread_data)));
1224 if (!rec->thread_data) {
1225 pr_err("Failed to allocate thread data\n");
1226 return -ENOMEM;
1227 }
1228 thread_data = rec->thread_data;
1229
1230 for (t = 0; t < rec->nr_threads; t++)
1231 record__thread_data_init_pipes(&thread_data[t]);
1232
1233 for (t = 0; t < rec->nr_threads; t++) {
1234 thread_data[t].rec = rec;
1235 thread_data[t].mask = &rec->thread_masks[t];
1236 ret = record__thread_data_init_maps(&thread_data[t], evlist);
1237 if (ret) {
1238 pr_err("Failed to initialize thread[%d] maps\n", t);
1239 goto out_free;
1240 }
1241 ret = record__thread_data_init_pollfd(&thread_data[t], evlist);
1242 if (ret) {
1243 pr_err("Failed to initialize thread[%d] pollfd\n", t);
1244 goto out_free;
1245 }
1246 if (t) {
1247 thread_data[t].tid = -1;
1248 ret = record__thread_data_open_pipes(&thread_data[t]);
1249 if (ret) {
1250 pr_err("Failed to open thread[%d] communication pipes\n", t);
1251 goto out_free;
1252 }
1253 ret = fdarray__add(&thread_data[t].pollfd, thread_data[t].pipes.msg[0],
1254 POLLIN | POLLERR | POLLHUP, fdarray_flag__nonfilterable);
1255 if (ret < 0) {
1256 pr_err("Failed to add descriptor to thread[%d] pollfd\n", t);
1257 goto out_free;
1258 }
1259 thread_data[t].ctlfd_pos = ret;
1260 pr_debug2("thread_data[%p]: pollfd[%d] <- ctl_fd=%d\n",
1261 thread_data, thread_data[t].ctlfd_pos,
1262 thread_data[t].pipes.msg[0]);
1263 } else {
1264 thread_data[t].tid = gettid();
1265
1266 ret = record__dup_non_perf_events(rec, evlist, &thread_data[t]);
1267 if (ret < 0)
1268 goto out_free;
1269
1270 thread_data[t].ctlfd_pos = -1; /* Not used */
1271 }
1272 }
1273
1274 return 0;
1275
1276 out_free:
1277 record__free_thread_data(rec);
1278
1279 return ret;
1280 }
1281
record__mmap_evlist(struct record * rec,struct evlist * evlist)1282 static int record__mmap_evlist(struct record *rec,
1283 struct evlist *evlist)
1284 {
1285 int i, ret;
1286 struct record_opts *opts = &rec->opts;
1287 bool auxtrace_overwrite = opts->auxtrace_snapshot_mode ||
1288 opts->auxtrace_sample_mode;
1289
1290 if (opts->affinity != PERF_AFFINITY_SYS)
1291 cpu__setup_cpunode_map();
1292
1293 if (evlist__mmap_ex(evlist, opts->mmap_pages,
1294 opts->auxtrace_mmap_pages,
1295 auxtrace_overwrite,
1296 opts->nr_cblocks, opts->affinity,
1297 opts->mmap_flush, opts->comp_level) < 0) {
1298 if (errno == EPERM) {
1299 pr_err("Permission error mapping pages.\n"
1300 "Consider increasing "
1301 "/proc/sys/kernel/perf_event_mlock_kb,\n"
1302 "or try again with a smaller value of -m/--mmap_pages.\n"
1303 "(current value: %u,%u)\n",
1304 opts->mmap_pages, opts->auxtrace_mmap_pages);
1305 return -errno;
1306 } else {
1307 pr_err("failed to mmap: %m\n");
1308 if (errno)
1309 return -errno;
1310 else
1311 return -EINVAL;
1312 }
1313 }
1314
1315 if (evlist__initialize_ctlfd(evlist, opts->ctl_fd, opts->ctl_fd_ack))
1316 return -1;
1317
1318 ret = record__alloc_thread_data(rec, evlist);
1319 if (ret)
1320 return ret;
1321
1322 if (record__threads_enabled(rec)) {
1323 ret = perf_data__create_dir(&rec->data, evlist->core.nr_mmaps);
1324 if (ret) {
1325 errno = -ret;
1326 pr_err("Failed to create data directory: %m\n");
1327 return ret;
1328 }
1329 for (i = 0; i < evlist->core.nr_mmaps; i++) {
1330 if (evlist->mmap)
1331 evlist->mmap[i].file = &rec->data.dir.files[i];
1332 if (evlist->overwrite_mmap)
1333 evlist->overwrite_mmap[i].file = &rec->data.dir.files[i];
1334 }
1335 }
1336
1337 return 0;
1338 }
1339
record__mmap(struct record * rec)1340 static int record__mmap(struct record *rec)
1341 {
1342 return record__mmap_evlist(rec, rec->evlist);
1343 }
1344
record__open(struct record * rec)1345 static int record__open(struct record *rec)
1346 {
1347 char msg[BUFSIZ];
1348 struct evsel *pos;
1349 struct evlist *evlist = rec->evlist;
1350 struct perf_session *session = rec->session;
1351 struct record_opts *opts = &rec->opts;
1352 int rc = 0;
1353 bool skipped = false;
1354 bool removed_tracking = false;
1355
1356 evlist__for_each_entry(evlist, pos) {
1357 if (removed_tracking) {
1358 /*
1359 * Normally the head of the list has tracking enabled
1360 * for sideband data like mmaps. If this event is
1361 * removed, make sure to add tracking to the next
1362 * processed event.
1363 */
1364 if (!pos->tracking) {
1365 pos->tracking = true;
1366 evsel__config(pos, opts, &callchain_param);
1367 }
1368 removed_tracking = false;
1369 }
1370 try_again:
1371 if (evsel__open(pos, pos->core.cpus, pos->core.threads) < 0) {
1372 bool report_error = true;
1373
1374 if (evsel__fallback(pos, &opts->target, errno, msg, sizeof(msg))) {
1375 if (verbose > 0)
1376 ui__warning("%s\n", msg);
1377 goto try_again;
1378 }
1379 if ((errno == EINVAL || errno == EBADF) &&
1380 pos->core.leader != &pos->core &&
1381 pos->weak_group) {
1382 pos = evlist__reset_weak_group(evlist, pos, true);
1383 goto try_again;
1384 }
1385 #if defined(__aarch64__) || defined(__arm__)
1386 if (strstr(evsel__name(pos), "cycles")) {
1387 struct evsel *pos2;
1388 /*
1389 * Unfortunately ARM has many events named
1390 * "cycles" on PMUs like the system-level (L3)
1391 * cache which don't support sampling. Only
1392 * display such failures to open when there is
1393 * only 1 cycles event or verbose is enabled.
1394 */
1395 evlist__for_each_entry(evlist, pos2) {
1396 if (pos2 == pos)
1397 continue;
1398 if (strstr(evsel__name(pos2), "cycles")) {
1399 report_error = false;
1400 break;
1401 }
1402 }
1403 }
1404 #endif
1405 if (report_error || verbose > 0) {
1406 evsel__open_strerror(pos, &opts->target, errno, msg, sizeof(msg));
1407 ui__error("Failure to open event '%s' on PMU '%s' which will be "
1408 "removed.\n%s\n",
1409 evsel__name(pos), evsel__pmu_name(pos), msg);
1410 }
1411 if (pos->tracking)
1412 removed_tracking = true;
1413 pos->skippable = true;
1414 skipped = true;
1415 }
1416 }
1417
1418 if (skipped) {
1419 struct evsel *tmp;
1420 int idx = 0;
1421 bool evlist_empty = true;
1422
1423 /* Remove evsels that failed to open and update indices. */
1424 evlist__for_each_entry_safe(evlist, tmp, pos) {
1425 if (pos->skippable) {
1426 evlist__remove(evlist, pos);
1427 continue;
1428 }
1429
1430 /*
1431 * Note, dummy events may be command line parsed or
1432 * added by the tool. We care about supporting `perf
1433 * record -e dummy` which may be used as a permission
1434 * check. Dummy events that are added to the command
1435 * line and opened along with other events that fail,
1436 * will still fail as if the dummy events were tool
1437 * added events for the sake of code simplicity.
1438 */
1439 if (!evsel__is_dummy_event(pos))
1440 evlist_empty = false;
1441 }
1442 evlist__for_each_entry(evlist, pos) {
1443 pos->core.idx = idx++;
1444 }
1445 /* If list is empty then fail. */
1446 if (evlist_empty) {
1447 ui__error("Failure to open any events for recording.\n");
1448 rc = -1;
1449 goto out;
1450 }
1451 }
1452 if (symbol_conf.kptr_restrict && !evlist__exclude_kernel(evlist)) {
1453 pr_warning(
1454 "WARNING: Kernel address maps (/proc/{kallsyms,modules}) are restricted,\n"
1455 "check /proc/sys/kernel/kptr_restrict and /proc/sys/kernel/perf_event_paranoid.\n\n"
1456 "Samples in kernel functions may not be resolved if a suitable vmlinux\n"
1457 "file is not found in the buildid cache or in the vmlinux path.\n\n"
1458 "Samples in kernel modules won't be resolved at all.\n\n"
1459 "If some relocation was applied (e.g. kexec) symbols may be misresolved\n"
1460 "even with a suitable vmlinux or kallsyms file.\n\n");
1461 }
1462
1463 if (evlist__apply_filters(evlist, &pos, &opts->target)) {
1464 pr_err("failed to set filter \"%s\" on event %s: %m\n",
1465 pos->filter ?: "BPF", evsel__name(pos));
1466 rc = -1;
1467 goto out;
1468 }
1469
1470 rc = record__mmap(rec);
1471 if (rc)
1472 goto out;
1473
1474 session->evlist = evlist;
1475 perf_session__set_id_hdr_size(session);
1476 out:
1477 return rc;
1478 }
1479
set_timestamp_boundary(struct record * rec,u64 sample_time)1480 static void set_timestamp_boundary(struct record *rec, u64 sample_time)
1481 {
1482 if (rec->evlist->first_sample_time == 0)
1483 rec->evlist->first_sample_time = sample_time;
1484
1485 if (sample_time)
1486 rec->evlist->last_sample_time = sample_time;
1487 }
1488
process_sample_event(const struct perf_tool * tool,union perf_event * event,struct perf_sample * sample,struct evsel * evsel,struct machine * machine)1489 static int process_sample_event(const struct perf_tool *tool,
1490 union perf_event *event,
1491 struct perf_sample *sample,
1492 struct evsel *evsel,
1493 struct machine *machine)
1494 {
1495 struct record *rec = container_of(tool, struct record, tool);
1496
1497 set_timestamp_boundary(rec, sample->time);
1498
1499 if (rec->buildid_all)
1500 return 0;
1501
1502 rec->samples++;
1503 return build_id__mark_dso_hit(tool, event, sample, evsel, machine);
1504 }
1505
process_buildids(struct record * rec)1506 static int process_buildids(struct record *rec)
1507 {
1508 struct perf_session *session = rec->session;
1509
1510 if (perf_data__size(&rec->data) == 0)
1511 return 0;
1512
1513 /* A single DSO is needed and not all inline frames. */
1514 symbol_conf.inline_name = false;
1515 /*
1516 * During this process, it'll load kernel map and replace the
1517 * dso->long_name to a real pathname it found. In this case
1518 * we prefer the vmlinux path like
1519 * /lib/modules/3.16.4/build/vmlinux
1520 *
1521 * rather than build-id path (in debug directory).
1522 * $HOME/.debug/.build-id/f0/6e17aa50adf4d00b88925e03775de107611551
1523 */
1524 symbol_conf.ignore_vmlinux_buildid = true;
1525 /*
1526 * If --buildid-all is given, it marks all DSO regardless of hits,
1527 * so no need to process samples. But if timestamp_boundary is enabled,
1528 * it still needs to walk on all samples to get the timestamps of
1529 * first/last samples.
1530 */
1531 if (rec->buildid_all && !rec->timestamp_boundary)
1532 rec->tool.sample = process_event_sample_stub;
1533
1534 return perf_session__process_events(session);
1535 }
1536
perf_event__synthesize_guest_os(struct machine * machine,void * data)1537 static void perf_event__synthesize_guest_os(struct machine *machine, void *data)
1538 {
1539 int err;
1540 struct perf_tool *tool = data;
1541 /*
1542 *As for guest kernel when processing subcommand record&report,
1543 *we arrange module mmap prior to guest kernel mmap and trigger
1544 *a preload dso because default guest module symbols are loaded
1545 *from guest kallsyms instead of /lib/modules/XXX/XXX. This
1546 *method is used to avoid symbol missing when the first addr is
1547 *in module instead of in guest kernel.
1548 */
1549 err = perf_event__synthesize_modules(tool, process_synthesized_event,
1550 machine);
1551 if (err < 0)
1552 pr_err("Couldn't record guest kernel [%d]'s reference"
1553 " relocation symbol.\n", machine->pid);
1554
1555 /*
1556 * We use _stext for guest kernel because guest kernel's /proc/kallsyms
1557 * have no _text sometimes.
1558 */
1559 err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
1560 machine);
1561 if (err < 0)
1562 pr_err("Couldn't record guest kernel [%d]'s reference"
1563 " relocation symbol.\n", machine->pid);
1564 }
1565
1566 static struct perf_event_header finished_round_event = {
1567 .size = sizeof(struct perf_event_header),
1568 .type = PERF_RECORD_FINISHED_ROUND,
1569 };
1570
1571 static struct perf_event_header finished_init_event = {
1572 .size = sizeof(struct perf_event_header),
1573 .type = PERF_RECORD_FINISHED_INIT,
1574 };
1575
record__adjust_affinity(struct record * rec,struct mmap * map)1576 static void record__adjust_affinity(struct record *rec, struct mmap *map)
1577 {
1578 if (rec->opts.affinity != PERF_AFFINITY_SYS &&
1579 !bitmap_equal(thread->mask->affinity.bits, map->affinity_mask.bits,
1580 thread->mask->affinity.nbits)) {
1581 bitmap_zero(thread->mask->affinity.bits, thread->mask->affinity.nbits);
1582 bitmap_or(thread->mask->affinity.bits, thread->mask->affinity.bits,
1583 map->affinity_mask.bits, thread->mask->affinity.nbits);
1584 sched_setaffinity(0, MMAP_CPU_MASK_BYTES(&thread->mask->affinity),
1585 (cpu_set_t *)thread->mask->affinity.bits);
1586 if (verbose == 2) {
1587 pr_debug("threads[%d]: running on cpu%d: ", thread->tid, sched_getcpu());
1588 mmap_cpu_mask__scnprintf(&thread->mask->affinity, "affinity");
1589 }
1590 }
1591 }
1592
process_comp_header(void * record,size_t increment)1593 static size_t process_comp_header(void *record, size_t increment)
1594 {
1595 struct perf_record_compressed2 *event = record;
1596 size_t size = sizeof(*event);
1597
1598 if (increment) {
1599 event->header.size += increment;
1600 return increment;
1601 }
1602
1603 event->header.type = PERF_RECORD_COMPRESSED2;
1604 event->header.size = size;
1605
1606 return size;
1607 }
1608
zstd_compress(struct perf_session * session,struct mmap * map,void * dst,size_t dst_size,void * src,size_t src_size)1609 static ssize_t zstd_compress(struct perf_session *session, struct mmap *map,
1610 void *dst, size_t dst_size, void *src, size_t src_size)
1611 {
1612 ssize_t compressed;
1613 size_t max_record_size = PERF_SAMPLE_MAX_SIZE - sizeof(struct perf_record_compressed2) - 1;
1614 struct zstd_data *zstd_data = &session->zstd_data;
1615
1616 if (map && map->file)
1617 zstd_data = &map->zstd_data;
1618
1619 compressed = zstd_compress_stream_to_records(zstd_data, dst, dst_size, src, src_size,
1620 max_record_size, process_comp_header);
1621 if (compressed < 0)
1622 return compressed;
1623
1624 if (map && map->file) {
1625 thread->bytes_transferred += src_size;
1626 thread->bytes_compressed += compressed;
1627 } else {
1628 session->bytes_transferred += src_size;
1629 session->bytes_compressed += compressed;
1630 }
1631
1632 return compressed;
1633 }
1634
record__mmap_read_evlist(struct record * rec,struct evlist * evlist,bool overwrite,bool synch)1635 static int record__mmap_read_evlist(struct record *rec, struct evlist *evlist,
1636 bool overwrite, bool synch)
1637 {
1638 u64 bytes_written = rec->bytes_written;
1639 int i;
1640 int rc = 0;
1641 int nr_mmaps;
1642 struct mmap **maps;
1643 int trace_fd = rec->data.file.fd;
1644 off_t off = 0;
1645
1646 if (!evlist)
1647 return 0;
1648
1649 nr_mmaps = thread->nr_mmaps;
1650 maps = overwrite ? thread->overwrite_maps : thread->maps;
1651
1652 if (!maps)
1653 return 0;
1654
1655 if (overwrite && evlist->bkw_mmap_state != BKW_MMAP_DATA_PENDING)
1656 return 0;
1657
1658 if (record__aio_enabled(rec))
1659 off = record__aio_get_pos(trace_fd);
1660
1661 for (i = 0; i < nr_mmaps; i++) {
1662 u64 flush = 0;
1663 struct mmap *map = maps[i];
1664
1665 if (map->core.base) {
1666 record__adjust_affinity(rec, map);
1667 if (synch) {
1668 flush = map->core.flush;
1669 map->core.flush = 1;
1670 }
1671 if (!record__aio_enabled(rec)) {
1672 if (perf_mmap__push(map, rec, record__pushfn) < 0) {
1673 if (synch)
1674 map->core.flush = flush;
1675 rc = -1;
1676 goto out;
1677 }
1678 } else {
1679 if (record__aio_push(rec, map, &off) < 0) {
1680 record__aio_set_pos(trace_fd, off);
1681 if (synch)
1682 map->core.flush = flush;
1683 rc = -1;
1684 goto out;
1685 }
1686 }
1687 if (synch)
1688 map->core.flush = flush;
1689 }
1690
1691 if (map->auxtrace_mmap.base && !rec->opts.auxtrace_snapshot_mode &&
1692 !rec->opts.auxtrace_sample_mode &&
1693 record__auxtrace_mmap_read(rec, map) != 0) {
1694 rc = -1;
1695 goto out;
1696 }
1697 }
1698
1699 if (record__aio_enabled(rec))
1700 record__aio_set_pos(trace_fd, off);
1701
1702 /*
1703 * Mark the round finished in case we wrote
1704 * at least one event.
1705 *
1706 * No need for round events in directory mode,
1707 * because per-cpu maps and files have data
1708 * sorted by kernel.
1709 */
1710 if (!record__threads_enabled(rec) && bytes_written != rec->bytes_written)
1711 rc = record__write(rec, NULL, &finished_round_event, sizeof(finished_round_event));
1712
1713 if (overwrite)
1714 evlist__toggle_bkw_mmap(evlist, BKW_MMAP_EMPTY);
1715 out:
1716 return rc;
1717 }
1718
record__mmap_read_all(struct record * rec,bool synch)1719 static int record__mmap_read_all(struct record *rec, bool synch)
1720 {
1721 int err;
1722
1723 err = record__mmap_read_evlist(rec, rec->evlist, false, synch);
1724 if (err)
1725 return err;
1726
1727 return record__mmap_read_evlist(rec, rec->evlist, true, synch);
1728 }
1729
record__thread_munmap_filtered(struct fdarray * fda,int fd,void * arg __maybe_unused)1730 static void record__thread_munmap_filtered(struct fdarray *fda, int fd,
1731 void *arg __maybe_unused)
1732 {
1733 struct perf_mmap *map = fda->priv[fd].ptr;
1734
1735 if (map)
1736 perf_mmap__put(map);
1737 }
1738
record__thread(void * arg)1739 static void *record__thread(void *arg)
1740 {
1741 enum thread_msg msg = THREAD_MSG__READY;
1742 bool terminate = false;
1743 struct fdarray *pollfd;
1744 int err, ctlfd_pos;
1745
1746 thread = arg;
1747 thread->tid = gettid();
1748
1749 err = write(thread->pipes.ack[1], &msg, sizeof(msg));
1750 if (err == -1)
1751 pr_warning("threads[%d]: failed to notify on start: %m\n", thread->tid);
1752
1753 pr_debug("threads[%d]: started on cpu%d\n", thread->tid, sched_getcpu());
1754
1755 pollfd = &thread->pollfd;
1756 ctlfd_pos = thread->ctlfd_pos;
1757
1758 for (;;) {
1759 unsigned long long hits = thread->samples;
1760
1761 if (record__mmap_read_all(thread->rec, false) < 0 || terminate)
1762 break;
1763
1764 if (hits == thread->samples) {
1765
1766 err = fdarray__poll(pollfd, -1);
1767 /*
1768 * Propagate error, only if there's any. Ignore positive
1769 * number of returned events and interrupt error.
1770 */
1771 if (err > 0 || (err < 0 && errno == EINTR))
1772 err = 0;
1773 thread->waking++;
1774
1775 if (fdarray__filter(pollfd, POLLERR | POLLHUP,
1776 record__thread_munmap_filtered, NULL) == 0)
1777 break;
1778 }
1779
1780 if (pollfd->entries[ctlfd_pos].revents & POLLHUP) {
1781 terminate = true;
1782 close(thread->pipes.msg[0]);
1783 thread->pipes.msg[0] = -1;
1784 pollfd->entries[ctlfd_pos].fd = -1;
1785 pollfd->entries[ctlfd_pos].events = 0;
1786 }
1787
1788 pollfd->entries[ctlfd_pos].revents = 0;
1789 }
1790 record__mmap_read_all(thread->rec, true);
1791
1792 err = write(thread->pipes.ack[1], &msg, sizeof(msg));
1793 if (err == -1)
1794 pr_warning("threads[%d]: failed to notify on termination: %m\n", thread->tid);
1795
1796 return NULL;
1797 }
1798
record__init_features(struct record * rec)1799 static void record__init_features(struct record *rec)
1800 {
1801 struct perf_session *session = rec->session;
1802 int feat;
1803
1804 for (feat = HEADER_FIRST_FEATURE; feat < HEADER_LAST_FEATURE; feat++)
1805 perf_header__set_feat(&session->header, feat);
1806
1807 if (rec->no_buildid)
1808 perf_header__clear_feat(&session->header, HEADER_BUILD_ID);
1809
1810 if (!have_tracepoints(&rec->evlist->core.entries))
1811 perf_header__clear_feat(&session->header, HEADER_TRACING_DATA);
1812
1813 if (!rec->opts.branch_stack)
1814 perf_header__clear_feat(&session->header, HEADER_BRANCH_STACK);
1815
1816 if (!rec->opts.full_auxtrace)
1817 perf_header__clear_feat(&session->header, HEADER_AUXTRACE);
1818
1819 if (!(rec->opts.use_clockid && rec->opts.clockid_res_ns))
1820 perf_header__clear_feat(&session->header, HEADER_CLOCKID);
1821
1822 if (!rec->opts.use_clockid)
1823 perf_header__clear_feat(&session->header, HEADER_CLOCK_DATA);
1824
1825 if (!record__threads_enabled(rec))
1826 perf_header__clear_feat(&session->header, HEADER_DIR_FORMAT);
1827
1828 if (!record__comp_enabled(rec))
1829 perf_header__clear_feat(&session->header, HEADER_COMPRESSED);
1830
1831 perf_header__clear_feat(&session->header, HEADER_STAT);
1832 }
1833
1834 static void
record__finish_output(struct record * rec)1835 record__finish_output(struct record *rec)
1836 {
1837 int i;
1838 struct perf_data *data = &rec->data;
1839 int fd = perf_data__fd(data);
1840
1841 if (data->is_pipe) {
1842 /* Just to display approx. size */
1843 data->file.size = rec->bytes_written;
1844 return;
1845 }
1846
1847 rec->session->header.data_size += rec->bytes_written;
1848 data->file.size = lseek(perf_data__fd(data), 0, SEEK_CUR);
1849 if (record__threads_enabled(rec)) {
1850 for (i = 0; i < data->dir.nr; i++)
1851 data->dir.files[i].size = lseek(data->dir.files[i].fd, 0, SEEK_CUR);
1852 }
1853
1854 /* Buildid scanning disabled or build ID in kernel and synthesized map events. */
1855 if (!rec->no_buildid || !rec->no_buildid_cache) {
1856 process_buildids(rec);
1857
1858 if (rec->buildid_all)
1859 perf_session__dsos_hit_all(rec->session);
1860 }
1861 perf_session__write_header(rec->session, rec->evlist, fd, true);
1862 perf_session__cache_build_ids(rec->session);
1863 }
1864
record__synthesize_workload(struct record * rec,bool tail)1865 static int record__synthesize_workload(struct record *rec, bool tail)
1866 {
1867 int err;
1868 struct perf_thread_map *thread_map;
1869 bool needs_mmap = rec->opts.synth & PERF_SYNTH_MMAP;
1870
1871 if (rec->opts.tail_synthesize != tail)
1872 return 0;
1873
1874 thread_map = thread_map__new_by_tid(rec->evlist->workload.pid);
1875 if (thread_map == NULL)
1876 return -1;
1877
1878 err = perf_event__synthesize_thread_map(&rec->tool, thread_map,
1879 process_synthesized_event,
1880 &rec->session->machines.host,
1881 needs_mmap,
1882 rec->opts.record_data_mmap);
1883 perf_thread_map__put(thread_map);
1884 return err;
1885 }
1886
write_finished_init(struct record * rec,bool tail)1887 static int write_finished_init(struct record *rec, bool tail)
1888 {
1889 if (rec->opts.tail_synthesize != tail)
1890 return 0;
1891
1892 return record__write(rec, NULL, &finished_init_event, sizeof(finished_init_event));
1893 }
1894
1895 static int record__synthesize(struct record *rec, bool tail);
1896
1897 static int
record__switch_output(struct record * rec,bool at_exit)1898 record__switch_output(struct record *rec, bool at_exit)
1899 {
1900 struct perf_data *data = &rec->data;
1901 char *new_filename = NULL;
1902 int fd, err;
1903
1904 /* Same Size: "2015122520103046"*/
1905 char timestamp[] = "InvalidTimestamp";
1906
1907 record__aio_mmap_read_sync(rec);
1908
1909 write_finished_init(rec, true);
1910
1911 record__synthesize(rec, true);
1912 if (target__none(&rec->opts.target))
1913 record__synthesize_workload(rec, true);
1914
1915 rec->samples = 0;
1916 record__finish_output(rec);
1917 err = fetch_current_timestamp(timestamp, sizeof(timestamp));
1918 if (err) {
1919 pr_err("Failed to get current timestamp\n");
1920 return -EINVAL;
1921 }
1922
1923 fd = perf_data__switch(data, timestamp,
1924 rec->session->header.data_offset,
1925 at_exit, &new_filename);
1926 if (fd >= 0 && !at_exit) {
1927 rec->bytes_written = 0;
1928 rec->session->header.data_size = 0;
1929 }
1930
1931 if (!quiet) {
1932 fprintf(stderr, "[ perf record: Dump %s.%s ]\n",
1933 data->path, timestamp);
1934 }
1935
1936 if (rec->switch_output.num_files) {
1937 int n = rec->switch_output.cur_file + 1;
1938
1939 if (n >= rec->switch_output.num_files)
1940 n = 0;
1941 rec->switch_output.cur_file = n;
1942 if (rec->switch_output.filenames[n]) {
1943 remove(rec->switch_output.filenames[n]);
1944 zfree(&rec->switch_output.filenames[n]);
1945 }
1946 rec->switch_output.filenames[n] = new_filename;
1947 } else {
1948 free(new_filename);
1949 }
1950
1951 /* Output tracking events */
1952 if (!at_exit) {
1953 record__synthesize(rec, false);
1954
1955 /*
1956 * In 'perf record --switch-output' without -a,
1957 * record__synthesize() in record__switch_output() won't
1958 * generate tracking events because there's no thread_map
1959 * in evlist. Which causes newly created perf.data doesn't
1960 * contain map and comm information.
1961 * Create a fake thread_map and directly call
1962 * perf_event__synthesize_thread_map() for those events.
1963 */
1964 if (target__none(&rec->opts.target))
1965 record__synthesize_workload(rec, false);
1966 write_finished_init(rec, false);
1967 }
1968 return fd;
1969 }
1970
__record__save_lost_samples(struct record * rec,struct evsel * evsel,struct perf_record_lost_samples * lost,int cpu_idx,int thread_idx,u64 lost_count,u16 misc_flag)1971 static void __record__save_lost_samples(struct record *rec, struct evsel *evsel,
1972 struct perf_record_lost_samples *lost,
1973 int cpu_idx, int thread_idx, u64 lost_count,
1974 u16 misc_flag)
1975 {
1976 struct perf_sample_id *sid;
1977 struct perf_sample sample;
1978 int id_hdr_size;
1979
1980 perf_sample__init(&sample, /*all=*/true);
1981 lost->lost = lost_count;
1982 if (evsel->core.ids) {
1983 sid = xyarray__entry(evsel->core.sample_id, cpu_idx, thread_idx);
1984 sample.id = sid->id;
1985 }
1986
1987 id_hdr_size = perf_event__synthesize_id_sample((void *)(lost + 1),
1988 evsel->core.attr.sample_type, &sample);
1989 lost->header.size = sizeof(*lost) + id_hdr_size;
1990 lost->header.misc = misc_flag;
1991 record__write(rec, NULL, lost, lost->header.size);
1992 perf_sample__exit(&sample);
1993 }
1994
record__read_lost_samples(struct record * rec)1995 static void record__read_lost_samples(struct record *rec)
1996 {
1997 struct perf_session *session = rec->session;
1998 struct perf_record_lost_samples_and_ids lost;
1999 struct evsel *evsel;
2000
2001 /* there was an error during record__open */
2002 if (session->evlist == NULL)
2003 return;
2004
2005 evlist__for_each_entry(session->evlist, evsel) {
2006 struct xyarray *xy = evsel->core.sample_id;
2007 u64 lost_count;
2008
2009 if (xy == NULL || evsel->core.fd == NULL)
2010 continue;
2011 if (xyarray__max_x(evsel->core.fd) != xyarray__max_x(xy) ||
2012 xyarray__max_y(evsel->core.fd) != xyarray__max_y(xy)) {
2013 pr_debug("Unmatched FD vs. sample ID: skip reading LOST count\n");
2014 continue;
2015 }
2016
2017 for (int x = 0; x < xyarray__max_x(xy); x++) {
2018 for (int y = 0; y < xyarray__max_y(xy); y++) {
2019 struct perf_counts_values count;
2020
2021 if (perf_evsel__read(&evsel->core, x, y, &count) < 0) {
2022 pr_debug("read LOST count failed\n");
2023 return;
2024 }
2025
2026 if (count.lost) {
2027 memset(&lost, 0, sizeof(lost));
2028 lost.lost.header.type = PERF_RECORD_LOST_SAMPLES;
2029 __record__save_lost_samples(rec, evsel, &lost.lost,
2030 x, y, count.lost, 0);
2031 }
2032 }
2033 }
2034
2035 lost_count = perf_bpf_filter__lost_count(evsel);
2036 if (lost_count) {
2037 memset(&lost, 0, sizeof(lost));
2038 lost.lost.header.type = PERF_RECORD_LOST_SAMPLES;
2039 __record__save_lost_samples(rec, evsel, &lost.lost, 0, 0, lost_count,
2040 PERF_RECORD_MISC_LOST_SAMPLES_BPF);
2041 }
2042 }
2043 }
2044
2045 static volatile sig_atomic_t workload_exec_errno;
2046
2047 /*
2048 * evlist__prepare_workload will send a SIGUSR1
2049 * if the fork fails, since we asked by setting its
2050 * want_signal to true.
2051 */
workload_exec_failed_signal(int signo __maybe_unused,siginfo_t * info,void * ucontext __maybe_unused)2052 static void workload_exec_failed_signal(int signo __maybe_unused,
2053 siginfo_t *info,
2054 void *ucontext __maybe_unused)
2055 {
2056 workload_exec_errno = info->si_value.sival_int;
2057 done = 1;
2058 child_finished = 1;
2059 }
2060
2061 static void snapshot_sig_handler(int sig);
2062 static void alarm_sig_handler(int sig);
2063
evlist__pick_pc(struct evlist * evlist)2064 static const struct perf_event_mmap_page *evlist__pick_pc(struct evlist *evlist)
2065 {
2066 if (evlist) {
2067 if (evlist->mmap && evlist->mmap[0].core.base)
2068 return evlist->mmap[0].core.base;
2069 if (evlist->overwrite_mmap && evlist->overwrite_mmap[0].core.base)
2070 return evlist->overwrite_mmap[0].core.base;
2071 }
2072 return NULL;
2073 }
2074
record__pick_pc(struct record * rec)2075 static const struct perf_event_mmap_page *record__pick_pc(struct record *rec)
2076 {
2077 const struct perf_event_mmap_page *pc = evlist__pick_pc(rec->evlist);
2078 if (pc)
2079 return pc;
2080 return NULL;
2081 }
2082
record__synthesize(struct record * rec,bool tail)2083 static int record__synthesize(struct record *rec, bool tail)
2084 {
2085 struct perf_session *session = rec->session;
2086 struct machine *machine = &session->machines.host;
2087 struct perf_data *data = &rec->data;
2088 struct record_opts *opts = &rec->opts;
2089 struct perf_tool *tool = &rec->tool;
2090 int err = 0;
2091 event_op f = process_synthesized_event;
2092
2093 if (rec->opts.tail_synthesize != tail)
2094 return 0;
2095
2096 if (data->is_pipe) {
2097 err = perf_event__synthesize_for_pipe(tool, session, data,
2098 process_synthesized_event);
2099 if (err < 0)
2100 goto out;
2101
2102 rec->bytes_written += err;
2103 }
2104
2105 err = perf_event__synth_time_conv(record__pick_pc(rec), tool,
2106 process_synthesized_event, machine);
2107 if (err)
2108 goto out;
2109
2110 /* Synthesize id_index before auxtrace_info */
2111 err = perf_event__synthesize_id_index(tool,
2112 process_synthesized_event,
2113 session->evlist, machine);
2114 if (err)
2115 goto out;
2116
2117 if (rec->opts.full_auxtrace) {
2118 err = perf_event__synthesize_auxtrace_info(rec->itr, tool,
2119 session, process_synthesized_event);
2120 if (err)
2121 goto out;
2122 }
2123
2124 if (!evlist__exclude_kernel(rec->evlist)) {
2125 err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
2126 machine);
2127 WARN_ONCE(err < 0, "Couldn't record kernel reference relocation symbol\n"
2128 "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
2129 "Check /proc/kallsyms permission or run as root.\n");
2130
2131 err = perf_event__synthesize_modules(tool, process_synthesized_event,
2132 machine);
2133 WARN_ONCE(err < 0, "Couldn't record kernel module information.\n"
2134 "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
2135 "Check /proc/modules permission or run as root.\n");
2136 }
2137
2138 if (perf_guest) {
2139 machines__process_guests(&session->machines,
2140 perf_event__synthesize_guest_os, tool);
2141 }
2142
2143 err = perf_event__synthesize_extra_attr(&rec->tool,
2144 rec->evlist,
2145 process_synthesized_event,
2146 data->is_pipe);
2147 if (err)
2148 goto out;
2149
2150 err = perf_event__synthesize_thread_map2(&rec->tool, rec->evlist->core.threads,
2151 process_synthesized_event,
2152 NULL);
2153 if (err < 0) {
2154 pr_err("Couldn't synthesize thread map.\n");
2155 return err;
2156 }
2157
2158 err = perf_event__synthesize_cpu_map(&rec->tool, rec->evlist->core.all_cpus,
2159 process_synthesized_event, NULL);
2160 if (err < 0) {
2161 pr_err("Couldn't synthesize cpu map.\n");
2162 return err;
2163 }
2164
2165 err = perf_event__synthesize_bpf_events(session, process_synthesized_event,
2166 machine, opts);
2167 if (err < 0) {
2168 pr_warning("Couldn't synthesize bpf events.\n");
2169 err = 0;
2170 }
2171
2172 if (rec->opts.synth & PERF_SYNTH_CGROUP) {
2173 err = perf_event__synthesize_cgroups(tool, process_synthesized_event,
2174 machine);
2175 if (err < 0) {
2176 pr_warning("Couldn't synthesize cgroup events.\n");
2177 err = 0;
2178 }
2179 }
2180
2181 if (rec->opts.nr_threads_synthesize > 1) {
2182 mutex_init(&synth_lock);
2183 perf_set_multithreaded();
2184 f = process_locked_synthesized_event;
2185 }
2186
2187 if (rec->opts.synth & PERF_SYNTH_TASK) {
2188 bool needs_mmap = rec->opts.synth & PERF_SYNTH_MMAP;
2189
2190 err = __machine__synthesize_threads(machine, tool, &opts->target,
2191 rec->evlist->core.threads,
2192 f, needs_mmap, opts->record_data_mmap,
2193 rec->opts.nr_threads_synthesize);
2194 }
2195
2196 if (rec->opts.nr_threads_synthesize > 1) {
2197 perf_set_singlethreaded();
2198 mutex_destroy(&synth_lock);
2199 }
2200
2201 out:
2202 return err;
2203 }
2204
record__synthesize_final_bpf_metadata(struct record * rec __maybe_unused)2205 static void record__synthesize_final_bpf_metadata(struct record *rec __maybe_unused)
2206 {
2207 #ifdef HAVE_LIBBPF_SUPPORT
2208 perf_event__synthesize_final_bpf_metadata(rec->session,
2209 process_synthesized_event);
2210 #endif
2211 }
2212
record__process_signal_event(union perf_event * event __maybe_unused,void * data)2213 static int record__process_signal_event(union perf_event *event __maybe_unused, void *data)
2214 {
2215 struct record *rec = data;
2216 pthread_kill(rec->thread_id, SIGUSR2);
2217 return 0;
2218 }
2219
record__setup_sb_evlist(struct record * rec)2220 static int record__setup_sb_evlist(struct record *rec)
2221 {
2222 struct record_opts *opts = &rec->opts;
2223
2224 if (rec->sb_evlist != NULL) {
2225 /*
2226 * We get here if --switch-output-event populated the
2227 * sb_evlist, so associate a callback that will send a SIGUSR2
2228 * to the main thread.
2229 */
2230 evlist__set_cb(rec->sb_evlist, record__process_signal_event, rec);
2231 rec->thread_id = pthread_self();
2232 }
2233 #ifdef HAVE_LIBBPF_SUPPORT
2234 if (!opts->no_bpf_event) {
2235 if (rec->sb_evlist == NULL) {
2236 rec->sb_evlist = evlist__new();
2237
2238 if (rec->sb_evlist == NULL) {
2239 pr_err("Couldn't create side band evlist.\n.");
2240 return -1;
2241 }
2242 }
2243
2244 if (evlist__add_bpf_sb_event(rec->sb_evlist, perf_session__env(rec->session))) {
2245 pr_err("Couldn't ask for PERF_RECORD_BPF_EVENT side band events.\n.");
2246 return -1;
2247 }
2248 }
2249 #endif
2250 if (evlist__start_sb_thread(rec->sb_evlist, &rec->opts.target)) {
2251 pr_debug("Couldn't start the BPF side band thread:\nBPF programs starting from now on won't be annotatable\n");
2252 opts->no_bpf_event = true;
2253 }
2254
2255 return 0;
2256 }
2257
record__init_clock(struct record * rec)2258 static int record__init_clock(struct record *rec)
2259 {
2260 struct perf_session *session = rec->session;
2261 struct timespec ref_clockid;
2262 struct timeval ref_tod;
2263 struct perf_env *env = perf_session__env(session);
2264 u64 ref;
2265
2266 if (!rec->opts.use_clockid)
2267 return 0;
2268
2269 if (rec->opts.use_clockid && rec->opts.clockid_res_ns)
2270 env->clock.clockid_res_ns = rec->opts.clockid_res_ns;
2271
2272 env->clock.clockid = rec->opts.clockid;
2273
2274 if (gettimeofday(&ref_tod, NULL) != 0) {
2275 pr_err("gettimeofday failed, cannot set reference time.\n");
2276 return -1;
2277 }
2278
2279 if (clock_gettime(rec->opts.clockid, &ref_clockid)) {
2280 pr_err("clock_gettime failed, cannot set reference time.\n");
2281 return -1;
2282 }
2283
2284 ref = (u64) ref_tod.tv_sec * NSEC_PER_SEC +
2285 (u64) ref_tod.tv_usec * NSEC_PER_USEC;
2286
2287 env->clock.tod_ns = ref;
2288
2289 ref = (u64) ref_clockid.tv_sec * NSEC_PER_SEC +
2290 (u64) ref_clockid.tv_nsec;
2291
2292 env->clock.clockid_ns = ref;
2293 return 0;
2294 }
2295
hit_auxtrace_snapshot_trigger(struct record * rec)2296 static void hit_auxtrace_snapshot_trigger(struct record *rec)
2297 {
2298 if (trigger_is_ready(&auxtrace_snapshot_trigger)) {
2299 trigger_hit(&auxtrace_snapshot_trigger);
2300 auxtrace_record__snapshot_started = 1;
2301 if (auxtrace_record__snapshot_start(rec->itr))
2302 trigger_error(&auxtrace_snapshot_trigger);
2303 }
2304 }
2305
record__terminate_thread(struct record_thread * thread_data)2306 static int record__terminate_thread(struct record_thread *thread_data)
2307 {
2308 int err;
2309 enum thread_msg ack = THREAD_MSG__UNDEFINED;
2310 pid_t tid = thread_data->tid;
2311
2312 close(thread_data->pipes.msg[1]);
2313 thread_data->pipes.msg[1] = -1;
2314 err = read(thread_data->pipes.ack[0], &ack, sizeof(ack));
2315 if (err > 0)
2316 pr_debug2("threads[%d]: sent %s\n", tid, thread_msg_tags[ack]);
2317 else
2318 pr_warning("threads[%d]: failed to receive termination notification from %d\n",
2319 thread->tid, tid);
2320
2321 return 0;
2322 }
2323
record__start_threads(struct record * rec)2324 static int record__start_threads(struct record *rec)
2325 {
2326 int t, tt, err, ret = 0, nr_threads = rec->nr_threads;
2327 struct record_thread *thread_data = rec->thread_data;
2328 sigset_t full, mask;
2329 pthread_t handle;
2330 pthread_attr_t attrs;
2331
2332 thread = &thread_data[0];
2333
2334 if (!record__threads_enabled(rec))
2335 return 0;
2336
2337 sigfillset(&full);
2338 if (sigprocmask(SIG_SETMASK, &full, &mask)) {
2339 pr_err("Failed to block signals on threads start: %m\n");
2340 return -1;
2341 }
2342
2343 pthread_attr_init(&attrs);
2344 pthread_attr_setdetachstate(&attrs, PTHREAD_CREATE_DETACHED);
2345
2346 for (t = 1; t < nr_threads; t++) {
2347 enum thread_msg msg = THREAD_MSG__UNDEFINED;
2348
2349 #ifdef HAVE_PTHREAD_ATTR_SETAFFINITY_NP
2350 pthread_attr_setaffinity_np(&attrs,
2351 MMAP_CPU_MASK_BYTES(&(thread_data[t].mask->affinity)),
2352 (cpu_set_t *)(thread_data[t].mask->affinity.bits));
2353 #endif
2354 if (pthread_create(&handle, &attrs, record__thread, &thread_data[t])) {
2355 for (tt = 1; tt < t; tt++)
2356 record__terminate_thread(&thread_data[t]);
2357 pr_err("Failed to start threads: %m\n");
2358 ret = -1;
2359 goto out_err;
2360 }
2361
2362 err = read(thread_data[t].pipes.ack[0], &msg, sizeof(msg));
2363 if (err > 0)
2364 pr_debug2("threads[%d]: sent %s\n", rec->thread_data[t].tid,
2365 thread_msg_tags[msg]);
2366 else
2367 pr_warning("threads[%d]: failed to receive start notification from %d\n",
2368 thread->tid, rec->thread_data[t].tid);
2369 }
2370
2371 sched_setaffinity(0, MMAP_CPU_MASK_BYTES(&thread->mask->affinity),
2372 (cpu_set_t *)thread->mask->affinity.bits);
2373
2374 pr_debug("threads[%d]: started on cpu%d\n", thread->tid, sched_getcpu());
2375
2376 out_err:
2377 pthread_attr_destroy(&attrs);
2378
2379 if (sigprocmask(SIG_SETMASK, &mask, NULL)) {
2380 pr_err("Failed to unblock signals on threads start: %m\n");
2381 ret = -1;
2382 }
2383
2384 return ret;
2385 }
2386
record__stop_threads(struct record * rec)2387 static int record__stop_threads(struct record *rec)
2388 {
2389 int t;
2390 struct record_thread *thread_data = rec->thread_data;
2391
2392 for (t = 1; t < rec->nr_threads; t++)
2393 record__terminate_thread(&thread_data[t]);
2394
2395 for (t = 0; t < rec->nr_threads; t++) {
2396 rec->samples += thread_data[t].samples;
2397 if (!record__threads_enabled(rec))
2398 continue;
2399 rec->session->bytes_transferred += thread_data[t].bytes_transferred;
2400 rec->session->bytes_compressed += thread_data[t].bytes_compressed;
2401 pr_debug("threads[%d]: samples=%lld, wakes=%ld, ", thread_data[t].tid,
2402 thread_data[t].samples, thread_data[t].waking);
2403 if (thread_data[t].bytes_transferred && thread_data[t].bytes_compressed)
2404 pr_debug("transferred=%" PRIu64 ", compressed=%" PRIu64 "\n",
2405 thread_data[t].bytes_transferred, thread_data[t].bytes_compressed);
2406 else
2407 pr_debug("written=%" PRIu64 "\n", thread_data[t].bytes_written);
2408 }
2409
2410 return 0;
2411 }
2412
record__waking(struct record * rec)2413 static unsigned long record__waking(struct record *rec)
2414 {
2415 int t;
2416 unsigned long waking = 0;
2417 struct record_thread *thread_data = rec->thread_data;
2418
2419 for (t = 0; t < rec->nr_threads; t++)
2420 waking += thread_data[t].waking;
2421
2422 return waking;
2423 }
2424
__cmd_record(struct record * rec,int argc,const char ** argv)2425 static int __cmd_record(struct record *rec, int argc, const char **argv)
2426 {
2427 int err;
2428 int status = 0;
2429 const bool forks = argc > 0;
2430 struct perf_tool *tool = &rec->tool;
2431 struct record_opts *opts = &rec->opts;
2432 struct perf_data *data = &rec->data;
2433 struct perf_session *session;
2434 bool disabled = false, draining = false;
2435 int fd;
2436 float ratio = 0;
2437 enum evlist_ctl_cmd cmd = EVLIST_CTL_CMD_UNSUPPORTED;
2438 struct perf_env *env;
2439
2440 atexit(record__sig_exit);
2441 signal(SIGCHLD, sig_handler);
2442 signal(SIGINT, sig_handler);
2443 signal(SIGTERM, sig_handler);
2444 signal(SIGSEGV, sigsegv_handler);
2445
2446 if (rec->opts.record_cgroup) {
2447 #ifndef HAVE_FILE_HANDLE
2448 pr_err("cgroup tracking is not supported\n");
2449 return -1;
2450 #endif
2451 }
2452
2453 if (rec->opts.auxtrace_snapshot_mode || rec->switch_output.enabled) {
2454 signal(SIGUSR2, snapshot_sig_handler);
2455 if (rec->opts.auxtrace_snapshot_mode)
2456 trigger_on(&auxtrace_snapshot_trigger);
2457 if (rec->switch_output.enabled)
2458 trigger_on(&switch_output_trigger);
2459 } else {
2460 signal(SIGUSR2, SIG_IGN);
2461 }
2462
2463 perf_tool__init(tool, /*ordered_events=*/true);
2464 tool->sample = process_sample_event;
2465 tool->fork = perf_event__process_fork;
2466 tool->exit = perf_event__process_exit;
2467 tool->comm = perf_event__process_comm;
2468 tool->namespaces = perf_event__process_namespaces;
2469 tool->mmap = build_id__process_mmap;
2470 tool->mmap2 = build_id__process_mmap2;
2471 tool->itrace_start = process_timestamp_boundary;
2472 tool->aux = process_timestamp_boundary;
2473 tool->namespace_events = rec->opts.record_namespaces;
2474 tool->cgroup_events = rec->opts.record_cgroup;
2475 session = perf_session__new(data, tool);
2476 if (IS_ERR(session)) {
2477 pr_err("Perf session creation failed.\n");
2478 return PTR_ERR(session);
2479 }
2480 env = perf_session__env(session);
2481 if (record__threads_enabled(rec)) {
2482 if (perf_data__is_pipe(&rec->data)) {
2483 pr_err("Parallel trace streaming is not available in pipe mode.\n");
2484 return -1;
2485 }
2486 if (rec->opts.full_auxtrace) {
2487 pr_err("Parallel trace streaming is not available in AUX area tracing mode.\n");
2488 return -1;
2489 }
2490 }
2491
2492 fd = perf_data__fd(data);
2493 rec->session = session;
2494
2495 if (zstd_init(&session->zstd_data, rec->opts.comp_level) < 0) {
2496 pr_err("Compression initialization failed.\n");
2497 return -1;
2498 }
2499 #ifdef HAVE_EVENTFD_SUPPORT
2500 done_fd = eventfd(0, EFD_NONBLOCK);
2501 if (done_fd < 0) {
2502 pr_err("Failed to create wakeup eventfd, error: %m\n");
2503 status = -1;
2504 goto out_delete_session;
2505 }
2506 err = evlist__add_wakeup_eventfd(rec->evlist, done_fd);
2507 if (err < 0) {
2508 pr_err("Failed to add wakeup eventfd to poll list\n");
2509 status = err;
2510 goto out_delete_session;
2511 }
2512 #endif // HAVE_EVENTFD_SUPPORT
2513
2514 env->comp_type = PERF_COMP_ZSTD;
2515 env->comp_level = rec->opts.comp_level;
2516
2517 if (rec->opts.kcore &&
2518 !record__kcore_readable(&session->machines.host)) {
2519 pr_err("ERROR: kcore is not readable.\n");
2520 return -1;
2521 }
2522
2523 if (record__init_clock(rec))
2524 return -1;
2525
2526 record__init_features(rec);
2527
2528 if (forks) {
2529 err = evlist__prepare_workload(rec->evlist, &opts->target, argv, data->is_pipe,
2530 workload_exec_failed_signal);
2531 if (err < 0) {
2532 pr_err("Couldn't run the workload!\n");
2533 status = err;
2534 goto out_delete_session;
2535 }
2536 }
2537
2538 /*
2539 * If we have just single event and are sending data
2540 * through pipe, we need to force the ids allocation,
2541 * because we synthesize event name through the pipe
2542 * and need the id for that.
2543 */
2544 if (data->is_pipe && rec->evlist->core.nr_entries == 1)
2545 rec->opts.sample_id = true;
2546
2547 if (rec->timestamp_filename && perf_data__is_pipe(data)) {
2548 rec->timestamp_filename = false;
2549 pr_warning("WARNING: --timestamp-filename option is not available in pipe mode.\n");
2550 }
2551
2552 /*
2553 * Use global stat_config that is zero meaning aggr_mode is AGGR_NONE
2554 * and hybrid_merge is false.
2555 */
2556 evlist__uniquify_evsel_names(rec->evlist, &stat_config);
2557
2558 evlist__config(rec->evlist, opts, &callchain_param);
2559
2560 /* Debug message used by test scripts */
2561 pr_debug3("perf record opening and mmapping events\n");
2562 if (record__open(rec) != 0) {
2563 err = -1;
2564 goto out_free_threads;
2565 }
2566 /* Debug message used by test scripts */
2567 pr_debug3("perf record done opening and mmapping events\n");
2568 env->comp_mmap_len = session->evlist->core.mmap_len;
2569
2570 if (rec->opts.kcore) {
2571 err = record__kcore_copy(&session->machines.host, data);
2572 if (err) {
2573 pr_err("ERROR: Failed to copy kcore\n");
2574 goto out_free_threads;
2575 }
2576 }
2577
2578 /*
2579 * Normally perf_session__new would do this, but it doesn't have the
2580 * evlist.
2581 */
2582 if (rec->tool.ordered_events && !evlist__sample_id_all(rec->evlist)) {
2583 pr_warning("WARNING: No sample_id_all support, falling back to unordered processing\n");
2584 rec->tool.ordered_events = false;
2585 }
2586
2587 if (evlist__nr_groups(rec->evlist) == 0)
2588 perf_header__clear_feat(&session->header, HEADER_GROUP_DESC);
2589
2590 if (data->is_pipe) {
2591 err = perf_header__write_pipe(fd);
2592 if (err < 0)
2593 goto out_free_threads;
2594 } else {
2595 err = perf_session__write_header(session, rec->evlist, fd, false);
2596 if (err < 0)
2597 goto out_free_threads;
2598 }
2599
2600 err = -1;
2601 if (!rec->no_buildid
2602 && !perf_header__has_feat(&session->header, HEADER_BUILD_ID)) {
2603 pr_err("Couldn't generate buildids. "
2604 "Use --no-buildid to profile anyway.\n");
2605 goto out_free_threads;
2606 }
2607
2608 if (!evlist__needs_bpf_sb_event(rec->evlist))
2609 opts->no_bpf_event = true;
2610
2611 err = record__setup_sb_evlist(rec);
2612 if (err)
2613 goto out_free_threads;
2614
2615 err = record__synthesize(rec, false);
2616 if (err < 0)
2617 goto out_free_threads;
2618
2619 if (rec->realtime_prio) {
2620 struct sched_param param;
2621
2622 param.sched_priority = rec->realtime_prio;
2623 if (sched_setscheduler(0, SCHED_FIFO, ¶m)) {
2624 pr_err("Could not set realtime priority.\n");
2625 err = -1;
2626 goto out_free_threads;
2627 }
2628 }
2629
2630 if (record__start_threads(rec))
2631 goto out_free_threads;
2632
2633 /*
2634 * When perf is starting the traced process, all the events
2635 * (apart from group members) have enable_on_exec=1 set,
2636 * so don't spoil it by prematurely enabling them.
2637 */
2638 if (!target__none(&opts->target) && !opts->target.initial_delay)
2639 evlist__enable(rec->evlist);
2640
2641 /*
2642 * offcpu-time does not call execve, so enable_on_exe wouldn't work
2643 * when recording a workload, do it manually
2644 */
2645 if (rec->off_cpu)
2646 evlist__enable_evsel(rec->evlist, (char *)OFFCPU_EVENT);
2647
2648 /*
2649 * Let the child rip
2650 */
2651 if (forks) {
2652 struct machine *machine = &session->machines.host;
2653 union perf_event *event;
2654 pid_t tgid;
2655
2656 event = malloc(sizeof(event->comm) + machine->id_hdr_size);
2657 if (event == NULL) {
2658 err = -ENOMEM;
2659 goto out_child;
2660 }
2661
2662 /*
2663 * Some H/W events are generated before COMM event
2664 * which is emitted during exec(), so perf script
2665 * cannot see a correct process name for those events.
2666 * Synthesize COMM event to prevent it.
2667 */
2668 tgid = perf_event__synthesize_comm(tool, event,
2669 rec->evlist->workload.pid,
2670 process_synthesized_event,
2671 machine);
2672 free(event);
2673
2674 if (tgid == -1)
2675 goto out_child;
2676
2677 event = malloc(sizeof(event->namespaces) +
2678 (NR_NAMESPACES * sizeof(struct perf_ns_link_info)) +
2679 machine->id_hdr_size);
2680 if (event == NULL) {
2681 err = -ENOMEM;
2682 goto out_child;
2683 }
2684
2685 /*
2686 * Synthesize NAMESPACES event for the command specified.
2687 */
2688 perf_event__synthesize_namespaces(tool, event,
2689 rec->evlist->workload.pid,
2690 tgid, process_synthesized_event,
2691 machine);
2692 free(event);
2693
2694 evlist__start_workload(rec->evlist);
2695 }
2696
2697 if (opts->target.initial_delay) {
2698 pr_info(EVLIST_DISABLED_MSG);
2699 if (opts->target.initial_delay > 0) {
2700 usleep(opts->target.initial_delay * USEC_PER_MSEC);
2701 evlist__enable(rec->evlist);
2702 pr_info(EVLIST_ENABLED_MSG);
2703 }
2704 }
2705
2706 err = event_enable_timer__start(rec->evlist->eet);
2707 if (err)
2708 goto out_child;
2709
2710 /* Debug message used by test scripts */
2711 pr_debug3("perf record has started\n");
2712 fflush(stderr);
2713
2714 trigger_ready(&auxtrace_snapshot_trigger);
2715 trigger_ready(&switch_output_trigger);
2716 perf_hooks__invoke_record_start();
2717
2718 /*
2719 * Must write FINISHED_INIT so it will be seen after all other
2720 * synthesized user events, but before any regular events.
2721 */
2722 err = write_finished_init(rec, false);
2723 if (err < 0)
2724 goto out_child;
2725
2726 for (;;) {
2727 unsigned long long hits = thread->samples;
2728
2729 /*
2730 * rec->evlist->bkw_mmap_state is possible to be
2731 * BKW_MMAP_EMPTY here: when done == true and
2732 * hits != rec->samples in previous round.
2733 *
2734 * evlist__toggle_bkw_mmap ensure we never
2735 * convert BKW_MMAP_EMPTY to BKW_MMAP_DATA_PENDING.
2736 */
2737 if (trigger_is_hit(&switch_output_trigger) || done || draining)
2738 evlist__toggle_bkw_mmap(rec->evlist, BKW_MMAP_DATA_PENDING);
2739
2740 if (record__mmap_read_all(rec, false) < 0) {
2741 trigger_error(&auxtrace_snapshot_trigger);
2742 trigger_error(&switch_output_trigger);
2743 err = -1;
2744 goto out_child;
2745 }
2746
2747 if (auxtrace_record__snapshot_started) {
2748 auxtrace_record__snapshot_started = 0;
2749 if (!trigger_is_error(&auxtrace_snapshot_trigger))
2750 record__read_auxtrace_snapshot(rec, false);
2751 if (trigger_is_error(&auxtrace_snapshot_trigger)) {
2752 pr_err("AUX area tracing snapshot failed\n");
2753 err = -1;
2754 goto out_child;
2755 }
2756 }
2757
2758 if (trigger_is_hit(&switch_output_trigger)) {
2759 /*
2760 * If switch_output_trigger is hit, the data in
2761 * overwritable ring buffer should have been collected,
2762 * so bkw_mmap_state should be set to BKW_MMAP_EMPTY.
2763 *
2764 * If SIGUSR2 raise after or during record__mmap_read_all(),
2765 * record__mmap_read_all() didn't collect data from
2766 * overwritable ring buffer. Read again.
2767 */
2768 if (rec->evlist->bkw_mmap_state == BKW_MMAP_RUNNING)
2769 continue;
2770 trigger_ready(&switch_output_trigger);
2771
2772 /*
2773 * Reenable events in overwrite ring buffer after
2774 * record__mmap_read_all(): we should have collected
2775 * data from it.
2776 */
2777 evlist__toggle_bkw_mmap(rec->evlist, BKW_MMAP_RUNNING);
2778
2779 if (!quiet)
2780 fprintf(stderr, "[ perf record: dump data: Woken up %ld times ]\n",
2781 record__waking(rec));
2782 thread->waking = 0;
2783 fd = record__switch_output(rec, false);
2784 if (fd < 0) {
2785 pr_err("Failed to switch to new file\n");
2786 trigger_error(&switch_output_trigger);
2787 err = fd;
2788 goto out_child;
2789 }
2790
2791 /* re-arm the alarm */
2792 if (rec->switch_output.time)
2793 alarm(rec->switch_output.time);
2794 }
2795
2796 if (hits == thread->samples) {
2797 if (done || draining)
2798 break;
2799 err = fdarray__poll(&thread->pollfd, -1);
2800 /*
2801 * Propagate error, only if there's any. Ignore positive
2802 * number of returned events and interrupt error.
2803 */
2804 if (err > 0 || (err < 0 && errno == EINTR))
2805 err = 0;
2806 thread->waking++;
2807
2808 if (fdarray__filter(&thread->pollfd, POLLERR | POLLHUP,
2809 record__thread_munmap_filtered, NULL) == 0)
2810 draining = true;
2811
2812 err = record__update_evlist_pollfd_from_thread(rec, rec->evlist, thread);
2813 if (err)
2814 goto out_child;
2815 }
2816
2817 if (evlist__ctlfd_process(rec->evlist, &cmd) > 0) {
2818 switch (cmd) {
2819 case EVLIST_CTL_CMD_SNAPSHOT:
2820 hit_auxtrace_snapshot_trigger(rec);
2821 evlist__ctlfd_ack(rec->evlist);
2822 break;
2823 case EVLIST_CTL_CMD_STOP:
2824 done = 1;
2825 break;
2826 case EVLIST_CTL_CMD_ACK:
2827 case EVLIST_CTL_CMD_UNSUPPORTED:
2828 case EVLIST_CTL_CMD_ENABLE:
2829 case EVLIST_CTL_CMD_DISABLE:
2830 case EVLIST_CTL_CMD_EVLIST:
2831 case EVLIST_CTL_CMD_PING:
2832 default:
2833 break;
2834 }
2835 }
2836
2837 err = event_enable_timer__process(rec->evlist->eet);
2838 if (err < 0)
2839 goto out_child;
2840 if (err) {
2841 err = 0;
2842 done = 1;
2843 }
2844
2845 /*
2846 * When perf is starting the traced process, at the end events
2847 * die with the process and we wait for that. Thus no need to
2848 * disable events in this case.
2849 */
2850 if (done && !disabled && !target__none(&opts->target)) {
2851 trigger_off(&auxtrace_snapshot_trigger);
2852 evlist__disable(rec->evlist);
2853 disabled = true;
2854 }
2855 }
2856
2857 trigger_off(&auxtrace_snapshot_trigger);
2858 trigger_off(&switch_output_trigger);
2859
2860 record__synthesize_final_bpf_metadata(rec);
2861
2862 if (opts->auxtrace_snapshot_on_exit)
2863 record__auxtrace_snapshot_exit(rec);
2864
2865 if (forks && workload_exec_errno) {
2866 char msg[STRERR_BUFSIZE];
2867 const char *emsg = str_error_r(workload_exec_errno, msg, sizeof(msg));
2868 struct strbuf sb = STRBUF_INIT;
2869
2870 evlist__format_evsels(rec->evlist, &sb, 2048);
2871
2872 pr_err("Failed to collect '%s' for the '%s' workload: %s\n",
2873 sb.buf, argv[0], emsg);
2874 strbuf_release(&sb);
2875 err = -1;
2876 goto out_child;
2877 }
2878
2879 if (!quiet)
2880 fprintf(stderr, "[ perf record: Woken up %ld times to write data ]\n",
2881 record__waking(rec));
2882
2883 write_finished_init(rec, true);
2884
2885 if (target__none(&rec->opts.target))
2886 record__synthesize_workload(rec, true);
2887
2888 out_child:
2889 record__stop_threads(rec);
2890 record__mmap_read_all(rec, true);
2891 out_free_threads:
2892 record__free_thread_data(rec);
2893 evlist__finalize_ctlfd(rec->evlist);
2894 record__aio_mmap_read_sync(rec);
2895
2896 if (rec->session->bytes_transferred && rec->session->bytes_compressed) {
2897 ratio = (float)rec->session->bytes_transferred/(float)rec->session->bytes_compressed;
2898 env->comp_ratio = ratio + 0.5;
2899 }
2900
2901 if (forks) {
2902 int exit_status;
2903
2904 if (!child_finished)
2905 kill(rec->evlist->workload.pid, SIGTERM);
2906
2907 wait(&exit_status);
2908
2909 if (err < 0)
2910 status = err;
2911 else if (WIFEXITED(exit_status))
2912 status = WEXITSTATUS(exit_status);
2913 else if (WIFSIGNALED(exit_status))
2914 signr = WTERMSIG(exit_status);
2915 } else
2916 status = err;
2917
2918 if (rec->off_cpu)
2919 rec->bytes_written += off_cpu_write(rec->session);
2920
2921 record__read_lost_samples(rec);
2922 /* this will be recalculated during process_buildids() */
2923 rec->samples = 0;
2924
2925 if (!err) {
2926 record__synthesize(rec, true);
2927 if (!rec->timestamp_filename) {
2928 record__finish_output(rec);
2929 } else {
2930 fd = record__switch_output(rec, true);
2931 if (fd < 0) {
2932 status = fd;
2933 goto out_delete_session;
2934 }
2935 }
2936 }
2937
2938 perf_hooks__invoke_record_end();
2939
2940 if (!err && !quiet) {
2941 char samples[128];
2942 const char *postfix = rec->timestamp_filename ?
2943 ".<timestamp>" : "";
2944
2945 if (rec->samples && !rec->opts.full_auxtrace)
2946 scnprintf(samples, sizeof(samples),
2947 " (%" PRIu64 " samples)", rec->samples);
2948 else
2949 samples[0] = '\0';
2950
2951 fprintf(stderr, "[ perf record: Captured and wrote %.3f MB %s%s%s",
2952 perf_data__size(data) / 1024.0 / 1024.0,
2953 data->path, postfix, samples);
2954 if (ratio) {
2955 fprintf(stderr, ", compressed (original %.3f MB, ratio is %.3f)",
2956 rec->session->bytes_transferred / 1024.0 / 1024.0,
2957 ratio);
2958 }
2959 fprintf(stderr, " ]\n");
2960 }
2961
2962 out_delete_session:
2963 #ifdef HAVE_EVENTFD_SUPPORT
2964 if (done_fd >= 0) {
2965 fd = done_fd;
2966 done_fd = -1;
2967
2968 close(fd);
2969 }
2970 #endif
2971 zstd_fini(&session->zstd_data);
2972 if (!opts->no_bpf_event)
2973 evlist__stop_sb_thread(rec->sb_evlist);
2974
2975 perf_session__delete(session);
2976 return status;
2977 }
2978
callchain_debug(struct callchain_param * callchain)2979 static void callchain_debug(struct callchain_param *callchain)
2980 {
2981 static const char *str[CALLCHAIN_MAX] = { "NONE", "FP", "DWARF", "LBR" };
2982
2983 pr_debug("callchain: type %s\n", str[callchain->record_mode]);
2984
2985 if (callchain->record_mode == CALLCHAIN_DWARF)
2986 pr_debug("callchain: stack dump size %d\n",
2987 callchain->dump_size);
2988 }
2989
record_opts__parse_callchain(struct record_opts * record,struct callchain_param * callchain,const char * arg,bool unset)2990 int record_opts__parse_callchain(struct record_opts *record,
2991 struct callchain_param *callchain,
2992 const char *arg, bool unset)
2993 {
2994 int ret;
2995 callchain->enabled = !unset;
2996
2997 /* --no-call-graph */
2998 if (unset) {
2999 callchain->record_mode = CALLCHAIN_NONE;
3000 pr_debug("callchain: disabled\n");
3001 return 0;
3002 }
3003
3004 ret = parse_callchain_record_opt(arg, callchain);
3005 if (!ret) {
3006 /* Enable data address sampling for DWARF unwind. */
3007 if (callchain->record_mode == CALLCHAIN_DWARF &&
3008 !record->record_data_mmap_set)
3009 record->record_data_mmap = true;
3010 callchain_debug(callchain);
3011 }
3012
3013 return ret;
3014 }
3015
record_parse_callchain_opt(const struct option * opt,const char * arg,int unset)3016 int record_parse_callchain_opt(const struct option *opt,
3017 const char *arg,
3018 int unset)
3019 {
3020 return record_opts__parse_callchain(opt->value, &callchain_param, arg, unset);
3021 }
3022
record_callchain_opt(const struct option * opt,const char * arg __maybe_unused,int unset __maybe_unused)3023 int record_callchain_opt(const struct option *opt,
3024 const char *arg __maybe_unused,
3025 int unset __maybe_unused)
3026 {
3027 struct callchain_param *callchain = opt->value;
3028
3029 callchain->enabled = true;
3030
3031 if (callchain->record_mode == CALLCHAIN_NONE)
3032 callchain->record_mode = CALLCHAIN_FP;
3033
3034 callchain_debug(callchain);
3035 return 0;
3036 }
3037
perf_record_config(const char * var,const char * value,void * cb)3038 static int perf_record_config(const char *var, const char *value, void *cb)
3039 {
3040 struct record *rec = cb;
3041
3042 if (!strcmp(var, "record.build-id")) {
3043 if (!strcmp(value, "cache"))
3044 rec->no_buildid_cache = false;
3045 else if (!strcmp(value, "no-cache"))
3046 rec->no_buildid_cache = true;
3047 else if (!strcmp(value, "skip"))
3048 rec->no_buildid = rec->no_buildid_cache = true;
3049 else if (!strcmp(value, "mmap"))
3050 rec->buildid_mmap = true;
3051 else if (!strcmp(value, "no-mmap"))
3052 rec->buildid_mmap = false;
3053 else
3054 return -1;
3055 return 0;
3056 }
3057 if (!strcmp(var, "record.call-graph")) {
3058 var = "call-graph.record-mode";
3059 return perf_default_config(var, value, cb);
3060 }
3061 #ifdef HAVE_AIO_SUPPORT
3062 if (!strcmp(var, "record.aio")) {
3063 rec->opts.nr_cblocks = strtol(value, NULL, 0);
3064 if (!rec->opts.nr_cblocks)
3065 rec->opts.nr_cblocks = nr_cblocks_default;
3066 }
3067 #endif
3068 if (!strcmp(var, "record.debuginfod")) {
3069 rec->debuginfod.urls = strdup(value);
3070 if (!rec->debuginfod.urls)
3071 return -ENOMEM;
3072 rec->debuginfod.set = true;
3073 }
3074
3075 return 0;
3076 }
3077
record__parse_event_enable_time(const struct option * opt,const char * str,int unset)3078 static int record__parse_event_enable_time(const struct option *opt, const char *str, int unset)
3079 {
3080 struct record *rec = (struct record *)opt->value;
3081
3082 return evlist__parse_event_enable_time(rec->evlist, &rec->opts, str, unset);
3083 }
3084
record__parse_affinity(const struct option * opt,const char * str,int unset)3085 static int record__parse_affinity(const struct option *opt, const char *str, int unset)
3086 {
3087 struct record_opts *opts = (struct record_opts *)opt->value;
3088
3089 if (unset || !str)
3090 return 0;
3091
3092 if (!strcasecmp(str, "node"))
3093 opts->affinity = PERF_AFFINITY_NODE;
3094 else if (!strcasecmp(str, "cpu"))
3095 opts->affinity = PERF_AFFINITY_CPU;
3096
3097 return 0;
3098 }
3099
record__mmap_cpu_mask_alloc(struct mmap_cpu_mask * mask,int nr_bits)3100 static int record__mmap_cpu_mask_alloc(struct mmap_cpu_mask *mask, int nr_bits)
3101 {
3102 mask->nbits = nr_bits;
3103 mask->bits = bitmap_zalloc(mask->nbits);
3104 if (!mask->bits)
3105 return -ENOMEM;
3106
3107 return 0;
3108 }
3109
record__mmap_cpu_mask_free(struct mmap_cpu_mask * mask)3110 static void record__mmap_cpu_mask_free(struct mmap_cpu_mask *mask)
3111 {
3112 bitmap_free(mask->bits);
3113 mask->nbits = 0;
3114 }
3115
record__thread_mask_alloc(struct thread_mask * mask,int nr_bits)3116 static int record__thread_mask_alloc(struct thread_mask *mask, int nr_bits)
3117 {
3118 int ret;
3119
3120 ret = record__mmap_cpu_mask_alloc(&mask->maps, nr_bits);
3121 if (ret) {
3122 mask->affinity.bits = NULL;
3123 return ret;
3124 }
3125
3126 ret = record__mmap_cpu_mask_alloc(&mask->affinity, nr_bits);
3127 if (ret) {
3128 record__mmap_cpu_mask_free(&mask->maps);
3129 mask->maps.bits = NULL;
3130 }
3131
3132 return ret;
3133 }
3134
record__thread_mask_free(struct thread_mask * mask)3135 static void record__thread_mask_free(struct thread_mask *mask)
3136 {
3137 record__mmap_cpu_mask_free(&mask->maps);
3138 record__mmap_cpu_mask_free(&mask->affinity);
3139 }
3140
record__parse_threads(const struct option * opt,const char * str,int unset)3141 static int record__parse_threads(const struct option *opt, const char *str, int unset)
3142 {
3143 int s;
3144 struct record_opts *opts = opt->value;
3145
3146 if (unset || !str || !strlen(str)) {
3147 opts->threads_spec = THREAD_SPEC__CPU;
3148 } else {
3149 for (s = 1; s < THREAD_SPEC__MAX; s++) {
3150 if (s == THREAD_SPEC__USER) {
3151 opts->threads_user_spec = strdup(str);
3152 if (!opts->threads_user_spec)
3153 return -ENOMEM;
3154 opts->threads_spec = THREAD_SPEC__USER;
3155 break;
3156 }
3157 if (!strncasecmp(str, thread_spec_tags[s], strlen(thread_spec_tags[s]))) {
3158 opts->threads_spec = s;
3159 break;
3160 }
3161 }
3162 }
3163
3164 if (opts->threads_spec == THREAD_SPEC__USER)
3165 pr_debug("threads_spec: %s\n", opts->threads_user_spec);
3166 else
3167 pr_debug("threads_spec: %s\n", thread_spec_tags[opts->threads_spec]);
3168
3169 return 0;
3170 }
3171
parse_output_max_size(const struct option * opt,const char * str,int unset)3172 static int parse_output_max_size(const struct option *opt,
3173 const char *str, int unset)
3174 {
3175 unsigned long *s = (unsigned long *)opt->value;
3176 static struct parse_tag tags_size[] = {
3177 { .tag = 'B', .mult = 1 },
3178 { .tag = 'K', .mult = 1 << 10 },
3179 { .tag = 'M', .mult = 1 << 20 },
3180 { .tag = 'G', .mult = 1 << 30 },
3181 { .tag = 0 },
3182 };
3183 unsigned long val;
3184
3185 if (unset) {
3186 *s = 0;
3187 return 0;
3188 }
3189
3190 val = parse_tag_value(str, tags_size);
3191 if (val != (unsigned long) -1) {
3192 *s = val;
3193 return 0;
3194 }
3195
3196 return -1;
3197 }
3198
record__parse_mmap_pages(const struct option * opt,const char * str,int unset __maybe_unused)3199 static int record__parse_mmap_pages(const struct option *opt,
3200 const char *str,
3201 int unset __maybe_unused)
3202 {
3203 struct record_opts *opts = opt->value;
3204 char *s, *p;
3205 unsigned int mmap_pages;
3206 int ret;
3207
3208 if (!str)
3209 return -EINVAL;
3210
3211 s = strdup(str);
3212 if (!s)
3213 return -ENOMEM;
3214
3215 p = strchr(s, ',');
3216 if (p)
3217 *p = '\0';
3218
3219 if (*s) {
3220 ret = __evlist__parse_mmap_pages(&mmap_pages, s);
3221 if (ret)
3222 goto out_free;
3223 opts->mmap_pages = mmap_pages;
3224 }
3225
3226 if (!p) {
3227 ret = 0;
3228 goto out_free;
3229 }
3230
3231 ret = __evlist__parse_mmap_pages(&mmap_pages, p + 1);
3232 if (ret)
3233 goto out_free;
3234
3235 opts->auxtrace_mmap_pages = mmap_pages;
3236
3237 out_free:
3238 free(s);
3239 return ret;
3240 }
3241
record__parse_off_cpu_thresh(const struct option * opt,const char * str,int unset __maybe_unused)3242 static int record__parse_off_cpu_thresh(const struct option *opt,
3243 const char *str,
3244 int unset __maybe_unused)
3245 {
3246 struct record_opts *opts = opt->value;
3247 char *endptr;
3248 u64 off_cpu_thresh_ms;
3249
3250 if (!str)
3251 return -EINVAL;
3252
3253 off_cpu_thresh_ms = strtoull(str, &endptr, 10);
3254
3255 /* the threshold isn't string "0", yet strtoull() returns 0, parsing failed */
3256 if (*endptr || (off_cpu_thresh_ms == 0 && strcmp(str, "0")))
3257 return -EINVAL;
3258 else
3259 opts->off_cpu_thresh_ns = off_cpu_thresh_ms * NSEC_PER_MSEC;
3260
3261 return 0;
3262 }
3263
arch__add_leaf_frame_record_opts(struct record_opts * opts __maybe_unused)3264 void __weak arch__add_leaf_frame_record_opts(struct record_opts *opts __maybe_unused)
3265 {
3266 }
3267
parse_control_option(const struct option * opt,const char * str,int unset __maybe_unused)3268 static int parse_control_option(const struct option *opt,
3269 const char *str,
3270 int unset __maybe_unused)
3271 {
3272 struct record_opts *opts = opt->value;
3273
3274 return evlist__parse_control(str, &opts->ctl_fd, &opts->ctl_fd_ack, &opts->ctl_fd_close);
3275 }
3276
switch_output_size_warn(struct record * rec)3277 static void switch_output_size_warn(struct record *rec)
3278 {
3279 u64 wakeup_size = evlist__mmap_size(rec->opts.mmap_pages);
3280 struct switch_output *s = &rec->switch_output;
3281
3282 wakeup_size /= 2;
3283
3284 if (s->size < wakeup_size) {
3285 char buf[100];
3286
3287 unit_number__scnprintf(buf, sizeof(buf), wakeup_size);
3288 pr_warning("WARNING: switch-output data size lower than "
3289 "wakeup kernel buffer size (%s) "
3290 "expect bigger perf.data sizes\n", buf);
3291 }
3292 }
3293
switch_output_setup(struct record * rec)3294 static int switch_output_setup(struct record *rec)
3295 {
3296 struct switch_output *s = &rec->switch_output;
3297 static struct parse_tag tags_size[] = {
3298 { .tag = 'B', .mult = 1 },
3299 { .tag = 'K', .mult = 1 << 10 },
3300 { .tag = 'M', .mult = 1 << 20 },
3301 { .tag = 'G', .mult = 1 << 30 },
3302 { .tag = 0 },
3303 };
3304 static struct parse_tag tags_time[] = {
3305 { .tag = 's', .mult = 1 },
3306 { .tag = 'm', .mult = 60 },
3307 { .tag = 'h', .mult = 60*60 },
3308 { .tag = 'd', .mult = 60*60*24 },
3309 { .tag = 0 },
3310 };
3311 unsigned long val;
3312
3313 /*
3314 * If we're using --switch-output-events, then we imply its
3315 * --switch-output=signal, as we'll send a SIGUSR2 from the side band
3316 * thread to its parent.
3317 */
3318 if (rec->switch_output_event_set) {
3319 if (record__threads_enabled(rec)) {
3320 pr_warning("WARNING: --switch-output-event option is not available in parallel streaming mode.\n");
3321 return 0;
3322 }
3323 goto do_signal;
3324 }
3325
3326 if (!s->set)
3327 return 0;
3328
3329 if (record__threads_enabled(rec)) {
3330 pr_warning("WARNING: --switch-output option is not available in parallel streaming mode.\n");
3331 return 0;
3332 }
3333
3334 if (!strcmp(s->str, "signal")) {
3335 do_signal:
3336 s->signal = true;
3337 pr_debug("switch-output with SIGUSR2 signal\n");
3338 goto enabled;
3339 }
3340
3341 val = parse_tag_value(s->str, tags_size);
3342 if (val != (unsigned long) -1) {
3343 s->size = val;
3344 pr_debug("switch-output with %s size threshold\n", s->str);
3345 goto enabled;
3346 }
3347
3348 val = parse_tag_value(s->str, tags_time);
3349 if (val != (unsigned long) -1) {
3350 s->time = val;
3351 pr_debug("switch-output with %s time threshold (%lu seconds)\n",
3352 s->str, s->time);
3353 goto enabled;
3354 }
3355
3356 return -1;
3357
3358 enabled:
3359 rec->timestamp_filename = true;
3360 s->enabled = true;
3361
3362 if (s->size && !rec->opts.no_buffering)
3363 switch_output_size_warn(rec);
3364
3365 return 0;
3366 }
3367
3368 static const char * const __record_usage[] = {
3369 "perf record [<options>] [<command>]",
3370 "perf record [<options>] -- <command> [<options>]",
3371 NULL
3372 };
3373 const char * const *record_usage = __record_usage;
3374
build_id__process_mmap(const struct perf_tool * tool,union perf_event * event,struct perf_sample * sample,struct machine * machine)3375 static int build_id__process_mmap(const struct perf_tool *tool, union perf_event *event,
3376 struct perf_sample *sample, struct machine *machine)
3377 {
3378 /*
3379 * We already have the kernel maps, put in place via perf_session__create_kernel_maps()
3380 * no need to add them twice.
3381 */
3382 if (!(event->header.misc & PERF_RECORD_MISC_USER))
3383 return 0;
3384 return perf_event__process_mmap(tool, event, sample, machine);
3385 }
3386
build_id__process_mmap2(const struct perf_tool * tool,union perf_event * event,struct perf_sample * sample,struct machine * machine)3387 static int build_id__process_mmap2(const struct perf_tool *tool, union perf_event *event,
3388 struct perf_sample *sample, struct machine *machine)
3389 {
3390 /*
3391 * We already have the kernel maps, put in place via perf_session__create_kernel_maps()
3392 * no need to add them twice.
3393 */
3394 if (!(event->header.misc & PERF_RECORD_MISC_USER))
3395 return 0;
3396
3397 return perf_event__process_mmap2(tool, event, sample, machine);
3398 }
3399
process_timestamp_boundary(const struct perf_tool * tool,union perf_event * event __maybe_unused,struct perf_sample * sample,struct machine * machine __maybe_unused)3400 static int process_timestamp_boundary(const struct perf_tool *tool,
3401 union perf_event *event __maybe_unused,
3402 struct perf_sample *sample,
3403 struct machine *machine __maybe_unused)
3404 {
3405 struct record *rec = container_of(tool, struct record, tool);
3406
3407 set_timestamp_boundary(rec, sample->time);
3408 return 0;
3409 }
3410
parse_record_synth_option(const struct option * opt,const char * str,int unset __maybe_unused)3411 static int parse_record_synth_option(const struct option *opt,
3412 const char *str,
3413 int unset __maybe_unused)
3414 {
3415 struct record_opts *opts = opt->value;
3416 char *p = strdup(str);
3417
3418 if (p == NULL)
3419 return -1;
3420
3421 opts->synth = parse_synth_opt(p);
3422 free(p);
3423
3424 if (opts->synth < 0) {
3425 pr_err("Invalid synth option: %s\n", str);
3426 return -1;
3427 }
3428 return 0;
3429 }
3430
3431 /*
3432 * XXX Ideally would be local to cmd_record() and passed to a record__new
3433 * because we need to have access to it in record__exit, that is called
3434 * after cmd_record() exits, but since record_options need to be accessible to
3435 * builtin-script, leave it here.
3436 *
3437 * At least we don't ouch it in all the other functions here directly.
3438 *
3439 * Just say no to tons of global variables, sigh.
3440 */
3441 static struct record record = {
3442 .opts = {
3443 .sample_time = true,
3444 .mmap_pages = UINT_MAX,
3445 .user_freq = UINT_MAX,
3446 .user_interval = ULLONG_MAX,
3447 .freq = 4000,
3448 .target = {
3449 .uses_mmap = true,
3450 .default_per_cpu = true,
3451 },
3452 .mmap_flush = MMAP_FLUSH_DEFAULT,
3453 .nr_threads_synthesize = 1,
3454 .ctl_fd = -1,
3455 .ctl_fd_ack = -1,
3456 .synth = PERF_SYNTH_ALL,
3457 .off_cpu_thresh_ns = OFFCPU_THRESH,
3458 },
3459 .buildid_mmap = true,
3460 };
3461
3462 const char record_callchain_help[] = CALLCHAIN_RECORD_HELP
3463 "\n\t\t\t\tDefault: fp";
3464
3465 static bool dry_run;
3466
3467 static struct parse_events_option_args parse_events_option_args = {
3468 .evlistp = &record.evlist,
3469 };
3470
3471 static struct parse_events_option_args switch_output_parse_events_option_args = {
3472 .evlistp = &record.sb_evlist,
3473 };
3474
3475 /*
3476 * XXX Will stay a global variable till we fix builtin-script.c to stop messing
3477 * with it and switch to use the library functions in perf_evlist that came
3478 * from builtin-record.c, i.e. use record_opts,
3479 * evlist__prepare_workload, etc instead of fork+exec'in 'perf record',
3480 * using pipes, etc.
3481 */
3482 static struct option __record_options[] = {
3483 OPT_CALLBACK('e', "event", &parse_events_option_args, "event",
3484 "event selector. use 'perf list' to list available events",
3485 parse_events_option),
3486 OPT_CALLBACK(0, "filter", &record.evlist, "filter",
3487 "event filter", parse_filter),
3488 OPT_BOOLEAN(0, "latency", &record.latency,
3489 "Enable data collection for latency profiling.\n"
3490 "\t\t\t Use perf report --latency for latency-centric profile."),
3491 OPT_CALLBACK_NOOPT(0, "exclude-perf", &record.evlist,
3492 NULL, "don't record events from perf itself",
3493 exclude_perf),
3494 OPT_STRING('p', "pid", &record.opts.target.pid, "pid",
3495 "record events on existing process id"),
3496 OPT_STRING('t', "tid", &record.opts.target.tid, "tid",
3497 "record events on existing thread id"),
3498 OPT_INTEGER('r', "realtime", &record.realtime_prio,
3499 "collect data with this RT SCHED_FIFO priority"),
3500 OPT_BOOLEAN(0, "no-buffering", &record.opts.no_buffering,
3501 "collect data without buffering"),
3502 OPT_BOOLEAN('R', "raw-samples", &record.opts.raw_samples,
3503 "collect raw sample records from all opened counters"),
3504 OPT_BOOLEAN('a', "all-cpus", &record.opts.target.system_wide,
3505 "system-wide collection from all CPUs"),
3506 OPT_STRING('C', "cpu", &record.opts.target.cpu_list, "cpu",
3507 "list of cpus to monitor"),
3508 OPT_U64('c', "count", &record.opts.user_interval, "event period to sample"),
3509 OPT_STRING('o', "output", &record.data.path, "file",
3510 "output file name"),
3511 OPT_BOOLEAN_SET('i', "no-inherit", &record.opts.no_inherit,
3512 &record.opts.no_inherit_set,
3513 "child tasks do not inherit counters"),
3514 OPT_BOOLEAN(0, "tail-synthesize", &record.opts.tail_synthesize,
3515 "synthesize non-sample events at the end of output"),
3516 OPT_BOOLEAN(0, "overwrite", &record.opts.overwrite, "use overwrite mode"),
3517 OPT_BOOLEAN(0, "no-bpf-event", &record.opts.no_bpf_event, "do not record bpf events"),
3518 OPT_BOOLEAN(0, "strict-freq", &record.opts.strict_freq,
3519 "Fail if the specified frequency can't be used"),
3520 OPT_CALLBACK('F', "freq", &record.opts, "freq or 'max'",
3521 "profile at this frequency",
3522 record__parse_freq),
3523 OPT_CALLBACK('m', "mmap-pages", &record.opts, "pages[,pages]",
3524 "number of mmap data pages and AUX area tracing mmap pages",
3525 record__parse_mmap_pages),
3526 OPT_CALLBACK(0, "mmap-flush", &record.opts, "number",
3527 "Minimal number of bytes that is extracted from mmap data pages (default: 1)",
3528 record__mmap_flush_parse),
3529 OPT_CALLBACK_NOOPT('g', NULL, &callchain_param,
3530 NULL, "enables call-graph recording" ,
3531 &record_callchain_opt),
3532 OPT_CALLBACK(0, "call-graph", &record.opts,
3533 "record_mode[,record_size]", record_callchain_help,
3534 &record_parse_callchain_opt),
3535 OPT_INCR('v', "verbose", &verbose,
3536 "be more verbose (show counter open errors, etc)"),
3537 OPT_BOOLEAN('q', "quiet", &quiet, "don't print any warnings or messages"),
3538 OPT_BOOLEAN('s', "stat", &record.opts.inherit_stat,
3539 "per thread counts"),
3540 OPT_BOOLEAN('d', "data", &record.opts.sample_address, "Record the sample addresses"),
3541 OPT_BOOLEAN(0, "phys-data", &record.opts.sample_phys_addr,
3542 "Record the sample physical addresses"),
3543 OPT_BOOLEAN(0, "data-page-size", &record.opts.sample_data_page_size,
3544 "Record the sampled data address data page size"),
3545 OPT_BOOLEAN(0, "code-page-size", &record.opts.sample_code_page_size,
3546 "Record the sampled code address (ip) page size"),
3547 OPT_BOOLEAN(0, "sample-mem-info", &record.opts.sample_data_src,
3548 "Record the data source for memory operations"),
3549 OPT_BOOLEAN(0, "sample-cpu", &record.opts.sample_cpu, "Record the sample cpu"),
3550 OPT_BOOLEAN(0, "sample-identifier", &record.opts.sample_identifier,
3551 "Record the sample identifier"),
3552 OPT_BOOLEAN_SET('T', "timestamp", &record.opts.sample_time,
3553 &record.opts.sample_time_set,
3554 "Record the sample timestamps"),
3555 OPT_BOOLEAN_SET('P', "period", &record.opts.period, &record.opts.period_set,
3556 "Record the sample period"),
3557 OPT_BOOLEAN('n', "no-samples", &record.opts.no_samples,
3558 "don't sample"),
3559 OPT_BOOLEAN_SET('N', "no-buildid-cache", &record.no_buildid_cache,
3560 &record.no_buildid_cache_set,
3561 "do not update the buildid cache"),
3562 OPT_BOOLEAN_SET('B', "no-buildid", &record.no_buildid,
3563 &record.no_buildid_set,
3564 "do not collect buildids in perf.data"),
3565 OPT_CALLBACK('G', "cgroup", &record.evlist, "name",
3566 "monitor event in cgroup name only",
3567 parse_cgroups),
3568 OPT_CALLBACK('D', "delay", &record, "ms",
3569 "ms to wait before starting measurement after program start (-1: start with events disabled), "
3570 "or ranges of time to enable events e.g. '-D 10-20,30-40'",
3571 record__parse_event_enable_time),
3572 OPT_BOOLEAN(0, "kcore", &record.opts.kcore, "copy /proc/kcore"),
3573 OPT_STRING('u', "uid", &record.uid_str, "user", "user to profile"),
3574
3575 OPT_CALLBACK_NOOPT('b', "branch-any", &record.opts.branch_stack,
3576 "branch any", "sample any taken branches",
3577 parse_branch_stack),
3578
3579 OPT_CALLBACK('j', "branch-filter", &record.opts.branch_stack,
3580 "branch filter mask", "branch stack filter modes",
3581 parse_branch_stack),
3582 OPT_BOOLEAN('W', "weight", &record.opts.sample_weight,
3583 "sample by weight (on special events only)"),
3584 OPT_BOOLEAN(0, "transaction", &record.opts.sample_transaction,
3585 "sample transaction flags (special events only)"),
3586 OPT_BOOLEAN(0, "per-thread", &record.opts.target.per_thread,
3587 "use per-thread mmaps"),
3588 OPT_CALLBACK_OPTARG('I', "intr-regs", &record.opts.sample_intr_regs, NULL, "any register",
3589 "sample selected machine registers on interrupt,"
3590 " use '-I?' to list register names", parse_intr_regs),
3591 OPT_CALLBACK_OPTARG(0, "user-regs", &record.opts.sample_user_regs, NULL, "any register",
3592 "sample selected machine registers in user space,"
3593 " use '--user-regs=?' to list register names", parse_user_regs),
3594 OPT_BOOLEAN(0, "running-time", &record.opts.running_time,
3595 "Record running/enabled time of read (:S) events"),
3596 OPT_CALLBACK('k', "clockid", &record.opts,
3597 "clockid", "clockid to use for events, see clock_gettime()",
3598 parse_clockid),
3599 OPT_STRING_OPTARG('S', "snapshot", &record.opts.auxtrace_snapshot_opts,
3600 "opts", "AUX area tracing Snapshot Mode", ""),
3601 OPT_STRING_OPTARG(0, "aux-sample", &record.opts.auxtrace_sample_opts,
3602 "opts", "sample AUX area", ""),
3603 OPT_UINTEGER(0, "proc-map-timeout", &proc_map_timeout,
3604 "per thread proc mmap processing timeout in ms"),
3605 OPT_BOOLEAN(0, "namespaces", &record.opts.record_namespaces,
3606 "Record namespaces events"),
3607 OPT_BOOLEAN(0, "all-cgroups", &record.opts.record_cgroup,
3608 "Record cgroup events"),
3609 OPT_BOOLEAN_SET(0, "switch-events", &record.opts.record_switch_events,
3610 &record.opts.record_switch_events_set,
3611 "Record context switch events"),
3612 OPT_BOOLEAN_FLAG(0, "all-kernel", &record.opts.all_kernel,
3613 "Configure all used events to run in kernel space.",
3614 PARSE_OPT_EXCLUSIVE),
3615 OPT_BOOLEAN_FLAG(0, "all-user", &record.opts.all_user,
3616 "Configure all used events to run in user space.",
3617 PARSE_OPT_EXCLUSIVE),
3618 OPT_BOOLEAN(0, "kernel-callchains", &record.opts.kernel_callchains,
3619 "collect kernel callchains"),
3620 OPT_BOOLEAN(0, "user-callchains", &record.opts.user_callchains,
3621 "collect user callchains"),
3622 OPT_STRING(0, "vmlinux", &symbol_conf.vmlinux_name,
3623 "file", "vmlinux pathname"),
3624 OPT_BOOLEAN(0, "buildid-all", &record.buildid_all,
3625 "Record build-id of all DSOs regardless of hits"),
3626 OPT_BOOLEAN_SET(0, "buildid-mmap", &record.buildid_mmap, &record.buildid_mmap_set,
3627 "Record build-id in mmap events and skip build-id processing."),
3628 OPT_BOOLEAN(0, "timestamp-filename", &record.timestamp_filename,
3629 "append timestamp to output filename"),
3630 OPT_BOOLEAN(0, "timestamp-boundary", &record.timestamp_boundary,
3631 "Record timestamp boundary (time of first/last samples)"),
3632 OPT_STRING_OPTARG_SET(0, "switch-output", &record.switch_output.str,
3633 &record.switch_output.set, "signal or size[BKMG] or time[smhd]",
3634 "Switch output when receiving SIGUSR2 (signal) or cross a size or time threshold",
3635 "signal"),
3636 OPT_CALLBACK_SET(0, "switch-output-event", &switch_output_parse_events_option_args,
3637 &record.switch_output_event_set, "switch output event",
3638 "switch output event selector. use 'perf list' to list available events",
3639 parse_events_option_new_evlist),
3640 OPT_INTEGER(0, "switch-max-files", &record.switch_output.num_files,
3641 "Limit number of switch output generated files"),
3642 OPT_BOOLEAN(0, "dry-run", &dry_run,
3643 "Parse options then exit"),
3644 #ifdef HAVE_AIO_SUPPORT
3645 OPT_CALLBACK_OPTARG(0, "aio", &record.opts,
3646 &nr_cblocks_default, "n", "Use <n> control blocks in asynchronous trace writing mode (default: 1, max: 4)",
3647 record__aio_parse),
3648 #endif
3649 OPT_CALLBACK(0, "affinity", &record.opts, "node|cpu",
3650 "Set affinity mask of trace reading thread to NUMA node cpu mask or cpu of processed mmap buffer",
3651 record__parse_affinity),
3652 #ifdef HAVE_ZSTD_SUPPORT
3653 OPT_CALLBACK_OPTARG('z', "compression-level", &record.opts, &comp_level_default, "n",
3654 "Compress records using specified level (default: 1 - fastest compression, 22 - greatest compression)",
3655 record__parse_comp_level),
3656 #endif
3657 OPT_CALLBACK(0, "max-size", &record.output_max_size,
3658 "size", "Limit the maximum size of the output file", parse_output_max_size),
3659 OPT_UINTEGER(0, "num-thread-synthesize",
3660 &record.opts.nr_threads_synthesize,
3661 "number of threads to run for event synthesis"),
3662 #ifdef HAVE_LIBPFM
3663 OPT_CALLBACK(0, "pfm-events", &record.evlist, "event",
3664 "libpfm4 event selector. use 'perf list' to list available events",
3665 parse_libpfm_events_option),
3666 #endif
3667 OPT_CALLBACK(0, "control", &record.opts, "fd:ctl-fd[,ack-fd] or fifo:ctl-fifo[,ack-fifo]",
3668 "Listen on ctl-fd descriptor for command to control measurement ('enable': enable events, 'disable': disable events,\n"
3669 "\t\t\t 'snapshot': AUX area tracing snapshot).\n"
3670 "\t\t\t Optionally send control command completion ('ack\\n') to ack-fd descriptor.\n"
3671 "\t\t\t Alternatively, ctl-fifo / ack-fifo will be opened and used as ctl-fd / ack-fd.",
3672 parse_control_option),
3673 OPT_CALLBACK(0, "synth", &record.opts, "no|all|task|mmap|cgroup",
3674 "Fine-tune event synthesis: default=all", parse_record_synth_option),
3675 OPT_STRING_OPTARG_SET(0, "debuginfod", &record.debuginfod.urls,
3676 &record.debuginfod.set, "debuginfod urls",
3677 "Enable debuginfod data retrieval from DEBUGINFOD_URLS or specified urls",
3678 "system"),
3679 OPT_CALLBACK_OPTARG(0, "threads", &record.opts, NULL, "spec",
3680 "write collected trace data into several data files using parallel threads",
3681 record__parse_threads),
3682 OPT_BOOLEAN(0, "off-cpu", &record.off_cpu, "Enable off-cpu analysis"),
3683 OPT_STRING(0, "setup-filter", &record.filter_action, "pin|unpin",
3684 "BPF filter action"),
3685 OPT_CALLBACK(0, "off-cpu-thresh", &record.opts, "ms",
3686 "Dump off-cpu samples if off-cpu time exceeds this threshold (in milliseconds). (Default: 500ms)",
3687 record__parse_off_cpu_thresh),
3688 OPT_BOOLEAN_SET(0, "data-mmap", &record.opts.record_data_mmap,
3689 &record.opts.record_data_mmap_set,
3690 "Record mmap events for non-executable mappings"),
3691 OPT_END()
3692 };
3693
3694 struct option *record_options = __record_options;
3695
record__mmap_cpu_mask_init(struct mmap_cpu_mask * mask,struct perf_cpu_map * cpus)3696 static int record__mmap_cpu_mask_init(struct mmap_cpu_mask *mask, struct perf_cpu_map *cpus)
3697 {
3698 struct perf_cpu cpu;
3699 int idx;
3700
3701 if (cpu_map__is_dummy(cpus))
3702 return 0;
3703
3704 perf_cpu_map__for_each_cpu_skip_any(cpu, idx, cpus) {
3705 /* Return ENODEV is input cpu is greater than max cpu */
3706 if ((unsigned long)cpu.cpu > mask->nbits)
3707 return -ENODEV;
3708 __set_bit(cpu.cpu, mask->bits);
3709 }
3710
3711 return 0;
3712 }
3713
record__mmap_cpu_mask_init_spec(struct mmap_cpu_mask * mask,const char * mask_spec)3714 static int record__mmap_cpu_mask_init_spec(struct mmap_cpu_mask *mask, const char *mask_spec)
3715 {
3716 struct perf_cpu_map *cpus;
3717
3718 cpus = perf_cpu_map__new(mask_spec);
3719 if (!cpus)
3720 return -ENOMEM;
3721
3722 bitmap_zero(mask->bits, mask->nbits);
3723 if (record__mmap_cpu_mask_init(mask, cpus))
3724 return -ENODEV;
3725
3726 perf_cpu_map__put(cpus);
3727
3728 return 0;
3729 }
3730
record__free_thread_masks(struct record * rec,int nr_threads)3731 static void record__free_thread_masks(struct record *rec, int nr_threads)
3732 {
3733 int t;
3734
3735 if (rec->thread_masks)
3736 for (t = 0; t < nr_threads; t++)
3737 record__thread_mask_free(&rec->thread_masks[t]);
3738
3739 zfree(&rec->thread_masks);
3740 }
3741
record__alloc_thread_masks(struct record * rec,int nr_threads,int nr_bits)3742 static int record__alloc_thread_masks(struct record *rec, int nr_threads, int nr_bits)
3743 {
3744 int t, ret;
3745
3746 rec->thread_masks = zalloc(nr_threads * sizeof(*(rec->thread_masks)));
3747 if (!rec->thread_masks) {
3748 pr_err("Failed to allocate thread masks\n");
3749 return -ENOMEM;
3750 }
3751
3752 for (t = 0; t < nr_threads; t++) {
3753 ret = record__thread_mask_alloc(&rec->thread_masks[t], nr_bits);
3754 if (ret) {
3755 pr_err("Failed to allocate thread masks[%d]\n", t);
3756 goto out_free;
3757 }
3758 }
3759
3760 return 0;
3761
3762 out_free:
3763 record__free_thread_masks(rec, nr_threads);
3764
3765 return ret;
3766 }
3767
record__init_thread_cpu_masks(struct record * rec,struct perf_cpu_map * cpus)3768 static int record__init_thread_cpu_masks(struct record *rec, struct perf_cpu_map *cpus)
3769 {
3770 int t, ret, nr_cpus = perf_cpu_map__nr(cpus);
3771
3772 ret = record__alloc_thread_masks(rec, nr_cpus, cpu__max_cpu().cpu);
3773 if (ret)
3774 return ret;
3775
3776 rec->nr_threads = nr_cpus;
3777 pr_debug("nr_threads: %d\n", rec->nr_threads);
3778
3779 for (t = 0; t < rec->nr_threads; t++) {
3780 __set_bit(perf_cpu_map__cpu(cpus, t).cpu, rec->thread_masks[t].maps.bits);
3781 __set_bit(perf_cpu_map__cpu(cpus, t).cpu, rec->thread_masks[t].affinity.bits);
3782 if (verbose > 0) {
3783 pr_debug("thread_masks[%d]: ", t);
3784 mmap_cpu_mask__scnprintf(&rec->thread_masks[t].maps, "maps");
3785 pr_debug("thread_masks[%d]: ", t);
3786 mmap_cpu_mask__scnprintf(&rec->thread_masks[t].affinity, "affinity");
3787 }
3788 }
3789
3790 return 0;
3791 }
3792
record__init_thread_masks_spec(struct record * rec,struct perf_cpu_map * cpus,const char ** maps_spec,const char ** affinity_spec,u32 nr_spec)3793 static int record__init_thread_masks_spec(struct record *rec, struct perf_cpu_map *cpus,
3794 const char **maps_spec, const char **affinity_spec,
3795 u32 nr_spec)
3796 {
3797 u32 s;
3798 int ret = 0, t = 0;
3799 struct mmap_cpu_mask cpus_mask;
3800 struct thread_mask thread_mask, full_mask, *thread_masks;
3801
3802 ret = record__mmap_cpu_mask_alloc(&cpus_mask, cpu__max_cpu().cpu);
3803 if (ret) {
3804 pr_err("Failed to allocate CPUs mask\n");
3805 return ret;
3806 }
3807
3808 ret = record__mmap_cpu_mask_init(&cpus_mask, cpus);
3809 if (ret) {
3810 pr_err("Failed to init cpu mask\n");
3811 goto out_free_cpu_mask;
3812 }
3813
3814 ret = record__thread_mask_alloc(&full_mask, cpu__max_cpu().cpu);
3815 if (ret) {
3816 pr_err("Failed to allocate full mask\n");
3817 goto out_free_cpu_mask;
3818 }
3819
3820 ret = record__thread_mask_alloc(&thread_mask, cpu__max_cpu().cpu);
3821 if (ret) {
3822 pr_err("Failed to allocate thread mask\n");
3823 goto out_free_full_and_cpu_masks;
3824 }
3825
3826 for (s = 0; s < nr_spec; s++) {
3827 ret = record__mmap_cpu_mask_init_spec(&thread_mask.maps, maps_spec[s]);
3828 if (ret) {
3829 pr_err("Failed to initialize maps thread mask\n");
3830 goto out_free;
3831 }
3832 ret = record__mmap_cpu_mask_init_spec(&thread_mask.affinity, affinity_spec[s]);
3833 if (ret) {
3834 pr_err("Failed to initialize affinity thread mask\n");
3835 goto out_free;
3836 }
3837
3838 /* ignore invalid CPUs but do not allow empty masks */
3839 if (!bitmap_and(thread_mask.maps.bits, thread_mask.maps.bits,
3840 cpus_mask.bits, thread_mask.maps.nbits)) {
3841 pr_err("Empty maps mask: %s\n", maps_spec[s]);
3842 ret = -EINVAL;
3843 goto out_free;
3844 }
3845 if (!bitmap_and(thread_mask.affinity.bits, thread_mask.affinity.bits,
3846 cpus_mask.bits, thread_mask.affinity.nbits)) {
3847 pr_err("Empty affinity mask: %s\n", affinity_spec[s]);
3848 ret = -EINVAL;
3849 goto out_free;
3850 }
3851
3852 /* do not allow intersection with other masks (full_mask) */
3853 if (bitmap_intersects(thread_mask.maps.bits, full_mask.maps.bits,
3854 thread_mask.maps.nbits)) {
3855 pr_err("Intersecting maps mask: %s\n", maps_spec[s]);
3856 ret = -EINVAL;
3857 goto out_free;
3858 }
3859 if (bitmap_intersects(thread_mask.affinity.bits, full_mask.affinity.bits,
3860 thread_mask.affinity.nbits)) {
3861 pr_err("Intersecting affinity mask: %s\n", affinity_spec[s]);
3862 ret = -EINVAL;
3863 goto out_free;
3864 }
3865
3866 bitmap_or(full_mask.maps.bits, full_mask.maps.bits,
3867 thread_mask.maps.bits, full_mask.maps.nbits);
3868 bitmap_or(full_mask.affinity.bits, full_mask.affinity.bits,
3869 thread_mask.affinity.bits, full_mask.maps.nbits);
3870
3871 thread_masks = realloc(rec->thread_masks, (t + 1) * sizeof(struct thread_mask));
3872 if (!thread_masks) {
3873 pr_err("Failed to reallocate thread masks\n");
3874 ret = -ENOMEM;
3875 goto out_free;
3876 }
3877 rec->thread_masks = thread_masks;
3878 rec->thread_masks[t] = thread_mask;
3879 if (verbose > 0) {
3880 pr_debug("thread_masks[%d]: ", t);
3881 mmap_cpu_mask__scnprintf(&rec->thread_masks[t].maps, "maps");
3882 pr_debug("thread_masks[%d]: ", t);
3883 mmap_cpu_mask__scnprintf(&rec->thread_masks[t].affinity, "affinity");
3884 }
3885 t++;
3886 ret = record__thread_mask_alloc(&thread_mask, cpu__max_cpu().cpu);
3887 if (ret) {
3888 pr_err("Failed to allocate thread mask\n");
3889 goto out_free_full_and_cpu_masks;
3890 }
3891 }
3892 rec->nr_threads = t;
3893 pr_debug("nr_threads: %d\n", rec->nr_threads);
3894 if (!rec->nr_threads)
3895 ret = -EINVAL;
3896
3897 out_free:
3898 record__thread_mask_free(&thread_mask);
3899 out_free_full_and_cpu_masks:
3900 record__thread_mask_free(&full_mask);
3901 out_free_cpu_mask:
3902 record__mmap_cpu_mask_free(&cpus_mask);
3903
3904 return ret;
3905 }
3906
record__init_thread_core_masks(struct record * rec,struct perf_cpu_map * cpus)3907 static int record__init_thread_core_masks(struct record *rec, struct perf_cpu_map *cpus)
3908 {
3909 int ret;
3910 struct cpu_topology *topo;
3911
3912 topo = cpu_topology__new();
3913 if (!topo) {
3914 pr_err("Failed to allocate CPU topology\n");
3915 return -ENOMEM;
3916 }
3917
3918 ret = record__init_thread_masks_spec(rec, cpus, topo->core_cpus_list,
3919 topo->core_cpus_list, topo->core_cpus_lists);
3920 cpu_topology__delete(topo);
3921
3922 return ret;
3923 }
3924
record__init_thread_package_masks(struct record * rec,struct perf_cpu_map * cpus)3925 static int record__init_thread_package_masks(struct record *rec, struct perf_cpu_map *cpus)
3926 {
3927 int ret;
3928 struct cpu_topology *topo;
3929
3930 topo = cpu_topology__new();
3931 if (!topo) {
3932 pr_err("Failed to allocate CPU topology\n");
3933 return -ENOMEM;
3934 }
3935
3936 ret = record__init_thread_masks_spec(rec, cpus, topo->package_cpus_list,
3937 topo->package_cpus_list, topo->package_cpus_lists);
3938 cpu_topology__delete(topo);
3939
3940 return ret;
3941 }
3942
record__init_thread_numa_masks(struct record * rec,struct perf_cpu_map * cpus)3943 static int record__init_thread_numa_masks(struct record *rec, struct perf_cpu_map *cpus)
3944 {
3945 u32 s;
3946 int ret;
3947 const char **spec;
3948 struct numa_topology *topo;
3949
3950 topo = numa_topology__new();
3951 if (!topo) {
3952 pr_err("Failed to allocate NUMA topology\n");
3953 return -ENOMEM;
3954 }
3955
3956 spec = zalloc(topo->nr * sizeof(char *));
3957 if (!spec) {
3958 pr_err("Failed to allocate NUMA spec\n");
3959 ret = -ENOMEM;
3960 goto out_delete_topo;
3961 }
3962 for (s = 0; s < topo->nr; s++)
3963 spec[s] = topo->nodes[s].cpus;
3964
3965 ret = record__init_thread_masks_spec(rec, cpus, spec, spec, topo->nr);
3966
3967 zfree(&spec);
3968
3969 out_delete_topo:
3970 numa_topology__delete(topo);
3971
3972 return ret;
3973 }
3974
record__init_thread_user_masks(struct record * rec,struct perf_cpu_map * cpus)3975 static int record__init_thread_user_masks(struct record *rec, struct perf_cpu_map *cpus)
3976 {
3977 int t, ret;
3978 u32 s, nr_spec = 0;
3979 char **maps_spec = NULL, **affinity_spec = NULL, **tmp_spec;
3980 char *user_spec, *spec, *spec_ptr, *mask, *mask_ptr, *dup_mask = NULL;
3981
3982 for (t = 0, user_spec = (char *)rec->opts.threads_user_spec; ; t++, user_spec = NULL) {
3983 spec = strtok_r(user_spec, ":", &spec_ptr);
3984 if (spec == NULL)
3985 break;
3986 pr_debug2("threads_spec[%d]: %s\n", t, spec);
3987 mask = strtok_r(spec, "/", &mask_ptr);
3988 if (mask == NULL)
3989 break;
3990 pr_debug2(" maps mask: %s\n", mask);
3991 tmp_spec = realloc(maps_spec, (nr_spec + 1) * sizeof(char *));
3992 if (!tmp_spec) {
3993 pr_err("Failed to reallocate maps spec\n");
3994 ret = -ENOMEM;
3995 goto out_free;
3996 }
3997 maps_spec = tmp_spec;
3998 maps_spec[nr_spec] = dup_mask = strdup(mask);
3999 if (!maps_spec[nr_spec]) {
4000 pr_err("Failed to allocate maps spec[%d]\n", nr_spec);
4001 ret = -ENOMEM;
4002 goto out_free;
4003 }
4004 mask = strtok_r(NULL, "/", &mask_ptr);
4005 if (mask == NULL) {
4006 pr_err("Invalid thread maps or affinity specs\n");
4007 ret = -EINVAL;
4008 goto out_free;
4009 }
4010 pr_debug2(" affinity mask: %s\n", mask);
4011 tmp_spec = realloc(affinity_spec, (nr_spec + 1) * sizeof(char *));
4012 if (!tmp_spec) {
4013 pr_err("Failed to reallocate affinity spec\n");
4014 ret = -ENOMEM;
4015 goto out_free;
4016 }
4017 affinity_spec = tmp_spec;
4018 affinity_spec[nr_spec] = strdup(mask);
4019 if (!affinity_spec[nr_spec]) {
4020 pr_err("Failed to allocate affinity spec[%d]\n", nr_spec);
4021 ret = -ENOMEM;
4022 goto out_free;
4023 }
4024 dup_mask = NULL;
4025 nr_spec++;
4026 }
4027
4028 ret = record__init_thread_masks_spec(rec, cpus, (const char **)maps_spec,
4029 (const char **)affinity_spec, nr_spec);
4030
4031 out_free:
4032 free(dup_mask);
4033 for (s = 0; s < nr_spec; s++) {
4034 if (maps_spec)
4035 free(maps_spec[s]);
4036 if (affinity_spec)
4037 free(affinity_spec[s]);
4038 }
4039 free(affinity_spec);
4040 free(maps_spec);
4041
4042 return ret;
4043 }
4044
record__init_thread_default_masks(struct record * rec,struct perf_cpu_map * cpus)4045 static int record__init_thread_default_masks(struct record *rec, struct perf_cpu_map *cpus)
4046 {
4047 int ret;
4048
4049 ret = record__alloc_thread_masks(rec, 1, cpu__max_cpu().cpu);
4050 if (ret)
4051 return ret;
4052
4053 if (record__mmap_cpu_mask_init(&rec->thread_masks->maps, cpus))
4054 return -ENODEV;
4055
4056 rec->nr_threads = 1;
4057
4058 return 0;
4059 }
4060
record__init_thread_masks(struct record * rec)4061 static int record__init_thread_masks(struct record *rec)
4062 {
4063 int ret = 0;
4064 struct perf_cpu_map *cpus = rec->evlist->core.all_cpus;
4065
4066 if (!record__threads_enabled(rec))
4067 return record__init_thread_default_masks(rec, cpus);
4068
4069 if (evlist__per_thread(rec->evlist)) {
4070 pr_err("--per-thread option is mutually exclusive to parallel streaming mode.\n");
4071 return -EINVAL;
4072 }
4073
4074 switch (rec->opts.threads_spec) {
4075 case THREAD_SPEC__CPU:
4076 ret = record__init_thread_cpu_masks(rec, cpus);
4077 break;
4078 case THREAD_SPEC__CORE:
4079 ret = record__init_thread_core_masks(rec, cpus);
4080 break;
4081 case THREAD_SPEC__PACKAGE:
4082 ret = record__init_thread_package_masks(rec, cpus);
4083 break;
4084 case THREAD_SPEC__NUMA:
4085 ret = record__init_thread_numa_masks(rec, cpus);
4086 break;
4087 case THREAD_SPEC__USER:
4088 ret = record__init_thread_user_masks(rec, cpus);
4089 break;
4090 default:
4091 break;
4092 }
4093
4094 return ret;
4095 }
4096
cmd_record(int argc,const char ** argv)4097 int cmd_record(int argc, const char **argv)
4098 {
4099 int err;
4100 struct record *rec = &record;
4101 char errbuf[BUFSIZ];
4102
4103 setlocale(LC_ALL, "");
4104
4105 #ifndef HAVE_BPF_SKEL
4106 # define set_nobuild(s, l, m, c) set_option_nobuild(record_options, s, l, m, c)
4107 set_nobuild('\0', "off-cpu", "no BUILD_BPF_SKEL=1", true);
4108 # undef set_nobuild
4109 #endif
4110
4111 /* Disable eager loading of kernel symbols that adds overhead to perf record. */
4112 symbol_conf.lazy_load_kernel_maps = true;
4113 rec->opts.affinity = PERF_AFFINITY_SYS;
4114
4115 rec->evlist = evlist__new();
4116 if (rec->evlist == NULL)
4117 return -ENOMEM;
4118
4119 err = perf_config(perf_record_config, rec);
4120 if (err)
4121 return err;
4122
4123 argc = parse_options(argc, argv, record_options, record_usage,
4124 PARSE_OPT_STOP_AT_NON_OPTION);
4125 if (quiet)
4126 perf_quiet_option();
4127
4128 err = symbol__validate_sym_arguments();
4129 if (err)
4130 return err;
4131
4132 perf_debuginfod_setup(&record.debuginfod);
4133
4134 /* Make system wide (-a) the default target. */
4135 if (!argc && target__none(&rec->opts.target))
4136 rec->opts.target.system_wide = true;
4137
4138 if (nr_cgroups && !rec->opts.target.system_wide) {
4139 usage_with_options_msg(record_usage, record_options,
4140 "cgroup monitoring only available in system-wide mode");
4141
4142 }
4143
4144 if (record.latency) {
4145 /*
4146 * There is no fundamental reason why latency profiling
4147 * can't work for system-wide mode, but exact semantics
4148 * and details are to be defined.
4149 * See the following thread for details:
4150 * https://lore.kernel.org/all/Z4XDJyvjiie3howF@google.com/
4151 */
4152 if (record.opts.target.system_wide) {
4153 pr_err("Failed: latency profiling is not supported with system-wide collection.\n");
4154 err = -EINVAL;
4155 goto out_opts;
4156 }
4157 record.opts.record_switch_events = true;
4158 }
4159
4160 if (rec->buildid_mmap && !perf_can_record_build_id()) {
4161 pr_warning("Missing support for build id in kernel mmap events.\n"
4162 "Disable this warning with --no-buildid-mmap\n");
4163 rec->buildid_mmap = false;
4164 }
4165
4166 if (rec->buildid_mmap) {
4167 /* Enable perf_event_attr::build_id bit. */
4168 rec->opts.build_id = true;
4169 /* Disable build-ID table in the header. */
4170 rec->no_buildid = true;
4171 } else {
4172 pr_debug("Disabling build id in synthesized mmap2 events.\n");
4173 symbol_conf.no_buildid_mmap2 = true;
4174 }
4175
4176 if (rec->no_buildid_set && rec->no_buildid) {
4177 /* -B implies -N for historic reasons. */
4178 rec->no_buildid_cache = true;
4179 }
4180
4181 if (rec->opts.record_cgroup && !perf_can_record_cgroup()) {
4182 pr_err("Kernel has no cgroup sampling support.\n");
4183 err = -EINVAL;
4184 goto out_opts;
4185 }
4186
4187 if (rec->opts.kcore)
4188 rec->opts.text_poke = true;
4189
4190 if (rec->opts.kcore || record__threads_enabled(rec))
4191 rec->data.is_dir = true;
4192
4193 if (record__threads_enabled(rec)) {
4194 if (rec->opts.affinity != PERF_AFFINITY_SYS) {
4195 pr_err("--affinity option is mutually exclusive to parallel streaming mode.\n");
4196 goto out_opts;
4197 }
4198 if (record__aio_enabled(rec)) {
4199 pr_err("Asynchronous streaming mode (--aio) is mutually exclusive to parallel streaming mode.\n");
4200 goto out_opts;
4201 }
4202 }
4203
4204 if (rec->opts.comp_level != 0) {
4205 pr_debug("Compression enabled, disabling build id collection at the end of the session.\n");
4206 rec->no_buildid = true;
4207 }
4208
4209 if (rec->opts.record_switch_events &&
4210 !perf_can_record_switch_events()) {
4211 ui__error("kernel does not support recording context switch events\n");
4212 parse_options_usage(record_usage, record_options, "switch-events", 0);
4213 err = -EINVAL;
4214 goto out_opts;
4215 }
4216
4217 if (switch_output_setup(rec)) {
4218 parse_options_usage(record_usage, record_options, "switch-output", 0);
4219 err = -EINVAL;
4220 goto out_opts;
4221 }
4222
4223 if (rec->switch_output.time) {
4224 signal(SIGALRM, alarm_sig_handler);
4225 alarm(rec->switch_output.time);
4226 }
4227
4228 if (rec->switch_output.num_files) {
4229 rec->switch_output.filenames = calloc(rec->switch_output.num_files,
4230 sizeof(char *));
4231 if (!rec->switch_output.filenames) {
4232 err = -EINVAL;
4233 goto out_opts;
4234 }
4235 }
4236
4237 if (rec->timestamp_filename && record__threads_enabled(rec)) {
4238 rec->timestamp_filename = false;
4239 pr_warning("WARNING: --timestamp-filename option is not available in parallel streaming mode.\n");
4240 }
4241
4242 if (rec->filter_action) {
4243 if (!strcmp(rec->filter_action, "pin"))
4244 err = perf_bpf_filter__pin();
4245 else if (!strcmp(rec->filter_action, "unpin"))
4246 err = perf_bpf_filter__unpin();
4247 else {
4248 pr_warning("Unknown BPF filter action: %s\n", rec->filter_action);
4249 err = -EINVAL;
4250 }
4251 goto out_opts;
4252 }
4253
4254 /* For backward compatibility, -d implies --mem-info and --data-mmap */
4255 if (rec->opts.sample_address) {
4256 rec->opts.sample_data_src = true;
4257 if (!rec->opts.record_data_mmap_set)
4258 rec->opts.record_data_mmap = true;
4259 }
4260
4261 /*
4262 * Allow aliases to facilitate the lookup of symbols for address
4263 * filters. Refer to auxtrace_parse_filters().
4264 */
4265 symbol_conf.allow_aliases = true;
4266
4267 symbol__init(NULL);
4268
4269 err = record__auxtrace_init(rec);
4270 if (err)
4271 goto out;
4272
4273 if (dry_run)
4274 goto out;
4275
4276 err = -ENOMEM;
4277
4278 if (rec->no_buildid_cache) {
4279 disable_buildid_cache();
4280 } else if (rec->switch_output.enabled) {
4281 /*
4282 * In 'perf record --switch-output', disable buildid
4283 * generation by default to reduce data file switching
4284 * overhead. Still generate buildid if they are required
4285 * explicitly using
4286 *
4287 * perf record --switch-output --no-no-buildid \
4288 * --no-no-buildid-cache
4289 *
4290 * Following code equals to:
4291 *
4292 * if ((rec->no_buildid || !rec->no_buildid_set) &&
4293 * (rec->no_buildid_cache || !rec->no_buildid_cache_set))
4294 * disable_buildid_cache();
4295 */
4296 bool disable = true;
4297
4298 if (rec->no_buildid_set && !rec->no_buildid)
4299 disable = false;
4300 if (rec->no_buildid_cache_set && !rec->no_buildid_cache)
4301 disable = false;
4302 if (disable) {
4303 rec->no_buildid = true;
4304 rec->no_buildid_cache = true;
4305 disable_buildid_cache();
4306 }
4307 }
4308
4309 if (record.opts.overwrite)
4310 record.opts.tail_synthesize = true;
4311
4312 if (rec->evlist->core.nr_entries == 0) {
4313 struct evlist *def_evlist = evlist__new_default();
4314
4315 if (!def_evlist)
4316 goto out;
4317
4318 evlist__splice_list_tail(rec->evlist, &def_evlist->core.entries);
4319 evlist__delete(def_evlist);
4320 }
4321
4322 if (rec->opts.target.tid && !rec->opts.no_inherit_set)
4323 rec->opts.no_inherit = true;
4324
4325 err = target__validate(&rec->opts.target);
4326 if (err) {
4327 target__strerror(&rec->opts.target, err, errbuf, BUFSIZ);
4328 ui__warning("%s\n", errbuf);
4329 }
4330
4331 if (rec->uid_str) {
4332 uid_t uid = parse_uid(rec->uid_str);
4333
4334 if (uid == UINT_MAX) {
4335 ui__error("Invalid User: %s", rec->uid_str);
4336 err = -EINVAL;
4337 goto out;
4338 }
4339 err = parse_uid_filter(rec->evlist, uid);
4340 if (err)
4341 goto out;
4342
4343 /* User ID filtering implies system wide. */
4344 rec->opts.target.system_wide = true;
4345 }
4346
4347 /* Enable ignoring missing threads when -p option is defined. */
4348 rec->opts.ignore_missing_thread = rec->opts.target.pid;
4349
4350 evlist__warn_user_requested_cpus(rec->evlist, rec->opts.target.cpu_list);
4351
4352 if (callchain_param.enabled && callchain_param.record_mode == CALLCHAIN_FP)
4353 arch__add_leaf_frame_record_opts(&rec->opts);
4354
4355 err = -ENOMEM;
4356 if (evlist__create_maps(rec->evlist, &rec->opts.target) < 0) {
4357 if (rec->opts.target.pid != NULL) {
4358 pr_err("Couldn't create thread/CPU maps: %s\n",
4359 errno == ENOENT ? "No such process" : str_error_r(errno, errbuf, sizeof(errbuf)));
4360 goto out;
4361 }
4362 else
4363 usage_with_options(record_usage, record_options);
4364 }
4365
4366 err = auxtrace_record__options(rec->itr, rec->evlist, &rec->opts);
4367 if (err)
4368 goto out;
4369
4370 /*
4371 * We take all buildids when the file contains
4372 * AUX area tracing data because we do not decode the
4373 * trace because it would take too long.
4374 */
4375 if (rec->opts.full_auxtrace)
4376 rec->buildid_all = true;
4377
4378 if (rec->opts.text_poke) {
4379 err = record__config_text_poke(rec->evlist);
4380 if (err) {
4381 pr_err("record__config_text_poke failed, error %d\n", err);
4382 goto out;
4383 }
4384 }
4385
4386 if (rec->off_cpu) {
4387 err = record__config_off_cpu(rec);
4388 if (err) {
4389 pr_err("record__config_off_cpu failed, error %d\n", err);
4390 goto out;
4391 }
4392 }
4393
4394 if (record_opts__config(&rec->opts)) {
4395 err = -EINVAL;
4396 goto out;
4397 }
4398
4399 err = record__config_tracking_events(rec);
4400 if (err) {
4401 pr_err("record__config_tracking_events failed, error %d\n", err);
4402 goto out;
4403 }
4404
4405 err = record__init_thread_masks(rec);
4406 if (err) {
4407 pr_err("Failed to initialize parallel data streaming masks\n");
4408 goto out;
4409 }
4410
4411 if (rec->opts.nr_cblocks > nr_cblocks_max)
4412 rec->opts.nr_cblocks = nr_cblocks_max;
4413 pr_debug("nr_cblocks: %d\n", rec->opts.nr_cblocks);
4414
4415 pr_debug("affinity: %s\n", affinity_tags[rec->opts.affinity]);
4416 pr_debug("mmap flush: %d\n", rec->opts.mmap_flush);
4417
4418 if (rec->opts.comp_level > comp_level_max)
4419 rec->opts.comp_level = comp_level_max;
4420 pr_debug("comp level: %d\n", rec->opts.comp_level);
4421
4422 err = __cmd_record(&record, argc, argv);
4423 out:
4424 record__free_thread_masks(rec, rec->nr_threads);
4425 rec->nr_threads = 0;
4426 symbol__exit();
4427 auxtrace_record__free(rec->itr);
4428 out_opts:
4429 evlist__close_control(rec->opts.ctl_fd, rec->opts.ctl_fd_ack, &rec->opts.ctl_fd_close);
4430 evlist__delete(rec->evlist);
4431 return err;
4432 }
4433
snapshot_sig_handler(int sig __maybe_unused)4434 static void snapshot_sig_handler(int sig __maybe_unused)
4435 {
4436 struct record *rec = &record;
4437
4438 hit_auxtrace_snapshot_trigger(rec);
4439
4440 if (switch_output_signal(rec))
4441 trigger_hit(&switch_output_trigger);
4442 }
4443
alarm_sig_handler(int sig __maybe_unused)4444 static void alarm_sig_handler(int sig __maybe_unused)
4445 {
4446 struct record *rec = &record;
4447
4448 if (switch_output_time(rec))
4449 trigger_hit(&switch_output_trigger);
4450 }
4451