1 // SPDX-License-Identifier: GPL-2.0
2 /*
3 * builtin-record.c
4 *
5 * Builtin record command: Record the profile of a workload
6 * (or a CPU, or a PID) into the perf.data output file - for
7 * later analysis via perf report.
8 */
9 #include "builtin.h"
10
11 #include "util/build-id.h"
12 #include <subcmd/parse-options.h>
13 #include <internal/xyarray.h>
14 #include "util/parse-events.h"
15 #include "util/config.h"
16
17 #include "util/callchain.h"
18 #include "util/cgroup.h"
19 #include "util/header.h"
20 #include "util/event.h"
21 #include "util/evlist.h"
22 #include "util/evsel.h"
23 #include "util/debug.h"
24 #include "util/mmap.h"
25 #include "util/mutex.h"
26 #include "util/target.h"
27 #include "util/session.h"
28 #include "util/tool.h"
29 #include "util/stat.h"
30 #include "util/symbol.h"
31 #include "util/record.h"
32 #include "util/cpumap.h"
33 #include "util/thread_map.h"
34 #include "util/data.h"
35 #include "util/perf_regs.h"
36 #include "util/auxtrace.h"
37 #include "util/tsc.h"
38 #include "util/parse-branch-options.h"
39 #include "util/parse-regs-options.h"
40 #include "util/perf_api_probe.h"
41 #include "util/trigger.h"
42 #include "util/perf-hooks.h"
43 #include "util/cpu-set-sched.h"
44 #include "util/synthetic-events.h"
45 #include "util/time-utils.h"
46 #include "util/units.h"
47 #include "util/bpf-event.h"
48 #include "util/util.h"
49 #include "util/pfm.h"
50 #include "util/pmu.h"
51 #include "util/pmus.h"
52 #include "util/clockid.h"
53 #include "util/off_cpu.h"
54 #include "util/bpf-filter.h"
55 #include "util/strbuf.h"
56 #include "asm/bug.h"
57 #include "perf.h"
58 #include "cputopo.h"
59
60 #include <errno.h>
61 #include <inttypes.h>
62 #include <locale.h>
63 #include <poll.h>
64 #include <pthread.h>
65 #include <unistd.h>
66 #ifndef HAVE_GETTID
67 #include <syscall.h>
68 #endif
69 #include <sched.h>
70 #include <signal.h>
71 #ifdef HAVE_EVENTFD_SUPPORT
72 #include <sys/eventfd.h>
73 #endif
74 #include <sys/mman.h>
75 #include <sys/wait.h>
76 #include <sys/types.h>
77 #include <sys/stat.h>
78 #include <fcntl.h>
79 #include <linux/err.h>
80 #include <linux/string.h>
81 #include <linux/time64.h>
82 #include <linux/zalloc.h>
83 #include <linux/bitmap.h>
84 #include <sys/time.h>
85
86 struct switch_output {
87 bool enabled;
88 bool signal;
89 unsigned long size;
90 unsigned long time;
91 const char *str;
92 bool set;
93 char **filenames;
94 int num_files;
95 int cur_file;
96 };
97
98 struct thread_mask {
99 struct mmap_cpu_mask maps;
100 struct mmap_cpu_mask affinity;
101 };
102
103 struct record_thread {
104 pid_t tid;
105 struct thread_mask *mask;
106 struct {
107 int msg[2];
108 int ack[2];
109 } pipes;
110 struct fdarray pollfd;
111 int ctlfd_pos;
112 int nr_mmaps;
113 struct mmap **maps;
114 struct mmap **overwrite_maps;
115 struct record *rec;
116 unsigned long long samples;
117 unsigned long waking;
118 u64 bytes_written;
119 u64 bytes_transferred;
120 u64 bytes_compressed;
121 };
122
123 static __thread struct record_thread *thread;
124
125 enum thread_msg {
126 THREAD_MSG__UNDEFINED = 0,
127 THREAD_MSG__READY,
128 THREAD_MSG__MAX,
129 };
130
131 static const char *thread_msg_tags[THREAD_MSG__MAX] = {
132 "UNDEFINED", "READY"
133 };
134
135 enum thread_spec {
136 THREAD_SPEC__UNDEFINED = 0,
137 THREAD_SPEC__CPU,
138 THREAD_SPEC__CORE,
139 THREAD_SPEC__PACKAGE,
140 THREAD_SPEC__NUMA,
141 THREAD_SPEC__USER,
142 THREAD_SPEC__MAX,
143 };
144
145 static const char *thread_spec_tags[THREAD_SPEC__MAX] = {
146 "undefined", "cpu", "core", "package", "numa", "user"
147 };
148
149 struct pollfd_index_map {
150 int evlist_pollfd_index;
151 int thread_pollfd_index;
152 };
153
154 struct record {
155 struct perf_tool tool;
156 struct record_opts opts;
157 u64 bytes_written;
158 u64 thread_bytes_written;
159 struct perf_data data;
160 struct auxtrace_record *itr;
161 struct evlist *evlist;
162 struct perf_session *session;
163 struct evlist *sb_evlist;
164 pthread_t thread_id;
165 int realtime_prio;
166 bool latency;
167 bool switch_output_event_set;
168 bool no_buildid;
169 bool no_buildid_set;
170 bool no_buildid_cache;
171 bool no_buildid_cache_set;
172 bool buildid_all;
173 bool buildid_mmap;
174 bool buildid_mmap_set;
175 bool timestamp_filename;
176 bool timestamp_boundary;
177 bool off_cpu;
178 const char *filter_action;
179 const char *uid_str;
180 struct switch_output switch_output;
181 unsigned long long samples;
182 unsigned long output_max_size; /* = 0: unlimited */
183 struct perf_debuginfod debuginfod;
184 int nr_threads;
185 struct thread_mask *thread_masks;
186 struct record_thread *thread_data;
187 struct pollfd_index_map *index_map;
188 size_t index_map_sz;
189 size_t index_map_cnt;
190 };
191
192 static volatile int done;
193
194 static volatile int auxtrace_record__snapshot_started;
195 static DEFINE_TRIGGER(auxtrace_snapshot_trigger);
196 static DEFINE_TRIGGER(switch_output_trigger);
197
198 static const char *affinity_tags[PERF_AFFINITY_MAX] = {
199 "SYS", "NODE", "CPU"
200 };
201
202 static int build_id__process_mmap(const struct perf_tool *tool, union perf_event *event,
203 struct perf_sample *sample, struct machine *machine);
204 static int build_id__process_mmap2(const struct perf_tool *tool, union perf_event *event,
205 struct perf_sample *sample, struct machine *machine);
206 static int process_timestamp_boundary(const struct perf_tool *tool,
207 union perf_event *event,
208 struct perf_sample *sample,
209 struct machine *machine);
210
211 #ifndef HAVE_GETTID
gettid(void)212 static inline pid_t gettid(void)
213 {
214 return (pid_t)syscall(__NR_gettid);
215 }
216 #endif
217
record__threads_enabled(struct record * rec)218 static int record__threads_enabled(struct record *rec)
219 {
220 return rec->opts.threads_spec;
221 }
222
switch_output_signal(struct record * rec)223 static bool switch_output_signal(struct record *rec)
224 {
225 return rec->switch_output.signal &&
226 trigger_is_ready(&switch_output_trigger);
227 }
228
switch_output_size(struct record * rec)229 static bool switch_output_size(struct record *rec)
230 {
231 return rec->switch_output.size &&
232 trigger_is_ready(&switch_output_trigger) &&
233 (rec->bytes_written >= rec->switch_output.size);
234 }
235
switch_output_time(struct record * rec)236 static bool switch_output_time(struct record *rec)
237 {
238 return rec->switch_output.time &&
239 trigger_is_ready(&switch_output_trigger);
240 }
241
record__bytes_written(struct record * rec)242 static u64 record__bytes_written(struct record *rec)
243 {
244 return rec->bytes_written + rec->thread_bytes_written;
245 }
246
record__output_max_size_exceeded(struct record * rec)247 static bool record__output_max_size_exceeded(struct record *rec)
248 {
249 return rec->output_max_size &&
250 (record__bytes_written(rec) >= rec->output_max_size);
251 }
252
record__write(struct record * rec,struct mmap * map __maybe_unused,void * bf,size_t size)253 static int record__write(struct record *rec, struct mmap *map __maybe_unused,
254 void *bf, size_t size)
255 {
256 struct perf_data_file *file = &rec->session->data->file;
257
258 if (map && map->file)
259 file = map->file;
260
261 if (perf_data_file__write(file, bf, size) < 0) {
262 pr_err("failed to write perf data, error: %m\n");
263 return -1;
264 }
265
266 if (map && map->file) {
267 thread->bytes_written += size;
268 rec->thread_bytes_written += size;
269 } else {
270 rec->bytes_written += size;
271 }
272
273 if (record__output_max_size_exceeded(rec) && !done) {
274 fprintf(stderr, "[ perf record: perf size limit reached (%" PRIu64 " KB),"
275 " stopping session ]\n",
276 record__bytes_written(rec) >> 10);
277 done = 1;
278 }
279
280 if (switch_output_size(rec))
281 trigger_hit(&switch_output_trigger);
282
283 return 0;
284 }
285
286 static int record__aio_enabled(struct record *rec);
287 static int record__comp_enabled(struct record *rec);
288 static ssize_t zstd_compress(struct perf_session *session, struct mmap *map,
289 void *dst, size_t dst_size, void *src, size_t src_size);
290
291 #ifdef HAVE_AIO_SUPPORT
record__aio_write(struct aiocb * cblock,int trace_fd,void * buf,size_t size,off_t off)292 static int record__aio_write(struct aiocb *cblock, int trace_fd,
293 void *buf, size_t size, off_t off)
294 {
295 int rc;
296
297 cblock->aio_fildes = trace_fd;
298 cblock->aio_buf = buf;
299 cblock->aio_nbytes = size;
300 cblock->aio_offset = off;
301 cblock->aio_sigevent.sigev_notify = SIGEV_NONE;
302
303 do {
304 rc = aio_write(cblock);
305 if (rc == 0) {
306 break;
307 } else if (errno != EAGAIN) {
308 cblock->aio_fildes = -1;
309 pr_err("failed to queue perf data, error: %m\n");
310 break;
311 }
312 } while (1);
313
314 return rc;
315 }
316
record__aio_complete(struct mmap * md,struct aiocb * cblock)317 static int record__aio_complete(struct mmap *md, struct aiocb *cblock)
318 {
319 void *rem_buf;
320 off_t rem_off;
321 size_t rem_size;
322 int rc, aio_errno;
323 ssize_t aio_ret, written;
324
325 aio_errno = aio_error(cblock);
326 if (aio_errno == EINPROGRESS)
327 return 0;
328
329 written = aio_ret = aio_return(cblock);
330 if (aio_ret < 0) {
331 if (aio_errno != EINTR)
332 pr_err("failed to write perf data, error: %m\n");
333 written = 0;
334 }
335
336 rem_size = cblock->aio_nbytes - written;
337
338 if (rem_size == 0) {
339 cblock->aio_fildes = -1;
340 /*
341 * md->refcount is incremented in record__aio_pushfn() for
342 * every aio write request started in record__aio_push() so
343 * decrement it because the request is now complete.
344 */
345 perf_mmap__put(&md->core);
346 rc = 1;
347 } else {
348 /*
349 * aio write request may require restart with the
350 * remainder if the kernel didn't write whole
351 * chunk at once.
352 */
353 rem_off = cblock->aio_offset + written;
354 rem_buf = (void *)(cblock->aio_buf + written);
355 record__aio_write(cblock, cblock->aio_fildes,
356 rem_buf, rem_size, rem_off);
357 rc = 0;
358 }
359
360 return rc;
361 }
362
record__aio_sync(struct mmap * md,bool sync_all)363 static int record__aio_sync(struct mmap *md, bool sync_all)
364 {
365 struct aiocb **aiocb = md->aio.aiocb;
366 struct aiocb *cblocks = md->aio.cblocks;
367 struct timespec timeout = { 0, 1000 * 1000 * 1 }; /* 1ms */
368 int i, do_suspend;
369
370 do {
371 do_suspend = 0;
372 for (i = 0; i < md->aio.nr_cblocks; ++i) {
373 if (cblocks[i].aio_fildes == -1 || record__aio_complete(md, &cblocks[i])) {
374 if (sync_all)
375 aiocb[i] = NULL;
376 else
377 return i;
378 } else {
379 /*
380 * Started aio write is not complete yet
381 * so it has to be waited before the
382 * next allocation.
383 */
384 aiocb[i] = &cblocks[i];
385 do_suspend = 1;
386 }
387 }
388 if (!do_suspend)
389 return -1;
390
391 while (aio_suspend((const struct aiocb **)aiocb, md->aio.nr_cblocks, &timeout)) {
392 if (!(errno == EAGAIN || errno == EINTR))
393 pr_err("failed to sync perf data, error: %m\n");
394 }
395 } while (1);
396 }
397
398 struct record_aio {
399 struct record *rec;
400 void *data;
401 size_t size;
402 };
403
record__aio_pushfn(struct mmap * map,void * to,void * buf,size_t size)404 static int record__aio_pushfn(struct mmap *map, void *to, void *buf, size_t size)
405 {
406 struct record_aio *aio = to;
407
408 /*
409 * map->core.base data pointed by buf is copied into free map->aio.data[] buffer
410 * to release space in the kernel buffer as fast as possible, calling
411 * perf_mmap__consume() from perf_mmap__push() function.
412 *
413 * That lets the kernel to proceed with storing more profiling data into
414 * the kernel buffer earlier than other per-cpu kernel buffers are handled.
415 *
416 * Coping can be done in two steps in case the chunk of profiling data
417 * crosses the upper bound of the kernel buffer. In this case we first move
418 * part of data from map->start till the upper bound and then the remainder
419 * from the beginning of the kernel buffer till the end of the data chunk.
420 */
421
422 if (record__comp_enabled(aio->rec)) {
423 ssize_t compressed = zstd_compress(aio->rec->session, NULL, aio->data + aio->size,
424 mmap__mmap_len(map) - aio->size,
425 buf, size);
426 if (compressed < 0)
427 return (int)compressed;
428
429 size = compressed;
430 } else {
431 memcpy(aio->data + aio->size, buf, size);
432 }
433
434 if (!aio->size) {
435 /*
436 * Increment map->refcount to guard map->aio.data[] buffer
437 * from premature deallocation because map object can be
438 * released earlier than aio write request started on
439 * map->aio.data[] buffer is complete.
440 *
441 * perf_mmap__put() is done at record__aio_complete()
442 * after started aio request completion or at record__aio_push()
443 * if the request failed to start.
444 */
445 perf_mmap__get(&map->core);
446 }
447
448 aio->size += size;
449
450 return size;
451 }
452
record__aio_push(struct record * rec,struct mmap * map,off_t * off)453 static int record__aio_push(struct record *rec, struct mmap *map, off_t *off)
454 {
455 int ret, idx;
456 int trace_fd = rec->session->data->file.fd;
457 struct record_aio aio = { .rec = rec, .size = 0 };
458
459 /*
460 * Call record__aio_sync() to wait till map->aio.data[] buffer
461 * becomes available after previous aio write operation.
462 */
463
464 idx = record__aio_sync(map, false);
465 aio.data = map->aio.data[idx];
466 ret = perf_mmap__push(map, &aio, record__aio_pushfn);
467 if (ret != 0) /* ret > 0 - no data, ret < 0 - error */
468 return ret;
469
470 rec->samples++;
471 ret = record__aio_write(&(map->aio.cblocks[idx]), trace_fd, aio.data, aio.size, *off);
472 if (!ret) {
473 *off += aio.size;
474 rec->bytes_written += aio.size;
475 if (switch_output_size(rec))
476 trigger_hit(&switch_output_trigger);
477 } else {
478 /*
479 * Decrement map->refcount incremented in record__aio_pushfn()
480 * back if record__aio_write() operation failed to start, otherwise
481 * map->refcount is decremented in record__aio_complete() after
482 * aio write operation finishes successfully.
483 */
484 perf_mmap__put(&map->core);
485 }
486
487 return ret;
488 }
489
record__aio_get_pos(int trace_fd)490 static off_t record__aio_get_pos(int trace_fd)
491 {
492 return lseek(trace_fd, 0, SEEK_CUR);
493 }
494
record__aio_set_pos(int trace_fd,off_t pos)495 static void record__aio_set_pos(int trace_fd, off_t pos)
496 {
497 lseek(trace_fd, pos, SEEK_SET);
498 }
499
record__aio_mmap_read_sync(struct record * rec)500 static void record__aio_mmap_read_sync(struct record *rec)
501 {
502 int i;
503 struct evlist *evlist = rec->evlist;
504 struct mmap *maps = evlist->mmap;
505
506 if (!record__aio_enabled(rec))
507 return;
508
509 for (i = 0; i < evlist->core.nr_mmaps; i++) {
510 struct mmap *map = &maps[i];
511
512 if (map->core.base)
513 record__aio_sync(map, true);
514 }
515 }
516
517 static int nr_cblocks_default = 1;
518 static int nr_cblocks_max = 4;
519
record__aio_parse(const struct option * opt,const char * str,int unset)520 static int record__aio_parse(const struct option *opt,
521 const char *str,
522 int unset)
523 {
524 struct record_opts *opts = (struct record_opts *)opt->value;
525
526 if (unset) {
527 opts->nr_cblocks = 0;
528 } else {
529 if (str)
530 opts->nr_cblocks = strtol(str, NULL, 0);
531 if (!opts->nr_cblocks)
532 opts->nr_cblocks = nr_cblocks_default;
533 }
534
535 return 0;
536 }
537 #else /* HAVE_AIO_SUPPORT */
538 static int nr_cblocks_max = 0;
539
record__aio_push(struct record * rec __maybe_unused,struct mmap * map __maybe_unused,off_t * off __maybe_unused)540 static int record__aio_push(struct record *rec __maybe_unused, struct mmap *map __maybe_unused,
541 off_t *off __maybe_unused)
542 {
543 return -1;
544 }
545
record__aio_get_pos(int trace_fd __maybe_unused)546 static off_t record__aio_get_pos(int trace_fd __maybe_unused)
547 {
548 return -1;
549 }
550
record__aio_set_pos(int trace_fd __maybe_unused,off_t pos __maybe_unused)551 static void record__aio_set_pos(int trace_fd __maybe_unused, off_t pos __maybe_unused)
552 {
553 }
554
record__aio_mmap_read_sync(struct record * rec __maybe_unused)555 static void record__aio_mmap_read_sync(struct record *rec __maybe_unused)
556 {
557 }
558 #endif
559
record__aio_enabled(struct record * rec)560 static int record__aio_enabled(struct record *rec)
561 {
562 return rec->opts.nr_cblocks > 0;
563 }
564
565 #define MMAP_FLUSH_DEFAULT 1
record__mmap_flush_parse(const struct option * opt,const char * str,int unset)566 static int record__mmap_flush_parse(const struct option *opt,
567 const char *str,
568 int unset)
569 {
570 int flush_max;
571 struct record_opts *opts = (struct record_opts *)opt->value;
572 static struct parse_tag tags[] = {
573 { .tag = 'B', .mult = 1 },
574 { .tag = 'K', .mult = 1 << 10 },
575 { .tag = 'M', .mult = 1 << 20 },
576 { .tag = 'G', .mult = 1 << 30 },
577 { .tag = 0 },
578 };
579
580 if (unset)
581 return 0;
582
583 if (str) {
584 opts->mmap_flush = parse_tag_value(str, tags);
585 if (opts->mmap_flush == (int)-1)
586 opts->mmap_flush = strtol(str, NULL, 0);
587 }
588
589 if (!opts->mmap_flush)
590 opts->mmap_flush = MMAP_FLUSH_DEFAULT;
591
592 flush_max = evlist__mmap_size(opts->mmap_pages);
593 flush_max /= 4;
594 if (opts->mmap_flush > flush_max)
595 opts->mmap_flush = flush_max;
596
597 return 0;
598 }
599
600 #ifdef HAVE_ZSTD_SUPPORT
601 static unsigned int comp_level_default = 1;
602
record__parse_comp_level(const struct option * opt,const char * str,int unset)603 static int record__parse_comp_level(const struct option *opt, const char *str, int unset)
604 {
605 struct record_opts *opts = opt->value;
606
607 if (unset) {
608 opts->comp_level = 0;
609 } else {
610 if (str)
611 opts->comp_level = strtol(str, NULL, 0);
612 if (!opts->comp_level)
613 opts->comp_level = comp_level_default;
614 }
615
616 return 0;
617 }
618 #endif
619 static unsigned int comp_level_max = 22;
620
record__comp_enabled(struct record * rec)621 static int record__comp_enabled(struct record *rec)
622 {
623 return rec->opts.comp_level > 0;
624 }
625
process_synthesized_event(const struct perf_tool * tool,union perf_event * event,struct perf_sample * sample __maybe_unused,struct machine * machine __maybe_unused)626 static int process_synthesized_event(const struct perf_tool *tool,
627 union perf_event *event,
628 struct perf_sample *sample __maybe_unused,
629 struct machine *machine __maybe_unused)
630 {
631 struct record *rec = container_of(tool, struct record, tool);
632 return record__write(rec, NULL, event, event->header.size);
633 }
634
635 static struct mutex synth_lock;
636
process_locked_synthesized_event(const struct perf_tool * tool,union perf_event * event,struct perf_sample * sample __maybe_unused,struct machine * machine __maybe_unused)637 static int process_locked_synthesized_event(const struct perf_tool *tool,
638 union perf_event *event,
639 struct perf_sample *sample __maybe_unused,
640 struct machine *machine __maybe_unused)
641 {
642 int ret;
643
644 mutex_lock(&synth_lock);
645 ret = process_synthesized_event(tool, event, sample, machine);
646 mutex_unlock(&synth_lock);
647 return ret;
648 }
649
record__pushfn(struct mmap * map,void * to,void * bf,size_t size)650 static int record__pushfn(struct mmap *map, void *to, void *bf, size_t size)
651 {
652 struct record *rec = to;
653
654 if (record__comp_enabled(rec)) {
655 struct perf_record_compressed2 *event = map->data;
656 size_t padding = 0;
657 u8 pad[8] = {0};
658 ssize_t compressed = zstd_compress(rec->session, map, map->data,
659 mmap__mmap_len(map), bf, size);
660
661 if (compressed < 0)
662 return (int)compressed;
663
664 bf = event;
665 thread->samples++;
666
667 /*
668 * The record from `zstd_compress` is not 8 bytes aligned, which would cause asan
669 * error. We make it aligned here.
670 */
671 event->data_size = compressed - sizeof(struct perf_record_compressed2);
672 event->header.size = PERF_ALIGN(compressed, sizeof(u64));
673 padding = event->header.size - compressed;
674 return record__write(rec, map, bf, compressed) ||
675 record__write(rec, map, &pad, padding);
676 }
677
678 thread->samples++;
679 return record__write(rec, map, bf, size);
680 }
681
682 static volatile sig_atomic_t signr = -1;
683 static volatile sig_atomic_t child_finished;
684 #ifdef HAVE_EVENTFD_SUPPORT
685 static volatile sig_atomic_t done_fd = -1;
686 #endif
687
sig_handler(int sig)688 static void sig_handler(int sig)
689 {
690 if (sig == SIGCHLD)
691 child_finished = 1;
692 else
693 signr = sig;
694
695 done = 1;
696 #ifdef HAVE_EVENTFD_SUPPORT
697 if (done_fd >= 0) {
698 u64 tmp = 1;
699 int orig_errno = errno;
700
701 /*
702 * It is possible for this signal handler to run after done is
703 * checked in the main loop, but before the perf counter fds are
704 * polled. If this happens, the poll() will continue to wait
705 * even though done is set, and will only break out if either
706 * another signal is received, or the counters are ready for
707 * read. To ensure the poll() doesn't sleep when done is set,
708 * use an eventfd (done_fd) to wake up the poll().
709 */
710 if (write(done_fd, &tmp, sizeof(tmp)) < 0)
711 pr_err("failed to signal wakeup fd, error: %m\n");
712
713 errno = orig_errno;
714 }
715 #endif // HAVE_EVENTFD_SUPPORT
716 }
717
sigsegv_handler(int sig)718 static void sigsegv_handler(int sig)
719 {
720 perf_hooks__recover();
721 sighandler_dump_stack(sig);
722 }
723
record__sig_exit(void)724 static void record__sig_exit(void)
725 {
726 if (signr == -1)
727 return;
728
729 signal(signr, SIG_DFL);
730 raise(signr);
731 }
732
733 #ifdef HAVE_AUXTRACE_SUPPORT
734
record__process_auxtrace(const struct perf_tool * tool,struct mmap * map,union perf_event * event,void * data1,size_t len1,void * data2,size_t len2)735 static int record__process_auxtrace(const struct perf_tool *tool,
736 struct mmap *map,
737 union perf_event *event, void *data1,
738 size_t len1, void *data2, size_t len2)
739 {
740 struct record *rec = container_of(tool, struct record, tool);
741 struct perf_data *data = &rec->data;
742 size_t padding;
743 u8 pad[8] = {0};
744
745 if (!perf_data__is_pipe(data) && perf_data__is_single_file(data)) {
746 off_t file_offset;
747 int fd = perf_data__fd(data);
748 int err;
749
750 file_offset = lseek(fd, 0, SEEK_CUR);
751 if (file_offset == -1)
752 return -1;
753 err = auxtrace_index__auxtrace_event(&rec->session->auxtrace_index,
754 event, file_offset);
755 if (err)
756 return err;
757 }
758
759 /* event.auxtrace.size includes padding, see __auxtrace_mmap__read() */
760 padding = (len1 + len2) & 7;
761 if (padding)
762 padding = 8 - padding;
763
764 record__write(rec, map, event, event->header.size);
765 record__write(rec, map, data1, len1);
766 if (len2)
767 record__write(rec, map, data2, len2);
768 record__write(rec, map, &pad, padding);
769
770 return 0;
771 }
772
record__auxtrace_mmap_read(struct record * rec,struct mmap * map)773 static int record__auxtrace_mmap_read(struct record *rec,
774 struct mmap *map)
775 {
776 int ret;
777
778 ret = auxtrace_mmap__read(map, rec->itr,
779 perf_session__env(rec->session),
780 &rec->tool,
781 record__process_auxtrace);
782 if (ret < 0)
783 return ret;
784
785 if (ret)
786 rec->samples++;
787
788 return 0;
789 }
790
record__auxtrace_mmap_read_snapshot(struct record * rec,struct mmap * map)791 static int record__auxtrace_mmap_read_snapshot(struct record *rec,
792 struct mmap *map)
793 {
794 int ret;
795
796 ret = auxtrace_mmap__read_snapshot(map, rec->itr,
797 perf_session__env(rec->session),
798 &rec->tool,
799 record__process_auxtrace,
800 rec->opts.auxtrace_snapshot_size);
801 if (ret < 0)
802 return ret;
803
804 if (ret)
805 rec->samples++;
806
807 return 0;
808 }
809
record__auxtrace_read_snapshot_all(struct record * rec)810 static int record__auxtrace_read_snapshot_all(struct record *rec)
811 {
812 int i;
813 int rc = 0;
814
815 for (i = 0; i < rec->evlist->core.nr_mmaps; i++) {
816 struct mmap *map = &rec->evlist->mmap[i];
817
818 if (!map->auxtrace_mmap.base)
819 continue;
820
821 if (record__auxtrace_mmap_read_snapshot(rec, map) != 0) {
822 rc = -1;
823 goto out;
824 }
825 }
826 out:
827 return rc;
828 }
829
record__read_auxtrace_snapshot(struct record * rec,bool on_exit)830 static void record__read_auxtrace_snapshot(struct record *rec, bool on_exit)
831 {
832 pr_debug("Recording AUX area tracing snapshot\n");
833 if (record__auxtrace_read_snapshot_all(rec) < 0) {
834 trigger_error(&auxtrace_snapshot_trigger);
835 } else {
836 if (auxtrace_record__snapshot_finish(rec->itr, on_exit))
837 trigger_error(&auxtrace_snapshot_trigger);
838 else
839 trigger_ready(&auxtrace_snapshot_trigger);
840 }
841 }
842
record__auxtrace_snapshot_exit(struct record * rec)843 static int record__auxtrace_snapshot_exit(struct record *rec)
844 {
845 if (trigger_is_error(&auxtrace_snapshot_trigger))
846 return 0;
847
848 if (!auxtrace_record__snapshot_started &&
849 auxtrace_record__snapshot_start(rec->itr))
850 return -1;
851
852 record__read_auxtrace_snapshot(rec, true);
853 if (trigger_is_error(&auxtrace_snapshot_trigger))
854 return -1;
855
856 return 0;
857 }
858
record__auxtrace_init(struct record * rec)859 static int record__auxtrace_init(struct record *rec)
860 {
861 int err;
862
863 if ((rec->opts.auxtrace_snapshot_opts || rec->opts.auxtrace_sample_opts)
864 && record__threads_enabled(rec)) {
865 pr_err("AUX area tracing options are not available in parallel streaming mode.\n");
866 return -EINVAL;
867 }
868
869 if (!rec->itr) {
870 rec->itr = auxtrace_record__init(rec->evlist, &err);
871 if (err)
872 return err;
873 }
874
875 err = auxtrace_parse_snapshot_options(rec->itr, &rec->opts,
876 rec->opts.auxtrace_snapshot_opts);
877 if (err)
878 return err;
879
880 err = auxtrace_parse_sample_options(rec->itr, rec->evlist, &rec->opts,
881 rec->opts.auxtrace_sample_opts);
882 if (err)
883 return err;
884
885 err = auxtrace_parse_aux_action(rec->evlist);
886 if (err)
887 return err;
888
889 return auxtrace_parse_filters(rec->evlist);
890 }
891
892 #else
893
894 static inline
record__auxtrace_mmap_read(struct record * rec __maybe_unused,struct mmap * map __maybe_unused)895 int record__auxtrace_mmap_read(struct record *rec __maybe_unused,
896 struct mmap *map __maybe_unused)
897 {
898 return 0;
899 }
900
901 static inline
record__read_auxtrace_snapshot(struct record * rec __maybe_unused,bool on_exit __maybe_unused)902 void record__read_auxtrace_snapshot(struct record *rec __maybe_unused,
903 bool on_exit __maybe_unused)
904 {
905 }
906
907 static inline
auxtrace_record__snapshot_start(struct auxtrace_record * itr __maybe_unused)908 int auxtrace_record__snapshot_start(struct auxtrace_record *itr __maybe_unused)
909 {
910 return 0;
911 }
912
913 static inline
record__auxtrace_snapshot_exit(struct record * rec __maybe_unused)914 int record__auxtrace_snapshot_exit(struct record *rec __maybe_unused)
915 {
916 return 0;
917 }
918
record__auxtrace_init(struct record * rec __maybe_unused)919 static int record__auxtrace_init(struct record *rec __maybe_unused)
920 {
921 return 0;
922 }
923
924 #endif
925
record__config_text_poke(struct evlist * evlist)926 static int record__config_text_poke(struct evlist *evlist)
927 {
928 struct evsel *evsel;
929
930 /* Nothing to do if text poke is already configured */
931 evlist__for_each_entry(evlist, evsel) {
932 if (evsel->core.attr.text_poke)
933 return 0;
934 }
935
936 evsel = evlist__add_dummy_on_all_cpus(evlist);
937 if (!evsel)
938 return -ENOMEM;
939
940 evsel->core.attr.text_poke = 1;
941 evsel->core.attr.ksymbol = 1;
942 evsel->immediate = true;
943 evsel__set_sample_bit(evsel, TIME);
944
945 return 0;
946 }
947
record__config_off_cpu(struct record * rec)948 static int record__config_off_cpu(struct record *rec)
949 {
950 return off_cpu_prepare(rec->evlist, &rec->opts.target, &rec->opts);
951 }
952
record__tracking_system_wide(struct record * rec)953 static bool record__tracking_system_wide(struct record *rec)
954 {
955 struct evlist *evlist = rec->evlist;
956 struct evsel *evsel;
957
958 /*
959 * If non-dummy evsel exists, system_wide sideband is need to
960 * help parse sample information.
961 * For example, PERF_EVENT_MMAP event to help parse symbol,
962 * and PERF_EVENT_COMM event to help parse task executable name.
963 */
964 evlist__for_each_entry(evlist, evsel) {
965 if (!evsel__is_dummy_event(evsel))
966 return true;
967 }
968
969 return false;
970 }
971
record__config_tracking_events(struct record * rec)972 static int record__config_tracking_events(struct record *rec)
973 {
974 struct record_opts *opts = &rec->opts;
975 struct evlist *evlist = rec->evlist;
976 bool system_wide = false;
977 struct evsel *evsel;
978
979 /*
980 * For initial_delay, system wide or a hybrid system, we need to add
981 * tracking event so that we can track PERF_RECORD_MMAP to cover the
982 * delay of waiting or event synthesis.
983 */
984 if (opts->target.initial_delay || target__has_cpu(&opts->target) ||
985 perf_pmus__num_core_pmus() > 1) {
986
987 /*
988 * User space tasks can migrate between CPUs, so when tracing
989 * selected CPUs, sideband for all CPUs is still needed.
990 */
991 if (!!opts->target.cpu_list && record__tracking_system_wide(rec))
992 system_wide = true;
993
994 evsel = evlist__findnew_tracking_event(evlist, system_wide);
995 if (!evsel)
996 return -ENOMEM;
997
998 /*
999 * Enable the tracking event when the process is forked for
1000 * initial_delay, immediately for system wide.
1001 */
1002 if (opts->target.initial_delay && !evsel->immediate &&
1003 !target__has_cpu(&opts->target))
1004 evsel->core.attr.enable_on_exec = 1;
1005 else
1006 evsel->immediate = 1;
1007 }
1008
1009 return 0;
1010 }
1011
record__kcore_readable(struct machine * machine)1012 static bool record__kcore_readable(struct machine *machine)
1013 {
1014 char kcore[PATH_MAX];
1015 int fd;
1016
1017 scnprintf(kcore, sizeof(kcore), "%s/proc/kcore", machine->root_dir);
1018
1019 fd = open(kcore, O_RDONLY);
1020 if (fd < 0)
1021 return false;
1022
1023 close(fd);
1024
1025 return true;
1026 }
1027
record__kcore_copy(struct machine * machine,struct perf_data * data)1028 static int record__kcore_copy(struct machine *machine, struct perf_data *data)
1029 {
1030 char from_dir[PATH_MAX];
1031 char kcore_dir[PATH_MAX];
1032 int ret;
1033
1034 snprintf(from_dir, sizeof(from_dir), "%s/proc", machine->root_dir);
1035
1036 ret = perf_data__make_kcore_dir(data, kcore_dir, sizeof(kcore_dir));
1037 if (ret)
1038 return ret;
1039
1040 return kcore_copy(from_dir, kcore_dir);
1041 }
1042
record__thread_data_init_pipes(struct record_thread * thread_data)1043 static void record__thread_data_init_pipes(struct record_thread *thread_data)
1044 {
1045 thread_data->pipes.msg[0] = -1;
1046 thread_data->pipes.msg[1] = -1;
1047 thread_data->pipes.ack[0] = -1;
1048 thread_data->pipes.ack[1] = -1;
1049 }
1050
record__thread_data_open_pipes(struct record_thread * thread_data)1051 static int record__thread_data_open_pipes(struct record_thread *thread_data)
1052 {
1053 if (pipe(thread_data->pipes.msg))
1054 return -EINVAL;
1055
1056 if (pipe(thread_data->pipes.ack)) {
1057 close(thread_data->pipes.msg[0]);
1058 thread_data->pipes.msg[0] = -1;
1059 close(thread_data->pipes.msg[1]);
1060 thread_data->pipes.msg[1] = -1;
1061 return -EINVAL;
1062 }
1063
1064 pr_debug2("thread_data[%p]: msg=[%d,%d], ack=[%d,%d]\n", thread_data,
1065 thread_data->pipes.msg[0], thread_data->pipes.msg[1],
1066 thread_data->pipes.ack[0], thread_data->pipes.ack[1]);
1067
1068 return 0;
1069 }
1070
record__thread_data_close_pipes(struct record_thread * thread_data)1071 static void record__thread_data_close_pipes(struct record_thread *thread_data)
1072 {
1073 if (thread_data->pipes.msg[0] != -1) {
1074 close(thread_data->pipes.msg[0]);
1075 thread_data->pipes.msg[0] = -1;
1076 }
1077 if (thread_data->pipes.msg[1] != -1) {
1078 close(thread_data->pipes.msg[1]);
1079 thread_data->pipes.msg[1] = -1;
1080 }
1081 if (thread_data->pipes.ack[0] != -1) {
1082 close(thread_data->pipes.ack[0]);
1083 thread_data->pipes.ack[0] = -1;
1084 }
1085 if (thread_data->pipes.ack[1] != -1) {
1086 close(thread_data->pipes.ack[1]);
1087 thread_data->pipes.ack[1] = -1;
1088 }
1089 }
1090
evlist__per_thread(struct evlist * evlist)1091 static bool evlist__per_thread(struct evlist *evlist)
1092 {
1093 return cpu_map__is_dummy(evlist->core.user_requested_cpus);
1094 }
1095
record__thread_data_init_maps(struct record_thread * thread_data,struct evlist * evlist)1096 static int record__thread_data_init_maps(struct record_thread *thread_data, struct evlist *evlist)
1097 {
1098 int m, tm, nr_mmaps = evlist->core.nr_mmaps;
1099 struct mmap *mmap = evlist->mmap;
1100 struct mmap *overwrite_mmap = evlist->overwrite_mmap;
1101 struct perf_cpu_map *cpus = evlist->core.all_cpus;
1102 bool per_thread = evlist__per_thread(evlist);
1103
1104 if (per_thread)
1105 thread_data->nr_mmaps = nr_mmaps;
1106 else
1107 thread_data->nr_mmaps = bitmap_weight(thread_data->mask->maps.bits,
1108 thread_data->mask->maps.nbits);
1109 if (mmap) {
1110 thread_data->maps = zalloc(thread_data->nr_mmaps * sizeof(struct mmap *));
1111 if (!thread_data->maps)
1112 return -ENOMEM;
1113 }
1114 if (overwrite_mmap) {
1115 thread_data->overwrite_maps = zalloc(thread_data->nr_mmaps * sizeof(struct mmap *));
1116 if (!thread_data->overwrite_maps) {
1117 zfree(&thread_data->maps);
1118 return -ENOMEM;
1119 }
1120 }
1121 pr_debug2("thread_data[%p]: nr_mmaps=%d, maps=%p, ow_maps=%p\n", thread_data,
1122 thread_data->nr_mmaps, thread_data->maps, thread_data->overwrite_maps);
1123
1124 for (m = 0, tm = 0; m < nr_mmaps && tm < thread_data->nr_mmaps; m++) {
1125 if (per_thread ||
1126 test_bit(perf_cpu_map__cpu(cpus, m).cpu, thread_data->mask->maps.bits)) {
1127 if (thread_data->maps) {
1128 thread_data->maps[tm] = &mmap[m];
1129 pr_debug2("thread_data[%p]: cpu%d: maps[%d] -> mmap[%d]\n",
1130 thread_data, perf_cpu_map__cpu(cpus, m).cpu, tm, m);
1131 }
1132 if (thread_data->overwrite_maps) {
1133 thread_data->overwrite_maps[tm] = &overwrite_mmap[m];
1134 pr_debug2("thread_data[%p]: cpu%d: ow_maps[%d] -> ow_mmap[%d]\n",
1135 thread_data, perf_cpu_map__cpu(cpus, m).cpu, tm, m);
1136 }
1137 tm++;
1138 }
1139 }
1140
1141 return 0;
1142 }
1143
record__thread_data_init_pollfd(struct record_thread * thread_data,struct evlist * evlist)1144 static int record__thread_data_init_pollfd(struct record_thread *thread_data, struct evlist *evlist)
1145 {
1146 int f, tm, pos;
1147 struct mmap *map, *overwrite_map;
1148
1149 fdarray__init(&thread_data->pollfd, 64);
1150
1151 for (tm = 0; tm < thread_data->nr_mmaps; tm++) {
1152 map = thread_data->maps ? thread_data->maps[tm] : NULL;
1153 overwrite_map = thread_data->overwrite_maps ?
1154 thread_data->overwrite_maps[tm] : NULL;
1155
1156 for (f = 0; f < evlist->core.pollfd.nr; f++) {
1157 void *ptr = evlist->core.pollfd.priv[f].ptr;
1158
1159 if ((map && ptr == map) || (overwrite_map && ptr == overwrite_map)) {
1160 pos = fdarray__dup_entry_from(&thread_data->pollfd, f,
1161 &evlist->core.pollfd);
1162 if (pos < 0)
1163 return pos;
1164 pr_debug2("thread_data[%p]: pollfd[%d] <- event_fd=%d\n",
1165 thread_data, pos, evlist->core.pollfd.entries[f].fd);
1166 }
1167 }
1168 }
1169
1170 return 0;
1171 }
1172
record__free_thread_data(struct record * rec)1173 static void record__free_thread_data(struct record *rec)
1174 {
1175 int t;
1176 struct record_thread *thread_data = rec->thread_data;
1177
1178 if (thread_data == NULL)
1179 return;
1180
1181 for (t = 0; t < rec->nr_threads; t++) {
1182 record__thread_data_close_pipes(&thread_data[t]);
1183 zfree(&thread_data[t].maps);
1184 zfree(&thread_data[t].overwrite_maps);
1185 fdarray__exit(&thread_data[t].pollfd);
1186 }
1187
1188 zfree(&rec->thread_data);
1189 }
1190
record__map_thread_evlist_pollfd_indexes(struct record * rec,int evlist_pollfd_index,int thread_pollfd_index)1191 static int record__map_thread_evlist_pollfd_indexes(struct record *rec,
1192 int evlist_pollfd_index,
1193 int thread_pollfd_index)
1194 {
1195 size_t x = rec->index_map_cnt;
1196
1197 if (realloc_array_as_needed(rec->index_map, rec->index_map_sz, x, NULL))
1198 return -ENOMEM;
1199 rec->index_map[x].evlist_pollfd_index = evlist_pollfd_index;
1200 rec->index_map[x].thread_pollfd_index = thread_pollfd_index;
1201 rec->index_map_cnt += 1;
1202 return 0;
1203 }
1204
record__update_evlist_pollfd_from_thread(struct record * rec,struct evlist * evlist,struct record_thread * thread_data)1205 static int record__update_evlist_pollfd_from_thread(struct record *rec,
1206 struct evlist *evlist,
1207 struct record_thread *thread_data)
1208 {
1209 struct pollfd *e_entries = evlist->core.pollfd.entries;
1210 struct pollfd *t_entries = thread_data->pollfd.entries;
1211 int err = 0;
1212 size_t i;
1213
1214 for (i = 0; i < rec->index_map_cnt; i++) {
1215 int e_pos = rec->index_map[i].evlist_pollfd_index;
1216 int t_pos = rec->index_map[i].thread_pollfd_index;
1217
1218 if (e_entries[e_pos].fd != t_entries[t_pos].fd ||
1219 e_entries[e_pos].events != t_entries[t_pos].events) {
1220 pr_err("Thread and evlist pollfd index mismatch\n");
1221 err = -EINVAL;
1222 continue;
1223 }
1224 e_entries[e_pos].revents = t_entries[t_pos].revents;
1225 }
1226 return err;
1227 }
1228
record__dup_non_perf_events(struct record * rec,struct evlist * evlist,struct record_thread * thread_data)1229 static int record__dup_non_perf_events(struct record *rec,
1230 struct evlist *evlist,
1231 struct record_thread *thread_data)
1232 {
1233 struct fdarray *fda = &evlist->core.pollfd;
1234 int i, ret;
1235
1236 for (i = 0; i < fda->nr; i++) {
1237 if (!(fda->priv[i].flags & fdarray_flag__non_perf_event))
1238 continue;
1239 ret = fdarray__dup_entry_from(&thread_data->pollfd, i, fda);
1240 if (ret < 0) {
1241 pr_err("Failed to duplicate descriptor in main thread pollfd\n");
1242 return ret;
1243 }
1244 pr_debug2("thread_data[%p]: pollfd[%d] <- non_perf_event fd=%d\n",
1245 thread_data, ret, fda->entries[i].fd);
1246 ret = record__map_thread_evlist_pollfd_indexes(rec, i, ret);
1247 if (ret < 0) {
1248 pr_err("Failed to map thread and evlist pollfd indexes\n");
1249 return ret;
1250 }
1251 }
1252 return 0;
1253 }
1254
record__alloc_thread_data(struct record * rec,struct evlist * evlist)1255 static int record__alloc_thread_data(struct record *rec, struct evlist *evlist)
1256 {
1257 int t, ret;
1258 struct record_thread *thread_data;
1259
1260 rec->thread_data = zalloc(rec->nr_threads * sizeof(*(rec->thread_data)));
1261 if (!rec->thread_data) {
1262 pr_err("Failed to allocate thread data\n");
1263 return -ENOMEM;
1264 }
1265 thread_data = rec->thread_data;
1266
1267 for (t = 0; t < rec->nr_threads; t++)
1268 record__thread_data_init_pipes(&thread_data[t]);
1269
1270 for (t = 0; t < rec->nr_threads; t++) {
1271 thread_data[t].rec = rec;
1272 thread_data[t].mask = &rec->thread_masks[t];
1273 ret = record__thread_data_init_maps(&thread_data[t], evlist);
1274 if (ret) {
1275 pr_err("Failed to initialize thread[%d] maps\n", t);
1276 goto out_free;
1277 }
1278 ret = record__thread_data_init_pollfd(&thread_data[t], evlist);
1279 if (ret) {
1280 pr_err("Failed to initialize thread[%d] pollfd\n", t);
1281 goto out_free;
1282 }
1283 if (t) {
1284 thread_data[t].tid = -1;
1285 ret = record__thread_data_open_pipes(&thread_data[t]);
1286 if (ret) {
1287 pr_err("Failed to open thread[%d] communication pipes\n", t);
1288 goto out_free;
1289 }
1290 ret = fdarray__add(&thread_data[t].pollfd, thread_data[t].pipes.msg[0],
1291 POLLIN | POLLERR | POLLHUP, fdarray_flag__nonfilterable);
1292 if (ret < 0) {
1293 pr_err("Failed to add descriptor to thread[%d] pollfd\n", t);
1294 goto out_free;
1295 }
1296 thread_data[t].ctlfd_pos = ret;
1297 pr_debug2("thread_data[%p]: pollfd[%d] <- ctl_fd=%d\n",
1298 thread_data, thread_data[t].ctlfd_pos,
1299 thread_data[t].pipes.msg[0]);
1300 } else {
1301 thread_data[t].tid = gettid();
1302
1303 ret = record__dup_non_perf_events(rec, evlist, &thread_data[t]);
1304 if (ret < 0)
1305 goto out_free;
1306
1307 thread_data[t].ctlfd_pos = -1; /* Not used */
1308 }
1309 }
1310
1311 return 0;
1312
1313 out_free:
1314 record__free_thread_data(rec);
1315
1316 return ret;
1317 }
1318
record__mmap_evlist(struct record * rec,struct evlist * evlist)1319 static int record__mmap_evlist(struct record *rec,
1320 struct evlist *evlist)
1321 {
1322 int i, ret;
1323 struct record_opts *opts = &rec->opts;
1324 bool auxtrace_overwrite = opts->auxtrace_snapshot_mode ||
1325 opts->auxtrace_sample_mode;
1326 char msg[512];
1327
1328 if (opts->affinity != PERF_AFFINITY_SYS)
1329 cpu__setup_cpunode_map();
1330
1331 if (evlist__mmap_ex(evlist, opts->mmap_pages,
1332 opts->auxtrace_mmap_pages,
1333 auxtrace_overwrite,
1334 opts->nr_cblocks, opts->affinity,
1335 opts->mmap_flush, opts->comp_level) < 0) {
1336 if (errno == EPERM) {
1337 pr_err("Permission error mapping pages.\n"
1338 "Consider increasing "
1339 "/proc/sys/kernel/perf_event_mlock_kb,\n"
1340 "or try again with a smaller value of -m/--mmap_pages.\n"
1341 "(current value: %u,%u)\n",
1342 opts->mmap_pages, opts->auxtrace_mmap_pages);
1343 return -errno;
1344 } else {
1345 pr_err("failed to mmap with %d (%s)\n", errno,
1346 str_error_r(errno, msg, sizeof(msg)));
1347 if (errno)
1348 return -errno;
1349 else
1350 return -EINVAL;
1351 }
1352 }
1353
1354 if (evlist__initialize_ctlfd(evlist, opts->ctl_fd, opts->ctl_fd_ack))
1355 return -1;
1356
1357 ret = record__alloc_thread_data(rec, evlist);
1358 if (ret)
1359 return ret;
1360
1361 if (record__threads_enabled(rec)) {
1362 ret = perf_data__create_dir(&rec->data, evlist->core.nr_mmaps);
1363 if (ret) {
1364 pr_err("Failed to create data directory: %s\n", strerror(-ret));
1365 return ret;
1366 }
1367 for (i = 0; i < evlist->core.nr_mmaps; i++) {
1368 if (evlist->mmap)
1369 evlist->mmap[i].file = &rec->data.dir.files[i];
1370 if (evlist->overwrite_mmap)
1371 evlist->overwrite_mmap[i].file = &rec->data.dir.files[i];
1372 }
1373 }
1374
1375 return 0;
1376 }
1377
record__mmap(struct record * rec)1378 static int record__mmap(struct record *rec)
1379 {
1380 return record__mmap_evlist(rec, rec->evlist);
1381 }
1382
record__open(struct record * rec)1383 static int record__open(struct record *rec)
1384 {
1385 char msg[BUFSIZ];
1386 struct evsel *pos;
1387 struct evlist *evlist = rec->evlist;
1388 struct perf_session *session = rec->session;
1389 struct record_opts *opts = &rec->opts;
1390 int rc = 0;
1391
1392 evlist__for_each_entry(evlist, pos) {
1393 try_again:
1394 if (evsel__open(pos, pos->core.cpus, pos->core.threads) < 0) {
1395 if (evsel__fallback(pos, &opts->target, errno, msg, sizeof(msg))) {
1396 if (verbose > 0)
1397 ui__warning("%s\n", msg);
1398 goto try_again;
1399 }
1400 if ((errno == EINVAL || errno == EBADF) &&
1401 pos->core.leader != &pos->core &&
1402 pos->weak_group) {
1403 pos = evlist__reset_weak_group(evlist, pos, true);
1404 goto try_again;
1405 }
1406 rc = -errno;
1407 evsel__open_strerror(pos, &opts->target, errno, msg, sizeof(msg));
1408 ui__error("%s\n", msg);
1409 goto out;
1410 }
1411
1412 pos->supported = true;
1413 }
1414
1415 if (symbol_conf.kptr_restrict && !evlist__exclude_kernel(evlist)) {
1416 pr_warning(
1417 "WARNING: Kernel address maps (/proc/{kallsyms,modules}) are restricted,\n"
1418 "check /proc/sys/kernel/kptr_restrict and /proc/sys/kernel/perf_event_paranoid.\n\n"
1419 "Samples in kernel functions may not be resolved if a suitable vmlinux\n"
1420 "file is not found in the buildid cache or in the vmlinux path.\n\n"
1421 "Samples in kernel modules won't be resolved at all.\n\n"
1422 "If some relocation was applied (e.g. kexec) symbols may be misresolved\n"
1423 "even with a suitable vmlinux or kallsyms file.\n\n");
1424 }
1425
1426 if (evlist__apply_filters(evlist, &pos, &opts->target)) {
1427 pr_err("failed to set filter \"%s\" on event %s with %d (%s)\n",
1428 pos->filter ?: "BPF", evsel__name(pos), errno,
1429 str_error_r(errno, msg, sizeof(msg)));
1430 rc = -1;
1431 goto out;
1432 }
1433
1434 rc = record__mmap(rec);
1435 if (rc)
1436 goto out;
1437
1438 session->evlist = evlist;
1439 perf_session__set_id_hdr_size(session);
1440 out:
1441 return rc;
1442 }
1443
set_timestamp_boundary(struct record * rec,u64 sample_time)1444 static void set_timestamp_boundary(struct record *rec, u64 sample_time)
1445 {
1446 if (rec->evlist->first_sample_time == 0)
1447 rec->evlist->first_sample_time = sample_time;
1448
1449 if (sample_time)
1450 rec->evlist->last_sample_time = sample_time;
1451 }
1452
process_sample_event(const struct perf_tool * tool,union perf_event * event,struct perf_sample * sample,struct evsel * evsel,struct machine * machine)1453 static int process_sample_event(const struct perf_tool *tool,
1454 union perf_event *event,
1455 struct perf_sample *sample,
1456 struct evsel *evsel,
1457 struct machine *machine)
1458 {
1459 struct record *rec = container_of(tool, struct record, tool);
1460
1461 set_timestamp_boundary(rec, sample->time);
1462
1463 if (rec->buildid_all)
1464 return 0;
1465
1466 rec->samples++;
1467 return build_id__mark_dso_hit(tool, event, sample, evsel, machine);
1468 }
1469
process_buildids(struct record * rec)1470 static int process_buildids(struct record *rec)
1471 {
1472 struct perf_session *session = rec->session;
1473
1474 if (perf_data__size(&rec->data) == 0)
1475 return 0;
1476
1477 /*
1478 * During this process, it'll load kernel map and replace the
1479 * dso->long_name to a real pathname it found. In this case
1480 * we prefer the vmlinux path like
1481 * /lib/modules/3.16.4/build/vmlinux
1482 *
1483 * rather than build-id path (in debug directory).
1484 * $HOME/.debug/.build-id/f0/6e17aa50adf4d00b88925e03775de107611551
1485 */
1486 symbol_conf.ignore_vmlinux_buildid = true;
1487
1488 /*
1489 * If --buildid-all is given, it marks all DSO regardless of hits,
1490 * so no need to process samples. But if timestamp_boundary is enabled,
1491 * it still needs to walk on all samples to get the timestamps of
1492 * first/last samples.
1493 */
1494 if (rec->buildid_all && !rec->timestamp_boundary)
1495 rec->tool.sample = process_event_sample_stub;
1496
1497 return perf_session__process_events(session);
1498 }
1499
perf_event__synthesize_guest_os(struct machine * machine,void * data)1500 static void perf_event__synthesize_guest_os(struct machine *machine, void *data)
1501 {
1502 int err;
1503 struct perf_tool *tool = data;
1504 /*
1505 *As for guest kernel when processing subcommand record&report,
1506 *we arrange module mmap prior to guest kernel mmap and trigger
1507 *a preload dso because default guest module symbols are loaded
1508 *from guest kallsyms instead of /lib/modules/XXX/XXX. This
1509 *method is used to avoid symbol missing when the first addr is
1510 *in module instead of in guest kernel.
1511 */
1512 err = perf_event__synthesize_modules(tool, process_synthesized_event,
1513 machine);
1514 if (err < 0)
1515 pr_err("Couldn't record guest kernel [%d]'s reference"
1516 " relocation symbol.\n", machine->pid);
1517
1518 /*
1519 * We use _stext for guest kernel because guest kernel's /proc/kallsyms
1520 * have no _text sometimes.
1521 */
1522 err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
1523 machine);
1524 if (err < 0)
1525 pr_err("Couldn't record guest kernel [%d]'s reference"
1526 " relocation symbol.\n", machine->pid);
1527 }
1528
1529 static struct perf_event_header finished_round_event = {
1530 .size = sizeof(struct perf_event_header),
1531 .type = PERF_RECORD_FINISHED_ROUND,
1532 };
1533
1534 static struct perf_event_header finished_init_event = {
1535 .size = sizeof(struct perf_event_header),
1536 .type = PERF_RECORD_FINISHED_INIT,
1537 };
1538
record__adjust_affinity(struct record * rec,struct mmap * map)1539 static void record__adjust_affinity(struct record *rec, struct mmap *map)
1540 {
1541 if (rec->opts.affinity != PERF_AFFINITY_SYS &&
1542 !bitmap_equal(thread->mask->affinity.bits, map->affinity_mask.bits,
1543 thread->mask->affinity.nbits)) {
1544 bitmap_zero(thread->mask->affinity.bits, thread->mask->affinity.nbits);
1545 bitmap_or(thread->mask->affinity.bits, thread->mask->affinity.bits,
1546 map->affinity_mask.bits, thread->mask->affinity.nbits);
1547 sched_setaffinity(0, MMAP_CPU_MASK_BYTES(&thread->mask->affinity),
1548 (cpu_set_t *)thread->mask->affinity.bits);
1549 if (verbose == 2) {
1550 pr_debug("threads[%d]: running on cpu%d: ", thread->tid, sched_getcpu());
1551 mmap_cpu_mask__scnprintf(&thread->mask->affinity, "affinity");
1552 }
1553 }
1554 }
1555
process_comp_header(void * record,size_t increment)1556 static size_t process_comp_header(void *record, size_t increment)
1557 {
1558 struct perf_record_compressed2 *event = record;
1559 size_t size = sizeof(*event);
1560
1561 if (increment) {
1562 event->header.size += increment;
1563 return increment;
1564 }
1565
1566 event->header.type = PERF_RECORD_COMPRESSED2;
1567 event->header.size = size;
1568
1569 return size;
1570 }
1571
zstd_compress(struct perf_session * session,struct mmap * map,void * dst,size_t dst_size,void * src,size_t src_size)1572 static ssize_t zstd_compress(struct perf_session *session, struct mmap *map,
1573 void *dst, size_t dst_size, void *src, size_t src_size)
1574 {
1575 ssize_t compressed;
1576 size_t max_record_size = PERF_SAMPLE_MAX_SIZE - sizeof(struct perf_record_compressed2) - 1;
1577 struct zstd_data *zstd_data = &session->zstd_data;
1578
1579 if (map && map->file)
1580 zstd_data = &map->zstd_data;
1581
1582 compressed = zstd_compress_stream_to_records(zstd_data, dst, dst_size, src, src_size,
1583 max_record_size, process_comp_header);
1584 if (compressed < 0)
1585 return compressed;
1586
1587 if (map && map->file) {
1588 thread->bytes_transferred += src_size;
1589 thread->bytes_compressed += compressed;
1590 } else {
1591 session->bytes_transferred += src_size;
1592 session->bytes_compressed += compressed;
1593 }
1594
1595 return compressed;
1596 }
1597
record__mmap_read_evlist(struct record * rec,struct evlist * evlist,bool overwrite,bool synch)1598 static int record__mmap_read_evlist(struct record *rec, struct evlist *evlist,
1599 bool overwrite, bool synch)
1600 {
1601 u64 bytes_written = rec->bytes_written;
1602 int i;
1603 int rc = 0;
1604 int nr_mmaps;
1605 struct mmap **maps;
1606 int trace_fd = rec->data.file.fd;
1607 off_t off = 0;
1608
1609 if (!evlist)
1610 return 0;
1611
1612 nr_mmaps = thread->nr_mmaps;
1613 maps = overwrite ? thread->overwrite_maps : thread->maps;
1614
1615 if (!maps)
1616 return 0;
1617
1618 if (overwrite && evlist->bkw_mmap_state != BKW_MMAP_DATA_PENDING)
1619 return 0;
1620
1621 if (record__aio_enabled(rec))
1622 off = record__aio_get_pos(trace_fd);
1623
1624 for (i = 0; i < nr_mmaps; i++) {
1625 u64 flush = 0;
1626 struct mmap *map = maps[i];
1627
1628 if (map->core.base) {
1629 record__adjust_affinity(rec, map);
1630 if (synch) {
1631 flush = map->core.flush;
1632 map->core.flush = 1;
1633 }
1634 if (!record__aio_enabled(rec)) {
1635 if (perf_mmap__push(map, rec, record__pushfn) < 0) {
1636 if (synch)
1637 map->core.flush = flush;
1638 rc = -1;
1639 goto out;
1640 }
1641 } else {
1642 if (record__aio_push(rec, map, &off) < 0) {
1643 record__aio_set_pos(trace_fd, off);
1644 if (synch)
1645 map->core.flush = flush;
1646 rc = -1;
1647 goto out;
1648 }
1649 }
1650 if (synch)
1651 map->core.flush = flush;
1652 }
1653
1654 if (map->auxtrace_mmap.base && !rec->opts.auxtrace_snapshot_mode &&
1655 !rec->opts.auxtrace_sample_mode &&
1656 record__auxtrace_mmap_read(rec, map) != 0) {
1657 rc = -1;
1658 goto out;
1659 }
1660 }
1661
1662 if (record__aio_enabled(rec))
1663 record__aio_set_pos(trace_fd, off);
1664
1665 /*
1666 * Mark the round finished in case we wrote
1667 * at least one event.
1668 *
1669 * No need for round events in directory mode,
1670 * because per-cpu maps and files have data
1671 * sorted by kernel.
1672 */
1673 if (!record__threads_enabled(rec) && bytes_written != rec->bytes_written)
1674 rc = record__write(rec, NULL, &finished_round_event, sizeof(finished_round_event));
1675
1676 if (overwrite)
1677 evlist__toggle_bkw_mmap(evlist, BKW_MMAP_EMPTY);
1678 out:
1679 return rc;
1680 }
1681
record__mmap_read_all(struct record * rec,bool synch)1682 static int record__mmap_read_all(struct record *rec, bool synch)
1683 {
1684 int err;
1685
1686 err = record__mmap_read_evlist(rec, rec->evlist, false, synch);
1687 if (err)
1688 return err;
1689
1690 return record__mmap_read_evlist(rec, rec->evlist, true, synch);
1691 }
1692
record__thread_munmap_filtered(struct fdarray * fda,int fd,void * arg __maybe_unused)1693 static void record__thread_munmap_filtered(struct fdarray *fda, int fd,
1694 void *arg __maybe_unused)
1695 {
1696 struct perf_mmap *map = fda->priv[fd].ptr;
1697
1698 if (map)
1699 perf_mmap__put(map);
1700 }
1701
record__thread(void * arg)1702 static void *record__thread(void *arg)
1703 {
1704 enum thread_msg msg = THREAD_MSG__READY;
1705 bool terminate = false;
1706 struct fdarray *pollfd;
1707 int err, ctlfd_pos;
1708
1709 thread = arg;
1710 thread->tid = gettid();
1711
1712 err = write(thread->pipes.ack[1], &msg, sizeof(msg));
1713 if (err == -1)
1714 pr_warning("threads[%d]: failed to notify on start: %s\n",
1715 thread->tid, strerror(errno));
1716
1717 pr_debug("threads[%d]: started on cpu%d\n", thread->tid, sched_getcpu());
1718
1719 pollfd = &thread->pollfd;
1720 ctlfd_pos = thread->ctlfd_pos;
1721
1722 for (;;) {
1723 unsigned long long hits = thread->samples;
1724
1725 if (record__mmap_read_all(thread->rec, false) < 0 || terminate)
1726 break;
1727
1728 if (hits == thread->samples) {
1729
1730 err = fdarray__poll(pollfd, -1);
1731 /*
1732 * Propagate error, only if there's any. Ignore positive
1733 * number of returned events and interrupt error.
1734 */
1735 if (err > 0 || (err < 0 && errno == EINTR))
1736 err = 0;
1737 thread->waking++;
1738
1739 if (fdarray__filter(pollfd, POLLERR | POLLHUP,
1740 record__thread_munmap_filtered, NULL) == 0)
1741 break;
1742 }
1743
1744 if (pollfd->entries[ctlfd_pos].revents & POLLHUP) {
1745 terminate = true;
1746 close(thread->pipes.msg[0]);
1747 thread->pipes.msg[0] = -1;
1748 pollfd->entries[ctlfd_pos].fd = -1;
1749 pollfd->entries[ctlfd_pos].events = 0;
1750 }
1751
1752 pollfd->entries[ctlfd_pos].revents = 0;
1753 }
1754 record__mmap_read_all(thread->rec, true);
1755
1756 err = write(thread->pipes.ack[1], &msg, sizeof(msg));
1757 if (err == -1)
1758 pr_warning("threads[%d]: failed to notify on termination: %s\n",
1759 thread->tid, strerror(errno));
1760
1761 return NULL;
1762 }
1763
record__init_features(struct record * rec)1764 static void record__init_features(struct record *rec)
1765 {
1766 struct perf_session *session = rec->session;
1767 int feat;
1768
1769 for (feat = HEADER_FIRST_FEATURE; feat < HEADER_LAST_FEATURE; feat++)
1770 perf_header__set_feat(&session->header, feat);
1771
1772 if (rec->no_buildid)
1773 perf_header__clear_feat(&session->header, HEADER_BUILD_ID);
1774
1775 if (!have_tracepoints(&rec->evlist->core.entries))
1776 perf_header__clear_feat(&session->header, HEADER_TRACING_DATA);
1777
1778 if (!rec->opts.branch_stack)
1779 perf_header__clear_feat(&session->header, HEADER_BRANCH_STACK);
1780
1781 if (!rec->opts.full_auxtrace)
1782 perf_header__clear_feat(&session->header, HEADER_AUXTRACE);
1783
1784 if (!(rec->opts.use_clockid && rec->opts.clockid_res_ns))
1785 perf_header__clear_feat(&session->header, HEADER_CLOCKID);
1786
1787 if (!rec->opts.use_clockid)
1788 perf_header__clear_feat(&session->header, HEADER_CLOCK_DATA);
1789
1790 if (!record__threads_enabled(rec))
1791 perf_header__clear_feat(&session->header, HEADER_DIR_FORMAT);
1792
1793 if (!record__comp_enabled(rec))
1794 perf_header__clear_feat(&session->header, HEADER_COMPRESSED);
1795
1796 perf_header__clear_feat(&session->header, HEADER_STAT);
1797 }
1798
1799 static void
record__finish_output(struct record * rec)1800 record__finish_output(struct record *rec)
1801 {
1802 int i;
1803 struct perf_data *data = &rec->data;
1804 int fd = perf_data__fd(data);
1805
1806 if (data->is_pipe) {
1807 /* Just to display approx. size */
1808 data->file.size = rec->bytes_written;
1809 return;
1810 }
1811
1812 rec->session->header.data_size += rec->bytes_written;
1813 data->file.size = lseek(perf_data__fd(data), 0, SEEK_CUR);
1814 if (record__threads_enabled(rec)) {
1815 for (i = 0; i < data->dir.nr; i++)
1816 data->dir.files[i].size = lseek(data->dir.files[i].fd, 0, SEEK_CUR);
1817 }
1818
1819 /* Buildid scanning disabled or build ID in kernel and synthesized map events. */
1820 if (!rec->no_buildid) {
1821 process_buildids(rec);
1822
1823 if (rec->buildid_all)
1824 perf_session__dsos_hit_all(rec->session);
1825 }
1826 perf_session__write_header(rec->session, rec->evlist, fd, true);
1827
1828 return;
1829 }
1830
record__synthesize_workload(struct record * rec,bool tail)1831 static int record__synthesize_workload(struct record *rec, bool tail)
1832 {
1833 int err;
1834 struct perf_thread_map *thread_map;
1835 bool needs_mmap = rec->opts.synth & PERF_SYNTH_MMAP;
1836
1837 if (rec->opts.tail_synthesize != tail)
1838 return 0;
1839
1840 thread_map = thread_map__new_by_tid(rec->evlist->workload.pid);
1841 if (thread_map == NULL)
1842 return -1;
1843
1844 err = perf_event__synthesize_thread_map(&rec->tool, thread_map,
1845 process_synthesized_event,
1846 &rec->session->machines.host,
1847 needs_mmap,
1848 rec->opts.sample_address);
1849 perf_thread_map__put(thread_map);
1850 return err;
1851 }
1852
write_finished_init(struct record * rec,bool tail)1853 static int write_finished_init(struct record *rec, bool tail)
1854 {
1855 if (rec->opts.tail_synthesize != tail)
1856 return 0;
1857
1858 return record__write(rec, NULL, &finished_init_event, sizeof(finished_init_event));
1859 }
1860
1861 static int record__synthesize(struct record *rec, bool tail);
1862
1863 static int
record__switch_output(struct record * rec,bool at_exit)1864 record__switch_output(struct record *rec, bool at_exit)
1865 {
1866 struct perf_data *data = &rec->data;
1867 char *new_filename = NULL;
1868 int fd, err;
1869
1870 /* Same Size: "2015122520103046"*/
1871 char timestamp[] = "InvalidTimestamp";
1872
1873 record__aio_mmap_read_sync(rec);
1874
1875 write_finished_init(rec, true);
1876
1877 record__synthesize(rec, true);
1878 if (target__none(&rec->opts.target))
1879 record__synthesize_workload(rec, true);
1880
1881 rec->samples = 0;
1882 record__finish_output(rec);
1883 err = fetch_current_timestamp(timestamp, sizeof(timestamp));
1884 if (err) {
1885 pr_err("Failed to get current timestamp\n");
1886 return -EINVAL;
1887 }
1888
1889 fd = perf_data__switch(data, timestamp,
1890 rec->session->header.data_offset,
1891 at_exit, &new_filename);
1892 if (fd >= 0 && !at_exit) {
1893 rec->bytes_written = 0;
1894 rec->session->header.data_size = 0;
1895 }
1896
1897 if (!quiet) {
1898 fprintf(stderr, "[ perf record: Dump %s.%s ]\n",
1899 data->path, timestamp);
1900 }
1901
1902 if (rec->switch_output.num_files) {
1903 int n = rec->switch_output.cur_file + 1;
1904
1905 if (n >= rec->switch_output.num_files)
1906 n = 0;
1907 rec->switch_output.cur_file = n;
1908 if (rec->switch_output.filenames[n]) {
1909 remove(rec->switch_output.filenames[n]);
1910 zfree(&rec->switch_output.filenames[n]);
1911 }
1912 rec->switch_output.filenames[n] = new_filename;
1913 } else {
1914 free(new_filename);
1915 }
1916
1917 /* Output tracking events */
1918 if (!at_exit) {
1919 record__synthesize(rec, false);
1920
1921 /*
1922 * In 'perf record --switch-output' without -a,
1923 * record__synthesize() in record__switch_output() won't
1924 * generate tracking events because there's no thread_map
1925 * in evlist. Which causes newly created perf.data doesn't
1926 * contain map and comm information.
1927 * Create a fake thread_map and directly call
1928 * perf_event__synthesize_thread_map() for those events.
1929 */
1930 if (target__none(&rec->opts.target))
1931 record__synthesize_workload(rec, false);
1932 write_finished_init(rec, false);
1933 }
1934 return fd;
1935 }
1936
__record__save_lost_samples(struct record * rec,struct evsel * evsel,struct perf_record_lost_samples * lost,int cpu_idx,int thread_idx,u64 lost_count,u16 misc_flag)1937 static void __record__save_lost_samples(struct record *rec, struct evsel *evsel,
1938 struct perf_record_lost_samples *lost,
1939 int cpu_idx, int thread_idx, u64 lost_count,
1940 u16 misc_flag)
1941 {
1942 struct perf_sample_id *sid;
1943 struct perf_sample sample;
1944 int id_hdr_size;
1945
1946 perf_sample__init(&sample, /*all=*/true);
1947 lost->lost = lost_count;
1948 if (evsel->core.ids) {
1949 sid = xyarray__entry(evsel->core.sample_id, cpu_idx, thread_idx);
1950 sample.id = sid->id;
1951 }
1952
1953 id_hdr_size = perf_event__synthesize_id_sample((void *)(lost + 1),
1954 evsel->core.attr.sample_type, &sample);
1955 lost->header.size = sizeof(*lost) + id_hdr_size;
1956 lost->header.misc = misc_flag;
1957 record__write(rec, NULL, lost, lost->header.size);
1958 perf_sample__exit(&sample);
1959 }
1960
record__read_lost_samples(struct record * rec)1961 static void record__read_lost_samples(struct record *rec)
1962 {
1963 struct perf_session *session = rec->session;
1964 struct perf_record_lost_samples_and_ids lost;
1965 struct evsel *evsel;
1966
1967 /* there was an error during record__open */
1968 if (session->evlist == NULL)
1969 return;
1970
1971 evlist__for_each_entry(session->evlist, evsel) {
1972 struct xyarray *xy = evsel->core.sample_id;
1973 u64 lost_count;
1974
1975 if (xy == NULL || evsel->core.fd == NULL)
1976 continue;
1977 if (xyarray__max_x(evsel->core.fd) != xyarray__max_x(xy) ||
1978 xyarray__max_y(evsel->core.fd) != xyarray__max_y(xy)) {
1979 pr_debug("Unmatched FD vs. sample ID: skip reading LOST count\n");
1980 continue;
1981 }
1982
1983 for (int x = 0; x < xyarray__max_x(xy); x++) {
1984 for (int y = 0; y < xyarray__max_y(xy); y++) {
1985 struct perf_counts_values count;
1986
1987 if (perf_evsel__read(&evsel->core, x, y, &count) < 0) {
1988 pr_debug("read LOST count failed\n");
1989 return;
1990 }
1991
1992 if (count.lost) {
1993 memset(&lost, 0, sizeof(lost));
1994 lost.lost.header.type = PERF_RECORD_LOST_SAMPLES;
1995 __record__save_lost_samples(rec, evsel, &lost.lost,
1996 x, y, count.lost, 0);
1997 }
1998 }
1999 }
2000
2001 lost_count = perf_bpf_filter__lost_count(evsel);
2002 if (lost_count) {
2003 memset(&lost, 0, sizeof(lost));
2004 lost.lost.header.type = PERF_RECORD_LOST_SAMPLES;
2005 __record__save_lost_samples(rec, evsel, &lost.lost, 0, 0, lost_count,
2006 PERF_RECORD_MISC_LOST_SAMPLES_BPF);
2007 }
2008 }
2009 }
2010
2011 static volatile sig_atomic_t workload_exec_errno;
2012
2013 /*
2014 * evlist__prepare_workload will send a SIGUSR1
2015 * if the fork fails, since we asked by setting its
2016 * want_signal to true.
2017 */
workload_exec_failed_signal(int signo __maybe_unused,siginfo_t * info,void * ucontext __maybe_unused)2018 static void workload_exec_failed_signal(int signo __maybe_unused,
2019 siginfo_t *info,
2020 void *ucontext __maybe_unused)
2021 {
2022 workload_exec_errno = info->si_value.sival_int;
2023 done = 1;
2024 child_finished = 1;
2025 }
2026
2027 static void snapshot_sig_handler(int sig);
2028 static void alarm_sig_handler(int sig);
2029
evlist__pick_pc(struct evlist * evlist)2030 static const struct perf_event_mmap_page *evlist__pick_pc(struct evlist *evlist)
2031 {
2032 if (evlist) {
2033 if (evlist->mmap && evlist->mmap[0].core.base)
2034 return evlist->mmap[0].core.base;
2035 if (evlist->overwrite_mmap && evlist->overwrite_mmap[0].core.base)
2036 return evlist->overwrite_mmap[0].core.base;
2037 }
2038 return NULL;
2039 }
2040
record__pick_pc(struct record * rec)2041 static const struct perf_event_mmap_page *record__pick_pc(struct record *rec)
2042 {
2043 const struct perf_event_mmap_page *pc = evlist__pick_pc(rec->evlist);
2044 if (pc)
2045 return pc;
2046 return NULL;
2047 }
2048
record__synthesize(struct record * rec,bool tail)2049 static int record__synthesize(struct record *rec, bool tail)
2050 {
2051 struct perf_session *session = rec->session;
2052 struct machine *machine = &session->machines.host;
2053 struct perf_data *data = &rec->data;
2054 struct record_opts *opts = &rec->opts;
2055 struct perf_tool *tool = &rec->tool;
2056 int err = 0;
2057 event_op f = process_synthesized_event;
2058
2059 if (rec->opts.tail_synthesize != tail)
2060 return 0;
2061
2062 if (data->is_pipe) {
2063 err = perf_event__synthesize_for_pipe(tool, session, data,
2064 process_synthesized_event);
2065 if (err < 0)
2066 goto out;
2067
2068 rec->bytes_written += err;
2069 }
2070
2071 err = perf_event__synth_time_conv(record__pick_pc(rec), tool,
2072 process_synthesized_event, machine);
2073 if (err)
2074 goto out;
2075
2076 /* Synthesize id_index before auxtrace_info */
2077 err = perf_event__synthesize_id_index(tool,
2078 process_synthesized_event,
2079 session->evlist, machine);
2080 if (err)
2081 goto out;
2082
2083 if (rec->opts.full_auxtrace) {
2084 err = perf_event__synthesize_auxtrace_info(rec->itr, tool,
2085 session, process_synthesized_event);
2086 if (err)
2087 goto out;
2088 }
2089
2090 if (!evlist__exclude_kernel(rec->evlist)) {
2091 err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
2092 machine);
2093 WARN_ONCE(err < 0, "Couldn't record kernel reference relocation symbol\n"
2094 "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
2095 "Check /proc/kallsyms permission or run as root.\n");
2096
2097 err = perf_event__synthesize_modules(tool, process_synthesized_event,
2098 machine);
2099 WARN_ONCE(err < 0, "Couldn't record kernel module information.\n"
2100 "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
2101 "Check /proc/modules permission or run as root.\n");
2102 }
2103
2104 if (perf_guest) {
2105 machines__process_guests(&session->machines,
2106 perf_event__synthesize_guest_os, tool);
2107 }
2108
2109 err = perf_event__synthesize_extra_attr(&rec->tool,
2110 rec->evlist,
2111 process_synthesized_event,
2112 data->is_pipe);
2113 if (err)
2114 goto out;
2115
2116 err = perf_event__synthesize_thread_map2(&rec->tool, rec->evlist->core.threads,
2117 process_synthesized_event,
2118 NULL);
2119 if (err < 0) {
2120 pr_err("Couldn't synthesize thread map.\n");
2121 return err;
2122 }
2123
2124 err = perf_event__synthesize_cpu_map(&rec->tool, rec->evlist->core.all_cpus,
2125 process_synthesized_event, NULL);
2126 if (err < 0) {
2127 pr_err("Couldn't synthesize cpu map.\n");
2128 return err;
2129 }
2130
2131 err = perf_event__synthesize_bpf_events(session, process_synthesized_event,
2132 machine, opts);
2133 if (err < 0) {
2134 pr_warning("Couldn't synthesize bpf events.\n");
2135 err = 0;
2136 }
2137
2138 if (rec->opts.synth & PERF_SYNTH_CGROUP) {
2139 err = perf_event__synthesize_cgroups(tool, process_synthesized_event,
2140 machine);
2141 if (err < 0) {
2142 pr_warning("Couldn't synthesize cgroup events.\n");
2143 err = 0;
2144 }
2145 }
2146
2147 if (rec->opts.nr_threads_synthesize > 1) {
2148 mutex_init(&synth_lock);
2149 perf_set_multithreaded();
2150 f = process_locked_synthesized_event;
2151 }
2152
2153 if (rec->opts.synth & PERF_SYNTH_TASK) {
2154 bool needs_mmap = rec->opts.synth & PERF_SYNTH_MMAP;
2155
2156 err = __machine__synthesize_threads(machine, tool, &opts->target,
2157 rec->evlist->core.threads,
2158 f, needs_mmap, opts->sample_address,
2159 rec->opts.nr_threads_synthesize);
2160 }
2161
2162 if (rec->opts.nr_threads_synthesize > 1) {
2163 perf_set_singlethreaded();
2164 mutex_destroy(&synth_lock);
2165 }
2166
2167 out:
2168 return err;
2169 }
2170
record__synthesize_final_bpf_metadata(struct record * rec __maybe_unused)2171 static void record__synthesize_final_bpf_metadata(struct record *rec __maybe_unused)
2172 {
2173 #ifdef HAVE_LIBBPF_SUPPORT
2174 perf_event__synthesize_final_bpf_metadata(rec->session,
2175 process_synthesized_event);
2176 #endif
2177 }
2178
record__process_signal_event(union perf_event * event __maybe_unused,void * data)2179 static int record__process_signal_event(union perf_event *event __maybe_unused, void *data)
2180 {
2181 struct record *rec = data;
2182 pthread_kill(rec->thread_id, SIGUSR2);
2183 return 0;
2184 }
2185
record__setup_sb_evlist(struct record * rec)2186 static int record__setup_sb_evlist(struct record *rec)
2187 {
2188 struct record_opts *opts = &rec->opts;
2189
2190 if (rec->sb_evlist != NULL) {
2191 /*
2192 * We get here if --switch-output-event populated the
2193 * sb_evlist, so associate a callback that will send a SIGUSR2
2194 * to the main thread.
2195 */
2196 evlist__set_cb(rec->sb_evlist, record__process_signal_event, rec);
2197 rec->thread_id = pthread_self();
2198 }
2199 #ifdef HAVE_LIBBPF_SUPPORT
2200 if (!opts->no_bpf_event) {
2201 if (rec->sb_evlist == NULL) {
2202 rec->sb_evlist = evlist__new();
2203
2204 if (rec->sb_evlist == NULL) {
2205 pr_err("Couldn't create side band evlist.\n.");
2206 return -1;
2207 }
2208 }
2209
2210 if (evlist__add_bpf_sb_event(rec->sb_evlist, perf_session__env(rec->session))) {
2211 pr_err("Couldn't ask for PERF_RECORD_BPF_EVENT side band events.\n.");
2212 return -1;
2213 }
2214 }
2215 #endif
2216 if (evlist__start_sb_thread(rec->sb_evlist, &rec->opts.target)) {
2217 pr_debug("Couldn't start the BPF side band thread:\nBPF programs starting from now on won't be annotatable\n");
2218 opts->no_bpf_event = true;
2219 }
2220
2221 return 0;
2222 }
2223
record__init_clock(struct record * rec)2224 static int record__init_clock(struct record *rec)
2225 {
2226 struct perf_session *session = rec->session;
2227 struct timespec ref_clockid;
2228 struct timeval ref_tod;
2229 struct perf_env *env = perf_session__env(session);
2230 u64 ref;
2231
2232 if (!rec->opts.use_clockid)
2233 return 0;
2234
2235 if (rec->opts.use_clockid && rec->opts.clockid_res_ns)
2236 env->clock.clockid_res_ns = rec->opts.clockid_res_ns;
2237
2238 env->clock.clockid = rec->opts.clockid;
2239
2240 if (gettimeofday(&ref_tod, NULL) != 0) {
2241 pr_err("gettimeofday failed, cannot set reference time.\n");
2242 return -1;
2243 }
2244
2245 if (clock_gettime(rec->opts.clockid, &ref_clockid)) {
2246 pr_err("clock_gettime failed, cannot set reference time.\n");
2247 return -1;
2248 }
2249
2250 ref = (u64) ref_tod.tv_sec * NSEC_PER_SEC +
2251 (u64) ref_tod.tv_usec * NSEC_PER_USEC;
2252
2253 env->clock.tod_ns = ref;
2254
2255 ref = (u64) ref_clockid.tv_sec * NSEC_PER_SEC +
2256 (u64) ref_clockid.tv_nsec;
2257
2258 env->clock.clockid_ns = ref;
2259 return 0;
2260 }
2261
hit_auxtrace_snapshot_trigger(struct record * rec)2262 static void hit_auxtrace_snapshot_trigger(struct record *rec)
2263 {
2264 if (trigger_is_ready(&auxtrace_snapshot_trigger)) {
2265 trigger_hit(&auxtrace_snapshot_trigger);
2266 auxtrace_record__snapshot_started = 1;
2267 if (auxtrace_record__snapshot_start(rec->itr))
2268 trigger_error(&auxtrace_snapshot_trigger);
2269 }
2270 }
2271
record__terminate_thread(struct record_thread * thread_data)2272 static int record__terminate_thread(struct record_thread *thread_data)
2273 {
2274 int err;
2275 enum thread_msg ack = THREAD_MSG__UNDEFINED;
2276 pid_t tid = thread_data->tid;
2277
2278 close(thread_data->pipes.msg[1]);
2279 thread_data->pipes.msg[1] = -1;
2280 err = read(thread_data->pipes.ack[0], &ack, sizeof(ack));
2281 if (err > 0)
2282 pr_debug2("threads[%d]: sent %s\n", tid, thread_msg_tags[ack]);
2283 else
2284 pr_warning("threads[%d]: failed to receive termination notification from %d\n",
2285 thread->tid, tid);
2286
2287 return 0;
2288 }
2289
record__start_threads(struct record * rec)2290 static int record__start_threads(struct record *rec)
2291 {
2292 int t, tt, err, ret = 0, nr_threads = rec->nr_threads;
2293 struct record_thread *thread_data = rec->thread_data;
2294 sigset_t full, mask;
2295 pthread_t handle;
2296 pthread_attr_t attrs;
2297
2298 thread = &thread_data[0];
2299
2300 if (!record__threads_enabled(rec))
2301 return 0;
2302
2303 sigfillset(&full);
2304 if (sigprocmask(SIG_SETMASK, &full, &mask)) {
2305 pr_err("Failed to block signals on threads start: %s\n", strerror(errno));
2306 return -1;
2307 }
2308
2309 pthread_attr_init(&attrs);
2310 pthread_attr_setdetachstate(&attrs, PTHREAD_CREATE_DETACHED);
2311
2312 for (t = 1; t < nr_threads; t++) {
2313 enum thread_msg msg = THREAD_MSG__UNDEFINED;
2314
2315 #ifdef HAVE_PTHREAD_ATTR_SETAFFINITY_NP
2316 pthread_attr_setaffinity_np(&attrs,
2317 MMAP_CPU_MASK_BYTES(&(thread_data[t].mask->affinity)),
2318 (cpu_set_t *)(thread_data[t].mask->affinity.bits));
2319 #endif
2320 if (pthread_create(&handle, &attrs, record__thread, &thread_data[t])) {
2321 for (tt = 1; tt < t; tt++)
2322 record__terminate_thread(&thread_data[t]);
2323 pr_err("Failed to start threads: %s\n", strerror(errno));
2324 ret = -1;
2325 goto out_err;
2326 }
2327
2328 err = read(thread_data[t].pipes.ack[0], &msg, sizeof(msg));
2329 if (err > 0)
2330 pr_debug2("threads[%d]: sent %s\n", rec->thread_data[t].tid,
2331 thread_msg_tags[msg]);
2332 else
2333 pr_warning("threads[%d]: failed to receive start notification from %d\n",
2334 thread->tid, rec->thread_data[t].tid);
2335 }
2336
2337 sched_setaffinity(0, MMAP_CPU_MASK_BYTES(&thread->mask->affinity),
2338 (cpu_set_t *)thread->mask->affinity.bits);
2339
2340 pr_debug("threads[%d]: started on cpu%d\n", thread->tid, sched_getcpu());
2341
2342 out_err:
2343 pthread_attr_destroy(&attrs);
2344
2345 if (sigprocmask(SIG_SETMASK, &mask, NULL)) {
2346 pr_err("Failed to unblock signals on threads start: %s\n", strerror(errno));
2347 ret = -1;
2348 }
2349
2350 return ret;
2351 }
2352
record__stop_threads(struct record * rec)2353 static int record__stop_threads(struct record *rec)
2354 {
2355 int t;
2356 struct record_thread *thread_data = rec->thread_data;
2357
2358 for (t = 1; t < rec->nr_threads; t++)
2359 record__terminate_thread(&thread_data[t]);
2360
2361 for (t = 0; t < rec->nr_threads; t++) {
2362 rec->samples += thread_data[t].samples;
2363 if (!record__threads_enabled(rec))
2364 continue;
2365 rec->session->bytes_transferred += thread_data[t].bytes_transferred;
2366 rec->session->bytes_compressed += thread_data[t].bytes_compressed;
2367 pr_debug("threads[%d]: samples=%lld, wakes=%ld, ", thread_data[t].tid,
2368 thread_data[t].samples, thread_data[t].waking);
2369 if (thread_data[t].bytes_transferred && thread_data[t].bytes_compressed)
2370 pr_debug("transferred=%" PRIu64 ", compressed=%" PRIu64 "\n",
2371 thread_data[t].bytes_transferred, thread_data[t].bytes_compressed);
2372 else
2373 pr_debug("written=%" PRIu64 "\n", thread_data[t].bytes_written);
2374 }
2375
2376 return 0;
2377 }
2378
record__waking(struct record * rec)2379 static unsigned long record__waking(struct record *rec)
2380 {
2381 int t;
2382 unsigned long waking = 0;
2383 struct record_thread *thread_data = rec->thread_data;
2384
2385 for (t = 0; t < rec->nr_threads; t++)
2386 waking += thread_data[t].waking;
2387
2388 return waking;
2389 }
2390
__cmd_record(struct record * rec,int argc,const char ** argv)2391 static int __cmd_record(struct record *rec, int argc, const char **argv)
2392 {
2393 int err;
2394 int status = 0;
2395 const bool forks = argc > 0;
2396 struct perf_tool *tool = &rec->tool;
2397 struct record_opts *opts = &rec->opts;
2398 struct perf_data *data = &rec->data;
2399 struct perf_session *session;
2400 bool disabled = false, draining = false;
2401 int fd;
2402 float ratio = 0;
2403 enum evlist_ctl_cmd cmd = EVLIST_CTL_CMD_UNSUPPORTED;
2404 struct perf_env *env;
2405
2406 atexit(record__sig_exit);
2407 signal(SIGCHLD, sig_handler);
2408 signal(SIGINT, sig_handler);
2409 signal(SIGTERM, sig_handler);
2410 signal(SIGSEGV, sigsegv_handler);
2411
2412 if (rec->opts.record_cgroup) {
2413 #ifndef HAVE_FILE_HANDLE
2414 pr_err("cgroup tracking is not supported\n");
2415 return -1;
2416 #endif
2417 }
2418
2419 if (rec->opts.auxtrace_snapshot_mode || rec->switch_output.enabled) {
2420 signal(SIGUSR2, snapshot_sig_handler);
2421 if (rec->opts.auxtrace_snapshot_mode)
2422 trigger_on(&auxtrace_snapshot_trigger);
2423 if (rec->switch_output.enabled)
2424 trigger_on(&switch_output_trigger);
2425 } else {
2426 signal(SIGUSR2, SIG_IGN);
2427 }
2428
2429 perf_tool__init(tool, /*ordered_events=*/true);
2430 tool->sample = process_sample_event;
2431 tool->fork = perf_event__process_fork;
2432 tool->exit = perf_event__process_exit;
2433 tool->comm = perf_event__process_comm;
2434 tool->namespaces = perf_event__process_namespaces;
2435 tool->mmap = build_id__process_mmap;
2436 tool->mmap2 = build_id__process_mmap2;
2437 tool->itrace_start = process_timestamp_boundary;
2438 tool->aux = process_timestamp_boundary;
2439 tool->namespace_events = rec->opts.record_namespaces;
2440 tool->cgroup_events = rec->opts.record_cgroup;
2441 session = perf_session__new(data, tool);
2442 if (IS_ERR(session)) {
2443 pr_err("Perf session creation failed.\n");
2444 return PTR_ERR(session);
2445 }
2446 env = perf_session__env(session);
2447 if (record__threads_enabled(rec)) {
2448 if (perf_data__is_pipe(&rec->data)) {
2449 pr_err("Parallel trace streaming is not available in pipe mode.\n");
2450 return -1;
2451 }
2452 if (rec->opts.full_auxtrace) {
2453 pr_err("Parallel trace streaming is not available in AUX area tracing mode.\n");
2454 return -1;
2455 }
2456 }
2457
2458 fd = perf_data__fd(data);
2459 rec->session = session;
2460
2461 if (zstd_init(&session->zstd_data, rec->opts.comp_level) < 0) {
2462 pr_err("Compression initialization failed.\n");
2463 return -1;
2464 }
2465 #ifdef HAVE_EVENTFD_SUPPORT
2466 done_fd = eventfd(0, EFD_NONBLOCK);
2467 if (done_fd < 0) {
2468 pr_err("Failed to create wakeup eventfd, error: %m\n");
2469 status = -1;
2470 goto out_delete_session;
2471 }
2472 err = evlist__add_wakeup_eventfd(rec->evlist, done_fd);
2473 if (err < 0) {
2474 pr_err("Failed to add wakeup eventfd to poll list\n");
2475 status = err;
2476 goto out_delete_session;
2477 }
2478 #endif // HAVE_EVENTFD_SUPPORT
2479
2480 env->comp_type = PERF_COMP_ZSTD;
2481 env->comp_level = rec->opts.comp_level;
2482
2483 if (rec->opts.kcore &&
2484 !record__kcore_readable(&session->machines.host)) {
2485 pr_err("ERROR: kcore is not readable.\n");
2486 return -1;
2487 }
2488
2489 if (record__init_clock(rec))
2490 return -1;
2491
2492 record__init_features(rec);
2493
2494 if (forks) {
2495 err = evlist__prepare_workload(rec->evlist, &opts->target, argv, data->is_pipe,
2496 workload_exec_failed_signal);
2497 if (err < 0) {
2498 pr_err("Couldn't run the workload!\n");
2499 status = err;
2500 goto out_delete_session;
2501 }
2502 }
2503
2504 /*
2505 * If we have just single event and are sending data
2506 * through pipe, we need to force the ids allocation,
2507 * because we synthesize event name through the pipe
2508 * and need the id for that.
2509 */
2510 if (data->is_pipe && rec->evlist->core.nr_entries == 1)
2511 rec->opts.sample_id = true;
2512
2513 if (rec->timestamp_filename && perf_data__is_pipe(data)) {
2514 rec->timestamp_filename = false;
2515 pr_warning("WARNING: --timestamp-filename option is not available in pipe mode.\n");
2516 }
2517
2518 /*
2519 * Use global stat_config that is zero meaning aggr_mode is AGGR_NONE
2520 * and hybrid_merge is false.
2521 */
2522 evlist__uniquify_evsel_names(rec->evlist, &stat_config);
2523
2524 evlist__config(rec->evlist, opts, &callchain_param);
2525
2526 /* Debug message used by test scripts */
2527 pr_debug3("perf record opening and mmapping events\n");
2528 if (record__open(rec) != 0) {
2529 err = -1;
2530 goto out_free_threads;
2531 }
2532 /* Debug message used by test scripts */
2533 pr_debug3("perf record done opening and mmapping events\n");
2534 env->comp_mmap_len = session->evlist->core.mmap_len;
2535
2536 if (rec->opts.kcore) {
2537 err = record__kcore_copy(&session->machines.host, data);
2538 if (err) {
2539 pr_err("ERROR: Failed to copy kcore\n");
2540 goto out_free_threads;
2541 }
2542 }
2543
2544 /*
2545 * Normally perf_session__new would do this, but it doesn't have the
2546 * evlist.
2547 */
2548 if (rec->tool.ordered_events && !evlist__sample_id_all(rec->evlist)) {
2549 pr_warning("WARNING: No sample_id_all support, falling back to unordered processing\n");
2550 rec->tool.ordered_events = false;
2551 }
2552
2553 if (evlist__nr_groups(rec->evlist) == 0)
2554 perf_header__clear_feat(&session->header, HEADER_GROUP_DESC);
2555
2556 if (data->is_pipe) {
2557 err = perf_header__write_pipe(fd);
2558 if (err < 0)
2559 goto out_free_threads;
2560 } else {
2561 err = perf_session__write_header(session, rec->evlist, fd, false);
2562 if (err < 0)
2563 goto out_free_threads;
2564 }
2565
2566 err = -1;
2567 if (!rec->no_buildid
2568 && !perf_header__has_feat(&session->header, HEADER_BUILD_ID)) {
2569 pr_err("Couldn't generate buildids. "
2570 "Use --no-buildid to profile anyway.\n");
2571 goto out_free_threads;
2572 }
2573
2574 if (!evlist__needs_bpf_sb_event(rec->evlist))
2575 opts->no_bpf_event = true;
2576
2577 err = record__setup_sb_evlist(rec);
2578 if (err)
2579 goto out_free_threads;
2580
2581 err = record__synthesize(rec, false);
2582 if (err < 0)
2583 goto out_free_threads;
2584
2585 if (rec->realtime_prio) {
2586 struct sched_param param;
2587
2588 param.sched_priority = rec->realtime_prio;
2589 if (sched_setscheduler(0, SCHED_FIFO, ¶m)) {
2590 pr_err("Could not set realtime priority.\n");
2591 err = -1;
2592 goto out_free_threads;
2593 }
2594 }
2595
2596 if (record__start_threads(rec))
2597 goto out_free_threads;
2598
2599 /*
2600 * When perf is starting the traced process, all the events
2601 * (apart from group members) have enable_on_exec=1 set,
2602 * so don't spoil it by prematurely enabling them.
2603 */
2604 if (!target__none(&opts->target) && !opts->target.initial_delay)
2605 evlist__enable(rec->evlist);
2606
2607 /*
2608 * offcpu-time does not call execve, so enable_on_exe wouldn't work
2609 * when recording a workload, do it manually
2610 */
2611 if (rec->off_cpu)
2612 evlist__enable_evsel(rec->evlist, (char *)OFFCPU_EVENT);
2613
2614 /*
2615 * Let the child rip
2616 */
2617 if (forks) {
2618 struct machine *machine = &session->machines.host;
2619 union perf_event *event;
2620 pid_t tgid;
2621
2622 event = malloc(sizeof(event->comm) + machine->id_hdr_size);
2623 if (event == NULL) {
2624 err = -ENOMEM;
2625 goto out_child;
2626 }
2627
2628 /*
2629 * Some H/W events are generated before COMM event
2630 * which is emitted during exec(), so perf script
2631 * cannot see a correct process name for those events.
2632 * Synthesize COMM event to prevent it.
2633 */
2634 tgid = perf_event__synthesize_comm(tool, event,
2635 rec->evlist->workload.pid,
2636 process_synthesized_event,
2637 machine);
2638 free(event);
2639
2640 if (tgid == -1)
2641 goto out_child;
2642
2643 event = malloc(sizeof(event->namespaces) +
2644 (NR_NAMESPACES * sizeof(struct perf_ns_link_info)) +
2645 machine->id_hdr_size);
2646 if (event == NULL) {
2647 err = -ENOMEM;
2648 goto out_child;
2649 }
2650
2651 /*
2652 * Synthesize NAMESPACES event for the command specified.
2653 */
2654 perf_event__synthesize_namespaces(tool, event,
2655 rec->evlist->workload.pid,
2656 tgid, process_synthesized_event,
2657 machine);
2658 free(event);
2659
2660 evlist__start_workload(rec->evlist);
2661 }
2662
2663 if (opts->target.initial_delay) {
2664 pr_info(EVLIST_DISABLED_MSG);
2665 if (opts->target.initial_delay > 0) {
2666 usleep(opts->target.initial_delay * USEC_PER_MSEC);
2667 evlist__enable(rec->evlist);
2668 pr_info(EVLIST_ENABLED_MSG);
2669 }
2670 }
2671
2672 err = event_enable_timer__start(rec->evlist->eet);
2673 if (err)
2674 goto out_child;
2675
2676 /* Debug message used by test scripts */
2677 pr_debug3("perf record has started\n");
2678 fflush(stderr);
2679
2680 trigger_ready(&auxtrace_snapshot_trigger);
2681 trigger_ready(&switch_output_trigger);
2682 perf_hooks__invoke_record_start();
2683
2684 /*
2685 * Must write FINISHED_INIT so it will be seen after all other
2686 * synthesized user events, but before any regular events.
2687 */
2688 err = write_finished_init(rec, false);
2689 if (err < 0)
2690 goto out_child;
2691
2692 for (;;) {
2693 unsigned long long hits = thread->samples;
2694
2695 /*
2696 * rec->evlist->bkw_mmap_state is possible to be
2697 * BKW_MMAP_EMPTY here: when done == true and
2698 * hits != rec->samples in previous round.
2699 *
2700 * evlist__toggle_bkw_mmap ensure we never
2701 * convert BKW_MMAP_EMPTY to BKW_MMAP_DATA_PENDING.
2702 */
2703 if (trigger_is_hit(&switch_output_trigger) || done || draining)
2704 evlist__toggle_bkw_mmap(rec->evlist, BKW_MMAP_DATA_PENDING);
2705
2706 if (record__mmap_read_all(rec, false) < 0) {
2707 trigger_error(&auxtrace_snapshot_trigger);
2708 trigger_error(&switch_output_trigger);
2709 err = -1;
2710 goto out_child;
2711 }
2712
2713 if (auxtrace_record__snapshot_started) {
2714 auxtrace_record__snapshot_started = 0;
2715 if (!trigger_is_error(&auxtrace_snapshot_trigger))
2716 record__read_auxtrace_snapshot(rec, false);
2717 if (trigger_is_error(&auxtrace_snapshot_trigger)) {
2718 pr_err("AUX area tracing snapshot failed\n");
2719 err = -1;
2720 goto out_child;
2721 }
2722 }
2723
2724 if (trigger_is_hit(&switch_output_trigger)) {
2725 /*
2726 * If switch_output_trigger is hit, the data in
2727 * overwritable ring buffer should have been collected,
2728 * so bkw_mmap_state should be set to BKW_MMAP_EMPTY.
2729 *
2730 * If SIGUSR2 raise after or during record__mmap_read_all(),
2731 * record__mmap_read_all() didn't collect data from
2732 * overwritable ring buffer. Read again.
2733 */
2734 if (rec->evlist->bkw_mmap_state == BKW_MMAP_RUNNING)
2735 continue;
2736 trigger_ready(&switch_output_trigger);
2737
2738 /*
2739 * Reenable events in overwrite ring buffer after
2740 * record__mmap_read_all(): we should have collected
2741 * data from it.
2742 */
2743 evlist__toggle_bkw_mmap(rec->evlist, BKW_MMAP_RUNNING);
2744
2745 if (!quiet)
2746 fprintf(stderr, "[ perf record: dump data: Woken up %ld times ]\n",
2747 record__waking(rec));
2748 thread->waking = 0;
2749 fd = record__switch_output(rec, false);
2750 if (fd < 0) {
2751 pr_err("Failed to switch to new file\n");
2752 trigger_error(&switch_output_trigger);
2753 err = fd;
2754 goto out_child;
2755 }
2756
2757 /* re-arm the alarm */
2758 if (rec->switch_output.time)
2759 alarm(rec->switch_output.time);
2760 }
2761
2762 if (hits == thread->samples) {
2763 if (done || draining)
2764 break;
2765 err = fdarray__poll(&thread->pollfd, -1);
2766 /*
2767 * Propagate error, only if there's any. Ignore positive
2768 * number of returned events and interrupt error.
2769 */
2770 if (err > 0 || (err < 0 && errno == EINTR))
2771 err = 0;
2772 thread->waking++;
2773
2774 if (fdarray__filter(&thread->pollfd, POLLERR | POLLHUP,
2775 record__thread_munmap_filtered, NULL) == 0)
2776 draining = true;
2777
2778 err = record__update_evlist_pollfd_from_thread(rec, rec->evlist, thread);
2779 if (err)
2780 goto out_child;
2781 }
2782
2783 if (evlist__ctlfd_process(rec->evlist, &cmd) > 0) {
2784 switch (cmd) {
2785 case EVLIST_CTL_CMD_SNAPSHOT:
2786 hit_auxtrace_snapshot_trigger(rec);
2787 evlist__ctlfd_ack(rec->evlist);
2788 break;
2789 case EVLIST_CTL_CMD_STOP:
2790 done = 1;
2791 break;
2792 case EVLIST_CTL_CMD_ACK:
2793 case EVLIST_CTL_CMD_UNSUPPORTED:
2794 case EVLIST_CTL_CMD_ENABLE:
2795 case EVLIST_CTL_CMD_DISABLE:
2796 case EVLIST_CTL_CMD_EVLIST:
2797 case EVLIST_CTL_CMD_PING:
2798 default:
2799 break;
2800 }
2801 }
2802
2803 err = event_enable_timer__process(rec->evlist->eet);
2804 if (err < 0)
2805 goto out_child;
2806 if (err) {
2807 err = 0;
2808 done = 1;
2809 }
2810
2811 /*
2812 * When perf is starting the traced process, at the end events
2813 * die with the process and we wait for that. Thus no need to
2814 * disable events in this case.
2815 */
2816 if (done && !disabled && !target__none(&opts->target)) {
2817 trigger_off(&auxtrace_snapshot_trigger);
2818 evlist__disable(rec->evlist);
2819 disabled = true;
2820 }
2821 }
2822
2823 trigger_off(&auxtrace_snapshot_trigger);
2824 trigger_off(&switch_output_trigger);
2825
2826 record__synthesize_final_bpf_metadata(rec);
2827
2828 if (opts->auxtrace_snapshot_on_exit)
2829 record__auxtrace_snapshot_exit(rec);
2830
2831 if (forks && workload_exec_errno) {
2832 char msg[STRERR_BUFSIZE];
2833 const char *emsg = str_error_r(workload_exec_errno, msg, sizeof(msg));
2834 struct strbuf sb = STRBUF_INIT;
2835
2836 evlist__format_evsels(rec->evlist, &sb, 2048);
2837
2838 pr_err("Failed to collect '%s' for the '%s' workload: %s\n",
2839 sb.buf, argv[0], emsg);
2840 strbuf_release(&sb);
2841 err = -1;
2842 goto out_child;
2843 }
2844
2845 if (!quiet)
2846 fprintf(stderr, "[ perf record: Woken up %ld times to write data ]\n",
2847 record__waking(rec));
2848
2849 write_finished_init(rec, true);
2850
2851 if (target__none(&rec->opts.target))
2852 record__synthesize_workload(rec, true);
2853
2854 out_child:
2855 record__stop_threads(rec);
2856 record__mmap_read_all(rec, true);
2857 out_free_threads:
2858 record__free_thread_data(rec);
2859 evlist__finalize_ctlfd(rec->evlist);
2860 record__aio_mmap_read_sync(rec);
2861
2862 if (rec->session->bytes_transferred && rec->session->bytes_compressed) {
2863 ratio = (float)rec->session->bytes_transferred/(float)rec->session->bytes_compressed;
2864 env->comp_ratio = ratio + 0.5;
2865 }
2866
2867 if (forks) {
2868 int exit_status;
2869
2870 if (!child_finished)
2871 kill(rec->evlist->workload.pid, SIGTERM);
2872
2873 wait(&exit_status);
2874
2875 if (err < 0)
2876 status = err;
2877 else if (WIFEXITED(exit_status))
2878 status = WEXITSTATUS(exit_status);
2879 else if (WIFSIGNALED(exit_status))
2880 signr = WTERMSIG(exit_status);
2881 } else
2882 status = err;
2883
2884 if (rec->off_cpu)
2885 rec->bytes_written += off_cpu_write(rec->session);
2886
2887 record__read_lost_samples(rec);
2888 record__synthesize(rec, true);
2889 /* this will be recalculated during process_buildids() */
2890 rec->samples = 0;
2891
2892 if (!err) {
2893 if (!rec->timestamp_filename) {
2894 record__finish_output(rec);
2895 } else {
2896 fd = record__switch_output(rec, true);
2897 if (fd < 0) {
2898 status = fd;
2899 goto out_delete_session;
2900 }
2901 }
2902 }
2903
2904 perf_hooks__invoke_record_end();
2905
2906 if (!err && !quiet) {
2907 char samples[128];
2908 const char *postfix = rec->timestamp_filename ?
2909 ".<timestamp>" : "";
2910
2911 if (rec->samples && !rec->opts.full_auxtrace)
2912 scnprintf(samples, sizeof(samples),
2913 " (%" PRIu64 " samples)", rec->samples);
2914 else
2915 samples[0] = '\0';
2916
2917 fprintf(stderr, "[ perf record: Captured and wrote %.3f MB %s%s%s",
2918 perf_data__size(data) / 1024.0 / 1024.0,
2919 data->path, postfix, samples);
2920 if (ratio) {
2921 fprintf(stderr, ", compressed (original %.3f MB, ratio is %.3f)",
2922 rec->session->bytes_transferred / 1024.0 / 1024.0,
2923 ratio);
2924 }
2925 fprintf(stderr, " ]\n");
2926 }
2927
2928 out_delete_session:
2929 #ifdef HAVE_EVENTFD_SUPPORT
2930 if (done_fd >= 0) {
2931 fd = done_fd;
2932 done_fd = -1;
2933
2934 close(fd);
2935 }
2936 #endif
2937 zstd_fini(&session->zstd_data);
2938 if (!opts->no_bpf_event)
2939 evlist__stop_sb_thread(rec->sb_evlist);
2940
2941 perf_session__delete(session);
2942 return status;
2943 }
2944
callchain_debug(struct callchain_param * callchain)2945 static void callchain_debug(struct callchain_param *callchain)
2946 {
2947 static const char *str[CALLCHAIN_MAX] = { "NONE", "FP", "DWARF", "LBR" };
2948
2949 pr_debug("callchain: type %s\n", str[callchain->record_mode]);
2950
2951 if (callchain->record_mode == CALLCHAIN_DWARF)
2952 pr_debug("callchain: stack dump size %d\n",
2953 callchain->dump_size);
2954 }
2955
record_opts__parse_callchain(struct record_opts * record,struct callchain_param * callchain,const char * arg,bool unset)2956 int record_opts__parse_callchain(struct record_opts *record,
2957 struct callchain_param *callchain,
2958 const char *arg, bool unset)
2959 {
2960 int ret;
2961 callchain->enabled = !unset;
2962
2963 /* --no-call-graph */
2964 if (unset) {
2965 callchain->record_mode = CALLCHAIN_NONE;
2966 pr_debug("callchain: disabled\n");
2967 return 0;
2968 }
2969
2970 ret = parse_callchain_record_opt(arg, callchain);
2971 if (!ret) {
2972 /* Enable data address sampling for DWARF unwind. */
2973 if (callchain->record_mode == CALLCHAIN_DWARF)
2974 record->sample_address = true;
2975 callchain_debug(callchain);
2976 }
2977
2978 return ret;
2979 }
2980
record_parse_callchain_opt(const struct option * opt,const char * arg,int unset)2981 int record_parse_callchain_opt(const struct option *opt,
2982 const char *arg,
2983 int unset)
2984 {
2985 return record_opts__parse_callchain(opt->value, &callchain_param, arg, unset);
2986 }
2987
record_callchain_opt(const struct option * opt,const char * arg __maybe_unused,int unset __maybe_unused)2988 int record_callchain_opt(const struct option *opt,
2989 const char *arg __maybe_unused,
2990 int unset __maybe_unused)
2991 {
2992 struct callchain_param *callchain = opt->value;
2993
2994 callchain->enabled = true;
2995
2996 if (callchain->record_mode == CALLCHAIN_NONE)
2997 callchain->record_mode = CALLCHAIN_FP;
2998
2999 callchain_debug(callchain);
3000 return 0;
3001 }
3002
perf_record_config(const char * var,const char * value,void * cb)3003 static int perf_record_config(const char *var, const char *value, void *cb)
3004 {
3005 struct record *rec = cb;
3006
3007 if (!strcmp(var, "record.build-id")) {
3008 if (!strcmp(value, "cache"))
3009 rec->no_buildid_cache = false;
3010 else if (!strcmp(value, "no-cache"))
3011 rec->no_buildid_cache = true;
3012 else if (!strcmp(value, "skip"))
3013 rec->no_buildid = true;
3014 else if (!strcmp(value, "mmap"))
3015 rec->buildid_mmap = true;
3016 else if (!strcmp(value, "no-mmap"))
3017 rec->buildid_mmap = false;
3018 else
3019 return -1;
3020 return 0;
3021 }
3022 if (!strcmp(var, "record.call-graph")) {
3023 var = "call-graph.record-mode";
3024 return perf_default_config(var, value, cb);
3025 }
3026 #ifdef HAVE_AIO_SUPPORT
3027 if (!strcmp(var, "record.aio")) {
3028 rec->opts.nr_cblocks = strtol(value, NULL, 0);
3029 if (!rec->opts.nr_cblocks)
3030 rec->opts.nr_cblocks = nr_cblocks_default;
3031 }
3032 #endif
3033 if (!strcmp(var, "record.debuginfod")) {
3034 rec->debuginfod.urls = strdup(value);
3035 if (!rec->debuginfod.urls)
3036 return -ENOMEM;
3037 rec->debuginfod.set = true;
3038 }
3039
3040 return 0;
3041 }
3042
record__parse_event_enable_time(const struct option * opt,const char * str,int unset)3043 static int record__parse_event_enable_time(const struct option *opt, const char *str, int unset)
3044 {
3045 struct record *rec = (struct record *)opt->value;
3046
3047 return evlist__parse_event_enable_time(rec->evlist, &rec->opts, str, unset);
3048 }
3049
record__parse_affinity(const struct option * opt,const char * str,int unset)3050 static int record__parse_affinity(const struct option *opt, const char *str, int unset)
3051 {
3052 struct record_opts *opts = (struct record_opts *)opt->value;
3053
3054 if (unset || !str)
3055 return 0;
3056
3057 if (!strcasecmp(str, "node"))
3058 opts->affinity = PERF_AFFINITY_NODE;
3059 else if (!strcasecmp(str, "cpu"))
3060 opts->affinity = PERF_AFFINITY_CPU;
3061
3062 return 0;
3063 }
3064
record__mmap_cpu_mask_alloc(struct mmap_cpu_mask * mask,int nr_bits)3065 static int record__mmap_cpu_mask_alloc(struct mmap_cpu_mask *mask, int nr_bits)
3066 {
3067 mask->nbits = nr_bits;
3068 mask->bits = bitmap_zalloc(mask->nbits);
3069 if (!mask->bits)
3070 return -ENOMEM;
3071
3072 return 0;
3073 }
3074
record__mmap_cpu_mask_free(struct mmap_cpu_mask * mask)3075 static void record__mmap_cpu_mask_free(struct mmap_cpu_mask *mask)
3076 {
3077 bitmap_free(mask->bits);
3078 mask->nbits = 0;
3079 }
3080
record__thread_mask_alloc(struct thread_mask * mask,int nr_bits)3081 static int record__thread_mask_alloc(struct thread_mask *mask, int nr_bits)
3082 {
3083 int ret;
3084
3085 ret = record__mmap_cpu_mask_alloc(&mask->maps, nr_bits);
3086 if (ret) {
3087 mask->affinity.bits = NULL;
3088 return ret;
3089 }
3090
3091 ret = record__mmap_cpu_mask_alloc(&mask->affinity, nr_bits);
3092 if (ret) {
3093 record__mmap_cpu_mask_free(&mask->maps);
3094 mask->maps.bits = NULL;
3095 }
3096
3097 return ret;
3098 }
3099
record__thread_mask_free(struct thread_mask * mask)3100 static void record__thread_mask_free(struct thread_mask *mask)
3101 {
3102 record__mmap_cpu_mask_free(&mask->maps);
3103 record__mmap_cpu_mask_free(&mask->affinity);
3104 }
3105
record__parse_threads(const struct option * opt,const char * str,int unset)3106 static int record__parse_threads(const struct option *opt, const char *str, int unset)
3107 {
3108 int s;
3109 struct record_opts *opts = opt->value;
3110
3111 if (unset || !str || !strlen(str)) {
3112 opts->threads_spec = THREAD_SPEC__CPU;
3113 } else {
3114 for (s = 1; s < THREAD_SPEC__MAX; s++) {
3115 if (s == THREAD_SPEC__USER) {
3116 opts->threads_user_spec = strdup(str);
3117 if (!opts->threads_user_spec)
3118 return -ENOMEM;
3119 opts->threads_spec = THREAD_SPEC__USER;
3120 break;
3121 }
3122 if (!strncasecmp(str, thread_spec_tags[s], strlen(thread_spec_tags[s]))) {
3123 opts->threads_spec = s;
3124 break;
3125 }
3126 }
3127 }
3128
3129 if (opts->threads_spec == THREAD_SPEC__USER)
3130 pr_debug("threads_spec: %s\n", opts->threads_user_spec);
3131 else
3132 pr_debug("threads_spec: %s\n", thread_spec_tags[opts->threads_spec]);
3133
3134 return 0;
3135 }
3136
parse_output_max_size(const struct option * opt,const char * str,int unset)3137 static int parse_output_max_size(const struct option *opt,
3138 const char *str, int unset)
3139 {
3140 unsigned long *s = (unsigned long *)opt->value;
3141 static struct parse_tag tags_size[] = {
3142 { .tag = 'B', .mult = 1 },
3143 { .tag = 'K', .mult = 1 << 10 },
3144 { .tag = 'M', .mult = 1 << 20 },
3145 { .tag = 'G', .mult = 1 << 30 },
3146 { .tag = 0 },
3147 };
3148 unsigned long val;
3149
3150 if (unset) {
3151 *s = 0;
3152 return 0;
3153 }
3154
3155 val = parse_tag_value(str, tags_size);
3156 if (val != (unsigned long) -1) {
3157 *s = val;
3158 return 0;
3159 }
3160
3161 return -1;
3162 }
3163
record__parse_mmap_pages(const struct option * opt,const char * str,int unset __maybe_unused)3164 static int record__parse_mmap_pages(const struct option *opt,
3165 const char *str,
3166 int unset __maybe_unused)
3167 {
3168 struct record_opts *opts = opt->value;
3169 char *s, *p;
3170 unsigned int mmap_pages;
3171 int ret;
3172
3173 if (!str)
3174 return -EINVAL;
3175
3176 s = strdup(str);
3177 if (!s)
3178 return -ENOMEM;
3179
3180 p = strchr(s, ',');
3181 if (p)
3182 *p = '\0';
3183
3184 if (*s) {
3185 ret = __evlist__parse_mmap_pages(&mmap_pages, s);
3186 if (ret)
3187 goto out_free;
3188 opts->mmap_pages = mmap_pages;
3189 }
3190
3191 if (!p) {
3192 ret = 0;
3193 goto out_free;
3194 }
3195
3196 ret = __evlist__parse_mmap_pages(&mmap_pages, p + 1);
3197 if (ret)
3198 goto out_free;
3199
3200 opts->auxtrace_mmap_pages = mmap_pages;
3201
3202 out_free:
3203 free(s);
3204 return ret;
3205 }
3206
record__parse_off_cpu_thresh(const struct option * opt,const char * str,int unset __maybe_unused)3207 static int record__parse_off_cpu_thresh(const struct option *opt,
3208 const char *str,
3209 int unset __maybe_unused)
3210 {
3211 struct record_opts *opts = opt->value;
3212 char *endptr;
3213 u64 off_cpu_thresh_ms;
3214
3215 if (!str)
3216 return -EINVAL;
3217
3218 off_cpu_thresh_ms = strtoull(str, &endptr, 10);
3219
3220 /* the threshold isn't string "0", yet strtoull() returns 0, parsing failed */
3221 if (*endptr || (off_cpu_thresh_ms == 0 && strcmp(str, "0")))
3222 return -EINVAL;
3223 else
3224 opts->off_cpu_thresh_ns = off_cpu_thresh_ms * NSEC_PER_MSEC;
3225
3226 return 0;
3227 }
3228
arch__add_leaf_frame_record_opts(struct record_opts * opts __maybe_unused)3229 void __weak arch__add_leaf_frame_record_opts(struct record_opts *opts __maybe_unused)
3230 {
3231 }
3232
parse_control_option(const struct option * opt,const char * str,int unset __maybe_unused)3233 static int parse_control_option(const struct option *opt,
3234 const char *str,
3235 int unset __maybe_unused)
3236 {
3237 struct record_opts *opts = opt->value;
3238
3239 return evlist__parse_control(str, &opts->ctl_fd, &opts->ctl_fd_ack, &opts->ctl_fd_close);
3240 }
3241
switch_output_size_warn(struct record * rec)3242 static void switch_output_size_warn(struct record *rec)
3243 {
3244 u64 wakeup_size = evlist__mmap_size(rec->opts.mmap_pages);
3245 struct switch_output *s = &rec->switch_output;
3246
3247 wakeup_size /= 2;
3248
3249 if (s->size < wakeup_size) {
3250 char buf[100];
3251
3252 unit_number__scnprintf(buf, sizeof(buf), wakeup_size);
3253 pr_warning("WARNING: switch-output data size lower than "
3254 "wakeup kernel buffer size (%s) "
3255 "expect bigger perf.data sizes\n", buf);
3256 }
3257 }
3258
switch_output_setup(struct record * rec)3259 static int switch_output_setup(struct record *rec)
3260 {
3261 struct switch_output *s = &rec->switch_output;
3262 static struct parse_tag tags_size[] = {
3263 { .tag = 'B', .mult = 1 },
3264 { .tag = 'K', .mult = 1 << 10 },
3265 { .tag = 'M', .mult = 1 << 20 },
3266 { .tag = 'G', .mult = 1 << 30 },
3267 { .tag = 0 },
3268 };
3269 static struct parse_tag tags_time[] = {
3270 { .tag = 's', .mult = 1 },
3271 { .tag = 'm', .mult = 60 },
3272 { .tag = 'h', .mult = 60*60 },
3273 { .tag = 'd', .mult = 60*60*24 },
3274 { .tag = 0 },
3275 };
3276 unsigned long val;
3277
3278 /*
3279 * If we're using --switch-output-events, then we imply its
3280 * --switch-output=signal, as we'll send a SIGUSR2 from the side band
3281 * thread to its parent.
3282 */
3283 if (rec->switch_output_event_set) {
3284 if (record__threads_enabled(rec)) {
3285 pr_warning("WARNING: --switch-output-event option is not available in parallel streaming mode.\n");
3286 return 0;
3287 }
3288 goto do_signal;
3289 }
3290
3291 if (!s->set)
3292 return 0;
3293
3294 if (record__threads_enabled(rec)) {
3295 pr_warning("WARNING: --switch-output option is not available in parallel streaming mode.\n");
3296 return 0;
3297 }
3298
3299 if (!strcmp(s->str, "signal")) {
3300 do_signal:
3301 s->signal = true;
3302 pr_debug("switch-output with SIGUSR2 signal\n");
3303 goto enabled;
3304 }
3305
3306 val = parse_tag_value(s->str, tags_size);
3307 if (val != (unsigned long) -1) {
3308 s->size = val;
3309 pr_debug("switch-output with %s size threshold\n", s->str);
3310 goto enabled;
3311 }
3312
3313 val = parse_tag_value(s->str, tags_time);
3314 if (val != (unsigned long) -1) {
3315 s->time = val;
3316 pr_debug("switch-output with %s time threshold (%lu seconds)\n",
3317 s->str, s->time);
3318 goto enabled;
3319 }
3320
3321 return -1;
3322
3323 enabled:
3324 rec->timestamp_filename = true;
3325 s->enabled = true;
3326
3327 if (s->size && !rec->opts.no_buffering)
3328 switch_output_size_warn(rec);
3329
3330 return 0;
3331 }
3332
3333 static const char * const __record_usage[] = {
3334 "perf record [<options>] [<command>]",
3335 "perf record [<options>] -- <command> [<options>]",
3336 NULL
3337 };
3338 const char * const *record_usage = __record_usage;
3339
build_id__process_mmap(const struct perf_tool * tool,union perf_event * event,struct perf_sample * sample,struct machine * machine)3340 static int build_id__process_mmap(const struct perf_tool *tool, union perf_event *event,
3341 struct perf_sample *sample, struct machine *machine)
3342 {
3343 /*
3344 * We already have the kernel maps, put in place via perf_session__create_kernel_maps()
3345 * no need to add them twice.
3346 */
3347 if (!(event->header.misc & PERF_RECORD_MISC_USER))
3348 return 0;
3349 return perf_event__process_mmap(tool, event, sample, machine);
3350 }
3351
build_id__process_mmap2(const struct perf_tool * tool,union perf_event * event,struct perf_sample * sample,struct machine * machine)3352 static int build_id__process_mmap2(const struct perf_tool *tool, union perf_event *event,
3353 struct perf_sample *sample, struct machine *machine)
3354 {
3355 /*
3356 * We already have the kernel maps, put in place via perf_session__create_kernel_maps()
3357 * no need to add them twice.
3358 */
3359 if (!(event->header.misc & PERF_RECORD_MISC_USER))
3360 return 0;
3361
3362 return perf_event__process_mmap2(tool, event, sample, machine);
3363 }
3364
process_timestamp_boundary(const struct perf_tool * tool,union perf_event * event __maybe_unused,struct perf_sample * sample,struct machine * machine __maybe_unused)3365 static int process_timestamp_boundary(const struct perf_tool *tool,
3366 union perf_event *event __maybe_unused,
3367 struct perf_sample *sample,
3368 struct machine *machine __maybe_unused)
3369 {
3370 struct record *rec = container_of(tool, struct record, tool);
3371
3372 set_timestamp_boundary(rec, sample->time);
3373 return 0;
3374 }
3375
parse_record_synth_option(const struct option * opt,const char * str,int unset __maybe_unused)3376 static int parse_record_synth_option(const struct option *opt,
3377 const char *str,
3378 int unset __maybe_unused)
3379 {
3380 struct record_opts *opts = opt->value;
3381 char *p = strdup(str);
3382
3383 if (p == NULL)
3384 return -1;
3385
3386 opts->synth = parse_synth_opt(p);
3387 free(p);
3388
3389 if (opts->synth < 0) {
3390 pr_err("Invalid synth option: %s\n", str);
3391 return -1;
3392 }
3393 return 0;
3394 }
3395
3396 /*
3397 * XXX Ideally would be local to cmd_record() and passed to a record__new
3398 * because we need to have access to it in record__exit, that is called
3399 * after cmd_record() exits, but since record_options need to be accessible to
3400 * builtin-script, leave it here.
3401 *
3402 * At least we don't ouch it in all the other functions here directly.
3403 *
3404 * Just say no to tons of global variables, sigh.
3405 */
3406 static struct record record = {
3407 .opts = {
3408 .sample_time = true,
3409 .mmap_pages = UINT_MAX,
3410 .user_freq = UINT_MAX,
3411 .user_interval = ULLONG_MAX,
3412 .freq = 4000,
3413 .target = {
3414 .uses_mmap = true,
3415 .default_per_cpu = true,
3416 },
3417 .mmap_flush = MMAP_FLUSH_DEFAULT,
3418 .nr_threads_synthesize = 1,
3419 .ctl_fd = -1,
3420 .ctl_fd_ack = -1,
3421 .synth = PERF_SYNTH_ALL,
3422 .off_cpu_thresh_ns = OFFCPU_THRESH,
3423 },
3424 .buildid_mmap = true,
3425 };
3426
3427 const char record_callchain_help[] = CALLCHAIN_RECORD_HELP
3428 "\n\t\t\t\tDefault: fp";
3429
3430 static bool dry_run;
3431
3432 static struct parse_events_option_args parse_events_option_args = {
3433 .evlistp = &record.evlist,
3434 };
3435
3436 static struct parse_events_option_args switch_output_parse_events_option_args = {
3437 .evlistp = &record.sb_evlist,
3438 };
3439
3440 /*
3441 * XXX Will stay a global variable till we fix builtin-script.c to stop messing
3442 * with it and switch to use the library functions in perf_evlist that came
3443 * from builtin-record.c, i.e. use record_opts,
3444 * evlist__prepare_workload, etc instead of fork+exec'in 'perf record',
3445 * using pipes, etc.
3446 */
3447 static struct option __record_options[] = {
3448 OPT_CALLBACK('e', "event", &parse_events_option_args, "event",
3449 "event selector. use 'perf list' to list available events",
3450 parse_events_option),
3451 OPT_CALLBACK(0, "filter", &record.evlist, "filter",
3452 "event filter", parse_filter),
3453 OPT_BOOLEAN(0, "latency", &record.latency,
3454 "Enable data collection for latency profiling.\n"
3455 "\t\t\t Use perf report --latency for latency-centric profile."),
3456 OPT_CALLBACK_NOOPT(0, "exclude-perf", &record.evlist,
3457 NULL, "don't record events from perf itself",
3458 exclude_perf),
3459 OPT_STRING('p', "pid", &record.opts.target.pid, "pid",
3460 "record events on existing process id"),
3461 OPT_STRING('t', "tid", &record.opts.target.tid, "tid",
3462 "record events on existing thread id"),
3463 OPT_INTEGER('r', "realtime", &record.realtime_prio,
3464 "collect data with this RT SCHED_FIFO priority"),
3465 OPT_BOOLEAN(0, "no-buffering", &record.opts.no_buffering,
3466 "collect data without buffering"),
3467 OPT_BOOLEAN('R', "raw-samples", &record.opts.raw_samples,
3468 "collect raw sample records from all opened counters"),
3469 OPT_BOOLEAN('a', "all-cpus", &record.opts.target.system_wide,
3470 "system-wide collection from all CPUs"),
3471 OPT_STRING('C', "cpu", &record.opts.target.cpu_list, "cpu",
3472 "list of cpus to monitor"),
3473 OPT_U64('c', "count", &record.opts.user_interval, "event period to sample"),
3474 OPT_STRING('o', "output", &record.data.path, "file",
3475 "output file name"),
3476 OPT_BOOLEAN_SET('i', "no-inherit", &record.opts.no_inherit,
3477 &record.opts.no_inherit_set,
3478 "child tasks do not inherit counters"),
3479 OPT_BOOLEAN(0, "tail-synthesize", &record.opts.tail_synthesize,
3480 "synthesize non-sample events at the end of output"),
3481 OPT_BOOLEAN(0, "overwrite", &record.opts.overwrite, "use overwrite mode"),
3482 OPT_BOOLEAN(0, "no-bpf-event", &record.opts.no_bpf_event, "do not record bpf events"),
3483 OPT_BOOLEAN(0, "strict-freq", &record.opts.strict_freq,
3484 "Fail if the specified frequency can't be used"),
3485 OPT_CALLBACK('F', "freq", &record.opts, "freq or 'max'",
3486 "profile at this frequency",
3487 record__parse_freq),
3488 OPT_CALLBACK('m', "mmap-pages", &record.opts, "pages[,pages]",
3489 "number of mmap data pages and AUX area tracing mmap pages",
3490 record__parse_mmap_pages),
3491 OPT_CALLBACK(0, "mmap-flush", &record.opts, "number",
3492 "Minimal number of bytes that is extracted from mmap data pages (default: 1)",
3493 record__mmap_flush_parse),
3494 OPT_CALLBACK_NOOPT('g', NULL, &callchain_param,
3495 NULL, "enables call-graph recording" ,
3496 &record_callchain_opt),
3497 OPT_CALLBACK(0, "call-graph", &record.opts,
3498 "record_mode[,record_size]", record_callchain_help,
3499 &record_parse_callchain_opt),
3500 OPT_INCR('v', "verbose", &verbose,
3501 "be more verbose (show counter open errors, etc)"),
3502 OPT_BOOLEAN('q', "quiet", &quiet, "don't print any warnings or messages"),
3503 OPT_BOOLEAN('s', "stat", &record.opts.inherit_stat,
3504 "per thread counts"),
3505 OPT_BOOLEAN('d', "data", &record.opts.sample_address, "Record the sample addresses"),
3506 OPT_BOOLEAN(0, "phys-data", &record.opts.sample_phys_addr,
3507 "Record the sample physical addresses"),
3508 OPT_BOOLEAN(0, "data-page-size", &record.opts.sample_data_page_size,
3509 "Record the sampled data address data page size"),
3510 OPT_BOOLEAN(0, "code-page-size", &record.opts.sample_code_page_size,
3511 "Record the sampled code address (ip) page size"),
3512 OPT_BOOLEAN(0, "sample-mem-info", &record.opts.sample_data_src,
3513 "Record the data source for memory operations"),
3514 OPT_BOOLEAN(0, "sample-cpu", &record.opts.sample_cpu, "Record the sample cpu"),
3515 OPT_BOOLEAN(0, "sample-identifier", &record.opts.sample_identifier,
3516 "Record the sample identifier"),
3517 OPT_BOOLEAN_SET('T', "timestamp", &record.opts.sample_time,
3518 &record.opts.sample_time_set,
3519 "Record the sample timestamps"),
3520 OPT_BOOLEAN_SET('P', "period", &record.opts.period, &record.opts.period_set,
3521 "Record the sample period"),
3522 OPT_BOOLEAN('n', "no-samples", &record.opts.no_samples,
3523 "don't sample"),
3524 OPT_BOOLEAN_SET('N', "no-buildid-cache", &record.no_buildid_cache,
3525 &record.no_buildid_cache_set,
3526 "do not update the buildid cache"),
3527 OPT_BOOLEAN_SET('B', "no-buildid", &record.no_buildid,
3528 &record.no_buildid_set,
3529 "do not collect buildids in perf.data"),
3530 OPT_CALLBACK('G', "cgroup", &record.evlist, "name",
3531 "monitor event in cgroup name only",
3532 parse_cgroups),
3533 OPT_CALLBACK('D', "delay", &record, "ms",
3534 "ms to wait before starting measurement after program start (-1: start with events disabled), "
3535 "or ranges of time to enable events e.g. '-D 10-20,30-40'",
3536 record__parse_event_enable_time),
3537 OPT_BOOLEAN(0, "kcore", &record.opts.kcore, "copy /proc/kcore"),
3538 OPT_STRING('u', "uid", &record.uid_str, "user", "user to profile"),
3539
3540 OPT_CALLBACK_NOOPT('b', "branch-any", &record.opts.branch_stack,
3541 "branch any", "sample any taken branches",
3542 parse_branch_stack),
3543
3544 OPT_CALLBACK('j', "branch-filter", &record.opts.branch_stack,
3545 "branch filter mask", "branch stack filter modes",
3546 parse_branch_stack),
3547 OPT_BOOLEAN('W', "weight", &record.opts.sample_weight,
3548 "sample by weight (on special events only)"),
3549 OPT_BOOLEAN(0, "transaction", &record.opts.sample_transaction,
3550 "sample transaction flags (special events only)"),
3551 OPT_BOOLEAN(0, "per-thread", &record.opts.target.per_thread,
3552 "use per-thread mmaps"),
3553 OPT_CALLBACK_OPTARG('I', "intr-regs", &record.opts.sample_intr_regs, NULL, "any register",
3554 "sample selected machine registers on interrupt,"
3555 " use '-I?' to list register names", parse_intr_regs),
3556 OPT_CALLBACK_OPTARG(0, "user-regs", &record.opts.sample_user_regs, NULL, "any register",
3557 "sample selected machine registers in user space,"
3558 " use '--user-regs=?' to list register names", parse_user_regs),
3559 OPT_BOOLEAN(0, "running-time", &record.opts.running_time,
3560 "Record running/enabled time of read (:S) events"),
3561 OPT_CALLBACK('k', "clockid", &record.opts,
3562 "clockid", "clockid to use for events, see clock_gettime()",
3563 parse_clockid),
3564 OPT_STRING_OPTARG('S', "snapshot", &record.opts.auxtrace_snapshot_opts,
3565 "opts", "AUX area tracing Snapshot Mode", ""),
3566 OPT_STRING_OPTARG(0, "aux-sample", &record.opts.auxtrace_sample_opts,
3567 "opts", "sample AUX area", ""),
3568 OPT_UINTEGER(0, "proc-map-timeout", &proc_map_timeout,
3569 "per thread proc mmap processing timeout in ms"),
3570 OPT_BOOLEAN(0, "namespaces", &record.opts.record_namespaces,
3571 "Record namespaces events"),
3572 OPT_BOOLEAN(0, "all-cgroups", &record.opts.record_cgroup,
3573 "Record cgroup events"),
3574 OPT_BOOLEAN_SET(0, "switch-events", &record.opts.record_switch_events,
3575 &record.opts.record_switch_events_set,
3576 "Record context switch events"),
3577 OPT_BOOLEAN_FLAG(0, "all-kernel", &record.opts.all_kernel,
3578 "Configure all used events to run in kernel space.",
3579 PARSE_OPT_EXCLUSIVE),
3580 OPT_BOOLEAN_FLAG(0, "all-user", &record.opts.all_user,
3581 "Configure all used events to run in user space.",
3582 PARSE_OPT_EXCLUSIVE),
3583 OPT_BOOLEAN(0, "kernel-callchains", &record.opts.kernel_callchains,
3584 "collect kernel callchains"),
3585 OPT_BOOLEAN(0, "user-callchains", &record.opts.user_callchains,
3586 "collect user callchains"),
3587 OPT_STRING(0, "vmlinux", &symbol_conf.vmlinux_name,
3588 "file", "vmlinux pathname"),
3589 OPT_BOOLEAN(0, "buildid-all", &record.buildid_all,
3590 "Record build-id of all DSOs regardless of hits"),
3591 OPT_BOOLEAN_SET(0, "buildid-mmap", &record.buildid_mmap, &record.buildid_mmap_set,
3592 "Record build-id in mmap events and skip build-id processing."),
3593 OPT_BOOLEAN(0, "timestamp-filename", &record.timestamp_filename,
3594 "append timestamp to output filename"),
3595 OPT_BOOLEAN(0, "timestamp-boundary", &record.timestamp_boundary,
3596 "Record timestamp boundary (time of first/last samples)"),
3597 OPT_STRING_OPTARG_SET(0, "switch-output", &record.switch_output.str,
3598 &record.switch_output.set, "signal or size[BKMG] or time[smhd]",
3599 "Switch output when receiving SIGUSR2 (signal) or cross a size or time threshold",
3600 "signal"),
3601 OPT_CALLBACK_SET(0, "switch-output-event", &switch_output_parse_events_option_args,
3602 &record.switch_output_event_set, "switch output event",
3603 "switch output event selector. use 'perf list' to list available events",
3604 parse_events_option_new_evlist),
3605 OPT_INTEGER(0, "switch-max-files", &record.switch_output.num_files,
3606 "Limit number of switch output generated files"),
3607 OPT_BOOLEAN(0, "dry-run", &dry_run,
3608 "Parse options then exit"),
3609 #ifdef HAVE_AIO_SUPPORT
3610 OPT_CALLBACK_OPTARG(0, "aio", &record.opts,
3611 &nr_cblocks_default, "n", "Use <n> control blocks in asynchronous trace writing mode (default: 1, max: 4)",
3612 record__aio_parse),
3613 #endif
3614 OPT_CALLBACK(0, "affinity", &record.opts, "node|cpu",
3615 "Set affinity mask of trace reading thread to NUMA node cpu mask or cpu of processed mmap buffer",
3616 record__parse_affinity),
3617 #ifdef HAVE_ZSTD_SUPPORT
3618 OPT_CALLBACK_OPTARG('z', "compression-level", &record.opts, &comp_level_default, "n",
3619 "Compress records using specified level (default: 1 - fastest compression, 22 - greatest compression)",
3620 record__parse_comp_level),
3621 #endif
3622 OPT_CALLBACK(0, "max-size", &record.output_max_size,
3623 "size", "Limit the maximum size of the output file", parse_output_max_size),
3624 OPT_UINTEGER(0, "num-thread-synthesize",
3625 &record.opts.nr_threads_synthesize,
3626 "number of threads to run for event synthesis"),
3627 #ifdef HAVE_LIBPFM
3628 OPT_CALLBACK(0, "pfm-events", &record.evlist, "event",
3629 "libpfm4 event selector. use 'perf list' to list available events",
3630 parse_libpfm_events_option),
3631 #endif
3632 OPT_CALLBACK(0, "control", &record.opts, "fd:ctl-fd[,ack-fd] or fifo:ctl-fifo[,ack-fifo]",
3633 "Listen on ctl-fd descriptor for command to control measurement ('enable': enable events, 'disable': disable events,\n"
3634 "\t\t\t 'snapshot': AUX area tracing snapshot).\n"
3635 "\t\t\t Optionally send control command completion ('ack\\n') to ack-fd descriptor.\n"
3636 "\t\t\t Alternatively, ctl-fifo / ack-fifo will be opened and used as ctl-fd / ack-fd.",
3637 parse_control_option),
3638 OPT_CALLBACK(0, "synth", &record.opts, "no|all|task|mmap|cgroup",
3639 "Fine-tune event synthesis: default=all", parse_record_synth_option),
3640 OPT_STRING_OPTARG_SET(0, "debuginfod", &record.debuginfod.urls,
3641 &record.debuginfod.set, "debuginfod urls",
3642 "Enable debuginfod data retrieval from DEBUGINFOD_URLS or specified urls",
3643 "system"),
3644 OPT_CALLBACK_OPTARG(0, "threads", &record.opts, NULL, "spec",
3645 "write collected trace data into several data files using parallel threads",
3646 record__parse_threads),
3647 OPT_BOOLEAN(0, "off-cpu", &record.off_cpu, "Enable off-cpu analysis"),
3648 OPT_STRING(0, "setup-filter", &record.filter_action, "pin|unpin",
3649 "BPF filter action"),
3650 OPT_CALLBACK(0, "off-cpu-thresh", &record.opts, "ms",
3651 "Dump off-cpu samples if off-cpu time exceeds this threshold (in milliseconds). (Default: 500ms)",
3652 record__parse_off_cpu_thresh),
3653 OPT_END()
3654 };
3655
3656 struct option *record_options = __record_options;
3657
record__mmap_cpu_mask_init(struct mmap_cpu_mask * mask,struct perf_cpu_map * cpus)3658 static int record__mmap_cpu_mask_init(struct mmap_cpu_mask *mask, struct perf_cpu_map *cpus)
3659 {
3660 struct perf_cpu cpu;
3661 int idx;
3662
3663 if (cpu_map__is_dummy(cpus))
3664 return 0;
3665
3666 perf_cpu_map__for_each_cpu_skip_any(cpu, idx, cpus) {
3667 /* Return ENODEV is input cpu is greater than max cpu */
3668 if ((unsigned long)cpu.cpu > mask->nbits)
3669 return -ENODEV;
3670 __set_bit(cpu.cpu, mask->bits);
3671 }
3672
3673 return 0;
3674 }
3675
record__mmap_cpu_mask_init_spec(struct mmap_cpu_mask * mask,const char * mask_spec)3676 static int record__mmap_cpu_mask_init_spec(struct mmap_cpu_mask *mask, const char *mask_spec)
3677 {
3678 struct perf_cpu_map *cpus;
3679
3680 cpus = perf_cpu_map__new(mask_spec);
3681 if (!cpus)
3682 return -ENOMEM;
3683
3684 bitmap_zero(mask->bits, mask->nbits);
3685 if (record__mmap_cpu_mask_init(mask, cpus))
3686 return -ENODEV;
3687
3688 perf_cpu_map__put(cpus);
3689
3690 return 0;
3691 }
3692
record__free_thread_masks(struct record * rec,int nr_threads)3693 static void record__free_thread_masks(struct record *rec, int nr_threads)
3694 {
3695 int t;
3696
3697 if (rec->thread_masks)
3698 for (t = 0; t < nr_threads; t++)
3699 record__thread_mask_free(&rec->thread_masks[t]);
3700
3701 zfree(&rec->thread_masks);
3702 }
3703
record__alloc_thread_masks(struct record * rec,int nr_threads,int nr_bits)3704 static int record__alloc_thread_masks(struct record *rec, int nr_threads, int nr_bits)
3705 {
3706 int t, ret;
3707
3708 rec->thread_masks = zalloc(nr_threads * sizeof(*(rec->thread_masks)));
3709 if (!rec->thread_masks) {
3710 pr_err("Failed to allocate thread masks\n");
3711 return -ENOMEM;
3712 }
3713
3714 for (t = 0; t < nr_threads; t++) {
3715 ret = record__thread_mask_alloc(&rec->thread_masks[t], nr_bits);
3716 if (ret) {
3717 pr_err("Failed to allocate thread masks[%d]\n", t);
3718 goto out_free;
3719 }
3720 }
3721
3722 return 0;
3723
3724 out_free:
3725 record__free_thread_masks(rec, nr_threads);
3726
3727 return ret;
3728 }
3729
record__init_thread_cpu_masks(struct record * rec,struct perf_cpu_map * cpus)3730 static int record__init_thread_cpu_masks(struct record *rec, struct perf_cpu_map *cpus)
3731 {
3732 int t, ret, nr_cpus = perf_cpu_map__nr(cpus);
3733
3734 ret = record__alloc_thread_masks(rec, nr_cpus, cpu__max_cpu().cpu);
3735 if (ret)
3736 return ret;
3737
3738 rec->nr_threads = nr_cpus;
3739 pr_debug("nr_threads: %d\n", rec->nr_threads);
3740
3741 for (t = 0; t < rec->nr_threads; t++) {
3742 __set_bit(perf_cpu_map__cpu(cpus, t).cpu, rec->thread_masks[t].maps.bits);
3743 __set_bit(perf_cpu_map__cpu(cpus, t).cpu, rec->thread_masks[t].affinity.bits);
3744 if (verbose > 0) {
3745 pr_debug("thread_masks[%d]: ", t);
3746 mmap_cpu_mask__scnprintf(&rec->thread_masks[t].maps, "maps");
3747 pr_debug("thread_masks[%d]: ", t);
3748 mmap_cpu_mask__scnprintf(&rec->thread_masks[t].affinity, "affinity");
3749 }
3750 }
3751
3752 return 0;
3753 }
3754
record__init_thread_masks_spec(struct record * rec,struct perf_cpu_map * cpus,const char ** maps_spec,const char ** affinity_spec,u32 nr_spec)3755 static int record__init_thread_masks_spec(struct record *rec, struct perf_cpu_map *cpus,
3756 const char **maps_spec, const char **affinity_spec,
3757 u32 nr_spec)
3758 {
3759 u32 s;
3760 int ret = 0, t = 0;
3761 struct mmap_cpu_mask cpus_mask;
3762 struct thread_mask thread_mask, full_mask, *thread_masks;
3763
3764 ret = record__mmap_cpu_mask_alloc(&cpus_mask, cpu__max_cpu().cpu);
3765 if (ret) {
3766 pr_err("Failed to allocate CPUs mask\n");
3767 return ret;
3768 }
3769
3770 ret = record__mmap_cpu_mask_init(&cpus_mask, cpus);
3771 if (ret) {
3772 pr_err("Failed to init cpu mask\n");
3773 goto out_free_cpu_mask;
3774 }
3775
3776 ret = record__thread_mask_alloc(&full_mask, cpu__max_cpu().cpu);
3777 if (ret) {
3778 pr_err("Failed to allocate full mask\n");
3779 goto out_free_cpu_mask;
3780 }
3781
3782 ret = record__thread_mask_alloc(&thread_mask, cpu__max_cpu().cpu);
3783 if (ret) {
3784 pr_err("Failed to allocate thread mask\n");
3785 goto out_free_full_and_cpu_masks;
3786 }
3787
3788 for (s = 0; s < nr_spec; s++) {
3789 ret = record__mmap_cpu_mask_init_spec(&thread_mask.maps, maps_spec[s]);
3790 if (ret) {
3791 pr_err("Failed to initialize maps thread mask\n");
3792 goto out_free;
3793 }
3794 ret = record__mmap_cpu_mask_init_spec(&thread_mask.affinity, affinity_spec[s]);
3795 if (ret) {
3796 pr_err("Failed to initialize affinity thread mask\n");
3797 goto out_free;
3798 }
3799
3800 /* ignore invalid CPUs but do not allow empty masks */
3801 if (!bitmap_and(thread_mask.maps.bits, thread_mask.maps.bits,
3802 cpus_mask.bits, thread_mask.maps.nbits)) {
3803 pr_err("Empty maps mask: %s\n", maps_spec[s]);
3804 ret = -EINVAL;
3805 goto out_free;
3806 }
3807 if (!bitmap_and(thread_mask.affinity.bits, thread_mask.affinity.bits,
3808 cpus_mask.bits, thread_mask.affinity.nbits)) {
3809 pr_err("Empty affinity mask: %s\n", affinity_spec[s]);
3810 ret = -EINVAL;
3811 goto out_free;
3812 }
3813
3814 /* do not allow intersection with other masks (full_mask) */
3815 if (bitmap_intersects(thread_mask.maps.bits, full_mask.maps.bits,
3816 thread_mask.maps.nbits)) {
3817 pr_err("Intersecting maps mask: %s\n", maps_spec[s]);
3818 ret = -EINVAL;
3819 goto out_free;
3820 }
3821 if (bitmap_intersects(thread_mask.affinity.bits, full_mask.affinity.bits,
3822 thread_mask.affinity.nbits)) {
3823 pr_err("Intersecting affinity mask: %s\n", affinity_spec[s]);
3824 ret = -EINVAL;
3825 goto out_free;
3826 }
3827
3828 bitmap_or(full_mask.maps.bits, full_mask.maps.bits,
3829 thread_mask.maps.bits, full_mask.maps.nbits);
3830 bitmap_or(full_mask.affinity.bits, full_mask.affinity.bits,
3831 thread_mask.affinity.bits, full_mask.maps.nbits);
3832
3833 thread_masks = realloc(rec->thread_masks, (t + 1) * sizeof(struct thread_mask));
3834 if (!thread_masks) {
3835 pr_err("Failed to reallocate thread masks\n");
3836 ret = -ENOMEM;
3837 goto out_free;
3838 }
3839 rec->thread_masks = thread_masks;
3840 rec->thread_masks[t] = thread_mask;
3841 if (verbose > 0) {
3842 pr_debug("thread_masks[%d]: ", t);
3843 mmap_cpu_mask__scnprintf(&rec->thread_masks[t].maps, "maps");
3844 pr_debug("thread_masks[%d]: ", t);
3845 mmap_cpu_mask__scnprintf(&rec->thread_masks[t].affinity, "affinity");
3846 }
3847 t++;
3848 ret = record__thread_mask_alloc(&thread_mask, cpu__max_cpu().cpu);
3849 if (ret) {
3850 pr_err("Failed to allocate thread mask\n");
3851 goto out_free_full_and_cpu_masks;
3852 }
3853 }
3854 rec->nr_threads = t;
3855 pr_debug("nr_threads: %d\n", rec->nr_threads);
3856 if (!rec->nr_threads)
3857 ret = -EINVAL;
3858
3859 out_free:
3860 record__thread_mask_free(&thread_mask);
3861 out_free_full_and_cpu_masks:
3862 record__thread_mask_free(&full_mask);
3863 out_free_cpu_mask:
3864 record__mmap_cpu_mask_free(&cpus_mask);
3865
3866 return ret;
3867 }
3868
record__init_thread_core_masks(struct record * rec,struct perf_cpu_map * cpus)3869 static int record__init_thread_core_masks(struct record *rec, struct perf_cpu_map *cpus)
3870 {
3871 int ret;
3872 struct cpu_topology *topo;
3873
3874 topo = cpu_topology__new();
3875 if (!topo) {
3876 pr_err("Failed to allocate CPU topology\n");
3877 return -ENOMEM;
3878 }
3879
3880 ret = record__init_thread_masks_spec(rec, cpus, topo->core_cpus_list,
3881 topo->core_cpus_list, topo->core_cpus_lists);
3882 cpu_topology__delete(topo);
3883
3884 return ret;
3885 }
3886
record__init_thread_package_masks(struct record * rec,struct perf_cpu_map * cpus)3887 static int record__init_thread_package_masks(struct record *rec, struct perf_cpu_map *cpus)
3888 {
3889 int ret;
3890 struct cpu_topology *topo;
3891
3892 topo = cpu_topology__new();
3893 if (!topo) {
3894 pr_err("Failed to allocate CPU topology\n");
3895 return -ENOMEM;
3896 }
3897
3898 ret = record__init_thread_masks_spec(rec, cpus, topo->package_cpus_list,
3899 topo->package_cpus_list, topo->package_cpus_lists);
3900 cpu_topology__delete(topo);
3901
3902 return ret;
3903 }
3904
record__init_thread_numa_masks(struct record * rec,struct perf_cpu_map * cpus)3905 static int record__init_thread_numa_masks(struct record *rec, struct perf_cpu_map *cpus)
3906 {
3907 u32 s;
3908 int ret;
3909 const char **spec;
3910 struct numa_topology *topo;
3911
3912 topo = numa_topology__new();
3913 if (!topo) {
3914 pr_err("Failed to allocate NUMA topology\n");
3915 return -ENOMEM;
3916 }
3917
3918 spec = zalloc(topo->nr * sizeof(char *));
3919 if (!spec) {
3920 pr_err("Failed to allocate NUMA spec\n");
3921 ret = -ENOMEM;
3922 goto out_delete_topo;
3923 }
3924 for (s = 0; s < topo->nr; s++)
3925 spec[s] = topo->nodes[s].cpus;
3926
3927 ret = record__init_thread_masks_spec(rec, cpus, spec, spec, topo->nr);
3928
3929 zfree(&spec);
3930
3931 out_delete_topo:
3932 numa_topology__delete(topo);
3933
3934 return ret;
3935 }
3936
record__init_thread_user_masks(struct record * rec,struct perf_cpu_map * cpus)3937 static int record__init_thread_user_masks(struct record *rec, struct perf_cpu_map *cpus)
3938 {
3939 int t, ret;
3940 u32 s, nr_spec = 0;
3941 char **maps_spec = NULL, **affinity_spec = NULL, **tmp_spec;
3942 char *user_spec, *spec, *spec_ptr, *mask, *mask_ptr, *dup_mask = NULL;
3943
3944 for (t = 0, user_spec = (char *)rec->opts.threads_user_spec; ; t++, user_spec = NULL) {
3945 spec = strtok_r(user_spec, ":", &spec_ptr);
3946 if (spec == NULL)
3947 break;
3948 pr_debug2("threads_spec[%d]: %s\n", t, spec);
3949 mask = strtok_r(spec, "/", &mask_ptr);
3950 if (mask == NULL)
3951 break;
3952 pr_debug2(" maps mask: %s\n", mask);
3953 tmp_spec = realloc(maps_spec, (nr_spec + 1) * sizeof(char *));
3954 if (!tmp_spec) {
3955 pr_err("Failed to reallocate maps spec\n");
3956 ret = -ENOMEM;
3957 goto out_free;
3958 }
3959 maps_spec = tmp_spec;
3960 maps_spec[nr_spec] = dup_mask = strdup(mask);
3961 if (!maps_spec[nr_spec]) {
3962 pr_err("Failed to allocate maps spec[%d]\n", nr_spec);
3963 ret = -ENOMEM;
3964 goto out_free;
3965 }
3966 mask = strtok_r(NULL, "/", &mask_ptr);
3967 if (mask == NULL) {
3968 pr_err("Invalid thread maps or affinity specs\n");
3969 ret = -EINVAL;
3970 goto out_free;
3971 }
3972 pr_debug2(" affinity mask: %s\n", mask);
3973 tmp_spec = realloc(affinity_spec, (nr_spec + 1) * sizeof(char *));
3974 if (!tmp_spec) {
3975 pr_err("Failed to reallocate affinity spec\n");
3976 ret = -ENOMEM;
3977 goto out_free;
3978 }
3979 affinity_spec = tmp_spec;
3980 affinity_spec[nr_spec] = strdup(mask);
3981 if (!affinity_spec[nr_spec]) {
3982 pr_err("Failed to allocate affinity spec[%d]\n", nr_spec);
3983 ret = -ENOMEM;
3984 goto out_free;
3985 }
3986 dup_mask = NULL;
3987 nr_spec++;
3988 }
3989
3990 ret = record__init_thread_masks_spec(rec, cpus, (const char **)maps_spec,
3991 (const char **)affinity_spec, nr_spec);
3992
3993 out_free:
3994 free(dup_mask);
3995 for (s = 0; s < nr_spec; s++) {
3996 if (maps_spec)
3997 free(maps_spec[s]);
3998 if (affinity_spec)
3999 free(affinity_spec[s]);
4000 }
4001 free(affinity_spec);
4002 free(maps_spec);
4003
4004 return ret;
4005 }
4006
record__init_thread_default_masks(struct record * rec,struct perf_cpu_map * cpus)4007 static int record__init_thread_default_masks(struct record *rec, struct perf_cpu_map *cpus)
4008 {
4009 int ret;
4010
4011 ret = record__alloc_thread_masks(rec, 1, cpu__max_cpu().cpu);
4012 if (ret)
4013 return ret;
4014
4015 if (record__mmap_cpu_mask_init(&rec->thread_masks->maps, cpus))
4016 return -ENODEV;
4017
4018 rec->nr_threads = 1;
4019
4020 return 0;
4021 }
4022
record__init_thread_masks(struct record * rec)4023 static int record__init_thread_masks(struct record *rec)
4024 {
4025 int ret = 0;
4026 struct perf_cpu_map *cpus = rec->evlist->core.all_cpus;
4027
4028 if (!record__threads_enabled(rec))
4029 return record__init_thread_default_masks(rec, cpus);
4030
4031 if (evlist__per_thread(rec->evlist)) {
4032 pr_err("--per-thread option is mutually exclusive to parallel streaming mode.\n");
4033 return -EINVAL;
4034 }
4035
4036 switch (rec->opts.threads_spec) {
4037 case THREAD_SPEC__CPU:
4038 ret = record__init_thread_cpu_masks(rec, cpus);
4039 break;
4040 case THREAD_SPEC__CORE:
4041 ret = record__init_thread_core_masks(rec, cpus);
4042 break;
4043 case THREAD_SPEC__PACKAGE:
4044 ret = record__init_thread_package_masks(rec, cpus);
4045 break;
4046 case THREAD_SPEC__NUMA:
4047 ret = record__init_thread_numa_masks(rec, cpus);
4048 break;
4049 case THREAD_SPEC__USER:
4050 ret = record__init_thread_user_masks(rec, cpus);
4051 break;
4052 default:
4053 break;
4054 }
4055
4056 return ret;
4057 }
4058
cmd_record(int argc,const char ** argv)4059 int cmd_record(int argc, const char **argv)
4060 {
4061 int err;
4062 struct record *rec = &record;
4063 char errbuf[BUFSIZ];
4064
4065 setlocale(LC_ALL, "");
4066
4067 #ifndef HAVE_BPF_SKEL
4068 # define set_nobuild(s, l, m, c) set_option_nobuild(record_options, s, l, m, c)
4069 set_nobuild('\0', "off-cpu", "no BUILD_BPF_SKEL=1", true);
4070 # undef set_nobuild
4071 #endif
4072
4073 /* Disable eager loading of kernel symbols that adds overhead to perf record. */
4074 symbol_conf.lazy_load_kernel_maps = true;
4075 rec->opts.affinity = PERF_AFFINITY_SYS;
4076
4077 rec->evlist = evlist__new();
4078 if (rec->evlist == NULL)
4079 return -ENOMEM;
4080
4081 err = perf_config(perf_record_config, rec);
4082 if (err)
4083 return err;
4084
4085 argc = parse_options(argc, argv, record_options, record_usage,
4086 PARSE_OPT_STOP_AT_NON_OPTION);
4087 if (quiet)
4088 perf_quiet_option();
4089
4090 err = symbol__validate_sym_arguments();
4091 if (err)
4092 return err;
4093
4094 perf_debuginfod_setup(&record.debuginfod);
4095
4096 /* Make system wide (-a) the default target. */
4097 if (!argc && target__none(&rec->opts.target))
4098 rec->opts.target.system_wide = true;
4099
4100 if (nr_cgroups && !rec->opts.target.system_wide) {
4101 usage_with_options_msg(record_usage, record_options,
4102 "cgroup monitoring only available in system-wide mode");
4103
4104 }
4105
4106 if (record.latency) {
4107 /*
4108 * There is no fundamental reason why latency profiling
4109 * can't work for system-wide mode, but exact semantics
4110 * and details are to be defined.
4111 * See the following thread for details:
4112 * https://lore.kernel.org/all/Z4XDJyvjiie3howF@google.com/
4113 */
4114 if (record.opts.target.system_wide) {
4115 pr_err("Failed: latency profiling is not supported with system-wide collection.\n");
4116 err = -EINVAL;
4117 goto out_opts;
4118 }
4119 record.opts.record_switch_events = true;
4120 }
4121
4122 if (!rec->buildid_mmap) {
4123 pr_debug("Disabling build id in synthesized mmap2 events.\n");
4124 symbol_conf.no_buildid_mmap2 = true;
4125 } else if (rec->buildid_mmap_set) {
4126 /*
4127 * Explicitly passing --buildid-mmap disables buildid processing
4128 * and cache generation.
4129 */
4130 rec->no_buildid = true;
4131 }
4132 if (rec->buildid_mmap && !perf_can_record_build_id()) {
4133 pr_warning("Missing support for build id in kernel mmap events.\n"
4134 "Disable this warning with --no-buildid-mmap\n");
4135 rec->buildid_mmap = false;
4136 }
4137 if (rec->buildid_mmap) {
4138 /* Enable perf_event_attr::build_id bit. */
4139 rec->opts.build_id = true;
4140 }
4141
4142 if (rec->opts.record_cgroup && !perf_can_record_cgroup()) {
4143 pr_err("Kernel has no cgroup sampling support.\n");
4144 err = -EINVAL;
4145 goto out_opts;
4146 }
4147
4148 if (rec->opts.kcore)
4149 rec->opts.text_poke = true;
4150
4151 if (rec->opts.kcore || record__threads_enabled(rec))
4152 rec->data.is_dir = true;
4153
4154 if (record__threads_enabled(rec)) {
4155 if (rec->opts.affinity != PERF_AFFINITY_SYS) {
4156 pr_err("--affinity option is mutually exclusive to parallel streaming mode.\n");
4157 goto out_opts;
4158 }
4159 if (record__aio_enabled(rec)) {
4160 pr_err("Asynchronous streaming mode (--aio) is mutually exclusive to parallel streaming mode.\n");
4161 goto out_opts;
4162 }
4163 }
4164
4165 if (rec->opts.comp_level != 0) {
4166 pr_debug("Compression enabled, disabling build id collection at the end of the session.\n");
4167 rec->no_buildid = true;
4168 }
4169
4170 if (rec->opts.record_switch_events &&
4171 !perf_can_record_switch_events()) {
4172 ui__error("kernel does not support recording context switch events\n");
4173 parse_options_usage(record_usage, record_options, "switch-events", 0);
4174 err = -EINVAL;
4175 goto out_opts;
4176 }
4177
4178 if (switch_output_setup(rec)) {
4179 parse_options_usage(record_usage, record_options, "switch-output", 0);
4180 err = -EINVAL;
4181 goto out_opts;
4182 }
4183
4184 if (rec->switch_output.time) {
4185 signal(SIGALRM, alarm_sig_handler);
4186 alarm(rec->switch_output.time);
4187 }
4188
4189 if (rec->switch_output.num_files) {
4190 rec->switch_output.filenames = calloc(rec->switch_output.num_files,
4191 sizeof(char *));
4192 if (!rec->switch_output.filenames) {
4193 err = -EINVAL;
4194 goto out_opts;
4195 }
4196 }
4197
4198 if (rec->timestamp_filename && record__threads_enabled(rec)) {
4199 rec->timestamp_filename = false;
4200 pr_warning("WARNING: --timestamp-filename option is not available in parallel streaming mode.\n");
4201 }
4202
4203 if (rec->filter_action) {
4204 if (!strcmp(rec->filter_action, "pin"))
4205 err = perf_bpf_filter__pin();
4206 else if (!strcmp(rec->filter_action, "unpin"))
4207 err = perf_bpf_filter__unpin();
4208 else {
4209 pr_warning("Unknown BPF filter action: %s\n", rec->filter_action);
4210 err = -EINVAL;
4211 }
4212 goto out_opts;
4213 }
4214
4215 /* For backward compatibility, -d implies --mem-info */
4216 if (rec->opts.sample_address)
4217 rec->opts.sample_data_src = true;
4218
4219 /*
4220 * Allow aliases to facilitate the lookup of symbols for address
4221 * filters. Refer to auxtrace_parse_filters().
4222 */
4223 symbol_conf.allow_aliases = true;
4224
4225 symbol__init(NULL);
4226
4227 err = record__auxtrace_init(rec);
4228 if (err)
4229 goto out;
4230
4231 if (dry_run)
4232 goto out;
4233
4234 err = -ENOMEM;
4235
4236 if (rec->no_buildid_cache || rec->no_buildid) {
4237 disable_buildid_cache();
4238 } else if (rec->switch_output.enabled) {
4239 /*
4240 * In 'perf record --switch-output', disable buildid
4241 * generation by default to reduce data file switching
4242 * overhead. Still generate buildid if they are required
4243 * explicitly using
4244 *
4245 * perf record --switch-output --no-no-buildid \
4246 * --no-no-buildid-cache
4247 *
4248 * Following code equals to:
4249 *
4250 * if ((rec->no_buildid || !rec->no_buildid_set) &&
4251 * (rec->no_buildid_cache || !rec->no_buildid_cache_set))
4252 * disable_buildid_cache();
4253 */
4254 bool disable = true;
4255
4256 if (rec->no_buildid_set && !rec->no_buildid)
4257 disable = false;
4258 if (rec->no_buildid_cache_set && !rec->no_buildid_cache)
4259 disable = false;
4260 if (disable) {
4261 rec->no_buildid = true;
4262 rec->no_buildid_cache = true;
4263 disable_buildid_cache();
4264 }
4265 }
4266
4267 if (record.opts.overwrite)
4268 record.opts.tail_synthesize = true;
4269
4270 if (rec->evlist->core.nr_entries == 0) {
4271 err = parse_event(rec->evlist, "cycles:P");
4272 if (err)
4273 goto out;
4274 }
4275
4276 if (rec->opts.target.tid && !rec->opts.no_inherit_set)
4277 rec->opts.no_inherit = true;
4278
4279 err = target__validate(&rec->opts.target);
4280 if (err) {
4281 target__strerror(&rec->opts.target, err, errbuf, BUFSIZ);
4282 ui__warning("%s\n", errbuf);
4283 }
4284
4285 if (rec->uid_str) {
4286 uid_t uid = parse_uid(rec->uid_str);
4287
4288 if (uid == UINT_MAX) {
4289 ui__error("Invalid User: %s", rec->uid_str);
4290 err = -EINVAL;
4291 goto out;
4292 }
4293 err = parse_uid_filter(rec->evlist, uid);
4294 if (err)
4295 goto out;
4296
4297 /* User ID filtering implies system wide. */
4298 rec->opts.target.system_wide = true;
4299 }
4300
4301 /* Enable ignoring missing threads when -p option is defined. */
4302 rec->opts.ignore_missing_thread = rec->opts.target.pid;
4303
4304 evlist__warn_user_requested_cpus(rec->evlist, rec->opts.target.cpu_list);
4305
4306 if (callchain_param.enabled && callchain_param.record_mode == CALLCHAIN_FP)
4307 arch__add_leaf_frame_record_opts(&rec->opts);
4308
4309 err = -ENOMEM;
4310 if (evlist__create_maps(rec->evlist, &rec->opts.target) < 0) {
4311 if (rec->opts.target.pid != NULL) {
4312 pr_err("Couldn't create thread/CPU maps: %s\n",
4313 errno == ENOENT ? "No such process" : str_error_r(errno, errbuf, sizeof(errbuf)));
4314 goto out;
4315 }
4316 else
4317 usage_with_options(record_usage, record_options);
4318 }
4319
4320 err = auxtrace_record__options(rec->itr, rec->evlist, &rec->opts);
4321 if (err)
4322 goto out;
4323
4324 /*
4325 * We take all buildids when the file contains
4326 * AUX area tracing data because we do not decode the
4327 * trace because it would take too long.
4328 */
4329 if (rec->opts.full_auxtrace)
4330 rec->buildid_all = true;
4331
4332 if (rec->opts.text_poke) {
4333 err = record__config_text_poke(rec->evlist);
4334 if (err) {
4335 pr_err("record__config_text_poke failed, error %d\n", err);
4336 goto out;
4337 }
4338 }
4339
4340 if (rec->off_cpu) {
4341 err = record__config_off_cpu(rec);
4342 if (err) {
4343 pr_err("record__config_off_cpu failed, error %d\n", err);
4344 goto out;
4345 }
4346 }
4347
4348 if (record_opts__config(&rec->opts)) {
4349 err = -EINVAL;
4350 goto out;
4351 }
4352
4353 err = record__config_tracking_events(rec);
4354 if (err) {
4355 pr_err("record__config_tracking_events failed, error %d\n", err);
4356 goto out;
4357 }
4358
4359 err = record__init_thread_masks(rec);
4360 if (err) {
4361 pr_err("Failed to initialize parallel data streaming masks\n");
4362 goto out;
4363 }
4364
4365 if (rec->opts.nr_cblocks > nr_cblocks_max)
4366 rec->opts.nr_cblocks = nr_cblocks_max;
4367 pr_debug("nr_cblocks: %d\n", rec->opts.nr_cblocks);
4368
4369 pr_debug("affinity: %s\n", affinity_tags[rec->opts.affinity]);
4370 pr_debug("mmap flush: %d\n", rec->opts.mmap_flush);
4371
4372 if (rec->opts.comp_level > comp_level_max)
4373 rec->opts.comp_level = comp_level_max;
4374 pr_debug("comp level: %d\n", rec->opts.comp_level);
4375
4376 err = __cmd_record(&record, argc, argv);
4377 out:
4378 record__free_thread_masks(rec, rec->nr_threads);
4379 rec->nr_threads = 0;
4380 symbol__exit();
4381 auxtrace_record__free(rec->itr);
4382 out_opts:
4383 evlist__close_control(rec->opts.ctl_fd, rec->opts.ctl_fd_ack, &rec->opts.ctl_fd_close);
4384 evlist__delete(rec->evlist);
4385 return err;
4386 }
4387
snapshot_sig_handler(int sig __maybe_unused)4388 static void snapshot_sig_handler(int sig __maybe_unused)
4389 {
4390 struct record *rec = &record;
4391
4392 hit_auxtrace_snapshot_trigger(rec);
4393
4394 if (switch_output_signal(rec))
4395 trigger_hit(&switch_output_trigger);
4396 }
4397
alarm_sig_handler(int sig __maybe_unused)4398 static void alarm_sig_handler(int sig __maybe_unused)
4399 {
4400 struct record *rec = &record;
4401
4402 if (switch_output_time(rec))
4403 trigger_hit(&switch_output_trigger);
4404 }
4405