xref: /linux/tools/perf/builtin-record.c (revision c34e9ab9a612ee8b18273398ef75c207b01f516d)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * builtin-record.c
4  *
5  * Builtin record command: Record the profile of a workload
6  * (or a CPU, or a PID) into the perf.data output file - for
7  * later analysis via perf report.
8  */
9 #include "builtin.h"
10 
11 #include "util/build-id.h"
12 #include <subcmd/parse-options.h>
13 #include <internal/xyarray.h>
14 #include "util/parse-events.h"
15 #include "util/config.h"
16 
17 #include "util/callchain.h"
18 #include "util/cgroup.h"
19 #include "util/header.h"
20 #include "util/event.h"
21 #include "util/evlist.h"
22 #include "util/evsel.h"
23 #include "util/debug.h"
24 #include "util/mmap.h"
25 #include "util/mutex.h"
26 #include "util/target.h"
27 #include "util/session.h"
28 #include "util/tool.h"
29 #include "util/symbol.h"
30 #include "util/record.h"
31 #include "util/cpumap.h"
32 #include "util/thread_map.h"
33 #include "util/data.h"
34 #include "util/perf_regs.h"
35 #include "util/auxtrace.h"
36 #include "util/tsc.h"
37 #include "util/parse-branch-options.h"
38 #include "util/parse-regs-options.h"
39 #include "util/perf_api_probe.h"
40 #include "util/trigger.h"
41 #include "util/perf-hooks.h"
42 #include "util/cpu-set-sched.h"
43 #include "util/synthetic-events.h"
44 #include "util/time-utils.h"
45 #include "util/units.h"
46 #include "util/bpf-event.h"
47 #include "util/util.h"
48 #include "util/pfm.h"
49 #include "util/pmu.h"
50 #include "util/pmus.h"
51 #include "util/clockid.h"
52 #include "util/off_cpu.h"
53 #include "util/bpf-filter.h"
54 #include "asm/bug.h"
55 #include "perf.h"
56 #include "cputopo.h"
57 
58 #include <errno.h>
59 #include <inttypes.h>
60 #include <locale.h>
61 #include <poll.h>
62 #include <pthread.h>
63 #include <unistd.h>
64 #ifndef HAVE_GETTID
65 #include <syscall.h>
66 #endif
67 #include <sched.h>
68 #include <signal.h>
69 #ifdef HAVE_EVENTFD_SUPPORT
70 #include <sys/eventfd.h>
71 #endif
72 #include <sys/mman.h>
73 #include <sys/wait.h>
74 #include <sys/types.h>
75 #include <sys/stat.h>
76 #include <fcntl.h>
77 #include <linux/err.h>
78 #include <linux/string.h>
79 #include <linux/time64.h>
80 #include <linux/zalloc.h>
81 #include <linux/bitmap.h>
82 #include <sys/time.h>
83 
84 struct switch_output {
85 	bool		 enabled;
86 	bool		 signal;
87 	unsigned long	 size;
88 	unsigned long	 time;
89 	const char	*str;
90 	bool		 set;
91 	char		 **filenames;
92 	int		 num_files;
93 	int		 cur_file;
94 };
95 
96 struct thread_mask {
97 	struct mmap_cpu_mask	maps;
98 	struct mmap_cpu_mask	affinity;
99 };
100 
101 struct record_thread {
102 	pid_t			tid;
103 	struct thread_mask	*mask;
104 	struct {
105 		int		msg[2];
106 		int		ack[2];
107 	} pipes;
108 	struct fdarray		pollfd;
109 	int			ctlfd_pos;
110 	int			nr_mmaps;
111 	struct mmap		**maps;
112 	struct mmap		**overwrite_maps;
113 	struct record		*rec;
114 	unsigned long long	samples;
115 	unsigned long		waking;
116 	u64			bytes_written;
117 	u64			bytes_transferred;
118 	u64			bytes_compressed;
119 };
120 
121 static __thread struct record_thread *thread;
122 
123 enum thread_msg {
124 	THREAD_MSG__UNDEFINED = 0,
125 	THREAD_MSG__READY,
126 	THREAD_MSG__MAX,
127 };
128 
129 static const char *thread_msg_tags[THREAD_MSG__MAX] = {
130 	"UNDEFINED", "READY"
131 };
132 
133 enum thread_spec {
134 	THREAD_SPEC__UNDEFINED = 0,
135 	THREAD_SPEC__CPU,
136 	THREAD_SPEC__CORE,
137 	THREAD_SPEC__PACKAGE,
138 	THREAD_SPEC__NUMA,
139 	THREAD_SPEC__USER,
140 	THREAD_SPEC__MAX,
141 };
142 
143 static const char *thread_spec_tags[THREAD_SPEC__MAX] = {
144 	"undefined", "cpu", "core", "package", "numa", "user"
145 };
146 
147 struct pollfd_index_map {
148 	int evlist_pollfd_index;
149 	int thread_pollfd_index;
150 };
151 
152 struct record {
153 	struct perf_tool	tool;
154 	struct record_opts	opts;
155 	u64			bytes_written;
156 	u64			thread_bytes_written;
157 	struct perf_data	data;
158 	struct auxtrace_record	*itr;
159 	struct evlist	*evlist;
160 	struct perf_session	*session;
161 	struct evlist		*sb_evlist;
162 	pthread_t		thread_id;
163 	int			realtime_prio;
164 	bool			switch_output_event_set;
165 	bool			no_buildid;
166 	bool			no_buildid_set;
167 	bool			no_buildid_cache;
168 	bool			no_buildid_cache_set;
169 	bool			buildid_all;
170 	bool			buildid_mmap;
171 	bool			timestamp_filename;
172 	bool			timestamp_boundary;
173 	bool			off_cpu;
174 	const char		*filter_action;
175 	struct switch_output	switch_output;
176 	unsigned long long	samples;
177 	unsigned long		output_max_size;	/* = 0: unlimited */
178 	struct perf_debuginfod	debuginfod;
179 	int			nr_threads;
180 	struct thread_mask	*thread_masks;
181 	struct record_thread	*thread_data;
182 	struct pollfd_index_map	*index_map;
183 	size_t			index_map_sz;
184 	size_t			index_map_cnt;
185 };
186 
187 static volatile int done;
188 
189 static volatile int auxtrace_record__snapshot_started;
190 static DEFINE_TRIGGER(auxtrace_snapshot_trigger);
191 static DEFINE_TRIGGER(switch_output_trigger);
192 
193 static const char *affinity_tags[PERF_AFFINITY_MAX] = {
194 	"SYS", "NODE", "CPU"
195 };
196 
197 static int build_id__process_mmap(const struct perf_tool *tool, union perf_event *event,
198 				  struct perf_sample *sample, struct machine *machine);
199 static int build_id__process_mmap2(const struct perf_tool *tool, union perf_event *event,
200 				   struct perf_sample *sample, struct machine *machine);
201 static int process_timestamp_boundary(const struct perf_tool *tool,
202 				      union perf_event *event,
203 				      struct perf_sample *sample,
204 				      struct machine *machine);
205 
206 #ifndef HAVE_GETTID
207 static inline pid_t gettid(void)
208 {
209 	return (pid_t)syscall(__NR_gettid);
210 }
211 #endif
212 
213 static int record__threads_enabled(struct record *rec)
214 {
215 	return rec->opts.threads_spec;
216 }
217 
218 static bool switch_output_signal(struct record *rec)
219 {
220 	return rec->switch_output.signal &&
221 	       trigger_is_ready(&switch_output_trigger);
222 }
223 
224 static bool switch_output_size(struct record *rec)
225 {
226 	return rec->switch_output.size &&
227 	       trigger_is_ready(&switch_output_trigger) &&
228 	       (rec->bytes_written >= rec->switch_output.size);
229 }
230 
231 static bool switch_output_time(struct record *rec)
232 {
233 	return rec->switch_output.time &&
234 	       trigger_is_ready(&switch_output_trigger);
235 }
236 
237 static u64 record__bytes_written(struct record *rec)
238 {
239 	return rec->bytes_written + rec->thread_bytes_written;
240 }
241 
242 static bool record__output_max_size_exceeded(struct record *rec)
243 {
244 	return rec->output_max_size &&
245 	       (record__bytes_written(rec) >= rec->output_max_size);
246 }
247 
248 static int record__write(struct record *rec, struct mmap *map __maybe_unused,
249 			 void *bf, size_t size)
250 {
251 	struct perf_data_file *file = &rec->session->data->file;
252 
253 	if (map && map->file)
254 		file = map->file;
255 
256 	if (perf_data_file__write(file, bf, size) < 0) {
257 		pr_err("failed to write perf data, error: %m\n");
258 		return -1;
259 	}
260 
261 	if (map && map->file) {
262 		thread->bytes_written += size;
263 		rec->thread_bytes_written += size;
264 	} else {
265 		rec->bytes_written += size;
266 	}
267 
268 	if (record__output_max_size_exceeded(rec) && !done) {
269 		fprintf(stderr, "[ perf record: perf size limit reached (%" PRIu64 " KB),"
270 				" stopping session ]\n",
271 				record__bytes_written(rec) >> 10);
272 		done = 1;
273 	}
274 
275 	if (switch_output_size(rec))
276 		trigger_hit(&switch_output_trigger);
277 
278 	return 0;
279 }
280 
281 static int record__aio_enabled(struct record *rec);
282 static int record__comp_enabled(struct record *rec);
283 static ssize_t zstd_compress(struct perf_session *session, struct mmap *map,
284 			    void *dst, size_t dst_size, void *src, size_t src_size);
285 
286 #ifdef HAVE_AIO_SUPPORT
287 static int record__aio_write(struct aiocb *cblock, int trace_fd,
288 		void *buf, size_t size, off_t off)
289 {
290 	int rc;
291 
292 	cblock->aio_fildes = trace_fd;
293 	cblock->aio_buf    = buf;
294 	cblock->aio_nbytes = size;
295 	cblock->aio_offset = off;
296 	cblock->aio_sigevent.sigev_notify = SIGEV_NONE;
297 
298 	do {
299 		rc = aio_write(cblock);
300 		if (rc == 0) {
301 			break;
302 		} else if (errno != EAGAIN) {
303 			cblock->aio_fildes = -1;
304 			pr_err("failed to queue perf data, error: %m\n");
305 			break;
306 		}
307 	} while (1);
308 
309 	return rc;
310 }
311 
312 static int record__aio_complete(struct mmap *md, struct aiocb *cblock)
313 {
314 	void *rem_buf;
315 	off_t rem_off;
316 	size_t rem_size;
317 	int rc, aio_errno;
318 	ssize_t aio_ret, written;
319 
320 	aio_errno = aio_error(cblock);
321 	if (aio_errno == EINPROGRESS)
322 		return 0;
323 
324 	written = aio_ret = aio_return(cblock);
325 	if (aio_ret < 0) {
326 		if (aio_errno != EINTR)
327 			pr_err("failed to write perf data, error: %m\n");
328 		written = 0;
329 	}
330 
331 	rem_size = cblock->aio_nbytes - written;
332 
333 	if (rem_size == 0) {
334 		cblock->aio_fildes = -1;
335 		/*
336 		 * md->refcount is incremented in record__aio_pushfn() for
337 		 * every aio write request started in record__aio_push() so
338 		 * decrement it because the request is now complete.
339 		 */
340 		perf_mmap__put(&md->core);
341 		rc = 1;
342 	} else {
343 		/*
344 		 * aio write request may require restart with the
345 		 * remainder if the kernel didn't write whole
346 		 * chunk at once.
347 		 */
348 		rem_off = cblock->aio_offset + written;
349 		rem_buf = (void *)(cblock->aio_buf + written);
350 		record__aio_write(cblock, cblock->aio_fildes,
351 				rem_buf, rem_size, rem_off);
352 		rc = 0;
353 	}
354 
355 	return rc;
356 }
357 
358 static int record__aio_sync(struct mmap *md, bool sync_all)
359 {
360 	struct aiocb **aiocb = md->aio.aiocb;
361 	struct aiocb *cblocks = md->aio.cblocks;
362 	struct timespec timeout = { 0, 1000 * 1000  * 1 }; /* 1ms */
363 	int i, do_suspend;
364 
365 	do {
366 		do_suspend = 0;
367 		for (i = 0; i < md->aio.nr_cblocks; ++i) {
368 			if (cblocks[i].aio_fildes == -1 || record__aio_complete(md, &cblocks[i])) {
369 				if (sync_all)
370 					aiocb[i] = NULL;
371 				else
372 					return i;
373 			} else {
374 				/*
375 				 * Started aio write is not complete yet
376 				 * so it has to be waited before the
377 				 * next allocation.
378 				 */
379 				aiocb[i] = &cblocks[i];
380 				do_suspend = 1;
381 			}
382 		}
383 		if (!do_suspend)
384 			return -1;
385 
386 		while (aio_suspend((const struct aiocb **)aiocb, md->aio.nr_cblocks, &timeout)) {
387 			if (!(errno == EAGAIN || errno == EINTR))
388 				pr_err("failed to sync perf data, error: %m\n");
389 		}
390 	} while (1);
391 }
392 
393 struct record_aio {
394 	struct record	*rec;
395 	void		*data;
396 	size_t		size;
397 };
398 
399 static int record__aio_pushfn(struct mmap *map, void *to, void *buf, size_t size)
400 {
401 	struct record_aio *aio = to;
402 
403 	/*
404 	 * map->core.base data pointed by buf is copied into free map->aio.data[] buffer
405 	 * to release space in the kernel buffer as fast as possible, calling
406 	 * perf_mmap__consume() from perf_mmap__push() function.
407 	 *
408 	 * That lets the kernel to proceed with storing more profiling data into
409 	 * the kernel buffer earlier than other per-cpu kernel buffers are handled.
410 	 *
411 	 * Coping can be done in two steps in case the chunk of profiling data
412 	 * crosses the upper bound of the kernel buffer. In this case we first move
413 	 * part of data from map->start till the upper bound and then the remainder
414 	 * from the beginning of the kernel buffer till the end of the data chunk.
415 	 */
416 
417 	if (record__comp_enabled(aio->rec)) {
418 		ssize_t compressed = zstd_compress(aio->rec->session, NULL, aio->data + aio->size,
419 						   mmap__mmap_len(map) - aio->size,
420 						   buf, size);
421 		if (compressed < 0)
422 			return (int)compressed;
423 
424 		size = compressed;
425 	} else {
426 		memcpy(aio->data + aio->size, buf, size);
427 	}
428 
429 	if (!aio->size) {
430 		/*
431 		 * Increment map->refcount to guard map->aio.data[] buffer
432 		 * from premature deallocation because map object can be
433 		 * released earlier than aio write request started on
434 		 * map->aio.data[] buffer is complete.
435 		 *
436 		 * perf_mmap__put() is done at record__aio_complete()
437 		 * after started aio request completion or at record__aio_push()
438 		 * if the request failed to start.
439 		 */
440 		perf_mmap__get(&map->core);
441 	}
442 
443 	aio->size += size;
444 
445 	return size;
446 }
447 
448 static int record__aio_push(struct record *rec, struct mmap *map, off_t *off)
449 {
450 	int ret, idx;
451 	int trace_fd = rec->session->data->file.fd;
452 	struct record_aio aio = { .rec = rec, .size = 0 };
453 
454 	/*
455 	 * Call record__aio_sync() to wait till map->aio.data[] buffer
456 	 * becomes available after previous aio write operation.
457 	 */
458 
459 	idx = record__aio_sync(map, false);
460 	aio.data = map->aio.data[idx];
461 	ret = perf_mmap__push(map, &aio, record__aio_pushfn);
462 	if (ret != 0) /* ret > 0 - no data, ret < 0 - error */
463 		return ret;
464 
465 	rec->samples++;
466 	ret = record__aio_write(&(map->aio.cblocks[idx]), trace_fd, aio.data, aio.size, *off);
467 	if (!ret) {
468 		*off += aio.size;
469 		rec->bytes_written += aio.size;
470 		if (switch_output_size(rec))
471 			trigger_hit(&switch_output_trigger);
472 	} else {
473 		/*
474 		 * Decrement map->refcount incremented in record__aio_pushfn()
475 		 * back if record__aio_write() operation failed to start, otherwise
476 		 * map->refcount is decremented in record__aio_complete() after
477 		 * aio write operation finishes successfully.
478 		 */
479 		perf_mmap__put(&map->core);
480 	}
481 
482 	return ret;
483 }
484 
485 static off_t record__aio_get_pos(int trace_fd)
486 {
487 	return lseek(trace_fd, 0, SEEK_CUR);
488 }
489 
490 static void record__aio_set_pos(int trace_fd, off_t pos)
491 {
492 	lseek(trace_fd, pos, SEEK_SET);
493 }
494 
495 static void record__aio_mmap_read_sync(struct record *rec)
496 {
497 	int i;
498 	struct evlist *evlist = rec->evlist;
499 	struct mmap *maps = evlist->mmap;
500 
501 	if (!record__aio_enabled(rec))
502 		return;
503 
504 	for (i = 0; i < evlist->core.nr_mmaps; i++) {
505 		struct mmap *map = &maps[i];
506 
507 		if (map->core.base)
508 			record__aio_sync(map, true);
509 	}
510 }
511 
512 static int nr_cblocks_default = 1;
513 static int nr_cblocks_max = 4;
514 
515 static int record__aio_parse(const struct option *opt,
516 			     const char *str,
517 			     int unset)
518 {
519 	struct record_opts *opts = (struct record_opts *)opt->value;
520 
521 	if (unset) {
522 		opts->nr_cblocks = 0;
523 	} else {
524 		if (str)
525 			opts->nr_cblocks = strtol(str, NULL, 0);
526 		if (!opts->nr_cblocks)
527 			opts->nr_cblocks = nr_cblocks_default;
528 	}
529 
530 	return 0;
531 }
532 #else /* HAVE_AIO_SUPPORT */
533 static int nr_cblocks_max = 0;
534 
535 static int record__aio_push(struct record *rec __maybe_unused, struct mmap *map __maybe_unused,
536 			    off_t *off __maybe_unused)
537 {
538 	return -1;
539 }
540 
541 static off_t record__aio_get_pos(int trace_fd __maybe_unused)
542 {
543 	return -1;
544 }
545 
546 static void record__aio_set_pos(int trace_fd __maybe_unused, off_t pos __maybe_unused)
547 {
548 }
549 
550 static void record__aio_mmap_read_sync(struct record *rec __maybe_unused)
551 {
552 }
553 #endif
554 
555 static int record__aio_enabled(struct record *rec)
556 {
557 	return rec->opts.nr_cblocks > 0;
558 }
559 
560 #define MMAP_FLUSH_DEFAULT 1
561 static int record__mmap_flush_parse(const struct option *opt,
562 				    const char *str,
563 				    int unset)
564 {
565 	int flush_max;
566 	struct record_opts *opts = (struct record_opts *)opt->value;
567 	static struct parse_tag tags[] = {
568 			{ .tag  = 'B', .mult = 1       },
569 			{ .tag  = 'K', .mult = 1 << 10 },
570 			{ .tag  = 'M', .mult = 1 << 20 },
571 			{ .tag  = 'G', .mult = 1 << 30 },
572 			{ .tag  = 0 },
573 	};
574 
575 	if (unset)
576 		return 0;
577 
578 	if (str) {
579 		opts->mmap_flush = parse_tag_value(str, tags);
580 		if (opts->mmap_flush == (int)-1)
581 			opts->mmap_flush = strtol(str, NULL, 0);
582 	}
583 
584 	if (!opts->mmap_flush)
585 		opts->mmap_flush = MMAP_FLUSH_DEFAULT;
586 
587 	flush_max = evlist__mmap_size(opts->mmap_pages);
588 	flush_max /= 4;
589 	if (opts->mmap_flush > flush_max)
590 		opts->mmap_flush = flush_max;
591 
592 	return 0;
593 }
594 
595 #ifdef HAVE_ZSTD_SUPPORT
596 static unsigned int comp_level_default = 1;
597 
598 static int record__parse_comp_level(const struct option *opt, const char *str, int unset)
599 {
600 	struct record_opts *opts = opt->value;
601 
602 	if (unset) {
603 		opts->comp_level = 0;
604 	} else {
605 		if (str)
606 			opts->comp_level = strtol(str, NULL, 0);
607 		if (!opts->comp_level)
608 			opts->comp_level = comp_level_default;
609 	}
610 
611 	return 0;
612 }
613 #endif
614 static unsigned int comp_level_max = 22;
615 
616 static int record__comp_enabled(struct record *rec)
617 {
618 	return rec->opts.comp_level > 0;
619 }
620 
621 static int process_synthesized_event(const struct perf_tool *tool,
622 				     union perf_event *event,
623 				     struct perf_sample *sample __maybe_unused,
624 				     struct machine *machine __maybe_unused)
625 {
626 	struct record *rec = container_of(tool, struct record, tool);
627 	return record__write(rec, NULL, event, event->header.size);
628 }
629 
630 static struct mutex synth_lock;
631 
632 static int process_locked_synthesized_event(const struct perf_tool *tool,
633 				     union perf_event *event,
634 				     struct perf_sample *sample __maybe_unused,
635 				     struct machine *machine __maybe_unused)
636 {
637 	int ret;
638 
639 	mutex_lock(&synth_lock);
640 	ret = process_synthesized_event(tool, event, sample, machine);
641 	mutex_unlock(&synth_lock);
642 	return ret;
643 }
644 
645 static int record__pushfn(struct mmap *map, void *to, void *bf, size_t size)
646 {
647 	struct record *rec = to;
648 
649 	if (record__comp_enabled(rec)) {
650 		ssize_t compressed = zstd_compress(rec->session, map, map->data,
651 						   mmap__mmap_len(map), bf, size);
652 
653 		if (compressed < 0)
654 			return (int)compressed;
655 
656 		size = compressed;
657 		bf   = map->data;
658 	}
659 
660 	thread->samples++;
661 	return record__write(rec, map, bf, size);
662 }
663 
664 static volatile sig_atomic_t signr = -1;
665 static volatile sig_atomic_t child_finished;
666 #ifdef HAVE_EVENTFD_SUPPORT
667 static volatile sig_atomic_t done_fd = -1;
668 #endif
669 
670 static void sig_handler(int sig)
671 {
672 	if (sig == SIGCHLD)
673 		child_finished = 1;
674 	else
675 		signr = sig;
676 
677 	done = 1;
678 #ifdef HAVE_EVENTFD_SUPPORT
679 	if (done_fd >= 0) {
680 		u64 tmp = 1;
681 		int orig_errno = errno;
682 
683 		/*
684 		 * It is possible for this signal handler to run after done is
685 		 * checked in the main loop, but before the perf counter fds are
686 		 * polled. If this happens, the poll() will continue to wait
687 		 * even though done is set, and will only break out if either
688 		 * another signal is received, or the counters are ready for
689 		 * read. To ensure the poll() doesn't sleep when done is set,
690 		 * use an eventfd (done_fd) to wake up the poll().
691 		 */
692 		if (write(done_fd, &tmp, sizeof(tmp)) < 0)
693 			pr_err("failed to signal wakeup fd, error: %m\n");
694 
695 		errno = orig_errno;
696 	}
697 #endif // HAVE_EVENTFD_SUPPORT
698 }
699 
700 static void sigsegv_handler(int sig)
701 {
702 	perf_hooks__recover();
703 	sighandler_dump_stack(sig);
704 }
705 
706 static void record__sig_exit(void)
707 {
708 	if (signr == -1)
709 		return;
710 
711 	signal(signr, SIG_DFL);
712 	raise(signr);
713 }
714 
715 #ifdef HAVE_AUXTRACE_SUPPORT
716 
717 static int record__process_auxtrace(const struct perf_tool *tool,
718 				    struct mmap *map,
719 				    union perf_event *event, void *data1,
720 				    size_t len1, void *data2, size_t len2)
721 {
722 	struct record *rec = container_of(tool, struct record, tool);
723 	struct perf_data *data = &rec->data;
724 	size_t padding;
725 	u8 pad[8] = {0};
726 
727 	if (!perf_data__is_pipe(data) && perf_data__is_single_file(data)) {
728 		off_t file_offset;
729 		int fd = perf_data__fd(data);
730 		int err;
731 
732 		file_offset = lseek(fd, 0, SEEK_CUR);
733 		if (file_offset == -1)
734 			return -1;
735 		err = auxtrace_index__auxtrace_event(&rec->session->auxtrace_index,
736 						     event, file_offset);
737 		if (err)
738 			return err;
739 	}
740 
741 	/* event.auxtrace.size includes padding, see __auxtrace_mmap__read() */
742 	padding = (len1 + len2) & 7;
743 	if (padding)
744 		padding = 8 - padding;
745 
746 	record__write(rec, map, event, event->header.size);
747 	record__write(rec, map, data1, len1);
748 	if (len2)
749 		record__write(rec, map, data2, len2);
750 	record__write(rec, map, &pad, padding);
751 
752 	return 0;
753 }
754 
755 static int record__auxtrace_mmap_read(struct record *rec,
756 				      struct mmap *map)
757 {
758 	int ret;
759 
760 	ret = auxtrace_mmap__read(map, rec->itr, &rec->tool,
761 				  record__process_auxtrace);
762 	if (ret < 0)
763 		return ret;
764 
765 	if (ret)
766 		rec->samples++;
767 
768 	return 0;
769 }
770 
771 static int record__auxtrace_mmap_read_snapshot(struct record *rec,
772 					       struct mmap *map)
773 {
774 	int ret;
775 
776 	ret = auxtrace_mmap__read_snapshot(map, rec->itr, &rec->tool,
777 					   record__process_auxtrace,
778 					   rec->opts.auxtrace_snapshot_size);
779 	if (ret < 0)
780 		return ret;
781 
782 	if (ret)
783 		rec->samples++;
784 
785 	return 0;
786 }
787 
788 static int record__auxtrace_read_snapshot_all(struct record *rec)
789 {
790 	int i;
791 	int rc = 0;
792 
793 	for (i = 0; i < rec->evlist->core.nr_mmaps; i++) {
794 		struct mmap *map = &rec->evlist->mmap[i];
795 
796 		if (!map->auxtrace_mmap.base)
797 			continue;
798 
799 		if (record__auxtrace_mmap_read_snapshot(rec, map) != 0) {
800 			rc = -1;
801 			goto out;
802 		}
803 	}
804 out:
805 	return rc;
806 }
807 
808 static void record__read_auxtrace_snapshot(struct record *rec, bool on_exit)
809 {
810 	pr_debug("Recording AUX area tracing snapshot\n");
811 	if (record__auxtrace_read_snapshot_all(rec) < 0) {
812 		trigger_error(&auxtrace_snapshot_trigger);
813 	} else {
814 		if (auxtrace_record__snapshot_finish(rec->itr, on_exit))
815 			trigger_error(&auxtrace_snapshot_trigger);
816 		else
817 			trigger_ready(&auxtrace_snapshot_trigger);
818 	}
819 }
820 
821 static int record__auxtrace_snapshot_exit(struct record *rec)
822 {
823 	if (trigger_is_error(&auxtrace_snapshot_trigger))
824 		return 0;
825 
826 	if (!auxtrace_record__snapshot_started &&
827 	    auxtrace_record__snapshot_start(rec->itr))
828 		return -1;
829 
830 	record__read_auxtrace_snapshot(rec, true);
831 	if (trigger_is_error(&auxtrace_snapshot_trigger))
832 		return -1;
833 
834 	return 0;
835 }
836 
837 static int record__auxtrace_init(struct record *rec)
838 {
839 	int err;
840 
841 	if ((rec->opts.auxtrace_snapshot_opts || rec->opts.auxtrace_sample_opts)
842 	    && record__threads_enabled(rec)) {
843 		pr_err("AUX area tracing options are not available in parallel streaming mode.\n");
844 		return -EINVAL;
845 	}
846 
847 	if (!rec->itr) {
848 		rec->itr = auxtrace_record__init(rec->evlist, &err);
849 		if (err)
850 			return err;
851 	}
852 
853 	err = auxtrace_parse_snapshot_options(rec->itr, &rec->opts,
854 					      rec->opts.auxtrace_snapshot_opts);
855 	if (err)
856 		return err;
857 
858 	err = auxtrace_parse_sample_options(rec->itr, rec->evlist, &rec->opts,
859 					    rec->opts.auxtrace_sample_opts);
860 	if (err)
861 		return err;
862 
863 	auxtrace_regroup_aux_output(rec->evlist);
864 
865 	return auxtrace_parse_filters(rec->evlist);
866 }
867 
868 #else
869 
870 static inline
871 int record__auxtrace_mmap_read(struct record *rec __maybe_unused,
872 			       struct mmap *map __maybe_unused)
873 {
874 	return 0;
875 }
876 
877 static inline
878 void record__read_auxtrace_snapshot(struct record *rec __maybe_unused,
879 				    bool on_exit __maybe_unused)
880 {
881 }
882 
883 static inline
884 int auxtrace_record__snapshot_start(struct auxtrace_record *itr __maybe_unused)
885 {
886 	return 0;
887 }
888 
889 static inline
890 int record__auxtrace_snapshot_exit(struct record *rec __maybe_unused)
891 {
892 	return 0;
893 }
894 
895 static int record__auxtrace_init(struct record *rec __maybe_unused)
896 {
897 	return 0;
898 }
899 
900 #endif
901 
902 static int record__config_text_poke(struct evlist *evlist)
903 {
904 	struct evsel *evsel;
905 
906 	/* Nothing to do if text poke is already configured */
907 	evlist__for_each_entry(evlist, evsel) {
908 		if (evsel->core.attr.text_poke)
909 			return 0;
910 	}
911 
912 	evsel = evlist__add_dummy_on_all_cpus(evlist);
913 	if (!evsel)
914 		return -ENOMEM;
915 
916 	evsel->core.attr.text_poke = 1;
917 	evsel->core.attr.ksymbol = 1;
918 	evsel->immediate = true;
919 	evsel__set_sample_bit(evsel, TIME);
920 
921 	return 0;
922 }
923 
924 static int record__config_off_cpu(struct record *rec)
925 {
926 	return off_cpu_prepare(rec->evlist, &rec->opts.target, &rec->opts);
927 }
928 
929 static bool record__tracking_system_wide(struct record *rec)
930 {
931 	struct evlist *evlist = rec->evlist;
932 	struct evsel *evsel;
933 
934 	/*
935 	 * If non-dummy evsel exists, system_wide sideband is need to
936 	 * help parse sample information.
937 	 * For example, PERF_EVENT_MMAP event to help parse symbol,
938 	 * and PERF_EVENT_COMM event to help parse task executable name.
939 	 */
940 	evlist__for_each_entry(evlist, evsel) {
941 		if (!evsel__is_dummy_event(evsel))
942 			return true;
943 	}
944 
945 	return false;
946 }
947 
948 static int record__config_tracking_events(struct record *rec)
949 {
950 	struct record_opts *opts = &rec->opts;
951 	struct evlist *evlist = rec->evlist;
952 	bool system_wide = false;
953 	struct evsel *evsel;
954 
955 	/*
956 	 * For initial_delay, system wide or a hybrid system, we need to add
957 	 * tracking event so that we can track PERF_RECORD_MMAP to cover the
958 	 * delay of waiting or event synthesis.
959 	 */
960 	if (opts->target.initial_delay || target__has_cpu(&opts->target) ||
961 	    perf_pmus__num_core_pmus() > 1) {
962 
963 		/*
964 		 * User space tasks can migrate between CPUs, so when tracing
965 		 * selected CPUs, sideband for all CPUs is still needed.
966 		 */
967 		if (!!opts->target.cpu_list && record__tracking_system_wide(rec))
968 			system_wide = true;
969 
970 		evsel = evlist__findnew_tracking_event(evlist, system_wide);
971 		if (!evsel)
972 			return -ENOMEM;
973 
974 		/*
975 		 * Enable the tracking event when the process is forked for
976 		 * initial_delay, immediately for system wide.
977 		 */
978 		if (opts->target.initial_delay && !evsel->immediate &&
979 		    !target__has_cpu(&opts->target))
980 			evsel->core.attr.enable_on_exec = 1;
981 		else
982 			evsel->immediate = 1;
983 	}
984 
985 	return 0;
986 }
987 
988 static bool record__kcore_readable(struct machine *machine)
989 {
990 	char kcore[PATH_MAX];
991 	int fd;
992 
993 	scnprintf(kcore, sizeof(kcore), "%s/proc/kcore", machine->root_dir);
994 
995 	fd = open(kcore, O_RDONLY);
996 	if (fd < 0)
997 		return false;
998 
999 	close(fd);
1000 
1001 	return true;
1002 }
1003 
1004 static int record__kcore_copy(struct machine *machine, struct perf_data *data)
1005 {
1006 	char from_dir[PATH_MAX];
1007 	char kcore_dir[PATH_MAX];
1008 	int ret;
1009 
1010 	snprintf(from_dir, sizeof(from_dir), "%s/proc", machine->root_dir);
1011 
1012 	ret = perf_data__make_kcore_dir(data, kcore_dir, sizeof(kcore_dir));
1013 	if (ret)
1014 		return ret;
1015 
1016 	return kcore_copy(from_dir, kcore_dir);
1017 }
1018 
1019 static void record__thread_data_init_pipes(struct record_thread *thread_data)
1020 {
1021 	thread_data->pipes.msg[0] = -1;
1022 	thread_data->pipes.msg[1] = -1;
1023 	thread_data->pipes.ack[0] = -1;
1024 	thread_data->pipes.ack[1] = -1;
1025 }
1026 
1027 static int record__thread_data_open_pipes(struct record_thread *thread_data)
1028 {
1029 	if (pipe(thread_data->pipes.msg))
1030 		return -EINVAL;
1031 
1032 	if (pipe(thread_data->pipes.ack)) {
1033 		close(thread_data->pipes.msg[0]);
1034 		thread_data->pipes.msg[0] = -1;
1035 		close(thread_data->pipes.msg[1]);
1036 		thread_data->pipes.msg[1] = -1;
1037 		return -EINVAL;
1038 	}
1039 
1040 	pr_debug2("thread_data[%p]: msg=[%d,%d], ack=[%d,%d]\n", thread_data,
1041 		 thread_data->pipes.msg[0], thread_data->pipes.msg[1],
1042 		 thread_data->pipes.ack[0], thread_data->pipes.ack[1]);
1043 
1044 	return 0;
1045 }
1046 
1047 static void record__thread_data_close_pipes(struct record_thread *thread_data)
1048 {
1049 	if (thread_data->pipes.msg[0] != -1) {
1050 		close(thread_data->pipes.msg[0]);
1051 		thread_data->pipes.msg[0] = -1;
1052 	}
1053 	if (thread_data->pipes.msg[1] != -1) {
1054 		close(thread_data->pipes.msg[1]);
1055 		thread_data->pipes.msg[1] = -1;
1056 	}
1057 	if (thread_data->pipes.ack[0] != -1) {
1058 		close(thread_data->pipes.ack[0]);
1059 		thread_data->pipes.ack[0] = -1;
1060 	}
1061 	if (thread_data->pipes.ack[1] != -1) {
1062 		close(thread_data->pipes.ack[1]);
1063 		thread_data->pipes.ack[1] = -1;
1064 	}
1065 }
1066 
1067 static bool evlist__per_thread(struct evlist *evlist)
1068 {
1069 	return cpu_map__is_dummy(evlist->core.user_requested_cpus);
1070 }
1071 
1072 static int record__thread_data_init_maps(struct record_thread *thread_data, struct evlist *evlist)
1073 {
1074 	int m, tm, nr_mmaps = evlist->core.nr_mmaps;
1075 	struct mmap *mmap = evlist->mmap;
1076 	struct mmap *overwrite_mmap = evlist->overwrite_mmap;
1077 	struct perf_cpu_map *cpus = evlist->core.all_cpus;
1078 	bool per_thread = evlist__per_thread(evlist);
1079 
1080 	if (per_thread)
1081 		thread_data->nr_mmaps = nr_mmaps;
1082 	else
1083 		thread_data->nr_mmaps = bitmap_weight(thread_data->mask->maps.bits,
1084 						      thread_data->mask->maps.nbits);
1085 	if (mmap) {
1086 		thread_data->maps = zalloc(thread_data->nr_mmaps * sizeof(struct mmap *));
1087 		if (!thread_data->maps)
1088 			return -ENOMEM;
1089 	}
1090 	if (overwrite_mmap) {
1091 		thread_data->overwrite_maps = zalloc(thread_data->nr_mmaps * sizeof(struct mmap *));
1092 		if (!thread_data->overwrite_maps) {
1093 			zfree(&thread_data->maps);
1094 			return -ENOMEM;
1095 		}
1096 	}
1097 	pr_debug2("thread_data[%p]: nr_mmaps=%d, maps=%p, ow_maps=%p\n", thread_data,
1098 		 thread_data->nr_mmaps, thread_data->maps, thread_data->overwrite_maps);
1099 
1100 	for (m = 0, tm = 0; m < nr_mmaps && tm < thread_data->nr_mmaps; m++) {
1101 		if (per_thread ||
1102 		    test_bit(perf_cpu_map__cpu(cpus, m).cpu, thread_data->mask->maps.bits)) {
1103 			if (thread_data->maps) {
1104 				thread_data->maps[tm] = &mmap[m];
1105 				pr_debug2("thread_data[%p]: cpu%d: maps[%d] -> mmap[%d]\n",
1106 					  thread_data, perf_cpu_map__cpu(cpus, m).cpu, tm, m);
1107 			}
1108 			if (thread_data->overwrite_maps) {
1109 				thread_data->overwrite_maps[tm] = &overwrite_mmap[m];
1110 				pr_debug2("thread_data[%p]: cpu%d: ow_maps[%d] -> ow_mmap[%d]\n",
1111 					  thread_data, perf_cpu_map__cpu(cpus, m).cpu, tm, m);
1112 			}
1113 			tm++;
1114 		}
1115 	}
1116 
1117 	return 0;
1118 }
1119 
1120 static int record__thread_data_init_pollfd(struct record_thread *thread_data, struct evlist *evlist)
1121 {
1122 	int f, tm, pos;
1123 	struct mmap *map, *overwrite_map;
1124 
1125 	fdarray__init(&thread_data->pollfd, 64);
1126 
1127 	for (tm = 0; tm < thread_data->nr_mmaps; tm++) {
1128 		map = thread_data->maps ? thread_data->maps[tm] : NULL;
1129 		overwrite_map = thread_data->overwrite_maps ?
1130 				thread_data->overwrite_maps[tm] : NULL;
1131 
1132 		for (f = 0; f < evlist->core.pollfd.nr; f++) {
1133 			void *ptr = evlist->core.pollfd.priv[f].ptr;
1134 
1135 			if ((map && ptr == map) || (overwrite_map && ptr == overwrite_map)) {
1136 				pos = fdarray__dup_entry_from(&thread_data->pollfd, f,
1137 							      &evlist->core.pollfd);
1138 				if (pos < 0)
1139 					return pos;
1140 				pr_debug2("thread_data[%p]: pollfd[%d] <- event_fd=%d\n",
1141 					 thread_data, pos, evlist->core.pollfd.entries[f].fd);
1142 			}
1143 		}
1144 	}
1145 
1146 	return 0;
1147 }
1148 
1149 static void record__free_thread_data(struct record *rec)
1150 {
1151 	int t;
1152 	struct record_thread *thread_data = rec->thread_data;
1153 
1154 	if (thread_data == NULL)
1155 		return;
1156 
1157 	for (t = 0; t < rec->nr_threads; t++) {
1158 		record__thread_data_close_pipes(&thread_data[t]);
1159 		zfree(&thread_data[t].maps);
1160 		zfree(&thread_data[t].overwrite_maps);
1161 		fdarray__exit(&thread_data[t].pollfd);
1162 	}
1163 
1164 	zfree(&rec->thread_data);
1165 }
1166 
1167 static int record__map_thread_evlist_pollfd_indexes(struct record *rec,
1168 						    int evlist_pollfd_index,
1169 						    int thread_pollfd_index)
1170 {
1171 	size_t x = rec->index_map_cnt;
1172 
1173 	if (realloc_array_as_needed(rec->index_map, rec->index_map_sz, x, NULL))
1174 		return -ENOMEM;
1175 	rec->index_map[x].evlist_pollfd_index = evlist_pollfd_index;
1176 	rec->index_map[x].thread_pollfd_index = thread_pollfd_index;
1177 	rec->index_map_cnt += 1;
1178 	return 0;
1179 }
1180 
1181 static int record__update_evlist_pollfd_from_thread(struct record *rec,
1182 						    struct evlist *evlist,
1183 						    struct record_thread *thread_data)
1184 {
1185 	struct pollfd *e_entries = evlist->core.pollfd.entries;
1186 	struct pollfd *t_entries = thread_data->pollfd.entries;
1187 	int err = 0;
1188 	size_t i;
1189 
1190 	for (i = 0; i < rec->index_map_cnt; i++) {
1191 		int e_pos = rec->index_map[i].evlist_pollfd_index;
1192 		int t_pos = rec->index_map[i].thread_pollfd_index;
1193 
1194 		if (e_entries[e_pos].fd != t_entries[t_pos].fd ||
1195 		    e_entries[e_pos].events != t_entries[t_pos].events) {
1196 			pr_err("Thread and evlist pollfd index mismatch\n");
1197 			err = -EINVAL;
1198 			continue;
1199 		}
1200 		e_entries[e_pos].revents = t_entries[t_pos].revents;
1201 	}
1202 	return err;
1203 }
1204 
1205 static int record__dup_non_perf_events(struct record *rec,
1206 				       struct evlist *evlist,
1207 				       struct record_thread *thread_data)
1208 {
1209 	struct fdarray *fda = &evlist->core.pollfd;
1210 	int i, ret;
1211 
1212 	for (i = 0; i < fda->nr; i++) {
1213 		if (!(fda->priv[i].flags & fdarray_flag__non_perf_event))
1214 			continue;
1215 		ret = fdarray__dup_entry_from(&thread_data->pollfd, i, fda);
1216 		if (ret < 0) {
1217 			pr_err("Failed to duplicate descriptor in main thread pollfd\n");
1218 			return ret;
1219 		}
1220 		pr_debug2("thread_data[%p]: pollfd[%d] <- non_perf_event fd=%d\n",
1221 			  thread_data, ret, fda->entries[i].fd);
1222 		ret = record__map_thread_evlist_pollfd_indexes(rec, i, ret);
1223 		if (ret < 0) {
1224 			pr_err("Failed to map thread and evlist pollfd indexes\n");
1225 			return ret;
1226 		}
1227 	}
1228 	return 0;
1229 }
1230 
1231 static int record__alloc_thread_data(struct record *rec, struct evlist *evlist)
1232 {
1233 	int t, ret;
1234 	struct record_thread *thread_data;
1235 
1236 	rec->thread_data = zalloc(rec->nr_threads * sizeof(*(rec->thread_data)));
1237 	if (!rec->thread_data) {
1238 		pr_err("Failed to allocate thread data\n");
1239 		return -ENOMEM;
1240 	}
1241 	thread_data = rec->thread_data;
1242 
1243 	for (t = 0; t < rec->nr_threads; t++)
1244 		record__thread_data_init_pipes(&thread_data[t]);
1245 
1246 	for (t = 0; t < rec->nr_threads; t++) {
1247 		thread_data[t].rec = rec;
1248 		thread_data[t].mask = &rec->thread_masks[t];
1249 		ret = record__thread_data_init_maps(&thread_data[t], evlist);
1250 		if (ret) {
1251 			pr_err("Failed to initialize thread[%d] maps\n", t);
1252 			goto out_free;
1253 		}
1254 		ret = record__thread_data_init_pollfd(&thread_data[t], evlist);
1255 		if (ret) {
1256 			pr_err("Failed to initialize thread[%d] pollfd\n", t);
1257 			goto out_free;
1258 		}
1259 		if (t) {
1260 			thread_data[t].tid = -1;
1261 			ret = record__thread_data_open_pipes(&thread_data[t]);
1262 			if (ret) {
1263 				pr_err("Failed to open thread[%d] communication pipes\n", t);
1264 				goto out_free;
1265 			}
1266 			ret = fdarray__add(&thread_data[t].pollfd, thread_data[t].pipes.msg[0],
1267 					   POLLIN | POLLERR | POLLHUP, fdarray_flag__nonfilterable);
1268 			if (ret < 0) {
1269 				pr_err("Failed to add descriptor to thread[%d] pollfd\n", t);
1270 				goto out_free;
1271 			}
1272 			thread_data[t].ctlfd_pos = ret;
1273 			pr_debug2("thread_data[%p]: pollfd[%d] <- ctl_fd=%d\n",
1274 				 thread_data, thread_data[t].ctlfd_pos,
1275 				 thread_data[t].pipes.msg[0]);
1276 		} else {
1277 			thread_data[t].tid = gettid();
1278 
1279 			ret = record__dup_non_perf_events(rec, evlist, &thread_data[t]);
1280 			if (ret < 0)
1281 				goto out_free;
1282 
1283 			thread_data[t].ctlfd_pos = -1; /* Not used */
1284 		}
1285 	}
1286 
1287 	return 0;
1288 
1289 out_free:
1290 	record__free_thread_data(rec);
1291 
1292 	return ret;
1293 }
1294 
1295 static int record__mmap_evlist(struct record *rec,
1296 			       struct evlist *evlist)
1297 {
1298 	int i, ret;
1299 	struct record_opts *opts = &rec->opts;
1300 	bool auxtrace_overwrite = opts->auxtrace_snapshot_mode ||
1301 				  opts->auxtrace_sample_mode;
1302 	char msg[512];
1303 
1304 	if (opts->affinity != PERF_AFFINITY_SYS)
1305 		cpu__setup_cpunode_map();
1306 
1307 	if (evlist__mmap_ex(evlist, opts->mmap_pages,
1308 				 opts->auxtrace_mmap_pages,
1309 				 auxtrace_overwrite,
1310 				 opts->nr_cblocks, opts->affinity,
1311 				 opts->mmap_flush, opts->comp_level) < 0) {
1312 		if (errno == EPERM) {
1313 			pr_err("Permission error mapping pages.\n"
1314 			       "Consider increasing "
1315 			       "/proc/sys/kernel/perf_event_mlock_kb,\n"
1316 			       "or try again with a smaller value of -m/--mmap_pages.\n"
1317 			       "(current value: %u,%u)\n",
1318 			       opts->mmap_pages, opts->auxtrace_mmap_pages);
1319 			return -errno;
1320 		} else {
1321 			pr_err("failed to mmap with %d (%s)\n", errno,
1322 				str_error_r(errno, msg, sizeof(msg)));
1323 			if (errno)
1324 				return -errno;
1325 			else
1326 				return -EINVAL;
1327 		}
1328 	}
1329 
1330 	if (evlist__initialize_ctlfd(evlist, opts->ctl_fd, opts->ctl_fd_ack))
1331 		return -1;
1332 
1333 	ret = record__alloc_thread_data(rec, evlist);
1334 	if (ret)
1335 		return ret;
1336 
1337 	if (record__threads_enabled(rec)) {
1338 		ret = perf_data__create_dir(&rec->data, evlist->core.nr_mmaps);
1339 		if (ret) {
1340 			pr_err("Failed to create data directory: %s\n", strerror(-ret));
1341 			return ret;
1342 		}
1343 		for (i = 0; i < evlist->core.nr_mmaps; i++) {
1344 			if (evlist->mmap)
1345 				evlist->mmap[i].file = &rec->data.dir.files[i];
1346 			if (evlist->overwrite_mmap)
1347 				evlist->overwrite_mmap[i].file = &rec->data.dir.files[i];
1348 		}
1349 	}
1350 
1351 	return 0;
1352 }
1353 
1354 static int record__mmap(struct record *rec)
1355 {
1356 	return record__mmap_evlist(rec, rec->evlist);
1357 }
1358 
1359 static int record__open(struct record *rec)
1360 {
1361 	char msg[BUFSIZ];
1362 	struct evsel *pos;
1363 	struct evlist *evlist = rec->evlist;
1364 	struct perf_session *session = rec->session;
1365 	struct record_opts *opts = &rec->opts;
1366 	int rc = 0;
1367 
1368 	evlist__for_each_entry(evlist, pos) {
1369 try_again:
1370 		if (evsel__open(pos, pos->core.cpus, pos->core.threads) < 0) {
1371 			if (evsel__fallback(pos, &opts->target, errno, msg, sizeof(msg))) {
1372 				if (verbose > 0)
1373 					ui__warning("%s\n", msg);
1374 				goto try_again;
1375 			}
1376 			if ((errno == EINVAL || errno == EBADF) &&
1377 			    pos->core.leader != &pos->core &&
1378 			    pos->weak_group) {
1379 			        pos = evlist__reset_weak_group(evlist, pos, true);
1380 				goto try_again;
1381 			}
1382 			rc = -errno;
1383 			evsel__open_strerror(pos, &opts->target, errno, msg, sizeof(msg));
1384 			ui__error("%s\n", msg);
1385 			goto out;
1386 		}
1387 
1388 		pos->supported = true;
1389 	}
1390 
1391 	if (symbol_conf.kptr_restrict && !evlist__exclude_kernel(evlist)) {
1392 		pr_warning(
1393 "WARNING: Kernel address maps (/proc/{kallsyms,modules}) are restricted,\n"
1394 "check /proc/sys/kernel/kptr_restrict and /proc/sys/kernel/perf_event_paranoid.\n\n"
1395 "Samples in kernel functions may not be resolved if a suitable vmlinux\n"
1396 "file is not found in the buildid cache or in the vmlinux path.\n\n"
1397 "Samples in kernel modules won't be resolved at all.\n\n"
1398 "If some relocation was applied (e.g. kexec) symbols may be misresolved\n"
1399 "even with a suitable vmlinux or kallsyms file.\n\n");
1400 	}
1401 
1402 	if (evlist__apply_filters(evlist, &pos, &opts->target)) {
1403 		pr_err("failed to set filter \"%s\" on event %s with %d (%s)\n",
1404 			pos->filter ?: "BPF", evsel__name(pos), errno,
1405 			str_error_r(errno, msg, sizeof(msg)));
1406 		rc = -1;
1407 		goto out;
1408 	}
1409 
1410 	rc = record__mmap(rec);
1411 	if (rc)
1412 		goto out;
1413 
1414 	session->evlist = evlist;
1415 	perf_session__set_id_hdr_size(session);
1416 out:
1417 	return rc;
1418 }
1419 
1420 static void set_timestamp_boundary(struct record *rec, u64 sample_time)
1421 {
1422 	if (rec->evlist->first_sample_time == 0)
1423 		rec->evlist->first_sample_time = sample_time;
1424 
1425 	if (sample_time)
1426 		rec->evlist->last_sample_time = sample_time;
1427 }
1428 
1429 static int process_sample_event(const struct perf_tool *tool,
1430 				union perf_event *event,
1431 				struct perf_sample *sample,
1432 				struct evsel *evsel,
1433 				struct machine *machine)
1434 {
1435 	struct record *rec = container_of(tool, struct record, tool);
1436 
1437 	set_timestamp_boundary(rec, sample->time);
1438 
1439 	if (rec->buildid_all)
1440 		return 0;
1441 
1442 	rec->samples++;
1443 	return build_id__mark_dso_hit(tool, event, sample, evsel, machine);
1444 }
1445 
1446 static int process_buildids(struct record *rec)
1447 {
1448 	struct perf_session *session = rec->session;
1449 
1450 	if (perf_data__size(&rec->data) == 0)
1451 		return 0;
1452 
1453 	/*
1454 	 * During this process, it'll load kernel map and replace the
1455 	 * dso->long_name to a real pathname it found.  In this case
1456 	 * we prefer the vmlinux path like
1457 	 *   /lib/modules/3.16.4/build/vmlinux
1458 	 *
1459 	 * rather than build-id path (in debug directory).
1460 	 *   $HOME/.debug/.build-id/f0/6e17aa50adf4d00b88925e03775de107611551
1461 	 */
1462 	symbol_conf.ignore_vmlinux_buildid = true;
1463 
1464 	/*
1465 	 * If --buildid-all is given, it marks all DSO regardless of hits,
1466 	 * so no need to process samples. But if timestamp_boundary is enabled,
1467 	 * it still needs to walk on all samples to get the timestamps of
1468 	 * first/last samples.
1469 	 */
1470 	if (rec->buildid_all && !rec->timestamp_boundary)
1471 		rec->tool.sample = process_event_sample_stub;
1472 
1473 	return perf_session__process_events(session);
1474 }
1475 
1476 static void perf_event__synthesize_guest_os(struct machine *machine, void *data)
1477 {
1478 	int err;
1479 	struct perf_tool *tool = data;
1480 	/*
1481 	 *As for guest kernel when processing subcommand record&report,
1482 	 *we arrange module mmap prior to guest kernel mmap and trigger
1483 	 *a preload dso because default guest module symbols are loaded
1484 	 *from guest kallsyms instead of /lib/modules/XXX/XXX. This
1485 	 *method is used to avoid symbol missing when the first addr is
1486 	 *in module instead of in guest kernel.
1487 	 */
1488 	err = perf_event__synthesize_modules(tool, process_synthesized_event,
1489 					     machine);
1490 	if (err < 0)
1491 		pr_err("Couldn't record guest kernel [%d]'s reference"
1492 		       " relocation symbol.\n", machine->pid);
1493 
1494 	/*
1495 	 * We use _stext for guest kernel because guest kernel's /proc/kallsyms
1496 	 * have no _text sometimes.
1497 	 */
1498 	err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
1499 						 machine);
1500 	if (err < 0)
1501 		pr_err("Couldn't record guest kernel [%d]'s reference"
1502 		       " relocation symbol.\n", machine->pid);
1503 }
1504 
1505 static struct perf_event_header finished_round_event = {
1506 	.size = sizeof(struct perf_event_header),
1507 	.type = PERF_RECORD_FINISHED_ROUND,
1508 };
1509 
1510 static struct perf_event_header finished_init_event = {
1511 	.size = sizeof(struct perf_event_header),
1512 	.type = PERF_RECORD_FINISHED_INIT,
1513 };
1514 
1515 static void record__adjust_affinity(struct record *rec, struct mmap *map)
1516 {
1517 	if (rec->opts.affinity != PERF_AFFINITY_SYS &&
1518 	    !bitmap_equal(thread->mask->affinity.bits, map->affinity_mask.bits,
1519 			  thread->mask->affinity.nbits)) {
1520 		bitmap_zero(thread->mask->affinity.bits, thread->mask->affinity.nbits);
1521 		bitmap_or(thread->mask->affinity.bits, thread->mask->affinity.bits,
1522 			  map->affinity_mask.bits, thread->mask->affinity.nbits);
1523 		sched_setaffinity(0, MMAP_CPU_MASK_BYTES(&thread->mask->affinity),
1524 					(cpu_set_t *)thread->mask->affinity.bits);
1525 		if (verbose == 2) {
1526 			pr_debug("threads[%d]: running on cpu%d: ", thread->tid, sched_getcpu());
1527 			mmap_cpu_mask__scnprintf(&thread->mask->affinity, "affinity");
1528 		}
1529 	}
1530 }
1531 
1532 static size_t process_comp_header(void *record, size_t increment)
1533 {
1534 	struct perf_record_compressed *event = record;
1535 	size_t size = sizeof(*event);
1536 
1537 	if (increment) {
1538 		event->header.size += increment;
1539 		return increment;
1540 	}
1541 
1542 	event->header.type = PERF_RECORD_COMPRESSED;
1543 	event->header.size = size;
1544 
1545 	return size;
1546 }
1547 
1548 static ssize_t zstd_compress(struct perf_session *session, struct mmap *map,
1549 			    void *dst, size_t dst_size, void *src, size_t src_size)
1550 {
1551 	ssize_t compressed;
1552 	size_t max_record_size = PERF_SAMPLE_MAX_SIZE - sizeof(struct perf_record_compressed) - 1;
1553 	struct zstd_data *zstd_data = &session->zstd_data;
1554 
1555 	if (map && map->file)
1556 		zstd_data = &map->zstd_data;
1557 
1558 	compressed = zstd_compress_stream_to_records(zstd_data, dst, dst_size, src, src_size,
1559 						     max_record_size, process_comp_header);
1560 	if (compressed < 0)
1561 		return compressed;
1562 
1563 	if (map && map->file) {
1564 		thread->bytes_transferred += src_size;
1565 		thread->bytes_compressed  += compressed;
1566 	} else {
1567 		session->bytes_transferred += src_size;
1568 		session->bytes_compressed  += compressed;
1569 	}
1570 
1571 	return compressed;
1572 }
1573 
1574 static int record__mmap_read_evlist(struct record *rec, struct evlist *evlist,
1575 				    bool overwrite, bool synch)
1576 {
1577 	u64 bytes_written = rec->bytes_written;
1578 	int i;
1579 	int rc = 0;
1580 	int nr_mmaps;
1581 	struct mmap **maps;
1582 	int trace_fd = rec->data.file.fd;
1583 	off_t off = 0;
1584 
1585 	if (!evlist)
1586 		return 0;
1587 
1588 	nr_mmaps = thread->nr_mmaps;
1589 	maps = overwrite ? thread->overwrite_maps : thread->maps;
1590 
1591 	if (!maps)
1592 		return 0;
1593 
1594 	if (overwrite && evlist->bkw_mmap_state != BKW_MMAP_DATA_PENDING)
1595 		return 0;
1596 
1597 	if (record__aio_enabled(rec))
1598 		off = record__aio_get_pos(trace_fd);
1599 
1600 	for (i = 0; i < nr_mmaps; i++) {
1601 		u64 flush = 0;
1602 		struct mmap *map = maps[i];
1603 
1604 		if (map->core.base) {
1605 			record__adjust_affinity(rec, map);
1606 			if (synch) {
1607 				flush = map->core.flush;
1608 				map->core.flush = 1;
1609 			}
1610 			if (!record__aio_enabled(rec)) {
1611 				if (perf_mmap__push(map, rec, record__pushfn) < 0) {
1612 					if (synch)
1613 						map->core.flush = flush;
1614 					rc = -1;
1615 					goto out;
1616 				}
1617 			} else {
1618 				if (record__aio_push(rec, map, &off) < 0) {
1619 					record__aio_set_pos(trace_fd, off);
1620 					if (synch)
1621 						map->core.flush = flush;
1622 					rc = -1;
1623 					goto out;
1624 				}
1625 			}
1626 			if (synch)
1627 				map->core.flush = flush;
1628 		}
1629 
1630 		if (map->auxtrace_mmap.base && !rec->opts.auxtrace_snapshot_mode &&
1631 		    !rec->opts.auxtrace_sample_mode &&
1632 		    record__auxtrace_mmap_read(rec, map) != 0) {
1633 			rc = -1;
1634 			goto out;
1635 		}
1636 	}
1637 
1638 	if (record__aio_enabled(rec))
1639 		record__aio_set_pos(trace_fd, off);
1640 
1641 	/*
1642 	 * Mark the round finished in case we wrote
1643 	 * at least one event.
1644 	 *
1645 	 * No need for round events in directory mode,
1646 	 * because per-cpu maps and files have data
1647 	 * sorted by kernel.
1648 	 */
1649 	if (!record__threads_enabled(rec) && bytes_written != rec->bytes_written)
1650 		rc = record__write(rec, NULL, &finished_round_event, sizeof(finished_round_event));
1651 
1652 	if (overwrite)
1653 		evlist__toggle_bkw_mmap(evlist, BKW_MMAP_EMPTY);
1654 out:
1655 	return rc;
1656 }
1657 
1658 static int record__mmap_read_all(struct record *rec, bool synch)
1659 {
1660 	int err;
1661 
1662 	err = record__mmap_read_evlist(rec, rec->evlist, false, synch);
1663 	if (err)
1664 		return err;
1665 
1666 	return record__mmap_read_evlist(rec, rec->evlist, true, synch);
1667 }
1668 
1669 static void record__thread_munmap_filtered(struct fdarray *fda, int fd,
1670 					   void *arg __maybe_unused)
1671 {
1672 	struct perf_mmap *map = fda->priv[fd].ptr;
1673 
1674 	if (map)
1675 		perf_mmap__put(map);
1676 }
1677 
1678 static void *record__thread(void *arg)
1679 {
1680 	enum thread_msg msg = THREAD_MSG__READY;
1681 	bool terminate = false;
1682 	struct fdarray *pollfd;
1683 	int err, ctlfd_pos;
1684 
1685 	thread = arg;
1686 	thread->tid = gettid();
1687 
1688 	err = write(thread->pipes.ack[1], &msg, sizeof(msg));
1689 	if (err == -1)
1690 		pr_warning("threads[%d]: failed to notify on start: %s\n",
1691 			   thread->tid, strerror(errno));
1692 
1693 	pr_debug("threads[%d]: started on cpu%d\n", thread->tid, sched_getcpu());
1694 
1695 	pollfd = &thread->pollfd;
1696 	ctlfd_pos = thread->ctlfd_pos;
1697 
1698 	for (;;) {
1699 		unsigned long long hits = thread->samples;
1700 
1701 		if (record__mmap_read_all(thread->rec, false) < 0 || terminate)
1702 			break;
1703 
1704 		if (hits == thread->samples) {
1705 
1706 			err = fdarray__poll(pollfd, -1);
1707 			/*
1708 			 * Propagate error, only if there's any. Ignore positive
1709 			 * number of returned events and interrupt error.
1710 			 */
1711 			if (err > 0 || (err < 0 && errno == EINTR))
1712 				err = 0;
1713 			thread->waking++;
1714 
1715 			if (fdarray__filter(pollfd, POLLERR | POLLHUP,
1716 					    record__thread_munmap_filtered, NULL) == 0)
1717 				break;
1718 		}
1719 
1720 		if (pollfd->entries[ctlfd_pos].revents & POLLHUP) {
1721 			terminate = true;
1722 			close(thread->pipes.msg[0]);
1723 			thread->pipes.msg[0] = -1;
1724 			pollfd->entries[ctlfd_pos].fd = -1;
1725 			pollfd->entries[ctlfd_pos].events = 0;
1726 		}
1727 
1728 		pollfd->entries[ctlfd_pos].revents = 0;
1729 	}
1730 	record__mmap_read_all(thread->rec, true);
1731 
1732 	err = write(thread->pipes.ack[1], &msg, sizeof(msg));
1733 	if (err == -1)
1734 		pr_warning("threads[%d]: failed to notify on termination: %s\n",
1735 			   thread->tid, strerror(errno));
1736 
1737 	return NULL;
1738 }
1739 
1740 static void record__init_features(struct record *rec)
1741 {
1742 	struct perf_session *session = rec->session;
1743 	int feat;
1744 
1745 	for (feat = HEADER_FIRST_FEATURE; feat < HEADER_LAST_FEATURE; feat++)
1746 		perf_header__set_feat(&session->header, feat);
1747 
1748 	if (rec->no_buildid)
1749 		perf_header__clear_feat(&session->header, HEADER_BUILD_ID);
1750 
1751 #ifdef HAVE_LIBTRACEEVENT
1752 	if (!have_tracepoints(&rec->evlist->core.entries))
1753 		perf_header__clear_feat(&session->header, HEADER_TRACING_DATA);
1754 #endif
1755 
1756 	if (!rec->opts.branch_stack)
1757 		perf_header__clear_feat(&session->header, HEADER_BRANCH_STACK);
1758 
1759 	if (!rec->opts.full_auxtrace)
1760 		perf_header__clear_feat(&session->header, HEADER_AUXTRACE);
1761 
1762 	if (!(rec->opts.use_clockid && rec->opts.clockid_res_ns))
1763 		perf_header__clear_feat(&session->header, HEADER_CLOCKID);
1764 
1765 	if (!rec->opts.use_clockid)
1766 		perf_header__clear_feat(&session->header, HEADER_CLOCK_DATA);
1767 
1768 	if (!record__threads_enabled(rec))
1769 		perf_header__clear_feat(&session->header, HEADER_DIR_FORMAT);
1770 
1771 	if (!record__comp_enabled(rec))
1772 		perf_header__clear_feat(&session->header, HEADER_COMPRESSED);
1773 
1774 	perf_header__clear_feat(&session->header, HEADER_STAT);
1775 }
1776 
1777 static void
1778 record__finish_output(struct record *rec)
1779 {
1780 	int i;
1781 	struct perf_data *data = &rec->data;
1782 	int fd = perf_data__fd(data);
1783 
1784 	if (data->is_pipe) {
1785 		/* Just to display approx. size */
1786 		data->file.size = rec->bytes_written;
1787 		return;
1788 	}
1789 
1790 	rec->session->header.data_size += rec->bytes_written;
1791 	data->file.size = lseek(perf_data__fd(data), 0, SEEK_CUR);
1792 	if (record__threads_enabled(rec)) {
1793 		for (i = 0; i < data->dir.nr; i++)
1794 			data->dir.files[i].size = lseek(data->dir.files[i].fd, 0, SEEK_CUR);
1795 	}
1796 
1797 	if (!rec->no_buildid) {
1798 		process_buildids(rec);
1799 
1800 		if (rec->buildid_all)
1801 			perf_session__dsos_hit_all(rec->session);
1802 	}
1803 	perf_session__write_header(rec->session, rec->evlist, fd, true);
1804 
1805 	return;
1806 }
1807 
1808 static int record__synthesize_workload(struct record *rec, bool tail)
1809 {
1810 	int err;
1811 	struct perf_thread_map *thread_map;
1812 	bool needs_mmap = rec->opts.synth & PERF_SYNTH_MMAP;
1813 
1814 	if (rec->opts.tail_synthesize != tail)
1815 		return 0;
1816 
1817 	thread_map = thread_map__new_by_tid(rec->evlist->workload.pid);
1818 	if (thread_map == NULL)
1819 		return -1;
1820 
1821 	err = perf_event__synthesize_thread_map(&rec->tool, thread_map,
1822 						 process_synthesized_event,
1823 						 &rec->session->machines.host,
1824 						 needs_mmap,
1825 						 rec->opts.sample_address);
1826 	perf_thread_map__put(thread_map);
1827 	return err;
1828 }
1829 
1830 static int write_finished_init(struct record *rec, bool tail)
1831 {
1832 	if (rec->opts.tail_synthesize != tail)
1833 		return 0;
1834 
1835 	return record__write(rec, NULL, &finished_init_event, sizeof(finished_init_event));
1836 }
1837 
1838 static int record__synthesize(struct record *rec, bool tail);
1839 
1840 static int
1841 record__switch_output(struct record *rec, bool at_exit)
1842 {
1843 	struct perf_data *data = &rec->data;
1844 	char *new_filename = NULL;
1845 	int fd, err;
1846 
1847 	/* Same Size:      "2015122520103046"*/
1848 	char timestamp[] = "InvalidTimestamp";
1849 
1850 	record__aio_mmap_read_sync(rec);
1851 
1852 	write_finished_init(rec, true);
1853 
1854 	record__synthesize(rec, true);
1855 	if (target__none(&rec->opts.target))
1856 		record__synthesize_workload(rec, true);
1857 
1858 	rec->samples = 0;
1859 	record__finish_output(rec);
1860 	err = fetch_current_timestamp(timestamp, sizeof(timestamp));
1861 	if (err) {
1862 		pr_err("Failed to get current timestamp\n");
1863 		return -EINVAL;
1864 	}
1865 
1866 	fd = perf_data__switch(data, timestamp,
1867 			       rec->session->header.data_offset,
1868 			       at_exit, &new_filename);
1869 	if (fd >= 0 && !at_exit) {
1870 		rec->bytes_written = 0;
1871 		rec->session->header.data_size = 0;
1872 	}
1873 
1874 	if (!quiet) {
1875 		fprintf(stderr, "[ perf record: Dump %s.%s ]\n",
1876 			data->path, timestamp);
1877 	}
1878 
1879 	if (rec->switch_output.num_files) {
1880 		int n = rec->switch_output.cur_file + 1;
1881 
1882 		if (n >= rec->switch_output.num_files)
1883 			n = 0;
1884 		rec->switch_output.cur_file = n;
1885 		if (rec->switch_output.filenames[n]) {
1886 			remove(rec->switch_output.filenames[n]);
1887 			zfree(&rec->switch_output.filenames[n]);
1888 		}
1889 		rec->switch_output.filenames[n] = new_filename;
1890 	} else {
1891 		free(new_filename);
1892 	}
1893 
1894 	/* Output tracking events */
1895 	if (!at_exit) {
1896 		record__synthesize(rec, false);
1897 
1898 		/*
1899 		 * In 'perf record --switch-output' without -a,
1900 		 * record__synthesize() in record__switch_output() won't
1901 		 * generate tracking events because there's no thread_map
1902 		 * in evlist. Which causes newly created perf.data doesn't
1903 		 * contain map and comm information.
1904 		 * Create a fake thread_map and directly call
1905 		 * perf_event__synthesize_thread_map() for those events.
1906 		 */
1907 		if (target__none(&rec->opts.target))
1908 			record__synthesize_workload(rec, false);
1909 		write_finished_init(rec, false);
1910 	}
1911 	return fd;
1912 }
1913 
1914 static void __record__save_lost_samples(struct record *rec, struct evsel *evsel,
1915 					struct perf_record_lost_samples *lost,
1916 					int cpu_idx, int thread_idx, u64 lost_count,
1917 					u16 misc_flag)
1918 {
1919 	struct perf_sample_id *sid;
1920 	struct perf_sample sample = {};
1921 	int id_hdr_size;
1922 
1923 	lost->lost = lost_count;
1924 	if (evsel->core.ids) {
1925 		sid = xyarray__entry(evsel->core.sample_id, cpu_idx, thread_idx);
1926 		sample.id = sid->id;
1927 	}
1928 
1929 	id_hdr_size = perf_event__synthesize_id_sample((void *)(lost + 1),
1930 						       evsel->core.attr.sample_type, &sample);
1931 	lost->header.size = sizeof(*lost) + id_hdr_size;
1932 	lost->header.misc = misc_flag;
1933 	record__write(rec, NULL, lost, lost->header.size);
1934 }
1935 
1936 static void record__read_lost_samples(struct record *rec)
1937 {
1938 	struct perf_session *session = rec->session;
1939 	struct perf_record_lost_samples_and_ids lost;
1940 	struct evsel *evsel;
1941 
1942 	/* there was an error during record__open */
1943 	if (session->evlist == NULL)
1944 		return;
1945 
1946 	evlist__for_each_entry(session->evlist, evsel) {
1947 		struct xyarray *xy = evsel->core.sample_id;
1948 		u64 lost_count;
1949 
1950 		if (xy == NULL || evsel->core.fd == NULL)
1951 			continue;
1952 		if (xyarray__max_x(evsel->core.fd) != xyarray__max_x(xy) ||
1953 		    xyarray__max_y(evsel->core.fd) != xyarray__max_y(xy)) {
1954 			pr_debug("Unmatched FD vs. sample ID: skip reading LOST count\n");
1955 			continue;
1956 		}
1957 
1958 		for (int x = 0; x < xyarray__max_x(xy); x++) {
1959 			for (int y = 0; y < xyarray__max_y(xy); y++) {
1960 				struct perf_counts_values count;
1961 
1962 				if (perf_evsel__read(&evsel->core, x, y, &count) < 0) {
1963 					pr_debug("read LOST count failed\n");
1964 					return;
1965 				}
1966 
1967 				if (count.lost) {
1968 					memset(&lost, 0, sizeof(lost));
1969 					lost.lost.header.type = PERF_RECORD_LOST_SAMPLES;
1970 					__record__save_lost_samples(rec, evsel, &lost.lost,
1971 								    x, y, count.lost, 0);
1972 				}
1973 			}
1974 		}
1975 
1976 		lost_count = perf_bpf_filter__lost_count(evsel);
1977 		if (lost_count) {
1978 			memset(&lost, 0, sizeof(lost));
1979 			lost.lost.header.type = PERF_RECORD_LOST_SAMPLES;
1980 			__record__save_lost_samples(rec, evsel, &lost.lost, 0, 0, lost_count,
1981 						    PERF_RECORD_MISC_LOST_SAMPLES_BPF);
1982 		}
1983 	}
1984 }
1985 
1986 static volatile sig_atomic_t workload_exec_errno;
1987 
1988 /*
1989  * evlist__prepare_workload will send a SIGUSR1
1990  * if the fork fails, since we asked by setting its
1991  * want_signal to true.
1992  */
1993 static void workload_exec_failed_signal(int signo __maybe_unused,
1994 					siginfo_t *info,
1995 					void *ucontext __maybe_unused)
1996 {
1997 	workload_exec_errno = info->si_value.sival_int;
1998 	done = 1;
1999 	child_finished = 1;
2000 }
2001 
2002 static void snapshot_sig_handler(int sig);
2003 static void alarm_sig_handler(int sig);
2004 
2005 static const struct perf_event_mmap_page *evlist__pick_pc(struct evlist *evlist)
2006 {
2007 	if (evlist) {
2008 		if (evlist->mmap && evlist->mmap[0].core.base)
2009 			return evlist->mmap[0].core.base;
2010 		if (evlist->overwrite_mmap && evlist->overwrite_mmap[0].core.base)
2011 			return evlist->overwrite_mmap[0].core.base;
2012 	}
2013 	return NULL;
2014 }
2015 
2016 static const struct perf_event_mmap_page *record__pick_pc(struct record *rec)
2017 {
2018 	const struct perf_event_mmap_page *pc = evlist__pick_pc(rec->evlist);
2019 	if (pc)
2020 		return pc;
2021 	return NULL;
2022 }
2023 
2024 static int record__synthesize(struct record *rec, bool tail)
2025 {
2026 	struct perf_session *session = rec->session;
2027 	struct machine *machine = &session->machines.host;
2028 	struct perf_data *data = &rec->data;
2029 	struct record_opts *opts = &rec->opts;
2030 	struct perf_tool *tool = &rec->tool;
2031 	int err = 0;
2032 	event_op f = process_synthesized_event;
2033 
2034 	if (rec->opts.tail_synthesize != tail)
2035 		return 0;
2036 
2037 	if (data->is_pipe) {
2038 		err = perf_event__synthesize_for_pipe(tool, session, data,
2039 						      process_synthesized_event);
2040 		if (err < 0)
2041 			goto out;
2042 
2043 		rec->bytes_written += err;
2044 	}
2045 
2046 	err = perf_event__synth_time_conv(record__pick_pc(rec), tool,
2047 					  process_synthesized_event, machine);
2048 	if (err)
2049 		goto out;
2050 
2051 	/* Synthesize id_index before auxtrace_info */
2052 	err = perf_event__synthesize_id_index(tool,
2053 					      process_synthesized_event,
2054 					      session->evlist, machine);
2055 	if (err)
2056 		goto out;
2057 
2058 	if (rec->opts.full_auxtrace) {
2059 		err = perf_event__synthesize_auxtrace_info(rec->itr, tool,
2060 					session, process_synthesized_event);
2061 		if (err)
2062 			goto out;
2063 	}
2064 
2065 	if (!evlist__exclude_kernel(rec->evlist)) {
2066 		err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
2067 							 machine);
2068 		WARN_ONCE(err < 0, "Couldn't record kernel reference relocation symbol\n"
2069 				   "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
2070 				   "Check /proc/kallsyms permission or run as root.\n");
2071 
2072 		err = perf_event__synthesize_modules(tool, process_synthesized_event,
2073 						     machine);
2074 		WARN_ONCE(err < 0, "Couldn't record kernel module information.\n"
2075 				   "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
2076 				   "Check /proc/modules permission or run as root.\n");
2077 	}
2078 
2079 	if (perf_guest) {
2080 		machines__process_guests(&session->machines,
2081 					 perf_event__synthesize_guest_os, tool);
2082 	}
2083 
2084 	err = perf_event__synthesize_extra_attr(&rec->tool,
2085 						rec->evlist,
2086 						process_synthesized_event,
2087 						data->is_pipe);
2088 	if (err)
2089 		goto out;
2090 
2091 	err = perf_event__synthesize_thread_map2(&rec->tool, rec->evlist->core.threads,
2092 						 process_synthesized_event,
2093 						NULL);
2094 	if (err < 0) {
2095 		pr_err("Couldn't synthesize thread map.\n");
2096 		return err;
2097 	}
2098 
2099 	err = perf_event__synthesize_cpu_map(&rec->tool, rec->evlist->core.all_cpus,
2100 					     process_synthesized_event, NULL);
2101 	if (err < 0) {
2102 		pr_err("Couldn't synthesize cpu map.\n");
2103 		return err;
2104 	}
2105 
2106 	err = perf_event__synthesize_bpf_events(session, process_synthesized_event,
2107 						machine, opts);
2108 	if (err < 0) {
2109 		pr_warning("Couldn't synthesize bpf events.\n");
2110 		err = 0;
2111 	}
2112 
2113 	if (rec->opts.synth & PERF_SYNTH_CGROUP) {
2114 		err = perf_event__synthesize_cgroups(tool, process_synthesized_event,
2115 						     machine);
2116 		if (err < 0) {
2117 			pr_warning("Couldn't synthesize cgroup events.\n");
2118 			err = 0;
2119 		}
2120 	}
2121 
2122 	if (rec->opts.nr_threads_synthesize > 1) {
2123 		mutex_init(&synth_lock);
2124 		perf_set_multithreaded();
2125 		f = process_locked_synthesized_event;
2126 	}
2127 
2128 	if (rec->opts.synth & PERF_SYNTH_TASK) {
2129 		bool needs_mmap = rec->opts.synth & PERF_SYNTH_MMAP;
2130 
2131 		err = __machine__synthesize_threads(machine, tool, &opts->target,
2132 						    rec->evlist->core.threads,
2133 						    f, needs_mmap, opts->sample_address,
2134 						    rec->opts.nr_threads_synthesize);
2135 	}
2136 
2137 	if (rec->opts.nr_threads_synthesize > 1) {
2138 		perf_set_singlethreaded();
2139 		mutex_destroy(&synth_lock);
2140 	}
2141 
2142 out:
2143 	return err;
2144 }
2145 
2146 static int record__process_signal_event(union perf_event *event __maybe_unused, void *data)
2147 {
2148 	struct record *rec = data;
2149 	pthread_kill(rec->thread_id, SIGUSR2);
2150 	return 0;
2151 }
2152 
2153 static int record__setup_sb_evlist(struct record *rec)
2154 {
2155 	struct record_opts *opts = &rec->opts;
2156 
2157 	if (rec->sb_evlist != NULL) {
2158 		/*
2159 		 * We get here if --switch-output-event populated the
2160 		 * sb_evlist, so associate a callback that will send a SIGUSR2
2161 		 * to the main thread.
2162 		 */
2163 		evlist__set_cb(rec->sb_evlist, record__process_signal_event, rec);
2164 		rec->thread_id = pthread_self();
2165 	}
2166 #ifdef HAVE_LIBBPF_SUPPORT
2167 	if (!opts->no_bpf_event) {
2168 		if (rec->sb_evlist == NULL) {
2169 			rec->sb_evlist = evlist__new();
2170 
2171 			if (rec->sb_evlist == NULL) {
2172 				pr_err("Couldn't create side band evlist.\n.");
2173 				return -1;
2174 			}
2175 		}
2176 
2177 		if (evlist__add_bpf_sb_event(rec->sb_evlist, &rec->session->header.env)) {
2178 			pr_err("Couldn't ask for PERF_RECORD_BPF_EVENT side band events.\n.");
2179 			return -1;
2180 		}
2181 	}
2182 #endif
2183 	if (evlist__start_sb_thread(rec->sb_evlist, &rec->opts.target)) {
2184 		pr_debug("Couldn't start the BPF side band thread:\nBPF programs starting from now on won't be annotatable\n");
2185 		opts->no_bpf_event = true;
2186 	}
2187 
2188 	return 0;
2189 }
2190 
2191 static int record__init_clock(struct record *rec)
2192 {
2193 	struct perf_session *session = rec->session;
2194 	struct timespec ref_clockid;
2195 	struct timeval ref_tod;
2196 	u64 ref;
2197 
2198 	if (!rec->opts.use_clockid)
2199 		return 0;
2200 
2201 	if (rec->opts.use_clockid && rec->opts.clockid_res_ns)
2202 		session->header.env.clock.clockid_res_ns = rec->opts.clockid_res_ns;
2203 
2204 	session->header.env.clock.clockid = rec->opts.clockid;
2205 
2206 	if (gettimeofday(&ref_tod, NULL) != 0) {
2207 		pr_err("gettimeofday failed, cannot set reference time.\n");
2208 		return -1;
2209 	}
2210 
2211 	if (clock_gettime(rec->opts.clockid, &ref_clockid)) {
2212 		pr_err("clock_gettime failed, cannot set reference time.\n");
2213 		return -1;
2214 	}
2215 
2216 	ref = (u64) ref_tod.tv_sec * NSEC_PER_SEC +
2217 	      (u64) ref_tod.tv_usec * NSEC_PER_USEC;
2218 
2219 	session->header.env.clock.tod_ns = ref;
2220 
2221 	ref = (u64) ref_clockid.tv_sec * NSEC_PER_SEC +
2222 	      (u64) ref_clockid.tv_nsec;
2223 
2224 	session->header.env.clock.clockid_ns = ref;
2225 	return 0;
2226 }
2227 
2228 static void hit_auxtrace_snapshot_trigger(struct record *rec)
2229 {
2230 	if (trigger_is_ready(&auxtrace_snapshot_trigger)) {
2231 		trigger_hit(&auxtrace_snapshot_trigger);
2232 		auxtrace_record__snapshot_started = 1;
2233 		if (auxtrace_record__snapshot_start(rec->itr))
2234 			trigger_error(&auxtrace_snapshot_trigger);
2235 	}
2236 }
2237 
2238 static int record__terminate_thread(struct record_thread *thread_data)
2239 {
2240 	int err;
2241 	enum thread_msg ack = THREAD_MSG__UNDEFINED;
2242 	pid_t tid = thread_data->tid;
2243 
2244 	close(thread_data->pipes.msg[1]);
2245 	thread_data->pipes.msg[1] = -1;
2246 	err = read(thread_data->pipes.ack[0], &ack, sizeof(ack));
2247 	if (err > 0)
2248 		pr_debug2("threads[%d]: sent %s\n", tid, thread_msg_tags[ack]);
2249 	else
2250 		pr_warning("threads[%d]: failed to receive termination notification from %d\n",
2251 			   thread->tid, tid);
2252 
2253 	return 0;
2254 }
2255 
2256 static int record__start_threads(struct record *rec)
2257 {
2258 	int t, tt, err, ret = 0, nr_threads = rec->nr_threads;
2259 	struct record_thread *thread_data = rec->thread_data;
2260 	sigset_t full, mask;
2261 	pthread_t handle;
2262 	pthread_attr_t attrs;
2263 
2264 	thread = &thread_data[0];
2265 
2266 	if (!record__threads_enabled(rec))
2267 		return 0;
2268 
2269 	sigfillset(&full);
2270 	if (sigprocmask(SIG_SETMASK, &full, &mask)) {
2271 		pr_err("Failed to block signals on threads start: %s\n", strerror(errno));
2272 		return -1;
2273 	}
2274 
2275 	pthread_attr_init(&attrs);
2276 	pthread_attr_setdetachstate(&attrs, PTHREAD_CREATE_DETACHED);
2277 
2278 	for (t = 1; t < nr_threads; t++) {
2279 		enum thread_msg msg = THREAD_MSG__UNDEFINED;
2280 
2281 #ifdef HAVE_PTHREAD_ATTR_SETAFFINITY_NP
2282 		pthread_attr_setaffinity_np(&attrs,
2283 					    MMAP_CPU_MASK_BYTES(&(thread_data[t].mask->affinity)),
2284 					    (cpu_set_t *)(thread_data[t].mask->affinity.bits));
2285 #endif
2286 		if (pthread_create(&handle, &attrs, record__thread, &thread_data[t])) {
2287 			for (tt = 1; tt < t; tt++)
2288 				record__terminate_thread(&thread_data[t]);
2289 			pr_err("Failed to start threads: %s\n", strerror(errno));
2290 			ret = -1;
2291 			goto out_err;
2292 		}
2293 
2294 		err = read(thread_data[t].pipes.ack[0], &msg, sizeof(msg));
2295 		if (err > 0)
2296 			pr_debug2("threads[%d]: sent %s\n", rec->thread_data[t].tid,
2297 				  thread_msg_tags[msg]);
2298 		else
2299 			pr_warning("threads[%d]: failed to receive start notification from %d\n",
2300 				   thread->tid, rec->thread_data[t].tid);
2301 	}
2302 
2303 	sched_setaffinity(0, MMAP_CPU_MASK_BYTES(&thread->mask->affinity),
2304 			(cpu_set_t *)thread->mask->affinity.bits);
2305 
2306 	pr_debug("threads[%d]: started on cpu%d\n", thread->tid, sched_getcpu());
2307 
2308 out_err:
2309 	pthread_attr_destroy(&attrs);
2310 
2311 	if (sigprocmask(SIG_SETMASK, &mask, NULL)) {
2312 		pr_err("Failed to unblock signals on threads start: %s\n", strerror(errno));
2313 		ret = -1;
2314 	}
2315 
2316 	return ret;
2317 }
2318 
2319 static int record__stop_threads(struct record *rec)
2320 {
2321 	int t;
2322 	struct record_thread *thread_data = rec->thread_data;
2323 
2324 	for (t = 1; t < rec->nr_threads; t++)
2325 		record__terminate_thread(&thread_data[t]);
2326 
2327 	for (t = 0; t < rec->nr_threads; t++) {
2328 		rec->samples += thread_data[t].samples;
2329 		if (!record__threads_enabled(rec))
2330 			continue;
2331 		rec->session->bytes_transferred += thread_data[t].bytes_transferred;
2332 		rec->session->bytes_compressed += thread_data[t].bytes_compressed;
2333 		pr_debug("threads[%d]: samples=%lld, wakes=%ld, ", thread_data[t].tid,
2334 			 thread_data[t].samples, thread_data[t].waking);
2335 		if (thread_data[t].bytes_transferred && thread_data[t].bytes_compressed)
2336 			pr_debug("transferred=%" PRIu64 ", compressed=%" PRIu64 "\n",
2337 				 thread_data[t].bytes_transferred, thread_data[t].bytes_compressed);
2338 		else
2339 			pr_debug("written=%" PRIu64 "\n", thread_data[t].bytes_written);
2340 	}
2341 
2342 	return 0;
2343 }
2344 
2345 static unsigned long record__waking(struct record *rec)
2346 {
2347 	int t;
2348 	unsigned long waking = 0;
2349 	struct record_thread *thread_data = rec->thread_data;
2350 
2351 	for (t = 0; t < rec->nr_threads; t++)
2352 		waking += thread_data[t].waking;
2353 
2354 	return waking;
2355 }
2356 
2357 static int __cmd_record(struct record *rec, int argc, const char **argv)
2358 {
2359 	int err;
2360 	int status = 0;
2361 	const bool forks = argc > 0;
2362 	struct perf_tool *tool = &rec->tool;
2363 	struct record_opts *opts = &rec->opts;
2364 	struct perf_data *data = &rec->data;
2365 	struct perf_session *session;
2366 	bool disabled = false, draining = false;
2367 	int fd;
2368 	float ratio = 0;
2369 	enum evlist_ctl_cmd cmd = EVLIST_CTL_CMD_UNSUPPORTED;
2370 
2371 	atexit(record__sig_exit);
2372 	signal(SIGCHLD, sig_handler);
2373 	signal(SIGINT, sig_handler);
2374 	signal(SIGTERM, sig_handler);
2375 	signal(SIGSEGV, sigsegv_handler);
2376 
2377 	if (rec->opts.record_cgroup) {
2378 #ifndef HAVE_FILE_HANDLE
2379 		pr_err("cgroup tracking is not supported\n");
2380 		return -1;
2381 #endif
2382 	}
2383 
2384 	if (rec->opts.auxtrace_snapshot_mode || rec->switch_output.enabled) {
2385 		signal(SIGUSR2, snapshot_sig_handler);
2386 		if (rec->opts.auxtrace_snapshot_mode)
2387 			trigger_on(&auxtrace_snapshot_trigger);
2388 		if (rec->switch_output.enabled)
2389 			trigger_on(&switch_output_trigger);
2390 	} else {
2391 		signal(SIGUSR2, SIG_IGN);
2392 	}
2393 
2394 	perf_tool__init(tool, /*ordered_events=*/true);
2395 	tool->sample		= process_sample_event;
2396 	tool->fork		= perf_event__process_fork;
2397 	tool->exit		= perf_event__process_exit;
2398 	tool->comm		= perf_event__process_comm;
2399 	tool->namespaces	= perf_event__process_namespaces;
2400 	tool->mmap		= build_id__process_mmap;
2401 	tool->mmap2		= build_id__process_mmap2;
2402 	tool->itrace_start	= process_timestamp_boundary;
2403 	tool->aux		= process_timestamp_boundary;
2404 	tool->namespace_events	= rec->opts.record_namespaces;
2405 	tool->cgroup_events	= rec->opts.record_cgroup;
2406 	session = perf_session__new(data, tool);
2407 	if (IS_ERR(session)) {
2408 		pr_err("Perf session creation failed.\n");
2409 		return PTR_ERR(session);
2410 	}
2411 
2412 	if (record__threads_enabled(rec)) {
2413 		if (perf_data__is_pipe(&rec->data)) {
2414 			pr_err("Parallel trace streaming is not available in pipe mode.\n");
2415 			return -1;
2416 		}
2417 		if (rec->opts.full_auxtrace) {
2418 			pr_err("Parallel trace streaming is not available in AUX area tracing mode.\n");
2419 			return -1;
2420 		}
2421 	}
2422 
2423 	fd = perf_data__fd(data);
2424 	rec->session = session;
2425 
2426 	if (zstd_init(&session->zstd_data, rec->opts.comp_level) < 0) {
2427 		pr_err("Compression initialization failed.\n");
2428 		return -1;
2429 	}
2430 #ifdef HAVE_EVENTFD_SUPPORT
2431 	done_fd = eventfd(0, EFD_NONBLOCK);
2432 	if (done_fd < 0) {
2433 		pr_err("Failed to create wakeup eventfd, error: %m\n");
2434 		status = -1;
2435 		goto out_delete_session;
2436 	}
2437 	err = evlist__add_wakeup_eventfd(rec->evlist, done_fd);
2438 	if (err < 0) {
2439 		pr_err("Failed to add wakeup eventfd to poll list\n");
2440 		status = err;
2441 		goto out_delete_session;
2442 	}
2443 #endif // HAVE_EVENTFD_SUPPORT
2444 
2445 	session->header.env.comp_type  = PERF_COMP_ZSTD;
2446 	session->header.env.comp_level = rec->opts.comp_level;
2447 
2448 	if (rec->opts.kcore &&
2449 	    !record__kcore_readable(&session->machines.host)) {
2450 		pr_err("ERROR: kcore is not readable.\n");
2451 		return -1;
2452 	}
2453 
2454 	if (record__init_clock(rec))
2455 		return -1;
2456 
2457 	record__init_features(rec);
2458 
2459 	if (forks) {
2460 		err = evlist__prepare_workload(rec->evlist, &opts->target, argv, data->is_pipe,
2461 					       workload_exec_failed_signal);
2462 		if (err < 0) {
2463 			pr_err("Couldn't run the workload!\n");
2464 			status = err;
2465 			goto out_delete_session;
2466 		}
2467 	}
2468 
2469 	/*
2470 	 * If we have just single event and are sending data
2471 	 * through pipe, we need to force the ids allocation,
2472 	 * because we synthesize event name through the pipe
2473 	 * and need the id for that.
2474 	 */
2475 	if (data->is_pipe && rec->evlist->core.nr_entries == 1)
2476 		rec->opts.sample_id = true;
2477 
2478 	if (rec->timestamp_filename && perf_data__is_pipe(data)) {
2479 		rec->timestamp_filename = false;
2480 		pr_warning("WARNING: --timestamp-filename option is not available in pipe mode.\n");
2481 	}
2482 
2483 	evlist__uniquify_name(rec->evlist);
2484 
2485 	evlist__config(rec->evlist, opts, &callchain_param);
2486 
2487 	/* Debug message used by test scripts */
2488 	pr_debug3("perf record opening and mmapping events\n");
2489 	if (record__open(rec) != 0) {
2490 		err = -1;
2491 		goto out_free_threads;
2492 	}
2493 	/* Debug message used by test scripts */
2494 	pr_debug3("perf record done opening and mmapping events\n");
2495 	session->header.env.comp_mmap_len = session->evlist->core.mmap_len;
2496 
2497 	if (rec->opts.kcore) {
2498 		err = record__kcore_copy(&session->machines.host, data);
2499 		if (err) {
2500 			pr_err("ERROR: Failed to copy kcore\n");
2501 			goto out_free_threads;
2502 		}
2503 	}
2504 
2505 	/*
2506 	 * Normally perf_session__new would do this, but it doesn't have the
2507 	 * evlist.
2508 	 */
2509 	if (rec->tool.ordered_events && !evlist__sample_id_all(rec->evlist)) {
2510 		pr_warning("WARNING: No sample_id_all support, falling back to unordered processing\n");
2511 		rec->tool.ordered_events = false;
2512 	}
2513 
2514 	if (evlist__nr_groups(rec->evlist) == 0)
2515 		perf_header__clear_feat(&session->header, HEADER_GROUP_DESC);
2516 
2517 	if (data->is_pipe) {
2518 		err = perf_header__write_pipe(fd);
2519 		if (err < 0)
2520 			goto out_free_threads;
2521 	} else {
2522 		err = perf_session__write_header(session, rec->evlist, fd, false);
2523 		if (err < 0)
2524 			goto out_free_threads;
2525 	}
2526 
2527 	err = -1;
2528 	if (!rec->no_buildid
2529 	    && !perf_header__has_feat(&session->header, HEADER_BUILD_ID)) {
2530 		pr_err("Couldn't generate buildids. "
2531 		       "Use --no-buildid to profile anyway.\n");
2532 		goto out_free_threads;
2533 	}
2534 
2535 	err = record__setup_sb_evlist(rec);
2536 	if (err)
2537 		goto out_free_threads;
2538 
2539 	err = record__synthesize(rec, false);
2540 	if (err < 0)
2541 		goto out_free_threads;
2542 
2543 	if (rec->realtime_prio) {
2544 		struct sched_param param;
2545 
2546 		param.sched_priority = rec->realtime_prio;
2547 		if (sched_setscheduler(0, SCHED_FIFO, &param)) {
2548 			pr_err("Could not set realtime priority.\n");
2549 			err = -1;
2550 			goto out_free_threads;
2551 		}
2552 	}
2553 
2554 	if (record__start_threads(rec))
2555 		goto out_free_threads;
2556 
2557 	/*
2558 	 * When perf is starting the traced process, all the events
2559 	 * (apart from group members) have enable_on_exec=1 set,
2560 	 * so don't spoil it by prematurely enabling them.
2561 	 */
2562 	if (!target__none(&opts->target) && !opts->target.initial_delay)
2563 		evlist__enable(rec->evlist);
2564 
2565 	/*
2566 	 * Let the child rip
2567 	 */
2568 	if (forks) {
2569 		struct machine *machine = &session->machines.host;
2570 		union perf_event *event;
2571 		pid_t tgid;
2572 
2573 		event = malloc(sizeof(event->comm) + machine->id_hdr_size);
2574 		if (event == NULL) {
2575 			err = -ENOMEM;
2576 			goto out_child;
2577 		}
2578 
2579 		/*
2580 		 * Some H/W events are generated before COMM event
2581 		 * which is emitted during exec(), so perf script
2582 		 * cannot see a correct process name for those events.
2583 		 * Synthesize COMM event to prevent it.
2584 		 */
2585 		tgid = perf_event__synthesize_comm(tool, event,
2586 						   rec->evlist->workload.pid,
2587 						   process_synthesized_event,
2588 						   machine);
2589 		free(event);
2590 
2591 		if (tgid == -1)
2592 			goto out_child;
2593 
2594 		event = malloc(sizeof(event->namespaces) +
2595 			       (NR_NAMESPACES * sizeof(struct perf_ns_link_info)) +
2596 			       machine->id_hdr_size);
2597 		if (event == NULL) {
2598 			err = -ENOMEM;
2599 			goto out_child;
2600 		}
2601 
2602 		/*
2603 		 * Synthesize NAMESPACES event for the command specified.
2604 		 */
2605 		perf_event__synthesize_namespaces(tool, event,
2606 						  rec->evlist->workload.pid,
2607 						  tgid, process_synthesized_event,
2608 						  machine);
2609 		free(event);
2610 
2611 		evlist__start_workload(rec->evlist);
2612 	}
2613 
2614 	if (opts->target.initial_delay) {
2615 		pr_info(EVLIST_DISABLED_MSG);
2616 		if (opts->target.initial_delay > 0) {
2617 			usleep(opts->target.initial_delay * USEC_PER_MSEC);
2618 			evlist__enable(rec->evlist);
2619 			pr_info(EVLIST_ENABLED_MSG);
2620 		}
2621 	}
2622 
2623 	err = event_enable_timer__start(rec->evlist->eet);
2624 	if (err)
2625 		goto out_child;
2626 
2627 	/* Debug message used by test scripts */
2628 	pr_debug3("perf record has started\n");
2629 	fflush(stderr);
2630 
2631 	trigger_ready(&auxtrace_snapshot_trigger);
2632 	trigger_ready(&switch_output_trigger);
2633 	perf_hooks__invoke_record_start();
2634 
2635 	/*
2636 	 * Must write FINISHED_INIT so it will be seen after all other
2637 	 * synthesized user events, but before any regular events.
2638 	 */
2639 	err = write_finished_init(rec, false);
2640 	if (err < 0)
2641 		goto out_child;
2642 
2643 	for (;;) {
2644 		unsigned long long hits = thread->samples;
2645 
2646 		/*
2647 		 * rec->evlist->bkw_mmap_state is possible to be
2648 		 * BKW_MMAP_EMPTY here: when done == true and
2649 		 * hits != rec->samples in previous round.
2650 		 *
2651 		 * evlist__toggle_bkw_mmap ensure we never
2652 		 * convert BKW_MMAP_EMPTY to BKW_MMAP_DATA_PENDING.
2653 		 */
2654 		if (trigger_is_hit(&switch_output_trigger) || done || draining)
2655 			evlist__toggle_bkw_mmap(rec->evlist, BKW_MMAP_DATA_PENDING);
2656 
2657 		if (record__mmap_read_all(rec, false) < 0) {
2658 			trigger_error(&auxtrace_snapshot_trigger);
2659 			trigger_error(&switch_output_trigger);
2660 			err = -1;
2661 			goto out_child;
2662 		}
2663 
2664 		if (auxtrace_record__snapshot_started) {
2665 			auxtrace_record__snapshot_started = 0;
2666 			if (!trigger_is_error(&auxtrace_snapshot_trigger))
2667 				record__read_auxtrace_snapshot(rec, false);
2668 			if (trigger_is_error(&auxtrace_snapshot_trigger)) {
2669 				pr_err("AUX area tracing snapshot failed\n");
2670 				err = -1;
2671 				goto out_child;
2672 			}
2673 		}
2674 
2675 		if (trigger_is_hit(&switch_output_trigger)) {
2676 			/*
2677 			 * If switch_output_trigger is hit, the data in
2678 			 * overwritable ring buffer should have been collected,
2679 			 * so bkw_mmap_state should be set to BKW_MMAP_EMPTY.
2680 			 *
2681 			 * If SIGUSR2 raise after or during record__mmap_read_all(),
2682 			 * record__mmap_read_all() didn't collect data from
2683 			 * overwritable ring buffer. Read again.
2684 			 */
2685 			if (rec->evlist->bkw_mmap_state == BKW_MMAP_RUNNING)
2686 				continue;
2687 			trigger_ready(&switch_output_trigger);
2688 
2689 			/*
2690 			 * Reenable events in overwrite ring buffer after
2691 			 * record__mmap_read_all(): we should have collected
2692 			 * data from it.
2693 			 */
2694 			evlist__toggle_bkw_mmap(rec->evlist, BKW_MMAP_RUNNING);
2695 
2696 			if (!quiet)
2697 				fprintf(stderr, "[ perf record: dump data: Woken up %ld times ]\n",
2698 					record__waking(rec));
2699 			thread->waking = 0;
2700 			fd = record__switch_output(rec, false);
2701 			if (fd < 0) {
2702 				pr_err("Failed to switch to new file\n");
2703 				trigger_error(&switch_output_trigger);
2704 				err = fd;
2705 				goto out_child;
2706 			}
2707 
2708 			/* re-arm the alarm */
2709 			if (rec->switch_output.time)
2710 				alarm(rec->switch_output.time);
2711 		}
2712 
2713 		if (hits == thread->samples) {
2714 			if (done || draining)
2715 				break;
2716 			err = fdarray__poll(&thread->pollfd, -1);
2717 			/*
2718 			 * Propagate error, only if there's any. Ignore positive
2719 			 * number of returned events and interrupt error.
2720 			 */
2721 			if (err > 0 || (err < 0 && errno == EINTR))
2722 				err = 0;
2723 			thread->waking++;
2724 
2725 			if (fdarray__filter(&thread->pollfd, POLLERR | POLLHUP,
2726 					    record__thread_munmap_filtered, NULL) == 0)
2727 				draining = true;
2728 
2729 			err = record__update_evlist_pollfd_from_thread(rec, rec->evlist, thread);
2730 			if (err)
2731 				goto out_child;
2732 		}
2733 
2734 		if (evlist__ctlfd_process(rec->evlist, &cmd) > 0) {
2735 			switch (cmd) {
2736 			case EVLIST_CTL_CMD_SNAPSHOT:
2737 				hit_auxtrace_snapshot_trigger(rec);
2738 				evlist__ctlfd_ack(rec->evlist);
2739 				break;
2740 			case EVLIST_CTL_CMD_STOP:
2741 				done = 1;
2742 				break;
2743 			case EVLIST_CTL_CMD_ACK:
2744 			case EVLIST_CTL_CMD_UNSUPPORTED:
2745 			case EVLIST_CTL_CMD_ENABLE:
2746 			case EVLIST_CTL_CMD_DISABLE:
2747 			case EVLIST_CTL_CMD_EVLIST:
2748 			case EVLIST_CTL_CMD_PING:
2749 			default:
2750 				break;
2751 			}
2752 		}
2753 
2754 		err = event_enable_timer__process(rec->evlist->eet);
2755 		if (err < 0)
2756 			goto out_child;
2757 		if (err) {
2758 			err = 0;
2759 			done = 1;
2760 		}
2761 
2762 		/*
2763 		 * When perf is starting the traced process, at the end events
2764 		 * die with the process and we wait for that. Thus no need to
2765 		 * disable events in this case.
2766 		 */
2767 		if (done && !disabled && !target__none(&opts->target)) {
2768 			trigger_off(&auxtrace_snapshot_trigger);
2769 			evlist__disable(rec->evlist);
2770 			disabled = true;
2771 		}
2772 	}
2773 
2774 	trigger_off(&auxtrace_snapshot_trigger);
2775 	trigger_off(&switch_output_trigger);
2776 
2777 	if (opts->auxtrace_snapshot_on_exit)
2778 		record__auxtrace_snapshot_exit(rec);
2779 
2780 	if (forks && workload_exec_errno) {
2781 		char msg[STRERR_BUFSIZE], strevsels[2048];
2782 		const char *emsg = str_error_r(workload_exec_errno, msg, sizeof(msg));
2783 
2784 		evlist__scnprintf_evsels(rec->evlist, sizeof(strevsels), strevsels);
2785 
2786 		pr_err("Failed to collect '%s' for the '%s' workload: %s\n",
2787 			strevsels, argv[0], emsg);
2788 		err = -1;
2789 		goto out_child;
2790 	}
2791 
2792 	if (!quiet)
2793 		fprintf(stderr, "[ perf record: Woken up %ld times to write data ]\n",
2794 			record__waking(rec));
2795 
2796 	write_finished_init(rec, true);
2797 
2798 	if (target__none(&rec->opts.target))
2799 		record__synthesize_workload(rec, true);
2800 
2801 out_child:
2802 	record__stop_threads(rec);
2803 	record__mmap_read_all(rec, true);
2804 out_free_threads:
2805 	record__free_thread_data(rec);
2806 	evlist__finalize_ctlfd(rec->evlist);
2807 	record__aio_mmap_read_sync(rec);
2808 
2809 	if (rec->session->bytes_transferred && rec->session->bytes_compressed) {
2810 		ratio = (float)rec->session->bytes_transferred/(float)rec->session->bytes_compressed;
2811 		session->header.env.comp_ratio = ratio + 0.5;
2812 	}
2813 
2814 	if (forks) {
2815 		int exit_status;
2816 
2817 		if (!child_finished)
2818 			kill(rec->evlist->workload.pid, SIGTERM);
2819 
2820 		wait(&exit_status);
2821 
2822 		if (err < 0)
2823 			status = err;
2824 		else if (WIFEXITED(exit_status))
2825 			status = WEXITSTATUS(exit_status);
2826 		else if (WIFSIGNALED(exit_status))
2827 			signr = WTERMSIG(exit_status);
2828 	} else
2829 		status = err;
2830 
2831 	if (rec->off_cpu)
2832 		rec->bytes_written += off_cpu_write(rec->session);
2833 
2834 	record__read_lost_samples(rec);
2835 	record__synthesize(rec, true);
2836 	/* this will be recalculated during process_buildids() */
2837 	rec->samples = 0;
2838 
2839 	if (!err) {
2840 		if (!rec->timestamp_filename) {
2841 			record__finish_output(rec);
2842 		} else {
2843 			fd = record__switch_output(rec, true);
2844 			if (fd < 0) {
2845 				status = fd;
2846 				goto out_delete_session;
2847 			}
2848 		}
2849 	}
2850 
2851 	perf_hooks__invoke_record_end();
2852 
2853 	if (!err && !quiet) {
2854 		char samples[128];
2855 		const char *postfix = rec->timestamp_filename ?
2856 					".<timestamp>" : "";
2857 
2858 		if (rec->samples && !rec->opts.full_auxtrace)
2859 			scnprintf(samples, sizeof(samples),
2860 				  " (%" PRIu64 " samples)", rec->samples);
2861 		else
2862 			samples[0] = '\0';
2863 
2864 		fprintf(stderr,	"[ perf record: Captured and wrote %.3f MB %s%s%s",
2865 			perf_data__size(data) / 1024.0 / 1024.0,
2866 			data->path, postfix, samples);
2867 		if (ratio) {
2868 			fprintf(stderr,	", compressed (original %.3f MB, ratio is %.3f)",
2869 					rec->session->bytes_transferred / 1024.0 / 1024.0,
2870 					ratio);
2871 		}
2872 		fprintf(stderr, " ]\n");
2873 	}
2874 
2875 out_delete_session:
2876 #ifdef HAVE_EVENTFD_SUPPORT
2877 	if (done_fd >= 0) {
2878 		fd = done_fd;
2879 		done_fd = -1;
2880 
2881 		close(fd);
2882 	}
2883 #endif
2884 	zstd_fini(&session->zstd_data);
2885 	if (!opts->no_bpf_event)
2886 		evlist__stop_sb_thread(rec->sb_evlist);
2887 
2888 	perf_session__delete(session);
2889 	return status;
2890 }
2891 
2892 static void callchain_debug(struct callchain_param *callchain)
2893 {
2894 	static const char *str[CALLCHAIN_MAX] = { "NONE", "FP", "DWARF", "LBR" };
2895 
2896 	pr_debug("callchain: type %s\n", str[callchain->record_mode]);
2897 
2898 	if (callchain->record_mode == CALLCHAIN_DWARF)
2899 		pr_debug("callchain: stack dump size %d\n",
2900 			 callchain->dump_size);
2901 }
2902 
2903 int record_opts__parse_callchain(struct record_opts *record,
2904 				 struct callchain_param *callchain,
2905 				 const char *arg, bool unset)
2906 {
2907 	int ret;
2908 	callchain->enabled = !unset;
2909 
2910 	/* --no-call-graph */
2911 	if (unset) {
2912 		callchain->record_mode = CALLCHAIN_NONE;
2913 		pr_debug("callchain: disabled\n");
2914 		return 0;
2915 	}
2916 
2917 	ret = parse_callchain_record_opt(arg, callchain);
2918 	if (!ret) {
2919 		/* Enable data address sampling for DWARF unwind. */
2920 		if (callchain->record_mode == CALLCHAIN_DWARF)
2921 			record->sample_address = true;
2922 		callchain_debug(callchain);
2923 	}
2924 
2925 	return ret;
2926 }
2927 
2928 int record_parse_callchain_opt(const struct option *opt,
2929 			       const char *arg,
2930 			       int unset)
2931 {
2932 	return record_opts__parse_callchain(opt->value, &callchain_param, arg, unset);
2933 }
2934 
2935 int record_callchain_opt(const struct option *opt,
2936 			 const char *arg __maybe_unused,
2937 			 int unset __maybe_unused)
2938 {
2939 	struct callchain_param *callchain = opt->value;
2940 
2941 	callchain->enabled = true;
2942 
2943 	if (callchain->record_mode == CALLCHAIN_NONE)
2944 		callchain->record_mode = CALLCHAIN_FP;
2945 
2946 	callchain_debug(callchain);
2947 	return 0;
2948 }
2949 
2950 static int perf_record_config(const char *var, const char *value, void *cb)
2951 {
2952 	struct record *rec = cb;
2953 
2954 	if (!strcmp(var, "record.build-id")) {
2955 		if (!strcmp(value, "cache"))
2956 			rec->no_buildid_cache = false;
2957 		else if (!strcmp(value, "no-cache"))
2958 			rec->no_buildid_cache = true;
2959 		else if (!strcmp(value, "skip"))
2960 			rec->no_buildid = true;
2961 		else if (!strcmp(value, "mmap"))
2962 			rec->buildid_mmap = true;
2963 		else
2964 			return -1;
2965 		return 0;
2966 	}
2967 	if (!strcmp(var, "record.call-graph")) {
2968 		var = "call-graph.record-mode";
2969 		return perf_default_config(var, value, cb);
2970 	}
2971 #ifdef HAVE_AIO_SUPPORT
2972 	if (!strcmp(var, "record.aio")) {
2973 		rec->opts.nr_cblocks = strtol(value, NULL, 0);
2974 		if (!rec->opts.nr_cblocks)
2975 			rec->opts.nr_cblocks = nr_cblocks_default;
2976 	}
2977 #endif
2978 	if (!strcmp(var, "record.debuginfod")) {
2979 		rec->debuginfod.urls = strdup(value);
2980 		if (!rec->debuginfod.urls)
2981 			return -ENOMEM;
2982 		rec->debuginfod.set = true;
2983 	}
2984 
2985 	return 0;
2986 }
2987 
2988 static int record__parse_event_enable_time(const struct option *opt, const char *str, int unset)
2989 {
2990 	struct record *rec = (struct record *)opt->value;
2991 
2992 	return evlist__parse_event_enable_time(rec->evlist, &rec->opts, str, unset);
2993 }
2994 
2995 static int record__parse_affinity(const struct option *opt, const char *str, int unset)
2996 {
2997 	struct record_opts *opts = (struct record_opts *)opt->value;
2998 
2999 	if (unset || !str)
3000 		return 0;
3001 
3002 	if (!strcasecmp(str, "node"))
3003 		opts->affinity = PERF_AFFINITY_NODE;
3004 	else if (!strcasecmp(str, "cpu"))
3005 		opts->affinity = PERF_AFFINITY_CPU;
3006 
3007 	return 0;
3008 }
3009 
3010 static int record__mmap_cpu_mask_alloc(struct mmap_cpu_mask *mask, int nr_bits)
3011 {
3012 	mask->nbits = nr_bits;
3013 	mask->bits = bitmap_zalloc(mask->nbits);
3014 	if (!mask->bits)
3015 		return -ENOMEM;
3016 
3017 	return 0;
3018 }
3019 
3020 static void record__mmap_cpu_mask_free(struct mmap_cpu_mask *mask)
3021 {
3022 	bitmap_free(mask->bits);
3023 	mask->nbits = 0;
3024 }
3025 
3026 static int record__thread_mask_alloc(struct thread_mask *mask, int nr_bits)
3027 {
3028 	int ret;
3029 
3030 	ret = record__mmap_cpu_mask_alloc(&mask->maps, nr_bits);
3031 	if (ret) {
3032 		mask->affinity.bits = NULL;
3033 		return ret;
3034 	}
3035 
3036 	ret = record__mmap_cpu_mask_alloc(&mask->affinity, nr_bits);
3037 	if (ret) {
3038 		record__mmap_cpu_mask_free(&mask->maps);
3039 		mask->maps.bits = NULL;
3040 	}
3041 
3042 	return ret;
3043 }
3044 
3045 static void record__thread_mask_free(struct thread_mask *mask)
3046 {
3047 	record__mmap_cpu_mask_free(&mask->maps);
3048 	record__mmap_cpu_mask_free(&mask->affinity);
3049 }
3050 
3051 static int record__parse_threads(const struct option *opt, const char *str, int unset)
3052 {
3053 	int s;
3054 	struct record_opts *opts = opt->value;
3055 
3056 	if (unset || !str || !strlen(str)) {
3057 		opts->threads_spec = THREAD_SPEC__CPU;
3058 	} else {
3059 		for (s = 1; s < THREAD_SPEC__MAX; s++) {
3060 			if (s == THREAD_SPEC__USER) {
3061 				opts->threads_user_spec = strdup(str);
3062 				if (!opts->threads_user_spec)
3063 					return -ENOMEM;
3064 				opts->threads_spec = THREAD_SPEC__USER;
3065 				break;
3066 			}
3067 			if (!strncasecmp(str, thread_spec_tags[s], strlen(thread_spec_tags[s]))) {
3068 				opts->threads_spec = s;
3069 				break;
3070 			}
3071 		}
3072 	}
3073 
3074 	if (opts->threads_spec == THREAD_SPEC__USER)
3075 		pr_debug("threads_spec: %s\n", opts->threads_user_spec);
3076 	else
3077 		pr_debug("threads_spec: %s\n", thread_spec_tags[opts->threads_spec]);
3078 
3079 	return 0;
3080 }
3081 
3082 static int parse_output_max_size(const struct option *opt,
3083 				 const char *str, int unset)
3084 {
3085 	unsigned long *s = (unsigned long *)opt->value;
3086 	static struct parse_tag tags_size[] = {
3087 		{ .tag  = 'B', .mult = 1       },
3088 		{ .tag  = 'K', .mult = 1 << 10 },
3089 		{ .tag  = 'M', .mult = 1 << 20 },
3090 		{ .tag  = 'G', .mult = 1 << 30 },
3091 		{ .tag  = 0 },
3092 	};
3093 	unsigned long val;
3094 
3095 	if (unset) {
3096 		*s = 0;
3097 		return 0;
3098 	}
3099 
3100 	val = parse_tag_value(str, tags_size);
3101 	if (val != (unsigned long) -1) {
3102 		*s = val;
3103 		return 0;
3104 	}
3105 
3106 	return -1;
3107 }
3108 
3109 static int record__parse_mmap_pages(const struct option *opt,
3110 				    const char *str,
3111 				    int unset __maybe_unused)
3112 {
3113 	struct record_opts *opts = opt->value;
3114 	char *s, *p;
3115 	unsigned int mmap_pages;
3116 	int ret;
3117 
3118 	if (!str)
3119 		return -EINVAL;
3120 
3121 	s = strdup(str);
3122 	if (!s)
3123 		return -ENOMEM;
3124 
3125 	p = strchr(s, ',');
3126 	if (p)
3127 		*p = '\0';
3128 
3129 	if (*s) {
3130 		ret = __evlist__parse_mmap_pages(&mmap_pages, s);
3131 		if (ret)
3132 			goto out_free;
3133 		opts->mmap_pages = mmap_pages;
3134 	}
3135 
3136 	if (!p) {
3137 		ret = 0;
3138 		goto out_free;
3139 	}
3140 
3141 	ret = __evlist__parse_mmap_pages(&mmap_pages, p + 1);
3142 	if (ret)
3143 		goto out_free;
3144 
3145 	opts->auxtrace_mmap_pages = mmap_pages;
3146 
3147 out_free:
3148 	free(s);
3149 	return ret;
3150 }
3151 
3152 void __weak arch__add_leaf_frame_record_opts(struct record_opts *opts __maybe_unused)
3153 {
3154 }
3155 
3156 static int parse_control_option(const struct option *opt,
3157 				const char *str,
3158 				int unset __maybe_unused)
3159 {
3160 	struct record_opts *opts = opt->value;
3161 
3162 	return evlist__parse_control(str, &opts->ctl_fd, &opts->ctl_fd_ack, &opts->ctl_fd_close);
3163 }
3164 
3165 static void switch_output_size_warn(struct record *rec)
3166 {
3167 	u64 wakeup_size = evlist__mmap_size(rec->opts.mmap_pages);
3168 	struct switch_output *s = &rec->switch_output;
3169 
3170 	wakeup_size /= 2;
3171 
3172 	if (s->size < wakeup_size) {
3173 		char buf[100];
3174 
3175 		unit_number__scnprintf(buf, sizeof(buf), wakeup_size);
3176 		pr_warning("WARNING: switch-output data size lower than "
3177 			   "wakeup kernel buffer size (%s) "
3178 			   "expect bigger perf.data sizes\n", buf);
3179 	}
3180 }
3181 
3182 static int switch_output_setup(struct record *rec)
3183 {
3184 	struct switch_output *s = &rec->switch_output;
3185 	static struct parse_tag tags_size[] = {
3186 		{ .tag  = 'B', .mult = 1       },
3187 		{ .tag  = 'K', .mult = 1 << 10 },
3188 		{ .tag  = 'M', .mult = 1 << 20 },
3189 		{ .tag  = 'G', .mult = 1 << 30 },
3190 		{ .tag  = 0 },
3191 	};
3192 	static struct parse_tag tags_time[] = {
3193 		{ .tag  = 's', .mult = 1        },
3194 		{ .tag  = 'm', .mult = 60       },
3195 		{ .tag  = 'h', .mult = 60*60    },
3196 		{ .tag  = 'd', .mult = 60*60*24 },
3197 		{ .tag  = 0 },
3198 	};
3199 	unsigned long val;
3200 
3201 	/*
3202 	 * If we're using --switch-output-events, then we imply its
3203 	 * --switch-output=signal, as we'll send a SIGUSR2 from the side band
3204 	 *  thread to its parent.
3205 	 */
3206 	if (rec->switch_output_event_set) {
3207 		if (record__threads_enabled(rec)) {
3208 			pr_warning("WARNING: --switch-output-event option is not available in parallel streaming mode.\n");
3209 			return 0;
3210 		}
3211 		goto do_signal;
3212 	}
3213 
3214 	if (!s->set)
3215 		return 0;
3216 
3217 	if (record__threads_enabled(rec)) {
3218 		pr_warning("WARNING: --switch-output option is not available in parallel streaming mode.\n");
3219 		return 0;
3220 	}
3221 
3222 	if (!strcmp(s->str, "signal")) {
3223 do_signal:
3224 		s->signal = true;
3225 		pr_debug("switch-output with SIGUSR2 signal\n");
3226 		goto enabled;
3227 	}
3228 
3229 	val = parse_tag_value(s->str, tags_size);
3230 	if (val != (unsigned long) -1) {
3231 		s->size = val;
3232 		pr_debug("switch-output with %s size threshold\n", s->str);
3233 		goto enabled;
3234 	}
3235 
3236 	val = parse_tag_value(s->str, tags_time);
3237 	if (val != (unsigned long) -1) {
3238 		s->time = val;
3239 		pr_debug("switch-output with %s time threshold (%lu seconds)\n",
3240 			 s->str, s->time);
3241 		goto enabled;
3242 	}
3243 
3244 	return -1;
3245 
3246 enabled:
3247 	rec->timestamp_filename = true;
3248 	s->enabled              = true;
3249 
3250 	if (s->size && !rec->opts.no_buffering)
3251 		switch_output_size_warn(rec);
3252 
3253 	return 0;
3254 }
3255 
3256 static const char * const __record_usage[] = {
3257 	"perf record [<options>] [<command>]",
3258 	"perf record [<options>] -- <command> [<options>]",
3259 	NULL
3260 };
3261 const char * const *record_usage = __record_usage;
3262 
3263 static int build_id__process_mmap(const struct perf_tool *tool, union perf_event *event,
3264 				  struct perf_sample *sample, struct machine *machine)
3265 {
3266 	/*
3267 	 * We already have the kernel maps, put in place via perf_session__create_kernel_maps()
3268 	 * no need to add them twice.
3269 	 */
3270 	if (!(event->header.misc & PERF_RECORD_MISC_USER))
3271 		return 0;
3272 	return perf_event__process_mmap(tool, event, sample, machine);
3273 }
3274 
3275 static int build_id__process_mmap2(const struct perf_tool *tool, union perf_event *event,
3276 				   struct perf_sample *sample, struct machine *machine)
3277 {
3278 	/*
3279 	 * We already have the kernel maps, put in place via perf_session__create_kernel_maps()
3280 	 * no need to add them twice.
3281 	 */
3282 	if (!(event->header.misc & PERF_RECORD_MISC_USER))
3283 		return 0;
3284 
3285 	return perf_event__process_mmap2(tool, event, sample, machine);
3286 }
3287 
3288 static int process_timestamp_boundary(const struct perf_tool *tool,
3289 				      union perf_event *event __maybe_unused,
3290 				      struct perf_sample *sample,
3291 				      struct machine *machine __maybe_unused)
3292 {
3293 	struct record *rec = container_of(tool, struct record, tool);
3294 
3295 	set_timestamp_boundary(rec, sample->time);
3296 	return 0;
3297 }
3298 
3299 static int parse_record_synth_option(const struct option *opt,
3300 				     const char *str,
3301 				     int unset __maybe_unused)
3302 {
3303 	struct record_opts *opts = opt->value;
3304 	char *p = strdup(str);
3305 
3306 	if (p == NULL)
3307 		return -1;
3308 
3309 	opts->synth = parse_synth_opt(p);
3310 	free(p);
3311 
3312 	if (opts->synth < 0) {
3313 		pr_err("Invalid synth option: %s\n", str);
3314 		return -1;
3315 	}
3316 	return 0;
3317 }
3318 
3319 /*
3320  * XXX Ideally would be local to cmd_record() and passed to a record__new
3321  * because we need to have access to it in record__exit, that is called
3322  * after cmd_record() exits, but since record_options need to be accessible to
3323  * builtin-script, leave it here.
3324  *
3325  * At least we don't ouch it in all the other functions here directly.
3326  *
3327  * Just say no to tons of global variables, sigh.
3328  */
3329 static struct record record = {
3330 	.opts = {
3331 		.sample_time	     = true,
3332 		.mmap_pages	     = UINT_MAX,
3333 		.user_freq	     = UINT_MAX,
3334 		.user_interval	     = ULLONG_MAX,
3335 		.freq		     = 4000,
3336 		.target		     = {
3337 			.uses_mmap   = true,
3338 			.default_per_cpu = true,
3339 		},
3340 		.mmap_flush          = MMAP_FLUSH_DEFAULT,
3341 		.nr_threads_synthesize = 1,
3342 		.ctl_fd              = -1,
3343 		.ctl_fd_ack          = -1,
3344 		.synth               = PERF_SYNTH_ALL,
3345 	},
3346 };
3347 
3348 const char record_callchain_help[] = CALLCHAIN_RECORD_HELP
3349 	"\n\t\t\t\tDefault: fp";
3350 
3351 static bool dry_run;
3352 
3353 static struct parse_events_option_args parse_events_option_args = {
3354 	.evlistp = &record.evlist,
3355 };
3356 
3357 static struct parse_events_option_args switch_output_parse_events_option_args = {
3358 	.evlistp = &record.sb_evlist,
3359 };
3360 
3361 /*
3362  * XXX Will stay a global variable till we fix builtin-script.c to stop messing
3363  * with it and switch to use the library functions in perf_evlist that came
3364  * from builtin-record.c, i.e. use record_opts,
3365  * evlist__prepare_workload, etc instead of fork+exec'in 'perf record',
3366  * using pipes, etc.
3367  */
3368 static struct option __record_options[] = {
3369 	OPT_CALLBACK('e', "event", &parse_events_option_args, "event",
3370 		     "event selector. use 'perf list' to list available events",
3371 		     parse_events_option),
3372 	OPT_CALLBACK(0, "filter", &record.evlist, "filter",
3373 		     "event filter", parse_filter),
3374 	OPT_CALLBACK_NOOPT(0, "exclude-perf", &record.evlist,
3375 			   NULL, "don't record events from perf itself",
3376 			   exclude_perf),
3377 	OPT_STRING('p', "pid", &record.opts.target.pid, "pid",
3378 		    "record events on existing process id"),
3379 	OPT_STRING('t', "tid", &record.opts.target.tid, "tid",
3380 		    "record events on existing thread id"),
3381 	OPT_INTEGER('r', "realtime", &record.realtime_prio,
3382 		    "collect data with this RT SCHED_FIFO priority"),
3383 	OPT_BOOLEAN(0, "no-buffering", &record.opts.no_buffering,
3384 		    "collect data without buffering"),
3385 	OPT_BOOLEAN('R', "raw-samples", &record.opts.raw_samples,
3386 		    "collect raw sample records from all opened counters"),
3387 	OPT_BOOLEAN('a', "all-cpus", &record.opts.target.system_wide,
3388 			    "system-wide collection from all CPUs"),
3389 	OPT_STRING('C', "cpu", &record.opts.target.cpu_list, "cpu",
3390 		    "list of cpus to monitor"),
3391 	OPT_U64('c', "count", &record.opts.user_interval, "event period to sample"),
3392 	OPT_STRING('o', "output", &record.data.path, "file",
3393 		    "output file name"),
3394 	OPT_BOOLEAN_SET('i', "no-inherit", &record.opts.no_inherit,
3395 			&record.opts.no_inherit_set,
3396 			"child tasks do not inherit counters"),
3397 	OPT_BOOLEAN(0, "tail-synthesize", &record.opts.tail_synthesize,
3398 		    "synthesize non-sample events at the end of output"),
3399 	OPT_BOOLEAN(0, "overwrite", &record.opts.overwrite, "use overwrite mode"),
3400 	OPT_BOOLEAN(0, "no-bpf-event", &record.opts.no_bpf_event, "do not record bpf events"),
3401 	OPT_BOOLEAN(0, "strict-freq", &record.opts.strict_freq,
3402 		    "Fail if the specified frequency can't be used"),
3403 	OPT_CALLBACK('F', "freq", &record.opts, "freq or 'max'",
3404 		     "profile at this frequency",
3405 		      record__parse_freq),
3406 	OPT_CALLBACK('m', "mmap-pages", &record.opts, "pages[,pages]",
3407 		     "number of mmap data pages and AUX area tracing mmap pages",
3408 		     record__parse_mmap_pages),
3409 	OPT_CALLBACK(0, "mmap-flush", &record.opts, "number",
3410 		     "Minimal number of bytes that is extracted from mmap data pages (default: 1)",
3411 		     record__mmap_flush_parse),
3412 	OPT_CALLBACK_NOOPT('g', NULL, &callchain_param,
3413 			   NULL, "enables call-graph recording" ,
3414 			   &record_callchain_opt),
3415 	OPT_CALLBACK(0, "call-graph", &record.opts,
3416 		     "record_mode[,record_size]", record_callchain_help,
3417 		     &record_parse_callchain_opt),
3418 	OPT_INCR('v', "verbose", &verbose,
3419 		    "be more verbose (show counter open errors, etc)"),
3420 	OPT_BOOLEAN('q', "quiet", &quiet, "don't print any warnings or messages"),
3421 	OPT_BOOLEAN('s', "stat", &record.opts.inherit_stat,
3422 		    "per thread counts"),
3423 	OPT_BOOLEAN('d', "data", &record.opts.sample_address, "Record the sample addresses"),
3424 	OPT_BOOLEAN(0, "phys-data", &record.opts.sample_phys_addr,
3425 		    "Record the sample physical addresses"),
3426 	OPT_BOOLEAN(0, "data-page-size", &record.opts.sample_data_page_size,
3427 		    "Record the sampled data address data page size"),
3428 	OPT_BOOLEAN(0, "code-page-size", &record.opts.sample_code_page_size,
3429 		    "Record the sampled code address (ip) page size"),
3430 	OPT_BOOLEAN(0, "sample-cpu", &record.opts.sample_cpu, "Record the sample cpu"),
3431 	OPT_BOOLEAN(0, "sample-identifier", &record.opts.sample_identifier,
3432 		    "Record the sample identifier"),
3433 	OPT_BOOLEAN_SET('T', "timestamp", &record.opts.sample_time,
3434 			&record.opts.sample_time_set,
3435 			"Record the sample timestamps"),
3436 	OPT_BOOLEAN_SET('P', "period", &record.opts.period, &record.opts.period_set,
3437 			"Record the sample period"),
3438 	OPT_BOOLEAN('n', "no-samples", &record.opts.no_samples,
3439 		    "don't sample"),
3440 	OPT_BOOLEAN_SET('N', "no-buildid-cache", &record.no_buildid_cache,
3441 			&record.no_buildid_cache_set,
3442 			"do not update the buildid cache"),
3443 	OPT_BOOLEAN_SET('B', "no-buildid", &record.no_buildid,
3444 			&record.no_buildid_set,
3445 			"do not collect buildids in perf.data"),
3446 	OPT_CALLBACK('G', "cgroup", &record.evlist, "name",
3447 		     "monitor event in cgroup name only",
3448 		     parse_cgroups),
3449 	OPT_CALLBACK('D', "delay", &record, "ms",
3450 		     "ms to wait before starting measurement after program start (-1: start with events disabled), "
3451 		     "or ranges of time to enable events e.g. '-D 10-20,30-40'",
3452 		     record__parse_event_enable_time),
3453 	OPT_BOOLEAN(0, "kcore", &record.opts.kcore, "copy /proc/kcore"),
3454 	OPT_STRING('u', "uid", &record.opts.target.uid_str, "user",
3455 		   "user to profile"),
3456 
3457 	OPT_CALLBACK_NOOPT('b', "branch-any", &record.opts.branch_stack,
3458 		     "branch any", "sample any taken branches",
3459 		     parse_branch_stack),
3460 
3461 	OPT_CALLBACK('j', "branch-filter", &record.opts.branch_stack,
3462 		     "branch filter mask", "branch stack filter modes",
3463 		     parse_branch_stack),
3464 	OPT_BOOLEAN('W', "weight", &record.opts.sample_weight,
3465 		    "sample by weight (on special events only)"),
3466 	OPT_BOOLEAN(0, "transaction", &record.opts.sample_transaction,
3467 		    "sample transaction flags (special events only)"),
3468 	OPT_BOOLEAN(0, "per-thread", &record.opts.target.per_thread,
3469 		    "use per-thread mmaps"),
3470 	OPT_CALLBACK_OPTARG('I', "intr-regs", &record.opts.sample_intr_regs, NULL, "any register",
3471 		    "sample selected machine registers on interrupt,"
3472 		    " use '-I?' to list register names", parse_intr_regs),
3473 	OPT_CALLBACK_OPTARG(0, "user-regs", &record.opts.sample_user_regs, NULL, "any register",
3474 		    "sample selected machine registers on interrupt,"
3475 		    " use '--user-regs=?' to list register names", parse_user_regs),
3476 	OPT_BOOLEAN(0, "running-time", &record.opts.running_time,
3477 		    "Record running/enabled time of read (:S) events"),
3478 	OPT_CALLBACK('k', "clockid", &record.opts,
3479 	"clockid", "clockid to use for events, see clock_gettime()",
3480 	parse_clockid),
3481 	OPT_STRING_OPTARG('S', "snapshot", &record.opts.auxtrace_snapshot_opts,
3482 			  "opts", "AUX area tracing Snapshot Mode", ""),
3483 	OPT_STRING_OPTARG(0, "aux-sample", &record.opts.auxtrace_sample_opts,
3484 			  "opts", "sample AUX area", ""),
3485 	OPT_UINTEGER(0, "proc-map-timeout", &proc_map_timeout,
3486 			"per thread proc mmap processing timeout in ms"),
3487 	OPT_BOOLEAN(0, "namespaces", &record.opts.record_namespaces,
3488 		    "Record namespaces events"),
3489 	OPT_BOOLEAN(0, "all-cgroups", &record.opts.record_cgroup,
3490 		    "Record cgroup events"),
3491 	OPT_BOOLEAN_SET(0, "switch-events", &record.opts.record_switch_events,
3492 			&record.opts.record_switch_events_set,
3493 			"Record context switch events"),
3494 	OPT_BOOLEAN_FLAG(0, "all-kernel", &record.opts.all_kernel,
3495 			 "Configure all used events to run in kernel space.",
3496 			 PARSE_OPT_EXCLUSIVE),
3497 	OPT_BOOLEAN_FLAG(0, "all-user", &record.opts.all_user,
3498 			 "Configure all used events to run in user space.",
3499 			 PARSE_OPT_EXCLUSIVE),
3500 	OPT_BOOLEAN(0, "kernel-callchains", &record.opts.kernel_callchains,
3501 		    "collect kernel callchains"),
3502 	OPT_BOOLEAN(0, "user-callchains", &record.opts.user_callchains,
3503 		    "collect user callchains"),
3504 	OPT_STRING(0, "vmlinux", &symbol_conf.vmlinux_name,
3505 		   "file", "vmlinux pathname"),
3506 	OPT_BOOLEAN(0, "buildid-all", &record.buildid_all,
3507 		    "Record build-id of all DSOs regardless of hits"),
3508 	OPT_BOOLEAN(0, "buildid-mmap", &record.buildid_mmap,
3509 		    "Record build-id in map events"),
3510 	OPT_BOOLEAN(0, "timestamp-filename", &record.timestamp_filename,
3511 		    "append timestamp to output filename"),
3512 	OPT_BOOLEAN(0, "timestamp-boundary", &record.timestamp_boundary,
3513 		    "Record timestamp boundary (time of first/last samples)"),
3514 	OPT_STRING_OPTARG_SET(0, "switch-output", &record.switch_output.str,
3515 			  &record.switch_output.set, "signal or size[BKMG] or time[smhd]",
3516 			  "Switch output when receiving SIGUSR2 (signal) or cross a size or time threshold",
3517 			  "signal"),
3518 	OPT_CALLBACK_SET(0, "switch-output-event", &switch_output_parse_events_option_args,
3519 			 &record.switch_output_event_set, "switch output event",
3520 			 "switch output event selector. use 'perf list' to list available events",
3521 			 parse_events_option_new_evlist),
3522 	OPT_INTEGER(0, "switch-max-files", &record.switch_output.num_files,
3523 		   "Limit number of switch output generated files"),
3524 	OPT_BOOLEAN(0, "dry-run", &dry_run,
3525 		    "Parse options then exit"),
3526 #ifdef HAVE_AIO_SUPPORT
3527 	OPT_CALLBACK_OPTARG(0, "aio", &record.opts,
3528 		     &nr_cblocks_default, "n", "Use <n> control blocks in asynchronous trace writing mode (default: 1, max: 4)",
3529 		     record__aio_parse),
3530 #endif
3531 	OPT_CALLBACK(0, "affinity", &record.opts, "node|cpu",
3532 		     "Set affinity mask of trace reading thread to NUMA node cpu mask or cpu of processed mmap buffer",
3533 		     record__parse_affinity),
3534 #ifdef HAVE_ZSTD_SUPPORT
3535 	OPT_CALLBACK_OPTARG('z', "compression-level", &record.opts, &comp_level_default, "n",
3536 			    "Compress records using specified level (default: 1 - fastest compression, 22 - greatest compression)",
3537 			    record__parse_comp_level),
3538 #endif
3539 	OPT_CALLBACK(0, "max-size", &record.output_max_size,
3540 		     "size", "Limit the maximum size of the output file", parse_output_max_size),
3541 	OPT_UINTEGER(0, "num-thread-synthesize",
3542 		     &record.opts.nr_threads_synthesize,
3543 		     "number of threads to run for event synthesis"),
3544 #ifdef HAVE_LIBPFM
3545 	OPT_CALLBACK(0, "pfm-events", &record.evlist, "event",
3546 		"libpfm4 event selector. use 'perf list' to list available events",
3547 		parse_libpfm_events_option),
3548 #endif
3549 	OPT_CALLBACK(0, "control", &record.opts, "fd:ctl-fd[,ack-fd] or fifo:ctl-fifo[,ack-fifo]",
3550 		     "Listen on ctl-fd descriptor for command to control measurement ('enable': enable events, 'disable': disable events,\n"
3551 		     "\t\t\t  'snapshot': AUX area tracing snapshot).\n"
3552 		     "\t\t\t  Optionally send control command completion ('ack\\n') to ack-fd descriptor.\n"
3553 		     "\t\t\t  Alternatively, ctl-fifo / ack-fifo will be opened and used as ctl-fd / ack-fd.",
3554 		      parse_control_option),
3555 	OPT_CALLBACK(0, "synth", &record.opts, "no|all|task|mmap|cgroup",
3556 		     "Fine-tune event synthesis: default=all", parse_record_synth_option),
3557 	OPT_STRING_OPTARG_SET(0, "debuginfod", &record.debuginfod.urls,
3558 			  &record.debuginfod.set, "debuginfod urls",
3559 			  "Enable debuginfod data retrieval from DEBUGINFOD_URLS or specified urls",
3560 			  "system"),
3561 	OPT_CALLBACK_OPTARG(0, "threads", &record.opts, NULL, "spec",
3562 			    "write collected trace data into several data files using parallel threads",
3563 			    record__parse_threads),
3564 	OPT_BOOLEAN(0, "off-cpu", &record.off_cpu, "Enable off-cpu analysis"),
3565 	OPT_STRING(0, "setup-filter", &record.filter_action, "pin|unpin",
3566 		   "BPF filter action"),
3567 	OPT_END()
3568 };
3569 
3570 struct option *record_options = __record_options;
3571 
3572 static int record__mmap_cpu_mask_init(struct mmap_cpu_mask *mask, struct perf_cpu_map *cpus)
3573 {
3574 	struct perf_cpu cpu;
3575 	int idx;
3576 
3577 	if (cpu_map__is_dummy(cpus))
3578 		return 0;
3579 
3580 	perf_cpu_map__for_each_cpu_skip_any(cpu, idx, cpus) {
3581 		/* Return ENODEV is input cpu is greater than max cpu */
3582 		if ((unsigned long)cpu.cpu > mask->nbits)
3583 			return -ENODEV;
3584 		__set_bit(cpu.cpu, mask->bits);
3585 	}
3586 
3587 	return 0;
3588 }
3589 
3590 static int record__mmap_cpu_mask_init_spec(struct mmap_cpu_mask *mask, const char *mask_spec)
3591 {
3592 	struct perf_cpu_map *cpus;
3593 
3594 	cpus = perf_cpu_map__new(mask_spec);
3595 	if (!cpus)
3596 		return -ENOMEM;
3597 
3598 	bitmap_zero(mask->bits, mask->nbits);
3599 	if (record__mmap_cpu_mask_init(mask, cpus))
3600 		return -ENODEV;
3601 
3602 	perf_cpu_map__put(cpus);
3603 
3604 	return 0;
3605 }
3606 
3607 static void record__free_thread_masks(struct record *rec, int nr_threads)
3608 {
3609 	int t;
3610 
3611 	if (rec->thread_masks)
3612 		for (t = 0; t < nr_threads; t++)
3613 			record__thread_mask_free(&rec->thread_masks[t]);
3614 
3615 	zfree(&rec->thread_masks);
3616 }
3617 
3618 static int record__alloc_thread_masks(struct record *rec, int nr_threads, int nr_bits)
3619 {
3620 	int t, ret;
3621 
3622 	rec->thread_masks = zalloc(nr_threads * sizeof(*(rec->thread_masks)));
3623 	if (!rec->thread_masks) {
3624 		pr_err("Failed to allocate thread masks\n");
3625 		return -ENOMEM;
3626 	}
3627 
3628 	for (t = 0; t < nr_threads; t++) {
3629 		ret = record__thread_mask_alloc(&rec->thread_masks[t], nr_bits);
3630 		if (ret) {
3631 			pr_err("Failed to allocate thread masks[%d]\n", t);
3632 			goto out_free;
3633 		}
3634 	}
3635 
3636 	return 0;
3637 
3638 out_free:
3639 	record__free_thread_masks(rec, nr_threads);
3640 
3641 	return ret;
3642 }
3643 
3644 static int record__init_thread_cpu_masks(struct record *rec, struct perf_cpu_map *cpus)
3645 {
3646 	int t, ret, nr_cpus = perf_cpu_map__nr(cpus);
3647 
3648 	ret = record__alloc_thread_masks(rec, nr_cpus, cpu__max_cpu().cpu);
3649 	if (ret)
3650 		return ret;
3651 
3652 	rec->nr_threads = nr_cpus;
3653 	pr_debug("nr_threads: %d\n", rec->nr_threads);
3654 
3655 	for (t = 0; t < rec->nr_threads; t++) {
3656 		__set_bit(perf_cpu_map__cpu(cpus, t).cpu, rec->thread_masks[t].maps.bits);
3657 		__set_bit(perf_cpu_map__cpu(cpus, t).cpu, rec->thread_masks[t].affinity.bits);
3658 		if (verbose > 0) {
3659 			pr_debug("thread_masks[%d]: ", t);
3660 			mmap_cpu_mask__scnprintf(&rec->thread_masks[t].maps, "maps");
3661 			pr_debug("thread_masks[%d]: ", t);
3662 			mmap_cpu_mask__scnprintf(&rec->thread_masks[t].affinity, "affinity");
3663 		}
3664 	}
3665 
3666 	return 0;
3667 }
3668 
3669 static int record__init_thread_masks_spec(struct record *rec, struct perf_cpu_map *cpus,
3670 					  const char **maps_spec, const char **affinity_spec,
3671 					  u32 nr_spec)
3672 {
3673 	u32 s;
3674 	int ret = 0, t = 0;
3675 	struct mmap_cpu_mask cpus_mask;
3676 	struct thread_mask thread_mask, full_mask, *thread_masks;
3677 
3678 	ret = record__mmap_cpu_mask_alloc(&cpus_mask, cpu__max_cpu().cpu);
3679 	if (ret) {
3680 		pr_err("Failed to allocate CPUs mask\n");
3681 		return ret;
3682 	}
3683 
3684 	ret = record__mmap_cpu_mask_init(&cpus_mask, cpus);
3685 	if (ret) {
3686 		pr_err("Failed to init cpu mask\n");
3687 		goto out_free_cpu_mask;
3688 	}
3689 
3690 	ret = record__thread_mask_alloc(&full_mask, cpu__max_cpu().cpu);
3691 	if (ret) {
3692 		pr_err("Failed to allocate full mask\n");
3693 		goto out_free_cpu_mask;
3694 	}
3695 
3696 	ret = record__thread_mask_alloc(&thread_mask, cpu__max_cpu().cpu);
3697 	if (ret) {
3698 		pr_err("Failed to allocate thread mask\n");
3699 		goto out_free_full_and_cpu_masks;
3700 	}
3701 
3702 	for (s = 0; s < nr_spec; s++) {
3703 		ret = record__mmap_cpu_mask_init_spec(&thread_mask.maps, maps_spec[s]);
3704 		if (ret) {
3705 			pr_err("Failed to initialize maps thread mask\n");
3706 			goto out_free;
3707 		}
3708 		ret = record__mmap_cpu_mask_init_spec(&thread_mask.affinity, affinity_spec[s]);
3709 		if (ret) {
3710 			pr_err("Failed to initialize affinity thread mask\n");
3711 			goto out_free;
3712 		}
3713 
3714 		/* ignore invalid CPUs but do not allow empty masks */
3715 		if (!bitmap_and(thread_mask.maps.bits, thread_mask.maps.bits,
3716 				cpus_mask.bits, thread_mask.maps.nbits)) {
3717 			pr_err("Empty maps mask: %s\n", maps_spec[s]);
3718 			ret = -EINVAL;
3719 			goto out_free;
3720 		}
3721 		if (!bitmap_and(thread_mask.affinity.bits, thread_mask.affinity.bits,
3722 				cpus_mask.bits, thread_mask.affinity.nbits)) {
3723 			pr_err("Empty affinity mask: %s\n", affinity_spec[s]);
3724 			ret = -EINVAL;
3725 			goto out_free;
3726 		}
3727 
3728 		/* do not allow intersection with other masks (full_mask) */
3729 		if (bitmap_intersects(thread_mask.maps.bits, full_mask.maps.bits,
3730 				      thread_mask.maps.nbits)) {
3731 			pr_err("Intersecting maps mask: %s\n", maps_spec[s]);
3732 			ret = -EINVAL;
3733 			goto out_free;
3734 		}
3735 		if (bitmap_intersects(thread_mask.affinity.bits, full_mask.affinity.bits,
3736 				      thread_mask.affinity.nbits)) {
3737 			pr_err("Intersecting affinity mask: %s\n", affinity_spec[s]);
3738 			ret = -EINVAL;
3739 			goto out_free;
3740 		}
3741 
3742 		bitmap_or(full_mask.maps.bits, full_mask.maps.bits,
3743 			  thread_mask.maps.bits, full_mask.maps.nbits);
3744 		bitmap_or(full_mask.affinity.bits, full_mask.affinity.bits,
3745 			  thread_mask.affinity.bits, full_mask.maps.nbits);
3746 
3747 		thread_masks = realloc(rec->thread_masks, (t + 1) * sizeof(struct thread_mask));
3748 		if (!thread_masks) {
3749 			pr_err("Failed to reallocate thread masks\n");
3750 			ret = -ENOMEM;
3751 			goto out_free;
3752 		}
3753 		rec->thread_masks = thread_masks;
3754 		rec->thread_masks[t] = thread_mask;
3755 		if (verbose > 0) {
3756 			pr_debug("thread_masks[%d]: ", t);
3757 			mmap_cpu_mask__scnprintf(&rec->thread_masks[t].maps, "maps");
3758 			pr_debug("thread_masks[%d]: ", t);
3759 			mmap_cpu_mask__scnprintf(&rec->thread_masks[t].affinity, "affinity");
3760 		}
3761 		t++;
3762 		ret = record__thread_mask_alloc(&thread_mask, cpu__max_cpu().cpu);
3763 		if (ret) {
3764 			pr_err("Failed to allocate thread mask\n");
3765 			goto out_free_full_and_cpu_masks;
3766 		}
3767 	}
3768 	rec->nr_threads = t;
3769 	pr_debug("nr_threads: %d\n", rec->nr_threads);
3770 	if (!rec->nr_threads)
3771 		ret = -EINVAL;
3772 
3773 out_free:
3774 	record__thread_mask_free(&thread_mask);
3775 out_free_full_and_cpu_masks:
3776 	record__thread_mask_free(&full_mask);
3777 out_free_cpu_mask:
3778 	record__mmap_cpu_mask_free(&cpus_mask);
3779 
3780 	return ret;
3781 }
3782 
3783 static int record__init_thread_core_masks(struct record *rec, struct perf_cpu_map *cpus)
3784 {
3785 	int ret;
3786 	struct cpu_topology *topo;
3787 
3788 	topo = cpu_topology__new();
3789 	if (!topo) {
3790 		pr_err("Failed to allocate CPU topology\n");
3791 		return -ENOMEM;
3792 	}
3793 
3794 	ret = record__init_thread_masks_spec(rec, cpus, topo->core_cpus_list,
3795 					     topo->core_cpus_list, topo->core_cpus_lists);
3796 	cpu_topology__delete(topo);
3797 
3798 	return ret;
3799 }
3800 
3801 static int record__init_thread_package_masks(struct record *rec, struct perf_cpu_map *cpus)
3802 {
3803 	int ret;
3804 	struct cpu_topology *topo;
3805 
3806 	topo = cpu_topology__new();
3807 	if (!topo) {
3808 		pr_err("Failed to allocate CPU topology\n");
3809 		return -ENOMEM;
3810 	}
3811 
3812 	ret = record__init_thread_masks_spec(rec, cpus, topo->package_cpus_list,
3813 					     topo->package_cpus_list, topo->package_cpus_lists);
3814 	cpu_topology__delete(topo);
3815 
3816 	return ret;
3817 }
3818 
3819 static int record__init_thread_numa_masks(struct record *rec, struct perf_cpu_map *cpus)
3820 {
3821 	u32 s;
3822 	int ret;
3823 	const char **spec;
3824 	struct numa_topology *topo;
3825 
3826 	topo = numa_topology__new();
3827 	if (!topo) {
3828 		pr_err("Failed to allocate NUMA topology\n");
3829 		return -ENOMEM;
3830 	}
3831 
3832 	spec = zalloc(topo->nr * sizeof(char *));
3833 	if (!spec) {
3834 		pr_err("Failed to allocate NUMA spec\n");
3835 		ret = -ENOMEM;
3836 		goto out_delete_topo;
3837 	}
3838 	for (s = 0; s < topo->nr; s++)
3839 		spec[s] = topo->nodes[s].cpus;
3840 
3841 	ret = record__init_thread_masks_spec(rec, cpus, spec, spec, topo->nr);
3842 
3843 	zfree(&spec);
3844 
3845 out_delete_topo:
3846 	numa_topology__delete(topo);
3847 
3848 	return ret;
3849 }
3850 
3851 static int record__init_thread_user_masks(struct record *rec, struct perf_cpu_map *cpus)
3852 {
3853 	int t, ret;
3854 	u32 s, nr_spec = 0;
3855 	char **maps_spec = NULL, **affinity_spec = NULL, **tmp_spec;
3856 	char *user_spec, *spec, *spec_ptr, *mask, *mask_ptr, *dup_mask = NULL;
3857 
3858 	for (t = 0, user_spec = (char *)rec->opts.threads_user_spec; ; t++, user_spec = NULL) {
3859 		spec = strtok_r(user_spec, ":", &spec_ptr);
3860 		if (spec == NULL)
3861 			break;
3862 		pr_debug2("threads_spec[%d]: %s\n", t, spec);
3863 		mask = strtok_r(spec, "/", &mask_ptr);
3864 		if (mask == NULL)
3865 			break;
3866 		pr_debug2("  maps mask: %s\n", mask);
3867 		tmp_spec = realloc(maps_spec, (nr_spec + 1) * sizeof(char *));
3868 		if (!tmp_spec) {
3869 			pr_err("Failed to reallocate maps spec\n");
3870 			ret = -ENOMEM;
3871 			goto out_free;
3872 		}
3873 		maps_spec = tmp_spec;
3874 		maps_spec[nr_spec] = dup_mask = strdup(mask);
3875 		if (!maps_spec[nr_spec]) {
3876 			pr_err("Failed to allocate maps spec[%d]\n", nr_spec);
3877 			ret = -ENOMEM;
3878 			goto out_free;
3879 		}
3880 		mask = strtok_r(NULL, "/", &mask_ptr);
3881 		if (mask == NULL) {
3882 			pr_err("Invalid thread maps or affinity specs\n");
3883 			ret = -EINVAL;
3884 			goto out_free;
3885 		}
3886 		pr_debug2("  affinity mask: %s\n", mask);
3887 		tmp_spec = realloc(affinity_spec, (nr_spec + 1) * sizeof(char *));
3888 		if (!tmp_spec) {
3889 			pr_err("Failed to reallocate affinity spec\n");
3890 			ret = -ENOMEM;
3891 			goto out_free;
3892 		}
3893 		affinity_spec = tmp_spec;
3894 		affinity_spec[nr_spec] = strdup(mask);
3895 		if (!affinity_spec[nr_spec]) {
3896 			pr_err("Failed to allocate affinity spec[%d]\n", nr_spec);
3897 			ret = -ENOMEM;
3898 			goto out_free;
3899 		}
3900 		dup_mask = NULL;
3901 		nr_spec++;
3902 	}
3903 
3904 	ret = record__init_thread_masks_spec(rec, cpus, (const char **)maps_spec,
3905 					     (const char **)affinity_spec, nr_spec);
3906 
3907 out_free:
3908 	free(dup_mask);
3909 	for (s = 0; s < nr_spec; s++) {
3910 		if (maps_spec)
3911 			free(maps_spec[s]);
3912 		if (affinity_spec)
3913 			free(affinity_spec[s]);
3914 	}
3915 	free(affinity_spec);
3916 	free(maps_spec);
3917 
3918 	return ret;
3919 }
3920 
3921 static int record__init_thread_default_masks(struct record *rec, struct perf_cpu_map *cpus)
3922 {
3923 	int ret;
3924 
3925 	ret = record__alloc_thread_masks(rec, 1, cpu__max_cpu().cpu);
3926 	if (ret)
3927 		return ret;
3928 
3929 	if (record__mmap_cpu_mask_init(&rec->thread_masks->maps, cpus))
3930 		return -ENODEV;
3931 
3932 	rec->nr_threads = 1;
3933 
3934 	return 0;
3935 }
3936 
3937 static int record__init_thread_masks(struct record *rec)
3938 {
3939 	int ret = 0;
3940 	struct perf_cpu_map *cpus = rec->evlist->core.all_cpus;
3941 
3942 	if (!record__threads_enabled(rec))
3943 		return record__init_thread_default_masks(rec, cpus);
3944 
3945 	if (evlist__per_thread(rec->evlist)) {
3946 		pr_err("--per-thread option is mutually exclusive to parallel streaming mode.\n");
3947 		return -EINVAL;
3948 	}
3949 
3950 	switch (rec->opts.threads_spec) {
3951 	case THREAD_SPEC__CPU:
3952 		ret = record__init_thread_cpu_masks(rec, cpus);
3953 		break;
3954 	case THREAD_SPEC__CORE:
3955 		ret = record__init_thread_core_masks(rec, cpus);
3956 		break;
3957 	case THREAD_SPEC__PACKAGE:
3958 		ret = record__init_thread_package_masks(rec, cpus);
3959 		break;
3960 	case THREAD_SPEC__NUMA:
3961 		ret = record__init_thread_numa_masks(rec, cpus);
3962 		break;
3963 	case THREAD_SPEC__USER:
3964 		ret = record__init_thread_user_masks(rec, cpus);
3965 		break;
3966 	default:
3967 		break;
3968 	}
3969 
3970 	return ret;
3971 }
3972 
3973 int cmd_record(int argc, const char **argv)
3974 {
3975 	int err;
3976 	struct record *rec = &record;
3977 	char errbuf[BUFSIZ];
3978 
3979 	setlocale(LC_ALL, "");
3980 
3981 #ifndef HAVE_BPF_SKEL
3982 # define set_nobuild(s, l, m, c) set_option_nobuild(record_options, s, l, m, c)
3983 	set_nobuild('\0', "off-cpu", "no BUILD_BPF_SKEL=1", true);
3984 # undef set_nobuild
3985 #endif
3986 
3987 	/* Disable eager loading of kernel symbols that adds overhead to perf record. */
3988 	symbol_conf.lazy_load_kernel_maps = true;
3989 	rec->opts.affinity = PERF_AFFINITY_SYS;
3990 
3991 	rec->evlist = evlist__new();
3992 	if (rec->evlist == NULL)
3993 		return -ENOMEM;
3994 
3995 	err = perf_config(perf_record_config, rec);
3996 	if (err)
3997 		return err;
3998 
3999 	argc = parse_options(argc, argv, record_options, record_usage,
4000 			    PARSE_OPT_STOP_AT_NON_OPTION);
4001 	if (quiet)
4002 		perf_quiet_option();
4003 
4004 	err = symbol__validate_sym_arguments();
4005 	if (err)
4006 		return err;
4007 
4008 	perf_debuginfod_setup(&record.debuginfod);
4009 
4010 	/* Make system wide (-a) the default target. */
4011 	if (!argc && target__none(&rec->opts.target))
4012 		rec->opts.target.system_wide = true;
4013 
4014 	if (nr_cgroups && !rec->opts.target.system_wide) {
4015 		usage_with_options_msg(record_usage, record_options,
4016 			"cgroup monitoring only available in system-wide mode");
4017 
4018 	}
4019 
4020 	if (rec->buildid_mmap) {
4021 		if (!perf_can_record_build_id()) {
4022 			pr_err("Failed: no support to record build id in mmap events, update your kernel.\n");
4023 			err = -EINVAL;
4024 			goto out_opts;
4025 		}
4026 		pr_debug("Enabling build id in mmap2 events.\n");
4027 		/* Enable mmap build id synthesizing. */
4028 		symbol_conf.buildid_mmap2 = true;
4029 		/* Enable perf_event_attr::build_id bit. */
4030 		rec->opts.build_id = true;
4031 		/* Disable build id cache. */
4032 		rec->no_buildid = true;
4033 	}
4034 
4035 	if (rec->opts.record_cgroup && !perf_can_record_cgroup()) {
4036 		pr_err("Kernel has no cgroup sampling support.\n");
4037 		err = -EINVAL;
4038 		goto out_opts;
4039 	}
4040 
4041 	if (rec->opts.kcore)
4042 		rec->opts.text_poke = true;
4043 
4044 	if (rec->opts.kcore || record__threads_enabled(rec))
4045 		rec->data.is_dir = true;
4046 
4047 	if (record__threads_enabled(rec)) {
4048 		if (rec->opts.affinity != PERF_AFFINITY_SYS) {
4049 			pr_err("--affinity option is mutually exclusive to parallel streaming mode.\n");
4050 			goto out_opts;
4051 		}
4052 		if (record__aio_enabled(rec)) {
4053 			pr_err("Asynchronous streaming mode (--aio) is mutually exclusive to parallel streaming mode.\n");
4054 			goto out_opts;
4055 		}
4056 	}
4057 
4058 	if (rec->opts.comp_level != 0) {
4059 		pr_debug("Compression enabled, disabling build id collection at the end of the session.\n");
4060 		rec->no_buildid = true;
4061 	}
4062 
4063 	if (rec->opts.record_switch_events &&
4064 	    !perf_can_record_switch_events()) {
4065 		ui__error("kernel does not support recording context switch events\n");
4066 		parse_options_usage(record_usage, record_options, "switch-events", 0);
4067 		err = -EINVAL;
4068 		goto out_opts;
4069 	}
4070 
4071 	if (switch_output_setup(rec)) {
4072 		parse_options_usage(record_usage, record_options, "switch-output", 0);
4073 		err = -EINVAL;
4074 		goto out_opts;
4075 	}
4076 
4077 	if (rec->switch_output.time) {
4078 		signal(SIGALRM, alarm_sig_handler);
4079 		alarm(rec->switch_output.time);
4080 	}
4081 
4082 	if (rec->switch_output.num_files) {
4083 		rec->switch_output.filenames = calloc(rec->switch_output.num_files,
4084 						      sizeof(char *));
4085 		if (!rec->switch_output.filenames) {
4086 			err = -EINVAL;
4087 			goto out_opts;
4088 		}
4089 	}
4090 
4091 	if (rec->timestamp_filename && record__threads_enabled(rec)) {
4092 		rec->timestamp_filename = false;
4093 		pr_warning("WARNING: --timestamp-filename option is not available in parallel streaming mode.\n");
4094 	}
4095 
4096 	if (rec->filter_action) {
4097 		if (!strcmp(rec->filter_action, "pin"))
4098 			err = perf_bpf_filter__pin();
4099 		else if (!strcmp(rec->filter_action, "unpin"))
4100 			err = perf_bpf_filter__unpin();
4101 		else {
4102 			pr_warning("Unknown BPF filter action: %s\n", rec->filter_action);
4103 			err = -EINVAL;
4104 		}
4105 		goto out_opts;
4106 	}
4107 
4108 	/*
4109 	 * Allow aliases to facilitate the lookup of symbols for address
4110 	 * filters. Refer to auxtrace_parse_filters().
4111 	 */
4112 	symbol_conf.allow_aliases = true;
4113 
4114 	symbol__init(NULL);
4115 
4116 	err = record__auxtrace_init(rec);
4117 	if (err)
4118 		goto out;
4119 
4120 	if (dry_run)
4121 		goto out;
4122 
4123 	err = -ENOMEM;
4124 
4125 	if (rec->no_buildid_cache || rec->no_buildid) {
4126 		disable_buildid_cache();
4127 	} else if (rec->switch_output.enabled) {
4128 		/*
4129 		 * In 'perf record --switch-output', disable buildid
4130 		 * generation by default to reduce data file switching
4131 		 * overhead. Still generate buildid if they are required
4132 		 * explicitly using
4133 		 *
4134 		 *  perf record --switch-output --no-no-buildid \
4135 		 *              --no-no-buildid-cache
4136 		 *
4137 		 * Following code equals to:
4138 		 *
4139 		 * if ((rec->no_buildid || !rec->no_buildid_set) &&
4140 		 *     (rec->no_buildid_cache || !rec->no_buildid_cache_set))
4141 		 *         disable_buildid_cache();
4142 		 */
4143 		bool disable = true;
4144 
4145 		if (rec->no_buildid_set && !rec->no_buildid)
4146 			disable = false;
4147 		if (rec->no_buildid_cache_set && !rec->no_buildid_cache)
4148 			disable = false;
4149 		if (disable) {
4150 			rec->no_buildid = true;
4151 			rec->no_buildid_cache = true;
4152 			disable_buildid_cache();
4153 		}
4154 	}
4155 
4156 	if (record.opts.overwrite)
4157 		record.opts.tail_synthesize = true;
4158 
4159 	if (rec->evlist->core.nr_entries == 0) {
4160 		err = parse_event(rec->evlist, "cycles:P");
4161 		if (err)
4162 			goto out;
4163 	}
4164 
4165 	if (rec->opts.target.tid && !rec->opts.no_inherit_set)
4166 		rec->opts.no_inherit = true;
4167 
4168 	err = target__validate(&rec->opts.target);
4169 	if (err) {
4170 		target__strerror(&rec->opts.target, err, errbuf, BUFSIZ);
4171 		ui__warning("%s\n", errbuf);
4172 	}
4173 
4174 	err = target__parse_uid(&rec->opts.target);
4175 	if (err) {
4176 		int saved_errno = errno;
4177 
4178 		target__strerror(&rec->opts.target, err, errbuf, BUFSIZ);
4179 		ui__error("%s", errbuf);
4180 
4181 		err = -saved_errno;
4182 		goto out;
4183 	}
4184 
4185 	/* Enable ignoring missing threads when -u/-p option is defined. */
4186 	rec->opts.ignore_missing_thread = rec->opts.target.uid != UINT_MAX || rec->opts.target.pid;
4187 
4188 	evlist__warn_user_requested_cpus(rec->evlist, rec->opts.target.cpu_list);
4189 
4190 	if (callchain_param.enabled && callchain_param.record_mode == CALLCHAIN_FP)
4191 		arch__add_leaf_frame_record_opts(&rec->opts);
4192 
4193 	err = -ENOMEM;
4194 	if (evlist__create_maps(rec->evlist, &rec->opts.target) < 0) {
4195 		if (rec->opts.target.pid != NULL) {
4196 			pr_err("Couldn't create thread/CPU maps: %s\n",
4197 				errno == ENOENT ? "No such process" : str_error_r(errno, errbuf, sizeof(errbuf)));
4198 			goto out;
4199 		}
4200 		else
4201 			usage_with_options(record_usage, record_options);
4202 	}
4203 
4204 	err = auxtrace_record__options(rec->itr, rec->evlist, &rec->opts);
4205 	if (err)
4206 		goto out;
4207 
4208 	/*
4209 	 * We take all buildids when the file contains
4210 	 * AUX area tracing data because we do not decode the
4211 	 * trace because it would take too long.
4212 	 */
4213 	if (rec->opts.full_auxtrace)
4214 		rec->buildid_all = true;
4215 
4216 	if (rec->opts.text_poke) {
4217 		err = record__config_text_poke(rec->evlist);
4218 		if (err) {
4219 			pr_err("record__config_text_poke failed, error %d\n", err);
4220 			goto out;
4221 		}
4222 	}
4223 
4224 	if (rec->off_cpu) {
4225 		err = record__config_off_cpu(rec);
4226 		if (err) {
4227 			pr_err("record__config_off_cpu failed, error %d\n", err);
4228 			goto out;
4229 		}
4230 	}
4231 
4232 	if (record_opts__config(&rec->opts)) {
4233 		err = -EINVAL;
4234 		goto out;
4235 	}
4236 
4237 	err = record__config_tracking_events(rec);
4238 	if (err) {
4239 		pr_err("record__config_tracking_events failed, error %d\n", err);
4240 		goto out;
4241 	}
4242 
4243 	err = record__init_thread_masks(rec);
4244 	if (err) {
4245 		pr_err("Failed to initialize parallel data streaming masks\n");
4246 		goto out;
4247 	}
4248 
4249 	if (rec->opts.nr_cblocks > nr_cblocks_max)
4250 		rec->opts.nr_cblocks = nr_cblocks_max;
4251 	pr_debug("nr_cblocks: %d\n", rec->opts.nr_cblocks);
4252 
4253 	pr_debug("affinity: %s\n", affinity_tags[rec->opts.affinity]);
4254 	pr_debug("mmap flush: %d\n", rec->opts.mmap_flush);
4255 
4256 	if (rec->opts.comp_level > comp_level_max)
4257 		rec->opts.comp_level = comp_level_max;
4258 	pr_debug("comp level: %d\n", rec->opts.comp_level);
4259 
4260 	err = __cmd_record(&record, argc, argv);
4261 out:
4262 	record__free_thread_masks(rec, rec->nr_threads);
4263 	rec->nr_threads = 0;
4264 	symbol__exit();
4265 	auxtrace_record__free(rec->itr);
4266 out_opts:
4267 	evlist__close_control(rec->opts.ctl_fd, rec->opts.ctl_fd_ack, &rec->opts.ctl_fd_close);
4268 	evlist__delete(rec->evlist);
4269 	return err;
4270 }
4271 
4272 static void snapshot_sig_handler(int sig __maybe_unused)
4273 {
4274 	struct record *rec = &record;
4275 
4276 	hit_auxtrace_snapshot_trigger(rec);
4277 
4278 	if (switch_output_signal(rec))
4279 		trigger_hit(&switch_output_trigger);
4280 }
4281 
4282 static void alarm_sig_handler(int sig __maybe_unused)
4283 {
4284 	struct record *rec = &record;
4285 
4286 	if (switch_output_time(rec))
4287 		trigger_hit(&switch_output_trigger);
4288 }
4289