xref: /linux/tools/perf/builtin-record.c (revision 9eef41014fe01287dae79fe208b9b433b13040bb)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * builtin-record.c
4  *
5  * Builtin record command: Record the profile of a workload
6  * (or a CPU, or a PID) into the perf.data output file - for
7  * later analysis via perf report.
8  */
9 #include "builtin.h"
10 
11 #include "util/build-id.h"
12 #include <subcmd/parse-options.h>
13 #include <internal/xyarray.h>
14 #include "util/parse-events.h"
15 #include "util/config.h"
16 
17 #include "util/callchain.h"
18 #include "util/cgroup.h"
19 #include "util/header.h"
20 #include "util/event.h"
21 #include "util/evlist.h"
22 #include "util/evsel.h"
23 #include "util/debug.h"
24 #include "util/mmap.h"
25 #include "util/mutex.h"
26 #include "util/target.h"
27 #include "util/session.h"
28 #include "util/tool.h"
29 #include "util/symbol.h"
30 #include "util/record.h"
31 #include "util/cpumap.h"
32 #include "util/thread_map.h"
33 #include "util/data.h"
34 #include "util/perf_regs.h"
35 #include "util/auxtrace.h"
36 #include "util/tsc.h"
37 #include "util/parse-branch-options.h"
38 #include "util/parse-regs-options.h"
39 #include "util/perf_api_probe.h"
40 #include "util/trigger.h"
41 #include "util/perf-hooks.h"
42 #include "util/cpu-set-sched.h"
43 #include "util/synthetic-events.h"
44 #include "util/time-utils.h"
45 #include "util/units.h"
46 #include "util/bpf-event.h"
47 #include "util/util.h"
48 #include "util/pfm.h"
49 #include "util/pmu.h"
50 #include "util/pmus.h"
51 #include "util/clockid.h"
52 #include "util/off_cpu.h"
53 #include "util/bpf-filter.h"
54 #include "asm/bug.h"
55 #include "perf.h"
56 #include "cputopo.h"
57 
58 #include <errno.h>
59 #include <inttypes.h>
60 #include <locale.h>
61 #include <poll.h>
62 #include <pthread.h>
63 #include <unistd.h>
64 #ifndef HAVE_GETTID
65 #include <syscall.h>
66 #endif
67 #include <sched.h>
68 #include <signal.h>
69 #ifdef HAVE_EVENTFD_SUPPORT
70 #include <sys/eventfd.h>
71 #endif
72 #include <sys/mman.h>
73 #include <sys/wait.h>
74 #include <sys/types.h>
75 #include <sys/stat.h>
76 #include <fcntl.h>
77 #include <linux/err.h>
78 #include <linux/string.h>
79 #include <linux/time64.h>
80 #include <linux/zalloc.h>
81 #include <linux/bitmap.h>
82 #include <sys/time.h>
83 
84 struct switch_output {
85 	bool		 enabled;
86 	bool		 signal;
87 	unsigned long	 size;
88 	unsigned long	 time;
89 	const char	*str;
90 	bool		 set;
91 	char		 **filenames;
92 	int		 num_files;
93 	int		 cur_file;
94 };
95 
96 struct thread_mask {
97 	struct mmap_cpu_mask	maps;
98 	struct mmap_cpu_mask	affinity;
99 };
100 
101 struct record_thread {
102 	pid_t			tid;
103 	struct thread_mask	*mask;
104 	struct {
105 		int		msg[2];
106 		int		ack[2];
107 	} pipes;
108 	struct fdarray		pollfd;
109 	int			ctlfd_pos;
110 	int			nr_mmaps;
111 	struct mmap		**maps;
112 	struct mmap		**overwrite_maps;
113 	struct record		*rec;
114 	unsigned long long	samples;
115 	unsigned long		waking;
116 	u64			bytes_written;
117 	u64			bytes_transferred;
118 	u64			bytes_compressed;
119 };
120 
121 static __thread struct record_thread *thread;
122 
123 enum thread_msg {
124 	THREAD_MSG__UNDEFINED = 0,
125 	THREAD_MSG__READY,
126 	THREAD_MSG__MAX,
127 };
128 
129 static const char *thread_msg_tags[THREAD_MSG__MAX] = {
130 	"UNDEFINED", "READY"
131 };
132 
133 enum thread_spec {
134 	THREAD_SPEC__UNDEFINED = 0,
135 	THREAD_SPEC__CPU,
136 	THREAD_SPEC__CORE,
137 	THREAD_SPEC__PACKAGE,
138 	THREAD_SPEC__NUMA,
139 	THREAD_SPEC__USER,
140 	THREAD_SPEC__MAX,
141 };
142 
143 static const char *thread_spec_tags[THREAD_SPEC__MAX] = {
144 	"undefined", "cpu", "core", "package", "numa", "user"
145 };
146 
147 struct pollfd_index_map {
148 	int evlist_pollfd_index;
149 	int thread_pollfd_index;
150 };
151 
152 struct record {
153 	struct perf_tool	tool;
154 	struct record_opts	opts;
155 	u64			bytes_written;
156 	u64			thread_bytes_written;
157 	struct perf_data	data;
158 	struct auxtrace_record	*itr;
159 	struct evlist	*evlist;
160 	struct perf_session	*session;
161 	struct evlist		*sb_evlist;
162 	pthread_t		thread_id;
163 	int			realtime_prio;
164 	bool			switch_output_event_set;
165 	bool			no_buildid;
166 	bool			no_buildid_set;
167 	bool			no_buildid_cache;
168 	bool			no_buildid_cache_set;
169 	bool			buildid_all;
170 	bool			buildid_mmap;
171 	bool			timestamp_filename;
172 	bool			timestamp_boundary;
173 	bool			off_cpu;
174 	struct switch_output	switch_output;
175 	unsigned long long	samples;
176 	unsigned long		output_max_size;	/* = 0: unlimited */
177 	struct perf_debuginfod	debuginfod;
178 	int			nr_threads;
179 	struct thread_mask	*thread_masks;
180 	struct record_thread	*thread_data;
181 	struct pollfd_index_map	*index_map;
182 	size_t			index_map_sz;
183 	size_t			index_map_cnt;
184 };
185 
186 static volatile int done;
187 
188 static volatile int auxtrace_record__snapshot_started;
189 static DEFINE_TRIGGER(auxtrace_snapshot_trigger);
190 static DEFINE_TRIGGER(switch_output_trigger);
191 
192 static const char *affinity_tags[PERF_AFFINITY_MAX] = {
193 	"SYS", "NODE", "CPU"
194 };
195 
196 #ifndef HAVE_GETTID
197 static inline pid_t gettid(void)
198 {
199 	return (pid_t)syscall(__NR_gettid);
200 }
201 #endif
202 
203 static int record__threads_enabled(struct record *rec)
204 {
205 	return rec->opts.threads_spec;
206 }
207 
208 static bool switch_output_signal(struct record *rec)
209 {
210 	return rec->switch_output.signal &&
211 	       trigger_is_ready(&switch_output_trigger);
212 }
213 
214 static bool switch_output_size(struct record *rec)
215 {
216 	return rec->switch_output.size &&
217 	       trigger_is_ready(&switch_output_trigger) &&
218 	       (rec->bytes_written >= rec->switch_output.size);
219 }
220 
221 static bool switch_output_time(struct record *rec)
222 {
223 	return rec->switch_output.time &&
224 	       trigger_is_ready(&switch_output_trigger);
225 }
226 
227 static u64 record__bytes_written(struct record *rec)
228 {
229 	return rec->bytes_written + rec->thread_bytes_written;
230 }
231 
232 static bool record__output_max_size_exceeded(struct record *rec)
233 {
234 	return rec->output_max_size &&
235 	       (record__bytes_written(rec) >= rec->output_max_size);
236 }
237 
238 static int record__write(struct record *rec, struct mmap *map __maybe_unused,
239 			 void *bf, size_t size)
240 {
241 	struct perf_data_file *file = &rec->session->data->file;
242 
243 	if (map && map->file)
244 		file = map->file;
245 
246 	if (perf_data_file__write(file, bf, size) < 0) {
247 		pr_err("failed to write perf data, error: %m\n");
248 		return -1;
249 	}
250 
251 	if (map && map->file) {
252 		thread->bytes_written += size;
253 		rec->thread_bytes_written += size;
254 	} else {
255 		rec->bytes_written += size;
256 	}
257 
258 	if (record__output_max_size_exceeded(rec) && !done) {
259 		fprintf(stderr, "[ perf record: perf size limit reached (%" PRIu64 " KB),"
260 				" stopping session ]\n",
261 				record__bytes_written(rec) >> 10);
262 		done = 1;
263 	}
264 
265 	if (switch_output_size(rec))
266 		trigger_hit(&switch_output_trigger);
267 
268 	return 0;
269 }
270 
271 static int record__aio_enabled(struct record *rec);
272 static int record__comp_enabled(struct record *rec);
273 static ssize_t zstd_compress(struct perf_session *session, struct mmap *map,
274 			    void *dst, size_t dst_size, void *src, size_t src_size);
275 
276 #ifdef HAVE_AIO_SUPPORT
277 static int record__aio_write(struct aiocb *cblock, int trace_fd,
278 		void *buf, size_t size, off_t off)
279 {
280 	int rc;
281 
282 	cblock->aio_fildes = trace_fd;
283 	cblock->aio_buf    = buf;
284 	cblock->aio_nbytes = size;
285 	cblock->aio_offset = off;
286 	cblock->aio_sigevent.sigev_notify = SIGEV_NONE;
287 
288 	do {
289 		rc = aio_write(cblock);
290 		if (rc == 0) {
291 			break;
292 		} else if (errno != EAGAIN) {
293 			cblock->aio_fildes = -1;
294 			pr_err("failed to queue perf data, error: %m\n");
295 			break;
296 		}
297 	} while (1);
298 
299 	return rc;
300 }
301 
302 static int record__aio_complete(struct mmap *md, struct aiocb *cblock)
303 {
304 	void *rem_buf;
305 	off_t rem_off;
306 	size_t rem_size;
307 	int rc, aio_errno;
308 	ssize_t aio_ret, written;
309 
310 	aio_errno = aio_error(cblock);
311 	if (aio_errno == EINPROGRESS)
312 		return 0;
313 
314 	written = aio_ret = aio_return(cblock);
315 	if (aio_ret < 0) {
316 		if (aio_errno != EINTR)
317 			pr_err("failed to write perf data, error: %m\n");
318 		written = 0;
319 	}
320 
321 	rem_size = cblock->aio_nbytes - written;
322 
323 	if (rem_size == 0) {
324 		cblock->aio_fildes = -1;
325 		/*
326 		 * md->refcount is incremented in record__aio_pushfn() for
327 		 * every aio write request started in record__aio_push() so
328 		 * decrement it because the request is now complete.
329 		 */
330 		perf_mmap__put(&md->core);
331 		rc = 1;
332 	} else {
333 		/*
334 		 * aio write request may require restart with the
335 		 * reminder if the kernel didn't write whole
336 		 * chunk at once.
337 		 */
338 		rem_off = cblock->aio_offset + written;
339 		rem_buf = (void *)(cblock->aio_buf + written);
340 		record__aio_write(cblock, cblock->aio_fildes,
341 				rem_buf, rem_size, rem_off);
342 		rc = 0;
343 	}
344 
345 	return rc;
346 }
347 
348 static int record__aio_sync(struct mmap *md, bool sync_all)
349 {
350 	struct aiocb **aiocb = md->aio.aiocb;
351 	struct aiocb *cblocks = md->aio.cblocks;
352 	struct timespec timeout = { 0, 1000 * 1000  * 1 }; /* 1ms */
353 	int i, do_suspend;
354 
355 	do {
356 		do_suspend = 0;
357 		for (i = 0; i < md->aio.nr_cblocks; ++i) {
358 			if (cblocks[i].aio_fildes == -1 || record__aio_complete(md, &cblocks[i])) {
359 				if (sync_all)
360 					aiocb[i] = NULL;
361 				else
362 					return i;
363 			} else {
364 				/*
365 				 * Started aio write is not complete yet
366 				 * so it has to be waited before the
367 				 * next allocation.
368 				 */
369 				aiocb[i] = &cblocks[i];
370 				do_suspend = 1;
371 			}
372 		}
373 		if (!do_suspend)
374 			return -1;
375 
376 		while (aio_suspend((const struct aiocb **)aiocb, md->aio.nr_cblocks, &timeout)) {
377 			if (!(errno == EAGAIN || errno == EINTR))
378 				pr_err("failed to sync perf data, error: %m\n");
379 		}
380 	} while (1);
381 }
382 
383 struct record_aio {
384 	struct record	*rec;
385 	void		*data;
386 	size_t		size;
387 };
388 
389 static int record__aio_pushfn(struct mmap *map, void *to, void *buf, size_t size)
390 {
391 	struct record_aio *aio = to;
392 
393 	/*
394 	 * map->core.base data pointed by buf is copied into free map->aio.data[] buffer
395 	 * to release space in the kernel buffer as fast as possible, calling
396 	 * perf_mmap__consume() from perf_mmap__push() function.
397 	 *
398 	 * That lets the kernel to proceed with storing more profiling data into
399 	 * the kernel buffer earlier than other per-cpu kernel buffers are handled.
400 	 *
401 	 * Coping can be done in two steps in case the chunk of profiling data
402 	 * crosses the upper bound of the kernel buffer. In this case we first move
403 	 * part of data from map->start till the upper bound and then the reminder
404 	 * from the beginning of the kernel buffer till the end of the data chunk.
405 	 */
406 
407 	if (record__comp_enabled(aio->rec)) {
408 		ssize_t compressed = zstd_compress(aio->rec->session, NULL, aio->data + aio->size,
409 						   mmap__mmap_len(map) - aio->size,
410 						   buf, size);
411 		if (compressed < 0)
412 			return (int)compressed;
413 
414 		size = compressed;
415 	} else {
416 		memcpy(aio->data + aio->size, buf, size);
417 	}
418 
419 	if (!aio->size) {
420 		/*
421 		 * Increment map->refcount to guard map->aio.data[] buffer
422 		 * from premature deallocation because map object can be
423 		 * released earlier than aio write request started on
424 		 * map->aio.data[] buffer is complete.
425 		 *
426 		 * perf_mmap__put() is done at record__aio_complete()
427 		 * after started aio request completion or at record__aio_push()
428 		 * if the request failed to start.
429 		 */
430 		perf_mmap__get(&map->core);
431 	}
432 
433 	aio->size += size;
434 
435 	return size;
436 }
437 
438 static int record__aio_push(struct record *rec, struct mmap *map, off_t *off)
439 {
440 	int ret, idx;
441 	int trace_fd = rec->session->data->file.fd;
442 	struct record_aio aio = { .rec = rec, .size = 0 };
443 
444 	/*
445 	 * Call record__aio_sync() to wait till map->aio.data[] buffer
446 	 * becomes available after previous aio write operation.
447 	 */
448 
449 	idx = record__aio_sync(map, false);
450 	aio.data = map->aio.data[idx];
451 	ret = perf_mmap__push(map, &aio, record__aio_pushfn);
452 	if (ret != 0) /* ret > 0 - no data, ret < 0 - error */
453 		return ret;
454 
455 	rec->samples++;
456 	ret = record__aio_write(&(map->aio.cblocks[idx]), trace_fd, aio.data, aio.size, *off);
457 	if (!ret) {
458 		*off += aio.size;
459 		rec->bytes_written += aio.size;
460 		if (switch_output_size(rec))
461 			trigger_hit(&switch_output_trigger);
462 	} else {
463 		/*
464 		 * Decrement map->refcount incremented in record__aio_pushfn()
465 		 * back if record__aio_write() operation failed to start, otherwise
466 		 * map->refcount is decremented in record__aio_complete() after
467 		 * aio write operation finishes successfully.
468 		 */
469 		perf_mmap__put(&map->core);
470 	}
471 
472 	return ret;
473 }
474 
475 static off_t record__aio_get_pos(int trace_fd)
476 {
477 	return lseek(trace_fd, 0, SEEK_CUR);
478 }
479 
480 static void record__aio_set_pos(int trace_fd, off_t pos)
481 {
482 	lseek(trace_fd, pos, SEEK_SET);
483 }
484 
485 static void record__aio_mmap_read_sync(struct record *rec)
486 {
487 	int i;
488 	struct evlist *evlist = rec->evlist;
489 	struct mmap *maps = evlist->mmap;
490 
491 	if (!record__aio_enabled(rec))
492 		return;
493 
494 	for (i = 0; i < evlist->core.nr_mmaps; i++) {
495 		struct mmap *map = &maps[i];
496 
497 		if (map->core.base)
498 			record__aio_sync(map, true);
499 	}
500 }
501 
502 static int nr_cblocks_default = 1;
503 static int nr_cblocks_max = 4;
504 
505 static int record__aio_parse(const struct option *opt,
506 			     const char *str,
507 			     int unset)
508 {
509 	struct record_opts *opts = (struct record_opts *)opt->value;
510 
511 	if (unset) {
512 		opts->nr_cblocks = 0;
513 	} else {
514 		if (str)
515 			opts->nr_cblocks = strtol(str, NULL, 0);
516 		if (!opts->nr_cblocks)
517 			opts->nr_cblocks = nr_cblocks_default;
518 	}
519 
520 	return 0;
521 }
522 #else /* HAVE_AIO_SUPPORT */
523 static int nr_cblocks_max = 0;
524 
525 static int record__aio_push(struct record *rec __maybe_unused, struct mmap *map __maybe_unused,
526 			    off_t *off __maybe_unused)
527 {
528 	return -1;
529 }
530 
531 static off_t record__aio_get_pos(int trace_fd __maybe_unused)
532 {
533 	return -1;
534 }
535 
536 static void record__aio_set_pos(int trace_fd __maybe_unused, off_t pos __maybe_unused)
537 {
538 }
539 
540 static void record__aio_mmap_read_sync(struct record *rec __maybe_unused)
541 {
542 }
543 #endif
544 
545 static int record__aio_enabled(struct record *rec)
546 {
547 	return rec->opts.nr_cblocks > 0;
548 }
549 
550 #define MMAP_FLUSH_DEFAULT 1
551 static int record__mmap_flush_parse(const struct option *opt,
552 				    const char *str,
553 				    int unset)
554 {
555 	int flush_max;
556 	struct record_opts *opts = (struct record_opts *)opt->value;
557 	static struct parse_tag tags[] = {
558 			{ .tag  = 'B', .mult = 1       },
559 			{ .tag  = 'K', .mult = 1 << 10 },
560 			{ .tag  = 'M', .mult = 1 << 20 },
561 			{ .tag  = 'G', .mult = 1 << 30 },
562 			{ .tag  = 0 },
563 	};
564 
565 	if (unset)
566 		return 0;
567 
568 	if (str) {
569 		opts->mmap_flush = parse_tag_value(str, tags);
570 		if (opts->mmap_flush == (int)-1)
571 			opts->mmap_flush = strtol(str, NULL, 0);
572 	}
573 
574 	if (!opts->mmap_flush)
575 		opts->mmap_flush = MMAP_FLUSH_DEFAULT;
576 
577 	flush_max = evlist__mmap_size(opts->mmap_pages);
578 	flush_max /= 4;
579 	if (opts->mmap_flush > flush_max)
580 		opts->mmap_flush = flush_max;
581 
582 	return 0;
583 }
584 
585 #ifdef HAVE_ZSTD_SUPPORT
586 static unsigned int comp_level_default = 1;
587 
588 static int record__parse_comp_level(const struct option *opt, const char *str, int unset)
589 {
590 	struct record_opts *opts = opt->value;
591 
592 	if (unset) {
593 		opts->comp_level = 0;
594 	} else {
595 		if (str)
596 			opts->comp_level = strtol(str, NULL, 0);
597 		if (!opts->comp_level)
598 			opts->comp_level = comp_level_default;
599 	}
600 
601 	return 0;
602 }
603 #endif
604 static unsigned int comp_level_max = 22;
605 
606 static int record__comp_enabled(struct record *rec)
607 {
608 	return rec->opts.comp_level > 0;
609 }
610 
611 static int process_synthesized_event(struct perf_tool *tool,
612 				     union perf_event *event,
613 				     struct perf_sample *sample __maybe_unused,
614 				     struct machine *machine __maybe_unused)
615 {
616 	struct record *rec = container_of(tool, struct record, tool);
617 	return record__write(rec, NULL, event, event->header.size);
618 }
619 
620 static struct mutex synth_lock;
621 
622 static int process_locked_synthesized_event(struct perf_tool *tool,
623 				     union perf_event *event,
624 				     struct perf_sample *sample __maybe_unused,
625 				     struct machine *machine __maybe_unused)
626 {
627 	int ret;
628 
629 	mutex_lock(&synth_lock);
630 	ret = process_synthesized_event(tool, event, sample, machine);
631 	mutex_unlock(&synth_lock);
632 	return ret;
633 }
634 
635 static int record__pushfn(struct mmap *map, void *to, void *bf, size_t size)
636 {
637 	struct record *rec = to;
638 
639 	if (record__comp_enabled(rec)) {
640 		ssize_t compressed = zstd_compress(rec->session, map, map->data,
641 						   mmap__mmap_len(map), bf, size);
642 
643 		if (compressed < 0)
644 			return (int)compressed;
645 
646 		size = compressed;
647 		bf   = map->data;
648 	}
649 
650 	thread->samples++;
651 	return record__write(rec, map, bf, size);
652 }
653 
654 static volatile sig_atomic_t signr = -1;
655 static volatile sig_atomic_t child_finished;
656 #ifdef HAVE_EVENTFD_SUPPORT
657 static volatile sig_atomic_t done_fd = -1;
658 #endif
659 
660 static void sig_handler(int sig)
661 {
662 	if (sig == SIGCHLD)
663 		child_finished = 1;
664 	else
665 		signr = sig;
666 
667 	done = 1;
668 #ifdef HAVE_EVENTFD_SUPPORT
669 	if (done_fd >= 0) {
670 		u64 tmp = 1;
671 		int orig_errno = errno;
672 
673 		/*
674 		 * It is possible for this signal handler to run after done is
675 		 * checked in the main loop, but before the perf counter fds are
676 		 * polled. If this happens, the poll() will continue to wait
677 		 * even though done is set, and will only break out if either
678 		 * another signal is received, or the counters are ready for
679 		 * read. To ensure the poll() doesn't sleep when done is set,
680 		 * use an eventfd (done_fd) to wake up the poll().
681 		 */
682 		if (write(done_fd, &tmp, sizeof(tmp)) < 0)
683 			pr_err("failed to signal wakeup fd, error: %m\n");
684 
685 		errno = orig_errno;
686 	}
687 #endif // HAVE_EVENTFD_SUPPORT
688 }
689 
690 static void sigsegv_handler(int sig)
691 {
692 	perf_hooks__recover();
693 	sighandler_dump_stack(sig);
694 }
695 
696 static void record__sig_exit(void)
697 {
698 	if (signr == -1)
699 		return;
700 
701 	signal(signr, SIG_DFL);
702 	raise(signr);
703 }
704 
705 #ifdef HAVE_AUXTRACE_SUPPORT
706 
707 static int record__process_auxtrace(struct perf_tool *tool,
708 				    struct mmap *map,
709 				    union perf_event *event, void *data1,
710 				    size_t len1, void *data2, size_t len2)
711 {
712 	struct record *rec = container_of(tool, struct record, tool);
713 	struct perf_data *data = &rec->data;
714 	size_t padding;
715 	u8 pad[8] = {0};
716 
717 	if (!perf_data__is_pipe(data) && perf_data__is_single_file(data)) {
718 		off_t file_offset;
719 		int fd = perf_data__fd(data);
720 		int err;
721 
722 		file_offset = lseek(fd, 0, SEEK_CUR);
723 		if (file_offset == -1)
724 			return -1;
725 		err = auxtrace_index__auxtrace_event(&rec->session->auxtrace_index,
726 						     event, file_offset);
727 		if (err)
728 			return err;
729 	}
730 
731 	/* event.auxtrace.size includes padding, see __auxtrace_mmap__read() */
732 	padding = (len1 + len2) & 7;
733 	if (padding)
734 		padding = 8 - padding;
735 
736 	record__write(rec, map, event, event->header.size);
737 	record__write(rec, map, data1, len1);
738 	if (len2)
739 		record__write(rec, map, data2, len2);
740 	record__write(rec, map, &pad, padding);
741 
742 	return 0;
743 }
744 
745 static int record__auxtrace_mmap_read(struct record *rec,
746 				      struct mmap *map)
747 {
748 	int ret;
749 
750 	ret = auxtrace_mmap__read(map, rec->itr, &rec->tool,
751 				  record__process_auxtrace);
752 	if (ret < 0)
753 		return ret;
754 
755 	if (ret)
756 		rec->samples++;
757 
758 	return 0;
759 }
760 
761 static int record__auxtrace_mmap_read_snapshot(struct record *rec,
762 					       struct mmap *map)
763 {
764 	int ret;
765 
766 	ret = auxtrace_mmap__read_snapshot(map, rec->itr, &rec->tool,
767 					   record__process_auxtrace,
768 					   rec->opts.auxtrace_snapshot_size);
769 	if (ret < 0)
770 		return ret;
771 
772 	if (ret)
773 		rec->samples++;
774 
775 	return 0;
776 }
777 
778 static int record__auxtrace_read_snapshot_all(struct record *rec)
779 {
780 	int i;
781 	int rc = 0;
782 
783 	for (i = 0; i < rec->evlist->core.nr_mmaps; i++) {
784 		struct mmap *map = &rec->evlist->mmap[i];
785 
786 		if (!map->auxtrace_mmap.base)
787 			continue;
788 
789 		if (record__auxtrace_mmap_read_snapshot(rec, map) != 0) {
790 			rc = -1;
791 			goto out;
792 		}
793 	}
794 out:
795 	return rc;
796 }
797 
798 static void record__read_auxtrace_snapshot(struct record *rec, bool on_exit)
799 {
800 	pr_debug("Recording AUX area tracing snapshot\n");
801 	if (record__auxtrace_read_snapshot_all(rec) < 0) {
802 		trigger_error(&auxtrace_snapshot_trigger);
803 	} else {
804 		if (auxtrace_record__snapshot_finish(rec->itr, on_exit))
805 			trigger_error(&auxtrace_snapshot_trigger);
806 		else
807 			trigger_ready(&auxtrace_snapshot_trigger);
808 	}
809 }
810 
811 static int record__auxtrace_snapshot_exit(struct record *rec)
812 {
813 	if (trigger_is_error(&auxtrace_snapshot_trigger))
814 		return 0;
815 
816 	if (!auxtrace_record__snapshot_started &&
817 	    auxtrace_record__snapshot_start(rec->itr))
818 		return -1;
819 
820 	record__read_auxtrace_snapshot(rec, true);
821 	if (trigger_is_error(&auxtrace_snapshot_trigger))
822 		return -1;
823 
824 	return 0;
825 }
826 
827 static int record__auxtrace_init(struct record *rec)
828 {
829 	int err;
830 
831 	if ((rec->opts.auxtrace_snapshot_opts || rec->opts.auxtrace_sample_opts)
832 	    && record__threads_enabled(rec)) {
833 		pr_err("AUX area tracing options are not available in parallel streaming mode.\n");
834 		return -EINVAL;
835 	}
836 
837 	if (!rec->itr) {
838 		rec->itr = auxtrace_record__init(rec->evlist, &err);
839 		if (err)
840 			return err;
841 	}
842 
843 	err = auxtrace_parse_snapshot_options(rec->itr, &rec->opts,
844 					      rec->opts.auxtrace_snapshot_opts);
845 	if (err)
846 		return err;
847 
848 	err = auxtrace_parse_sample_options(rec->itr, rec->evlist, &rec->opts,
849 					    rec->opts.auxtrace_sample_opts);
850 	if (err)
851 		return err;
852 
853 	auxtrace_regroup_aux_output(rec->evlist);
854 
855 	return auxtrace_parse_filters(rec->evlist);
856 }
857 
858 #else
859 
860 static inline
861 int record__auxtrace_mmap_read(struct record *rec __maybe_unused,
862 			       struct mmap *map __maybe_unused)
863 {
864 	return 0;
865 }
866 
867 static inline
868 void record__read_auxtrace_snapshot(struct record *rec __maybe_unused,
869 				    bool on_exit __maybe_unused)
870 {
871 }
872 
873 static inline
874 int auxtrace_record__snapshot_start(struct auxtrace_record *itr __maybe_unused)
875 {
876 	return 0;
877 }
878 
879 static inline
880 int record__auxtrace_snapshot_exit(struct record *rec __maybe_unused)
881 {
882 	return 0;
883 }
884 
885 static int record__auxtrace_init(struct record *rec __maybe_unused)
886 {
887 	return 0;
888 }
889 
890 #endif
891 
892 static int record__config_text_poke(struct evlist *evlist)
893 {
894 	struct evsel *evsel;
895 
896 	/* Nothing to do if text poke is already configured */
897 	evlist__for_each_entry(evlist, evsel) {
898 		if (evsel->core.attr.text_poke)
899 			return 0;
900 	}
901 
902 	evsel = evlist__add_dummy_on_all_cpus(evlist);
903 	if (!evsel)
904 		return -ENOMEM;
905 
906 	evsel->core.attr.text_poke = 1;
907 	evsel->core.attr.ksymbol = 1;
908 	evsel->immediate = true;
909 	evsel__set_sample_bit(evsel, TIME);
910 
911 	return 0;
912 }
913 
914 static int record__config_off_cpu(struct record *rec)
915 {
916 	return off_cpu_prepare(rec->evlist, &rec->opts.target, &rec->opts);
917 }
918 
919 static bool record__tracking_system_wide(struct record *rec)
920 {
921 	struct evlist *evlist = rec->evlist;
922 	struct evsel *evsel;
923 
924 	/*
925 	 * If non-dummy evsel exists, system_wide sideband is need to
926 	 * help parse sample information.
927 	 * For example, PERF_EVENT_MMAP event to help parse symbol,
928 	 * and PERF_EVENT_COMM event to help parse task executable name.
929 	 */
930 	evlist__for_each_entry(evlist, evsel) {
931 		if (!evsel__is_dummy_event(evsel))
932 			return true;
933 	}
934 
935 	return false;
936 }
937 
938 static int record__config_tracking_events(struct record *rec)
939 {
940 	struct record_opts *opts = &rec->opts;
941 	struct evlist *evlist = rec->evlist;
942 	bool system_wide = false;
943 	struct evsel *evsel;
944 
945 	/*
946 	 * For initial_delay, system wide or a hybrid system, we need to add
947 	 * tracking event so that we can track PERF_RECORD_MMAP to cover the
948 	 * delay of waiting or event synthesis.
949 	 */
950 	if (opts->target.initial_delay || target__has_cpu(&opts->target) ||
951 	    perf_pmus__num_core_pmus() > 1) {
952 
953 		/*
954 		 * User space tasks can migrate between CPUs, so when tracing
955 		 * selected CPUs, sideband for all CPUs is still needed.
956 		 */
957 		if (!!opts->target.cpu_list && record__tracking_system_wide(rec))
958 			system_wide = true;
959 
960 		evsel = evlist__findnew_tracking_event(evlist, system_wide);
961 		if (!evsel)
962 			return -ENOMEM;
963 
964 		/*
965 		 * Enable the tracking event when the process is forked for
966 		 * initial_delay, immediately for system wide.
967 		 */
968 		if (opts->target.initial_delay && !evsel->immediate &&
969 		    !target__has_cpu(&opts->target))
970 			evsel->core.attr.enable_on_exec = 1;
971 		else
972 			evsel->immediate = 1;
973 	}
974 
975 	return 0;
976 }
977 
978 static bool record__kcore_readable(struct machine *machine)
979 {
980 	char kcore[PATH_MAX];
981 	int fd;
982 
983 	scnprintf(kcore, sizeof(kcore), "%s/proc/kcore", machine->root_dir);
984 
985 	fd = open(kcore, O_RDONLY);
986 	if (fd < 0)
987 		return false;
988 
989 	close(fd);
990 
991 	return true;
992 }
993 
994 static int record__kcore_copy(struct machine *machine, struct perf_data *data)
995 {
996 	char from_dir[PATH_MAX];
997 	char kcore_dir[PATH_MAX];
998 	int ret;
999 
1000 	snprintf(from_dir, sizeof(from_dir), "%s/proc", machine->root_dir);
1001 
1002 	ret = perf_data__make_kcore_dir(data, kcore_dir, sizeof(kcore_dir));
1003 	if (ret)
1004 		return ret;
1005 
1006 	return kcore_copy(from_dir, kcore_dir);
1007 }
1008 
1009 static void record__thread_data_init_pipes(struct record_thread *thread_data)
1010 {
1011 	thread_data->pipes.msg[0] = -1;
1012 	thread_data->pipes.msg[1] = -1;
1013 	thread_data->pipes.ack[0] = -1;
1014 	thread_data->pipes.ack[1] = -1;
1015 }
1016 
1017 static int record__thread_data_open_pipes(struct record_thread *thread_data)
1018 {
1019 	if (pipe(thread_data->pipes.msg))
1020 		return -EINVAL;
1021 
1022 	if (pipe(thread_data->pipes.ack)) {
1023 		close(thread_data->pipes.msg[0]);
1024 		thread_data->pipes.msg[0] = -1;
1025 		close(thread_data->pipes.msg[1]);
1026 		thread_data->pipes.msg[1] = -1;
1027 		return -EINVAL;
1028 	}
1029 
1030 	pr_debug2("thread_data[%p]: msg=[%d,%d], ack=[%d,%d]\n", thread_data,
1031 		 thread_data->pipes.msg[0], thread_data->pipes.msg[1],
1032 		 thread_data->pipes.ack[0], thread_data->pipes.ack[1]);
1033 
1034 	return 0;
1035 }
1036 
1037 static void record__thread_data_close_pipes(struct record_thread *thread_data)
1038 {
1039 	if (thread_data->pipes.msg[0] != -1) {
1040 		close(thread_data->pipes.msg[0]);
1041 		thread_data->pipes.msg[0] = -1;
1042 	}
1043 	if (thread_data->pipes.msg[1] != -1) {
1044 		close(thread_data->pipes.msg[1]);
1045 		thread_data->pipes.msg[1] = -1;
1046 	}
1047 	if (thread_data->pipes.ack[0] != -1) {
1048 		close(thread_data->pipes.ack[0]);
1049 		thread_data->pipes.ack[0] = -1;
1050 	}
1051 	if (thread_data->pipes.ack[1] != -1) {
1052 		close(thread_data->pipes.ack[1]);
1053 		thread_data->pipes.ack[1] = -1;
1054 	}
1055 }
1056 
1057 static bool evlist__per_thread(struct evlist *evlist)
1058 {
1059 	return cpu_map__is_dummy(evlist->core.user_requested_cpus);
1060 }
1061 
1062 static int record__thread_data_init_maps(struct record_thread *thread_data, struct evlist *evlist)
1063 {
1064 	int m, tm, nr_mmaps = evlist->core.nr_mmaps;
1065 	struct mmap *mmap = evlist->mmap;
1066 	struct mmap *overwrite_mmap = evlist->overwrite_mmap;
1067 	struct perf_cpu_map *cpus = evlist->core.all_cpus;
1068 	bool per_thread = evlist__per_thread(evlist);
1069 
1070 	if (per_thread)
1071 		thread_data->nr_mmaps = nr_mmaps;
1072 	else
1073 		thread_data->nr_mmaps = bitmap_weight(thread_data->mask->maps.bits,
1074 						      thread_data->mask->maps.nbits);
1075 	if (mmap) {
1076 		thread_data->maps = zalloc(thread_data->nr_mmaps * sizeof(struct mmap *));
1077 		if (!thread_data->maps)
1078 			return -ENOMEM;
1079 	}
1080 	if (overwrite_mmap) {
1081 		thread_data->overwrite_maps = zalloc(thread_data->nr_mmaps * sizeof(struct mmap *));
1082 		if (!thread_data->overwrite_maps) {
1083 			zfree(&thread_data->maps);
1084 			return -ENOMEM;
1085 		}
1086 	}
1087 	pr_debug2("thread_data[%p]: nr_mmaps=%d, maps=%p, ow_maps=%p\n", thread_data,
1088 		 thread_data->nr_mmaps, thread_data->maps, thread_data->overwrite_maps);
1089 
1090 	for (m = 0, tm = 0; m < nr_mmaps && tm < thread_data->nr_mmaps; m++) {
1091 		if (per_thread ||
1092 		    test_bit(perf_cpu_map__cpu(cpus, m).cpu, thread_data->mask->maps.bits)) {
1093 			if (thread_data->maps) {
1094 				thread_data->maps[tm] = &mmap[m];
1095 				pr_debug2("thread_data[%p]: cpu%d: maps[%d] -> mmap[%d]\n",
1096 					  thread_data, perf_cpu_map__cpu(cpus, m).cpu, tm, m);
1097 			}
1098 			if (thread_data->overwrite_maps) {
1099 				thread_data->overwrite_maps[tm] = &overwrite_mmap[m];
1100 				pr_debug2("thread_data[%p]: cpu%d: ow_maps[%d] -> ow_mmap[%d]\n",
1101 					  thread_data, perf_cpu_map__cpu(cpus, m).cpu, tm, m);
1102 			}
1103 			tm++;
1104 		}
1105 	}
1106 
1107 	return 0;
1108 }
1109 
1110 static int record__thread_data_init_pollfd(struct record_thread *thread_data, struct evlist *evlist)
1111 {
1112 	int f, tm, pos;
1113 	struct mmap *map, *overwrite_map;
1114 
1115 	fdarray__init(&thread_data->pollfd, 64);
1116 
1117 	for (tm = 0; tm < thread_data->nr_mmaps; tm++) {
1118 		map = thread_data->maps ? thread_data->maps[tm] : NULL;
1119 		overwrite_map = thread_data->overwrite_maps ?
1120 				thread_data->overwrite_maps[tm] : NULL;
1121 
1122 		for (f = 0; f < evlist->core.pollfd.nr; f++) {
1123 			void *ptr = evlist->core.pollfd.priv[f].ptr;
1124 
1125 			if ((map && ptr == map) || (overwrite_map && ptr == overwrite_map)) {
1126 				pos = fdarray__dup_entry_from(&thread_data->pollfd, f,
1127 							      &evlist->core.pollfd);
1128 				if (pos < 0)
1129 					return pos;
1130 				pr_debug2("thread_data[%p]: pollfd[%d] <- event_fd=%d\n",
1131 					 thread_data, pos, evlist->core.pollfd.entries[f].fd);
1132 			}
1133 		}
1134 	}
1135 
1136 	return 0;
1137 }
1138 
1139 static void record__free_thread_data(struct record *rec)
1140 {
1141 	int t;
1142 	struct record_thread *thread_data = rec->thread_data;
1143 
1144 	if (thread_data == NULL)
1145 		return;
1146 
1147 	for (t = 0; t < rec->nr_threads; t++) {
1148 		record__thread_data_close_pipes(&thread_data[t]);
1149 		zfree(&thread_data[t].maps);
1150 		zfree(&thread_data[t].overwrite_maps);
1151 		fdarray__exit(&thread_data[t].pollfd);
1152 	}
1153 
1154 	zfree(&rec->thread_data);
1155 }
1156 
1157 static int record__map_thread_evlist_pollfd_indexes(struct record *rec,
1158 						    int evlist_pollfd_index,
1159 						    int thread_pollfd_index)
1160 {
1161 	size_t x = rec->index_map_cnt;
1162 
1163 	if (realloc_array_as_needed(rec->index_map, rec->index_map_sz, x, NULL))
1164 		return -ENOMEM;
1165 	rec->index_map[x].evlist_pollfd_index = evlist_pollfd_index;
1166 	rec->index_map[x].thread_pollfd_index = thread_pollfd_index;
1167 	rec->index_map_cnt += 1;
1168 	return 0;
1169 }
1170 
1171 static int record__update_evlist_pollfd_from_thread(struct record *rec,
1172 						    struct evlist *evlist,
1173 						    struct record_thread *thread_data)
1174 {
1175 	struct pollfd *e_entries = evlist->core.pollfd.entries;
1176 	struct pollfd *t_entries = thread_data->pollfd.entries;
1177 	int err = 0;
1178 	size_t i;
1179 
1180 	for (i = 0; i < rec->index_map_cnt; i++) {
1181 		int e_pos = rec->index_map[i].evlist_pollfd_index;
1182 		int t_pos = rec->index_map[i].thread_pollfd_index;
1183 
1184 		if (e_entries[e_pos].fd != t_entries[t_pos].fd ||
1185 		    e_entries[e_pos].events != t_entries[t_pos].events) {
1186 			pr_err("Thread and evlist pollfd index mismatch\n");
1187 			err = -EINVAL;
1188 			continue;
1189 		}
1190 		e_entries[e_pos].revents = t_entries[t_pos].revents;
1191 	}
1192 	return err;
1193 }
1194 
1195 static int record__dup_non_perf_events(struct record *rec,
1196 				       struct evlist *evlist,
1197 				       struct record_thread *thread_data)
1198 {
1199 	struct fdarray *fda = &evlist->core.pollfd;
1200 	int i, ret;
1201 
1202 	for (i = 0; i < fda->nr; i++) {
1203 		if (!(fda->priv[i].flags & fdarray_flag__non_perf_event))
1204 			continue;
1205 		ret = fdarray__dup_entry_from(&thread_data->pollfd, i, fda);
1206 		if (ret < 0) {
1207 			pr_err("Failed to duplicate descriptor in main thread pollfd\n");
1208 			return ret;
1209 		}
1210 		pr_debug2("thread_data[%p]: pollfd[%d] <- non_perf_event fd=%d\n",
1211 			  thread_data, ret, fda->entries[i].fd);
1212 		ret = record__map_thread_evlist_pollfd_indexes(rec, i, ret);
1213 		if (ret < 0) {
1214 			pr_err("Failed to map thread and evlist pollfd indexes\n");
1215 			return ret;
1216 		}
1217 	}
1218 	return 0;
1219 }
1220 
1221 static int record__alloc_thread_data(struct record *rec, struct evlist *evlist)
1222 {
1223 	int t, ret;
1224 	struct record_thread *thread_data;
1225 
1226 	rec->thread_data = zalloc(rec->nr_threads * sizeof(*(rec->thread_data)));
1227 	if (!rec->thread_data) {
1228 		pr_err("Failed to allocate thread data\n");
1229 		return -ENOMEM;
1230 	}
1231 	thread_data = rec->thread_data;
1232 
1233 	for (t = 0; t < rec->nr_threads; t++)
1234 		record__thread_data_init_pipes(&thread_data[t]);
1235 
1236 	for (t = 0; t < rec->nr_threads; t++) {
1237 		thread_data[t].rec = rec;
1238 		thread_data[t].mask = &rec->thread_masks[t];
1239 		ret = record__thread_data_init_maps(&thread_data[t], evlist);
1240 		if (ret) {
1241 			pr_err("Failed to initialize thread[%d] maps\n", t);
1242 			goto out_free;
1243 		}
1244 		ret = record__thread_data_init_pollfd(&thread_data[t], evlist);
1245 		if (ret) {
1246 			pr_err("Failed to initialize thread[%d] pollfd\n", t);
1247 			goto out_free;
1248 		}
1249 		if (t) {
1250 			thread_data[t].tid = -1;
1251 			ret = record__thread_data_open_pipes(&thread_data[t]);
1252 			if (ret) {
1253 				pr_err("Failed to open thread[%d] communication pipes\n", t);
1254 				goto out_free;
1255 			}
1256 			ret = fdarray__add(&thread_data[t].pollfd, thread_data[t].pipes.msg[0],
1257 					   POLLIN | POLLERR | POLLHUP, fdarray_flag__nonfilterable);
1258 			if (ret < 0) {
1259 				pr_err("Failed to add descriptor to thread[%d] pollfd\n", t);
1260 				goto out_free;
1261 			}
1262 			thread_data[t].ctlfd_pos = ret;
1263 			pr_debug2("thread_data[%p]: pollfd[%d] <- ctl_fd=%d\n",
1264 				 thread_data, thread_data[t].ctlfd_pos,
1265 				 thread_data[t].pipes.msg[0]);
1266 		} else {
1267 			thread_data[t].tid = gettid();
1268 
1269 			ret = record__dup_non_perf_events(rec, evlist, &thread_data[t]);
1270 			if (ret < 0)
1271 				goto out_free;
1272 
1273 			thread_data[t].ctlfd_pos = -1; /* Not used */
1274 		}
1275 	}
1276 
1277 	return 0;
1278 
1279 out_free:
1280 	record__free_thread_data(rec);
1281 
1282 	return ret;
1283 }
1284 
1285 static int record__mmap_evlist(struct record *rec,
1286 			       struct evlist *evlist)
1287 {
1288 	int i, ret;
1289 	struct record_opts *opts = &rec->opts;
1290 	bool auxtrace_overwrite = opts->auxtrace_snapshot_mode ||
1291 				  opts->auxtrace_sample_mode;
1292 	char msg[512];
1293 
1294 	if (opts->affinity != PERF_AFFINITY_SYS)
1295 		cpu__setup_cpunode_map();
1296 
1297 	if (evlist__mmap_ex(evlist, opts->mmap_pages,
1298 				 opts->auxtrace_mmap_pages,
1299 				 auxtrace_overwrite,
1300 				 opts->nr_cblocks, opts->affinity,
1301 				 opts->mmap_flush, opts->comp_level) < 0) {
1302 		if (errno == EPERM) {
1303 			pr_err("Permission error mapping pages.\n"
1304 			       "Consider increasing "
1305 			       "/proc/sys/kernel/perf_event_mlock_kb,\n"
1306 			       "or try again with a smaller value of -m/--mmap_pages.\n"
1307 			       "(current value: %u,%u)\n",
1308 			       opts->mmap_pages, opts->auxtrace_mmap_pages);
1309 			return -errno;
1310 		} else {
1311 			pr_err("failed to mmap with %d (%s)\n", errno,
1312 				str_error_r(errno, msg, sizeof(msg)));
1313 			if (errno)
1314 				return -errno;
1315 			else
1316 				return -EINVAL;
1317 		}
1318 	}
1319 
1320 	if (evlist__initialize_ctlfd(evlist, opts->ctl_fd, opts->ctl_fd_ack))
1321 		return -1;
1322 
1323 	ret = record__alloc_thread_data(rec, evlist);
1324 	if (ret)
1325 		return ret;
1326 
1327 	if (record__threads_enabled(rec)) {
1328 		ret = perf_data__create_dir(&rec->data, evlist->core.nr_mmaps);
1329 		if (ret) {
1330 			pr_err("Failed to create data directory: %s\n", strerror(-ret));
1331 			return ret;
1332 		}
1333 		for (i = 0; i < evlist->core.nr_mmaps; i++) {
1334 			if (evlist->mmap)
1335 				evlist->mmap[i].file = &rec->data.dir.files[i];
1336 			if (evlist->overwrite_mmap)
1337 				evlist->overwrite_mmap[i].file = &rec->data.dir.files[i];
1338 		}
1339 	}
1340 
1341 	return 0;
1342 }
1343 
1344 static int record__mmap(struct record *rec)
1345 {
1346 	return record__mmap_evlist(rec, rec->evlist);
1347 }
1348 
1349 static int record__open(struct record *rec)
1350 {
1351 	char msg[BUFSIZ];
1352 	struct evsel *pos;
1353 	struct evlist *evlist = rec->evlist;
1354 	struct perf_session *session = rec->session;
1355 	struct record_opts *opts = &rec->opts;
1356 	int rc = 0;
1357 
1358 	evlist__config(evlist, opts, &callchain_param);
1359 
1360 	evlist__for_each_entry(evlist, pos) {
1361 try_again:
1362 		if (evsel__open(pos, pos->core.cpus, pos->core.threads) < 0) {
1363 			if (evsel__fallback(pos, errno, msg, sizeof(msg))) {
1364 				if (verbose > 0)
1365 					ui__warning("%s\n", msg);
1366 				goto try_again;
1367 			}
1368 			if ((errno == EINVAL || errno == EBADF) &&
1369 			    pos->core.leader != &pos->core &&
1370 			    pos->weak_group) {
1371 			        pos = evlist__reset_weak_group(evlist, pos, true);
1372 				goto try_again;
1373 			}
1374 			rc = -errno;
1375 			evsel__open_strerror(pos, &opts->target, errno, msg, sizeof(msg));
1376 			ui__error("%s\n", msg);
1377 			goto out;
1378 		}
1379 
1380 		pos->supported = true;
1381 	}
1382 
1383 	if (symbol_conf.kptr_restrict && !evlist__exclude_kernel(evlist)) {
1384 		pr_warning(
1385 "WARNING: Kernel address maps (/proc/{kallsyms,modules}) are restricted,\n"
1386 "check /proc/sys/kernel/kptr_restrict and /proc/sys/kernel/perf_event_paranoid.\n\n"
1387 "Samples in kernel functions may not be resolved if a suitable vmlinux\n"
1388 "file is not found in the buildid cache or in the vmlinux path.\n\n"
1389 "Samples in kernel modules won't be resolved at all.\n\n"
1390 "If some relocation was applied (e.g. kexec) symbols may be misresolved\n"
1391 "even with a suitable vmlinux or kallsyms file.\n\n");
1392 	}
1393 
1394 	if (evlist__apply_filters(evlist, &pos)) {
1395 		pr_err("failed to set filter \"%s\" on event %s with %d (%s)\n",
1396 			pos->filter ?: "BPF", evsel__name(pos), errno,
1397 			str_error_r(errno, msg, sizeof(msg)));
1398 		rc = -1;
1399 		goto out;
1400 	}
1401 
1402 	rc = record__mmap(rec);
1403 	if (rc)
1404 		goto out;
1405 
1406 	session->evlist = evlist;
1407 	perf_session__set_id_hdr_size(session);
1408 out:
1409 	return rc;
1410 }
1411 
1412 static void set_timestamp_boundary(struct record *rec, u64 sample_time)
1413 {
1414 	if (rec->evlist->first_sample_time == 0)
1415 		rec->evlist->first_sample_time = sample_time;
1416 
1417 	if (sample_time)
1418 		rec->evlist->last_sample_time = sample_time;
1419 }
1420 
1421 static int process_sample_event(struct perf_tool *tool,
1422 				union perf_event *event,
1423 				struct perf_sample *sample,
1424 				struct evsel *evsel,
1425 				struct machine *machine)
1426 {
1427 	struct record *rec = container_of(tool, struct record, tool);
1428 
1429 	set_timestamp_boundary(rec, sample->time);
1430 
1431 	if (rec->buildid_all)
1432 		return 0;
1433 
1434 	rec->samples++;
1435 	return build_id__mark_dso_hit(tool, event, sample, evsel, machine);
1436 }
1437 
1438 static int process_buildids(struct record *rec)
1439 {
1440 	struct perf_session *session = rec->session;
1441 
1442 	if (perf_data__size(&rec->data) == 0)
1443 		return 0;
1444 
1445 	/*
1446 	 * During this process, it'll load kernel map and replace the
1447 	 * dso->long_name to a real pathname it found.  In this case
1448 	 * we prefer the vmlinux path like
1449 	 *   /lib/modules/3.16.4/build/vmlinux
1450 	 *
1451 	 * rather than build-id path (in debug directory).
1452 	 *   $HOME/.debug/.build-id/f0/6e17aa50adf4d00b88925e03775de107611551
1453 	 */
1454 	symbol_conf.ignore_vmlinux_buildid = true;
1455 
1456 	/*
1457 	 * If --buildid-all is given, it marks all DSO regardless of hits,
1458 	 * so no need to process samples. But if timestamp_boundary is enabled,
1459 	 * it still needs to walk on all samples to get the timestamps of
1460 	 * first/last samples.
1461 	 */
1462 	if (rec->buildid_all && !rec->timestamp_boundary)
1463 		rec->tool.sample = NULL;
1464 
1465 	return perf_session__process_events(session);
1466 }
1467 
1468 static void perf_event__synthesize_guest_os(struct machine *machine, void *data)
1469 {
1470 	int err;
1471 	struct perf_tool *tool = data;
1472 	/*
1473 	 *As for guest kernel when processing subcommand record&report,
1474 	 *we arrange module mmap prior to guest kernel mmap and trigger
1475 	 *a preload dso because default guest module symbols are loaded
1476 	 *from guest kallsyms instead of /lib/modules/XXX/XXX. This
1477 	 *method is used to avoid symbol missing when the first addr is
1478 	 *in module instead of in guest kernel.
1479 	 */
1480 	err = perf_event__synthesize_modules(tool, process_synthesized_event,
1481 					     machine);
1482 	if (err < 0)
1483 		pr_err("Couldn't record guest kernel [%d]'s reference"
1484 		       " relocation symbol.\n", machine->pid);
1485 
1486 	/*
1487 	 * We use _stext for guest kernel because guest kernel's /proc/kallsyms
1488 	 * have no _text sometimes.
1489 	 */
1490 	err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
1491 						 machine);
1492 	if (err < 0)
1493 		pr_err("Couldn't record guest kernel [%d]'s reference"
1494 		       " relocation symbol.\n", machine->pid);
1495 }
1496 
1497 static struct perf_event_header finished_round_event = {
1498 	.size = sizeof(struct perf_event_header),
1499 	.type = PERF_RECORD_FINISHED_ROUND,
1500 };
1501 
1502 static struct perf_event_header finished_init_event = {
1503 	.size = sizeof(struct perf_event_header),
1504 	.type = PERF_RECORD_FINISHED_INIT,
1505 };
1506 
1507 static void record__adjust_affinity(struct record *rec, struct mmap *map)
1508 {
1509 	if (rec->opts.affinity != PERF_AFFINITY_SYS &&
1510 	    !bitmap_equal(thread->mask->affinity.bits, map->affinity_mask.bits,
1511 			  thread->mask->affinity.nbits)) {
1512 		bitmap_zero(thread->mask->affinity.bits, thread->mask->affinity.nbits);
1513 		bitmap_or(thread->mask->affinity.bits, thread->mask->affinity.bits,
1514 			  map->affinity_mask.bits, thread->mask->affinity.nbits);
1515 		sched_setaffinity(0, MMAP_CPU_MASK_BYTES(&thread->mask->affinity),
1516 					(cpu_set_t *)thread->mask->affinity.bits);
1517 		if (verbose == 2) {
1518 			pr_debug("threads[%d]: running on cpu%d: ", thread->tid, sched_getcpu());
1519 			mmap_cpu_mask__scnprintf(&thread->mask->affinity, "affinity");
1520 		}
1521 	}
1522 }
1523 
1524 static size_t process_comp_header(void *record, size_t increment)
1525 {
1526 	struct perf_record_compressed *event = record;
1527 	size_t size = sizeof(*event);
1528 
1529 	if (increment) {
1530 		event->header.size += increment;
1531 		return increment;
1532 	}
1533 
1534 	event->header.type = PERF_RECORD_COMPRESSED;
1535 	event->header.size = size;
1536 
1537 	return size;
1538 }
1539 
1540 static ssize_t zstd_compress(struct perf_session *session, struct mmap *map,
1541 			    void *dst, size_t dst_size, void *src, size_t src_size)
1542 {
1543 	ssize_t compressed;
1544 	size_t max_record_size = PERF_SAMPLE_MAX_SIZE - sizeof(struct perf_record_compressed) - 1;
1545 	struct zstd_data *zstd_data = &session->zstd_data;
1546 
1547 	if (map && map->file)
1548 		zstd_data = &map->zstd_data;
1549 
1550 	compressed = zstd_compress_stream_to_records(zstd_data, dst, dst_size, src, src_size,
1551 						     max_record_size, process_comp_header);
1552 	if (compressed < 0)
1553 		return compressed;
1554 
1555 	if (map && map->file) {
1556 		thread->bytes_transferred += src_size;
1557 		thread->bytes_compressed  += compressed;
1558 	} else {
1559 		session->bytes_transferred += src_size;
1560 		session->bytes_compressed  += compressed;
1561 	}
1562 
1563 	return compressed;
1564 }
1565 
1566 static int record__mmap_read_evlist(struct record *rec, struct evlist *evlist,
1567 				    bool overwrite, bool synch)
1568 {
1569 	u64 bytes_written = rec->bytes_written;
1570 	int i;
1571 	int rc = 0;
1572 	int nr_mmaps;
1573 	struct mmap **maps;
1574 	int trace_fd = rec->data.file.fd;
1575 	off_t off = 0;
1576 
1577 	if (!evlist)
1578 		return 0;
1579 
1580 	nr_mmaps = thread->nr_mmaps;
1581 	maps = overwrite ? thread->overwrite_maps : thread->maps;
1582 
1583 	if (!maps)
1584 		return 0;
1585 
1586 	if (overwrite && evlist->bkw_mmap_state != BKW_MMAP_DATA_PENDING)
1587 		return 0;
1588 
1589 	if (record__aio_enabled(rec))
1590 		off = record__aio_get_pos(trace_fd);
1591 
1592 	for (i = 0; i < nr_mmaps; i++) {
1593 		u64 flush = 0;
1594 		struct mmap *map = maps[i];
1595 
1596 		if (map->core.base) {
1597 			record__adjust_affinity(rec, map);
1598 			if (synch) {
1599 				flush = map->core.flush;
1600 				map->core.flush = 1;
1601 			}
1602 			if (!record__aio_enabled(rec)) {
1603 				if (perf_mmap__push(map, rec, record__pushfn) < 0) {
1604 					if (synch)
1605 						map->core.flush = flush;
1606 					rc = -1;
1607 					goto out;
1608 				}
1609 			} else {
1610 				if (record__aio_push(rec, map, &off) < 0) {
1611 					record__aio_set_pos(trace_fd, off);
1612 					if (synch)
1613 						map->core.flush = flush;
1614 					rc = -1;
1615 					goto out;
1616 				}
1617 			}
1618 			if (synch)
1619 				map->core.flush = flush;
1620 		}
1621 
1622 		if (map->auxtrace_mmap.base && !rec->opts.auxtrace_snapshot_mode &&
1623 		    !rec->opts.auxtrace_sample_mode &&
1624 		    record__auxtrace_mmap_read(rec, map) != 0) {
1625 			rc = -1;
1626 			goto out;
1627 		}
1628 	}
1629 
1630 	if (record__aio_enabled(rec))
1631 		record__aio_set_pos(trace_fd, off);
1632 
1633 	/*
1634 	 * Mark the round finished in case we wrote
1635 	 * at least one event.
1636 	 *
1637 	 * No need for round events in directory mode,
1638 	 * because per-cpu maps and files have data
1639 	 * sorted by kernel.
1640 	 */
1641 	if (!record__threads_enabled(rec) && bytes_written != rec->bytes_written)
1642 		rc = record__write(rec, NULL, &finished_round_event, sizeof(finished_round_event));
1643 
1644 	if (overwrite)
1645 		evlist__toggle_bkw_mmap(evlist, BKW_MMAP_EMPTY);
1646 out:
1647 	return rc;
1648 }
1649 
1650 static int record__mmap_read_all(struct record *rec, bool synch)
1651 {
1652 	int err;
1653 
1654 	err = record__mmap_read_evlist(rec, rec->evlist, false, synch);
1655 	if (err)
1656 		return err;
1657 
1658 	return record__mmap_read_evlist(rec, rec->evlist, true, synch);
1659 }
1660 
1661 static void record__thread_munmap_filtered(struct fdarray *fda, int fd,
1662 					   void *arg __maybe_unused)
1663 {
1664 	struct perf_mmap *map = fda->priv[fd].ptr;
1665 
1666 	if (map)
1667 		perf_mmap__put(map);
1668 }
1669 
1670 static void *record__thread(void *arg)
1671 {
1672 	enum thread_msg msg = THREAD_MSG__READY;
1673 	bool terminate = false;
1674 	struct fdarray *pollfd;
1675 	int err, ctlfd_pos;
1676 
1677 	thread = arg;
1678 	thread->tid = gettid();
1679 
1680 	err = write(thread->pipes.ack[1], &msg, sizeof(msg));
1681 	if (err == -1)
1682 		pr_warning("threads[%d]: failed to notify on start: %s\n",
1683 			   thread->tid, strerror(errno));
1684 
1685 	pr_debug("threads[%d]: started on cpu%d\n", thread->tid, sched_getcpu());
1686 
1687 	pollfd = &thread->pollfd;
1688 	ctlfd_pos = thread->ctlfd_pos;
1689 
1690 	for (;;) {
1691 		unsigned long long hits = thread->samples;
1692 
1693 		if (record__mmap_read_all(thread->rec, false) < 0 || terminate)
1694 			break;
1695 
1696 		if (hits == thread->samples) {
1697 
1698 			err = fdarray__poll(pollfd, -1);
1699 			/*
1700 			 * Propagate error, only if there's any. Ignore positive
1701 			 * number of returned events and interrupt error.
1702 			 */
1703 			if (err > 0 || (err < 0 && errno == EINTR))
1704 				err = 0;
1705 			thread->waking++;
1706 
1707 			if (fdarray__filter(pollfd, POLLERR | POLLHUP,
1708 					    record__thread_munmap_filtered, NULL) == 0)
1709 				break;
1710 		}
1711 
1712 		if (pollfd->entries[ctlfd_pos].revents & POLLHUP) {
1713 			terminate = true;
1714 			close(thread->pipes.msg[0]);
1715 			thread->pipes.msg[0] = -1;
1716 			pollfd->entries[ctlfd_pos].fd = -1;
1717 			pollfd->entries[ctlfd_pos].events = 0;
1718 		}
1719 
1720 		pollfd->entries[ctlfd_pos].revents = 0;
1721 	}
1722 	record__mmap_read_all(thread->rec, true);
1723 
1724 	err = write(thread->pipes.ack[1], &msg, sizeof(msg));
1725 	if (err == -1)
1726 		pr_warning("threads[%d]: failed to notify on termination: %s\n",
1727 			   thread->tid, strerror(errno));
1728 
1729 	return NULL;
1730 }
1731 
1732 static void record__init_features(struct record *rec)
1733 {
1734 	struct perf_session *session = rec->session;
1735 	int feat;
1736 
1737 	for (feat = HEADER_FIRST_FEATURE; feat < HEADER_LAST_FEATURE; feat++)
1738 		perf_header__set_feat(&session->header, feat);
1739 
1740 	if (rec->no_buildid)
1741 		perf_header__clear_feat(&session->header, HEADER_BUILD_ID);
1742 
1743 #ifdef HAVE_LIBTRACEEVENT
1744 	if (!have_tracepoints(&rec->evlist->core.entries))
1745 		perf_header__clear_feat(&session->header, HEADER_TRACING_DATA);
1746 #endif
1747 
1748 	if (!rec->opts.branch_stack)
1749 		perf_header__clear_feat(&session->header, HEADER_BRANCH_STACK);
1750 
1751 	if (!rec->opts.full_auxtrace)
1752 		perf_header__clear_feat(&session->header, HEADER_AUXTRACE);
1753 
1754 	if (!(rec->opts.use_clockid && rec->opts.clockid_res_ns))
1755 		perf_header__clear_feat(&session->header, HEADER_CLOCKID);
1756 
1757 	if (!rec->opts.use_clockid)
1758 		perf_header__clear_feat(&session->header, HEADER_CLOCK_DATA);
1759 
1760 	if (!record__threads_enabled(rec))
1761 		perf_header__clear_feat(&session->header, HEADER_DIR_FORMAT);
1762 
1763 	if (!record__comp_enabled(rec))
1764 		perf_header__clear_feat(&session->header, HEADER_COMPRESSED);
1765 
1766 	perf_header__clear_feat(&session->header, HEADER_STAT);
1767 }
1768 
1769 static void
1770 record__finish_output(struct record *rec)
1771 {
1772 	int i;
1773 	struct perf_data *data = &rec->data;
1774 	int fd = perf_data__fd(data);
1775 
1776 	if (data->is_pipe)
1777 		return;
1778 
1779 	rec->session->header.data_size += rec->bytes_written;
1780 	data->file.size = lseek(perf_data__fd(data), 0, SEEK_CUR);
1781 	if (record__threads_enabled(rec)) {
1782 		for (i = 0; i < data->dir.nr; i++)
1783 			data->dir.files[i].size = lseek(data->dir.files[i].fd, 0, SEEK_CUR);
1784 	}
1785 
1786 	if (!rec->no_buildid) {
1787 		process_buildids(rec);
1788 
1789 		if (rec->buildid_all)
1790 			dsos__hit_all(rec->session);
1791 	}
1792 	perf_session__write_header(rec->session, rec->evlist, fd, true);
1793 
1794 	return;
1795 }
1796 
1797 static int record__synthesize_workload(struct record *rec, bool tail)
1798 {
1799 	int err;
1800 	struct perf_thread_map *thread_map;
1801 	bool needs_mmap = rec->opts.synth & PERF_SYNTH_MMAP;
1802 
1803 	if (rec->opts.tail_synthesize != tail)
1804 		return 0;
1805 
1806 	thread_map = thread_map__new_by_tid(rec->evlist->workload.pid);
1807 	if (thread_map == NULL)
1808 		return -1;
1809 
1810 	err = perf_event__synthesize_thread_map(&rec->tool, thread_map,
1811 						 process_synthesized_event,
1812 						 &rec->session->machines.host,
1813 						 needs_mmap,
1814 						 rec->opts.sample_address);
1815 	perf_thread_map__put(thread_map);
1816 	return err;
1817 }
1818 
1819 static int write_finished_init(struct record *rec, bool tail)
1820 {
1821 	if (rec->opts.tail_synthesize != tail)
1822 		return 0;
1823 
1824 	return record__write(rec, NULL, &finished_init_event, sizeof(finished_init_event));
1825 }
1826 
1827 static int record__synthesize(struct record *rec, bool tail);
1828 
1829 static int
1830 record__switch_output(struct record *rec, bool at_exit)
1831 {
1832 	struct perf_data *data = &rec->data;
1833 	int fd, err;
1834 	char *new_filename;
1835 
1836 	/* Same Size:      "2015122520103046"*/
1837 	char timestamp[] = "InvalidTimestamp";
1838 
1839 	record__aio_mmap_read_sync(rec);
1840 
1841 	write_finished_init(rec, true);
1842 
1843 	record__synthesize(rec, true);
1844 	if (target__none(&rec->opts.target))
1845 		record__synthesize_workload(rec, true);
1846 
1847 	rec->samples = 0;
1848 	record__finish_output(rec);
1849 	err = fetch_current_timestamp(timestamp, sizeof(timestamp));
1850 	if (err) {
1851 		pr_err("Failed to get current timestamp\n");
1852 		return -EINVAL;
1853 	}
1854 
1855 	fd = perf_data__switch(data, timestamp,
1856 				    rec->session->header.data_offset,
1857 				    at_exit, &new_filename);
1858 	if (fd >= 0 && !at_exit) {
1859 		rec->bytes_written = 0;
1860 		rec->session->header.data_size = 0;
1861 	}
1862 
1863 	if (!quiet)
1864 		fprintf(stderr, "[ perf record: Dump %s.%s ]\n",
1865 			data->path, timestamp);
1866 
1867 	if (rec->switch_output.num_files) {
1868 		int n = rec->switch_output.cur_file + 1;
1869 
1870 		if (n >= rec->switch_output.num_files)
1871 			n = 0;
1872 		rec->switch_output.cur_file = n;
1873 		if (rec->switch_output.filenames[n]) {
1874 			remove(rec->switch_output.filenames[n]);
1875 			zfree(&rec->switch_output.filenames[n]);
1876 		}
1877 		rec->switch_output.filenames[n] = new_filename;
1878 	} else {
1879 		free(new_filename);
1880 	}
1881 
1882 	/* Output tracking events */
1883 	if (!at_exit) {
1884 		record__synthesize(rec, false);
1885 
1886 		/*
1887 		 * In 'perf record --switch-output' without -a,
1888 		 * record__synthesize() in record__switch_output() won't
1889 		 * generate tracking events because there's no thread_map
1890 		 * in evlist. Which causes newly created perf.data doesn't
1891 		 * contain map and comm information.
1892 		 * Create a fake thread_map and directly call
1893 		 * perf_event__synthesize_thread_map() for those events.
1894 		 */
1895 		if (target__none(&rec->opts.target))
1896 			record__synthesize_workload(rec, false);
1897 		write_finished_init(rec, false);
1898 	}
1899 	return fd;
1900 }
1901 
1902 static void __record__save_lost_samples(struct record *rec, struct evsel *evsel,
1903 					struct perf_record_lost_samples *lost,
1904 					int cpu_idx, int thread_idx, u64 lost_count,
1905 					u16 misc_flag)
1906 {
1907 	struct perf_sample_id *sid;
1908 	struct perf_sample sample = {};
1909 	int id_hdr_size;
1910 
1911 	lost->lost = lost_count;
1912 	if (evsel->core.ids) {
1913 		sid = xyarray__entry(evsel->core.sample_id, cpu_idx, thread_idx);
1914 		sample.id = sid->id;
1915 	}
1916 
1917 	id_hdr_size = perf_event__synthesize_id_sample((void *)(lost + 1),
1918 						       evsel->core.attr.sample_type, &sample);
1919 	lost->header.size = sizeof(*lost) + id_hdr_size;
1920 	lost->header.misc = misc_flag;
1921 	record__write(rec, NULL, lost, lost->header.size);
1922 }
1923 
1924 static void record__read_lost_samples(struct record *rec)
1925 {
1926 	struct perf_session *session = rec->session;
1927 	struct perf_record_lost_samples *lost;
1928 	struct evsel *evsel;
1929 
1930 	/* there was an error during record__open */
1931 	if (session->evlist == NULL)
1932 		return;
1933 
1934 	lost = zalloc(PERF_SAMPLE_MAX_SIZE);
1935 	if (lost == NULL) {
1936 		pr_debug("Memory allocation failed\n");
1937 		return;
1938 	}
1939 
1940 	lost->header.type = PERF_RECORD_LOST_SAMPLES;
1941 
1942 	evlist__for_each_entry(session->evlist, evsel) {
1943 		struct xyarray *xy = evsel->core.sample_id;
1944 		u64 lost_count;
1945 
1946 		if (xy == NULL || evsel->core.fd == NULL)
1947 			continue;
1948 		if (xyarray__max_x(evsel->core.fd) != xyarray__max_x(xy) ||
1949 		    xyarray__max_y(evsel->core.fd) != xyarray__max_y(xy)) {
1950 			pr_debug("Unmatched FD vs. sample ID: skip reading LOST count\n");
1951 			continue;
1952 		}
1953 
1954 		for (int x = 0; x < xyarray__max_x(xy); x++) {
1955 			for (int y = 0; y < xyarray__max_y(xy); y++) {
1956 				struct perf_counts_values count;
1957 
1958 				if (perf_evsel__read(&evsel->core, x, y, &count) < 0) {
1959 					pr_debug("read LOST count failed\n");
1960 					goto out;
1961 				}
1962 
1963 				if (count.lost) {
1964 					__record__save_lost_samples(rec, evsel, lost,
1965 								    x, y, count.lost, 0);
1966 				}
1967 			}
1968 		}
1969 
1970 		lost_count = perf_bpf_filter__lost_count(evsel);
1971 		if (lost_count)
1972 			__record__save_lost_samples(rec, evsel, lost, 0, 0, lost_count,
1973 						    PERF_RECORD_MISC_LOST_SAMPLES_BPF);
1974 	}
1975 out:
1976 	free(lost);
1977 }
1978 
1979 static volatile sig_atomic_t workload_exec_errno;
1980 
1981 /*
1982  * evlist__prepare_workload will send a SIGUSR1
1983  * if the fork fails, since we asked by setting its
1984  * want_signal to true.
1985  */
1986 static void workload_exec_failed_signal(int signo __maybe_unused,
1987 					siginfo_t *info,
1988 					void *ucontext __maybe_unused)
1989 {
1990 	workload_exec_errno = info->si_value.sival_int;
1991 	done = 1;
1992 	child_finished = 1;
1993 }
1994 
1995 static void snapshot_sig_handler(int sig);
1996 static void alarm_sig_handler(int sig);
1997 
1998 static const struct perf_event_mmap_page *evlist__pick_pc(struct evlist *evlist)
1999 {
2000 	if (evlist) {
2001 		if (evlist->mmap && evlist->mmap[0].core.base)
2002 			return evlist->mmap[0].core.base;
2003 		if (evlist->overwrite_mmap && evlist->overwrite_mmap[0].core.base)
2004 			return evlist->overwrite_mmap[0].core.base;
2005 	}
2006 	return NULL;
2007 }
2008 
2009 static const struct perf_event_mmap_page *record__pick_pc(struct record *rec)
2010 {
2011 	const struct perf_event_mmap_page *pc = evlist__pick_pc(rec->evlist);
2012 	if (pc)
2013 		return pc;
2014 	return NULL;
2015 }
2016 
2017 static int record__synthesize(struct record *rec, bool tail)
2018 {
2019 	struct perf_session *session = rec->session;
2020 	struct machine *machine = &session->machines.host;
2021 	struct perf_data *data = &rec->data;
2022 	struct record_opts *opts = &rec->opts;
2023 	struct perf_tool *tool = &rec->tool;
2024 	int err = 0;
2025 	event_op f = process_synthesized_event;
2026 
2027 	if (rec->opts.tail_synthesize != tail)
2028 		return 0;
2029 
2030 	if (data->is_pipe) {
2031 		err = perf_event__synthesize_for_pipe(tool, session, data,
2032 						      process_synthesized_event);
2033 		if (err < 0)
2034 			goto out;
2035 
2036 		rec->bytes_written += err;
2037 	}
2038 
2039 	err = perf_event__synth_time_conv(record__pick_pc(rec), tool,
2040 					  process_synthesized_event, machine);
2041 	if (err)
2042 		goto out;
2043 
2044 	/* Synthesize id_index before auxtrace_info */
2045 	err = perf_event__synthesize_id_index(tool,
2046 					      process_synthesized_event,
2047 					      session->evlist, machine);
2048 	if (err)
2049 		goto out;
2050 
2051 	if (rec->opts.full_auxtrace) {
2052 		err = perf_event__synthesize_auxtrace_info(rec->itr, tool,
2053 					session, process_synthesized_event);
2054 		if (err)
2055 			goto out;
2056 	}
2057 
2058 	if (!evlist__exclude_kernel(rec->evlist)) {
2059 		err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
2060 							 machine);
2061 		WARN_ONCE(err < 0, "Couldn't record kernel reference relocation symbol\n"
2062 				   "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
2063 				   "Check /proc/kallsyms permission or run as root.\n");
2064 
2065 		err = perf_event__synthesize_modules(tool, process_synthesized_event,
2066 						     machine);
2067 		WARN_ONCE(err < 0, "Couldn't record kernel module information.\n"
2068 				   "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
2069 				   "Check /proc/modules permission or run as root.\n");
2070 	}
2071 
2072 	if (perf_guest) {
2073 		machines__process_guests(&session->machines,
2074 					 perf_event__synthesize_guest_os, tool);
2075 	}
2076 
2077 	err = perf_event__synthesize_extra_attr(&rec->tool,
2078 						rec->evlist,
2079 						process_synthesized_event,
2080 						data->is_pipe);
2081 	if (err)
2082 		goto out;
2083 
2084 	err = perf_event__synthesize_thread_map2(&rec->tool, rec->evlist->core.threads,
2085 						 process_synthesized_event,
2086 						NULL);
2087 	if (err < 0) {
2088 		pr_err("Couldn't synthesize thread map.\n");
2089 		return err;
2090 	}
2091 
2092 	err = perf_event__synthesize_cpu_map(&rec->tool, rec->evlist->core.all_cpus,
2093 					     process_synthesized_event, NULL);
2094 	if (err < 0) {
2095 		pr_err("Couldn't synthesize cpu map.\n");
2096 		return err;
2097 	}
2098 
2099 	err = perf_event__synthesize_bpf_events(session, process_synthesized_event,
2100 						machine, opts);
2101 	if (err < 0) {
2102 		pr_warning("Couldn't synthesize bpf events.\n");
2103 		err = 0;
2104 	}
2105 
2106 	if (rec->opts.synth & PERF_SYNTH_CGROUP) {
2107 		err = perf_event__synthesize_cgroups(tool, process_synthesized_event,
2108 						     machine);
2109 		if (err < 0) {
2110 			pr_warning("Couldn't synthesize cgroup events.\n");
2111 			err = 0;
2112 		}
2113 	}
2114 
2115 	if (rec->opts.nr_threads_synthesize > 1) {
2116 		mutex_init(&synth_lock);
2117 		perf_set_multithreaded();
2118 		f = process_locked_synthesized_event;
2119 	}
2120 
2121 	if (rec->opts.synth & PERF_SYNTH_TASK) {
2122 		bool needs_mmap = rec->opts.synth & PERF_SYNTH_MMAP;
2123 
2124 		err = __machine__synthesize_threads(machine, tool, &opts->target,
2125 						    rec->evlist->core.threads,
2126 						    f, needs_mmap, opts->sample_address,
2127 						    rec->opts.nr_threads_synthesize);
2128 	}
2129 
2130 	if (rec->opts.nr_threads_synthesize > 1) {
2131 		perf_set_singlethreaded();
2132 		mutex_destroy(&synth_lock);
2133 	}
2134 
2135 out:
2136 	return err;
2137 }
2138 
2139 static int record__process_signal_event(union perf_event *event __maybe_unused, void *data)
2140 {
2141 	struct record *rec = data;
2142 	pthread_kill(rec->thread_id, SIGUSR2);
2143 	return 0;
2144 }
2145 
2146 static int record__setup_sb_evlist(struct record *rec)
2147 {
2148 	struct record_opts *opts = &rec->opts;
2149 
2150 	if (rec->sb_evlist != NULL) {
2151 		/*
2152 		 * We get here if --switch-output-event populated the
2153 		 * sb_evlist, so associate a callback that will send a SIGUSR2
2154 		 * to the main thread.
2155 		 */
2156 		evlist__set_cb(rec->sb_evlist, record__process_signal_event, rec);
2157 		rec->thread_id = pthread_self();
2158 	}
2159 #ifdef HAVE_LIBBPF_SUPPORT
2160 	if (!opts->no_bpf_event) {
2161 		if (rec->sb_evlist == NULL) {
2162 			rec->sb_evlist = evlist__new();
2163 
2164 			if (rec->sb_evlist == NULL) {
2165 				pr_err("Couldn't create side band evlist.\n.");
2166 				return -1;
2167 			}
2168 		}
2169 
2170 		if (evlist__add_bpf_sb_event(rec->sb_evlist, &rec->session->header.env)) {
2171 			pr_err("Couldn't ask for PERF_RECORD_BPF_EVENT side band events.\n.");
2172 			return -1;
2173 		}
2174 	}
2175 #endif
2176 	if (evlist__start_sb_thread(rec->sb_evlist, &rec->opts.target)) {
2177 		pr_debug("Couldn't start the BPF side band thread:\nBPF programs starting from now on won't be annotatable\n");
2178 		opts->no_bpf_event = true;
2179 	}
2180 
2181 	return 0;
2182 }
2183 
2184 static int record__init_clock(struct record *rec)
2185 {
2186 	struct perf_session *session = rec->session;
2187 	struct timespec ref_clockid;
2188 	struct timeval ref_tod;
2189 	u64 ref;
2190 
2191 	if (!rec->opts.use_clockid)
2192 		return 0;
2193 
2194 	if (rec->opts.use_clockid && rec->opts.clockid_res_ns)
2195 		session->header.env.clock.clockid_res_ns = rec->opts.clockid_res_ns;
2196 
2197 	session->header.env.clock.clockid = rec->opts.clockid;
2198 
2199 	if (gettimeofday(&ref_tod, NULL) != 0) {
2200 		pr_err("gettimeofday failed, cannot set reference time.\n");
2201 		return -1;
2202 	}
2203 
2204 	if (clock_gettime(rec->opts.clockid, &ref_clockid)) {
2205 		pr_err("clock_gettime failed, cannot set reference time.\n");
2206 		return -1;
2207 	}
2208 
2209 	ref = (u64) ref_tod.tv_sec * NSEC_PER_SEC +
2210 	      (u64) ref_tod.tv_usec * NSEC_PER_USEC;
2211 
2212 	session->header.env.clock.tod_ns = ref;
2213 
2214 	ref = (u64) ref_clockid.tv_sec * NSEC_PER_SEC +
2215 	      (u64) ref_clockid.tv_nsec;
2216 
2217 	session->header.env.clock.clockid_ns = ref;
2218 	return 0;
2219 }
2220 
2221 static void hit_auxtrace_snapshot_trigger(struct record *rec)
2222 {
2223 	if (trigger_is_ready(&auxtrace_snapshot_trigger)) {
2224 		trigger_hit(&auxtrace_snapshot_trigger);
2225 		auxtrace_record__snapshot_started = 1;
2226 		if (auxtrace_record__snapshot_start(rec->itr))
2227 			trigger_error(&auxtrace_snapshot_trigger);
2228 	}
2229 }
2230 
2231 static void record__uniquify_name(struct record *rec)
2232 {
2233 	struct evsel *pos;
2234 	struct evlist *evlist = rec->evlist;
2235 	char *new_name;
2236 	int ret;
2237 
2238 	if (perf_pmus__num_core_pmus() == 1)
2239 		return;
2240 
2241 	evlist__for_each_entry(evlist, pos) {
2242 		if (!evsel__is_hybrid(pos))
2243 			continue;
2244 
2245 		if (strchr(pos->name, '/'))
2246 			continue;
2247 
2248 		ret = asprintf(&new_name, "%s/%s/",
2249 			       pos->pmu_name, pos->name);
2250 		if (ret) {
2251 			free(pos->name);
2252 			pos->name = new_name;
2253 		}
2254 	}
2255 }
2256 
2257 static int record__terminate_thread(struct record_thread *thread_data)
2258 {
2259 	int err;
2260 	enum thread_msg ack = THREAD_MSG__UNDEFINED;
2261 	pid_t tid = thread_data->tid;
2262 
2263 	close(thread_data->pipes.msg[1]);
2264 	thread_data->pipes.msg[1] = -1;
2265 	err = read(thread_data->pipes.ack[0], &ack, sizeof(ack));
2266 	if (err > 0)
2267 		pr_debug2("threads[%d]: sent %s\n", tid, thread_msg_tags[ack]);
2268 	else
2269 		pr_warning("threads[%d]: failed to receive termination notification from %d\n",
2270 			   thread->tid, tid);
2271 
2272 	return 0;
2273 }
2274 
2275 static int record__start_threads(struct record *rec)
2276 {
2277 	int t, tt, err, ret = 0, nr_threads = rec->nr_threads;
2278 	struct record_thread *thread_data = rec->thread_data;
2279 	sigset_t full, mask;
2280 	pthread_t handle;
2281 	pthread_attr_t attrs;
2282 
2283 	thread = &thread_data[0];
2284 
2285 	if (!record__threads_enabled(rec))
2286 		return 0;
2287 
2288 	sigfillset(&full);
2289 	if (sigprocmask(SIG_SETMASK, &full, &mask)) {
2290 		pr_err("Failed to block signals on threads start: %s\n", strerror(errno));
2291 		return -1;
2292 	}
2293 
2294 	pthread_attr_init(&attrs);
2295 	pthread_attr_setdetachstate(&attrs, PTHREAD_CREATE_DETACHED);
2296 
2297 	for (t = 1; t < nr_threads; t++) {
2298 		enum thread_msg msg = THREAD_MSG__UNDEFINED;
2299 
2300 #ifdef HAVE_PTHREAD_ATTR_SETAFFINITY_NP
2301 		pthread_attr_setaffinity_np(&attrs,
2302 					    MMAP_CPU_MASK_BYTES(&(thread_data[t].mask->affinity)),
2303 					    (cpu_set_t *)(thread_data[t].mask->affinity.bits));
2304 #endif
2305 		if (pthread_create(&handle, &attrs, record__thread, &thread_data[t])) {
2306 			for (tt = 1; tt < t; tt++)
2307 				record__terminate_thread(&thread_data[t]);
2308 			pr_err("Failed to start threads: %s\n", strerror(errno));
2309 			ret = -1;
2310 			goto out_err;
2311 		}
2312 
2313 		err = read(thread_data[t].pipes.ack[0], &msg, sizeof(msg));
2314 		if (err > 0)
2315 			pr_debug2("threads[%d]: sent %s\n", rec->thread_data[t].tid,
2316 				  thread_msg_tags[msg]);
2317 		else
2318 			pr_warning("threads[%d]: failed to receive start notification from %d\n",
2319 				   thread->tid, rec->thread_data[t].tid);
2320 	}
2321 
2322 	sched_setaffinity(0, MMAP_CPU_MASK_BYTES(&thread->mask->affinity),
2323 			(cpu_set_t *)thread->mask->affinity.bits);
2324 
2325 	pr_debug("threads[%d]: started on cpu%d\n", thread->tid, sched_getcpu());
2326 
2327 out_err:
2328 	pthread_attr_destroy(&attrs);
2329 
2330 	if (sigprocmask(SIG_SETMASK, &mask, NULL)) {
2331 		pr_err("Failed to unblock signals on threads start: %s\n", strerror(errno));
2332 		ret = -1;
2333 	}
2334 
2335 	return ret;
2336 }
2337 
2338 static int record__stop_threads(struct record *rec)
2339 {
2340 	int t;
2341 	struct record_thread *thread_data = rec->thread_data;
2342 
2343 	for (t = 1; t < rec->nr_threads; t++)
2344 		record__terminate_thread(&thread_data[t]);
2345 
2346 	for (t = 0; t < rec->nr_threads; t++) {
2347 		rec->samples += thread_data[t].samples;
2348 		if (!record__threads_enabled(rec))
2349 			continue;
2350 		rec->session->bytes_transferred += thread_data[t].bytes_transferred;
2351 		rec->session->bytes_compressed += thread_data[t].bytes_compressed;
2352 		pr_debug("threads[%d]: samples=%lld, wakes=%ld, ", thread_data[t].tid,
2353 			 thread_data[t].samples, thread_data[t].waking);
2354 		if (thread_data[t].bytes_transferred && thread_data[t].bytes_compressed)
2355 			pr_debug("transferred=%" PRIu64 ", compressed=%" PRIu64 "\n",
2356 				 thread_data[t].bytes_transferred, thread_data[t].bytes_compressed);
2357 		else
2358 			pr_debug("written=%" PRIu64 "\n", thread_data[t].bytes_written);
2359 	}
2360 
2361 	return 0;
2362 }
2363 
2364 static unsigned long record__waking(struct record *rec)
2365 {
2366 	int t;
2367 	unsigned long waking = 0;
2368 	struct record_thread *thread_data = rec->thread_data;
2369 
2370 	for (t = 0; t < rec->nr_threads; t++)
2371 		waking += thread_data[t].waking;
2372 
2373 	return waking;
2374 }
2375 
2376 static int __cmd_record(struct record *rec, int argc, const char **argv)
2377 {
2378 	int err;
2379 	int status = 0;
2380 	const bool forks = argc > 0;
2381 	struct perf_tool *tool = &rec->tool;
2382 	struct record_opts *opts = &rec->opts;
2383 	struct perf_data *data = &rec->data;
2384 	struct perf_session *session;
2385 	bool disabled = false, draining = false;
2386 	int fd;
2387 	float ratio = 0;
2388 	enum evlist_ctl_cmd cmd = EVLIST_CTL_CMD_UNSUPPORTED;
2389 
2390 	atexit(record__sig_exit);
2391 	signal(SIGCHLD, sig_handler);
2392 	signal(SIGINT, sig_handler);
2393 	signal(SIGTERM, sig_handler);
2394 	signal(SIGSEGV, sigsegv_handler);
2395 
2396 	if (rec->opts.record_namespaces)
2397 		tool->namespace_events = true;
2398 
2399 	if (rec->opts.record_cgroup) {
2400 #ifdef HAVE_FILE_HANDLE
2401 		tool->cgroup_events = true;
2402 #else
2403 		pr_err("cgroup tracking is not supported\n");
2404 		return -1;
2405 #endif
2406 	}
2407 
2408 	if (rec->opts.auxtrace_snapshot_mode || rec->switch_output.enabled) {
2409 		signal(SIGUSR2, snapshot_sig_handler);
2410 		if (rec->opts.auxtrace_snapshot_mode)
2411 			trigger_on(&auxtrace_snapshot_trigger);
2412 		if (rec->switch_output.enabled)
2413 			trigger_on(&switch_output_trigger);
2414 	} else {
2415 		signal(SIGUSR2, SIG_IGN);
2416 	}
2417 
2418 	session = perf_session__new(data, tool);
2419 	if (IS_ERR(session)) {
2420 		pr_err("Perf session creation failed.\n");
2421 		return PTR_ERR(session);
2422 	}
2423 
2424 	if (record__threads_enabled(rec)) {
2425 		if (perf_data__is_pipe(&rec->data)) {
2426 			pr_err("Parallel trace streaming is not available in pipe mode.\n");
2427 			return -1;
2428 		}
2429 		if (rec->opts.full_auxtrace) {
2430 			pr_err("Parallel trace streaming is not available in AUX area tracing mode.\n");
2431 			return -1;
2432 		}
2433 	}
2434 
2435 	fd = perf_data__fd(data);
2436 	rec->session = session;
2437 
2438 	if (zstd_init(&session->zstd_data, rec->opts.comp_level) < 0) {
2439 		pr_err("Compression initialization failed.\n");
2440 		return -1;
2441 	}
2442 #ifdef HAVE_EVENTFD_SUPPORT
2443 	done_fd = eventfd(0, EFD_NONBLOCK);
2444 	if (done_fd < 0) {
2445 		pr_err("Failed to create wakeup eventfd, error: %m\n");
2446 		status = -1;
2447 		goto out_delete_session;
2448 	}
2449 	err = evlist__add_wakeup_eventfd(rec->evlist, done_fd);
2450 	if (err < 0) {
2451 		pr_err("Failed to add wakeup eventfd to poll list\n");
2452 		status = err;
2453 		goto out_delete_session;
2454 	}
2455 #endif // HAVE_EVENTFD_SUPPORT
2456 
2457 	session->header.env.comp_type  = PERF_COMP_ZSTD;
2458 	session->header.env.comp_level = rec->opts.comp_level;
2459 
2460 	if (rec->opts.kcore &&
2461 	    !record__kcore_readable(&session->machines.host)) {
2462 		pr_err("ERROR: kcore is not readable.\n");
2463 		return -1;
2464 	}
2465 
2466 	if (record__init_clock(rec))
2467 		return -1;
2468 
2469 	record__init_features(rec);
2470 
2471 	if (forks) {
2472 		err = evlist__prepare_workload(rec->evlist, &opts->target, argv, data->is_pipe,
2473 					       workload_exec_failed_signal);
2474 		if (err < 0) {
2475 			pr_err("Couldn't run the workload!\n");
2476 			status = err;
2477 			goto out_delete_session;
2478 		}
2479 	}
2480 
2481 	/*
2482 	 * If we have just single event and are sending data
2483 	 * through pipe, we need to force the ids allocation,
2484 	 * because we synthesize event name through the pipe
2485 	 * and need the id for that.
2486 	 */
2487 	if (data->is_pipe && rec->evlist->core.nr_entries == 1)
2488 		rec->opts.sample_id = true;
2489 
2490 	record__uniquify_name(rec);
2491 
2492 	/* Debug message used by test scripts */
2493 	pr_debug3("perf record opening and mmapping events\n");
2494 	if (record__open(rec) != 0) {
2495 		err = -1;
2496 		goto out_free_threads;
2497 	}
2498 	/* Debug message used by test scripts */
2499 	pr_debug3("perf record done opening and mmapping events\n");
2500 	session->header.env.comp_mmap_len = session->evlist->core.mmap_len;
2501 
2502 	if (rec->opts.kcore) {
2503 		err = record__kcore_copy(&session->machines.host, data);
2504 		if (err) {
2505 			pr_err("ERROR: Failed to copy kcore\n");
2506 			goto out_free_threads;
2507 		}
2508 	}
2509 
2510 	/*
2511 	 * Normally perf_session__new would do this, but it doesn't have the
2512 	 * evlist.
2513 	 */
2514 	if (rec->tool.ordered_events && !evlist__sample_id_all(rec->evlist)) {
2515 		pr_warning("WARNING: No sample_id_all support, falling back to unordered processing\n");
2516 		rec->tool.ordered_events = false;
2517 	}
2518 
2519 	if (evlist__nr_groups(rec->evlist) == 0)
2520 		perf_header__clear_feat(&session->header, HEADER_GROUP_DESC);
2521 
2522 	if (data->is_pipe) {
2523 		err = perf_header__write_pipe(fd);
2524 		if (err < 0)
2525 			goto out_free_threads;
2526 	} else {
2527 		err = perf_session__write_header(session, rec->evlist, fd, false);
2528 		if (err < 0)
2529 			goto out_free_threads;
2530 	}
2531 
2532 	err = -1;
2533 	if (!rec->no_buildid
2534 	    && !perf_header__has_feat(&session->header, HEADER_BUILD_ID)) {
2535 		pr_err("Couldn't generate buildids. "
2536 		       "Use --no-buildid to profile anyway.\n");
2537 		goto out_free_threads;
2538 	}
2539 
2540 	err = record__setup_sb_evlist(rec);
2541 	if (err)
2542 		goto out_free_threads;
2543 
2544 	err = record__synthesize(rec, false);
2545 	if (err < 0)
2546 		goto out_free_threads;
2547 
2548 	if (rec->realtime_prio) {
2549 		struct sched_param param;
2550 
2551 		param.sched_priority = rec->realtime_prio;
2552 		if (sched_setscheduler(0, SCHED_FIFO, &param)) {
2553 			pr_err("Could not set realtime priority.\n");
2554 			err = -1;
2555 			goto out_free_threads;
2556 		}
2557 	}
2558 
2559 	if (record__start_threads(rec))
2560 		goto out_free_threads;
2561 
2562 	/*
2563 	 * When perf is starting the traced process, all the events
2564 	 * (apart from group members) have enable_on_exec=1 set,
2565 	 * so don't spoil it by prematurely enabling them.
2566 	 */
2567 	if (!target__none(&opts->target) && !opts->target.initial_delay)
2568 		evlist__enable(rec->evlist);
2569 
2570 	/*
2571 	 * Let the child rip
2572 	 */
2573 	if (forks) {
2574 		struct machine *machine = &session->machines.host;
2575 		union perf_event *event;
2576 		pid_t tgid;
2577 
2578 		event = malloc(sizeof(event->comm) + machine->id_hdr_size);
2579 		if (event == NULL) {
2580 			err = -ENOMEM;
2581 			goto out_child;
2582 		}
2583 
2584 		/*
2585 		 * Some H/W events are generated before COMM event
2586 		 * which is emitted during exec(), so perf script
2587 		 * cannot see a correct process name for those events.
2588 		 * Synthesize COMM event to prevent it.
2589 		 */
2590 		tgid = perf_event__synthesize_comm(tool, event,
2591 						   rec->evlist->workload.pid,
2592 						   process_synthesized_event,
2593 						   machine);
2594 		free(event);
2595 
2596 		if (tgid == -1)
2597 			goto out_child;
2598 
2599 		event = malloc(sizeof(event->namespaces) +
2600 			       (NR_NAMESPACES * sizeof(struct perf_ns_link_info)) +
2601 			       machine->id_hdr_size);
2602 		if (event == NULL) {
2603 			err = -ENOMEM;
2604 			goto out_child;
2605 		}
2606 
2607 		/*
2608 		 * Synthesize NAMESPACES event for the command specified.
2609 		 */
2610 		perf_event__synthesize_namespaces(tool, event,
2611 						  rec->evlist->workload.pid,
2612 						  tgid, process_synthesized_event,
2613 						  machine);
2614 		free(event);
2615 
2616 		evlist__start_workload(rec->evlist);
2617 	}
2618 
2619 	if (opts->target.initial_delay) {
2620 		pr_info(EVLIST_DISABLED_MSG);
2621 		if (opts->target.initial_delay > 0) {
2622 			usleep(opts->target.initial_delay * USEC_PER_MSEC);
2623 			evlist__enable(rec->evlist);
2624 			pr_info(EVLIST_ENABLED_MSG);
2625 		}
2626 	}
2627 
2628 	err = event_enable_timer__start(rec->evlist->eet);
2629 	if (err)
2630 		goto out_child;
2631 
2632 	/* Debug message used by test scripts */
2633 	pr_debug3("perf record has started\n");
2634 	fflush(stderr);
2635 
2636 	trigger_ready(&auxtrace_snapshot_trigger);
2637 	trigger_ready(&switch_output_trigger);
2638 	perf_hooks__invoke_record_start();
2639 
2640 	/*
2641 	 * Must write FINISHED_INIT so it will be seen after all other
2642 	 * synthesized user events, but before any regular events.
2643 	 */
2644 	err = write_finished_init(rec, false);
2645 	if (err < 0)
2646 		goto out_child;
2647 
2648 	for (;;) {
2649 		unsigned long long hits = thread->samples;
2650 
2651 		/*
2652 		 * rec->evlist->bkw_mmap_state is possible to be
2653 		 * BKW_MMAP_EMPTY here: when done == true and
2654 		 * hits != rec->samples in previous round.
2655 		 *
2656 		 * evlist__toggle_bkw_mmap ensure we never
2657 		 * convert BKW_MMAP_EMPTY to BKW_MMAP_DATA_PENDING.
2658 		 */
2659 		if (trigger_is_hit(&switch_output_trigger) || done || draining)
2660 			evlist__toggle_bkw_mmap(rec->evlist, BKW_MMAP_DATA_PENDING);
2661 
2662 		if (record__mmap_read_all(rec, false) < 0) {
2663 			trigger_error(&auxtrace_snapshot_trigger);
2664 			trigger_error(&switch_output_trigger);
2665 			err = -1;
2666 			goto out_child;
2667 		}
2668 
2669 		if (auxtrace_record__snapshot_started) {
2670 			auxtrace_record__snapshot_started = 0;
2671 			if (!trigger_is_error(&auxtrace_snapshot_trigger))
2672 				record__read_auxtrace_snapshot(rec, false);
2673 			if (trigger_is_error(&auxtrace_snapshot_trigger)) {
2674 				pr_err("AUX area tracing snapshot failed\n");
2675 				err = -1;
2676 				goto out_child;
2677 			}
2678 		}
2679 
2680 		if (trigger_is_hit(&switch_output_trigger)) {
2681 			/*
2682 			 * If switch_output_trigger is hit, the data in
2683 			 * overwritable ring buffer should have been collected,
2684 			 * so bkw_mmap_state should be set to BKW_MMAP_EMPTY.
2685 			 *
2686 			 * If SIGUSR2 raise after or during record__mmap_read_all(),
2687 			 * record__mmap_read_all() didn't collect data from
2688 			 * overwritable ring buffer. Read again.
2689 			 */
2690 			if (rec->evlist->bkw_mmap_state == BKW_MMAP_RUNNING)
2691 				continue;
2692 			trigger_ready(&switch_output_trigger);
2693 
2694 			/*
2695 			 * Reenable events in overwrite ring buffer after
2696 			 * record__mmap_read_all(): we should have collected
2697 			 * data from it.
2698 			 */
2699 			evlist__toggle_bkw_mmap(rec->evlist, BKW_MMAP_RUNNING);
2700 
2701 			if (!quiet)
2702 				fprintf(stderr, "[ perf record: dump data: Woken up %ld times ]\n",
2703 					record__waking(rec));
2704 			thread->waking = 0;
2705 			fd = record__switch_output(rec, false);
2706 			if (fd < 0) {
2707 				pr_err("Failed to switch to new file\n");
2708 				trigger_error(&switch_output_trigger);
2709 				err = fd;
2710 				goto out_child;
2711 			}
2712 
2713 			/* re-arm the alarm */
2714 			if (rec->switch_output.time)
2715 				alarm(rec->switch_output.time);
2716 		}
2717 
2718 		if (hits == thread->samples) {
2719 			if (done || draining)
2720 				break;
2721 			err = fdarray__poll(&thread->pollfd, -1);
2722 			/*
2723 			 * Propagate error, only if there's any. Ignore positive
2724 			 * number of returned events and interrupt error.
2725 			 */
2726 			if (err > 0 || (err < 0 && errno == EINTR))
2727 				err = 0;
2728 			thread->waking++;
2729 
2730 			if (fdarray__filter(&thread->pollfd, POLLERR | POLLHUP,
2731 					    record__thread_munmap_filtered, NULL) == 0)
2732 				draining = true;
2733 
2734 			err = record__update_evlist_pollfd_from_thread(rec, rec->evlist, thread);
2735 			if (err)
2736 				goto out_child;
2737 		}
2738 
2739 		if (evlist__ctlfd_process(rec->evlist, &cmd) > 0) {
2740 			switch (cmd) {
2741 			case EVLIST_CTL_CMD_SNAPSHOT:
2742 				hit_auxtrace_snapshot_trigger(rec);
2743 				evlist__ctlfd_ack(rec->evlist);
2744 				break;
2745 			case EVLIST_CTL_CMD_STOP:
2746 				done = 1;
2747 				break;
2748 			case EVLIST_CTL_CMD_ACK:
2749 			case EVLIST_CTL_CMD_UNSUPPORTED:
2750 			case EVLIST_CTL_CMD_ENABLE:
2751 			case EVLIST_CTL_CMD_DISABLE:
2752 			case EVLIST_CTL_CMD_EVLIST:
2753 			case EVLIST_CTL_CMD_PING:
2754 			default:
2755 				break;
2756 			}
2757 		}
2758 
2759 		err = event_enable_timer__process(rec->evlist->eet);
2760 		if (err < 0)
2761 			goto out_child;
2762 		if (err) {
2763 			err = 0;
2764 			done = 1;
2765 		}
2766 
2767 		/*
2768 		 * When perf is starting the traced process, at the end events
2769 		 * die with the process and we wait for that. Thus no need to
2770 		 * disable events in this case.
2771 		 */
2772 		if (done && !disabled && !target__none(&opts->target)) {
2773 			trigger_off(&auxtrace_snapshot_trigger);
2774 			evlist__disable(rec->evlist);
2775 			disabled = true;
2776 		}
2777 	}
2778 
2779 	trigger_off(&auxtrace_snapshot_trigger);
2780 	trigger_off(&switch_output_trigger);
2781 
2782 	if (opts->auxtrace_snapshot_on_exit)
2783 		record__auxtrace_snapshot_exit(rec);
2784 
2785 	if (forks && workload_exec_errno) {
2786 		char msg[STRERR_BUFSIZE], strevsels[2048];
2787 		const char *emsg = str_error_r(workload_exec_errno, msg, sizeof(msg));
2788 
2789 		evlist__scnprintf_evsels(rec->evlist, sizeof(strevsels), strevsels);
2790 
2791 		pr_err("Failed to collect '%s' for the '%s' workload: %s\n",
2792 			strevsels, argv[0], emsg);
2793 		err = -1;
2794 		goto out_child;
2795 	}
2796 
2797 	if (!quiet)
2798 		fprintf(stderr, "[ perf record: Woken up %ld times to write data ]\n",
2799 			record__waking(rec));
2800 
2801 	write_finished_init(rec, true);
2802 
2803 	if (target__none(&rec->opts.target))
2804 		record__synthesize_workload(rec, true);
2805 
2806 out_child:
2807 	record__stop_threads(rec);
2808 	record__mmap_read_all(rec, true);
2809 out_free_threads:
2810 	record__free_thread_data(rec);
2811 	evlist__finalize_ctlfd(rec->evlist);
2812 	record__aio_mmap_read_sync(rec);
2813 
2814 	if (rec->session->bytes_transferred && rec->session->bytes_compressed) {
2815 		ratio = (float)rec->session->bytes_transferred/(float)rec->session->bytes_compressed;
2816 		session->header.env.comp_ratio = ratio + 0.5;
2817 	}
2818 
2819 	if (forks) {
2820 		int exit_status;
2821 
2822 		if (!child_finished)
2823 			kill(rec->evlist->workload.pid, SIGTERM);
2824 
2825 		wait(&exit_status);
2826 
2827 		if (err < 0)
2828 			status = err;
2829 		else if (WIFEXITED(exit_status))
2830 			status = WEXITSTATUS(exit_status);
2831 		else if (WIFSIGNALED(exit_status))
2832 			signr = WTERMSIG(exit_status);
2833 	} else
2834 		status = err;
2835 
2836 	if (rec->off_cpu)
2837 		rec->bytes_written += off_cpu_write(rec->session);
2838 
2839 	record__read_lost_samples(rec);
2840 	record__synthesize(rec, true);
2841 	/* this will be recalculated during process_buildids() */
2842 	rec->samples = 0;
2843 
2844 	if (!err) {
2845 		if (!rec->timestamp_filename) {
2846 			record__finish_output(rec);
2847 		} else {
2848 			fd = record__switch_output(rec, true);
2849 			if (fd < 0) {
2850 				status = fd;
2851 				goto out_delete_session;
2852 			}
2853 		}
2854 	}
2855 
2856 	perf_hooks__invoke_record_end();
2857 
2858 	if (!err && !quiet) {
2859 		char samples[128];
2860 		const char *postfix = rec->timestamp_filename ?
2861 					".<timestamp>" : "";
2862 
2863 		if (rec->samples && !rec->opts.full_auxtrace)
2864 			scnprintf(samples, sizeof(samples),
2865 				  " (%" PRIu64 " samples)", rec->samples);
2866 		else
2867 			samples[0] = '\0';
2868 
2869 		fprintf(stderr,	"[ perf record: Captured and wrote %.3f MB %s%s%s",
2870 			perf_data__size(data) / 1024.0 / 1024.0,
2871 			data->path, postfix, samples);
2872 		if (ratio) {
2873 			fprintf(stderr,	", compressed (original %.3f MB, ratio is %.3f)",
2874 					rec->session->bytes_transferred / 1024.0 / 1024.0,
2875 					ratio);
2876 		}
2877 		fprintf(stderr, " ]\n");
2878 	}
2879 
2880 out_delete_session:
2881 #ifdef HAVE_EVENTFD_SUPPORT
2882 	if (done_fd >= 0) {
2883 		fd = done_fd;
2884 		done_fd = -1;
2885 
2886 		close(fd);
2887 	}
2888 #endif
2889 	zstd_fini(&session->zstd_data);
2890 	perf_session__delete(session);
2891 
2892 	if (!opts->no_bpf_event)
2893 		evlist__stop_sb_thread(rec->sb_evlist);
2894 	return status;
2895 }
2896 
2897 static void callchain_debug(struct callchain_param *callchain)
2898 {
2899 	static const char *str[CALLCHAIN_MAX] = { "NONE", "FP", "DWARF", "LBR" };
2900 
2901 	pr_debug("callchain: type %s\n", str[callchain->record_mode]);
2902 
2903 	if (callchain->record_mode == CALLCHAIN_DWARF)
2904 		pr_debug("callchain: stack dump size %d\n",
2905 			 callchain->dump_size);
2906 }
2907 
2908 int record_opts__parse_callchain(struct record_opts *record,
2909 				 struct callchain_param *callchain,
2910 				 const char *arg, bool unset)
2911 {
2912 	int ret;
2913 	callchain->enabled = !unset;
2914 
2915 	/* --no-call-graph */
2916 	if (unset) {
2917 		callchain->record_mode = CALLCHAIN_NONE;
2918 		pr_debug("callchain: disabled\n");
2919 		return 0;
2920 	}
2921 
2922 	ret = parse_callchain_record_opt(arg, callchain);
2923 	if (!ret) {
2924 		/* Enable data address sampling for DWARF unwind. */
2925 		if (callchain->record_mode == CALLCHAIN_DWARF)
2926 			record->sample_address = true;
2927 		callchain_debug(callchain);
2928 	}
2929 
2930 	return ret;
2931 }
2932 
2933 int record_parse_callchain_opt(const struct option *opt,
2934 			       const char *arg,
2935 			       int unset)
2936 {
2937 	return record_opts__parse_callchain(opt->value, &callchain_param, arg, unset);
2938 }
2939 
2940 int record_callchain_opt(const struct option *opt,
2941 			 const char *arg __maybe_unused,
2942 			 int unset __maybe_unused)
2943 {
2944 	struct callchain_param *callchain = opt->value;
2945 
2946 	callchain->enabled = true;
2947 
2948 	if (callchain->record_mode == CALLCHAIN_NONE)
2949 		callchain->record_mode = CALLCHAIN_FP;
2950 
2951 	callchain_debug(callchain);
2952 	return 0;
2953 }
2954 
2955 static int perf_record_config(const char *var, const char *value, void *cb)
2956 {
2957 	struct record *rec = cb;
2958 
2959 	if (!strcmp(var, "record.build-id")) {
2960 		if (!strcmp(value, "cache"))
2961 			rec->no_buildid_cache = false;
2962 		else if (!strcmp(value, "no-cache"))
2963 			rec->no_buildid_cache = true;
2964 		else if (!strcmp(value, "skip"))
2965 			rec->no_buildid = true;
2966 		else if (!strcmp(value, "mmap"))
2967 			rec->buildid_mmap = true;
2968 		else
2969 			return -1;
2970 		return 0;
2971 	}
2972 	if (!strcmp(var, "record.call-graph")) {
2973 		var = "call-graph.record-mode";
2974 		return perf_default_config(var, value, cb);
2975 	}
2976 #ifdef HAVE_AIO_SUPPORT
2977 	if (!strcmp(var, "record.aio")) {
2978 		rec->opts.nr_cblocks = strtol(value, NULL, 0);
2979 		if (!rec->opts.nr_cblocks)
2980 			rec->opts.nr_cblocks = nr_cblocks_default;
2981 	}
2982 #endif
2983 	if (!strcmp(var, "record.debuginfod")) {
2984 		rec->debuginfod.urls = strdup(value);
2985 		if (!rec->debuginfod.urls)
2986 			return -ENOMEM;
2987 		rec->debuginfod.set = true;
2988 	}
2989 
2990 	return 0;
2991 }
2992 
2993 static int record__parse_event_enable_time(const struct option *opt, const char *str, int unset)
2994 {
2995 	struct record *rec = (struct record *)opt->value;
2996 
2997 	return evlist__parse_event_enable_time(rec->evlist, &rec->opts, str, unset);
2998 }
2999 
3000 static int record__parse_affinity(const struct option *opt, const char *str, int unset)
3001 {
3002 	struct record_opts *opts = (struct record_opts *)opt->value;
3003 
3004 	if (unset || !str)
3005 		return 0;
3006 
3007 	if (!strcasecmp(str, "node"))
3008 		opts->affinity = PERF_AFFINITY_NODE;
3009 	else if (!strcasecmp(str, "cpu"))
3010 		opts->affinity = PERF_AFFINITY_CPU;
3011 
3012 	return 0;
3013 }
3014 
3015 static int record__mmap_cpu_mask_alloc(struct mmap_cpu_mask *mask, int nr_bits)
3016 {
3017 	mask->nbits = nr_bits;
3018 	mask->bits = bitmap_zalloc(mask->nbits);
3019 	if (!mask->bits)
3020 		return -ENOMEM;
3021 
3022 	return 0;
3023 }
3024 
3025 static void record__mmap_cpu_mask_free(struct mmap_cpu_mask *mask)
3026 {
3027 	bitmap_free(mask->bits);
3028 	mask->nbits = 0;
3029 }
3030 
3031 static int record__thread_mask_alloc(struct thread_mask *mask, int nr_bits)
3032 {
3033 	int ret;
3034 
3035 	ret = record__mmap_cpu_mask_alloc(&mask->maps, nr_bits);
3036 	if (ret) {
3037 		mask->affinity.bits = NULL;
3038 		return ret;
3039 	}
3040 
3041 	ret = record__mmap_cpu_mask_alloc(&mask->affinity, nr_bits);
3042 	if (ret) {
3043 		record__mmap_cpu_mask_free(&mask->maps);
3044 		mask->maps.bits = NULL;
3045 	}
3046 
3047 	return ret;
3048 }
3049 
3050 static void record__thread_mask_free(struct thread_mask *mask)
3051 {
3052 	record__mmap_cpu_mask_free(&mask->maps);
3053 	record__mmap_cpu_mask_free(&mask->affinity);
3054 }
3055 
3056 static int record__parse_threads(const struct option *opt, const char *str, int unset)
3057 {
3058 	int s;
3059 	struct record_opts *opts = opt->value;
3060 
3061 	if (unset || !str || !strlen(str)) {
3062 		opts->threads_spec = THREAD_SPEC__CPU;
3063 	} else {
3064 		for (s = 1; s < THREAD_SPEC__MAX; s++) {
3065 			if (s == THREAD_SPEC__USER) {
3066 				opts->threads_user_spec = strdup(str);
3067 				if (!opts->threads_user_spec)
3068 					return -ENOMEM;
3069 				opts->threads_spec = THREAD_SPEC__USER;
3070 				break;
3071 			}
3072 			if (!strncasecmp(str, thread_spec_tags[s], strlen(thread_spec_tags[s]))) {
3073 				opts->threads_spec = s;
3074 				break;
3075 			}
3076 		}
3077 	}
3078 
3079 	if (opts->threads_spec == THREAD_SPEC__USER)
3080 		pr_debug("threads_spec: %s\n", opts->threads_user_spec);
3081 	else
3082 		pr_debug("threads_spec: %s\n", thread_spec_tags[opts->threads_spec]);
3083 
3084 	return 0;
3085 }
3086 
3087 static int parse_output_max_size(const struct option *opt,
3088 				 const char *str, int unset)
3089 {
3090 	unsigned long *s = (unsigned long *)opt->value;
3091 	static struct parse_tag tags_size[] = {
3092 		{ .tag  = 'B', .mult = 1       },
3093 		{ .tag  = 'K', .mult = 1 << 10 },
3094 		{ .tag  = 'M', .mult = 1 << 20 },
3095 		{ .tag  = 'G', .mult = 1 << 30 },
3096 		{ .tag  = 0 },
3097 	};
3098 	unsigned long val;
3099 
3100 	if (unset) {
3101 		*s = 0;
3102 		return 0;
3103 	}
3104 
3105 	val = parse_tag_value(str, tags_size);
3106 	if (val != (unsigned long) -1) {
3107 		*s = val;
3108 		return 0;
3109 	}
3110 
3111 	return -1;
3112 }
3113 
3114 static int record__parse_mmap_pages(const struct option *opt,
3115 				    const char *str,
3116 				    int unset __maybe_unused)
3117 {
3118 	struct record_opts *opts = opt->value;
3119 	char *s, *p;
3120 	unsigned int mmap_pages;
3121 	int ret;
3122 
3123 	if (!str)
3124 		return -EINVAL;
3125 
3126 	s = strdup(str);
3127 	if (!s)
3128 		return -ENOMEM;
3129 
3130 	p = strchr(s, ',');
3131 	if (p)
3132 		*p = '\0';
3133 
3134 	if (*s) {
3135 		ret = __evlist__parse_mmap_pages(&mmap_pages, s);
3136 		if (ret)
3137 			goto out_free;
3138 		opts->mmap_pages = mmap_pages;
3139 	}
3140 
3141 	if (!p) {
3142 		ret = 0;
3143 		goto out_free;
3144 	}
3145 
3146 	ret = __evlist__parse_mmap_pages(&mmap_pages, p + 1);
3147 	if (ret)
3148 		goto out_free;
3149 
3150 	opts->auxtrace_mmap_pages = mmap_pages;
3151 
3152 out_free:
3153 	free(s);
3154 	return ret;
3155 }
3156 
3157 void __weak arch__add_leaf_frame_record_opts(struct record_opts *opts __maybe_unused)
3158 {
3159 }
3160 
3161 static int parse_control_option(const struct option *opt,
3162 				const char *str,
3163 				int unset __maybe_unused)
3164 {
3165 	struct record_opts *opts = opt->value;
3166 
3167 	return evlist__parse_control(str, &opts->ctl_fd, &opts->ctl_fd_ack, &opts->ctl_fd_close);
3168 }
3169 
3170 static void switch_output_size_warn(struct record *rec)
3171 {
3172 	u64 wakeup_size = evlist__mmap_size(rec->opts.mmap_pages);
3173 	struct switch_output *s = &rec->switch_output;
3174 
3175 	wakeup_size /= 2;
3176 
3177 	if (s->size < wakeup_size) {
3178 		char buf[100];
3179 
3180 		unit_number__scnprintf(buf, sizeof(buf), wakeup_size);
3181 		pr_warning("WARNING: switch-output data size lower than "
3182 			   "wakeup kernel buffer size (%s) "
3183 			   "expect bigger perf.data sizes\n", buf);
3184 	}
3185 }
3186 
3187 static int switch_output_setup(struct record *rec)
3188 {
3189 	struct switch_output *s = &rec->switch_output;
3190 	static struct parse_tag tags_size[] = {
3191 		{ .tag  = 'B', .mult = 1       },
3192 		{ .tag  = 'K', .mult = 1 << 10 },
3193 		{ .tag  = 'M', .mult = 1 << 20 },
3194 		{ .tag  = 'G', .mult = 1 << 30 },
3195 		{ .tag  = 0 },
3196 	};
3197 	static struct parse_tag tags_time[] = {
3198 		{ .tag  = 's', .mult = 1        },
3199 		{ .tag  = 'm', .mult = 60       },
3200 		{ .tag  = 'h', .mult = 60*60    },
3201 		{ .tag  = 'd', .mult = 60*60*24 },
3202 		{ .tag  = 0 },
3203 	};
3204 	unsigned long val;
3205 
3206 	/*
3207 	 * If we're using --switch-output-events, then we imply its
3208 	 * --switch-output=signal, as we'll send a SIGUSR2 from the side band
3209 	 *  thread to its parent.
3210 	 */
3211 	if (rec->switch_output_event_set) {
3212 		if (record__threads_enabled(rec)) {
3213 			pr_warning("WARNING: --switch-output-event option is not available in parallel streaming mode.\n");
3214 			return 0;
3215 		}
3216 		goto do_signal;
3217 	}
3218 
3219 	if (!s->set)
3220 		return 0;
3221 
3222 	if (record__threads_enabled(rec)) {
3223 		pr_warning("WARNING: --switch-output option is not available in parallel streaming mode.\n");
3224 		return 0;
3225 	}
3226 
3227 	if (!strcmp(s->str, "signal")) {
3228 do_signal:
3229 		s->signal = true;
3230 		pr_debug("switch-output with SIGUSR2 signal\n");
3231 		goto enabled;
3232 	}
3233 
3234 	val = parse_tag_value(s->str, tags_size);
3235 	if (val != (unsigned long) -1) {
3236 		s->size = val;
3237 		pr_debug("switch-output with %s size threshold\n", s->str);
3238 		goto enabled;
3239 	}
3240 
3241 	val = parse_tag_value(s->str, tags_time);
3242 	if (val != (unsigned long) -1) {
3243 		s->time = val;
3244 		pr_debug("switch-output with %s time threshold (%lu seconds)\n",
3245 			 s->str, s->time);
3246 		goto enabled;
3247 	}
3248 
3249 	return -1;
3250 
3251 enabled:
3252 	rec->timestamp_filename = true;
3253 	s->enabled              = true;
3254 
3255 	if (s->size && !rec->opts.no_buffering)
3256 		switch_output_size_warn(rec);
3257 
3258 	return 0;
3259 }
3260 
3261 static const char * const __record_usage[] = {
3262 	"perf record [<options>] [<command>]",
3263 	"perf record [<options>] -- <command> [<options>]",
3264 	NULL
3265 };
3266 const char * const *record_usage = __record_usage;
3267 
3268 static int build_id__process_mmap(struct perf_tool *tool, union perf_event *event,
3269 				  struct perf_sample *sample, struct machine *machine)
3270 {
3271 	/*
3272 	 * We already have the kernel maps, put in place via perf_session__create_kernel_maps()
3273 	 * no need to add them twice.
3274 	 */
3275 	if (!(event->header.misc & PERF_RECORD_MISC_USER))
3276 		return 0;
3277 	return perf_event__process_mmap(tool, event, sample, machine);
3278 }
3279 
3280 static int build_id__process_mmap2(struct perf_tool *tool, union perf_event *event,
3281 				   struct perf_sample *sample, struct machine *machine)
3282 {
3283 	/*
3284 	 * We already have the kernel maps, put in place via perf_session__create_kernel_maps()
3285 	 * no need to add them twice.
3286 	 */
3287 	if (!(event->header.misc & PERF_RECORD_MISC_USER))
3288 		return 0;
3289 
3290 	return perf_event__process_mmap2(tool, event, sample, machine);
3291 }
3292 
3293 static int process_timestamp_boundary(struct perf_tool *tool,
3294 				      union perf_event *event __maybe_unused,
3295 				      struct perf_sample *sample,
3296 				      struct machine *machine __maybe_unused)
3297 {
3298 	struct record *rec = container_of(tool, struct record, tool);
3299 
3300 	set_timestamp_boundary(rec, sample->time);
3301 	return 0;
3302 }
3303 
3304 static int parse_record_synth_option(const struct option *opt,
3305 				     const char *str,
3306 				     int unset __maybe_unused)
3307 {
3308 	struct record_opts *opts = opt->value;
3309 	char *p = strdup(str);
3310 
3311 	if (p == NULL)
3312 		return -1;
3313 
3314 	opts->synth = parse_synth_opt(p);
3315 	free(p);
3316 
3317 	if (opts->synth < 0) {
3318 		pr_err("Invalid synth option: %s\n", str);
3319 		return -1;
3320 	}
3321 	return 0;
3322 }
3323 
3324 /*
3325  * XXX Ideally would be local to cmd_record() and passed to a record__new
3326  * because we need to have access to it in record__exit, that is called
3327  * after cmd_record() exits, but since record_options need to be accessible to
3328  * builtin-script, leave it here.
3329  *
3330  * At least we don't ouch it in all the other functions here directly.
3331  *
3332  * Just say no to tons of global variables, sigh.
3333  */
3334 static struct record record = {
3335 	.opts = {
3336 		.sample_time	     = true,
3337 		.mmap_pages	     = UINT_MAX,
3338 		.user_freq	     = UINT_MAX,
3339 		.user_interval	     = ULLONG_MAX,
3340 		.freq		     = 4000,
3341 		.target		     = {
3342 			.uses_mmap   = true,
3343 			.default_per_cpu = true,
3344 		},
3345 		.mmap_flush          = MMAP_FLUSH_DEFAULT,
3346 		.nr_threads_synthesize = 1,
3347 		.ctl_fd              = -1,
3348 		.ctl_fd_ack          = -1,
3349 		.synth               = PERF_SYNTH_ALL,
3350 	},
3351 	.tool = {
3352 		.sample		= process_sample_event,
3353 		.fork		= perf_event__process_fork,
3354 		.exit		= perf_event__process_exit,
3355 		.comm		= perf_event__process_comm,
3356 		.namespaces	= perf_event__process_namespaces,
3357 		.mmap		= build_id__process_mmap,
3358 		.mmap2		= build_id__process_mmap2,
3359 		.itrace_start	= process_timestamp_boundary,
3360 		.aux		= process_timestamp_boundary,
3361 		.ordered_events	= true,
3362 	},
3363 };
3364 
3365 const char record_callchain_help[] = CALLCHAIN_RECORD_HELP
3366 	"\n\t\t\t\tDefault: fp";
3367 
3368 static bool dry_run;
3369 
3370 static struct parse_events_option_args parse_events_option_args = {
3371 	.evlistp = &record.evlist,
3372 };
3373 
3374 static struct parse_events_option_args switch_output_parse_events_option_args = {
3375 	.evlistp = &record.sb_evlist,
3376 };
3377 
3378 /*
3379  * XXX Will stay a global variable till we fix builtin-script.c to stop messing
3380  * with it and switch to use the library functions in perf_evlist that came
3381  * from builtin-record.c, i.e. use record_opts,
3382  * evlist__prepare_workload, etc instead of fork+exec'in 'perf record',
3383  * using pipes, etc.
3384  */
3385 static struct option __record_options[] = {
3386 	OPT_CALLBACK('e', "event", &parse_events_option_args, "event",
3387 		     "event selector. use 'perf list' to list available events",
3388 		     parse_events_option),
3389 	OPT_CALLBACK(0, "filter", &record.evlist, "filter",
3390 		     "event filter", parse_filter),
3391 	OPT_CALLBACK_NOOPT(0, "exclude-perf", &record.evlist,
3392 			   NULL, "don't record events from perf itself",
3393 			   exclude_perf),
3394 	OPT_STRING('p', "pid", &record.opts.target.pid, "pid",
3395 		    "record events on existing process id"),
3396 	OPT_STRING('t', "tid", &record.opts.target.tid, "tid",
3397 		    "record events on existing thread id"),
3398 	OPT_INTEGER('r', "realtime", &record.realtime_prio,
3399 		    "collect data with this RT SCHED_FIFO priority"),
3400 	OPT_BOOLEAN(0, "no-buffering", &record.opts.no_buffering,
3401 		    "collect data without buffering"),
3402 	OPT_BOOLEAN('R', "raw-samples", &record.opts.raw_samples,
3403 		    "collect raw sample records from all opened counters"),
3404 	OPT_BOOLEAN('a', "all-cpus", &record.opts.target.system_wide,
3405 			    "system-wide collection from all CPUs"),
3406 	OPT_STRING('C', "cpu", &record.opts.target.cpu_list, "cpu",
3407 		    "list of cpus to monitor"),
3408 	OPT_U64('c', "count", &record.opts.user_interval, "event period to sample"),
3409 	OPT_STRING('o', "output", &record.data.path, "file",
3410 		    "output file name"),
3411 	OPT_BOOLEAN_SET('i', "no-inherit", &record.opts.no_inherit,
3412 			&record.opts.no_inherit_set,
3413 			"child tasks do not inherit counters"),
3414 	OPT_BOOLEAN(0, "tail-synthesize", &record.opts.tail_synthesize,
3415 		    "synthesize non-sample events at the end of output"),
3416 	OPT_BOOLEAN(0, "overwrite", &record.opts.overwrite, "use overwrite mode"),
3417 	OPT_BOOLEAN(0, "no-bpf-event", &record.opts.no_bpf_event, "do not record bpf events"),
3418 	OPT_BOOLEAN(0, "strict-freq", &record.opts.strict_freq,
3419 		    "Fail if the specified frequency can't be used"),
3420 	OPT_CALLBACK('F', "freq", &record.opts, "freq or 'max'",
3421 		     "profile at this frequency",
3422 		      record__parse_freq),
3423 	OPT_CALLBACK('m', "mmap-pages", &record.opts, "pages[,pages]",
3424 		     "number of mmap data pages and AUX area tracing mmap pages",
3425 		     record__parse_mmap_pages),
3426 	OPT_CALLBACK(0, "mmap-flush", &record.opts, "number",
3427 		     "Minimal number of bytes that is extracted from mmap data pages (default: 1)",
3428 		     record__mmap_flush_parse),
3429 	OPT_CALLBACK_NOOPT('g', NULL, &callchain_param,
3430 			   NULL, "enables call-graph recording" ,
3431 			   &record_callchain_opt),
3432 	OPT_CALLBACK(0, "call-graph", &record.opts,
3433 		     "record_mode[,record_size]", record_callchain_help,
3434 		     &record_parse_callchain_opt),
3435 	OPT_INCR('v', "verbose", &verbose,
3436 		    "be more verbose (show counter open errors, etc)"),
3437 	OPT_BOOLEAN('q', "quiet", &quiet, "don't print any warnings or messages"),
3438 	OPT_BOOLEAN('s', "stat", &record.opts.inherit_stat,
3439 		    "per thread counts"),
3440 	OPT_BOOLEAN('d', "data", &record.opts.sample_address, "Record the sample addresses"),
3441 	OPT_BOOLEAN(0, "phys-data", &record.opts.sample_phys_addr,
3442 		    "Record the sample physical addresses"),
3443 	OPT_BOOLEAN(0, "data-page-size", &record.opts.sample_data_page_size,
3444 		    "Record the sampled data address data page size"),
3445 	OPT_BOOLEAN(0, "code-page-size", &record.opts.sample_code_page_size,
3446 		    "Record the sampled code address (ip) page size"),
3447 	OPT_BOOLEAN(0, "sample-cpu", &record.opts.sample_cpu, "Record the sample cpu"),
3448 	OPT_BOOLEAN(0, "sample-identifier", &record.opts.sample_identifier,
3449 		    "Record the sample identifier"),
3450 	OPT_BOOLEAN_SET('T', "timestamp", &record.opts.sample_time,
3451 			&record.opts.sample_time_set,
3452 			"Record the sample timestamps"),
3453 	OPT_BOOLEAN_SET('P', "period", &record.opts.period, &record.opts.period_set,
3454 			"Record the sample period"),
3455 	OPT_BOOLEAN('n', "no-samples", &record.opts.no_samples,
3456 		    "don't sample"),
3457 	OPT_BOOLEAN_SET('N', "no-buildid-cache", &record.no_buildid_cache,
3458 			&record.no_buildid_cache_set,
3459 			"do not update the buildid cache"),
3460 	OPT_BOOLEAN_SET('B', "no-buildid", &record.no_buildid,
3461 			&record.no_buildid_set,
3462 			"do not collect buildids in perf.data"),
3463 	OPT_CALLBACK('G', "cgroup", &record.evlist, "name",
3464 		     "monitor event in cgroup name only",
3465 		     parse_cgroups),
3466 	OPT_CALLBACK('D', "delay", &record, "ms",
3467 		     "ms to wait before starting measurement after program start (-1: start with events disabled), "
3468 		     "or ranges of time to enable events e.g. '-D 10-20,30-40'",
3469 		     record__parse_event_enable_time),
3470 	OPT_BOOLEAN(0, "kcore", &record.opts.kcore, "copy /proc/kcore"),
3471 	OPT_STRING('u', "uid", &record.opts.target.uid_str, "user",
3472 		   "user to profile"),
3473 
3474 	OPT_CALLBACK_NOOPT('b', "branch-any", &record.opts.branch_stack,
3475 		     "branch any", "sample any taken branches",
3476 		     parse_branch_stack),
3477 
3478 	OPT_CALLBACK('j', "branch-filter", &record.opts.branch_stack,
3479 		     "branch filter mask", "branch stack filter modes",
3480 		     parse_branch_stack),
3481 	OPT_BOOLEAN('W', "weight", &record.opts.sample_weight,
3482 		    "sample by weight (on special events only)"),
3483 	OPT_BOOLEAN(0, "transaction", &record.opts.sample_transaction,
3484 		    "sample transaction flags (special events only)"),
3485 	OPT_BOOLEAN(0, "per-thread", &record.opts.target.per_thread,
3486 		    "use per-thread mmaps"),
3487 	OPT_CALLBACK_OPTARG('I', "intr-regs", &record.opts.sample_intr_regs, NULL, "any register",
3488 		    "sample selected machine registers on interrupt,"
3489 		    " use '-I?' to list register names", parse_intr_regs),
3490 	OPT_CALLBACK_OPTARG(0, "user-regs", &record.opts.sample_user_regs, NULL, "any register",
3491 		    "sample selected machine registers on interrupt,"
3492 		    " use '--user-regs=?' to list register names", parse_user_regs),
3493 	OPT_BOOLEAN(0, "running-time", &record.opts.running_time,
3494 		    "Record running/enabled time of read (:S) events"),
3495 	OPT_CALLBACK('k', "clockid", &record.opts,
3496 	"clockid", "clockid to use for events, see clock_gettime()",
3497 	parse_clockid),
3498 	OPT_STRING_OPTARG('S', "snapshot", &record.opts.auxtrace_snapshot_opts,
3499 			  "opts", "AUX area tracing Snapshot Mode", ""),
3500 	OPT_STRING_OPTARG(0, "aux-sample", &record.opts.auxtrace_sample_opts,
3501 			  "opts", "sample AUX area", ""),
3502 	OPT_UINTEGER(0, "proc-map-timeout", &proc_map_timeout,
3503 			"per thread proc mmap processing timeout in ms"),
3504 	OPT_BOOLEAN(0, "namespaces", &record.opts.record_namespaces,
3505 		    "Record namespaces events"),
3506 	OPT_BOOLEAN(0, "all-cgroups", &record.opts.record_cgroup,
3507 		    "Record cgroup events"),
3508 	OPT_BOOLEAN_SET(0, "switch-events", &record.opts.record_switch_events,
3509 			&record.opts.record_switch_events_set,
3510 			"Record context switch events"),
3511 	OPT_BOOLEAN_FLAG(0, "all-kernel", &record.opts.all_kernel,
3512 			 "Configure all used events to run in kernel space.",
3513 			 PARSE_OPT_EXCLUSIVE),
3514 	OPT_BOOLEAN_FLAG(0, "all-user", &record.opts.all_user,
3515 			 "Configure all used events to run in user space.",
3516 			 PARSE_OPT_EXCLUSIVE),
3517 	OPT_BOOLEAN(0, "kernel-callchains", &record.opts.kernel_callchains,
3518 		    "collect kernel callchains"),
3519 	OPT_BOOLEAN(0, "user-callchains", &record.opts.user_callchains,
3520 		    "collect user callchains"),
3521 	OPT_STRING(0, "vmlinux", &symbol_conf.vmlinux_name,
3522 		   "file", "vmlinux pathname"),
3523 	OPT_BOOLEAN(0, "buildid-all", &record.buildid_all,
3524 		    "Record build-id of all DSOs regardless of hits"),
3525 	OPT_BOOLEAN(0, "buildid-mmap", &record.buildid_mmap,
3526 		    "Record build-id in map events"),
3527 	OPT_BOOLEAN(0, "timestamp-filename", &record.timestamp_filename,
3528 		    "append timestamp to output filename"),
3529 	OPT_BOOLEAN(0, "timestamp-boundary", &record.timestamp_boundary,
3530 		    "Record timestamp boundary (time of first/last samples)"),
3531 	OPT_STRING_OPTARG_SET(0, "switch-output", &record.switch_output.str,
3532 			  &record.switch_output.set, "signal or size[BKMG] or time[smhd]",
3533 			  "Switch output when receiving SIGUSR2 (signal) or cross a size or time threshold",
3534 			  "signal"),
3535 	OPT_CALLBACK_SET(0, "switch-output-event", &switch_output_parse_events_option_args,
3536 			 &record.switch_output_event_set, "switch output event",
3537 			 "switch output event selector. use 'perf list' to list available events",
3538 			 parse_events_option_new_evlist),
3539 	OPT_INTEGER(0, "switch-max-files", &record.switch_output.num_files,
3540 		   "Limit number of switch output generated files"),
3541 	OPT_BOOLEAN(0, "dry-run", &dry_run,
3542 		    "Parse options then exit"),
3543 #ifdef HAVE_AIO_SUPPORT
3544 	OPT_CALLBACK_OPTARG(0, "aio", &record.opts,
3545 		     &nr_cblocks_default, "n", "Use <n> control blocks in asynchronous trace writing mode (default: 1, max: 4)",
3546 		     record__aio_parse),
3547 #endif
3548 	OPT_CALLBACK(0, "affinity", &record.opts, "node|cpu",
3549 		     "Set affinity mask of trace reading thread to NUMA node cpu mask or cpu of processed mmap buffer",
3550 		     record__parse_affinity),
3551 #ifdef HAVE_ZSTD_SUPPORT
3552 	OPT_CALLBACK_OPTARG('z', "compression-level", &record.opts, &comp_level_default, "n",
3553 			    "Compress records using specified level (default: 1 - fastest compression, 22 - greatest compression)",
3554 			    record__parse_comp_level),
3555 #endif
3556 	OPT_CALLBACK(0, "max-size", &record.output_max_size,
3557 		     "size", "Limit the maximum size of the output file", parse_output_max_size),
3558 	OPT_UINTEGER(0, "num-thread-synthesize",
3559 		     &record.opts.nr_threads_synthesize,
3560 		     "number of threads to run for event synthesis"),
3561 #ifdef HAVE_LIBPFM
3562 	OPT_CALLBACK(0, "pfm-events", &record.evlist, "event",
3563 		"libpfm4 event selector. use 'perf list' to list available events",
3564 		parse_libpfm_events_option),
3565 #endif
3566 	OPT_CALLBACK(0, "control", &record.opts, "fd:ctl-fd[,ack-fd] or fifo:ctl-fifo[,ack-fifo]",
3567 		     "Listen on ctl-fd descriptor for command to control measurement ('enable': enable events, 'disable': disable events,\n"
3568 		     "\t\t\t  'snapshot': AUX area tracing snapshot).\n"
3569 		     "\t\t\t  Optionally send control command completion ('ack\\n') to ack-fd descriptor.\n"
3570 		     "\t\t\t  Alternatively, ctl-fifo / ack-fifo will be opened and used as ctl-fd / ack-fd.",
3571 		      parse_control_option),
3572 	OPT_CALLBACK(0, "synth", &record.opts, "no|all|task|mmap|cgroup",
3573 		     "Fine-tune event synthesis: default=all", parse_record_synth_option),
3574 	OPT_STRING_OPTARG_SET(0, "debuginfod", &record.debuginfod.urls,
3575 			  &record.debuginfod.set, "debuginfod urls",
3576 			  "Enable debuginfod data retrieval from DEBUGINFOD_URLS or specified urls",
3577 			  "system"),
3578 	OPT_CALLBACK_OPTARG(0, "threads", &record.opts, NULL, "spec",
3579 			    "write collected trace data into several data files using parallel threads",
3580 			    record__parse_threads),
3581 	OPT_BOOLEAN(0, "off-cpu", &record.off_cpu, "Enable off-cpu analysis"),
3582 	OPT_END()
3583 };
3584 
3585 struct option *record_options = __record_options;
3586 
3587 static int record__mmap_cpu_mask_init(struct mmap_cpu_mask *mask, struct perf_cpu_map *cpus)
3588 {
3589 	struct perf_cpu cpu;
3590 	int idx;
3591 
3592 	if (cpu_map__is_dummy(cpus))
3593 		return 0;
3594 
3595 	perf_cpu_map__for_each_cpu(cpu, idx, cpus) {
3596 		if (cpu.cpu == -1)
3597 			continue;
3598 		/* Return ENODEV is input cpu is greater than max cpu */
3599 		if ((unsigned long)cpu.cpu > mask->nbits)
3600 			return -ENODEV;
3601 		__set_bit(cpu.cpu, mask->bits);
3602 	}
3603 
3604 	return 0;
3605 }
3606 
3607 static int record__mmap_cpu_mask_init_spec(struct mmap_cpu_mask *mask, const char *mask_spec)
3608 {
3609 	struct perf_cpu_map *cpus;
3610 
3611 	cpus = perf_cpu_map__new(mask_spec);
3612 	if (!cpus)
3613 		return -ENOMEM;
3614 
3615 	bitmap_zero(mask->bits, mask->nbits);
3616 	if (record__mmap_cpu_mask_init(mask, cpus))
3617 		return -ENODEV;
3618 
3619 	perf_cpu_map__put(cpus);
3620 
3621 	return 0;
3622 }
3623 
3624 static void record__free_thread_masks(struct record *rec, int nr_threads)
3625 {
3626 	int t;
3627 
3628 	if (rec->thread_masks)
3629 		for (t = 0; t < nr_threads; t++)
3630 			record__thread_mask_free(&rec->thread_masks[t]);
3631 
3632 	zfree(&rec->thread_masks);
3633 }
3634 
3635 static int record__alloc_thread_masks(struct record *rec, int nr_threads, int nr_bits)
3636 {
3637 	int t, ret;
3638 
3639 	rec->thread_masks = zalloc(nr_threads * sizeof(*(rec->thread_masks)));
3640 	if (!rec->thread_masks) {
3641 		pr_err("Failed to allocate thread masks\n");
3642 		return -ENOMEM;
3643 	}
3644 
3645 	for (t = 0; t < nr_threads; t++) {
3646 		ret = record__thread_mask_alloc(&rec->thread_masks[t], nr_bits);
3647 		if (ret) {
3648 			pr_err("Failed to allocate thread masks[%d]\n", t);
3649 			goto out_free;
3650 		}
3651 	}
3652 
3653 	return 0;
3654 
3655 out_free:
3656 	record__free_thread_masks(rec, nr_threads);
3657 
3658 	return ret;
3659 }
3660 
3661 static int record__init_thread_cpu_masks(struct record *rec, struct perf_cpu_map *cpus)
3662 {
3663 	int t, ret, nr_cpus = perf_cpu_map__nr(cpus);
3664 
3665 	ret = record__alloc_thread_masks(rec, nr_cpus, cpu__max_cpu().cpu);
3666 	if (ret)
3667 		return ret;
3668 
3669 	rec->nr_threads = nr_cpus;
3670 	pr_debug("nr_threads: %d\n", rec->nr_threads);
3671 
3672 	for (t = 0; t < rec->nr_threads; t++) {
3673 		__set_bit(perf_cpu_map__cpu(cpus, t).cpu, rec->thread_masks[t].maps.bits);
3674 		__set_bit(perf_cpu_map__cpu(cpus, t).cpu, rec->thread_masks[t].affinity.bits);
3675 		if (verbose > 0) {
3676 			pr_debug("thread_masks[%d]: ", t);
3677 			mmap_cpu_mask__scnprintf(&rec->thread_masks[t].maps, "maps");
3678 			pr_debug("thread_masks[%d]: ", t);
3679 			mmap_cpu_mask__scnprintf(&rec->thread_masks[t].affinity, "affinity");
3680 		}
3681 	}
3682 
3683 	return 0;
3684 }
3685 
3686 static int record__init_thread_masks_spec(struct record *rec, struct perf_cpu_map *cpus,
3687 					  const char **maps_spec, const char **affinity_spec,
3688 					  u32 nr_spec)
3689 {
3690 	u32 s;
3691 	int ret = 0, t = 0;
3692 	struct mmap_cpu_mask cpus_mask;
3693 	struct thread_mask thread_mask, full_mask, *thread_masks;
3694 
3695 	ret = record__mmap_cpu_mask_alloc(&cpus_mask, cpu__max_cpu().cpu);
3696 	if (ret) {
3697 		pr_err("Failed to allocate CPUs mask\n");
3698 		return ret;
3699 	}
3700 
3701 	ret = record__mmap_cpu_mask_init(&cpus_mask, cpus);
3702 	if (ret) {
3703 		pr_err("Failed to init cpu mask\n");
3704 		goto out_free_cpu_mask;
3705 	}
3706 
3707 	ret = record__thread_mask_alloc(&full_mask, cpu__max_cpu().cpu);
3708 	if (ret) {
3709 		pr_err("Failed to allocate full mask\n");
3710 		goto out_free_cpu_mask;
3711 	}
3712 
3713 	ret = record__thread_mask_alloc(&thread_mask, cpu__max_cpu().cpu);
3714 	if (ret) {
3715 		pr_err("Failed to allocate thread mask\n");
3716 		goto out_free_full_and_cpu_masks;
3717 	}
3718 
3719 	for (s = 0; s < nr_spec; s++) {
3720 		ret = record__mmap_cpu_mask_init_spec(&thread_mask.maps, maps_spec[s]);
3721 		if (ret) {
3722 			pr_err("Failed to initialize maps thread mask\n");
3723 			goto out_free;
3724 		}
3725 		ret = record__mmap_cpu_mask_init_spec(&thread_mask.affinity, affinity_spec[s]);
3726 		if (ret) {
3727 			pr_err("Failed to initialize affinity thread mask\n");
3728 			goto out_free;
3729 		}
3730 
3731 		/* ignore invalid CPUs but do not allow empty masks */
3732 		if (!bitmap_and(thread_mask.maps.bits, thread_mask.maps.bits,
3733 				cpus_mask.bits, thread_mask.maps.nbits)) {
3734 			pr_err("Empty maps mask: %s\n", maps_spec[s]);
3735 			ret = -EINVAL;
3736 			goto out_free;
3737 		}
3738 		if (!bitmap_and(thread_mask.affinity.bits, thread_mask.affinity.bits,
3739 				cpus_mask.bits, thread_mask.affinity.nbits)) {
3740 			pr_err("Empty affinity mask: %s\n", affinity_spec[s]);
3741 			ret = -EINVAL;
3742 			goto out_free;
3743 		}
3744 
3745 		/* do not allow intersection with other masks (full_mask) */
3746 		if (bitmap_intersects(thread_mask.maps.bits, full_mask.maps.bits,
3747 				      thread_mask.maps.nbits)) {
3748 			pr_err("Intersecting maps mask: %s\n", maps_spec[s]);
3749 			ret = -EINVAL;
3750 			goto out_free;
3751 		}
3752 		if (bitmap_intersects(thread_mask.affinity.bits, full_mask.affinity.bits,
3753 				      thread_mask.affinity.nbits)) {
3754 			pr_err("Intersecting affinity mask: %s\n", affinity_spec[s]);
3755 			ret = -EINVAL;
3756 			goto out_free;
3757 		}
3758 
3759 		bitmap_or(full_mask.maps.bits, full_mask.maps.bits,
3760 			  thread_mask.maps.bits, full_mask.maps.nbits);
3761 		bitmap_or(full_mask.affinity.bits, full_mask.affinity.bits,
3762 			  thread_mask.affinity.bits, full_mask.maps.nbits);
3763 
3764 		thread_masks = realloc(rec->thread_masks, (t + 1) * sizeof(struct thread_mask));
3765 		if (!thread_masks) {
3766 			pr_err("Failed to reallocate thread masks\n");
3767 			ret = -ENOMEM;
3768 			goto out_free;
3769 		}
3770 		rec->thread_masks = thread_masks;
3771 		rec->thread_masks[t] = thread_mask;
3772 		if (verbose > 0) {
3773 			pr_debug("thread_masks[%d]: ", t);
3774 			mmap_cpu_mask__scnprintf(&rec->thread_masks[t].maps, "maps");
3775 			pr_debug("thread_masks[%d]: ", t);
3776 			mmap_cpu_mask__scnprintf(&rec->thread_masks[t].affinity, "affinity");
3777 		}
3778 		t++;
3779 		ret = record__thread_mask_alloc(&thread_mask, cpu__max_cpu().cpu);
3780 		if (ret) {
3781 			pr_err("Failed to allocate thread mask\n");
3782 			goto out_free_full_and_cpu_masks;
3783 		}
3784 	}
3785 	rec->nr_threads = t;
3786 	pr_debug("nr_threads: %d\n", rec->nr_threads);
3787 	if (!rec->nr_threads)
3788 		ret = -EINVAL;
3789 
3790 out_free:
3791 	record__thread_mask_free(&thread_mask);
3792 out_free_full_and_cpu_masks:
3793 	record__thread_mask_free(&full_mask);
3794 out_free_cpu_mask:
3795 	record__mmap_cpu_mask_free(&cpus_mask);
3796 
3797 	return ret;
3798 }
3799 
3800 static int record__init_thread_core_masks(struct record *rec, struct perf_cpu_map *cpus)
3801 {
3802 	int ret;
3803 	struct cpu_topology *topo;
3804 
3805 	topo = cpu_topology__new();
3806 	if (!topo) {
3807 		pr_err("Failed to allocate CPU topology\n");
3808 		return -ENOMEM;
3809 	}
3810 
3811 	ret = record__init_thread_masks_spec(rec, cpus, topo->core_cpus_list,
3812 					     topo->core_cpus_list, topo->core_cpus_lists);
3813 	cpu_topology__delete(topo);
3814 
3815 	return ret;
3816 }
3817 
3818 static int record__init_thread_package_masks(struct record *rec, struct perf_cpu_map *cpus)
3819 {
3820 	int ret;
3821 	struct cpu_topology *topo;
3822 
3823 	topo = cpu_topology__new();
3824 	if (!topo) {
3825 		pr_err("Failed to allocate CPU topology\n");
3826 		return -ENOMEM;
3827 	}
3828 
3829 	ret = record__init_thread_masks_spec(rec, cpus, topo->package_cpus_list,
3830 					     topo->package_cpus_list, topo->package_cpus_lists);
3831 	cpu_topology__delete(topo);
3832 
3833 	return ret;
3834 }
3835 
3836 static int record__init_thread_numa_masks(struct record *rec, struct perf_cpu_map *cpus)
3837 {
3838 	u32 s;
3839 	int ret;
3840 	const char **spec;
3841 	struct numa_topology *topo;
3842 
3843 	topo = numa_topology__new();
3844 	if (!topo) {
3845 		pr_err("Failed to allocate NUMA topology\n");
3846 		return -ENOMEM;
3847 	}
3848 
3849 	spec = zalloc(topo->nr * sizeof(char *));
3850 	if (!spec) {
3851 		pr_err("Failed to allocate NUMA spec\n");
3852 		ret = -ENOMEM;
3853 		goto out_delete_topo;
3854 	}
3855 	for (s = 0; s < topo->nr; s++)
3856 		spec[s] = topo->nodes[s].cpus;
3857 
3858 	ret = record__init_thread_masks_spec(rec, cpus, spec, spec, topo->nr);
3859 
3860 	zfree(&spec);
3861 
3862 out_delete_topo:
3863 	numa_topology__delete(topo);
3864 
3865 	return ret;
3866 }
3867 
3868 static int record__init_thread_user_masks(struct record *rec, struct perf_cpu_map *cpus)
3869 {
3870 	int t, ret;
3871 	u32 s, nr_spec = 0;
3872 	char **maps_spec = NULL, **affinity_spec = NULL, **tmp_spec;
3873 	char *user_spec, *spec, *spec_ptr, *mask, *mask_ptr, *dup_mask = NULL;
3874 
3875 	for (t = 0, user_spec = (char *)rec->opts.threads_user_spec; ; t++, user_spec = NULL) {
3876 		spec = strtok_r(user_spec, ":", &spec_ptr);
3877 		if (spec == NULL)
3878 			break;
3879 		pr_debug2("threads_spec[%d]: %s\n", t, spec);
3880 		mask = strtok_r(spec, "/", &mask_ptr);
3881 		if (mask == NULL)
3882 			break;
3883 		pr_debug2("  maps mask: %s\n", mask);
3884 		tmp_spec = realloc(maps_spec, (nr_spec + 1) * sizeof(char *));
3885 		if (!tmp_spec) {
3886 			pr_err("Failed to reallocate maps spec\n");
3887 			ret = -ENOMEM;
3888 			goto out_free;
3889 		}
3890 		maps_spec = tmp_spec;
3891 		maps_spec[nr_spec] = dup_mask = strdup(mask);
3892 		if (!maps_spec[nr_spec]) {
3893 			pr_err("Failed to allocate maps spec[%d]\n", nr_spec);
3894 			ret = -ENOMEM;
3895 			goto out_free;
3896 		}
3897 		mask = strtok_r(NULL, "/", &mask_ptr);
3898 		if (mask == NULL) {
3899 			pr_err("Invalid thread maps or affinity specs\n");
3900 			ret = -EINVAL;
3901 			goto out_free;
3902 		}
3903 		pr_debug2("  affinity mask: %s\n", mask);
3904 		tmp_spec = realloc(affinity_spec, (nr_spec + 1) * sizeof(char *));
3905 		if (!tmp_spec) {
3906 			pr_err("Failed to reallocate affinity spec\n");
3907 			ret = -ENOMEM;
3908 			goto out_free;
3909 		}
3910 		affinity_spec = tmp_spec;
3911 		affinity_spec[nr_spec] = strdup(mask);
3912 		if (!affinity_spec[nr_spec]) {
3913 			pr_err("Failed to allocate affinity spec[%d]\n", nr_spec);
3914 			ret = -ENOMEM;
3915 			goto out_free;
3916 		}
3917 		dup_mask = NULL;
3918 		nr_spec++;
3919 	}
3920 
3921 	ret = record__init_thread_masks_spec(rec, cpus, (const char **)maps_spec,
3922 					     (const char **)affinity_spec, nr_spec);
3923 
3924 out_free:
3925 	free(dup_mask);
3926 	for (s = 0; s < nr_spec; s++) {
3927 		if (maps_spec)
3928 			free(maps_spec[s]);
3929 		if (affinity_spec)
3930 			free(affinity_spec[s]);
3931 	}
3932 	free(affinity_spec);
3933 	free(maps_spec);
3934 
3935 	return ret;
3936 }
3937 
3938 static int record__init_thread_default_masks(struct record *rec, struct perf_cpu_map *cpus)
3939 {
3940 	int ret;
3941 
3942 	ret = record__alloc_thread_masks(rec, 1, cpu__max_cpu().cpu);
3943 	if (ret)
3944 		return ret;
3945 
3946 	if (record__mmap_cpu_mask_init(&rec->thread_masks->maps, cpus))
3947 		return -ENODEV;
3948 
3949 	rec->nr_threads = 1;
3950 
3951 	return 0;
3952 }
3953 
3954 static int record__init_thread_masks(struct record *rec)
3955 {
3956 	int ret = 0;
3957 	struct perf_cpu_map *cpus = rec->evlist->core.all_cpus;
3958 
3959 	if (!record__threads_enabled(rec))
3960 		return record__init_thread_default_masks(rec, cpus);
3961 
3962 	if (evlist__per_thread(rec->evlist)) {
3963 		pr_err("--per-thread option is mutually exclusive to parallel streaming mode.\n");
3964 		return -EINVAL;
3965 	}
3966 
3967 	switch (rec->opts.threads_spec) {
3968 	case THREAD_SPEC__CPU:
3969 		ret = record__init_thread_cpu_masks(rec, cpus);
3970 		break;
3971 	case THREAD_SPEC__CORE:
3972 		ret = record__init_thread_core_masks(rec, cpus);
3973 		break;
3974 	case THREAD_SPEC__PACKAGE:
3975 		ret = record__init_thread_package_masks(rec, cpus);
3976 		break;
3977 	case THREAD_SPEC__NUMA:
3978 		ret = record__init_thread_numa_masks(rec, cpus);
3979 		break;
3980 	case THREAD_SPEC__USER:
3981 		ret = record__init_thread_user_masks(rec, cpus);
3982 		break;
3983 	default:
3984 		break;
3985 	}
3986 
3987 	return ret;
3988 }
3989 
3990 int cmd_record(int argc, const char **argv)
3991 {
3992 	int err;
3993 	struct record *rec = &record;
3994 	char errbuf[BUFSIZ];
3995 
3996 	setlocale(LC_ALL, "");
3997 
3998 #ifndef HAVE_BPF_SKEL
3999 # define set_nobuild(s, l, m, c) set_option_nobuild(record_options, s, l, m, c)
4000 	set_nobuild('\0', "off-cpu", "no BUILD_BPF_SKEL=1", true);
4001 # undef set_nobuild
4002 #endif
4003 
4004 	/* Disable eager loading of kernel symbols that adds overhead to perf record. */
4005 	symbol_conf.lazy_load_kernel_maps = true;
4006 	rec->opts.affinity = PERF_AFFINITY_SYS;
4007 
4008 	rec->evlist = evlist__new();
4009 	if (rec->evlist == NULL)
4010 		return -ENOMEM;
4011 
4012 	err = perf_config(perf_record_config, rec);
4013 	if (err)
4014 		return err;
4015 
4016 	argc = parse_options(argc, argv, record_options, record_usage,
4017 			    PARSE_OPT_STOP_AT_NON_OPTION);
4018 	if (quiet)
4019 		perf_quiet_option();
4020 
4021 	err = symbol__validate_sym_arguments();
4022 	if (err)
4023 		return err;
4024 
4025 	perf_debuginfod_setup(&record.debuginfod);
4026 
4027 	/* Make system wide (-a) the default target. */
4028 	if (!argc && target__none(&rec->opts.target))
4029 		rec->opts.target.system_wide = true;
4030 
4031 	if (nr_cgroups && !rec->opts.target.system_wide) {
4032 		usage_with_options_msg(record_usage, record_options,
4033 			"cgroup monitoring only available in system-wide mode");
4034 
4035 	}
4036 
4037 	if (rec->buildid_mmap) {
4038 		if (!perf_can_record_build_id()) {
4039 			pr_err("Failed: no support to record build id in mmap events, update your kernel.\n");
4040 			err = -EINVAL;
4041 			goto out_opts;
4042 		}
4043 		pr_debug("Enabling build id in mmap2 events.\n");
4044 		/* Enable mmap build id synthesizing. */
4045 		symbol_conf.buildid_mmap2 = true;
4046 		/* Enable perf_event_attr::build_id bit. */
4047 		rec->opts.build_id = true;
4048 		/* Disable build id cache. */
4049 		rec->no_buildid = true;
4050 	}
4051 
4052 	if (rec->opts.record_cgroup && !perf_can_record_cgroup()) {
4053 		pr_err("Kernel has no cgroup sampling support.\n");
4054 		err = -EINVAL;
4055 		goto out_opts;
4056 	}
4057 
4058 	if (rec->opts.kcore)
4059 		rec->opts.text_poke = true;
4060 
4061 	if (rec->opts.kcore || record__threads_enabled(rec))
4062 		rec->data.is_dir = true;
4063 
4064 	if (record__threads_enabled(rec)) {
4065 		if (rec->opts.affinity != PERF_AFFINITY_SYS) {
4066 			pr_err("--affinity option is mutually exclusive to parallel streaming mode.\n");
4067 			goto out_opts;
4068 		}
4069 		if (record__aio_enabled(rec)) {
4070 			pr_err("Asynchronous streaming mode (--aio) is mutually exclusive to parallel streaming mode.\n");
4071 			goto out_opts;
4072 		}
4073 	}
4074 
4075 	if (rec->opts.comp_level != 0) {
4076 		pr_debug("Compression enabled, disabling build id collection at the end of the session.\n");
4077 		rec->no_buildid = true;
4078 	}
4079 
4080 	if (rec->opts.record_switch_events &&
4081 	    !perf_can_record_switch_events()) {
4082 		ui__error("kernel does not support recording context switch events\n");
4083 		parse_options_usage(record_usage, record_options, "switch-events", 0);
4084 		err = -EINVAL;
4085 		goto out_opts;
4086 	}
4087 
4088 	if (switch_output_setup(rec)) {
4089 		parse_options_usage(record_usage, record_options, "switch-output", 0);
4090 		err = -EINVAL;
4091 		goto out_opts;
4092 	}
4093 
4094 	if (rec->switch_output.time) {
4095 		signal(SIGALRM, alarm_sig_handler);
4096 		alarm(rec->switch_output.time);
4097 	}
4098 
4099 	if (rec->switch_output.num_files) {
4100 		rec->switch_output.filenames = calloc(sizeof(char *),
4101 						      rec->switch_output.num_files);
4102 		if (!rec->switch_output.filenames) {
4103 			err = -EINVAL;
4104 			goto out_opts;
4105 		}
4106 	}
4107 
4108 	if (rec->timestamp_filename && record__threads_enabled(rec)) {
4109 		rec->timestamp_filename = false;
4110 		pr_warning("WARNING: --timestamp-filename option is not available in parallel streaming mode.\n");
4111 	}
4112 
4113 	/*
4114 	 * Allow aliases to facilitate the lookup of symbols for address
4115 	 * filters. Refer to auxtrace_parse_filters().
4116 	 */
4117 	symbol_conf.allow_aliases = true;
4118 
4119 	symbol__init(NULL);
4120 
4121 	err = record__auxtrace_init(rec);
4122 	if (err)
4123 		goto out;
4124 
4125 	if (dry_run)
4126 		goto out;
4127 
4128 	err = -ENOMEM;
4129 
4130 	if (rec->no_buildid_cache || rec->no_buildid) {
4131 		disable_buildid_cache();
4132 	} else if (rec->switch_output.enabled) {
4133 		/*
4134 		 * In 'perf record --switch-output', disable buildid
4135 		 * generation by default to reduce data file switching
4136 		 * overhead. Still generate buildid if they are required
4137 		 * explicitly using
4138 		 *
4139 		 *  perf record --switch-output --no-no-buildid \
4140 		 *              --no-no-buildid-cache
4141 		 *
4142 		 * Following code equals to:
4143 		 *
4144 		 * if ((rec->no_buildid || !rec->no_buildid_set) &&
4145 		 *     (rec->no_buildid_cache || !rec->no_buildid_cache_set))
4146 		 *         disable_buildid_cache();
4147 		 */
4148 		bool disable = true;
4149 
4150 		if (rec->no_buildid_set && !rec->no_buildid)
4151 			disable = false;
4152 		if (rec->no_buildid_cache_set && !rec->no_buildid_cache)
4153 			disable = false;
4154 		if (disable) {
4155 			rec->no_buildid = true;
4156 			rec->no_buildid_cache = true;
4157 			disable_buildid_cache();
4158 		}
4159 	}
4160 
4161 	if (record.opts.overwrite)
4162 		record.opts.tail_synthesize = true;
4163 
4164 	if (rec->evlist->core.nr_entries == 0) {
4165 		bool can_profile_kernel = perf_event_paranoid_check(1);
4166 
4167 		err = parse_event(rec->evlist, can_profile_kernel ? "cycles:P" : "cycles:Pu");
4168 		if (err)
4169 			goto out;
4170 	}
4171 
4172 	if (rec->opts.target.tid && !rec->opts.no_inherit_set)
4173 		rec->opts.no_inherit = true;
4174 
4175 	err = target__validate(&rec->opts.target);
4176 	if (err) {
4177 		target__strerror(&rec->opts.target, err, errbuf, BUFSIZ);
4178 		ui__warning("%s\n", errbuf);
4179 	}
4180 
4181 	err = target__parse_uid(&rec->opts.target);
4182 	if (err) {
4183 		int saved_errno = errno;
4184 
4185 		target__strerror(&rec->opts.target, err, errbuf, BUFSIZ);
4186 		ui__error("%s", errbuf);
4187 
4188 		err = -saved_errno;
4189 		goto out;
4190 	}
4191 
4192 	/* Enable ignoring missing threads when -u/-p option is defined. */
4193 	rec->opts.ignore_missing_thread = rec->opts.target.uid != UINT_MAX || rec->opts.target.pid;
4194 
4195 	evlist__warn_user_requested_cpus(rec->evlist, rec->opts.target.cpu_list);
4196 
4197 	if (callchain_param.enabled && callchain_param.record_mode == CALLCHAIN_FP)
4198 		arch__add_leaf_frame_record_opts(&rec->opts);
4199 
4200 	err = -ENOMEM;
4201 	if (evlist__create_maps(rec->evlist, &rec->opts.target) < 0) {
4202 		if (rec->opts.target.pid != NULL) {
4203 			pr_err("Couldn't create thread/CPU maps: %s\n",
4204 				errno == ENOENT ? "No such process" : str_error_r(errno, errbuf, sizeof(errbuf)));
4205 			goto out;
4206 		}
4207 		else
4208 			usage_with_options(record_usage, record_options);
4209 	}
4210 
4211 	err = auxtrace_record__options(rec->itr, rec->evlist, &rec->opts);
4212 	if (err)
4213 		goto out;
4214 
4215 	/*
4216 	 * We take all buildids when the file contains
4217 	 * AUX area tracing data because we do not decode the
4218 	 * trace because it would take too long.
4219 	 */
4220 	if (rec->opts.full_auxtrace)
4221 		rec->buildid_all = true;
4222 
4223 	if (rec->opts.text_poke) {
4224 		err = record__config_text_poke(rec->evlist);
4225 		if (err) {
4226 			pr_err("record__config_text_poke failed, error %d\n", err);
4227 			goto out;
4228 		}
4229 	}
4230 
4231 	if (rec->off_cpu) {
4232 		err = record__config_off_cpu(rec);
4233 		if (err) {
4234 			pr_err("record__config_off_cpu failed, error %d\n", err);
4235 			goto out;
4236 		}
4237 	}
4238 
4239 	if (record_opts__config(&rec->opts)) {
4240 		err = -EINVAL;
4241 		goto out;
4242 	}
4243 
4244 	err = record__config_tracking_events(rec);
4245 	if (err) {
4246 		pr_err("record__config_tracking_events failed, error %d\n", err);
4247 		goto out;
4248 	}
4249 
4250 	err = record__init_thread_masks(rec);
4251 	if (err) {
4252 		pr_err("Failed to initialize parallel data streaming masks\n");
4253 		goto out;
4254 	}
4255 
4256 	if (rec->opts.nr_cblocks > nr_cblocks_max)
4257 		rec->opts.nr_cblocks = nr_cblocks_max;
4258 	pr_debug("nr_cblocks: %d\n", rec->opts.nr_cblocks);
4259 
4260 	pr_debug("affinity: %s\n", affinity_tags[rec->opts.affinity]);
4261 	pr_debug("mmap flush: %d\n", rec->opts.mmap_flush);
4262 
4263 	if (rec->opts.comp_level > comp_level_max)
4264 		rec->opts.comp_level = comp_level_max;
4265 	pr_debug("comp level: %d\n", rec->opts.comp_level);
4266 
4267 	err = __cmd_record(&record, argc, argv);
4268 out:
4269 	evlist__delete(rec->evlist);
4270 	symbol__exit();
4271 	auxtrace_record__free(rec->itr);
4272 out_opts:
4273 	record__free_thread_masks(rec, rec->nr_threads);
4274 	rec->nr_threads = 0;
4275 	evlist__close_control(rec->opts.ctl_fd, rec->opts.ctl_fd_ack, &rec->opts.ctl_fd_close);
4276 	return err;
4277 }
4278 
4279 static void snapshot_sig_handler(int sig __maybe_unused)
4280 {
4281 	struct record *rec = &record;
4282 
4283 	hit_auxtrace_snapshot_trigger(rec);
4284 
4285 	if (switch_output_signal(rec))
4286 		trigger_hit(&switch_output_trigger);
4287 }
4288 
4289 static void alarm_sig_handler(int sig __maybe_unused)
4290 {
4291 	struct record *rec = &record;
4292 
4293 	if (switch_output_time(rec))
4294 		trigger_hit(&switch_output_trigger);
4295 }
4296