xref: /linux/tools/perf/builtin-record.c (revision 0e4cac557531a4c93de108d9ff11329fcad482ff)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * builtin-record.c
4  *
5  * Builtin record command: Record the profile of a workload
6  * (or a CPU, or a PID) into the perf.data output file - for
7  * later analysis via perf report.
8  */
9 #include "builtin.h"
10 
11 #include "util/build-id.h"
12 #include <subcmd/parse-options.h>
13 #include <internal/xyarray.h>
14 #include "util/parse-events.h"
15 #include "util/config.h"
16 
17 #include "util/callchain.h"
18 #include "util/cgroup.h"
19 #include "util/header.h"
20 #include "util/event.h"
21 #include "util/evlist.h"
22 #include "util/evsel.h"
23 #include "util/debug.h"
24 #include "util/mmap.h"
25 #include "util/mutex.h"
26 #include "util/target.h"
27 #include "util/session.h"
28 #include "util/tool.h"
29 #include "util/symbol.h"
30 #include "util/record.h"
31 #include "util/cpumap.h"
32 #include "util/thread_map.h"
33 #include "util/data.h"
34 #include "util/perf_regs.h"
35 #include "util/auxtrace.h"
36 #include "util/tsc.h"
37 #include "util/parse-branch-options.h"
38 #include "util/parse-regs-options.h"
39 #include "util/perf_api_probe.h"
40 #include "util/trigger.h"
41 #include "util/perf-hooks.h"
42 #include "util/cpu-set-sched.h"
43 #include "util/synthetic-events.h"
44 #include "util/time-utils.h"
45 #include "util/units.h"
46 #include "util/bpf-event.h"
47 #include "util/util.h"
48 #include "util/pfm.h"
49 #include "util/pmu.h"
50 #include "util/pmus.h"
51 #include "util/clockid.h"
52 #include "util/off_cpu.h"
53 #include "util/bpf-filter.h"
54 #include "asm/bug.h"
55 #include "perf.h"
56 #include "cputopo.h"
57 
58 #include <errno.h>
59 #include <inttypes.h>
60 #include <locale.h>
61 #include <poll.h>
62 #include <pthread.h>
63 #include <unistd.h>
64 #ifndef HAVE_GETTID
65 #include <syscall.h>
66 #endif
67 #include <sched.h>
68 #include <signal.h>
69 #ifdef HAVE_EVENTFD_SUPPORT
70 #include <sys/eventfd.h>
71 #endif
72 #include <sys/mman.h>
73 #include <sys/wait.h>
74 #include <sys/types.h>
75 #include <sys/stat.h>
76 #include <fcntl.h>
77 #include <linux/err.h>
78 #include <linux/string.h>
79 #include <linux/time64.h>
80 #include <linux/zalloc.h>
81 #include <linux/bitmap.h>
82 #include <sys/time.h>
83 
84 struct switch_output {
85 	bool		 enabled;
86 	bool		 signal;
87 	unsigned long	 size;
88 	unsigned long	 time;
89 	const char	*str;
90 	bool		 set;
91 	char		 **filenames;
92 	int		 num_files;
93 	int		 cur_file;
94 };
95 
96 struct thread_mask {
97 	struct mmap_cpu_mask	maps;
98 	struct mmap_cpu_mask	affinity;
99 };
100 
101 struct record_thread {
102 	pid_t			tid;
103 	struct thread_mask	*mask;
104 	struct {
105 		int		msg[2];
106 		int		ack[2];
107 	} pipes;
108 	struct fdarray		pollfd;
109 	int			ctlfd_pos;
110 	int			nr_mmaps;
111 	struct mmap		**maps;
112 	struct mmap		**overwrite_maps;
113 	struct record		*rec;
114 	unsigned long long	samples;
115 	unsigned long		waking;
116 	u64			bytes_written;
117 	u64			bytes_transferred;
118 	u64			bytes_compressed;
119 };
120 
121 static __thread struct record_thread *thread;
122 
123 enum thread_msg {
124 	THREAD_MSG__UNDEFINED = 0,
125 	THREAD_MSG__READY,
126 	THREAD_MSG__MAX,
127 };
128 
129 static const char *thread_msg_tags[THREAD_MSG__MAX] = {
130 	"UNDEFINED", "READY"
131 };
132 
133 enum thread_spec {
134 	THREAD_SPEC__UNDEFINED = 0,
135 	THREAD_SPEC__CPU,
136 	THREAD_SPEC__CORE,
137 	THREAD_SPEC__PACKAGE,
138 	THREAD_SPEC__NUMA,
139 	THREAD_SPEC__USER,
140 	THREAD_SPEC__MAX,
141 };
142 
143 static const char *thread_spec_tags[THREAD_SPEC__MAX] = {
144 	"undefined", "cpu", "core", "package", "numa", "user"
145 };
146 
147 struct pollfd_index_map {
148 	int evlist_pollfd_index;
149 	int thread_pollfd_index;
150 };
151 
152 struct record {
153 	struct perf_tool	tool;
154 	struct record_opts	opts;
155 	u64			bytes_written;
156 	u64			thread_bytes_written;
157 	struct perf_data	data;
158 	struct auxtrace_record	*itr;
159 	struct evlist	*evlist;
160 	struct perf_session	*session;
161 	struct evlist		*sb_evlist;
162 	pthread_t		thread_id;
163 	int			realtime_prio;
164 	bool			switch_output_event_set;
165 	bool			no_buildid;
166 	bool			no_buildid_set;
167 	bool			no_buildid_cache;
168 	bool			no_buildid_cache_set;
169 	bool			buildid_all;
170 	bool			buildid_mmap;
171 	bool			timestamp_filename;
172 	bool			timestamp_boundary;
173 	bool			off_cpu;
174 	struct switch_output	switch_output;
175 	unsigned long long	samples;
176 	unsigned long		output_max_size;	/* = 0: unlimited */
177 	struct perf_debuginfod	debuginfod;
178 	int			nr_threads;
179 	struct thread_mask	*thread_masks;
180 	struct record_thread	*thread_data;
181 	struct pollfd_index_map	*index_map;
182 	size_t			index_map_sz;
183 	size_t			index_map_cnt;
184 };
185 
186 static volatile int done;
187 
188 static volatile int auxtrace_record__snapshot_started;
189 static DEFINE_TRIGGER(auxtrace_snapshot_trigger);
190 static DEFINE_TRIGGER(switch_output_trigger);
191 
192 static const char *affinity_tags[PERF_AFFINITY_MAX] = {
193 	"SYS", "NODE", "CPU"
194 };
195 
196 #ifndef HAVE_GETTID
197 static inline pid_t gettid(void)
198 {
199 	return (pid_t)syscall(__NR_gettid);
200 }
201 #endif
202 
203 static int record__threads_enabled(struct record *rec)
204 {
205 	return rec->opts.threads_spec;
206 }
207 
208 static bool switch_output_signal(struct record *rec)
209 {
210 	return rec->switch_output.signal &&
211 	       trigger_is_ready(&switch_output_trigger);
212 }
213 
214 static bool switch_output_size(struct record *rec)
215 {
216 	return rec->switch_output.size &&
217 	       trigger_is_ready(&switch_output_trigger) &&
218 	       (rec->bytes_written >= rec->switch_output.size);
219 }
220 
221 static bool switch_output_time(struct record *rec)
222 {
223 	return rec->switch_output.time &&
224 	       trigger_is_ready(&switch_output_trigger);
225 }
226 
227 static u64 record__bytes_written(struct record *rec)
228 {
229 	return rec->bytes_written + rec->thread_bytes_written;
230 }
231 
232 static bool record__output_max_size_exceeded(struct record *rec)
233 {
234 	return rec->output_max_size &&
235 	       (record__bytes_written(rec) >= rec->output_max_size);
236 }
237 
238 static int record__write(struct record *rec, struct mmap *map __maybe_unused,
239 			 void *bf, size_t size)
240 {
241 	struct perf_data_file *file = &rec->session->data->file;
242 
243 	if (map && map->file)
244 		file = map->file;
245 
246 	if (perf_data_file__write(file, bf, size) < 0) {
247 		pr_err("failed to write perf data, error: %m\n");
248 		return -1;
249 	}
250 
251 	if (map && map->file) {
252 		thread->bytes_written += size;
253 		rec->thread_bytes_written += size;
254 	} else {
255 		rec->bytes_written += size;
256 	}
257 
258 	if (record__output_max_size_exceeded(rec) && !done) {
259 		fprintf(stderr, "[ perf record: perf size limit reached (%" PRIu64 " KB),"
260 				" stopping session ]\n",
261 				record__bytes_written(rec) >> 10);
262 		done = 1;
263 	}
264 
265 	if (switch_output_size(rec))
266 		trigger_hit(&switch_output_trigger);
267 
268 	return 0;
269 }
270 
271 static int record__aio_enabled(struct record *rec);
272 static int record__comp_enabled(struct record *rec);
273 static size_t zstd_compress(struct perf_session *session, struct mmap *map,
274 			    void *dst, size_t dst_size, void *src, size_t src_size);
275 
276 #ifdef HAVE_AIO_SUPPORT
277 static int record__aio_write(struct aiocb *cblock, int trace_fd,
278 		void *buf, size_t size, off_t off)
279 {
280 	int rc;
281 
282 	cblock->aio_fildes = trace_fd;
283 	cblock->aio_buf    = buf;
284 	cblock->aio_nbytes = size;
285 	cblock->aio_offset = off;
286 	cblock->aio_sigevent.sigev_notify = SIGEV_NONE;
287 
288 	do {
289 		rc = aio_write(cblock);
290 		if (rc == 0) {
291 			break;
292 		} else if (errno != EAGAIN) {
293 			cblock->aio_fildes = -1;
294 			pr_err("failed to queue perf data, error: %m\n");
295 			break;
296 		}
297 	} while (1);
298 
299 	return rc;
300 }
301 
302 static int record__aio_complete(struct mmap *md, struct aiocb *cblock)
303 {
304 	void *rem_buf;
305 	off_t rem_off;
306 	size_t rem_size;
307 	int rc, aio_errno;
308 	ssize_t aio_ret, written;
309 
310 	aio_errno = aio_error(cblock);
311 	if (aio_errno == EINPROGRESS)
312 		return 0;
313 
314 	written = aio_ret = aio_return(cblock);
315 	if (aio_ret < 0) {
316 		if (aio_errno != EINTR)
317 			pr_err("failed to write perf data, error: %m\n");
318 		written = 0;
319 	}
320 
321 	rem_size = cblock->aio_nbytes - written;
322 
323 	if (rem_size == 0) {
324 		cblock->aio_fildes = -1;
325 		/*
326 		 * md->refcount is incremented in record__aio_pushfn() for
327 		 * every aio write request started in record__aio_push() so
328 		 * decrement it because the request is now complete.
329 		 */
330 		perf_mmap__put(&md->core);
331 		rc = 1;
332 	} else {
333 		/*
334 		 * aio write request may require restart with the
335 		 * reminder if the kernel didn't write whole
336 		 * chunk at once.
337 		 */
338 		rem_off = cblock->aio_offset + written;
339 		rem_buf = (void *)(cblock->aio_buf + written);
340 		record__aio_write(cblock, cblock->aio_fildes,
341 				rem_buf, rem_size, rem_off);
342 		rc = 0;
343 	}
344 
345 	return rc;
346 }
347 
348 static int record__aio_sync(struct mmap *md, bool sync_all)
349 {
350 	struct aiocb **aiocb = md->aio.aiocb;
351 	struct aiocb *cblocks = md->aio.cblocks;
352 	struct timespec timeout = { 0, 1000 * 1000  * 1 }; /* 1ms */
353 	int i, do_suspend;
354 
355 	do {
356 		do_suspend = 0;
357 		for (i = 0; i < md->aio.nr_cblocks; ++i) {
358 			if (cblocks[i].aio_fildes == -1 || record__aio_complete(md, &cblocks[i])) {
359 				if (sync_all)
360 					aiocb[i] = NULL;
361 				else
362 					return i;
363 			} else {
364 				/*
365 				 * Started aio write is not complete yet
366 				 * so it has to be waited before the
367 				 * next allocation.
368 				 */
369 				aiocb[i] = &cblocks[i];
370 				do_suspend = 1;
371 			}
372 		}
373 		if (!do_suspend)
374 			return -1;
375 
376 		while (aio_suspend((const struct aiocb **)aiocb, md->aio.nr_cblocks, &timeout)) {
377 			if (!(errno == EAGAIN || errno == EINTR))
378 				pr_err("failed to sync perf data, error: %m\n");
379 		}
380 	} while (1);
381 }
382 
383 struct record_aio {
384 	struct record	*rec;
385 	void		*data;
386 	size_t		size;
387 };
388 
389 static int record__aio_pushfn(struct mmap *map, void *to, void *buf, size_t size)
390 {
391 	struct record_aio *aio = to;
392 
393 	/*
394 	 * map->core.base data pointed by buf is copied into free map->aio.data[] buffer
395 	 * to release space in the kernel buffer as fast as possible, calling
396 	 * perf_mmap__consume() from perf_mmap__push() function.
397 	 *
398 	 * That lets the kernel to proceed with storing more profiling data into
399 	 * the kernel buffer earlier than other per-cpu kernel buffers are handled.
400 	 *
401 	 * Coping can be done in two steps in case the chunk of profiling data
402 	 * crosses the upper bound of the kernel buffer. In this case we first move
403 	 * part of data from map->start till the upper bound and then the reminder
404 	 * from the beginning of the kernel buffer till the end of the data chunk.
405 	 */
406 
407 	if (record__comp_enabled(aio->rec)) {
408 		size = zstd_compress(aio->rec->session, NULL, aio->data + aio->size,
409 				     mmap__mmap_len(map) - aio->size,
410 				     buf, size);
411 	} else {
412 		memcpy(aio->data + aio->size, buf, size);
413 	}
414 
415 	if (!aio->size) {
416 		/*
417 		 * Increment map->refcount to guard map->aio.data[] buffer
418 		 * from premature deallocation because map object can be
419 		 * released earlier than aio write request started on
420 		 * map->aio.data[] buffer is complete.
421 		 *
422 		 * perf_mmap__put() is done at record__aio_complete()
423 		 * after started aio request completion or at record__aio_push()
424 		 * if the request failed to start.
425 		 */
426 		perf_mmap__get(&map->core);
427 	}
428 
429 	aio->size += size;
430 
431 	return size;
432 }
433 
434 static int record__aio_push(struct record *rec, struct mmap *map, off_t *off)
435 {
436 	int ret, idx;
437 	int trace_fd = rec->session->data->file.fd;
438 	struct record_aio aio = { .rec = rec, .size = 0 };
439 
440 	/*
441 	 * Call record__aio_sync() to wait till map->aio.data[] buffer
442 	 * becomes available after previous aio write operation.
443 	 */
444 
445 	idx = record__aio_sync(map, false);
446 	aio.data = map->aio.data[idx];
447 	ret = perf_mmap__push(map, &aio, record__aio_pushfn);
448 	if (ret != 0) /* ret > 0 - no data, ret < 0 - error */
449 		return ret;
450 
451 	rec->samples++;
452 	ret = record__aio_write(&(map->aio.cblocks[idx]), trace_fd, aio.data, aio.size, *off);
453 	if (!ret) {
454 		*off += aio.size;
455 		rec->bytes_written += aio.size;
456 		if (switch_output_size(rec))
457 			trigger_hit(&switch_output_trigger);
458 	} else {
459 		/*
460 		 * Decrement map->refcount incremented in record__aio_pushfn()
461 		 * back if record__aio_write() operation failed to start, otherwise
462 		 * map->refcount is decremented in record__aio_complete() after
463 		 * aio write operation finishes successfully.
464 		 */
465 		perf_mmap__put(&map->core);
466 	}
467 
468 	return ret;
469 }
470 
471 static off_t record__aio_get_pos(int trace_fd)
472 {
473 	return lseek(trace_fd, 0, SEEK_CUR);
474 }
475 
476 static void record__aio_set_pos(int trace_fd, off_t pos)
477 {
478 	lseek(trace_fd, pos, SEEK_SET);
479 }
480 
481 static void record__aio_mmap_read_sync(struct record *rec)
482 {
483 	int i;
484 	struct evlist *evlist = rec->evlist;
485 	struct mmap *maps = evlist->mmap;
486 
487 	if (!record__aio_enabled(rec))
488 		return;
489 
490 	for (i = 0; i < evlist->core.nr_mmaps; i++) {
491 		struct mmap *map = &maps[i];
492 
493 		if (map->core.base)
494 			record__aio_sync(map, true);
495 	}
496 }
497 
498 static int nr_cblocks_default = 1;
499 static int nr_cblocks_max = 4;
500 
501 static int record__aio_parse(const struct option *opt,
502 			     const char *str,
503 			     int unset)
504 {
505 	struct record_opts *opts = (struct record_opts *)opt->value;
506 
507 	if (unset) {
508 		opts->nr_cblocks = 0;
509 	} else {
510 		if (str)
511 			opts->nr_cblocks = strtol(str, NULL, 0);
512 		if (!opts->nr_cblocks)
513 			opts->nr_cblocks = nr_cblocks_default;
514 	}
515 
516 	return 0;
517 }
518 #else /* HAVE_AIO_SUPPORT */
519 static int nr_cblocks_max = 0;
520 
521 static int record__aio_push(struct record *rec __maybe_unused, struct mmap *map __maybe_unused,
522 			    off_t *off __maybe_unused)
523 {
524 	return -1;
525 }
526 
527 static off_t record__aio_get_pos(int trace_fd __maybe_unused)
528 {
529 	return -1;
530 }
531 
532 static void record__aio_set_pos(int trace_fd __maybe_unused, off_t pos __maybe_unused)
533 {
534 }
535 
536 static void record__aio_mmap_read_sync(struct record *rec __maybe_unused)
537 {
538 }
539 #endif
540 
541 static int record__aio_enabled(struct record *rec)
542 {
543 	return rec->opts.nr_cblocks > 0;
544 }
545 
546 #define MMAP_FLUSH_DEFAULT 1
547 static int record__mmap_flush_parse(const struct option *opt,
548 				    const char *str,
549 				    int unset)
550 {
551 	int flush_max;
552 	struct record_opts *opts = (struct record_opts *)opt->value;
553 	static struct parse_tag tags[] = {
554 			{ .tag  = 'B', .mult = 1       },
555 			{ .tag  = 'K', .mult = 1 << 10 },
556 			{ .tag  = 'M', .mult = 1 << 20 },
557 			{ .tag  = 'G', .mult = 1 << 30 },
558 			{ .tag  = 0 },
559 	};
560 
561 	if (unset)
562 		return 0;
563 
564 	if (str) {
565 		opts->mmap_flush = parse_tag_value(str, tags);
566 		if (opts->mmap_flush == (int)-1)
567 			opts->mmap_flush = strtol(str, NULL, 0);
568 	}
569 
570 	if (!opts->mmap_flush)
571 		opts->mmap_flush = MMAP_FLUSH_DEFAULT;
572 
573 	flush_max = evlist__mmap_size(opts->mmap_pages);
574 	flush_max /= 4;
575 	if (opts->mmap_flush > flush_max)
576 		opts->mmap_flush = flush_max;
577 
578 	return 0;
579 }
580 
581 #ifdef HAVE_ZSTD_SUPPORT
582 static unsigned int comp_level_default = 1;
583 
584 static int record__parse_comp_level(const struct option *opt, const char *str, int unset)
585 {
586 	struct record_opts *opts = opt->value;
587 
588 	if (unset) {
589 		opts->comp_level = 0;
590 	} else {
591 		if (str)
592 			opts->comp_level = strtol(str, NULL, 0);
593 		if (!opts->comp_level)
594 			opts->comp_level = comp_level_default;
595 	}
596 
597 	return 0;
598 }
599 #endif
600 static unsigned int comp_level_max = 22;
601 
602 static int record__comp_enabled(struct record *rec)
603 {
604 	return rec->opts.comp_level > 0;
605 }
606 
607 static int process_synthesized_event(struct perf_tool *tool,
608 				     union perf_event *event,
609 				     struct perf_sample *sample __maybe_unused,
610 				     struct machine *machine __maybe_unused)
611 {
612 	struct record *rec = container_of(tool, struct record, tool);
613 	return record__write(rec, NULL, event, event->header.size);
614 }
615 
616 static struct mutex synth_lock;
617 
618 static int process_locked_synthesized_event(struct perf_tool *tool,
619 				     union perf_event *event,
620 				     struct perf_sample *sample __maybe_unused,
621 				     struct machine *machine __maybe_unused)
622 {
623 	int ret;
624 
625 	mutex_lock(&synth_lock);
626 	ret = process_synthesized_event(tool, event, sample, machine);
627 	mutex_unlock(&synth_lock);
628 	return ret;
629 }
630 
631 static int record__pushfn(struct mmap *map, void *to, void *bf, size_t size)
632 {
633 	struct record *rec = to;
634 
635 	if (record__comp_enabled(rec)) {
636 		size = zstd_compress(rec->session, map, map->data, mmap__mmap_len(map), bf, size);
637 		bf   = map->data;
638 	}
639 
640 	thread->samples++;
641 	return record__write(rec, map, bf, size);
642 }
643 
644 static volatile sig_atomic_t signr = -1;
645 static volatile sig_atomic_t child_finished;
646 #ifdef HAVE_EVENTFD_SUPPORT
647 static volatile sig_atomic_t done_fd = -1;
648 #endif
649 
650 static void sig_handler(int sig)
651 {
652 	if (sig == SIGCHLD)
653 		child_finished = 1;
654 	else
655 		signr = sig;
656 
657 	done = 1;
658 #ifdef HAVE_EVENTFD_SUPPORT
659 	if (done_fd >= 0) {
660 		u64 tmp = 1;
661 		int orig_errno = errno;
662 
663 		/*
664 		 * It is possible for this signal handler to run after done is
665 		 * checked in the main loop, but before the perf counter fds are
666 		 * polled. If this happens, the poll() will continue to wait
667 		 * even though done is set, and will only break out if either
668 		 * another signal is received, or the counters are ready for
669 		 * read. To ensure the poll() doesn't sleep when done is set,
670 		 * use an eventfd (done_fd) to wake up the poll().
671 		 */
672 		if (write(done_fd, &tmp, sizeof(tmp)) < 0)
673 			pr_err("failed to signal wakeup fd, error: %m\n");
674 
675 		errno = orig_errno;
676 	}
677 #endif // HAVE_EVENTFD_SUPPORT
678 }
679 
680 static void sigsegv_handler(int sig)
681 {
682 	perf_hooks__recover();
683 	sighandler_dump_stack(sig);
684 }
685 
686 static void record__sig_exit(void)
687 {
688 	if (signr == -1)
689 		return;
690 
691 	signal(signr, SIG_DFL);
692 	raise(signr);
693 }
694 
695 #ifdef HAVE_AUXTRACE_SUPPORT
696 
697 static int record__process_auxtrace(struct perf_tool *tool,
698 				    struct mmap *map,
699 				    union perf_event *event, void *data1,
700 				    size_t len1, void *data2, size_t len2)
701 {
702 	struct record *rec = container_of(tool, struct record, tool);
703 	struct perf_data *data = &rec->data;
704 	size_t padding;
705 	u8 pad[8] = {0};
706 
707 	if (!perf_data__is_pipe(data) && perf_data__is_single_file(data)) {
708 		off_t file_offset;
709 		int fd = perf_data__fd(data);
710 		int err;
711 
712 		file_offset = lseek(fd, 0, SEEK_CUR);
713 		if (file_offset == -1)
714 			return -1;
715 		err = auxtrace_index__auxtrace_event(&rec->session->auxtrace_index,
716 						     event, file_offset);
717 		if (err)
718 			return err;
719 	}
720 
721 	/* event.auxtrace.size includes padding, see __auxtrace_mmap__read() */
722 	padding = (len1 + len2) & 7;
723 	if (padding)
724 		padding = 8 - padding;
725 
726 	record__write(rec, map, event, event->header.size);
727 	record__write(rec, map, data1, len1);
728 	if (len2)
729 		record__write(rec, map, data2, len2);
730 	record__write(rec, map, &pad, padding);
731 
732 	return 0;
733 }
734 
735 static int record__auxtrace_mmap_read(struct record *rec,
736 				      struct mmap *map)
737 {
738 	int ret;
739 
740 	ret = auxtrace_mmap__read(map, rec->itr, &rec->tool,
741 				  record__process_auxtrace);
742 	if (ret < 0)
743 		return ret;
744 
745 	if (ret)
746 		rec->samples++;
747 
748 	return 0;
749 }
750 
751 static int record__auxtrace_mmap_read_snapshot(struct record *rec,
752 					       struct mmap *map)
753 {
754 	int ret;
755 
756 	ret = auxtrace_mmap__read_snapshot(map, rec->itr, &rec->tool,
757 					   record__process_auxtrace,
758 					   rec->opts.auxtrace_snapshot_size);
759 	if (ret < 0)
760 		return ret;
761 
762 	if (ret)
763 		rec->samples++;
764 
765 	return 0;
766 }
767 
768 static int record__auxtrace_read_snapshot_all(struct record *rec)
769 {
770 	int i;
771 	int rc = 0;
772 
773 	for (i = 0; i < rec->evlist->core.nr_mmaps; i++) {
774 		struct mmap *map = &rec->evlist->mmap[i];
775 
776 		if (!map->auxtrace_mmap.base)
777 			continue;
778 
779 		if (record__auxtrace_mmap_read_snapshot(rec, map) != 0) {
780 			rc = -1;
781 			goto out;
782 		}
783 	}
784 out:
785 	return rc;
786 }
787 
788 static void record__read_auxtrace_snapshot(struct record *rec, bool on_exit)
789 {
790 	pr_debug("Recording AUX area tracing snapshot\n");
791 	if (record__auxtrace_read_snapshot_all(rec) < 0) {
792 		trigger_error(&auxtrace_snapshot_trigger);
793 	} else {
794 		if (auxtrace_record__snapshot_finish(rec->itr, on_exit))
795 			trigger_error(&auxtrace_snapshot_trigger);
796 		else
797 			trigger_ready(&auxtrace_snapshot_trigger);
798 	}
799 }
800 
801 static int record__auxtrace_snapshot_exit(struct record *rec)
802 {
803 	if (trigger_is_error(&auxtrace_snapshot_trigger))
804 		return 0;
805 
806 	if (!auxtrace_record__snapshot_started &&
807 	    auxtrace_record__snapshot_start(rec->itr))
808 		return -1;
809 
810 	record__read_auxtrace_snapshot(rec, true);
811 	if (trigger_is_error(&auxtrace_snapshot_trigger))
812 		return -1;
813 
814 	return 0;
815 }
816 
817 static int record__auxtrace_init(struct record *rec)
818 {
819 	int err;
820 
821 	if ((rec->opts.auxtrace_snapshot_opts || rec->opts.auxtrace_sample_opts)
822 	    && record__threads_enabled(rec)) {
823 		pr_err("AUX area tracing options are not available in parallel streaming mode.\n");
824 		return -EINVAL;
825 	}
826 
827 	if (!rec->itr) {
828 		rec->itr = auxtrace_record__init(rec->evlist, &err);
829 		if (err)
830 			return err;
831 	}
832 
833 	err = auxtrace_parse_snapshot_options(rec->itr, &rec->opts,
834 					      rec->opts.auxtrace_snapshot_opts);
835 	if (err)
836 		return err;
837 
838 	err = auxtrace_parse_sample_options(rec->itr, rec->evlist, &rec->opts,
839 					    rec->opts.auxtrace_sample_opts);
840 	if (err)
841 		return err;
842 
843 	auxtrace_regroup_aux_output(rec->evlist);
844 
845 	return auxtrace_parse_filters(rec->evlist);
846 }
847 
848 #else
849 
850 static inline
851 int record__auxtrace_mmap_read(struct record *rec __maybe_unused,
852 			       struct mmap *map __maybe_unused)
853 {
854 	return 0;
855 }
856 
857 static inline
858 void record__read_auxtrace_snapshot(struct record *rec __maybe_unused,
859 				    bool on_exit __maybe_unused)
860 {
861 }
862 
863 static inline
864 int auxtrace_record__snapshot_start(struct auxtrace_record *itr __maybe_unused)
865 {
866 	return 0;
867 }
868 
869 static inline
870 int record__auxtrace_snapshot_exit(struct record *rec __maybe_unused)
871 {
872 	return 0;
873 }
874 
875 static int record__auxtrace_init(struct record *rec __maybe_unused)
876 {
877 	return 0;
878 }
879 
880 #endif
881 
882 static int record__config_text_poke(struct evlist *evlist)
883 {
884 	struct evsel *evsel;
885 
886 	/* Nothing to do if text poke is already configured */
887 	evlist__for_each_entry(evlist, evsel) {
888 		if (evsel->core.attr.text_poke)
889 			return 0;
890 	}
891 
892 	evsel = evlist__add_dummy_on_all_cpus(evlist);
893 	if (!evsel)
894 		return -ENOMEM;
895 
896 	evsel->core.attr.text_poke = 1;
897 	evsel->core.attr.ksymbol = 1;
898 	evsel->immediate = true;
899 	evsel__set_sample_bit(evsel, TIME);
900 
901 	return 0;
902 }
903 
904 static int record__config_off_cpu(struct record *rec)
905 {
906 	return off_cpu_prepare(rec->evlist, &rec->opts.target, &rec->opts);
907 }
908 
909 static bool record__kcore_readable(struct machine *machine)
910 {
911 	char kcore[PATH_MAX];
912 	int fd;
913 
914 	scnprintf(kcore, sizeof(kcore), "%s/proc/kcore", machine->root_dir);
915 
916 	fd = open(kcore, O_RDONLY);
917 	if (fd < 0)
918 		return false;
919 
920 	close(fd);
921 
922 	return true;
923 }
924 
925 static int record__kcore_copy(struct machine *machine, struct perf_data *data)
926 {
927 	char from_dir[PATH_MAX];
928 	char kcore_dir[PATH_MAX];
929 	int ret;
930 
931 	snprintf(from_dir, sizeof(from_dir), "%s/proc", machine->root_dir);
932 
933 	ret = perf_data__make_kcore_dir(data, kcore_dir, sizeof(kcore_dir));
934 	if (ret)
935 		return ret;
936 
937 	return kcore_copy(from_dir, kcore_dir);
938 }
939 
940 static void record__thread_data_init_pipes(struct record_thread *thread_data)
941 {
942 	thread_data->pipes.msg[0] = -1;
943 	thread_data->pipes.msg[1] = -1;
944 	thread_data->pipes.ack[0] = -1;
945 	thread_data->pipes.ack[1] = -1;
946 }
947 
948 static int record__thread_data_open_pipes(struct record_thread *thread_data)
949 {
950 	if (pipe(thread_data->pipes.msg))
951 		return -EINVAL;
952 
953 	if (pipe(thread_data->pipes.ack)) {
954 		close(thread_data->pipes.msg[0]);
955 		thread_data->pipes.msg[0] = -1;
956 		close(thread_data->pipes.msg[1]);
957 		thread_data->pipes.msg[1] = -1;
958 		return -EINVAL;
959 	}
960 
961 	pr_debug2("thread_data[%p]: msg=[%d,%d], ack=[%d,%d]\n", thread_data,
962 		 thread_data->pipes.msg[0], thread_data->pipes.msg[1],
963 		 thread_data->pipes.ack[0], thread_data->pipes.ack[1]);
964 
965 	return 0;
966 }
967 
968 static void record__thread_data_close_pipes(struct record_thread *thread_data)
969 {
970 	if (thread_data->pipes.msg[0] != -1) {
971 		close(thread_data->pipes.msg[0]);
972 		thread_data->pipes.msg[0] = -1;
973 	}
974 	if (thread_data->pipes.msg[1] != -1) {
975 		close(thread_data->pipes.msg[1]);
976 		thread_data->pipes.msg[1] = -1;
977 	}
978 	if (thread_data->pipes.ack[0] != -1) {
979 		close(thread_data->pipes.ack[0]);
980 		thread_data->pipes.ack[0] = -1;
981 	}
982 	if (thread_data->pipes.ack[1] != -1) {
983 		close(thread_data->pipes.ack[1]);
984 		thread_data->pipes.ack[1] = -1;
985 	}
986 }
987 
988 static bool evlist__per_thread(struct evlist *evlist)
989 {
990 	return cpu_map__is_dummy(evlist->core.user_requested_cpus);
991 }
992 
993 static int record__thread_data_init_maps(struct record_thread *thread_data, struct evlist *evlist)
994 {
995 	int m, tm, nr_mmaps = evlist->core.nr_mmaps;
996 	struct mmap *mmap = evlist->mmap;
997 	struct mmap *overwrite_mmap = evlist->overwrite_mmap;
998 	struct perf_cpu_map *cpus = evlist->core.all_cpus;
999 	bool per_thread = evlist__per_thread(evlist);
1000 
1001 	if (per_thread)
1002 		thread_data->nr_mmaps = nr_mmaps;
1003 	else
1004 		thread_data->nr_mmaps = bitmap_weight(thread_data->mask->maps.bits,
1005 						      thread_data->mask->maps.nbits);
1006 	if (mmap) {
1007 		thread_data->maps = zalloc(thread_data->nr_mmaps * sizeof(struct mmap *));
1008 		if (!thread_data->maps)
1009 			return -ENOMEM;
1010 	}
1011 	if (overwrite_mmap) {
1012 		thread_data->overwrite_maps = zalloc(thread_data->nr_mmaps * sizeof(struct mmap *));
1013 		if (!thread_data->overwrite_maps) {
1014 			zfree(&thread_data->maps);
1015 			return -ENOMEM;
1016 		}
1017 	}
1018 	pr_debug2("thread_data[%p]: nr_mmaps=%d, maps=%p, ow_maps=%p\n", thread_data,
1019 		 thread_data->nr_mmaps, thread_data->maps, thread_data->overwrite_maps);
1020 
1021 	for (m = 0, tm = 0; m < nr_mmaps && tm < thread_data->nr_mmaps; m++) {
1022 		if (per_thread ||
1023 		    test_bit(perf_cpu_map__cpu(cpus, m).cpu, thread_data->mask->maps.bits)) {
1024 			if (thread_data->maps) {
1025 				thread_data->maps[tm] = &mmap[m];
1026 				pr_debug2("thread_data[%p]: cpu%d: maps[%d] -> mmap[%d]\n",
1027 					  thread_data, perf_cpu_map__cpu(cpus, m).cpu, tm, m);
1028 			}
1029 			if (thread_data->overwrite_maps) {
1030 				thread_data->overwrite_maps[tm] = &overwrite_mmap[m];
1031 				pr_debug2("thread_data[%p]: cpu%d: ow_maps[%d] -> ow_mmap[%d]\n",
1032 					  thread_data, perf_cpu_map__cpu(cpus, m).cpu, tm, m);
1033 			}
1034 			tm++;
1035 		}
1036 	}
1037 
1038 	return 0;
1039 }
1040 
1041 static int record__thread_data_init_pollfd(struct record_thread *thread_data, struct evlist *evlist)
1042 {
1043 	int f, tm, pos;
1044 	struct mmap *map, *overwrite_map;
1045 
1046 	fdarray__init(&thread_data->pollfd, 64);
1047 
1048 	for (tm = 0; tm < thread_data->nr_mmaps; tm++) {
1049 		map = thread_data->maps ? thread_data->maps[tm] : NULL;
1050 		overwrite_map = thread_data->overwrite_maps ?
1051 				thread_data->overwrite_maps[tm] : NULL;
1052 
1053 		for (f = 0; f < evlist->core.pollfd.nr; f++) {
1054 			void *ptr = evlist->core.pollfd.priv[f].ptr;
1055 
1056 			if ((map && ptr == map) || (overwrite_map && ptr == overwrite_map)) {
1057 				pos = fdarray__dup_entry_from(&thread_data->pollfd, f,
1058 							      &evlist->core.pollfd);
1059 				if (pos < 0)
1060 					return pos;
1061 				pr_debug2("thread_data[%p]: pollfd[%d] <- event_fd=%d\n",
1062 					 thread_data, pos, evlist->core.pollfd.entries[f].fd);
1063 			}
1064 		}
1065 	}
1066 
1067 	return 0;
1068 }
1069 
1070 static void record__free_thread_data(struct record *rec)
1071 {
1072 	int t;
1073 	struct record_thread *thread_data = rec->thread_data;
1074 
1075 	if (thread_data == NULL)
1076 		return;
1077 
1078 	for (t = 0; t < rec->nr_threads; t++) {
1079 		record__thread_data_close_pipes(&thread_data[t]);
1080 		zfree(&thread_data[t].maps);
1081 		zfree(&thread_data[t].overwrite_maps);
1082 		fdarray__exit(&thread_data[t].pollfd);
1083 	}
1084 
1085 	zfree(&rec->thread_data);
1086 }
1087 
1088 static int record__map_thread_evlist_pollfd_indexes(struct record *rec,
1089 						    int evlist_pollfd_index,
1090 						    int thread_pollfd_index)
1091 {
1092 	size_t x = rec->index_map_cnt;
1093 
1094 	if (realloc_array_as_needed(rec->index_map, rec->index_map_sz, x, NULL))
1095 		return -ENOMEM;
1096 	rec->index_map[x].evlist_pollfd_index = evlist_pollfd_index;
1097 	rec->index_map[x].thread_pollfd_index = thread_pollfd_index;
1098 	rec->index_map_cnt += 1;
1099 	return 0;
1100 }
1101 
1102 static int record__update_evlist_pollfd_from_thread(struct record *rec,
1103 						    struct evlist *evlist,
1104 						    struct record_thread *thread_data)
1105 {
1106 	struct pollfd *e_entries = evlist->core.pollfd.entries;
1107 	struct pollfd *t_entries = thread_data->pollfd.entries;
1108 	int err = 0;
1109 	size_t i;
1110 
1111 	for (i = 0; i < rec->index_map_cnt; i++) {
1112 		int e_pos = rec->index_map[i].evlist_pollfd_index;
1113 		int t_pos = rec->index_map[i].thread_pollfd_index;
1114 
1115 		if (e_entries[e_pos].fd != t_entries[t_pos].fd ||
1116 		    e_entries[e_pos].events != t_entries[t_pos].events) {
1117 			pr_err("Thread and evlist pollfd index mismatch\n");
1118 			err = -EINVAL;
1119 			continue;
1120 		}
1121 		e_entries[e_pos].revents = t_entries[t_pos].revents;
1122 	}
1123 	return err;
1124 }
1125 
1126 static int record__dup_non_perf_events(struct record *rec,
1127 				       struct evlist *evlist,
1128 				       struct record_thread *thread_data)
1129 {
1130 	struct fdarray *fda = &evlist->core.pollfd;
1131 	int i, ret;
1132 
1133 	for (i = 0; i < fda->nr; i++) {
1134 		if (!(fda->priv[i].flags & fdarray_flag__non_perf_event))
1135 			continue;
1136 		ret = fdarray__dup_entry_from(&thread_data->pollfd, i, fda);
1137 		if (ret < 0) {
1138 			pr_err("Failed to duplicate descriptor in main thread pollfd\n");
1139 			return ret;
1140 		}
1141 		pr_debug2("thread_data[%p]: pollfd[%d] <- non_perf_event fd=%d\n",
1142 			  thread_data, ret, fda->entries[i].fd);
1143 		ret = record__map_thread_evlist_pollfd_indexes(rec, i, ret);
1144 		if (ret < 0) {
1145 			pr_err("Failed to map thread and evlist pollfd indexes\n");
1146 			return ret;
1147 		}
1148 	}
1149 	return 0;
1150 }
1151 
1152 static int record__alloc_thread_data(struct record *rec, struct evlist *evlist)
1153 {
1154 	int t, ret;
1155 	struct record_thread *thread_data;
1156 
1157 	rec->thread_data = zalloc(rec->nr_threads * sizeof(*(rec->thread_data)));
1158 	if (!rec->thread_data) {
1159 		pr_err("Failed to allocate thread data\n");
1160 		return -ENOMEM;
1161 	}
1162 	thread_data = rec->thread_data;
1163 
1164 	for (t = 0; t < rec->nr_threads; t++)
1165 		record__thread_data_init_pipes(&thread_data[t]);
1166 
1167 	for (t = 0; t < rec->nr_threads; t++) {
1168 		thread_data[t].rec = rec;
1169 		thread_data[t].mask = &rec->thread_masks[t];
1170 		ret = record__thread_data_init_maps(&thread_data[t], evlist);
1171 		if (ret) {
1172 			pr_err("Failed to initialize thread[%d] maps\n", t);
1173 			goto out_free;
1174 		}
1175 		ret = record__thread_data_init_pollfd(&thread_data[t], evlist);
1176 		if (ret) {
1177 			pr_err("Failed to initialize thread[%d] pollfd\n", t);
1178 			goto out_free;
1179 		}
1180 		if (t) {
1181 			thread_data[t].tid = -1;
1182 			ret = record__thread_data_open_pipes(&thread_data[t]);
1183 			if (ret) {
1184 				pr_err("Failed to open thread[%d] communication pipes\n", t);
1185 				goto out_free;
1186 			}
1187 			ret = fdarray__add(&thread_data[t].pollfd, thread_data[t].pipes.msg[0],
1188 					   POLLIN | POLLERR | POLLHUP, fdarray_flag__nonfilterable);
1189 			if (ret < 0) {
1190 				pr_err("Failed to add descriptor to thread[%d] pollfd\n", t);
1191 				goto out_free;
1192 			}
1193 			thread_data[t].ctlfd_pos = ret;
1194 			pr_debug2("thread_data[%p]: pollfd[%d] <- ctl_fd=%d\n",
1195 				 thread_data, thread_data[t].ctlfd_pos,
1196 				 thread_data[t].pipes.msg[0]);
1197 		} else {
1198 			thread_data[t].tid = gettid();
1199 
1200 			ret = record__dup_non_perf_events(rec, evlist, &thread_data[t]);
1201 			if (ret < 0)
1202 				goto out_free;
1203 
1204 			thread_data[t].ctlfd_pos = -1; /* Not used */
1205 		}
1206 	}
1207 
1208 	return 0;
1209 
1210 out_free:
1211 	record__free_thread_data(rec);
1212 
1213 	return ret;
1214 }
1215 
1216 static int record__mmap_evlist(struct record *rec,
1217 			       struct evlist *evlist)
1218 {
1219 	int i, ret;
1220 	struct record_opts *opts = &rec->opts;
1221 	bool auxtrace_overwrite = opts->auxtrace_snapshot_mode ||
1222 				  opts->auxtrace_sample_mode;
1223 	char msg[512];
1224 
1225 	if (opts->affinity != PERF_AFFINITY_SYS)
1226 		cpu__setup_cpunode_map();
1227 
1228 	if (evlist__mmap_ex(evlist, opts->mmap_pages,
1229 				 opts->auxtrace_mmap_pages,
1230 				 auxtrace_overwrite,
1231 				 opts->nr_cblocks, opts->affinity,
1232 				 opts->mmap_flush, opts->comp_level) < 0) {
1233 		if (errno == EPERM) {
1234 			pr_err("Permission error mapping pages.\n"
1235 			       "Consider increasing "
1236 			       "/proc/sys/kernel/perf_event_mlock_kb,\n"
1237 			       "or try again with a smaller value of -m/--mmap_pages.\n"
1238 			       "(current value: %u,%u)\n",
1239 			       opts->mmap_pages, opts->auxtrace_mmap_pages);
1240 			return -errno;
1241 		} else {
1242 			pr_err("failed to mmap with %d (%s)\n", errno,
1243 				str_error_r(errno, msg, sizeof(msg)));
1244 			if (errno)
1245 				return -errno;
1246 			else
1247 				return -EINVAL;
1248 		}
1249 	}
1250 
1251 	if (evlist__initialize_ctlfd(evlist, opts->ctl_fd, opts->ctl_fd_ack))
1252 		return -1;
1253 
1254 	ret = record__alloc_thread_data(rec, evlist);
1255 	if (ret)
1256 		return ret;
1257 
1258 	if (record__threads_enabled(rec)) {
1259 		ret = perf_data__create_dir(&rec->data, evlist->core.nr_mmaps);
1260 		if (ret) {
1261 			pr_err("Failed to create data directory: %s\n", strerror(-ret));
1262 			return ret;
1263 		}
1264 		for (i = 0; i < evlist->core.nr_mmaps; i++) {
1265 			if (evlist->mmap)
1266 				evlist->mmap[i].file = &rec->data.dir.files[i];
1267 			if (evlist->overwrite_mmap)
1268 				evlist->overwrite_mmap[i].file = &rec->data.dir.files[i];
1269 		}
1270 	}
1271 
1272 	return 0;
1273 }
1274 
1275 static int record__mmap(struct record *rec)
1276 {
1277 	return record__mmap_evlist(rec, rec->evlist);
1278 }
1279 
1280 static int record__open(struct record *rec)
1281 {
1282 	char msg[BUFSIZ];
1283 	struct evsel *pos;
1284 	struct evlist *evlist = rec->evlist;
1285 	struct perf_session *session = rec->session;
1286 	struct record_opts *opts = &rec->opts;
1287 	int rc = 0;
1288 
1289 	/*
1290 	 * For initial_delay, system wide or a hybrid system, we need to add a
1291 	 * dummy event so that we can track PERF_RECORD_MMAP to cover the delay
1292 	 * of waiting or event synthesis.
1293 	 */
1294 	if (opts->target.initial_delay || target__has_cpu(&opts->target) ||
1295 	    perf_pmus__num_core_pmus() > 1) {
1296 		pos = evlist__get_tracking_event(evlist);
1297 		if (!evsel__is_dummy_event(pos)) {
1298 			/* Set up dummy event. */
1299 			if (evlist__add_dummy(evlist))
1300 				return -ENOMEM;
1301 			pos = evlist__last(evlist);
1302 			evlist__set_tracking_event(evlist, pos);
1303 		}
1304 
1305 		/*
1306 		 * Enable the dummy event when the process is forked for
1307 		 * initial_delay, immediately for system wide.
1308 		 */
1309 		if (opts->target.initial_delay && !pos->immediate &&
1310 		    !target__has_cpu(&opts->target))
1311 			pos->core.attr.enable_on_exec = 1;
1312 		else
1313 			pos->immediate = 1;
1314 	}
1315 
1316 	evlist__config(evlist, opts, &callchain_param);
1317 
1318 	evlist__for_each_entry(evlist, pos) {
1319 try_again:
1320 		if (evsel__open(pos, pos->core.cpus, pos->core.threads) < 0) {
1321 			if (evsel__fallback(pos, errno, msg, sizeof(msg))) {
1322 				if (verbose > 0)
1323 					ui__warning("%s\n", msg);
1324 				goto try_again;
1325 			}
1326 			if ((errno == EINVAL || errno == EBADF) &&
1327 			    pos->core.leader != &pos->core &&
1328 			    pos->weak_group) {
1329 			        pos = evlist__reset_weak_group(evlist, pos, true);
1330 				goto try_again;
1331 			}
1332 			rc = -errno;
1333 			evsel__open_strerror(pos, &opts->target, errno, msg, sizeof(msg));
1334 			ui__error("%s\n", msg);
1335 			goto out;
1336 		}
1337 
1338 		pos->supported = true;
1339 	}
1340 
1341 	if (symbol_conf.kptr_restrict && !evlist__exclude_kernel(evlist)) {
1342 		pr_warning(
1343 "WARNING: Kernel address maps (/proc/{kallsyms,modules}) are restricted,\n"
1344 "check /proc/sys/kernel/kptr_restrict and /proc/sys/kernel/perf_event_paranoid.\n\n"
1345 "Samples in kernel functions may not be resolved if a suitable vmlinux\n"
1346 "file is not found in the buildid cache or in the vmlinux path.\n\n"
1347 "Samples in kernel modules won't be resolved at all.\n\n"
1348 "If some relocation was applied (e.g. kexec) symbols may be misresolved\n"
1349 "even with a suitable vmlinux or kallsyms file.\n\n");
1350 	}
1351 
1352 	if (evlist__apply_filters(evlist, &pos)) {
1353 		pr_err("failed to set filter \"%s\" on event %s with %d (%s)\n",
1354 			pos->filter ?: "BPF", evsel__name(pos), errno,
1355 			str_error_r(errno, msg, sizeof(msg)));
1356 		rc = -1;
1357 		goto out;
1358 	}
1359 
1360 	rc = record__mmap(rec);
1361 	if (rc)
1362 		goto out;
1363 
1364 	session->evlist = evlist;
1365 	perf_session__set_id_hdr_size(session);
1366 out:
1367 	return rc;
1368 }
1369 
1370 static void set_timestamp_boundary(struct record *rec, u64 sample_time)
1371 {
1372 	if (rec->evlist->first_sample_time == 0)
1373 		rec->evlist->first_sample_time = sample_time;
1374 
1375 	if (sample_time)
1376 		rec->evlist->last_sample_time = sample_time;
1377 }
1378 
1379 static int process_sample_event(struct perf_tool *tool,
1380 				union perf_event *event,
1381 				struct perf_sample *sample,
1382 				struct evsel *evsel,
1383 				struct machine *machine)
1384 {
1385 	struct record *rec = container_of(tool, struct record, tool);
1386 
1387 	set_timestamp_boundary(rec, sample->time);
1388 
1389 	if (rec->buildid_all)
1390 		return 0;
1391 
1392 	rec->samples++;
1393 	return build_id__mark_dso_hit(tool, event, sample, evsel, machine);
1394 }
1395 
1396 static int process_buildids(struct record *rec)
1397 {
1398 	struct perf_session *session = rec->session;
1399 
1400 	if (perf_data__size(&rec->data) == 0)
1401 		return 0;
1402 
1403 	/*
1404 	 * During this process, it'll load kernel map and replace the
1405 	 * dso->long_name to a real pathname it found.  In this case
1406 	 * we prefer the vmlinux path like
1407 	 *   /lib/modules/3.16.4/build/vmlinux
1408 	 *
1409 	 * rather than build-id path (in debug directory).
1410 	 *   $HOME/.debug/.build-id/f0/6e17aa50adf4d00b88925e03775de107611551
1411 	 */
1412 	symbol_conf.ignore_vmlinux_buildid = true;
1413 
1414 	/*
1415 	 * If --buildid-all is given, it marks all DSO regardless of hits,
1416 	 * so no need to process samples. But if timestamp_boundary is enabled,
1417 	 * it still needs to walk on all samples to get the timestamps of
1418 	 * first/last samples.
1419 	 */
1420 	if (rec->buildid_all && !rec->timestamp_boundary)
1421 		rec->tool.sample = NULL;
1422 
1423 	return perf_session__process_events(session);
1424 }
1425 
1426 static void perf_event__synthesize_guest_os(struct machine *machine, void *data)
1427 {
1428 	int err;
1429 	struct perf_tool *tool = data;
1430 	/*
1431 	 *As for guest kernel when processing subcommand record&report,
1432 	 *we arrange module mmap prior to guest kernel mmap and trigger
1433 	 *a preload dso because default guest module symbols are loaded
1434 	 *from guest kallsyms instead of /lib/modules/XXX/XXX. This
1435 	 *method is used to avoid symbol missing when the first addr is
1436 	 *in module instead of in guest kernel.
1437 	 */
1438 	err = perf_event__synthesize_modules(tool, process_synthesized_event,
1439 					     machine);
1440 	if (err < 0)
1441 		pr_err("Couldn't record guest kernel [%d]'s reference"
1442 		       " relocation symbol.\n", machine->pid);
1443 
1444 	/*
1445 	 * We use _stext for guest kernel because guest kernel's /proc/kallsyms
1446 	 * have no _text sometimes.
1447 	 */
1448 	err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
1449 						 machine);
1450 	if (err < 0)
1451 		pr_err("Couldn't record guest kernel [%d]'s reference"
1452 		       " relocation symbol.\n", machine->pid);
1453 }
1454 
1455 static struct perf_event_header finished_round_event = {
1456 	.size = sizeof(struct perf_event_header),
1457 	.type = PERF_RECORD_FINISHED_ROUND,
1458 };
1459 
1460 static struct perf_event_header finished_init_event = {
1461 	.size = sizeof(struct perf_event_header),
1462 	.type = PERF_RECORD_FINISHED_INIT,
1463 };
1464 
1465 static void record__adjust_affinity(struct record *rec, struct mmap *map)
1466 {
1467 	if (rec->opts.affinity != PERF_AFFINITY_SYS &&
1468 	    !bitmap_equal(thread->mask->affinity.bits, map->affinity_mask.bits,
1469 			  thread->mask->affinity.nbits)) {
1470 		bitmap_zero(thread->mask->affinity.bits, thread->mask->affinity.nbits);
1471 		bitmap_or(thread->mask->affinity.bits, thread->mask->affinity.bits,
1472 			  map->affinity_mask.bits, thread->mask->affinity.nbits);
1473 		sched_setaffinity(0, MMAP_CPU_MASK_BYTES(&thread->mask->affinity),
1474 					(cpu_set_t *)thread->mask->affinity.bits);
1475 		if (verbose == 2) {
1476 			pr_debug("threads[%d]: running on cpu%d: ", thread->tid, sched_getcpu());
1477 			mmap_cpu_mask__scnprintf(&thread->mask->affinity, "affinity");
1478 		}
1479 	}
1480 }
1481 
1482 static size_t process_comp_header(void *record, size_t increment)
1483 {
1484 	struct perf_record_compressed *event = record;
1485 	size_t size = sizeof(*event);
1486 
1487 	if (increment) {
1488 		event->header.size += increment;
1489 		return increment;
1490 	}
1491 
1492 	event->header.type = PERF_RECORD_COMPRESSED;
1493 	event->header.size = size;
1494 
1495 	return size;
1496 }
1497 
1498 static size_t zstd_compress(struct perf_session *session, struct mmap *map,
1499 			    void *dst, size_t dst_size, void *src, size_t src_size)
1500 {
1501 	size_t compressed;
1502 	size_t max_record_size = PERF_SAMPLE_MAX_SIZE - sizeof(struct perf_record_compressed) - 1;
1503 	struct zstd_data *zstd_data = &session->zstd_data;
1504 
1505 	if (map && map->file)
1506 		zstd_data = &map->zstd_data;
1507 
1508 	compressed = zstd_compress_stream_to_records(zstd_data, dst, dst_size, src, src_size,
1509 						     max_record_size, process_comp_header);
1510 
1511 	if (map && map->file) {
1512 		thread->bytes_transferred += src_size;
1513 		thread->bytes_compressed  += compressed;
1514 	} else {
1515 		session->bytes_transferred += src_size;
1516 		session->bytes_compressed  += compressed;
1517 	}
1518 
1519 	return compressed;
1520 }
1521 
1522 static int record__mmap_read_evlist(struct record *rec, struct evlist *evlist,
1523 				    bool overwrite, bool synch)
1524 {
1525 	u64 bytes_written = rec->bytes_written;
1526 	int i;
1527 	int rc = 0;
1528 	int nr_mmaps;
1529 	struct mmap **maps;
1530 	int trace_fd = rec->data.file.fd;
1531 	off_t off = 0;
1532 
1533 	if (!evlist)
1534 		return 0;
1535 
1536 	nr_mmaps = thread->nr_mmaps;
1537 	maps = overwrite ? thread->overwrite_maps : thread->maps;
1538 
1539 	if (!maps)
1540 		return 0;
1541 
1542 	if (overwrite && evlist->bkw_mmap_state != BKW_MMAP_DATA_PENDING)
1543 		return 0;
1544 
1545 	if (record__aio_enabled(rec))
1546 		off = record__aio_get_pos(trace_fd);
1547 
1548 	for (i = 0; i < nr_mmaps; i++) {
1549 		u64 flush = 0;
1550 		struct mmap *map = maps[i];
1551 
1552 		if (map->core.base) {
1553 			record__adjust_affinity(rec, map);
1554 			if (synch) {
1555 				flush = map->core.flush;
1556 				map->core.flush = 1;
1557 			}
1558 			if (!record__aio_enabled(rec)) {
1559 				if (perf_mmap__push(map, rec, record__pushfn) < 0) {
1560 					if (synch)
1561 						map->core.flush = flush;
1562 					rc = -1;
1563 					goto out;
1564 				}
1565 			} else {
1566 				if (record__aio_push(rec, map, &off) < 0) {
1567 					record__aio_set_pos(trace_fd, off);
1568 					if (synch)
1569 						map->core.flush = flush;
1570 					rc = -1;
1571 					goto out;
1572 				}
1573 			}
1574 			if (synch)
1575 				map->core.flush = flush;
1576 		}
1577 
1578 		if (map->auxtrace_mmap.base && !rec->opts.auxtrace_snapshot_mode &&
1579 		    !rec->opts.auxtrace_sample_mode &&
1580 		    record__auxtrace_mmap_read(rec, map) != 0) {
1581 			rc = -1;
1582 			goto out;
1583 		}
1584 	}
1585 
1586 	if (record__aio_enabled(rec))
1587 		record__aio_set_pos(trace_fd, off);
1588 
1589 	/*
1590 	 * Mark the round finished in case we wrote
1591 	 * at least one event.
1592 	 *
1593 	 * No need for round events in directory mode,
1594 	 * because per-cpu maps and files have data
1595 	 * sorted by kernel.
1596 	 */
1597 	if (!record__threads_enabled(rec) && bytes_written != rec->bytes_written)
1598 		rc = record__write(rec, NULL, &finished_round_event, sizeof(finished_round_event));
1599 
1600 	if (overwrite)
1601 		evlist__toggle_bkw_mmap(evlist, BKW_MMAP_EMPTY);
1602 out:
1603 	return rc;
1604 }
1605 
1606 static int record__mmap_read_all(struct record *rec, bool synch)
1607 {
1608 	int err;
1609 
1610 	err = record__mmap_read_evlist(rec, rec->evlist, false, synch);
1611 	if (err)
1612 		return err;
1613 
1614 	return record__mmap_read_evlist(rec, rec->evlist, true, synch);
1615 }
1616 
1617 static void record__thread_munmap_filtered(struct fdarray *fda, int fd,
1618 					   void *arg __maybe_unused)
1619 {
1620 	struct perf_mmap *map = fda->priv[fd].ptr;
1621 
1622 	if (map)
1623 		perf_mmap__put(map);
1624 }
1625 
1626 static void *record__thread(void *arg)
1627 {
1628 	enum thread_msg msg = THREAD_MSG__READY;
1629 	bool terminate = false;
1630 	struct fdarray *pollfd;
1631 	int err, ctlfd_pos;
1632 
1633 	thread = arg;
1634 	thread->tid = gettid();
1635 
1636 	err = write(thread->pipes.ack[1], &msg, sizeof(msg));
1637 	if (err == -1)
1638 		pr_warning("threads[%d]: failed to notify on start: %s\n",
1639 			   thread->tid, strerror(errno));
1640 
1641 	pr_debug("threads[%d]: started on cpu%d\n", thread->tid, sched_getcpu());
1642 
1643 	pollfd = &thread->pollfd;
1644 	ctlfd_pos = thread->ctlfd_pos;
1645 
1646 	for (;;) {
1647 		unsigned long long hits = thread->samples;
1648 
1649 		if (record__mmap_read_all(thread->rec, false) < 0 || terminate)
1650 			break;
1651 
1652 		if (hits == thread->samples) {
1653 
1654 			err = fdarray__poll(pollfd, -1);
1655 			/*
1656 			 * Propagate error, only if there's any. Ignore positive
1657 			 * number of returned events and interrupt error.
1658 			 */
1659 			if (err > 0 || (err < 0 && errno == EINTR))
1660 				err = 0;
1661 			thread->waking++;
1662 
1663 			if (fdarray__filter(pollfd, POLLERR | POLLHUP,
1664 					    record__thread_munmap_filtered, NULL) == 0)
1665 				break;
1666 		}
1667 
1668 		if (pollfd->entries[ctlfd_pos].revents & POLLHUP) {
1669 			terminate = true;
1670 			close(thread->pipes.msg[0]);
1671 			thread->pipes.msg[0] = -1;
1672 			pollfd->entries[ctlfd_pos].fd = -1;
1673 			pollfd->entries[ctlfd_pos].events = 0;
1674 		}
1675 
1676 		pollfd->entries[ctlfd_pos].revents = 0;
1677 	}
1678 	record__mmap_read_all(thread->rec, true);
1679 
1680 	err = write(thread->pipes.ack[1], &msg, sizeof(msg));
1681 	if (err == -1)
1682 		pr_warning("threads[%d]: failed to notify on termination: %s\n",
1683 			   thread->tid, strerror(errno));
1684 
1685 	return NULL;
1686 }
1687 
1688 static void record__init_features(struct record *rec)
1689 {
1690 	struct perf_session *session = rec->session;
1691 	int feat;
1692 
1693 	for (feat = HEADER_FIRST_FEATURE; feat < HEADER_LAST_FEATURE; feat++)
1694 		perf_header__set_feat(&session->header, feat);
1695 
1696 	if (rec->no_buildid)
1697 		perf_header__clear_feat(&session->header, HEADER_BUILD_ID);
1698 
1699 #ifdef HAVE_LIBTRACEEVENT
1700 	if (!have_tracepoints(&rec->evlist->core.entries))
1701 		perf_header__clear_feat(&session->header, HEADER_TRACING_DATA);
1702 #endif
1703 
1704 	if (!rec->opts.branch_stack)
1705 		perf_header__clear_feat(&session->header, HEADER_BRANCH_STACK);
1706 
1707 	if (!rec->opts.full_auxtrace)
1708 		perf_header__clear_feat(&session->header, HEADER_AUXTRACE);
1709 
1710 	if (!(rec->opts.use_clockid && rec->opts.clockid_res_ns))
1711 		perf_header__clear_feat(&session->header, HEADER_CLOCKID);
1712 
1713 	if (!rec->opts.use_clockid)
1714 		perf_header__clear_feat(&session->header, HEADER_CLOCK_DATA);
1715 
1716 	if (!record__threads_enabled(rec))
1717 		perf_header__clear_feat(&session->header, HEADER_DIR_FORMAT);
1718 
1719 	if (!record__comp_enabled(rec))
1720 		perf_header__clear_feat(&session->header, HEADER_COMPRESSED);
1721 
1722 	perf_header__clear_feat(&session->header, HEADER_STAT);
1723 }
1724 
1725 static void
1726 record__finish_output(struct record *rec)
1727 {
1728 	int i;
1729 	struct perf_data *data = &rec->data;
1730 	int fd = perf_data__fd(data);
1731 
1732 	if (data->is_pipe)
1733 		return;
1734 
1735 	rec->session->header.data_size += rec->bytes_written;
1736 	data->file.size = lseek(perf_data__fd(data), 0, SEEK_CUR);
1737 	if (record__threads_enabled(rec)) {
1738 		for (i = 0; i < data->dir.nr; i++)
1739 			data->dir.files[i].size = lseek(data->dir.files[i].fd, 0, SEEK_CUR);
1740 	}
1741 
1742 	if (!rec->no_buildid) {
1743 		process_buildids(rec);
1744 
1745 		if (rec->buildid_all)
1746 			dsos__hit_all(rec->session);
1747 	}
1748 	perf_session__write_header(rec->session, rec->evlist, fd, true);
1749 
1750 	return;
1751 }
1752 
1753 static int record__synthesize_workload(struct record *rec, bool tail)
1754 {
1755 	int err;
1756 	struct perf_thread_map *thread_map;
1757 	bool needs_mmap = rec->opts.synth & PERF_SYNTH_MMAP;
1758 
1759 	if (rec->opts.tail_synthesize != tail)
1760 		return 0;
1761 
1762 	thread_map = thread_map__new_by_tid(rec->evlist->workload.pid);
1763 	if (thread_map == NULL)
1764 		return -1;
1765 
1766 	err = perf_event__synthesize_thread_map(&rec->tool, thread_map,
1767 						 process_synthesized_event,
1768 						 &rec->session->machines.host,
1769 						 needs_mmap,
1770 						 rec->opts.sample_address);
1771 	perf_thread_map__put(thread_map);
1772 	return err;
1773 }
1774 
1775 static int write_finished_init(struct record *rec, bool tail)
1776 {
1777 	if (rec->opts.tail_synthesize != tail)
1778 		return 0;
1779 
1780 	return record__write(rec, NULL, &finished_init_event, sizeof(finished_init_event));
1781 }
1782 
1783 static int record__synthesize(struct record *rec, bool tail);
1784 
1785 static int
1786 record__switch_output(struct record *rec, bool at_exit)
1787 {
1788 	struct perf_data *data = &rec->data;
1789 	int fd, err;
1790 	char *new_filename;
1791 
1792 	/* Same Size:      "2015122520103046"*/
1793 	char timestamp[] = "InvalidTimestamp";
1794 
1795 	record__aio_mmap_read_sync(rec);
1796 
1797 	write_finished_init(rec, true);
1798 
1799 	record__synthesize(rec, true);
1800 	if (target__none(&rec->opts.target))
1801 		record__synthesize_workload(rec, true);
1802 
1803 	rec->samples = 0;
1804 	record__finish_output(rec);
1805 	err = fetch_current_timestamp(timestamp, sizeof(timestamp));
1806 	if (err) {
1807 		pr_err("Failed to get current timestamp\n");
1808 		return -EINVAL;
1809 	}
1810 
1811 	fd = perf_data__switch(data, timestamp,
1812 				    rec->session->header.data_offset,
1813 				    at_exit, &new_filename);
1814 	if (fd >= 0 && !at_exit) {
1815 		rec->bytes_written = 0;
1816 		rec->session->header.data_size = 0;
1817 	}
1818 
1819 	if (!quiet)
1820 		fprintf(stderr, "[ perf record: Dump %s.%s ]\n",
1821 			data->path, timestamp);
1822 
1823 	if (rec->switch_output.num_files) {
1824 		int n = rec->switch_output.cur_file + 1;
1825 
1826 		if (n >= rec->switch_output.num_files)
1827 			n = 0;
1828 		rec->switch_output.cur_file = n;
1829 		if (rec->switch_output.filenames[n]) {
1830 			remove(rec->switch_output.filenames[n]);
1831 			zfree(&rec->switch_output.filenames[n]);
1832 		}
1833 		rec->switch_output.filenames[n] = new_filename;
1834 	} else {
1835 		free(new_filename);
1836 	}
1837 
1838 	/* Output tracking events */
1839 	if (!at_exit) {
1840 		record__synthesize(rec, false);
1841 
1842 		/*
1843 		 * In 'perf record --switch-output' without -a,
1844 		 * record__synthesize() in record__switch_output() won't
1845 		 * generate tracking events because there's no thread_map
1846 		 * in evlist. Which causes newly created perf.data doesn't
1847 		 * contain map and comm information.
1848 		 * Create a fake thread_map and directly call
1849 		 * perf_event__synthesize_thread_map() for those events.
1850 		 */
1851 		if (target__none(&rec->opts.target))
1852 			record__synthesize_workload(rec, false);
1853 		write_finished_init(rec, false);
1854 	}
1855 	return fd;
1856 }
1857 
1858 static void __record__save_lost_samples(struct record *rec, struct evsel *evsel,
1859 					struct perf_record_lost_samples *lost,
1860 					int cpu_idx, int thread_idx, u64 lost_count,
1861 					u16 misc_flag)
1862 {
1863 	struct perf_sample_id *sid;
1864 	struct perf_sample sample = {};
1865 	int id_hdr_size;
1866 
1867 	lost->lost = lost_count;
1868 	if (evsel->core.ids) {
1869 		sid = xyarray__entry(evsel->core.sample_id, cpu_idx, thread_idx);
1870 		sample.id = sid->id;
1871 	}
1872 
1873 	id_hdr_size = perf_event__synthesize_id_sample((void *)(lost + 1),
1874 						       evsel->core.attr.sample_type, &sample);
1875 	lost->header.size = sizeof(*lost) + id_hdr_size;
1876 	lost->header.misc = misc_flag;
1877 	record__write(rec, NULL, lost, lost->header.size);
1878 }
1879 
1880 static void record__read_lost_samples(struct record *rec)
1881 {
1882 	struct perf_session *session = rec->session;
1883 	struct perf_record_lost_samples *lost;
1884 	struct evsel *evsel;
1885 
1886 	/* there was an error during record__open */
1887 	if (session->evlist == NULL)
1888 		return;
1889 
1890 	lost = zalloc(PERF_SAMPLE_MAX_SIZE);
1891 	if (lost == NULL) {
1892 		pr_debug("Memory allocation failed\n");
1893 		return;
1894 	}
1895 
1896 	lost->header.type = PERF_RECORD_LOST_SAMPLES;
1897 
1898 	evlist__for_each_entry(session->evlist, evsel) {
1899 		struct xyarray *xy = evsel->core.sample_id;
1900 		u64 lost_count;
1901 
1902 		if (xy == NULL || evsel->core.fd == NULL)
1903 			continue;
1904 		if (xyarray__max_x(evsel->core.fd) != xyarray__max_x(xy) ||
1905 		    xyarray__max_y(evsel->core.fd) != xyarray__max_y(xy)) {
1906 			pr_debug("Unmatched FD vs. sample ID: skip reading LOST count\n");
1907 			continue;
1908 		}
1909 
1910 		for (int x = 0; x < xyarray__max_x(xy); x++) {
1911 			for (int y = 0; y < xyarray__max_y(xy); y++) {
1912 				struct perf_counts_values count;
1913 
1914 				if (perf_evsel__read(&evsel->core, x, y, &count) < 0) {
1915 					pr_debug("read LOST count failed\n");
1916 					goto out;
1917 				}
1918 
1919 				if (count.lost) {
1920 					__record__save_lost_samples(rec, evsel, lost,
1921 								    x, y, count.lost, 0);
1922 				}
1923 			}
1924 		}
1925 
1926 		lost_count = perf_bpf_filter__lost_count(evsel);
1927 		if (lost_count)
1928 			__record__save_lost_samples(rec, evsel, lost, 0, 0, lost_count,
1929 						    PERF_RECORD_MISC_LOST_SAMPLES_BPF);
1930 	}
1931 out:
1932 	free(lost);
1933 }
1934 
1935 static volatile sig_atomic_t workload_exec_errno;
1936 
1937 /*
1938  * evlist__prepare_workload will send a SIGUSR1
1939  * if the fork fails, since we asked by setting its
1940  * want_signal to true.
1941  */
1942 static void workload_exec_failed_signal(int signo __maybe_unused,
1943 					siginfo_t *info,
1944 					void *ucontext __maybe_unused)
1945 {
1946 	workload_exec_errno = info->si_value.sival_int;
1947 	done = 1;
1948 	child_finished = 1;
1949 }
1950 
1951 static void snapshot_sig_handler(int sig);
1952 static void alarm_sig_handler(int sig);
1953 
1954 static const struct perf_event_mmap_page *evlist__pick_pc(struct evlist *evlist)
1955 {
1956 	if (evlist) {
1957 		if (evlist->mmap && evlist->mmap[0].core.base)
1958 			return evlist->mmap[0].core.base;
1959 		if (evlist->overwrite_mmap && evlist->overwrite_mmap[0].core.base)
1960 			return evlist->overwrite_mmap[0].core.base;
1961 	}
1962 	return NULL;
1963 }
1964 
1965 static const struct perf_event_mmap_page *record__pick_pc(struct record *rec)
1966 {
1967 	const struct perf_event_mmap_page *pc = evlist__pick_pc(rec->evlist);
1968 	if (pc)
1969 		return pc;
1970 	return NULL;
1971 }
1972 
1973 static int record__synthesize(struct record *rec, bool tail)
1974 {
1975 	struct perf_session *session = rec->session;
1976 	struct machine *machine = &session->machines.host;
1977 	struct perf_data *data = &rec->data;
1978 	struct record_opts *opts = &rec->opts;
1979 	struct perf_tool *tool = &rec->tool;
1980 	int err = 0;
1981 	event_op f = process_synthesized_event;
1982 
1983 	if (rec->opts.tail_synthesize != tail)
1984 		return 0;
1985 
1986 	if (data->is_pipe) {
1987 		err = perf_event__synthesize_for_pipe(tool, session, data,
1988 						      process_synthesized_event);
1989 		if (err < 0)
1990 			goto out;
1991 
1992 		rec->bytes_written += err;
1993 	}
1994 
1995 	err = perf_event__synth_time_conv(record__pick_pc(rec), tool,
1996 					  process_synthesized_event, machine);
1997 	if (err)
1998 		goto out;
1999 
2000 	/* Synthesize id_index before auxtrace_info */
2001 	err = perf_event__synthesize_id_index(tool,
2002 					      process_synthesized_event,
2003 					      session->evlist, machine);
2004 	if (err)
2005 		goto out;
2006 
2007 	if (rec->opts.full_auxtrace) {
2008 		err = perf_event__synthesize_auxtrace_info(rec->itr, tool,
2009 					session, process_synthesized_event);
2010 		if (err)
2011 			goto out;
2012 	}
2013 
2014 	if (!evlist__exclude_kernel(rec->evlist)) {
2015 		err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
2016 							 machine);
2017 		WARN_ONCE(err < 0, "Couldn't record kernel reference relocation symbol\n"
2018 				   "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
2019 				   "Check /proc/kallsyms permission or run as root.\n");
2020 
2021 		err = perf_event__synthesize_modules(tool, process_synthesized_event,
2022 						     machine);
2023 		WARN_ONCE(err < 0, "Couldn't record kernel module information.\n"
2024 				   "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
2025 				   "Check /proc/modules permission or run as root.\n");
2026 	}
2027 
2028 	if (perf_guest) {
2029 		machines__process_guests(&session->machines,
2030 					 perf_event__synthesize_guest_os, tool);
2031 	}
2032 
2033 	err = perf_event__synthesize_extra_attr(&rec->tool,
2034 						rec->evlist,
2035 						process_synthesized_event,
2036 						data->is_pipe);
2037 	if (err)
2038 		goto out;
2039 
2040 	err = perf_event__synthesize_thread_map2(&rec->tool, rec->evlist->core.threads,
2041 						 process_synthesized_event,
2042 						NULL);
2043 	if (err < 0) {
2044 		pr_err("Couldn't synthesize thread map.\n");
2045 		return err;
2046 	}
2047 
2048 	err = perf_event__synthesize_cpu_map(&rec->tool, rec->evlist->core.all_cpus,
2049 					     process_synthesized_event, NULL);
2050 	if (err < 0) {
2051 		pr_err("Couldn't synthesize cpu map.\n");
2052 		return err;
2053 	}
2054 
2055 	err = perf_event__synthesize_bpf_events(session, process_synthesized_event,
2056 						machine, opts);
2057 	if (err < 0) {
2058 		pr_warning("Couldn't synthesize bpf events.\n");
2059 		err = 0;
2060 	}
2061 
2062 	if (rec->opts.synth & PERF_SYNTH_CGROUP) {
2063 		err = perf_event__synthesize_cgroups(tool, process_synthesized_event,
2064 						     machine);
2065 		if (err < 0) {
2066 			pr_warning("Couldn't synthesize cgroup events.\n");
2067 			err = 0;
2068 		}
2069 	}
2070 
2071 	if (rec->opts.nr_threads_synthesize > 1) {
2072 		mutex_init(&synth_lock);
2073 		perf_set_multithreaded();
2074 		f = process_locked_synthesized_event;
2075 	}
2076 
2077 	if (rec->opts.synth & PERF_SYNTH_TASK) {
2078 		bool needs_mmap = rec->opts.synth & PERF_SYNTH_MMAP;
2079 
2080 		err = __machine__synthesize_threads(machine, tool, &opts->target,
2081 						    rec->evlist->core.threads,
2082 						    f, needs_mmap, opts->sample_address,
2083 						    rec->opts.nr_threads_synthesize);
2084 	}
2085 
2086 	if (rec->opts.nr_threads_synthesize > 1) {
2087 		perf_set_singlethreaded();
2088 		mutex_destroy(&synth_lock);
2089 	}
2090 
2091 out:
2092 	return err;
2093 }
2094 
2095 static int record__process_signal_event(union perf_event *event __maybe_unused, void *data)
2096 {
2097 	struct record *rec = data;
2098 	pthread_kill(rec->thread_id, SIGUSR2);
2099 	return 0;
2100 }
2101 
2102 static int record__setup_sb_evlist(struct record *rec)
2103 {
2104 	struct record_opts *opts = &rec->opts;
2105 
2106 	if (rec->sb_evlist != NULL) {
2107 		/*
2108 		 * We get here if --switch-output-event populated the
2109 		 * sb_evlist, so associate a callback that will send a SIGUSR2
2110 		 * to the main thread.
2111 		 */
2112 		evlist__set_cb(rec->sb_evlist, record__process_signal_event, rec);
2113 		rec->thread_id = pthread_self();
2114 	}
2115 #ifdef HAVE_LIBBPF_SUPPORT
2116 	if (!opts->no_bpf_event) {
2117 		if (rec->sb_evlist == NULL) {
2118 			rec->sb_evlist = evlist__new();
2119 
2120 			if (rec->sb_evlist == NULL) {
2121 				pr_err("Couldn't create side band evlist.\n.");
2122 				return -1;
2123 			}
2124 		}
2125 
2126 		if (evlist__add_bpf_sb_event(rec->sb_evlist, &rec->session->header.env)) {
2127 			pr_err("Couldn't ask for PERF_RECORD_BPF_EVENT side band events.\n.");
2128 			return -1;
2129 		}
2130 	}
2131 #endif
2132 	if (evlist__start_sb_thread(rec->sb_evlist, &rec->opts.target)) {
2133 		pr_debug("Couldn't start the BPF side band thread:\nBPF programs starting from now on won't be annotatable\n");
2134 		opts->no_bpf_event = true;
2135 	}
2136 
2137 	return 0;
2138 }
2139 
2140 static int record__init_clock(struct record *rec)
2141 {
2142 	struct perf_session *session = rec->session;
2143 	struct timespec ref_clockid;
2144 	struct timeval ref_tod;
2145 	u64 ref;
2146 
2147 	if (!rec->opts.use_clockid)
2148 		return 0;
2149 
2150 	if (rec->opts.use_clockid && rec->opts.clockid_res_ns)
2151 		session->header.env.clock.clockid_res_ns = rec->opts.clockid_res_ns;
2152 
2153 	session->header.env.clock.clockid = rec->opts.clockid;
2154 
2155 	if (gettimeofday(&ref_tod, NULL) != 0) {
2156 		pr_err("gettimeofday failed, cannot set reference time.\n");
2157 		return -1;
2158 	}
2159 
2160 	if (clock_gettime(rec->opts.clockid, &ref_clockid)) {
2161 		pr_err("clock_gettime failed, cannot set reference time.\n");
2162 		return -1;
2163 	}
2164 
2165 	ref = (u64) ref_tod.tv_sec * NSEC_PER_SEC +
2166 	      (u64) ref_tod.tv_usec * NSEC_PER_USEC;
2167 
2168 	session->header.env.clock.tod_ns = ref;
2169 
2170 	ref = (u64) ref_clockid.tv_sec * NSEC_PER_SEC +
2171 	      (u64) ref_clockid.tv_nsec;
2172 
2173 	session->header.env.clock.clockid_ns = ref;
2174 	return 0;
2175 }
2176 
2177 static void hit_auxtrace_snapshot_trigger(struct record *rec)
2178 {
2179 	if (trigger_is_ready(&auxtrace_snapshot_trigger)) {
2180 		trigger_hit(&auxtrace_snapshot_trigger);
2181 		auxtrace_record__snapshot_started = 1;
2182 		if (auxtrace_record__snapshot_start(rec->itr))
2183 			trigger_error(&auxtrace_snapshot_trigger);
2184 	}
2185 }
2186 
2187 static void record__uniquify_name(struct record *rec)
2188 {
2189 	struct evsel *pos;
2190 	struct evlist *evlist = rec->evlist;
2191 	char *new_name;
2192 	int ret;
2193 
2194 	if (perf_pmus__num_core_pmus() == 1)
2195 		return;
2196 
2197 	evlist__for_each_entry(evlist, pos) {
2198 		if (!evsel__is_hybrid(pos))
2199 			continue;
2200 
2201 		if (strchr(pos->name, '/'))
2202 			continue;
2203 
2204 		ret = asprintf(&new_name, "%s/%s/",
2205 			       pos->pmu_name, pos->name);
2206 		if (ret) {
2207 			free(pos->name);
2208 			pos->name = new_name;
2209 		}
2210 	}
2211 }
2212 
2213 static int record__terminate_thread(struct record_thread *thread_data)
2214 {
2215 	int err;
2216 	enum thread_msg ack = THREAD_MSG__UNDEFINED;
2217 	pid_t tid = thread_data->tid;
2218 
2219 	close(thread_data->pipes.msg[1]);
2220 	thread_data->pipes.msg[1] = -1;
2221 	err = read(thread_data->pipes.ack[0], &ack, sizeof(ack));
2222 	if (err > 0)
2223 		pr_debug2("threads[%d]: sent %s\n", tid, thread_msg_tags[ack]);
2224 	else
2225 		pr_warning("threads[%d]: failed to receive termination notification from %d\n",
2226 			   thread->tid, tid);
2227 
2228 	return 0;
2229 }
2230 
2231 static int record__start_threads(struct record *rec)
2232 {
2233 	int t, tt, err, ret = 0, nr_threads = rec->nr_threads;
2234 	struct record_thread *thread_data = rec->thread_data;
2235 	sigset_t full, mask;
2236 	pthread_t handle;
2237 	pthread_attr_t attrs;
2238 
2239 	thread = &thread_data[0];
2240 
2241 	if (!record__threads_enabled(rec))
2242 		return 0;
2243 
2244 	sigfillset(&full);
2245 	if (sigprocmask(SIG_SETMASK, &full, &mask)) {
2246 		pr_err("Failed to block signals on threads start: %s\n", strerror(errno));
2247 		return -1;
2248 	}
2249 
2250 	pthread_attr_init(&attrs);
2251 	pthread_attr_setdetachstate(&attrs, PTHREAD_CREATE_DETACHED);
2252 
2253 	for (t = 1; t < nr_threads; t++) {
2254 		enum thread_msg msg = THREAD_MSG__UNDEFINED;
2255 
2256 #ifdef HAVE_PTHREAD_ATTR_SETAFFINITY_NP
2257 		pthread_attr_setaffinity_np(&attrs,
2258 					    MMAP_CPU_MASK_BYTES(&(thread_data[t].mask->affinity)),
2259 					    (cpu_set_t *)(thread_data[t].mask->affinity.bits));
2260 #endif
2261 		if (pthread_create(&handle, &attrs, record__thread, &thread_data[t])) {
2262 			for (tt = 1; tt < t; tt++)
2263 				record__terminate_thread(&thread_data[t]);
2264 			pr_err("Failed to start threads: %s\n", strerror(errno));
2265 			ret = -1;
2266 			goto out_err;
2267 		}
2268 
2269 		err = read(thread_data[t].pipes.ack[0], &msg, sizeof(msg));
2270 		if (err > 0)
2271 			pr_debug2("threads[%d]: sent %s\n", rec->thread_data[t].tid,
2272 				  thread_msg_tags[msg]);
2273 		else
2274 			pr_warning("threads[%d]: failed to receive start notification from %d\n",
2275 				   thread->tid, rec->thread_data[t].tid);
2276 	}
2277 
2278 	sched_setaffinity(0, MMAP_CPU_MASK_BYTES(&thread->mask->affinity),
2279 			(cpu_set_t *)thread->mask->affinity.bits);
2280 
2281 	pr_debug("threads[%d]: started on cpu%d\n", thread->tid, sched_getcpu());
2282 
2283 out_err:
2284 	pthread_attr_destroy(&attrs);
2285 
2286 	if (sigprocmask(SIG_SETMASK, &mask, NULL)) {
2287 		pr_err("Failed to unblock signals on threads start: %s\n", strerror(errno));
2288 		ret = -1;
2289 	}
2290 
2291 	return ret;
2292 }
2293 
2294 static int record__stop_threads(struct record *rec)
2295 {
2296 	int t;
2297 	struct record_thread *thread_data = rec->thread_data;
2298 
2299 	for (t = 1; t < rec->nr_threads; t++)
2300 		record__terminate_thread(&thread_data[t]);
2301 
2302 	for (t = 0; t < rec->nr_threads; t++) {
2303 		rec->samples += thread_data[t].samples;
2304 		if (!record__threads_enabled(rec))
2305 			continue;
2306 		rec->session->bytes_transferred += thread_data[t].bytes_transferred;
2307 		rec->session->bytes_compressed += thread_data[t].bytes_compressed;
2308 		pr_debug("threads[%d]: samples=%lld, wakes=%ld, ", thread_data[t].tid,
2309 			 thread_data[t].samples, thread_data[t].waking);
2310 		if (thread_data[t].bytes_transferred && thread_data[t].bytes_compressed)
2311 			pr_debug("transferred=%" PRIu64 ", compressed=%" PRIu64 "\n",
2312 				 thread_data[t].bytes_transferred, thread_data[t].bytes_compressed);
2313 		else
2314 			pr_debug("written=%" PRIu64 "\n", thread_data[t].bytes_written);
2315 	}
2316 
2317 	return 0;
2318 }
2319 
2320 static unsigned long record__waking(struct record *rec)
2321 {
2322 	int t;
2323 	unsigned long waking = 0;
2324 	struct record_thread *thread_data = rec->thread_data;
2325 
2326 	for (t = 0; t < rec->nr_threads; t++)
2327 		waking += thread_data[t].waking;
2328 
2329 	return waking;
2330 }
2331 
2332 static int __cmd_record(struct record *rec, int argc, const char **argv)
2333 {
2334 	int err;
2335 	int status = 0;
2336 	const bool forks = argc > 0;
2337 	struct perf_tool *tool = &rec->tool;
2338 	struct record_opts *opts = &rec->opts;
2339 	struct perf_data *data = &rec->data;
2340 	struct perf_session *session;
2341 	bool disabled = false, draining = false;
2342 	int fd;
2343 	float ratio = 0;
2344 	enum evlist_ctl_cmd cmd = EVLIST_CTL_CMD_UNSUPPORTED;
2345 
2346 	atexit(record__sig_exit);
2347 	signal(SIGCHLD, sig_handler);
2348 	signal(SIGINT, sig_handler);
2349 	signal(SIGTERM, sig_handler);
2350 	signal(SIGSEGV, sigsegv_handler);
2351 
2352 	if (rec->opts.record_namespaces)
2353 		tool->namespace_events = true;
2354 
2355 	if (rec->opts.record_cgroup) {
2356 #ifdef HAVE_FILE_HANDLE
2357 		tool->cgroup_events = true;
2358 #else
2359 		pr_err("cgroup tracking is not supported\n");
2360 		return -1;
2361 #endif
2362 	}
2363 
2364 	if (rec->opts.auxtrace_snapshot_mode || rec->switch_output.enabled) {
2365 		signal(SIGUSR2, snapshot_sig_handler);
2366 		if (rec->opts.auxtrace_snapshot_mode)
2367 			trigger_on(&auxtrace_snapshot_trigger);
2368 		if (rec->switch_output.enabled)
2369 			trigger_on(&switch_output_trigger);
2370 	} else {
2371 		signal(SIGUSR2, SIG_IGN);
2372 	}
2373 
2374 	session = perf_session__new(data, tool);
2375 	if (IS_ERR(session)) {
2376 		pr_err("Perf session creation failed.\n");
2377 		return PTR_ERR(session);
2378 	}
2379 
2380 	if (record__threads_enabled(rec)) {
2381 		if (perf_data__is_pipe(&rec->data)) {
2382 			pr_err("Parallel trace streaming is not available in pipe mode.\n");
2383 			return -1;
2384 		}
2385 		if (rec->opts.full_auxtrace) {
2386 			pr_err("Parallel trace streaming is not available in AUX area tracing mode.\n");
2387 			return -1;
2388 		}
2389 	}
2390 
2391 	fd = perf_data__fd(data);
2392 	rec->session = session;
2393 
2394 	if (zstd_init(&session->zstd_data, rec->opts.comp_level) < 0) {
2395 		pr_err("Compression initialization failed.\n");
2396 		return -1;
2397 	}
2398 #ifdef HAVE_EVENTFD_SUPPORT
2399 	done_fd = eventfd(0, EFD_NONBLOCK);
2400 	if (done_fd < 0) {
2401 		pr_err("Failed to create wakeup eventfd, error: %m\n");
2402 		status = -1;
2403 		goto out_delete_session;
2404 	}
2405 	err = evlist__add_wakeup_eventfd(rec->evlist, done_fd);
2406 	if (err < 0) {
2407 		pr_err("Failed to add wakeup eventfd to poll list\n");
2408 		status = err;
2409 		goto out_delete_session;
2410 	}
2411 #endif // HAVE_EVENTFD_SUPPORT
2412 
2413 	session->header.env.comp_type  = PERF_COMP_ZSTD;
2414 	session->header.env.comp_level = rec->opts.comp_level;
2415 
2416 	if (rec->opts.kcore &&
2417 	    !record__kcore_readable(&session->machines.host)) {
2418 		pr_err("ERROR: kcore is not readable.\n");
2419 		return -1;
2420 	}
2421 
2422 	if (record__init_clock(rec))
2423 		return -1;
2424 
2425 	record__init_features(rec);
2426 
2427 	if (forks) {
2428 		err = evlist__prepare_workload(rec->evlist, &opts->target, argv, data->is_pipe,
2429 					       workload_exec_failed_signal);
2430 		if (err < 0) {
2431 			pr_err("Couldn't run the workload!\n");
2432 			status = err;
2433 			goto out_delete_session;
2434 		}
2435 	}
2436 
2437 	/*
2438 	 * If we have just single event and are sending data
2439 	 * through pipe, we need to force the ids allocation,
2440 	 * because we synthesize event name through the pipe
2441 	 * and need the id for that.
2442 	 */
2443 	if (data->is_pipe && rec->evlist->core.nr_entries == 1)
2444 		rec->opts.sample_id = true;
2445 
2446 	record__uniquify_name(rec);
2447 
2448 	/* Debug message used by test scripts */
2449 	pr_debug3("perf record opening and mmapping events\n");
2450 	if (record__open(rec) != 0) {
2451 		err = -1;
2452 		goto out_free_threads;
2453 	}
2454 	/* Debug message used by test scripts */
2455 	pr_debug3("perf record done opening and mmapping events\n");
2456 	session->header.env.comp_mmap_len = session->evlist->core.mmap_len;
2457 
2458 	if (rec->opts.kcore) {
2459 		err = record__kcore_copy(&session->machines.host, data);
2460 		if (err) {
2461 			pr_err("ERROR: Failed to copy kcore\n");
2462 			goto out_free_threads;
2463 		}
2464 	}
2465 
2466 	/*
2467 	 * Normally perf_session__new would do this, but it doesn't have the
2468 	 * evlist.
2469 	 */
2470 	if (rec->tool.ordered_events && !evlist__sample_id_all(rec->evlist)) {
2471 		pr_warning("WARNING: No sample_id_all support, falling back to unordered processing\n");
2472 		rec->tool.ordered_events = false;
2473 	}
2474 
2475 	if (evlist__nr_groups(rec->evlist) == 0)
2476 		perf_header__clear_feat(&session->header, HEADER_GROUP_DESC);
2477 
2478 	if (data->is_pipe) {
2479 		err = perf_header__write_pipe(fd);
2480 		if (err < 0)
2481 			goto out_free_threads;
2482 	} else {
2483 		err = perf_session__write_header(session, rec->evlist, fd, false);
2484 		if (err < 0)
2485 			goto out_free_threads;
2486 	}
2487 
2488 	err = -1;
2489 	if (!rec->no_buildid
2490 	    && !perf_header__has_feat(&session->header, HEADER_BUILD_ID)) {
2491 		pr_err("Couldn't generate buildids. "
2492 		       "Use --no-buildid to profile anyway.\n");
2493 		goto out_free_threads;
2494 	}
2495 
2496 	err = record__setup_sb_evlist(rec);
2497 	if (err)
2498 		goto out_free_threads;
2499 
2500 	err = record__synthesize(rec, false);
2501 	if (err < 0)
2502 		goto out_free_threads;
2503 
2504 	if (rec->realtime_prio) {
2505 		struct sched_param param;
2506 
2507 		param.sched_priority = rec->realtime_prio;
2508 		if (sched_setscheduler(0, SCHED_FIFO, &param)) {
2509 			pr_err("Could not set realtime priority.\n");
2510 			err = -1;
2511 			goto out_free_threads;
2512 		}
2513 	}
2514 
2515 	if (record__start_threads(rec))
2516 		goto out_free_threads;
2517 
2518 	/*
2519 	 * When perf is starting the traced process, all the events
2520 	 * (apart from group members) have enable_on_exec=1 set,
2521 	 * so don't spoil it by prematurely enabling them.
2522 	 */
2523 	if (!target__none(&opts->target) && !opts->target.initial_delay)
2524 		evlist__enable(rec->evlist);
2525 
2526 	/*
2527 	 * Let the child rip
2528 	 */
2529 	if (forks) {
2530 		struct machine *machine = &session->machines.host;
2531 		union perf_event *event;
2532 		pid_t tgid;
2533 
2534 		event = malloc(sizeof(event->comm) + machine->id_hdr_size);
2535 		if (event == NULL) {
2536 			err = -ENOMEM;
2537 			goto out_child;
2538 		}
2539 
2540 		/*
2541 		 * Some H/W events are generated before COMM event
2542 		 * which is emitted during exec(), so perf script
2543 		 * cannot see a correct process name for those events.
2544 		 * Synthesize COMM event to prevent it.
2545 		 */
2546 		tgid = perf_event__synthesize_comm(tool, event,
2547 						   rec->evlist->workload.pid,
2548 						   process_synthesized_event,
2549 						   machine);
2550 		free(event);
2551 
2552 		if (tgid == -1)
2553 			goto out_child;
2554 
2555 		event = malloc(sizeof(event->namespaces) +
2556 			       (NR_NAMESPACES * sizeof(struct perf_ns_link_info)) +
2557 			       machine->id_hdr_size);
2558 		if (event == NULL) {
2559 			err = -ENOMEM;
2560 			goto out_child;
2561 		}
2562 
2563 		/*
2564 		 * Synthesize NAMESPACES event for the command specified.
2565 		 */
2566 		perf_event__synthesize_namespaces(tool, event,
2567 						  rec->evlist->workload.pid,
2568 						  tgid, process_synthesized_event,
2569 						  machine);
2570 		free(event);
2571 
2572 		evlist__start_workload(rec->evlist);
2573 	}
2574 
2575 	if (opts->target.initial_delay) {
2576 		pr_info(EVLIST_DISABLED_MSG);
2577 		if (opts->target.initial_delay > 0) {
2578 			usleep(opts->target.initial_delay * USEC_PER_MSEC);
2579 			evlist__enable(rec->evlist);
2580 			pr_info(EVLIST_ENABLED_MSG);
2581 		}
2582 	}
2583 
2584 	err = event_enable_timer__start(rec->evlist->eet);
2585 	if (err)
2586 		goto out_child;
2587 
2588 	/* Debug message used by test scripts */
2589 	pr_debug3("perf record has started\n");
2590 	fflush(stderr);
2591 
2592 	trigger_ready(&auxtrace_snapshot_trigger);
2593 	trigger_ready(&switch_output_trigger);
2594 	perf_hooks__invoke_record_start();
2595 
2596 	/*
2597 	 * Must write FINISHED_INIT so it will be seen after all other
2598 	 * synthesized user events, but before any regular events.
2599 	 */
2600 	err = write_finished_init(rec, false);
2601 	if (err < 0)
2602 		goto out_child;
2603 
2604 	for (;;) {
2605 		unsigned long long hits = thread->samples;
2606 
2607 		/*
2608 		 * rec->evlist->bkw_mmap_state is possible to be
2609 		 * BKW_MMAP_EMPTY here: when done == true and
2610 		 * hits != rec->samples in previous round.
2611 		 *
2612 		 * evlist__toggle_bkw_mmap ensure we never
2613 		 * convert BKW_MMAP_EMPTY to BKW_MMAP_DATA_PENDING.
2614 		 */
2615 		if (trigger_is_hit(&switch_output_trigger) || done || draining)
2616 			evlist__toggle_bkw_mmap(rec->evlist, BKW_MMAP_DATA_PENDING);
2617 
2618 		if (record__mmap_read_all(rec, false) < 0) {
2619 			trigger_error(&auxtrace_snapshot_trigger);
2620 			trigger_error(&switch_output_trigger);
2621 			err = -1;
2622 			goto out_child;
2623 		}
2624 
2625 		if (auxtrace_record__snapshot_started) {
2626 			auxtrace_record__snapshot_started = 0;
2627 			if (!trigger_is_error(&auxtrace_snapshot_trigger))
2628 				record__read_auxtrace_snapshot(rec, false);
2629 			if (trigger_is_error(&auxtrace_snapshot_trigger)) {
2630 				pr_err("AUX area tracing snapshot failed\n");
2631 				err = -1;
2632 				goto out_child;
2633 			}
2634 		}
2635 
2636 		if (trigger_is_hit(&switch_output_trigger)) {
2637 			/*
2638 			 * If switch_output_trigger is hit, the data in
2639 			 * overwritable ring buffer should have been collected,
2640 			 * so bkw_mmap_state should be set to BKW_MMAP_EMPTY.
2641 			 *
2642 			 * If SIGUSR2 raise after or during record__mmap_read_all(),
2643 			 * record__mmap_read_all() didn't collect data from
2644 			 * overwritable ring buffer. Read again.
2645 			 */
2646 			if (rec->evlist->bkw_mmap_state == BKW_MMAP_RUNNING)
2647 				continue;
2648 			trigger_ready(&switch_output_trigger);
2649 
2650 			/*
2651 			 * Reenable events in overwrite ring buffer after
2652 			 * record__mmap_read_all(): we should have collected
2653 			 * data from it.
2654 			 */
2655 			evlist__toggle_bkw_mmap(rec->evlist, BKW_MMAP_RUNNING);
2656 
2657 			if (!quiet)
2658 				fprintf(stderr, "[ perf record: dump data: Woken up %ld times ]\n",
2659 					record__waking(rec));
2660 			thread->waking = 0;
2661 			fd = record__switch_output(rec, false);
2662 			if (fd < 0) {
2663 				pr_err("Failed to switch to new file\n");
2664 				trigger_error(&switch_output_trigger);
2665 				err = fd;
2666 				goto out_child;
2667 			}
2668 
2669 			/* re-arm the alarm */
2670 			if (rec->switch_output.time)
2671 				alarm(rec->switch_output.time);
2672 		}
2673 
2674 		if (hits == thread->samples) {
2675 			if (done || draining)
2676 				break;
2677 			err = fdarray__poll(&thread->pollfd, -1);
2678 			/*
2679 			 * Propagate error, only if there's any. Ignore positive
2680 			 * number of returned events and interrupt error.
2681 			 */
2682 			if (err > 0 || (err < 0 && errno == EINTR))
2683 				err = 0;
2684 			thread->waking++;
2685 
2686 			if (fdarray__filter(&thread->pollfd, POLLERR | POLLHUP,
2687 					    record__thread_munmap_filtered, NULL) == 0)
2688 				draining = true;
2689 
2690 			err = record__update_evlist_pollfd_from_thread(rec, rec->evlist, thread);
2691 			if (err)
2692 				goto out_child;
2693 		}
2694 
2695 		if (evlist__ctlfd_process(rec->evlist, &cmd) > 0) {
2696 			switch (cmd) {
2697 			case EVLIST_CTL_CMD_SNAPSHOT:
2698 				hit_auxtrace_snapshot_trigger(rec);
2699 				evlist__ctlfd_ack(rec->evlist);
2700 				break;
2701 			case EVLIST_CTL_CMD_STOP:
2702 				done = 1;
2703 				break;
2704 			case EVLIST_CTL_CMD_ACK:
2705 			case EVLIST_CTL_CMD_UNSUPPORTED:
2706 			case EVLIST_CTL_CMD_ENABLE:
2707 			case EVLIST_CTL_CMD_DISABLE:
2708 			case EVLIST_CTL_CMD_EVLIST:
2709 			case EVLIST_CTL_CMD_PING:
2710 			default:
2711 				break;
2712 			}
2713 		}
2714 
2715 		err = event_enable_timer__process(rec->evlist->eet);
2716 		if (err < 0)
2717 			goto out_child;
2718 		if (err) {
2719 			err = 0;
2720 			done = 1;
2721 		}
2722 
2723 		/*
2724 		 * When perf is starting the traced process, at the end events
2725 		 * die with the process and we wait for that. Thus no need to
2726 		 * disable events in this case.
2727 		 */
2728 		if (done && !disabled && !target__none(&opts->target)) {
2729 			trigger_off(&auxtrace_snapshot_trigger);
2730 			evlist__disable(rec->evlist);
2731 			disabled = true;
2732 		}
2733 	}
2734 
2735 	trigger_off(&auxtrace_snapshot_trigger);
2736 	trigger_off(&switch_output_trigger);
2737 
2738 	if (opts->auxtrace_snapshot_on_exit)
2739 		record__auxtrace_snapshot_exit(rec);
2740 
2741 	if (forks && workload_exec_errno) {
2742 		char msg[STRERR_BUFSIZE], strevsels[2048];
2743 		const char *emsg = str_error_r(workload_exec_errno, msg, sizeof(msg));
2744 
2745 		evlist__scnprintf_evsels(rec->evlist, sizeof(strevsels), strevsels);
2746 
2747 		pr_err("Failed to collect '%s' for the '%s' workload: %s\n",
2748 			strevsels, argv[0], emsg);
2749 		err = -1;
2750 		goto out_child;
2751 	}
2752 
2753 	if (!quiet)
2754 		fprintf(stderr, "[ perf record: Woken up %ld times to write data ]\n",
2755 			record__waking(rec));
2756 
2757 	write_finished_init(rec, true);
2758 
2759 	if (target__none(&rec->opts.target))
2760 		record__synthesize_workload(rec, true);
2761 
2762 out_child:
2763 	record__stop_threads(rec);
2764 	record__mmap_read_all(rec, true);
2765 out_free_threads:
2766 	record__free_thread_data(rec);
2767 	evlist__finalize_ctlfd(rec->evlist);
2768 	record__aio_mmap_read_sync(rec);
2769 
2770 	if (rec->session->bytes_transferred && rec->session->bytes_compressed) {
2771 		ratio = (float)rec->session->bytes_transferred/(float)rec->session->bytes_compressed;
2772 		session->header.env.comp_ratio = ratio + 0.5;
2773 	}
2774 
2775 	if (forks) {
2776 		int exit_status;
2777 
2778 		if (!child_finished)
2779 			kill(rec->evlist->workload.pid, SIGTERM);
2780 
2781 		wait(&exit_status);
2782 
2783 		if (err < 0)
2784 			status = err;
2785 		else if (WIFEXITED(exit_status))
2786 			status = WEXITSTATUS(exit_status);
2787 		else if (WIFSIGNALED(exit_status))
2788 			signr = WTERMSIG(exit_status);
2789 	} else
2790 		status = err;
2791 
2792 	if (rec->off_cpu)
2793 		rec->bytes_written += off_cpu_write(rec->session);
2794 
2795 	record__read_lost_samples(rec);
2796 	record__synthesize(rec, true);
2797 	/* this will be recalculated during process_buildids() */
2798 	rec->samples = 0;
2799 
2800 	if (!err) {
2801 		if (!rec->timestamp_filename) {
2802 			record__finish_output(rec);
2803 		} else {
2804 			fd = record__switch_output(rec, true);
2805 			if (fd < 0) {
2806 				status = fd;
2807 				goto out_delete_session;
2808 			}
2809 		}
2810 	}
2811 
2812 	perf_hooks__invoke_record_end();
2813 
2814 	if (!err && !quiet) {
2815 		char samples[128];
2816 		const char *postfix = rec->timestamp_filename ?
2817 					".<timestamp>" : "";
2818 
2819 		if (rec->samples && !rec->opts.full_auxtrace)
2820 			scnprintf(samples, sizeof(samples),
2821 				  " (%" PRIu64 " samples)", rec->samples);
2822 		else
2823 			samples[0] = '\0';
2824 
2825 		fprintf(stderr,	"[ perf record: Captured and wrote %.3f MB %s%s%s",
2826 			perf_data__size(data) / 1024.0 / 1024.0,
2827 			data->path, postfix, samples);
2828 		if (ratio) {
2829 			fprintf(stderr,	", compressed (original %.3f MB, ratio is %.3f)",
2830 					rec->session->bytes_transferred / 1024.0 / 1024.0,
2831 					ratio);
2832 		}
2833 		fprintf(stderr, " ]\n");
2834 	}
2835 
2836 out_delete_session:
2837 #ifdef HAVE_EVENTFD_SUPPORT
2838 	if (done_fd >= 0) {
2839 		fd = done_fd;
2840 		done_fd = -1;
2841 
2842 		close(fd);
2843 	}
2844 #endif
2845 	zstd_fini(&session->zstd_data);
2846 	perf_session__delete(session);
2847 
2848 	if (!opts->no_bpf_event)
2849 		evlist__stop_sb_thread(rec->sb_evlist);
2850 	return status;
2851 }
2852 
2853 static void callchain_debug(struct callchain_param *callchain)
2854 {
2855 	static const char *str[CALLCHAIN_MAX] = { "NONE", "FP", "DWARF", "LBR" };
2856 
2857 	pr_debug("callchain: type %s\n", str[callchain->record_mode]);
2858 
2859 	if (callchain->record_mode == CALLCHAIN_DWARF)
2860 		pr_debug("callchain: stack dump size %d\n",
2861 			 callchain->dump_size);
2862 }
2863 
2864 int record_opts__parse_callchain(struct record_opts *record,
2865 				 struct callchain_param *callchain,
2866 				 const char *arg, bool unset)
2867 {
2868 	int ret;
2869 	callchain->enabled = !unset;
2870 
2871 	/* --no-call-graph */
2872 	if (unset) {
2873 		callchain->record_mode = CALLCHAIN_NONE;
2874 		pr_debug("callchain: disabled\n");
2875 		return 0;
2876 	}
2877 
2878 	ret = parse_callchain_record_opt(arg, callchain);
2879 	if (!ret) {
2880 		/* Enable data address sampling for DWARF unwind. */
2881 		if (callchain->record_mode == CALLCHAIN_DWARF)
2882 			record->sample_address = true;
2883 		callchain_debug(callchain);
2884 	}
2885 
2886 	return ret;
2887 }
2888 
2889 int record_parse_callchain_opt(const struct option *opt,
2890 			       const char *arg,
2891 			       int unset)
2892 {
2893 	return record_opts__parse_callchain(opt->value, &callchain_param, arg, unset);
2894 }
2895 
2896 int record_callchain_opt(const struct option *opt,
2897 			 const char *arg __maybe_unused,
2898 			 int unset __maybe_unused)
2899 {
2900 	struct callchain_param *callchain = opt->value;
2901 
2902 	callchain->enabled = true;
2903 
2904 	if (callchain->record_mode == CALLCHAIN_NONE)
2905 		callchain->record_mode = CALLCHAIN_FP;
2906 
2907 	callchain_debug(callchain);
2908 	return 0;
2909 }
2910 
2911 static int perf_record_config(const char *var, const char *value, void *cb)
2912 {
2913 	struct record *rec = cb;
2914 
2915 	if (!strcmp(var, "record.build-id")) {
2916 		if (!strcmp(value, "cache"))
2917 			rec->no_buildid_cache = false;
2918 		else if (!strcmp(value, "no-cache"))
2919 			rec->no_buildid_cache = true;
2920 		else if (!strcmp(value, "skip"))
2921 			rec->no_buildid = true;
2922 		else if (!strcmp(value, "mmap"))
2923 			rec->buildid_mmap = true;
2924 		else
2925 			return -1;
2926 		return 0;
2927 	}
2928 	if (!strcmp(var, "record.call-graph")) {
2929 		var = "call-graph.record-mode";
2930 		return perf_default_config(var, value, cb);
2931 	}
2932 #ifdef HAVE_AIO_SUPPORT
2933 	if (!strcmp(var, "record.aio")) {
2934 		rec->opts.nr_cblocks = strtol(value, NULL, 0);
2935 		if (!rec->opts.nr_cblocks)
2936 			rec->opts.nr_cblocks = nr_cblocks_default;
2937 	}
2938 #endif
2939 	if (!strcmp(var, "record.debuginfod")) {
2940 		rec->debuginfod.urls = strdup(value);
2941 		if (!rec->debuginfod.urls)
2942 			return -ENOMEM;
2943 		rec->debuginfod.set = true;
2944 	}
2945 
2946 	return 0;
2947 }
2948 
2949 static int record__parse_event_enable_time(const struct option *opt, const char *str, int unset)
2950 {
2951 	struct record *rec = (struct record *)opt->value;
2952 
2953 	return evlist__parse_event_enable_time(rec->evlist, &rec->opts, str, unset);
2954 }
2955 
2956 static int record__parse_affinity(const struct option *opt, const char *str, int unset)
2957 {
2958 	struct record_opts *opts = (struct record_opts *)opt->value;
2959 
2960 	if (unset || !str)
2961 		return 0;
2962 
2963 	if (!strcasecmp(str, "node"))
2964 		opts->affinity = PERF_AFFINITY_NODE;
2965 	else if (!strcasecmp(str, "cpu"))
2966 		opts->affinity = PERF_AFFINITY_CPU;
2967 
2968 	return 0;
2969 }
2970 
2971 static int record__mmap_cpu_mask_alloc(struct mmap_cpu_mask *mask, int nr_bits)
2972 {
2973 	mask->nbits = nr_bits;
2974 	mask->bits = bitmap_zalloc(mask->nbits);
2975 	if (!mask->bits)
2976 		return -ENOMEM;
2977 
2978 	return 0;
2979 }
2980 
2981 static void record__mmap_cpu_mask_free(struct mmap_cpu_mask *mask)
2982 {
2983 	bitmap_free(mask->bits);
2984 	mask->nbits = 0;
2985 }
2986 
2987 static int record__thread_mask_alloc(struct thread_mask *mask, int nr_bits)
2988 {
2989 	int ret;
2990 
2991 	ret = record__mmap_cpu_mask_alloc(&mask->maps, nr_bits);
2992 	if (ret) {
2993 		mask->affinity.bits = NULL;
2994 		return ret;
2995 	}
2996 
2997 	ret = record__mmap_cpu_mask_alloc(&mask->affinity, nr_bits);
2998 	if (ret) {
2999 		record__mmap_cpu_mask_free(&mask->maps);
3000 		mask->maps.bits = NULL;
3001 	}
3002 
3003 	return ret;
3004 }
3005 
3006 static void record__thread_mask_free(struct thread_mask *mask)
3007 {
3008 	record__mmap_cpu_mask_free(&mask->maps);
3009 	record__mmap_cpu_mask_free(&mask->affinity);
3010 }
3011 
3012 static int record__parse_threads(const struct option *opt, const char *str, int unset)
3013 {
3014 	int s;
3015 	struct record_opts *opts = opt->value;
3016 
3017 	if (unset || !str || !strlen(str)) {
3018 		opts->threads_spec = THREAD_SPEC__CPU;
3019 	} else {
3020 		for (s = 1; s < THREAD_SPEC__MAX; s++) {
3021 			if (s == THREAD_SPEC__USER) {
3022 				opts->threads_user_spec = strdup(str);
3023 				if (!opts->threads_user_spec)
3024 					return -ENOMEM;
3025 				opts->threads_spec = THREAD_SPEC__USER;
3026 				break;
3027 			}
3028 			if (!strncasecmp(str, thread_spec_tags[s], strlen(thread_spec_tags[s]))) {
3029 				opts->threads_spec = s;
3030 				break;
3031 			}
3032 		}
3033 	}
3034 
3035 	if (opts->threads_spec == THREAD_SPEC__USER)
3036 		pr_debug("threads_spec: %s\n", opts->threads_user_spec);
3037 	else
3038 		pr_debug("threads_spec: %s\n", thread_spec_tags[opts->threads_spec]);
3039 
3040 	return 0;
3041 }
3042 
3043 static int parse_output_max_size(const struct option *opt,
3044 				 const char *str, int unset)
3045 {
3046 	unsigned long *s = (unsigned long *)opt->value;
3047 	static struct parse_tag tags_size[] = {
3048 		{ .tag  = 'B', .mult = 1       },
3049 		{ .tag  = 'K', .mult = 1 << 10 },
3050 		{ .tag  = 'M', .mult = 1 << 20 },
3051 		{ .tag  = 'G', .mult = 1 << 30 },
3052 		{ .tag  = 0 },
3053 	};
3054 	unsigned long val;
3055 
3056 	if (unset) {
3057 		*s = 0;
3058 		return 0;
3059 	}
3060 
3061 	val = parse_tag_value(str, tags_size);
3062 	if (val != (unsigned long) -1) {
3063 		*s = val;
3064 		return 0;
3065 	}
3066 
3067 	return -1;
3068 }
3069 
3070 static int record__parse_mmap_pages(const struct option *opt,
3071 				    const char *str,
3072 				    int unset __maybe_unused)
3073 {
3074 	struct record_opts *opts = opt->value;
3075 	char *s, *p;
3076 	unsigned int mmap_pages;
3077 	int ret;
3078 
3079 	if (!str)
3080 		return -EINVAL;
3081 
3082 	s = strdup(str);
3083 	if (!s)
3084 		return -ENOMEM;
3085 
3086 	p = strchr(s, ',');
3087 	if (p)
3088 		*p = '\0';
3089 
3090 	if (*s) {
3091 		ret = __evlist__parse_mmap_pages(&mmap_pages, s);
3092 		if (ret)
3093 			goto out_free;
3094 		opts->mmap_pages = mmap_pages;
3095 	}
3096 
3097 	if (!p) {
3098 		ret = 0;
3099 		goto out_free;
3100 	}
3101 
3102 	ret = __evlist__parse_mmap_pages(&mmap_pages, p + 1);
3103 	if (ret)
3104 		goto out_free;
3105 
3106 	opts->auxtrace_mmap_pages = mmap_pages;
3107 
3108 out_free:
3109 	free(s);
3110 	return ret;
3111 }
3112 
3113 void __weak arch__add_leaf_frame_record_opts(struct record_opts *opts __maybe_unused)
3114 {
3115 }
3116 
3117 static int parse_control_option(const struct option *opt,
3118 				const char *str,
3119 				int unset __maybe_unused)
3120 {
3121 	struct record_opts *opts = opt->value;
3122 
3123 	return evlist__parse_control(str, &opts->ctl_fd, &opts->ctl_fd_ack, &opts->ctl_fd_close);
3124 }
3125 
3126 static void switch_output_size_warn(struct record *rec)
3127 {
3128 	u64 wakeup_size = evlist__mmap_size(rec->opts.mmap_pages);
3129 	struct switch_output *s = &rec->switch_output;
3130 
3131 	wakeup_size /= 2;
3132 
3133 	if (s->size < wakeup_size) {
3134 		char buf[100];
3135 
3136 		unit_number__scnprintf(buf, sizeof(buf), wakeup_size);
3137 		pr_warning("WARNING: switch-output data size lower than "
3138 			   "wakeup kernel buffer size (%s) "
3139 			   "expect bigger perf.data sizes\n", buf);
3140 	}
3141 }
3142 
3143 static int switch_output_setup(struct record *rec)
3144 {
3145 	struct switch_output *s = &rec->switch_output;
3146 	static struct parse_tag tags_size[] = {
3147 		{ .tag  = 'B', .mult = 1       },
3148 		{ .tag  = 'K', .mult = 1 << 10 },
3149 		{ .tag  = 'M', .mult = 1 << 20 },
3150 		{ .tag  = 'G', .mult = 1 << 30 },
3151 		{ .tag  = 0 },
3152 	};
3153 	static struct parse_tag tags_time[] = {
3154 		{ .tag  = 's', .mult = 1        },
3155 		{ .tag  = 'm', .mult = 60       },
3156 		{ .tag  = 'h', .mult = 60*60    },
3157 		{ .tag  = 'd', .mult = 60*60*24 },
3158 		{ .tag  = 0 },
3159 	};
3160 	unsigned long val;
3161 
3162 	/*
3163 	 * If we're using --switch-output-events, then we imply its
3164 	 * --switch-output=signal, as we'll send a SIGUSR2 from the side band
3165 	 *  thread to its parent.
3166 	 */
3167 	if (rec->switch_output_event_set) {
3168 		if (record__threads_enabled(rec)) {
3169 			pr_warning("WARNING: --switch-output-event option is not available in parallel streaming mode.\n");
3170 			return 0;
3171 		}
3172 		goto do_signal;
3173 	}
3174 
3175 	if (!s->set)
3176 		return 0;
3177 
3178 	if (record__threads_enabled(rec)) {
3179 		pr_warning("WARNING: --switch-output option is not available in parallel streaming mode.\n");
3180 		return 0;
3181 	}
3182 
3183 	if (!strcmp(s->str, "signal")) {
3184 do_signal:
3185 		s->signal = true;
3186 		pr_debug("switch-output with SIGUSR2 signal\n");
3187 		goto enabled;
3188 	}
3189 
3190 	val = parse_tag_value(s->str, tags_size);
3191 	if (val != (unsigned long) -1) {
3192 		s->size = val;
3193 		pr_debug("switch-output with %s size threshold\n", s->str);
3194 		goto enabled;
3195 	}
3196 
3197 	val = parse_tag_value(s->str, tags_time);
3198 	if (val != (unsigned long) -1) {
3199 		s->time = val;
3200 		pr_debug("switch-output with %s time threshold (%lu seconds)\n",
3201 			 s->str, s->time);
3202 		goto enabled;
3203 	}
3204 
3205 	return -1;
3206 
3207 enabled:
3208 	rec->timestamp_filename = true;
3209 	s->enabled              = true;
3210 
3211 	if (s->size && !rec->opts.no_buffering)
3212 		switch_output_size_warn(rec);
3213 
3214 	return 0;
3215 }
3216 
3217 static const char * const __record_usage[] = {
3218 	"perf record [<options>] [<command>]",
3219 	"perf record [<options>] -- <command> [<options>]",
3220 	NULL
3221 };
3222 const char * const *record_usage = __record_usage;
3223 
3224 static int build_id__process_mmap(struct perf_tool *tool, union perf_event *event,
3225 				  struct perf_sample *sample, struct machine *machine)
3226 {
3227 	/*
3228 	 * We already have the kernel maps, put in place via perf_session__create_kernel_maps()
3229 	 * no need to add them twice.
3230 	 */
3231 	if (!(event->header.misc & PERF_RECORD_MISC_USER))
3232 		return 0;
3233 	return perf_event__process_mmap(tool, event, sample, machine);
3234 }
3235 
3236 static int build_id__process_mmap2(struct perf_tool *tool, union perf_event *event,
3237 				   struct perf_sample *sample, struct machine *machine)
3238 {
3239 	/*
3240 	 * We already have the kernel maps, put in place via perf_session__create_kernel_maps()
3241 	 * no need to add them twice.
3242 	 */
3243 	if (!(event->header.misc & PERF_RECORD_MISC_USER))
3244 		return 0;
3245 
3246 	return perf_event__process_mmap2(tool, event, sample, machine);
3247 }
3248 
3249 static int process_timestamp_boundary(struct perf_tool *tool,
3250 				      union perf_event *event __maybe_unused,
3251 				      struct perf_sample *sample,
3252 				      struct machine *machine __maybe_unused)
3253 {
3254 	struct record *rec = container_of(tool, struct record, tool);
3255 
3256 	set_timestamp_boundary(rec, sample->time);
3257 	return 0;
3258 }
3259 
3260 static int parse_record_synth_option(const struct option *opt,
3261 				     const char *str,
3262 				     int unset __maybe_unused)
3263 {
3264 	struct record_opts *opts = opt->value;
3265 	char *p = strdup(str);
3266 
3267 	if (p == NULL)
3268 		return -1;
3269 
3270 	opts->synth = parse_synth_opt(p);
3271 	free(p);
3272 
3273 	if (opts->synth < 0) {
3274 		pr_err("Invalid synth option: %s\n", str);
3275 		return -1;
3276 	}
3277 	return 0;
3278 }
3279 
3280 /*
3281  * XXX Ideally would be local to cmd_record() and passed to a record__new
3282  * because we need to have access to it in record__exit, that is called
3283  * after cmd_record() exits, but since record_options need to be accessible to
3284  * builtin-script, leave it here.
3285  *
3286  * At least we don't ouch it in all the other functions here directly.
3287  *
3288  * Just say no to tons of global variables, sigh.
3289  */
3290 static struct record record = {
3291 	.opts = {
3292 		.sample_time	     = true,
3293 		.mmap_pages	     = UINT_MAX,
3294 		.user_freq	     = UINT_MAX,
3295 		.user_interval	     = ULLONG_MAX,
3296 		.freq		     = 4000,
3297 		.target		     = {
3298 			.uses_mmap   = true,
3299 			.default_per_cpu = true,
3300 		},
3301 		.mmap_flush          = MMAP_FLUSH_DEFAULT,
3302 		.nr_threads_synthesize = 1,
3303 		.ctl_fd              = -1,
3304 		.ctl_fd_ack          = -1,
3305 		.synth               = PERF_SYNTH_ALL,
3306 	},
3307 	.tool = {
3308 		.sample		= process_sample_event,
3309 		.fork		= perf_event__process_fork,
3310 		.exit		= perf_event__process_exit,
3311 		.comm		= perf_event__process_comm,
3312 		.namespaces	= perf_event__process_namespaces,
3313 		.mmap		= build_id__process_mmap,
3314 		.mmap2		= build_id__process_mmap2,
3315 		.itrace_start	= process_timestamp_boundary,
3316 		.aux		= process_timestamp_boundary,
3317 		.ordered_events	= true,
3318 	},
3319 };
3320 
3321 const char record_callchain_help[] = CALLCHAIN_RECORD_HELP
3322 	"\n\t\t\t\tDefault: fp";
3323 
3324 static bool dry_run;
3325 
3326 static struct parse_events_option_args parse_events_option_args = {
3327 	.evlistp = &record.evlist,
3328 };
3329 
3330 static struct parse_events_option_args switch_output_parse_events_option_args = {
3331 	.evlistp = &record.sb_evlist,
3332 };
3333 
3334 /*
3335  * XXX Will stay a global variable till we fix builtin-script.c to stop messing
3336  * with it and switch to use the library functions in perf_evlist that came
3337  * from builtin-record.c, i.e. use record_opts,
3338  * evlist__prepare_workload, etc instead of fork+exec'in 'perf record',
3339  * using pipes, etc.
3340  */
3341 static struct option __record_options[] = {
3342 	OPT_CALLBACK('e', "event", &parse_events_option_args, "event",
3343 		     "event selector. use 'perf list' to list available events",
3344 		     parse_events_option),
3345 	OPT_CALLBACK(0, "filter", &record.evlist, "filter",
3346 		     "event filter", parse_filter),
3347 	OPT_CALLBACK_NOOPT(0, "exclude-perf", &record.evlist,
3348 			   NULL, "don't record events from perf itself",
3349 			   exclude_perf),
3350 	OPT_STRING('p', "pid", &record.opts.target.pid, "pid",
3351 		    "record events on existing process id"),
3352 	OPT_STRING('t', "tid", &record.opts.target.tid, "tid",
3353 		    "record events on existing thread id"),
3354 	OPT_INTEGER('r', "realtime", &record.realtime_prio,
3355 		    "collect data with this RT SCHED_FIFO priority"),
3356 	OPT_BOOLEAN(0, "no-buffering", &record.opts.no_buffering,
3357 		    "collect data without buffering"),
3358 	OPT_BOOLEAN('R', "raw-samples", &record.opts.raw_samples,
3359 		    "collect raw sample records from all opened counters"),
3360 	OPT_BOOLEAN('a', "all-cpus", &record.opts.target.system_wide,
3361 			    "system-wide collection from all CPUs"),
3362 	OPT_STRING('C', "cpu", &record.opts.target.cpu_list, "cpu",
3363 		    "list of cpus to monitor"),
3364 	OPT_U64('c', "count", &record.opts.user_interval, "event period to sample"),
3365 	OPT_STRING('o', "output", &record.data.path, "file",
3366 		    "output file name"),
3367 	OPT_BOOLEAN_SET('i', "no-inherit", &record.opts.no_inherit,
3368 			&record.opts.no_inherit_set,
3369 			"child tasks do not inherit counters"),
3370 	OPT_BOOLEAN(0, "tail-synthesize", &record.opts.tail_synthesize,
3371 		    "synthesize non-sample events at the end of output"),
3372 	OPT_BOOLEAN(0, "overwrite", &record.opts.overwrite, "use overwrite mode"),
3373 	OPT_BOOLEAN(0, "no-bpf-event", &record.opts.no_bpf_event, "do not record bpf events"),
3374 	OPT_BOOLEAN(0, "strict-freq", &record.opts.strict_freq,
3375 		    "Fail if the specified frequency can't be used"),
3376 	OPT_CALLBACK('F', "freq", &record.opts, "freq or 'max'",
3377 		     "profile at this frequency",
3378 		      record__parse_freq),
3379 	OPT_CALLBACK('m', "mmap-pages", &record.opts, "pages[,pages]",
3380 		     "number of mmap data pages and AUX area tracing mmap pages",
3381 		     record__parse_mmap_pages),
3382 	OPT_CALLBACK(0, "mmap-flush", &record.opts, "number",
3383 		     "Minimal number of bytes that is extracted from mmap data pages (default: 1)",
3384 		     record__mmap_flush_parse),
3385 	OPT_CALLBACK_NOOPT('g', NULL, &callchain_param,
3386 			   NULL, "enables call-graph recording" ,
3387 			   &record_callchain_opt),
3388 	OPT_CALLBACK(0, "call-graph", &record.opts,
3389 		     "record_mode[,record_size]", record_callchain_help,
3390 		     &record_parse_callchain_opt),
3391 	OPT_INCR('v', "verbose", &verbose,
3392 		    "be more verbose (show counter open errors, etc)"),
3393 	OPT_BOOLEAN('q', "quiet", &quiet, "don't print any warnings or messages"),
3394 	OPT_BOOLEAN('s', "stat", &record.opts.inherit_stat,
3395 		    "per thread counts"),
3396 	OPT_BOOLEAN('d', "data", &record.opts.sample_address, "Record the sample addresses"),
3397 	OPT_BOOLEAN(0, "phys-data", &record.opts.sample_phys_addr,
3398 		    "Record the sample physical addresses"),
3399 	OPT_BOOLEAN(0, "data-page-size", &record.opts.sample_data_page_size,
3400 		    "Record the sampled data address data page size"),
3401 	OPT_BOOLEAN(0, "code-page-size", &record.opts.sample_code_page_size,
3402 		    "Record the sampled code address (ip) page size"),
3403 	OPT_BOOLEAN(0, "sample-cpu", &record.opts.sample_cpu, "Record the sample cpu"),
3404 	OPT_BOOLEAN(0, "sample-identifier", &record.opts.sample_identifier,
3405 		    "Record the sample identifier"),
3406 	OPT_BOOLEAN_SET('T', "timestamp", &record.opts.sample_time,
3407 			&record.opts.sample_time_set,
3408 			"Record the sample timestamps"),
3409 	OPT_BOOLEAN_SET('P', "period", &record.opts.period, &record.opts.period_set,
3410 			"Record the sample period"),
3411 	OPT_BOOLEAN('n', "no-samples", &record.opts.no_samples,
3412 		    "don't sample"),
3413 	OPT_BOOLEAN_SET('N', "no-buildid-cache", &record.no_buildid_cache,
3414 			&record.no_buildid_cache_set,
3415 			"do not update the buildid cache"),
3416 	OPT_BOOLEAN_SET('B', "no-buildid", &record.no_buildid,
3417 			&record.no_buildid_set,
3418 			"do not collect buildids in perf.data"),
3419 	OPT_CALLBACK('G', "cgroup", &record.evlist, "name",
3420 		     "monitor event in cgroup name only",
3421 		     parse_cgroups),
3422 	OPT_CALLBACK('D', "delay", &record, "ms",
3423 		     "ms to wait before starting measurement after program start (-1: start with events disabled), "
3424 		     "or ranges of time to enable events e.g. '-D 10-20,30-40'",
3425 		     record__parse_event_enable_time),
3426 	OPT_BOOLEAN(0, "kcore", &record.opts.kcore, "copy /proc/kcore"),
3427 	OPT_STRING('u', "uid", &record.opts.target.uid_str, "user",
3428 		   "user to profile"),
3429 
3430 	OPT_CALLBACK_NOOPT('b', "branch-any", &record.opts.branch_stack,
3431 		     "branch any", "sample any taken branches",
3432 		     parse_branch_stack),
3433 
3434 	OPT_CALLBACK('j', "branch-filter", &record.opts.branch_stack,
3435 		     "branch filter mask", "branch stack filter modes",
3436 		     parse_branch_stack),
3437 	OPT_BOOLEAN('W', "weight", &record.opts.sample_weight,
3438 		    "sample by weight (on special events only)"),
3439 	OPT_BOOLEAN(0, "transaction", &record.opts.sample_transaction,
3440 		    "sample transaction flags (special events only)"),
3441 	OPT_BOOLEAN(0, "per-thread", &record.opts.target.per_thread,
3442 		    "use per-thread mmaps"),
3443 	OPT_CALLBACK_OPTARG('I', "intr-regs", &record.opts.sample_intr_regs, NULL, "any register",
3444 		    "sample selected machine registers on interrupt,"
3445 		    " use '-I?' to list register names", parse_intr_regs),
3446 	OPT_CALLBACK_OPTARG(0, "user-regs", &record.opts.sample_user_regs, NULL, "any register",
3447 		    "sample selected machine registers on interrupt,"
3448 		    " use '--user-regs=?' to list register names", parse_user_regs),
3449 	OPT_BOOLEAN(0, "running-time", &record.opts.running_time,
3450 		    "Record running/enabled time of read (:S) events"),
3451 	OPT_CALLBACK('k', "clockid", &record.opts,
3452 	"clockid", "clockid to use for events, see clock_gettime()",
3453 	parse_clockid),
3454 	OPT_STRING_OPTARG('S', "snapshot", &record.opts.auxtrace_snapshot_opts,
3455 			  "opts", "AUX area tracing Snapshot Mode", ""),
3456 	OPT_STRING_OPTARG(0, "aux-sample", &record.opts.auxtrace_sample_opts,
3457 			  "opts", "sample AUX area", ""),
3458 	OPT_UINTEGER(0, "proc-map-timeout", &proc_map_timeout,
3459 			"per thread proc mmap processing timeout in ms"),
3460 	OPT_BOOLEAN(0, "namespaces", &record.opts.record_namespaces,
3461 		    "Record namespaces events"),
3462 	OPT_BOOLEAN(0, "all-cgroups", &record.opts.record_cgroup,
3463 		    "Record cgroup events"),
3464 	OPT_BOOLEAN_SET(0, "switch-events", &record.opts.record_switch_events,
3465 			&record.opts.record_switch_events_set,
3466 			"Record context switch events"),
3467 	OPT_BOOLEAN_FLAG(0, "all-kernel", &record.opts.all_kernel,
3468 			 "Configure all used events to run in kernel space.",
3469 			 PARSE_OPT_EXCLUSIVE),
3470 	OPT_BOOLEAN_FLAG(0, "all-user", &record.opts.all_user,
3471 			 "Configure all used events to run in user space.",
3472 			 PARSE_OPT_EXCLUSIVE),
3473 	OPT_BOOLEAN(0, "kernel-callchains", &record.opts.kernel_callchains,
3474 		    "collect kernel callchains"),
3475 	OPT_BOOLEAN(0, "user-callchains", &record.opts.user_callchains,
3476 		    "collect user callchains"),
3477 	OPT_STRING(0, "vmlinux", &symbol_conf.vmlinux_name,
3478 		   "file", "vmlinux pathname"),
3479 	OPT_BOOLEAN(0, "buildid-all", &record.buildid_all,
3480 		    "Record build-id of all DSOs regardless of hits"),
3481 	OPT_BOOLEAN(0, "buildid-mmap", &record.buildid_mmap,
3482 		    "Record build-id in map events"),
3483 	OPT_BOOLEAN(0, "timestamp-filename", &record.timestamp_filename,
3484 		    "append timestamp to output filename"),
3485 	OPT_BOOLEAN(0, "timestamp-boundary", &record.timestamp_boundary,
3486 		    "Record timestamp boundary (time of first/last samples)"),
3487 	OPT_STRING_OPTARG_SET(0, "switch-output", &record.switch_output.str,
3488 			  &record.switch_output.set, "signal or size[BKMG] or time[smhd]",
3489 			  "Switch output when receiving SIGUSR2 (signal) or cross a size or time threshold",
3490 			  "signal"),
3491 	OPT_CALLBACK_SET(0, "switch-output-event", &switch_output_parse_events_option_args,
3492 			 &record.switch_output_event_set, "switch output event",
3493 			 "switch output event selector. use 'perf list' to list available events",
3494 			 parse_events_option_new_evlist),
3495 	OPT_INTEGER(0, "switch-max-files", &record.switch_output.num_files,
3496 		   "Limit number of switch output generated files"),
3497 	OPT_BOOLEAN(0, "dry-run", &dry_run,
3498 		    "Parse options then exit"),
3499 #ifdef HAVE_AIO_SUPPORT
3500 	OPT_CALLBACK_OPTARG(0, "aio", &record.opts,
3501 		     &nr_cblocks_default, "n", "Use <n> control blocks in asynchronous trace writing mode (default: 1, max: 4)",
3502 		     record__aio_parse),
3503 #endif
3504 	OPT_CALLBACK(0, "affinity", &record.opts, "node|cpu",
3505 		     "Set affinity mask of trace reading thread to NUMA node cpu mask or cpu of processed mmap buffer",
3506 		     record__parse_affinity),
3507 #ifdef HAVE_ZSTD_SUPPORT
3508 	OPT_CALLBACK_OPTARG('z', "compression-level", &record.opts, &comp_level_default, "n",
3509 			    "Compress records using specified level (default: 1 - fastest compression, 22 - greatest compression)",
3510 			    record__parse_comp_level),
3511 #endif
3512 	OPT_CALLBACK(0, "max-size", &record.output_max_size,
3513 		     "size", "Limit the maximum size of the output file", parse_output_max_size),
3514 	OPT_UINTEGER(0, "num-thread-synthesize",
3515 		     &record.opts.nr_threads_synthesize,
3516 		     "number of threads to run for event synthesis"),
3517 #ifdef HAVE_LIBPFM
3518 	OPT_CALLBACK(0, "pfm-events", &record.evlist, "event",
3519 		"libpfm4 event selector. use 'perf list' to list available events",
3520 		parse_libpfm_events_option),
3521 #endif
3522 	OPT_CALLBACK(0, "control", &record.opts, "fd:ctl-fd[,ack-fd] or fifo:ctl-fifo[,ack-fifo]",
3523 		     "Listen on ctl-fd descriptor for command to control measurement ('enable': enable events, 'disable': disable events,\n"
3524 		     "\t\t\t  'snapshot': AUX area tracing snapshot).\n"
3525 		     "\t\t\t  Optionally send control command completion ('ack\\n') to ack-fd descriptor.\n"
3526 		     "\t\t\t  Alternatively, ctl-fifo / ack-fifo will be opened and used as ctl-fd / ack-fd.",
3527 		      parse_control_option),
3528 	OPT_CALLBACK(0, "synth", &record.opts, "no|all|task|mmap|cgroup",
3529 		     "Fine-tune event synthesis: default=all", parse_record_synth_option),
3530 	OPT_STRING_OPTARG_SET(0, "debuginfod", &record.debuginfod.urls,
3531 			  &record.debuginfod.set, "debuginfod urls",
3532 			  "Enable debuginfod data retrieval from DEBUGINFOD_URLS or specified urls",
3533 			  "system"),
3534 	OPT_CALLBACK_OPTARG(0, "threads", &record.opts, NULL, "spec",
3535 			    "write collected trace data into several data files using parallel threads",
3536 			    record__parse_threads),
3537 	OPT_BOOLEAN(0, "off-cpu", &record.off_cpu, "Enable off-cpu analysis"),
3538 	OPT_END()
3539 };
3540 
3541 struct option *record_options = __record_options;
3542 
3543 static int record__mmap_cpu_mask_init(struct mmap_cpu_mask *mask, struct perf_cpu_map *cpus)
3544 {
3545 	struct perf_cpu cpu;
3546 	int idx;
3547 
3548 	if (cpu_map__is_dummy(cpus))
3549 		return 0;
3550 
3551 	perf_cpu_map__for_each_cpu(cpu, idx, cpus) {
3552 		if (cpu.cpu == -1)
3553 			continue;
3554 		/* Return ENODEV is input cpu is greater than max cpu */
3555 		if ((unsigned long)cpu.cpu > mask->nbits)
3556 			return -ENODEV;
3557 		__set_bit(cpu.cpu, mask->bits);
3558 	}
3559 
3560 	return 0;
3561 }
3562 
3563 static int record__mmap_cpu_mask_init_spec(struct mmap_cpu_mask *mask, const char *mask_spec)
3564 {
3565 	struct perf_cpu_map *cpus;
3566 
3567 	cpus = perf_cpu_map__new(mask_spec);
3568 	if (!cpus)
3569 		return -ENOMEM;
3570 
3571 	bitmap_zero(mask->bits, mask->nbits);
3572 	if (record__mmap_cpu_mask_init(mask, cpus))
3573 		return -ENODEV;
3574 
3575 	perf_cpu_map__put(cpus);
3576 
3577 	return 0;
3578 }
3579 
3580 static void record__free_thread_masks(struct record *rec, int nr_threads)
3581 {
3582 	int t;
3583 
3584 	if (rec->thread_masks)
3585 		for (t = 0; t < nr_threads; t++)
3586 			record__thread_mask_free(&rec->thread_masks[t]);
3587 
3588 	zfree(&rec->thread_masks);
3589 }
3590 
3591 static int record__alloc_thread_masks(struct record *rec, int nr_threads, int nr_bits)
3592 {
3593 	int t, ret;
3594 
3595 	rec->thread_masks = zalloc(nr_threads * sizeof(*(rec->thread_masks)));
3596 	if (!rec->thread_masks) {
3597 		pr_err("Failed to allocate thread masks\n");
3598 		return -ENOMEM;
3599 	}
3600 
3601 	for (t = 0; t < nr_threads; t++) {
3602 		ret = record__thread_mask_alloc(&rec->thread_masks[t], nr_bits);
3603 		if (ret) {
3604 			pr_err("Failed to allocate thread masks[%d]\n", t);
3605 			goto out_free;
3606 		}
3607 	}
3608 
3609 	return 0;
3610 
3611 out_free:
3612 	record__free_thread_masks(rec, nr_threads);
3613 
3614 	return ret;
3615 }
3616 
3617 static int record__init_thread_cpu_masks(struct record *rec, struct perf_cpu_map *cpus)
3618 {
3619 	int t, ret, nr_cpus = perf_cpu_map__nr(cpus);
3620 
3621 	ret = record__alloc_thread_masks(rec, nr_cpus, cpu__max_cpu().cpu);
3622 	if (ret)
3623 		return ret;
3624 
3625 	rec->nr_threads = nr_cpus;
3626 	pr_debug("nr_threads: %d\n", rec->nr_threads);
3627 
3628 	for (t = 0; t < rec->nr_threads; t++) {
3629 		__set_bit(perf_cpu_map__cpu(cpus, t).cpu, rec->thread_masks[t].maps.bits);
3630 		__set_bit(perf_cpu_map__cpu(cpus, t).cpu, rec->thread_masks[t].affinity.bits);
3631 		if (verbose > 0) {
3632 			pr_debug("thread_masks[%d]: ", t);
3633 			mmap_cpu_mask__scnprintf(&rec->thread_masks[t].maps, "maps");
3634 			pr_debug("thread_masks[%d]: ", t);
3635 			mmap_cpu_mask__scnprintf(&rec->thread_masks[t].affinity, "affinity");
3636 		}
3637 	}
3638 
3639 	return 0;
3640 }
3641 
3642 static int record__init_thread_masks_spec(struct record *rec, struct perf_cpu_map *cpus,
3643 					  const char **maps_spec, const char **affinity_spec,
3644 					  u32 nr_spec)
3645 {
3646 	u32 s;
3647 	int ret = 0, t = 0;
3648 	struct mmap_cpu_mask cpus_mask;
3649 	struct thread_mask thread_mask, full_mask, *thread_masks;
3650 
3651 	ret = record__mmap_cpu_mask_alloc(&cpus_mask, cpu__max_cpu().cpu);
3652 	if (ret) {
3653 		pr_err("Failed to allocate CPUs mask\n");
3654 		return ret;
3655 	}
3656 
3657 	ret = record__mmap_cpu_mask_init(&cpus_mask, cpus);
3658 	if (ret) {
3659 		pr_err("Failed to init cpu mask\n");
3660 		goto out_free_cpu_mask;
3661 	}
3662 
3663 	ret = record__thread_mask_alloc(&full_mask, cpu__max_cpu().cpu);
3664 	if (ret) {
3665 		pr_err("Failed to allocate full mask\n");
3666 		goto out_free_cpu_mask;
3667 	}
3668 
3669 	ret = record__thread_mask_alloc(&thread_mask, cpu__max_cpu().cpu);
3670 	if (ret) {
3671 		pr_err("Failed to allocate thread mask\n");
3672 		goto out_free_full_and_cpu_masks;
3673 	}
3674 
3675 	for (s = 0; s < nr_spec; s++) {
3676 		ret = record__mmap_cpu_mask_init_spec(&thread_mask.maps, maps_spec[s]);
3677 		if (ret) {
3678 			pr_err("Failed to initialize maps thread mask\n");
3679 			goto out_free;
3680 		}
3681 		ret = record__mmap_cpu_mask_init_spec(&thread_mask.affinity, affinity_spec[s]);
3682 		if (ret) {
3683 			pr_err("Failed to initialize affinity thread mask\n");
3684 			goto out_free;
3685 		}
3686 
3687 		/* ignore invalid CPUs but do not allow empty masks */
3688 		if (!bitmap_and(thread_mask.maps.bits, thread_mask.maps.bits,
3689 				cpus_mask.bits, thread_mask.maps.nbits)) {
3690 			pr_err("Empty maps mask: %s\n", maps_spec[s]);
3691 			ret = -EINVAL;
3692 			goto out_free;
3693 		}
3694 		if (!bitmap_and(thread_mask.affinity.bits, thread_mask.affinity.bits,
3695 				cpus_mask.bits, thread_mask.affinity.nbits)) {
3696 			pr_err("Empty affinity mask: %s\n", affinity_spec[s]);
3697 			ret = -EINVAL;
3698 			goto out_free;
3699 		}
3700 
3701 		/* do not allow intersection with other masks (full_mask) */
3702 		if (bitmap_intersects(thread_mask.maps.bits, full_mask.maps.bits,
3703 				      thread_mask.maps.nbits)) {
3704 			pr_err("Intersecting maps mask: %s\n", maps_spec[s]);
3705 			ret = -EINVAL;
3706 			goto out_free;
3707 		}
3708 		if (bitmap_intersects(thread_mask.affinity.bits, full_mask.affinity.bits,
3709 				      thread_mask.affinity.nbits)) {
3710 			pr_err("Intersecting affinity mask: %s\n", affinity_spec[s]);
3711 			ret = -EINVAL;
3712 			goto out_free;
3713 		}
3714 
3715 		bitmap_or(full_mask.maps.bits, full_mask.maps.bits,
3716 			  thread_mask.maps.bits, full_mask.maps.nbits);
3717 		bitmap_or(full_mask.affinity.bits, full_mask.affinity.bits,
3718 			  thread_mask.affinity.bits, full_mask.maps.nbits);
3719 
3720 		thread_masks = realloc(rec->thread_masks, (t + 1) * sizeof(struct thread_mask));
3721 		if (!thread_masks) {
3722 			pr_err("Failed to reallocate thread masks\n");
3723 			ret = -ENOMEM;
3724 			goto out_free;
3725 		}
3726 		rec->thread_masks = thread_masks;
3727 		rec->thread_masks[t] = thread_mask;
3728 		if (verbose > 0) {
3729 			pr_debug("thread_masks[%d]: ", t);
3730 			mmap_cpu_mask__scnprintf(&rec->thread_masks[t].maps, "maps");
3731 			pr_debug("thread_masks[%d]: ", t);
3732 			mmap_cpu_mask__scnprintf(&rec->thread_masks[t].affinity, "affinity");
3733 		}
3734 		t++;
3735 		ret = record__thread_mask_alloc(&thread_mask, cpu__max_cpu().cpu);
3736 		if (ret) {
3737 			pr_err("Failed to allocate thread mask\n");
3738 			goto out_free_full_and_cpu_masks;
3739 		}
3740 	}
3741 	rec->nr_threads = t;
3742 	pr_debug("nr_threads: %d\n", rec->nr_threads);
3743 	if (!rec->nr_threads)
3744 		ret = -EINVAL;
3745 
3746 out_free:
3747 	record__thread_mask_free(&thread_mask);
3748 out_free_full_and_cpu_masks:
3749 	record__thread_mask_free(&full_mask);
3750 out_free_cpu_mask:
3751 	record__mmap_cpu_mask_free(&cpus_mask);
3752 
3753 	return ret;
3754 }
3755 
3756 static int record__init_thread_core_masks(struct record *rec, struct perf_cpu_map *cpus)
3757 {
3758 	int ret;
3759 	struct cpu_topology *topo;
3760 
3761 	topo = cpu_topology__new();
3762 	if (!topo) {
3763 		pr_err("Failed to allocate CPU topology\n");
3764 		return -ENOMEM;
3765 	}
3766 
3767 	ret = record__init_thread_masks_spec(rec, cpus, topo->core_cpus_list,
3768 					     topo->core_cpus_list, topo->core_cpus_lists);
3769 	cpu_topology__delete(topo);
3770 
3771 	return ret;
3772 }
3773 
3774 static int record__init_thread_package_masks(struct record *rec, struct perf_cpu_map *cpus)
3775 {
3776 	int ret;
3777 	struct cpu_topology *topo;
3778 
3779 	topo = cpu_topology__new();
3780 	if (!topo) {
3781 		pr_err("Failed to allocate CPU topology\n");
3782 		return -ENOMEM;
3783 	}
3784 
3785 	ret = record__init_thread_masks_spec(rec, cpus, topo->package_cpus_list,
3786 					     topo->package_cpus_list, topo->package_cpus_lists);
3787 	cpu_topology__delete(topo);
3788 
3789 	return ret;
3790 }
3791 
3792 static int record__init_thread_numa_masks(struct record *rec, struct perf_cpu_map *cpus)
3793 {
3794 	u32 s;
3795 	int ret;
3796 	const char **spec;
3797 	struct numa_topology *topo;
3798 
3799 	topo = numa_topology__new();
3800 	if (!topo) {
3801 		pr_err("Failed to allocate NUMA topology\n");
3802 		return -ENOMEM;
3803 	}
3804 
3805 	spec = zalloc(topo->nr * sizeof(char *));
3806 	if (!spec) {
3807 		pr_err("Failed to allocate NUMA spec\n");
3808 		ret = -ENOMEM;
3809 		goto out_delete_topo;
3810 	}
3811 	for (s = 0; s < topo->nr; s++)
3812 		spec[s] = topo->nodes[s].cpus;
3813 
3814 	ret = record__init_thread_masks_spec(rec, cpus, spec, spec, topo->nr);
3815 
3816 	zfree(&spec);
3817 
3818 out_delete_topo:
3819 	numa_topology__delete(topo);
3820 
3821 	return ret;
3822 }
3823 
3824 static int record__init_thread_user_masks(struct record *rec, struct perf_cpu_map *cpus)
3825 {
3826 	int t, ret;
3827 	u32 s, nr_spec = 0;
3828 	char **maps_spec = NULL, **affinity_spec = NULL, **tmp_spec;
3829 	char *user_spec, *spec, *spec_ptr, *mask, *mask_ptr, *dup_mask = NULL;
3830 
3831 	for (t = 0, user_spec = (char *)rec->opts.threads_user_spec; ; t++, user_spec = NULL) {
3832 		spec = strtok_r(user_spec, ":", &spec_ptr);
3833 		if (spec == NULL)
3834 			break;
3835 		pr_debug2("threads_spec[%d]: %s\n", t, spec);
3836 		mask = strtok_r(spec, "/", &mask_ptr);
3837 		if (mask == NULL)
3838 			break;
3839 		pr_debug2("  maps mask: %s\n", mask);
3840 		tmp_spec = realloc(maps_spec, (nr_spec + 1) * sizeof(char *));
3841 		if (!tmp_spec) {
3842 			pr_err("Failed to reallocate maps spec\n");
3843 			ret = -ENOMEM;
3844 			goto out_free;
3845 		}
3846 		maps_spec = tmp_spec;
3847 		maps_spec[nr_spec] = dup_mask = strdup(mask);
3848 		if (!maps_spec[nr_spec]) {
3849 			pr_err("Failed to allocate maps spec[%d]\n", nr_spec);
3850 			ret = -ENOMEM;
3851 			goto out_free;
3852 		}
3853 		mask = strtok_r(NULL, "/", &mask_ptr);
3854 		if (mask == NULL) {
3855 			pr_err("Invalid thread maps or affinity specs\n");
3856 			ret = -EINVAL;
3857 			goto out_free;
3858 		}
3859 		pr_debug2("  affinity mask: %s\n", mask);
3860 		tmp_spec = realloc(affinity_spec, (nr_spec + 1) * sizeof(char *));
3861 		if (!tmp_spec) {
3862 			pr_err("Failed to reallocate affinity spec\n");
3863 			ret = -ENOMEM;
3864 			goto out_free;
3865 		}
3866 		affinity_spec = tmp_spec;
3867 		affinity_spec[nr_spec] = strdup(mask);
3868 		if (!affinity_spec[nr_spec]) {
3869 			pr_err("Failed to allocate affinity spec[%d]\n", nr_spec);
3870 			ret = -ENOMEM;
3871 			goto out_free;
3872 		}
3873 		dup_mask = NULL;
3874 		nr_spec++;
3875 	}
3876 
3877 	ret = record__init_thread_masks_spec(rec, cpus, (const char **)maps_spec,
3878 					     (const char **)affinity_spec, nr_spec);
3879 
3880 out_free:
3881 	free(dup_mask);
3882 	for (s = 0; s < nr_spec; s++) {
3883 		if (maps_spec)
3884 			free(maps_spec[s]);
3885 		if (affinity_spec)
3886 			free(affinity_spec[s]);
3887 	}
3888 	free(affinity_spec);
3889 	free(maps_spec);
3890 
3891 	return ret;
3892 }
3893 
3894 static int record__init_thread_default_masks(struct record *rec, struct perf_cpu_map *cpus)
3895 {
3896 	int ret;
3897 
3898 	ret = record__alloc_thread_masks(rec, 1, cpu__max_cpu().cpu);
3899 	if (ret)
3900 		return ret;
3901 
3902 	if (record__mmap_cpu_mask_init(&rec->thread_masks->maps, cpus))
3903 		return -ENODEV;
3904 
3905 	rec->nr_threads = 1;
3906 
3907 	return 0;
3908 }
3909 
3910 static int record__init_thread_masks(struct record *rec)
3911 {
3912 	int ret = 0;
3913 	struct perf_cpu_map *cpus = rec->evlist->core.all_cpus;
3914 
3915 	if (!record__threads_enabled(rec))
3916 		return record__init_thread_default_masks(rec, cpus);
3917 
3918 	if (evlist__per_thread(rec->evlist)) {
3919 		pr_err("--per-thread option is mutually exclusive to parallel streaming mode.\n");
3920 		return -EINVAL;
3921 	}
3922 
3923 	switch (rec->opts.threads_spec) {
3924 	case THREAD_SPEC__CPU:
3925 		ret = record__init_thread_cpu_masks(rec, cpus);
3926 		break;
3927 	case THREAD_SPEC__CORE:
3928 		ret = record__init_thread_core_masks(rec, cpus);
3929 		break;
3930 	case THREAD_SPEC__PACKAGE:
3931 		ret = record__init_thread_package_masks(rec, cpus);
3932 		break;
3933 	case THREAD_SPEC__NUMA:
3934 		ret = record__init_thread_numa_masks(rec, cpus);
3935 		break;
3936 	case THREAD_SPEC__USER:
3937 		ret = record__init_thread_user_masks(rec, cpus);
3938 		break;
3939 	default:
3940 		break;
3941 	}
3942 
3943 	return ret;
3944 }
3945 
3946 int cmd_record(int argc, const char **argv)
3947 {
3948 	int err;
3949 	struct record *rec = &record;
3950 	char errbuf[BUFSIZ];
3951 
3952 	setlocale(LC_ALL, "");
3953 
3954 #ifndef HAVE_BPF_SKEL
3955 # define set_nobuild(s, l, m, c) set_option_nobuild(record_options, s, l, m, c)
3956 	set_nobuild('\0', "off-cpu", "no BUILD_BPF_SKEL=1", true);
3957 # undef set_nobuild
3958 #endif
3959 
3960 	rec->opts.affinity = PERF_AFFINITY_SYS;
3961 
3962 	rec->evlist = evlist__new();
3963 	if (rec->evlist == NULL)
3964 		return -ENOMEM;
3965 
3966 	err = perf_config(perf_record_config, rec);
3967 	if (err)
3968 		return err;
3969 
3970 	argc = parse_options(argc, argv, record_options, record_usage,
3971 			    PARSE_OPT_STOP_AT_NON_OPTION);
3972 	if (quiet)
3973 		perf_quiet_option();
3974 
3975 	err = symbol__validate_sym_arguments();
3976 	if (err)
3977 		return err;
3978 
3979 	perf_debuginfod_setup(&record.debuginfod);
3980 
3981 	/* Make system wide (-a) the default target. */
3982 	if (!argc && target__none(&rec->opts.target))
3983 		rec->opts.target.system_wide = true;
3984 
3985 	if (nr_cgroups && !rec->opts.target.system_wide) {
3986 		usage_with_options_msg(record_usage, record_options,
3987 			"cgroup monitoring only available in system-wide mode");
3988 
3989 	}
3990 
3991 	if (rec->buildid_mmap) {
3992 		if (!perf_can_record_build_id()) {
3993 			pr_err("Failed: no support to record build id in mmap events, update your kernel.\n");
3994 			err = -EINVAL;
3995 			goto out_opts;
3996 		}
3997 		pr_debug("Enabling build id in mmap2 events.\n");
3998 		/* Enable mmap build id synthesizing. */
3999 		symbol_conf.buildid_mmap2 = true;
4000 		/* Enable perf_event_attr::build_id bit. */
4001 		rec->opts.build_id = true;
4002 		/* Disable build id cache. */
4003 		rec->no_buildid = true;
4004 	}
4005 
4006 	if (rec->opts.record_cgroup && !perf_can_record_cgroup()) {
4007 		pr_err("Kernel has no cgroup sampling support.\n");
4008 		err = -EINVAL;
4009 		goto out_opts;
4010 	}
4011 
4012 	if (rec->opts.kcore)
4013 		rec->opts.text_poke = true;
4014 
4015 	if (rec->opts.kcore || record__threads_enabled(rec))
4016 		rec->data.is_dir = true;
4017 
4018 	if (record__threads_enabled(rec)) {
4019 		if (rec->opts.affinity != PERF_AFFINITY_SYS) {
4020 			pr_err("--affinity option is mutually exclusive to parallel streaming mode.\n");
4021 			goto out_opts;
4022 		}
4023 		if (record__aio_enabled(rec)) {
4024 			pr_err("Asynchronous streaming mode (--aio) is mutually exclusive to parallel streaming mode.\n");
4025 			goto out_opts;
4026 		}
4027 	}
4028 
4029 	if (rec->opts.comp_level != 0) {
4030 		pr_debug("Compression enabled, disabling build id collection at the end of the session.\n");
4031 		rec->no_buildid = true;
4032 	}
4033 
4034 	if (rec->opts.record_switch_events &&
4035 	    !perf_can_record_switch_events()) {
4036 		ui__error("kernel does not support recording context switch events\n");
4037 		parse_options_usage(record_usage, record_options, "switch-events", 0);
4038 		err = -EINVAL;
4039 		goto out_opts;
4040 	}
4041 
4042 	if (switch_output_setup(rec)) {
4043 		parse_options_usage(record_usage, record_options, "switch-output", 0);
4044 		err = -EINVAL;
4045 		goto out_opts;
4046 	}
4047 
4048 	if (rec->switch_output.time) {
4049 		signal(SIGALRM, alarm_sig_handler);
4050 		alarm(rec->switch_output.time);
4051 	}
4052 
4053 	if (rec->switch_output.num_files) {
4054 		rec->switch_output.filenames = calloc(sizeof(char *),
4055 						      rec->switch_output.num_files);
4056 		if (!rec->switch_output.filenames) {
4057 			err = -EINVAL;
4058 			goto out_opts;
4059 		}
4060 	}
4061 
4062 	if (rec->timestamp_filename && record__threads_enabled(rec)) {
4063 		rec->timestamp_filename = false;
4064 		pr_warning("WARNING: --timestamp-filename option is not available in parallel streaming mode.\n");
4065 	}
4066 
4067 	/*
4068 	 * Allow aliases to facilitate the lookup of symbols for address
4069 	 * filters. Refer to auxtrace_parse_filters().
4070 	 */
4071 	symbol_conf.allow_aliases = true;
4072 
4073 	symbol__init(NULL);
4074 
4075 	err = record__auxtrace_init(rec);
4076 	if (err)
4077 		goto out;
4078 
4079 	if (dry_run)
4080 		goto out;
4081 
4082 	err = -ENOMEM;
4083 
4084 	if (rec->no_buildid_cache || rec->no_buildid) {
4085 		disable_buildid_cache();
4086 	} else if (rec->switch_output.enabled) {
4087 		/*
4088 		 * In 'perf record --switch-output', disable buildid
4089 		 * generation by default to reduce data file switching
4090 		 * overhead. Still generate buildid if they are required
4091 		 * explicitly using
4092 		 *
4093 		 *  perf record --switch-output --no-no-buildid \
4094 		 *              --no-no-buildid-cache
4095 		 *
4096 		 * Following code equals to:
4097 		 *
4098 		 * if ((rec->no_buildid || !rec->no_buildid_set) &&
4099 		 *     (rec->no_buildid_cache || !rec->no_buildid_cache_set))
4100 		 *         disable_buildid_cache();
4101 		 */
4102 		bool disable = true;
4103 
4104 		if (rec->no_buildid_set && !rec->no_buildid)
4105 			disable = false;
4106 		if (rec->no_buildid_cache_set && !rec->no_buildid_cache)
4107 			disable = false;
4108 		if (disable) {
4109 			rec->no_buildid = true;
4110 			rec->no_buildid_cache = true;
4111 			disable_buildid_cache();
4112 		}
4113 	}
4114 
4115 	if (record.opts.overwrite)
4116 		record.opts.tail_synthesize = true;
4117 
4118 	if (rec->evlist->core.nr_entries == 0) {
4119 		bool can_profile_kernel = perf_event_paranoid_check(1);
4120 
4121 		err = parse_event(rec->evlist, can_profile_kernel ? "cycles:P" : "cycles:Pu");
4122 		if (err)
4123 			goto out;
4124 	}
4125 
4126 	if (rec->opts.target.tid && !rec->opts.no_inherit_set)
4127 		rec->opts.no_inherit = true;
4128 
4129 	err = target__validate(&rec->opts.target);
4130 	if (err) {
4131 		target__strerror(&rec->opts.target, err, errbuf, BUFSIZ);
4132 		ui__warning("%s\n", errbuf);
4133 	}
4134 
4135 	err = target__parse_uid(&rec->opts.target);
4136 	if (err) {
4137 		int saved_errno = errno;
4138 
4139 		target__strerror(&rec->opts.target, err, errbuf, BUFSIZ);
4140 		ui__error("%s", errbuf);
4141 
4142 		err = -saved_errno;
4143 		goto out;
4144 	}
4145 
4146 	/* Enable ignoring missing threads when -u/-p option is defined. */
4147 	rec->opts.ignore_missing_thread = rec->opts.target.uid != UINT_MAX || rec->opts.target.pid;
4148 
4149 	evlist__warn_user_requested_cpus(rec->evlist, rec->opts.target.cpu_list);
4150 
4151 	if (callchain_param.enabled && callchain_param.record_mode == CALLCHAIN_FP)
4152 		arch__add_leaf_frame_record_opts(&rec->opts);
4153 
4154 	err = -ENOMEM;
4155 	if (evlist__create_maps(rec->evlist, &rec->opts.target) < 0) {
4156 		if (rec->opts.target.pid != NULL) {
4157 			pr_err("Couldn't create thread/CPU maps: %s\n",
4158 				errno == ENOENT ? "No such process" : str_error_r(errno, errbuf, sizeof(errbuf)));
4159 			goto out;
4160 		}
4161 		else
4162 			usage_with_options(record_usage, record_options);
4163 	}
4164 
4165 	err = auxtrace_record__options(rec->itr, rec->evlist, &rec->opts);
4166 	if (err)
4167 		goto out;
4168 
4169 	/*
4170 	 * We take all buildids when the file contains
4171 	 * AUX area tracing data because we do not decode the
4172 	 * trace because it would take too long.
4173 	 */
4174 	if (rec->opts.full_auxtrace)
4175 		rec->buildid_all = true;
4176 
4177 	if (rec->opts.text_poke) {
4178 		err = record__config_text_poke(rec->evlist);
4179 		if (err) {
4180 			pr_err("record__config_text_poke failed, error %d\n", err);
4181 			goto out;
4182 		}
4183 	}
4184 
4185 	if (rec->off_cpu) {
4186 		err = record__config_off_cpu(rec);
4187 		if (err) {
4188 			pr_err("record__config_off_cpu failed, error %d\n", err);
4189 			goto out;
4190 		}
4191 	}
4192 
4193 	if (record_opts__config(&rec->opts)) {
4194 		err = -EINVAL;
4195 		goto out;
4196 	}
4197 
4198 	err = record__init_thread_masks(rec);
4199 	if (err) {
4200 		pr_err("Failed to initialize parallel data streaming masks\n");
4201 		goto out;
4202 	}
4203 
4204 	if (rec->opts.nr_cblocks > nr_cblocks_max)
4205 		rec->opts.nr_cblocks = nr_cblocks_max;
4206 	pr_debug("nr_cblocks: %d\n", rec->opts.nr_cblocks);
4207 
4208 	pr_debug("affinity: %s\n", affinity_tags[rec->opts.affinity]);
4209 	pr_debug("mmap flush: %d\n", rec->opts.mmap_flush);
4210 
4211 	if (rec->opts.comp_level > comp_level_max)
4212 		rec->opts.comp_level = comp_level_max;
4213 	pr_debug("comp level: %d\n", rec->opts.comp_level);
4214 
4215 	err = __cmd_record(&record, argc, argv);
4216 out:
4217 	evlist__delete(rec->evlist);
4218 	symbol__exit();
4219 	auxtrace_record__free(rec->itr);
4220 out_opts:
4221 	record__free_thread_masks(rec, rec->nr_threads);
4222 	rec->nr_threads = 0;
4223 	evlist__close_control(rec->opts.ctl_fd, rec->opts.ctl_fd_ack, &rec->opts.ctl_fd_close);
4224 	return err;
4225 }
4226 
4227 static void snapshot_sig_handler(int sig __maybe_unused)
4228 {
4229 	struct record *rec = &record;
4230 
4231 	hit_auxtrace_snapshot_trigger(rec);
4232 
4233 	if (switch_output_signal(rec))
4234 		trigger_hit(&switch_output_trigger);
4235 }
4236 
4237 static void alarm_sig_handler(int sig __maybe_unused)
4238 {
4239 	struct record *rec = &record;
4240 
4241 	if (switch_output_time(rec))
4242 		trigger_hit(&switch_output_trigger);
4243 }
4244