xref: /linux/tools/perf/builtin-record.c (revision 82aff6cc070417f26f9b02b26e63c17ff43b4044)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * builtin-record.c
4  *
5  * Builtin record command: Record the profile of a workload
6  * (or a CPU, or a PID) into the perf.data output file - for
7  * later analysis via perf report.
8  */
9 #include "builtin.h"
10 
11 #include "util/build-id.h"
12 #include <subcmd/parse-options.h>
13 #include "util/parse-events.h"
14 #include "util/config.h"
15 
16 #include "util/callchain.h"
17 #include "util/cgroup.h"
18 #include "util/header.h"
19 #include "util/event.h"
20 #include "util/evlist.h"
21 #include "util/evsel.h"
22 #include "util/debug.h"
23 #include "util/mmap.h"
24 #include "util/mutex.h"
25 #include "util/target.h"
26 #include "util/session.h"
27 #include "util/tool.h"
28 #include "util/symbol.h"
29 #include "util/record.h"
30 #include "util/cpumap.h"
31 #include "util/thread_map.h"
32 #include "util/data.h"
33 #include "util/perf_regs.h"
34 #include "util/auxtrace.h"
35 #include "util/tsc.h"
36 #include "util/parse-branch-options.h"
37 #include "util/parse-regs-options.h"
38 #include "util/perf_api_probe.h"
39 #include "util/llvm-utils.h"
40 #include "util/bpf-loader.h"
41 #include "util/trigger.h"
42 #include "util/perf-hooks.h"
43 #include "util/cpu-set-sched.h"
44 #include "util/synthetic-events.h"
45 #include "util/time-utils.h"
46 #include "util/units.h"
47 #include "util/bpf-event.h"
48 #include "util/util.h"
49 #include "util/pfm.h"
50 #include "util/clockid.h"
51 #include "util/pmu-hybrid.h"
52 #include "util/evlist-hybrid.h"
53 #include "util/off_cpu.h"
54 #include "asm/bug.h"
55 #include "perf.h"
56 #include "cputopo.h"
57 
58 #include <errno.h>
59 #include <inttypes.h>
60 #include <locale.h>
61 #include <poll.h>
62 #include <pthread.h>
63 #include <unistd.h>
64 #ifndef HAVE_GETTID
65 #include <syscall.h>
66 #endif
67 #include <sched.h>
68 #include <signal.h>
69 #ifdef HAVE_EVENTFD_SUPPORT
70 #include <sys/eventfd.h>
71 #endif
72 #include <sys/mman.h>
73 #include <sys/wait.h>
74 #include <sys/types.h>
75 #include <sys/stat.h>
76 #include <fcntl.h>
77 #include <linux/err.h>
78 #include <linux/string.h>
79 #include <linux/time64.h>
80 #include <linux/zalloc.h>
81 #include <linux/bitmap.h>
82 #include <sys/time.h>
83 
84 struct switch_output {
85 	bool		 enabled;
86 	bool		 signal;
87 	unsigned long	 size;
88 	unsigned long	 time;
89 	const char	*str;
90 	bool		 set;
91 	char		 **filenames;
92 	int		 num_files;
93 	int		 cur_file;
94 };
95 
96 struct thread_mask {
97 	struct mmap_cpu_mask	maps;
98 	struct mmap_cpu_mask	affinity;
99 };
100 
101 struct record_thread {
102 	pid_t			tid;
103 	struct thread_mask	*mask;
104 	struct {
105 		int		msg[2];
106 		int		ack[2];
107 	} pipes;
108 	struct fdarray		pollfd;
109 	int			ctlfd_pos;
110 	int			nr_mmaps;
111 	struct mmap		**maps;
112 	struct mmap		**overwrite_maps;
113 	struct record		*rec;
114 	unsigned long long	samples;
115 	unsigned long		waking;
116 	u64			bytes_written;
117 	u64			bytes_transferred;
118 	u64			bytes_compressed;
119 };
120 
121 static __thread struct record_thread *thread;
122 
123 enum thread_msg {
124 	THREAD_MSG__UNDEFINED = 0,
125 	THREAD_MSG__READY,
126 	THREAD_MSG__MAX,
127 };
128 
129 static const char *thread_msg_tags[THREAD_MSG__MAX] = {
130 	"UNDEFINED", "READY"
131 };
132 
133 enum thread_spec {
134 	THREAD_SPEC__UNDEFINED = 0,
135 	THREAD_SPEC__CPU,
136 	THREAD_SPEC__CORE,
137 	THREAD_SPEC__PACKAGE,
138 	THREAD_SPEC__NUMA,
139 	THREAD_SPEC__USER,
140 	THREAD_SPEC__MAX,
141 };
142 
143 static const char *thread_spec_tags[THREAD_SPEC__MAX] = {
144 	"undefined", "cpu", "core", "package", "numa", "user"
145 };
146 
147 struct pollfd_index_map {
148 	int evlist_pollfd_index;
149 	int thread_pollfd_index;
150 };
151 
152 struct record {
153 	struct perf_tool	tool;
154 	struct record_opts	opts;
155 	u64			bytes_written;
156 	struct perf_data	data;
157 	struct auxtrace_record	*itr;
158 	struct evlist	*evlist;
159 	struct perf_session	*session;
160 	struct evlist		*sb_evlist;
161 	pthread_t		thread_id;
162 	int			realtime_prio;
163 	bool			switch_output_event_set;
164 	bool			no_buildid;
165 	bool			no_buildid_set;
166 	bool			no_buildid_cache;
167 	bool			no_buildid_cache_set;
168 	bool			buildid_all;
169 	bool			buildid_mmap;
170 	bool			timestamp_filename;
171 	bool			timestamp_boundary;
172 	bool			off_cpu;
173 	struct switch_output	switch_output;
174 	unsigned long long	samples;
175 	unsigned long		output_max_size;	/* = 0: unlimited */
176 	struct perf_debuginfod	debuginfod;
177 	int			nr_threads;
178 	struct thread_mask	*thread_masks;
179 	struct record_thread	*thread_data;
180 	struct pollfd_index_map	*index_map;
181 	size_t			index_map_sz;
182 	size_t			index_map_cnt;
183 };
184 
185 static volatile int done;
186 
187 static volatile int auxtrace_record__snapshot_started;
188 static DEFINE_TRIGGER(auxtrace_snapshot_trigger);
189 static DEFINE_TRIGGER(switch_output_trigger);
190 
191 static const char *affinity_tags[PERF_AFFINITY_MAX] = {
192 	"SYS", "NODE", "CPU"
193 };
194 
195 #ifndef HAVE_GETTID
196 static inline pid_t gettid(void)
197 {
198 	return (pid_t)syscall(__NR_gettid);
199 }
200 #endif
201 
202 static int record__threads_enabled(struct record *rec)
203 {
204 	return rec->opts.threads_spec;
205 }
206 
207 static bool switch_output_signal(struct record *rec)
208 {
209 	return rec->switch_output.signal &&
210 	       trigger_is_ready(&switch_output_trigger);
211 }
212 
213 static bool switch_output_size(struct record *rec)
214 {
215 	return rec->switch_output.size &&
216 	       trigger_is_ready(&switch_output_trigger) &&
217 	       (rec->bytes_written >= rec->switch_output.size);
218 }
219 
220 static bool switch_output_time(struct record *rec)
221 {
222 	return rec->switch_output.time &&
223 	       trigger_is_ready(&switch_output_trigger);
224 }
225 
226 static u64 record__bytes_written(struct record *rec)
227 {
228 	int t;
229 	u64 bytes_written = rec->bytes_written;
230 	struct record_thread *thread_data = rec->thread_data;
231 
232 	for (t = 0; t < rec->nr_threads; t++)
233 		bytes_written += thread_data[t].bytes_written;
234 
235 	return bytes_written;
236 }
237 
238 static bool record__output_max_size_exceeded(struct record *rec)
239 {
240 	return rec->output_max_size &&
241 	       (record__bytes_written(rec) >= rec->output_max_size);
242 }
243 
244 static int record__write(struct record *rec, struct mmap *map __maybe_unused,
245 			 void *bf, size_t size)
246 {
247 	struct perf_data_file *file = &rec->session->data->file;
248 
249 	if (map && map->file)
250 		file = map->file;
251 
252 	if (perf_data_file__write(file, bf, size) < 0) {
253 		pr_err("failed to write perf data, error: %m\n");
254 		return -1;
255 	}
256 
257 	if (map && map->file)
258 		thread->bytes_written += size;
259 	else
260 		rec->bytes_written += size;
261 
262 	if (record__output_max_size_exceeded(rec) && !done) {
263 		fprintf(stderr, "[ perf record: perf size limit reached (%" PRIu64 " KB),"
264 				" stopping session ]\n",
265 				record__bytes_written(rec) >> 10);
266 		done = 1;
267 	}
268 
269 	if (switch_output_size(rec))
270 		trigger_hit(&switch_output_trigger);
271 
272 	return 0;
273 }
274 
275 static int record__aio_enabled(struct record *rec);
276 static int record__comp_enabled(struct record *rec);
277 static size_t zstd_compress(struct perf_session *session, struct mmap *map,
278 			    void *dst, size_t dst_size, void *src, size_t src_size);
279 
280 #ifdef HAVE_AIO_SUPPORT
281 static int record__aio_write(struct aiocb *cblock, int trace_fd,
282 		void *buf, size_t size, off_t off)
283 {
284 	int rc;
285 
286 	cblock->aio_fildes = trace_fd;
287 	cblock->aio_buf    = buf;
288 	cblock->aio_nbytes = size;
289 	cblock->aio_offset = off;
290 	cblock->aio_sigevent.sigev_notify = SIGEV_NONE;
291 
292 	do {
293 		rc = aio_write(cblock);
294 		if (rc == 0) {
295 			break;
296 		} else if (errno != EAGAIN) {
297 			cblock->aio_fildes = -1;
298 			pr_err("failed to queue perf data, error: %m\n");
299 			break;
300 		}
301 	} while (1);
302 
303 	return rc;
304 }
305 
306 static int record__aio_complete(struct mmap *md, struct aiocb *cblock)
307 {
308 	void *rem_buf;
309 	off_t rem_off;
310 	size_t rem_size;
311 	int rc, aio_errno;
312 	ssize_t aio_ret, written;
313 
314 	aio_errno = aio_error(cblock);
315 	if (aio_errno == EINPROGRESS)
316 		return 0;
317 
318 	written = aio_ret = aio_return(cblock);
319 	if (aio_ret < 0) {
320 		if (aio_errno != EINTR)
321 			pr_err("failed to write perf data, error: %m\n");
322 		written = 0;
323 	}
324 
325 	rem_size = cblock->aio_nbytes - written;
326 
327 	if (rem_size == 0) {
328 		cblock->aio_fildes = -1;
329 		/*
330 		 * md->refcount is incremented in record__aio_pushfn() for
331 		 * every aio write request started in record__aio_push() so
332 		 * decrement it because the request is now complete.
333 		 */
334 		perf_mmap__put(&md->core);
335 		rc = 1;
336 	} else {
337 		/*
338 		 * aio write request may require restart with the
339 		 * reminder if the kernel didn't write whole
340 		 * chunk at once.
341 		 */
342 		rem_off = cblock->aio_offset + written;
343 		rem_buf = (void *)(cblock->aio_buf + written);
344 		record__aio_write(cblock, cblock->aio_fildes,
345 				rem_buf, rem_size, rem_off);
346 		rc = 0;
347 	}
348 
349 	return rc;
350 }
351 
352 static int record__aio_sync(struct mmap *md, bool sync_all)
353 {
354 	struct aiocb **aiocb = md->aio.aiocb;
355 	struct aiocb *cblocks = md->aio.cblocks;
356 	struct timespec timeout = { 0, 1000 * 1000  * 1 }; /* 1ms */
357 	int i, do_suspend;
358 
359 	do {
360 		do_suspend = 0;
361 		for (i = 0; i < md->aio.nr_cblocks; ++i) {
362 			if (cblocks[i].aio_fildes == -1 || record__aio_complete(md, &cblocks[i])) {
363 				if (sync_all)
364 					aiocb[i] = NULL;
365 				else
366 					return i;
367 			} else {
368 				/*
369 				 * Started aio write is not complete yet
370 				 * so it has to be waited before the
371 				 * next allocation.
372 				 */
373 				aiocb[i] = &cblocks[i];
374 				do_suspend = 1;
375 			}
376 		}
377 		if (!do_suspend)
378 			return -1;
379 
380 		while (aio_suspend((const struct aiocb **)aiocb, md->aio.nr_cblocks, &timeout)) {
381 			if (!(errno == EAGAIN || errno == EINTR))
382 				pr_err("failed to sync perf data, error: %m\n");
383 		}
384 	} while (1);
385 }
386 
387 struct record_aio {
388 	struct record	*rec;
389 	void		*data;
390 	size_t		size;
391 };
392 
393 static int record__aio_pushfn(struct mmap *map, void *to, void *buf, size_t size)
394 {
395 	struct record_aio *aio = to;
396 
397 	/*
398 	 * map->core.base data pointed by buf is copied into free map->aio.data[] buffer
399 	 * to release space in the kernel buffer as fast as possible, calling
400 	 * perf_mmap__consume() from perf_mmap__push() function.
401 	 *
402 	 * That lets the kernel to proceed with storing more profiling data into
403 	 * the kernel buffer earlier than other per-cpu kernel buffers are handled.
404 	 *
405 	 * Coping can be done in two steps in case the chunk of profiling data
406 	 * crosses the upper bound of the kernel buffer. In this case we first move
407 	 * part of data from map->start till the upper bound and then the reminder
408 	 * from the beginning of the kernel buffer till the end of the data chunk.
409 	 */
410 
411 	if (record__comp_enabled(aio->rec)) {
412 		size = zstd_compress(aio->rec->session, NULL, aio->data + aio->size,
413 				     mmap__mmap_len(map) - aio->size,
414 				     buf, size);
415 	} else {
416 		memcpy(aio->data + aio->size, buf, size);
417 	}
418 
419 	if (!aio->size) {
420 		/*
421 		 * Increment map->refcount to guard map->aio.data[] buffer
422 		 * from premature deallocation because map object can be
423 		 * released earlier than aio write request started on
424 		 * map->aio.data[] buffer is complete.
425 		 *
426 		 * perf_mmap__put() is done at record__aio_complete()
427 		 * after started aio request completion or at record__aio_push()
428 		 * if the request failed to start.
429 		 */
430 		perf_mmap__get(&map->core);
431 	}
432 
433 	aio->size += size;
434 
435 	return size;
436 }
437 
438 static int record__aio_push(struct record *rec, struct mmap *map, off_t *off)
439 {
440 	int ret, idx;
441 	int trace_fd = rec->session->data->file.fd;
442 	struct record_aio aio = { .rec = rec, .size = 0 };
443 
444 	/*
445 	 * Call record__aio_sync() to wait till map->aio.data[] buffer
446 	 * becomes available after previous aio write operation.
447 	 */
448 
449 	idx = record__aio_sync(map, false);
450 	aio.data = map->aio.data[idx];
451 	ret = perf_mmap__push(map, &aio, record__aio_pushfn);
452 	if (ret != 0) /* ret > 0 - no data, ret < 0 - error */
453 		return ret;
454 
455 	rec->samples++;
456 	ret = record__aio_write(&(map->aio.cblocks[idx]), trace_fd, aio.data, aio.size, *off);
457 	if (!ret) {
458 		*off += aio.size;
459 		rec->bytes_written += aio.size;
460 		if (switch_output_size(rec))
461 			trigger_hit(&switch_output_trigger);
462 	} else {
463 		/*
464 		 * Decrement map->refcount incremented in record__aio_pushfn()
465 		 * back if record__aio_write() operation failed to start, otherwise
466 		 * map->refcount is decremented in record__aio_complete() after
467 		 * aio write operation finishes successfully.
468 		 */
469 		perf_mmap__put(&map->core);
470 	}
471 
472 	return ret;
473 }
474 
475 static off_t record__aio_get_pos(int trace_fd)
476 {
477 	return lseek(trace_fd, 0, SEEK_CUR);
478 }
479 
480 static void record__aio_set_pos(int trace_fd, off_t pos)
481 {
482 	lseek(trace_fd, pos, SEEK_SET);
483 }
484 
485 static void record__aio_mmap_read_sync(struct record *rec)
486 {
487 	int i;
488 	struct evlist *evlist = rec->evlist;
489 	struct mmap *maps = evlist->mmap;
490 
491 	if (!record__aio_enabled(rec))
492 		return;
493 
494 	for (i = 0; i < evlist->core.nr_mmaps; i++) {
495 		struct mmap *map = &maps[i];
496 
497 		if (map->core.base)
498 			record__aio_sync(map, true);
499 	}
500 }
501 
502 static int nr_cblocks_default = 1;
503 static int nr_cblocks_max = 4;
504 
505 static int record__aio_parse(const struct option *opt,
506 			     const char *str,
507 			     int unset)
508 {
509 	struct record_opts *opts = (struct record_opts *)opt->value;
510 
511 	if (unset) {
512 		opts->nr_cblocks = 0;
513 	} else {
514 		if (str)
515 			opts->nr_cblocks = strtol(str, NULL, 0);
516 		if (!opts->nr_cblocks)
517 			opts->nr_cblocks = nr_cblocks_default;
518 	}
519 
520 	return 0;
521 }
522 #else /* HAVE_AIO_SUPPORT */
523 static int nr_cblocks_max = 0;
524 
525 static int record__aio_push(struct record *rec __maybe_unused, struct mmap *map __maybe_unused,
526 			    off_t *off __maybe_unused)
527 {
528 	return -1;
529 }
530 
531 static off_t record__aio_get_pos(int trace_fd __maybe_unused)
532 {
533 	return -1;
534 }
535 
536 static void record__aio_set_pos(int trace_fd __maybe_unused, off_t pos __maybe_unused)
537 {
538 }
539 
540 static void record__aio_mmap_read_sync(struct record *rec __maybe_unused)
541 {
542 }
543 #endif
544 
545 static int record__aio_enabled(struct record *rec)
546 {
547 	return rec->opts.nr_cblocks > 0;
548 }
549 
550 #define MMAP_FLUSH_DEFAULT 1
551 static int record__mmap_flush_parse(const struct option *opt,
552 				    const char *str,
553 				    int unset)
554 {
555 	int flush_max;
556 	struct record_opts *opts = (struct record_opts *)opt->value;
557 	static struct parse_tag tags[] = {
558 			{ .tag  = 'B', .mult = 1       },
559 			{ .tag  = 'K', .mult = 1 << 10 },
560 			{ .tag  = 'M', .mult = 1 << 20 },
561 			{ .tag  = 'G', .mult = 1 << 30 },
562 			{ .tag  = 0 },
563 	};
564 
565 	if (unset)
566 		return 0;
567 
568 	if (str) {
569 		opts->mmap_flush = parse_tag_value(str, tags);
570 		if (opts->mmap_flush == (int)-1)
571 			opts->mmap_flush = strtol(str, NULL, 0);
572 	}
573 
574 	if (!opts->mmap_flush)
575 		opts->mmap_flush = MMAP_FLUSH_DEFAULT;
576 
577 	flush_max = evlist__mmap_size(opts->mmap_pages);
578 	flush_max /= 4;
579 	if (opts->mmap_flush > flush_max)
580 		opts->mmap_flush = flush_max;
581 
582 	return 0;
583 }
584 
585 #ifdef HAVE_ZSTD_SUPPORT
586 static unsigned int comp_level_default = 1;
587 
588 static int record__parse_comp_level(const struct option *opt, const char *str, int unset)
589 {
590 	struct record_opts *opts = opt->value;
591 
592 	if (unset) {
593 		opts->comp_level = 0;
594 	} else {
595 		if (str)
596 			opts->comp_level = strtol(str, NULL, 0);
597 		if (!opts->comp_level)
598 			opts->comp_level = comp_level_default;
599 	}
600 
601 	return 0;
602 }
603 #endif
604 static unsigned int comp_level_max = 22;
605 
606 static int record__comp_enabled(struct record *rec)
607 {
608 	return rec->opts.comp_level > 0;
609 }
610 
611 static int process_synthesized_event(struct perf_tool *tool,
612 				     union perf_event *event,
613 				     struct perf_sample *sample __maybe_unused,
614 				     struct machine *machine __maybe_unused)
615 {
616 	struct record *rec = container_of(tool, struct record, tool);
617 	return record__write(rec, NULL, event, event->header.size);
618 }
619 
620 static struct mutex synth_lock;
621 
622 static int process_locked_synthesized_event(struct perf_tool *tool,
623 				     union perf_event *event,
624 				     struct perf_sample *sample __maybe_unused,
625 				     struct machine *machine __maybe_unused)
626 {
627 	int ret;
628 
629 	mutex_lock(&synth_lock);
630 	ret = process_synthesized_event(tool, event, sample, machine);
631 	mutex_unlock(&synth_lock);
632 	return ret;
633 }
634 
635 static int record__pushfn(struct mmap *map, void *to, void *bf, size_t size)
636 {
637 	struct record *rec = to;
638 
639 	if (record__comp_enabled(rec)) {
640 		size = zstd_compress(rec->session, map, map->data, mmap__mmap_len(map), bf, size);
641 		bf   = map->data;
642 	}
643 
644 	thread->samples++;
645 	return record__write(rec, map, bf, size);
646 }
647 
648 static volatile int signr = -1;
649 static volatile int child_finished;
650 #ifdef HAVE_EVENTFD_SUPPORT
651 static int done_fd = -1;
652 #endif
653 
654 static void sig_handler(int sig)
655 {
656 	if (sig == SIGCHLD)
657 		child_finished = 1;
658 	else
659 		signr = sig;
660 
661 	done = 1;
662 #ifdef HAVE_EVENTFD_SUPPORT
663 {
664 	u64 tmp = 1;
665 	/*
666 	 * It is possible for this signal handler to run after done is checked
667 	 * in the main loop, but before the perf counter fds are polled. If this
668 	 * happens, the poll() will continue to wait even though done is set,
669 	 * and will only break out if either another signal is received, or the
670 	 * counters are ready for read. To ensure the poll() doesn't sleep when
671 	 * done is set, use an eventfd (done_fd) to wake up the poll().
672 	 */
673 	if (write(done_fd, &tmp, sizeof(tmp)) < 0)
674 		pr_err("failed to signal wakeup fd, error: %m\n");
675 }
676 #endif // HAVE_EVENTFD_SUPPORT
677 }
678 
679 static void sigsegv_handler(int sig)
680 {
681 	perf_hooks__recover();
682 	sighandler_dump_stack(sig);
683 }
684 
685 static void record__sig_exit(void)
686 {
687 	if (signr == -1)
688 		return;
689 
690 	signal(signr, SIG_DFL);
691 	raise(signr);
692 }
693 
694 #ifdef HAVE_AUXTRACE_SUPPORT
695 
696 static int record__process_auxtrace(struct perf_tool *tool,
697 				    struct mmap *map,
698 				    union perf_event *event, void *data1,
699 				    size_t len1, void *data2, size_t len2)
700 {
701 	struct record *rec = container_of(tool, struct record, tool);
702 	struct perf_data *data = &rec->data;
703 	size_t padding;
704 	u8 pad[8] = {0};
705 
706 	if (!perf_data__is_pipe(data) && perf_data__is_single_file(data)) {
707 		off_t file_offset;
708 		int fd = perf_data__fd(data);
709 		int err;
710 
711 		file_offset = lseek(fd, 0, SEEK_CUR);
712 		if (file_offset == -1)
713 			return -1;
714 		err = auxtrace_index__auxtrace_event(&rec->session->auxtrace_index,
715 						     event, file_offset);
716 		if (err)
717 			return err;
718 	}
719 
720 	/* event.auxtrace.size includes padding, see __auxtrace_mmap__read() */
721 	padding = (len1 + len2) & 7;
722 	if (padding)
723 		padding = 8 - padding;
724 
725 	record__write(rec, map, event, event->header.size);
726 	record__write(rec, map, data1, len1);
727 	if (len2)
728 		record__write(rec, map, data2, len2);
729 	record__write(rec, map, &pad, padding);
730 
731 	return 0;
732 }
733 
734 static int record__auxtrace_mmap_read(struct record *rec,
735 				      struct mmap *map)
736 {
737 	int ret;
738 
739 	ret = auxtrace_mmap__read(map, rec->itr, &rec->tool,
740 				  record__process_auxtrace);
741 	if (ret < 0)
742 		return ret;
743 
744 	if (ret)
745 		rec->samples++;
746 
747 	return 0;
748 }
749 
750 static int record__auxtrace_mmap_read_snapshot(struct record *rec,
751 					       struct mmap *map)
752 {
753 	int ret;
754 
755 	ret = auxtrace_mmap__read_snapshot(map, rec->itr, &rec->tool,
756 					   record__process_auxtrace,
757 					   rec->opts.auxtrace_snapshot_size);
758 	if (ret < 0)
759 		return ret;
760 
761 	if (ret)
762 		rec->samples++;
763 
764 	return 0;
765 }
766 
767 static int record__auxtrace_read_snapshot_all(struct record *rec)
768 {
769 	int i;
770 	int rc = 0;
771 
772 	for (i = 0; i < rec->evlist->core.nr_mmaps; i++) {
773 		struct mmap *map = &rec->evlist->mmap[i];
774 
775 		if (!map->auxtrace_mmap.base)
776 			continue;
777 
778 		if (record__auxtrace_mmap_read_snapshot(rec, map) != 0) {
779 			rc = -1;
780 			goto out;
781 		}
782 	}
783 out:
784 	return rc;
785 }
786 
787 static void record__read_auxtrace_snapshot(struct record *rec, bool on_exit)
788 {
789 	pr_debug("Recording AUX area tracing snapshot\n");
790 	if (record__auxtrace_read_snapshot_all(rec) < 0) {
791 		trigger_error(&auxtrace_snapshot_trigger);
792 	} else {
793 		if (auxtrace_record__snapshot_finish(rec->itr, on_exit))
794 			trigger_error(&auxtrace_snapshot_trigger);
795 		else
796 			trigger_ready(&auxtrace_snapshot_trigger);
797 	}
798 }
799 
800 static int record__auxtrace_snapshot_exit(struct record *rec)
801 {
802 	if (trigger_is_error(&auxtrace_snapshot_trigger))
803 		return 0;
804 
805 	if (!auxtrace_record__snapshot_started &&
806 	    auxtrace_record__snapshot_start(rec->itr))
807 		return -1;
808 
809 	record__read_auxtrace_snapshot(rec, true);
810 	if (trigger_is_error(&auxtrace_snapshot_trigger))
811 		return -1;
812 
813 	return 0;
814 }
815 
816 static int record__auxtrace_init(struct record *rec)
817 {
818 	int err;
819 
820 	if ((rec->opts.auxtrace_snapshot_opts || rec->opts.auxtrace_sample_opts)
821 	    && record__threads_enabled(rec)) {
822 		pr_err("AUX area tracing options are not available in parallel streaming mode.\n");
823 		return -EINVAL;
824 	}
825 
826 	if (!rec->itr) {
827 		rec->itr = auxtrace_record__init(rec->evlist, &err);
828 		if (err)
829 			return err;
830 	}
831 
832 	err = auxtrace_parse_snapshot_options(rec->itr, &rec->opts,
833 					      rec->opts.auxtrace_snapshot_opts);
834 	if (err)
835 		return err;
836 
837 	err = auxtrace_parse_sample_options(rec->itr, rec->evlist, &rec->opts,
838 					    rec->opts.auxtrace_sample_opts);
839 	if (err)
840 		return err;
841 
842 	auxtrace_regroup_aux_output(rec->evlist);
843 
844 	return auxtrace_parse_filters(rec->evlist);
845 }
846 
847 #else
848 
849 static inline
850 int record__auxtrace_mmap_read(struct record *rec __maybe_unused,
851 			       struct mmap *map __maybe_unused)
852 {
853 	return 0;
854 }
855 
856 static inline
857 void record__read_auxtrace_snapshot(struct record *rec __maybe_unused,
858 				    bool on_exit __maybe_unused)
859 {
860 }
861 
862 static inline
863 int auxtrace_record__snapshot_start(struct auxtrace_record *itr __maybe_unused)
864 {
865 	return 0;
866 }
867 
868 static inline
869 int record__auxtrace_snapshot_exit(struct record *rec __maybe_unused)
870 {
871 	return 0;
872 }
873 
874 static int record__auxtrace_init(struct record *rec __maybe_unused)
875 {
876 	return 0;
877 }
878 
879 #endif
880 
881 static int record__config_text_poke(struct evlist *evlist)
882 {
883 	struct evsel *evsel;
884 
885 	/* Nothing to do if text poke is already configured */
886 	evlist__for_each_entry(evlist, evsel) {
887 		if (evsel->core.attr.text_poke)
888 			return 0;
889 	}
890 
891 	evsel = evlist__add_dummy_on_all_cpus(evlist);
892 	if (!evsel)
893 		return -ENOMEM;
894 
895 	evsel->core.attr.text_poke = 1;
896 	evsel->core.attr.ksymbol = 1;
897 	evsel->immediate = true;
898 	evsel__set_sample_bit(evsel, TIME);
899 
900 	return 0;
901 }
902 
903 static int record__config_off_cpu(struct record *rec)
904 {
905 	return off_cpu_prepare(rec->evlist, &rec->opts.target, &rec->opts);
906 }
907 
908 static bool record__kcore_readable(struct machine *machine)
909 {
910 	char kcore[PATH_MAX];
911 	int fd;
912 
913 	scnprintf(kcore, sizeof(kcore), "%s/proc/kcore", machine->root_dir);
914 
915 	fd = open(kcore, O_RDONLY);
916 	if (fd < 0)
917 		return false;
918 
919 	close(fd);
920 
921 	return true;
922 }
923 
924 static int record__kcore_copy(struct machine *machine, struct perf_data *data)
925 {
926 	char from_dir[PATH_MAX];
927 	char kcore_dir[PATH_MAX];
928 	int ret;
929 
930 	snprintf(from_dir, sizeof(from_dir), "%s/proc", machine->root_dir);
931 
932 	ret = perf_data__make_kcore_dir(data, kcore_dir, sizeof(kcore_dir));
933 	if (ret)
934 		return ret;
935 
936 	return kcore_copy(from_dir, kcore_dir);
937 }
938 
939 static void record__thread_data_init_pipes(struct record_thread *thread_data)
940 {
941 	thread_data->pipes.msg[0] = -1;
942 	thread_data->pipes.msg[1] = -1;
943 	thread_data->pipes.ack[0] = -1;
944 	thread_data->pipes.ack[1] = -1;
945 }
946 
947 static int record__thread_data_open_pipes(struct record_thread *thread_data)
948 {
949 	if (pipe(thread_data->pipes.msg))
950 		return -EINVAL;
951 
952 	if (pipe(thread_data->pipes.ack)) {
953 		close(thread_data->pipes.msg[0]);
954 		thread_data->pipes.msg[0] = -1;
955 		close(thread_data->pipes.msg[1]);
956 		thread_data->pipes.msg[1] = -1;
957 		return -EINVAL;
958 	}
959 
960 	pr_debug2("thread_data[%p]: msg=[%d,%d], ack=[%d,%d]\n", thread_data,
961 		 thread_data->pipes.msg[0], thread_data->pipes.msg[1],
962 		 thread_data->pipes.ack[0], thread_data->pipes.ack[1]);
963 
964 	return 0;
965 }
966 
967 static void record__thread_data_close_pipes(struct record_thread *thread_data)
968 {
969 	if (thread_data->pipes.msg[0] != -1) {
970 		close(thread_data->pipes.msg[0]);
971 		thread_data->pipes.msg[0] = -1;
972 	}
973 	if (thread_data->pipes.msg[1] != -1) {
974 		close(thread_data->pipes.msg[1]);
975 		thread_data->pipes.msg[1] = -1;
976 	}
977 	if (thread_data->pipes.ack[0] != -1) {
978 		close(thread_data->pipes.ack[0]);
979 		thread_data->pipes.ack[0] = -1;
980 	}
981 	if (thread_data->pipes.ack[1] != -1) {
982 		close(thread_data->pipes.ack[1]);
983 		thread_data->pipes.ack[1] = -1;
984 	}
985 }
986 
987 static bool evlist__per_thread(struct evlist *evlist)
988 {
989 	return cpu_map__is_dummy(evlist->core.user_requested_cpus);
990 }
991 
992 static int record__thread_data_init_maps(struct record_thread *thread_data, struct evlist *evlist)
993 {
994 	int m, tm, nr_mmaps = evlist->core.nr_mmaps;
995 	struct mmap *mmap = evlist->mmap;
996 	struct mmap *overwrite_mmap = evlist->overwrite_mmap;
997 	struct perf_cpu_map *cpus = evlist->core.all_cpus;
998 	bool per_thread = evlist__per_thread(evlist);
999 
1000 	if (per_thread)
1001 		thread_data->nr_mmaps = nr_mmaps;
1002 	else
1003 		thread_data->nr_mmaps = bitmap_weight(thread_data->mask->maps.bits,
1004 						      thread_data->mask->maps.nbits);
1005 	if (mmap) {
1006 		thread_data->maps = zalloc(thread_data->nr_mmaps * sizeof(struct mmap *));
1007 		if (!thread_data->maps)
1008 			return -ENOMEM;
1009 	}
1010 	if (overwrite_mmap) {
1011 		thread_data->overwrite_maps = zalloc(thread_data->nr_mmaps * sizeof(struct mmap *));
1012 		if (!thread_data->overwrite_maps) {
1013 			zfree(&thread_data->maps);
1014 			return -ENOMEM;
1015 		}
1016 	}
1017 	pr_debug2("thread_data[%p]: nr_mmaps=%d, maps=%p, ow_maps=%p\n", thread_data,
1018 		 thread_data->nr_mmaps, thread_data->maps, thread_data->overwrite_maps);
1019 
1020 	for (m = 0, tm = 0; m < nr_mmaps && tm < thread_data->nr_mmaps; m++) {
1021 		if (per_thread ||
1022 		    test_bit(perf_cpu_map__cpu(cpus, m).cpu, thread_data->mask->maps.bits)) {
1023 			if (thread_data->maps) {
1024 				thread_data->maps[tm] = &mmap[m];
1025 				pr_debug2("thread_data[%p]: cpu%d: maps[%d] -> mmap[%d]\n",
1026 					  thread_data, perf_cpu_map__cpu(cpus, m).cpu, tm, m);
1027 			}
1028 			if (thread_data->overwrite_maps) {
1029 				thread_data->overwrite_maps[tm] = &overwrite_mmap[m];
1030 				pr_debug2("thread_data[%p]: cpu%d: ow_maps[%d] -> ow_mmap[%d]\n",
1031 					  thread_data, perf_cpu_map__cpu(cpus, m).cpu, tm, m);
1032 			}
1033 			tm++;
1034 		}
1035 	}
1036 
1037 	return 0;
1038 }
1039 
1040 static int record__thread_data_init_pollfd(struct record_thread *thread_data, struct evlist *evlist)
1041 {
1042 	int f, tm, pos;
1043 	struct mmap *map, *overwrite_map;
1044 
1045 	fdarray__init(&thread_data->pollfd, 64);
1046 
1047 	for (tm = 0; tm < thread_data->nr_mmaps; tm++) {
1048 		map = thread_data->maps ? thread_data->maps[tm] : NULL;
1049 		overwrite_map = thread_data->overwrite_maps ?
1050 				thread_data->overwrite_maps[tm] : NULL;
1051 
1052 		for (f = 0; f < evlist->core.pollfd.nr; f++) {
1053 			void *ptr = evlist->core.pollfd.priv[f].ptr;
1054 
1055 			if ((map && ptr == map) || (overwrite_map && ptr == overwrite_map)) {
1056 				pos = fdarray__dup_entry_from(&thread_data->pollfd, f,
1057 							      &evlist->core.pollfd);
1058 				if (pos < 0)
1059 					return pos;
1060 				pr_debug2("thread_data[%p]: pollfd[%d] <- event_fd=%d\n",
1061 					 thread_data, pos, evlist->core.pollfd.entries[f].fd);
1062 			}
1063 		}
1064 	}
1065 
1066 	return 0;
1067 }
1068 
1069 static void record__free_thread_data(struct record *rec)
1070 {
1071 	int t;
1072 	struct record_thread *thread_data = rec->thread_data;
1073 
1074 	if (thread_data == NULL)
1075 		return;
1076 
1077 	for (t = 0; t < rec->nr_threads; t++) {
1078 		record__thread_data_close_pipes(&thread_data[t]);
1079 		zfree(&thread_data[t].maps);
1080 		zfree(&thread_data[t].overwrite_maps);
1081 		fdarray__exit(&thread_data[t].pollfd);
1082 	}
1083 
1084 	zfree(&rec->thread_data);
1085 }
1086 
1087 static int record__map_thread_evlist_pollfd_indexes(struct record *rec,
1088 						    int evlist_pollfd_index,
1089 						    int thread_pollfd_index)
1090 {
1091 	size_t x = rec->index_map_cnt;
1092 
1093 	if (realloc_array_as_needed(rec->index_map, rec->index_map_sz, x, NULL))
1094 		return -ENOMEM;
1095 	rec->index_map[x].evlist_pollfd_index = evlist_pollfd_index;
1096 	rec->index_map[x].thread_pollfd_index = thread_pollfd_index;
1097 	rec->index_map_cnt += 1;
1098 	return 0;
1099 }
1100 
1101 static int record__update_evlist_pollfd_from_thread(struct record *rec,
1102 						    struct evlist *evlist,
1103 						    struct record_thread *thread_data)
1104 {
1105 	struct pollfd *e_entries = evlist->core.pollfd.entries;
1106 	struct pollfd *t_entries = thread_data->pollfd.entries;
1107 	int err = 0;
1108 	size_t i;
1109 
1110 	for (i = 0; i < rec->index_map_cnt; i++) {
1111 		int e_pos = rec->index_map[i].evlist_pollfd_index;
1112 		int t_pos = rec->index_map[i].thread_pollfd_index;
1113 
1114 		if (e_entries[e_pos].fd != t_entries[t_pos].fd ||
1115 		    e_entries[e_pos].events != t_entries[t_pos].events) {
1116 			pr_err("Thread and evlist pollfd index mismatch\n");
1117 			err = -EINVAL;
1118 			continue;
1119 		}
1120 		e_entries[e_pos].revents = t_entries[t_pos].revents;
1121 	}
1122 	return err;
1123 }
1124 
1125 static int record__dup_non_perf_events(struct record *rec,
1126 				       struct evlist *evlist,
1127 				       struct record_thread *thread_data)
1128 {
1129 	struct fdarray *fda = &evlist->core.pollfd;
1130 	int i, ret;
1131 
1132 	for (i = 0; i < fda->nr; i++) {
1133 		if (!(fda->priv[i].flags & fdarray_flag__non_perf_event))
1134 			continue;
1135 		ret = fdarray__dup_entry_from(&thread_data->pollfd, i, fda);
1136 		if (ret < 0) {
1137 			pr_err("Failed to duplicate descriptor in main thread pollfd\n");
1138 			return ret;
1139 		}
1140 		pr_debug2("thread_data[%p]: pollfd[%d] <- non_perf_event fd=%d\n",
1141 			  thread_data, ret, fda->entries[i].fd);
1142 		ret = record__map_thread_evlist_pollfd_indexes(rec, i, ret);
1143 		if (ret < 0) {
1144 			pr_err("Failed to map thread and evlist pollfd indexes\n");
1145 			return ret;
1146 		}
1147 	}
1148 	return 0;
1149 }
1150 
1151 static int record__alloc_thread_data(struct record *rec, struct evlist *evlist)
1152 {
1153 	int t, ret;
1154 	struct record_thread *thread_data;
1155 
1156 	rec->thread_data = zalloc(rec->nr_threads * sizeof(*(rec->thread_data)));
1157 	if (!rec->thread_data) {
1158 		pr_err("Failed to allocate thread data\n");
1159 		return -ENOMEM;
1160 	}
1161 	thread_data = rec->thread_data;
1162 
1163 	for (t = 0; t < rec->nr_threads; t++)
1164 		record__thread_data_init_pipes(&thread_data[t]);
1165 
1166 	for (t = 0; t < rec->nr_threads; t++) {
1167 		thread_data[t].rec = rec;
1168 		thread_data[t].mask = &rec->thread_masks[t];
1169 		ret = record__thread_data_init_maps(&thread_data[t], evlist);
1170 		if (ret) {
1171 			pr_err("Failed to initialize thread[%d] maps\n", t);
1172 			goto out_free;
1173 		}
1174 		ret = record__thread_data_init_pollfd(&thread_data[t], evlist);
1175 		if (ret) {
1176 			pr_err("Failed to initialize thread[%d] pollfd\n", t);
1177 			goto out_free;
1178 		}
1179 		if (t) {
1180 			thread_data[t].tid = -1;
1181 			ret = record__thread_data_open_pipes(&thread_data[t]);
1182 			if (ret) {
1183 				pr_err("Failed to open thread[%d] communication pipes\n", t);
1184 				goto out_free;
1185 			}
1186 			ret = fdarray__add(&thread_data[t].pollfd, thread_data[t].pipes.msg[0],
1187 					   POLLIN | POLLERR | POLLHUP, fdarray_flag__nonfilterable);
1188 			if (ret < 0) {
1189 				pr_err("Failed to add descriptor to thread[%d] pollfd\n", t);
1190 				goto out_free;
1191 			}
1192 			thread_data[t].ctlfd_pos = ret;
1193 			pr_debug2("thread_data[%p]: pollfd[%d] <- ctl_fd=%d\n",
1194 				 thread_data, thread_data[t].ctlfd_pos,
1195 				 thread_data[t].pipes.msg[0]);
1196 		} else {
1197 			thread_data[t].tid = gettid();
1198 
1199 			ret = record__dup_non_perf_events(rec, evlist, &thread_data[t]);
1200 			if (ret < 0)
1201 				goto out_free;
1202 
1203 			thread_data[t].ctlfd_pos = -1; /* Not used */
1204 		}
1205 	}
1206 
1207 	return 0;
1208 
1209 out_free:
1210 	record__free_thread_data(rec);
1211 
1212 	return ret;
1213 }
1214 
1215 static int record__mmap_evlist(struct record *rec,
1216 			       struct evlist *evlist)
1217 {
1218 	int i, ret;
1219 	struct record_opts *opts = &rec->opts;
1220 	bool auxtrace_overwrite = opts->auxtrace_snapshot_mode ||
1221 				  opts->auxtrace_sample_mode;
1222 	char msg[512];
1223 
1224 	if (opts->affinity != PERF_AFFINITY_SYS)
1225 		cpu__setup_cpunode_map();
1226 
1227 	if (evlist__mmap_ex(evlist, opts->mmap_pages,
1228 				 opts->auxtrace_mmap_pages,
1229 				 auxtrace_overwrite,
1230 				 opts->nr_cblocks, opts->affinity,
1231 				 opts->mmap_flush, opts->comp_level) < 0) {
1232 		if (errno == EPERM) {
1233 			pr_err("Permission error mapping pages.\n"
1234 			       "Consider increasing "
1235 			       "/proc/sys/kernel/perf_event_mlock_kb,\n"
1236 			       "or try again with a smaller value of -m/--mmap_pages.\n"
1237 			       "(current value: %u,%u)\n",
1238 			       opts->mmap_pages, opts->auxtrace_mmap_pages);
1239 			return -errno;
1240 		} else {
1241 			pr_err("failed to mmap with %d (%s)\n", errno,
1242 				str_error_r(errno, msg, sizeof(msg)));
1243 			if (errno)
1244 				return -errno;
1245 			else
1246 				return -EINVAL;
1247 		}
1248 	}
1249 
1250 	if (evlist__initialize_ctlfd(evlist, opts->ctl_fd, opts->ctl_fd_ack))
1251 		return -1;
1252 
1253 	ret = record__alloc_thread_data(rec, evlist);
1254 	if (ret)
1255 		return ret;
1256 
1257 	if (record__threads_enabled(rec)) {
1258 		ret = perf_data__create_dir(&rec->data, evlist->core.nr_mmaps);
1259 		if (ret) {
1260 			pr_err("Failed to create data directory: %s\n", strerror(-ret));
1261 			return ret;
1262 		}
1263 		for (i = 0; i < evlist->core.nr_mmaps; i++) {
1264 			if (evlist->mmap)
1265 				evlist->mmap[i].file = &rec->data.dir.files[i];
1266 			if (evlist->overwrite_mmap)
1267 				evlist->overwrite_mmap[i].file = &rec->data.dir.files[i];
1268 		}
1269 	}
1270 
1271 	return 0;
1272 }
1273 
1274 static int record__mmap(struct record *rec)
1275 {
1276 	return record__mmap_evlist(rec, rec->evlist);
1277 }
1278 
1279 static int record__open(struct record *rec)
1280 {
1281 	char msg[BUFSIZ];
1282 	struct evsel *pos;
1283 	struct evlist *evlist = rec->evlist;
1284 	struct perf_session *session = rec->session;
1285 	struct record_opts *opts = &rec->opts;
1286 	int rc = 0;
1287 
1288 	/*
1289 	 * For initial_delay, system wide or a hybrid system, we need to add a
1290 	 * dummy event so that we can track PERF_RECORD_MMAP to cover the delay
1291 	 * of waiting or event synthesis.
1292 	 */
1293 	if (opts->initial_delay || target__has_cpu(&opts->target) ||
1294 	    perf_pmu__has_hybrid()) {
1295 		pos = evlist__get_tracking_event(evlist);
1296 		if (!evsel__is_dummy_event(pos)) {
1297 			/* Set up dummy event. */
1298 			if (evlist__add_dummy(evlist))
1299 				return -ENOMEM;
1300 			pos = evlist__last(evlist);
1301 			evlist__set_tracking_event(evlist, pos);
1302 		}
1303 
1304 		/*
1305 		 * Enable the dummy event when the process is forked for
1306 		 * initial_delay, immediately for system wide.
1307 		 */
1308 		if (opts->initial_delay && !pos->immediate &&
1309 		    !target__has_cpu(&opts->target))
1310 			pos->core.attr.enable_on_exec = 1;
1311 		else
1312 			pos->immediate = 1;
1313 	}
1314 
1315 	evlist__config(evlist, opts, &callchain_param);
1316 
1317 	evlist__for_each_entry(evlist, pos) {
1318 try_again:
1319 		if (evsel__open(pos, pos->core.cpus, pos->core.threads) < 0) {
1320 			if (evsel__fallback(pos, errno, msg, sizeof(msg))) {
1321 				if (verbose > 0)
1322 					ui__warning("%s\n", msg);
1323 				goto try_again;
1324 			}
1325 			if ((errno == EINVAL || errno == EBADF) &&
1326 			    pos->core.leader != &pos->core &&
1327 			    pos->weak_group) {
1328 			        pos = evlist__reset_weak_group(evlist, pos, true);
1329 				goto try_again;
1330 			}
1331 			rc = -errno;
1332 			evsel__open_strerror(pos, &opts->target, errno, msg, sizeof(msg));
1333 			ui__error("%s\n", msg);
1334 			goto out;
1335 		}
1336 
1337 		pos->supported = true;
1338 	}
1339 
1340 	if (symbol_conf.kptr_restrict && !evlist__exclude_kernel(evlist)) {
1341 		pr_warning(
1342 "WARNING: Kernel address maps (/proc/{kallsyms,modules}) are restricted,\n"
1343 "check /proc/sys/kernel/kptr_restrict and /proc/sys/kernel/perf_event_paranoid.\n\n"
1344 "Samples in kernel functions may not be resolved if a suitable vmlinux\n"
1345 "file is not found in the buildid cache or in the vmlinux path.\n\n"
1346 "Samples in kernel modules won't be resolved at all.\n\n"
1347 "If some relocation was applied (e.g. kexec) symbols may be misresolved\n"
1348 "even with a suitable vmlinux or kallsyms file.\n\n");
1349 	}
1350 
1351 	if (evlist__apply_filters(evlist, &pos)) {
1352 		pr_err("failed to set filter \"%s\" on event %s with %d (%s)\n",
1353 			pos->filter, evsel__name(pos), errno,
1354 			str_error_r(errno, msg, sizeof(msg)));
1355 		rc = -1;
1356 		goto out;
1357 	}
1358 
1359 	rc = record__mmap(rec);
1360 	if (rc)
1361 		goto out;
1362 
1363 	session->evlist = evlist;
1364 	perf_session__set_id_hdr_size(session);
1365 out:
1366 	return rc;
1367 }
1368 
1369 static void set_timestamp_boundary(struct record *rec, u64 sample_time)
1370 {
1371 	if (rec->evlist->first_sample_time == 0)
1372 		rec->evlist->first_sample_time = sample_time;
1373 
1374 	if (sample_time)
1375 		rec->evlist->last_sample_time = sample_time;
1376 }
1377 
1378 static int process_sample_event(struct perf_tool *tool,
1379 				union perf_event *event,
1380 				struct perf_sample *sample,
1381 				struct evsel *evsel,
1382 				struct machine *machine)
1383 {
1384 	struct record *rec = container_of(tool, struct record, tool);
1385 
1386 	set_timestamp_boundary(rec, sample->time);
1387 
1388 	if (rec->buildid_all)
1389 		return 0;
1390 
1391 	rec->samples++;
1392 	return build_id__mark_dso_hit(tool, event, sample, evsel, machine);
1393 }
1394 
1395 static int process_buildids(struct record *rec)
1396 {
1397 	struct perf_session *session = rec->session;
1398 
1399 	if (perf_data__size(&rec->data) == 0)
1400 		return 0;
1401 
1402 	/*
1403 	 * During this process, it'll load kernel map and replace the
1404 	 * dso->long_name to a real pathname it found.  In this case
1405 	 * we prefer the vmlinux path like
1406 	 *   /lib/modules/3.16.4/build/vmlinux
1407 	 *
1408 	 * rather than build-id path (in debug directory).
1409 	 *   $HOME/.debug/.build-id/f0/6e17aa50adf4d00b88925e03775de107611551
1410 	 */
1411 	symbol_conf.ignore_vmlinux_buildid = true;
1412 
1413 	/*
1414 	 * If --buildid-all is given, it marks all DSO regardless of hits,
1415 	 * so no need to process samples. But if timestamp_boundary is enabled,
1416 	 * it still needs to walk on all samples to get the timestamps of
1417 	 * first/last samples.
1418 	 */
1419 	if (rec->buildid_all && !rec->timestamp_boundary)
1420 		rec->tool.sample = NULL;
1421 
1422 	return perf_session__process_events(session);
1423 }
1424 
1425 static void perf_event__synthesize_guest_os(struct machine *machine, void *data)
1426 {
1427 	int err;
1428 	struct perf_tool *tool = data;
1429 	/*
1430 	 *As for guest kernel when processing subcommand record&report,
1431 	 *we arrange module mmap prior to guest kernel mmap and trigger
1432 	 *a preload dso because default guest module symbols are loaded
1433 	 *from guest kallsyms instead of /lib/modules/XXX/XXX. This
1434 	 *method is used to avoid symbol missing when the first addr is
1435 	 *in module instead of in guest kernel.
1436 	 */
1437 	err = perf_event__synthesize_modules(tool, process_synthesized_event,
1438 					     machine);
1439 	if (err < 0)
1440 		pr_err("Couldn't record guest kernel [%d]'s reference"
1441 		       " relocation symbol.\n", machine->pid);
1442 
1443 	/*
1444 	 * We use _stext for guest kernel because guest kernel's /proc/kallsyms
1445 	 * have no _text sometimes.
1446 	 */
1447 	err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
1448 						 machine);
1449 	if (err < 0)
1450 		pr_err("Couldn't record guest kernel [%d]'s reference"
1451 		       " relocation symbol.\n", machine->pid);
1452 }
1453 
1454 static struct perf_event_header finished_round_event = {
1455 	.size = sizeof(struct perf_event_header),
1456 	.type = PERF_RECORD_FINISHED_ROUND,
1457 };
1458 
1459 static struct perf_event_header finished_init_event = {
1460 	.size = sizeof(struct perf_event_header),
1461 	.type = PERF_RECORD_FINISHED_INIT,
1462 };
1463 
1464 static void record__adjust_affinity(struct record *rec, struct mmap *map)
1465 {
1466 	if (rec->opts.affinity != PERF_AFFINITY_SYS &&
1467 	    !bitmap_equal(thread->mask->affinity.bits, map->affinity_mask.bits,
1468 			  thread->mask->affinity.nbits)) {
1469 		bitmap_zero(thread->mask->affinity.bits, thread->mask->affinity.nbits);
1470 		bitmap_or(thread->mask->affinity.bits, thread->mask->affinity.bits,
1471 			  map->affinity_mask.bits, thread->mask->affinity.nbits);
1472 		sched_setaffinity(0, MMAP_CPU_MASK_BYTES(&thread->mask->affinity),
1473 					(cpu_set_t *)thread->mask->affinity.bits);
1474 		if (verbose == 2) {
1475 			pr_debug("threads[%d]: running on cpu%d: ", thread->tid, sched_getcpu());
1476 			mmap_cpu_mask__scnprintf(&thread->mask->affinity, "affinity");
1477 		}
1478 	}
1479 }
1480 
1481 static size_t process_comp_header(void *record, size_t increment)
1482 {
1483 	struct perf_record_compressed *event = record;
1484 	size_t size = sizeof(*event);
1485 
1486 	if (increment) {
1487 		event->header.size += increment;
1488 		return increment;
1489 	}
1490 
1491 	event->header.type = PERF_RECORD_COMPRESSED;
1492 	event->header.size = size;
1493 
1494 	return size;
1495 }
1496 
1497 static size_t zstd_compress(struct perf_session *session, struct mmap *map,
1498 			    void *dst, size_t dst_size, void *src, size_t src_size)
1499 {
1500 	size_t compressed;
1501 	size_t max_record_size = PERF_SAMPLE_MAX_SIZE - sizeof(struct perf_record_compressed) - 1;
1502 	struct zstd_data *zstd_data = &session->zstd_data;
1503 
1504 	if (map && map->file)
1505 		zstd_data = &map->zstd_data;
1506 
1507 	compressed = zstd_compress_stream_to_records(zstd_data, dst, dst_size, src, src_size,
1508 						     max_record_size, process_comp_header);
1509 
1510 	if (map && map->file) {
1511 		thread->bytes_transferred += src_size;
1512 		thread->bytes_compressed  += compressed;
1513 	} else {
1514 		session->bytes_transferred += src_size;
1515 		session->bytes_compressed  += compressed;
1516 	}
1517 
1518 	return compressed;
1519 }
1520 
1521 static int record__mmap_read_evlist(struct record *rec, struct evlist *evlist,
1522 				    bool overwrite, bool synch)
1523 {
1524 	u64 bytes_written = rec->bytes_written;
1525 	int i;
1526 	int rc = 0;
1527 	int nr_mmaps;
1528 	struct mmap **maps;
1529 	int trace_fd = rec->data.file.fd;
1530 	off_t off = 0;
1531 
1532 	if (!evlist)
1533 		return 0;
1534 
1535 	nr_mmaps = thread->nr_mmaps;
1536 	maps = overwrite ? thread->overwrite_maps : thread->maps;
1537 
1538 	if (!maps)
1539 		return 0;
1540 
1541 	if (overwrite && evlist->bkw_mmap_state != BKW_MMAP_DATA_PENDING)
1542 		return 0;
1543 
1544 	if (record__aio_enabled(rec))
1545 		off = record__aio_get_pos(trace_fd);
1546 
1547 	for (i = 0; i < nr_mmaps; i++) {
1548 		u64 flush = 0;
1549 		struct mmap *map = maps[i];
1550 
1551 		if (map->core.base) {
1552 			record__adjust_affinity(rec, map);
1553 			if (synch) {
1554 				flush = map->core.flush;
1555 				map->core.flush = 1;
1556 			}
1557 			if (!record__aio_enabled(rec)) {
1558 				if (perf_mmap__push(map, rec, record__pushfn) < 0) {
1559 					if (synch)
1560 						map->core.flush = flush;
1561 					rc = -1;
1562 					goto out;
1563 				}
1564 			} else {
1565 				if (record__aio_push(rec, map, &off) < 0) {
1566 					record__aio_set_pos(trace_fd, off);
1567 					if (synch)
1568 						map->core.flush = flush;
1569 					rc = -1;
1570 					goto out;
1571 				}
1572 			}
1573 			if (synch)
1574 				map->core.flush = flush;
1575 		}
1576 
1577 		if (map->auxtrace_mmap.base && !rec->opts.auxtrace_snapshot_mode &&
1578 		    !rec->opts.auxtrace_sample_mode &&
1579 		    record__auxtrace_mmap_read(rec, map) != 0) {
1580 			rc = -1;
1581 			goto out;
1582 		}
1583 	}
1584 
1585 	if (record__aio_enabled(rec))
1586 		record__aio_set_pos(trace_fd, off);
1587 
1588 	/*
1589 	 * Mark the round finished in case we wrote
1590 	 * at least one event.
1591 	 *
1592 	 * No need for round events in directory mode,
1593 	 * because per-cpu maps and files have data
1594 	 * sorted by kernel.
1595 	 */
1596 	if (!record__threads_enabled(rec) && bytes_written != rec->bytes_written)
1597 		rc = record__write(rec, NULL, &finished_round_event, sizeof(finished_round_event));
1598 
1599 	if (overwrite)
1600 		evlist__toggle_bkw_mmap(evlist, BKW_MMAP_EMPTY);
1601 out:
1602 	return rc;
1603 }
1604 
1605 static int record__mmap_read_all(struct record *rec, bool synch)
1606 {
1607 	int err;
1608 
1609 	err = record__mmap_read_evlist(rec, rec->evlist, false, synch);
1610 	if (err)
1611 		return err;
1612 
1613 	return record__mmap_read_evlist(rec, rec->evlist, true, synch);
1614 }
1615 
1616 static void record__thread_munmap_filtered(struct fdarray *fda, int fd,
1617 					   void *arg __maybe_unused)
1618 {
1619 	struct perf_mmap *map = fda->priv[fd].ptr;
1620 
1621 	if (map)
1622 		perf_mmap__put(map);
1623 }
1624 
1625 static void *record__thread(void *arg)
1626 {
1627 	enum thread_msg msg = THREAD_MSG__READY;
1628 	bool terminate = false;
1629 	struct fdarray *pollfd;
1630 	int err, ctlfd_pos;
1631 
1632 	thread = arg;
1633 	thread->tid = gettid();
1634 
1635 	err = write(thread->pipes.ack[1], &msg, sizeof(msg));
1636 	if (err == -1)
1637 		pr_warning("threads[%d]: failed to notify on start: %s\n",
1638 			   thread->tid, strerror(errno));
1639 
1640 	pr_debug("threads[%d]: started on cpu%d\n", thread->tid, sched_getcpu());
1641 
1642 	pollfd = &thread->pollfd;
1643 	ctlfd_pos = thread->ctlfd_pos;
1644 
1645 	for (;;) {
1646 		unsigned long long hits = thread->samples;
1647 
1648 		if (record__mmap_read_all(thread->rec, false) < 0 || terminate)
1649 			break;
1650 
1651 		if (hits == thread->samples) {
1652 
1653 			err = fdarray__poll(pollfd, -1);
1654 			/*
1655 			 * Propagate error, only if there's any. Ignore positive
1656 			 * number of returned events and interrupt error.
1657 			 */
1658 			if (err > 0 || (err < 0 && errno == EINTR))
1659 				err = 0;
1660 			thread->waking++;
1661 
1662 			if (fdarray__filter(pollfd, POLLERR | POLLHUP,
1663 					    record__thread_munmap_filtered, NULL) == 0)
1664 				break;
1665 		}
1666 
1667 		if (pollfd->entries[ctlfd_pos].revents & POLLHUP) {
1668 			terminate = true;
1669 			close(thread->pipes.msg[0]);
1670 			thread->pipes.msg[0] = -1;
1671 			pollfd->entries[ctlfd_pos].fd = -1;
1672 			pollfd->entries[ctlfd_pos].events = 0;
1673 		}
1674 
1675 		pollfd->entries[ctlfd_pos].revents = 0;
1676 	}
1677 	record__mmap_read_all(thread->rec, true);
1678 
1679 	err = write(thread->pipes.ack[1], &msg, sizeof(msg));
1680 	if (err == -1)
1681 		pr_warning("threads[%d]: failed to notify on termination: %s\n",
1682 			   thread->tid, strerror(errno));
1683 
1684 	return NULL;
1685 }
1686 
1687 static void record__init_features(struct record *rec)
1688 {
1689 	struct perf_session *session = rec->session;
1690 	int feat;
1691 
1692 	for (feat = HEADER_FIRST_FEATURE; feat < HEADER_LAST_FEATURE; feat++)
1693 		perf_header__set_feat(&session->header, feat);
1694 
1695 	if (rec->no_buildid)
1696 		perf_header__clear_feat(&session->header, HEADER_BUILD_ID);
1697 
1698 	if (!have_tracepoints(&rec->evlist->core.entries))
1699 		perf_header__clear_feat(&session->header, HEADER_TRACING_DATA);
1700 
1701 	if (!rec->opts.branch_stack)
1702 		perf_header__clear_feat(&session->header, HEADER_BRANCH_STACK);
1703 
1704 	if (!rec->opts.full_auxtrace)
1705 		perf_header__clear_feat(&session->header, HEADER_AUXTRACE);
1706 
1707 	if (!(rec->opts.use_clockid && rec->opts.clockid_res_ns))
1708 		perf_header__clear_feat(&session->header, HEADER_CLOCKID);
1709 
1710 	if (!rec->opts.use_clockid)
1711 		perf_header__clear_feat(&session->header, HEADER_CLOCK_DATA);
1712 
1713 	if (!record__threads_enabled(rec))
1714 		perf_header__clear_feat(&session->header, HEADER_DIR_FORMAT);
1715 
1716 	if (!record__comp_enabled(rec))
1717 		perf_header__clear_feat(&session->header, HEADER_COMPRESSED);
1718 
1719 	perf_header__clear_feat(&session->header, HEADER_STAT);
1720 }
1721 
1722 static void
1723 record__finish_output(struct record *rec)
1724 {
1725 	int i;
1726 	struct perf_data *data = &rec->data;
1727 	int fd = perf_data__fd(data);
1728 
1729 	if (data->is_pipe)
1730 		return;
1731 
1732 	rec->session->header.data_size += rec->bytes_written;
1733 	data->file.size = lseek(perf_data__fd(data), 0, SEEK_CUR);
1734 	if (record__threads_enabled(rec)) {
1735 		for (i = 0; i < data->dir.nr; i++)
1736 			data->dir.files[i].size = lseek(data->dir.files[i].fd, 0, SEEK_CUR);
1737 	}
1738 
1739 	if (!rec->no_buildid) {
1740 		process_buildids(rec);
1741 
1742 		if (rec->buildid_all)
1743 			dsos__hit_all(rec->session);
1744 	}
1745 	perf_session__write_header(rec->session, rec->evlist, fd, true);
1746 
1747 	return;
1748 }
1749 
1750 static int record__synthesize_workload(struct record *rec, bool tail)
1751 {
1752 	int err;
1753 	struct perf_thread_map *thread_map;
1754 	bool needs_mmap = rec->opts.synth & PERF_SYNTH_MMAP;
1755 
1756 	if (rec->opts.tail_synthesize != tail)
1757 		return 0;
1758 
1759 	thread_map = thread_map__new_by_tid(rec->evlist->workload.pid);
1760 	if (thread_map == NULL)
1761 		return -1;
1762 
1763 	err = perf_event__synthesize_thread_map(&rec->tool, thread_map,
1764 						 process_synthesized_event,
1765 						 &rec->session->machines.host,
1766 						 needs_mmap,
1767 						 rec->opts.sample_address);
1768 	perf_thread_map__put(thread_map);
1769 	return err;
1770 }
1771 
1772 static int write_finished_init(struct record *rec, bool tail)
1773 {
1774 	if (rec->opts.tail_synthesize != tail)
1775 		return 0;
1776 
1777 	return record__write(rec, NULL, &finished_init_event, sizeof(finished_init_event));
1778 }
1779 
1780 static int record__synthesize(struct record *rec, bool tail);
1781 
1782 static int
1783 record__switch_output(struct record *rec, bool at_exit)
1784 {
1785 	struct perf_data *data = &rec->data;
1786 	int fd, err;
1787 	char *new_filename;
1788 
1789 	/* Same Size:      "2015122520103046"*/
1790 	char timestamp[] = "InvalidTimestamp";
1791 
1792 	record__aio_mmap_read_sync(rec);
1793 
1794 	write_finished_init(rec, true);
1795 
1796 	record__synthesize(rec, true);
1797 	if (target__none(&rec->opts.target))
1798 		record__synthesize_workload(rec, true);
1799 
1800 	rec->samples = 0;
1801 	record__finish_output(rec);
1802 	err = fetch_current_timestamp(timestamp, sizeof(timestamp));
1803 	if (err) {
1804 		pr_err("Failed to get current timestamp\n");
1805 		return -EINVAL;
1806 	}
1807 
1808 	fd = perf_data__switch(data, timestamp,
1809 				    rec->session->header.data_offset,
1810 				    at_exit, &new_filename);
1811 	if (fd >= 0 && !at_exit) {
1812 		rec->bytes_written = 0;
1813 		rec->session->header.data_size = 0;
1814 	}
1815 
1816 	if (!quiet)
1817 		fprintf(stderr, "[ perf record: Dump %s.%s ]\n",
1818 			data->path, timestamp);
1819 
1820 	if (rec->switch_output.num_files) {
1821 		int n = rec->switch_output.cur_file + 1;
1822 
1823 		if (n >= rec->switch_output.num_files)
1824 			n = 0;
1825 		rec->switch_output.cur_file = n;
1826 		if (rec->switch_output.filenames[n]) {
1827 			remove(rec->switch_output.filenames[n]);
1828 			zfree(&rec->switch_output.filenames[n]);
1829 		}
1830 		rec->switch_output.filenames[n] = new_filename;
1831 	} else {
1832 		free(new_filename);
1833 	}
1834 
1835 	/* Output tracking events */
1836 	if (!at_exit) {
1837 		record__synthesize(rec, false);
1838 
1839 		/*
1840 		 * In 'perf record --switch-output' without -a,
1841 		 * record__synthesize() in record__switch_output() won't
1842 		 * generate tracking events because there's no thread_map
1843 		 * in evlist. Which causes newly created perf.data doesn't
1844 		 * contain map and comm information.
1845 		 * Create a fake thread_map and directly call
1846 		 * perf_event__synthesize_thread_map() for those events.
1847 		 */
1848 		if (target__none(&rec->opts.target))
1849 			record__synthesize_workload(rec, false);
1850 		write_finished_init(rec, false);
1851 	}
1852 	return fd;
1853 }
1854 
1855 static volatile int workload_exec_errno;
1856 
1857 /*
1858  * evlist__prepare_workload will send a SIGUSR1
1859  * if the fork fails, since we asked by setting its
1860  * want_signal to true.
1861  */
1862 static void workload_exec_failed_signal(int signo __maybe_unused,
1863 					siginfo_t *info,
1864 					void *ucontext __maybe_unused)
1865 {
1866 	workload_exec_errno = info->si_value.sival_int;
1867 	done = 1;
1868 	child_finished = 1;
1869 }
1870 
1871 static void snapshot_sig_handler(int sig);
1872 static void alarm_sig_handler(int sig);
1873 
1874 static const struct perf_event_mmap_page *evlist__pick_pc(struct evlist *evlist)
1875 {
1876 	if (evlist) {
1877 		if (evlist->mmap && evlist->mmap[0].core.base)
1878 			return evlist->mmap[0].core.base;
1879 		if (evlist->overwrite_mmap && evlist->overwrite_mmap[0].core.base)
1880 			return evlist->overwrite_mmap[0].core.base;
1881 	}
1882 	return NULL;
1883 }
1884 
1885 static const struct perf_event_mmap_page *record__pick_pc(struct record *rec)
1886 {
1887 	const struct perf_event_mmap_page *pc = evlist__pick_pc(rec->evlist);
1888 	if (pc)
1889 		return pc;
1890 	return NULL;
1891 }
1892 
1893 static int record__synthesize(struct record *rec, bool tail)
1894 {
1895 	struct perf_session *session = rec->session;
1896 	struct machine *machine = &session->machines.host;
1897 	struct perf_data *data = &rec->data;
1898 	struct record_opts *opts = &rec->opts;
1899 	struct perf_tool *tool = &rec->tool;
1900 	int err = 0;
1901 	event_op f = process_synthesized_event;
1902 
1903 	if (rec->opts.tail_synthesize != tail)
1904 		return 0;
1905 
1906 	if (data->is_pipe) {
1907 		err = perf_event__synthesize_for_pipe(tool, session, data,
1908 						      process_synthesized_event);
1909 		if (err < 0)
1910 			goto out;
1911 
1912 		rec->bytes_written += err;
1913 	}
1914 
1915 	err = perf_event__synth_time_conv(record__pick_pc(rec), tool,
1916 					  process_synthesized_event, machine);
1917 	if (err)
1918 		goto out;
1919 
1920 	/* Synthesize id_index before auxtrace_info */
1921 	err = perf_event__synthesize_id_index(tool,
1922 					      process_synthesized_event,
1923 					      session->evlist, machine);
1924 	if (err)
1925 		goto out;
1926 
1927 	if (rec->opts.full_auxtrace) {
1928 		err = perf_event__synthesize_auxtrace_info(rec->itr, tool,
1929 					session, process_synthesized_event);
1930 		if (err)
1931 			goto out;
1932 	}
1933 
1934 	if (!evlist__exclude_kernel(rec->evlist)) {
1935 		err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
1936 							 machine);
1937 		WARN_ONCE(err < 0, "Couldn't record kernel reference relocation symbol\n"
1938 				   "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
1939 				   "Check /proc/kallsyms permission or run as root.\n");
1940 
1941 		err = perf_event__synthesize_modules(tool, process_synthesized_event,
1942 						     machine);
1943 		WARN_ONCE(err < 0, "Couldn't record kernel module information.\n"
1944 				   "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
1945 				   "Check /proc/modules permission or run as root.\n");
1946 	}
1947 
1948 	if (perf_guest) {
1949 		machines__process_guests(&session->machines,
1950 					 perf_event__synthesize_guest_os, tool);
1951 	}
1952 
1953 	err = perf_event__synthesize_extra_attr(&rec->tool,
1954 						rec->evlist,
1955 						process_synthesized_event,
1956 						data->is_pipe);
1957 	if (err)
1958 		goto out;
1959 
1960 	err = perf_event__synthesize_thread_map2(&rec->tool, rec->evlist->core.threads,
1961 						 process_synthesized_event,
1962 						NULL);
1963 	if (err < 0) {
1964 		pr_err("Couldn't synthesize thread map.\n");
1965 		return err;
1966 	}
1967 
1968 	err = perf_event__synthesize_cpu_map(&rec->tool, rec->evlist->core.all_cpus,
1969 					     process_synthesized_event, NULL);
1970 	if (err < 0) {
1971 		pr_err("Couldn't synthesize cpu map.\n");
1972 		return err;
1973 	}
1974 
1975 	err = perf_event__synthesize_bpf_events(session, process_synthesized_event,
1976 						machine, opts);
1977 	if (err < 0) {
1978 		pr_warning("Couldn't synthesize bpf events.\n");
1979 		err = 0;
1980 	}
1981 
1982 	if (rec->opts.synth & PERF_SYNTH_CGROUP) {
1983 		err = perf_event__synthesize_cgroups(tool, process_synthesized_event,
1984 						     machine);
1985 		if (err < 0) {
1986 			pr_warning("Couldn't synthesize cgroup events.\n");
1987 			err = 0;
1988 		}
1989 	}
1990 
1991 	if (rec->opts.nr_threads_synthesize > 1) {
1992 		mutex_init(&synth_lock);
1993 		perf_set_multithreaded();
1994 		f = process_locked_synthesized_event;
1995 	}
1996 
1997 	if (rec->opts.synth & PERF_SYNTH_TASK) {
1998 		bool needs_mmap = rec->opts.synth & PERF_SYNTH_MMAP;
1999 
2000 		err = __machine__synthesize_threads(machine, tool, &opts->target,
2001 						    rec->evlist->core.threads,
2002 						    f, needs_mmap, opts->sample_address,
2003 						    rec->opts.nr_threads_synthesize);
2004 	}
2005 
2006 	if (rec->opts.nr_threads_synthesize > 1) {
2007 		perf_set_singlethreaded();
2008 		mutex_destroy(&synth_lock);
2009 	}
2010 
2011 out:
2012 	return err;
2013 }
2014 
2015 static int record__process_signal_event(union perf_event *event __maybe_unused, void *data)
2016 {
2017 	struct record *rec = data;
2018 	pthread_kill(rec->thread_id, SIGUSR2);
2019 	return 0;
2020 }
2021 
2022 static int record__setup_sb_evlist(struct record *rec)
2023 {
2024 	struct record_opts *opts = &rec->opts;
2025 
2026 	if (rec->sb_evlist != NULL) {
2027 		/*
2028 		 * We get here if --switch-output-event populated the
2029 		 * sb_evlist, so associate a callback that will send a SIGUSR2
2030 		 * to the main thread.
2031 		 */
2032 		evlist__set_cb(rec->sb_evlist, record__process_signal_event, rec);
2033 		rec->thread_id = pthread_self();
2034 	}
2035 #ifdef HAVE_LIBBPF_SUPPORT
2036 	if (!opts->no_bpf_event) {
2037 		if (rec->sb_evlist == NULL) {
2038 			rec->sb_evlist = evlist__new();
2039 
2040 			if (rec->sb_evlist == NULL) {
2041 				pr_err("Couldn't create side band evlist.\n.");
2042 				return -1;
2043 			}
2044 		}
2045 
2046 		if (evlist__add_bpf_sb_event(rec->sb_evlist, &rec->session->header.env)) {
2047 			pr_err("Couldn't ask for PERF_RECORD_BPF_EVENT side band events.\n.");
2048 			return -1;
2049 		}
2050 	}
2051 #endif
2052 	if (evlist__start_sb_thread(rec->sb_evlist, &rec->opts.target)) {
2053 		pr_debug("Couldn't start the BPF side band thread:\nBPF programs starting from now on won't be annotatable\n");
2054 		opts->no_bpf_event = true;
2055 	}
2056 
2057 	return 0;
2058 }
2059 
2060 static int record__init_clock(struct record *rec)
2061 {
2062 	struct perf_session *session = rec->session;
2063 	struct timespec ref_clockid;
2064 	struct timeval ref_tod;
2065 	u64 ref;
2066 
2067 	if (!rec->opts.use_clockid)
2068 		return 0;
2069 
2070 	if (rec->opts.use_clockid && rec->opts.clockid_res_ns)
2071 		session->header.env.clock.clockid_res_ns = rec->opts.clockid_res_ns;
2072 
2073 	session->header.env.clock.clockid = rec->opts.clockid;
2074 
2075 	if (gettimeofday(&ref_tod, NULL) != 0) {
2076 		pr_err("gettimeofday failed, cannot set reference time.\n");
2077 		return -1;
2078 	}
2079 
2080 	if (clock_gettime(rec->opts.clockid, &ref_clockid)) {
2081 		pr_err("clock_gettime failed, cannot set reference time.\n");
2082 		return -1;
2083 	}
2084 
2085 	ref = (u64) ref_tod.tv_sec * NSEC_PER_SEC +
2086 	      (u64) ref_tod.tv_usec * NSEC_PER_USEC;
2087 
2088 	session->header.env.clock.tod_ns = ref;
2089 
2090 	ref = (u64) ref_clockid.tv_sec * NSEC_PER_SEC +
2091 	      (u64) ref_clockid.tv_nsec;
2092 
2093 	session->header.env.clock.clockid_ns = ref;
2094 	return 0;
2095 }
2096 
2097 static void hit_auxtrace_snapshot_trigger(struct record *rec)
2098 {
2099 	if (trigger_is_ready(&auxtrace_snapshot_trigger)) {
2100 		trigger_hit(&auxtrace_snapshot_trigger);
2101 		auxtrace_record__snapshot_started = 1;
2102 		if (auxtrace_record__snapshot_start(rec->itr))
2103 			trigger_error(&auxtrace_snapshot_trigger);
2104 	}
2105 }
2106 
2107 static void record__uniquify_name(struct record *rec)
2108 {
2109 	struct evsel *pos;
2110 	struct evlist *evlist = rec->evlist;
2111 	char *new_name;
2112 	int ret;
2113 
2114 	if (!perf_pmu__has_hybrid())
2115 		return;
2116 
2117 	evlist__for_each_entry(evlist, pos) {
2118 		if (!evsel__is_hybrid(pos))
2119 			continue;
2120 
2121 		if (strchr(pos->name, '/'))
2122 			continue;
2123 
2124 		ret = asprintf(&new_name, "%s/%s/",
2125 			       pos->pmu_name, pos->name);
2126 		if (ret) {
2127 			free(pos->name);
2128 			pos->name = new_name;
2129 		}
2130 	}
2131 }
2132 
2133 static int record__terminate_thread(struct record_thread *thread_data)
2134 {
2135 	int err;
2136 	enum thread_msg ack = THREAD_MSG__UNDEFINED;
2137 	pid_t tid = thread_data->tid;
2138 
2139 	close(thread_data->pipes.msg[1]);
2140 	thread_data->pipes.msg[1] = -1;
2141 	err = read(thread_data->pipes.ack[0], &ack, sizeof(ack));
2142 	if (err > 0)
2143 		pr_debug2("threads[%d]: sent %s\n", tid, thread_msg_tags[ack]);
2144 	else
2145 		pr_warning("threads[%d]: failed to receive termination notification from %d\n",
2146 			   thread->tid, tid);
2147 
2148 	return 0;
2149 }
2150 
2151 static int record__start_threads(struct record *rec)
2152 {
2153 	int t, tt, err, ret = 0, nr_threads = rec->nr_threads;
2154 	struct record_thread *thread_data = rec->thread_data;
2155 	sigset_t full, mask;
2156 	pthread_t handle;
2157 	pthread_attr_t attrs;
2158 
2159 	thread = &thread_data[0];
2160 
2161 	if (!record__threads_enabled(rec))
2162 		return 0;
2163 
2164 	sigfillset(&full);
2165 	if (sigprocmask(SIG_SETMASK, &full, &mask)) {
2166 		pr_err("Failed to block signals on threads start: %s\n", strerror(errno));
2167 		return -1;
2168 	}
2169 
2170 	pthread_attr_init(&attrs);
2171 	pthread_attr_setdetachstate(&attrs, PTHREAD_CREATE_DETACHED);
2172 
2173 	for (t = 1; t < nr_threads; t++) {
2174 		enum thread_msg msg = THREAD_MSG__UNDEFINED;
2175 
2176 #ifdef HAVE_PTHREAD_ATTR_SETAFFINITY_NP
2177 		pthread_attr_setaffinity_np(&attrs,
2178 					    MMAP_CPU_MASK_BYTES(&(thread_data[t].mask->affinity)),
2179 					    (cpu_set_t *)(thread_data[t].mask->affinity.bits));
2180 #endif
2181 		if (pthread_create(&handle, &attrs, record__thread, &thread_data[t])) {
2182 			for (tt = 1; tt < t; tt++)
2183 				record__terminate_thread(&thread_data[t]);
2184 			pr_err("Failed to start threads: %s\n", strerror(errno));
2185 			ret = -1;
2186 			goto out_err;
2187 		}
2188 
2189 		err = read(thread_data[t].pipes.ack[0], &msg, sizeof(msg));
2190 		if (err > 0)
2191 			pr_debug2("threads[%d]: sent %s\n", rec->thread_data[t].tid,
2192 				  thread_msg_tags[msg]);
2193 		else
2194 			pr_warning("threads[%d]: failed to receive start notification from %d\n",
2195 				   thread->tid, rec->thread_data[t].tid);
2196 	}
2197 
2198 	sched_setaffinity(0, MMAP_CPU_MASK_BYTES(&thread->mask->affinity),
2199 			(cpu_set_t *)thread->mask->affinity.bits);
2200 
2201 	pr_debug("threads[%d]: started on cpu%d\n", thread->tid, sched_getcpu());
2202 
2203 out_err:
2204 	pthread_attr_destroy(&attrs);
2205 
2206 	if (sigprocmask(SIG_SETMASK, &mask, NULL)) {
2207 		pr_err("Failed to unblock signals on threads start: %s\n", strerror(errno));
2208 		ret = -1;
2209 	}
2210 
2211 	return ret;
2212 }
2213 
2214 static int record__stop_threads(struct record *rec)
2215 {
2216 	int t;
2217 	struct record_thread *thread_data = rec->thread_data;
2218 
2219 	for (t = 1; t < rec->nr_threads; t++)
2220 		record__terminate_thread(&thread_data[t]);
2221 
2222 	for (t = 0; t < rec->nr_threads; t++) {
2223 		rec->samples += thread_data[t].samples;
2224 		if (!record__threads_enabled(rec))
2225 			continue;
2226 		rec->session->bytes_transferred += thread_data[t].bytes_transferred;
2227 		rec->session->bytes_compressed += thread_data[t].bytes_compressed;
2228 		pr_debug("threads[%d]: samples=%lld, wakes=%ld, ", thread_data[t].tid,
2229 			 thread_data[t].samples, thread_data[t].waking);
2230 		if (thread_data[t].bytes_transferred && thread_data[t].bytes_compressed)
2231 			pr_debug("transferred=%" PRIu64 ", compressed=%" PRIu64 "\n",
2232 				 thread_data[t].bytes_transferred, thread_data[t].bytes_compressed);
2233 		else
2234 			pr_debug("written=%" PRIu64 "\n", thread_data[t].bytes_written);
2235 	}
2236 
2237 	return 0;
2238 }
2239 
2240 static unsigned long record__waking(struct record *rec)
2241 {
2242 	int t;
2243 	unsigned long waking = 0;
2244 	struct record_thread *thread_data = rec->thread_data;
2245 
2246 	for (t = 0; t < rec->nr_threads; t++)
2247 		waking += thread_data[t].waking;
2248 
2249 	return waking;
2250 }
2251 
2252 static int __cmd_record(struct record *rec, int argc, const char **argv)
2253 {
2254 	int err;
2255 	int status = 0;
2256 	const bool forks = argc > 0;
2257 	struct perf_tool *tool = &rec->tool;
2258 	struct record_opts *opts = &rec->opts;
2259 	struct perf_data *data = &rec->data;
2260 	struct perf_session *session;
2261 	bool disabled = false, draining = false;
2262 	int fd;
2263 	float ratio = 0;
2264 	enum evlist_ctl_cmd cmd = EVLIST_CTL_CMD_UNSUPPORTED;
2265 
2266 	atexit(record__sig_exit);
2267 	signal(SIGCHLD, sig_handler);
2268 	signal(SIGINT, sig_handler);
2269 	signal(SIGTERM, sig_handler);
2270 	signal(SIGSEGV, sigsegv_handler);
2271 
2272 	if (rec->opts.record_namespaces)
2273 		tool->namespace_events = true;
2274 
2275 	if (rec->opts.record_cgroup) {
2276 #ifdef HAVE_FILE_HANDLE
2277 		tool->cgroup_events = true;
2278 #else
2279 		pr_err("cgroup tracking is not supported\n");
2280 		return -1;
2281 #endif
2282 	}
2283 
2284 	if (rec->opts.auxtrace_snapshot_mode || rec->switch_output.enabled) {
2285 		signal(SIGUSR2, snapshot_sig_handler);
2286 		if (rec->opts.auxtrace_snapshot_mode)
2287 			trigger_on(&auxtrace_snapshot_trigger);
2288 		if (rec->switch_output.enabled)
2289 			trigger_on(&switch_output_trigger);
2290 	} else {
2291 		signal(SIGUSR2, SIG_IGN);
2292 	}
2293 
2294 	session = perf_session__new(data, tool);
2295 	if (IS_ERR(session)) {
2296 		pr_err("Perf session creation failed.\n");
2297 		return PTR_ERR(session);
2298 	}
2299 
2300 	if (record__threads_enabled(rec)) {
2301 		if (perf_data__is_pipe(&rec->data)) {
2302 			pr_err("Parallel trace streaming is not available in pipe mode.\n");
2303 			return -1;
2304 		}
2305 		if (rec->opts.full_auxtrace) {
2306 			pr_err("Parallel trace streaming is not available in AUX area tracing mode.\n");
2307 			return -1;
2308 		}
2309 	}
2310 
2311 	fd = perf_data__fd(data);
2312 	rec->session = session;
2313 
2314 	if (zstd_init(&session->zstd_data, rec->opts.comp_level) < 0) {
2315 		pr_err("Compression initialization failed.\n");
2316 		return -1;
2317 	}
2318 #ifdef HAVE_EVENTFD_SUPPORT
2319 	done_fd = eventfd(0, EFD_NONBLOCK);
2320 	if (done_fd < 0) {
2321 		pr_err("Failed to create wakeup eventfd, error: %m\n");
2322 		status = -1;
2323 		goto out_delete_session;
2324 	}
2325 	err = evlist__add_wakeup_eventfd(rec->evlist, done_fd);
2326 	if (err < 0) {
2327 		pr_err("Failed to add wakeup eventfd to poll list\n");
2328 		status = err;
2329 		goto out_delete_session;
2330 	}
2331 #endif // HAVE_EVENTFD_SUPPORT
2332 
2333 	session->header.env.comp_type  = PERF_COMP_ZSTD;
2334 	session->header.env.comp_level = rec->opts.comp_level;
2335 
2336 	if (rec->opts.kcore &&
2337 	    !record__kcore_readable(&session->machines.host)) {
2338 		pr_err("ERROR: kcore is not readable.\n");
2339 		return -1;
2340 	}
2341 
2342 	if (record__init_clock(rec))
2343 		return -1;
2344 
2345 	record__init_features(rec);
2346 
2347 	if (forks) {
2348 		err = evlist__prepare_workload(rec->evlist, &opts->target, argv, data->is_pipe,
2349 					       workload_exec_failed_signal);
2350 		if (err < 0) {
2351 			pr_err("Couldn't run the workload!\n");
2352 			status = err;
2353 			goto out_delete_session;
2354 		}
2355 	}
2356 
2357 	/*
2358 	 * If we have just single event and are sending data
2359 	 * through pipe, we need to force the ids allocation,
2360 	 * because we synthesize event name through the pipe
2361 	 * and need the id for that.
2362 	 */
2363 	if (data->is_pipe && rec->evlist->core.nr_entries == 1)
2364 		rec->opts.sample_id = true;
2365 
2366 	record__uniquify_name(rec);
2367 
2368 	if (record__open(rec) != 0) {
2369 		err = -1;
2370 		goto out_free_threads;
2371 	}
2372 	session->header.env.comp_mmap_len = session->evlist->core.mmap_len;
2373 
2374 	if (rec->opts.kcore) {
2375 		err = record__kcore_copy(&session->machines.host, data);
2376 		if (err) {
2377 			pr_err("ERROR: Failed to copy kcore\n");
2378 			goto out_free_threads;
2379 		}
2380 	}
2381 
2382 	err = bpf__apply_obj_config();
2383 	if (err) {
2384 		char errbuf[BUFSIZ];
2385 
2386 		bpf__strerror_apply_obj_config(err, errbuf, sizeof(errbuf));
2387 		pr_err("ERROR: Apply config to BPF failed: %s\n",
2388 			 errbuf);
2389 		goto out_free_threads;
2390 	}
2391 
2392 	/*
2393 	 * Normally perf_session__new would do this, but it doesn't have the
2394 	 * evlist.
2395 	 */
2396 	if (rec->tool.ordered_events && !evlist__sample_id_all(rec->evlist)) {
2397 		pr_warning("WARNING: No sample_id_all support, falling back to unordered processing\n");
2398 		rec->tool.ordered_events = false;
2399 	}
2400 
2401 	if (!rec->evlist->core.nr_groups)
2402 		perf_header__clear_feat(&session->header, HEADER_GROUP_DESC);
2403 
2404 	if (data->is_pipe) {
2405 		err = perf_header__write_pipe(fd);
2406 		if (err < 0)
2407 			goto out_free_threads;
2408 	} else {
2409 		err = perf_session__write_header(session, rec->evlist, fd, false);
2410 		if (err < 0)
2411 			goto out_free_threads;
2412 	}
2413 
2414 	err = -1;
2415 	if (!rec->no_buildid
2416 	    && !perf_header__has_feat(&session->header, HEADER_BUILD_ID)) {
2417 		pr_err("Couldn't generate buildids. "
2418 		       "Use --no-buildid to profile anyway.\n");
2419 		goto out_free_threads;
2420 	}
2421 
2422 	err = record__setup_sb_evlist(rec);
2423 	if (err)
2424 		goto out_free_threads;
2425 
2426 	err = record__synthesize(rec, false);
2427 	if (err < 0)
2428 		goto out_free_threads;
2429 
2430 	if (rec->realtime_prio) {
2431 		struct sched_param param;
2432 
2433 		param.sched_priority = rec->realtime_prio;
2434 		if (sched_setscheduler(0, SCHED_FIFO, &param)) {
2435 			pr_err("Could not set realtime priority.\n");
2436 			err = -1;
2437 			goto out_free_threads;
2438 		}
2439 	}
2440 
2441 	if (record__start_threads(rec))
2442 		goto out_free_threads;
2443 
2444 	/*
2445 	 * When perf is starting the traced process, all the events
2446 	 * (apart from group members) have enable_on_exec=1 set,
2447 	 * so don't spoil it by prematurely enabling them.
2448 	 */
2449 	if (!target__none(&opts->target) && !opts->initial_delay)
2450 		evlist__enable(rec->evlist);
2451 
2452 	/*
2453 	 * Let the child rip
2454 	 */
2455 	if (forks) {
2456 		struct machine *machine = &session->machines.host;
2457 		union perf_event *event;
2458 		pid_t tgid;
2459 
2460 		event = malloc(sizeof(event->comm) + machine->id_hdr_size);
2461 		if (event == NULL) {
2462 			err = -ENOMEM;
2463 			goto out_child;
2464 		}
2465 
2466 		/*
2467 		 * Some H/W events are generated before COMM event
2468 		 * which is emitted during exec(), so perf script
2469 		 * cannot see a correct process name for those events.
2470 		 * Synthesize COMM event to prevent it.
2471 		 */
2472 		tgid = perf_event__synthesize_comm(tool, event,
2473 						   rec->evlist->workload.pid,
2474 						   process_synthesized_event,
2475 						   machine);
2476 		free(event);
2477 
2478 		if (tgid == -1)
2479 			goto out_child;
2480 
2481 		event = malloc(sizeof(event->namespaces) +
2482 			       (NR_NAMESPACES * sizeof(struct perf_ns_link_info)) +
2483 			       machine->id_hdr_size);
2484 		if (event == NULL) {
2485 			err = -ENOMEM;
2486 			goto out_child;
2487 		}
2488 
2489 		/*
2490 		 * Synthesize NAMESPACES event for the command specified.
2491 		 */
2492 		perf_event__synthesize_namespaces(tool, event,
2493 						  rec->evlist->workload.pid,
2494 						  tgid, process_synthesized_event,
2495 						  machine);
2496 		free(event);
2497 
2498 		evlist__start_workload(rec->evlist);
2499 	}
2500 
2501 	if (opts->initial_delay) {
2502 		pr_info(EVLIST_DISABLED_MSG);
2503 		if (opts->initial_delay > 0) {
2504 			usleep(opts->initial_delay * USEC_PER_MSEC);
2505 			evlist__enable(rec->evlist);
2506 			pr_info(EVLIST_ENABLED_MSG);
2507 		}
2508 	}
2509 
2510 	err = event_enable_timer__start(rec->evlist->eet);
2511 	if (err)
2512 		goto out_child;
2513 
2514 	trigger_ready(&auxtrace_snapshot_trigger);
2515 	trigger_ready(&switch_output_trigger);
2516 	perf_hooks__invoke_record_start();
2517 
2518 	/*
2519 	 * Must write FINISHED_INIT so it will be seen after all other
2520 	 * synthesized user events, but before any regular events.
2521 	 */
2522 	err = write_finished_init(rec, false);
2523 	if (err < 0)
2524 		goto out_child;
2525 
2526 	for (;;) {
2527 		unsigned long long hits = thread->samples;
2528 
2529 		/*
2530 		 * rec->evlist->bkw_mmap_state is possible to be
2531 		 * BKW_MMAP_EMPTY here: when done == true and
2532 		 * hits != rec->samples in previous round.
2533 		 *
2534 		 * evlist__toggle_bkw_mmap ensure we never
2535 		 * convert BKW_MMAP_EMPTY to BKW_MMAP_DATA_PENDING.
2536 		 */
2537 		if (trigger_is_hit(&switch_output_trigger) || done || draining)
2538 			evlist__toggle_bkw_mmap(rec->evlist, BKW_MMAP_DATA_PENDING);
2539 
2540 		if (record__mmap_read_all(rec, false) < 0) {
2541 			trigger_error(&auxtrace_snapshot_trigger);
2542 			trigger_error(&switch_output_trigger);
2543 			err = -1;
2544 			goto out_child;
2545 		}
2546 
2547 		if (auxtrace_record__snapshot_started) {
2548 			auxtrace_record__snapshot_started = 0;
2549 			if (!trigger_is_error(&auxtrace_snapshot_trigger))
2550 				record__read_auxtrace_snapshot(rec, false);
2551 			if (trigger_is_error(&auxtrace_snapshot_trigger)) {
2552 				pr_err("AUX area tracing snapshot failed\n");
2553 				err = -1;
2554 				goto out_child;
2555 			}
2556 		}
2557 
2558 		if (trigger_is_hit(&switch_output_trigger)) {
2559 			/*
2560 			 * If switch_output_trigger is hit, the data in
2561 			 * overwritable ring buffer should have been collected,
2562 			 * so bkw_mmap_state should be set to BKW_MMAP_EMPTY.
2563 			 *
2564 			 * If SIGUSR2 raise after or during record__mmap_read_all(),
2565 			 * record__mmap_read_all() didn't collect data from
2566 			 * overwritable ring buffer. Read again.
2567 			 */
2568 			if (rec->evlist->bkw_mmap_state == BKW_MMAP_RUNNING)
2569 				continue;
2570 			trigger_ready(&switch_output_trigger);
2571 
2572 			/*
2573 			 * Reenable events in overwrite ring buffer after
2574 			 * record__mmap_read_all(): we should have collected
2575 			 * data from it.
2576 			 */
2577 			evlist__toggle_bkw_mmap(rec->evlist, BKW_MMAP_RUNNING);
2578 
2579 			if (!quiet)
2580 				fprintf(stderr, "[ perf record: dump data: Woken up %ld times ]\n",
2581 					record__waking(rec));
2582 			thread->waking = 0;
2583 			fd = record__switch_output(rec, false);
2584 			if (fd < 0) {
2585 				pr_err("Failed to switch to new file\n");
2586 				trigger_error(&switch_output_trigger);
2587 				err = fd;
2588 				goto out_child;
2589 			}
2590 
2591 			/* re-arm the alarm */
2592 			if (rec->switch_output.time)
2593 				alarm(rec->switch_output.time);
2594 		}
2595 
2596 		if (hits == thread->samples) {
2597 			if (done || draining)
2598 				break;
2599 			err = fdarray__poll(&thread->pollfd, -1);
2600 			/*
2601 			 * Propagate error, only if there's any. Ignore positive
2602 			 * number of returned events and interrupt error.
2603 			 */
2604 			if (err > 0 || (err < 0 && errno == EINTR))
2605 				err = 0;
2606 			thread->waking++;
2607 
2608 			if (fdarray__filter(&thread->pollfd, POLLERR | POLLHUP,
2609 					    record__thread_munmap_filtered, NULL) == 0)
2610 				draining = true;
2611 
2612 			err = record__update_evlist_pollfd_from_thread(rec, rec->evlist, thread);
2613 			if (err)
2614 				goto out_child;
2615 		}
2616 
2617 		if (evlist__ctlfd_process(rec->evlist, &cmd) > 0) {
2618 			switch (cmd) {
2619 			case EVLIST_CTL_CMD_SNAPSHOT:
2620 				hit_auxtrace_snapshot_trigger(rec);
2621 				evlist__ctlfd_ack(rec->evlist);
2622 				break;
2623 			case EVLIST_CTL_CMD_STOP:
2624 				done = 1;
2625 				break;
2626 			case EVLIST_CTL_CMD_ACK:
2627 			case EVLIST_CTL_CMD_UNSUPPORTED:
2628 			case EVLIST_CTL_CMD_ENABLE:
2629 			case EVLIST_CTL_CMD_DISABLE:
2630 			case EVLIST_CTL_CMD_EVLIST:
2631 			case EVLIST_CTL_CMD_PING:
2632 			default:
2633 				break;
2634 			}
2635 		}
2636 
2637 		err = event_enable_timer__process(rec->evlist->eet);
2638 		if (err < 0)
2639 			goto out_child;
2640 		if (err) {
2641 			err = 0;
2642 			done = 1;
2643 		}
2644 
2645 		/*
2646 		 * When perf is starting the traced process, at the end events
2647 		 * die with the process and we wait for that. Thus no need to
2648 		 * disable events in this case.
2649 		 */
2650 		if (done && !disabled && !target__none(&opts->target)) {
2651 			trigger_off(&auxtrace_snapshot_trigger);
2652 			evlist__disable(rec->evlist);
2653 			disabled = true;
2654 		}
2655 	}
2656 
2657 	trigger_off(&auxtrace_snapshot_trigger);
2658 	trigger_off(&switch_output_trigger);
2659 
2660 	if (opts->auxtrace_snapshot_on_exit)
2661 		record__auxtrace_snapshot_exit(rec);
2662 
2663 	if (forks && workload_exec_errno) {
2664 		char msg[STRERR_BUFSIZE], strevsels[2048];
2665 		const char *emsg = str_error_r(workload_exec_errno, msg, sizeof(msg));
2666 
2667 		evlist__scnprintf_evsels(rec->evlist, sizeof(strevsels), strevsels);
2668 
2669 		pr_err("Failed to collect '%s' for the '%s' workload: %s\n",
2670 			strevsels, argv[0], emsg);
2671 		err = -1;
2672 		goto out_child;
2673 	}
2674 
2675 	if (!quiet)
2676 		fprintf(stderr, "[ perf record: Woken up %ld times to write data ]\n",
2677 			record__waking(rec));
2678 
2679 	write_finished_init(rec, true);
2680 
2681 	if (target__none(&rec->opts.target))
2682 		record__synthesize_workload(rec, true);
2683 
2684 out_child:
2685 	record__stop_threads(rec);
2686 	record__mmap_read_all(rec, true);
2687 out_free_threads:
2688 	record__free_thread_data(rec);
2689 	evlist__finalize_ctlfd(rec->evlist);
2690 	record__aio_mmap_read_sync(rec);
2691 
2692 	if (rec->session->bytes_transferred && rec->session->bytes_compressed) {
2693 		ratio = (float)rec->session->bytes_transferred/(float)rec->session->bytes_compressed;
2694 		session->header.env.comp_ratio = ratio + 0.5;
2695 	}
2696 
2697 	if (forks) {
2698 		int exit_status;
2699 
2700 		if (!child_finished)
2701 			kill(rec->evlist->workload.pid, SIGTERM);
2702 
2703 		wait(&exit_status);
2704 
2705 		if (err < 0)
2706 			status = err;
2707 		else if (WIFEXITED(exit_status))
2708 			status = WEXITSTATUS(exit_status);
2709 		else if (WIFSIGNALED(exit_status))
2710 			signr = WTERMSIG(exit_status);
2711 	} else
2712 		status = err;
2713 
2714 	if (rec->off_cpu)
2715 		rec->bytes_written += off_cpu_write(rec->session);
2716 
2717 	record__synthesize(rec, true);
2718 	/* this will be recalculated during process_buildids() */
2719 	rec->samples = 0;
2720 
2721 	if (!err) {
2722 		if (!rec->timestamp_filename) {
2723 			record__finish_output(rec);
2724 		} else {
2725 			fd = record__switch_output(rec, true);
2726 			if (fd < 0) {
2727 				status = fd;
2728 				goto out_delete_session;
2729 			}
2730 		}
2731 	}
2732 
2733 	perf_hooks__invoke_record_end();
2734 
2735 	if (!err && !quiet) {
2736 		char samples[128];
2737 		const char *postfix = rec->timestamp_filename ?
2738 					".<timestamp>" : "";
2739 
2740 		if (rec->samples && !rec->opts.full_auxtrace)
2741 			scnprintf(samples, sizeof(samples),
2742 				  " (%" PRIu64 " samples)", rec->samples);
2743 		else
2744 			samples[0] = '\0';
2745 
2746 		fprintf(stderr,	"[ perf record: Captured and wrote %.3f MB %s%s%s",
2747 			perf_data__size(data) / 1024.0 / 1024.0,
2748 			data->path, postfix, samples);
2749 		if (ratio) {
2750 			fprintf(stderr,	", compressed (original %.3f MB, ratio is %.3f)",
2751 					rec->session->bytes_transferred / 1024.0 / 1024.0,
2752 					ratio);
2753 		}
2754 		fprintf(stderr, " ]\n");
2755 	}
2756 
2757 out_delete_session:
2758 #ifdef HAVE_EVENTFD_SUPPORT
2759 	if (done_fd >= 0)
2760 		close(done_fd);
2761 #endif
2762 	zstd_fini(&session->zstd_data);
2763 	perf_session__delete(session);
2764 
2765 	if (!opts->no_bpf_event)
2766 		evlist__stop_sb_thread(rec->sb_evlist);
2767 	return status;
2768 }
2769 
2770 static void callchain_debug(struct callchain_param *callchain)
2771 {
2772 	static const char *str[CALLCHAIN_MAX] = { "NONE", "FP", "DWARF", "LBR" };
2773 
2774 	pr_debug("callchain: type %s\n", str[callchain->record_mode]);
2775 
2776 	if (callchain->record_mode == CALLCHAIN_DWARF)
2777 		pr_debug("callchain: stack dump size %d\n",
2778 			 callchain->dump_size);
2779 }
2780 
2781 int record_opts__parse_callchain(struct record_opts *record,
2782 				 struct callchain_param *callchain,
2783 				 const char *arg, bool unset)
2784 {
2785 	int ret;
2786 	callchain->enabled = !unset;
2787 
2788 	/* --no-call-graph */
2789 	if (unset) {
2790 		callchain->record_mode = CALLCHAIN_NONE;
2791 		pr_debug("callchain: disabled\n");
2792 		return 0;
2793 	}
2794 
2795 	ret = parse_callchain_record_opt(arg, callchain);
2796 	if (!ret) {
2797 		/* Enable data address sampling for DWARF unwind. */
2798 		if (callchain->record_mode == CALLCHAIN_DWARF)
2799 			record->sample_address = true;
2800 		callchain_debug(callchain);
2801 	}
2802 
2803 	return ret;
2804 }
2805 
2806 int record_parse_callchain_opt(const struct option *opt,
2807 			       const char *arg,
2808 			       int unset)
2809 {
2810 	return record_opts__parse_callchain(opt->value, &callchain_param, arg, unset);
2811 }
2812 
2813 int record_callchain_opt(const struct option *opt,
2814 			 const char *arg __maybe_unused,
2815 			 int unset __maybe_unused)
2816 {
2817 	struct callchain_param *callchain = opt->value;
2818 
2819 	callchain->enabled = true;
2820 
2821 	if (callchain->record_mode == CALLCHAIN_NONE)
2822 		callchain->record_mode = CALLCHAIN_FP;
2823 
2824 	callchain_debug(callchain);
2825 	return 0;
2826 }
2827 
2828 static int perf_record_config(const char *var, const char *value, void *cb)
2829 {
2830 	struct record *rec = cb;
2831 
2832 	if (!strcmp(var, "record.build-id")) {
2833 		if (!strcmp(value, "cache"))
2834 			rec->no_buildid_cache = false;
2835 		else if (!strcmp(value, "no-cache"))
2836 			rec->no_buildid_cache = true;
2837 		else if (!strcmp(value, "skip"))
2838 			rec->no_buildid = true;
2839 		else if (!strcmp(value, "mmap"))
2840 			rec->buildid_mmap = true;
2841 		else
2842 			return -1;
2843 		return 0;
2844 	}
2845 	if (!strcmp(var, "record.call-graph")) {
2846 		var = "call-graph.record-mode";
2847 		return perf_default_config(var, value, cb);
2848 	}
2849 #ifdef HAVE_AIO_SUPPORT
2850 	if (!strcmp(var, "record.aio")) {
2851 		rec->opts.nr_cblocks = strtol(value, NULL, 0);
2852 		if (!rec->opts.nr_cblocks)
2853 			rec->opts.nr_cblocks = nr_cblocks_default;
2854 	}
2855 #endif
2856 	if (!strcmp(var, "record.debuginfod")) {
2857 		rec->debuginfod.urls = strdup(value);
2858 		if (!rec->debuginfod.urls)
2859 			return -ENOMEM;
2860 		rec->debuginfod.set = true;
2861 	}
2862 
2863 	return 0;
2864 }
2865 
2866 static int record__parse_event_enable_time(const struct option *opt, const char *str, int unset)
2867 {
2868 	struct record *rec = (struct record *)opt->value;
2869 
2870 	return evlist__parse_event_enable_time(rec->evlist, &rec->opts, str, unset);
2871 }
2872 
2873 static int record__parse_affinity(const struct option *opt, const char *str, int unset)
2874 {
2875 	struct record_opts *opts = (struct record_opts *)opt->value;
2876 
2877 	if (unset || !str)
2878 		return 0;
2879 
2880 	if (!strcasecmp(str, "node"))
2881 		opts->affinity = PERF_AFFINITY_NODE;
2882 	else if (!strcasecmp(str, "cpu"))
2883 		opts->affinity = PERF_AFFINITY_CPU;
2884 
2885 	return 0;
2886 }
2887 
2888 static int record__mmap_cpu_mask_alloc(struct mmap_cpu_mask *mask, int nr_bits)
2889 {
2890 	mask->nbits = nr_bits;
2891 	mask->bits = bitmap_zalloc(mask->nbits);
2892 	if (!mask->bits)
2893 		return -ENOMEM;
2894 
2895 	return 0;
2896 }
2897 
2898 static void record__mmap_cpu_mask_free(struct mmap_cpu_mask *mask)
2899 {
2900 	bitmap_free(mask->bits);
2901 	mask->nbits = 0;
2902 }
2903 
2904 static int record__thread_mask_alloc(struct thread_mask *mask, int nr_bits)
2905 {
2906 	int ret;
2907 
2908 	ret = record__mmap_cpu_mask_alloc(&mask->maps, nr_bits);
2909 	if (ret) {
2910 		mask->affinity.bits = NULL;
2911 		return ret;
2912 	}
2913 
2914 	ret = record__mmap_cpu_mask_alloc(&mask->affinity, nr_bits);
2915 	if (ret) {
2916 		record__mmap_cpu_mask_free(&mask->maps);
2917 		mask->maps.bits = NULL;
2918 	}
2919 
2920 	return ret;
2921 }
2922 
2923 static void record__thread_mask_free(struct thread_mask *mask)
2924 {
2925 	record__mmap_cpu_mask_free(&mask->maps);
2926 	record__mmap_cpu_mask_free(&mask->affinity);
2927 }
2928 
2929 static int record__parse_threads(const struct option *opt, const char *str, int unset)
2930 {
2931 	int s;
2932 	struct record_opts *opts = opt->value;
2933 
2934 	if (unset || !str || !strlen(str)) {
2935 		opts->threads_spec = THREAD_SPEC__CPU;
2936 	} else {
2937 		for (s = 1; s < THREAD_SPEC__MAX; s++) {
2938 			if (s == THREAD_SPEC__USER) {
2939 				opts->threads_user_spec = strdup(str);
2940 				if (!opts->threads_user_spec)
2941 					return -ENOMEM;
2942 				opts->threads_spec = THREAD_SPEC__USER;
2943 				break;
2944 			}
2945 			if (!strncasecmp(str, thread_spec_tags[s], strlen(thread_spec_tags[s]))) {
2946 				opts->threads_spec = s;
2947 				break;
2948 			}
2949 		}
2950 	}
2951 
2952 	if (opts->threads_spec == THREAD_SPEC__USER)
2953 		pr_debug("threads_spec: %s\n", opts->threads_user_spec);
2954 	else
2955 		pr_debug("threads_spec: %s\n", thread_spec_tags[opts->threads_spec]);
2956 
2957 	return 0;
2958 }
2959 
2960 static int parse_output_max_size(const struct option *opt,
2961 				 const char *str, int unset)
2962 {
2963 	unsigned long *s = (unsigned long *)opt->value;
2964 	static struct parse_tag tags_size[] = {
2965 		{ .tag  = 'B', .mult = 1       },
2966 		{ .tag  = 'K', .mult = 1 << 10 },
2967 		{ .tag  = 'M', .mult = 1 << 20 },
2968 		{ .tag  = 'G', .mult = 1 << 30 },
2969 		{ .tag  = 0 },
2970 	};
2971 	unsigned long val;
2972 
2973 	if (unset) {
2974 		*s = 0;
2975 		return 0;
2976 	}
2977 
2978 	val = parse_tag_value(str, tags_size);
2979 	if (val != (unsigned long) -1) {
2980 		*s = val;
2981 		return 0;
2982 	}
2983 
2984 	return -1;
2985 }
2986 
2987 static int record__parse_mmap_pages(const struct option *opt,
2988 				    const char *str,
2989 				    int unset __maybe_unused)
2990 {
2991 	struct record_opts *opts = opt->value;
2992 	char *s, *p;
2993 	unsigned int mmap_pages;
2994 	int ret;
2995 
2996 	if (!str)
2997 		return -EINVAL;
2998 
2999 	s = strdup(str);
3000 	if (!s)
3001 		return -ENOMEM;
3002 
3003 	p = strchr(s, ',');
3004 	if (p)
3005 		*p = '\0';
3006 
3007 	if (*s) {
3008 		ret = __evlist__parse_mmap_pages(&mmap_pages, s);
3009 		if (ret)
3010 			goto out_free;
3011 		opts->mmap_pages = mmap_pages;
3012 	}
3013 
3014 	if (!p) {
3015 		ret = 0;
3016 		goto out_free;
3017 	}
3018 
3019 	ret = __evlist__parse_mmap_pages(&mmap_pages, p + 1);
3020 	if (ret)
3021 		goto out_free;
3022 
3023 	opts->auxtrace_mmap_pages = mmap_pages;
3024 
3025 out_free:
3026 	free(s);
3027 	return ret;
3028 }
3029 
3030 void __weak arch__add_leaf_frame_record_opts(struct record_opts *opts __maybe_unused)
3031 {
3032 }
3033 
3034 static int parse_control_option(const struct option *opt,
3035 				const char *str,
3036 				int unset __maybe_unused)
3037 {
3038 	struct record_opts *opts = opt->value;
3039 
3040 	return evlist__parse_control(str, &opts->ctl_fd, &opts->ctl_fd_ack, &opts->ctl_fd_close);
3041 }
3042 
3043 static void switch_output_size_warn(struct record *rec)
3044 {
3045 	u64 wakeup_size = evlist__mmap_size(rec->opts.mmap_pages);
3046 	struct switch_output *s = &rec->switch_output;
3047 
3048 	wakeup_size /= 2;
3049 
3050 	if (s->size < wakeup_size) {
3051 		char buf[100];
3052 
3053 		unit_number__scnprintf(buf, sizeof(buf), wakeup_size);
3054 		pr_warning("WARNING: switch-output data size lower than "
3055 			   "wakeup kernel buffer size (%s) "
3056 			   "expect bigger perf.data sizes\n", buf);
3057 	}
3058 }
3059 
3060 static int switch_output_setup(struct record *rec)
3061 {
3062 	struct switch_output *s = &rec->switch_output;
3063 	static struct parse_tag tags_size[] = {
3064 		{ .tag  = 'B', .mult = 1       },
3065 		{ .tag  = 'K', .mult = 1 << 10 },
3066 		{ .tag  = 'M', .mult = 1 << 20 },
3067 		{ .tag  = 'G', .mult = 1 << 30 },
3068 		{ .tag  = 0 },
3069 	};
3070 	static struct parse_tag tags_time[] = {
3071 		{ .tag  = 's', .mult = 1        },
3072 		{ .tag  = 'm', .mult = 60       },
3073 		{ .tag  = 'h', .mult = 60*60    },
3074 		{ .tag  = 'd', .mult = 60*60*24 },
3075 		{ .tag  = 0 },
3076 	};
3077 	unsigned long val;
3078 
3079 	/*
3080 	 * If we're using --switch-output-events, then we imply its
3081 	 * --switch-output=signal, as we'll send a SIGUSR2 from the side band
3082 	 *  thread to its parent.
3083 	 */
3084 	if (rec->switch_output_event_set) {
3085 		if (record__threads_enabled(rec)) {
3086 			pr_warning("WARNING: --switch-output-event option is not available in parallel streaming mode.\n");
3087 			return 0;
3088 		}
3089 		goto do_signal;
3090 	}
3091 
3092 	if (!s->set)
3093 		return 0;
3094 
3095 	if (record__threads_enabled(rec)) {
3096 		pr_warning("WARNING: --switch-output option is not available in parallel streaming mode.\n");
3097 		return 0;
3098 	}
3099 
3100 	if (!strcmp(s->str, "signal")) {
3101 do_signal:
3102 		s->signal = true;
3103 		pr_debug("switch-output with SIGUSR2 signal\n");
3104 		goto enabled;
3105 	}
3106 
3107 	val = parse_tag_value(s->str, tags_size);
3108 	if (val != (unsigned long) -1) {
3109 		s->size = val;
3110 		pr_debug("switch-output with %s size threshold\n", s->str);
3111 		goto enabled;
3112 	}
3113 
3114 	val = parse_tag_value(s->str, tags_time);
3115 	if (val != (unsigned long) -1) {
3116 		s->time = val;
3117 		pr_debug("switch-output with %s time threshold (%lu seconds)\n",
3118 			 s->str, s->time);
3119 		goto enabled;
3120 	}
3121 
3122 	return -1;
3123 
3124 enabled:
3125 	rec->timestamp_filename = true;
3126 	s->enabled              = true;
3127 
3128 	if (s->size && !rec->opts.no_buffering)
3129 		switch_output_size_warn(rec);
3130 
3131 	return 0;
3132 }
3133 
3134 static const char * const __record_usage[] = {
3135 	"perf record [<options>] [<command>]",
3136 	"perf record [<options>] -- <command> [<options>]",
3137 	NULL
3138 };
3139 const char * const *record_usage = __record_usage;
3140 
3141 static int build_id__process_mmap(struct perf_tool *tool, union perf_event *event,
3142 				  struct perf_sample *sample, struct machine *machine)
3143 {
3144 	/*
3145 	 * We already have the kernel maps, put in place via perf_session__create_kernel_maps()
3146 	 * no need to add them twice.
3147 	 */
3148 	if (!(event->header.misc & PERF_RECORD_MISC_USER))
3149 		return 0;
3150 	return perf_event__process_mmap(tool, event, sample, machine);
3151 }
3152 
3153 static int build_id__process_mmap2(struct perf_tool *tool, union perf_event *event,
3154 				   struct perf_sample *sample, struct machine *machine)
3155 {
3156 	/*
3157 	 * We already have the kernel maps, put in place via perf_session__create_kernel_maps()
3158 	 * no need to add them twice.
3159 	 */
3160 	if (!(event->header.misc & PERF_RECORD_MISC_USER))
3161 		return 0;
3162 
3163 	return perf_event__process_mmap2(tool, event, sample, machine);
3164 }
3165 
3166 static int process_timestamp_boundary(struct perf_tool *tool,
3167 				      union perf_event *event __maybe_unused,
3168 				      struct perf_sample *sample,
3169 				      struct machine *machine __maybe_unused)
3170 {
3171 	struct record *rec = container_of(tool, struct record, tool);
3172 
3173 	set_timestamp_boundary(rec, sample->time);
3174 	return 0;
3175 }
3176 
3177 static int parse_record_synth_option(const struct option *opt,
3178 				     const char *str,
3179 				     int unset __maybe_unused)
3180 {
3181 	struct record_opts *opts = opt->value;
3182 	char *p = strdup(str);
3183 
3184 	if (p == NULL)
3185 		return -1;
3186 
3187 	opts->synth = parse_synth_opt(p);
3188 	free(p);
3189 
3190 	if (opts->synth < 0) {
3191 		pr_err("Invalid synth option: %s\n", str);
3192 		return -1;
3193 	}
3194 	return 0;
3195 }
3196 
3197 /*
3198  * XXX Ideally would be local to cmd_record() and passed to a record__new
3199  * because we need to have access to it in record__exit, that is called
3200  * after cmd_record() exits, but since record_options need to be accessible to
3201  * builtin-script, leave it here.
3202  *
3203  * At least we don't ouch it in all the other functions here directly.
3204  *
3205  * Just say no to tons of global variables, sigh.
3206  */
3207 static struct record record = {
3208 	.opts = {
3209 		.sample_time	     = true,
3210 		.mmap_pages	     = UINT_MAX,
3211 		.user_freq	     = UINT_MAX,
3212 		.user_interval	     = ULLONG_MAX,
3213 		.freq		     = 4000,
3214 		.target		     = {
3215 			.uses_mmap   = true,
3216 			.default_per_cpu = true,
3217 		},
3218 		.mmap_flush          = MMAP_FLUSH_DEFAULT,
3219 		.nr_threads_synthesize = 1,
3220 		.ctl_fd              = -1,
3221 		.ctl_fd_ack          = -1,
3222 		.synth               = PERF_SYNTH_ALL,
3223 	},
3224 	.tool = {
3225 		.sample		= process_sample_event,
3226 		.fork		= perf_event__process_fork,
3227 		.exit		= perf_event__process_exit,
3228 		.comm		= perf_event__process_comm,
3229 		.namespaces	= perf_event__process_namespaces,
3230 		.mmap		= build_id__process_mmap,
3231 		.mmap2		= build_id__process_mmap2,
3232 		.itrace_start	= process_timestamp_boundary,
3233 		.aux		= process_timestamp_boundary,
3234 		.ordered_events	= true,
3235 	},
3236 };
3237 
3238 const char record_callchain_help[] = CALLCHAIN_RECORD_HELP
3239 	"\n\t\t\t\tDefault: fp";
3240 
3241 static bool dry_run;
3242 
3243 /*
3244  * XXX Will stay a global variable till we fix builtin-script.c to stop messing
3245  * with it and switch to use the library functions in perf_evlist that came
3246  * from builtin-record.c, i.e. use record_opts,
3247  * evlist__prepare_workload, etc instead of fork+exec'in 'perf record',
3248  * using pipes, etc.
3249  */
3250 static struct option __record_options[] = {
3251 	OPT_CALLBACK('e', "event", &record.evlist, "event",
3252 		     "event selector. use 'perf list' to list available events",
3253 		     parse_events_option),
3254 	OPT_CALLBACK(0, "filter", &record.evlist, "filter",
3255 		     "event filter", parse_filter),
3256 	OPT_CALLBACK_NOOPT(0, "exclude-perf", &record.evlist,
3257 			   NULL, "don't record events from perf itself",
3258 			   exclude_perf),
3259 	OPT_STRING('p', "pid", &record.opts.target.pid, "pid",
3260 		    "record events on existing process id"),
3261 	OPT_STRING('t', "tid", &record.opts.target.tid, "tid",
3262 		    "record events on existing thread id"),
3263 	OPT_INTEGER('r', "realtime", &record.realtime_prio,
3264 		    "collect data with this RT SCHED_FIFO priority"),
3265 	OPT_BOOLEAN(0, "no-buffering", &record.opts.no_buffering,
3266 		    "collect data without buffering"),
3267 	OPT_BOOLEAN('R', "raw-samples", &record.opts.raw_samples,
3268 		    "collect raw sample records from all opened counters"),
3269 	OPT_BOOLEAN('a', "all-cpus", &record.opts.target.system_wide,
3270 			    "system-wide collection from all CPUs"),
3271 	OPT_STRING('C', "cpu", &record.opts.target.cpu_list, "cpu",
3272 		    "list of cpus to monitor"),
3273 	OPT_U64('c', "count", &record.opts.user_interval, "event period to sample"),
3274 	OPT_STRING('o', "output", &record.data.path, "file",
3275 		    "output file name"),
3276 	OPT_BOOLEAN_SET('i', "no-inherit", &record.opts.no_inherit,
3277 			&record.opts.no_inherit_set,
3278 			"child tasks do not inherit counters"),
3279 	OPT_BOOLEAN(0, "tail-synthesize", &record.opts.tail_synthesize,
3280 		    "synthesize non-sample events at the end of output"),
3281 	OPT_BOOLEAN(0, "overwrite", &record.opts.overwrite, "use overwrite mode"),
3282 	OPT_BOOLEAN(0, "no-bpf-event", &record.opts.no_bpf_event, "do not record bpf events"),
3283 	OPT_BOOLEAN(0, "strict-freq", &record.opts.strict_freq,
3284 		    "Fail if the specified frequency can't be used"),
3285 	OPT_CALLBACK('F', "freq", &record.opts, "freq or 'max'",
3286 		     "profile at this frequency",
3287 		      record__parse_freq),
3288 	OPT_CALLBACK('m', "mmap-pages", &record.opts, "pages[,pages]",
3289 		     "number of mmap data pages and AUX area tracing mmap pages",
3290 		     record__parse_mmap_pages),
3291 	OPT_CALLBACK(0, "mmap-flush", &record.opts, "number",
3292 		     "Minimal number of bytes that is extracted from mmap data pages (default: 1)",
3293 		     record__mmap_flush_parse),
3294 	OPT_BOOLEAN(0, "group", &record.opts.group,
3295 		    "put the counters into a counter group"),
3296 	OPT_CALLBACK_NOOPT('g', NULL, &callchain_param,
3297 			   NULL, "enables call-graph recording" ,
3298 			   &record_callchain_opt),
3299 	OPT_CALLBACK(0, "call-graph", &record.opts,
3300 		     "record_mode[,record_size]", record_callchain_help,
3301 		     &record_parse_callchain_opt),
3302 	OPT_INCR('v', "verbose", &verbose,
3303 		    "be more verbose (show counter open errors, etc)"),
3304 	OPT_BOOLEAN('q', "quiet", &quiet, "don't print any message"),
3305 	OPT_BOOLEAN('s', "stat", &record.opts.inherit_stat,
3306 		    "per thread counts"),
3307 	OPT_BOOLEAN('d', "data", &record.opts.sample_address, "Record the sample addresses"),
3308 	OPT_BOOLEAN(0, "phys-data", &record.opts.sample_phys_addr,
3309 		    "Record the sample physical addresses"),
3310 	OPT_BOOLEAN(0, "data-page-size", &record.opts.sample_data_page_size,
3311 		    "Record the sampled data address data page size"),
3312 	OPT_BOOLEAN(0, "code-page-size", &record.opts.sample_code_page_size,
3313 		    "Record the sampled code address (ip) page size"),
3314 	OPT_BOOLEAN(0, "sample-cpu", &record.opts.sample_cpu, "Record the sample cpu"),
3315 	OPT_BOOLEAN(0, "sample-identifier", &record.opts.sample_identifier,
3316 		    "Record the sample identifier"),
3317 	OPT_BOOLEAN_SET('T', "timestamp", &record.opts.sample_time,
3318 			&record.opts.sample_time_set,
3319 			"Record the sample timestamps"),
3320 	OPT_BOOLEAN_SET('P', "period", &record.opts.period, &record.opts.period_set,
3321 			"Record the sample period"),
3322 	OPT_BOOLEAN('n', "no-samples", &record.opts.no_samples,
3323 		    "don't sample"),
3324 	OPT_BOOLEAN_SET('N', "no-buildid-cache", &record.no_buildid_cache,
3325 			&record.no_buildid_cache_set,
3326 			"do not update the buildid cache"),
3327 	OPT_BOOLEAN_SET('B', "no-buildid", &record.no_buildid,
3328 			&record.no_buildid_set,
3329 			"do not collect buildids in perf.data"),
3330 	OPT_CALLBACK('G', "cgroup", &record.evlist, "name",
3331 		     "monitor event in cgroup name only",
3332 		     parse_cgroups),
3333 	OPT_CALLBACK('D', "delay", &record, "ms",
3334 		     "ms to wait before starting measurement after program start (-1: start with events disabled), "
3335 		     "or ranges of time to enable events e.g. '-D 10-20,30-40'",
3336 		     record__parse_event_enable_time),
3337 	OPT_BOOLEAN(0, "kcore", &record.opts.kcore, "copy /proc/kcore"),
3338 	OPT_STRING('u', "uid", &record.opts.target.uid_str, "user",
3339 		   "user to profile"),
3340 
3341 	OPT_CALLBACK_NOOPT('b', "branch-any", &record.opts.branch_stack,
3342 		     "branch any", "sample any taken branches",
3343 		     parse_branch_stack),
3344 
3345 	OPT_CALLBACK('j', "branch-filter", &record.opts.branch_stack,
3346 		     "branch filter mask", "branch stack filter modes",
3347 		     parse_branch_stack),
3348 	OPT_BOOLEAN('W', "weight", &record.opts.sample_weight,
3349 		    "sample by weight (on special events only)"),
3350 	OPT_BOOLEAN(0, "transaction", &record.opts.sample_transaction,
3351 		    "sample transaction flags (special events only)"),
3352 	OPT_BOOLEAN(0, "per-thread", &record.opts.target.per_thread,
3353 		    "use per-thread mmaps"),
3354 	OPT_CALLBACK_OPTARG('I', "intr-regs", &record.opts.sample_intr_regs, NULL, "any register",
3355 		    "sample selected machine registers on interrupt,"
3356 		    " use '-I?' to list register names", parse_intr_regs),
3357 	OPT_CALLBACK_OPTARG(0, "user-regs", &record.opts.sample_user_regs, NULL, "any register",
3358 		    "sample selected machine registers on interrupt,"
3359 		    " use '--user-regs=?' to list register names", parse_user_regs),
3360 	OPT_BOOLEAN(0, "running-time", &record.opts.running_time,
3361 		    "Record running/enabled time of read (:S) events"),
3362 	OPT_CALLBACK('k', "clockid", &record.opts,
3363 	"clockid", "clockid to use for events, see clock_gettime()",
3364 	parse_clockid),
3365 	OPT_STRING_OPTARG('S', "snapshot", &record.opts.auxtrace_snapshot_opts,
3366 			  "opts", "AUX area tracing Snapshot Mode", ""),
3367 	OPT_STRING_OPTARG(0, "aux-sample", &record.opts.auxtrace_sample_opts,
3368 			  "opts", "sample AUX area", ""),
3369 	OPT_UINTEGER(0, "proc-map-timeout", &proc_map_timeout,
3370 			"per thread proc mmap processing timeout in ms"),
3371 	OPT_BOOLEAN(0, "namespaces", &record.opts.record_namespaces,
3372 		    "Record namespaces events"),
3373 	OPT_BOOLEAN(0, "all-cgroups", &record.opts.record_cgroup,
3374 		    "Record cgroup events"),
3375 	OPT_BOOLEAN_SET(0, "switch-events", &record.opts.record_switch_events,
3376 			&record.opts.record_switch_events_set,
3377 			"Record context switch events"),
3378 	OPT_BOOLEAN_FLAG(0, "all-kernel", &record.opts.all_kernel,
3379 			 "Configure all used events to run in kernel space.",
3380 			 PARSE_OPT_EXCLUSIVE),
3381 	OPT_BOOLEAN_FLAG(0, "all-user", &record.opts.all_user,
3382 			 "Configure all used events to run in user space.",
3383 			 PARSE_OPT_EXCLUSIVE),
3384 	OPT_BOOLEAN(0, "kernel-callchains", &record.opts.kernel_callchains,
3385 		    "collect kernel callchains"),
3386 	OPT_BOOLEAN(0, "user-callchains", &record.opts.user_callchains,
3387 		    "collect user callchains"),
3388 	OPT_STRING(0, "clang-path", &llvm_param.clang_path, "clang path",
3389 		   "clang binary to use for compiling BPF scriptlets"),
3390 	OPT_STRING(0, "clang-opt", &llvm_param.clang_opt, "clang options",
3391 		   "options passed to clang when compiling BPF scriptlets"),
3392 	OPT_STRING(0, "vmlinux", &symbol_conf.vmlinux_name,
3393 		   "file", "vmlinux pathname"),
3394 	OPT_BOOLEAN(0, "buildid-all", &record.buildid_all,
3395 		    "Record build-id of all DSOs regardless of hits"),
3396 	OPT_BOOLEAN(0, "buildid-mmap", &record.buildid_mmap,
3397 		    "Record build-id in map events"),
3398 	OPT_BOOLEAN(0, "timestamp-filename", &record.timestamp_filename,
3399 		    "append timestamp to output filename"),
3400 	OPT_BOOLEAN(0, "timestamp-boundary", &record.timestamp_boundary,
3401 		    "Record timestamp boundary (time of first/last samples)"),
3402 	OPT_STRING_OPTARG_SET(0, "switch-output", &record.switch_output.str,
3403 			  &record.switch_output.set, "signal or size[BKMG] or time[smhd]",
3404 			  "Switch output when receiving SIGUSR2 (signal) or cross a size or time threshold",
3405 			  "signal"),
3406 	OPT_CALLBACK_SET(0, "switch-output-event", &record.sb_evlist, &record.switch_output_event_set, "switch output event",
3407 			 "switch output event selector. use 'perf list' to list available events",
3408 			 parse_events_option_new_evlist),
3409 	OPT_INTEGER(0, "switch-max-files", &record.switch_output.num_files,
3410 		   "Limit number of switch output generated files"),
3411 	OPT_BOOLEAN(0, "dry-run", &dry_run,
3412 		    "Parse options then exit"),
3413 #ifdef HAVE_AIO_SUPPORT
3414 	OPT_CALLBACK_OPTARG(0, "aio", &record.opts,
3415 		     &nr_cblocks_default, "n", "Use <n> control blocks in asynchronous trace writing mode (default: 1, max: 4)",
3416 		     record__aio_parse),
3417 #endif
3418 	OPT_CALLBACK(0, "affinity", &record.opts, "node|cpu",
3419 		     "Set affinity mask of trace reading thread to NUMA node cpu mask or cpu of processed mmap buffer",
3420 		     record__parse_affinity),
3421 #ifdef HAVE_ZSTD_SUPPORT
3422 	OPT_CALLBACK_OPTARG('z', "compression-level", &record.opts, &comp_level_default, "n",
3423 			    "Compress records using specified level (default: 1 - fastest compression, 22 - greatest compression)",
3424 			    record__parse_comp_level),
3425 #endif
3426 	OPT_CALLBACK(0, "max-size", &record.output_max_size,
3427 		     "size", "Limit the maximum size of the output file", parse_output_max_size),
3428 	OPT_UINTEGER(0, "num-thread-synthesize",
3429 		     &record.opts.nr_threads_synthesize,
3430 		     "number of threads to run for event synthesis"),
3431 #ifdef HAVE_LIBPFM
3432 	OPT_CALLBACK(0, "pfm-events", &record.evlist, "event",
3433 		"libpfm4 event selector. use 'perf list' to list available events",
3434 		parse_libpfm_events_option),
3435 #endif
3436 	OPT_CALLBACK(0, "control", &record.opts, "fd:ctl-fd[,ack-fd] or fifo:ctl-fifo[,ack-fifo]",
3437 		     "Listen on ctl-fd descriptor for command to control measurement ('enable': enable events, 'disable': disable events,\n"
3438 		     "\t\t\t  'snapshot': AUX area tracing snapshot).\n"
3439 		     "\t\t\t  Optionally send control command completion ('ack\\n') to ack-fd descriptor.\n"
3440 		     "\t\t\t  Alternatively, ctl-fifo / ack-fifo will be opened and used as ctl-fd / ack-fd.",
3441 		      parse_control_option),
3442 	OPT_CALLBACK(0, "synth", &record.opts, "no|all|task|mmap|cgroup",
3443 		     "Fine-tune event synthesis: default=all", parse_record_synth_option),
3444 	OPT_STRING_OPTARG_SET(0, "debuginfod", &record.debuginfod.urls,
3445 			  &record.debuginfod.set, "debuginfod urls",
3446 			  "Enable debuginfod data retrieval from DEBUGINFOD_URLS or specified urls",
3447 			  "system"),
3448 	OPT_CALLBACK_OPTARG(0, "threads", &record.opts, NULL, "spec",
3449 			    "write collected trace data into several data files using parallel threads",
3450 			    record__parse_threads),
3451 	OPT_BOOLEAN(0, "off-cpu", &record.off_cpu, "Enable off-cpu analysis"),
3452 	OPT_END()
3453 };
3454 
3455 struct option *record_options = __record_options;
3456 
3457 static int record__mmap_cpu_mask_init(struct mmap_cpu_mask *mask, struct perf_cpu_map *cpus)
3458 {
3459 	struct perf_cpu cpu;
3460 	int idx;
3461 
3462 	if (cpu_map__is_dummy(cpus))
3463 		return 0;
3464 
3465 	perf_cpu_map__for_each_cpu(cpu, idx, cpus) {
3466 		if (cpu.cpu == -1)
3467 			continue;
3468 		/* Return ENODEV is input cpu is greater than max cpu */
3469 		if ((unsigned long)cpu.cpu > mask->nbits)
3470 			return -ENODEV;
3471 		set_bit(cpu.cpu, mask->bits);
3472 	}
3473 
3474 	return 0;
3475 }
3476 
3477 static int record__mmap_cpu_mask_init_spec(struct mmap_cpu_mask *mask, const char *mask_spec)
3478 {
3479 	struct perf_cpu_map *cpus;
3480 
3481 	cpus = perf_cpu_map__new(mask_spec);
3482 	if (!cpus)
3483 		return -ENOMEM;
3484 
3485 	bitmap_zero(mask->bits, mask->nbits);
3486 	if (record__mmap_cpu_mask_init(mask, cpus))
3487 		return -ENODEV;
3488 
3489 	perf_cpu_map__put(cpus);
3490 
3491 	return 0;
3492 }
3493 
3494 static void record__free_thread_masks(struct record *rec, int nr_threads)
3495 {
3496 	int t;
3497 
3498 	if (rec->thread_masks)
3499 		for (t = 0; t < nr_threads; t++)
3500 			record__thread_mask_free(&rec->thread_masks[t]);
3501 
3502 	zfree(&rec->thread_masks);
3503 }
3504 
3505 static int record__alloc_thread_masks(struct record *rec, int nr_threads, int nr_bits)
3506 {
3507 	int t, ret;
3508 
3509 	rec->thread_masks = zalloc(nr_threads * sizeof(*(rec->thread_masks)));
3510 	if (!rec->thread_masks) {
3511 		pr_err("Failed to allocate thread masks\n");
3512 		return -ENOMEM;
3513 	}
3514 
3515 	for (t = 0; t < nr_threads; t++) {
3516 		ret = record__thread_mask_alloc(&rec->thread_masks[t], nr_bits);
3517 		if (ret) {
3518 			pr_err("Failed to allocate thread masks[%d]\n", t);
3519 			goto out_free;
3520 		}
3521 	}
3522 
3523 	return 0;
3524 
3525 out_free:
3526 	record__free_thread_masks(rec, nr_threads);
3527 
3528 	return ret;
3529 }
3530 
3531 static int record__init_thread_cpu_masks(struct record *rec, struct perf_cpu_map *cpus)
3532 {
3533 	int t, ret, nr_cpus = perf_cpu_map__nr(cpus);
3534 
3535 	ret = record__alloc_thread_masks(rec, nr_cpus, cpu__max_cpu().cpu);
3536 	if (ret)
3537 		return ret;
3538 
3539 	rec->nr_threads = nr_cpus;
3540 	pr_debug("nr_threads: %d\n", rec->nr_threads);
3541 
3542 	for (t = 0; t < rec->nr_threads; t++) {
3543 		set_bit(perf_cpu_map__cpu(cpus, t).cpu, rec->thread_masks[t].maps.bits);
3544 		set_bit(perf_cpu_map__cpu(cpus, t).cpu, rec->thread_masks[t].affinity.bits);
3545 		if (verbose) {
3546 			pr_debug("thread_masks[%d]: ", t);
3547 			mmap_cpu_mask__scnprintf(&rec->thread_masks[t].maps, "maps");
3548 			pr_debug("thread_masks[%d]: ", t);
3549 			mmap_cpu_mask__scnprintf(&rec->thread_masks[t].affinity, "affinity");
3550 		}
3551 	}
3552 
3553 	return 0;
3554 }
3555 
3556 static int record__init_thread_masks_spec(struct record *rec, struct perf_cpu_map *cpus,
3557 					  const char **maps_spec, const char **affinity_spec,
3558 					  u32 nr_spec)
3559 {
3560 	u32 s;
3561 	int ret = 0, t = 0;
3562 	struct mmap_cpu_mask cpus_mask;
3563 	struct thread_mask thread_mask, full_mask, *thread_masks;
3564 
3565 	ret = record__mmap_cpu_mask_alloc(&cpus_mask, cpu__max_cpu().cpu);
3566 	if (ret) {
3567 		pr_err("Failed to allocate CPUs mask\n");
3568 		return ret;
3569 	}
3570 
3571 	ret = record__mmap_cpu_mask_init(&cpus_mask, cpus);
3572 	if (ret) {
3573 		pr_err("Failed to init cpu mask\n");
3574 		goto out_free_cpu_mask;
3575 	}
3576 
3577 	ret = record__thread_mask_alloc(&full_mask, cpu__max_cpu().cpu);
3578 	if (ret) {
3579 		pr_err("Failed to allocate full mask\n");
3580 		goto out_free_cpu_mask;
3581 	}
3582 
3583 	ret = record__thread_mask_alloc(&thread_mask, cpu__max_cpu().cpu);
3584 	if (ret) {
3585 		pr_err("Failed to allocate thread mask\n");
3586 		goto out_free_full_and_cpu_masks;
3587 	}
3588 
3589 	for (s = 0; s < nr_spec; s++) {
3590 		ret = record__mmap_cpu_mask_init_spec(&thread_mask.maps, maps_spec[s]);
3591 		if (ret) {
3592 			pr_err("Failed to initialize maps thread mask\n");
3593 			goto out_free;
3594 		}
3595 		ret = record__mmap_cpu_mask_init_spec(&thread_mask.affinity, affinity_spec[s]);
3596 		if (ret) {
3597 			pr_err("Failed to initialize affinity thread mask\n");
3598 			goto out_free;
3599 		}
3600 
3601 		/* ignore invalid CPUs but do not allow empty masks */
3602 		if (!bitmap_and(thread_mask.maps.bits, thread_mask.maps.bits,
3603 				cpus_mask.bits, thread_mask.maps.nbits)) {
3604 			pr_err("Empty maps mask: %s\n", maps_spec[s]);
3605 			ret = -EINVAL;
3606 			goto out_free;
3607 		}
3608 		if (!bitmap_and(thread_mask.affinity.bits, thread_mask.affinity.bits,
3609 				cpus_mask.bits, thread_mask.affinity.nbits)) {
3610 			pr_err("Empty affinity mask: %s\n", affinity_spec[s]);
3611 			ret = -EINVAL;
3612 			goto out_free;
3613 		}
3614 
3615 		/* do not allow intersection with other masks (full_mask) */
3616 		if (bitmap_intersects(thread_mask.maps.bits, full_mask.maps.bits,
3617 				      thread_mask.maps.nbits)) {
3618 			pr_err("Intersecting maps mask: %s\n", maps_spec[s]);
3619 			ret = -EINVAL;
3620 			goto out_free;
3621 		}
3622 		if (bitmap_intersects(thread_mask.affinity.bits, full_mask.affinity.bits,
3623 				      thread_mask.affinity.nbits)) {
3624 			pr_err("Intersecting affinity mask: %s\n", affinity_spec[s]);
3625 			ret = -EINVAL;
3626 			goto out_free;
3627 		}
3628 
3629 		bitmap_or(full_mask.maps.bits, full_mask.maps.bits,
3630 			  thread_mask.maps.bits, full_mask.maps.nbits);
3631 		bitmap_or(full_mask.affinity.bits, full_mask.affinity.bits,
3632 			  thread_mask.affinity.bits, full_mask.maps.nbits);
3633 
3634 		thread_masks = realloc(rec->thread_masks, (t + 1) * sizeof(struct thread_mask));
3635 		if (!thread_masks) {
3636 			pr_err("Failed to reallocate thread masks\n");
3637 			ret = -ENOMEM;
3638 			goto out_free;
3639 		}
3640 		rec->thread_masks = thread_masks;
3641 		rec->thread_masks[t] = thread_mask;
3642 		if (verbose) {
3643 			pr_debug("thread_masks[%d]: ", t);
3644 			mmap_cpu_mask__scnprintf(&rec->thread_masks[t].maps, "maps");
3645 			pr_debug("thread_masks[%d]: ", t);
3646 			mmap_cpu_mask__scnprintf(&rec->thread_masks[t].affinity, "affinity");
3647 		}
3648 		t++;
3649 		ret = record__thread_mask_alloc(&thread_mask, cpu__max_cpu().cpu);
3650 		if (ret) {
3651 			pr_err("Failed to allocate thread mask\n");
3652 			goto out_free_full_and_cpu_masks;
3653 		}
3654 	}
3655 	rec->nr_threads = t;
3656 	pr_debug("nr_threads: %d\n", rec->nr_threads);
3657 	if (!rec->nr_threads)
3658 		ret = -EINVAL;
3659 
3660 out_free:
3661 	record__thread_mask_free(&thread_mask);
3662 out_free_full_and_cpu_masks:
3663 	record__thread_mask_free(&full_mask);
3664 out_free_cpu_mask:
3665 	record__mmap_cpu_mask_free(&cpus_mask);
3666 
3667 	return ret;
3668 }
3669 
3670 static int record__init_thread_core_masks(struct record *rec, struct perf_cpu_map *cpus)
3671 {
3672 	int ret;
3673 	struct cpu_topology *topo;
3674 
3675 	topo = cpu_topology__new();
3676 	if (!topo) {
3677 		pr_err("Failed to allocate CPU topology\n");
3678 		return -ENOMEM;
3679 	}
3680 
3681 	ret = record__init_thread_masks_spec(rec, cpus, topo->core_cpus_list,
3682 					     topo->core_cpus_list, topo->core_cpus_lists);
3683 	cpu_topology__delete(topo);
3684 
3685 	return ret;
3686 }
3687 
3688 static int record__init_thread_package_masks(struct record *rec, struct perf_cpu_map *cpus)
3689 {
3690 	int ret;
3691 	struct cpu_topology *topo;
3692 
3693 	topo = cpu_topology__new();
3694 	if (!topo) {
3695 		pr_err("Failed to allocate CPU topology\n");
3696 		return -ENOMEM;
3697 	}
3698 
3699 	ret = record__init_thread_masks_spec(rec, cpus, topo->package_cpus_list,
3700 					     topo->package_cpus_list, topo->package_cpus_lists);
3701 	cpu_topology__delete(topo);
3702 
3703 	return ret;
3704 }
3705 
3706 static int record__init_thread_numa_masks(struct record *rec, struct perf_cpu_map *cpus)
3707 {
3708 	u32 s;
3709 	int ret;
3710 	const char **spec;
3711 	struct numa_topology *topo;
3712 
3713 	topo = numa_topology__new();
3714 	if (!topo) {
3715 		pr_err("Failed to allocate NUMA topology\n");
3716 		return -ENOMEM;
3717 	}
3718 
3719 	spec = zalloc(topo->nr * sizeof(char *));
3720 	if (!spec) {
3721 		pr_err("Failed to allocate NUMA spec\n");
3722 		ret = -ENOMEM;
3723 		goto out_delete_topo;
3724 	}
3725 	for (s = 0; s < topo->nr; s++)
3726 		spec[s] = topo->nodes[s].cpus;
3727 
3728 	ret = record__init_thread_masks_spec(rec, cpus, spec, spec, topo->nr);
3729 
3730 	zfree(&spec);
3731 
3732 out_delete_topo:
3733 	numa_topology__delete(topo);
3734 
3735 	return ret;
3736 }
3737 
3738 static int record__init_thread_user_masks(struct record *rec, struct perf_cpu_map *cpus)
3739 {
3740 	int t, ret;
3741 	u32 s, nr_spec = 0;
3742 	char **maps_spec = NULL, **affinity_spec = NULL, **tmp_spec;
3743 	char *user_spec, *spec, *spec_ptr, *mask, *mask_ptr, *dup_mask = NULL;
3744 
3745 	for (t = 0, user_spec = (char *)rec->opts.threads_user_spec; ; t++, user_spec = NULL) {
3746 		spec = strtok_r(user_spec, ":", &spec_ptr);
3747 		if (spec == NULL)
3748 			break;
3749 		pr_debug2("threads_spec[%d]: %s\n", t, spec);
3750 		mask = strtok_r(spec, "/", &mask_ptr);
3751 		if (mask == NULL)
3752 			break;
3753 		pr_debug2("  maps mask: %s\n", mask);
3754 		tmp_spec = realloc(maps_spec, (nr_spec + 1) * sizeof(char *));
3755 		if (!tmp_spec) {
3756 			pr_err("Failed to reallocate maps spec\n");
3757 			ret = -ENOMEM;
3758 			goto out_free;
3759 		}
3760 		maps_spec = tmp_spec;
3761 		maps_spec[nr_spec] = dup_mask = strdup(mask);
3762 		if (!maps_spec[nr_spec]) {
3763 			pr_err("Failed to allocate maps spec[%d]\n", nr_spec);
3764 			ret = -ENOMEM;
3765 			goto out_free;
3766 		}
3767 		mask = strtok_r(NULL, "/", &mask_ptr);
3768 		if (mask == NULL) {
3769 			pr_err("Invalid thread maps or affinity specs\n");
3770 			ret = -EINVAL;
3771 			goto out_free;
3772 		}
3773 		pr_debug2("  affinity mask: %s\n", mask);
3774 		tmp_spec = realloc(affinity_spec, (nr_spec + 1) * sizeof(char *));
3775 		if (!tmp_spec) {
3776 			pr_err("Failed to reallocate affinity spec\n");
3777 			ret = -ENOMEM;
3778 			goto out_free;
3779 		}
3780 		affinity_spec = tmp_spec;
3781 		affinity_spec[nr_spec] = strdup(mask);
3782 		if (!affinity_spec[nr_spec]) {
3783 			pr_err("Failed to allocate affinity spec[%d]\n", nr_spec);
3784 			ret = -ENOMEM;
3785 			goto out_free;
3786 		}
3787 		dup_mask = NULL;
3788 		nr_spec++;
3789 	}
3790 
3791 	ret = record__init_thread_masks_spec(rec, cpus, (const char **)maps_spec,
3792 					     (const char **)affinity_spec, nr_spec);
3793 
3794 out_free:
3795 	free(dup_mask);
3796 	for (s = 0; s < nr_spec; s++) {
3797 		if (maps_spec)
3798 			free(maps_spec[s]);
3799 		if (affinity_spec)
3800 			free(affinity_spec[s]);
3801 	}
3802 	free(affinity_spec);
3803 	free(maps_spec);
3804 
3805 	return ret;
3806 }
3807 
3808 static int record__init_thread_default_masks(struct record *rec, struct perf_cpu_map *cpus)
3809 {
3810 	int ret;
3811 
3812 	ret = record__alloc_thread_masks(rec, 1, cpu__max_cpu().cpu);
3813 	if (ret)
3814 		return ret;
3815 
3816 	if (record__mmap_cpu_mask_init(&rec->thread_masks->maps, cpus))
3817 		return -ENODEV;
3818 
3819 	rec->nr_threads = 1;
3820 
3821 	return 0;
3822 }
3823 
3824 static int record__init_thread_masks(struct record *rec)
3825 {
3826 	int ret = 0;
3827 	struct perf_cpu_map *cpus = rec->evlist->core.all_cpus;
3828 
3829 	if (!record__threads_enabled(rec))
3830 		return record__init_thread_default_masks(rec, cpus);
3831 
3832 	if (evlist__per_thread(rec->evlist)) {
3833 		pr_err("--per-thread option is mutually exclusive to parallel streaming mode.\n");
3834 		return -EINVAL;
3835 	}
3836 
3837 	switch (rec->opts.threads_spec) {
3838 	case THREAD_SPEC__CPU:
3839 		ret = record__init_thread_cpu_masks(rec, cpus);
3840 		break;
3841 	case THREAD_SPEC__CORE:
3842 		ret = record__init_thread_core_masks(rec, cpus);
3843 		break;
3844 	case THREAD_SPEC__PACKAGE:
3845 		ret = record__init_thread_package_masks(rec, cpus);
3846 		break;
3847 	case THREAD_SPEC__NUMA:
3848 		ret = record__init_thread_numa_masks(rec, cpus);
3849 		break;
3850 	case THREAD_SPEC__USER:
3851 		ret = record__init_thread_user_masks(rec, cpus);
3852 		break;
3853 	default:
3854 		break;
3855 	}
3856 
3857 	return ret;
3858 }
3859 
3860 int cmd_record(int argc, const char **argv)
3861 {
3862 	int err;
3863 	struct record *rec = &record;
3864 	char errbuf[BUFSIZ];
3865 
3866 	setlocale(LC_ALL, "");
3867 
3868 #ifndef HAVE_LIBBPF_SUPPORT
3869 # define set_nobuild(s, l, c) set_option_nobuild(record_options, s, l, "NO_LIBBPF=1", c)
3870 	set_nobuild('\0', "clang-path", true);
3871 	set_nobuild('\0', "clang-opt", true);
3872 # undef set_nobuild
3873 #endif
3874 
3875 #ifndef HAVE_BPF_PROLOGUE
3876 # if !defined (HAVE_DWARF_SUPPORT)
3877 #  define REASON  "NO_DWARF=1"
3878 # elif !defined (HAVE_LIBBPF_SUPPORT)
3879 #  define REASON  "NO_LIBBPF=1"
3880 # else
3881 #  define REASON  "this architecture doesn't support BPF prologue"
3882 # endif
3883 # define set_nobuild(s, l, c) set_option_nobuild(record_options, s, l, REASON, c)
3884 	set_nobuild('\0', "vmlinux", true);
3885 # undef set_nobuild
3886 # undef REASON
3887 #endif
3888 
3889 #ifndef HAVE_BPF_SKEL
3890 # define set_nobuild(s, l, m, c) set_option_nobuild(record_options, s, l, m, c)
3891 	set_nobuild('\0', "off-cpu", "no BUILD_BPF_SKEL=1", true);
3892 # undef set_nobuild
3893 #endif
3894 
3895 	rec->opts.affinity = PERF_AFFINITY_SYS;
3896 
3897 	rec->evlist = evlist__new();
3898 	if (rec->evlist == NULL)
3899 		return -ENOMEM;
3900 
3901 	err = perf_config(perf_record_config, rec);
3902 	if (err)
3903 		return err;
3904 
3905 	argc = parse_options(argc, argv, record_options, record_usage,
3906 			    PARSE_OPT_STOP_AT_NON_OPTION);
3907 	if (quiet)
3908 		perf_quiet_option();
3909 
3910 	err = symbol__validate_sym_arguments();
3911 	if (err)
3912 		return err;
3913 
3914 	perf_debuginfod_setup(&record.debuginfod);
3915 
3916 	/* Make system wide (-a) the default target. */
3917 	if (!argc && target__none(&rec->opts.target))
3918 		rec->opts.target.system_wide = true;
3919 
3920 	if (nr_cgroups && !rec->opts.target.system_wide) {
3921 		usage_with_options_msg(record_usage, record_options,
3922 			"cgroup monitoring only available in system-wide mode");
3923 
3924 	}
3925 
3926 	if (rec->buildid_mmap) {
3927 		if (!perf_can_record_build_id()) {
3928 			pr_err("Failed: no support to record build id in mmap events, update your kernel.\n");
3929 			err = -EINVAL;
3930 			goto out_opts;
3931 		}
3932 		pr_debug("Enabling build id in mmap2 events.\n");
3933 		/* Enable mmap build id synthesizing. */
3934 		symbol_conf.buildid_mmap2 = true;
3935 		/* Enable perf_event_attr::build_id bit. */
3936 		rec->opts.build_id = true;
3937 		/* Disable build id cache. */
3938 		rec->no_buildid = true;
3939 	}
3940 
3941 	if (rec->opts.record_cgroup && !perf_can_record_cgroup()) {
3942 		pr_err("Kernel has no cgroup sampling support.\n");
3943 		err = -EINVAL;
3944 		goto out_opts;
3945 	}
3946 
3947 	if (rec->opts.kcore)
3948 		rec->opts.text_poke = true;
3949 
3950 	if (rec->opts.kcore || record__threads_enabled(rec))
3951 		rec->data.is_dir = true;
3952 
3953 	if (record__threads_enabled(rec)) {
3954 		if (rec->opts.affinity != PERF_AFFINITY_SYS) {
3955 			pr_err("--affinity option is mutually exclusive to parallel streaming mode.\n");
3956 			goto out_opts;
3957 		}
3958 		if (record__aio_enabled(rec)) {
3959 			pr_err("Asynchronous streaming mode (--aio) is mutually exclusive to parallel streaming mode.\n");
3960 			goto out_opts;
3961 		}
3962 	}
3963 
3964 	if (rec->opts.comp_level != 0) {
3965 		pr_debug("Compression enabled, disabling build id collection at the end of the session.\n");
3966 		rec->no_buildid = true;
3967 	}
3968 
3969 	if (rec->opts.record_switch_events &&
3970 	    !perf_can_record_switch_events()) {
3971 		ui__error("kernel does not support recording context switch events\n");
3972 		parse_options_usage(record_usage, record_options, "switch-events", 0);
3973 		err = -EINVAL;
3974 		goto out_opts;
3975 	}
3976 
3977 	if (switch_output_setup(rec)) {
3978 		parse_options_usage(record_usage, record_options, "switch-output", 0);
3979 		err = -EINVAL;
3980 		goto out_opts;
3981 	}
3982 
3983 	if (rec->switch_output.time) {
3984 		signal(SIGALRM, alarm_sig_handler);
3985 		alarm(rec->switch_output.time);
3986 	}
3987 
3988 	if (rec->switch_output.num_files) {
3989 		rec->switch_output.filenames = calloc(sizeof(char *),
3990 						      rec->switch_output.num_files);
3991 		if (!rec->switch_output.filenames) {
3992 			err = -EINVAL;
3993 			goto out_opts;
3994 		}
3995 	}
3996 
3997 	if (rec->timestamp_filename && record__threads_enabled(rec)) {
3998 		rec->timestamp_filename = false;
3999 		pr_warning("WARNING: --timestamp-filename option is not available in parallel streaming mode.\n");
4000 	}
4001 
4002 	/*
4003 	 * Allow aliases to facilitate the lookup of symbols for address
4004 	 * filters. Refer to auxtrace_parse_filters().
4005 	 */
4006 	symbol_conf.allow_aliases = true;
4007 
4008 	symbol__init(NULL);
4009 
4010 	err = record__auxtrace_init(rec);
4011 	if (err)
4012 		goto out;
4013 
4014 	if (dry_run)
4015 		goto out;
4016 
4017 	err = bpf__setup_stdout(rec->evlist);
4018 	if (err) {
4019 		bpf__strerror_setup_stdout(rec->evlist, err, errbuf, sizeof(errbuf));
4020 		pr_err("ERROR: Setup BPF stdout failed: %s\n",
4021 			 errbuf);
4022 		goto out;
4023 	}
4024 
4025 	err = -ENOMEM;
4026 
4027 	if (rec->no_buildid_cache || rec->no_buildid) {
4028 		disable_buildid_cache();
4029 	} else if (rec->switch_output.enabled) {
4030 		/*
4031 		 * In 'perf record --switch-output', disable buildid
4032 		 * generation by default to reduce data file switching
4033 		 * overhead. Still generate buildid if they are required
4034 		 * explicitly using
4035 		 *
4036 		 *  perf record --switch-output --no-no-buildid \
4037 		 *              --no-no-buildid-cache
4038 		 *
4039 		 * Following code equals to:
4040 		 *
4041 		 * if ((rec->no_buildid || !rec->no_buildid_set) &&
4042 		 *     (rec->no_buildid_cache || !rec->no_buildid_cache_set))
4043 		 *         disable_buildid_cache();
4044 		 */
4045 		bool disable = true;
4046 
4047 		if (rec->no_buildid_set && !rec->no_buildid)
4048 			disable = false;
4049 		if (rec->no_buildid_cache_set && !rec->no_buildid_cache)
4050 			disable = false;
4051 		if (disable) {
4052 			rec->no_buildid = true;
4053 			rec->no_buildid_cache = true;
4054 			disable_buildid_cache();
4055 		}
4056 	}
4057 
4058 	if (record.opts.overwrite)
4059 		record.opts.tail_synthesize = true;
4060 
4061 	if (rec->evlist->core.nr_entries == 0) {
4062 		if (perf_pmu__has_hybrid()) {
4063 			err = evlist__add_default_hybrid(rec->evlist,
4064 							 !record.opts.no_samples);
4065 		} else {
4066 			err = __evlist__add_default(rec->evlist,
4067 						    !record.opts.no_samples);
4068 		}
4069 
4070 		if (err < 0) {
4071 			pr_err("Not enough memory for event selector list\n");
4072 			goto out;
4073 		}
4074 	}
4075 
4076 	if (rec->opts.target.tid && !rec->opts.no_inherit_set)
4077 		rec->opts.no_inherit = true;
4078 
4079 	err = target__validate(&rec->opts.target);
4080 	if (err) {
4081 		target__strerror(&rec->opts.target, err, errbuf, BUFSIZ);
4082 		ui__warning("%s\n", errbuf);
4083 	}
4084 
4085 	err = target__parse_uid(&rec->opts.target);
4086 	if (err) {
4087 		int saved_errno = errno;
4088 
4089 		target__strerror(&rec->opts.target, err, errbuf, BUFSIZ);
4090 		ui__error("%s", errbuf);
4091 
4092 		err = -saved_errno;
4093 		goto out;
4094 	}
4095 
4096 	/* Enable ignoring missing threads when -u/-p option is defined. */
4097 	rec->opts.ignore_missing_thread = rec->opts.target.uid != UINT_MAX || rec->opts.target.pid;
4098 
4099 	if (evlist__fix_hybrid_cpus(rec->evlist, rec->opts.target.cpu_list)) {
4100 		pr_err("failed to use cpu list %s\n",
4101 		       rec->opts.target.cpu_list);
4102 		goto out;
4103 	}
4104 
4105 	rec->opts.target.hybrid = perf_pmu__has_hybrid();
4106 
4107 	if (callchain_param.enabled && callchain_param.record_mode == CALLCHAIN_FP)
4108 		arch__add_leaf_frame_record_opts(&rec->opts);
4109 
4110 	err = -ENOMEM;
4111 	if (evlist__create_maps(rec->evlist, &rec->opts.target) < 0) {
4112 		if (rec->opts.target.pid != NULL) {
4113 			pr_err("Couldn't create thread/CPU maps: %s\n",
4114 				errno == ENOENT ? "No such process" : str_error_r(errno, errbuf, sizeof(errbuf)));
4115 			goto out;
4116 		}
4117 		else
4118 			usage_with_options(record_usage, record_options);
4119 	}
4120 
4121 	err = auxtrace_record__options(rec->itr, rec->evlist, &rec->opts);
4122 	if (err)
4123 		goto out;
4124 
4125 	/*
4126 	 * We take all buildids when the file contains
4127 	 * AUX area tracing data because we do not decode the
4128 	 * trace because it would take too long.
4129 	 */
4130 	if (rec->opts.full_auxtrace)
4131 		rec->buildid_all = true;
4132 
4133 	if (rec->opts.text_poke) {
4134 		err = record__config_text_poke(rec->evlist);
4135 		if (err) {
4136 			pr_err("record__config_text_poke failed, error %d\n", err);
4137 			goto out;
4138 		}
4139 	}
4140 
4141 	if (rec->off_cpu) {
4142 		err = record__config_off_cpu(rec);
4143 		if (err) {
4144 			pr_err("record__config_off_cpu failed, error %d\n", err);
4145 			goto out;
4146 		}
4147 	}
4148 
4149 	if (record_opts__config(&rec->opts)) {
4150 		err = -EINVAL;
4151 		goto out;
4152 	}
4153 
4154 	err = record__init_thread_masks(rec);
4155 	if (err) {
4156 		pr_err("Failed to initialize parallel data streaming masks\n");
4157 		goto out;
4158 	}
4159 
4160 	if (rec->opts.nr_cblocks > nr_cblocks_max)
4161 		rec->opts.nr_cblocks = nr_cblocks_max;
4162 	pr_debug("nr_cblocks: %d\n", rec->opts.nr_cblocks);
4163 
4164 	pr_debug("affinity: %s\n", affinity_tags[rec->opts.affinity]);
4165 	pr_debug("mmap flush: %d\n", rec->opts.mmap_flush);
4166 
4167 	if (rec->opts.comp_level > comp_level_max)
4168 		rec->opts.comp_level = comp_level_max;
4169 	pr_debug("comp level: %d\n", rec->opts.comp_level);
4170 
4171 	err = __cmd_record(&record, argc, argv);
4172 out:
4173 	evlist__delete(rec->evlist);
4174 	symbol__exit();
4175 	auxtrace_record__free(rec->itr);
4176 out_opts:
4177 	record__free_thread_masks(rec, rec->nr_threads);
4178 	rec->nr_threads = 0;
4179 	evlist__close_control(rec->opts.ctl_fd, rec->opts.ctl_fd_ack, &rec->opts.ctl_fd_close);
4180 	return err;
4181 }
4182 
4183 static void snapshot_sig_handler(int sig __maybe_unused)
4184 {
4185 	struct record *rec = &record;
4186 
4187 	hit_auxtrace_snapshot_trigger(rec);
4188 
4189 	if (switch_output_signal(rec))
4190 		trigger_hit(&switch_output_trigger);
4191 }
4192 
4193 static void alarm_sig_handler(int sig __maybe_unused)
4194 {
4195 	struct record *rec = &record;
4196 
4197 	if (switch_output_time(rec))
4198 		trigger_hit(&switch_output_trigger);
4199 }
4200