xref: /linux/tools/perf/builtin-record.c (revision 9f2c9170934eace462499ba0bfe042cc72900173)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * builtin-record.c
4  *
5  * Builtin record command: Record the profile of a workload
6  * (or a CPU, or a PID) into the perf.data output file - for
7  * later analysis via perf report.
8  */
9 #include "builtin.h"
10 
11 #include "util/build-id.h"
12 #include <subcmd/parse-options.h>
13 #include <internal/xyarray.h>
14 #include "util/parse-events.h"
15 #include "util/config.h"
16 
17 #include "util/callchain.h"
18 #include "util/cgroup.h"
19 #include "util/header.h"
20 #include "util/event.h"
21 #include "util/evlist.h"
22 #include "util/evsel.h"
23 #include "util/debug.h"
24 #include "util/mmap.h"
25 #include "util/mutex.h"
26 #include "util/target.h"
27 #include "util/session.h"
28 #include "util/tool.h"
29 #include "util/symbol.h"
30 #include "util/record.h"
31 #include "util/cpumap.h"
32 #include "util/thread_map.h"
33 #include "util/data.h"
34 #include "util/perf_regs.h"
35 #include "util/auxtrace.h"
36 #include "util/tsc.h"
37 #include "util/parse-branch-options.h"
38 #include "util/parse-regs-options.h"
39 #include "util/perf_api_probe.h"
40 #include "util/llvm-utils.h"
41 #include "util/bpf-loader.h"
42 #include "util/trigger.h"
43 #include "util/perf-hooks.h"
44 #include "util/cpu-set-sched.h"
45 #include "util/synthetic-events.h"
46 #include "util/time-utils.h"
47 #include "util/units.h"
48 #include "util/bpf-event.h"
49 #include "util/util.h"
50 #include "util/pfm.h"
51 #include "util/clockid.h"
52 #include "util/pmu-hybrid.h"
53 #include "util/evlist-hybrid.h"
54 #include "util/off_cpu.h"
55 #include "asm/bug.h"
56 #include "perf.h"
57 #include "cputopo.h"
58 
59 #include <errno.h>
60 #include <inttypes.h>
61 #include <locale.h>
62 #include <poll.h>
63 #include <pthread.h>
64 #include <unistd.h>
65 #ifndef HAVE_GETTID
66 #include <syscall.h>
67 #endif
68 #include <sched.h>
69 #include <signal.h>
70 #ifdef HAVE_EVENTFD_SUPPORT
71 #include <sys/eventfd.h>
72 #endif
73 #include <sys/mman.h>
74 #include <sys/wait.h>
75 #include <sys/types.h>
76 #include <sys/stat.h>
77 #include <fcntl.h>
78 #include <linux/err.h>
79 #include <linux/string.h>
80 #include <linux/time64.h>
81 #include <linux/zalloc.h>
82 #include <linux/bitmap.h>
83 #include <sys/time.h>
84 
85 struct switch_output {
86 	bool		 enabled;
87 	bool		 signal;
88 	unsigned long	 size;
89 	unsigned long	 time;
90 	const char	*str;
91 	bool		 set;
92 	char		 **filenames;
93 	int		 num_files;
94 	int		 cur_file;
95 };
96 
97 struct thread_mask {
98 	struct mmap_cpu_mask	maps;
99 	struct mmap_cpu_mask	affinity;
100 };
101 
102 struct record_thread {
103 	pid_t			tid;
104 	struct thread_mask	*mask;
105 	struct {
106 		int		msg[2];
107 		int		ack[2];
108 	} pipes;
109 	struct fdarray		pollfd;
110 	int			ctlfd_pos;
111 	int			nr_mmaps;
112 	struct mmap		**maps;
113 	struct mmap		**overwrite_maps;
114 	struct record		*rec;
115 	unsigned long long	samples;
116 	unsigned long		waking;
117 	u64			bytes_written;
118 	u64			bytes_transferred;
119 	u64			bytes_compressed;
120 };
121 
122 static __thread struct record_thread *thread;
123 
124 enum thread_msg {
125 	THREAD_MSG__UNDEFINED = 0,
126 	THREAD_MSG__READY,
127 	THREAD_MSG__MAX,
128 };
129 
130 static const char *thread_msg_tags[THREAD_MSG__MAX] = {
131 	"UNDEFINED", "READY"
132 };
133 
134 enum thread_spec {
135 	THREAD_SPEC__UNDEFINED = 0,
136 	THREAD_SPEC__CPU,
137 	THREAD_SPEC__CORE,
138 	THREAD_SPEC__PACKAGE,
139 	THREAD_SPEC__NUMA,
140 	THREAD_SPEC__USER,
141 	THREAD_SPEC__MAX,
142 };
143 
144 static const char *thread_spec_tags[THREAD_SPEC__MAX] = {
145 	"undefined", "cpu", "core", "package", "numa", "user"
146 };
147 
148 struct pollfd_index_map {
149 	int evlist_pollfd_index;
150 	int thread_pollfd_index;
151 };
152 
153 struct record {
154 	struct perf_tool	tool;
155 	struct record_opts	opts;
156 	u64			bytes_written;
157 	struct perf_data	data;
158 	struct auxtrace_record	*itr;
159 	struct evlist	*evlist;
160 	struct perf_session	*session;
161 	struct evlist		*sb_evlist;
162 	pthread_t		thread_id;
163 	int			realtime_prio;
164 	bool			switch_output_event_set;
165 	bool			no_buildid;
166 	bool			no_buildid_set;
167 	bool			no_buildid_cache;
168 	bool			no_buildid_cache_set;
169 	bool			buildid_all;
170 	bool			buildid_mmap;
171 	bool			timestamp_filename;
172 	bool			timestamp_boundary;
173 	bool			off_cpu;
174 	struct switch_output	switch_output;
175 	unsigned long long	samples;
176 	unsigned long		output_max_size;	/* = 0: unlimited */
177 	struct perf_debuginfod	debuginfod;
178 	int			nr_threads;
179 	struct thread_mask	*thread_masks;
180 	struct record_thread	*thread_data;
181 	struct pollfd_index_map	*index_map;
182 	size_t			index_map_sz;
183 	size_t			index_map_cnt;
184 };
185 
186 static volatile int done;
187 
188 static volatile int auxtrace_record__snapshot_started;
189 static DEFINE_TRIGGER(auxtrace_snapshot_trigger);
190 static DEFINE_TRIGGER(switch_output_trigger);
191 
192 static const char *affinity_tags[PERF_AFFINITY_MAX] = {
193 	"SYS", "NODE", "CPU"
194 };
195 
196 #ifndef HAVE_GETTID
197 static inline pid_t gettid(void)
198 {
199 	return (pid_t)syscall(__NR_gettid);
200 }
201 #endif
202 
203 static int record__threads_enabled(struct record *rec)
204 {
205 	return rec->opts.threads_spec;
206 }
207 
208 static bool switch_output_signal(struct record *rec)
209 {
210 	return rec->switch_output.signal &&
211 	       trigger_is_ready(&switch_output_trigger);
212 }
213 
214 static bool switch_output_size(struct record *rec)
215 {
216 	return rec->switch_output.size &&
217 	       trigger_is_ready(&switch_output_trigger) &&
218 	       (rec->bytes_written >= rec->switch_output.size);
219 }
220 
221 static bool switch_output_time(struct record *rec)
222 {
223 	return rec->switch_output.time &&
224 	       trigger_is_ready(&switch_output_trigger);
225 }
226 
227 static u64 record__bytes_written(struct record *rec)
228 {
229 	int t;
230 	u64 bytes_written = rec->bytes_written;
231 	struct record_thread *thread_data = rec->thread_data;
232 
233 	for (t = 0; t < rec->nr_threads; t++)
234 		bytes_written += thread_data[t].bytes_written;
235 
236 	return bytes_written;
237 }
238 
239 static bool record__output_max_size_exceeded(struct record *rec)
240 {
241 	return rec->output_max_size &&
242 	       (record__bytes_written(rec) >= rec->output_max_size);
243 }
244 
245 static int record__write(struct record *rec, struct mmap *map __maybe_unused,
246 			 void *bf, size_t size)
247 {
248 	struct perf_data_file *file = &rec->session->data->file;
249 
250 	if (map && map->file)
251 		file = map->file;
252 
253 	if (perf_data_file__write(file, bf, size) < 0) {
254 		pr_err("failed to write perf data, error: %m\n");
255 		return -1;
256 	}
257 
258 	if (map && map->file)
259 		thread->bytes_written += size;
260 	else
261 		rec->bytes_written += size;
262 
263 	if (record__output_max_size_exceeded(rec) && !done) {
264 		fprintf(stderr, "[ perf record: perf size limit reached (%" PRIu64 " KB),"
265 				" stopping session ]\n",
266 				record__bytes_written(rec) >> 10);
267 		done = 1;
268 	}
269 
270 	if (switch_output_size(rec))
271 		trigger_hit(&switch_output_trigger);
272 
273 	return 0;
274 }
275 
276 static int record__aio_enabled(struct record *rec);
277 static int record__comp_enabled(struct record *rec);
278 static size_t zstd_compress(struct perf_session *session, struct mmap *map,
279 			    void *dst, size_t dst_size, void *src, size_t src_size);
280 
281 #ifdef HAVE_AIO_SUPPORT
282 static int record__aio_write(struct aiocb *cblock, int trace_fd,
283 		void *buf, size_t size, off_t off)
284 {
285 	int rc;
286 
287 	cblock->aio_fildes = trace_fd;
288 	cblock->aio_buf    = buf;
289 	cblock->aio_nbytes = size;
290 	cblock->aio_offset = off;
291 	cblock->aio_sigevent.sigev_notify = SIGEV_NONE;
292 
293 	do {
294 		rc = aio_write(cblock);
295 		if (rc == 0) {
296 			break;
297 		} else if (errno != EAGAIN) {
298 			cblock->aio_fildes = -1;
299 			pr_err("failed to queue perf data, error: %m\n");
300 			break;
301 		}
302 	} while (1);
303 
304 	return rc;
305 }
306 
307 static int record__aio_complete(struct mmap *md, struct aiocb *cblock)
308 {
309 	void *rem_buf;
310 	off_t rem_off;
311 	size_t rem_size;
312 	int rc, aio_errno;
313 	ssize_t aio_ret, written;
314 
315 	aio_errno = aio_error(cblock);
316 	if (aio_errno == EINPROGRESS)
317 		return 0;
318 
319 	written = aio_ret = aio_return(cblock);
320 	if (aio_ret < 0) {
321 		if (aio_errno != EINTR)
322 			pr_err("failed to write perf data, error: %m\n");
323 		written = 0;
324 	}
325 
326 	rem_size = cblock->aio_nbytes - written;
327 
328 	if (rem_size == 0) {
329 		cblock->aio_fildes = -1;
330 		/*
331 		 * md->refcount is incremented in record__aio_pushfn() for
332 		 * every aio write request started in record__aio_push() so
333 		 * decrement it because the request is now complete.
334 		 */
335 		perf_mmap__put(&md->core);
336 		rc = 1;
337 	} else {
338 		/*
339 		 * aio write request may require restart with the
340 		 * reminder if the kernel didn't write whole
341 		 * chunk at once.
342 		 */
343 		rem_off = cblock->aio_offset + written;
344 		rem_buf = (void *)(cblock->aio_buf + written);
345 		record__aio_write(cblock, cblock->aio_fildes,
346 				rem_buf, rem_size, rem_off);
347 		rc = 0;
348 	}
349 
350 	return rc;
351 }
352 
353 static int record__aio_sync(struct mmap *md, bool sync_all)
354 {
355 	struct aiocb **aiocb = md->aio.aiocb;
356 	struct aiocb *cblocks = md->aio.cblocks;
357 	struct timespec timeout = { 0, 1000 * 1000  * 1 }; /* 1ms */
358 	int i, do_suspend;
359 
360 	do {
361 		do_suspend = 0;
362 		for (i = 0; i < md->aio.nr_cblocks; ++i) {
363 			if (cblocks[i].aio_fildes == -1 || record__aio_complete(md, &cblocks[i])) {
364 				if (sync_all)
365 					aiocb[i] = NULL;
366 				else
367 					return i;
368 			} else {
369 				/*
370 				 * Started aio write is not complete yet
371 				 * so it has to be waited before the
372 				 * next allocation.
373 				 */
374 				aiocb[i] = &cblocks[i];
375 				do_suspend = 1;
376 			}
377 		}
378 		if (!do_suspend)
379 			return -1;
380 
381 		while (aio_suspend((const struct aiocb **)aiocb, md->aio.nr_cblocks, &timeout)) {
382 			if (!(errno == EAGAIN || errno == EINTR))
383 				pr_err("failed to sync perf data, error: %m\n");
384 		}
385 	} while (1);
386 }
387 
388 struct record_aio {
389 	struct record	*rec;
390 	void		*data;
391 	size_t		size;
392 };
393 
394 static int record__aio_pushfn(struct mmap *map, void *to, void *buf, size_t size)
395 {
396 	struct record_aio *aio = to;
397 
398 	/*
399 	 * map->core.base data pointed by buf is copied into free map->aio.data[] buffer
400 	 * to release space in the kernel buffer as fast as possible, calling
401 	 * perf_mmap__consume() from perf_mmap__push() function.
402 	 *
403 	 * That lets the kernel to proceed with storing more profiling data into
404 	 * the kernel buffer earlier than other per-cpu kernel buffers are handled.
405 	 *
406 	 * Coping can be done in two steps in case the chunk of profiling data
407 	 * crosses the upper bound of the kernel buffer. In this case we first move
408 	 * part of data from map->start till the upper bound and then the reminder
409 	 * from the beginning of the kernel buffer till the end of the data chunk.
410 	 */
411 
412 	if (record__comp_enabled(aio->rec)) {
413 		size = zstd_compress(aio->rec->session, NULL, aio->data + aio->size,
414 				     mmap__mmap_len(map) - aio->size,
415 				     buf, size);
416 	} else {
417 		memcpy(aio->data + aio->size, buf, size);
418 	}
419 
420 	if (!aio->size) {
421 		/*
422 		 * Increment map->refcount to guard map->aio.data[] buffer
423 		 * from premature deallocation because map object can be
424 		 * released earlier than aio write request started on
425 		 * map->aio.data[] buffer is complete.
426 		 *
427 		 * perf_mmap__put() is done at record__aio_complete()
428 		 * after started aio request completion or at record__aio_push()
429 		 * if the request failed to start.
430 		 */
431 		perf_mmap__get(&map->core);
432 	}
433 
434 	aio->size += size;
435 
436 	return size;
437 }
438 
439 static int record__aio_push(struct record *rec, struct mmap *map, off_t *off)
440 {
441 	int ret, idx;
442 	int trace_fd = rec->session->data->file.fd;
443 	struct record_aio aio = { .rec = rec, .size = 0 };
444 
445 	/*
446 	 * Call record__aio_sync() to wait till map->aio.data[] buffer
447 	 * becomes available after previous aio write operation.
448 	 */
449 
450 	idx = record__aio_sync(map, false);
451 	aio.data = map->aio.data[idx];
452 	ret = perf_mmap__push(map, &aio, record__aio_pushfn);
453 	if (ret != 0) /* ret > 0 - no data, ret < 0 - error */
454 		return ret;
455 
456 	rec->samples++;
457 	ret = record__aio_write(&(map->aio.cblocks[idx]), trace_fd, aio.data, aio.size, *off);
458 	if (!ret) {
459 		*off += aio.size;
460 		rec->bytes_written += aio.size;
461 		if (switch_output_size(rec))
462 			trigger_hit(&switch_output_trigger);
463 	} else {
464 		/*
465 		 * Decrement map->refcount incremented in record__aio_pushfn()
466 		 * back if record__aio_write() operation failed to start, otherwise
467 		 * map->refcount is decremented in record__aio_complete() after
468 		 * aio write operation finishes successfully.
469 		 */
470 		perf_mmap__put(&map->core);
471 	}
472 
473 	return ret;
474 }
475 
476 static off_t record__aio_get_pos(int trace_fd)
477 {
478 	return lseek(trace_fd, 0, SEEK_CUR);
479 }
480 
481 static void record__aio_set_pos(int trace_fd, off_t pos)
482 {
483 	lseek(trace_fd, pos, SEEK_SET);
484 }
485 
486 static void record__aio_mmap_read_sync(struct record *rec)
487 {
488 	int i;
489 	struct evlist *evlist = rec->evlist;
490 	struct mmap *maps = evlist->mmap;
491 
492 	if (!record__aio_enabled(rec))
493 		return;
494 
495 	for (i = 0; i < evlist->core.nr_mmaps; i++) {
496 		struct mmap *map = &maps[i];
497 
498 		if (map->core.base)
499 			record__aio_sync(map, true);
500 	}
501 }
502 
503 static int nr_cblocks_default = 1;
504 static int nr_cblocks_max = 4;
505 
506 static int record__aio_parse(const struct option *opt,
507 			     const char *str,
508 			     int unset)
509 {
510 	struct record_opts *opts = (struct record_opts *)opt->value;
511 
512 	if (unset) {
513 		opts->nr_cblocks = 0;
514 	} else {
515 		if (str)
516 			opts->nr_cblocks = strtol(str, NULL, 0);
517 		if (!opts->nr_cblocks)
518 			opts->nr_cblocks = nr_cblocks_default;
519 	}
520 
521 	return 0;
522 }
523 #else /* HAVE_AIO_SUPPORT */
524 static int nr_cblocks_max = 0;
525 
526 static int record__aio_push(struct record *rec __maybe_unused, struct mmap *map __maybe_unused,
527 			    off_t *off __maybe_unused)
528 {
529 	return -1;
530 }
531 
532 static off_t record__aio_get_pos(int trace_fd __maybe_unused)
533 {
534 	return -1;
535 }
536 
537 static void record__aio_set_pos(int trace_fd __maybe_unused, off_t pos __maybe_unused)
538 {
539 }
540 
541 static void record__aio_mmap_read_sync(struct record *rec __maybe_unused)
542 {
543 }
544 #endif
545 
546 static int record__aio_enabled(struct record *rec)
547 {
548 	return rec->opts.nr_cblocks > 0;
549 }
550 
551 #define MMAP_FLUSH_DEFAULT 1
552 static int record__mmap_flush_parse(const struct option *opt,
553 				    const char *str,
554 				    int unset)
555 {
556 	int flush_max;
557 	struct record_opts *opts = (struct record_opts *)opt->value;
558 	static struct parse_tag tags[] = {
559 			{ .tag  = 'B', .mult = 1       },
560 			{ .tag  = 'K', .mult = 1 << 10 },
561 			{ .tag  = 'M', .mult = 1 << 20 },
562 			{ .tag  = 'G', .mult = 1 << 30 },
563 			{ .tag  = 0 },
564 	};
565 
566 	if (unset)
567 		return 0;
568 
569 	if (str) {
570 		opts->mmap_flush = parse_tag_value(str, tags);
571 		if (opts->mmap_flush == (int)-1)
572 			opts->mmap_flush = strtol(str, NULL, 0);
573 	}
574 
575 	if (!opts->mmap_flush)
576 		opts->mmap_flush = MMAP_FLUSH_DEFAULT;
577 
578 	flush_max = evlist__mmap_size(opts->mmap_pages);
579 	flush_max /= 4;
580 	if (opts->mmap_flush > flush_max)
581 		opts->mmap_flush = flush_max;
582 
583 	return 0;
584 }
585 
586 #ifdef HAVE_ZSTD_SUPPORT
587 static unsigned int comp_level_default = 1;
588 
589 static int record__parse_comp_level(const struct option *opt, const char *str, int unset)
590 {
591 	struct record_opts *opts = opt->value;
592 
593 	if (unset) {
594 		opts->comp_level = 0;
595 	} else {
596 		if (str)
597 			opts->comp_level = strtol(str, NULL, 0);
598 		if (!opts->comp_level)
599 			opts->comp_level = comp_level_default;
600 	}
601 
602 	return 0;
603 }
604 #endif
605 static unsigned int comp_level_max = 22;
606 
607 static int record__comp_enabled(struct record *rec)
608 {
609 	return rec->opts.comp_level > 0;
610 }
611 
612 static int process_synthesized_event(struct perf_tool *tool,
613 				     union perf_event *event,
614 				     struct perf_sample *sample __maybe_unused,
615 				     struct machine *machine __maybe_unused)
616 {
617 	struct record *rec = container_of(tool, struct record, tool);
618 	return record__write(rec, NULL, event, event->header.size);
619 }
620 
621 static struct mutex synth_lock;
622 
623 static int process_locked_synthesized_event(struct perf_tool *tool,
624 				     union perf_event *event,
625 				     struct perf_sample *sample __maybe_unused,
626 				     struct machine *machine __maybe_unused)
627 {
628 	int ret;
629 
630 	mutex_lock(&synth_lock);
631 	ret = process_synthesized_event(tool, event, sample, machine);
632 	mutex_unlock(&synth_lock);
633 	return ret;
634 }
635 
636 static int record__pushfn(struct mmap *map, void *to, void *bf, size_t size)
637 {
638 	struct record *rec = to;
639 
640 	if (record__comp_enabled(rec)) {
641 		size = zstd_compress(rec->session, map, map->data, mmap__mmap_len(map), bf, size);
642 		bf   = map->data;
643 	}
644 
645 	thread->samples++;
646 	return record__write(rec, map, bf, size);
647 }
648 
649 static volatile sig_atomic_t signr = -1;
650 static volatile sig_atomic_t child_finished;
651 #ifdef HAVE_EVENTFD_SUPPORT
652 static volatile sig_atomic_t done_fd = -1;
653 #endif
654 
655 static void sig_handler(int sig)
656 {
657 	if (sig == SIGCHLD)
658 		child_finished = 1;
659 	else
660 		signr = sig;
661 
662 	done = 1;
663 #ifdef HAVE_EVENTFD_SUPPORT
664 	if (done_fd >= 0) {
665 		u64 tmp = 1;
666 		int orig_errno = errno;
667 
668 		/*
669 		 * It is possible for this signal handler to run after done is
670 		 * checked in the main loop, but before the perf counter fds are
671 		 * polled. If this happens, the poll() will continue to wait
672 		 * even though done is set, and will only break out if either
673 		 * another signal is received, or the counters are ready for
674 		 * read. To ensure the poll() doesn't sleep when done is set,
675 		 * use an eventfd (done_fd) to wake up the poll().
676 		 */
677 		if (write(done_fd, &tmp, sizeof(tmp)) < 0)
678 			pr_err("failed to signal wakeup fd, error: %m\n");
679 
680 		errno = orig_errno;
681 	}
682 #endif // HAVE_EVENTFD_SUPPORT
683 }
684 
685 static void sigsegv_handler(int sig)
686 {
687 	perf_hooks__recover();
688 	sighandler_dump_stack(sig);
689 }
690 
691 static void record__sig_exit(void)
692 {
693 	if (signr == -1)
694 		return;
695 
696 	signal(signr, SIG_DFL);
697 	raise(signr);
698 }
699 
700 #ifdef HAVE_AUXTRACE_SUPPORT
701 
702 static int record__process_auxtrace(struct perf_tool *tool,
703 				    struct mmap *map,
704 				    union perf_event *event, void *data1,
705 				    size_t len1, void *data2, size_t len2)
706 {
707 	struct record *rec = container_of(tool, struct record, tool);
708 	struct perf_data *data = &rec->data;
709 	size_t padding;
710 	u8 pad[8] = {0};
711 
712 	if (!perf_data__is_pipe(data) && perf_data__is_single_file(data)) {
713 		off_t file_offset;
714 		int fd = perf_data__fd(data);
715 		int err;
716 
717 		file_offset = lseek(fd, 0, SEEK_CUR);
718 		if (file_offset == -1)
719 			return -1;
720 		err = auxtrace_index__auxtrace_event(&rec->session->auxtrace_index,
721 						     event, file_offset);
722 		if (err)
723 			return err;
724 	}
725 
726 	/* event.auxtrace.size includes padding, see __auxtrace_mmap__read() */
727 	padding = (len1 + len2) & 7;
728 	if (padding)
729 		padding = 8 - padding;
730 
731 	record__write(rec, map, event, event->header.size);
732 	record__write(rec, map, data1, len1);
733 	if (len2)
734 		record__write(rec, map, data2, len2);
735 	record__write(rec, map, &pad, padding);
736 
737 	return 0;
738 }
739 
740 static int record__auxtrace_mmap_read(struct record *rec,
741 				      struct mmap *map)
742 {
743 	int ret;
744 
745 	ret = auxtrace_mmap__read(map, rec->itr, &rec->tool,
746 				  record__process_auxtrace);
747 	if (ret < 0)
748 		return ret;
749 
750 	if (ret)
751 		rec->samples++;
752 
753 	return 0;
754 }
755 
756 static int record__auxtrace_mmap_read_snapshot(struct record *rec,
757 					       struct mmap *map)
758 {
759 	int ret;
760 
761 	ret = auxtrace_mmap__read_snapshot(map, rec->itr, &rec->tool,
762 					   record__process_auxtrace,
763 					   rec->opts.auxtrace_snapshot_size);
764 	if (ret < 0)
765 		return ret;
766 
767 	if (ret)
768 		rec->samples++;
769 
770 	return 0;
771 }
772 
773 static int record__auxtrace_read_snapshot_all(struct record *rec)
774 {
775 	int i;
776 	int rc = 0;
777 
778 	for (i = 0; i < rec->evlist->core.nr_mmaps; i++) {
779 		struct mmap *map = &rec->evlist->mmap[i];
780 
781 		if (!map->auxtrace_mmap.base)
782 			continue;
783 
784 		if (record__auxtrace_mmap_read_snapshot(rec, map) != 0) {
785 			rc = -1;
786 			goto out;
787 		}
788 	}
789 out:
790 	return rc;
791 }
792 
793 static void record__read_auxtrace_snapshot(struct record *rec, bool on_exit)
794 {
795 	pr_debug("Recording AUX area tracing snapshot\n");
796 	if (record__auxtrace_read_snapshot_all(rec) < 0) {
797 		trigger_error(&auxtrace_snapshot_trigger);
798 	} else {
799 		if (auxtrace_record__snapshot_finish(rec->itr, on_exit))
800 			trigger_error(&auxtrace_snapshot_trigger);
801 		else
802 			trigger_ready(&auxtrace_snapshot_trigger);
803 	}
804 }
805 
806 static int record__auxtrace_snapshot_exit(struct record *rec)
807 {
808 	if (trigger_is_error(&auxtrace_snapshot_trigger))
809 		return 0;
810 
811 	if (!auxtrace_record__snapshot_started &&
812 	    auxtrace_record__snapshot_start(rec->itr))
813 		return -1;
814 
815 	record__read_auxtrace_snapshot(rec, true);
816 	if (trigger_is_error(&auxtrace_snapshot_trigger))
817 		return -1;
818 
819 	return 0;
820 }
821 
822 static int record__auxtrace_init(struct record *rec)
823 {
824 	int err;
825 
826 	if ((rec->opts.auxtrace_snapshot_opts || rec->opts.auxtrace_sample_opts)
827 	    && record__threads_enabled(rec)) {
828 		pr_err("AUX area tracing options are not available in parallel streaming mode.\n");
829 		return -EINVAL;
830 	}
831 
832 	if (!rec->itr) {
833 		rec->itr = auxtrace_record__init(rec->evlist, &err);
834 		if (err)
835 			return err;
836 	}
837 
838 	err = auxtrace_parse_snapshot_options(rec->itr, &rec->opts,
839 					      rec->opts.auxtrace_snapshot_opts);
840 	if (err)
841 		return err;
842 
843 	err = auxtrace_parse_sample_options(rec->itr, rec->evlist, &rec->opts,
844 					    rec->opts.auxtrace_sample_opts);
845 	if (err)
846 		return err;
847 
848 	auxtrace_regroup_aux_output(rec->evlist);
849 
850 	return auxtrace_parse_filters(rec->evlist);
851 }
852 
853 #else
854 
855 static inline
856 int record__auxtrace_mmap_read(struct record *rec __maybe_unused,
857 			       struct mmap *map __maybe_unused)
858 {
859 	return 0;
860 }
861 
862 static inline
863 void record__read_auxtrace_snapshot(struct record *rec __maybe_unused,
864 				    bool on_exit __maybe_unused)
865 {
866 }
867 
868 static inline
869 int auxtrace_record__snapshot_start(struct auxtrace_record *itr __maybe_unused)
870 {
871 	return 0;
872 }
873 
874 static inline
875 int record__auxtrace_snapshot_exit(struct record *rec __maybe_unused)
876 {
877 	return 0;
878 }
879 
880 static int record__auxtrace_init(struct record *rec __maybe_unused)
881 {
882 	return 0;
883 }
884 
885 #endif
886 
887 static int record__config_text_poke(struct evlist *evlist)
888 {
889 	struct evsel *evsel;
890 
891 	/* Nothing to do if text poke is already configured */
892 	evlist__for_each_entry(evlist, evsel) {
893 		if (evsel->core.attr.text_poke)
894 			return 0;
895 	}
896 
897 	evsel = evlist__add_dummy_on_all_cpus(evlist);
898 	if (!evsel)
899 		return -ENOMEM;
900 
901 	evsel->core.attr.text_poke = 1;
902 	evsel->core.attr.ksymbol = 1;
903 	evsel->immediate = true;
904 	evsel__set_sample_bit(evsel, TIME);
905 
906 	return 0;
907 }
908 
909 static int record__config_off_cpu(struct record *rec)
910 {
911 	return off_cpu_prepare(rec->evlist, &rec->opts.target, &rec->opts);
912 }
913 
914 static bool record__kcore_readable(struct machine *machine)
915 {
916 	char kcore[PATH_MAX];
917 	int fd;
918 
919 	scnprintf(kcore, sizeof(kcore), "%s/proc/kcore", machine->root_dir);
920 
921 	fd = open(kcore, O_RDONLY);
922 	if (fd < 0)
923 		return false;
924 
925 	close(fd);
926 
927 	return true;
928 }
929 
930 static int record__kcore_copy(struct machine *machine, struct perf_data *data)
931 {
932 	char from_dir[PATH_MAX];
933 	char kcore_dir[PATH_MAX];
934 	int ret;
935 
936 	snprintf(from_dir, sizeof(from_dir), "%s/proc", machine->root_dir);
937 
938 	ret = perf_data__make_kcore_dir(data, kcore_dir, sizeof(kcore_dir));
939 	if (ret)
940 		return ret;
941 
942 	return kcore_copy(from_dir, kcore_dir);
943 }
944 
945 static void record__thread_data_init_pipes(struct record_thread *thread_data)
946 {
947 	thread_data->pipes.msg[0] = -1;
948 	thread_data->pipes.msg[1] = -1;
949 	thread_data->pipes.ack[0] = -1;
950 	thread_data->pipes.ack[1] = -1;
951 }
952 
953 static int record__thread_data_open_pipes(struct record_thread *thread_data)
954 {
955 	if (pipe(thread_data->pipes.msg))
956 		return -EINVAL;
957 
958 	if (pipe(thread_data->pipes.ack)) {
959 		close(thread_data->pipes.msg[0]);
960 		thread_data->pipes.msg[0] = -1;
961 		close(thread_data->pipes.msg[1]);
962 		thread_data->pipes.msg[1] = -1;
963 		return -EINVAL;
964 	}
965 
966 	pr_debug2("thread_data[%p]: msg=[%d,%d], ack=[%d,%d]\n", thread_data,
967 		 thread_data->pipes.msg[0], thread_data->pipes.msg[1],
968 		 thread_data->pipes.ack[0], thread_data->pipes.ack[1]);
969 
970 	return 0;
971 }
972 
973 static void record__thread_data_close_pipes(struct record_thread *thread_data)
974 {
975 	if (thread_data->pipes.msg[0] != -1) {
976 		close(thread_data->pipes.msg[0]);
977 		thread_data->pipes.msg[0] = -1;
978 	}
979 	if (thread_data->pipes.msg[1] != -1) {
980 		close(thread_data->pipes.msg[1]);
981 		thread_data->pipes.msg[1] = -1;
982 	}
983 	if (thread_data->pipes.ack[0] != -1) {
984 		close(thread_data->pipes.ack[0]);
985 		thread_data->pipes.ack[0] = -1;
986 	}
987 	if (thread_data->pipes.ack[1] != -1) {
988 		close(thread_data->pipes.ack[1]);
989 		thread_data->pipes.ack[1] = -1;
990 	}
991 }
992 
993 static bool evlist__per_thread(struct evlist *evlist)
994 {
995 	return cpu_map__is_dummy(evlist->core.user_requested_cpus);
996 }
997 
998 static int record__thread_data_init_maps(struct record_thread *thread_data, struct evlist *evlist)
999 {
1000 	int m, tm, nr_mmaps = evlist->core.nr_mmaps;
1001 	struct mmap *mmap = evlist->mmap;
1002 	struct mmap *overwrite_mmap = evlist->overwrite_mmap;
1003 	struct perf_cpu_map *cpus = evlist->core.all_cpus;
1004 	bool per_thread = evlist__per_thread(evlist);
1005 
1006 	if (per_thread)
1007 		thread_data->nr_mmaps = nr_mmaps;
1008 	else
1009 		thread_data->nr_mmaps = bitmap_weight(thread_data->mask->maps.bits,
1010 						      thread_data->mask->maps.nbits);
1011 	if (mmap) {
1012 		thread_data->maps = zalloc(thread_data->nr_mmaps * sizeof(struct mmap *));
1013 		if (!thread_data->maps)
1014 			return -ENOMEM;
1015 	}
1016 	if (overwrite_mmap) {
1017 		thread_data->overwrite_maps = zalloc(thread_data->nr_mmaps * sizeof(struct mmap *));
1018 		if (!thread_data->overwrite_maps) {
1019 			zfree(&thread_data->maps);
1020 			return -ENOMEM;
1021 		}
1022 	}
1023 	pr_debug2("thread_data[%p]: nr_mmaps=%d, maps=%p, ow_maps=%p\n", thread_data,
1024 		 thread_data->nr_mmaps, thread_data->maps, thread_data->overwrite_maps);
1025 
1026 	for (m = 0, tm = 0; m < nr_mmaps && tm < thread_data->nr_mmaps; m++) {
1027 		if (per_thread ||
1028 		    test_bit(perf_cpu_map__cpu(cpus, m).cpu, thread_data->mask->maps.bits)) {
1029 			if (thread_data->maps) {
1030 				thread_data->maps[tm] = &mmap[m];
1031 				pr_debug2("thread_data[%p]: cpu%d: maps[%d] -> mmap[%d]\n",
1032 					  thread_data, perf_cpu_map__cpu(cpus, m).cpu, tm, m);
1033 			}
1034 			if (thread_data->overwrite_maps) {
1035 				thread_data->overwrite_maps[tm] = &overwrite_mmap[m];
1036 				pr_debug2("thread_data[%p]: cpu%d: ow_maps[%d] -> ow_mmap[%d]\n",
1037 					  thread_data, perf_cpu_map__cpu(cpus, m).cpu, tm, m);
1038 			}
1039 			tm++;
1040 		}
1041 	}
1042 
1043 	return 0;
1044 }
1045 
1046 static int record__thread_data_init_pollfd(struct record_thread *thread_data, struct evlist *evlist)
1047 {
1048 	int f, tm, pos;
1049 	struct mmap *map, *overwrite_map;
1050 
1051 	fdarray__init(&thread_data->pollfd, 64);
1052 
1053 	for (tm = 0; tm < thread_data->nr_mmaps; tm++) {
1054 		map = thread_data->maps ? thread_data->maps[tm] : NULL;
1055 		overwrite_map = thread_data->overwrite_maps ?
1056 				thread_data->overwrite_maps[tm] : NULL;
1057 
1058 		for (f = 0; f < evlist->core.pollfd.nr; f++) {
1059 			void *ptr = evlist->core.pollfd.priv[f].ptr;
1060 
1061 			if ((map && ptr == map) || (overwrite_map && ptr == overwrite_map)) {
1062 				pos = fdarray__dup_entry_from(&thread_data->pollfd, f,
1063 							      &evlist->core.pollfd);
1064 				if (pos < 0)
1065 					return pos;
1066 				pr_debug2("thread_data[%p]: pollfd[%d] <- event_fd=%d\n",
1067 					 thread_data, pos, evlist->core.pollfd.entries[f].fd);
1068 			}
1069 		}
1070 	}
1071 
1072 	return 0;
1073 }
1074 
1075 static void record__free_thread_data(struct record *rec)
1076 {
1077 	int t;
1078 	struct record_thread *thread_data = rec->thread_data;
1079 
1080 	if (thread_data == NULL)
1081 		return;
1082 
1083 	for (t = 0; t < rec->nr_threads; t++) {
1084 		record__thread_data_close_pipes(&thread_data[t]);
1085 		zfree(&thread_data[t].maps);
1086 		zfree(&thread_data[t].overwrite_maps);
1087 		fdarray__exit(&thread_data[t].pollfd);
1088 	}
1089 
1090 	zfree(&rec->thread_data);
1091 }
1092 
1093 static int record__map_thread_evlist_pollfd_indexes(struct record *rec,
1094 						    int evlist_pollfd_index,
1095 						    int thread_pollfd_index)
1096 {
1097 	size_t x = rec->index_map_cnt;
1098 
1099 	if (realloc_array_as_needed(rec->index_map, rec->index_map_sz, x, NULL))
1100 		return -ENOMEM;
1101 	rec->index_map[x].evlist_pollfd_index = evlist_pollfd_index;
1102 	rec->index_map[x].thread_pollfd_index = thread_pollfd_index;
1103 	rec->index_map_cnt += 1;
1104 	return 0;
1105 }
1106 
1107 static int record__update_evlist_pollfd_from_thread(struct record *rec,
1108 						    struct evlist *evlist,
1109 						    struct record_thread *thread_data)
1110 {
1111 	struct pollfd *e_entries = evlist->core.pollfd.entries;
1112 	struct pollfd *t_entries = thread_data->pollfd.entries;
1113 	int err = 0;
1114 	size_t i;
1115 
1116 	for (i = 0; i < rec->index_map_cnt; i++) {
1117 		int e_pos = rec->index_map[i].evlist_pollfd_index;
1118 		int t_pos = rec->index_map[i].thread_pollfd_index;
1119 
1120 		if (e_entries[e_pos].fd != t_entries[t_pos].fd ||
1121 		    e_entries[e_pos].events != t_entries[t_pos].events) {
1122 			pr_err("Thread and evlist pollfd index mismatch\n");
1123 			err = -EINVAL;
1124 			continue;
1125 		}
1126 		e_entries[e_pos].revents = t_entries[t_pos].revents;
1127 	}
1128 	return err;
1129 }
1130 
1131 static int record__dup_non_perf_events(struct record *rec,
1132 				       struct evlist *evlist,
1133 				       struct record_thread *thread_data)
1134 {
1135 	struct fdarray *fda = &evlist->core.pollfd;
1136 	int i, ret;
1137 
1138 	for (i = 0; i < fda->nr; i++) {
1139 		if (!(fda->priv[i].flags & fdarray_flag__non_perf_event))
1140 			continue;
1141 		ret = fdarray__dup_entry_from(&thread_data->pollfd, i, fda);
1142 		if (ret < 0) {
1143 			pr_err("Failed to duplicate descriptor in main thread pollfd\n");
1144 			return ret;
1145 		}
1146 		pr_debug2("thread_data[%p]: pollfd[%d] <- non_perf_event fd=%d\n",
1147 			  thread_data, ret, fda->entries[i].fd);
1148 		ret = record__map_thread_evlist_pollfd_indexes(rec, i, ret);
1149 		if (ret < 0) {
1150 			pr_err("Failed to map thread and evlist pollfd indexes\n");
1151 			return ret;
1152 		}
1153 	}
1154 	return 0;
1155 }
1156 
1157 static int record__alloc_thread_data(struct record *rec, struct evlist *evlist)
1158 {
1159 	int t, ret;
1160 	struct record_thread *thread_data;
1161 
1162 	rec->thread_data = zalloc(rec->nr_threads * sizeof(*(rec->thread_data)));
1163 	if (!rec->thread_data) {
1164 		pr_err("Failed to allocate thread data\n");
1165 		return -ENOMEM;
1166 	}
1167 	thread_data = rec->thread_data;
1168 
1169 	for (t = 0; t < rec->nr_threads; t++)
1170 		record__thread_data_init_pipes(&thread_data[t]);
1171 
1172 	for (t = 0; t < rec->nr_threads; t++) {
1173 		thread_data[t].rec = rec;
1174 		thread_data[t].mask = &rec->thread_masks[t];
1175 		ret = record__thread_data_init_maps(&thread_data[t], evlist);
1176 		if (ret) {
1177 			pr_err("Failed to initialize thread[%d] maps\n", t);
1178 			goto out_free;
1179 		}
1180 		ret = record__thread_data_init_pollfd(&thread_data[t], evlist);
1181 		if (ret) {
1182 			pr_err("Failed to initialize thread[%d] pollfd\n", t);
1183 			goto out_free;
1184 		}
1185 		if (t) {
1186 			thread_data[t].tid = -1;
1187 			ret = record__thread_data_open_pipes(&thread_data[t]);
1188 			if (ret) {
1189 				pr_err("Failed to open thread[%d] communication pipes\n", t);
1190 				goto out_free;
1191 			}
1192 			ret = fdarray__add(&thread_data[t].pollfd, thread_data[t].pipes.msg[0],
1193 					   POLLIN | POLLERR | POLLHUP, fdarray_flag__nonfilterable);
1194 			if (ret < 0) {
1195 				pr_err("Failed to add descriptor to thread[%d] pollfd\n", t);
1196 				goto out_free;
1197 			}
1198 			thread_data[t].ctlfd_pos = ret;
1199 			pr_debug2("thread_data[%p]: pollfd[%d] <- ctl_fd=%d\n",
1200 				 thread_data, thread_data[t].ctlfd_pos,
1201 				 thread_data[t].pipes.msg[0]);
1202 		} else {
1203 			thread_data[t].tid = gettid();
1204 
1205 			ret = record__dup_non_perf_events(rec, evlist, &thread_data[t]);
1206 			if (ret < 0)
1207 				goto out_free;
1208 
1209 			thread_data[t].ctlfd_pos = -1; /* Not used */
1210 		}
1211 	}
1212 
1213 	return 0;
1214 
1215 out_free:
1216 	record__free_thread_data(rec);
1217 
1218 	return ret;
1219 }
1220 
1221 static int record__mmap_evlist(struct record *rec,
1222 			       struct evlist *evlist)
1223 {
1224 	int i, ret;
1225 	struct record_opts *opts = &rec->opts;
1226 	bool auxtrace_overwrite = opts->auxtrace_snapshot_mode ||
1227 				  opts->auxtrace_sample_mode;
1228 	char msg[512];
1229 
1230 	if (opts->affinity != PERF_AFFINITY_SYS)
1231 		cpu__setup_cpunode_map();
1232 
1233 	if (evlist__mmap_ex(evlist, opts->mmap_pages,
1234 				 opts->auxtrace_mmap_pages,
1235 				 auxtrace_overwrite,
1236 				 opts->nr_cblocks, opts->affinity,
1237 				 opts->mmap_flush, opts->comp_level) < 0) {
1238 		if (errno == EPERM) {
1239 			pr_err("Permission error mapping pages.\n"
1240 			       "Consider increasing "
1241 			       "/proc/sys/kernel/perf_event_mlock_kb,\n"
1242 			       "or try again with a smaller value of -m/--mmap_pages.\n"
1243 			       "(current value: %u,%u)\n",
1244 			       opts->mmap_pages, opts->auxtrace_mmap_pages);
1245 			return -errno;
1246 		} else {
1247 			pr_err("failed to mmap with %d (%s)\n", errno,
1248 				str_error_r(errno, msg, sizeof(msg)));
1249 			if (errno)
1250 				return -errno;
1251 			else
1252 				return -EINVAL;
1253 		}
1254 	}
1255 
1256 	if (evlist__initialize_ctlfd(evlist, opts->ctl_fd, opts->ctl_fd_ack))
1257 		return -1;
1258 
1259 	ret = record__alloc_thread_data(rec, evlist);
1260 	if (ret)
1261 		return ret;
1262 
1263 	if (record__threads_enabled(rec)) {
1264 		ret = perf_data__create_dir(&rec->data, evlist->core.nr_mmaps);
1265 		if (ret) {
1266 			pr_err("Failed to create data directory: %s\n", strerror(-ret));
1267 			return ret;
1268 		}
1269 		for (i = 0; i < evlist->core.nr_mmaps; i++) {
1270 			if (evlist->mmap)
1271 				evlist->mmap[i].file = &rec->data.dir.files[i];
1272 			if (evlist->overwrite_mmap)
1273 				evlist->overwrite_mmap[i].file = &rec->data.dir.files[i];
1274 		}
1275 	}
1276 
1277 	return 0;
1278 }
1279 
1280 static int record__mmap(struct record *rec)
1281 {
1282 	return record__mmap_evlist(rec, rec->evlist);
1283 }
1284 
1285 static int record__open(struct record *rec)
1286 {
1287 	char msg[BUFSIZ];
1288 	struct evsel *pos;
1289 	struct evlist *evlist = rec->evlist;
1290 	struct perf_session *session = rec->session;
1291 	struct record_opts *opts = &rec->opts;
1292 	int rc = 0;
1293 
1294 	/*
1295 	 * For initial_delay, system wide or a hybrid system, we need to add a
1296 	 * dummy event so that we can track PERF_RECORD_MMAP to cover the delay
1297 	 * of waiting or event synthesis.
1298 	 */
1299 	if (opts->initial_delay || target__has_cpu(&opts->target) ||
1300 	    perf_pmu__has_hybrid()) {
1301 		pos = evlist__get_tracking_event(evlist);
1302 		if (!evsel__is_dummy_event(pos)) {
1303 			/* Set up dummy event. */
1304 			if (evlist__add_dummy(evlist))
1305 				return -ENOMEM;
1306 			pos = evlist__last(evlist);
1307 			evlist__set_tracking_event(evlist, pos);
1308 		}
1309 
1310 		/*
1311 		 * Enable the dummy event when the process is forked for
1312 		 * initial_delay, immediately for system wide.
1313 		 */
1314 		if (opts->initial_delay && !pos->immediate &&
1315 		    !target__has_cpu(&opts->target))
1316 			pos->core.attr.enable_on_exec = 1;
1317 		else
1318 			pos->immediate = 1;
1319 	}
1320 
1321 	evlist__config(evlist, opts, &callchain_param);
1322 
1323 	evlist__for_each_entry(evlist, pos) {
1324 try_again:
1325 		if (evsel__open(pos, pos->core.cpus, pos->core.threads) < 0) {
1326 			if (evsel__fallback(pos, errno, msg, sizeof(msg))) {
1327 				if (verbose > 0)
1328 					ui__warning("%s\n", msg);
1329 				goto try_again;
1330 			}
1331 			if ((errno == EINVAL || errno == EBADF) &&
1332 			    pos->core.leader != &pos->core &&
1333 			    pos->weak_group) {
1334 			        pos = evlist__reset_weak_group(evlist, pos, true);
1335 				goto try_again;
1336 			}
1337 			rc = -errno;
1338 			evsel__open_strerror(pos, &opts->target, errno, msg, sizeof(msg));
1339 			ui__error("%s\n", msg);
1340 			goto out;
1341 		}
1342 
1343 		pos->supported = true;
1344 	}
1345 
1346 	if (symbol_conf.kptr_restrict && !evlist__exclude_kernel(evlist)) {
1347 		pr_warning(
1348 "WARNING: Kernel address maps (/proc/{kallsyms,modules}) are restricted,\n"
1349 "check /proc/sys/kernel/kptr_restrict and /proc/sys/kernel/perf_event_paranoid.\n\n"
1350 "Samples in kernel functions may not be resolved if a suitable vmlinux\n"
1351 "file is not found in the buildid cache or in the vmlinux path.\n\n"
1352 "Samples in kernel modules won't be resolved at all.\n\n"
1353 "If some relocation was applied (e.g. kexec) symbols may be misresolved\n"
1354 "even with a suitable vmlinux or kallsyms file.\n\n");
1355 	}
1356 
1357 	if (evlist__apply_filters(evlist, &pos)) {
1358 		pr_err("failed to set filter \"%s\" on event %s with %d (%s)\n",
1359 			pos->filter, evsel__name(pos), errno,
1360 			str_error_r(errno, msg, sizeof(msg)));
1361 		rc = -1;
1362 		goto out;
1363 	}
1364 
1365 	rc = record__mmap(rec);
1366 	if (rc)
1367 		goto out;
1368 
1369 	session->evlist = evlist;
1370 	perf_session__set_id_hdr_size(session);
1371 out:
1372 	return rc;
1373 }
1374 
1375 static void set_timestamp_boundary(struct record *rec, u64 sample_time)
1376 {
1377 	if (rec->evlist->first_sample_time == 0)
1378 		rec->evlist->first_sample_time = sample_time;
1379 
1380 	if (sample_time)
1381 		rec->evlist->last_sample_time = sample_time;
1382 }
1383 
1384 static int process_sample_event(struct perf_tool *tool,
1385 				union perf_event *event,
1386 				struct perf_sample *sample,
1387 				struct evsel *evsel,
1388 				struct machine *machine)
1389 {
1390 	struct record *rec = container_of(tool, struct record, tool);
1391 
1392 	set_timestamp_boundary(rec, sample->time);
1393 
1394 	if (rec->buildid_all)
1395 		return 0;
1396 
1397 	rec->samples++;
1398 	return build_id__mark_dso_hit(tool, event, sample, evsel, machine);
1399 }
1400 
1401 static int process_buildids(struct record *rec)
1402 {
1403 	struct perf_session *session = rec->session;
1404 
1405 	if (perf_data__size(&rec->data) == 0)
1406 		return 0;
1407 
1408 	/*
1409 	 * During this process, it'll load kernel map and replace the
1410 	 * dso->long_name to a real pathname it found.  In this case
1411 	 * we prefer the vmlinux path like
1412 	 *   /lib/modules/3.16.4/build/vmlinux
1413 	 *
1414 	 * rather than build-id path (in debug directory).
1415 	 *   $HOME/.debug/.build-id/f0/6e17aa50adf4d00b88925e03775de107611551
1416 	 */
1417 	symbol_conf.ignore_vmlinux_buildid = true;
1418 
1419 	/*
1420 	 * If --buildid-all is given, it marks all DSO regardless of hits,
1421 	 * so no need to process samples. But if timestamp_boundary is enabled,
1422 	 * it still needs to walk on all samples to get the timestamps of
1423 	 * first/last samples.
1424 	 */
1425 	if (rec->buildid_all && !rec->timestamp_boundary)
1426 		rec->tool.sample = NULL;
1427 
1428 	return perf_session__process_events(session);
1429 }
1430 
1431 static void perf_event__synthesize_guest_os(struct machine *machine, void *data)
1432 {
1433 	int err;
1434 	struct perf_tool *tool = data;
1435 	/*
1436 	 *As for guest kernel when processing subcommand record&report,
1437 	 *we arrange module mmap prior to guest kernel mmap and trigger
1438 	 *a preload dso because default guest module symbols are loaded
1439 	 *from guest kallsyms instead of /lib/modules/XXX/XXX. This
1440 	 *method is used to avoid symbol missing when the first addr is
1441 	 *in module instead of in guest kernel.
1442 	 */
1443 	err = perf_event__synthesize_modules(tool, process_synthesized_event,
1444 					     machine);
1445 	if (err < 0)
1446 		pr_err("Couldn't record guest kernel [%d]'s reference"
1447 		       " relocation symbol.\n", machine->pid);
1448 
1449 	/*
1450 	 * We use _stext for guest kernel because guest kernel's /proc/kallsyms
1451 	 * have no _text sometimes.
1452 	 */
1453 	err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
1454 						 machine);
1455 	if (err < 0)
1456 		pr_err("Couldn't record guest kernel [%d]'s reference"
1457 		       " relocation symbol.\n", machine->pid);
1458 }
1459 
1460 static struct perf_event_header finished_round_event = {
1461 	.size = sizeof(struct perf_event_header),
1462 	.type = PERF_RECORD_FINISHED_ROUND,
1463 };
1464 
1465 static struct perf_event_header finished_init_event = {
1466 	.size = sizeof(struct perf_event_header),
1467 	.type = PERF_RECORD_FINISHED_INIT,
1468 };
1469 
1470 static void record__adjust_affinity(struct record *rec, struct mmap *map)
1471 {
1472 	if (rec->opts.affinity != PERF_AFFINITY_SYS &&
1473 	    !bitmap_equal(thread->mask->affinity.bits, map->affinity_mask.bits,
1474 			  thread->mask->affinity.nbits)) {
1475 		bitmap_zero(thread->mask->affinity.bits, thread->mask->affinity.nbits);
1476 		bitmap_or(thread->mask->affinity.bits, thread->mask->affinity.bits,
1477 			  map->affinity_mask.bits, thread->mask->affinity.nbits);
1478 		sched_setaffinity(0, MMAP_CPU_MASK_BYTES(&thread->mask->affinity),
1479 					(cpu_set_t *)thread->mask->affinity.bits);
1480 		if (verbose == 2) {
1481 			pr_debug("threads[%d]: running on cpu%d: ", thread->tid, sched_getcpu());
1482 			mmap_cpu_mask__scnprintf(&thread->mask->affinity, "affinity");
1483 		}
1484 	}
1485 }
1486 
1487 static size_t process_comp_header(void *record, size_t increment)
1488 {
1489 	struct perf_record_compressed *event = record;
1490 	size_t size = sizeof(*event);
1491 
1492 	if (increment) {
1493 		event->header.size += increment;
1494 		return increment;
1495 	}
1496 
1497 	event->header.type = PERF_RECORD_COMPRESSED;
1498 	event->header.size = size;
1499 
1500 	return size;
1501 }
1502 
1503 static size_t zstd_compress(struct perf_session *session, struct mmap *map,
1504 			    void *dst, size_t dst_size, void *src, size_t src_size)
1505 {
1506 	size_t compressed;
1507 	size_t max_record_size = PERF_SAMPLE_MAX_SIZE - sizeof(struct perf_record_compressed) - 1;
1508 	struct zstd_data *zstd_data = &session->zstd_data;
1509 
1510 	if (map && map->file)
1511 		zstd_data = &map->zstd_data;
1512 
1513 	compressed = zstd_compress_stream_to_records(zstd_data, dst, dst_size, src, src_size,
1514 						     max_record_size, process_comp_header);
1515 
1516 	if (map && map->file) {
1517 		thread->bytes_transferred += src_size;
1518 		thread->bytes_compressed  += compressed;
1519 	} else {
1520 		session->bytes_transferred += src_size;
1521 		session->bytes_compressed  += compressed;
1522 	}
1523 
1524 	return compressed;
1525 }
1526 
1527 static int record__mmap_read_evlist(struct record *rec, struct evlist *evlist,
1528 				    bool overwrite, bool synch)
1529 {
1530 	u64 bytes_written = rec->bytes_written;
1531 	int i;
1532 	int rc = 0;
1533 	int nr_mmaps;
1534 	struct mmap **maps;
1535 	int trace_fd = rec->data.file.fd;
1536 	off_t off = 0;
1537 
1538 	if (!evlist)
1539 		return 0;
1540 
1541 	nr_mmaps = thread->nr_mmaps;
1542 	maps = overwrite ? thread->overwrite_maps : thread->maps;
1543 
1544 	if (!maps)
1545 		return 0;
1546 
1547 	if (overwrite && evlist->bkw_mmap_state != BKW_MMAP_DATA_PENDING)
1548 		return 0;
1549 
1550 	if (record__aio_enabled(rec))
1551 		off = record__aio_get_pos(trace_fd);
1552 
1553 	for (i = 0; i < nr_mmaps; i++) {
1554 		u64 flush = 0;
1555 		struct mmap *map = maps[i];
1556 
1557 		if (map->core.base) {
1558 			record__adjust_affinity(rec, map);
1559 			if (synch) {
1560 				flush = map->core.flush;
1561 				map->core.flush = 1;
1562 			}
1563 			if (!record__aio_enabled(rec)) {
1564 				if (perf_mmap__push(map, rec, record__pushfn) < 0) {
1565 					if (synch)
1566 						map->core.flush = flush;
1567 					rc = -1;
1568 					goto out;
1569 				}
1570 			} else {
1571 				if (record__aio_push(rec, map, &off) < 0) {
1572 					record__aio_set_pos(trace_fd, off);
1573 					if (synch)
1574 						map->core.flush = flush;
1575 					rc = -1;
1576 					goto out;
1577 				}
1578 			}
1579 			if (synch)
1580 				map->core.flush = flush;
1581 		}
1582 
1583 		if (map->auxtrace_mmap.base && !rec->opts.auxtrace_snapshot_mode &&
1584 		    !rec->opts.auxtrace_sample_mode &&
1585 		    record__auxtrace_mmap_read(rec, map) != 0) {
1586 			rc = -1;
1587 			goto out;
1588 		}
1589 	}
1590 
1591 	if (record__aio_enabled(rec))
1592 		record__aio_set_pos(trace_fd, off);
1593 
1594 	/*
1595 	 * Mark the round finished in case we wrote
1596 	 * at least one event.
1597 	 *
1598 	 * No need for round events in directory mode,
1599 	 * because per-cpu maps and files have data
1600 	 * sorted by kernel.
1601 	 */
1602 	if (!record__threads_enabled(rec) && bytes_written != rec->bytes_written)
1603 		rc = record__write(rec, NULL, &finished_round_event, sizeof(finished_round_event));
1604 
1605 	if (overwrite)
1606 		evlist__toggle_bkw_mmap(evlist, BKW_MMAP_EMPTY);
1607 out:
1608 	return rc;
1609 }
1610 
1611 static int record__mmap_read_all(struct record *rec, bool synch)
1612 {
1613 	int err;
1614 
1615 	err = record__mmap_read_evlist(rec, rec->evlist, false, synch);
1616 	if (err)
1617 		return err;
1618 
1619 	return record__mmap_read_evlist(rec, rec->evlist, true, synch);
1620 }
1621 
1622 static void record__thread_munmap_filtered(struct fdarray *fda, int fd,
1623 					   void *arg __maybe_unused)
1624 {
1625 	struct perf_mmap *map = fda->priv[fd].ptr;
1626 
1627 	if (map)
1628 		perf_mmap__put(map);
1629 }
1630 
1631 static void *record__thread(void *arg)
1632 {
1633 	enum thread_msg msg = THREAD_MSG__READY;
1634 	bool terminate = false;
1635 	struct fdarray *pollfd;
1636 	int err, ctlfd_pos;
1637 
1638 	thread = arg;
1639 	thread->tid = gettid();
1640 
1641 	err = write(thread->pipes.ack[1], &msg, sizeof(msg));
1642 	if (err == -1)
1643 		pr_warning("threads[%d]: failed to notify on start: %s\n",
1644 			   thread->tid, strerror(errno));
1645 
1646 	pr_debug("threads[%d]: started on cpu%d\n", thread->tid, sched_getcpu());
1647 
1648 	pollfd = &thread->pollfd;
1649 	ctlfd_pos = thread->ctlfd_pos;
1650 
1651 	for (;;) {
1652 		unsigned long long hits = thread->samples;
1653 
1654 		if (record__mmap_read_all(thread->rec, false) < 0 || terminate)
1655 			break;
1656 
1657 		if (hits == thread->samples) {
1658 
1659 			err = fdarray__poll(pollfd, -1);
1660 			/*
1661 			 * Propagate error, only if there's any. Ignore positive
1662 			 * number of returned events and interrupt error.
1663 			 */
1664 			if (err > 0 || (err < 0 && errno == EINTR))
1665 				err = 0;
1666 			thread->waking++;
1667 
1668 			if (fdarray__filter(pollfd, POLLERR | POLLHUP,
1669 					    record__thread_munmap_filtered, NULL) == 0)
1670 				break;
1671 		}
1672 
1673 		if (pollfd->entries[ctlfd_pos].revents & POLLHUP) {
1674 			terminate = true;
1675 			close(thread->pipes.msg[0]);
1676 			thread->pipes.msg[0] = -1;
1677 			pollfd->entries[ctlfd_pos].fd = -1;
1678 			pollfd->entries[ctlfd_pos].events = 0;
1679 		}
1680 
1681 		pollfd->entries[ctlfd_pos].revents = 0;
1682 	}
1683 	record__mmap_read_all(thread->rec, true);
1684 
1685 	err = write(thread->pipes.ack[1], &msg, sizeof(msg));
1686 	if (err == -1)
1687 		pr_warning("threads[%d]: failed to notify on termination: %s\n",
1688 			   thread->tid, strerror(errno));
1689 
1690 	return NULL;
1691 }
1692 
1693 static void record__init_features(struct record *rec)
1694 {
1695 	struct perf_session *session = rec->session;
1696 	int feat;
1697 
1698 	for (feat = HEADER_FIRST_FEATURE; feat < HEADER_LAST_FEATURE; feat++)
1699 		perf_header__set_feat(&session->header, feat);
1700 
1701 	if (rec->no_buildid)
1702 		perf_header__clear_feat(&session->header, HEADER_BUILD_ID);
1703 
1704 #ifdef HAVE_LIBTRACEEVENT
1705 	if (!have_tracepoints(&rec->evlist->core.entries))
1706 		perf_header__clear_feat(&session->header, HEADER_TRACING_DATA);
1707 #endif
1708 
1709 	if (!rec->opts.branch_stack)
1710 		perf_header__clear_feat(&session->header, HEADER_BRANCH_STACK);
1711 
1712 	if (!rec->opts.full_auxtrace)
1713 		perf_header__clear_feat(&session->header, HEADER_AUXTRACE);
1714 
1715 	if (!(rec->opts.use_clockid && rec->opts.clockid_res_ns))
1716 		perf_header__clear_feat(&session->header, HEADER_CLOCKID);
1717 
1718 	if (!rec->opts.use_clockid)
1719 		perf_header__clear_feat(&session->header, HEADER_CLOCK_DATA);
1720 
1721 	if (!record__threads_enabled(rec))
1722 		perf_header__clear_feat(&session->header, HEADER_DIR_FORMAT);
1723 
1724 	if (!record__comp_enabled(rec))
1725 		perf_header__clear_feat(&session->header, HEADER_COMPRESSED);
1726 
1727 	perf_header__clear_feat(&session->header, HEADER_STAT);
1728 }
1729 
1730 static void
1731 record__finish_output(struct record *rec)
1732 {
1733 	int i;
1734 	struct perf_data *data = &rec->data;
1735 	int fd = perf_data__fd(data);
1736 
1737 	if (data->is_pipe)
1738 		return;
1739 
1740 	rec->session->header.data_size += rec->bytes_written;
1741 	data->file.size = lseek(perf_data__fd(data), 0, SEEK_CUR);
1742 	if (record__threads_enabled(rec)) {
1743 		for (i = 0; i < data->dir.nr; i++)
1744 			data->dir.files[i].size = lseek(data->dir.files[i].fd, 0, SEEK_CUR);
1745 	}
1746 
1747 	if (!rec->no_buildid) {
1748 		process_buildids(rec);
1749 
1750 		if (rec->buildid_all)
1751 			dsos__hit_all(rec->session);
1752 	}
1753 	perf_session__write_header(rec->session, rec->evlist, fd, true);
1754 
1755 	return;
1756 }
1757 
1758 static int record__synthesize_workload(struct record *rec, bool tail)
1759 {
1760 	int err;
1761 	struct perf_thread_map *thread_map;
1762 	bool needs_mmap = rec->opts.synth & PERF_SYNTH_MMAP;
1763 
1764 	if (rec->opts.tail_synthesize != tail)
1765 		return 0;
1766 
1767 	thread_map = thread_map__new_by_tid(rec->evlist->workload.pid);
1768 	if (thread_map == NULL)
1769 		return -1;
1770 
1771 	err = perf_event__synthesize_thread_map(&rec->tool, thread_map,
1772 						 process_synthesized_event,
1773 						 &rec->session->machines.host,
1774 						 needs_mmap,
1775 						 rec->opts.sample_address);
1776 	perf_thread_map__put(thread_map);
1777 	return err;
1778 }
1779 
1780 static int write_finished_init(struct record *rec, bool tail)
1781 {
1782 	if (rec->opts.tail_synthesize != tail)
1783 		return 0;
1784 
1785 	return record__write(rec, NULL, &finished_init_event, sizeof(finished_init_event));
1786 }
1787 
1788 static int record__synthesize(struct record *rec, bool tail);
1789 
1790 static int
1791 record__switch_output(struct record *rec, bool at_exit)
1792 {
1793 	struct perf_data *data = &rec->data;
1794 	int fd, err;
1795 	char *new_filename;
1796 
1797 	/* Same Size:      "2015122520103046"*/
1798 	char timestamp[] = "InvalidTimestamp";
1799 
1800 	record__aio_mmap_read_sync(rec);
1801 
1802 	write_finished_init(rec, true);
1803 
1804 	record__synthesize(rec, true);
1805 	if (target__none(&rec->opts.target))
1806 		record__synthesize_workload(rec, true);
1807 
1808 	rec->samples = 0;
1809 	record__finish_output(rec);
1810 	err = fetch_current_timestamp(timestamp, sizeof(timestamp));
1811 	if (err) {
1812 		pr_err("Failed to get current timestamp\n");
1813 		return -EINVAL;
1814 	}
1815 
1816 	fd = perf_data__switch(data, timestamp,
1817 				    rec->session->header.data_offset,
1818 				    at_exit, &new_filename);
1819 	if (fd >= 0 && !at_exit) {
1820 		rec->bytes_written = 0;
1821 		rec->session->header.data_size = 0;
1822 	}
1823 
1824 	if (!quiet)
1825 		fprintf(stderr, "[ perf record: Dump %s.%s ]\n",
1826 			data->path, timestamp);
1827 
1828 	if (rec->switch_output.num_files) {
1829 		int n = rec->switch_output.cur_file + 1;
1830 
1831 		if (n >= rec->switch_output.num_files)
1832 			n = 0;
1833 		rec->switch_output.cur_file = n;
1834 		if (rec->switch_output.filenames[n]) {
1835 			remove(rec->switch_output.filenames[n]);
1836 			zfree(&rec->switch_output.filenames[n]);
1837 		}
1838 		rec->switch_output.filenames[n] = new_filename;
1839 	} else {
1840 		free(new_filename);
1841 	}
1842 
1843 	/* Output tracking events */
1844 	if (!at_exit) {
1845 		record__synthesize(rec, false);
1846 
1847 		/*
1848 		 * In 'perf record --switch-output' without -a,
1849 		 * record__synthesize() in record__switch_output() won't
1850 		 * generate tracking events because there's no thread_map
1851 		 * in evlist. Which causes newly created perf.data doesn't
1852 		 * contain map and comm information.
1853 		 * Create a fake thread_map and directly call
1854 		 * perf_event__synthesize_thread_map() for those events.
1855 		 */
1856 		if (target__none(&rec->opts.target))
1857 			record__synthesize_workload(rec, false);
1858 		write_finished_init(rec, false);
1859 	}
1860 	return fd;
1861 }
1862 
1863 static void __record__read_lost_samples(struct record *rec, struct evsel *evsel,
1864 					struct perf_record_lost_samples *lost,
1865 					int cpu_idx, int thread_idx)
1866 {
1867 	struct perf_counts_values count;
1868 	struct perf_sample_id *sid;
1869 	struct perf_sample sample = {};
1870 	int id_hdr_size;
1871 
1872 	if (perf_evsel__read(&evsel->core, cpu_idx, thread_idx, &count) < 0) {
1873 		pr_err("read LOST count failed\n");
1874 		return;
1875 	}
1876 
1877 	if (count.lost == 0)
1878 		return;
1879 
1880 	lost->lost = count.lost;
1881 	if (evsel->core.ids) {
1882 		sid = xyarray__entry(evsel->core.sample_id, cpu_idx, thread_idx);
1883 		sample.id = sid->id;
1884 	}
1885 
1886 	id_hdr_size = perf_event__synthesize_id_sample((void *)(lost + 1),
1887 						       evsel->core.attr.sample_type, &sample);
1888 	lost->header.size = sizeof(*lost) + id_hdr_size;
1889 	record__write(rec, NULL, lost, lost->header.size);
1890 }
1891 
1892 static void record__read_lost_samples(struct record *rec)
1893 {
1894 	struct perf_session *session = rec->session;
1895 	struct perf_record_lost_samples *lost;
1896 	struct evsel *evsel;
1897 
1898 	/* there was an error during record__open */
1899 	if (session->evlist == NULL)
1900 		return;
1901 
1902 	lost = zalloc(PERF_SAMPLE_MAX_SIZE);
1903 	if (lost == NULL) {
1904 		pr_debug("Memory allocation failed\n");
1905 		return;
1906 	}
1907 
1908 	lost->header.type = PERF_RECORD_LOST_SAMPLES;
1909 
1910 	evlist__for_each_entry(session->evlist, evsel) {
1911 		struct xyarray *xy = evsel->core.sample_id;
1912 
1913 		if (xy == NULL || evsel->core.fd == NULL)
1914 			continue;
1915 		if (xyarray__max_x(evsel->core.fd) != xyarray__max_x(xy) ||
1916 		    xyarray__max_y(evsel->core.fd) != xyarray__max_y(xy)) {
1917 			pr_debug("Unmatched FD vs. sample ID: skip reading LOST count\n");
1918 			continue;
1919 		}
1920 
1921 		for (int x = 0; x < xyarray__max_x(xy); x++) {
1922 			for (int y = 0; y < xyarray__max_y(xy); y++) {
1923 				__record__read_lost_samples(rec, evsel, lost, x, y);
1924 			}
1925 		}
1926 	}
1927 	free(lost);
1928 
1929 }
1930 
1931 static volatile sig_atomic_t workload_exec_errno;
1932 
1933 /*
1934  * evlist__prepare_workload will send a SIGUSR1
1935  * if the fork fails, since we asked by setting its
1936  * want_signal to true.
1937  */
1938 static void workload_exec_failed_signal(int signo __maybe_unused,
1939 					siginfo_t *info,
1940 					void *ucontext __maybe_unused)
1941 {
1942 	workload_exec_errno = info->si_value.sival_int;
1943 	done = 1;
1944 	child_finished = 1;
1945 }
1946 
1947 static void snapshot_sig_handler(int sig);
1948 static void alarm_sig_handler(int sig);
1949 
1950 static const struct perf_event_mmap_page *evlist__pick_pc(struct evlist *evlist)
1951 {
1952 	if (evlist) {
1953 		if (evlist->mmap && evlist->mmap[0].core.base)
1954 			return evlist->mmap[0].core.base;
1955 		if (evlist->overwrite_mmap && evlist->overwrite_mmap[0].core.base)
1956 			return evlist->overwrite_mmap[0].core.base;
1957 	}
1958 	return NULL;
1959 }
1960 
1961 static const struct perf_event_mmap_page *record__pick_pc(struct record *rec)
1962 {
1963 	const struct perf_event_mmap_page *pc = evlist__pick_pc(rec->evlist);
1964 	if (pc)
1965 		return pc;
1966 	return NULL;
1967 }
1968 
1969 static int record__synthesize(struct record *rec, bool tail)
1970 {
1971 	struct perf_session *session = rec->session;
1972 	struct machine *machine = &session->machines.host;
1973 	struct perf_data *data = &rec->data;
1974 	struct record_opts *opts = &rec->opts;
1975 	struct perf_tool *tool = &rec->tool;
1976 	int err = 0;
1977 	event_op f = process_synthesized_event;
1978 
1979 	if (rec->opts.tail_synthesize != tail)
1980 		return 0;
1981 
1982 	if (data->is_pipe) {
1983 		err = perf_event__synthesize_for_pipe(tool, session, data,
1984 						      process_synthesized_event);
1985 		if (err < 0)
1986 			goto out;
1987 
1988 		rec->bytes_written += err;
1989 	}
1990 
1991 	err = perf_event__synth_time_conv(record__pick_pc(rec), tool,
1992 					  process_synthesized_event, machine);
1993 	if (err)
1994 		goto out;
1995 
1996 	/* Synthesize id_index before auxtrace_info */
1997 	err = perf_event__synthesize_id_index(tool,
1998 					      process_synthesized_event,
1999 					      session->evlist, machine);
2000 	if (err)
2001 		goto out;
2002 
2003 	if (rec->opts.full_auxtrace) {
2004 		err = perf_event__synthesize_auxtrace_info(rec->itr, tool,
2005 					session, process_synthesized_event);
2006 		if (err)
2007 			goto out;
2008 	}
2009 
2010 	if (!evlist__exclude_kernel(rec->evlist)) {
2011 		err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
2012 							 machine);
2013 		WARN_ONCE(err < 0, "Couldn't record kernel reference relocation symbol\n"
2014 				   "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
2015 				   "Check /proc/kallsyms permission or run as root.\n");
2016 
2017 		err = perf_event__synthesize_modules(tool, process_synthesized_event,
2018 						     machine);
2019 		WARN_ONCE(err < 0, "Couldn't record kernel module information.\n"
2020 				   "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
2021 				   "Check /proc/modules permission or run as root.\n");
2022 	}
2023 
2024 	if (perf_guest) {
2025 		machines__process_guests(&session->machines,
2026 					 perf_event__synthesize_guest_os, tool);
2027 	}
2028 
2029 	err = perf_event__synthesize_extra_attr(&rec->tool,
2030 						rec->evlist,
2031 						process_synthesized_event,
2032 						data->is_pipe);
2033 	if (err)
2034 		goto out;
2035 
2036 	err = perf_event__synthesize_thread_map2(&rec->tool, rec->evlist->core.threads,
2037 						 process_synthesized_event,
2038 						NULL);
2039 	if (err < 0) {
2040 		pr_err("Couldn't synthesize thread map.\n");
2041 		return err;
2042 	}
2043 
2044 	err = perf_event__synthesize_cpu_map(&rec->tool, rec->evlist->core.all_cpus,
2045 					     process_synthesized_event, NULL);
2046 	if (err < 0) {
2047 		pr_err("Couldn't synthesize cpu map.\n");
2048 		return err;
2049 	}
2050 
2051 	err = perf_event__synthesize_bpf_events(session, process_synthesized_event,
2052 						machine, opts);
2053 	if (err < 0) {
2054 		pr_warning("Couldn't synthesize bpf events.\n");
2055 		err = 0;
2056 	}
2057 
2058 	if (rec->opts.synth & PERF_SYNTH_CGROUP) {
2059 		err = perf_event__synthesize_cgroups(tool, process_synthesized_event,
2060 						     machine);
2061 		if (err < 0) {
2062 			pr_warning("Couldn't synthesize cgroup events.\n");
2063 			err = 0;
2064 		}
2065 	}
2066 
2067 	if (rec->opts.nr_threads_synthesize > 1) {
2068 		mutex_init(&synth_lock);
2069 		perf_set_multithreaded();
2070 		f = process_locked_synthesized_event;
2071 	}
2072 
2073 	if (rec->opts.synth & PERF_SYNTH_TASK) {
2074 		bool needs_mmap = rec->opts.synth & PERF_SYNTH_MMAP;
2075 
2076 		err = __machine__synthesize_threads(machine, tool, &opts->target,
2077 						    rec->evlist->core.threads,
2078 						    f, needs_mmap, opts->sample_address,
2079 						    rec->opts.nr_threads_synthesize);
2080 	}
2081 
2082 	if (rec->opts.nr_threads_synthesize > 1) {
2083 		perf_set_singlethreaded();
2084 		mutex_destroy(&synth_lock);
2085 	}
2086 
2087 out:
2088 	return err;
2089 }
2090 
2091 static int record__process_signal_event(union perf_event *event __maybe_unused, void *data)
2092 {
2093 	struct record *rec = data;
2094 	pthread_kill(rec->thread_id, SIGUSR2);
2095 	return 0;
2096 }
2097 
2098 static int record__setup_sb_evlist(struct record *rec)
2099 {
2100 	struct record_opts *opts = &rec->opts;
2101 
2102 	if (rec->sb_evlist != NULL) {
2103 		/*
2104 		 * We get here if --switch-output-event populated the
2105 		 * sb_evlist, so associate a callback that will send a SIGUSR2
2106 		 * to the main thread.
2107 		 */
2108 		evlist__set_cb(rec->sb_evlist, record__process_signal_event, rec);
2109 		rec->thread_id = pthread_self();
2110 	}
2111 #ifdef HAVE_LIBBPF_SUPPORT
2112 	if (!opts->no_bpf_event) {
2113 		if (rec->sb_evlist == NULL) {
2114 			rec->sb_evlist = evlist__new();
2115 
2116 			if (rec->sb_evlist == NULL) {
2117 				pr_err("Couldn't create side band evlist.\n.");
2118 				return -1;
2119 			}
2120 		}
2121 
2122 		if (evlist__add_bpf_sb_event(rec->sb_evlist, &rec->session->header.env)) {
2123 			pr_err("Couldn't ask for PERF_RECORD_BPF_EVENT side band events.\n.");
2124 			return -1;
2125 		}
2126 	}
2127 #endif
2128 	if (evlist__start_sb_thread(rec->sb_evlist, &rec->opts.target)) {
2129 		pr_debug("Couldn't start the BPF side band thread:\nBPF programs starting from now on won't be annotatable\n");
2130 		opts->no_bpf_event = true;
2131 	}
2132 
2133 	return 0;
2134 }
2135 
2136 static int record__init_clock(struct record *rec)
2137 {
2138 	struct perf_session *session = rec->session;
2139 	struct timespec ref_clockid;
2140 	struct timeval ref_tod;
2141 	u64 ref;
2142 
2143 	if (!rec->opts.use_clockid)
2144 		return 0;
2145 
2146 	if (rec->opts.use_clockid && rec->opts.clockid_res_ns)
2147 		session->header.env.clock.clockid_res_ns = rec->opts.clockid_res_ns;
2148 
2149 	session->header.env.clock.clockid = rec->opts.clockid;
2150 
2151 	if (gettimeofday(&ref_tod, NULL) != 0) {
2152 		pr_err("gettimeofday failed, cannot set reference time.\n");
2153 		return -1;
2154 	}
2155 
2156 	if (clock_gettime(rec->opts.clockid, &ref_clockid)) {
2157 		pr_err("clock_gettime failed, cannot set reference time.\n");
2158 		return -1;
2159 	}
2160 
2161 	ref = (u64) ref_tod.tv_sec * NSEC_PER_SEC +
2162 	      (u64) ref_tod.tv_usec * NSEC_PER_USEC;
2163 
2164 	session->header.env.clock.tod_ns = ref;
2165 
2166 	ref = (u64) ref_clockid.tv_sec * NSEC_PER_SEC +
2167 	      (u64) ref_clockid.tv_nsec;
2168 
2169 	session->header.env.clock.clockid_ns = ref;
2170 	return 0;
2171 }
2172 
2173 static void hit_auxtrace_snapshot_trigger(struct record *rec)
2174 {
2175 	if (trigger_is_ready(&auxtrace_snapshot_trigger)) {
2176 		trigger_hit(&auxtrace_snapshot_trigger);
2177 		auxtrace_record__snapshot_started = 1;
2178 		if (auxtrace_record__snapshot_start(rec->itr))
2179 			trigger_error(&auxtrace_snapshot_trigger);
2180 	}
2181 }
2182 
2183 static void record__uniquify_name(struct record *rec)
2184 {
2185 	struct evsel *pos;
2186 	struct evlist *evlist = rec->evlist;
2187 	char *new_name;
2188 	int ret;
2189 
2190 	if (!perf_pmu__has_hybrid())
2191 		return;
2192 
2193 	evlist__for_each_entry(evlist, pos) {
2194 		if (!evsel__is_hybrid(pos))
2195 			continue;
2196 
2197 		if (strchr(pos->name, '/'))
2198 			continue;
2199 
2200 		ret = asprintf(&new_name, "%s/%s/",
2201 			       pos->pmu_name, pos->name);
2202 		if (ret) {
2203 			free(pos->name);
2204 			pos->name = new_name;
2205 		}
2206 	}
2207 }
2208 
2209 static int record__terminate_thread(struct record_thread *thread_data)
2210 {
2211 	int err;
2212 	enum thread_msg ack = THREAD_MSG__UNDEFINED;
2213 	pid_t tid = thread_data->tid;
2214 
2215 	close(thread_data->pipes.msg[1]);
2216 	thread_data->pipes.msg[1] = -1;
2217 	err = read(thread_data->pipes.ack[0], &ack, sizeof(ack));
2218 	if (err > 0)
2219 		pr_debug2("threads[%d]: sent %s\n", tid, thread_msg_tags[ack]);
2220 	else
2221 		pr_warning("threads[%d]: failed to receive termination notification from %d\n",
2222 			   thread->tid, tid);
2223 
2224 	return 0;
2225 }
2226 
2227 static int record__start_threads(struct record *rec)
2228 {
2229 	int t, tt, err, ret = 0, nr_threads = rec->nr_threads;
2230 	struct record_thread *thread_data = rec->thread_data;
2231 	sigset_t full, mask;
2232 	pthread_t handle;
2233 	pthread_attr_t attrs;
2234 
2235 	thread = &thread_data[0];
2236 
2237 	if (!record__threads_enabled(rec))
2238 		return 0;
2239 
2240 	sigfillset(&full);
2241 	if (sigprocmask(SIG_SETMASK, &full, &mask)) {
2242 		pr_err("Failed to block signals on threads start: %s\n", strerror(errno));
2243 		return -1;
2244 	}
2245 
2246 	pthread_attr_init(&attrs);
2247 	pthread_attr_setdetachstate(&attrs, PTHREAD_CREATE_DETACHED);
2248 
2249 	for (t = 1; t < nr_threads; t++) {
2250 		enum thread_msg msg = THREAD_MSG__UNDEFINED;
2251 
2252 #ifdef HAVE_PTHREAD_ATTR_SETAFFINITY_NP
2253 		pthread_attr_setaffinity_np(&attrs,
2254 					    MMAP_CPU_MASK_BYTES(&(thread_data[t].mask->affinity)),
2255 					    (cpu_set_t *)(thread_data[t].mask->affinity.bits));
2256 #endif
2257 		if (pthread_create(&handle, &attrs, record__thread, &thread_data[t])) {
2258 			for (tt = 1; tt < t; tt++)
2259 				record__terminate_thread(&thread_data[t]);
2260 			pr_err("Failed to start threads: %s\n", strerror(errno));
2261 			ret = -1;
2262 			goto out_err;
2263 		}
2264 
2265 		err = read(thread_data[t].pipes.ack[0], &msg, sizeof(msg));
2266 		if (err > 0)
2267 			pr_debug2("threads[%d]: sent %s\n", rec->thread_data[t].tid,
2268 				  thread_msg_tags[msg]);
2269 		else
2270 			pr_warning("threads[%d]: failed to receive start notification from %d\n",
2271 				   thread->tid, rec->thread_data[t].tid);
2272 	}
2273 
2274 	sched_setaffinity(0, MMAP_CPU_MASK_BYTES(&thread->mask->affinity),
2275 			(cpu_set_t *)thread->mask->affinity.bits);
2276 
2277 	pr_debug("threads[%d]: started on cpu%d\n", thread->tid, sched_getcpu());
2278 
2279 out_err:
2280 	pthread_attr_destroy(&attrs);
2281 
2282 	if (sigprocmask(SIG_SETMASK, &mask, NULL)) {
2283 		pr_err("Failed to unblock signals on threads start: %s\n", strerror(errno));
2284 		ret = -1;
2285 	}
2286 
2287 	return ret;
2288 }
2289 
2290 static int record__stop_threads(struct record *rec)
2291 {
2292 	int t;
2293 	struct record_thread *thread_data = rec->thread_data;
2294 
2295 	for (t = 1; t < rec->nr_threads; t++)
2296 		record__terminate_thread(&thread_data[t]);
2297 
2298 	for (t = 0; t < rec->nr_threads; t++) {
2299 		rec->samples += thread_data[t].samples;
2300 		if (!record__threads_enabled(rec))
2301 			continue;
2302 		rec->session->bytes_transferred += thread_data[t].bytes_transferred;
2303 		rec->session->bytes_compressed += thread_data[t].bytes_compressed;
2304 		pr_debug("threads[%d]: samples=%lld, wakes=%ld, ", thread_data[t].tid,
2305 			 thread_data[t].samples, thread_data[t].waking);
2306 		if (thread_data[t].bytes_transferred && thread_data[t].bytes_compressed)
2307 			pr_debug("transferred=%" PRIu64 ", compressed=%" PRIu64 "\n",
2308 				 thread_data[t].bytes_transferred, thread_data[t].bytes_compressed);
2309 		else
2310 			pr_debug("written=%" PRIu64 "\n", thread_data[t].bytes_written);
2311 	}
2312 
2313 	return 0;
2314 }
2315 
2316 static unsigned long record__waking(struct record *rec)
2317 {
2318 	int t;
2319 	unsigned long waking = 0;
2320 	struct record_thread *thread_data = rec->thread_data;
2321 
2322 	for (t = 0; t < rec->nr_threads; t++)
2323 		waking += thread_data[t].waking;
2324 
2325 	return waking;
2326 }
2327 
2328 static int __cmd_record(struct record *rec, int argc, const char **argv)
2329 {
2330 	int err;
2331 	int status = 0;
2332 	const bool forks = argc > 0;
2333 	struct perf_tool *tool = &rec->tool;
2334 	struct record_opts *opts = &rec->opts;
2335 	struct perf_data *data = &rec->data;
2336 	struct perf_session *session;
2337 	bool disabled = false, draining = false;
2338 	int fd;
2339 	float ratio = 0;
2340 	enum evlist_ctl_cmd cmd = EVLIST_CTL_CMD_UNSUPPORTED;
2341 
2342 	atexit(record__sig_exit);
2343 	signal(SIGCHLD, sig_handler);
2344 	signal(SIGINT, sig_handler);
2345 	signal(SIGTERM, sig_handler);
2346 	signal(SIGSEGV, sigsegv_handler);
2347 
2348 	if (rec->opts.record_namespaces)
2349 		tool->namespace_events = true;
2350 
2351 	if (rec->opts.record_cgroup) {
2352 #ifdef HAVE_FILE_HANDLE
2353 		tool->cgroup_events = true;
2354 #else
2355 		pr_err("cgroup tracking is not supported\n");
2356 		return -1;
2357 #endif
2358 	}
2359 
2360 	if (rec->opts.auxtrace_snapshot_mode || rec->switch_output.enabled) {
2361 		signal(SIGUSR2, snapshot_sig_handler);
2362 		if (rec->opts.auxtrace_snapshot_mode)
2363 			trigger_on(&auxtrace_snapshot_trigger);
2364 		if (rec->switch_output.enabled)
2365 			trigger_on(&switch_output_trigger);
2366 	} else {
2367 		signal(SIGUSR2, SIG_IGN);
2368 	}
2369 
2370 	session = perf_session__new(data, tool);
2371 	if (IS_ERR(session)) {
2372 		pr_err("Perf session creation failed.\n");
2373 		return PTR_ERR(session);
2374 	}
2375 
2376 	if (record__threads_enabled(rec)) {
2377 		if (perf_data__is_pipe(&rec->data)) {
2378 			pr_err("Parallel trace streaming is not available in pipe mode.\n");
2379 			return -1;
2380 		}
2381 		if (rec->opts.full_auxtrace) {
2382 			pr_err("Parallel trace streaming is not available in AUX area tracing mode.\n");
2383 			return -1;
2384 		}
2385 	}
2386 
2387 	fd = perf_data__fd(data);
2388 	rec->session = session;
2389 
2390 	if (zstd_init(&session->zstd_data, rec->opts.comp_level) < 0) {
2391 		pr_err("Compression initialization failed.\n");
2392 		return -1;
2393 	}
2394 #ifdef HAVE_EVENTFD_SUPPORT
2395 	done_fd = eventfd(0, EFD_NONBLOCK);
2396 	if (done_fd < 0) {
2397 		pr_err("Failed to create wakeup eventfd, error: %m\n");
2398 		status = -1;
2399 		goto out_delete_session;
2400 	}
2401 	err = evlist__add_wakeup_eventfd(rec->evlist, done_fd);
2402 	if (err < 0) {
2403 		pr_err("Failed to add wakeup eventfd to poll list\n");
2404 		status = err;
2405 		goto out_delete_session;
2406 	}
2407 #endif // HAVE_EVENTFD_SUPPORT
2408 
2409 	session->header.env.comp_type  = PERF_COMP_ZSTD;
2410 	session->header.env.comp_level = rec->opts.comp_level;
2411 
2412 	if (rec->opts.kcore &&
2413 	    !record__kcore_readable(&session->machines.host)) {
2414 		pr_err("ERROR: kcore is not readable.\n");
2415 		return -1;
2416 	}
2417 
2418 	if (record__init_clock(rec))
2419 		return -1;
2420 
2421 	record__init_features(rec);
2422 
2423 	if (forks) {
2424 		err = evlist__prepare_workload(rec->evlist, &opts->target, argv, data->is_pipe,
2425 					       workload_exec_failed_signal);
2426 		if (err < 0) {
2427 			pr_err("Couldn't run the workload!\n");
2428 			status = err;
2429 			goto out_delete_session;
2430 		}
2431 	}
2432 
2433 	/*
2434 	 * If we have just single event and are sending data
2435 	 * through pipe, we need to force the ids allocation,
2436 	 * because we synthesize event name through the pipe
2437 	 * and need the id for that.
2438 	 */
2439 	if (data->is_pipe && rec->evlist->core.nr_entries == 1)
2440 		rec->opts.sample_id = true;
2441 
2442 	record__uniquify_name(rec);
2443 
2444 	/* Debug message used by test scripts */
2445 	pr_debug3("perf record opening and mmapping events\n");
2446 	if (record__open(rec) != 0) {
2447 		err = -1;
2448 		goto out_free_threads;
2449 	}
2450 	/* Debug message used by test scripts */
2451 	pr_debug3("perf record done opening and mmapping events\n");
2452 	session->header.env.comp_mmap_len = session->evlist->core.mmap_len;
2453 
2454 	if (rec->opts.kcore) {
2455 		err = record__kcore_copy(&session->machines.host, data);
2456 		if (err) {
2457 			pr_err("ERROR: Failed to copy kcore\n");
2458 			goto out_free_threads;
2459 		}
2460 	}
2461 
2462 	err = bpf__apply_obj_config();
2463 	if (err) {
2464 		char errbuf[BUFSIZ];
2465 
2466 		bpf__strerror_apply_obj_config(err, errbuf, sizeof(errbuf));
2467 		pr_err("ERROR: Apply config to BPF failed: %s\n",
2468 			 errbuf);
2469 		goto out_free_threads;
2470 	}
2471 
2472 	/*
2473 	 * Normally perf_session__new would do this, but it doesn't have the
2474 	 * evlist.
2475 	 */
2476 	if (rec->tool.ordered_events && !evlist__sample_id_all(rec->evlist)) {
2477 		pr_warning("WARNING: No sample_id_all support, falling back to unordered processing\n");
2478 		rec->tool.ordered_events = false;
2479 	}
2480 
2481 	if (!rec->evlist->core.nr_groups)
2482 		perf_header__clear_feat(&session->header, HEADER_GROUP_DESC);
2483 
2484 	if (data->is_pipe) {
2485 		err = perf_header__write_pipe(fd);
2486 		if (err < 0)
2487 			goto out_free_threads;
2488 	} else {
2489 		err = perf_session__write_header(session, rec->evlist, fd, false);
2490 		if (err < 0)
2491 			goto out_free_threads;
2492 	}
2493 
2494 	err = -1;
2495 	if (!rec->no_buildid
2496 	    && !perf_header__has_feat(&session->header, HEADER_BUILD_ID)) {
2497 		pr_err("Couldn't generate buildids. "
2498 		       "Use --no-buildid to profile anyway.\n");
2499 		goto out_free_threads;
2500 	}
2501 
2502 	err = record__setup_sb_evlist(rec);
2503 	if (err)
2504 		goto out_free_threads;
2505 
2506 	err = record__synthesize(rec, false);
2507 	if (err < 0)
2508 		goto out_free_threads;
2509 
2510 	if (rec->realtime_prio) {
2511 		struct sched_param param;
2512 
2513 		param.sched_priority = rec->realtime_prio;
2514 		if (sched_setscheduler(0, SCHED_FIFO, &param)) {
2515 			pr_err("Could not set realtime priority.\n");
2516 			err = -1;
2517 			goto out_free_threads;
2518 		}
2519 	}
2520 
2521 	if (record__start_threads(rec))
2522 		goto out_free_threads;
2523 
2524 	/*
2525 	 * When perf is starting the traced process, all the events
2526 	 * (apart from group members) have enable_on_exec=1 set,
2527 	 * so don't spoil it by prematurely enabling them.
2528 	 */
2529 	if (!target__none(&opts->target) && !opts->initial_delay)
2530 		evlist__enable(rec->evlist);
2531 
2532 	/*
2533 	 * Let the child rip
2534 	 */
2535 	if (forks) {
2536 		struct machine *machine = &session->machines.host;
2537 		union perf_event *event;
2538 		pid_t tgid;
2539 
2540 		event = malloc(sizeof(event->comm) + machine->id_hdr_size);
2541 		if (event == NULL) {
2542 			err = -ENOMEM;
2543 			goto out_child;
2544 		}
2545 
2546 		/*
2547 		 * Some H/W events are generated before COMM event
2548 		 * which is emitted during exec(), so perf script
2549 		 * cannot see a correct process name for those events.
2550 		 * Synthesize COMM event to prevent it.
2551 		 */
2552 		tgid = perf_event__synthesize_comm(tool, event,
2553 						   rec->evlist->workload.pid,
2554 						   process_synthesized_event,
2555 						   machine);
2556 		free(event);
2557 
2558 		if (tgid == -1)
2559 			goto out_child;
2560 
2561 		event = malloc(sizeof(event->namespaces) +
2562 			       (NR_NAMESPACES * sizeof(struct perf_ns_link_info)) +
2563 			       machine->id_hdr_size);
2564 		if (event == NULL) {
2565 			err = -ENOMEM;
2566 			goto out_child;
2567 		}
2568 
2569 		/*
2570 		 * Synthesize NAMESPACES event for the command specified.
2571 		 */
2572 		perf_event__synthesize_namespaces(tool, event,
2573 						  rec->evlist->workload.pid,
2574 						  tgid, process_synthesized_event,
2575 						  machine);
2576 		free(event);
2577 
2578 		evlist__start_workload(rec->evlist);
2579 	}
2580 
2581 	if (opts->initial_delay) {
2582 		pr_info(EVLIST_DISABLED_MSG);
2583 		if (opts->initial_delay > 0) {
2584 			usleep(opts->initial_delay * USEC_PER_MSEC);
2585 			evlist__enable(rec->evlist);
2586 			pr_info(EVLIST_ENABLED_MSG);
2587 		}
2588 	}
2589 
2590 	err = event_enable_timer__start(rec->evlist->eet);
2591 	if (err)
2592 		goto out_child;
2593 
2594 	/* Debug message used by test scripts */
2595 	pr_debug3("perf record has started\n");
2596 	fflush(stderr);
2597 
2598 	trigger_ready(&auxtrace_snapshot_trigger);
2599 	trigger_ready(&switch_output_trigger);
2600 	perf_hooks__invoke_record_start();
2601 
2602 	/*
2603 	 * Must write FINISHED_INIT so it will be seen after all other
2604 	 * synthesized user events, but before any regular events.
2605 	 */
2606 	err = write_finished_init(rec, false);
2607 	if (err < 0)
2608 		goto out_child;
2609 
2610 	for (;;) {
2611 		unsigned long long hits = thread->samples;
2612 
2613 		/*
2614 		 * rec->evlist->bkw_mmap_state is possible to be
2615 		 * BKW_MMAP_EMPTY here: when done == true and
2616 		 * hits != rec->samples in previous round.
2617 		 *
2618 		 * evlist__toggle_bkw_mmap ensure we never
2619 		 * convert BKW_MMAP_EMPTY to BKW_MMAP_DATA_PENDING.
2620 		 */
2621 		if (trigger_is_hit(&switch_output_trigger) || done || draining)
2622 			evlist__toggle_bkw_mmap(rec->evlist, BKW_MMAP_DATA_PENDING);
2623 
2624 		if (record__mmap_read_all(rec, false) < 0) {
2625 			trigger_error(&auxtrace_snapshot_trigger);
2626 			trigger_error(&switch_output_trigger);
2627 			err = -1;
2628 			goto out_child;
2629 		}
2630 
2631 		if (auxtrace_record__snapshot_started) {
2632 			auxtrace_record__snapshot_started = 0;
2633 			if (!trigger_is_error(&auxtrace_snapshot_trigger))
2634 				record__read_auxtrace_snapshot(rec, false);
2635 			if (trigger_is_error(&auxtrace_snapshot_trigger)) {
2636 				pr_err("AUX area tracing snapshot failed\n");
2637 				err = -1;
2638 				goto out_child;
2639 			}
2640 		}
2641 
2642 		if (trigger_is_hit(&switch_output_trigger)) {
2643 			/*
2644 			 * If switch_output_trigger is hit, the data in
2645 			 * overwritable ring buffer should have been collected,
2646 			 * so bkw_mmap_state should be set to BKW_MMAP_EMPTY.
2647 			 *
2648 			 * If SIGUSR2 raise after or during record__mmap_read_all(),
2649 			 * record__mmap_read_all() didn't collect data from
2650 			 * overwritable ring buffer. Read again.
2651 			 */
2652 			if (rec->evlist->bkw_mmap_state == BKW_MMAP_RUNNING)
2653 				continue;
2654 			trigger_ready(&switch_output_trigger);
2655 
2656 			/*
2657 			 * Reenable events in overwrite ring buffer after
2658 			 * record__mmap_read_all(): we should have collected
2659 			 * data from it.
2660 			 */
2661 			evlist__toggle_bkw_mmap(rec->evlist, BKW_MMAP_RUNNING);
2662 
2663 			if (!quiet)
2664 				fprintf(stderr, "[ perf record: dump data: Woken up %ld times ]\n",
2665 					record__waking(rec));
2666 			thread->waking = 0;
2667 			fd = record__switch_output(rec, false);
2668 			if (fd < 0) {
2669 				pr_err("Failed to switch to new file\n");
2670 				trigger_error(&switch_output_trigger);
2671 				err = fd;
2672 				goto out_child;
2673 			}
2674 
2675 			/* re-arm the alarm */
2676 			if (rec->switch_output.time)
2677 				alarm(rec->switch_output.time);
2678 		}
2679 
2680 		if (hits == thread->samples) {
2681 			if (done || draining)
2682 				break;
2683 			err = fdarray__poll(&thread->pollfd, -1);
2684 			/*
2685 			 * Propagate error, only if there's any. Ignore positive
2686 			 * number of returned events and interrupt error.
2687 			 */
2688 			if (err > 0 || (err < 0 && errno == EINTR))
2689 				err = 0;
2690 			thread->waking++;
2691 
2692 			if (fdarray__filter(&thread->pollfd, POLLERR | POLLHUP,
2693 					    record__thread_munmap_filtered, NULL) == 0)
2694 				draining = true;
2695 
2696 			err = record__update_evlist_pollfd_from_thread(rec, rec->evlist, thread);
2697 			if (err)
2698 				goto out_child;
2699 		}
2700 
2701 		if (evlist__ctlfd_process(rec->evlist, &cmd) > 0) {
2702 			switch (cmd) {
2703 			case EVLIST_CTL_CMD_SNAPSHOT:
2704 				hit_auxtrace_snapshot_trigger(rec);
2705 				evlist__ctlfd_ack(rec->evlist);
2706 				break;
2707 			case EVLIST_CTL_CMD_STOP:
2708 				done = 1;
2709 				break;
2710 			case EVLIST_CTL_CMD_ACK:
2711 			case EVLIST_CTL_CMD_UNSUPPORTED:
2712 			case EVLIST_CTL_CMD_ENABLE:
2713 			case EVLIST_CTL_CMD_DISABLE:
2714 			case EVLIST_CTL_CMD_EVLIST:
2715 			case EVLIST_CTL_CMD_PING:
2716 			default:
2717 				break;
2718 			}
2719 		}
2720 
2721 		err = event_enable_timer__process(rec->evlist->eet);
2722 		if (err < 0)
2723 			goto out_child;
2724 		if (err) {
2725 			err = 0;
2726 			done = 1;
2727 		}
2728 
2729 		/*
2730 		 * When perf is starting the traced process, at the end events
2731 		 * die with the process and we wait for that. Thus no need to
2732 		 * disable events in this case.
2733 		 */
2734 		if (done && !disabled && !target__none(&opts->target)) {
2735 			trigger_off(&auxtrace_snapshot_trigger);
2736 			evlist__disable(rec->evlist);
2737 			disabled = true;
2738 		}
2739 	}
2740 
2741 	trigger_off(&auxtrace_snapshot_trigger);
2742 	trigger_off(&switch_output_trigger);
2743 
2744 	if (opts->auxtrace_snapshot_on_exit)
2745 		record__auxtrace_snapshot_exit(rec);
2746 
2747 	if (forks && workload_exec_errno) {
2748 		char msg[STRERR_BUFSIZE], strevsels[2048];
2749 		const char *emsg = str_error_r(workload_exec_errno, msg, sizeof(msg));
2750 
2751 		evlist__scnprintf_evsels(rec->evlist, sizeof(strevsels), strevsels);
2752 
2753 		pr_err("Failed to collect '%s' for the '%s' workload: %s\n",
2754 			strevsels, argv[0], emsg);
2755 		err = -1;
2756 		goto out_child;
2757 	}
2758 
2759 	if (!quiet)
2760 		fprintf(stderr, "[ perf record: Woken up %ld times to write data ]\n",
2761 			record__waking(rec));
2762 
2763 	write_finished_init(rec, true);
2764 
2765 	if (target__none(&rec->opts.target))
2766 		record__synthesize_workload(rec, true);
2767 
2768 out_child:
2769 	record__stop_threads(rec);
2770 	record__mmap_read_all(rec, true);
2771 out_free_threads:
2772 	record__free_thread_data(rec);
2773 	evlist__finalize_ctlfd(rec->evlist);
2774 	record__aio_mmap_read_sync(rec);
2775 
2776 	if (rec->session->bytes_transferred && rec->session->bytes_compressed) {
2777 		ratio = (float)rec->session->bytes_transferred/(float)rec->session->bytes_compressed;
2778 		session->header.env.comp_ratio = ratio + 0.5;
2779 	}
2780 
2781 	if (forks) {
2782 		int exit_status;
2783 
2784 		if (!child_finished)
2785 			kill(rec->evlist->workload.pid, SIGTERM);
2786 
2787 		wait(&exit_status);
2788 
2789 		if (err < 0)
2790 			status = err;
2791 		else if (WIFEXITED(exit_status))
2792 			status = WEXITSTATUS(exit_status);
2793 		else if (WIFSIGNALED(exit_status))
2794 			signr = WTERMSIG(exit_status);
2795 	} else
2796 		status = err;
2797 
2798 	if (rec->off_cpu)
2799 		rec->bytes_written += off_cpu_write(rec->session);
2800 
2801 	record__read_lost_samples(rec);
2802 	record__synthesize(rec, true);
2803 	/* this will be recalculated during process_buildids() */
2804 	rec->samples = 0;
2805 
2806 	if (!err) {
2807 		if (!rec->timestamp_filename) {
2808 			record__finish_output(rec);
2809 		} else {
2810 			fd = record__switch_output(rec, true);
2811 			if (fd < 0) {
2812 				status = fd;
2813 				goto out_delete_session;
2814 			}
2815 		}
2816 	}
2817 
2818 	perf_hooks__invoke_record_end();
2819 
2820 	if (!err && !quiet) {
2821 		char samples[128];
2822 		const char *postfix = rec->timestamp_filename ?
2823 					".<timestamp>" : "";
2824 
2825 		if (rec->samples && !rec->opts.full_auxtrace)
2826 			scnprintf(samples, sizeof(samples),
2827 				  " (%" PRIu64 " samples)", rec->samples);
2828 		else
2829 			samples[0] = '\0';
2830 
2831 		fprintf(stderr,	"[ perf record: Captured and wrote %.3f MB %s%s%s",
2832 			perf_data__size(data) / 1024.0 / 1024.0,
2833 			data->path, postfix, samples);
2834 		if (ratio) {
2835 			fprintf(stderr,	", compressed (original %.3f MB, ratio is %.3f)",
2836 					rec->session->bytes_transferred / 1024.0 / 1024.0,
2837 					ratio);
2838 		}
2839 		fprintf(stderr, " ]\n");
2840 	}
2841 
2842 out_delete_session:
2843 #ifdef HAVE_EVENTFD_SUPPORT
2844 	if (done_fd >= 0) {
2845 		fd = done_fd;
2846 		done_fd = -1;
2847 
2848 		close(fd);
2849 	}
2850 #endif
2851 	zstd_fini(&session->zstd_data);
2852 	perf_session__delete(session);
2853 
2854 	if (!opts->no_bpf_event)
2855 		evlist__stop_sb_thread(rec->sb_evlist);
2856 	return status;
2857 }
2858 
2859 static void callchain_debug(struct callchain_param *callchain)
2860 {
2861 	static const char *str[CALLCHAIN_MAX] = { "NONE", "FP", "DWARF", "LBR" };
2862 
2863 	pr_debug("callchain: type %s\n", str[callchain->record_mode]);
2864 
2865 	if (callchain->record_mode == CALLCHAIN_DWARF)
2866 		pr_debug("callchain: stack dump size %d\n",
2867 			 callchain->dump_size);
2868 }
2869 
2870 int record_opts__parse_callchain(struct record_opts *record,
2871 				 struct callchain_param *callchain,
2872 				 const char *arg, bool unset)
2873 {
2874 	int ret;
2875 	callchain->enabled = !unset;
2876 
2877 	/* --no-call-graph */
2878 	if (unset) {
2879 		callchain->record_mode = CALLCHAIN_NONE;
2880 		pr_debug("callchain: disabled\n");
2881 		return 0;
2882 	}
2883 
2884 	ret = parse_callchain_record_opt(arg, callchain);
2885 	if (!ret) {
2886 		/* Enable data address sampling for DWARF unwind. */
2887 		if (callchain->record_mode == CALLCHAIN_DWARF)
2888 			record->sample_address = true;
2889 		callchain_debug(callchain);
2890 	}
2891 
2892 	return ret;
2893 }
2894 
2895 int record_parse_callchain_opt(const struct option *opt,
2896 			       const char *arg,
2897 			       int unset)
2898 {
2899 	return record_opts__parse_callchain(opt->value, &callchain_param, arg, unset);
2900 }
2901 
2902 int record_callchain_opt(const struct option *opt,
2903 			 const char *arg __maybe_unused,
2904 			 int unset __maybe_unused)
2905 {
2906 	struct callchain_param *callchain = opt->value;
2907 
2908 	callchain->enabled = true;
2909 
2910 	if (callchain->record_mode == CALLCHAIN_NONE)
2911 		callchain->record_mode = CALLCHAIN_FP;
2912 
2913 	callchain_debug(callchain);
2914 	return 0;
2915 }
2916 
2917 static int perf_record_config(const char *var, const char *value, void *cb)
2918 {
2919 	struct record *rec = cb;
2920 
2921 	if (!strcmp(var, "record.build-id")) {
2922 		if (!strcmp(value, "cache"))
2923 			rec->no_buildid_cache = false;
2924 		else if (!strcmp(value, "no-cache"))
2925 			rec->no_buildid_cache = true;
2926 		else if (!strcmp(value, "skip"))
2927 			rec->no_buildid = true;
2928 		else if (!strcmp(value, "mmap"))
2929 			rec->buildid_mmap = true;
2930 		else
2931 			return -1;
2932 		return 0;
2933 	}
2934 	if (!strcmp(var, "record.call-graph")) {
2935 		var = "call-graph.record-mode";
2936 		return perf_default_config(var, value, cb);
2937 	}
2938 #ifdef HAVE_AIO_SUPPORT
2939 	if (!strcmp(var, "record.aio")) {
2940 		rec->opts.nr_cblocks = strtol(value, NULL, 0);
2941 		if (!rec->opts.nr_cblocks)
2942 			rec->opts.nr_cblocks = nr_cblocks_default;
2943 	}
2944 #endif
2945 	if (!strcmp(var, "record.debuginfod")) {
2946 		rec->debuginfod.urls = strdup(value);
2947 		if (!rec->debuginfod.urls)
2948 			return -ENOMEM;
2949 		rec->debuginfod.set = true;
2950 	}
2951 
2952 	return 0;
2953 }
2954 
2955 static int record__parse_event_enable_time(const struct option *opt, const char *str, int unset)
2956 {
2957 	struct record *rec = (struct record *)opt->value;
2958 
2959 	return evlist__parse_event_enable_time(rec->evlist, &rec->opts, str, unset);
2960 }
2961 
2962 static int record__parse_affinity(const struct option *opt, const char *str, int unset)
2963 {
2964 	struct record_opts *opts = (struct record_opts *)opt->value;
2965 
2966 	if (unset || !str)
2967 		return 0;
2968 
2969 	if (!strcasecmp(str, "node"))
2970 		opts->affinity = PERF_AFFINITY_NODE;
2971 	else if (!strcasecmp(str, "cpu"))
2972 		opts->affinity = PERF_AFFINITY_CPU;
2973 
2974 	return 0;
2975 }
2976 
2977 static int record__mmap_cpu_mask_alloc(struct mmap_cpu_mask *mask, int nr_bits)
2978 {
2979 	mask->nbits = nr_bits;
2980 	mask->bits = bitmap_zalloc(mask->nbits);
2981 	if (!mask->bits)
2982 		return -ENOMEM;
2983 
2984 	return 0;
2985 }
2986 
2987 static void record__mmap_cpu_mask_free(struct mmap_cpu_mask *mask)
2988 {
2989 	bitmap_free(mask->bits);
2990 	mask->nbits = 0;
2991 }
2992 
2993 static int record__thread_mask_alloc(struct thread_mask *mask, int nr_bits)
2994 {
2995 	int ret;
2996 
2997 	ret = record__mmap_cpu_mask_alloc(&mask->maps, nr_bits);
2998 	if (ret) {
2999 		mask->affinity.bits = NULL;
3000 		return ret;
3001 	}
3002 
3003 	ret = record__mmap_cpu_mask_alloc(&mask->affinity, nr_bits);
3004 	if (ret) {
3005 		record__mmap_cpu_mask_free(&mask->maps);
3006 		mask->maps.bits = NULL;
3007 	}
3008 
3009 	return ret;
3010 }
3011 
3012 static void record__thread_mask_free(struct thread_mask *mask)
3013 {
3014 	record__mmap_cpu_mask_free(&mask->maps);
3015 	record__mmap_cpu_mask_free(&mask->affinity);
3016 }
3017 
3018 static int record__parse_threads(const struct option *opt, const char *str, int unset)
3019 {
3020 	int s;
3021 	struct record_opts *opts = opt->value;
3022 
3023 	if (unset || !str || !strlen(str)) {
3024 		opts->threads_spec = THREAD_SPEC__CPU;
3025 	} else {
3026 		for (s = 1; s < THREAD_SPEC__MAX; s++) {
3027 			if (s == THREAD_SPEC__USER) {
3028 				opts->threads_user_spec = strdup(str);
3029 				if (!opts->threads_user_spec)
3030 					return -ENOMEM;
3031 				opts->threads_spec = THREAD_SPEC__USER;
3032 				break;
3033 			}
3034 			if (!strncasecmp(str, thread_spec_tags[s], strlen(thread_spec_tags[s]))) {
3035 				opts->threads_spec = s;
3036 				break;
3037 			}
3038 		}
3039 	}
3040 
3041 	if (opts->threads_spec == THREAD_SPEC__USER)
3042 		pr_debug("threads_spec: %s\n", opts->threads_user_spec);
3043 	else
3044 		pr_debug("threads_spec: %s\n", thread_spec_tags[opts->threads_spec]);
3045 
3046 	return 0;
3047 }
3048 
3049 static int parse_output_max_size(const struct option *opt,
3050 				 const char *str, int unset)
3051 {
3052 	unsigned long *s = (unsigned long *)opt->value;
3053 	static struct parse_tag tags_size[] = {
3054 		{ .tag  = 'B', .mult = 1       },
3055 		{ .tag  = 'K', .mult = 1 << 10 },
3056 		{ .tag  = 'M', .mult = 1 << 20 },
3057 		{ .tag  = 'G', .mult = 1 << 30 },
3058 		{ .tag  = 0 },
3059 	};
3060 	unsigned long val;
3061 
3062 	if (unset) {
3063 		*s = 0;
3064 		return 0;
3065 	}
3066 
3067 	val = parse_tag_value(str, tags_size);
3068 	if (val != (unsigned long) -1) {
3069 		*s = val;
3070 		return 0;
3071 	}
3072 
3073 	return -1;
3074 }
3075 
3076 static int record__parse_mmap_pages(const struct option *opt,
3077 				    const char *str,
3078 				    int unset __maybe_unused)
3079 {
3080 	struct record_opts *opts = opt->value;
3081 	char *s, *p;
3082 	unsigned int mmap_pages;
3083 	int ret;
3084 
3085 	if (!str)
3086 		return -EINVAL;
3087 
3088 	s = strdup(str);
3089 	if (!s)
3090 		return -ENOMEM;
3091 
3092 	p = strchr(s, ',');
3093 	if (p)
3094 		*p = '\0';
3095 
3096 	if (*s) {
3097 		ret = __evlist__parse_mmap_pages(&mmap_pages, s);
3098 		if (ret)
3099 			goto out_free;
3100 		opts->mmap_pages = mmap_pages;
3101 	}
3102 
3103 	if (!p) {
3104 		ret = 0;
3105 		goto out_free;
3106 	}
3107 
3108 	ret = __evlist__parse_mmap_pages(&mmap_pages, p + 1);
3109 	if (ret)
3110 		goto out_free;
3111 
3112 	opts->auxtrace_mmap_pages = mmap_pages;
3113 
3114 out_free:
3115 	free(s);
3116 	return ret;
3117 }
3118 
3119 void __weak arch__add_leaf_frame_record_opts(struct record_opts *opts __maybe_unused)
3120 {
3121 }
3122 
3123 static int parse_control_option(const struct option *opt,
3124 				const char *str,
3125 				int unset __maybe_unused)
3126 {
3127 	struct record_opts *opts = opt->value;
3128 
3129 	return evlist__parse_control(str, &opts->ctl_fd, &opts->ctl_fd_ack, &opts->ctl_fd_close);
3130 }
3131 
3132 static void switch_output_size_warn(struct record *rec)
3133 {
3134 	u64 wakeup_size = evlist__mmap_size(rec->opts.mmap_pages);
3135 	struct switch_output *s = &rec->switch_output;
3136 
3137 	wakeup_size /= 2;
3138 
3139 	if (s->size < wakeup_size) {
3140 		char buf[100];
3141 
3142 		unit_number__scnprintf(buf, sizeof(buf), wakeup_size);
3143 		pr_warning("WARNING: switch-output data size lower than "
3144 			   "wakeup kernel buffer size (%s) "
3145 			   "expect bigger perf.data sizes\n", buf);
3146 	}
3147 }
3148 
3149 static int switch_output_setup(struct record *rec)
3150 {
3151 	struct switch_output *s = &rec->switch_output;
3152 	static struct parse_tag tags_size[] = {
3153 		{ .tag  = 'B', .mult = 1       },
3154 		{ .tag  = 'K', .mult = 1 << 10 },
3155 		{ .tag  = 'M', .mult = 1 << 20 },
3156 		{ .tag  = 'G', .mult = 1 << 30 },
3157 		{ .tag  = 0 },
3158 	};
3159 	static struct parse_tag tags_time[] = {
3160 		{ .tag  = 's', .mult = 1        },
3161 		{ .tag  = 'm', .mult = 60       },
3162 		{ .tag  = 'h', .mult = 60*60    },
3163 		{ .tag  = 'd', .mult = 60*60*24 },
3164 		{ .tag  = 0 },
3165 	};
3166 	unsigned long val;
3167 
3168 	/*
3169 	 * If we're using --switch-output-events, then we imply its
3170 	 * --switch-output=signal, as we'll send a SIGUSR2 from the side band
3171 	 *  thread to its parent.
3172 	 */
3173 	if (rec->switch_output_event_set) {
3174 		if (record__threads_enabled(rec)) {
3175 			pr_warning("WARNING: --switch-output-event option is not available in parallel streaming mode.\n");
3176 			return 0;
3177 		}
3178 		goto do_signal;
3179 	}
3180 
3181 	if (!s->set)
3182 		return 0;
3183 
3184 	if (record__threads_enabled(rec)) {
3185 		pr_warning("WARNING: --switch-output option is not available in parallel streaming mode.\n");
3186 		return 0;
3187 	}
3188 
3189 	if (!strcmp(s->str, "signal")) {
3190 do_signal:
3191 		s->signal = true;
3192 		pr_debug("switch-output with SIGUSR2 signal\n");
3193 		goto enabled;
3194 	}
3195 
3196 	val = parse_tag_value(s->str, tags_size);
3197 	if (val != (unsigned long) -1) {
3198 		s->size = val;
3199 		pr_debug("switch-output with %s size threshold\n", s->str);
3200 		goto enabled;
3201 	}
3202 
3203 	val = parse_tag_value(s->str, tags_time);
3204 	if (val != (unsigned long) -1) {
3205 		s->time = val;
3206 		pr_debug("switch-output with %s time threshold (%lu seconds)\n",
3207 			 s->str, s->time);
3208 		goto enabled;
3209 	}
3210 
3211 	return -1;
3212 
3213 enabled:
3214 	rec->timestamp_filename = true;
3215 	s->enabled              = true;
3216 
3217 	if (s->size && !rec->opts.no_buffering)
3218 		switch_output_size_warn(rec);
3219 
3220 	return 0;
3221 }
3222 
3223 static const char * const __record_usage[] = {
3224 	"perf record [<options>] [<command>]",
3225 	"perf record [<options>] -- <command> [<options>]",
3226 	NULL
3227 };
3228 const char * const *record_usage = __record_usage;
3229 
3230 static int build_id__process_mmap(struct perf_tool *tool, union perf_event *event,
3231 				  struct perf_sample *sample, struct machine *machine)
3232 {
3233 	/*
3234 	 * We already have the kernel maps, put in place via perf_session__create_kernel_maps()
3235 	 * no need to add them twice.
3236 	 */
3237 	if (!(event->header.misc & PERF_RECORD_MISC_USER))
3238 		return 0;
3239 	return perf_event__process_mmap(tool, event, sample, machine);
3240 }
3241 
3242 static int build_id__process_mmap2(struct perf_tool *tool, union perf_event *event,
3243 				   struct perf_sample *sample, struct machine *machine)
3244 {
3245 	/*
3246 	 * We already have the kernel maps, put in place via perf_session__create_kernel_maps()
3247 	 * no need to add them twice.
3248 	 */
3249 	if (!(event->header.misc & PERF_RECORD_MISC_USER))
3250 		return 0;
3251 
3252 	return perf_event__process_mmap2(tool, event, sample, machine);
3253 }
3254 
3255 static int process_timestamp_boundary(struct perf_tool *tool,
3256 				      union perf_event *event __maybe_unused,
3257 				      struct perf_sample *sample,
3258 				      struct machine *machine __maybe_unused)
3259 {
3260 	struct record *rec = container_of(tool, struct record, tool);
3261 
3262 	set_timestamp_boundary(rec, sample->time);
3263 	return 0;
3264 }
3265 
3266 static int parse_record_synth_option(const struct option *opt,
3267 				     const char *str,
3268 				     int unset __maybe_unused)
3269 {
3270 	struct record_opts *opts = opt->value;
3271 	char *p = strdup(str);
3272 
3273 	if (p == NULL)
3274 		return -1;
3275 
3276 	opts->synth = parse_synth_opt(p);
3277 	free(p);
3278 
3279 	if (opts->synth < 0) {
3280 		pr_err("Invalid synth option: %s\n", str);
3281 		return -1;
3282 	}
3283 	return 0;
3284 }
3285 
3286 /*
3287  * XXX Ideally would be local to cmd_record() and passed to a record__new
3288  * because we need to have access to it in record__exit, that is called
3289  * after cmd_record() exits, but since record_options need to be accessible to
3290  * builtin-script, leave it here.
3291  *
3292  * At least we don't ouch it in all the other functions here directly.
3293  *
3294  * Just say no to tons of global variables, sigh.
3295  */
3296 static struct record record = {
3297 	.opts = {
3298 		.sample_time	     = true,
3299 		.mmap_pages	     = UINT_MAX,
3300 		.user_freq	     = UINT_MAX,
3301 		.user_interval	     = ULLONG_MAX,
3302 		.freq		     = 4000,
3303 		.target		     = {
3304 			.uses_mmap   = true,
3305 			.default_per_cpu = true,
3306 		},
3307 		.mmap_flush          = MMAP_FLUSH_DEFAULT,
3308 		.nr_threads_synthesize = 1,
3309 		.ctl_fd              = -1,
3310 		.ctl_fd_ack          = -1,
3311 		.synth               = PERF_SYNTH_ALL,
3312 	},
3313 	.tool = {
3314 		.sample		= process_sample_event,
3315 		.fork		= perf_event__process_fork,
3316 		.exit		= perf_event__process_exit,
3317 		.comm		= perf_event__process_comm,
3318 		.namespaces	= perf_event__process_namespaces,
3319 		.mmap		= build_id__process_mmap,
3320 		.mmap2		= build_id__process_mmap2,
3321 		.itrace_start	= process_timestamp_boundary,
3322 		.aux		= process_timestamp_boundary,
3323 		.ordered_events	= true,
3324 	},
3325 };
3326 
3327 const char record_callchain_help[] = CALLCHAIN_RECORD_HELP
3328 	"\n\t\t\t\tDefault: fp";
3329 
3330 static bool dry_run;
3331 
3332 /*
3333  * XXX Will stay a global variable till we fix builtin-script.c to stop messing
3334  * with it and switch to use the library functions in perf_evlist that came
3335  * from builtin-record.c, i.e. use record_opts,
3336  * evlist__prepare_workload, etc instead of fork+exec'in 'perf record',
3337  * using pipes, etc.
3338  */
3339 static struct option __record_options[] = {
3340 	OPT_CALLBACK('e', "event", &record.evlist, "event",
3341 		     "event selector. use 'perf list' to list available events",
3342 		     parse_events_option),
3343 	OPT_CALLBACK(0, "filter", &record.evlist, "filter",
3344 		     "event filter", parse_filter),
3345 	OPT_CALLBACK_NOOPT(0, "exclude-perf", &record.evlist,
3346 			   NULL, "don't record events from perf itself",
3347 			   exclude_perf),
3348 	OPT_STRING('p', "pid", &record.opts.target.pid, "pid",
3349 		    "record events on existing process id"),
3350 	OPT_STRING('t', "tid", &record.opts.target.tid, "tid",
3351 		    "record events on existing thread id"),
3352 	OPT_INTEGER('r', "realtime", &record.realtime_prio,
3353 		    "collect data with this RT SCHED_FIFO priority"),
3354 	OPT_BOOLEAN(0, "no-buffering", &record.opts.no_buffering,
3355 		    "collect data without buffering"),
3356 	OPT_BOOLEAN('R', "raw-samples", &record.opts.raw_samples,
3357 		    "collect raw sample records from all opened counters"),
3358 	OPT_BOOLEAN('a', "all-cpus", &record.opts.target.system_wide,
3359 			    "system-wide collection from all CPUs"),
3360 	OPT_STRING('C', "cpu", &record.opts.target.cpu_list, "cpu",
3361 		    "list of cpus to monitor"),
3362 	OPT_U64('c', "count", &record.opts.user_interval, "event period to sample"),
3363 	OPT_STRING('o', "output", &record.data.path, "file",
3364 		    "output file name"),
3365 	OPT_BOOLEAN_SET('i', "no-inherit", &record.opts.no_inherit,
3366 			&record.opts.no_inherit_set,
3367 			"child tasks do not inherit counters"),
3368 	OPT_BOOLEAN(0, "tail-synthesize", &record.opts.tail_synthesize,
3369 		    "synthesize non-sample events at the end of output"),
3370 	OPT_BOOLEAN(0, "overwrite", &record.opts.overwrite, "use overwrite mode"),
3371 	OPT_BOOLEAN(0, "no-bpf-event", &record.opts.no_bpf_event, "do not record bpf events"),
3372 	OPT_BOOLEAN(0, "strict-freq", &record.opts.strict_freq,
3373 		    "Fail if the specified frequency can't be used"),
3374 	OPT_CALLBACK('F', "freq", &record.opts, "freq or 'max'",
3375 		     "profile at this frequency",
3376 		      record__parse_freq),
3377 	OPT_CALLBACK('m', "mmap-pages", &record.opts, "pages[,pages]",
3378 		     "number of mmap data pages and AUX area tracing mmap pages",
3379 		     record__parse_mmap_pages),
3380 	OPT_CALLBACK(0, "mmap-flush", &record.opts, "number",
3381 		     "Minimal number of bytes that is extracted from mmap data pages (default: 1)",
3382 		     record__mmap_flush_parse),
3383 	OPT_CALLBACK_NOOPT('g', NULL, &callchain_param,
3384 			   NULL, "enables call-graph recording" ,
3385 			   &record_callchain_opt),
3386 	OPT_CALLBACK(0, "call-graph", &record.opts,
3387 		     "record_mode[,record_size]", record_callchain_help,
3388 		     &record_parse_callchain_opt),
3389 	OPT_INCR('v', "verbose", &verbose,
3390 		    "be more verbose (show counter open errors, etc)"),
3391 	OPT_BOOLEAN('q', "quiet", &quiet, "don't print any warnings or messages"),
3392 	OPT_BOOLEAN('s', "stat", &record.opts.inherit_stat,
3393 		    "per thread counts"),
3394 	OPT_BOOLEAN('d', "data", &record.opts.sample_address, "Record the sample addresses"),
3395 	OPT_BOOLEAN(0, "phys-data", &record.opts.sample_phys_addr,
3396 		    "Record the sample physical addresses"),
3397 	OPT_BOOLEAN(0, "data-page-size", &record.opts.sample_data_page_size,
3398 		    "Record the sampled data address data page size"),
3399 	OPT_BOOLEAN(0, "code-page-size", &record.opts.sample_code_page_size,
3400 		    "Record the sampled code address (ip) page size"),
3401 	OPT_BOOLEAN(0, "sample-cpu", &record.opts.sample_cpu, "Record the sample cpu"),
3402 	OPT_BOOLEAN(0, "sample-identifier", &record.opts.sample_identifier,
3403 		    "Record the sample identifier"),
3404 	OPT_BOOLEAN_SET('T', "timestamp", &record.opts.sample_time,
3405 			&record.opts.sample_time_set,
3406 			"Record the sample timestamps"),
3407 	OPT_BOOLEAN_SET('P', "period", &record.opts.period, &record.opts.period_set,
3408 			"Record the sample period"),
3409 	OPT_BOOLEAN('n', "no-samples", &record.opts.no_samples,
3410 		    "don't sample"),
3411 	OPT_BOOLEAN_SET('N', "no-buildid-cache", &record.no_buildid_cache,
3412 			&record.no_buildid_cache_set,
3413 			"do not update the buildid cache"),
3414 	OPT_BOOLEAN_SET('B', "no-buildid", &record.no_buildid,
3415 			&record.no_buildid_set,
3416 			"do not collect buildids in perf.data"),
3417 	OPT_CALLBACK('G', "cgroup", &record.evlist, "name",
3418 		     "monitor event in cgroup name only",
3419 		     parse_cgroups),
3420 	OPT_CALLBACK('D', "delay", &record, "ms",
3421 		     "ms to wait before starting measurement after program start (-1: start with events disabled), "
3422 		     "or ranges of time to enable events e.g. '-D 10-20,30-40'",
3423 		     record__parse_event_enable_time),
3424 	OPT_BOOLEAN(0, "kcore", &record.opts.kcore, "copy /proc/kcore"),
3425 	OPT_STRING('u', "uid", &record.opts.target.uid_str, "user",
3426 		   "user to profile"),
3427 
3428 	OPT_CALLBACK_NOOPT('b', "branch-any", &record.opts.branch_stack,
3429 		     "branch any", "sample any taken branches",
3430 		     parse_branch_stack),
3431 
3432 	OPT_CALLBACK('j', "branch-filter", &record.opts.branch_stack,
3433 		     "branch filter mask", "branch stack filter modes",
3434 		     parse_branch_stack),
3435 	OPT_BOOLEAN('W', "weight", &record.opts.sample_weight,
3436 		    "sample by weight (on special events only)"),
3437 	OPT_BOOLEAN(0, "transaction", &record.opts.sample_transaction,
3438 		    "sample transaction flags (special events only)"),
3439 	OPT_BOOLEAN(0, "per-thread", &record.opts.target.per_thread,
3440 		    "use per-thread mmaps"),
3441 	OPT_CALLBACK_OPTARG('I', "intr-regs", &record.opts.sample_intr_regs, NULL, "any register",
3442 		    "sample selected machine registers on interrupt,"
3443 		    " use '-I?' to list register names", parse_intr_regs),
3444 	OPT_CALLBACK_OPTARG(0, "user-regs", &record.opts.sample_user_regs, NULL, "any register",
3445 		    "sample selected machine registers on interrupt,"
3446 		    " use '--user-regs=?' to list register names", parse_user_regs),
3447 	OPT_BOOLEAN(0, "running-time", &record.opts.running_time,
3448 		    "Record running/enabled time of read (:S) events"),
3449 	OPT_CALLBACK('k', "clockid", &record.opts,
3450 	"clockid", "clockid to use for events, see clock_gettime()",
3451 	parse_clockid),
3452 	OPT_STRING_OPTARG('S', "snapshot", &record.opts.auxtrace_snapshot_opts,
3453 			  "opts", "AUX area tracing Snapshot Mode", ""),
3454 	OPT_STRING_OPTARG(0, "aux-sample", &record.opts.auxtrace_sample_opts,
3455 			  "opts", "sample AUX area", ""),
3456 	OPT_UINTEGER(0, "proc-map-timeout", &proc_map_timeout,
3457 			"per thread proc mmap processing timeout in ms"),
3458 	OPT_BOOLEAN(0, "namespaces", &record.opts.record_namespaces,
3459 		    "Record namespaces events"),
3460 	OPT_BOOLEAN(0, "all-cgroups", &record.opts.record_cgroup,
3461 		    "Record cgroup events"),
3462 	OPT_BOOLEAN_SET(0, "switch-events", &record.opts.record_switch_events,
3463 			&record.opts.record_switch_events_set,
3464 			"Record context switch events"),
3465 	OPT_BOOLEAN_FLAG(0, "all-kernel", &record.opts.all_kernel,
3466 			 "Configure all used events to run in kernel space.",
3467 			 PARSE_OPT_EXCLUSIVE),
3468 	OPT_BOOLEAN_FLAG(0, "all-user", &record.opts.all_user,
3469 			 "Configure all used events to run in user space.",
3470 			 PARSE_OPT_EXCLUSIVE),
3471 	OPT_BOOLEAN(0, "kernel-callchains", &record.opts.kernel_callchains,
3472 		    "collect kernel callchains"),
3473 	OPT_BOOLEAN(0, "user-callchains", &record.opts.user_callchains,
3474 		    "collect user callchains"),
3475 	OPT_STRING(0, "clang-path", &llvm_param.clang_path, "clang path",
3476 		   "clang binary to use for compiling BPF scriptlets"),
3477 	OPT_STRING(0, "clang-opt", &llvm_param.clang_opt, "clang options",
3478 		   "options passed to clang when compiling BPF scriptlets"),
3479 	OPT_STRING(0, "vmlinux", &symbol_conf.vmlinux_name,
3480 		   "file", "vmlinux pathname"),
3481 	OPT_BOOLEAN(0, "buildid-all", &record.buildid_all,
3482 		    "Record build-id of all DSOs regardless of hits"),
3483 	OPT_BOOLEAN(0, "buildid-mmap", &record.buildid_mmap,
3484 		    "Record build-id in map events"),
3485 	OPT_BOOLEAN(0, "timestamp-filename", &record.timestamp_filename,
3486 		    "append timestamp to output filename"),
3487 	OPT_BOOLEAN(0, "timestamp-boundary", &record.timestamp_boundary,
3488 		    "Record timestamp boundary (time of first/last samples)"),
3489 	OPT_STRING_OPTARG_SET(0, "switch-output", &record.switch_output.str,
3490 			  &record.switch_output.set, "signal or size[BKMG] or time[smhd]",
3491 			  "Switch output when receiving SIGUSR2 (signal) or cross a size or time threshold",
3492 			  "signal"),
3493 	OPT_CALLBACK_SET(0, "switch-output-event", &record.sb_evlist, &record.switch_output_event_set, "switch output event",
3494 			 "switch output event selector. use 'perf list' to list available events",
3495 			 parse_events_option_new_evlist),
3496 	OPT_INTEGER(0, "switch-max-files", &record.switch_output.num_files,
3497 		   "Limit number of switch output generated files"),
3498 	OPT_BOOLEAN(0, "dry-run", &dry_run,
3499 		    "Parse options then exit"),
3500 #ifdef HAVE_AIO_SUPPORT
3501 	OPT_CALLBACK_OPTARG(0, "aio", &record.opts,
3502 		     &nr_cblocks_default, "n", "Use <n> control blocks in asynchronous trace writing mode (default: 1, max: 4)",
3503 		     record__aio_parse),
3504 #endif
3505 	OPT_CALLBACK(0, "affinity", &record.opts, "node|cpu",
3506 		     "Set affinity mask of trace reading thread to NUMA node cpu mask or cpu of processed mmap buffer",
3507 		     record__parse_affinity),
3508 #ifdef HAVE_ZSTD_SUPPORT
3509 	OPT_CALLBACK_OPTARG('z', "compression-level", &record.opts, &comp_level_default, "n",
3510 			    "Compress records using specified level (default: 1 - fastest compression, 22 - greatest compression)",
3511 			    record__parse_comp_level),
3512 #endif
3513 	OPT_CALLBACK(0, "max-size", &record.output_max_size,
3514 		     "size", "Limit the maximum size of the output file", parse_output_max_size),
3515 	OPT_UINTEGER(0, "num-thread-synthesize",
3516 		     &record.opts.nr_threads_synthesize,
3517 		     "number of threads to run for event synthesis"),
3518 #ifdef HAVE_LIBPFM
3519 	OPT_CALLBACK(0, "pfm-events", &record.evlist, "event",
3520 		"libpfm4 event selector. use 'perf list' to list available events",
3521 		parse_libpfm_events_option),
3522 #endif
3523 	OPT_CALLBACK(0, "control", &record.opts, "fd:ctl-fd[,ack-fd] or fifo:ctl-fifo[,ack-fifo]",
3524 		     "Listen on ctl-fd descriptor for command to control measurement ('enable': enable events, 'disable': disable events,\n"
3525 		     "\t\t\t  'snapshot': AUX area tracing snapshot).\n"
3526 		     "\t\t\t  Optionally send control command completion ('ack\\n') to ack-fd descriptor.\n"
3527 		     "\t\t\t  Alternatively, ctl-fifo / ack-fifo will be opened and used as ctl-fd / ack-fd.",
3528 		      parse_control_option),
3529 	OPT_CALLBACK(0, "synth", &record.opts, "no|all|task|mmap|cgroup",
3530 		     "Fine-tune event synthesis: default=all", parse_record_synth_option),
3531 	OPT_STRING_OPTARG_SET(0, "debuginfod", &record.debuginfod.urls,
3532 			  &record.debuginfod.set, "debuginfod urls",
3533 			  "Enable debuginfod data retrieval from DEBUGINFOD_URLS or specified urls",
3534 			  "system"),
3535 	OPT_CALLBACK_OPTARG(0, "threads", &record.opts, NULL, "spec",
3536 			    "write collected trace data into several data files using parallel threads",
3537 			    record__parse_threads),
3538 	OPT_BOOLEAN(0, "off-cpu", &record.off_cpu, "Enable off-cpu analysis"),
3539 	OPT_END()
3540 };
3541 
3542 struct option *record_options = __record_options;
3543 
3544 static int record__mmap_cpu_mask_init(struct mmap_cpu_mask *mask, struct perf_cpu_map *cpus)
3545 {
3546 	struct perf_cpu cpu;
3547 	int idx;
3548 
3549 	if (cpu_map__is_dummy(cpus))
3550 		return 0;
3551 
3552 	perf_cpu_map__for_each_cpu(cpu, idx, cpus) {
3553 		if (cpu.cpu == -1)
3554 			continue;
3555 		/* Return ENODEV is input cpu is greater than max cpu */
3556 		if ((unsigned long)cpu.cpu > mask->nbits)
3557 			return -ENODEV;
3558 		__set_bit(cpu.cpu, mask->bits);
3559 	}
3560 
3561 	return 0;
3562 }
3563 
3564 static int record__mmap_cpu_mask_init_spec(struct mmap_cpu_mask *mask, const char *mask_spec)
3565 {
3566 	struct perf_cpu_map *cpus;
3567 
3568 	cpus = perf_cpu_map__new(mask_spec);
3569 	if (!cpus)
3570 		return -ENOMEM;
3571 
3572 	bitmap_zero(mask->bits, mask->nbits);
3573 	if (record__mmap_cpu_mask_init(mask, cpus))
3574 		return -ENODEV;
3575 
3576 	perf_cpu_map__put(cpus);
3577 
3578 	return 0;
3579 }
3580 
3581 static void record__free_thread_masks(struct record *rec, int nr_threads)
3582 {
3583 	int t;
3584 
3585 	if (rec->thread_masks)
3586 		for (t = 0; t < nr_threads; t++)
3587 			record__thread_mask_free(&rec->thread_masks[t]);
3588 
3589 	zfree(&rec->thread_masks);
3590 }
3591 
3592 static int record__alloc_thread_masks(struct record *rec, int nr_threads, int nr_bits)
3593 {
3594 	int t, ret;
3595 
3596 	rec->thread_masks = zalloc(nr_threads * sizeof(*(rec->thread_masks)));
3597 	if (!rec->thread_masks) {
3598 		pr_err("Failed to allocate thread masks\n");
3599 		return -ENOMEM;
3600 	}
3601 
3602 	for (t = 0; t < nr_threads; t++) {
3603 		ret = record__thread_mask_alloc(&rec->thread_masks[t], nr_bits);
3604 		if (ret) {
3605 			pr_err("Failed to allocate thread masks[%d]\n", t);
3606 			goto out_free;
3607 		}
3608 	}
3609 
3610 	return 0;
3611 
3612 out_free:
3613 	record__free_thread_masks(rec, nr_threads);
3614 
3615 	return ret;
3616 }
3617 
3618 static int record__init_thread_cpu_masks(struct record *rec, struct perf_cpu_map *cpus)
3619 {
3620 	int t, ret, nr_cpus = perf_cpu_map__nr(cpus);
3621 
3622 	ret = record__alloc_thread_masks(rec, nr_cpus, cpu__max_cpu().cpu);
3623 	if (ret)
3624 		return ret;
3625 
3626 	rec->nr_threads = nr_cpus;
3627 	pr_debug("nr_threads: %d\n", rec->nr_threads);
3628 
3629 	for (t = 0; t < rec->nr_threads; t++) {
3630 		__set_bit(perf_cpu_map__cpu(cpus, t).cpu, rec->thread_masks[t].maps.bits);
3631 		__set_bit(perf_cpu_map__cpu(cpus, t).cpu, rec->thread_masks[t].affinity.bits);
3632 		if (verbose > 0) {
3633 			pr_debug("thread_masks[%d]: ", t);
3634 			mmap_cpu_mask__scnprintf(&rec->thread_masks[t].maps, "maps");
3635 			pr_debug("thread_masks[%d]: ", t);
3636 			mmap_cpu_mask__scnprintf(&rec->thread_masks[t].affinity, "affinity");
3637 		}
3638 	}
3639 
3640 	return 0;
3641 }
3642 
3643 static int record__init_thread_masks_spec(struct record *rec, struct perf_cpu_map *cpus,
3644 					  const char **maps_spec, const char **affinity_spec,
3645 					  u32 nr_spec)
3646 {
3647 	u32 s;
3648 	int ret = 0, t = 0;
3649 	struct mmap_cpu_mask cpus_mask;
3650 	struct thread_mask thread_mask, full_mask, *thread_masks;
3651 
3652 	ret = record__mmap_cpu_mask_alloc(&cpus_mask, cpu__max_cpu().cpu);
3653 	if (ret) {
3654 		pr_err("Failed to allocate CPUs mask\n");
3655 		return ret;
3656 	}
3657 
3658 	ret = record__mmap_cpu_mask_init(&cpus_mask, cpus);
3659 	if (ret) {
3660 		pr_err("Failed to init cpu mask\n");
3661 		goto out_free_cpu_mask;
3662 	}
3663 
3664 	ret = record__thread_mask_alloc(&full_mask, cpu__max_cpu().cpu);
3665 	if (ret) {
3666 		pr_err("Failed to allocate full mask\n");
3667 		goto out_free_cpu_mask;
3668 	}
3669 
3670 	ret = record__thread_mask_alloc(&thread_mask, cpu__max_cpu().cpu);
3671 	if (ret) {
3672 		pr_err("Failed to allocate thread mask\n");
3673 		goto out_free_full_and_cpu_masks;
3674 	}
3675 
3676 	for (s = 0; s < nr_spec; s++) {
3677 		ret = record__mmap_cpu_mask_init_spec(&thread_mask.maps, maps_spec[s]);
3678 		if (ret) {
3679 			pr_err("Failed to initialize maps thread mask\n");
3680 			goto out_free;
3681 		}
3682 		ret = record__mmap_cpu_mask_init_spec(&thread_mask.affinity, affinity_spec[s]);
3683 		if (ret) {
3684 			pr_err("Failed to initialize affinity thread mask\n");
3685 			goto out_free;
3686 		}
3687 
3688 		/* ignore invalid CPUs but do not allow empty masks */
3689 		if (!bitmap_and(thread_mask.maps.bits, thread_mask.maps.bits,
3690 				cpus_mask.bits, thread_mask.maps.nbits)) {
3691 			pr_err("Empty maps mask: %s\n", maps_spec[s]);
3692 			ret = -EINVAL;
3693 			goto out_free;
3694 		}
3695 		if (!bitmap_and(thread_mask.affinity.bits, thread_mask.affinity.bits,
3696 				cpus_mask.bits, thread_mask.affinity.nbits)) {
3697 			pr_err("Empty affinity mask: %s\n", affinity_spec[s]);
3698 			ret = -EINVAL;
3699 			goto out_free;
3700 		}
3701 
3702 		/* do not allow intersection with other masks (full_mask) */
3703 		if (bitmap_intersects(thread_mask.maps.bits, full_mask.maps.bits,
3704 				      thread_mask.maps.nbits)) {
3705 			pr_err("Intersecting maps mask: %s\n", maps_spec[s]);
3706 			ret = -EINVAL;
3707 			goto out_free;
3708 		}
3709 		if (bitmap_intersects(thread_mask.affinity.bits, full_mask.affinity.bits,
3710 				      thread_mask.affinity.nbits)) {
3711 			pr_err("Intersecting affinity mask: %s\n", affinity_spec[s]);
3712 			ret = -EINVAL;
3713 			goto out_free;
3714 		}
3715 
3716 		bitmap_or(full_mask.maps.bits, full_mask.maps.bits,
3717 			  thread_mask.maps.bits, full_mask.maps.nbits);
3718 		bitmap_or(full_mask.affinity.bits, full_mask.affinity.bits,
3719 			  thread_mask.affinity.bits, full_mask.maps.nbits);
3720 
3721 		thread_masks = realloc(rec->thread_masks, (t + 1) * sizeof(struct thread_mask));
3722 		if (!thread_masks) {
3723 			pr_err("Failed to reallocate thread masks\n");
3724 			ret = -ENOMEM;
3725 			goto out_free;
3726 		}
3727 		rec->thread_masks = thread_masks;
3728 		rec->thread_masks[t] = thread_mask;
3729 		if (verbose > 0) {
3730 			pr_debug("thread_masks[%d]: ", t);
3731 			mmap_cpu_mask__scnprintf(&rec->thread_masks[t].maps, "maps");
3732 			pr_debug("thread_masks[%d]: ", t);
3733 			mmap_cpu_mask__scnprintf(&rec->thread_masks[t].affinity, "affinity");
3734 		}
3735 		t++;
3736 		ret = record__thread_mask_alloc(&thread_mask, cpu__max_cpu().cpu);
3737 		if (ret) {
3738 			pr_err("Failed to allocate thread mask\n");
3739 			goto out_free_full_and_cpu_masks;
3740 		}
3741 	}
3742 	rec->nr_threads = t;
3743 	pr_debug("nr_threads: %d\n", rec->nr_threads);
3744 	if (!rec->nr_threads)
3745 		ret = -EINVAL;
3746 
3747 out_free:
3748 	record__thread_mask_free(&thread_mask);
3749 out_free_full_and_cpu_masks:
3750 	record__thread_mask_free(&full_mask);
3751 out_free_cpu_mask:
3752 	record__mmap_cpu_mask_free(&cpus_mask);
3753 
3754 	return ret;
3755 }
3756 
3757 static int record__init_thread_core_masks(struct record *rec, struct perf_cpu_map *cpus)
3758 {
3759 	int ret;
3760 	struct cpu_topology *topo;
3761 
3762 	topo = cpu_topology__new();
3763 	if (!topo) {
3764 		pr_err("Failed to allocate CPU topology\n");
3765 		return -ENOMEM;
3766 	}
3767 
3768 	ret = record__init_thread_masks_spec(rec, cpus, topo->core_cpus_list,
3769 					     topo->core_cpus_list, topo->core_cpus_lists);
3770 	cpu_topology__delete(topo);
3771 
3772 	return ret;
3773 }
3774 
3775 static int record__init_thread_package_masks(struct record *rec, struct perf_cpu_map *cpus)
3776 {
3777 	int ret;
3778 	struct cpu_topology *topo;
3779 
3780 	topo = cpu_topology__new();
3781 	if (!topo) {
3782 		pr_err("Failed to allocate CPU topology\n");
3783 		return -ENOMEM;
3784 	}
3785 
3786 	ret = record__init_thread_masks_spec(rec, cpus, topo->package_cpus_list,
3787 					     topo->package_cpus_list, topo->package_cpus_lists);
3788 	cpu_topology__delete(topo);
3789 
3790 	return ret;
3791 }
3792 
3793 static int record__init_thread_numa_masks(struct record *rec, struct perf_cpu_map *cpus)
3794 {
3795 	u32 s;
3796 	int ret;
3797 	const char **spec;
3798 	struct numa_topology *topo;
3799 
3800 	topo = numa_topology__new();
3801 	if (!topo) {
3802 		pr_err("Failed to allocate NUMA topology\n");
3803 		return -ENOMEM;
3804 	}
3805 
3806 	spec = zalloc(topo->nr * sizeof(char *));
3807 	if (!spec) {
3808 		pr_err("Failed to allocate NUMA spec\n");
3809 		ret = -ENOMEM;
3810 		goto out_delete_topo;
3811 	}
3812 	for (s = 0; s < topo->nr; s++)
3813 		spec[s] = topo->nodes[s].cpus;
3814 
3815 	ret = record__init_thread_masks_spec(rec, cpus, spec, spec, topo->nr);
3816 
3817 	zfree(&spec);
3818 
3819 out_delete_topo:
3820 	numa_topology__delete(topo);
3821 
3822 	return ret;
3823 }
3824 
3825 static int record__init_thread_user_masks(struct record *rec, struct perf_cpu_map *cpus)
3826 {
3827 	int t, ret;
3828 	u32 s, nr_spec = 0;
3829 	char **maps_spec = NULL, **affinity_spec = NULL, **tmp_spec;
3830 	char *user_spec, *spec, *spec_ptr, *mask, *mask_ptr, *dup_mask = NULL;
3831 
3832 	for (t = 0, user_spec = (char *)rec->opts.threads_user_spec; ; t++, user_spec = NULL) {
3833 		spec = strtok_r(user_spec, ":", &spec_ptr);
3834 		if (spec == NULL)
3835 			break;
3836 		pr_debug2("threads_spec[%d]: %s\n", t, spec);
3837 		mask = strtok_r(spec, "/", &mask_ptr);
3838 		if (mask == NULL)
3839 			break;
3840 		pr_debug2("  maps mask: %s\n", mask);
3841 		tmp_spec = realloc(maps_spec, (nr_spec + 1) * sizeof(char *));
3842 		if (!tmp_spec) {
3843 			pr_err("Failed to reallocate maps spec\n");
3844 			ret = -ENOMEM;
3845 			goto out_free;
3846 		}
3847 		maps_spec = tmp_spec;
3848 		maps_spec[nr_spec] = dup_mask = strdup(mask);
3849 		if (!maps_spec[nr_spec]) {
3850 			pr_err("Failed to allocate maps spec[%d]\n", nr_spec);
3851 			ret = -ENOMEM;
3852 			goto out_free;
3853 		}
3854 		mask = strtok_r(NULL, "/", &mask_ptr);
3855 		if (mask == NULL) {
3856 			pr_err("Invalid thread maps or affinity specs\n");
3857 			ret = -EINVAL;
3858 			goto out_free;
3859 		}
3860 		pr_debug2("  affinity mask: %s\n", mask);
3861 		tmp_spec = realloc(affinity_spec, (nr_spec + 1) * sizeof(char *));
3862 		if (!tmp_spec) {
3863 			pr_err("Failed to reallocate affinity spec\n");
3864 			ret = -ENOMEM;
3865 			goto out_free;
3866 		}
3867 		affinity_spec = tmp_spec;
3868 		affinity_spec[nr_spec] = strdup(mask);
3869 		if (!affinity_spec[nr_spec]) {
3870 			pr_err("Failed to allocate affinity spec[%d]\n", nr_spec);
3871 			ret = -ENOMEM;
3872 			goto out_free;
3873 		}
3874 		dup_mask = NULL;
3875 		nr_spec++;
3876 	}
3877 
3878 	ret = record__init_thread_masks_spec(rec, cpus, (const char **)maps_spec,
3879 					     (const char **)affinity_spec, nr_spec);
3880 
3881 out_free:
3882 	free(dup_mask);
3883 	for (s = 0; s < nr_spec; s++) {
3884 		if (maps_spec)
3885 			free(maps_spec[s]);
3886 		if (affinity_spec)
3887 			free(affinity_spec[s]);
3888 	}
3889 	free(affinity_spec);
3890 	free(maps_spec);
3891 
3892 	return ret;
3893 }
3894 
3895 static int record__init_thread_default_masks(struct record *rec, struct perf_cpu_map *cpus)
3896 {
3897 	int ret;
3898 
3899 	ret = record__alloc_thread_masks(rec, 1, cpu__max_cpu().cpu);
3900 	if (ret)
3901 		return ret;
3902 
3903 	if (record__mmap_cpu_mask_init(&rec->thread_masks->maps, cpus))
3904 		return -ENODEV;
3905 
3906 	rec->nr_threads = 1;
3907 
3908 	return 0;
3909 }
3910 
3911 static int record__init_thread_masks(struct record *rec)
3912 {
3913 	int ret = 0;
3914 	struct perf_cpu_map *cpus = rec->evlist->core.all_cpus;
3915 
3916 	if (!record__threads_enabled(rec))
3917 		return record__init_thread_default_masks(rec, cpus);
3918 
3919 	if (evlist__per_thread(rec->evlist)) {
3920 		pr_err("--per-thread option is mutually exclusive to parallel streaming mode.\n");
3921 		return -EINVAL;
3922 	}
3923 
3924 	switch (rec->opts.threads_spec) {
3925 	case THREAD_SPEC__CPU:
3926 		ret = record__init_thread_cpu_masks(rec, cpus);
3927 		break;
3928 	case THREAD_SPEC__CORE:
3929 		ret = record__init_thread_core_masks(rec, cpus);
3930 		break;
3931 	case THREAD_SPEC__PACKAGE:
3932 		ret = record__init_thread_package_masks(rec, cpus);
3933 		break;
3934 	case THREAD_SPEC__NUMA:
3935 		ret = record__init_thread_numa_masks(rec, cpus);
3936 		break;
3937 	case THREAD_SPEC__USER:
3938 		ret = record__init_thread_user_masks(rec, cpus);
3939 		break;
3940 	default:
3941 		break;
3942 	}
3943 
3944 	return ret;
3945 }
3946 
3947 int cmd_record(int argc, const char **argv)
3948 {
3949 	int err;
3950 	struct record *rec = &record;
3951 	char errbuf[BUFSIZ];
3952 
3953 	setlocale(LC_ALL, "");
3954 
3955 #ifndef HAVE_LIBBPF_SUPPORT
3956 # define set_nobuild(s, l, c) set_option_nobuild(record_options, s, l, "NO_LIBBPF=1", c)
3957 	set_nobuild('\0', "clang-path", true);
3958 	set_nobuild('\0', "clang-opt", true);
3959 # undef set_nobuild
3960 #endif
3961 
3962 #ifndef HAVE_BPF_PROLOGUE
3963 # if !defined (HAVE_DWARF_SUPPORT)
3964 #  define REASON  "NO_DWARF=1"
3965 # elif !defined (HAVE_LIBBPF_SUPPORT)
3966 #  define REASON  "NO_LIBBPF=1"
3967 # else
3968 #  define REASON  "this architecture doesn't support BPF prologue"
3969 # endif
3970 # define set_nobuild(s, l, c) set_option_nobuild(record_options, s, l, REASON, c)
3971 	set_nobuild('\0', "vmlinux", true);
3972 # undef set_nobuild
3973 # undef REASON
3974 #endif
3975 
3976 #ifndef HAVE_BPF_SKEL
3977 # define set_nobuild(s, l, m, c) set_option_nobuild(record_options, s, l, m, c)
3978 	set_nobuild('\0', "off-cpu", "no BUILD_BPF_SKEL=1", true);
3979 # undef set_nobuild
3980 #endif
3981 
3982 	rec->opts.affinity = PERF_AFFINITY_SYS;
3983 
3984 	rec->evlist = evlist__new();
3985 	if (rec->evlist == NULL)
3986 		return -ENOMEM;
3987 
3988 	err = perf_config(perf_record_config, rec);
3989 	if (err)
3990 		return err;
3991 
3992 	argc = parse_options(argc, argv, record_options, record_usage,
3993 			    PARSE_OPT_STOP_AT_NON_OPTION);
3994 	if (quiet)
3995 		perf_quiet_option();
3996 
3997 	err = symbol__validate_sym_arguments();
3998 	if (err)
3999 		return err;
4000 
4001 	perf_debuginfod_setup(&record.debuginfod);
4002 
4003 	/* Make system wide (-a) the default target. */
4004 	if (!argc && target__none(&rec->opts.target))
4005 		rec->opts.target.system_wide = true;
4006 
4007 	if (nr_cgroups && !rec->opts.target.system_wide) {
4008 		usage_with_options_msg(record_usage, record_options,
4009 			"cgroup monitoring only available in system-wide mode");
4010 
4011 	}
4012 
4013 	if (rec->buildid_mmap) {
4014 		if (!perf_can_record_build_id()) {
4015 			pr_err("Failed: no support to record build id in mmap events, update your kernel.\n");
4016 			err = -EINVAL;
4017 			goto out_opts;
4018 		}
4019 		pr_debug("Enabling build id in mmap2 events.\n");
4020 		/* Enable mmap build id synthesizing. */
4021 		symbol_conf.buildid_mmap2 = true;
4022 		/* Enable perf_event_attr::build_id bit. */
4023 		rec->opts.build_id = true;
4024 		/* Disable build id cache. */
4025 		rec->no_buildid = true;
4026 	}
4027 
4028 	if (rec->opts.record_cgroup && !perf_can_record_cgroup()) {
4029 		pr_err("Kernel has no cgroup sampling support.\n");
4030 		err = -EINVAL;
4031 		goto out_opts;
4032 	}
4033 
4034 	if (rec->opts.kcore)
4035 		rec->opts.text_poke = true;
4036 
4037 	if (rec->opts.kcore || record__threads_enabled(rec))
4038 		rec->data.is_dir = true;
4039 
4040 	if (record__threads_enabled(rec)) {
4041 		if (rec->opts.affinity != PERF_AFFINITY_SYS) {
4042 			pr_err("--affinity option is mutually exclusive to parallel streaming mode.\n");
4043 			goto out_opts;
4044 		}
4045 		if (record__aio_enabled(rec)) {
4046 			pr_err("Asynchronous streaming mode (--aio) is mutually exclusive to parallel streaming mode.\n");
4047 			goto out_opts;
4048 		}
4049 	}
4050 
4051 	if (rec->opts.comp_level != 0) {
4052 		pr_debug("Compression enabled, disabling build id collection at the end of the session.\n");
4053 		rec->no_buildid = true;
4054 	}
4055 
4056 	if (rec->opts.record_switch_events &&
4057 	    !perf_can_record_switch_events()) {
4058 		ui__error("kernel does not support recording context switch events\n");
4059 		parse_options_usage(record_usage, record_options, "switch-events", 0);
4060 		err = -EINVAL;
4061 		goto out_opts;
4062 	}
4063 
4064 	if (switch_output_setup(rec)) {
4065 		parse_options_usage(record_usage, record_options, "switch-output", 0);
4066 		err = -EINVAL;
4067 		goto out_opts;
4068 	}
4069 
4070 	if (rec->switch_output.time) {
4071 		signal(SIGALRM, alarm_sig_handler);
4072 		alarm(rec->switch_output.time);
4073 	}
4074 
4075 	if (rec->switch_output.num_files) {
4076 		rec->switch_output.filenames = calloc(sizeof(char *),
4077 						      rec->switch_output.num_files);
4078 		if (!rec->switch_output.filenames) {
4079 			err = -EINVAL;
4080 			goto out_opts;
4081 		}
4082 	}
4083 
4084 	if (rec->timestamp_filename && record__threads_enabled(rec)) {
4085 		rec->timestamp_filename = false;
4086 		pr_warning("WARNING: --timestamp-filename option is not available in parallel streaming mode.\n");
4087 	}
4088 
4089 	/*
4090 	 * Allow aliases to facilitate the lookup of symbols for address
4091 	 * filters. Refer to auxtrace_parse_filters().
4092 	 */
4093 	symbol_conf.allow_aliases = true;
4094 
4095 	symbol__init(NULL);
4096 
4097 	err = record__auxtrace_init(rec);
4098 	if (err)
4099 		goto out;
4100 
4101 	if (dry_run)
4102 		goto out;
4103 
4104 	err = bpf__setup_stdout(rec->evlist);
4105 	if (err) {
4106 		bpf__strerror_setup_stdout(rec->evlist, err, errbuf, sizeof(errbuf));
4107 		pr_err("ERROR: Setup BPF stdout failed: %s\n",
4108 			 errbuf);
4109 		goto out;
4110 	}
4111 
4112 	err = -ENOMEM;
4113 
4114 	if (rec->no_buildid_cache || rec->no_buildid) {
4115 		disable_buildid_cache();
4116 	} else if (rec->switch_output.enabled) {
4117 		/*
4118 		 * In 'perf record --switch-output', disable buildid
4119 		 * generation by default to reduce data file switching
4120 		 * overhead. Still generate buildid if they are required
4121 		 * explicitly using
4122 		 *
4123 		 *  perf record --switch-output --no-no-buildid \
4124 		 *              --no-no-buildid-cache
4125 		 *
4126 		 * Following code equals to:
4127 		 *
4128 		 * if ((rec->no_buildid || !rec->no_buildid_set) &&
4129 		 *     (rec->no_buildid_cache || !rec->no_buildid_cache_set))
4130 		 *         disable_buildid_cache();
4131 		 */
4132 		bool disable = true;
4133 
4134 		if (rec->no_buildid_set && !rec->no_buildid)
4135 			disable = false;
4136 		if (rec->no_buildid_cache_set && !rec->no_buildid_cache)
4137 			disable = false;
4138 		if (disable) {
4139 			rec->no_buildid = true;
4140 			rec->no_buildid_cache = true;
4141 			disable_buildid_cache();
4142 		}
4143 	}
4144 
4145 	if (record.opts.overwrite)
4146 		record.opts.tail_synthesize = true;
4147 
4148 	if (rec->evlist->core.nr_entries == 0) {
4149 		if (perf_pmu__has_hybrid()) {
4150 			err = evlist__add_default_hybrid(rec->evlist,
4151 							 !record.opts.no_samples);
4152 		} else {
4153 			err = __evlist__add_default(rec->evlist,
4154 						    !record.opts.no_samples);
4155 		}
4156 
4157 		if (err < 0) {
4158 			pr_err("Not enough memory for event selector list\n");
4159 			goto out;
4160 		}
4161 	}
4162 
4163 	if (rec->opts.target.tid && !rec->opts.no_inherit_set)
4164 		rec->opts.no_inherit = true;
4165 
4166 	err = target__validate(&rec->opts.target);
4167 	if (err) {
4168 		target__strerror(&rec->opts.target, err, errbuf, BUFSIZ);
4169 		ui__warning("%s\n", errbuf);
4170 	}
4171 
4172 	err = target__parse_uid(&rec->opts.target);
4173 	if (err) {
4174 		int saved_errno = errno;
4175 
4176 		target__strerror(&rec->opts.target, err, errbuf, BUFSIZ);
4177 		ui__error("%s", errbuf);
4178 
4179 		err = -saved_errno;
4180 		goto out;
4181 	}
4182 
4183 	/* Enable ignoring missing threads when -u/-p option is defined. */
4184 	rec->opts.ignore_missing_thread = rec->opts.target.uid != UINT_MAX || rec->opts.target.pid;
4185 
4186 	if (evlist__fix_hybrid_cpus(rec->evlist, rec->opts.target.cpu_list)) {
4187 		pr_err("failed to use cpu list %s\n",
4188 		       rec->opts.target.cpu_list);
4189 		goto out;
4190 	}
4191 
4192 	rec->opts.target.hybrid = perf_pmu__has_hybrid();
4193 
4194 	if (callchain_param.enabled && callchain_param.record_mode == CALLCHAIN_FP)
4195 		arch__add_leaf_frame_record_opts(&rec->opts);
4196 
4197 	err = -ENOMEM;
4198 	if (evlist__create_maps(rec->evlist, &rec->opts.target) < 0) {
4199 		if (rec->opts.target.pid != NULL) {
4200 			pr_err("Couldn't create thread/CPU maps: %s\n",
4201 				errno == ENOENT ? "No such process" : str_error_r(errno, errbuf, sizeof(errbuf)));
4202 			goto out;
4203 		}
4204 		else
4205 			usage_with_options(record_usage, record_options);
4206 	}
4207 
4208 	err = auxtrace_record__options(rec->itr, rec->evlist, &rec->opts);
4209 	if (err)
4210 		goto out;
4211 
4212 	/*
4213 	 * We take all buildids when the file contains
4214 	 * AUX area tracing data because we do not decode the
4215 	 * trace because it would take too long.
4216 	 */
4217 	if (rec->opts.full_auxtrace)
4218 		rec->buildid_all = true;
4219 
4220 	if (rec->opts.text_poke) {
4221 		err = record__config_text_poke(rec->evlist);
4222 		if (err) {
4223 			pr_err("record__config_text_poke failed, error %d\n", err);
4224 			goto out;
4225 		}
4226 	}
4227 
4228 	if (rec->off_cpu) {
4229 		err = record__config_off_cpu(rec);
4230 		if (err) {
4231 			pr_err("record__config_off_cpu failed, error %d\n", err);
4232 			goto out;
4233 		}
4234 	}
4235 
4236 	if (record_opts__config(&rec->opts)) {
4237 		err = -EINVAL;
4238 		goto out;
4239 	}
4240 
4241 	err = record__init_thread_masks(rec);
4242 	if (err) {
4243 		pr_err("Failed to initialize parallel data streaming masks\n");
4244 		goto out;
4245 	}
4246 
4247 	if (rec->opts.nr_cblocks > nr_cblocks_max)
4248 		rec->opts.nr_cblocks = nr_cblocks_max;
4249 	pr_debug("nr_cblocks: %d\n", rec->opts.nr_cblocks);
4250 
4251 	pr_debug("affinity: %s\n", affinity_tags[rec->opts.affinity]);
4252 	pr_debug("mmap flush: %d\n", rec->opts.mmap_flush);
4253 
4254 	if (rec->opts.comp_level > comp_level_max)
4255 		rec->opts.comp_level = comp_level_max;
4256 	pr_debug("comp level: %d\n", rec->opts.comp_level);
4257 
4258 	err = __cmd_record(&record, argc, argv);
4259 out:
4260 	evlist__delete(rec->evlist);
4261 	symbol__exit();
4262 	auxtrace_record__free(rec->itr);
4263 out_opts:
4264 	record__free_thread_masks(rec, rec->nr_threads);
4265 	rec->nr_threads = 0;
4266 	evlist__close_control(rec->opts.ctl_fd, rec->opts.ctl_fd_ack, &rec->opts.ctl_fd_close);
4267 	return err;
4268 }
4269 
4270 static void snapshot_sig_handler(int sig __maybe_unused)
4271 {
4272 	struct record *rec = &record;
4273 
4274 	hit_auxtrace_snapshot_trigger(rec);
4275 
4276 	if (switch_output_signal(rec))
4277 		trigger_hit(&switch_output_trigger);
4278 }
4279 
4280 static void alarm_sig_handler(int sig __maybe_unused)
4281 {
4282 	struct record *rec = &record;
4283 
4284 	if (switch_output_time(rec))
4285 		trigger_hit(&switch_output_trigger);
4286 }
4287