xref: /linux/tools/perf/builtin-record.c (revision ea518afc992032f7570c0a89ac9240b387dc0faf)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * builtin-record.c
4  *
5  * Builtin record command: Record the profile of a workload
6  * (or a CPU, or a PID) into the perf.data output file - for
7  * later analysis via perf report.
8  */
9 #include "builtin.h"
10 
11 #include "util/build-id.h"
12 #include <subcmd/parse-options.h>
13 #include <internal/xyarray.h>
14 #include "util/parse-events.h"
15 #include "util/config.h"
16 
17 #include "util/callchain.h"
18 #include "util/cgroup.h"
19 #include "util/header.h"
20 #include "util/event.h"
21 #include "util/evlist.h"
22 #include "util/evsel.h"
23 #include "util/debug.h"
24 #include "util/mmap.h"
25 #include "util/mutex.h"
26 #include "util/target.h"
27 #include "util/session.h"
28 #include "util/tool.h"
29 #include "util/symbol.h"
30 #include "util/record.h"
31 #include "util/cpumap.h"
32 #include "util/thread_map.h"
33 #include "util/data.h"
34 #include "util/perf_regs.h"
35 #include "util/auxtrace.h"
36 #include "util/tsc.h"
37 #include "util/parse-branch-options.h"
38 #include "util/parse-regs-options.h"
39 #include "util/perf_api_probe.h"
40 #include "util/trigger.h"
41 #include "util/perf-hooks.h"
42 #include "util/cpu-set-sched.h"
43 #include "util/synthetic-events.h"
44 #include "util/time-utils.h"
45 #include "util/units.h"
46 #include "util/bpf-event.h"
47 #include "util/util.h"
48 #include "util/pfm.h"
49 #include "util/pmu.h"
50 #include "util/pmus.h"
51 #include "util/clockid.h"
52 #include "util/off_cpu.h"
53 #include "util/bpf-filter.h"
54 #include "asm/bug.h"
55 #include "perf.h"
56 #include "cputopo.h"
57 
58 #include <errno.h>
59 #include <inttypes.h>
60 #include <locale.h>
61 #include <poll.h>
62 #include <pthread.h>
63 #include <unistd.h>
64 #ifndef HAVE_GETTID
65 #include <syscall.h>
66 #endif
67 #include <sched.h>
68 #include <signal.h>
69 #ifdef HAVE_EVENTFD_SUPPORT
70 #include <sys/eventfd.h>
71 #endif
72 #include <sys/mman.h>
73 #include <sys/wait.h>
74 #include <sys/types.h>
75 #include <sys/stat.h>
76 #include <fcntl.h>
77 #include <linux/err.h>
78 #include <linux/string.h>
79 #include <linux/time64.h>
80 #include <linux/zalloc.h>
81 #include <linux/bitmap.h>
82 #include <sys/time.h>
83 
84 struct switch_output {
85 	bool		 enabled;
86 	bool		 signal;
87 	unsigned long	 size;
88 	unsigned long	 time;
89 	const char	*str;
90 	bool		 set;
91 	char		 **filenames;
92 	int		 num_files;
93 	int		 cur_file;
94 };
95 
96 struct thread_mask {
97 	struct mmap_cpu_mask	maps;
98 	struct mmap_cpu_mask	affinity;
99 };
100 
101 struct record_thread {
102 	pid_t			tid;
103 	struct thread_mask	*mask;
104 	struct {
105 		int		msg[2];
106 		int		ack[2];
107 	} pipes;
108 	struct fdarray		pollfd;
109 	int			ctlfd_pos;
110 	int			nr_mmaps;
111 	struct mmap		**maps;
112 	struct mmap		**overwrite_maps;
113 	struct record		*rec;
114 	unsigned long long	samples;
115 	unsigned long		waking;
116 	u64			bytes_written;
117 	u64			bytes_transferred;
118 	u64			bytes_compressed;
119 };
120 
121 static __thread struct record_thread *thread;
122 
123 enum thread_msg {
124 	THREAD_MSG__UNDEFINED = 0,
125 	THREAD_MSG__READY,
126 	THREAD_MSG__MAX,
127 };
128 
129 static const char *thread_msg_tags[THREAD_MSG__MAX] = {
130 	"UNDEFINED", "READY"
131 };
132 
133 enum thread_spec {
134 	THREAD_SPEC__UNDEFINED = 0,
135 	THREAD_SPEC__CPU,
136 	THREAD_SPEC__CORE,
137 	THREAD_SPEC__PACKAGE,
138 	THREAD_SPEC__NUMA,
139 	THREAD_SPEC__USER,
140 	THREAD_SPEC__MAX,
141 };
142 
143 static const char *thread_spec_tags[THREAD_SPEC__MAX] = {
144 	"undefined", "cpu", "core", "package", "numa", "user"
145 };
146 
147 struct pollfd_index_map {
148 	int evlist_pollfd_index;
149 	int thread_pollfd_index;
150 };
151 
152 struct record {
153 	struct perf_tool	tool;
154 	struct record_opts	opts;
155 	u64			bytes_written;
156 	u64			thread_bytes_written;
157 	struct perf_data	data;
158 	struct auxtrace_record	*itr;
159 	struct evlist	*evlist;
160 	struct perf_session	*session;
161 	struct evlist		*sb_evlist;
162 	pthread_t		thread_id;
163 	int			realtime_prio;
164 	bool			switch_output_event_set;
165 	bool			no_buildid;
166 	bool			no_buildid_set;
167 	bool			no_buildid_cache;
168 	bool			no_buildid_cache_set;
169 	bool			buildid_all;
170 	bool			buildid_mmap;
171 	bool			timestamp_filename;
172 	bool			timestamp_boundary;
173 	bool			off_cpu;
174 	struct switch_output	switch_output;
175 	unsigned long long	samples;
176 	unsigned long		output_max_size;	/* = 0: unlimited */
177 	struct perf_debuginfod	debuginfod;
178 	int			nr_threads;
179 	struct thread_mask	*thread_masks;
180 	struct record_thread	*thread_data;
181 	struct pollfd_index_map	*index_map;
182 	size_t			index_map_sz;
183 	size_t			index_map_cnt;
184 };
185 
186 static volatile int done;
187 
188 static volatile int auxtrace_record__snapshot_started;
189 static DEFINE_TRIGGER(auxtrace_snapshot_trigger);
190 static DEFINE_TRIGGER(switch_output_trigger);
191 
192 static const char *affinity_tags[PERF_AFFINITY_MAX] = {
193 	"SYS", "NODE", "CPU"
194 };
195 
196 #ifndef HAVE_GETTID
197 static inline pid_t gettid(void)
198 {
199 	return (pid_t)syscall(__NR_gettid);
200 }
201 #endif
202 
203 static int record__threads_enabled(struct record *rec)
204 {
205 	return rec->opts.threads_spec;
206 }
207 
208 static bool switch_output_signal(struct record *rec)
209 {
210 	return rec->switch_output.signal &&
211 	       trigger_is_ready(&switch_output_trigger);
212 }
213 
214 static bool switch_output_size(struct record *rec)
215 {
216 	return rec->switch_output.size &&
217 	       trigger_is_ready(&switch_output_trigger) &&
218 	       (rec->bytes_written >= rec->switch_output.size);
219 }
220 
221 static bool switch_output_time(struct record *rec)
222 {
223 	return rec->switch_output.time &&
224 	       trigger_is_ready(&switch_output_trigger);
225 }
226 
227 static u64 record__bytes_written(struct record *rec)
228 {
229 	return rec->bytes_written + rec->thread_bytes_written;
230 }
231 
232 static bool record__output_max_size_exceeded(struct record *rec)
233 {
234 	return rec->output_max_size &&
235 	       (record__bytes_written(rec) >= rec->output_max_size);
236 }
237 
238 static int record__write(struct record *rec, struct mmap *map __maybe_unused,
239 			 void *bf, size_t size)
240 {
241 	struct perf_data_file *file = &rec->session->data->file;
242 
243 	if (map && map->file)
244 		file = map->file;
245 
246 	if (perf_data_file__write(file, bf, size) < 0) {
247 		pr_err("failed to write perf data, error: %m\n");
248 		return -1;
249 	}
250 
251 	if (map && map->file) {
252 		thread->bytes_written += size;
253 		rec->thread_bytes_written += size;
254 	} else {
255 		rec->bytes_written += size;
256 	}
257 
258 	if (record__output_max_size_exceeded(rec) && !done) {
259 		fprintf(stderr, "[ perf record: perf size limit reached (%" PRIu64 " KB),"
260 				" stopping session ]\n",
261 				record__bytes_written(rec) >> 10);
262 		done = 1;
263 	}
264 
265 	if (switch_output_size(rec))
266 		trigger_hit(&switch_output_trigger);
267 
268 	return 0;
269 }
270 
271 static int record__aio_enabled(struct record *rec);
272 static int record__comp_enabled(struct record *rec);
273 static ssize_t zstd_compress(struct perf_session *session, struct mmap *map,
274 			    void *dst, size_t dst_size, void *src, size_t src_size);
275 
276 #ifdef HAVE_AIO_SUPPORT
277 static int record__aio_write(struct aiocb *cblock, int trace_fd,
278 		void *buf, size_t size, off_t off)
279 {
280 	int rc;
281 
282 	cblock->aio_fildes = trace_fd;
283 	cblock->aio_buf    = buf;
284 	cblock->aio_nbytes = size;
285 	cblock->aio_offset = off;
286 	cblock->aio_sigevent.sigev_notify = SIGEV_NONE;
287 
288 	do {
289 		rc = aio_write(cblock);
290 		if (rc == 0) {
291 			break;
292 		} else if (errno != EAGAIN) {
293 			cblock->aio_fildes = -1;
294 			pr_err("failed to queue perf data, error: %m\n");
295 			break;
296 		}
297 	} while (1);
298 
299 	return rc;
300 }
301 
302 static int record__aio_complete(struct mmap *md, struct aiocb *cblock)
303 {
304 	void *rem_buf;
305 	off_t rem_off;
306 	size_t rem_size;
307 	int rc, aio_errno;
308 	ssize_t aio_ret, written;
309 
310 	aio_errno = aio_error(cblock);
311 	if (aio_errno == EINPROGRESS)
312 		return 0;
313 
314 	written = aio_ret = aio_return(cblock);
315 	if (aio_ret < 0) {
316 		if (aio_errno != EINTR)
317 			pr_err("failed to write perf data, error: %m\n");
318 		written = 0;
319 	}
320 
321 	rem_size = cblock->aio_nbytes - written;
322 
323 	if (rem_size == 0) {
324 		cblock->aio_fildes = -1;
325 		/*
326 		 * md->refcount is incremented in record__aio_pushfn() for
327 		 * every aio write request started in record__aio_push() so
328 		 * decrement it because the request is now complete.
329 		 */
330 		perf_mmap__put(&md->core);
331 		rc = 1;
332 	} else {
333 		/*
334 		 * aio write request may require restart with the
335 		 * reminder if the kernel didn't write whole
336 		 * chunk at once.
337 		 */
338 		rem_off = cblock->aio_offset + written;
339 		rem_buf = (void *)(cblock->aio_buf + written);
340 		record__aio_write(cblock, cblock->aio_fildes,
341 				rem_buf, rem_size, rem_off);
342 		rc = 0;
343 	}
344 
345 	return rc;
346 }
347 
348 static int record__aio_sync(struct mmap *md, bool sync_all)
349 {
350 	struct aiocb **aiocb = md->aio.aiocb;
351 	struct aiocb *cblocks = md->aio.cblocks;
352 	struct timespec timeout = { 0, 1000 * 1000  * 1 }; /* 1ms */
353 	int i, do_suspend;
354 
355 	do {
356 		do_suspend = 0;
357 		for (i = 0; i < md->aio.nr_cblocks; ++i) {
358 			if (cblocks[i].aio_fildes == -1 || record__aio_complete(md, &cblocks[i])) {
359 				if (sync_all)
360 					aiocb[i] = NULL;
361 				else
362 					return i;
363 			} else {
364 				/*
365 				 * Started aio write is not complete yet
366 				 * so it has to be waited before the
367 				 * next allocation.
368 				 */
369 				aiocb[i] = &cblocks[i];
370 				do_suspend = 1;
371 			}
372 		}
373 		if (!do_suspend)
374 			return -1;
375 
376 		while (aio_suspend((const struct aiocb **)aiocb, md->aio.nr_cblocks, &timeout)) {
377 			if (!(errno == EAGAIN || errno == EINTR))
378 				pr_err("failed to sync perf data, error: %m\n");
379 		}
380 	} while (1);
381 }
382 
383 struct record_aio {
384 	struct record	*rec;
385 	void		*data;
386 	size_t		size;
387 };
388 
389 static int record__aio_pushfn(struct mmap *map, void *to, void *buf, size_t size)
390 {
391 	struct record_aio *aio = to;
392 
393 	/*
394 	 * map->core.base data pointed by buf is copied into free map->aio.data[] buffer
395 	 * to release space in the kernel buffer as fast as possible, calling
396 	 * perf_mmap__consume() from perf_mmap__push() function.
397 	 *
398 	 * That lets the kernel to proceed with storing more profiling data into
399 	 * the kernel buffer earlier than other per-cpu kernel buffers are handled.
400 	 *
401 	 * Coping can be done in two steps in case the chunk of profiling data
402 	 * crosses the upper bound of the kernel buffer. In this case we first move
403 	 * part of data from map->start till the upper bound and then the reminder
404 	 * from the beginning of the kernel buffer till the end of the data chunk.
405 	 */
406 
407 	if (record__comp_enabled(aio->rec)) {
408 		ssize_t compressed = zstd_compress(aio->rec->session, NULL, aio->data + aio->size,
409 						   mmap__mmap_len(map) - aio->size,
410 						   buf, size);
411 		if (compressed < 0)
412 			return (int)compressed;
413 
414 		size = compressed;
415 	} else {
416 		memcpy(aio->data + aio->size, buf, size);
417 	}
418 
419 	if (!aio->size) {
420 		/*
421 		 * Increment map->refcount to guard map->aio.data[] buffer
422 		 * from premature deallocation because map object can be
423 		 * released earlier than aio write request started on
424 		 * map->aio.data[] buffer is complete.
425 		 *
426 		 * perf_mmap__put() is done at record__aio_complete()
427 		 * after started aio request completion or at record__aio_push()
428 		 * if the request failed to start.
429 		 */
430 		perf_mmap__get(&map->core);
431 	}
432 
433 	aio->size += size;
434 
435 	return size;
436 }
437 
438 static int record__aio_push(struct record *rec, struct mmap *map, off_t *off)
439 {
440 	int ret, idx;
441 	int trace_fd = rec->session->data->file.fd;
442 	struct record_aio aio = { .rec = rec, .size = 0 };
443 
444 	/*
445 	 * Call record__aio_sync() to wait till map->aio.data[] buffer
446 	 * becomes available after previous aio write operation.
447 	 */
448 
449 	idx = record__aio_sync(map, false);
450 	aio.data = map->aio.data[idx];
451 	ret = perf_mmap__push(map, &aio, record__aio_pushfn);
452 	if (ret != 0) /* ret > 0 - no data, ret < 0 - error */
453 		return ret;
454 
455 	rec->samples++;
456 	ret = record__aio_write(&(map->aio.cblocks[idx]), trace_fd, aio.data, aio.size, *off);
457 	if (!ret) {
458 		*off += aio.size;
459 		rec->bytes_written += aio.size;
460 		if (switch_output_size(rec))
461 			trigger_hit(&switch_output_trigger);
462 	} else {
463 		/*
464 		 * Decrement map->refcount incremented in record__aio_pushfn()
465 		 * back if record__aio_write() operation failed to start, otherwise
466 		 * map->refcount is decremented in record__aio_complete() after
467 		 * aio write operation finishes successfully.
468 		 */
469 		perf_mmap__put(&map->core);
470 	}
471 
472 	return ret;
473 }
474 
475 static off_t record__aio_get_pos(int trace_fd)
476 {
477 	return lseek(trace_fd, 0, SEEK_CUR);
478 }
479 
480 static void record__aio_set_pos(int trace_fd, off_t pos)
481 {
482 	lseek(trace_fd, pos, SEEK_SET);
483 }
484 
485 static void record__aio_mmap_read_sync(struct record *rec)
486 {
487 	int i;
488 	struct evlist *evlist = rec->evlist;
489 	struct mmap *maps = evlist->mmap;
490 
491 	if (!record__aio_enabled(rec))
492 		return;
493 
494 	for (i = 0; i < evlist->core.nr_mmaps; i++) {
495 		struct mmap *map = &maps[i];
496 
497 		if (map->core.base)
498 			record__aio_sync(map, true);
499 	}
500 }
501 
502 static int nr_cblocks_default = 1;
503 static int nr_cblocks_max = 4;
504 
505 static int record__aio_parse(const struct option *opt,
506 			     const char *str,
507 			     int unset)
508 {
509 	struct record_opts *opts = (struct record_opts *)opt->value;
510 
511 	if (unset) {
512 		opts->nr_cblocks = 0;
513 	} else {
514 		if (str)
515 			opts->nr_cblocks = strtol(str, NULL, 0);
516 		if (!opts->nr_cblocks)
517 			opts->nr_cblocks = nr_cblocks_default;
518 	}
519 
520 	return 0;
521 }
522 #else /* HAVE_AIO_SUPPORT */
523 static int nr_cblocks_max = 0;
524 
525 static int record__aio_push(struct record *rec __maybe_unused, struct mmap *map __maybe_unused,
526 			    off_t *off __maybe_unused)
527 {
528 	return -1;
529 }
530 
531 static off_t record__aio_get_pos(int trace_fd __maybe_unused)
532 {
533 	return -1;
534 }
535 
536 static void record__aio_set_pos(int trace_fd __maybe_unused, off_t pos __maybe_unused)
537 {
538 }
539 
540 static void record__aio_mmap_read_sync(struct record *rec __maybe_unused)
541 {
542 }
543 #endif
544 
545 static int record__aio_enabled(struct record *rec)
546 {
547 	return rec->opts.nr_cblocks > 0;
548 }
549 
550 #define MMAP_FLUSH_DEFAULT 1
551 static int record__mmap_flush_parse(const struct option *opt,
552 				    const char *str,
553 				    int unset)
554 {
555 	int flush_max;
556 	struct record_opts *opts = (struct record_opts *)opt->value;
557 	static struct parse_tag tags[] = {
558 			{ .tag  = 'B', .mult = 1       },
559 			{ .tag  = 'K', .mult = 1 << 10 },
560 			{ .tag  = 'M', .mult = 1 << 20 },
561 			{ .tag  = 'G', .mult = 1 << 30 },
562 			{ .tag  = 0 },
563 	};
564 
565 	if (unset)
566 		return 0;
567 
568 	if (str) {
569 		opts->mmap_flush = parse_tag_value(str, tags);
570 		if (opts->mmap_flush == (int)-1)
571 			opts->mmap_flush = strtol(str, NULL, 0);
572 	}
573 
574 	if (!opts->mmap_flush)
575 		opts->mmap_flush = MMAP_FLUSH_DEFAULT;
576 
577 	flush_max = evlist__mmap_size(opts->mmap_pages);
578 	flush_max /= 4;
579 	if (opts->mmap_flush > flush_max)
580 		opts->mmap_flush = flush_max;
581 
582 	return 0;
583 }
584 
585 #ifdef HAVE_ZSTD_SUPPORT
586 static unsigned int comp_level_default = 1;
587 
588 static int record__parse_comp_level(const struct option *opt, const char *str, int unset)
589 {
590 	struct record_opts *opts = opt->value;
591 
592 	if (unset) {
593 		opts->comp_level = 0;
594 	} else {
595 		if (str)
596 			opts->comp_level = strtol(str, NULL, 0);
597 		if (!opts->comp_level)
598 			opts->comp_level = comp_level_default;
599 	}
600 
601 	return 0;
602 }
603 #endif
604 static unsigned int comp_level_max = 22;
605 
606 static int record__comp_enabled(struct record *rec)
607 {
608 	return rec->opts.comp_level > 0;
609 }
610 
611 static int process_synthesized_event(struct perf_tool *tool,
612 				     union perf_event *event,
613 				     struct perf_sample *sample __maybe_unused,
614 				     struct machine *machine __maybe_unused)
615 {
616 	struct record *rec = container_of(tool, struct record, tool);
617 	return record__write(rec, NULL, event, event->header.size);
618 }
619 
620 static struct mutex synth_lock;
621 
622 static int process_locked_synthesized_event(struct perf_tool *tool,
623 				     union perf_event *event,
624 				     struct perf_sample *sample __maybe_unused,
625 				     struct machine *machine __maybe_unused)
626 {
627 	int ret;
628 
629 	mutex_lock(&synth_lock);
630 	ret = process_synthesized_event(tool, event, sample, machine);
631 	mutex_unlock(&synth_lock);
632 	return ret;
633 }
634 
635 static int record__pushfn(struct mmap *map, void *to, void *bf, size_t size)
636 {
637 	struct record *rec = to;
638 
639 	if (record__comp_enabled(rec)) {
640 		ssize_t compressed = zstd_compress(rec->session, map, map->data,
641 						   mmap__mmap_len(map), bf, size);
642 
643 		if (compressed < 0)
644 			return (int)compressed;
645 
646 		size = compressed;
647 		bf   = map->data;
648 	}
649 
650 	thread->samples++;
651 	return record__write(rec, map, bf, size);
652 }
653 
654 static volatile sig_atomic_t signr = -1;
655 static volatile sig_atomic_t child_finished;
656 #ifdef HAVE_EVENTFD_SUPPORT
657 static volatile sig_atomic_t done_fd = -1;
658 #endif
659 
660 static void sig_handler(int sig)
661 {
662 	if (sig == SIGCHLD)
663 		child_finished = 1;
664 	else
665 		signr = sig;
666 
667 	done = 1;
668 #ifdef HAVE_EVENTFD_SUPPORT
669 	if (done_fd >= 0) {
670 		u64 tmp = 1;
671 		int orig_errno = errno;
672 
673 		/*
674 		 * It is possible for this signal handler to run after done is
675 		 * checked in the main loop, but before the perf counter fds are
676 		 * polled. If this happens, the poll() will continue to wait
677 		 * even though done is set, and will only break out if either
678 		 * another signal is received, or the counters are ready for
679 		 * read. To ensure the poll() doesn't sleep when done is set,
680 		 * use an eventfd (done_fd) to wake up the poll().
681 		 */
682 		if (write(done_fd, &tmp, sizeof(tmp)) < 0)
683 			pr_err("failed to signal wakeup fd, error: %m\n");
684 
685 		errno = orig_errno;
686 	}
687 #endif // HAVE_EVENTFD_SUPPORT
688 }
689 
690 static void sigsegv_handler(int sig)
691 {
692 	perf_hooks__recover();
693 	sighandler_dump_stack(sig);
694 }
695 
696 static void record__sig_exit(void)
697 {
698 	if (signr == -1)
699 		return;
700 
701 	signal(signr, SIG_DFL);
702 	raise(signr);
703 }
704 
705 #ifdef HAVE_AUXTRACE_SUPPORT
706 
707 static int record__process_auxtrace(struct perf_tool *tool,
708 				    struct mmap *map,
709 				    union perf_event *event, void *data1,
710 				    size_t len1, void *data2, size_t len2)
711 {
712 	struct record *rec = container_of(tool, struct record, tool);
713 	struct perf_data *data = &rec->data;
714 	size_t padding;
715 	u8 pad[8] = {0};
716 
717 	if (!perf_data__is_pipe(data) && perf_data__is_single_file(data)) {
718 		off_t file_offset;
719 		int fd = perf_data__fd(data);
720 		int err;
721 
722 		file_offset = lseek(fd, 0, SEEK_CUR);
723 		if (file_offset == -1)
724 			return -1;
725 		err = auxtrace_index__auxtrace_event(&rec->session->auxtrace_index,
726 						     event, file_offset);
727 		if (err)
728 			return err;
729 	}
730 
731 	/* event.auxtrace.size includes padding, see __auxtrace_mmap__read() */
732 	padding = (len1 + len2) & 7;
733 	if (padding)
734 		padding = 8 - padding;
735 
736 	record__write(rec, map, event, event->header.size);
737 	record__write(rec, map, data1, len1);
738 	if (len2)
739 		record__write(rec, map, data2, len2);
740 	record__write(rec, map, &pad, padding);
741 
742 	return 0;
743 }
744 
745 static int record__auxtrace_mmap_read(struct record *rec,
746 				      struct mmap *map)
747 {
748 	int ret;
749 
750 	ret = auxtrace_mmap__read(map, rec->itr, &rec->tool,
751 				  record__process_auxtrace);
752 	if (ret < 0)
753 		return ret;
754 
755 	if (ret)
756 		rec->samples++;
757 
758 	return 0;
759 }
760 
761 static int record__auxtrace_mmap_read_snapshot(struct record *rec,
762 					       struct mmap *map)
763 {
764 	int ret;
765 
766 	ret = auxtrace_mmap__read_snapshot(map, rec->itr, &rec->tool,
767 					   record__process_auxtrace,
768 					   rec->opts.auxtrace_snapshot_size);
769 	if (ret < 0)
770 		return ret;
771 
772 	if (ret)
773 		rec->samples++;
774 
775 	return 0;
776 }
777 
778 static int record__auxtrace_read_snapshot_all(struct record *rec)
779 {
780 	int i;
781 	int rc = 0;
782 
783 	for (i = 0; i < rec->evlist->core.nr_mmaps; i++) {
784 		struct mmap *map = &rec->evlist->mmap[i];
785 
786 		if (!map->auxtrace_mmap.base)
787 			continue;
788 
789 		if (record__auxtrace_mmap_read_snapshot(rec, map) != 0) {
790 			rc = -1;
791 			goto out;
792 		}
793 	}
794 out:
795 	return rc;
796 }
797 
798 static void record__read_auxtrace_snapshot(struct record *rec, bool on_exit)
799 {
800 	pr_debug("Recording AUX area tracing snapshot\n");
801 	if (record__auxtrace_read_snapshot_all(rec) < 0) {
802 		trigger_error(&auxtrace_snapshot_trigger);
803 	} else {
804 		if (auxtrace_record__snapshot_finish(rec->itr, on_exit))
805 			trigger_error(&auxtrace_snapshot_trigger);
806 		else
807 			trigger_ready(&auxtrace_snapshot_trigger);
808 	}
809 }
810 
811 static int record__auxtrace_snapshot_exit(struct record *rec)
812 {
813 	if (trigger_is_error(&auxtrace_snapshot_trigger))
814 		return 0;
815 
816 	if (!auxtrace_record__snapshot_started &&
817 	    auxtrace_record__snapshot_start(rec->itr))
818 		return -1;
819 
820 	record__read_auxtrace_snapshot(rec, true);
821 	if (trigger_is_error(&auxtrace_snapshot_trigger))
822 		return -1;
823 
824 	return 0;
825 }
826 
827 static int record__auxtrace_init(struct record *rec)
828 {
829 	int err;
830 
831 	if ((rec->opts.auxtrace_snapshot_opts || rec->opts.auxtrace_sample_opts)
832 	    && record__threads_enabled(rec)) {
833 		pr_err("AUX area tracing options are not available in parallel streaming mode.\n");
834 		return -EINVAL;
835 	}
836 
837 	if (!rec->itr) {
838 		rec->itr = auxtrace_record__init(rec->evlist, &err);
839 		if (err)
840 			return err;
841 	}
842 
843 	err = auxtrace_parse_snapshot_options(rec->itr, &rec->opts,
844 					      rec->opts.auxtrace_snapshot_opts);
845 	if (err)
846 		return err;
847 
848 	err = auxtrace_parse_sample_options(rec->itr, rec->evlist, &rec->opts,
849 					    rec->opts.auxtrace_sample_opts);
850 	if (err)
851 		return err;
852 
853 	auxtrace_regroup_aux_output(rec->evlist);
854 
855 	return auxtrace_parse_filters(rec->evlist);
856 }
857 
858 #else
859 
860 static inline
861 int record__auxtrace_mmap_read(struct record *rec __maybe_unused,
862 			       struct mmap *map __maybe_unused)
863 {
864 	return 0;
865 }
866 
867 static inline
868 void record__read_auxtrace_snapshot(struct record *rec __maybe_unused,
869 				    bool on_exit __maybe_unused)
870 {
871 }
872 
873 static inline
874 int auxtrace_record__snapshot_start(struct auxtrace_record *itr __maybe_unused)
875 {
876 	return 0;
877 }
878 
879 static inline
880 int record__auxtrace_snapshot_exit(struct record *rec __maybe_unused)
881 {
882 	return 0;
883 }
884 
885 static int record__auxtrace_init(struct record *rec __maybe_unused)
886 {
887 	return 0;
888 }
889 
890 #endif
891 
892 static int record__config_text_poke(struct evlist *evlist)
893 {
894 	struct evsel *evsel;
895 
896 	/* Nothing to do if text poke is already configured */
897 	evlist__for_each_entry(evlist, evsel) {
898 		if (evsel->core.attr.text_poke)
899 			return 0;
900 	}
901 
902 	evsel = evlist__add_dummy_on_all_cpus(evlist);
903 	if (!evsel)
904 		return -ENOMEM;
905 
906 	evsel->core.attr.text_poke = 1;
907 	evsel->core.attr.ksymbol = 1;
908 	evsel->immediate = true;
909 	evsel__set_sample_bit(evsel, TIME);
910 
911 	return 0;
912 }
913 
914 static int record__config_off_cpu(struct record *rec)
915 {
916 	return off_cpu_prepare(rec->evlist, &rec->opts.target, &rec->opts);
917 }
918 
919 static bool record__tracking_system_wide(struct record *rec)
920 {
921 	struct evlist *evlist = rec->evlist;
922 	struct evsel *evsel;
923 
924 	/*
925 	 * If non-dummy evsel exists, system_wide sideband is need to
926 	 * help parse sample information.
927 	 * For example, PERF_EVENT_MMAP event to help parse symbol,
928 	 * and PERF_EVENT_COMM event to help parse task executable name.
929 	 */
930 	evlist__for_each_entry(evlist, evsel) {
931 		if (!evsel__is_dummy_event(evsel))
932 			return true;
933 	}
934 
935 	return false;
936 }
937 
938 static int record__config_tracking_events(struct record *rec)
939 {
940 	struct record_opts *opts = &rec->opts;
941 	struct evlist *evlist = rec->evlist;
942 	bool system_wide = false;
943 	struct evsel *evsel;
944 
945 	/*
946 	 * For initial_delay, system wide or a hybrid system, we need to add
947 	 * tracking event so that we can track PERF_RECORD_MMAP to cover the
948 	 * delay of waiting or event synthesis.
949 	 */
950 	if (opts->target.initial_delay || target__has_cpu(&opts->target) ||
951 	    perf_pmus__num_core_pmus() > 1) {
952 
953 		/*
954 		 * User space tasks can migrate between CPUs, so when tracing
955 		 * selected CPUs, sideband for all CPUs is still needed.
956 		 */
957 		if (!!opts->target.cpu_list && record__tracking_system_wide(rec))
958 			system_wide = true;
959 
960 		evsel = evlist__findnew_tracking_event(evlist, system_wide);
961 		if (!evsel)
962 			return -ENOMEM;
963 
964 		/*
965 		 * Enable the tracking event when the process is forked for
966 		 * initial_delay, immediately for system wide.
967 		 */
968 		if (opts->target.initial_delay && !evsel->immediate &&
969 		    !target__has_cpu(&opts->target))
970 			evsel->core.attr.enable_on_exec = 1;
971 		else
972 			evsel->immediate = 1;
973 	}
974 
975 	return 0;
976 }
977 
978 static bool record__kcore_readable(struct machine *machine)
979 {
980 	char kcore[PATH_MAX];
981 	int fd;
982 
983 	scnprintf(kcore, sizeof(kcore), "%s/proc/kcore", machine->root_dir);
984 
985 	fd = open(kcore, O_RDONLY);
986 	if (fd < 0)
987 		return false;
988 
989 	close(fd);
990 
991 	return true;
992 }
993 
994 static int record__kcore_copy(struct machine *machine, struct perf_data *data)
995 {
996 	char from_dir[PATH_MAX];
997 	char kcore_dir[PATH_MAX];
998 	int ret;
999 
1000 	snprintf(from_dir, sizeof(from_dir), "%s/proc", machine->root_dir);
1001 
1002 	ret = perf_data__make_kcore_dir(data, kcore_dir, sizeof(kcore_dir));
1003 	if (ret)
1004 		return ret;
1005 
1006 	return kcore_copy(from_dir, kcore_dir);
1007 }
1008 
1009 static void record__thread_data_init_pipes(struct record_thread *thread_data)
1010 {
1011 	thread_data->pipes.msg[0] = -1;
1012 	thread_data->pipes.msg[1] = -1;
1013 	thread_data->pipes.ack[0] = -1;
1014 	thread_data->pipes.ack[1] = -1;
1015 }
1016 
1017 static int record__thread_data_open_pipes(struct record_thread *thread_data)
1018 {
1019 	if (pipe(thread_data->pipes.msg))
1020 		return -EINVAL;
1021 
1022 	if (pipe(thread_data->pipes.ack)) {
1023 		close(thread_data->pipes.msg[0]);
1024 		thread_data->pipes.msg[0] = -1;
1025 		close(thread_data->pipes.msg[1]);
1026 		thread_data->pipes.msg[1] = -1;
1027 		return -EINVAL;
1028 	}
1029 
1030 	pr_debug2("thread_data[%p]: msg=[%d,%d], ack=[%d,%d]\n", thread_data,
1031 		 thread_data->pipes.msg[0], thread_data->pipes.msg[1],
1032 		 thread_data->pipes.ack[0], thread_data->pipes.ack[1]);
1033 
1034 	return 0;
1035 }
1036 
1037 static void record__thread_data_close_pipes(struct record_thread *thread_data)
1038 {
1039 	if (thread_data->pipes.msg[0] != -1) {
1040 		close(thread_data->pipes.msg[0]);
1041 		thread_data->pipes.msg[0] = -1;
1042 	}
1043 	if (thread_data->pipes.msg[1] != -1) {
1044 		close(thread_data->pipes.msg[1]);
1045 		thread_data->pipes.msg[1] = -1;
1046 	}
1047 	if (thread_data->pipes.ack[0] != -1) {
1048 		close(thread_data->pipes.ack[0]);
1049 		thread_data->pipes.ack[0] = -1;
1050 	}
1051 	if (thread_data->pipes.ack[1] != -1) {
1052 		close(thread_data->pipes.ack[1]);
1053 		thread_data->pipes.ack[1] = -1;
1054 	}
1055 }
1056 
1057 static bool evlist__per_thread(struct evlist *evlist)
1058 {
1059 	return cpu_map__is_dummy(evlist->core.user_requested_cpus);
1060 }
1061 
1062 static int record__thread_data_init_maps(struct record_thread *thread_data, struct evlist *evlist)
1063 {
1064 	int m, tm, nr_mmaps = evlist->core.nr_mmaps;
1065 	struct mmap *mmap = evlist->mmap;
1066 	struct mmap *overwrite_mmap = evlist->overwrite_mmap;
1067 	struct perf_cpu_map *cpus = evlist->core.all_cpus;
1068 	bool per_thread = evlist__per_thread(evlist);
1069 
1070 	if (per_thread)
1071 		thread_data->nr_mmaps = nr_mmaps;
1072 	else
1073 		thread_data->nr_mmaps = bitmap_weight(thread_data->mask->maps.bits,
1074 						      thread_data->mask->maps.nbits);
1075 	if (mmap) {
1076 		thread_data->maps = zalloc(thread_data->nr_mmaps * sizeof(struct mmap *));
1077 		if (!thread_data->maps)
1078 			return -ENOMEM;
1079 	}
1080 	if (overwrite_mmap) {
1081 		thread_data->overwrite_maps = zalloc(thread_data->nr_mmaps * sizeof(struct mmap *));
1082 		if (!thread_data->overwrite_maps) {
1083 			zfree(&thread_data->maps);
1084 			return -ENOMEM;
1085 		}
1086 	}
1087 	pr_debug2("thread_data[%p]: nr_mmaps=%d, maps=%p, ow_maps=%p\n", thread_data,
1088 		 thread_data->nr_mmaps, thread_data->maps, thread_data->overwrite_maps);
1089 
1090 	for (m = 0, tm = 0; m < nr_mmaps && tm < thread_data->nr_mmaps; m++) {
1091 		if (per_thread ||
1092 		    test_bit(perf_cpu_map__cpu(cpus, m).cpu, thread_data->mask->maps.bits)) {
1093 			if (thread_data->maps) {
1094 				thread_data->maps[tm] = &mmap[m];
1095 				pr_debug2("thread_data[%p]: cpu%d: maps[%d] -> mmap[%d]\n",
1096 					  thread_data, perf_cpu_map__cpu(cpus, m).cpu, tm, m);
1097 			}
1098 			if (thread_data->overwrite_maps) {
1099 				thread_data->overwrite_maps[tm] = &overwrite_mmap[m];
1100 				pr_debug2("thread_data[%p]: cpu%d: ow_maps[%d] -> ow_mmap[%d]\n",
1101 					  thread_data, perf_cpu_map__cpu(cpus, m).cpu, tm, m);
1102 			}
1103 			tm++;
1104 		}
1105 	}
1106 
1107 	return 0;
1108 }
1109 
1110 static int record__thread_data_init_pollfd(struct record_thread *thread_data, struct evlist *evlist)
1111 {
1112 	int f, tm, pos;
1113 	struct mmap *map, *overwrite_map;
1114 
1115 	fdarray__init(&thread_data->pollfd, 64);
1116 
1117 	for (tm = 0; tm < thread_data->nr_mmaps; tm++) {
1118 		map = thread_data->maps ? thread_data->maps[tm] : NULL;
1119 		overwrite_map = thread_data->overwrite_maps ?
1120 				thread_data->overwrite_maps[tm] : NULL;
1121 
1122 		for (f = 0; f < evlist->core.pollfd.nr; f++) {
1123 			void *ptr = evlist->core.pollfd.priv[f].ptr;
1124 
1125 			if ((map && ptr == map) || (overwrite_map && ptr == overwrite_map)) {
1126 				pos = fdarray__dup_entry_from(&thread_data->pollfd, f,
1127 							      &evlist->core.pollfd);
1128 				if (pos < 0)
1129 					return pos;
1130 				pr_debug2("thread_data[%p]: pollfd[%d] <- event_fd=%d\n",
1131 					 thread_data, pos, evlist->core.pollfd.entries[f].fd);
1132 			}
1133 		}
1134 	}
1135 
1136 	return 0;
1137 }
1138 
1139 static void record__free_thread_data(struct record *rec)
1140 {
1141 	int t;
1142 	struct record_thread *thread_data = rec->thread_data;
1143 
1144 	if (thread_data == NULL)
1145 		return;
1146 
1147 	for (t = 0; t < rec->nr_threads; t++) {
1148 		record__thread_data_close_pipes(&thread_data[t]);
1149 		zfree(&thread_data[t].maps);
1150 		zfree(&thread_data[t].overwrite_maps);
1151 		fdarray__exit(&thread_data[t].pollfd);
1152 	}
1153 
1154 	zfree(&rec->thread_data);
1155 }
1156 
1157 static int record__map_thread_evlist_pollfd_indexes(struct record *rec,
1158 						    int evlist_pollfd_index,
1159 						    int thread_pollfd_index)
1160 {
1161 	size_t x = rec->index_map_cnt;
1162 
1163 	if (realloc_array_as_needed(rec->index_map, rec->index_map_sz, x, NULL))
1164 		return -ENOMEM;
1165 	rec->index_map[x].evlist_pollfd_index = evlist_pollfd_index;
1166 	rec->index_map[x].thread_pollfd_index = thread_pollfd_index;
1167 	rec->index_map_cnt += 1;
1168 	return 0;
1169 }
1170 
1171 static int record__update_evlist_pollfd_from_thread(struct record *rec,
1172 						    struct evlist *evlist,
1173 						    struct record_thread *thread_data)
1174 {
1175 	struct pollfd *e_entries = evlist->core.pollfd.entries;
1176 	struct pollfd *t_entries = thread_data->pollfd.entries;
1177 	int err = 0;
1178 	size_t i;
1179 
1180 	for (i = 0; i < rec->index_map_cnt; i++) {
1181 		int e_pos = rec->index_map[i].evlist_pollfd_index;
1182 		int t_pos = rec->index_map[i].thread_pollfd_index;
1183 
1184 		if (e_entries[e_pos].fd != t_entries[t_pos].fd ||
1185 		    e_entries[e_pos].events != t_entries[t_pos].events) {
1186 			pr_err("Thread and evlist pollfd index mismatch\n");
1187 			err = -EINVAL;
1188 			continue;
1189 		}
1190 		e_entries[e_pos].revents = t_entries[t_pos].revents;
1191 	}
1192 	return err;
1193 }
1194 
1195 static int record__dup_non_perf_events(struct record *rec,
1196 				       struct evlist *evlist,
1197 				       struct record_thread *thread_data)
1198 {
1199 	struct fdarray *fda = &evlist->core.pollfd;
1200 	int i, ret;
1201 
1202 	for (i = 0; i < fda->nr; i++) {
1203 		if (!(fda->priv[i].flags & fdarray_flag__non_perf_event))
1204 			continue;
1205 		ret = fdarray__dup_entry_from(&thread_data->pollfd, i, fda);
1206 		if (ret < 0) {
1207 			pr_err("Failed to duplicate descriptor in main thread pollfd\n");
1208 			return ret;
1209 		}
1210 		pr_debug2("thread_data[%p]: pollfd[%d] <- non_perf_event fd=%d\n",
1211 			  thread_data, ret, fda->entries[i].fd);
1212 		ret = record__map_thread_evlist_pollfd_indexes(rec, i, ret);
1213 		if (ret < 0) {
1214 			pr_err("Failed to map thread and evlist pollfd indexes\n");
1215 			return ret;
1216 		}
1217 	}
1218 	return 0;
1219 }
1220 
1221 static int record__alloc_thread_data(struct record *rec, struct evlist *evlist)
1222 {
1223 	int t, ret;
1224 	struct record_thread *thread_data;
1225 
1226 	rec->thread_data = zalloc(rec->nr_threads * sizeof(*(rec->thread_data)));
1227 	if (!rec->thread_data) {
1228 		pr_err("Failed to allocate thread data\n");
1229 		return -ENOMEM;
1230 	}
1231 	thread_data = rec->thread_data;
1232 
1233 	for (t = 0; t < rec->nr_threads; t++)
1234 		record__thread_data_init_pipes(&thread_data[t]);
1235 
1236 	for (t = 0; t < rec->nr_threads; t++) {
1237 		thread_data[t].rec = rec;
1238 		thread_data[t].mask = &rec->thread_masks[t];
1239 		ret = record__thread_data_init_maps(&thread_data[t], evlist);
1240 		if (ret) {
1241 			pr_err("Failed to initialize thread[%d] maps\n", t);
1242 			goto out_free;
1243 		}
1244 		ret = record__thread_data_init_pollfd(&thread_data[t], evlist);
1245 		if (ret) {
1246 			pr_err("Failed to initialize thread[%d] pollfd\n", t);
1247 			goto out_free;
1248 		}
1249 		if (t) {
1250 			thread_data[t].tid = -1;
1251 			ret = record__thread_data_open_pipes(&thread_data[t]);
1252 			if (ret) {
1253 				pr_err("Failed to open thread[%d] communication pipes\n", t);
1254 				goto out_free;
1255 			}
1256 			ret = fdarray__add(&thread_data[t].pollfd, thread_data[t].pipes.msg[0],
1257 					   POLLIN | POLLERR | POLLHUP, fdarray_flag__nonfilterable);
1258 			if (ret < 0) {
1259 				pr_err("Failed to add descriptor to thread[%d] pollfd\n", t);
1260 				goto out_free;
1261 			}
1262 			thread_data[t].ctlfd_pos = ret;
1263 			pr_debug2("thread_data[%p]: pollfd[%d] <- ctl_fd=%d\n",
1264 				 thread_data, thread_data[t].ctlfd_pos,
1265 				 thread_data[t].pipes.msg[0]);
1266 		} else {
1267 			thread_data[t].tid = gettid();
1268 
1269 			ret = record__dup_non_perf_events(rec, evlist, &thread_data[t]);
1270 			if (ret < 0)
1271 				goto out_free;
1272 
1273 			thread_data[t].ctlfd_pos = -1; /* Not used */
1274 		}
1275 	}
1276 
1277 	return 0;
1278 
1279 out_free:
1280 	record__free_thread_data(rec);
1281 
1282 	return ret;
1283 }
1284 
1285 static int record__mmap_evlist(struct record *rec,
1286 			       struct evlist *evlist)
1287 {
1288 	int i, ret;
1289 	struct record_opts *opts = &rec->opts;
1290 	bool auxtrace_overwrite = opts->auxtrace_snapshot_mode ||
1291 				  opts->auxtrace_sample_mode;
1292 	char msg[512];
1293 
1294 	if (opts->affinity != PERF_AFFINITY_SYS)
1295 		cpu__setup_cpunode_map();
1296 
1297 	if (evlist__mmap_ex(evlist, opts->mmap_pages,
1298 				 opts->auxtrace_mmap_pages,
1299 				 auxtrace_overwrite,
1300 				 opts->nr_cblocks, opts->affinity,
1301 				 opts->mmap_flush, opts->comp_level) < 0) {
1302 		if (errno == EPERM) {
1303 			pr_err("Permission error mapping pages.\n"
1304 			       "Consider increasing "
1305 			       "/proc/sys/kernel/perf_event_mlock_kb,\n"
1306 			       "or try again with a smaller value of -m/--mmap_pages.\n"
1307 			       "(current value: %u,%u)\n",
1308 			       opts->mmap_pages, opts->auxtrace_mmap_pages);
1309 			return -errno;
1310 		} else {
1311 			pr_err("failed to mmap with %d (%s)\n", errno,
1312 				str_error_r(errno, msg, sizeof(msg)));
1313 			if (errno)
1314 				return -errno;
1315 			else
1316 				return -EINVAL;
1317 		}
1318 	}
1319 
1320 	if (evlist__initialize_ctlfd(evlist, opts->ctl_fd, opts->ctl_fd_ack))
1321 		return -1;
1322 
1323 	ret = record__alloc_thread_data(rec, evlist);
1324 	if (ret)
1325 		return ret;
1326 
1327 	if (record__threads_enabled(rec)) {
1328 		ret = perf_data__create_dir(&rec->data, evlist->core.nr_mmaps);
1329 		if (ret) {
1330 			pr_err("Failed to create data directory: %s\n", strerror(-ret));
1331 			return ret;
1332 		}
1333 		for (i = 0; i < evlist->core.nr_mmaps; i++) {
1334 			if (evlist->mmap)
1335 				evlist->mmap[i].file = &rec->data.dir.files[i];
1336 			if (evlist->overwrite_mmap)
1337 				evlist->overwrite_mmap[i].file = &rec->data.dir.files[i];
1338 		}
1339 	}
1340 
1341 	return 0;
1342 }
1343 
1344 static int record__mmap(struct record *rec)
1345 {
1346 	return record__mmap_evlist(rec, rec->evlist);
1347 }
1348 
1349 static int record__open(struct record *rec)
1350 {
1351 	char msg[BUFSIZ];
1352 	struct evsel *pos;
1353 	struct evlist *evlist = rec->evlist;
1354 	struct perf_session *session = rec->session;
1355 	struct record_opts *opts = &rec->opts;
1356 	int rc = 0;
1357 
1358 	evlist__config(evlist, opts, &callchain_param);
1359 
1360 	evlist__for_each_entry(evlist, pos) {
1361 try_again:
1362 		if (evsel__open(pos, pos->core.cpus, pos->core.threads) < 0) {
1363 			if (evsel__fallback(pos, &opts->target, errno, msg, sizeof(msg))) {
1364 				if (verbose > 0)
1365 					ui__warning("%s\n", msg);
1366 				goto try_again;
1367 			}
1368 			if ((errno == EINVAL || errno == EBADF) &&
1369 			    pos->core.leader != &pos->core &&
1370 			    pos->weak_group) {
1371 			        pos = evlist__reset_weak_group(evlist, pos, true);
1372 				goto try_again;
1373 			}
1374 			rc = -errno;
1375 			evsel__open_strerror(pos, &opts->target, errno, msg, sizeof(msg));
1376 			ui__error("%s\n", msg);
1377 			goto out;
1378 		}
1379 
1380 		pos->supported = true;
1381 	}
1382 
1383 	if (symbol_conf.kptr_restrict && !evlist__exclude_kernel(evlist)) {
1384 		pr_warning(
1385 "WARNING: Kernel address maps (/proc/{kallsyms,modules}) are restricted,\n"
1386 "check /proc/sys/kernel/kptr_restrict and /proc/sys/kernel/perf_event_paranoid.\n\n"
1387 "Samples in kernel functions may not be resolved if a suitable vmlinux\n"
1388 "file is not found in the buildid cache or in the vmlinux path.\n\n"
1389 "Samples in kernel modules won't be resolved at all.\n\n"
1390 "If some relocation was applied (e.g. kexec) symbols may be misresolved\n"
1391 "even with a suitable vmlinux or kallsyms file.\n\n");
1392 	}
1393 
1394 	if (evlist__apply_filters(evlist, &pos)) {
1395 		pr_err("failed to set filter \"%s\" on event %s with %d (%s)\n",
1396 			pos->filter ?: "BPF", evsel__name(pos), errno,
1397 			str_error_r(errno, msg, sizeof(msg)));
1398 		rc = -1;
1399 		goto out;
1400 	}
1401 
1402 	rc = record__mmap(rec);
1403 	if (rc)
1404 		goto out;
1405 
1406 	session->evlist = evlist;
1407 	perf_session__set_id_hdr_size(session);
1408 out:
1409 	return rc;
1410 }
1411 
1412 static void set_timestamp_boundary(struct record *rec, u64 sample_time)
1413 {
1414 	if (rec->evlist->first_sample_time == 0)
1415 		rec->evlist->first_sample_time = sample_time;
1416 
1417 	if (sample_time)
1418 		rec->evlist->last_sample_time = sample_time;
1419 }
1420 
1421 static int process_sample_event(struct perf_tool *tool,
1422 				union perf_event *event,
1423 				struct perf_sample *sample,
1424 				struct evsel *evsel,
1425 				struct machine *machine)
1426 {
1427 	struct record *rec = container_of(tool, struct record, tool);
1428 
1429 	set_timestamp_boundary(rec, sample->time);
1430 
1431 	if (rec->buildid_all)
1432 		return 0;
1433 
1434 	rec->samples++;
1435 	return build_id__mark_dso_hit(tool, event, sample, evsel, machine);
1436 }
1437 
1438 static int process_buildids(struct record *rec)
1439 {
1440 	struct perf_session *session = rec->session;
1441 
1442 	if (perf_data__size(&rec->data) == 0)
1443 		return 0;
1444 
1445 	/*
1446 	 * During this process, it'll load kernel map and replace the
1447 	 * dso->long_name to a real pathname it found.  In this case
1448 	 * we prefer the vmlinux path like
1449 	 *   /lib/modules/3.16.4/build/vmlinux
1450 	 *
1451 	 * rather than build-id path (in debug directory).
1452 	 *   $HOME/.debug/.build-id/f0/6e17aa50adf4d00b88925e03775de107611551
1453 	 */
1454 	symbol_conf.ignore_vmlinux_buildid = true;
1455 
1456 	/*
1457 	 * If --buildid-all is given, it marks all DSO regardless of hits,
1458 	 * so no need to process samples. But if timestamp_boundary is enabled,
1459 	 * it still needs to walk on all samples to get the timestamps of
1460 	 * first/last samples.
1461 	 */
1462 	if (rec->buildid_all && !rec->timestamp_boundary)
1463 		rec->tool.sample = NULL;
1464 
1465 	return perf_session__process_events(session);
1466 }
1467 
1468 static void perf_event__synthesize_guest_os(struct machine *machine, void *data)
1469 {
1470 	int err;
1471 	struct perf_tool *tool = data;
1472 	/*
1473 	 *As for guest kernel when processing subcommand record&report,
1474 	 *we arrange module mmap prior to guest kernel mmap and trigger
1475 	 *a preload dso because default guest module symbols are loaded
1476 	 *from guest kallsyms instead of /lib/modules/XXX/XXX. This
1477 	 *method is used to avoid symbol missing when the first addr is
1478 	 *in module instead of in guest kernel.
1479 	 */
1480 	err = perf_event__synthesize_modules(tool, process_synthesized_event,
1481 					     machine);
1482 	if (err < 0)
1483 		pr_err("Couldn't record guest kernel [%d]'s reference"
1484 		       " relocation symbol.\n", machine->pid);
1485 
1486 	/*
1487 	 * We use _stext for guest kernel because guest kernel's /proc/kallsyms
1488 	 * have no _text sometimes.
1489 	 */
1490 	err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
1491 						 machine);
1492 	if (err < 0)
1493 		pr_err("Couldn't record guest kernel [%d]'s reference"
1494 		       " relocation symbol.\n", machine->pid);
1495 }
1496 
1497 static struct perf_event_header finished_round_event = {
1498 	.size = sizeof(struct perf_event_header),
1499 	.type = PERF_RECORD_FINISHED_ROUND,
1500 };
1501 
1502 static struct perf_event_header finished_init_event = {
1503 	.size = sizeof(struct perf_event_header),
1504 	.type = PERF_RECORD_FINISHED_INIT,
1505 };
1506 
1507 static void record__adjust_affinity(struct record *rec, struct mmap *map)
1508 {
1509 	if (rec->opts.affinity != PERF_AFFINITY_SYS &&
1510 	    !bitmap_equal(thread->mask->affinity.bits, map->affinity_mask.bits,
1511 			  thread->mask->affinity.nbits)) {
1512 		bitmap_zero(thread->mask->affinity.bits, thread->mask->affinity.nbits);
1513 		bitmap_or(thread->mask->affinity.bits, thread->mask->affinity.bits,
1514 			  map->affinity_mask.bits, thread->mask->affinity.nbits);
1515 		sched_setaffinity(0, MMAP_CPU_MASK_BYTES(&thread->mask->affinity),
1516 					(cpu_set_t *)thread->mask->affinity.bits);
1517 		if (verbose == 2) {
1518 			pr_debug("threads[%d]: running on cpu%d: ", thread->tid, sched_getcpu());
1519 			mmap_cpu_mask__scnprintf(&thread->mask->affinity, "affinity");
1520 		}
1521 	}
1522 }
1523 
1524 static size_t process_comp_header(void *record, size_t increment)
1525 {
1526 	struct perf_record_compressed *event = record;
1527 	size_t size = sizeof(*event);
1528 
1529 	if (increment) {
1530 		event->header.size += increment;
1531 		return increment;
1532 	}
1533 
1534 	event->header.type = PERF_RECORD_COMPRESSED;
1535 	event->header.size = size;
1536 
1537 	return size;
1538 }
1539 
1540 static ssize_t zstd_compress(struct perf_session *session, struct mmap *map,
1541 			    void *dst, size_t dst_size, void *src, size_t src_size)
1542 {
1543 	ssize_t compressed;
1544 	size_t max_record_size = PERF_SAMPLE_MAX_SIZE - sizeof(struct perf_record_compressed) - 1;
1545 	struct zstd_data *zstd_data = &session->zstd_data;
1546 
1547 	if (map && map->file)
1548 		zstd_data = &map->zstd_data;
1549 
1550 	compressed = zstd_compress_stream_to_records(zstd_data, dst, dst_size, src, src_size,
1551 						     max_record_size, process_comp_header);
1552 	if (compressed < 0)
1553 		return compressed;
1554 
1555 	if (map && map->file) {
1556 		thread->bytes_transferred += src_size;
1557 		thread->bytes_compressed  += compressed;
1558 	} else {
1559 		session->bytes_transferred += src_size;
1560 		session->bytes_compressed  += compressed;
1561 	}
1562 
1563 	return compressed;
1564 }
1565 
1566 static int record__mmap_read_evlist(struct record *rec, struct evlist *evlist,
1567 				    bool overwrite, bool synch)
1568 {
1569 	u64 bytes_written = rec->bytes_written;
1570 	int i;
1571 	int rc = 0;
1572 	int nr_mmaps;
1573 	struct mmap **maps;
1574 	int trace_fd = rec->data.file.fd;
1575 	off_t off = 0;
1576 
1577 	if (!evlist)
1578 		return 0;
1579 
1580 	nr_mmaps = thread->nr_mmaps;
1581 	maps = overwrite ? thread->overwrite_maps : thread->maps;
1582 
1583 	if (!maps)
1584 		return 0;
1585 
1586 	if (overwrite && evlist->bkw_mmap_state != BKW_MMAP_DATA_PENDING)
1587 		return 0;
1588 
1589 	if (record__aio_enabled(rec))
1590 		off = record__aio_get_pos(trace_fd);
1591 
1592 	for (i = 0; i < nr_mmaps; i++) {
1593 		u64 flush = 0;
1594 		struct mmap *map = maps[i];
1595 
1596 		if (map->core.base) {
1597 			record__adjust_affinity(rec, map);
1598 			if (synch) {
1599 				flush = map->core.flush;
1600 				map->core.flush = 1;
1601 			}
1602 			if (!record__aio_enabled(rec)) {
1603 				if (perf_mmap__push(map, rec, record__pushfn) < 0) {
1604 					if (synch)
1605 						map->core.flush = flush;
1606 					rc = -1;
1607 					goto out;
1608 				}
1609 			} else {
1610 				if (record__aio_push(rec, map, &off) < 0) {
1611 					record__aio_set_pos(trace_fd, off);
1612 					if (synch)
1613 						map->core.flush = flush;
1614 					rc = -1;
1615 					goto out;
1616 				}
1617 			}
1618 			if (synch)
1619 				map->core.flush = flush;
1620 		}
1621 
1622 		if (map->auxtrace_mmap.base && !rec->opts.auxtrace_snapshot_mode &&
1623 		    !rec->opts.auxtrace_sample_mode &&
1624 		    record__auxtrace_mmap_read(rec, map) != 0) {
1625 			rc = -1;
1626 			goto out;
1627 		}
1628 	}
1629 
1630 	if (record__aio_enabled(rec))
1631 		record__aio_set_pos(trace_fd, off);
1632 
1633 	/*
1634 	 * Mark the round finished in case we wrote
1635 	 * at least one event.
1636 	 *
1637 	 * No need for round events in directory mode,
1638 	 * because per-cpu maps and files have data
1639 	 * sorted by kernel.
1640 	 */
1641 	if (!record__threads_enabled(rec) && bytes_written != rec->bytes_written)
1642 		rc = record__write(rec, NULL, &finished_round_event, sizeof(finished_round_event));
1643 
1644 	if (overwrite)
1645 		evlist__toggle_bkw_mmap(evlist, BKW_MMAP_EMPTY);
1646 out:
1647 	return rc;
1648 }
1649 
1650 static int record__mmap_read_all(struct record *rec, bool synch)
1651 {
1652 	int err;
1653 
1654 	err = record__mmap_read_evlist(rec, rec->evlist, false, synch);
1655 	if (err)
1656 		return err;
1657 
1658 	return record__mmap_read_evlist(rec, rec->evlist, true, synch);
1659 }
1660 
1661 static void record__thread_munmap_filtered(struct fdarray *fda, int fd,
1662 					   void *arg __maybe_unused)
1663 {
1664 	struct perf_mmap *map = fda->priv[fd].ptr;
1665 
1666 	if (map)
1667 		perf_mmap__put(map);
1668 }
1669 
1670 static void *record__thread(void *arg)
1671 {
1672 	enum thread_msg msg = THREAD_MSG__READY;
1673 	bool terminate = false;
1674 	struct fdarray *pollfd;
1675 	int err, ctlfd_pos;
1676 
1677 	thread = arg;
1678 	thread->tid = gettid();
1679 
1680 	err = write(thread->pipes.ack[1], &msg, sizeof(msg));
1681 	if (err == -1)
1682 		pr_warning("threads[%d]: failed to notify on start: %s\n",
1683 			   thread->tid, strerror(errno));
1684 
1685 	pr_debug("threads[%d]: started on cpu%d\n", thread->tid, sched_getcpu());
1686 
1687 	pollfd = &thread->pollfd;
1688 	ctlfd_pos = thread->ctlfd_pos;
1689 
1690 	for (;;) {
1691 		unsigned long long hits = thread->samples;
1692 
1693 		if (record__mmap_read_all(thread->rec, false) < 0 || terminate)
1694 			break;
1695 
1696 		if (hits == thread->samples) {
1697 
1698 			err = fdarray__poll(pollfd, -1);
1699 			/*
1700 			 * Propagate error, only if there's any. Ignore positive
1701 			 * number of returned events and interrupt error.
1702 			 */
1703 			if (err > 0 || (err < 0 && errno == EINTR))
1704 				err = 0;
1705 			thread->waking++;
1706 
1707 			if (fdarray__filter(pollfd, POLLERR | POLLHUP,
1708 					    record__thread_munmap_filtered, NULL) == 0)
1709 				break;
1710 		}
1711 
1712 		if (pollfd->entries[ctlfd_pos].revents & POLLHUP) {
1713 			terminate = true;
1714 			close(thread->pipes.msg[0]);
1715 			thread->pipes.msg[0] = -1;
1716 			pollfd->entries[ctlfd_pos].fd = -1;
1717 			pollfd->entries[ctlfd_pos].events = 0;
1718 		}
1719 
1720 		pollfd->entries[ctlfd_pos].revents = 0;
1721 	}
1722 	record__mmap_read_all(thread->rec, true);
1723 
1724 	err = write(thread->pipes.ack[1], &msg, sizeof(msg));
1725 	if (err == -1)
1726 		pr_warning("threads[%d]: failed to notify on termination: %s\n",
1727 			   thread->tid, strerror(errno));
1728 
1729 	return NULL;
1730 }
1731 
1732 static void record__init_features(struct record *rec)
1733 {
1734 	struct perf_session *session = rec->session;
1735 	int feat;
1736 
1737 	for (feat = HEADER_FIRST_FEATURE; feat < HEADER_LAST_FEATURE; feat++)
1738 		perf_header__set_feat(&session->header, feat);
1739 
1740 	if (rec->no_buildid)
1741 		perf_header__clear_feat(&session->header, HEADER_BUILD_ID);
1742 
1743 #ifdef HAVE_LIBTRACEEVENT
1744 	if (!have_tracepoints(&rec->evlist->core.entries))
1745 		perf_header__clear_feat(&session->header, HEADER_TRACING_DATA);
1746 #endif
1747 
1748 	if (!rec->opts.branch_stack)
1749 		perf_header__clear_feat(&session->header, HEADER_BRANCH_STACK);
1750 
1751 	if (!rec->opts.full_auxtrace)
1752 		perf_header__clear_feat(&session->header, HEADER_AUXTRACE);
1753 
1754 	if (!(rec->opts.use_clockid && rec->opts.clockid_res_ns))
1755 		perf_header__clear_feat(&session->header, HEADER_CLOCKID);
1756 
1757 	if (!rec->opts.use_clockid)
1758 		perf_header__clear_feat(&session->header, HEADER_CLOCK_DATA);
1759 
1760 	if (!record__threads_enabled(rec))
1761 		perf_header__clear_feat(&session->header, HEADER_DIR_FORMAT);
1762 
1763 	if (!record__comp_enabled(rec))
1764 		perf_header__clear_feat(&session->header, HEADER_COMPRESSED);
1765 
1766 	perf_header__clear_feat(&session->header, HEADER_STAT);
1767 }
1768 
1769 static void
1770 record__finish_output(struct record *rec)
1771 {
1772 	int i;
1773 	struct perf_data *data = &rec->data;
1774 	int fd = perf_data__fd(data);
1775 
1776 	if (data->is_pipe) {
1777 		/* Just to display approx. size */
1778 		data->file.size = rec->bytes_written;
1779 		return;
1780 	}
1781 
1782 	rec->session->header.data_size += rec->bytes_written;
1783 	data->file.size = lseek(perf_data__fd(data), 0, SEEK_CUR);
1784 	if (record__threads_enabled(rec)) {
1785 		for (i = 0; i < data->dir.nr; i++)
1786 			data->dir.files[i].size = lseek(data->dir.files[i].fd, 0, SEEK_CUR);
1787 	}
1788 
1789 	if (!rec->no_buildid) {
1790 		process_buildids(rec);
1791 
1792 		if (rec->buildid_all)
1793 			dsos__hit_all(rec->session);
1794 	}
1795 	perf_session__write_header(rec->session, rec->evlist, fd, true);
1796 
1797 	return;
1798 }
1799 
1800 static int record__synthesize_workload(struct record *rec, bool tail)
1801 {
1802 	int err;
1803 	struct perf_thread_map *thread_map;
1804 	bool needs_mmap = rec->opts.synth & PERF_SYNTH_MMAP;
1805 
1806 	if (rec->opts.tail_synthesize != tail)
1807 		return 0;
1808 
1809 	thread_map = thread_map__new_by_tid(rec->evlist->workload.pid);
1810 	if (thread_map == NULL)
1811 		return -1;
1812 
1813 	err = perf_event__synthesize_thread_map(&rec->tool, thread_map,
1814 						 process_synthesized_event,
1815 						 &rec->session->machines.host,
1816 						 needs_mmap,
1817 						 rec->opts.sample_address);
1818 	perf_thread_map__put(thread_map);
1819 	return err;
1820 }
1821 
1822 static int write_finished_init(struct record *rec, bool tail)
1823 {
1824 	if (rec->opts.tail_synthesize != tail)
1825 		return 0;
1826 
1827 	return record__write(rec, NULL, &finished_init_event, sizeof(finished_init_event));
1828 }
1829 
1830 static int record__synthesize(struct record *rec, bool tail);
1831 
1832 static int
1833 record__switch_output(struct record *rec, bool at_exit)
1834 {
1835 	struct perf_data *data = &rec->data;
1836 	char *new_filename = NULL;
1837 	int fd, err;
1838 
1839 	/* Same Size:      "2015122520103046"*/
1840 	char timestamp[] = "InvalidTimestamp";
1841 
1842 	record__aio_mmap_read_sync(rec);
1843 
1844 	write_finished_init(rec, true);
1845 
1846 	record__synthesize(rec, true);
1847 	if (target__none(&rec->opts.target))
1848 		record__synthesize_workload(rec, true);
1849 
1850 	rec->samples = 0;
1851 	record__finish_output(rec);
1852 	err = fetch_current_timestamp(timestamp, sizeof(timestamp));
1853 	if (err) {
1854 		pr_err("Failed to get current timestamp\n");
1855 		return -EINVAL;
1856 	}
1857 
1858 	fd = perf_data__switch(data, timestamp,
1859 			       rec->session->header.data_offset,
1860 			       at_exit, &new_filename);
1861 	if (fd >= 0 && !at_exit) {
1862 		rec->bytes_written = 0;
1863 		rec->session->header.data_size = 0;
1864 	}
1865 
1866 	if (!quiet) {
1867 		fprintf(stderr, "[ perf record: Dump %s.%s ]\n",
1868 			data->path, timestamp);
1869 	}
1870 
1871 	if (rec->switch_output.num_files) {
1872 		int n = rec->switch_output.cur_file + 1;
1873 
1874 		if (n >= rec->switch_output.num_files)
1875 			n = 0;
1876 		rec->switch_output.cur_file = n;
1877 		if (rec->switch_output.filenames[n]) {
1878 			remove(rec->switch_output.filenames[n]);
1879 			zfree(&rec->switch_output.filenames[n]);
1880 		}
1881 		rec->switch_output.filenames[n] = new_filename;
1882 	} else {
1883 		free(new_filename);
1884 	}
1885 
1886 	/* Output tracking events */
1887 	if (!at_exit) {
1888 		record__synthesize(rec, false);
1889 
1890 		/*
1891 		 * In 'perf record --switch-output' without -a,
1892 		 * record__synthesize() in record__switch_output() won't
1893 		 * generate tracking events because there's no thread_map
1894 		 * in evlist. Which causes newly created perf.data doesn't
1895 		 * contain map and comm information.
1896 		 * Create a fake thread_map and directly call
1897 		 * perf_event__synthesize_thread_map() for those events.
1898 		 */
1899 		if (target__none(&rec->opts.target))
1900 			record__synthesize_workload(rec, false);
1901 		write_finished_init(rec, false);
1902 	}
1903 	return fd;
1904 }
1905 
1906 static void __record__save_lost_samples(struct record *rec, struct evsel *evsel,
1907 					struct perf_record_lost_samples *lost,
1908 					int cpu_idx, int thread_idx, u64 lost_count,
1909 					u16 misc_flag)
1910 {
1911 	struct perf_sample_id *sid;
1912 	struct perf_sample sample = {};
1913 	int id_hdr_size;
1914 
1915 	lost->lost = lost_count;
1916 	if (evsel->core.ids) {
1917 		sid = xyarray__entry(evsel->core.sample_id, cpu_idx, thread_idx);
1918 		sample.id = sid->id;
1919 	}
1920 
1921 	id_hdr_size = perf_event__synthesize_id_sample((void *)(lost + 1),
1922 						       evsel->core.attr.sample_type, &sample);
1923 	lost->header.size = sizeof(*lost) + id_hdr_size;
1924 	lost->header.misc = misc_flag;
1925 	record__write(rec, NULL, lost, lost->header.size);
1926 }
1927 
1928 static void record__read_lost_samples(struct record *rec)
1929 {
1930 	struct perf_session *session = rec->session;
1931 	struct perf_record_lost_samples *lost = NULL;
1932 	struct evsel *evsel;
1933 
1934 	/* there was an error during record__open */
1935 	if (session->evlist == NULL)
1936 		return;
1937 
1938 	evlist__for_each_entry(session->evlist, evsel) {
1939 		struct xyarray *xy = evsel->core.sample_id;
1940 		u64 lost_count;
1941 
1942 		if (xy == NULL || evsel->core.fd == NULL)
1943 			continue;
1944 		if (xyarray__max_x(evsel->core.fd) != xyarray__max_x(xy) ||
1945 		    xyarray__max_y(evsel->core.fd) != xyarray__max_y(xy)) {
1946 			pr_debug("Unmatched FD vs. sample ID: skip reading LOST count\n");
1947 			continue;
1948 		}
1949 
1950 		for (int x = 0; x < xyarray__max_x(xy); x++) {
1951 			for (int y = 0; y < xyarray__max_y(xy); y++) {
1952 				struct perf_counts_values count;
1953 
1954 				if (perf_evsel__read(&evsel->core, x, y, &count) < 0) {
1955 					pr_debug("read LOST count failed\n");
1956 					goto out;
1957 				}
1958 
1959 				if (count.lost) {
1960 					if (!lost) {
1961 						lost = zalloc(sizeof(*lost) +
1962 							      session->machines.host.id_hdr_size);
1963 						if (!lost) {
1964 							pr_debug("Memory allocation failed\n");
1965 							return;
1966 						}
1967 						lost->header.type = PERF_RECORD_LOST_SAMPLES;
1968 					}
1969 					__record__save_lost_samples(rec, evsel, lost,
1970 								    x, y, count.lost, 0);
1971 				}
1972 			}
1973 		}
1974 
1975 		lost_count = perf_bpf_filter__lost_count(evsel);
1976 		if (lost_count) {
1977 			if (!lost) {
1978 				lost = zalloc(sizeof(*lost) +
1979 					      session->machines.host.id_hdr_size);
1980 				if (!lost) {
1981 					pr_debug("Memory allocation failed\n");
1982 					return;
1983 				}
1984 				lost->header.type = PERF_RECORD_LOST_SAMPLES;
1985 			}
1986 			__record__save_lost_samples(rec, evsel, lost, 0, 0, lost_count,
1987 						    PERF_RECORD_MISC_LOST_SAMPLES_BPF);
1988 		}
1989 	}
1990 out:
1991 	free(lost);
1992 }
1993 
1994 static volatile sig_atomic_t workload_exec_errno;
1995 
1996 /*
1997  * evlist__prepare_workload will send a SIGUSR1
1998  * if the fork fails, since we asked by setting its
1999  * want_signal to true.
2000  */
2001 static void workload_exec_failed_signal(int signo __maybe_unused,
2002 					siginfo_t *info,
2003 					void *ucontext __maybe_unused)
2004 {
2005 	workload_exec_errno = info->si_value.sival_int;
2006 	done = 1;
2007 	child_finished = 1;
2008 }
2009 
2010 static void snapshot_sig_handler(int sig);
2011 static void alarm_sig_handler(int sig);
2012 
2013 static const struct perf_event_mmap_page *evlist__pick_pc(struct evlist *evlist)
2014 {
2015 	if (evlist) {
2016 		if (evlist->mmap && evlist->mmap[0].core.base)
2017 			return evlist->mmap[0].core.base;
2018 		if (evlist->overwrite_mmap && evlist->overwrite_mmap[0].core.base)
2019 			return evlist->overwrite_mmap[0].core.base;
2020 	}
2021 	return NULL;
2022 }
2023 
2024 static const struct perf_event_mmap_page *record__pick_pc(struct record *rec)
2025 {
2026 	const struct perf_event_mmap_page *pc = evlist__pick_pc(rec->evlist);
2027 	if (pc)
2028 		return pc;
2029 	return NULL;
2030 }
2031 
2032 static int record__synthesize(struct record *rec, bool tail)
2033 {
2034 	struct perf_session *session = rec->session;
2035 	struct machine *machine = &session->machines.host;
2036 	struct perf_data *data = &rec->data;
2037 	struct record_opts *opts = &rec->opts;
2038 	struct perf_tool *tool = &rec->tool;
2039 	int err = 0;
2040 	event_op f = process_synthesized_event;
2041 
2042 	if (rec->opts.tail_synthesize != tail)
2043 		return 0;
2044 
2045 	if (data->is_pipe) {
2046 		err = perf_event__synthesize_for_pipe(tool, session, data,
2047 						      process_synthesized_event);
2048 		if (err < 0)
2049 			goto out;
2050 
2051 		rec->bytes_written += err;
2052 	}
2053 
2054 	err = perf_event__synth_time_conv(record__pick_pc(rec), tool,
2055 					  process_synthesized_event, machine);
2056 	if (err)
2057 		goto out;
2058 
2059 	/* Synthesize id_index before auxtrace_info */
2060 	err = perf_event__synthesize_id_index(tool,
2061 					      process_synthesized_event,
2062 					      session->evlist, machine);
2063 	if (err)
2064 		goto out;
2065 
2066 	if (rec->opts.full_auxtrace) {
2067 		err = perf_event__synthesize_auxtrace_info(rec->itr, tool,
2068 					session, process_synthesized_event);
2069 		if (err)
2070 			goto out;
2071 	}
2072 
2073 	if (!evlist__exclude_kernel(rec->evlist)) {
2074 		err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
2075 							 machine);
2076 		WARN_ONCE(err < 0, "Couldn't record kernel reference relocation symbol\n"
2077 				   "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
2078 				   "Check /proc/kallsyms permission or run as root.\n");
2079 
2080 		err = perf_event__synthesize_modules(tool, process_synthesized_event,
2081 						     machine);
2082 		WARN_ONCE(err < 0, "Couldn't record kernel module information.\n"
2083 				   "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
2084 				   "Check /proc/modules permission or run as root.\n");
2085 	}
2086 
2087 	if (perf_guest) {
2088 		machines__process_guests(&session->machines,
2089 					 perf_event__synthesize_guest_os, tool);
2090 	}
2091 
2092 	err = perf_event__synthesize_extra_attr(&rec->tool,
2093 						rec->evlist,
2094 						process_synthesized_event,
2095 						data->is_pipe);
2096 	if (err)
2097 		goto out;
2098 
2099 	err = perf_event__synthesize_thread_map2(&rec->tool, rec->evlist->core.threads,
2100 						 process_synthesized_event,
2101 						NULL);
2102 	if (err < 0) {
2103 		pr_err("Couldn't synthesize thread map.\n");
2104 		return err;
2105 	}
2106 
2107 	err = perf_event__synthesize_cpu_map(&rec->tool, rec->evlist->core.all_cpus,
2108 					     process_synthesized_event, NULL);
2109 	if (err < 0) {
2110 		pr_err("Couldn't synthesize cpu map.\n");
2111 		return err;
2112 	}
2113 
2114 	err = perf_event__synthesize_bpf_events(session, process_synthesized_event,
2115 						machine, opts);
2116 	if (err < 0) {
2117 		pr_warning("Couldn't synthesize bpf events.\n");
2118 		err = 0;
2119 	}
2120 
2121 	if (rec->opts.synth & PERF_SYNTH_CGROUP) {
2122 		err = perf_event__synthesize_cgroups(tool, process_synthesized_event,
2123 						     machine);
2124 		if (err < 0) {
2125 			pr_warning("Couldn't synthesize cgroup events.\n");
2126 			err = 0;
2127 		}
2128 	}
2129 
2130 	if (rec->opts.nr_threads_synthesize > 1) {
2131 		mutex_init(&synth_lock);
2132 		perf_set_multithreaded();
2133 		f = process_locked_synthesized_event;
2134 	}
2135 
2136 	if (rec->opts.synth & PERF_SYNTH_TASK) {
2137 		bool needs_mmap = rec->opts.synth & PERF_SYNTH_MMAP;
2138 
2139 		err = __machine__synthesize_threads(machine, tool, &opts->target,
2140 						    rec->evlist->core.threads,
2141 						    f, needs_mmap, opts->sample_address,
2142 						    rec->opts.nr_threads_synthesize);
2143 	}
2144 
2145 	if (rec->opts.nr_threads_synthesize > 1) {
2146 		perf_set_singlethreaded();
2147 		mutex_destroy(&synth_lock);
2148 	}
2149 
2150 out:
2151 	return err;
2152 }
2153 
2154 static int record__process_signal_event(union perf_event *event __maybe_unused, void *data)
2155 {
2156 	struct record *rec = data;
2157 	pthread_kill(rec->thread_id, SIGUSR2);
2158 	return 0;
2159 }
2160 
2161 static int record__setup_sb_evlist(struct record *rec)
2162 {
2163 	struct record_opts *opts = &rec->opts;
2164 
2165 	if (rec->sb_evlist != NULL) {
2166 		/*
2167 		 * We get here if --switch-output-event populated the
2168 		 * sb_evlist, so associate a callback that will send a SIGUSR2
2169 		 * to the main thread.
2170 		 */
2171 		evlist__set_cb(rec->sb_evlist, record__process_signal_event, rec);
2172 		rec->thread_id = pthread_self();
2173 	}
2174 #ifdef HAVE_LIBBPF_SUPPORT
2175 	if (!opts->no_bpf_event) {
2176 		if (rec->sb_evlist == NULL) {
2177 			rec->sb_evlist = evlist__new();
2178 
2179 			if (rec->sb_evlist == NULL) {
2180 				pr_err("Couldn't create side band evlist.\n.");
2181 				return -1;
2182 			}
2183 		}
2184 
2185 		if (evlist__add_bpf_sb_event(rec->sb_evlist, &rec->session->header.env)) {
2186 			pr_err("Couldn't ask for PERF_RECORD_BPF_EVENT side band events.\n.");
2187 			return -1;
2188 		}
2189 	}
2190 #endif
2191 	if (evlist__start_sb_thread(rec->sb_evlist, &rec->opts.target)) {
2192 		pr_debug("Couldn't start the BPF side band thread:\nBPF programs starting from now on won't be annotatable\n");
2193 		opts->no_bpf_event = true;
2194 	}
2195 
2196 	return 0;
2197 }
2198 
2199 static int record__init_clock(struct record *rec)
2200 {
2201 	struct perf_session *session = rec->session;
2202 	struct timespec ref_clockid;
2203 	struct timeval ref_tod;
2204 	u64 ref;
2205 
2206 	if (!rec->opts.use_clockid)
2207 		return 0;
2208 
2209 	if (rec->opts.use_clockid && rec->opts.clockid_res_ns)
2210 		session->header.env.clock.clockid_res_ns = rec->opts.clockid_res_ns;
2211 
2212 	session->header.env.clock.clockid = rec->opts.clockid;
2213 
2214 	if (gettimeofday(&ref_tod, NULL) != 0) {
2215 		pr_err("gettimeofday failed, cannot set reference time.\n");
2216 		return -1;
2217 	}
2218 
2219 	if (clock_gettime(rec->opts.clockid, &ref_clockid)) {
2220 		pr_err("clock_gettime failed, cannot set reference time.\n");
2221 		return -1;
2222 	}
2223 
2224 	ref = (u64) ref_tod.tv_sec * NSEC_PER_SEC +
2225 	      (u64) ref_tod.tv_usec * NSEC_PER_USEC;
2226 
2227 	session->header.env.clock.tod_ns = ref;
2228 
2229 	ref = (u64) ref_clockid.tv_sec * NSEC_PER_SEC +
2230 	      (u64) ref_clockid.tv_nsec;
2231 
2232 	session->header.env.clock.clockid_ns = ref;
2233 	return 0;
2234 }
2235 
2236 static void hit_auxtrace_snapshot_trigger(struct record *rec)
2237 {
2238 	if (trigger_is_ready(&auxtrace_snapshot_trigger)) {
2239 		trigger_hit(&auxtrace_snapshot_trigger);
2240 		auxtrace_record__snapshot_started = 1;
2241 		if (auxtrace_record__snapshot_start(rec->itr))
2242 			trigger_error(&auxtrace_snapshot_trigger);
2243 	}
2244 }
2245 
2246 static int record__terminate_thread(struct record_thread *thread_data)
2247 {
2248 	int err;
2249 	enum thread_msg ack = THREAD_MSG__UNDEFINED;
2250 	pid_t tid = thread_data->tid;
2251 
2252 	close(thread_data->pipes.msg[1]);
2253 	thread_data->pipes.msg[1] = -1;
2254 	err = read(thread_data->pipes.ack[0], &ack, sizeof(ack));
2255 	if (err > 0)
2256 		pr_debug2("threads[%d]: sent %s\n", tid, thread_msg_tags[ack]);
2257 	else
2258 		pr_warning("threads[%d]: failed to receive termination notification from %d\n",
2259 			   thread->tid, tid);
2260 
2261 	return 0;
2262 }
2263 
2264 static int record__start_threads(struct record *rec)
2265 {
2266 	int t, tt, err, ret = 0, nr_threads = rec->nr_threads;
2267 	struct record_thread *thread_data = rec->thread_data;
2268 	sigset_t full, mask;
2269 	pthread_t handle;
2270 	pthread_attr_t attrs;
2271 
2272 	thread = &thread_data[0];
2273 
2274 	if (!record__threads_enabled(rec))
2275 		return 0;
2276 
2277 	sigfillset(&full);
2278 	if (sigprocmask(SIG_SETMASK, &full, &mask)) {
2279 		pr_err("Failed to block signals on threads start: %s\n", strerror(errno));
2280 		return -1;
2281 	}
2282 
2283 	pthread_attr_init(&attrs);
2284 	pthread_attr_setdetachstate(&attrs, PTHREAD_CREATE_DETACHED);
2285 
2286 	for (t = 1; t < nr_threads; t++) {
2287 		enum thread_msg msg = THREAD_MSG__UNDEFINED;
2288 
2289 #ifdef HAVE_PTHREAD_ATTR_SETAFFINITY_NP
2290 		pthread_attr_setaffinity_np(&attrs,
2291 					    MMAP_CPU_MASK_BYTES(&(thread_data[t].mask->affinity)),
2292 					    (cpu_set_t *)(thread_data[t].mask->affinity.bits));
2293 #endif
2294 		if (pthread_create(&handle, &attrs, record__thread, &thread_data[t])) {
2295 			for (tt = 1; tt < t; tt++)
2296 				record__terminate_thread(&thread_data[t]);
2297 			pr_err("Failed to start threads: %s\n", strerror(errno));
2298 			ret = -1;
2299 			goto out_err;
2300 		}
2301 
2302 		err = read(thread_data[t].pipes.ack[0], &msg, sizeof(msg));
2303 		if (err > 0)
2304 			pr_debug2("threads[%d]: sent %s\n", rec->thread_data[t].tid,
2305 				  thread_msg_tags[msg]);
2306 		else
2307 			pr_warning("threads[%d]: failed to receive start notification from %d\n",
2308 				   thread->tid, rec->thread_data[t].tid);
2309 	}
2310 
2311 	sched_setaffinity(0, MMAP_CPU_MASK_BYTES(&thread->mask->affinity),
2312 			(cpu_set_t *)thread->mask->affinity.bits);
2313 
2314 	pr_debug("threads[%d]: started on cpu%d\n", thread->tid, sched_getcpu());
2315 
2316 out_err:
2317 	pthread_attr_destroy(&attrs);
2318 
2319 	if (sigprocmask(SIG_SETMASK, &mask, NULL)) {
2320 		pr_err("Failed to unblock signals on threads start: %s\n", strerror(errno));
2321 		ret = -1;
2322 	}
2323 
2324 	return ret;
2325 }
2326 
2327 static int record__stop_threads(struct record *rec)
2328 {
2329 	int t;
2330 	struct record_thread *thread_data = rec->thread_data;
2331 
2332 	for (t = 1; t < rec->nr_threads; t++)
2333 		record__terminate_thread(&thread_data[t]);
2334 
2335 	for (t = 0; t < rec->nr_threads; t++) {
2336 		rec->samples += thread_data[t].samples;
2337 		if (!record__threads_enabled(rec))
2338 			continue;
2339 		rec->session->bytes_transferred += thread_data[t].bytes_transferred;
2340 		rec->session->bytes_compressed += thread_data[t].bytes_compressed;
2341 		pr_debug("threads[%d]: samples=%lld, wakes=%ld, ", thread_data[t].tid,
2342 			 thread_data[t].samples, thread_data[t].waking);
2343 		if (thread_data[t].bytes_transferred && thread_data[t].bytes_compressed)
2344 			pr_debug("transferred=%" PRIu64 ", compressed=%" PRIu64 "\n",
2345 				 thread_data[t].bytes_transferred, thread_data[t].bytes_compressed);
2346 		else
2347 			pr_debug("written=%" PRIu64 "\n", thread_data[t].bytes_written);
2348 	}
2349 
2350 	return 0;
2351 }
2352 
2353 static unsigned long record__waking(struct record *rec)
2354 {
2355 	int t;
2356 	unsigned long waking = 0;
2357 	struct record_thread *thread_data = rec->thread_data;
2358 
2359 	for (t = 0; t < rec->nr_threads; t++)
2360 		waking += thread_data[t].waking;
2361 
2362 	return waking;
2363 }
2364 
2365 static int __cmd_record(struct record *rec, int argc, const char **argv)
2366 {
2367 	int err;
2368 	int status = 0;
2369 	const bool forks = argc > 0;
2370 	struct perf_tool *tool = &rec->tool;
2371 	struct record_opts *opts = &rec->opts;
2372 	struct perf_data *data = &rec->data;
2373 	struct perf_session *session;
2374 	bool disabled = false, draining = false;
2375 	int fd;
2376 	float ratio = 0;
2377 	enum evlist_ctl_cmd cmd = EVLIST_CTL_CMD_UNSUPPORTED;
2378 
2379 	atexit(record__sig_exit);
2380 	signal(SIGCHLD, sig_handler);
2381 	signal(SIGINT, sig_handler);
2382 	signal(SIGTERM, sig_handler);
2383 	signal(SIGSEGV, sigsegv_handler);
2384 
2385 	if (rec->opts.record_namespaces)
2386 		tool->namespace_events = true;
2387 
2388 	if (rec->opts.record_cgroup) {
2389 #ifdef HAVE_FILE_HANDLE
2390 		tool->cgroup_events = true;
2391 #else
2392 		pr_err("cgroup tracking is not supported\n");
2393 		return -1;
2394 #endif
2395 	}
2396 
2397 	if (rec->opts.auxtrace_snapshot_mode || rec->switch_output.enabled) {
2398 		signal(SIGUSR2, snapshot_sig_handler);
2399 		if (rec->opts.auxtrace_snapshot_mode)
2400 			trigger_on(&auxtrace_snapshot_trigger);
2401 		if (rec->switch_output.enabled)
2402 			trigger_on(&switch_output_trigger);
2403 	} else {
2404 		signal(SIGUSR2, SIG_IGN);
2405 	}
2406 
2407 	session = perf_session__new(data, tool);
2408 	if (IS_ERR(session)) {
2409 		pr_err("Perf session creation failed.\n");
2410 		return PTR_ERR(session);
2411 	}
2412 
2413 	if (record__threads_enabled(rec)) {
2414 		if (perf_data__is_pipe(&rec->data)) {
2415 			pr_err("Parallel trace streaming is not available in pipe mode.\n");
2416 			return -1;
2417 		}
2418 		if (rec->opts.full_auxtrace) {
2419 			pr_err("Parallel trace streaming is not available in AUX area tracing mode.\n");
2420 			return -1;
2421 		}
2422 	}
2423 
2424 	fd = perf_data__fd(data);
2425 	rec->session = session;
2426 
2427 	if (zstd_init(&session->zstd_data, rec->opts.comp_level) < 0) {
2428 		pr_err("Compression initialization failed.\n");
2429 		return -1;
2430 	}
2431 #ifdef HAVE_EVENTFD_SUPPORT
2432 	done_fd = eventfd(0, EFD_NONBLOCK);
2433 	if (done_fd < 0) {
2434 		pr_err("Failed to create wakeup eventfd, error: %m\n");
2435 		status = -1;
2436 		goto out_delete_session;
2437 	}
2438 	err = evlist__add_wakeup_eventfd(rec->evlist, done_fd);
2439 	if (err < 0) {
2440 		pr_err("Failed to add wakeup eventfd to poll list\n");
2441 		status = err;
2442 		goto out_delete_session;
2443 	}
2444 #endif // HAVE_EVENTFD_SUPPORT
2445 
2446 	session->header.env.comp_type  = PERF_COMP_ZSTD;
2447 	session->header.env.comp_level = rec->opts.comp_level;
2448 
2449 	if (rec->opts.kcore &&
2450 	    !record__kcore_readable(&session->machines.host)) {
2451 		pr_err("ERROR: kcore is not readable.\n");
2452 		return -1;
2453 	}
2454 
2455 	if (record__init_clock(rec))
2456 		return -1;
2457 
2458 	record__init_features(rec);
2459 
2460 	if (forks) {
2461 		err = evlist__prepare_workload(rec->evlist, &opts->target, argv, data->is_pipe,
2462 					       workload_exec_failed_signal);
2463 		if (err < 0) {
2464 			pr_err("Couldn't run the workload!\n");
2465 			status = err;
2466 			goto out_delete_session;
2467 		}
2468 	}
2469 
2470 	/*
2471 	 * If we have just single event and are sending data
2472 	 * through pipe, we need to force the ids allocation,
2473 	 * because we synthesize event name through the pipe
2474 	 * and need the id for that.
2475 	 */
2476 	if (data->is_pipe && rec->evlist->core.nr_entries == 1)
2477 		rec->opts.sample_id = true;
2478 
2479 	if (rec->timestamp_filename && perf_data__is_pipe(data)) {
2480 		rec->timestamp_filename = false;
2481 		pr_warning("WARNING: --timestamp-filename option is not available in pipe mode.\n");
2482 	}
2483 
2484 	evlist__uniquify_name(rec->evlist);
2485 
2486 	/* Debug message used by test scripts */
2487 	pr_debug3("perf record opening and mmapping events\n");
2488 	if (record__open(rec) != 0) {
2489 		err = -1;
2490 		goto out_free_threads;
2491 	}
2492 	/* Debug message used by test scripts */
2493 	pr_debug3("perf record done opening and mmapping events\n");
2494 	session->header.env.comp_mmap_len = session->evlist->core.mmap_len;
2495 
2496 	if (rec->opts.kcore) {
2497 		err = record__kcore_copy(&session->machines.host, data);
2498 		if (err) {
2499 			pr_err("ERROR: Failed to copy kcore\n");
2500 			goto out_free_threads;
2501 		}
2502 	}
2503 
2504 	/*
2505 	 * Normally perf_session__new would do this, but it doesn't have the
2506 	 * evlist.
2507 	 */
2508 	if (rec->tool.ordered_events && !evlist__sample_id_all(rec->evlist)) {
2509 		pr_warning("WARNING: No sample_id_all support, falling back to unordered processing\n");
2510 		rec->tool.ordered_events = false;
2511 	}
2512 
2513 	if (evlist__nr_groups(rec->evlist) == 0)
2514 		perf_header__clear_feat(&session->header, HEADER_GROUP_DESC);
2515 
2516 	if (data->is_pipe) {
2517 		err = perf_header__write_pipe(fd);
2518 		if (err < 0)
2519 			goto out_free_threads;
2520 	} else {
2521 		err = perf_session__write_header(session, rec->evlist, fd, false);
2522 		if (err < 0)
2523 			goto out_free_threads;
2524 	}
2525 
2526 	err = -1;
2527 	if (!rec->no_buildid
2528 	    && !perf_header__has_feat(&session->header, HEADER_BUILD_ID)) {
2529 		pr_err("Couldn't generate buildids. "
2530 		       "Use --no-buildid to profile anyway.\n");
2531 		goto out_free_threads;
2532 	}
2533 
2534 	err = record__setup_sb_evlist(rec);
2535 	if (err)
2536 		goto out_free_threads;
2537 
2538 	err = record__synthesize(rec, false);
2539 	if (err < 0)
2540 		goto out_free_threads;
2541 
2542 	if (rec->realtime_prio) {
2543 		struct sched_param param;
2544 
2545 		param.sched_priority = rec->realtime_prio;
2546 		if (sched_setscheduler(0, SCHED_FIFO, &param)) {
2547 			pr_err("Could not set realtime priority.\n");
2548 			err = -1;
2549 			goto out_free_threads;
2550 		}
2551 	}
2552 
2553 	if (record__start_threads(rec))
2554 		goto out_free_threads;
2555 
2556 	/*
2557 	 * When perf is starting the traced process, all the events
2558 	 * (apart from group members) have enable_on_exec=1 set,
2559 	 * so don't spoil it by prematurely enabling them.
2560 	 */
2561 	if (!target__none(&opts->target) && !opts->target.initial_delay)
2562 		evlist__enable(rec->evlist);
2563 
2564 	/*
2565 	 * Let the child rip
2566 	 */
2567 	if (forks) {
2568 		struct machine *machine = &session->machines.host;
2569 		union perf_event *event;
2570 		pid_t tgid;
2571 
2572 		event = malloc(sizeof(event->comm) + machine->id_hdr_size);
2573 		if (event == NULL) {
2574 			err = -ENOMEM;
2575 			goto out_child;
2576 		}
2577 
2578 		/*
2579 		 * Some H/W events are generated before COMM event
2580 		 * which is emitted during exec(), so perf script
2581 		 * cannot see a correct process name for those events.
2582 		 * Synthesize COMM event to prevent it.
2583 		 */
2584 		tgid = perf_event__synthesize_comm(tool, event,
2585 						   rec->evlist->workload.pid,
2586 						   process_synthesized_event,
2587 						   machine);
2588 		free(event);
2589 
2590 		if (tgid == -1)
2591 			goto out_child;
2592 
2593 		event = malloc(sizeof(event->namespaces) +
2594 			       (NR_NAMESPACES * sizeof(struct perf_ns_link_info)) +
2595 			       machine->id_hdr_size);
2596 		if (event == NULL) {
2597 			err = -ENOMEM;
2598 			goto out_child;
2599 		}
2600 
2601 		/*
2602 		 * Synthesize NAMESPACES event for the command specified.
2603 		 */
2604 		perf_event__synthesize_namespaces(tool, event,
2605 						  rec->evlist->workload.pid,
2606 						  tgid, process_synthesized_event,
2607 						  machine);
2608 		free(event);
2609 
2610 		evlist__start_workload(rec->evlist);
2611 	}
2612 
2613 	if (opts->target.initial_delay) {
2614 		pr_info(EVLIST_DISABLED_MSG);
2615 		if (opts->target.initial_delay > 0) {
2616 			usleep(opts->target.initial_delay * USEC_PER_MSEC);
2617 			evlist__enable(rec->evlist);
2618 			pr_info(EVLIST_ENABLED_MSG);
2619 		}
2620 	}
2621 
2622 	err = event_enable_timer__start(rec->evlist->eet);
2623 	if (err)
2624 		goto out_child;
2625 
2626 	/* Debug message used by test scripts */
2627 	pr_debug3("perf record has started\n");
2628 	fflush(stderr);
2629 
2630 	trigger_ready(&auxtrace_snapshot_trigger);
2631 	trigger_ready(&switch_output_trigger);
2632 	perf_hooks__invoke_record_start();
2633 
2634 	/*
2635 	 * Must write FINISHED_INIT so it will be seen after all other
2636 	 * synthesized user events, but before any regular events.
2637 	 */
2638 	err = write_finished_init(rec, false);
2639 	if (err < 0)
2640 		goto out_child;
2641 
2642 	for (;;) {
2643 		unsigned long long hits = thread->samples;
2644 
2645 		/*
2646 		 * rec->evlist->bkw_mmap_state is possible to be
2647 		 * BKW_MMAP_EMPTY here: when done == true and
2648 		 * hits != rec->samples in previous round.
2649 		 *
2650 		 * evlist__toggle_bkw_mmap ensure we never
2651 		 * convert BKW_MMAP_EMPTY to BKW_MMAP_DATA_PENDING.
2652 		 */
2653 		if (trigger_is_hit(&switch_output_trigger) || done || draining)
2654 			evlist__toggle_bkw_mmap(rec->evlist, BKW_MMAP_DATA_PENDING);
2655 
2656 		if (record__mmap_read_all(rec, false) < 0) {
2657 			trigger_error(&auxtrace_snapshot_trigger);
2658 			trigger_error(&switch_output_trigger);
2659 			err = -1;
2660 			goto out_child;
2661 		}
2662 
2663 		if (auxtrace_record__snapshot_started) {
2664 			auxtrace_record__snapshot_started = 0;
2665 			if (!trigger_is_error(&auxtrace_snapshot_trigger))
2666 				record__read_auxtrace_snapshot(rec, false);
2667 			if (trigger_is_error(&auxtrace_snapshot_trigger)) {
2668 				pr_err("AUX area tracing snapshot failed\n");
2669 				err = -1;
2670 				goto out_child;
2671 			}
2672 		}
2673 
2674 		if (trigger_is_hit(&switch_output_trigger)) {
2675 			/*
2676 			 * If switch_output_trigger is hit, the data in
2677 			 * overwritable ring buffer should have been collected,
2678 			 * so bkw_mmap_state should be set to BKW_MMAP_EMPTY.
2679 			 *
2680 			 * If SIGUSR2 raise after or during record__mmap_read_all(),
2681 			 * record__mmap_read_all() didn't collect data from
2682 			 * overwritable ring buffer. Read again.
2683 			 */
2684 			if (rec->evlist->bkw_mmap_state == BKW_MMAP_RUNNING)
2685 				continue;
2686 			trigger_ready(&switch_output_trigger);
2687 
2688 			/*
2689 			 * Reenable events in overwrite ring buffer after
2690 			 * record__mmap_read_all(): we should have collected
2691 			 * data from it.
2692 			 */
2693 			evlist__toggle_bkw_mmap(rec->evlist, BKW_MMAP_RUNNING);
2694 
2695 			if (!quiet)
2696 				fprintf(stderr, "[ perf record: dump data: Woken up %ld times ]\n",
2697 					record__waking(rec));
2698 			thread->waking = 0;
2699 			fd = record__switch_output(rec, false);
2700 			if (fd < 0) {
2701 				pr_err("Failed to switch to new file\n");
2702 				trigger_error(&switch_output_trigger);
2703 				err = fd;
2704 				goto out_child;
2705 			}
2706 
2707 			/* re-arm the alarm */
2708 			if (rec->switch_output.time)
2709 				alarm(rec->switch_output.time);
2710 		}
2711 
2712 		if (hits == thread->samples) {
2713 			if (done || draining)
2714 				break;
2715 			err = fdarray__poll(&thread->pollfd, -1);
2716 			/*
2717 			 * Propagate error, only if there's any. Ignore positive
2718 			 * number of returned events and interrupt error.
2719 			 */
2720 			if (err > 0 || (err < 0 && errno == EINTR))
2721 				err = 0;
2722 			thread->waking++;
2723 
2724 			if (fdarray__filter(&thread->pollfd, POLLERR | POLLHUP,
2725 					    record__thread_munmap_filtered, NULL) == 0)
2726 				draining = true;
2727 
2728 			err = record__update_evlist_pollfd_from_thread(rec, rec->evlist, thread);
2729 			if (err)
2730 				goto out_child;
2731 		}
2732 
2733 		if (evlist__ctlfd_process(rec->evlist, &cmd) > 0) {
2734 			switch (cmd) {
2735 			case EVLIST_CTL_CMD_SNAPSHOT:
2736 				hit_auxtrace_snapshot_trigger(rec);
2737 				evlist__ctlfd_ack(rec->evlist);
2738 				break;
2739 			case EVLIST_CTL_CMD_STOP:
2740 				done = 1;
2741 				break;
2742 			case EVLIST_CTL_CMD_ACK:
2743 			case EVLIST_CTL_CMD_UNSUPPORTED:
2744 			case EVLIST_CTL_CMD_ENABLE:
2745 			case EVLIST_CTL_CMD_DISABLE:
2746 			case EVLIST_CTL_CMD_EVLIST:
2747 			case EVLIST_CTL_CMD_PING:
2748 			default:
2749 				break;
2750 			}
2751 		}
2752 
2753 		err = event_enable_timer__process(rec->evlist->eet);
2754 		if (err < 0)
2755 			goto out_child;
2756 		if (err) {
2757 			err = 0;
2758 			done = 1;
2759 		}
2760 
2761 		/*
2762 		 * When perf is starting the traced process, at the end events
2763 		 * die with the process and we wait for that. Thus no need to
2764 		 * disable events in this case.
2765 		 */
2766 		if (done && !disabled && !target__none(&opts->target)) {
2767 			trigger_off(&auxtrace_snapshot_trigger);
2768 			evlist__disable(rec->evlist);
2769 			disabled = true;
2770 		}
2771 	}
2772 
2773 	trigger_off(&auxtrace_snapshot_trigger);
2774 	trigger_off(&switch_output_trigger);
2775 
2776 	if (opts->auxtrace_snapshot_on_exit)
2777 		record__auxtrace_snapshot_exit(rec);
2778 
2779 	if (forks && workload_exec_errno) {
2780 		char msg[STRERR_BUFSIZE], strevsels[2048];
2781 		const char *emsg = str_error_r(workload_exec_errno, msg, sizeof(msg));
2782 
2783 		evlist__scnprintf_evsels(rec->evlist, sizeof(strevsels), strevsels);
2784 
2785 		pr_err("Failed to collect '%s' for the '%s' workload: %s\n",
2786 			strevsels, argv[0], emsg);
2787 		err = -1;
2788 		goto out_child;
2789 	}
2790 
2791 	if (!quiet)
2792 		fprintf(stderr, "[ perf record: Woken up %ld times to write data ]\n",
2793 			record__waking(rec));
2794 
2795 	write_finished_init(rec, true);
2796 
2797 	if (target__none(&rec->opts.target))
2798 		record__synthesize_workload(rec, true);
2799 
2800 out_child:
2801 	record__stop_threads(rec);
2802 	record__mmap_read_all(rec, true);
2803 out_free_threads:
2804 	record__free_thread_data(rec);
2805 	evlist__finalize_ctlfd(rec->evlist);
2806 	record__aio_mmap_read_sync(rec);
2807 
2808 	if (rec->session->bytes_transferred && rec->session->bytes_compressed) {
2809 		ratio = (float)rec->session->bytes_transferred/(float)rec->session->bytes_compressed;
2810 		session->header.env.comp_ratio = ratio + 0.5;
2811 	}
2812 
2813 	if (forks) {
2814 		int exit_status;
2815 
2816 		if (!child_finished)
2817 			kill(rec->evlist->workload.pid, SIGTERM);
2818 
2819 		wait(&exit_status);
2820 
2821 		if (err < 0)
2822 			status = err;
2823 		else if (WIFEXITED(exit_status))
2824 			status = WEXITSTATUS(exit_status);
2825 		else if (WIFSIGNALED(exit_status))
2826 			signr = WTERMSIG(exit_status);
2827 	} else
2828 		status = err;
2829 
2830 	if (rec->off_cpu)
2831 		rec->bytes_written += off_cpu_write(rec->session);
2832 
2833 	record__read_lost_samples(rec);
2834 	record__synthesize(rec, true);
2835 	/* this will be recalculated during process_buildids() */
2836 	rec->samples = 0;
2837 
2838 	if (!err) {
2839 		if (!rec->timestamp_filename) {
2840 			record__finish_output(rec);
2841 		} else {
2842 			fd = record__switch_output(rec, true);
2843 			if (fd < 0) {
2844 				status = fd;
2845 				goto out_delete_session;
2846 			}
2847 		}
2848 	}
2849 
2850 	perf_hooks__invoke_record_end();
2851 
2852 	if (!err && !quiet) {
2853 		char samples[128];
2854 		const char *postfix = rec->timestamp_filename ?
2855 					".<timestamp>" : "";
2856 
2857 		if (rec->samples && !rec->opts.full_auxtrace)
2858 			scnprintf(samples, sizeof(samples),
2859 				  " (%" PRIu64 " samples)", rec->samples);
2860 		else
2861 			samples[0] = '\0';
2862 
2863 		fprintf(stderr,	"[ perf record: Captured and wrote %.3f MB %s%s%s",
2864 			perf_data__size(data) / 1024.0 / 1024.0,
2865 			data->path, postfix, samples);
2866 		if (ratio) {
2867 			fprintf(stderr,	", compressed (original %.3f MB, ratio is %.3f)",
2868 					rec->session->bytes_transferred / 1024.0 / 1024.0,
2869 					ratio);
2870 		}
2871 		fprintf(stderr, " ]\n");
2872 	}
2873 
2874 out_delete_session:
2875 #ifdef HAVE_EVENTFD_SUPPORT
2876 	if (done_fd >= 0) {
2877 		fd = done_fd;
2878 		done_fd = -1;
2879 
2880 		close(fd);
2881 	}
2882 #endif
2883 	zstd_fini(&session->zstd_data);
2884 	perf_session__delete(session);
2885 
2886 	if (!opts->no_bpf_event)
2887 		evlist__stop_sb_thread(rec->sb_evlist);
2888 	return status;
2889 }
2890 
2891 static void callchain_debug(struct callchain_param *callchain)
2892 {
2893 	static const char *str[CALLCHAIN_MAX] = { "NONE", "FP", "DWARF", "LBR" };
2894 
2895 	pr_debug("callchain: type %s\n", str[callchain->record_mode]);
2896 
2897 	if (callchain->record_mode == CALLCHAIN_DWARF)
2898 		pr_debug("callchain: stack dump size %d\n",
2899 			 callchain->dump_size);
2900 }
2901 
2902 int record_opts__parse_callchain(struct record_opts *record,
2903 				 struct callchain_param *callchain,
2904 				 const char *arg, bool unset)
2905 {
2906 	int ret;
2907 	callchain->enabled = !unset;
2908 
2909 	/* --no-call-graph */
2910 	if (unset) {
2911 		callchain->record_mode = CALLCHAIN_NONE;
2912 		pr_debug("callchain: disabled\n");
2913 		return 0;
2914 	}
2915 
2916 	ret = parse_callchain_record_opt(arg, callchain);
2917 	if (!ret) {
2918 		/* Enable data address sampling for DWARF unwind. */
2919 		if (callchain->record_mode == CALLCHAIN_DWARF)
2920 			record->sample_address = true;
2921 		callchain_debug(callchain);
2922 	}
2923 
2924 	return ret;
2925 }
2926 
2927 int record_parse_callchain_opt(const struct option *opt,
2928 			       const char *arg,
2929 			       int unset)
2930 {
2931 	return record_opts__parse_callchain(opt->value, &callchain_param, arg, unset);
2932 }
2933 
2934 int record_callchain_opt(const struct option *opt,
2935 			 const char *arg __maybe_unused,
2936 			 int unset __maybe_unused)
2937 {
2938 	struct callchain_param *callchain = opt->value;
2939 
2940 	callchain->enabled = true;
2941 
2942 	if (callchain->record_mode == CALLCHAIN_NONE)
2943 		callchain->record_mode = CALLCHAIN_FP;
2944 
2945 	callchain_debug(callchain);
2946 	return 0;
2947 }
2948 
2949 static int perf_record_config(const char *var, const char *value, void *cb)
2950 {
2951 	struct record *rec = cb;
2952 
2953 	if (!strcmp(var, "record.build-id")) {
2954 		if (!strcmp(value, "cache"))
2955 			rec->no_buildid_cache = false;
2956 		else if (!strcmp(value, "no-cache"))
2957 			rec->no_buildid_cache = true;
2958 		else if (!strcmp(value, "skip"))
2959 			rec->no_buildid = true;
2960 		else if (!strcmp(value, "mmap"))
2961 			rec->buildid_mmap = true;
2962 		else
2963 			return -1;
2964 		return 0;
2965 	}
2966 	if (!strcmp(var, "record.call-graph")) {
2967 		var = "call-graph.record-mode";
2968 		return perf_default_config(var, value, cb);
2969 	}
2970 #ifdef HAVE_AIO_SUPPORT
2971 	if (!strcmp(var, "record.aio")) {
2972 		rec->opts.nr_cblocks = strtol(value, NULL, 0);
2973 		if (!rec->opts.nr_cblocks)
2974 			rec->opts.nr_cblocks = nr_cblocks_default;
2975 	}
2976 #endif
2977 	if (!strcmp(var, "record.debuginfod")) {
2978 		rec->debuginfod.urls = strdup(value);
2979 		if (!rec->debuginfod.urls)
2980 			return -ENOMEM;
2981 		rec->debuginfod.set = true;
2982 	}
2983 
2984 	return 0;
2985 }
2986 
2987 static int record__parse_event_enable_time(const struct option *opt, const char *str, int unset)
2988 {
2989 	struct record *rec = (struct record *)opt->value;
2990 
2991 	return evlist__parse_event_enable_time(rec->evlist, &rec->opts, str, unset);
2992 }
2993 
2994 static int record__parse_affinity(const struct option *opt, const char *str, int unset)
2995 {
2996 	struct record_opts *opts = (struct record_opts *)opt->value;
2997 
2998 	if (unset || !str)
2999 		return 0;
3000 
3001 	if (!strcasecmp(str, "node"))
3002 		opts->affinity = PERF_AFFINITY_NODE;
3003 	else if (!strcasecmp(str, "cpu"))
3004 		opts->affinity = PERF_AFFINITY_CPU;
3005 
3006 	return 0;
3007 }
3008 
3009 static int record__mmap_cpu_mask_alloc(struct mmap_cpu_mask *mask, int nr_bits)
3010 {
3011 	mask->nbits = nr_bits;
3012 	mask->bits = bitmap_zalloc(mask->nbits);
3013 	if (!mask->bits)
3014 		return -ENOMEM;
3015 
3016 	return 0;
3017 }
3018 
3019 static void record__mmap_cpu_mask_free(struct mmap_cpu_mask *mask)
3020 {
3021 	bitmap_free(mask->bits);
3022 	mask->nbits = 0;
3023 }
3024 
3025 static int record__thread_mask_alloc(struct thread_mask *mask, int nr_bits)
3026 {
3027 	int ret;
3028 
3029 	ret = record__mmap_cpu_mask_alloc(&mask->maps, nr_bits);
3030 	if (ret) {
3031 		mask->affinity.bits = NULL;
3032 		return ret;
3033 	}
3034 
3035 	ret = record__mmap_cpu_mask_alloc(&mask->affinity, nr_bits);
3036 	if (ret) {
3037 		record__mmap_cpu_mask_free(&mask->maps);
3038 		mask->maps.bits = NULL;
3039 	}
3040 
3041 	return ret;
3042 }
3043 
3044 static void record__thread_mask_free(struct thread_mask *mask)
3045 {
3046 	record__mmap_cpu_mask_free(&mask->maps);
3047 	record__mmap_cpu_mask_free(&mask->affinity);
3048 }
3049 
3050 static int record__parse_threads(const struct option *opt, const char *str, int unset)
3051 {
3052 	int s;
3053 	struct record_opts *opts = opt->value;
3054 
3055 	if (unset || !str || !strlen(str)) {
3056 		opts->threads_spec = THREAD_SPEC__CPU;
3057 	} else {
3058 		for (s = 1; s < THREAD_SPEC__MAX; s++) {
3059 			if (s == THREAD_SPEC__USER) {
3060 				opts->threads_user_spec = strdup(str);
3061 				if (!opts->threads_user_spec)
3062 					return -ENOMEM;
3063 				opts->threads_spec = THREAD_SPEC__USER;
3064 				break;
3065 			}
3066 			if (!strncasecmp(str, thread_spec_tags[s], strlen(thread_spec_tags[s]))) {
3067 				opts->threads_spec = s;
3068 				break;
3069 			}
3070 		}
3071 	}
3072 
3073 	if (opts->threads_spec == THREAD_SPEC__USER)
3074 		pr_debug("threads_spec: %s\n", opts->threads_user_spec);
3075 	else
3076 		pr_debug("threads_spec: %s\n", thread_spec_tags[opts->threads_spec]);
3077 
3078 	return 0;
3079 }
3080 
3081 static int parse_output_max_size(const struct option *opt,
3082 				 const char *str, int unset)
3083 {
3084 	unsigned long *s = (unsigned long *)opt->value;
3085 	static struct parse_tag tags_size[] = {
3086 		{ .tag  = 'B', .mult = 1       },
3087 		{ .tag  = 'K', .mult = 1 << 10 },
3088 		{ .tag  = 'M', .mult = 1 << 20 },
3089 		{ .tag  = 'G', .mult = 1 << 30 },
3090 		{ .tag  = 0 },
3091 	};
3092 	unsigned long val;
3093 
3094 	if (unset) {
3095 		*s = 0;
3096 		return 0;
3097 	}
3098 
3099 	val = parse_tag_value(str, tags_size);
3100 	if (val != (unsigned long) -1) {
3101 		*s = val;
3102 		return 0;
3103 	}
3104 
3105 	return -1;
3106 }
3107 
3108 static int record__parse_mmap_pages(const struct option *opt,
3109 				    const char *str,
3110 				    int unset __maybe_unused)
3111 {
3112 	struct record_opts *opts = opt->value;
3113 	char *s, *p;
3114 	unsigned int mmap_pages;
3115 	int ret;
3116 
3117 	if (!str)
3118 		return -EINVAL;
3119 
3120 	s = strdup(str);
3121 	if (!s)
3122 		return -ENOMEM;
3123 
3124 	p = strchr(s, ',');
3125 	if (p)
3126 		*p = '\0';
3127 
3128 	if (*s) {
3129 		ret = __evlist__parse_mmap_pages(&mmap_pages, s);
3130 		if (ret)
3131 			goto out_free;
3132 		opts->mmap_pages = mmap_pages;
3133 	}
3134 
3135 	if (!p) {
3136 		ret = 0;
3137 		goto out_free;
3138 	}
3139 
3140 	ret = __evlist__parse_mmap_pages(&mmap_pages, p + 1);
3141 	if (ret)
3142 		goto out_free;
3143 
3144 	opts->auxtrace_mmap_pages = mmap_pages;
3145 
3146 out_free:
3147 	free(s);
3148 	return ret;
3149 }
3150 
3151 void __weak arch__add_leaf_frame_record_opts(struct record_opts *opts __maybe_unused)
3152 {
3153 }
3154 
3155 static int parse_control_option(const struct option *opt,
3156 				const char *str,
3157 				int unset __maybe_unused)
3158 {
3159 	struct record_opts *opts = opt->value;
3160 
3161 	return evlist__parse_control(str, &opts->ctl_fd, &opts->ctl_fd_ack, &opts->ctl_fd_close);
3162 }
3163 
3164 static void switch_output_size_warn(struct record *rec)
3165 {
3166 	u64 wakeup_size = evlist__mmap_size(rec->opts.mmap_pages);
3167 	struct switch_output *s = &rec->switch_output;
3168 
3169 	wakeup_size /= 2;
3170 
3171 	if (s->size < wakeup_size) {
3172 		char buf[100];
3173 
3174 		unit_number__scnprintf(buf, sizeof(buf), wakeup_size);
3175 		pr_warning("WARNING: switch-output data size lower than "
3176 			   "wakeup kernel buffer size (%s) "
3177 			   "expect bigger perf.data sizes\n", buf);
3178 	}
3179 }
3180 
3181 static int switch_output_setup(struct record *rec)
3182 {
3183 	struct switch_output *s = &rec->switch_output;
3184 	static struct parse_tag tags_size[] = {
3185 		{ .tag  = 'B', .mult = 1       },
3186 		{ .tag  = 'K', .mult = 1 << 10 },
3187 		{ .tag  = 'M', .mult = 1 << 20 },
3188 		{ .tag  = 'G', .mult = 1 << 30 },
3189 		{ .tag  = 0 },
3190 	};
3191 	static struct parse_tag tags_time[] = {
3192 		{ .tag  = 's', .mult = 1        },
3193 		{ .tag  = 'm', .mult = 60       },
3194 		{ .tag  = 'h', .mult = 60*60    },
3195 		{ .tag  = 'd', .mult = 60*60*24 },
3196 		{ .tag  = 0 },
3197 	};
3198 	unsigned long val;
3199 
3200 	/*
3201 	 * If we're using --switch-output-events, then we imply its
3202 	 * --switch-output=signal, as we'll send a SIGUSR2 from the side band
3203 	 *  thread to its parent.
3204 	 */
3205 	if (rec->switch_output_event_set) {
3206 		if (record__threads_enabled(rec)) {
3207 			pr_warning("WARNING: --switch-output-event option is not available in parallel streaming mode.\n");
3208 			return 0;
3209 		}
3210 		goto do_signal;
3211 	}
3212 
3213 	if (!s->set)
3214 		return 0;
3215 
3216 	if (record__threads_enabled(rec)) {
3217 		pr_warning("WARNING: --switch-output option is not available in parallel streaming mode.\n");
3218 		return 0;
3219 	}
3220 
3221 	if (!strcmp(s->str, "signal")) {
3222 do_signal:
3223 		s->signal = true;
3224 		pr_debug("switch-output with SIGUSR2 signal\n");
3225 		goto enabled;
3226 	}
3227 
3228 	val = parse_tag_value(s->str, tags_size);
3229 	if (val != (unsigned long) -1) {
3230 		s->size = val;
3231 		pr_debug("switch-output with %s size threshold\n", s->str);
3232 		goto enabled;
3233 	}
3234 
3235 	val = parse_tag_value(s->str, tags_time);
3236 	if (val != (unsigned long) -1) {
3237 		s->time = val;
3238 		pr_debug("switch-output with %s time threshold (%lu seconds)\n",
3239 			 s->str, s->time);
3240 		goto enabled;
3241 	}
3242 
3243 	return -1;
3244 
3245 enabled:
3246 	rec->timestamp_filename = true;
3247 	s->enabled              = true;
3248 
3249 	if (s->size && !rec->opts.no_buffering)
3250 		switch_output_size_warn(rec);
3251 
3252 	return 0;
3253 }
3254 
3255 static const char * const __record_usage[] = {
3256 	"perf record [<options>] [<command>]",
3257 	"perf record [<options>] -- <command> [<options>]",
3258 	NULL
3259 };
3260 const char * const *record_usage = __record_usage;
3261 
3262 static int build_id__process_mmap(struct perf_tool *tool, union perf_event *event,
3263 				  struct perf_sample *sample, struct machine *machine)
3264 {
3265 	/*
3266 	 * We already have the kernel maps, put in place via perf_session__create_kernel_maps()
3267 	 * no need to add them twice.
3268 	 */
3269 	if (!(event->header.misc & PERF_RECORD_MISC_USER))
3270 		return 0;
3271 	return perf_event__process_mmap(tool, event, sample, machine);
3272 }
3273 
3274 static int build_id__process_mmap2(struct perf_tool *tool, union perf_event *event,
3275 				   struct perf_sample *sample, struct machine *machine)
3276 {
3277 	/*
3278 	 * We already have the kernel maps, put in place via perf_session__create_kernel_maps()
3279 	 * no need to add them twice.
3280 	 */
3281 	if (!(event->header.misc & PERF_RECORD_MISC_USER))
3282 		return 0;
3283 
3284 	return perf_event__process_mmap2(tool, event, sample, machine);
3285 }
3286 
3287 static int process_timestamp_boundary(struct perf_tool *tool,
3288 				      union perf_event *event __maybe_unused,
3289 				      struct perf_sample *sample,
3290 				      struct machine *machine __maybe_unused)
3291 {
3292 	struct record *rec = container_of(tool, struct record, tool);
3293 
3294 	set_timestamp_boundary(rec, sample->time);
3295 	return 0;
3296 }
3297 
3298 static int parse_record_synth_option(const struct option *opt,
3299 				     const char *str,
3300 				     int unset __maybe_unused)
3301 {
3302 	struct record_opts *opts = opt->value;
3303 	char *p = strdup(str);
3304 
3305 	if (p == NULL)
3306 		return -1;
3307 
3308 	opts->synth = parse_synth_opt(p);
3309 	free(p);
3310 
3311 	if (opts->synth < 0) {
3312 		pr_err("Invalid synth option: %s\n", str);
3313 		return -1;
3314 	}
3315 	return 0;
3316 }
3317 
3318 /*
3319  * XXX Ideally would be local to cmd_record() and passed to a record__new
3320  * because we need to have access to it in record__exit, that is called
3321  * after cmd_record() exits, but since record_options need to be accessible to
3322  * builtin-script, leave it here.
3323  *
3324  * At least we don't ouch it in all the other functions here directly.
3325  *
3326  * Just say no to tons of global variables, sigh.
3327  */
3328 static struct record record = {
3329 	.opts = {
3330 		.sample_time	     = true,
3331 		.mmap_pages	     = UINT_MAX,
3332 		.user_freq	     = UINT_MAX,
3333 		.user_interval	     = ULLONG_MAX,
3334 		.freq		     = 4000,
3335 		.target		     = {
3336 			.uses_mmap   = true,
3337 			.default_per_cpu = true,
3338 		},
3339 		.mmap_flush          = MMAP_FLUSH_DEFAULT,
3340 		.nr_threads_synthesize = 1,
3341 		.ctl_fd              = -1,
3342 		.ctl_fd_ack          = -1,
3343 		.synth               = PERF_SYNTH_ALL,
3344 	},
3345 	.tool = {
3346 		.sample		= process_sample_event,
3347 		.fork		= perf_event__process_fork,
3348 		.exit		= perf_event__process_exit,
3349 		.comm		= perf_event__process_comm,
3350 		.namespaces	= perf_event__process_namespaces,
3351 		.mmap		= build_id__process_mmap,
3352 		.mmap2		= build_id__process_mmap2,
3353 		.itrace_start	= process_timestamp_boundary,
3354 		.aux		= process_timestamp_boundary,
3355 		.ordered_events	= true,
3356 	},
3357 };
3358 
3359 const char record_callchain_help[] = CALLCHAIN_RECORD_HELP
3360 	"\n\t\t\t\tDefault: fp";
3361 
3362 static bool dry_run;
3363 
3364 static struct parse_events_option_args parse_events_option_args = {
3365 	.evlistp = &record.evlist,
3366 };
3367 
3368 static struct parse_events_option_args switch_output_parse_events_option_args = {
3369 	.evlistp = &record.sb_evlist,
3370 };
3371 
3372 /*
3373  * XXX Will stay a global variable till we fix builtin-script.c to stop messing
3374  * with it and switch to use the library functions in perf_evlist that came
3375  * from builtin-record.c, i.e. use record_opts,
3376  * evlist__prepare_workload, etc instead of fork+exec'in 'perf record',
3377  * using pipes, etc.
3378  */
3379 static struct option __record_options[] = {
3380 	OPT_CALLBACK('e', "event", &parse_events_option_args, "event",
3381 		     "event selector. use 'perf list' to list available events",
3382 		     parse_events_option),
3383 	OPT_CALLBACK(0, "filter", &record.evlist, "filter",
3384 		     "event filter", parse_filter),
3385 	OPT_CALLBACK_NOOPT(0, "exclude-perf", &record.evlist,
3386 			   NULL, "don't record events from perf itself",
3387 			   exclude_perf),
3388 	OPT_STRING('p', "pid", &record.opts.target.pid, "pid",
3389 		    "record events on existing process id"),
3390 	OPT_STRING('t', "tid", &record.opts.target.tid, "tid",
3391 		    "record events on existing thread id"),
3392 	OPT_INTEGER('r', "realtime", &record.realtime_prio,
3393 		    "collect data with this RT SCHED_FIFO priority"),
3394 	OPT_BOOLEAN(0, "no-buffering", &record.opts.no_buffering,
3395 		    "collect data without buffering"),
3396 	OPT_BOOLEAN('R', "raw-samples", &record.opts.raw_samples,
3397 		    "collect raw sample records from all opened counters"),
3398 	OPT_BOOLEAN('a', "all-cpus", &record.opts.target.system_wide,
3399 			    "system-wide collection from all CPUs"),
3400 	OPT_STRING('C', "cpu", &record.opts.target.cpu_list, "cpu",
3401 		    "list of cpus to monitor"),
3402 	OPT_U64('c', "count", &record.opts.user_interval, "event period to sample"),
3403 	OPT_STRING('o', "output", &record.data.path, "file",
3404 		    "output file name"),
3405 	OPT_BOOLEAN_SET('i', "no-inherit", &record.opts.no_inherit,
3406 			&record.opts.no_inherit_set,
3407 			"child tasks do not inherit counters"),
3408 	OPT_BOOLEAN(0, "tail-synthesize", &record.opts.tail_synthesize,
3409 		    "synthesize non-sample events at the end of output"),
3410 	OPT_BOOLEAN(0, "overwrite", &record.opts.overwrite, "use overwrite mode"),
3411 	OPT_BOOLEAN(0, "no-bpf-event", &record.opts.no_bpf_event, "do not record bpf events"),
3412 	OPT_BOOLEAN(0, "strict-freq", &record.opts.strict_freq,
3413 		    "Fail if the specified frequency can't be used"),
3414 	OPT_CALLBACK('F', "freq", &record.opts, "freq or 'max'",
3415 		     "profile at this frequency",
3416 		      record__parse_freq),
3417 	OPT_CALLBACK('m', "mmap-pages", &record.opts, "pages[,pages]",
3418 		     "number of mmap data pages and AUX area tracing mmap pages",
3419 		     record__parse_mmap_pages),
3420 	OPT_CALLBACK(0, "mmap-flush", &record.opts, "number",
3421 		     "Minimal number of bytes that is extracted from mmap data pages (default: 1)",
3422 		     record__mmap_flush_parse),
3423 	OPT_CALLBACK_NOOPT('g', NULL, &callchain_param,
3424 			   NULL, "enables call-graph recording" ,
3425 			   &record_callchain_opt),
3426 	OPT_CALLBACK(0, "call-graph", &record.opts,
3427 		     "record_mode[,record_size]", record_callchain_help,
3428 		     &record_parse_callchain_opt),
3429 	OPT_INCR('v', "verbose", &verbose,
3430 		    "be more verbose (show counter open errors, etc)"),
3431 	OPT_BOOLEAN('q', "quiet", &quiet, "don't print any warnings or messages"),
3432 	OPT_BOOLEAN('s', "stat", &record.opts.inherit_stat,
3433 		    "per thread counts"),
3434 	OPT_BOOLEAN('d', "data", &record.opts.sample_address, "Record the sample addresses"),
3435 	OPT_BOOLEAN(0, "phys-data", &record.opts.sample_phys_addr,
3436 		    "Record the sample physical addresses"),
3437 	OPT_BOOLEAN(0, "data-page-size", &record.opts.sample_data_page_size,
3438 		    "Record the sampled data address data page size"),
3439 	OPT_BOOLEAN(0, "code-page-size", &record.opts.sample_code_page_size,
3440 		    "Record the sampled code address (ip) page size"),
3441 	OPT_BOOLEAN(0, "sample-cpu", &record.opts.sample_cpu, "Record the sample cpu"),
3442 	OPT_BOOLEAN(0, "sample-identifier", &record.opts.sample_identifier,
3443 		    "Record the sample identifier"),
3444 	OPT_BOOLEAN_SET('T', "timestamp", &record.opts.sample_time,
3445 			&record.opts.sample_time_set,
3446 			"Record the sample timestamps"),
3447 	OPT_BOOLEAN_SET('P', "period", &record.opts.period, &record.opts.period_set,
3448 			"Record the sample period"),
3449 	OPT_BOOLEAN('n', "no-samples", &record.opts.no_samples,
3450 		    "don't sample"),
3451 	OPT_BOOLEAN_SET('N', "no-buildid-cache", &record.no_buildid_cache,
3452 			&record.no_buildid_cache_set,
3453 			"do not update the buildid cache"),
3454 	OPT_BOOLEAN_SET('B', "no-buildid", &record.no_buildid,
3455 			&record.no_buildid_set,
3456 			"do not collect buildids in perf.data"),
3457 	OPT_CALLBACK('G', "cgroup", &record.evlist, "name",
3458 		     "monitor event in cgroup name only",
3459 		     parse_cgroups),
3460 	OPT_CALLBACK('D', "delay", &record, "ms",
3461 		     "ms to wait before starting measurement after program start (-1: start with events disabled), "
3462 		     "or ranges of time to enable events e.g. '-D 10-20,30-40'",
3463 		     record__parse_event_enable_time),
3464 	OPT_BOOLEAN(0, "kcore", &record.opts.kcore, "copy /proc/kcore"),
3465 	OPT_STRING('u', "uid", &record.opts.target.uid_str, "user",
3466 		   "user to profile"),
3467 
3468 	OPT_CALLBACK_NOOPT('b', "branch-any", &record.opts.branch_stack,
3469 		     "branch any", "sample any taken branches",
3470 		     parse_branch_stack),
3471 
3472 	OPT_CALLBACK('j', "branch-filter", &record.opts.branch_stack,
3473 		     "branch filter mask", "branch stack filter modes",
3474 		     parse_branch_stack),
3475 	OPT_BOOLEAN('W', "weight", &record.opts.sample_weight,
3476 		    "sample by weight (on special events only)"),
3477 	OPT_BOOLEAN(0, "transaction", &record.opts.sample_transaction,
3478 		    "sample transaction flags (special events only)"),
3479 	OPT_BOOLEAN(0, "per-thread", &record.opts.target.per_thread,
3480 		    "use per-thread mmaps"),
3481 	OPT_CALLBACK_OPTARG('I', "intr-regs", &record.opts.sample_intr_regs, NULL, "any register",
3482 		    "sample selected machine registers on interrupt,"
3483 		    " use '-I?' to list register names", parse_intr_regs),
3484 	OPT_CALLBACK_OPTARG(0, "user-regs", &record.opts.sample_user_regs, NULL, "any register",
3485 		    "sample selected machine registers on interrupt,"
3486 		    " use '--user-regs=?' to list register names", parse_user_regs),
3487 	OPT_BOOLEAN(0, "running-time", &record.opts.running_time,
3488 		    "Record running/enabled time of read (:S) events"),
3489 	OPT_CALLBACK('k', "clockid", &record.opts,
3490 	"clockid", "clockid to use for events, see clock_gettime()",
3491 	parse_clockid),
3492 	OPT_STRING_OPTARG('S', "snapshot", &record.opts.auxtrace_snapshot_opts,
3493 			  "opts", "AUX area tracing Snapshot Mode", ""),
3494 	OPT_STRING_OPTARG(0, "aux-sample", &record.opts.auxtrace_sample_opts,
3495 			  "opts", "sample AUX area", ""),
3496 	OPT_UINTEGER(0, "proc-map-timeout", &proc_map_timeout,
3497 			"per thread proc mmap processing timeout in ms"),
3498 	OPT_BOOLEAN(0, "namespaces", &record.opts.record_namespaces,
3499 		    "Record namespaces events"),
3500 	OPT_BOOLEAN(0, "all-cgroups", &record.opts.record_cgroup,
3501 		    "Record cgroup events"),
3502 	OPT_BOOLEAN_SET(0, "switch-events", &record.opts.record_switch_events,
3503 			&record.opts.record_switch_events_set,
3504 			"Record context switch events"),
3505 	OPT_BOOLEAN_FLAG(0, "all-kernel", &record.opts.all_kernel,
3506 			 "Configure all used events to run in kernel space.",
3507 			 PARSE_OPT_EXCLUSIVE),
3508 	OPT_BOOLEAN_FLAG(0, "all-user", &record.opts.all_user,
3509 			 "Configure all used events to run in user space.",
3510 			 PARSE_OPT_EXCLUSIVE),
3511 	OPT_BOOLEAN(0, "kernel-callchains", &record.opts.kernel_callchains,
3512 		    "collect kernel callchains"),
3513 	OPT_BOOLEAN(0, "user-callchains", &record.opts.user_callchains,
3514 		    "collect user callchains"),
3515 	OPT_STRING(0, "vmlinux", &symbol_conf.vmlinux_name,
3516 		   "file", "vmlinux pathname"),
3517 	OPT_BOOLEAN(0, "buildid-all", &record.buildid_all,
3518 		    "Record build-id of all DSOs regardless of hits"),
3519 	OPT_BOOLEAN(0, "buildid-mmap", &record.buildid_mmap,
3520 		    "Record build-id in map events"),
3521 	OPT_BOOLEAN(0, "timestamp-filename", &record.timestamp_filename,
3522 		    "append timestamp to output filename"),
3523 	OPT_BOOLEAN(0, "timestamp-boundary", &record.timestamp_boundary,
3524 		    "Record timestamp boundary (time of first/last samples)"),
3525 	OPT_STRING_OPTARG_SET(0, "switch-output", &record.switch_output.str,
3526 			  &record.switch_output.set, "signal or size[BKMG] or time[smhd]",
3527 			  "Switch output when receiving SIGUSR2 (signal) or cross a size or time threshold",
3528 			  "signal"),
3529 	OPT_CALLBACK_SET(0, "switch-output-event", &switch_output_parse_events_option_args,
3530 			 &record.switch_output_event_set, "switch output event",
3531 			 "switch output event selector. use 'perf list' to list available events",
3532 			 parse_events_option_new_evlist),
3533 	OPT_INTEGER(0, "switch-max-files", &record.switch_output.num_files,
3534 		   "Limit number of switch output generated files"),
3535 	OPT_BOOLEAN(0, "dry-run", &dry_run,
3536 		    "Parse options then exit"),
3537 #ifdef HAVE_AIO_SUPPORT
3538 	OPT_CALLBACK_OPTARG(0, "aio", &record.opts,
3539 		     &nr_cblocks_default, "n", "Use <n> control blocks in asynchronous trace writing mode (default: 1, max: 4)",
3540 		     record__aio_parse),
3541 #endif
3542 	OPT_CALLBACK(0, "affinity", &record.opts, "node|cpu",
3543 		     "Set affinity mask of trace reading thread to NUMA node cpu mask or cpu of processed mmap buffer",
3544 		     record__parse_affinity),
3545 #ifdef HAVE_ZSTD_SUPPORT
3546 	OPT_CALLBACK_OPTARG('z', "compression-level", &record.opts, &comp_level_default, "n",
3547 			    "Compress records using specified level (default: 1 - fastest compression, 22 - greatest compression)",
3548 			    record__parse_comp_level),
3549 #endif
3550 	OPT_CALLBACK(0, "max-size", &record.output_max_size,
3551 		     "size", "Limit the maximum size of the output file", parse_output_max_size),
3552 	OPT_UINTEGER(0, "num-thread-synthesize",
3553 		     &record.opts.nr_threads_synthesize,
3554 		     "number of threads to run for event synthesis"),
3555 #ifdef HAVE_LIBPFM
3556 	OPT_CALLBACK(0, "pfm-events", &record.evlist, "event",
3557 		"libpfm4 event selector. use 'perf list' to list available events",
3558 		parse_libpfm_events_option),
3559 #endif
3560 	OPT_CALLBACK(0, "control", &record.opts, "fd:ctl-fd[,ack-fd] or fifo:ctl-fifo[,ack-fifo]",
3561 		     "Listen on ctl-fd descriptor for command to control measurement ('enable': enable events, 'disable': disable events,\n"
3562 		     "\t\t\t  'snapshot': AUX area tracing snapshot).\n"
3563 		     "\t\t\t  Optionally send control command completion ('ack\\n') to ack-fd descriptor.\n"
3564 		     "\t\t\t  Alternatively, ctl-fifo / ack-fifo will be opened and used as ctl-fd / ack-fd.",
3565 		      parse_control_option),
3566 	OPT_CALLBACK(0, "synth", &record.opts, "no|all|task|mmap|cgroup",
3567 		     "Fine-tune event synthesis: default=all", parse_record_synth_option),
3568 	OPT_STRING_OPTARG_SET(0, "debuginfod", &record.debuginfod.urls,
3569 			  &record.debuginfod.set, "debuginfod urls",
3570 			  "Enable debuginfod data retrieval from DEBUGINFOD_URLS or specified urls",
3571 			  "system"),
3572 	OPT_CALLBACK_OPTARG(0, "threads", &record.opts, NULL, "spec",
3573 			    "write collected trace data into several data files using parallel threads",
3574 			    record__parse_threads),
3575 	OPT_BOOLEAN(0, "off-cpu", &record.off_cpu, "Enable off-cpu analysis"),
3576 	OPT_END()
3577 };
3578 
3579 struct option *record_options = __record_options;
3580 
3581 static int record__mmap_cpu_mask_init(struct mmap_cpu_mask *mask, struct perf_cpu_map *cpus)
3582 {
3583 	struct perf_cpu cpu;
3584 	int idx;
3585 
3586 	if (cpu_map__is_dummy(cpus))
3587 		return 0;
3588 
3589 	perf_cpu_map__for_each_cpu_skip_any(cpu, idx, cpus) {
3590 		/* Return ENODEV is input cpu is greater than max cpu */
3591 		if ((unsigned long)cpu.cpu > mask->nbits)
3592 			return -ENODEV;
3593 		__set_bit(cpu.cpu, mask->bits);
3594 	}
3595 
3596 	return 0;
3597 }
3598 
3599 static int record__mmap_cpu_mask_init_spec(struct mmap_cpu_mask *mask, const char *mask_spec)
3600 {
3601 	struct perf_cpu_map *cpus;
3602 
3603 	cpus = perf_cpu_map__new(mask_spec);
3604 	if (!cpus)
3605 		return -ENOMEM;
3606 
3607 	bitmap_zero(mask->bits, mask->nbits);
3608 	if (record__mmap_cpu_mask_init(mask, cpus))
3609 		return -ENODEV;
3610 
3611 	perf_cpu_map__put(cpus);
3612 
3613 	return 0;
3614 }
3615 
3616 static void record__free_thread_masks(struct record *rec, int nr_threads)
3617 {
3618 	int t;
3619 
3620 	if (rec->thread_masks)
3621 		for (t = 0; t < nr_threads; t++)
3622 			record__thread_mask_free(&rec->thread_masks[t]);
3623 
3624 	zfree(&rec->thread_masks);
3625 }
3626 
3627 static int record__alloc_thread_masks(struct record *rec, int nr_threads, int nr_bits)
3628 {
3629 	int t, ret;
3630 
3631 	rec->thread_masks = zalloc(nr_threads * sizeof(*(rec->thread_masks)));
3632 	if (!rec->thread_masks) {
3633 		pr_err("Failed to allocate thread masks\n");
3634 		return -ENOMEM;
3635 	}
3636 
3637 	for (t = 0; t < nr_threads; t++) {
3638 		ret = record__thread_mask_alloc(&rec->thread_masks[t], nr_bits);
3639 		if (ret) {
3640 			pr_err("Failed to allocate thread masks[%d]\n", t);
3641 			goto out_free;
3642 		}
3643 	}
3644 
3645 	return 0;
3646 
3647 out_free:
3648 	record__free_thread_masks(rec, nr_threads);
3649 
3650 	return ret;
3651 }
3652 
3653 static int record__init_thread_cpu_masks(struct record *rec, struct perf_cpu_map *cpus)
3654 {
3655 	int t, ret, nr_cpus = perf_cpu_map__nr(cpus);
3656 
3657 	ret = record__alloc_thread_masks(rec, nr_cpus, cpu__max_cpu().cpu);
3658 	if (ret)
3659 		return ret;
3660 
3661 	rec->nr_threads = nr_cpus;
3662 	pr_debug("nr_threads: %d\n", rec->nr_threads);
3663 
3664 	for (t = 0; t < rec->nr_threads; t++) {
3665 		__set_bit(perf_cpu_map__cpu(cpus, t).cpu, rec->thread_masks[t].maps.bits);
3666 		__set_bit(perf_cpu_map__cpu(cpus, t).cpu, rec->thread_masks[t].affinity.bits);
3667 		if (verbose > 0) {
3668 			pr_debug("thread_masks[%d]: ", t);
3669 			mmap_cpu_mask__scnprintf(&rec->thread_masks[t].maps, "maps");
3670 			pr_debug("thread_masks[%d]: ", t);
3671 			mmap_cpu_mask__scnprintf(&rec->thread_masks[t].affinity, "affinity");
3672 		}
3673 	}
3674 
3675 	return 0;
3676 }
3677 
3678 static int record__init_thread_masks_spec(struct record *rec, struct perf_cpu_map *cpus,
3679 					  const char **maps_spec, const char **affinity_spec,
3680 					  u32 nr_spec)
3681 {
3682 	u32 s;
3683 	int ret = 0, t = 0;
3684 	struct mmap_cpu_mask cpus_mask;
3685 	struct thread_mask thread_mask, full_mask, *thread_masks;
3686 
3687 	ret = record__mmap_cpu_mask_alloc(&cpus_mask, cpu__max_cpu().cpu);
3688 	if (ret) {
3689 		pr_err("Failed to allocate CPUs mask\n");
3690 		return ret;
3691 	}
3692 
3693 	ret = record__mmap_cpu_mask_init(&cpus_mask, cpus);
3694 	if (ret) {
3695 		pr_err("Failed to init cpu mask\n");
3696 		goto out_free_cpu_mask;
3697 	}
3698 
3699 	ret = record__thread_mask_alloc(&full_mask, cpu__max_cpu().cpu);
3700 	if (ret) {
3701 		pr_err("Failed to allocate full mask\n");
3702 		goto out_free_cpu_mask;
3703 	}
3704 
3705 	ret = record__thread_mask_alloc(&thread_mask, cpu__max_cpu().cpu);
3706 	if (ret) {
3707 		pr_err("Failed to allocate thread mask\n");
3708 		goto out_free_full_and_cpu_masks;
3709 	}
3710 
3711 	for (s = 0; s < nr_spec; s++) {
3712 		ret = record__mmap_cpu_mask_init_spec(&thread_mask.maps, maps_spec[s]);
3713 		if (ret) {
3714 			pr_err("Failed to initialize maps thread mask\n");
3715 			goto out_free;
3716 		}
3717 		ret = record__mmap_cpu_mask_init_spec(&thread_mask.affinity, affinity_spec[s]);
3718 		if (ret) {
3719 			pr_err("Failed to initialize affinity thread mask\n");
3720 			goto out_free;
3721 		}
3722 
3723 		/* ignore invalid CPUs but do not allow empty masks */
3724 		if (!bitmap_and(thread_mask.maps.bits, thread_mask.maps.bits,
3725 				cpus_mask.bits, thread_mask.maps.nbits)) {
3726 			pr_err("Empty maps mask: %s\n", maps_spec[s]);
3727 			ret = -EINVAL;
3728 			goto out_free;
3729 		}
3730 		if (!bitmap_and(thread_mask.affinity.bits, thread_mask.affinity.bits,
3731 				cpus_mask.bits, thread_mask.affinity.nbits)) {
3732 			pr_err("Empty affinity mask: %s\n", affinity_spec[s]);
3733 			ret = -EINVAL;
3734 			goto out_free;
3735 		}
3736 
3737 		/* do not allow intersection with other masks (full_mask) */
3738 		if (bitmap_intersects(thread_mask.maps.bits, full_mask.maps.bits,
3739 				      thread_mask.maps.nbits)) {
3740 			pr_err("Intersecting maps mask: %s\n", maps_spec[s]);
3741 			ret = -EINVAL;
3742 			goto out_free;
3743 		}
3744 		if (bitmap_intersects(thread_mask.affinity.bits, full_mask.affinity.bits,
3745 				      thread_mask.affinity.nbits)) {
3746 			pr_err("Intersecting affinity mask: %s\n", affinity_spec[s]);
3747 			ret = -EINVAL;
3748 			goto out_free;
3749 		}
3750 
3751 		bitmap_or(full_mask.maps.bits, full_mask.maps.bits,
3752 			  thread_mask.maps.bits, full_mask.maps.nbits);
3753 		bitmap_or(full_mask.affinity.bits, full_mask.affinity.bits,
3754 			  thread_mask.affinity.bits, full_mask.maps.nbits);
3755 
3756 		thread_masks = realloc(rec->thread_masks, (t + 1) * sizeof(struct thread_mask));
3757 		if (!thread_masks) {
3758 			pr_err("Failed to reallocate thread masks\n");
3759 			ret = -ENOMEM;
3760 			goto out_free;
3761 		}
3762 		rec->thread_masks = thread_masks;
3763 		rec->thread_masks[t] = thread_mask;
3764 		if (verbose > 0) {
3765 			pr_debug("thread_masks[%d]: ", t);
3766 			mmap_cpu_mask__scnprintf(&rec->thread_masks[t].maps, "maps");
3767 			pr_debug("thread_masks[%d]: ", t);
3768 			mmap_cpu_mask__scnprintf(&rec->thread_masks[t].affinity, "affinity");
3769 		}
3770 		t++;
3771 		ret = record__thread_mask_alloc(&thread_mask, cpu__max_cpu().cpu);
3772 		if (ret) {
3773 			pr_err("Failed to allocate thread mask\n");
3774 			goto out_free_full_and_cpu_masks;
3775 		}
3776 	}
3777 	rec->nr_threads = t;
3778 	pr_debug("nr_threads: %d\n", rec->nr_threads);
3779 	if (!rec->nr_threads)
3780 		ret = -EINVAL;
3781 
3782 out_free:
3783 	record__thread_mask_free(&thread_mask);
3784 out_free_full_and_cpu_masks:
3785 	record__thread_mask_free(&full_mask);
3786 out_free_cpu_mask:
3787 	record__mmap_cpu_mask_free(&cpus_mask);
3788 
3789 	return ret;
3790 }
3791 
3792 static int record__init_thread_core_masks(struct record *rec, struct perf_cpu_map *cpus)
3793 {
3794 	int ret;
3795 	struct cpu_topology *topo;
3796 
3797 	topo = cpu_topology__new();
3798 	if (!topo) {
3799 		pr_err("Failed to allocate CPU topology\n");
3800 		return -ENOMEM;
3801 	}
3802 
3803 	ret = record__init_thread_masks_spec(rec, cpus, topo->core_cpus_list,
3804 					     topo->core_cpus_list, topo->core_cpus_lists);
3805 	cpu_topology__delete(topo);
3806 
3807 	return ret;
3808 }
3809 
3810 static int record__init_thread_package_masks(struct record *rec, struct perf_cpu_map *cpus)
3811 {
3812 	int ret;
3813 	struct cpu_topology *topo;
3814 
3815 	topo = cpu_topology__new();
3816 	if (!topo) {
3817 		pr_err("Failed to allocate CPU topology\n");
3818 		return -ENOMEM;
3819 	}
3820 
3821 	ret = record__init_thread_masks_spec(rec, cpus, topo->package_cpus_list,
3822 					     topo->package_cpus_list, topo->package_cpus_lists);
3823 	cpu_topology__delete(topo);
3824 
3825 	return ret;
3826 }
3827 
3828 static int record__init_thread_numa_masks(struct record *rec, struct perf_cpu_map *cpus)
3829 {
3830 	u32 s;
3831 	int ret;
3832 	const char **spec;
3833 	struct numa_topology *topo;
3834 
3835 	topo = numa_topology__new();
3836 	if (!topo) {
3837 		pr_err("Failed to allocate NUMA topology\n");
3838 		return -ENOMEM;
3839 	}
3840 
3841 	spec = zalloc(topo->nr * sizeof(char *));
3842 	if (!spec) {
3843 		pr_err("Failed to allocate NUMA spec\n");
3844 		ret = -ENOMEM;
3845 		goto out_delete_topo;
3846 	}
3847 	for (s = 0; s < topo->nr; s++)
3848 		spec[s] = topo->nodes[s].cpus;
3849 
3850 	ret = record__init_thread_masks_spec(rec, cpus, spec, spec, topo->nr);
3851 
3852 	zfree(&spec);
3853 
3854 out_delete_topo:
3855 	numa_topology__delete(topo);
3856 
3857 	return ret;
3858 }
3859 
3860 static int record__init_thread_user_masks(struct record *rec, struct perf_cpu_map *cpus)
3861 {
3862 	int t, ret;
3863 	u32 s, nr_spec = 0;
3864 	char **maps_spec = NULL, **affinity_spec = NULL, **tmp_spec;
3865 	char *user_spec, *spec, *spec_ptr, *mask, *mask_ptr, *dup_mask = NULL;
3866 
3867 	for (t = 0, user_spec = (char *)rec->opts.threads_user_spec; ; t++, user_spec = NULL) {
3868 		spec = strtok_r(user_spec, ":", &spec_ptr);
3869 		if (spec == NULL)
3870 			break;
3871 		pr_debug2("threads_spec[%d]: %s\n", t, spec);
3872 		mask = strtok_r(spec, "/", &mask_ptr);
3873 		if (mask == NULL)
3874 			break;
3875 		pr_debug2("  maps mask: %s\n", mask);
3876 		tmp_spec = realloc(maps_spec, (nr_spec + 1) * sizeof(char *));
3877 		if (!tmp_spec) {
3878 			pr_err("Failed to reallocate maps spec\n");
3879 			ret = -ENOMEM;
3880 			goto out_free;
3881 		}
3882 		maps_spec = tmp_spec;
3883 		maps_spec[nr_spec] = dup_mask = strdup(mask);
3884 		if (!maps_spec[nr_spec]) {
3885 			pr_err("Failed to allocate maps spec[%d]\n", nr_spec);
3886 			ret = -ENOMEM;
3887 			goto out_free;
3888 		}
3889 		mask = strtok_r(NULL, "/", &mask_ptr);
3890 		if (mask == NULL) {
3891 			pr_err("Invalid thread maps or affinity specs\n");
3892 			ret = -EINVAL;
3893 			goto out_free;
3894 		}
3895 		pr_debug2("  affinity mask: %s\n", mask);
3896 		tmp_spec = realloc(affinity_spec, (nr_spec + 1) * sizeof(char *));
3897 		if (!tmp_spec) {
3898 			pr_err("Failed to reallocate affinity spec\n");
3899 			ret = -ENOMEM;
3900 			goto out_free;
3901 		}
3902 		affinity_spec = tmp_spec;
3903 		affinity_spec[nr_spec] = strdup(mask);
3904 		if (!affinity_spec[nr_spec]) {
3905 			pr_err("Failed to allocate affinity spec[%d]\n", nr_spec);
3906 			ret = -ENOMEM;
3907 			goto out_free;
3908 		}
3909 		dup_mask = NULL;
3910 		nr_spec++;
3911 	}
3912 
3913 	ret = record__init_thread_masks_spec(rec, cpus, (const char **)maps_spec,
3914 					     (const char **)affinity_spec, nr_spec);
3915 
3916 out_free:
3917 	free(dup_mask);
3918 	for (s = 0; s < nr_spec; s++) {
3919 		if (maps_spec)
3920 			free(maps_spec[s]);
3921 		if (affinity_spec)
3922 			free(affinity_spec[s]);
3923 	}
3924 	free(affinity_spec);
3925 	free(maps_spec);
3926 
3927 	return ret;
3928 }
3929 
3930 static int record__init_thread_default_masks(struct record *rec, struct perf_cpu_map *cpus)
3931 {
3932 	int ret;
3933 
3934 	ret = record__alloc_thread_masks(rec, 1, cpu__max_cpu().cpu);
3935 	if (ret)
3936 		return ret;
3937 
3938 	if (record__mmap_cpu_mask_init(&rec->thread_masks->maps, cpus))
3939 		return -ENODEV;
3940 
3941 	rec->nr_threads = 1;
3942 
3943 	return 0;
3944 }
3945 
3946 static int record__init_thread_masks(struct record *rec)
3947 {
3948 	int ret = 0;
3949 	struct perf_cpu_map *cpus = rec->evlist->core.all_cpus;
3950 
3951 	if (!record__threads_enabled(rec))
3952 		return record__init_thread_default_masks(rec, cpus);
3953 
3954 	if (evlist__per_thread(rec->evlist)) {
3955 		pr_err("--per-thread option is mutually exclusive to parallel streaming mode.\n");
3956 		return -EINVAL;
3957 	}
3958 
3959 	switch (rec->opts.threads_spec) {
3960 	case THREAD_SPEC__CPU:
3961 		ret = record__init_thread_cpu_masks(rec, cpus);
3962 		break;
3963 	case THREAD_SPEC__CORE:
3964 		ret = record__init_thread_core_masks(rec, cpus);
3965 		break;
3966 	case THREAD_SPEC__PACKAGE:
3967 		ret = record__init_thread_package_masks(rec, cpus);
3968 		break;
3969 	case THREAD_SPEC__NUMA:
3970 		ret = record__init_thread_numa_masks(rec, cpus);
3971 		break;
3972 	case THREAD_SPEC__USER:
3973 		ret = record__init_thread_user_masks(rec, cpus);
3974 		break;
3975 	default:
3976 		break;
3977 	}
3978 
3979 	return ret;
3980 }
3981 
3982 int cmd_record(int argc, const char **argv)
3983 {
3984 	int err;
3985 	struct record *rec = &record;
3986 	char errbuf[BUFSIZ];
3987 
3988 	setlocale(LC_ALL, "");
3989 
3990 #ifndef HAVE_BPF_SKEL
3991 # define set_nobuild(s, l, m, c) set_option_nobuild(record_options, s, l, m, c)
3992 	set_nobuild('\0', "off-cpu", "no BUILD_BPF_SKEL=1", true);
3993 # undef set_nobuild
3994 #endif
3995 
3996 	/* Disable eager loading of kernel symbols that adds overhead to perf record. */
3997 	symbol_conf.lazy_load_kernel_maps = true;
3998 	rec->opts.affinity = PERF_AFFINITY_SYS;
3999 
4000 	rec->evlist = evlist__new();
4001 	if (rec->evlist == NULL)
4002 		return -ENOMEM;
4003 
4004 	err = perf_config(perf_record_config, rec);
4005 	if (err)
4006 		return err;
4007 
4008 	argc = parse_options(argc, argv, record_options, record_usage,
4009 			    PARSE_OPT_STOP_AT_NON_OPTION);
4010 	if (quiet)
4011 		perf_quiet_option();
4012 
4013 	err = symbol__validate_sym_arguments();
4014 	if (err)
4015 		return err;
4016 
4017 	perf_debuginfod_setup(&record.debuginfod);
4018 
4019 	/* Make system wide (-a) the default target. */
4020 	if (!argc && target__none(&rec->opts.target))
4021 		rec->opts.target.system_wide = true;
4022 
4023 	if (nr_cgroups && !rec->opts.target.system_wide) {
4024 		usage_with_options_msg(record_usage, record_options,
4025 			"cgroup monitoring only available in system-wide mode");
4026 
4027 	}
4028 
4029 	if (rec->buildid_mmap) {
4030 		if (!perf_can_record_build_id()) {
4031 			pr_err("Failed: no support to record build id in mmap events, update your kernel.\n");
4032 			err = -EINVAL;
4033 			goto out_opts;
4034 		}
4035 		pr_debug("Enabling build id in mmap2 events.\n");
4036 		/* Enable mmap build id synthesizing. */
4037 		symbol_conf.buildid_mmap2 = true;
4038 		/* Enable perf_event_attr::build_id bit. */
4039 		rec->opts.build_id = true;
4040 		/* Disable build id cache. */
4041 		rec->no_buildid = true;
4042 	}
4043 
4044 	if (rec->opts.record_cgroup && !perf_can_record_cgroup()) {
4045 		pr_err("Kernel has no cgroup sampling support.\n");
4046 		err = -EINVAL;
4047 		goto out_opts;
4048 	}
4049 
4050 	if (rec->opts.kcore)
4051 		rec->opts.text_poke = true;
4052 
4053 	if (rec->opts.kcore || record__threads_enabled(rec))
4054 		rec->data.is_dir = true;
4055 
4056 	if (record__threads_enabled(rec)) {
4057 		if (rec->opts.affinity != PERF_AFFINITY_SYS) {
4058 			pr_err("--affinity option is mutually exclusive to parallel streaming mode.\n");
4059 			goto out_opts;
4060 		}
4061 		if (record__aio_enabled(rec)) {
4062 			pr_err("Asynchronous streaming mode (--aio) is mutually exclusive to parallel streaming mode.\n");
4063 			goto out_opts;
4064 		}
4065 	}
4066 
4067 	if (rec->opts.comp_level != 0) {
4068 		pr_debug("Compression enabled, disabling build id collection at the end of the session.\n");
4069 		rec->no_buildid = true;
4070 	}
4071 
4072 	if (rec->opts.record_switch_events &&
4073 	    !perf_can_record_switch_events()) {
4074 		ui__error("kernel does not support recording context switch events\n");
4075 		parse_options_usage(record_usage, record_options, "switch-events", 0);
4076 		err = -EINVAL;
4077 		goto out_opts;
4078 	}
4079 
4080 	if (switch_output_setup(rec)) {
4081 		parse_options_usage(record_usage, record_options, "switch-output", 0);
4082 		err = -EINVAL;
4083 		goto out_opts;
4084 	}
4085 
4086 	if (rec->switch_output.time) {
4087 		signal(SIGALRM, alarm_sig_handler);
4088 		alarm(rec->switch_output.time);
4089 	}
4090 
4091 	if (rec->switch_output.num_files) {
4092 		rec->switch_output.filenames = calloc(rec->switch_output.num_files,
4093 						      sizeof(char *));
4094 		if (!rec->switch_output.filenames) {
4095 			err = -EINVAL;
4096 			goto out_opts;
4097 		}
4098 	}
4099 
4100 	if (rec->timestamp_filename && record__threads_enabled(rec)) {
4101 		rec->timestamp_filename = false;
4102 		pr_warning("WARNING: --timestamp-filename option is not available in parallel streaming mode.\n");
4103 	}
4104 
4105 	/*
4106 	 * Allow aliases to facilitate the lookup of symbols for address
4107 	 * filters. Refer to auxtrace_parse_filters().
4108 	 */
4109 	symbol_conf.allow_aliases = true;
4110 
4111 	symbol__init(NULL);
4112 
4113 	err = record__auxtrace_init(rec);
4114 	if (err)
4115 		goto out;
4116 
4117 	if (dry_run)
4118 		goto out;
4119 
4120 	err = -ENOMEM;
4121 
4122 	if (rec->no_buildid_cache || rec->no_buildid) {
4123 		disable_buildid_cache();
4124 	} else if (rec->switch_output.enabled) {
4125 		/*
4126 		 * In 'perf record --switch-output', disable buildid
4127 		 * generation by default to reduce data file switching
4128 		 * overhead. Still generate buildid if they are required
4129 		 * explicitly using
4130 		 *
4131 		 *  perf record --switch-output --no-no-buildid \
4132 		 *              --no-no-buildid-cache
4133 		 *
4134 		 * Following code equals to:
4135 		 *
4136 		 * if ((rec->no_buildid || !rec->no_buildid_set) &&
4137 		 *     (rec->no_buildid_cache || !rec->no_buildid_cache_set))
4138 		 *         disable_buildid_cache();
4139 		 */
4140 		bool disable = true;
4141 
4142 		if (rec->no_buildid_set && !rec->no_buildid)
4143 			disable = false;
4144 		if (rec->no_buildid_cache_set && !rec->no_buildid_cache)
4145 			disable = false;
4146 		if (disable) {
4147 			rec->no_buildid = true;
4148 			rec->no_buildid_cache = true;
4149 			disable_buildid_cache();
4150 		}
4151 	}
4152 
4153 	if (record.opts.overwrite)
4154 		record.opts.tail_synthesize = true;
4155 
4156 	if (rec->evlist->core.nr_entries == 0) {
4157 		bool can_profile_kernel = perf_event_paranoid_check(1);
4158 
4159 		err = parse_event(rec->evlist, can_profile_kernel ? "cycles:P" : "cycles:Pu");
4160 		if (err)
4161 			goto out;
4162 	}
4163 
4164 	if (rec->opts.target.tid && !rec->opts.no_inherit_set)
4165 		rec->opts.no_inherit = true;
4166 
4167 	err = target__validate(&rec->opts.target);
4168 	if (err) {
4169 		target__strerror(&rec->opts.target, err, errbuf, BUFSIZ);
4170 		ui__warning("%s\n", errbuf);
4171 	}
4172 
4173 	err = target__parse_uid(&rec->opts.target);
4174 	if (err) {
4175 		int saved_errno = errno;
4176 
4177 		target__strerror(&rec->opts.target, err, errbuf, BUFSIZ);
4178 		ui__error("%s", errbuf);
4179 
4180 		err = -saved_errno;
4181 		goto out;
4182 	}
4183 
4184 	/* Enable ignoring missing threads when -u/-p option is defined. */
4185 	rec->opts.ignore_missing_thread = rec->opts.target.uid != UINT_MAX || rec->opts.target.pid;
4186 
4187 	evlist__warn_user_requested_cpus(rec->evlist, rec->opts.target.cpu_list);
4188 
4189 	if (callchain_param.enabled && callchain_param.record_mode == CALLCHAIN_FP)
4190 		arch__add_leaf_frame_record_opts(&rec->opts);
4191 
4192 	err = -ENOMEM;
4193 	if (evlist__create_maps(rec->evlist, &rec->opts.target) < 0) {
4194 		if (rec->opts.target.pid != NULL) {
4195 			pr_err("Couldn't create thread/CPU maps: %s\n",
4196 				errno == ENOENT ? "No such process" : str_error_r(errno, errbuf, sizeof(errbuf)));
4197 			goto out;
4198 		}
4199 		else
4200 			usage_with_options(record_usage, record_options);
4201 	}
4202 
4203 	err = auxtrace_record__options(rec->itr, rec->evlist, &rec->opts);
4204 	if (err)
4205 		goto out;
4206 
4207 	/*
4208 	 * We take all buildids when the file contains
4209 	 * AUX area tracing data because we do not decode the
4210 	 * trace because it would take too long.
4211 	 */
4212 	if (rec->opts.full_auxtrace)
4213 		rec->buildid_all = true;
4214 
4215 	if (rec->opts.text_poke) {
4216 		err = record__config_text_poke(rec->evlist);
4217 		if (err) {
4218 			pr_err("record__config_text_poke failed, error %d\n", err);
4219 			goto out;
4220 		}
4221 	}
4222 
4223 	if (rec->off_cpu) {
4224 		err = record__config_off_cpu(rec);
4225 		if (err) {
4226 			pr_err("record__config_off_cpu failed, error %d\n", err);
4227 			goto out;
4228 		}
4229 	}
4230 
4231 	if (record_opts__config(&rec->opts)) {
4232 		err = -EINVAL;
4233 		goto out;
4234 	}
4235 
4236 	err = record__config_tracking_events(rec);
4237 	if (err) {
4238 		pr_err("record__config_tracking_events failed, error %d\n", err);
4239 		goto out;
4240 	}
4241 
4242 	err = record__init_thread_masks(rec);
4243 	if (err) {
4244 		pr_err("Failed to initialize parallel data streaming masks\n");
4245 		goto out;
4246 	}
4247 
4248 	if (rec->opts.nr_cblocks > nr_cblocks_max)
4249 		rec->opts.nr_cblocks = nr_cblocks_max;
4250 	pr_debug("nr_cblocks: %d\n", rec->opts.nr_cblocks);
4251 
4252 	pr_debug("affinity: %s\n", affinity_tags[rec->opts.affinity]);
4253 	pr_debug("mmap flush: %d\n", rec->opts.mmap_flush);
4254 
4255 	if (rec->opts.comp_level > comp_level_max)
4256 		rec->opts.comp_level = comp_level_max;
4257 	pr_debug("comp level: %d\n", rec->opts.comp_level);
4258 
4259 	err = __cmd_record(&record, argc, argv);
4260 out:
4261 	evlist__delete(rec->evlist);
4262 	symbol__exit();
4263 	auxtrace_record__free(rec->itr);
4264 out_opts:
4265 	record__free_thread_masks(rec, rec->nr_threads);
4266 	rec->nr_threads = 0;
4267 	evlist__close_control(rec->opts.ctl_fd, rec->opts.ctl_fd_ack, &rec->opts.ctl_fd_close);
4268 	return err;
4269 }
4270 
4271 static void snapshot_sig_handler(int sig __maybe_unused)
4272 {
4273 	struct record *rec = &record;
4274 
4275 	hit_auxtrace_snapshot_trigger(rec);
4276 
4277 	if (switch_output_signal(rec))
4278 		trigger_hit(&switch_output_trigger);
4279 }
4280 
4281 static void alarm_sig_handler(int sig __maybe_unused)
4282 {
4283 	struct record *rec = &record;
4284 
4285 	if (switch_output_time(rec))
4286 		trigger_hit(&switch_output_trigger);
4287 }
4288