xref: /linux/tools/perf/builtin-record.c (revision ee057c8c194b9283f4137b253b70e292693a39f0)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * builtin-record.c
4  *
5  * Builtin record command: Record the profile of a workload
6  * (or a CPU, or a PID) into the perf.data output file - for
7  * later analysis via perf report.
8  */
9 #include "builtin.h"
10 
11 #include "util/build-id.h"
12 #include <subcmd/parse-options.h>
13 #include <internal/xyarray.h>
14 #include "util/parse-events.h"
15 #include "util/config.h"
16 
17 #include "util/callchain.h"
18 #include "util/cgroup.h"
19 #include "util/header.h"
20 #include "util/event.h"
21 #include "util/evlist.h"
22 #include "util/evsel.h"
23 #include "util/debug.h"
24 #include "util/mmap.h"
25 #include "util/mutex.h"
26 #include "util/target.h"
27 #include "util/session.h"
28 #include "util/tool.h"
29 #include "util/symbol.h"
30 #include "util/record.h"
31 #include "util/cpumap.h"
32 #include "util/thread_map.h"
33 #include "util/data.h"
34 #include "util/perf_regs.h"
35 #include "util/auxtrace.h"
36 #include "util/tsc.h"
37 #include "util/parse-branch-options.h"
38 #include "util/parse-regs-options.h"
39 #include "util/perf_api_probe.h"
40 #include "util/trigger.h"
41 #include "util/perf-hooks.h"
42 #include "util/cpu-set-sched.h"
43 #include "util/synthetic-events.h"
44 #include "util/time-utils.h"
45 #include "util/units.h"
46 #include "util/bpf-event.h"
47 #include "util/util.h"
48 #include "util/pfm.h"
49 #include "util/pmu.h"
50 #include "util/pmus.h"
51 #include "util/clockid.h"
52 #include "util/off_cpu.h"
53 #include "util/bpf-filter.h"
54 #include "asm/bug.h"
55 #include "perf.h"
56 #include "cputopo.h"
57 
58 #include <errno.h>
59 #include <inttypes.h>
60 #include <locale.h>
61 #include <poll.h>
62 #include <pthread.h>
63 #include <unistd.h>
64 #ifndef HAVE_GETTID
65 #include <syscall.h>
66 #endif
67 #include <sched.h>
68 #include <signal.h>
69 #ifdef HAVE_EVENTFD_SUPPORT
70 #include <sys/eventfd.h>
71 #endif
72 #include <sys/mman.h>
73 #include <sys/wait.h>
74 #include <sys/types.h>
75 #include <sys/stat.h>
76 #include <fcntl.h>
77 #include <linux/err.h>
78 #include <linux/string.h>
79 #include <linux/time64.h>
80 #include <linux/zalloc.h>
81 #include <linux/bitmap.h>
82 #include <sys/time.h>
83 
84 struct switch_output {
85 	bool		 enabled;
86 	bool		 signal;
87 	unsigned long	 size;
88 	unsigned long	 time;
89 	const char	*str;
90 	bool		 set;
91 	char		 **filenames;
92 	int		 num_files;
93 	int		 cur_file;
94 };
95 
96 struct thread_mask {
97 	struct mmap_cpu_mask	maps;
98 	struct mmap_cpu_mask	affinity;
99 };
100 
101 struct record_thread {
102 	pid_t			tid;
103 	struct thread_mask	*mask;
104 	struct {
105 		int		msg[2];
106 		int		ack[2];
107 	} pipes;
108 	struct fdarray		pollfd;
109 	int			ctlfd_pos;
110 	int			nr_mmaps;
111 	struct mmap		**maps;
112 	struct mmap		**overwrite_maps;
113 	struct record		*rec;
114 	unsigned long long	samples;
115 	unsigned long		waking;
116 	u64			bytes_written;
117 	u64			bytes_transferred;
118 	u64			bytes_compressed;
119 };
120 
121 static __thread struct record_thread *thread;
122 
123 enum thread_msg {
124 	THREAD_MSG__UNDEFINED = 0,
125 	THREAD_MSG__READY,
126 	THREAD_MSG__MAX,
127 };
128 
129 static const char *thread_msg_tags[THREAD_MSG__MAX] = {
130 	"UNDEFINED", "READY"
131 };
132 
133 enum thread_spec {
134 	THREAD_SPEC__UNDEFINED = 0,
135 	THREAD_SPEC__CPU,
136 	THREAD_SPEC__CORE,
137 	THREAD_SPEC__PACKAGE,
138 	THREAD_SPEC__NUMA,
139 	THREAD_SPEC__USER,
140 	THREAD_SPEC__MAX,
141 };
142 
143 static const char *thread_spec_tags[THREAD_SPEC__MAX] = {
144 	"undefined", "cpu", "core", "package", "numa", "user"
145 };
146 
147 struct pollfd_index_map {
148 	int evlist_pollfd_index;
149 	int thread_pollfd_index;
150 };
151 
152 struct record {
153 	struct perf_tool	tool;
154 	struct record_opts	opts;
155 	u64			bytes_written;
156 	u64			thread_bytes_written;
157 	struct perf_data	data;
158 	struct auxtrace_record	*itr;
159 	struct evlist	*evlist;
160 	struct perf_session	*session;
161 	struct evlist		*sb_evlist;
162 	pthread_t		thread_id;
163 	int			realtime_prio;
164 	bool			switch_output_event_set;
165 	bool			no_buildid;
166 	bool			no_buildid_set;
167 	bool			no_buildid_cache;
168 	bool			no_buildid_cache_set;
169 	bool			buildid_all;
170 	bool			buildid_mmap;
171 	bool			timestamp_filename;
172 	bool			timestamp_boundary;
173 	bool			off_cpu;
174 	struct switch_output	switch_output;
175 	unsigned long long	samples;
176 	unsigned long		output_max_size;	/* = 0: unlimited */
177 	struct perf_debuginfod	debuginfod;
178 	int			nr_threads;
179 	struct thread_mask	*thread_masks;
180 	struct record_thread	*thread_data;
181 	struct pollfd_index_map	*index_map;
182 	size_t			index_map_sz;
183 	size_t			index_map_cnt;
184 };
185 
186 static volatile int done;
187 
188 static volatile int auxtrace_record__snapshot_started;
189 static DEFINE_TRIGGER(auxtrace_snapshot_trigger);
190 static DEFINE_TRIGGER(switch_output_trigger);
191 
192 static const char *affinity_tags[PERF_AFFINITY_MAX] = {
193 	"SYS", "NODE", "CPU"
194 };
195 
196 #ifndef HAVE_GETTID
197 static inline pid_t gettid(void)
198 {
199 	return (pid_t)syscall(__NR_gettid);
200 }
201 #endif
202 
203 static int record__threads_enabled(struct record *rec)
204 {
205 	return rec->opts.threads_spec;
206 }
207 
208 static bool switch_output_signal(struct record *rec)
209 {
210 	return rec->switch_output.signal &&
211 	       trigger_is_ready(&switch_output_trigger);
212 }
213 
214 static bool switch_output_size(struct record *rec)
215 {
216 	return rec->switch_output.size &&
217 	       trigger_is_ready(&switch_output_trigger) &&
218 	       (rec->bytes_written >= rec->switch_output.size);
219 }
220 
221 static bool switch_output_time(struct record *rec)
222 {
223 	return rec->switch_output.time &&
224 	       trigger_is_ready(&switch_output_trigger);
225 }
226 
227 static u64 record__bytes_written(struct record *rec)
228 {
229 	return rec->bytes_written + rec->thread_bytes_written;
230 }
231 
232 static bool record__output_max_size_exceeded(struct record *rec)
233 {
234 	return rec->output_max_size &&
235 	       (record__bytes_written(rec) >= rec->output_max_size);
236 }
237 
238 static int record__write(struct record *rec, struct mmap *map __maybe_unused,
239 			 void *bf, size_t size)
240 {
241 	struct perf_data_file *file = &rec->session->data->file;
242 
243 	if (map && map->file)
244 		file = map->file;
245 
246 	if (perf_data_file__write(file, bf, size) < 0) {
247 		pr_err("failed to write perf data, error: %m\n");
248 		return -1;
249 	}
250 
251 	if (map && map->file) {
252 		thread->bytes_written += size;
253 		rec->thread_bytes_written += size;
254 	} else {
255 		rec->bytes_written += size;
256 	}
257 
258 	if (record__output_max_size_exceeded(rec) && !done) {
259 		fprintf(stderr, "[ perf record: perf size limit reached (%" PRIu64 " KB),"
260 				" stopping session ]\n",
261 				record__bytes_written(rec) >> 10);
262 		done = 1;
263 	}
264 
265 	if (switch_output_size(rec))
266 		trigger_hit(&switch_output_trigger);
267 
268 	return 0;
269 }
270 
271 static int record__aio_enabled(struct record *rec);
272 static int record__comp_enabled(struct record *rec);
273 static ssize_t zstd_compress(struct perf_session *session, struct mmap *map,
274 			    void *dst, size_t dst_size, void *src, size_t src_size);
275 
276 #ifdef HAVE_AIO_SUPPORT
277 static int record__aio_write(struct aiocb *cblock, int trace_fd,
278 		void *buf, size_t size, off_t off)
279 {
280 	int rc;
281 
282 	cblock->aio_fildes = trace_fd;
283 	cblock->aio_buf    = buf;
284 	cblock->aio_nbytes = size;
285 	cblock->aio_offset = off;
286 	cblock->aio_sigevent.sigev_notify = SIGEV_NONE;
287 
288 	do {
289 		rc = aio_write(cblock);
290 		if (rc == 0) {
291 			break;
292 		} else if (errno != EAGAIN) {
293 			cblock->aio_fildes = -1;
294 			pr_err("failed to queue perf data, error: %m\n");
295 			break;
296 		}
297 	} while (1);
298 
299 	return rc;
300 }
301 
302 static int record__aio_complete(struct mmap *md, struct aiocb *cblock)
303 {
304 	void *rem_buf;
305 	off_t rem_off;
306 	size_t rem_size;
307 	int rc, aio_errno;
308 	ssize_t aio_ret, written;
309 
310 	aio_errno = aio_error(cblock);
311 	if (aio_errno == EINPROGRESS)
312 		return 0;
313 
314 	written = aio_ret = aio_return(cblock);
315 	if (aio_ret < 0) {
316 		if (aio_errno != EINTR)
317 			pr_err("failed to write perf data, error: %m\n");
318 		written = 0;
319 	}
320 
321 	rem_size = cblock->aio_nbytes - written;
322 
323 	if (rem_size == 0) {
324 		cblock->aio_fildes = -1;
325 		/*
326 		 * md->refcount is incremented in record__aio_pushfn() for
327 		 * every aio write request started in record__aio_push() so
328 		 * decrement it because the request is now complete.
329 		 */
330 		perf_mmap__put(&md->core);
331 		rc = 1;
332 	} else {
333 		/*
334 		 * aio write request may require restart with the
335 		 * remainder if the kernel didn't write whole
336 		 * chunk at once.
337 		 */
338 		rem_off = cblock->aio_offset + written;
339 		rem_buf = (void *)(cblock->aio_buf + written);
340 		record__aio_write(cblock, cblock->aio_fildes,
341 				rem_buf, rem_size, rem_off);
342 		rc = 0;
343 	}
344 
345 	return rc;
346 }
347 
348 static int record__aio_sync(struct mmap *md, bool sync_all)
349 {
350 	struct aiocb **aiocb = md->aio.aiocb;
351 	struct aiocb *cblocks = md->aio.cblocks;
352 	struct timespec timeout = { 0, 1000 * 1000  * 1 }; /* 1ms */
353 	int i, do_suspend;
354 
355 	do {
356 		do_suspend = 0;
357 		for (i = 0; i < md->aio.nr_cblocks; ++i) {
358 			if (cblocks[i].aio_fildes == -1 || record__aio_complete(md, &cblocks[i])) {
359 				if (sync_all)
360 					aiocb[i] = NULL;
361 				else
362 					return i;
363 			} else {
364 				/*
365 				 * Started aio write is not complete yet
366 				 * so it has to be waited before the
367 				 * next allocation.
368 				 */
369 				aiocb[i] = &cblocks[i];
370 				do_suspend = 1;
371 			}
372 		}
373 		if (!do_suspend)
374 			return -1;
375 
376 		while (aio_suspend((const struct aiocb **)aiocb, md->aio.nr_cblocks, &timeout)) {
377 			if (!(errno == EAGAIN || errno == EINTR))
378 				pr_err("failed to sync perf data, error: %m\n");
379 		}
380 	} while (1);
381 }
382 
383 struct record_aio {
384 	struct record	*rec;
385 	void		*data;
386 	size_t		size;
387 };
388 
389 static int record__aio_pushfn(struct mmap *map, void *to, void *buf, size_t size)
390 {
391 	struct record_aio *aio = to;
392 
393 	/*
394 	 * map->core.base data pointed by buf is copied into free map->aio.data[] buffer
395 	 * to release space in the kernel buffer as fast as possible, calling
396 	 * perf_mmap__consume() from perf_mmap__push() function.
397 	 *
398 	 * That lets the kernel to proceed with storing more profiling data into
399 	 * the kernel buffer earlier than other per-cpu kernel buffers are handled.
400 	 *
401 	 * Coping can be done in two steps in case the chunk of profiling data
402 	 * crosses the upper bound of the kernel buffer. In this case we first move
403 	 * part of data from map->start till the upper bound and then the remainder
404 	 * from the beginning of the kernel buffer till the end of the data chunk.
405 	 */
406 
407 	if (record__comp_enabled(aio->rec)) {
408 		ssize_t compressed = zstd_compress(aio->rec->session, NULL, aio->data + aio->size,
409 						   mmap__mmap_len(map) - aio->size,
410 						   buf, size);
411 		if (compressed < 0)
412 			return (int)compressed;
413 
414 		size = compressed;
415 	} else {
416 		memcpy(aio->data + aio->size, buf, size);
417 	}
418 
419 	if (!aio->size) {
420 		/*
421 		 * Increment map->refcount to guard map->aio.data[] buffer
422 		 * from premature deallocation because map object can be
423 		 * released earlier than aio write request started on
424 		 * map->aio.data[] buffer is complete.
425 		 *
426 		 * perf_mmap__put() is done at record__aio_complete()
427 		 * after started aio request completion or at record__aio_push()
428 		 * if the request failed to start.
429 		 */
430 		perf_mmap__get(&map->core);
431 	}
432 
433 	aio->size += size;
434 
435 	return size;
436 }
437 
438 static int record__aio_push(struct record *rec, struct mmap *map, off_t *off)
439 {
440 	int ret, idx;
441 	int trace_fd = rec->session->data->file.fd;
442 	struct record_aio aio = { .rec = rec, .size = 0 };
443 
444 	/*
445 	 * Call record__aio_sync() to wait till map->aio.data[] buffer
446 	 * becomes available after previous aio write operation.
447 	 */
448 
449 	idx = record__aio_sync(map, false);
450 	aio.data = map->aio.data[idx];
451 	ret = perf_mmap__push(map, &aio, record__aio_pushfn);
452 	if (ret != 0) /* ret > 0 - no data, ret < 0 - error */
453 		return ret;
454 
455 	rec->samples++;
456 	ret = record__aio_write(&(map->aio.cblocks[idx]), trace_fd, aio.data, aio.size, *off);
457 	if (!ret) {
458 		*off += aio.size;
459 		rec->bytes_written += aio.size;
460 		if (switch_output_size(rec))
461 			trigger_hit(&switch_output_trigger);
462 	} else {
463 		/*
464 		 * Decrement map->refcount incremented in record__aio_pushfn()
465 		 * back if record__aio_write() operation failed to start, otherwise
466 		 * map->refcount is decremented in record__aio_complete() after
467 		 * aio write operation finishes successfully.
468 		 */
469 		perf_mmap__put(&map->core);
470 	}
471 
472 	return ret;
473 }
474 
475 static off_t record__aio_get_pos(int trace_fd)
476 {
477 	return lseek(trace_fd, 0, SEEK_CUR);
478 }
479 
480 static void record__aio_set_pos(int trace_fd, off_t pos)
481 {
482 	lseek(trace_fd, pos, SEEK_SET);
483 }
484 
485 static void record__aio_mmap_read_sync(struct record *rec)
486 {
487 	int i;
488 	struct evlist *evlist = rec->evlist;
489 	struct mmap *maps = evlist->mmap;
490 
491 	if (!record__aio_enabled(rec))
492 		return;
493 
494 	for (i = 0; i < evlist->core.nr_mmaps; i++) {
495 		struct mmap *map = &maps[i];
496 
497 		if (map->core.base)
498 			record__aio_sync(map, true);
499 	}
500 }
501 
502 static int nr_cblocks_default = 1;
503 static int nr_cblocks_max = 4;
504 
505 static int record__aio_parse(const struct option *opt,
506 			     const char *str,
507 			     int unset)
508 {
509 	struct record_opts *opts = (struct record_opts *)opt->value;
510 
511 	if (unset) {
512 		opts->nr_cblocks = 0;
513 	} else {
514 		if (str)
515 			opts->nr_cblocks = strtol(str, NULL, 0);
516 		if (!opts->nr_cblocks)
517 			opts->nr_cblocks = nr_cblocks_default;
518 	}
519 
520 	return 0;
521 }
522 #else /* HAVE_AIO_SUPPORT */
523 static int nr_cblocks_max = 0;
524 
525 static int record__aio_push(struct record *rec __maybe_unused, struct mmap *map __maybe_unused,
526 			    off_t *off __maybe_unused)
527 {
528 	return -1;
529 }
530 
531 static off_t record__aio_get_pos(int trace_fd __maybe_unused)
532 {
533 	return -1;
534 }
535 
536 static void record__aio_set_pos(int trace_fd __maybe_unused, off_t pos __maybe_unused)
537 {
538 }
539 
540 static void record__aio_mmap_read_sync(struct record *rec __maybe_unused)
541 {
542 }
543 #endif
544 
545 static int record__aio_enabled(struct record *rec)
546 {
547 	return rec->opts.nr_cblocks > 0;
548 }
549 
550 #define MMAP_FLUSH_DEFAULT 1
551 static int record__mmap_flush_parse(const struct option *opt,
552 				    const char *str,
553 				    int unset)
554 {
555 	int flush_max;
556 	struct record_opts *opts = (struct record_opts *)opt->value;
557 	static struct parse_tag tags[] = {
558 			{ .tag  = 'B', .mult = 1       },
559 			{ .tag  = 'K', .mult = 1 << 10 },
560 			{ .tag  = 'M', .mult = 1 << 20 },
561 			{ .tag  = 'G', .mult = 1 << 30 },
562 			{ .tag  = 0 },
563 	};
564 
565 	if (unset)
566 		return 0;
567 
568 	if (str) {
569 		opts->mmap_flush = parse_tag_value(str, tags);
570 		if (opts->mmap_flush == (int)-1)
571 			opts->mmap_flush = strtol(str, NULL, 0);
572 	}
573 
574 	if (!opts->mmap_flush)
575 		opts->mmap_flush = MMAP_FLUSH_DEFAULT;
576 
577 	flush_max = evlist__mmap_size(opts->mmap_pages);
578 	flush_max /= 4;
579 	if (opts->mmap_flush > flush_max)
580 		opts->mmap_flush = flush_max;
581 
582 	return 0;
583 }
584 
585 #ifdef HAVE_ZSTD_SUPPORT
586 static unsigned int comp_level_default = 1;
587 
588 static int record__parse_comp_level(const struct option *opt, const char *str, int unset)
589 {
590 	struct record_opts *opts = opt->value;
591 
592 	if (unset) {
593 		opts->comp_level = 0;
594 	} else {
595 		if (str)
596 			opts->comp_level = strtol(str, NULL, 0);
597 		if (!opts->comp_level)
598 			opts->comp_level = comp_level_default;
599 	}
600 
601 	return 0;
602 }
603 #endif
604 static unsigned int comp_level_max = 22;
605 
606 static int record__comp_enabled(struct record *rec)
607 {
608 	return rec->opts.comp_level > 0;
609 }
610 
611 static int process_synthesized_event(struct perf_tool *tool,
612 				     union perf_event *event,
613 				     struct perf_sample *sample __maybe_unused,
614 				     struct machine *machine __maybe_unused)
615 {
616 	struct record *rec = container_of(tool, struct record, tool);
617 	return record__write(rec, NULL, event, event->header.size);
618 }
619 
620 static struct mutex synth_lock;
621 
622 static int process_locked_synthesized_event(struct perf_tool *tool,
623 				     union perf_event *event,
624 				     struct perf_sample *sample __maybe_unused,
625 				     struct machine *machine __maybe_unused)
626 {
627 	int ret;
628 
629 	mutex_lock(&synth_lock);
630 	ret = process_synthesized_event(tool, event, sample, machine);
631 	mutex_unlock(&synth_lock);
632 	return ret;
633 }
634 
635 static int record__pushfn(struct mmap *map, void *to, void *bf, size_t size)
636 {
637 	struct record *rec = to;
638 
639 	if (record__comp_enabled(rec)) {
640 		ssize_t compressed = zstd_compress(rec->session, map, map->data,
641 						   mmap__mmap_len(map), bf, size);
642 
643 		if (compressed < 0)
644 			return (int)compressed;
645 
646 		size = compressed;
647 		bf   = map->data;
648 	}
649 
650 	thread->samples++;
651 	return record__write(rec, map, bf, size);
652 }
653 
654 static volatile sig_atomic_t signr = -1;
655 static volatile sig_atomic_t child_finished;
656 #ifdef HAVE_EVENTFD_SUPPORT
657 static volatile sig_atomic_t done_fd = -1;
658 #endif
659 
660 static void sig_handler(int sig)
661 {
662 	if (sig == SIGCHLD)
663 		child_finished = 1;
664 	else
665 		signr = sig;
666 
667 	done = 1;
668 #ifdef HAVE_EVENTFD_SUPPORT
669 	if (done_fd >= 0) {
670 		u64 tmp = 1;
671 		int orig_errno = errno;
672 
673 		/*
674 		 * It is possible for this signal handler to run after done is
675 		 * checked in the main loop, but before the perf counter fds are
676 		 * polled. If this happens, the poll() will continue to wait
677 		 * even though done is set, and will only break out if either
678 		 * another signal is received, or the counters are ready for
679 		 * read. To ensure the poll() doesn't sleep when done is set,
680 		 * use an eventfd (done_fd) to wake up the poll().
681 		 */
682 		if (write(done_fd, &tmp, sizeof(tmp)) < 0)
683 			pr_err("failed to signal wakeup fd, error: %m\n");
684 
685 		errno = orig_errno;
686 	}
687 #endif // HAVE_EVENTFD_SUPPORT
688 }
689 
690 static void sigsegv_handler(int sig)
691 {
692 	perf_hooks__recover();
693 	sighandler_dump_stack(sig);
694 }
695 
696 static void record__sig_exit(void)
697 {
698 	if (signr == -1)
699 		return;
700 
701 	signal(signr, SIG_DFL);
702 	raise(signr);
703 }
704 
705 #ifdef HAVE_AUXTRACE_SUPPORT
706 
707 static int record__process_auxtrace(struct perf_tool *tool,
708 				    struct mmap *map,
709 				    union perf_event *event, void *data1,
710 				    size_t len1, void *data2, size_t len2)
711 {
712 	struct record *rec = container_of(tool, struct record, tool);
713 	struct perf_data *data = &rec->data;
714 	size_t padding;
715 	u8 pad[8] = {0};
716 
717 	if (!perf_data__is_pipe(data) && perf_data__is_single_file(data)) {
718 		off_t file_offset;
719 		int fd = perf_data__fd(data);
720 		int err;
721 
722 		file_offset = lseek(fd, 0, SEEK_CUR);
723 		if (file_offset == -1)
724 			return -1;
725 		err = auxtrace_index__auxtrace_event(&rec->session->auxtrace_index,
726 						     event, file_offset);
727 		if (err)
728 			return err;
729 	}
730 
731 	/* event.auxtrace.size includes padding, see __auxtrace_mmap__read() */
732 	padding = (len1 + len2) & 7;
733 	if (padding)
734 		padding = 8 - padding;
735 
736 	record__write(rec, map, event, event->header.size);
737 	record__write(rec, map, data1, len1);
738 	if (len2)
739 		record__write(rec, map, data2, len2);
740 	record__write(rec, map, &pad, padding);
741 
742 	return 0;
743 }
744 
745 static int record__auxtrace_mmap_read(struct record *rec,
746 				      struct mmap *map)
747 {
748 	int ret;
749 
750 	ret = auxtrace_mmap__read(map, rec->itr, &rec->tool,
751 				  record__process_auxtrace);
752 	if (ret < 0)
753 		return ret;
754 
755 	if (ret)
756 		rec->samples++;
757 
758 	return 0;
759 }
760 
761 static int record__auxtrace_mmap_read_snapshot(struct record *rec,
762 					       struct mmap *map)
763 {
764 	int ret;
765 
766 	ret = auxtrace_mmap__read_snapshot(map, rec->itr, &rec->tool,
767 					   record__process_auxtrace,
768 					   rec->opts.auxtrace_snapshot_size);
769 	if (ret < 0)
770 		return ret;
771 
772 	if (ret)
773 		rec->samples++;
774 
775 	return 0;
776 }
777 
778 static int record__auxtrace_read_snapshot_all(struct record *rec)
779 {
780 	int i;
781 	int rc = 0;
782 
783 	for (i = 0; i < rec->evlist->core.nr_mmaps; i++) {
784 		struct mmap *map = &rec->evlist->mmap[i];
785 
786 		if (!map->auxtrace_mmap.base)
787 			continue;
788 
789 		if (record__auxtrace_mmap_read_snapshot(rec, map) != 0) {
790 			rc = -1;
791 			goto out;
792 		}
793 	}
794 out:
795 	return rc;
796 }
797 
798 static void record__read_auxtrace_snapshot(struct record *rec, bool on_exit)
799 {
800 	pr_debug("Recording AUX area tracing snapshot\n");
801 	if (record__auxtrace_read_snapshot_all(rec) < 0) {
802 		trigger_error(&auxtrace_snapshot_trigger);
803 	} else {
804 		if (auxtrace_record__snapshot_finish(rec->itr, on_exit))
805 			trigger_error(&auxtrace_snapshot_trigger);
806 		else
807 			trigger_ready(&auxtrace_snapshot_trigger);
808 	}
809 }
810 
811 static int record__auxtrace_snapshot_exit(struct record *rec)
812 {
813 	if (trigger_is_error(&auxtrace_snapshot_trigger))
814 		return 0;
815 
816 	if (!auxtrace_record__snapshot_started &&
817 	    auxtrace_record__snapshot_start(rec->itr))
818 		return -1;
819 
820 	record__read_auxtrace_snapshot(rec, true);
821 	if (trigger_is_error(&auxtrace_snapshot_trigger))
822 		return -1;
823 
824 	return 0;
825 }
826 
827 static int record__auxtrace_init(struct record *rec)
828 {
829 	int err;
830 
831 	if ((rec->opts.auxtrace_snapshot_opts || rec->opts.auxtrace_sample_opts)
832 	    && record__threads_enabled(rec)) {
833 		pr_err("AUX area tracing options are not available in parallel streaming mode.\n");
834 		return -EINVAL;
835 	}
836 
837 	if (!rec->itr) {
838 		rec->itr = auxtrace_record__init(rec->evlist, &err);
839 		if (err)
840 			return err;
841 	}
842 
843 	err = auxtrace_parse_snapshot_options(rec->itr, &rec->opts,
844 					      rec->opts.auxtrace_snapshot_opts);
845 	if (err)
846 		return err;
847 
848 	err = auxtrace_parse_sample_options(rec->itr, rec->evlist, &rec->opts,
849 					    rec->opts.auxtrace_sample_opts);
850 	if (err)
851 		return err;
852 
853 	auxtrace_regroup_aux_output(rec->evlist);
854 
855 	return auxtrace_parse_filters(rec->evlist);
856 }
857 
858 #else
859 
860 static inline
861 int record__auxtrace_mmap_read(struct record *rec __maybe_unused,
862 			       struct mmap *map __maybe_unused)
863 {
864 	return 0;
865 }
866 
867 static inline
868 void record__read_auxtrace_snapshot(struct record *rec __maybe_unused,
869 				    bool on_exit __maybe_unused)
870 {
871 }
872 
873 static inline
874 int auxtrace_record__snapshot_start(struct auxtrace_record *itr __maybe_unused)
875 {
876 	return 0;
877 }
878 
879 static inline
880 int record__auxtrace_snapshot_exit(struct record *rec __maybe_unused)
881 {
882 	return 0;
883 }
884 
885 static int record__auxtrace_init(struct record *rec __maybe_unused)
886 {
887 	return 0;
888 }
889 
890 #endif
891 
892 static int record__config_text_poke(struct evlist *evlist)
893 {
894 	struct evsel *evsel;
895 
896 	/* Nothing to do if text poke is already configured */
897 	evlist__for_each_entry(evlist, evsel) {
898 		if (evsel->core.attr.text_poke)
899 			return 0;
900 	}
901 
902 	evsel = evlist__add_dummy_on_all_cpus(evlist);
903 	if (!evsel)
904 		return -ENOMEM;
905 
906 	evsel->core.attr.text_poke = 1;
907 	evsel->core.attr.ksymbol = 1;
908 	evsel->immediate = true;
909 	evsel__set_sample_bit(evsel, TIME);
910 
911 	return 0;
912 }
913 
914 static int record__config_off_cpu(struct record *rec)
915 {
916 	return off_cpu_prepare(rec->evlist, &rec->opts.target, &rec->opts);
917 }
918 
919 static bool record__tracking_system_wide(struct record *rec)
920 {
921 	struct evlist *evlist = rec->evlist;
922 	struct evsel *evsel;
923 
924 	/*
925 	 * If non-dummy evsel exists, system_wide sideband is need to
926 	 * help parse sample information.
927 	 * For example, PERF_EVENT_MMAP event to help parse symbol,
928 	 * and PERF_EVENT_COMM event to help parse task executable name.
929 	 */
930 	evlist__for_each_entry(evlist, evsel) {
931 		if (!evsel__is_dummy_event(evsel))
932 			return true;
933 	}
934 
935 	return false;
936 }
937 
938 static int record__config_tracking_events(struct record *rec)
939 {
940 	struct record_opts *opts = &rec->opts;
941 	struct evlist *evlist = rec->evlist;
942 	bool system_wide = false;
943 	struct evsel *evsel;
944 
945 	/*
946 	 * For initial_delay, system wide or a hybrid system, we need to add
947 	 * tracking event so that we can track PERF_RECORD_MMAP to cover the
948 	 * delay of waiting or event synthesis.
949 	 */
950 	if (opts->target.initial_delay || target__has_cpu(&opts->target) ||
951 	    perf_pmus__num_core_pmus() > 1) {
952 
953 		/*
954 		 * User space tasks can migrate between CPUs, so when tracing
955 		 * selected CPUs, sideband for all CPUs is still needed.
956 		 */
957 		if (!!opts->target.cpu_list && record__tracking_system_wide(rec))
958 			system_wide = true;
959 
960 		evsel = evlist__findnew_tracking_event(evlist, system_wide);
961 		if (!evsel)
962 			return -ENOMEM;
963 
964 		/*
965 		 * Enable the tracking event when the process is forked for
966 		 * initial_delay, immediately for system wide.
967 		 */
968 		if (opts->target.initial_delay && !evsel->immediate &&
969 		    !target__has_cpu(&opts->target))
970 			evsel->core.attr.enable_on_exec = 1;
971 		else
972 			evsel->immediate = 1;
973 	}
974 
975 	return 0;
976 }
977 
978 static bool record__kcore_readable(struct machine *machine)
979 {
980 	char kcore[PATH_MAX];
981 	int fd;
982 
983 	scnprintf(kcore, sizeof(kcore), "%s/proc/kcore", machine->root_dir);
984 
985 	fd = open(kcore, O_RDONLY);
986 	if (fd < 0)
987 		return false;
988 
989 	close(fd);
990 
991 	return true;
992 }
993 
994 static int record__kcore_copy(struct machine *machine, struct perf_data *data)
995 {
996 	char from_dir[PATH_MAX];
997 	char kcore_dir[PATH_MAX];
998 	int ret;
999 
1000 	snprintf(from_dir, sizeof(from_dir), "%s/proc", machine->root_dir);
1001 
1002 	ret = perf_data__make_kcore_dir(data, kcore_dir, sizeof(kcore_dir));
1003 	if (ret)
1004 		return ret;
1005 
1006 	return kcore_copy(from_dir, kcore_dir);
1007 }
1008 
1009 static void record__thread_data_init_pipes(struct record_thread *thread_data)
1010 {
1011 	thread_data->pipes.msg[0] = -1;
1012 	thread_data->pipes.msg[1] = -1;
1013 	thread_data->pipes.ack[0] = -1;
1014 	thread_data->pipes.ack[1] = -1;
1015 }
1016 
1017 static int record__thread_data_open_pipes(struct record_thread *thread_data)
1018 {
1019 	if (pipe(thread_data->pipes.msg))
1020 		return -EINVAL;
1021 
1022 	if (pipe(thread_data->pipes.ack)) {
1023 		close(thread_data->pipes.msg[0]);
1024 		thread_data->pipes.msg[0] = -1;
1025 		close(thread_data->pipes.msg[1]);
1026 		thread_data->pipes.msg[1] = -1;
1027 		return -EINVAL;
1028 	}
1029 
1030 	pr_debug2("thread_data[%p]: msg=[%d,%d], ack=[%d,%d]\n", thread_data,
1031 		 thread_data->pipes.msg[0], thread_data->pipes.msg[1],
1032 		 thread_data->pipes.ack[0], thread_data->pipes.ack[1]);
1033 
1034 	return 0;
1035 }
1036 
1037 static void record__thread_data_close_pipes(struct record_thread *thread_data)
1038 {
1039 	if (thread_data->pipes.msg[0] != -1) {
1040 		close(thread_data->pipes.msg[0]);
1041 		thread_data->pipes.msg[0] = -1;
1042 	}
1043 	if (thread_data->pipes.msg[1] != -1) {
1044 		close(thread_data->pipes.msg[1]);
1045 		thread_data->pipes.msg[1] = -1;
1046 	}
1047 	if (thread_data->pipes.ack[0] != -1) {
1048 		close(thread_data->pipes.ack[0]);
1049 		thread_data->pipes.ack[0] = -1;
1050 	}
1051 	if (thread_data->pipes.ack[1] != -1) {
1052 		close(thread_data->pipes.ack[1]);
1053 		thread_data->pipes.ack[1] = -1;
1054 	}
1055 }
1056 
1057 static bool evlist__per_thread(struct evlist *evlist)
1058 {
1059 	return cpu_map__is_dummy(evlist->core.user_requested_cpus);
1060 }
1061 
1062 static int record__thread_data_init_maps(struct record_thread *thread_data, struct evlist *evlist)
1063 {
1064 	int m, tm, nr_mmaps = evlist->core.nr_mmaps;
1065 	struct mmap *mmap = evlist->mmap;
1066 	struct mmap *overwrite_mmap = evlist->overwrite_mmap;
1067 	struct perf_cpu_map *cpus = evlist->core.all_cpus;
1068 	bool per_thread = evlist__per_thread(evlist);
1069 
1070 	if (per_thread)
1071 		thread_data->nr_mmaps = nr_mmaps;
1072 	else
1073 		thread_data->nr_mmaps = bitmap_weight(thread_data->mask->maps.bits,
1074 						      thread_data->mask->maps.nbits);
1075 	if (mmap) {
1076 		thread_data->maps = zalloc(thread_data->nr_mmaps * sizeof(struct mmap *));
1077 		if (!thread_data->maps)
1078 			return -ENOMEM;
1079 	}
1080 	if (overwrite_mmap) {
1081 		thread_data->overwrite_maps = zalloc(thread_data->nr_mmaps * sizeof(struct mmap *));
1082 		if (!thread_data->overwrite_maps) {
1083 			zfree(&thread_data->maps);
1084 			return -ENOMEM;
1085 		}
1086 	}
1087 	pr_debug2("thread_data[%p]: nr_mmaps=%d, maps=%p, ow_maps=%p\n", thread_data,
1088 		 thread_data->nr_mmaps, thread_data->maps, thread_data->overwrite_maps);
1089 
1090 	for (m = 0, tm = 0; m < nr_mmaps && tm < thread_data->nr_mmaps; m++) {
1091 		if (per_thread ||
1092 		    test_bit(perf_cpu_map__cpu(cpus, m).cpu, thread_data->mask->maps.bits)) {
1093 			if (thread_data->maps) {
1094 				thread_data->maps[tm] = &mmap[m];
1095 				pr_debug2("thread_data[%p]: cpu%d: maps[%d] -> mmap[%d]\n",
1096 					  thread_data, perf_cpu_map__cpu(cpus, m).cpu, tm, m);
1097 			}
1098 			if (thread_data->overwrite_maps) {
1099 				thread_data->overwrite_maps[tm] = &overwrite_mmap[m];
1100 				pr_debug2("thread_data[%p]: cpu%d: ow_maps[%d] -> ow_mmap[%d]\n",
1101 					  thread_data, perf_cpu_map__cpu(cpus, m).cpu, tm, m);
1102 			}
1103 			tm++;
1104 		}
1105 	}
1106 
1107 	return 0;
1108 }
1109 
1110 static int record__thread_data_init_pollfd(struct record_thread *thread_data, struct evlist *evlist)
1111 {
1112 	int f, tm, pos;
1113 	struct mmap *map, *overwrite_map;
1114 
1115 	fdarray__init(&thread_data->pollfd, 64);
1116 
1117 	for (tm = 0; tm < thread_data->nr_mmaps; tm++) {
1118 		map = thread_data->maps ? thread_data->maps[tm] : NULL;
1119 		overwrite_map = thread_data->overwrite_maps ?
1120 				thread_data->overwrite_maps[tm] : NULL;
1121 
1122 		for (f = 0; f < evlist->core.pollfd.nr; f++) {
1123 			void *ptr = evlist->core.pollfd.priv[f].ptr;
1124 
1125 			if ((map && ptr == map) || (overwrite_map && ptr == overwrite_map)) {
1126 				pos = fdarray__dup_entry_from(&thread_data->pollfd, f,
1127 							      &evlist->core.pollfd);
1128 				if (pos < 0)
1129 					return pos;
1130 				pr_debug2("thread_data[%p]: pollfd[%d] <- event_fd=%d\n",
1131 					 thread_data, pos, evlist->core.pollfd.entries[f].fd);
1132 			}
1133 		}
1134 	}
1135 
1136 	return 0;
1137 }
1138 
1139 static void record__free_thread_data(struct record *rec)
1140 {
1141 	int t;
1142 	struct record_thread *thread_data = rec->thread_data;
1143 
1144 	if (thread_data == NULL)
1145 		return;
1146 
1147 	for (t = 0; t < rec->nr_threads; t++) {
1148 		record__thread_data_close_pipes(&thread_data[t]);
1149 		zfree(&thread_data[t].maps);
1150 		zfree(&thread_data[t].overwrite_maps);
1151 		fdarray__exit(&thread_data[t].pollfd);
1152 	}
1153 
1154 	zfree(&rec->thread_data);
1155 }
1156 
1157 static int record__map_thread_evlist_pollfd_indexes(struct record *rec,
1158 						    int evlist_pollfd_index,
1159 						    int thread_pollfd_index)
1160 {
1161 	size_t x = rec->index_map_cnt;
1162 
1163 	if (realloc_array_as_needed(rec->index_map, rec->index_map_sz, x, NULL))
1164 		return -ENOMEM;
1165 	rec->index_map[x].evlist_pollfd_index = evlist_pollfd_index;
1166 	rec->index_map[x].thread_pollfd_index = thread_pollfd_index;
1167 	rec->index_map_cnt += 1;
1168 	return 0;
1169 }
1170 
1171 static int record__update_evlist_pollfd_from_thread(struct record *rec,
1172 						    struct evlist *evlist,
1173 						    struct record_thread *thread_data)
1174 {
1175 	struct pollfd *e_entries = evlist->core.pollfd.entries;
1176 	struct pollfd *t_entries = thread_data->pollfd.entries;
1177 	int err = 0;
1178 	size_t i;
1179 
1180 	for (i = 0; i < rec->index_map_cnt; i++) {
1181 		int e_pos = rec->index_map[i].evlist_pollfd_index;
1182 		int t_pos = rec->index_map[i].thread_pollfd_index;
1183 
1184 		if (e_entries[e_pos].fd != t_entries[t_pos].fd ||
1185 		    e_entries[e_pos].events != t_entries[t_pos].events) {
1186 			pr_err("Thread and evlist pollfd index mismatch\n");
1187 			err = -EINVAL;
1188 			continue;
1189 		}
1190 		e_entries[e_pos].revents = t_entries[t_pos].revents;
1191 	}
1192 	return err;
1193 }
1194 
1195 static int record__dup_non_perf_events(struct record *rec,
1196 				       struct evlist *evlist,
1197 				       struct record_thread *thread_data)
1198 {
1199 	struct fdarray *fda = &evlist->core.pollfd;
1200 	int i, ret;
1201 
1202 	for (i = 0; i < fda->nr; i++) {
1203 		if (!(fda->priv[i].flags & fdarray_flag__non_perf_event))
1204 			continue;
1205 		ret = fdarray__dup_entry_from(&thread_data->pollfd, i, fda);
1206 		if (ret < 0) {
1207 			pr_err("Failed to duplicate descriptor in main thread pollfd\n");
1208 			return ret;
1209 		}
1210 		pr_debug2("thread_data[%p]: pollfd[%d] <- non_perf_event fd=%d\n",
1211 			  thread_data, ret, fda->entries[i].fd);
1212 		ret = record__map_thread_evlist_pollfd_indexes(rec, i, ret);
1213 		if (ret < 0) {
1214 			pr_err("Failed to map thread and evlist pollfd indexes\n");
1215 			return ret;
1216 		}
1217 	}
1218 	return 0;
1219 }
1220 
1221 static int record__alloc_thread_data(struct record *rec, struct evlist *evlist)
1222 {
1223 	int t, ret;
1224 	struct record_thread *thread_data;
1225 
1226 	rec->thread_data = zalloc(rec->nr_threads * sizeof(*(rec->thread_data)));
1227 	if (!rec->thread_data) {
1228 		pr_err("Failed to allocate thread data\n");
1229 		return -ENOMEM;
1230 	}
1231 	thread_data = rec->thread_data;
1232 
1233 	for (t = 0; t < rec->nr_threads; t++)
1234 		record__thread_data_init_pipes(&thread_data[t]);
1235 
1236 	for (t = 0; t < rec->nr_threads; t++) {
1237 		thread_data[t].rec = rec;
1238 		thread_data[t].mask = &rec->thread_masks[t];
1239 		ret = record__thread_data_init_maps(&thread_data[t], evlist);
1240 		if (ret) {
1241 			pr_err("Failed to initialize thread[%d] maps\n", t);
1242 			goto out_free;
1243 		}
1244 		ret = record__thread_data_init_pollfd(&thread_data[t], evlist);
1245 		if (ret) {
1246 			pr_err("Failed to initialize thread[%d] pollfd\n", t);
1247 			goto out_free;
1248 		}
1249 		if (t) {
1250 			thread_data[t].tid = -1;
1251 			ret = record__thread_data_open_pipes(&thread_data[t]);
1252 			if (ret) {
1253 				pr_err("Failed to open thread[%d] communication pipes\n", t);
1254 				goto out_free;
1255 			}
1256 			ret = fdarray__add(&thread_data[t].pollfd, thread_data[t].pipes.msg[0],
1257 					   POLLIN | POLLERR | POLLHUP, fdarray_flag__nonfilterable);
1258 			if (ret < 0) {
1259 				pr_err("Failed to add descriptor to thread[%d] pollfd\n", t);
1260 				goto out_free;
1261 			}
1262 			thread_data[t].ctlfd_pos = ret;
1263 			pr_debug2("thread_data[%p]: pollfd[%d] <- ctl_fd=%d\n",
1264 				 thread_data, thread_data[t].ctlfd_pos,
1265 				 thread_data[t].pipes.msg[0]);
1266 		} else {
1267 			thread_data[t].tid = gettid();
1268 
1269 			ret = record__dup_non_perf_events(rec, evlist, &thread_data[t]);
1270 			if (ret < 0)
1271 				goto out_free;
1272 
1273 			thread_data[t].ctlfd_pos = -1; /* Not used */
1274 		}
1275 	}
1276 
1277 	return 0;
1278 
1279 out_free:
1280 	record__free_thread_data(rec);
1281 
1282 	return ret;
1283 }
1284 
1285 static int record__mmap_evlist(struct record *rec,
1286 			       struct evlist *evlist)
1287 {
1288 	int i, ret;
1289 	struct record_opts *opts = &rec->opts;
1290 	bool auxtrace_overwrite = opts->auxtrace_snapshot_mode ||
1291 				  opts->auxtrace_sample_mode;
1292 	char msg[512];
1293 
1294 	if (opts->affinity != PERF_AFFINITY_SYS)
1295 		cpu__setup_cpunode_map();
1296 
1297 	if (evlist__mmap_ex(evlist, opts->mmap_pages,
1298 				 opts->auxtrace_mmap_pages,
1299 				 auxtrace_overwrite,
1300 				 opts->nr_cblocks, opts->affinity,
1301 				 opts->mmap_flush, opts->comp_level) < 0) {
1302 		if (errno == EPERM) {
1303 			pr_err("Permission error mapping pages.\n"
1304 			       "Consider increasing "
1305 			       "/proc/sys/kernel/perf_event_mlock_kb,\n"
1306 			       "or try again with a smaller value of -m/--mmap_pages.\n"
1307 			       "(current value: %u,%u)\n",
1308 			       opts->mmap_pages, opts->auxtrace_mmap_pages);
1309 			return -errno;
1310 		} else {
1311 			pr_err("failed to mmap with %d (%s)\n", errno,
1312 				str_error_r(errno, msg, sizeof(msg)));
1313 			if (errno)
1314 				return -errno;
1315 			else
1316 				return -EINVAL;
1317 		}
1318 	}
1319 
1320 	if (evlist__initialize_ctlfd(evlist, opts->ctl_fd, opts->ctl_fd_ack))
1321 		return -1;
1322 
1323 	ret = record__alloc_thread_data(rec, evlist);
1324 	if (ret)
1325 		return ret;
1326 
1327 	if (record__threads_enabled(rec)) {
1328 		ret = perf_data__create_dir(&rec->data, evlist->core.nr_mmaps);
1329 		if (ret) {
1330 			pr_err("Failed to create data directory: %s\n", strerror(-ret));
1331 			return ret;
1332 		}
1333 		for (i = 0; i < evlist->core.nr_mmaps; i++) {
1334 			if (evlist->mmap)
1335 				evlist->mmap[i].file = &rec->data.dir.files[i];
1336 			if (evlist->overwrite_mmap)
1337 				evlist->overwrite_mmap[i].file = &rec->data.dir.files[i];
1338 		}
1339 	}
1340 
1341 	return 0;
1342 }
1343 
1344 static int record__mmap(struct record *rec)
1345 {
1346 	return record__mmap_evlist(rec, rec->evlist);
1347 }
1348 
1349 static int record__open(struct record *rec)
1350 {
1351 	char msg[BUFSIZ];
1352 	struct evsel *pos;
1353 	struct evlist *evlist = rec->evlist;
1354 	struct perf_session *session = rec->session;
1355 	struct record_opts *opts = &rec->opts;
1356 	int rc = 0;
1357 
1358 	evlist__for_each_entry(evlist, pos) {
1359 try_again:
1360 		if (evsel__open(pos, pos->core.cpus, pos->core.threads) < 0) {
1361 			if (evsel__fallback(pos, &opts->target, errno, msg, sizeof(msg))) {
1362 				if (verbose > 0)
1363 					ui__warning("%s\n", msg);
1364 				goto try_again;
1365 			}
1366 			if ((errno == EINVAL || errno == EBADF) &&
1367 			    pos->core.leader != &pos->core &&
1368 			    pos->weak_group) {
1369 			        pos = evlist__reset_weak_group(evlist, pos, true);
1370 				goto try_again;
1371 			}
1372 			rc = -errno;
1373 			evsel__open_strerror(pos, &opts->target, errno, msg, sizeof(msg));
1374 			ui__error("%s\n", msg);
1375 			goto out;
1376 		}
1377 
1378 		pos->supported = true;
1379 	}
1380 
1381 	if (symbol_conf.kptr_restrict && !evlist__exclude_kernel(evlist)) {
1382 		pr_warning(
1383 "WARNING: Kernel address maps (/proc/{kallsyms,modules}) are restricted,\n"
1384 "check /proc/sys/kernel/kptr_restrict and /proc/sys/kernel/perf_event_paranoid.\n\n"
1385 "Samples in kernel functions may not be resolved if a suitable vmlinux\n"
1386 "file is not found in the buildid cache or in the vmlinux path.\n\n"
1387 "Samples in kernel modules won't be resolved at all.\n\n"
1388 "If some relocation was applied (e.g. kexec) symbols may be misresolved\n"
1389 "even with a suitable vmlinux or kallsyms file.\n\n");
1390 	}
1391 
1392 	if (evlist__apply_filters(evlist, &pos)) {
1393 		pr_err("failed to set filter \"%s\" on event %s with %d (%s)\n",
1394 			pos->filter ?: "BPF", evsel__name(pos), errno,
1395 			str_error_r(errno, msg, sizeof(msg)));
1396 		rc = -1;
1397 		goto out;
1398 	}
1399 
1400 	rc = record__mmap(rec);
1401 	if (rc)
1402 		goto out;
1403 
1404 	session->evlist = evlist;
1405 	perf_session__set_id_hdr_size(session);
1406 out:
1407 	return rc;
1408 }
1409 
1410 static void set_timestamp_boundary(struct record *rec, u64 sample_time)
1411 {
1412 	if (rec->evlist->first_sample_time == 0)
1413 		rec->evlist->first_sample_time = sample_time;
1414 
1415 	if (sample_time)
1416 		rec->evlist->last_sample_time = sample_time;
1417 }
1418 
1419 static int process_sample_event(struct perf_tool *tool,
1420 				union perf_event *event,
1421 				struct perf_sample *sample,
1422 				struct evsel *evsel,
1423 				struct machine *machine)
1424 {
1425 	struct record *rec = container_of(tool, struct record, tool);
1426 
1427 	set_timestamp_boundary(rec, sample->time);
1428 
1429 	if (rec->buildid_all)
1430 		return 0;
1431 
1432 	rec->samples++;
1433 	return build_id__mark_dso_hit(tool, event, sample, evsel, machine);
1434 }
1435 
1436 static int process_buildids(struct record *rec)
1437 {
1438 	struct perf_session *session = rec->session;
1439 
1440 	if (perf_data__size(&rec->data) == 0)
1441 		return 0;
1442 
1443 	/*
1444 	 * During this process, it'll load kernel map and replace the
1445 	 * dso->long_name to a real pathname it found.  In this case
1446 	 * we prefer the vmlinux path like
1447 	 *   /lib/modules/3.16.4/build/vmlinux
1448 	 *
1449 	 * rather than build-id path (in debug directory).
1450 	 *   $HOME/.debug/.build-id/f0/6e17aa50adf4d00b88925e03775de107611551
1451 	 */
1452 	symbol_conf.ignore_vmlinux_buildid = true;
1453 
1454 	/*
1455 	 * If --buildid-all is given, it marks all DSO regardless of hits,
1456 	 * so no need to process samples. But if timestamp_boundary is enabled,
1457 	 * it still needs to walk on all samples to get the timestamps of
1458 	 * first/last samples.
1459 	 */
1460 	if (rec->buildid_all && !rec->timestamp_boundary)
1461 		rec->tool.sample = NULL;
1462 
1463 	return perf_session__process_events(session);
1464 }
1465 
1466 static void perf_event__synthesize_guest_os(struct machine *machine, void *data)
1467 {
1468 	int err;
1469 	struct perf_tool *tool = data;
1470 	/*
1471 	 *As for guest kernel when processing subcommand record&report,
1472 	 *we arrange module mmap prior to guest kernel mmap and trigger
1473 	 *a preload dso because default guest module symbols are loaded
1474 	 *from guest kallsyms instead of /lib/modules/XXX/XXX. This
1475 	 *method is used to avoid symbol missing when the first addr is
1476 	 *in module instead of in guest kernel.
1477 	 */
1478 	err = perf_event__synthesize_modules(tool, process_synthesized_event,
1479 					     machine);
1480 	if (err < 0)
1481 		pr_err("Couldn't record guest kernel [%d]'s reference"
1482 		       " relocation symbol.\n", machine->pid);
1483 
1484 	/*
1485 	 * We use _stext for guest kernel because guest kernel's /proc/kallsyms
1486 	 * have no _text sometimes.
1487 	 */
1488 	err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
1489 						 machine);
1490 	if (err < 0)
1491 		pr_err("Couldn't record guest kernel [%d]'s reference"
1492 		       " relocation symbol.\n", machine->pid);
1493 }
1494 
1495 static struct perf_event_header finished_round_event = {
1496 	.size = sizeof(struct perf_event_header),
1497 	.type = PERF_RECORD_FINISHED_ROUND,
1498 };
1499 
1500 static struct perf_event_header finished_init_event = {
1501 	.size = sizeof(struct perf_event_header),
1502 	.type = PERF_RECORD_FINISHED_INIT,
1503 };
1504 
1505 static void record__adjust_affinity(struct record *rec, struct mmap *map)
1506 {
1507 	if (rec->opts.affinity != PERF_AFFINITY_SYS &&
1508 	    !bitmap_equal(thread->mask->affinity.bits, map->affinity_mask.bits,
1509 			  thread->mask->affinity.nbits)) {
1510 		bitmap_zero(thread->mask->affinity.bits, thread->mask->affinity.nbits);
1511 		bitmap_or(thread->mask->affinity.bits, thread->mask->affinity.bits,
1512 			  map->affinity_mask.bits, thread->mask->affinity.nbits);
1513 		sched_setaffinity(0, MMAP_CPU_MASK_BYTES(&thread->mask->affinity),
1514 					(cpu_set_t *)thread->mask->affinity.bits);
1515 		if (verbose == 2) {
1516 			pr_debug("threads[%d]: running on cpu%d: ", thread->tid, sched_getcpu());
1517 			mmap_cpu_mask__scnprintf(&thread->mask->affinity, "affinity");
1518 		}
1519 	}
1520 }
1521 
1522 static size_t process_comp_header(void *record, size_t increment)
1523 {
1524 	struct perf_record_compressed *event = record;
1525 	size_t size = sizeof(*event);
1526 
1527 	if (increment) {
1528 		event->header.size += increment;
1529 		return increment;
1530 	}
1531 
1532 	event->header.type = PERF_RECORD_COMPRESSED;
1533 	event->header.size = size;
1534 
1535 	return size;
1536 }
1537 
1538 static ssize_t zstd_compress(struct perf_session *session, struct mmap *map,
1539 			    void *dst, size_t dst_size, void *src, size_t src_size)
1540 {
1541 	ssize_t compressed;
1542 	size_t max_record_size = PERF_SAMPLE_MAX_SIZE - sizeof(struct perf_record_compressed) - 1;
1543 	struct zstd_data *zstd_data = &session->zstd_data;
1544 
1545 	if (map && map->file)
1546 		zstd_data = &map->zstd_data;
1547 
1548 	compressed = zstd_compress_stream_to_records(zstd_data, dst, dst_size, src, src_size,
1549 						     max_record_size, process_comp_header);
1550 	if (compressed < 0)
1551 		return compressed;
1552 
1553 	if (map && map->file) {
1554 		thread->bytes_transferred += src_size;
1555 		thread->bytes_compressed  += compressed;
1556 	} else {
1557 		session->bytes_transferred += src_size;
1558 		session->bytes_compressed  += compressed;
1559 	}
1560 
1561 	return compressed;
1562 }
1563 
1564 static int record__mmap_read_evlist(struct record *rec, struct evlist *evlist,
1565 				    bool overwrite, bool synch)
1566 {
1567 	u64 bytes_written = rec->bytes_written;
1568 	int i;
1569 	int rc = 0;
1570 	int nr_mmaps;
1571 	struct mmap **maps;
1572 	int trace_fd = rec->data.file.fd;
1573 	off_t off = 0;
1574 
1575 	if (!evlist)
1576 		return 0;
1577 
1578 	nr_mmaps = thread->nr_mmaps;
1579 	maps = overwrite ? thread->overwrite_maps : thread->maps;
1580 
1581 	if (!maps)
1582 		return 0;
1583 
1584 	if (overwrite && evlist->bkw_mmap_state != BKW_MMAP_DATA_PENDING)
1585 		return 0;
1586 
1587 	if (record__aio_enabled(rec))
1588 		off = record__aio_get_pos(trace_fd);
1589 
1590 	for (i = 0; i < nr_mmaps; i++) {
1591 		u64 flush = 0;
1592 		struct mmap *map = maps[i];
1593 
1594 		if (map->core.base) {
1595 			record__adjust_affinity(rec, map);
1596 			if (synch) {
1597 				flush = map->core.flush;
1598 				map->core.flush = 1;
1599 			}
1600 			if (!record__aio_enabled(rec)) {
1601 				if (perf_mmap__push(map, rec, record__pushfn) < 0) {
1602 					if (synch)
1603 						map->core.flush = flush;
1604 					rc = -1;
1605 					goto out;
1606 				}
1607 			} else {
1608 				if (record__aio_push(rec, map, &off) < 0) {
1609 					record__aio_set_pos(trace_fd, off);
1610 					if (synch)
1611 						map->core.flush = flush;
1612 					rc = -1;
1613 					goto out;
1614 				}
1615 			}
1616 			if (synch)
1617 				map->core.flush = flush;
1618 		}
1619 
1620 		if (map->auxtrace_mmap.base && !rec->opts.auxtrace_snapshot_mode &&
1621 		    !rec->opts.auxtrace_sample_mode &&
1622 		    record__auxtrace_mmap_read(rec, map) != 0) {
1623 			rc = -1;
1624 			goto out;
1625 		}
1626 	}
1627 
1628 	if (record__aio_enabled(rec))
1629 		record__aio_set_pos(trace_fd, off);
1630 
1631 	/*
1632 	 * Mark the round finished in case we wrote
1633 	 * at least one event.
1634 	 *
1635 	 * No need for round events in directory mode,
1636 	 * because per-cpu maps and files have data
1637 	 * sorted by kernel.
1638 	 */
1639 	if (!record__threads_enabled(rec) && bytes_written != rec->bytes_written)
1640 		rc = record__write(rec, NULL, &finished_round_event, sizeof(finished_round_event));
1641 
1642 	if (overwrite)
1643 		evlist__toggle_bkw_mmap(evlist, BKW_MMAP_EMPTY);
1644 out:
1645 	return rc;
1646 }
1647 
1648 static int record__mmap_read_all(struct record *rec, bool synch)
1649 {
1650 	int err;
1651 
1652 	err = record__mmap_read_evlist(rec, rec->evlist, false, synch);
1653 	if (err)
1654 		return err;
1655 
1656 	return record__mmap_read_evlist(rec, rec->evlist, true, synch);
1657 }
1658 
1659 static void record__thread_munmap_filtered(struct fdarray *fda, int fd,
1660 					   void *arg __maybe_unused)
1661 {
1662 	struct perf_mmap *map = fda->priv[fd].ptr;
1663 
1664 	if (map)
1665 		perf_mmap__put(map);
1666 }
1667 
1668 static void *record__thread(void *arg)
1669 {
1670 	enum thread_msg msg = THREAD_MSG__READY;
1671 	bool terminate = false;
1672 	struct fdarray *pollfd;
1673 	int err, ctlfd_pos;
1674 
1675 	thread = arg;
1676 	thread->tid = gettid();
1677 
1678 	err = write(thread->pipes.ack[1], &msg, sizeof(msg));
1679 	if (err == -1)
1680 		pr_warning("threads[%d]: failed to notify on start: %s\n",
1681 			   thread->tid, strerror(errno));
1682 
1683 	pr_debug("threads[%d]: started on cpu%d\n", thread->tid, sched_getcpu());
1684 
1685 	pollfd = &thread->pollfd;
1686 	ctlfd_pos = thread->ctlfd_pos;
1687 
1688 	for (;;) {
1689 		unsigned long long hits = thread->samples;
1690 
1691 		if (record__mmap_read_all(thread->rec, false) < 0 || terminate)
1692 			break;
1693 
1694 		if (hits == thread->samples) {
1695 
1696 			err = fdarray__poll(pollfd, -1);
1697 			/*
1698 			 * Propagate error, only if there's any. Ignore positive
1699 			 * number of returned events and interrupt error.
1700 			 */
1701 			if (err > 0 || (err < 0 && errno == EINTR))
1702 				err = 0;
1703 			thread->waking++;
1704 
1705 			if (fdarray__filter(pollfd, POLLERR | POLLHUP,
1706 					    record__thread_munmap_filtered, NULL) == 0)
1707 				break;
1708 		}
1709 
1710 		if (pollfd->entries[ctlfd_pos].revents & POLLHUP) {
1711 			terminate = true;
1712 			close(thread->pipes.msg[0]);
1713 			thread->pipes.msg[0] = -1;
1714 			pollfd->entries[ctlfd_pos].fd = -1;
1715 			pollfd->entries[ctlfd_pos].events = 0;
1716 		}
1717 
1718 		pollfd->entries[ctlfd_pos].revents = 0;
1719 	}
1720 	record__mmap_read_all(thread->rec, true);
1721 
1722 	err = write(thread->pipes.ack[1], &msg, sizeof(msg));
1723 	if (err == -1)
1724 		pr_warning("threads[%d]: failed to notify on termination: %s\n",
1725 			   thread->tid, strerror(errno));
1726 
1727 	return NULL;
1728 }
1729 
1730 static void record__init_features(struct record *rec)
1731 {
1732 	struct perf_session *session = rec->session;
1733 	int feat;
1734 
1735 	for (feat = HEADER_FIRST_FEATURE; feat < HEADER_LAST_FEATURE; feat++)
1736 		perf_header__set_feat(&session->header, feat);
1737 
1738 	if (rec->no_buildid)
1739 		perf_header__clear_feat(&session->header, HEADER_BUILD_ID);
1740 
1741 #ifdef HAVE_LIBTRACEEVENT
1742 	if (!have_tracepoints(&rec->evlist->core.entries))
1743 		perf_header__clear_feat(&session->header, HEADER_TRACING_DATA);
1744 #endif
1745 
1746 	if (!rec->opts.branch_stack)
1747 		perf_header__clear_feat(&session->header, HEADER_BRANCH_STACK);
1748 
1749 	if (!rec->opts.full_auxtrace)
1750 		perf_header__clear_feat(&session->header, HEADER_AUXTRACE);
1751 
1752 	if (!(rec->opts.use_clockid && rec->opts.clockid_res_ns))
1753 		perf_header__clear_feat(&session->header, HEADER_CLOCKID);
1754 
1755 	if (!rec->opts.use_clockid)
1756 		perf_header__clear_feat(&session->header, HEADER_CLOCK_DATA);
1757 
1758 	if (!record__threads_enabled(rec))
1759 		perf_header__clear_feat(&session->header, HEADER_DIR_FORMAT);
1760 
1761 	if (!record__comp_enabled(rec))
1762 		perf_header__clear_feat(&session->header, HEADER_COMPRESSED);
1763 
1764 	perf_header__clear_feat(&session->header, HEADER_STAT);
1765 }
1766 
1767 static void
1768 record__finish_output(struct record *rec)
1769 {
1770 	int i;
1771 	struct perf_data *data = &rec->data;
1772 	int fd = perf_data__fd(data);
1773 
1774 	if (data->is_pipe) {
1775 		/* Just to display approx. size */
1776 		data->file.size = rec->bytes_written;
1777 		return;
1778 	}
1779 
1780 	rec->session->header.data_size += rec->bytes_written;
1781 	data->file.size = lseek(perf_data__fd(data), 0, SEEK_CUR);
1782 	if (record__threads_enabled(rec)) {
1783 		for (i = 0; i < data->dir.nr; i++)
1784 			data->dir.files[i].size = lseek(data->dir.files[i].fd, 0, SEEK_CUR);
1785 	}
1786 
1787 	if (!rec->no_buildid) {
1788 		process_buildids(rec);
1789 
1790 		if (rec->buildid_all)
1791 			perf_session__dsos_hit_all(rec->session);
1792 	}
1793 	perf_session__write_header(rec->session, rec->evlist, fd, true);
1794 
1795 	return;
1796 }
1797 
1798 static int record__synthesize_workload(struct record *rec, bool tail)
1799 {
1800 	int err;
1801 	struct perf_thread_map *thread_map;
1802 	bool needs_mmap = rec->opts.synth & PERF_SYNTH_MMAP;
1803 
1804 	if (rec->opts.tail_synthesize != tail)
1805 		return 0;
1806 
1807 	thread_map = thread_map__new_by_tid(rec->evlist->workload.pid);
1808 	if (thread_map == NULL)
1809 		return -1;
1810 
1811 	err = perf_event__synthesize_thread_map(&rec->tool, thread_map,
1812 						 process_synthesized_event,
1813 						 &rec->session->machines.host,
1814 						 needs_mmap,
1815 						 rec->opts.sample_address);
1816 	perf_thread_map__put(thread_map);
1817 	return err;
1818 }
1819 
1820 static int write_finished_init(struct record *rec, bool tail)
1821 {
1822 	if (rec->opts.tail_synthesize != tail)
1823 		return 0;
1824 
1825 	return record__write(rec, NULL, &finished_init_event, sizeof(finished_init_event));
1826 }
1827 
1828 static int record__synthesize(struct record *rec, bool tail);
1829 
1830 static int
1831 record__switch_output(struct record *rec, bool at_exit)
1832 {
1833 	struct perf_data *data = &rec->data;
1834 	char *new_filename = NULL;
1835 	int fd, err;
1836 
1837 	/* Same Size:      "2015122520103046"*/
1838 	char timestamp[] = "InvalidTimestamp";
1839 
1840 	record__aio_mmap_read_sync(rec);
1841 
1842 	write_finished_init(rec, true);
1843 
1844 	record__synthesize(rec, true);
1845 	if (target__none(&rec->opts.target))
1846 		record__synthesize_workload(rec, true);
1847 
1848 	rec->samples = 0;
1849 	record__finish_output(rec);
1850 	err = fetch_current_timestamp(timestamp, sizeof(timestamp));
1851 	if (err) {
1852 		pr_err("Failed to get current timestamp\n");
1853 		return -EINVAL;
1854 	}
1855 
1856 	fd = perf_data__switch(data, timestamp,
1857 			       rec->session->header.data_offset,
1858 			       at_exit, &new_filename);
1859 	if (fd >= 0 && !at_exit) {
1860 		rec->bytes_written = 0;
1861 		rec->session->header.data_size = 0;
1862 	}
1863 
1864 	if (!quiet) {
1865 		fprintf(stderr, "[ perf record: Dump %s.%s ]\n",
1866 			data->path, timestamp);
1867 	}
1868 
1869 	if (rec->switch_output.num_files) {
1870 		int n = rec->switch_output.cur_file + 1;
1871 
1872 		if (n >= rec->switch_output.num_files)
1873 			n = 0;
1874 		rec->switch_output.cur_file = n;
1875 		if (rec->switch_output.filenames[n]) {
1876 			remove(rec->switch_output.filenames[n]);
1877 			zfree(&rec->switch_output.filenames[n]);
1878 		}
1879 		rec->switch_output.filenames[n] = new_filename;
1880 	} else {
1881 		free(new_filename);
1882 	}
1883 
1884 	/* Output tracking events */
1885 	if (!at_exit) {
1886 		record__synthesize(rec, false);
1887 
1888 		/*
1889 		 * In 'perf record --switch-output' without -a,
1890 		 * record__synthesize() in record__switch_output() won't
1891 		 * generate tracking events because there's no thread_map
1892 		 * in evlist. Which causes newly created perf.data doesn't
1893 		 * contain map and comm information.
1894 		 * Create a fake thread_map and directly call
1895 		 * perf_event__synthesize_thread_map() for those events.
1896 		 */
1897 		if (target__none(&rec->opts.target))
1898 			record__synthesize_workload(rec, false);
1899 		write_finished_init(rec, false);
1900 	}
1901 	return fd;
1902 }
1903 
1904 static void __record__save_lost_samples(struct record *rec, struct evsel *evsel,
1905 					struct perf_record_lost_samples *lost,
1906 					int cpu_idx, int thread_idx, u64 lost_count,
1907 					u16 misc_flag)
1908 {
1909 	struct perf_sample_id *sid;
1910 	struct perf_sample sample = {};
1911 	int id_hdr_size;
1912 
1913 	lost->lost = lost_count;
1914 	if (evsel->core.ids) {
1915 		sid = xyarray__entry(evsel->core.sample_id, cpu_idx, thread_idx);
1916 		sample.id = sid->id;
1917 	}
1918 
1919 	id_hdr_size = perf_event__synthesize_id_sample((void *)(lost + 1),
1920 						       evsel->core.attr.sample_type, &sample);
1921 	lost->header.size = sizeof(*lost) + id_hdr_size;
1922 	lost->header.misc = misc_flag;
1923 	record__write(rec, NULL, lost, lost->header.size);
1924 }
1925 
1926 static void record__read_lost_samples(struct record *rec)
1927 {
1928 	struct perf_session *session = rec->session;
1929 	struct perf_record_lost_samples_and_ids lost;
1930 	struct evsel *evsel;
1931 
1932 	/* there was an error during record__open */
1933 	if (session->evlist == NULL)
1934 		return;
1935 
1936 	evlist__for_each_entry(session->evlist, evsel) {
1937 		struct xyarray *xy = evsel->core.sample_id;
1938 		u64 lost_count;
1939 
1940 		if (xy == NULL || evsel->core.fd == NULL)
1941 			continue;
1942 		if (xyarray__max_x(evsel->core.fd) != xyarray__max_x(xy) ||
1943 		    xyarray__max_y(evsel->core.fd) != xyarray__max_y(xy)) {
1944 			pr_debug("Unmatched FD vs. sample ID: skip reading LOST count\n");
1945 			continue;
1946 		}
1947 
1948 		for (int x = 0; x < xyarray__max_x(xy); x++) {
1949 			for (int y = 0; y < xyarray__max_y(xy); y++) {
1950 				struct perf_counts_values count;
1951 
1952 				if (perf_evsel__read(&evsel->core, x, y, &count) < 0) {
1953 					pr_debug("read LOST count failed\n");
1954 					return;
1955 				}
1956 
1957 				if (count.lost) {
1958 					memset(&lost, 0, sizeof(lost));
1959 					lost.lost.header.type = PERF_RECORD_LOST_SAMPLES;
1960 					__record__save_lost_samples(rec, evsel, &lost.lost,
1961 								    x, y, count.lost, 0);
1962 				}
1963 			}
1964 		}
1965 
1966 		lost_count = perf_bpf_filter__lost_count(evsel);
1967 		if (lost_count) {
1968 			memset(&lost, 0, sizeof(lost));
1969 			lost.lost.header.type = PERF_RECORD_LOST_SAMPLES;
1970 			__record__save_lost_samples(rec, evsel, &lost.lost, 0, 0, lost_count,
1971 						    PERF_RECORD_MISC_LOST_SAMPLES_BPF);
1972 		}
1973 	}
1974 }
1975 
1976 static volatile sig_atomic_t workload_exec_errno;
1977 
1978 /*
1979  * evlist__prepare_workload will send a SIGUSR1
1980  * if the fork fails, since we asked by setting its
1981  * want_signal to true.
1982  */
1983 static void workload_exec_failed_signal(int signo __maybe_unused,
1984 					siginfo_t *info,
1985 					void *ucontext __maybe_unused)
1986 {
1987 	workload_exec_errno = info->si_value.sival_int;
1988 	done = 1;
1989 	child_finished = 1;
1990 }
1991 
1992 static void snapshot_sig_handler(int sig);
1993 static void alarm_sig_handler(int sig);
1994 
1995 static const struct perf_event_mmap_page *evlist__pick_pc(struct evlist *evlist)
1996 {
1997 	if (evlist) {
1998 		if (evlist->mmap && evlist->mmap[0].core.base)
1999 			return evlist->mmap[0].core.base;
2000 		if (evlist->overwrite_mmap && evlist->overwrite_mmap[0].core.base)
2001 			return evlist->overwrite_mmap[0].core.base;
2002 	}
2003 	return NULL;
2004 }
2005 
2006 static const struct perf_event_mmap_page *record__pick_pc(struct record *rec)
2007 {
2008 	const struct perf_event_mmap_page *pc = evlist__pick_pc(rec->evlist);
2009 	if (pc)
2010 		return pc;
2011 	return NULL;
2012 }
2013 
2014 static int record__synthesize(struct record *rec, bool tail)
2015 {
2016 	struct perf_session *session = rec->session;
2017 	struct machine *machine = &session->machines.host;
2018 	struct perf_data *data = &rec->data;
2019 	struct record_opts *opts = &rec->opts;
2020 	struct perf_tool *tool = &rec->tool;
2021 	int err = 0;
2022 	event_op f = process_synthesized_event;
2023 
2024 	if (rec->opts.tail_synthesize != tail)
2025 		return 0;
2026 
2027 	if (data->is_pipe) {
2028 		err = perf_event__synthesize_for_pipe(tool, session, data,
2029 						      process_synthesized_event);
2030 		if (err < 0)
2031 			goto out;
2032 
2033 		rec->bytes_written += err;
2034 	}
2035 
2036 	err = perf_event__synth_time_conv(record__pick_pc(rec), tool,
2037 					  process_synthesized_event, machine);
2038 	if (err)
2039 		goto out;
2040 
2041 	/* Synthesize id_index before auxtrace_info */
2042 	err = perf_event__synthesize_id_index(tool,
2043 					      process_synthesized_event,
2044 					      session->evlist, machine);
2045 	if (err)
2046 		goto out;
2047 
2048 	if (rec->opts.full_auxtrace) {
2049 		err = perf_event__synthesize_auxtrace_info(rec->itr, tool,
2050 					session, process_synthesized_event);
2051 		if (err)
2052 			goto out;
2053 	}
2054 
2055 	if (!evlist__exclude_kernel(rec->evlist)) {
2056 		err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
2057 							 machine);
2058 		WARN_ONCE(err < 0, "Couldn't record kernel reference relocation symbol\n"
2059 				   "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
2060 				   "Check /proc/kallsyms permission or run as root.\n");
2061 
2062 		err = perf_event__synthesize_modules(tool, process_synthesized_event,
2063 						     machine);
2064 		WARN_ONCE(err < 0, "Couldn't record kernel module information.\n"
2065 				   "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
2066 				   "Check /proc/modules permission or run as root.\n");
2067 	}
2068 
2069 	if (perf_guest) {
2070 		machines__process_guests(&session->machines,
2071 					 perf_event__synthesize_guest_os, tool);
2072 	}
2073 
2074 	err = perf_event__synthesize_extra_attr(&rec->tool,
2075 						rec->evlist,
2076 						process_synthesized_event,
2077 						data->is_pipe);
2078 	if (err)
2079 		goto out;
2080 
2081 	err = perf_event__synthesize_thread_map2(&rec->tool, rec->evlist->core.threads,
2082 						 process_synthesized_event,
2083 						NULL);
2084 	if (err < 0) {
2085 		pr_err("Couldn't synthesize thread map.\n");
2086 		return err;
2087 	}
2088 
2089 	err = perf_event__synthesize_cpu_map(&rec->tool, rec->evlist->core.all_cpus,
2090 					     process_synthesized_event, NULL);
2091 	if (err < 0) {
2092 		pr_err("Couldn't synthesize cpu map.\n");
2093 		return err;
2094 	}
2095 
2096 	err = perf_event__synthesize_bpf_events(session, process_synthesized_event,
2097 						machine, opts);
2098 	if (err < 0) {
2099 		pr_warning("Couldn't synthesize bpf events.\n");
2100 		err = 0;
2101 	}
2102 
2103 	if (rec->opts.synth & PERF_SYNTH_CGROUP) {
2104 		err = perf_event__synthesize_cgroups(tool, process_synthesized_event,
2105 						     machine);
2106 		if (err < 0) {
2107 			pr_warning("Couldn't synthesize cgroup events.\n");
2108 			err = 0;
2109 		}
2110 	}
2111 
2112 	if (rec->opts.nr_threads_synthesize > 1) {
2113 		mutex_init(&synth_lock);
2114 		perf_set_multithreaded();
2115 		f = process_locked_synthesized_event;
2116 	}
2117 
2118 	if (rec->opts.synth & PERF_SYNTH_TASK) {
2119 		bool needs_mmap = rec->opts.synth & PERF_SYNTH_MMAP;
2120 
2121 		err = __machine__synthesize_threads(machine, tool, &opts->target,
2122 						    rec->evlist->core.threads,
2123 						    f, needs_mmap, opts->sample_address,
2124 						    rec->opts.nr_threads_synthesize);
2125 	}
2126 
2127 	if (rec->opts.nr_threads_synthesize > 1) {
2128 		perf_set_singlethreaded();
2129 		mutex_destroy(&synth_lock);
2130 	}
2131 
2132 out:
2133 	return err;
2134 }
2135 
2136 static int record__process_signal_event(union perf_event *event __maybe_unused, void *data)
2137 {
2138 	struct record *rec = data;
2139 	pthread_kill(rec->thread_id, SIGUSR2);
2140 	return 0;
2141 }
2142 
2143 static int record__setup_sb_evlist(struct record *rec)
2144 {
2145 	struct record_opts *opts = &rec->opts;
2146 
2147 	if (rec->sb_evlist != NULL) {
2148 		/*
2149 		 * We get here if --switch-output-event populated the
2150 		 * sb_evlist, so associate a callback that will send a SIGUSR2
2151 		 * to the main thread.
2152 		 */
2153 		evlist__set_cb(rec->sb_evlist, record__process_signal_event, rec);
2154 		rec->thread_id = pthread_self();
2155 	}
2156 #ifdef HAVE_LIBBPF_SUPPORT
2157 	if (!opts->no_bpf_event) {
2158 		if (rec->sb_evlist == NULL) {
2159 			rec->sb_evlist = evlist__new();
2160 
2161 			if (rec->sb_evlist == NULL) {
2162 				pr_err("Couldn't create side band evlist.\n.");
2163 				return -1;
2164 			}
2165 		}
2166 
2167 		if (evlist__add_bpf_sb_event(rec->sb_evlist, &rec->session->header.env)) {
2168 			pr_err("Couldn't ask for PERF_RECORD_BPF_EVENT side band events.\n.");
2169 			return -1;
2170 		}
2171 	}
2172 #endif
2173 	if (evlist__start_sb_thread(rec->sb_evlist, &rec->opts.target)) {
2174 		pr_debug("Couldn't start the BPF side band thread:\nBPF programs starting from now on won't be annotatable\n");
2175 		opts->no_bpf_event = true;
2176 	}
2177 
2178 	return 0;
2179 }
2180 
2181 static int record__init_clock(struct record *rec)
2182 {
2183 	struct perf_session *session = rec->session;
2184 	struct timespec ref_clockid;
2185 	struct timeval ref_tod;
2186 	u64 ref;
2187 
2188 	if (!rec->opts.use_clockid)
2189 		return 0;
2190 
2191 	if (rec->opts.use_clockid && rec->opts.clockid_res_ns)
2192 		session->header.env.clock.clockid_res_ns = rec->opts.clockid_res_ns;
2193 
2194 	session->header.env.clock.clockid = rec->opts.clockid;
2195 
2196 	if (gettimeofday(&ref_tod, NULL) != 0) {
2197 		pr_err("gettimeofday failed, cannot set reference time.\n");
2198 		return -1;
2199 	}
2200 
2201 	if (clock_gettime(rec->opts.clockid, &ref_clockid)) {
2202 		pr_err("clock_gettime failed, cannot set reference time.\n");
2203 		return -1;
2204 	}
2205 
2206 	ref = (u64) ref_tod.tv_sec * NSEC_PER_SEC +
2207 	      (u64) ref_tod.tv_usec * NSEC_PER_USEC;
2208 
2209 	session->header.env.clock.tod_ns = ref;
2210 
2211 	ref = (u64) ref_clockid.tv_sec * NSEC_PER_SEC +
2212 	      (u64) ref_clockid.tv_nsec;
2213 
2214 	session->header.env.clock.clockid_ns = ref;
2215 	return 0;
2216 }
2217 
2218 static void hit_auxtrace_snapshot_trigger(struct record *rec)
2219 {
2220 	if (trigger_is_ready(&auxtrace_snapshot_trigger)) {
2221 		trigger_hit(&auxtrace_snapshot_trigger);
2222 		auxtrace_record__snapshot_started = 1;
2223 		if (auxtrace_record__snapshot_start(rec->itr))
2224 			trigger_error(&auxtrace_snapshot_trigger);
2225 	}
2226 }
2227 
2228 static int record__terminate_thread(struct record_thread *thread_data)
2229 {
2230 	int err;
2231 	enum thread_msg ack = THREAD_MSG__UNDEFINED;
2232 	pid_t tid = thread_data->tid;
2233 
2234 	close(thread_data->pipes.msg[1]);
2235 	thread_data->pipes.msg[1] = -1;
2236 	err = read(thread_data->pipes.ack[0], &ack, sizeof(ack));
2237 	if (err > 0)
2238 		pr_debug2("threads[%d]: sent %s\n", tid, thread_msg_tags[ack]);
2239 	else
2240 		pr_warning("threads[%d]: failed to receive termination notification from %d\n",
2241 			   thread->tid, tid);
2242 
2243 	return 0;
2244 }
2245 
2246 static int record__start_threads(struct record *rec)
2247 {
2248 	int t, tt, err, ret = 0, nr_threads = rec->nr_threads;
2249 	struct record_thread *thread_data = rec->thread_data;
2250 	sigset_t full, mask;
2251 	pthread_t handle;
2252 	pthread_attr_t attrs;
2253 
2254 	thread = &thread_data[0];
2255 
2256 	if (!record__threads_enabled(rec))
2257 		return 0;
2258 
2259 	sigfillset(&full);
2260 	if (sigprocmask(SIG_SETMASK, &full, &mask)) {
2261 		pr_err("Failed to block signals on threads start: %s\n", strerror(errno));
2262 		return -1;
2263 	}
2264 
2265 	pthread_attr_init(&attrs);
2266 	pthread_attr_setdetachstate(&attrs, PTHREAD_CREATE_DETACHED);
2267 
2268 	for (t = 1; t < nr_threads; t++) {
2269 		enum thread_msg msg = THREAD_MSG__UNDEFINED;
2270 
2271 #ifdef HAVE_PTHREAD_ATTR_SETAFFINITY_NP
2272 		pthread_attr_setaffinity_np(&attrs,
2273 					    MMAP_CPU_MASK_BYTES(&(thread_data[t].mask->affinity)),
2274 					    (cpu_set_t *)(thread_data[t].mask->affinity.bits));
2275 #endif
2276 		if (pthread_create(&handle, &attrs, record__thread, &thread_data[t])) {
2277 			for (tt = 1; tt < t; tt++)
2278 				record__terminate_thread(&thread_data[t]);
2279 			pr_err("Failed to start threads: %s\n", strerror(errno));
2280 			ret = -1;
2281 			goto out_err;
2282 		}
2283 
2284 		err = read(thread_data[t].pipes.ack[0], &msg, sizeof(msg));
2285 		if (err > 0)
2286 			pr_debug2("threads[%d]: sent %s\n", rec->thread_data[t].tid,
2287 				  thread_msg_tags[msg]);
2288 		else
2289 			pr_warning("threads[%d]: failed to receive start notification from %d\n",
2290 				   thread->tid, rec->thread_data[t].tid);
2291 	}
2292 
2293 	sched_setaffinity(0, MMAP_CPU_MASK_BYTES(&thread->mask->affinity),
2294 			(cpu_set_t *)thread->mask->affinity.bits);
2295 
2296 	pr_debug("threads[%d]: started on cpu%d\n", thread->tid, sched_getcpu());
2297 
2298 out_err:
2299 	pthread_attr_destroy(&attrs);
2300 
2301 	if (sigprocmask(SIG_SETMASK, &mask, NULL)) {
2302 		pr_err("Failed to unblock signals on threads start: %s\n", strerror(errno));
2303 		ret = -1;
2304 	}
2305 
2306 	return ret;
2307 }
2308 
2309 static int record__stop_threads(struct record *rec)
2310 {
2311 	int t;
2312 	struct record_thread *thread_data = rec->thread_data;
2313 
2314 	for (t = 1; t < rec->nr_threads; t++)
2315 		record__terminate_thread(&thread_data[t]);
2316 
2317 	for (t = 0; t < rec->nr_threads; t++) {
2318 		rec->samples += thread_data[t].samples;
2319 		if (!record__threads_enabled(rec))
2320 			continue;
2321 		rec->session->bytes_transferred += thread_data[t].bytes_transferred;
2322 		rec->session->bytes_compressed += thread_data[t].bytes_compressed;
2323 		pr_debug("threads[%d]: samples=%lld, wakes=%ld, ", thread_data[t].tid,
2324 			 thread_data[t].samples, thread_data[t].waking);
2325 		if (thread_data[t].bytes_transferred && thread_data[t].bytes_compressed)
2326 			pr_debug("transferred=%" PRIu64 ", compressed=%" PRIu64 "\n",
2327 				 thread_data[t].bytes_transferred, thread_data[t].bytes_compressed);
2328 		else
2329 			pr_debug("written=%" PRIu64 "\n", thread_data[t].bytes_written);
2330 	}
2331 
2332 	return 0;
2333 }
2334 
2335 static unsigned long record__waking(struct record *rec)
2336 {
2337 	int t;
2338 	unsigned long waking = 0;
2339 	struct record_thread *thread_data = rec->thread_data;
2340 
2341 	for (t = 0; t < rec->nr_threads; t++)
2342 		waking += thread_data[t].waking;
2343 
2344 	return waking;
2345 }
2346 
2347 static int __cmd_record(struct record *rec, int argc, const char **argv)
2348 {
2349 	int err;
2350 	int status = 0;
2351 	const bool forks = argc > 0;
2352 	struct perf_tool *tool = &rec->tool;
2353 	struct record_opts *opts = &rec->opts;
2354 	struct perf_data *data = &rec->data;
2355 	struct perf_session *session;
2356 	bool disabled = false, draining = false;
2357 	int fd;
2358 	float ratio = 0;
2359 	enum evlist_ctl_cmd cmd = EVLIST_CTL_CMD_UNSUPPORTED;
2360 
2361 	atexit(record__sig_exit);
2362 	signal(SIGCHLD, sig_handler);
2363 	signal(SIGINT, sig_handler);
2364 	signal(SIGTERM, sig_handler);
2365 	signal(SIGSEGV, sigsegv_handler);
2366 
2367 	if (rec->opts.record_namespaces)
2368 		tool->namespace_events = true;
2369 
2370 	if (rec->opts.record_cgroup) {
2371 #ifdef HAVE_FILE_HANDLE
2372 		tool->cgroup_events = true;
2373 #else
2374 		pr_err("cgroup tracking is not supported\n");
2375 		return -1;
2376 #endif
2377 	}
2378 
2379 	if (rec->opts.auxtrace_snapshot_mode || rec->switch_output.enabled) {
2380 		signal(SIGUSR2, snapshot_sig_handler);
2381 		if (rec->opts.auxtrace_snapshot_mode)
2382 			trigger_on(&auxtrace_snapshot_trigger);
2383 		if (rec->switch_output.enabled)
2384 			trigger_on(&switch_output_trigger);
2385 	} else {
2386 		signal(SIGUSR2, SIG_IGN);
2387 	}
2388 
2389 	session = perf_session__new(data, tool);
2390 	if (IS_ERR(session)) {
2391 		pr_err("Perf session creation failed.\n");
2392 		return PTR_ERR(session);
2393 	}
2394 
2395 	if (record__threads_enabled(rec)) {
2396 		if (perf_data__is_pipe(&rec->data)) {
2397 			pr_err("Parallel trace streaming is not available in pipe mode.\n");
2398 			return -1;
2399 		}
2400 		if (rec->opts.full_auxtrace) {
2401 			pr_err("Parallel trace streaming is not available in AUX area tracing mode.\n");
2402 			return -1;
2403 		}
2404 	}
2405 
2406 	fd = perf_data__fd(data);
2407 	rec->session = session;
2408 
2409 	if (zstd_init(&session->zstd_data, rec->opts.comp_level) < 0) {
2410 		pr_err("Compression initialization failed.\n");
2411 		return -1;
2412 	}
2413 #ifdef HAVE_EVENTFD_SUPPORT
2414 	done_fd = eventfd(0, EFD_NONBLOCK);
2415 	if (done_fd < 0) {
2416 		pr_err("Failed to create wakeup eventfd, error: %m\n");
2417 		status = -1;
2418 		goto out_delete_session;
2419 	}
2420 	err = evlist__add_wakeup_eventfd(rec->evlist, done_fd);
2421 	if (err < 0) {
2422 		pr_err("Failed to add wakeup eventfd to poll list\n");
2423 		status = err;
2424 		goto out_delete_session;
2425 	}
2426 #endif // HAVE_EVENTFD_SUPPORT
2427 
2428 	session->header.env.comp_type  = PERF_COMP_ZSTD;
2429 	session->header.env.comp_level = rec->opts.comp_level;
2430 
2431 	if (rec->opts.kcore &&
2432 	    !record__kcore_readable(&session->machines.host)) {
2433 		pr_err("ERROR: kcore is not readable.\n");
2434 		return -1;
2435 	}
2436 
2437 	if (record__init_clock(rec))
2438 		return -1;
2439 
2440 	record__init_features(rec);
2441 
2442 	if (forks) {
2443 		err = evlist__prepare_workload(rec->evlist, &opts->target, argv, data->is_pipe,
2444 					       workload_exec_failed_signal);
2445 		if (err < 0) {
2446 			pr_err("Couldn't run the workload!\n");
2447 			status = err;
2448 			goto out_delete_session;
2449 		}
2450 	}
2451 
2452 	/*
2453 	 * If we have just single event and are sending data
2454 	 * through pipe, we need to force the ids allocation,
2455 	 * because we synthesize event name through the pipe
2456 	 * and need the id for that.
2457 	 */
2458 	if (data->is_pipe && rec->evlist->core.nr_entries == 1)
2459 		rec->opts.sample_id = true;
2460 
2461 	if (rec->timestamp_filename && perf_data__is_pipe(data)) {
2462 		rec->timestamp_filename = false;
2463 		pr_warning("WARNING: --timestamp-filename option is not available in pipe mode.\n");
2464 	}
2465 
2466 	evlist__uniquify_name(rec->evlist);
2467 
2468 	evlist__config(rec->evlist, opts, &callchain_param);
2469 
2470 	/* Debug message used by test scripts */
2471 	pr_debug3("perf record opening and mmapping events\n");
2472 	if (record__open(rec) != 0) {
2473 		err = -1;
2474 		goto out_free_threads;
2475 	}
2476 	/* Debug message used by test scripts */
2477 	pr_debug3("perf record done opening and mmapping events\n");
2478 	session->header.env.comp_mmap_len = session->evlist->core.mmap_len;
2479 
2480 	if (rec->opts.kcore) {
2481 		err = record__kcore_copy(&session->machines.host, data);
2482 		if (err) {
2483 			pr_err("ERROR: Failed to copy kcore\n");
2484 			goto out_free_threads;
2485 		}
2486 	}
2487 
2488 	/*
2489 	 * Normally perf_session__new would do this, but it doesn't have the
2490 	 * evlist.
2491 	 */
2492 	if (rec->tool.ordered_events && !evlist__sample_id_all(rec->evlist)) {
2493 		pr_warning("WARNING: No sample_id_all support, falling back to unordered processing\n");
2494 		rec->tool.ordered_events = false;
2495 	}
2496 
2497 	if (evlist__nr_groups(rec->evlist) == 0)
2498 		perf_header__clear_feat(&session->header, HEADER_GROUP_DESC);
2499 
2500 	if (data->is_pipe) {
2501 		err = perf_header__write_pipe(fd);
2502 		if (err < 0)
2503 			goto out_free_threads;
2504 	} else {
2505 		err = perf_session__write_header(session, rec->evlist, fd, false);
2506 		if (err < 0)
2507 			goto out_free_threads;
2508 	}
2509 
2510 	err = -1;
2511 	if (!rec->no_buildid
2512 	    && !perf_header__has_feat(&session->header, HEADER_BUILD_ID)) {
2513 		pr_err("Couldn't generate buildids. "
2514 		       "Use --no-buildid to profile anyway.\n");
2515 		goto out_free_threads;
2516 	}
2517 
2518 	err = record__setup_sb_evlist(rec);
2519 	if (err)
2520 		goto out_free_threads;
2521 
2522 	err = record__synthesize(rec, false);
2523 	if (err < 0)
2524 		goto out_free_threads;
2525 
2526 	if (rec->realtime_prio) {
2527 		struct sched_param param;
2528 
2529 		param.sched_priority = rec->realtime_prio;
2530 		if (sched_setscheduler(0, SCHED_FIFO, &param)) {
2531 			pr_err("Could not set realtime priority.\n");
2532 			err = -1;
2533 			goto out_free_threads;
2534 		}
2535 	}
2536 
2537 	if (record__start_threads(rec))
2538 		goto out_free_threads;
2539 
2540 	/*
2541 	 * When perf is starting the traced process, all the events
2542 	 * (apart from group members) have enable_on_exec=1 set,
2543 	 * so don't spoil it by prematurely enabling them.
2544 	 */
2545 	if (!target__none(&opts->target) && !opts->target.initial_delay)
2546 		evlist__enable(rec->evlist);
2547 
2548 	/*
2549 	 * Let the child rip
2550 	 */
2551 	if (forks) {
2552 		struct machine *machine = &session->machines.host;
2553 		union perf_event *event;
2554 		pid_t tgid;
2555 
2556 		event = malloc(sizeof(event->comm) + machine->id_hdr_size);
2557 		if (event == NULL) {
2558 			err = -ENOMEM;
2559 			goto out_child;
2560 		}
2561 
2562 		/*
2563 		 * Some H/W events are generated before COMM event
2564 		 * which is emitted during exec(), so perf script
2565 		 * cannot see a correct process name for those events.
2566 		 * Synthesize COMM event to prevent it.
2567 		 */
2568 		tgid = perf_event__synthesize_comm(tool, event,
2569 						   rec->evlist->workload.pid,
2570 						   process_synthesized_event,
2571 						   machine);
2572 		free(event);
2573 
2574 		if (tgid == -1)
2575 			goto out_child;
2576 
2577 		event = malloc(sizeof(event->namespaces) +
2578 			       (NR_NAMESPACES * sizeof(struct perf_ns_link_info)) +
2579 			       machine->id_hdr_size);
2580 		if (event == NULL) {
2581 			err = -ENOMEM;
2582 			goto out_child;
2583 		}
2584 
2585 		/*
2586 		 * Synthesize NAMESPACES event for the command specified.
2587 		 */
2588 		perf_event__synthesize_namespaces(tool, event,
2589 						  rec->evlist->workload.pid,
2590 						  tgid, process_synthesized_event,
2591 						  machine);
2592 		free(event);
2593 
2594 		evlist__start_workload(rec->evlist);
2595 	}
2596 
2597 	if (opts->target.initial_delay) {
2598 		pr_info(EVLIST_DISABLED_MSG);
2599 		if (opts->target.initial_delay > 0) {
2600 			usleep(opts->target.initial_delay * USEC_PER_MSEC);
2601 			evlist__enable(rec->evlist);
2602 			pr_info(EVLIST_ENABLED_MSG);
2603 		}
2604 	}
2605 
2606 	err = event_enable_timer__start(rec->evlist->eet);
2607 	if (err)
2608 		goto out_child;
2609 
2610 	/* Debug message used by test scripts */
2611 	pr_debug3("perf record has started\n");
2612 	fflush(stderr);
2613 
2614 	trigger_ready(&auxtrace_snapshot_trigger);
2615 	trigger_ready(&switch_output_trigger);
2616 	perf_hooks__invoke_record_start();
2617 
2618 	/*
2619 	 * Must write FINISHED_INIT so it will be seen after all other
2620 	 * synthesized user events, but before any regular events.
2621 	 */
2622 	err = write_finished_init(rec, false);
2623 	if (err < 0)
2624 		goto out_child;
2625 
2626 	for (;;) {
2627 		unsigned long long hits = thread->samples;
2628 
2629 		/*
2630 		 * rec->evlist->bkw_mmap_state is possible to be
2631 		 * BKW_MMAP_EMPTY here: when done == true and
2632 		 * hits != rec->samples in previous round.
2633 		 *
2634 		 * evlist__toggle_bkw_mmap ensure we never
2635 		 * convert BKW_MMAP_EMPTY to BKW_MMAP_DATA_PENDING.
2636 		 */
2637 		if (trigger_is_hit(&switch_output_trigger) || done || draining)
2638 			evlist__toggle_bkw_mmap(rec->evlist, BKW_MMAP_DATA_PENDING);
2639 
2640 		if (record__mmap_read_all(rec, false) < 0) {
2641 			trigger_error(&auxtrace_snapshot_trigger);
2642 			trigger_error(&switch_output_trigger);
2643 			err = -1;
2644 			goto out_child;
2645 		}
2646 
2647 		if (auxtrace_record__snapshot_started) {
2648 			auxtrace_record__snapshot_started = 0;
2649 			if (!trigger_is_error(&auxtrace_snapshot_trigger))
2650 				record__read_auxtrace_snapshot(rec, false);
2651 			if (trigger_is_error(&auxtrace_snapshot_trigger)) {
2652 				pr_err("AUX area tracing snapshot failed\n");
2653 				err = -1;
2654 				goto out_child;
2655 			}
2656 		}
2657 
2658 		if (trigger_is_hit(&switch_output_trigger)) {
2659 			/*
2660 			 * If switch_output_trigger is hit, the data in
2661 			 * overwritable ring buffer should have been collected,
2662 			 * so bkw_mmap_state should be set to BKW_MMAP_EMPTY.
2663 			 *
2664 			 * If SIGUSR2 raise after or during record__mmap_read_all(),
2665 			 * record__mmap_read_all() didn't collect data from
2666 			 * overwritable ring buffer. Read again.
2667 			 */
2668 			if (rec->evlist->bkw_mmap_state == BKW_MMAP_RUNNING)
2669 				continue;
2670 			trigger_ready(&switch_output_trigger);
2671 
2672 			/*
2673 			 * Reenable events in overwrite ring buffer after
2674 			 * record__mmap_read_all(): we should have collected
2675 			 * data from it.
2676 			 */
2677 			evlist__toggle_bkw_mmap(rec->evlist, BKW_MMAP_RUNNING);
2678 
2679 			if (!quiet)
2680 				fprintf(stderr, "[ perf record: dump data: Woken up %ld times ]\n",
2681 					record__waking(rec));
2682 			thread->waking = 0;
2683 			fd = record__switch_output(rec, false);
2684 			if (fd < 0) {
2685 				pr_err("Failed to switch to new file\n");
2686 				trigger_error(&switch_output_trigger);
2687 				err = fd;
2688 				goto out_child;
2689 			}
2690 
2691 			/* re-arm the alarm */
2692 			if (rec->switch_output.time)
2693 				alarm(rec->switch_output.time);
2694 		}
2695 
2696 		if (hits == thread->samples) {
2697 			if (done || draining)
2698 				break;
2699 			err = fdarray__poll(&thread->pollfd, -1);
2700 			/*
2701 			 * Propagate error, only if there's any. Ignore positive
2702 			 * number of returned events and interrupt error.
2703 			 */
2704 			if (err > 0 || (err < 0 && errno == EINTR))
2705 				err = 0;
2706 			thread->waking++;
2707 
2708 			if (fdarray__filter(&thread->pollfd, POLLERR | POLLHUP,
2709 					    record__thread_munmap_filtered, NULL) == 0)
2710 				draining = true;
2711 
2712 			err = record__update_evlist_pollfd_from_thread(rec, rec->evlist, thread);
2713 			if (err)
2714 				goto out_child;
2715 		}
2716 
2717 		if (evlist__ctlfd_process(rec->evlist, &cmd) > 0) {
2718 			switch (cmd) {
2719 			case EVLIST_CTL_CMD_SNAPSHOT:
2720 				hit_auxtrace_snapshot_trigger(rec);
2721 				evlist__ctlfd_ack(rec->evlist);
2722 				break;
2723 			case EVLIST_CTL_CMD_STOP:
2724 				done = 1;
2725 				break;
2726 			case EVLIST_CTL_CMD_ACK:
2727 			case EVLIST_CTL_CMD_UNSUPPORTED:
2728 			case EVLIST_CTL_CMD_ENABLE:
2729 			case EVLIST_CTL_CMD_DISABLE:
2730 			case EVLIST_CTL_CMD_EVLIST:
2731 			case EVLIST_CTL_CMD_PING:
2732 			default:
2733 				break;
2734 			}
2735 		}
2736 
2737 		err = event_enable_timer__process(rec->evlist->eet);
2738 		if (err < 0)
2739 			goto out_child;
2740 		if (err) {
2741 			err = 0;
2742 			done = 1;
2743 		}
2744 
2745 		/*
2746 		 * When perf is starting the traced process, at the end events
2747 		 * die with the process and we wait for that. Thus no need to
2748 		 * disable events in this case.
2749 		 */
2750 		if (done && !disabled && !target__none(&opts->target)) {
2751 			trigger_off(&auxtrace_snapshot_trigger);
2752 			evlist__disable(rec->evlist);
2753 			disabled = true;
2754 		}
2755 	}
2756 
2757 	trigger_off(&auxtrace_snapshot_trigger);
2758 	trigger_off(&switch_output_trigger);
2759 
2760 	if (opts->auxtrace_snapshot_on_exit)
2761 		record__auxtrace_snapshot_exit(rec);
2762 
2763 	if (forks && workload_exec_errno) {
2764 		char msg[STRERR_BUFSIZE], strevsels[2048];
2765 		const char *emsg = str_error_r(workload_exec_errno, msg, sizeof(msg));
2766 
2767 		evlist__scnprintf_evsels(rec->evlist, sizeof(strevsels), strevsels);
2768 
2769 		pr_err("Failed to collect '%s' for the '%s' workload: %s\n",
2770 			strevsels, argv[0], emsg);
2771 		err = -1;
2772 		goto out_child;
2773 	}
2774 
2775 	if (!quiet)
2776 		fprintf(stderr, "[ perf record: Woken up %ld times to write data ]\n",
2777 			record__waking(rec));
2778 
2779 	write_finished_init(rec, true);
2780 
2781 	if (target__none(&rec->opts.target))
2782 		record__synthesize_workload(rec, true);
2783 
2784 out_child:
2785 	record__stop_threads(rec);
2786 	record__mmap_read_all(rec, true);
2787 out_free_threads:
2788 	record__free_thread_data(rec);
2789 	evlist__finalize_ctlfd(rec->evlist);
2790 	record__aio_mmap_read_sync(rec);
2791 
2792 	if (rec->session->bytes_transferred && rec->session->bytes_compressed) {
2793 		ratio = (float)rec->session->bytes_transferred/(float)rec->session->bytes_compressed;
2794 		session->header.env.comp_ratio = ratio + 0.5;
2795 	}
2796 
2797 	if (forks) {
2798 		int exit_status;
2799 
2800 		if (!child_finished)
2801 			kill(rec->evlist->workload.pid, SIGTERM);
2802 
2803 		wait(&exit_status);
2804 
2805 		if (err < 0)
2806 			status = err;
2807 		else if (WIFEXITED(exit_status))
2808 			status = WEXITSTATUS(exit_status);
2809 		else if (WIFSIGNALED(exit_status))
2810 			signr = WTERMSIG(exit_status);
2811 	} else
2812 		status = err;
2813 
2814 	if (rec->off_cpu)
2815 		rec->bytes_written += off_cpu_write(rec->session);
2816 
2817 	record__read_lost_samples(rec);
2818 	record__synthesize(rec, true);
2819 	/* this will be recalculated during process_buildids() */
2820 	rec->samples = 0;
2821 
2822 	if (!err) {
2823 		if (!rec->timestamp_filename) {
2824 			record__finish_output(rec);
2825 		} else {
2826 			fd = record__switch_output(rec, true);
2827 			if (fd < 0) {
2828 				status = fd;
2829 				goto out_delete_session;
2830 			}
2831 		}
2832 	}
2833 
2834 	perf_hooks__invoke_record_end();
2835 
2836 	if (!err && !quiet) {
2837 		char samples[128];
2838 		const char *postfix = rec->timestamp_filename ?
2839 					".<timestamp>" : "";
2840 
2841 		if (rec->samples && !rec->opts.full_auxtrace)
2842 			scnprintf(samples, sizeof(samples),
2843 				  " (%" PRIu64 " samples)", rec->samples);
2844 		else
2845 			samples[0] = '\0';
2846 
2847 		fprintf(stderr,	"[ perf record: Captured and wrote %.3f MB %s%s%s",
2848 			perf_data__size(data) / 1024.0 / 1024.0,
2849 			data->path, postfix, samples);
2850 		if (ratio) {
2851 			fprintf(stderr,	", compressed (original %.3f MB, ratio is %.3f)",
2852 					rec->session->bytes_transferred / 1024.0 / 1024.0,
2853 					ratio);
2854 		}
2855 		fprintf(stderr, " ]\n");
2856 	}
2857 
2858 out_delete_session:
2859 #ifdef HAVE_EVENTFD_SUPPORT
2860 	if (done_fd >= 0) {
2861 		fd = done_fd;
2862 		done_fd = -1;
2863 
2864 		close(fd);
2865 	}
2866 #endif
2867 	zstd_fini(&session->zstd_data);
2868 	if (!opts->no_bpf_event)
2869 		evlist__stop_sb_thread(rec->sb_evlist);
2870 
2871 	perf_session__delete(session);
2872 	return status;
2873 }
2874 
2875 static void callchain_debug(struct callchain_param *callchain)
2876 {
2877 	static const char *str[CALLCHAIN_MAX] = { "NONE", "FP", "DWARF", "LBR" };
2878 
2879 	pr_debug("callchain: type %s\n", str[callchain->record_mode]);
2880 
2881 	if (callchain->record_mode == CALLCHAIN_DWARF)
2882 		pr_debug("callchain: stack dump size %d\n",
2883 			 callchain->dump_size);
2884 }
2885 
2886 int record_opts__parse_callchain(struct record_opts *record,
2887 				 struct callchain_param *callchain,
2888 				 const char *arg, bool unset)
2889 {
2890 	int ret;
2891 	callchain->enabled = !unset;
2892 
2893 	/* --no-call-graph */
2894 	if (unset) {
2895 		callchain->record_mode = CALLCHAIN_NONE;
2896 		pr_debug("callchain: disabled\n");
2897 		return 0;
2898 	}
2899 
2900 	ret = parse_callchain_record_opt(arg, callchain);
2901 	if (!ret) {
2902 		/* Enable data address sampling for DWARF unwind. */
2903 		if (callchain->record_mode == CALLCHAIN_DWARF)
2904 			record->sample_address = true;
2905 		callchain_debug(callchain);
2906 	}
2907 
2908 	return ret;
2909 }
2910 
2911 int record_parse_callchain_opt(const struct option *opt,
2912 			       const char *arg,
2913 			       int unset)
2914 {
2915 	return record_opts__parse_callchain(opt->value, &callchain_param, arg, unset);
2916 }
2917 
2918 int record_callchain_opt(const struct option *opt,
2919 			 const char *arg __maybe_unused,
2920 			 int unset __maybe_unused)
2921 {
2922 	struct callchain_param *callchain = opt->value;
2923 
2924 	callchain->enabled = true;
2925 
2926 	if (callchain->record_mode == CALLCHAIN_NONE)
2927 		callchain->record_mode = CALLCHAIN_FP;
2928 
2929 	callchain_debug(callchain);
2930 	return 0;
2931 }
2932 
2933 static int perf_record_config(const char *var, const char *value, void *cb)
2934 {
2935 	struct record *rec = cb;
2936 
2937 	if (!strcmp(var, "record.build-id")) {
2938 		if (!strcmp(value, "cache"))
2939 			rec->no_buildid_cache = false;
2940 		else if (!strcmp(value, "no-cache"))
2941 			rec->no_buildid_cache = true;
2942 		else if (!strcmp(value, "skip"))
2943 			rec->no_buildid = true;
2944 		else if (!strcmp(value, "mmap"))
2945 			rec->buildid_mmap = true;
2946 		else
2947 			return -1;
2948 		return 0;
2949 	}
2950 	if (!strcmp(var, "record.call-graph")) {
2951 		var = "call-graph.record-mode";
2952 		return perf_default_config(var, value, cb);
2953 	}
2954 #ifdef HAVE_AIO_SUPPORT
2955 	if (!strcmp(var, "record.aio")) {
2956 		rec->opts.nr_cblocks = strtol(value, NULL, 0);
2957 		if (!rec->opts.nr_cblocks)
2958 			rec->opts.nr_cblocks = nr_cblocks_default;
2959 	}
2960 #endif
2961 	if (!strcmp(var, "record.debuginfod")) {
2962 		rec->debuginfod.urls = strdup(value);
2963 		if (!rec->debuginfod.urls)
2964 			return -ENOMEM;
2965 		rec->debuginfod.set = true;
2966 	}
2967 
2968 	return 0;
2969 }
2970 
2971 static int record__parse_event_enable_time(const struct option *opt, const char *str, int unset)
2972 {
2973 	struct record *rec = (struct record *)opt->value;
2974 
2975 	return evlist__parse_event_enable_time(rec->evlist, &rec->opts, str, unset);
2976 }
2977 
2978 static int record__parse_affinity(const struct option *opt, const char *str, int unset)
2979 {
2980 	struct record_opts *opts = (struct record_opts *)opt->value;
2981 
2982 	if (unset || !str)
2983 		return 0;
2984 
2985 	if (!strcasecmp(str, "node"))
2986 		opts->affinity = PERF_AFFINITY_NODE;
2987 	else if (!strcasecmp(str, "cpu"))
2988 		opts->affinity = PERF_AFFINITY_CPU;
2989 
2990 	return 0;
2991 }
2992 
2993 static int record__mmap_cpu_mask_alloc(struct mmap_cpu_mask *mask, int nr_bits)
2994 {
2995 	mask->nbits = nr_bits;
2996 	mask->bits = bitmap_zalloc(mask->nbits);
2997 	if (!mask->bits)
2998 		return -ENOMEM;
2999 
3000 	return 0;
3001 }
3002 
3003 static void record__mmap_cpu_mask_free(struct mmap_cpu_mask *mask)
3004 {
3005 	bitmap_free(mask->bits);
3006 	mask->nbits = 0;
3007 }
3008 
3009 static int record__thread_mask_alloc(struct thread_mask *mask, int nr_bits)
3010 {
3011 	int ret;
3012 
3013 	ret = record__mmap_cpu_mask_alloc(&mask->maps, nr_bits);
3014 	if (ret) {
3015 		mask->affinity.bits = NULL;
3016 		return ret;
3017 	}
3018 
3019 	ret = record__mmap_cpu_mask_alloc(&mask->affinity, nr_bits);
3020 	if (ret) {
3021 		record__mmap_cpu_mask_free(&mask->maps);
3022 		mask->maps.bits = NULL;
3023 	}
3024 
3025 	return ret;
3026 }
3027 
3028 static void record__thread_mask_free(struct thread_mask *mask)
3029 {
3030 	record__mmap_cpu_mask_free(&mask->maps);
3031 	record__mmap_cpu_mask_free(&mask->affinity);
3032 }
3033 
3034 static int record__parse_threads(const struct option *opt, const char *str, int unset)
3035 {
3036 	int s;
3037 	struct record_opts *opts = opt->value;
3038 
3039 	if (unset || !str || !strlen(str)) {
3040 		opts->threads_spec = THREAD_SPEC__CPU;
3041 	} else {
3042 		for (s = 1; s < THREAD_SPEC__MAX; s++) {
3043 			if (s == THREAD_SPEC__USER) {
3044 				opts->threads_user_spec = strdup(str);
3045 				if (!opts->threads_user_spec)
3046 					return -ENOMEM;
3047 				opts->threads_spec = THREAD_SPEC__USER;
3048 				break;
3049 			}
3050 			if (!strncasecmp(str, thread_spec_tags[s], strlen(thread_spec_tags[s]))) {
3051 				opts->threads_spec = s;
3052 				break;
3053 			}
3054 		}
3055 	}
3056 
3057 	if (opts->threads_spec == THREAD_SPEC__USER)
3058 		pr_debug("threads_spec: %s\n", opts->threads_user_spec);
3059 	else
3060 		pr_debug("threads_spec: %s\n", thread_spec_tags[opts->threads_spec]);
3061 
3062 	return 0;
3063 }
3064 
3065 static int parse_output_max_size(const struct option *opt,
3066 				 const char *str, int unset)
3067 {
3068 	unsigned long *s = (unsigned long *)opt->value;
3069 	static struct parse_tag tags_size[] = {
3070 		{ .tag  = 'B', .mult = 1       },
3071 		{ .tag  = 'K', .mult = 1 << 10 },
3072 		{ .tag  = 'M', .mult = 1 << 20 },
3073 		{ .tag  = 'G', .mult = 1 << 30 },
3074 		{ .tag  = 0 },
3075 	};
3076 	unsigned long val;
3077 
3078 	if (unset) {
3079 		*s = 0;
3080 		return 0;
3081 	}
3082 
3083 	val = parse_tag_value(str, tags_size);
3084 	if (val != (unsigned long) -1) {
3085 		*s = val;
3086 		return 0;
3087 	}
3088 
3089 	return -1;
3090 }
3091 
3092 static int record__parse_mmap_pages(const struct option *opt,
3093 				    const char *str,
3094 				    int unset __maybe_unused)
3095 {
3096 	struct record_opts *opts = opt->value;
3097 	char *s, *p;
3098 	unsigned int mmap_pages;
3099 	int ret;
3100 
3101 	if (!str)
3102 		return -EINVAL;
3103 
3104 	s = strdup(str);
3105 	if (!s)
3106 		return -ENOMEM;
3107 
3108 	p = strchr(s, ',');
3109 	if (p)
3110 		*p = '\0';
3111 
3112 	if (*s) {
3113 		ret = __evlist__parse_mmap_pages(&mmap_pages, s);
3114 		if (ret)
3115 			goto out_free;
3116 		opts->mmap_pages = mmap_pages;
3117 	}
3118 
3119 	if (!p) {
3120 		ret = 0;
3121 		goto out_free;
3122 	}
3123 
3124 	ret = __evlist__parse_mmap_pages(&mmap_pages, p + 1);
3125 	if (ret)
3126 		goto out_free;
3127 
3128 	opts->auxtrace_mmap_pages = mmap_pages;
3129 
3130 out_free:
3131 	free(s);
3132 	return ret;
3133 }
3134 
3135 void __weak arch__add_leaf_frame_record_opts(struct record_opts *opts __maybe_unused)
3136 {
3137 }
3138 
3139 static int parse_control_option(const struct option *opt,
3140 				const char *str,
3141 				int unset __maybe_unused)
3142 {
3143 	struct record_opts *opts = opt->value;
3144 
3145 	return evlist__parse_control(str, &opts->ctl_fd, &opts->ctl_fd_ack, &opts->ctl_fd_close);
3146 }
3147 
3148 static void switch_output_size_warn(struct record *rec)
3149 {
3150 	u64 wakeup_size = evlist__mmap_size(rec->opts.mmap_pages);
3151 	struct switch_output *s = &rec->switch_output;
3152 
3153 	wakeup_size /= 2;
3154 
3155 	if (s->size < wakeup_size) {
3156 		char buf[100];
3157 
3158 		unit_number__scnprintf(buf, sizeof(buf), wakeup_size);
3159 		pr_warning("WARNING: switch-output data size lower than "
3160 			   "wakeup kernel buffer size (%s) "
3161 			   "expect bigger perf.data sizes\n", buf);
3162 	}
3163 }
3164 
3165 static int switch_output_setup(struct record *rec)
3166 {
3167 	struct switch_output *s = &rec->switch_output;
3168 	static struct parse_tag tags_size[] = {
3169 		{ .tag  = 'B', .mult = 1       },
3170 		{ .tag  = 'K', .mult = 1 << 10 },
3171 		{ .tag  = 'M', .mult = 1 << 20 },
3172 		{ .tag  = 'G', .mult = 1 << 30 },
3173 		{ .tag  = 0 },
3174 	};
3175 	static struct parse_tag tags_time[] = {
3176 		{ .tag  = 's', .mult = 1        },
3177 		{ .tag  = 'm', .mult = 60       },
3178 		{ .tag  = 'h', .mult = 60*60    },
3179 		{ .tag  = 'd', .mult = 60*60*24 },
3180 		{ .tag  = 0 },
3181 	};
3182 	unsigned long val;
3183 
3184 	/*
3185 	 * If we're using --switch-output-events, then we imply its
3186 	 * --switch-output=signal, as we'll send a SIGUSR2 from the side band
3187 	 *  thread to its parent.
3188 	 */
3189 	if (rec->switch_output_event_set) {
3190 		if (record__threads_enabled(rec)) {
3191 			pr_warning("WARNING: --switch-output-event option is not available in parallel streaming mode.\n");
3192 			return 0;
3193 		}
3194 		goto do_signal;
3195 	}
3196 
3197 	if (!s->set)
3198 		return 0;
3199 
3200 	if (record__threads_enabled(rec)) {
3201 		pr_warning("WARNING: --switch-output option is not available in parallel streaming mode.\n");
3202 		return 0;
3203 	}
3204 
3205 	if (!strcmp(s->str, "signal")) {
3206 do_signal:
3207 		s->signal = true;
3208 		pr_debug("switch-output with SIGUSR2 signal\n");
3209 		goto enabled;
3210 	}
3211 
3212 	val = parse_tag_value(s->str, tags_size);
3213 	if (val != (unsigned long) -1) {
3214 		s->size = val;
3215 		pr_debug("switch-output with %s size threshold\n", s->str);
3216 		goto enabled;
3217 	}
3218 
3219 	val = parse_tag_value(s->str, tags_time);
3220 	if (val != (unsigned long) -1) {
3221 		s->time = val;
3222 		pr_debug("switch-output with %s time threshold (%lu seconds)\n",
3223 			 s->str, s->time);
3224 		goto enabled;
3225 	}
3226 
3227 	return -1;
3228 
3229 enabled:
3230 	rec->timestamp_filename = true;
3231 	s->enabled              = true;
3232 
3233 	if (s->size && !rec->opts.no_buffering)
3234 		switch_output_size_warn(rec);
3235 
3236 	return 0;
3237 }
3238 
3239 static const char * const __record_usage[] = {
3240 	"perf record [<options>] [<command>]",
3241 	"perf record [<options>] -- <command> [<options>]",
3242 	NULL
3243 };
3244 const char * const *record_usage = __record_usage;
3245 
3246 static int build_id__process_mmap(struct perf_tool *tool, union perf_event *event,
3247 				  struct perf_sample *sample, struct machine *machine)
3248 {
3249 	/*
3250 	 * We already have the kernel maps, put in place via perf_session__create_kernel_maps()
3251 	 * no need to add them twice.
3252 	 */
3253 	if (!(event->header.misc & PERF_RECORD_MISC_USER))
3254 		return 0;
3255 	return perf_event__process_mmap(tool, event, sample, machine);
3256 }
3257 
3258 static int build_id__process_mmap2(struct perf_tool *tool, union perf_event *event,
3259 				   struct perf_sample *sample, struct machine *machine)
3260 {
3261 	/*
3262 	 * We already have the kernel maps, put in place via perf_session__create_kernel_maps()
3263 	 * no need to add them twice.
3264 	 */
3265 	if (!(event->header.misc & PERF_RECORD_MISC_USER))
3266 		return 0;
3267 
3268 	return perf_event__process_mmap2(tool, event, sample, machine);
3269 }
3270 
3271 static int process_timestamp_boundary(struct perf_tool *tool,
3272 				      union perf_event *event __maybe_unused,
3273 				      struct perf_sample *sample,
3274 				      struct machine *machine __maybe_unused)
3275 {
3276 	struct record *rec = container_of(tool, struct record, tool);
3277 
3278 	set_timestamp_boundary(rec, sample->time);
3279 	return 0;
3280 }
3281 
3282 static int parse_record_synth_option(const struct option *opt,
3283 				     const char *str,
3284 				     int unset __maybe_unused)
3285 {
3286 	struct record_opts *opts = opt->value;
3287 	char *p = strdup(str);
3288 
3289 	if (p == NULL)
3290 		return -1;
3291 
3292 	opts->synth = parse_synth_opt(p);
3293 	free(p);
3294 
3295 	if (opts->synth < 0) {
3296 		pr_err("Invalid synth option: %s\n", str);
3297 		return -1;
3298 	}
3299 	return 0;
3300 }
3301 
3302 /*
3303  * XXX Ideally would be local to cmd_record() and passed to a record__new
3304  * because we need to have access to it in record__exit, that is called
3305  * after cmd_record() exits, but since record_options need to be accessible to
3306  * builtin-script, leave it here.
3307  *
3308  * At least we don't ouch it in all the other functions here directly.
3309  *
3310  * Just say no to tons of global variables, sigh.
3311  */
3312 static struct record record = {
3313 	.opts = {
3314 		.sample_time	     = true,
3315 		.mmap_pages	     = UINT_MAX,
3316 		.user_freq	     = UINT_MAX,
3317 		.user_interval	     = ULLONG_MAX,
3318 		.freq		     = 4000,
3319 		.target		     = {
3320 			.uses_mmap   = true,
3321 			.default_per_cpu = true,
3322 		},
3323 		.mmap_flush          = MMAP_FLUSH_DEFAULT,
3324 		.nr_threads_synthesize = 1,
3325 		.ctl_fd              = -1,
3326 		.ctl_fd_ack          = -1,
3327 		.synth               = PERF_SYNTH_ALL,
3328 	},
3329 	.tool = {
3330 		.sample		= process_sample_event,
3331 		.fork		= perf_event__process_fork,
3332 		.exit		= perf_event__process_exit,
3333 		.comm		= perf_event__process_comm,
3334 		.namespaces	= perf_event__process_namespaces,
3335 		.mmap		= build_id__process_mmap,
3336 		.mmap2		= build_id__process_mmap2,
3337 		.itrace_start	= process_timestamp_boundary,
3338 		.aux		= process_timestamp_boundary,
3339 		.ordered_events	= true,
3340 	},
3341 };
3342 
3343 const char record_callchain_help[] = CALLCHAIN_RECORD_HELP
3344 	"\n\t\t\t\tDefault: fp";
3345 
3346 static bool dry_run;
3347 
3348 static struct parse_events_option_args parse_events_option_args = {
3349 	.evlistp = &record.evlist,
3350 };
3351 
3352 static struct parse_events_option_args switch_output_parse_events_option_args = {
3353 	.evlistp = &record.sb_evlist,
3354 };
3355 
3356 /*
3357  * XXX Will stay a global variable till we fix builtin-script.c to stop messing
3358  * with it and switch to use the library functions in perf_evlist that came
3359  * from builtin-record.c, i.e. use record_opts,
3360  * evlist__prepare_workload, etc instead of fork+exec'in 'perf record',
3361  * using pipes, etc.
3362  */
3363 static struct option __record_options[] = {
3364 	OPT_CALLBACK('e', "event", &parse_events_option_args, "event",
3365 		     "event selector. use 'perf list' to list available events",
3366 		     parse_events_option),
3367 	OPT_CALLBACK(0, "filter", &record.evlist, "filter",
3368 		     "event filter", parse_filter),
3369 	OPT_CALLBACK_NOOPT(0, "exclude-perf", &record.evlist,
3370 			   NULL, "don't record events from perf itself",
3371 			   exclude_perf),
3372 	OPT_STRING('p', "pid", &record.opts.target.pid, "pid",
3373 		    "record events on existing process id"),
3374 	OPT_STRING('t', "tid", &record.opts.target.tid, "tid",
3375 		    "record events on existing thread id"),
3376 	OPT_INTEGER('r', "realtime", &record.realtime_prio,
3377 		    "collect data with this RT SCHED_FIFO priority"),
3378 	OPT_BOOLEAN(0, "no-buffering", &record.opts.no_buffering,
3379 		    "collect data without buffering"),
3380 	OPT_BOOLEAN('R', "raw-samples", &record.opts.raw_samples,
3381 		    "collect raw sample records from all opened counters"),
3382 	OPT_BOOLEAN('a', "all-cpus", &record.opts.target.system_wide,
3383 			    "system-wide collection from all CPUs"),
3384 	OPT_STRING('C', "cpu", &record.opts.target.cpu_list, "cpu",
3385 		    "list of cpus to monitor"),
3386 	OPT_U64('c', "count", &record.opts.user_interval, "event period to sample"),
3387 	OPT_STRING('o', "output", &record.data.path, "file",
3388 		    "output file name"),
3389 	OPT_BOOLEAN_SET('i', "no-inherit", &record.opts.no_inherit,
3390 			&record.opts.no_inherit_set,
3391 			"child tasks do not inherit counters"),
3392 	OPT_BOOLEAN(0, "tail-synthesize", &record.opts.tail_synthesize,
3393 		    "synthesize non-sample events at the end of output"),
3394 	OPT_BOOLEAN(0, "overwrite", &record.opts.overwrite, "use overwrite mode"),
3395 	OPT_BOOLEAN(0, "no-bpf-event", &record.opts.no_bpf_event, "do not record bpf events"),
3396 	OPT_BOOLEAN(0, "strict-freq", &record.opts.strict_freq,
3397 		    "Fail if the specified frequency can't be used"),
3398 	OPT_CALLBACK('F', "freq", &record.opts, "freq or 'max'",
3399 		     "profile at this frequency",
3400 		      record__parse_freq),
3401 	OPT_CALLBACK('m', "mmap-pages", &record.opts, "pages[,pages]",
3402 		     "number of mmap data pages and AUX area tracing mmap pages",
3403 		     record__parse_mmap_pages),
3404 	OPT_CALLBACK(0, "mmap-flush", &record.opts, "number",
3405 		     "Minimal number of bytes that is extracted from mmap data pages (default: 1)",
3406 		     record__mmap_flush_parse),
3407 	OPT_CALLBACK_NOOPT('g', NULL, &callchain_param,
3408 			   NULL, "enables call-graph recording" ,
3409 			   &record_callchain_opt),
3410 	OPT_CALLBACK(0, "call-graph", &record.opts,
3411 		     "record_mode[,record_size]", record_callchain_help,
3412 		     &record_parse_callchain_opt),
3413 	OPT_INCR('v', "verbose", &verbose,
3414 		    "be more verbose (show counter open errors, etc)"),
3415 	OPT_BOOLEAN('q', "quiet", &quiet, "don't print any warnings or messages"),
3416 	OPT_BOOLEAN('s', "stat", &record.opts.inherit_stat,
3417 		    "per thread counts"),
3418 	OPT_BOOLEAN('d', "data", &record.opts.sample_address, "Record the sample addresses"),
3419 	OPT_BOOLEAN(0, "phys-data", &record.opts.sample_phys_addr,
3420 		    "Record the sample physical addresses"),
3421 	OPT_BOOLEAN(0, "data-page-size", &record.opts.sample_data_page_size,
3422 		    "Record the sampled data address data page size"),
3423 	OPT_BOOLEAN(0, "code-page-size", &record.opts.sample_code_page_size,
3424 		    "Record the sampled code address (ip) page size"),
3425 	OPT_BOOLEAN(0, "sample-cpu", &record.opts.sample_cpu, "Record the sample cpu"),
3426 	OPT_BOOLEAN(0, "sample-identifier", &record.opts.sample_identifier,
3427 		    "Record the sample identifier"),
3428 	OPT_BOOLEAN_SET('T', "timestamp", &record.opts.sample_time,
3429 			&record.opts.sample_time_set,
3430 			"Record the sample timestamps"),
3431 	OPT_BOOLEAN_SET('P', "period", &record.opts.period, &record.opts.period_set,
3432 			"Record the sample period"),
3433 	OPT_BOOLEAN('n', "no-samples", &record.opts.no_samples,
3434 		    "don't sample"),
3435 	OPT_BOOLEAN_SET('N', "no-buildid-cache", &record.no_buildid_cache,
3436 			&record.no_buildid_cache_set,
3437 			"do not update the buildid cache"),
3438 	OPT_BOOLEAN_SET('B', "no-buildid", &record.no_buildid,
3439 			&record.no_buildid_set,
3440 			"do not collect buildids in perf.data"),
3441 	OPT_CALLBACK('G', "cgroup", &record.evlist, "name",
3442 		     "monitor event in cgroup name only",
3443 		     parse_cgroups),
3444 	OPT_CALLBACK('D', "delay", &record, "ms",
3445 		     "ms to wait before starting measurement after program start (-1: start with events disabled), "
3446 		     "or ranges of time to enable events e.g. '-D 10-20,30-40'",
3447 		     record__parse_event_enable_time),
3448 	OPT_BOOLEAN(0, "kcore", &record.opts.kcore, "copy /proc/kcore"),
3449 	OPT_STRING('u', "uid", &record.opts.target.uid_str, "user",
3450 		   "user to profile"),
3451 
3452 	OPT_CALLBACK_NOOPT('b', "branch-any", &record.opts.branch_stack,
3453 		     "branch any", "sample any taken branches",
3454 		     parse_branch_stack),
3455 
3456 	OPT_CALLBACK('j', "branch-filter", &record.opts.branch_stack,
3457 		     "branch filter mask", "branch stack filter modes",
3458 		     parse_branch_stack),
3459 	OPT_BOOLEAN('W', "weight", &record.opts.sample_weight,
3460 		    "sample by weight (on special events only)"),
3461 	OPT_BOOLEAN(0, "transaction", &record.opts.sample_transaction,
3462 		    "sample transaction flags (special events only)"),
3463 	OPT_BOOLEAN(0, "per-thread", &record.opts.target.per_thread,
3464 		    "use per-thread mmaps"),
3465 	OPT_CALLBACK_OPTARG('I', "intr-regs", &record.opts.sample_intr_regs, NULL, "any register",
3466 		    "sample selected machine registers on interrupt,"
3467 		    " use '-I?' to list register names", parse_intr_regs),
3468 	OPT_CALLBACK_OPTARG(0, "user-regs", &record.opts.sample_user_regs, NULL, "any register",
3469 		    "sample selected machine registers on interrupt,"
3470 		    " use '--user-regs=?' to list register names", parse_user_regs),
3471 	OPT_BOOLEAN(0, "running-time", &record.opts.running_time,
3472 		    "Record running/enabled time of read (:S) events"),
3473 	OPT_CALLBACK('k', "clockid", &record.opts,
3474 	"clockid", "clockid to use for events, see clock_gettime()",
3475 	parse_clockid),
3476 	OPT_STRING_OPTARG('S', "snapshot", &record.opts.auxtrace_snapshot_opts,
3477 			  "opts", "AUX area tracing Snapshot Mode", ""),
3478 	OPT_STRING_OPTARG(0, "aux-sample", &record.opts.auxtrace_sample_opts,
3479 			  "opts", "sample AUX area", ""),
3480 	OPT_UINTEGER(0, "proc-map-timeout", &proc_map_timeout,
3481 			"per thread proc mmap processing timeout in ms"),
3482 	OPT_BOOLEAN(0, "namespaces", &record.opts.record_namespaces,
3483 		    "Record namespaces events"),
3484 	OPT_BOOLEAN(0, "all-cgroups", &record.opts.record_cgroup,
3485 		    "Record cgroup events"),
3486 	OPT_BOOLEAN_SET(0, "switch-events", &record.opts.record_switch_events,
3487 			&record.opts.record_switch_events_set,
3488 			"Record context switch events"),
3489 	OPT_BOOLEAN_FLAG(0, "all-kernel", &record.opts.all_kernel,
3490 			 "Configure all used events to run in kernel space.",
3491 			 PARSE_OPT_EXCLUSIVE),
3492 	OPT_BOOLEAN_FLAG(0, "all-user", &record.opts.all_user,
3493 			 "Configure all used events to run in user space.",
3494 			 PARSE_OPT_EXCLUSIVE),
3495 	OPT_BOOLEAN(0, "kernel-callchains", &record.opts.kernel_callchains,
3496 		    "collect kernel callchains"),
3497 	OPT_BOOLEAN(0, "user-callchains", &record.opts.user_callchains,
3498 		    "collect user callchains"),
3499 	OPT_STRING(0, "vmlinux", &symbol_conf.vmlinux_name,
3500 		   "file", "vmlinux pathname"),
3501 	OPT_BOOLEAN(0, "buildid-all", &record.buildid_all,
3502 		    "Record build-id of all DSOs regardless of hits"),
3503 	OPT_BOOLEAN(0, "buildid-mmap", &record.buildid_mmap,
3504 		    "Record build-id in map events"),
3505 	OPT_BOOLEAN(0, "timestamp-filename", &record.timestamp_filename,
3506 		    "append timestamp to output filename"),
3507 	OPT_BOOLEAN(0, "timestamp-boundary", &record.timestamp_boundary,
3508 		    "Record timestamp boundary (time of first/last samples)"),
3509 	OPT_STRING_OPTARG_SET(0, "switch-output", &record.switch_output.str,
3510 			  &record.switch_output.set, "signal or size[BKMG] or time[smhd]",
3511 			  "Switch output when receiving SIGUSR2 (signal) or cross a size or time threshold",
3512 			  "signal"),
3513 	OPT_CALLBACK_SET(0, "switch-output-event", &switch_output_parse_events_option_args,
3514 			 &record.switch_output_event_set, "switch output event",
3515 			 "switch output event selector. use 'perf list' to list available events",
3516 			 parse_events_option_new_evlist),
3517 	OPT_INTEGER(0, "switch-max-files", &record.switch_output.num_files,
3518 		   "Limit number of switch output generated files"),
3519 	OPT_BOOLEAN(0, "dry-run", &dry_run,
3520 		    "Parse options then exit"),
3521 #ifdef HAVE_AIO_SUPPORT
3522 	OPT_CALLBACK_OPTARG(0, "aio", &record.opts,
3523 		     &nr_cblocks_default, "n", "Use <n> control blocks in asynchronous trace writing mode (default: 1, max: 4)",
3524 		     record__aio_parse),
3525 #endif
3526 	OPT_CALLBACK(0, "affinity", &record.opts, "node|cpu",
3527 		     "Set affinity mask of trace reading thread to NUMA node cpu mask or cpu of processed mmap buffer",
3528 		     record__parse_affinity),
3529 #ifdef HAVE_ZSTD_SUPPORT
3530 	OPT_CALLBACK_OPTARG('z', "compression-level", &record.opts, &comp_level_default, "n",
3531 			    "Compress records using specified level (default: 1 - fastest compression, 22 - greatest compression)",
3532 			    record__parse_comp_level),
3533 #endif
3534 	OPT_CALLBACK(0, "max-size", &record.output_max_size,
3535 		     "size", "Limit the maximum size of the output file", parse_output_max_size),
3536 	OPT_UINTEGER(0, "num-thread-synthesize",
3537 		     &record.opts.nr_threads_synthesize,
3538 		     "number of threads to run for event synthesis"),
3539 #ifdef HAVE_LIBPFM
3540 	OPT_CALLBACK(0, "pfm-events", &record.evlist, "event",
3541 		"libpfm4 event selector. use 'perf list' to list available events",
3542 		parse_libpfm_events_option),
3543 #endif
3544 	OPT_CALLBACK(0, "control", &record.opts, "fd:ctl-fd[,ack-fd] or fifo:ctl-fifo[,ack-fifo]",
3545 		     "Listen on ctl-fd descriptor for command to control measurement ('enable': enable events, 'disable': disable events,\n"
3546 		     "\t\t\t  'snapshot': AUX area tracing snapshot).\n"
3547 		     "\t\t\t  Optionally send control command completion ('ack\\n') to ack-fd descriptor.\n"
3548 		     "\t\t\t  Alternatively, ctl-fifo / ack-fifo will be opened and used as ctl-fd / ack-fd.",
3549 		      parse_control_option),
3550 	OPT_CALLBACK(0, "synth", &record.opts, "no|all|task|mmap|cgroup",
3551 		     "Fine-tune event synthesis: default=all", parse_record_synth_option),
3552 	OPT_STRING_OPTARG_SET(0, "debuginfod", &record.debuginfod.urls,
3553 			  &record.debuginfod.set, "debuginfod urls",
3554 			  "Enable debuginfod data retrieval from DEBUGINFOD_URLS or specified urls",
3555 			  "system"),
3556 	OPT_CALLBACK_OPTARG(0, "threads", &record.opts, NULL, "spec",
3557 			    "write collected trace data into several data files using parallel threads",
3558 			    record__parse_threads),
3559 	OPT_BOOLEAN(0, "off-cpu", &record.off_cpu, "Enable off-cpu analysis"),
3560 	OPT_END()
3561 };
3562 
3563 struct option *record_options = __record_options;
3564 
3565 static int record__mmap_cpu_mask_init(struct mmap_cpu_mask *mask, struct perf_cpu_map *cpus)
3566 {
3567 	struct perf_cpu cpu;
3568 	int idx;
3569 
3570 	if (cpu_map__is_dummy(cpus))
3571 		return 0;
3572 
3573 	perf_cpu_map__for_each_cpu_skip_any(cpu, idx, cpus) {
3574 		/* Return ENODEV is input cpu is greater than max cpu */
3575 		if ((unsigned long)cpu.cpu > mask->nbits)
3576 			return -ENODEV;
3577 		__set_bit(cpu.cpu, mask->bits);
3578 	}
3579 
3580 	return 0;
3581 }
3582 
3583 static int record__mmap_cpu_mask_init_spec(struct mmap_cpu_mask *mask, const char *mask_spec)
3584 {
3585 	struct perf_cpu_map *cpus;
3586 
3587 	cpus = perf_cpu_map__new(mask_spec);
3588 	if (!cpus)
3589 		return -ENOMEM;
3590 
3591 	bitmap_zero(mask->bits, mask->nbits);
3592 	if (record__mmap_cpu_mask_init(mask, cpus))
3593 		return -ENODEV;
3594 
3595 	perf_cpu_map__put(cpus);
3596 
3597 	return 0;
3598 }
3599 
3600 static void record__free_thread_masks(struct record *rec, int nr_threads)
3601 {
3602 	int t;
3603 
3604 	if (rec->thread_masks)
3605 		for (t = 0; t < nr_threads; t++)
3606 			record__thread_mask_free(&rec->thread_masks[t]);
3607 
3608 	zfree(&rec->thread_masks);
3609 }
3610 
3611 static int record__alloc_thread_masks(struct record *rec, int nr_threads, int nr_bits)
3612 {
3613 	int t, ret;
3614 
3615 	rec->thread_masks = zalloc(nr_threads * sizeof(*(rec->thread_masks)));
3616 	if (!rec->thread_masks) {
3617 		pr_err("Failed to allocate thread masks\n");
3618 		return -ENOMEM;
3619 	}
3620 
3621 	for (t = 0; t < nr_threads; t++) {
3622 		ret = record__thread_mask_alloc(&rec->thread_masks[t], nr_bits);
3623 		if (ret) {
3624 			pr_err("Failed to allocate thread masks[%d]\n", t);
3625 			goto out_free;
3626 		}
3627 	}
3628 
3629 	return 0;
3630 
3631 out_free:
3632 	record__free_thread_masks(rec, nr_threads);
3633 
3634 	return ret;
3635 }
3636 
3637 static int record__init_thread_cpu_masks(struct record *rec, struct perf_cpu_map *cpus)
3638 {
3639 	int t, ret, nr_cpus = perf_cpu_map__nr(cpus);
3640 
3641 	ret = record__alloc_thread_masks(rec, nr_cpus, cpu__max_cpu().cpu);
3642 	if (ret)
3643 		return ret;
3644 
3645 	rec->nr_threads = nr_cpus;
3646 	pr_debug("nr_threads: %d\n", rec->nr_threads);
3647 
3648 	for (t = 0; t < rec->nr_threads; t++) {
3649 		__set_bit(perf_cpu_map__cpu(cpus, t).cpu, rec->thread_masks[t].maps.bits);
3650 		__set_bit(perf_cpu_map__cpu(cpus, t).cpu, rec->thread_masks[t].affinity.bits);
3651 		if (verbose > 0) {
3652 			pr_debug("thread_masks[%d]: ", t);
3653 			mmap_cpu_mask__scnprintf(&rec->thread_masks[t].maps, "maps");
3654 			pr_debug("thread_masks[%d]: ", t);
3655 			mmap_cpu_mask__scnprintf(&rec->thread_masks[t].affinity, "affinity");
3656 		}
3657 	}
3658 
3659 	return 0;
3660 }
3661 
3662 static int record__init_thread_masks_spec(struct record *rec, struct perf_cpu_map *cpus,
3663 					  const char **maps_spec, const char **affinity_spec,
3664 					  u32 nr_spec)
3665 {
3666 	u32 s;
3667 	int ret = 0, t = 0;
3668 	struct mmap_cpu_mask cpus_mask;
3669 	struct thread_mask thread_mask, full_mask, *thread_masks;
3670 
3671 	ret = record__mmap_cpu_mask_alloc(&cpus_mask, cpu__max_cpu().cpu);
3672 	if (ret) {
3673 		pr_err("Failed to allocate CPUs mask\n");
3674 		return ret;
3675 	}
3676 
3677 	ret = record__mmap_cpu_mask_init(&cpus_mask, cpus);
3678 	if (ret) {
3679 		pr_err("Failed to init cpu mask\n");
3680 		goto out_free_cpu_mask;
3681 	}
3682 
3683 	ret = record__thread_mask_alloc(&full_mask, cpu__max_cpu().cpu);
3684 	if (ret) {
3685 		pr_err("Failed to allocate full mask\n");
3686 		goto out_free_cpu_mask;
3687 	}
3688 
3689 	ret = record__thread_mask_alloc(&thread_mask, cpu__max_cpu().cpu);
3690 	if (ret) {
3691 		pr_err("Failed to allocate thread mask\n");
3692 		goto out_free_full_and_cpu_masks;
3693 	}
3694 
3695 	for (s = 0; s < nr_spec; s++) {
3696 		ret = record__mmap_cpu_mask_init_spec(&thread_mask.maps, maps_spec[s]);
3697 		if (ret) {
3698 			pr_err("Failed to initialize maps thread mask\n");
3699 			goto out_free;
3700 		}
3701 		ret = record__mmap_cpu_mask_init_spec(&thread_mask.affinity, affinity_spec[s]);
3702 		if (ret) {
3703 			pr_err("Failed to initialize affinity thread mask\n");
3704 			goto out_free;
3705 		}
3706 
3707 		/* ignore invalid CPUs but do not allow empty masks */
3708 		if (!bitmap_and(thread_mask.maps.bits, thread_mask.maps.bits,
3709 				cpus_mask.bits, thread_mask.maps.nbits)) {
3710 			pr_err("Empty maps mask: %s\n", maps_spec[s]);
3711 			ret = -EINVAL;
3712 			goto out_free;
3713 		}
3714 		if (!bitmap_and(thread_mask.affinity.bits, thread_mask.affinity.bits,
3715 				cpus_mask.bits, thread_mask.affinity.nbits)) {
3716 			pr_err("Empty affinity mask: %s\n", affinity_spec[s]);
3717 			ret = -EINVAL;
3718 			goto out_free;
3719 		}
3720 
3721 		/* do not allow intersection with other masks (full_mask) */
3722 		if (bitmap_intersects(thread_mask.maps.bits, full_mask.maps.bits,
3723 				      thread_mask.maps.nbits)) {
3724 			pr_err("Intersecting maps mask: %s\n", maps_spec[s]);
3725 			ret = -EINVAL;
3726 			goto out_free;
3727 		}
3728 		if (bitmap_intersects(thread_mask.affinity.bits, full_mask.affinity.bits,
3729 				      thread_mask.affinity.nbits)) {
3730 			pr_err("Intersecting affinity mask: %s\n", affinity_spec[s]);
3731 			ret = -EINVAL;
3732 			goto out_free;
3733 		}
3734 
3735 		bitmap_or(full_mask.maps.bits, full_mask.maps.bits,
3736 			  thread_mask.maps.bits, full_mask.maps.nbits);
3737 		bitmap_or(full_mask.affinity.bits, full_mask.affinity.bits,
3738 			  thread_mask.affinity.bits, full_mask.maps.nbits);
3739 
3740 		thread_masks = realloc(rec->thread_masks, (t + 1) * sizeof(struct thread_mask));
3741 		if (!thread_masks) {
3742 			pr_err("Failed to reallocate thread masks\n");
3743 			ret = -ENOMEM;
3744 			goto out_free;
3745 		}
3746 		rec->thread_masks = thread_masks;
3747 		rec->thread_masks[t] = thread_mask;
3748 		if (verbose > 0) {
3749 			pr_debug("thread_masks[%d]: ", t);
3750 			mmap_cpu_mask__scnprintf(&rec->thread_masks[t].maps, "maps");
3751 			pr_debug("thread_masks[%d]: ", t);
3752 			mmap_cpu_mask__scnprintf(&rec->thread_masks[t].affinity, "affinity");
3753 		}
3754 		t++;
3755 		ret = record__thread_mask_alloc(&thread_mask, cpu__max_cpu().cpu);
3756 		if (ret) {
3757 			pr_err("Failed to allocate thread mask\n");
3758 			goto out_free_full_and_cpu_masks;
3759 		}
3760 	}
3761 	rec->nr_threads = t;
3762 	pr_debug("nr_threads: %d\n", rec->nr_threads);
3763 	if (!rec->nr_threads)
3764 		ret = -EINVAL;
3765 
3766 out_free:
3767 	record__thread_mask_free(&thread_mask);
3768 out_free_full_and_cpu_masks:
3769 	record__thread_mask_free(&full_mask);
3770 out_free_cpu_mask:
3771 	record__mmap_cpu_mask_free(&cpus_mask);
3772 
3773 	return ret;
3774 }
3775 
3776 static int record__init_thread_core_masks(struct record *rec, struct perf_cpu_map *cpus)
3777 {
3778 	int ret;
3779 	struct cpu_topology *topo;
3780 
3781 	topo = cpu_topology__new();
3782 	if (!topo) {
3783 		pr_err("Failed to allocate CPU topology\n");
3784 		return -ENOMEM;
3785 	}
3786 
3787 	ret = record__init_thread_masks_spec(rec, cpus, topo->core_cpus_list,
3788 					     topo->core_cpus_list, topo->core_cpus_lists);
3789 	cpu_topology__delete(topo);
3790 
3791 	return ret;
3792 }
3793 
3794 static int record__init_thread_package_masks(struct record *rec, struct perf_cpu_map *cpus)
3795 {
3796 	int ret;
3797 	struct cpu_topology *topo;
3798 
3799 	topo = cpu_topology__new();
3800 	if (!topo) {
3801 		pr_err("Failed to allocate CPU topology\n");
3802 		return -ENOMEM;
3803 	}
3804 
3805 	ret = record__init_thread_masks_spec(rec, cpus, topo->package_cpus_list,
3806 					     topo->package_cpus_list, topo->package_cpus_lists);
3807 	cpu_topology__delete(topo);
3808 
3809 	return ret;
3810 }
3811 
3812 static int record__init_thread_numa_masks(struct record *rec, struct perf_cpu_map *cpus)
3813 {
3814 	u32 s;
3815 	int ret;
3816 	const char **spec;
3817 	struct numa_topology *topo;
3818 
3819 	topo = numa_topology__new();
3820 	if (!topo) {
3821 		pr_err("Failed to allocate NUMA topology\n");
3822 		return -ENOMEM;
3823 	}
3824 
3825 	spec = zalloc(topo->nr * sizeof(char *));
3826 	if (!spec) {
3827 		pr_err("Failed to allocate NUMA spec\n");
3828 		ret = -ENOMEM;
3829 		goto out_delete_topo;
3830 	}
3831 	for (s = 0; s < topo->nr; s++)
3832 		spec[s] = topo->nodes[s].cpus;
3833 
3834 	ret = record__init_thread_masks_spec(rec, cpus, spec, spec, topo->nr);
3835 
3836 	zfree(&spec);
3837 
3838 out_delete_topo:
3839 	numa_topology__delete(topo);
3840 
3841 	return ret;
3842 }
3843 
3844 static int record__init_thread_user_masks(struct record *rec, struct perf_cpu_map *cpus)
3845 {
3846 	int t, ret;
3847 	u32 s, nr_spec = 0;
3848 	char **maps_spec = NULL, **affinity_spec = NULL, **tmp_spec;
3849 	char *user_spec, *spec, *spec_ptr, *mask, *mask_ptr, *dup_mask = NULL;
3850 
3851 	for (t = 0, user_spec = (char *)rec->opts.threads_user_spec; ; t++, user_spec = NULL) {
3852 		spec = strtok_r(user_spec, ":", &spec_ptr);
3853 		if (spec == NULL)
3854 			break;
3855 		pr_debug2("threads_spec[%d]: %s\n", t, spec);
3856 		mask = strtok_r(spec, "/", &mask_ptr);
3857 		if (mask == NULL)
3858 			break;
3859 		pr_debug2("  maps mask: %s\n", mask);
3860 		tmp_spec = realloc(maps_spec, (nr_spec + 1) * sizeof(char *));
3861 		if (!tmp_spec) {
3862 			pr_err("Failed to reallocate maps spec\n");
3863 			ret = -ENOMEM;
3864 			goto out_free;
3865 		}
3866 		maps_spec = tmp_spec;
3867 		maps_spec[nr_spec] = dup_mask = strdup(mask);
3868 		if (!maps_spec[nr_spec]) {
3869 			pr_err("Failed to allocate maps spec[%d]\n", nr_spec);
3870 			ret = -ENOMEM;
3871 			goto out_free;
3872 		}
3873 		mask = strtok_r(NULL, "/", &mask_ptr);
3874 		if (mask == NULL) {
3875 			pr_err("Invalid thread maps or affinity specs\n");
3876 			ret = -EINVAL;
3877 			goto out_free;
3878 		}
3879 		pr_debug2("  affinity mask: %s\n", mask);
3880 		tmp_spec = realloc(affinity_spec, (nr_spec + 1) * sizeof(char *));
3881 		if (!tmp_spec) {
3882 			pr_err("Failed to reallocate affinity spec\n");
3883 			ret = -ENOMEM;
3884 			goto out_free;
3885 		}
3886 		affinity_spec = tmp_spec;
3887 		affinity_spec[nr_spec] = strdup(mask);
3888 		if (!affinity_spec[nr_spec]) {
3889 			pr_err("Failed to allocate affinity spec[%d]\n", nr_spec);
3890 			ret = -ENOMEM;
3891 			goto out_free;
3892 		}
3893 		dup_mask = NULL;
3894 		nr_spec++;
3895 	}
3896 
3897 	ret = record__init_thread_masks_spec(rec, cpus, (const char **)maps_spec,
3898 					     (const char **)affinity_spec, nr_spec);
3899 
3900 out_free:
3901 	free(dup_mask);
3902 	for (s = 0; s < nr_spec; s++) {
3903 		if (maps_spec)
3904 			free(maps_spec[s]);
3905 		if (affinity_spec)
3906 			free(affinity_spec[s]);
3907 	}
3908 	free(affinity_spec);
3909 	free(maps_spec);
3910 
3911 	return ret;
3912 }
3913 
3914 static int record__init_thread_default_masks(struct record *rec, struct perf_cpu_map *cpus)
3915 {
3916 	int ret;
3917 
3918 	ret = record__alloc_thread_masks(rec, 1, cpu__max_cpu().cpu);
3919 	if (ret)
3920 		return ret;
3921 
3922 	if (record__mmap_cpu_mask_init(&rec->thread_masks->maps, cpus))
3923 		return -ENODEV;
3924 
3925 	rec->nr_threads = 1;
3926 
3927 	return 0;
3928 }
3929 
3930 static int record__init_thread_masks(struct record *rec)
3931 {
3932 	int ret = 0;
3933 	struct perf_cpu_map *cpus = rec->evlist->core.all_cpus;
3934 
3935 	if (!record__threads_enabled(rec))
3936 		return record__init_thread_default_masks(rec, cpus);
3937 
3938 	if (evlist__per_thread(rec->evlist)) {
3939 		pr_err("--per-thread option is mutually exclusive to parallel streaming mode.\n");
3940 		return -EINVAL;
3941 	}
3942 
3943 	switch (rec->opts.threads_spec) {
3944 	case THREAD_SPEC__CPU:
3945 		ret = record__init_thread_cpu_masks(rec, cpus);
3946 		break;
3947 	case THREAD_SPEC__CORE:
3948 		ret = record__init_thread_core_masks(rec, cpus);
3949 		break;
3950 	case THREAD_SPEC__PACKAGE:
3951 		ret = record__init_thread_package_masks(rec, cpus);
3952 		break;
3953 	case THREAD_SPEC__NUMA:
3954 		ret = record__init_thread_numa_masks(rec, cpus);
3955 		break;
3956 	case THREAD_SPEC__USER:
3957 		ret = record__init_thread_user_masks(rec, cpus);
3958 		break;
3959 	default:
3960 		break;
3961 	}
3962 
3963 	return ret;
3964 }
3965 
3966 int cmd_record(int argc, const char **argv)
3967 {
3968 	int err;
3969 	struct record *rec = &record;
3970 	char errbuf[BUFSIZ];
3971 
3972 	setlocale(LC_ALL, "");
3973 
3974 #ifndef HAVE_BPF_SKEL
3975 # define set_nobuild(s, l, m, c) set_option_nobuild(record_options, s, l, m, c)
3976 	set_nobuild('\0', "off-cpu", "no BUILD_BPF_SKEL=1", true);
3977 # undef set_nobuild
3978 #endif
3979 
3980 	/* Disable eager loading of kernel symbols that adds overhead to perf record. */
3981 	symbol_conf.lazy_load_kernel_maps = true;
3982 	rec->opts.affinity = PERF_AFFINITY_SYS;
3983 
3984 	rec->evlist = evlist__new();
3985 	if (rec->evlist == NULL)
3986 		return -ENOMEM;
3987 
3988 	err = perf_config(perf_record_config, rec);
3989 	if (err)
3990 		return err;
3991 
3992 	argc = parse_options(argc, argv, record_options, record_usage,
3993 			    PARSE_OPT_STOP_AT_NON_OPTION);
3994 	if (quiet)
3995 		perf_quiet_option();
3996 
3997 	err = symbol__validate_sym_arguments();
3998 	if (err)
3999 		return err;
4000 
4001 	perf_debuginfod_setup(&record.debuginfod);
4002 
4003 	/* Make system wide (-a) the default target. */
4004 	if (!argc && target__none(&rec->opts.target))
4005 		rec->opts.target.system_wide = true;
4006 
4007 	if (nr_cgroups && !rec->opts.target.system_wide) {
4008 		usage_with_options_msg(record_usage, record_options,
4009 			"cgroup monitoring only available in system-wide mode");
4010 
4011 	}
4012 
4013 	if (rec->buildid_mmap) {
4014 		if (!perf_can_record_build_id()) {
4015 			pr_err("Failed: no support to record build id in mmap events, update your kernel.\n");
4016 			err = -EINVAL;
4017 			goto out_opts;
4018 		}
4019 		pr_debug("Enabling build id in mmap2 events.\n");
4020 		/* Enable mmap build id synthesizing. */
4021 		symbol_conf.buildid_mmap2 = true;
4022 		/* Enable perf_event_attr::build_id bit. */
4023 		rec->opts.build_id = true;
4024 		/* Disable build id cache. */
4025 		rec->no_buildid = true;
4026 	}
4027 
4028 	if (rec->opts.record_cgroup && !perf_can_record_cgroup()) {
4029 		pr_err("Kernel has no cgroup sampling support.\n");
4030 		err = -EINVAL;
4031 		goto out_opts;
4032 	}
4033 
4034 	if (rec->opts.kcore)
4035 		rec->opts.text_poke = true;
4036 
4037 	if (rec->opts.kcore || record__threads_enabled(rec))
4038 		rec->data.is_dir = true;
4039 
4040 	if (record__threads_enabled(rec)) {
4041 		if (rec->opts.affinity != PERF_AFFINITY_SYS) {
4042 			pr_err("--affinity option is mutually exclusive to parallel streaming mode.\n");
4043 			goto out_opts;
4044 		}
4045 		if (record__aio_enabled(rec)) {
4046 			pr_err("Asynchronous streaming mode (--aio) is mutually exclusive to parallel streaming mode.\n");
4047 			goto out_opts;
4048 		}
4049 	}
4050 
4051 	if (rec->opts.comp_level != 0) {
4052 		pr_debug("Compression enabled, disabling build id collection at the end of the session.\n");
4053 		rec->no_buildid = true;
4054 	}
4055 
4056 	if (rec->opts.record_switch_events &&
4057 	    !perf_can_record_switch_events()) {
4058 		ui__error("kernel does not support recording context switch events\n");
4059 		parse_options_usage(record_usage, record_options, "switch-events", 0);
4060 		err = -EINVAL;
4061 		goto out_opts;
4062 	}
4063 
4064 	if (switch_output_setup(rec)) {
4065 		parse_options_usage(record_usage, record_options, "switch-output", 0);
4066 		err = -EINVAL;
4067 		goto out_opts;
4068 	}
4069 
4070 	if (rec->switch_output.time) {
4071 		signal(SIGALRM, alarm_sig_handler);
4072 		alarm(rec->switch_output.time);
4073 	}
4074 
4075 	if (rec->switch_output.num_files) {
4076 		rec->switch_output.filenames = calloc(rec->switch_output.num_files,
4077 						      sizeof(char *));
4078 		if (!rec->switch_output.filenames) {
4079 			err = -EINVAL;
4080 			goto out_opts;
4081 		}
4082 	}
4083 
4084 	if (rec->timestamp_filename && record__threads_enabled(rec)) {
4085 		rec->timestamp_filename = false;
4086 		pr_warning("WARNING: --timestamp-filename option is not available in parallel streaming mode.\n");
4087 	}
4088 
4089 	/*
4090 	 * Allow aliases to facilitate the lookup of symbols for address
4091 	 * filters. Refer to auxtrace_parse_filters().
4092 	 */
4093 	symbol_conf.allow_aliases = true;
4094 
4095 	symbol__init(NULL);
4096 
4097 	err = record__auxtrace_init(rec);
4098 	if (err)
4099 		goto out;
4100 
4101 	if (dry_run)
4102 		goto out;
4103 
4104 	err = -ENOMEM;
4105 
4106 	if (rec->no_buildid_cache || rec->no_buildid) {
4107 		disable_buildid_cache();
4108 	} else if (rec->switch_output.enabled) {
4109 		/*
4110 		 * In 'perf record --switch-output', disable buildid
4111 		 * generation by default to reduce data file switching
4112 		 * overhead. Still generate buildid if they are required
4113 		 * explicitly using
4114 		 *
4115 		 *  perf record --switch-output --no-no-buildid \
4116 		 *              --no-no-buildid-cache
4117 		 *
4118 		 * Following code equals to:
4119 		 *
4120 		 * if ((rec->no_buildid || !rec->no_buildid_set) &&
4121 		 *     (rec->no_buildid_cache || !rec->no_buildid_cache_set))
4122 		 *         disable_buildid_cache();
4123 		 */
4124 		bool disable = true;
4125 
4126 		if (rec->no_buildid_set && !rec->no_buildid)
4127 			disable = false;
4128 		if (rec->no_buildid_cache_set && !rec->no_buildid_cache)
4129 			disable = false;
4130 		if (disable) {
4131 			rec->no_buildid = true;
4132 			rec->no_buildid_cache = true;
4133 			disable_buildid_cache();
4134 		}
4135 	}
4136 
4137 	if (record.opts.overwrite)
4138 		record.opts.tail_synthesize = true;
4139 
4140 	if (rec->evlist->core.nr_entries == 0) {
4141 		bool can_profile_kernel = perf_event_paranoid_check(1);
4142 
4143 		err = parse_event(rec->evlist, can_profile_kernel ? "cycles:P" : "cycles:Pu");
4144 		if (err)
4145 			goto out;
4146 	}
4147 
4148 	if (rec->opts.target.tid && !rec->opts.no_inherit_set)
4149 		rec->opts.no_inherit = true;
4150 
4151 	err = target__validate(&rec->opts.target);
4152 	if (err) {
4153 		target__strerror(&rec->opts.target, err, errbuf, BUFSIZ);
4154 		ui__warning("%s\n", errbuf);
4155 	}
4156 
4157 	err = target__parse_uid(&rec->opts.target);
4158 	if (err) {
4159 		int saved_errno = errno;
4160 
4161 		target__strerror(&rec->opts.target, err, errbuf, BUFSIZ);
4162 		ui__error("%s", errbuf);
4163 
4164 		err = -saved_errno;
4165 		goto out;
4166 	}
4167 
4168 	/* Enable ignoring missing threads when -u/-p option is defined. */
4169 	rec->opts.ignore_missing_thread = rec->opts.target.uid != UINT_MAX || rec->opts.target.pid;
4170 
4171 	evlist__warn_user_requested_cpus(rec->evlist, rec->opts.target.cpu_list);
4172 
4173 	if (callchain_param.enabled && callchain_param.record_mode == CALLCHAIN_FP)
4174 		arch__add_leaf_frame_record_opts(&rec->opts);
4175 
4176 	err = -ENOMEM;
4177 	if (evlist__create_maps(rec->evlist, &rec->opts.target) < 0) {
4178 		if (rec->opts.target.pid != NULL) {
4179 			pr_err("Couldn't create thread/CPU maps: %s\n",
4180 				errno == ENOENT ? "No such process" : str_error_r(errno, errbuf, sizeof(errbuf)));
4181 			goto out;
4182 		}
4183 		else
4184 			usage_with_options(record_usage, record_options);
4185 	}
4186 
4187 	err = auxtrace_record__options(rec->itr, rec->evlist, &rec->opts);
4188 	if (err)
4189 		goto out;
4190 
4191 	/*
4192 	 * We take all buildids when the file contains
4193 	 * AUX area tracing data because we do not decode the
4194 	 * trace because it would take too long.
4195 	 */
4196 	if (rec->opts.full_auxtrace)
4197 		rec->buildid_all = true;
4198 
4199 	if (rec->opts.text_poke) {
4200 		err = record__config_text_poke(rec->evlist);
4201 		if (err) {
4202 			pr_err("record__config_text_poke failed, error %d\n", err);
4203 			goto out;
4204 		}
4205 	}
4206 
4207 	if (rec->off_cpu) {
4208 		err = record__config_off_cpu(rec);
4209 		if (err) {
4210 			pr_err("record__config_off_cpu failed, error %d\n", err);
4211 			goto out;
4212 		}
4213 	}
4214 
4215 	if (record_opts__config(&rec->opts)) {
4216 		err = -EINVAL;
4217 		goto out;
4218 	}
4219 
4220 	err = record__config_tracking_events(rec);
4221 	if (err) {
4222 		pr_err("record__config_tracking_events failed, error %d\n", err);
4223 		goto out;
4224 	}
4225 
4226 	err = record__init_thread_masks(rec);
4227 	if (err) {
4228 		pr_err("Failed to initialize parallel data streaming masks\n");
4229 		goto out;
4230 	}
4231 
4232 	if (rec->opts.nr_cblocks > nr_cblocks_max)
4233 		rec->opts.nr_cblocks = nr_cblocks_max;
4234 	pr_debug("nr_cblocks: %d\n", rec->opts.nr_cblocks);
4235 
4236 	pr_debug("affinity: %s\n", affinity_tags[rec->opts.affinity]);
4237 	pr_debug("mmap flush: %d\n", rec->opts.mmap_flush);
4238 
4239 	if (rec->opts.comp_level > comp_level_max)
4240 		rec->opts.comp_level = comp_level_max;
4241 	pr_debug("comp level: %d\n", rec->opts.comp_level);
4242 
4243 	err = __cmd_record(&record, argc, argv);
4244 out:
4245 	evlist__delete(rec->evlist);
4246 	symbol__exit();
4247 	auxtrace_record__free(rec->itr);
4248 out_opts:
4249 	record__free_thread_masks(rec, rec->nr_threads);
4250 	rec->nr_threads = 0;
4251 	evlist__close_control(rec->opts.ctl_fd, rec->opts.ctl_fd_ack, &rec->opts.ctl_fd_close);
4252 	return err;
4253 }
4254 
4255 static void snapshot_sig_handler(int sig __maybe_unused)
4256 {
4257 	struct record *rec = &record;
4258 
4259 	hit_auxtrace_snapshot_trigger(rec);
4260 
4261 	if (switch_output_signal(rec))
4262 		trigger_hit(&switch_output_trigger);
4263 }
4264 
4265 static void alarm_sig_handler(int sig __maybe_unused)
4266 {
4267 	struct record *rec = &record;
4268 
4269 	if (switch_output_time(rec))
4270 		trigger_hit(&switch_output_trigger);
4271 }
4272