xref: /linux/tools/perf/builtin-record.c (revision 96f30c8f0aa9923aa39b30bcaefeacf88b490231)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * builtin-record.c
4  *
5  * Builtin record command: Record the profile of a workload
6  * (or a CPU, or a PID) into the perf.data output file - for
7  * later analysis via perf report.
8  */
9 #include "builtin.h"
10 
11 #include "util/build-id.h"
12 #include <subcmd/parse-options.h>
13 #include <internal/xyarray.h>
14 #include "util/parse-events.h"
15 #include "util/config.h"
16 
17 #include "util/callchain.h"
18 #include "util/cgroup.h"
19 #include "util/header.h"
20 #include "util/event.h"
21 #include "util/evlist.h"
22 #include "util/evsel.h"
23 #include "util/debug.h"
24 #include "util/mmap.h"
25 #include "util/mutex.h"
26 #include "util/target.h"
27 #include "util/session.h"
28 #include "util/tool.h"
29 #include "util/symbol.h"
30 #include "util/record.h"
31 #include "util/cpumap.h"
32 #include "util/thread_map.h"
33 #include "util/data.h"
34 #include "util/perf_regs.h"
35 #include "util/auxtrace.h"
36 #include "util/tsc.h"
37 #include "util/parse-branch-options.h"
38 #include "util/parse-regs-options.h"
39 #include "util/perf_api_probe.h"
40 #include "util/trigger.h"
41 #include "util/perf-hooks.h"
42 #include "util/cpu-set-sched.h"
43 #include "util/synthetic-events.h"
44 #include "util/time-utils.h"
45 #include "util/units.h"
46 #include "util/bpf-event.h"
47 #include "util/util.h"
48 #include "util/pfm.h"
49 #include "util/pmu.h"
50 #include "util/pmus.h"
51 #include "util/clockid.h"
52 #include "util/off_cpu.h"
53 #include "util/bpf-filter.h"
54 #include "asm/bug.h"
55 #include "perf.h"
56 #include "cputopo.h"
57 
58 #include <errno.h>
59 #include <inttypes.h>
60 #include <locale.h>
61 #include <poll.h>
62 #include <pthread.h>
63 #include <unistd.h>
64 #ifndef HAVE_GETTID
65 #include <syscall.h>
66 #endif
67 #include <sched.h>
68 #include <signal.h>
69 #ifdef HAVE_EVENTFD_SUPPORT
70 #include <sys/eventfd.h>
71 #endif
72 #include <sys/mman.h>
73 #include <sys/wait.h>
74 #include <sys/types.h>
75 #include <sys/stat.h>
76 #include <fcntl.h>
77 #include <linux/err.h>
78 #include <linux/string.h>
79 #include <linux/time64.h>
80 #include <linux/zalloc.h>
81 #include <linux/bitmap.h>
82 #include <sys/time.h>
83 
84 struct switch_output {
85 	bool		 enabled;
86 	bool		 signal;
87 	unsigned long	 size;
88 	unsigned long	 time;
89 	const char	*str;
90 	bool		 set;
91 	char		 **filenames;
92 	int		 num_files;
93 	int		 cur_file;
94 };
95 
96 struct thread_mask {
97 	struct mmap_cpu_mask	maps;
98 	struct mmap_cpu_mask	affinity;
99 };
100 
101 struct record_thread {
102 	pid_t			tid;
103 	struct thread_mask	*mask;
104 	struct {
105 		int		msg[2];
106 		int		ack[2];
107 	} pipes;
108 	struct fdarray		pollfd;
109 	int			ctlfd_pos;
110 	int			nr_mmaps;
111 	struct mmap		**maps;
112 	struct mmap		**overwrite_maps;
113 	struct record		*rec;
114 	unsigned long long	samples;
115 	unsigned long		waking;
116 	u64			bytes_written;
117 	u64			bytes_transferred;
118 	u64			bytes_compressed;
119 };
120 
121 static __thread struct record_thread *thread;
122 
123 enum thread_msg {
124 	THREAD_MSG__UNDEFINED = 0,
125 	THREAD_MSG__READY,
126 	THREAD_MSG__MAX,
127 };
128 
129 static const char *thread_msg_tags[THREAD_MSG__MAX] = {
130 	"UNDEFINED", "READY"
131 };
132 
133 enum thread_spec {
134 	THREAD_SPEC__UNDEFINED = 0,
135 	THREAD_SPEC__CPU,
136 	THREAD_SPEC__CORE,
137 	THREAD_SPEC__PACKAGE,
138 	THREAD_SPEC__NUMA,
139 	THREAD_SPEC__USER,
140 	THREAD_SPEC__MAX,
141 };
142 
143 static const char *thread_spec_tags[THREAD_SPEC__MAX] = {
144 	"undefined", "cpu", "core", "package", "numa", "user"
145 };
146 
147 struct pollfd_index_map {
148 	int evlist_pollfd_index;
149 	int thread_pollfd_index;
150 };
151 
152 struct record {
153 	struct perf_tool	tool;
154 	struct record_opts	opts;
155 	u64			bytes_written;
156 	u64			thread_bytes_written;
157 	struct perf_data	data;
158 	struct auxtrace_record	*itr;
159 	struct evlist	*evlist;
160 	struct perf_session	*session;
161 	struct evlist		*sb_evlist;
162 	pthread_t		thread_id;
163 	int			realtime_prio;
164 	bool			switch_output_event_set;
165 	bool			no_buildid;
166 	bool			no_buildid_set;
167 	bool			no_buildid_cache;
168 	bool			no_buildid_cache_set;
169 	bool			buildid_all;
170 	bool			buildid_mmap;
171 	bool			timestamp_filename;
172 	bool			timestamp_boundary;
173 	bool			off_cpu;
174 	const char		*filter_action;
175 	struct switch_output	switch_output;
176 	unsigned long long	samples;
177 	unsigned long		output_max_size;	/* = 0: unlimited */
178 	struct perf_debuginfod	debuginfod;
179 	int			nr_threads;
180 	struct thread_mask	*thread_masks;
181 	struct record_thread	*thread_data;
182 	struct pollfd_index_map	*index_map;
183 	size_t			index_map_sz;
184 	size_t			index_map_cnt;
185 };
186 
187 static volatile int done;
188 
189 static volatile int auxtrace_record__snapshot_started;
190 static DEFINE_TRIGGER(auxtrace_snapshot_trigger);
191 static DEFINE_TRIGGER(switch_output_trigger);
192 
193 static const char *affinity_tags[PERF_AFFINITY_MAX] = {
194 	"SYS", "NODE", "CPU"
195 };
196 
197 #ifndef HAVE_GETTID
198 static inline pid_t gettid(void)
199 {
200 	return (pid_t)syscall(__NR_gettid);
201 }
202 #endif
203 
204 static int record__threads_enabled(struct record *rec)
205 {
206 	return rec->opts.threads_spec;
207 }
208 
209 static bool switch_output_signal(struct record *rec)
210 {
211 	return rec->switch_output.signal &&
212 	       trigger_is_ready(&switch_output_trigger);
213 }
214 
215 static bool switch_output_size(struct record *rec)
216 {
217 	return rec->switch_output.size &&
218 	       trigger_is_ready(&switch_output_trigger) &&
219 	       (rec->bytes_written >= rec->switch_output.size);
220 }
221 
222 static bool switch_output_time(struct record *rec)
223 {
224 	return rec->switch_output.time &&
225 	       trigger_is_ready(&switch_output_trigger);
226 }
227 
228 static u64 record__bytes_written(struct record *rec)
229 {
230 	return rec->bytes_written + rec->thread_bytes_written;
231 }
232 
233 static bool record__output_max_size_exceeded(struct record *rec)
234 {
235 	return rec->output_max_size &&
236 	       (record__bytes_written(rec) >= rec->output_max_size);
237 }
238 
239 static int record__write(struct record *rec, struct mmap *map __maybe_unused,
240 			 void *bf, size_t size)
241 {
242 	struct perf_data_file *file = &rec->session->data->file;
243 
244 	if (map && map->file)
245 		file = map->file;
246 
247 	if (perf_data_file__write(file, bf, size) < 0) {
248 		pr_err("failed to write perf data, error: %m\n");
249 		return -1;
250 	}
251 
252 	if (map && map->file) {
253 		thread->bytes_written += size;
254 		rec->thread_bytes_written += size;
255 	} else {
256 		rec->bytes_written += size;
257 	}
258 
259 	if (record__output_max_size_exceeded(rec) && !done) {
260 		fprintf(stderr, "[ perf record: perf size limit reached (%" PRIu64 " KB),"
261 				" stopping session ]\n",
262 				record__bytes_written(rec) >> 10);
263 		done = 1;
264 	}
265 
266 	if (switch_output_size(rec))
267 		trigger_hit(&switch_output_trigger);
268 
269 	return 0;
270 }
271 
272 static int record__aio_enabled(struct record *rec);
273 static int record__comp_enabled(struct record *rec);
274 static ssize_t zstd_compress(struct perf_session *session, struct mmap *map,
275 			    void *dst, size_t dst_size, void *src, size_t src_size);
276 
277 #ifdef HAVE_AIO_SUPPORT
278 static int record__aio_write(struct aiocb *cblock, int trace_fd,
279 		void *buf, size_t size, off_t off)
280 {
281 	int rc;
282 
283 	cblock->aio_fildes = trace_fd;
284 	cblock->aio_buf    = buf;
285 	cblock->aio_nbytes = size;
286 	cblock->aio_offset = off;
287 	cblock->aio_sigevent.sigev_notify = SIGEV_NONE;
288 
289 	do {
290 		rc = aio_write(cblock);
291 		if (rc == 0) {
292 			break;
293 		} else if (errno != EAGAIN) {
294 			cblock->aio_fildes = -1;
295 			pr_err("failed to queue perf data, error: %m\n");
296 			break;
297 		}
298 	} while (1);
299 
300 	return rc;
301 }
302 
303 static int record__aio_complete(struct mmap *md, struct aiocb *cblock)
304 {
305 	void *rem_buf;
306 	off_t rem_off;
307 	size_t rem_size;
308 	int rc, aio_errno;
309 	ssize_t aio_ret, written;
310 
311 	aio_errno = aio_error(cblock);
312 	if (aio_errno == EINPROGRESS)
313 		return 0;
314 
315 	written = aio_ret = aio_return(cblock);
316 	if (aio_ret < 0) {
317 		if (aio_errno != EINTR)
318 			pr_err("failed to write perf data, error: %m\n");
319 		written = 0;
320 	}
321 
322 	rem_size = cblock->aio_nbytes - written;
323 
324 	if (rem_size == 0) {
325 		cblock->aio_fildes = -1;
326 		/*
327 		 * md->refcount is incremented in record__aio_pushfn() for
328 		 * every aio write request started in record__aio_push() so
329 		 * decrement it because the request is now complete.
330 		 */
331 		perf_mmap__put(&md->core);
332 		rc = 1;
333 	} else {
334 		/*
335 		 * aio write request may require restart with the
336 		 * remainder if the kernel didn't write whole
337 		 * chunk at once.
338 		 */
339 		rem_off = cblock->aio_offset + written;
340 		rem_buf = (void *)(cblock->aio_buf + written);
341 		record__aio_write(cblock, cblock->aio_fildes,
342 				rem_buf, rem_size, rem_off);
343 		rc = 0;
344 	}
345 
346 	return rc;
347 }
348 
349 static int record__aio_sync(struct mmap *md, bool sync_all)
350 {
351 	struct aiocb **aiocb = md->aio.aiocb;
352 	struct aiocb *cblocks = md->aio.cblocks;
353 	struct timespec timeout = { 0, 1000 * 1000  * 1 }; /* 1ms */
354 	int i, do_suspend;
355 
356 	do {
357 		do_suspend = 0;
358 		for (i = 0; i < md->aio.nr_cblocks; ++i) {
359 			if (cblocks[i].aio_fildes == -1 || record__aio_complete(md, &cblocks[i])) {
360 				if (sync_all)
361 					aiocb[i] = NULL;
362 				else
363 					return i;
364 			} else {
365 				/*
366 				 * Started aio write is not complete yet
367 				 * so it has to be waited before the
368 				 * next allocation.
369 				 */
370 				aiocb[i] = &cblocks[i];
371 				do_suspend = 1;
372 			}
373 		}
374 		if (!do_suspend)
375 			return -1;
376 
377 		while (aio_suspend((const struct aiocb **)aiocb, md->aio.nr_cblocks, &timeout)) {
378 			if (!(errno == EAGAIN || errno == EINTR))
379 				pr_err("failed to sync perf data, error: %m\n");
380 		}
381 	} while (1);
382 }
383 
384 struct record_aio {
385 	struct record	*rec;
386 	void		*data;
387 	size_t		size;
388 };
389 
390 static int record__aio_pushfn(struct mmap *map, void *to, void *buf, size_t size)
391 {
392 	struct record_aio *aio = to;
393 
394 	/*
395 	 * map->core.base data pointed by buf is copied into free map->aio.data[] buffer
396 	 * to release space in the kernel buffer as fast as possible, calling
397 	 * perf_mmap__consume() from perf_mmap__push() function.
398 	 *
399 	 * That lets the kernel to proceed with storing more profiling data into
400 	 * the kernel buffer earlier than other per-cpu kernel buffers are handled.
401 	 *
402 	 * Coping can be done in two steps in case the chunk of profiling data
403 	 * crosses the upper bound of the kernel buffer. In this case we first move
404 	 * part of data from map->start till the upper bound and then the remainder
405 	 * from the beginning of the kernel buffer till the end of the data chunk.
406 	 */
407 
408 	if (record__comp_enabled(aio->rec)) {
409 		ssize_t compressed = zstd_compress(aio->rec->session, NULL, aio->data + aio->size,
410 						   mmap__mmap_len(map) - aio->size,
411 						   buf, size);
412 		if (compressed < 0)
413 			return (int)compressed;
414 
415 		size = compressed;
416 	} else {
417 		memcpy(aio->data + aio->size, buf, size);
418 	}
419 
420 	if (!aio->size) {
421 		/*
422 		 * Increment map->refcount to guard map->aio.data[] buffer
423 		 * from premature deallocation because map object can be
424 		 * released earlier than aio write request started on
425 		 * map->aio.data[] buffer is complete.
426 		 *
427 		 * perf_mmap__put() is done at record__aio_complete()
428 		 * after started aio request completion or at record__aio_push()
429 		 * if the request failed to start.
430 		 */
431 		perf_mmap__get(&map->core);
432 	}
433 
434 	aio->size += size;
435 
436 	return size;
437 }
438 
439 static int record__aio_push(struct record *rec, struct mmap *map, off_t *off)
440 {
441 	int ret, idx;
442 	int trace_fd = rec->session->data->file.fd;
443 	struct record_aio aio = { .rec = rec, .size = 0 };
444 
445 	/*
446 	 * Call record__aio_sync() to wait till map->aio.data[] buffer
447 	 * becomes available after previous aio write operation.
448 	 */
449 
450 	idx = record__aio_sync(map, false);
451 	aio.data = map->aio.data[idx];
452 	ret = perf_mmap__push(map, &aio, record__aio_pushfn);
453 	if (ret != 0) /* ret > 0 - no data, ret < 0 - error */
454 		return ret;
455 
456 	rec->samples++;
457 	ret = record__aio_write(&(map->aio.cblocks[idx]), trace_fd, aio.data, aio.size, *off);
458 	if (!ret) {
459 		*off += aio.size;
460 		rec->bytes_written += aio.size;
461 		if (switch_output_size(rec))
462 			trigger_hit(&switch_output_trigger);
463 	} else {
464 		/*
465 		 * Decrement map->refcount incremented in record__aio_pushfn()
466 		 * back if record__aio_write() operation failed to start, otherwise
467 		 * map->refcount is decremented in record__aio_complete() after
468 		 * aio write operation finishes successfully.
469 		 */
470 		perf_mmap__put(&map->core);
471 	}
472 
473 	return ret;
474 }
475 
476 static off_t record__aio_get_pos(int trace_fd)
477 {
478 	return lseek(trace_fd, 0, SEEK_CUR);
479 }
480 
481 static void record__aio_set_pos(int trace_fd, off_t pos)
482 {
483 	lseek(trace_fd, pos, SEEK_SET);
484 }
485 
486 static void record__aio_mmap_read_sync(struct record *rec)
487 {
488 	int i;
489 	struct evlist *evlist = rec->evlist;
490 	struct mmap *maps = evlist->mmap;
491 
492 	if (!record__aio_enabled(rec))
493 		return;
494 
495 	for (i = 0; i < evlist->core.nr_mmaps; i++) {
496 		struct mmap *map = &maps[i];
497 
498 		if (map->core.base)
499 			record__aio_sync(map, true);
500 	}
501 }
502 
503 static int nr_cblocks_default = 1;
504 static int nr_cblocks_max = 4;
505 
506 static int record__aio_parse(const struct option *opt,
507 			     const char *str,
508 			     int unset)
509 {
510 	struct record_opts *opts = (struct record_opts *)opt->value;
511 
512 	if (unset) {
513 		opts->nr_cblocks = 0;
514 	} else {
515 		if (str)
516 			opts->nr_cblocks = strtol(str, NULL, 0);
517 		if (!opts->nr_cblocks)
518 			opts->nr_cblocks = nr_cblocks_default;
519 	}
520 
521 	return 0;
522 }
523 #else /* HAVE_AIO_SUPPORT */
524 static int nr_cblocks_max = 0;
525 
526 static int record__aio_push(struct record *rec __maybe_unused, struct mmap *map __maybe_unused,
527 			    off_t *off __maybe_unused)
528 {
529 	return -1;
530 }
531 
532 static off_t record__aio_get_pos(int trace_fd __maybe_unused)
533 {
534 	return -1;
535 }
536 
537 static void record__aio_set_pos(int trace_fd __maybe_unused, off_t pos __maybe_unused)
538 {
539 }
540 
541 static void record__aio_mmap_read_sync(struct record *rec __maybe_unused)
542 {
543 }
544 #endif
545 
546 static int record__aio_enabled(struct record *rec)
547 {
548 	return rec->opts.nr_cblocks > 0;
549 }
550 
551 #define MMAP_FLUSH_DEFAULT 1
552 static int record__mmap_flush_parse(const struct option *opt,
553 				    const char *str,
554 				    int unset)
555 {
556 	int flush_max;
557 	struct record_opts *opts = (struct record_opts *)opt->value;
558 	static struct parse_tag tags[] = {
559 			{ .tag  = 'B', .mult = 1       },
560 			{ .tag  = 'K', .mult = 1 << 10 },
561 			{ .tag  = 'M', .mult = 1 << 20 },
562 			{ .tag  = 'G', .mult = 1 << 30 },
563 			{ .tag  = 0 },
564 	};
565 
566 	if (unset)
567 		return 0;
568 
569 	if (str) {
570 		opts->mmap_flush = parse_tag_value(str, tags);
571 		if (opts->mmap_flush == (int)-1)
572 			opts->mmap_flush = strtol(str, NULL, 0);
573 	}
574 
575 	if (!opts->mmap_flush)
576 		opts->mmap_flush = MMAP_FLUSH_DEFAULT;
577 
578 	flush_max = evlist__mmap_size(opts->mmap_pages);
579 	flush_max /= 4;
580 	if (opts->mmap_flush > flush_max)
581 		opts->mmap_flush = flush_max;
582 
583 	return 0;
584 }
585 
586 #ifdef HAVE_ZSTD_SUPPORT
587 static unsigned int comp_level_default = 1;
588 
589 static int record__parse_comp_level(const struct option *opt, const char *str, int unset)
590 {
591 	struct record_opts *opts = opt->value;
592 
593 	if (unset) {
594 		opts->comp_level = 0;
595 	} else {
596 		if (str)
597 			opts->comp_level = strtol(str, NULL, 0);
598 		if (!opts->comp_level)
599 			opts->comp_level = comp_level_default;
600 	}
601 
602 	return 0;
603 }
604 #endif
605 static unsigned int comp_level_max = 22;
606 
607 static int record__comp_enabled(struct record *rec)
608 {
609 	return rec->opts.comp_level > 0;
610 }
611 
612 static int process_synthesized_event(struct perf_tool *tool,
613 				     union perf_event *event,
614 				     struct perf_sample *sample __maybe_unused,
615 				     struct machine *machine __maybe_unused)
616 {
617 	struct record *rec = container_of(tool, struct record, tool);
618 	return record__write(rec, NULL, event, event->header.size);
619 }
620 
621 static struct mutex synth_lock;
622 
623 static int process_locked_synthesized_event(struct perf_tool *tool,
624 				     union perf_event *event,
625 				     struct perf_sample *sample __maybe_unused,
626 				     struct machine *machine __maybe_unused)
627 {
628 	int ret;
629 
630 	mutex_lock(&synth_lock);
631 	ret = process_synthesized_event(tool, event, sample, machine);
632 	mutex_unlock(&synth_lock);
633 	return ret;
634 }
635 
636 static int record__pushfn(struct mmap *map, void *to, void *bf, size_t size)
637 {
638 	struct record *rec = to;
639 
640 	if (record__comp_enabled(rec)) {
641 		ssize_t compressed = zstd_compress(rec->session, map, map->data,
642 						   mmap__mmap_len(map), bf, size);
643 
644 		if (compressed < 0)
645 			return (int)compressed;
646 
647 		size = compressed;
648 		bf   = map->data;
649 	}
650 
651 	thread->samples++;
652 	return record__write(rec, map, bf, size);
653 }
654 
655 static volatile sig_atomic_t signr = -1;
656 static volatile sig_atomic_t child_finished;
657 #ifdef HAVE_EVENTFD_SUPPORT
658 static volatile sig_atomic_t done_fd = -1;
659 #endif
660 
661 static void sig_handler(int sig)
662 {
663 	if (sig == SIGCHLD)
664 		child_finished = 1;
665 	else
666 		signr = sig;
667 
668 	done = 1;
669 #ifdef HAVE_EVENTFD_SUPPORT
670 	if (done_fd >= 0) {
671 		u64 tmp = 1;
672 		int orig_errno = errno;
673 
674 		/*
675 		 * It is possible for this signal handler to run after done is
676 		 * checked in the main loop, but before the perf counter fds are
677 		 * polled. If this happens, the poll() will continue to wait
678 		 * even though done is set, and will only break out if either
679 		 * another signal is received, or the counters are ready for
680 		 * read. To ensure the poll() doesn't sleep when done is set,
681 		 * use an eventfd (done_fd) to wake up the poll().
682 		 */
683 		if (write(done_fd, &tmp, sizeof(tmp)) < 0)
684 			pr_err("failed to signal wakeup fd, error: %m\n");
685 
686 		errno = orig_errno;
687 	}
688 #endif // HAVE_EVENTFD_SUPPORT
689 }
690 
691 static void sigsegv_handler(int sig)
692 {
693 	perf_hooks__recover();
694 	sighandler_dump_stack(sig);
695 }
696 
697 static void record__sig_exit(void)
698 {
699 	if (signr == -1)
700 		return;
701 
702 	signal(signr, SIG_DFL);
703 	raise(signr);
704 }
705 
706 #ifdef HAVE_AUXTRACE_SUPPORT
707 
708 static int record__process_auxtrace(struct perf_tool *tool,
709 				    struct mmap *map,
710 				    union perf_event *event, void *data1,
711 				    size_t len1, void *data2, size_t len2)
712 {
713 	struct record *rec = container_of(tool, struct record, tool);
714 	struct perf_data *data = &rec->data;
715 	size_t padding;
716 	u8 pad[8] = {0};
717 
718 	if (!perf_data__is_pipe(data) && perf_data__is_single_file(data)) {
719 		off_t file_offset;
720 		int fd = perf_data__fd(data);
721 		int err;
722 
723 		file_offset = lseek(fd, 0, SEEK_CUR);
724 		if (file_offset == -1)
725 			return -1;
726 		err = auxtrace_index__auxtrace_event(&rec->session->auxtrace_index,
727 						     event, file_offset);
728 		if (err)
729 			return err;
730 	}
731 
732 	/* event.auxtrace.size includes padding, see __auxtrace_mmap__read() */
733 	padding = (len1 + len2) & 7;
734 	if (padding)
735 		padding = 8 - padding;
736 
737 	record__write(rec, map, event, event->header.size);
738 	record__write(rec, map, data1, len1);
739 	if (len2)
740 		record__write(rec, map, data2, len2);
741 	record__write(rec, map, &pad, padding);
742 
743 	return 0;
744 }
745 
746 static int record__auxtrace_mmap_read(struct record *rec,
747 				      struct mmap *map)
748 {
749 	int ret;
750 
751 	ret = auxtrace_mmap__read(map, rec->itr, &rec->tool,
752 				  record__process_auxtrace);
753 	if (ret < 0)
754 		return ret;
755 
756 	if (ret)
757 		rec->samples++;
758 
759 	return 0;
760 }
761 
762 static int record__auxtrace_mmap_read_snapshot(struct record *rec,
763 					       struct mmap *map)
764 {
765 	int ret;
766 
767 	ret = auxtrace_mmap__read_snapshot(map, rec->itr, &rec->tool,
768 					   record__process_auxtrace,
769 					   rec->opts.auxtrace_snapshot_size);
770 	if (ret < 0)
771 		return ret;
772 
773 	if (ret)
774 		rec->samples++;
775 
776 	return 0;
777 }
778 
779 static int record__auxtrace_read_snapshot_all(struct record *rec)
780 {
781 	int i;
782 	int rc = 0;
783 
784 	for (i = 0; i < rec->evlist->core.nr_mmaps; i++) {
785 		struct mmap *map = &rec->evlist->mmap[i];
786 
787 		if (!map->auxtrace_mmap.base)
788 			continue;
789 
790 		if (record__auxtrace_mmap_read_snapshot(rec, map) != 0) {
791 			rc = -1;
792 			goto out;
793 		}
794 	}
795 out:
796 	return rc;
797 }
798 
799 static void record__read_auxtrace_snapshot(struct record *rec, bool on_exit)
800 {
801 	pr_debug("Recording AUX area tracing snapshot\n");
802 	if (record__auxtrace_read_snapshot_all(rec) < 0) {
803 		trigger_error(&auxtrace_snapshot_trigger);
804 	} else {
805 		if (auxtrace_record__snapshot_finish(rec->itr, on_exit))
806 			trigger_error(&auxtrace_snapshot_trigger);
807 		else
808 			trigger_ready(&auxtrace_snapshot_trigger);
809 	}
810 }
811 
812 static int record__auxtrace_snapshot_exit(struct record *rec)
813 {
814 	if (trigger_is_error(&auxtrace_snapshot_trigger))
815 		return 0;
816 
817 	if (!auxtrace_record__snapshot_started &&
818 	    auxtrace_record__snapshot_start(rec->itr))
819 		return -1;
820 
821 	record__read_auxtrace_snapshot(rec, true);
822 	if (trigger_is_error(&auxtrace_snapshot_trigger))
823 		return -1;
824 
825 	return 0;
826 }
827 
828 static int record__auxtrace_init(struct record *rec)
829 {
830 	int err;
831 
832 	if ((rec->opts.auxtrace_snapshot_opts || rec->opts.auxtrace_sample_opts)
833 	    && record__threads_enabled(rec)) {
834 		pr_err("AUX area tracing options are not available in parallel streaming mode.\n");
835 		return -EINVAL;
836 	}
837 
838 	if (!rec->itr) {
839 		rec->itr = auxtrace_record__init(rec->evlist, &err);
840 		if (err)
841 			return err;
842 	}
843 
844 	err = auxtrace_parse_snapshot_options(rec->itr, &rec->opts,
845 					      rec->opts.auxtrace_snapshot_opts);
846 	if (err)
847 		return err;
848 
849 	err = auxtrace_parse_sample_options(rec->itr, rec->evlist, &rec->opts,
850 					    rec->opts.auxtrace_sample_opts);
851 	if (err)
852 		return err;
853 
854 	auxtrace_regroup_aux_output(rec->evlist);
855 
856 	return auxtrace_parse_filters(rec->evlist);
857 }
858 
859 #else
860 
861 static inline
862 int record__auxtrace_mmap_read(struct record *rec __maybe_unused,
863 			       struct mmap *map __maybe_unused)
864 {
865 	return 0;
866 }
867 
868 static inline
869 void record__read_auxtrace_snapshot(struct record *rec __maybe_unused,
870 				    bool on_exit __maybe_unused)
871 {
872 }
873 
874 static inline
875 int auxtrace_record__snapshot_start(struct auxtrace_record *itr __maybe_unused)
876 {
877 	return 0;
878 }
879 
880 static inline
881 int record__auxtrace_snapshot_exit(struct record *rec __maybe_unused)
882 {
883 	return 0;
884 }
885 
886 static int record__auxtrace_init(struct record *rec __maybe_unused)
887 {
888 	return 0;
889 }
890 
891 #endif
892 
893 static int record__config_text_poke(struct evlist *evlist)
894 {
895 	struct evsel *evsel;
896 
897 	/* Nothing to do if text poke is already configured */
898 	evlist__for_each_entry(evlist, evsel) {
899 		if (evsel->core.attr.text_poke)
900 			return 0;
901 	}
902 
903 	evsel = evlist__add_dummy_on_all_cpus(evlist);
904 	if (!evsel)
905 		return -ENOMEM;
906 
907 	evsel->core.attr.text_poke = 1;
908 	evsel->core.attr.ksymbol = 1;
909 	evsel->immediate = true;
910 	evsel__set_sample_bit(evsel, TIME);
911 
912 	return 0;
913 }
914 
915 static int record__config_off_cpu(struct record *rec)
916 {
917 	return off_cpu_prepare(rec->evlist, &rec->opts.target, &rec->opts);
918 }
919 
920 static bool record__tracking_system_wide(struct record *rec)
921 {
922 	struct evlist *evlist = rec->evlist;
923 	struct evsel *evsel;
924 
925 	/*
926 	 * If non-dummy evsel exists, system_wide sideband is need to
927 	 * help parse sample information.
928 	 * For example, PERF_EVENT_MMAP event to help parse symbol,
929 	 * and PERF_EVENT_COMM event to help parse task executable name.
930 	 */
931 	evlist__for_each_entry(evlist, evsel) {
932 		if (!evsel__is_dummy_event(evsel))
933 			return true;
934 	}
935 
936 	return false;
937 }
938 
939 static int record__config_tracking_events(struct record *rec)
940 {
941 	struct record_opts *opts = &rec->opts;
942 	struct evlist *evlist = rec->evlist;
943 	bool system_wide = false;
944 	struct evsel *evsel;
945 
946 	/*
947 	 * For initial_delay, system wide or a hybrid system, we need to add
948 	 * tracking event so that we can track PERF_RECORD_MMAP to cover the
949 	 * delay of waiting or event synthesis.
950 	 */
951 	if (opts->target.initial_delay || target__has_cpu(&opts->target) ||
952 	    perf_pmus__num_core_pmus() > 1) {
953 
954 		/*
955 		 * User space tasks can migrate between CPUs, so when tracing
956 		 * selected CPUs, sideband for all CPUs is still needed.
957 		 */
958 		if (!!opts->target.cpu_list && record__tracking_system_wide(rec))
959 			system_wide = true;
960 
961 		evsel = evlist__findnew_tracking_event(evlist, system_wide);
962 		if (!evsel)
963 			return -ENOMEM;
964 
965 		/*
966 		 * Enable the tracking event when the process is forked for
967 		 * initial_delay, immediately for system wide.
968 		 */
969 		if (opts->target.initial_delay && !evsel->immediate &&
970 		    !target__has_cpu(&opts->target))
971 			evsel->core.attr.enable_on_exec = 1;
972 		else
973 			evsel->immediate = 1;
974 	}
975 
976 	return 0;
977 }
978 
979 static bool record__kcore_readable(struct machine *machine)
980 {
981 	char kcore[PATH_MAX];
982 	int fd;
983 
984 	scnprintf(kcore, sizeof(kcore), "%s/proc/kcore", machine->root_dir);
985 
986 	fd = open(kcore, O_RDONLY);
987 	if (fd < 0)
988 		return false;
989 
990 	close(fd);
991 
992 	return true;
993 }
994 
995 static int record__kcore_copy(struct machine *machine, struct perf_data *data)
996 {
997 	char from_dir[PATH_MAX];
998 	char kcore_dir[PATH_MAX];
999 	int ret;
1000 
1001 	snprintf(from_dir, sizeof(from_dir), "%s/proc", machine->root_dir);
1002 
1003 	ret = perf_data__make_kcore_dir(data, kcore_dir, sizeof(kcore_dir));
1004 	if (ret)
1005 		return ret;
1006 
1007 	return kcore_copy(from_dir, kcore_dir);
1008 }
1009 
1010 static void record__thread_data_init_pipes(struct record_thread *thread_data)
1011 {
1012 	thread_data->pipes.msg[0] = -1;
1013 	thread_data->pipes.msg[1] = -1;
1014 	thread_data->pipes.ack[0] = -1;
1015 	thread_data->pipes.ack[1] = -1;
1016 }
1017 
1018 static int record__thread_data_open_pipes(struct record_thread *thread_data)
1019 {
1020 	if (pipe(thread_data->pipes.msg))
1021 		return -EINVAL;
1022 
1023 	if (pipe(thread_data->pipes.ack)) {
1024 		close(thread_data->pipes.msg[0]);
1025 		thread_data->pipes.msg[0] = -1;
1026 		close(thread_data->pipes.msg[1]);
1027 		thread_data->pipes.msg[1] = -1;
1028 		return -EINVAL;
1029 	}
1030 
1031 	pr_debug2("thread_data[%p]: msg=[%d,%d], ack=[%d,%d]\n", thread_data,
1032 		 thread_data->pipes.msg[0], thread_data->pipes.msg[1],
1033 		 thread_data->pipes.ack[0], thread_data->pipes.ack[1]);
1034 
1035 	return 0;
1036 }
1037 
1038 static void record__thread_data_close_pipes(struct record_thread *thread_data)
1039 {
1040 	if (thread_data->pipes.msg[0] != -1) {
1041 		close(thread_data->pipes.msg[0]);
1042 		thread_data->pipes.msg[0] = -1;
1043 	}
1044 	if (thread_data->pipes.msg[1] != -1) {
1045 		close(thread_data->pipes.msg[1]);
1046 		thread_data->pipes.msg[1] = -1;
1047 	}
1048 	if (thread_data->pipes.ack[0] != -1) {
1049 		close(thread_data->pipes.ack[0]);
1050 		thread_data->pipes.ack[0] = -1;
1051 	}
1052 	if (thread_data->pipes.ack[1] != -1) {
1053 		close(thread_data->pipes.ack[1]);
1054 		thread_data->pipes.ack[1] = -1;
1055 	}
1056 }
1057 
1058 static bool evlist__per_thread(struct evlist *evlist)
1059 {
1060 	return cpu_map__is_dummy(evlist->core.user_requested_cpus);
1061 }
1062 
1063 static int record__thread_data_init_maps(struct record_thread *thread_data, struct evlist *evlist)
1064 {
1065 	int m, tm, nr_mmaps = evlist->core.nr_mmaps;
1066 	struct mmap *mmap = evlist->mmap;
1067 	struct mmap *overwrite_mmap = evlist->overwrite_mmap;
1068 	struct perf_cpu_map *cpus = evlist->core.all_cpus;
1069 	bool per_thread = evlist__per_thread(evlist);
1070 
1071 	if (per_thread)
1072 		thread_data->nr_mmaps = nr_mmaps;
1073 	else
1074 		thread_data->nr_mmaps = bitmap_weight(thread_data->mask->maps.bits,
1075 						      thread_data->mask->maps.nbits);
1076 	if (mmap) {
1077 		thread_data->maps = zalloc(thread_data->nr_mmaps * sizeof(struct mmap *));
1078 		if (!thread_data->maps)
1079 			return -ENOMEM;
1080 	}
1081 	if (overwrite_mmap) {
1082 		thread_data->overwrite_maps = zalloc(thread_data->nr_mmaps * sizeof(struct mmap *));
1083 		if (!thread_data->overwrite_maps) {
1084 			zfree(&thread_data->maps);
1085 			return -ENOMEM;
1086 		}
1087 	}
1088 	pr_debug2("thread_data[%p]: nr_mmaps=%d, maps=%p, ow_maps=%p\n", thread_data,
1089 		 thread_data->nr_mmaps, thread_data->maps, thread_data->overwrite_maps);
1090 
1091 	for (m = 0, tm = 0; m < nr_mmaps && tm < thread_data->nr_mmaps; m++) {
1092 		if (per_thread ||
1093 		    test_bit(perf_cpu_map__cpu(cpus, m).cpu, thread_data->mask->maps.bits)) {
1094 			if (thread_data->maps) {
1095 				thread_data->maps[tm] = &mmap[m];
1096 				pr_debug2("thread_data[%p]: cpu%d: maps[%d] -> mmap[%d]\n",
1097 					  thread_data, perf_cpu_map__cpu(cpus, m).cpu, tm, m);
1098 			}
1099 			if (thread_data->overwrite_maps) {
1100 				thread_data->overwrite_maps[tm] = &overwrite_mmap[m];
1101 				pr_debug2("thread_data[%p]: cpu%d: ow_maps[%d] -> ow_mmap[%d]\n",
1102 					  thread_data, perf_cpu_map__cpu(cpus, m).cpu, tm, m);
1103 			}
1104 			tm++;
1105 		}
1106 	}
1107 
1108 	return 0;
1109 }
1110 
1111 static int record__thread_data_init_pollfd(struct record_thread *thread_data, struct evlist *evlist)
1112 {
1113 	int f, tm, pos;
1114 	struct mmap *map, *overwrite_map;
1115 
1116 	fdarray__init(&thread_data->pollfd, 64);
1117 
1118 	for (tm = 0; tm < thread_data->nr_mmaps; tm++) {
1119 		map = thread_data->maps ? thread_data->maps[tm] : NULL;
1120 		overwrite_map = thread_data->overwrite_maps ?
1121 				thread_data->overwrite_maps[tm] : NULL;
1122 
1123 		for (f = 0; f < evlist->core.pollfd.nr; f++) {
1124 			void *ptr = evlist->core.pollfd.priv[f].ptr;
1125 
1126 			if ((map && ptr == map) || (overwrite_map && ptr == overwrite_map)) {
1127 				pos = fdarray__dup_entry_from(&thread_data->pollfd, f,
1128 							      &evlist->core.pollfd);
1129 				if (pos < 0)
1130 					return pos;
1131 				pr_debug2("thread_data[%p]: pollfd[%d] <- event_fd=%d\n",
1132 					 thread_data, pos, evlist->core.pollfd.entries[f].fd);
1133 			}
1134 		}
1135 	}
1136 
1137 	return 0;
1138 }
1139 
1140 static void record__free_thread_data(struct record *rec)
1141 {
1142 	int t;
1143 	struct record_thread *thread_data = rec->thread_data;
1144 
1145 	if (thread_data == NULL)
1146 		return;
1147 
1148 	for (t = 0; t < rec->nr_threads; t++) {
1149 		record__thread_data_close_pipes(&thread_data[t]);
1150 		zfree(&thread_data[t].maps);
1151 		zfree(&thread_data[t].overwrite_maps);
1152 		fdarray__exit(&thread_data[t].pollfd);
1153 	}
1154 
1155 	zfree(&rec->thread_data);
1156 }
1157 
1158 static int record__map_thread_evlist_pollfd_indexes(struct record *rec,
1159 						    int evlist_pollfd_index,
1160 						    int thread_pollfd_index)
1161 {
1162 	size_t x = rec->index_map_cnt;
1163 
1164 	if (realloc_array_as_needed(rec->index_map, rec->index_map_sz, x, NULL))
1165 		return -ENOMEM;
1166 	rec->index_map[x].evlist_pollfd_index = evlist_pollfd_index;
1167 	rec->index_map[x].thread_pollfd_index = thread_pollfd_index;
1168 	rec->index_map_cnt += 1;
1169 	return 0;
1170 }
1171 
1172 static int record__update_evlist_pollfd_from_thread(struct record *rec,
1173 						    struct evlist *evlist,
1174 						    struct record_thread *thread_data)
1175 {
1176 	struct pollfd *e_entries = evlist->core.pollfd.entries;
1177 	struct pollfd *t_entries = thread_data->pollfd.entries;
1178 	int err = 0;
1179 	size_t i;
1180 
1181 	for (i = 0; i < rec->index_map_cnt; i++) {
1182 		int e_pos = rec->index_map[i].evlist_pollfd_index;
1183 		int t_pos = rec->index_map[i].thread_pollfd_index;
1184 
1185 		if (e_entries[e_pos].fd != t_entries[t_pos].fd ||
1186 		    e_entries[e_pos].events != t_entries[t_pos].events) {
1187 			pr_err("Thread and evlist pollfd index mismatch\n");
1188 			err = -EINVAL;
1189 			continue;
1190 		}
1191 		e_entries[e_pos].revents = t_entries[t_pos].revents;
1192 	}
1193 	return err;
1194 }
1195 
1196 static int record__dup_non_perf_events(struct record *rec,
1197 				       struct evlist *evlist,
1198 				       struct record_thread *thread_data)
1199 {
1200 	struct fdarray *fda = &evlist->core.pollfd;
1201 	int i, ret;
1202 
1203 	for (i = 0; i < fda->nr; i++) {
1204 		if (!(fda->priv[i].flags & fdarray_flag__non_perf_event))
1205 			continue;
1206 		ret = fdarray__dup_entry_from(&thread_data->pollfd, i, fda);
1207 		if (ret < 0) {
1208 			pr_err("Failed to duplicate descriptor in main thread pollfd\n");
1209 			return ret;
1210 		}
1211 		pr_debug2("thread_data[%p]: pollfd[%d] <- non_perf_event fd=%d\n",
1212 			  thread_data, ret, fda->entries[i].fd);
1213 		ret = record__map_thread_evlist_pollfd_indexes(rec, i, ret);
1214 		if (ret < 0) {
1215 			pr_err("Failed to map thread and evlist pollfd indexes\n");
1216 			return ret;
1217 		}
1218 	}
1219 	return 0;
1220 }
1221 
1222 static int record__alloc_thread_data(struct record *rec, struct evlist *evlist)
1223 {
1224 	int t, ret;
1225 	struct record_thread *thread_data;
1226 
1227 	rec->thread_data = zalloc(rec->nr_threads * sizeof(*(rec->thread_data)));
1228 	if (!rec->thread_data) {
1229 		pr_err("Failed to allocate thread data\n");
1230 		return -ENOMEM;
1231 	}
1232 	thread_data = rec->thread_data;
1233 
1234 	for (t = 0; t < rec->nr_threads; t++)
1235 		record__thread_data_init_pipes(&thread_data[t]);
1236 
1237 	for (t = 0; t < rec->nr_threads; t++) {
1238 		thread_data[t].rec = rec;
1239 		thread_data[t].mask = &rec->thread_masks[t];
1240 		ret = record__thread_data_init_maps(&thread_data[t], evlist);
1241 		if (ret) {
1242 			pr_err("Failed to initialize thread[%d] maps\n", t);
1243 			goto out_free;
1244 		}
1245 		ret = record__thread_data_init_pollfd(&thread_data[t], evlist);
1246 		if (ret) {
1247 			pr_err("Failed to initialize thread[%d] pollfd\n", t);
1248 			goto out_free;
1249 		}
1250 		if (t) {
1251 			thread_data[t].tid = -1;
1252 			ret = record__thread_data_open_pipes(&thread_data[t]);
1253 			if (ret) {
1254 				pr_err("Failed to open thread[%d] communication pipes\n", t);
1255 				goto out_free;
1256 			}
1257 			ret = fdarray__add(&thread_data[t].pollfd, thread_data[t].pipes.msg[0],
1258 					   POLLIN | POLLERR | POLLHUP, fdarray_flag__nonfilterable);
1259 			if (ret < 0) {
1260 				pr_err("Failed to add descriptor to thread[%d] pollfd\n", t);
1261 				goto out_free;
1262 			}
1263 			thread_data[t].ctlfd_pos = ret;
1264 			pr_debug2("thread_data[%p]: pollfd[%d] <- ctl_fd=%d\n",
1265 				 thread_data, thread_data[t].ctlfd_pos,
1266 				 thread_data[t].pipes.msg[0]);
1267 		} else {
1268 			thread_data[t].tid = gettid();
1269 
1270 			ret = record__dup_non_perf_events(rec, evlist, &thread_data[t]);
1271 			if (ret < 0)
1272 				goto out_free;
1273 
1274 			thread_data[t].ctlfd_pos = -1; /* Not used */
1275 		}
1276 	}
1277 
1278 	return 0;
1279 
1280 out_free:
1281 	record__free_thread_data(rec);
1282 
1283 	return ret;
1284 }
1285 
1286 static int record__mmap_evlist(struct record *rec,
1287 			       struct evlist *evlist)
1288 {
1289 	int i, ret;
1290 	struct record_opts *opts = &rec->opts;
1291 	bool auxtrace_overwrite = opts->auxtrace_snapshot_mode ||
1292 				  opts->auxtrace_sample_mode;
1293 	char msg[512];
1294 
1295 	if (opts->affinity != PERF_AFFINITY_SYS)
1296 		cpu__setup_cpunode_map();
1297 
1298 	if (evlist__mmap_ex(evlist, opts->mmap_pages,
1299 				 opts->auxtrace_mmap_pages,
1300 				 auxtrace_overwrite,
1301 				 opts->nr_cblocks, opts->affinity,
1302 				 opts->mmap_flush, opts->comp_level) < 0) {
1303 		if (errno == EPERM) {
1304 			pr_err("Permission error mapping pages.\n"
1305 			       "Consider increasing "
1306 			       "/proc/sys/kernel/perf_event_mlock_kb,\n"
1307 			       "or try again with a smaller value of -m/--mmap_pages.\n"
1308 			       "(current value: %u,%u)\n",
1309 			       opts->mmap_pages, opts->auxtrace_mmap_pages);
1310 			return -errno;
1311 		} else {
1312 			pr_err("failed to mmap with %d (%s)\n", errno,
1313 				str_error_r(errno, msg, sizeof(msg)));
1314 			if (errno)
1315 				return -errno;
1316 			else
1317 				return -EINVAL;
1318 		}
1319 	}
1320 
1321 	if (evlist__initialize_ctlfd(evlist, opts->ctl_fd, opts->ctl_fd_ack))
1322 		return -1;
1323 
1324 	ret = record__alloc_thread_data(rec, evlist);
1325 	if (ret)
1326 		return ret;
1327 
1328 	if (record__threads_enabled(rec)) {
1329 		ret = perf_data__create_dir(&rec->data, evlist->core.nr_mmaps);
1330 		if (ret) {
1331 			pr_err("Failed to create data directory: %s\n", strerror(-ret));
1332 			return ret;
1333 		}
1334 		for (i = 0; i < evlist->core.nr_mmaps; i++) {
1335 			if (evlist->mmap)
1336 				evlist->mmap[i].file = &rec->data.dir.files[i];
1337 			if (evlist->overwrite_mmap)
1338 				evlist->overwrite_mmap[i].file = &rec->data.dir.files[i];
1339 		}
1340 	}
1341 
1342 	return 0;
1343 }
1344 
1345 static int record__mmap(struct record *rec)
1346 {
1347 	return record__mmap_evlist(rec, rec->evlist);
1348 }
1349 
1350 static int record__open(struct record *rec)
1351 {
1352 	char msg[BUFSIZ];
1353 	struct evsel *pos;
1354 	struct evlist *evlist = rec->evlist;
1355 	struct perf_session *session = rec->session;
1356 	struct record_opts *opts = &rec->opts;
1357 	int rc = 0;
1358 
1359 	evlist__for_each_entry(evlist, pos) {
1360 try_again:
1361 		if (evsel__open(pos, pos->core.cpus, pos->core.threads) < 0) {
1362 			if (evsel__fallback(pos, &opts->target, errno, msg, sizeof(msg))) {
1363 				if (verbose > 0)
1364 					ui__warning("%s\n", msg);
1365 				goto try_again;
1366 			}
1367 			if ((errno == EINVAL || errno == EBADF) &&
1368 			    pos->core.leader != &pos->core &&
1369 			    pos->weak_group) {
1370 			        pos = evlist__reset_weak_group(evlist, pos, true);
1371 				goto try_again;
1372 			}
1373 			rc = -errno;
1374 			evsel__open_strerror(pos, &opts->target, errno, msg, sizeof(msg));
1375 			ui__error("%s\n", msg);
1376 			goto out;
1377 		}
1378 
1379 		pos->supported = true;
1380 	}
1381 
1382 	if (symbol_conf.kptr_restrict && !evlist__exclude_kernel(evlist)) {
1383 		pr_warning(
1384 "WARNING: Kernel address maps (/proc/{kallsyms,modules}) are restricted,\n"
1385 "check /proc/sys/kernel/kptr_restrict and /proc/sys/kernel/perf_event_paranoid.\n\n"
1386 "Samples in kernel functions may not be resolved if a suitable vmlinux\n"
1387 "file is not found in the buildid cache or in the vmlinux path.\n\n"
1388 "Samples in kernel modules won't be resolved at all.\n\n"
1389 "If some relocation was applied (e.g. kexec) symbols may be misresolved\n"
1390 "even with a suitable vmlinux or kallsyms file.\n\n");
1391 	}
1392 
1393 	if (evlist__apply_filters(evlist, &pos, &opts->target)) {
1394 		pr_err("failed to set filter \"%s\" on event %s with %d (%s)\n",
1395 			pos->filter ?: "BPF", evsel__name(pos), errno,
1396 			str_error_r(errno, msg, sizeof(msg)));
1397 		rc = -1;
1398 		goto out;
1399 	}
1400 
1401 	rc = record__mmap(rec);
1402 	if (rc)
1403 		goto out;
1404 
1405 	session->evlist = evlist;
1406 	perf_session__set_id_hdr_size(session);
1407 out:
1408 	return rc;
1409 }
1410 
1411 static void set_timestamp_boundary(struct record *rec, u64 sample_time)
1412 {
1413 	if (rec->evlist->first_sample_time == 0)
1414 		rec->evlist->first_sample_time = sample_time;
1415 
1416 	if (sample_time)
1417 		rec->evlist->last_sample_time = sample_time;
1418 }
1419 
1420 static int process_sample_event(struct perf_tool *tool,
1421 				union perf_event *event,
1422 				struct perf_sample *sample,
1423 				struct evsel *evsel,
1424 				struct machine *machine)
1425 {
1426 	struct record *rec = container_of(tool, struct record, tool);
1427 
1428 	set_timestamp_boundary(rec, sample->time);
1429 
1430 	if (rec->buildid_all)
1431 		return 0;
1432 
1433 	rec->samples++;
1434 	return build_id__mark_dso_hit(tool, event, sample, evsel, machine);
1435 }
1436 
1437 static int process_buildids(struct record *rec)
1438 {
1439 	struct perf_session *session = rec->session;
1440 
1441 	if (perf_data__size(&rec->data) == 0)
1442 		return 0;
1443 
1444 	/*
1445 	 * During this process, it'll load kernel map and replace the
1446 	 * dso->long_name to a real pathname it found.  In this case
1447 	 * we prefer the vmlinux path like
1448 	 *   /lib/modules/3.16.4/build/vmlinux
1449 	 *
1450 	 * rather than build-id path (in debug directory).
1451 	 *   $HOME/.debug/.build-id/f0/6e17aa50adf4d00b88925e03775de107611551
1452 	 */
1453 	symbol_conf.ignore_vmlinux_buildid = true;
1454 
1455 	/*
1456 	 * If --buildid-all is given, it marks all DSO regardless of hits,
1457 	 * so no need to process samples. But if timestamp_boundary is enabled,
1458 	 * it still needs to walk on all samples to get the timestamps of
1459 	 * first/last samples.
1460 	 */
1461 	if (rec->buildid_all && !rec->timestamp_boundary)
1462 		rec->tool.sample = NULL;
1463 
1464 	return perf_session__process_events(session);
1465 }
1466 
1467 static void perf_event__synthesize_guest_os(struct machine *machine, void *data)
1468 {
1469 	int err;
1470 	struct perf_tool *tool = data;
1471 	/*
1472 	 *As for guest kernel when processing subcommand record&report,
1473 	 *we arrange module mmap prior to guest kernel mmap and trigger
1474 	 *a preload dso because default guest module symbols are loaded
1475 	 *from guest kallsyms instead of /lib/modules/XXX/XXX. This
1476 	 *method is used to avoid symbol missing when the first addr is
1477 	 *in module instead of in guest kernel.
1478 	 */
1479 	err = perf_event__synthesize_modules(tool, process_synthesized_event,
1480 					     machine);
1481 	if (err < 0)
1482 		pr_err("Couldn't record guest kernel [%d]'s reference"
1483 		       " relocation symbol.\n", machine->pid);
1484 
1485 	/*
1486 	 * We use _stext for guest kernel because guest kernel's /proc/kallsyms
1487 	 * have no _text sometimes.
1488 	 */
1489 	err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
1490 						 machine);
1491 	if (err < 0)
1492 		pr_err("Couldn't record guest kernel [%d]'s reference"
1493 		       " relocation symbol.\n", machine->pid);
1494 }
1495 
1496 static struct perf_event_header finished_round_event = {
1497 	.size = sizeof(struct perf_event_header),
1498 	.type = PERF_RECORD_FINISHED_ROUND,
1499 };
1500 
1501 static struct perf_event_header finished_init_event = {
1502 	.size = sizeof(struct perf_event_header),
1503 	.type = PERF_RECORD_FINISHED_INIT,
1504 };
1505 
1506 static void record__adjust_affinity(struct record *rec, struct mmap *map)
1507 {
1508 	if (rec->opts.affinity != PERF_AFFINITY_SYS &&
1509 	    !bitmap_equal(thread->mask->affinity.bits, map->affinity_mask.bits,
1510 			  thread->mask->affinity.nbits)) {
1511 		bitmap_zero(thread->mask->affinity.bits, thread->mask->affinity.nbits);
1512 		bitmap_or(thread->mask->affinity.bits, thread->mask->affinity.bits,
1513 			  map->affinity_mask.bits, thread->mask->affinity.nbits);
1514 		sched_setaffinity(0, MMAP_CPU_MASK_BYTES(&thread->mask->affinity),
1515 					(cpu_set_t *)thread->mask->affinity.bits);
1516 		if (verbose == 2) {
1517 			pr_debug("threads[%d]: running on cpu%d: ", thread->tid, sched_getcpu());
1518 			mmap_cpu_mask__scnprintf(&thread->mask->affinity, "affinity");
1519 		}
1520 	}
1521 }
1522 
1523 static size_t process_comp_header(void *record, size_t increment)
1524 {
1525 	struct perf_record_compressed *event = record;
1526 	size_t size = sizeof(*event);
1527 
1528 	if (increment) {
1529 		event->header.size += increment;
1530 		return increment;
1531 	}
1532 
1533 	event->header.type = PERF_RECORD_COMPRESSED;
1534 	event->header.size = size;
1535 
1536 	return size;
1537 }
1538 
1539 static ssize_t zstd_compress(struct perf_session *session, struct mmap *map,
1540 			    void *dst, size_t dst_size, void *src, size_t src_size)
1541 {
1542 	ssize_t compressed;
1543 	size_t max_record_size = PERF_SAMPLE_MAX_SIZE - sizeof(struct perf_record_compressed) - 1;
1544 	struct zstd_data *zstd_data = &session->zstd_data;
1545 
1546 	if (map && map->file)
1547 		zstd_data = &map->zstd_data;
1548 
1549 	compressed = zstd_compress_stream_to_records(zstd_data, dst, dst_size, src, src_size,
1550 						     max_record_size, process_comp_header);
1551 	if (compressed < 0)
1552 		return compressed;
1553 
1554 	if (map && map->file) {
1555 		thread->bytes_transferred += src_size;
1556 		thread->bytes_compressed  += compressed;
1557 	} else {
1558 		session->bytes_transferred += src_size;
1559 		session->bytes_compressed  += compressed;
1560 	}
1561 
1562 	return compressed;
1563 }
1564 
1565 static int record__mmap_read_evlist(struct record *rec, struct evlist *evlist,
1566 				    bool overwrite, bool synch)
1567 {
1568 	u64 bytes_written = rec->bytes_written;
1569 	int i;
1570 	int rc = 0;
1571 	int nr_mmaps;
1572 	struct mmap **maps;
1573 	int trace_fd = rec->data.file.fd;
1574 	off_t off = 0;
1575 
1576 	if (!evlist)
1577 		return 0;
1578 
1579 	nr_mmaps = thread->nr_mmaps;
1580 	maps = overwrite ? thread->overwrite_maps : thread->maps;
1581 
1582 	if (!maps)
1583 		return 0;
1584 
1585 	if (overwrite && evlist->bkw_mmap_state != BKW_MMAP_DATA_PENDING)
1586 		return 0;
1587 
1588 	if (record__aio_enabled(rec))
1589 		off = record__aio_get_pos(trace_fd);
1590 
1591 	for (i = 0; i < nr_mmaps; i++) {
1592 		u64 flush = 0;
1593 		struct mmap *map = maps[i];
1594 
1595 		if (map->core.base) {
1596 			record__adjust_affinity(rec, map);
1597 			if (synch) {
1598 				flush = map->core.flush;
1599 				map->core.flush = 1;
1600 			}
1601 			if (!record__aio_enabled(rec)) {
1602 				if (perf_mmap__push(map, rec, record__pushfn) < 0) {
1603 					if (synch)
1604 						map->core.flush = flush;
1605 					rc = -1;
1606 					goto out;
1607 				}
1608 			} else {
1609 				if (record__aio_push(rec, map, &off) < 0) {
1610 					record__aio_set_pos(trace_fd, off);
1611 					if (synch)
1612 						map->core.flush = flush;
1613 					rc = -1;
1614 					goto out;
1615 				}
1616 			}
1617 			if (synch)
1618 				map->core.flush = flush;
1619 		}
1620 
1621 		if (map->auxtrace_mmap.base && !rec->opts.auxtrace_snapshot_mode &&
1622 		    !rec->opts.auxtrace_sample_mode &&
1623 		    record__auxtrace_mmap_read(rec, map) != 0) {
1624 			rc = -1;
1625 			goto out;
1626 		}
1627 	}
1628 
1629 	if (record__aio_enabled(rec))
1630 		record__aio_set_pos(trace_fd, off);
1631 
1632 	/*
1633 	 * Mark the round finished in case we wrote
1634 	 * at least one event.
1635 	 *
1636 	 * No need for round events in directory mode,
1637 	 * because per-cpu maps and files have data
1638 	 * sorted by kernel.
1639 	 */
1640 	if (!record__threads_enabled(rec) && bytes_written != rec->bytes_written)
1641 		rc = record__write(rec, NULL, &finished_round_event, sizeof(finished_round_event));
1642 
1643 	if (overwrite)
1644 		evlist__toggle_bkw_mmap(evlist, BKW_MMAP_EMPTY);
1645 out:
1646 	return rc;
1647 }
1648 
1649 static int record__mmap_read_all(struct record *rec, bool synch)
1650 {
1651 	int err;
1652 
1653 	err = record__mmap_read_evlist(rec, rec->evlist, false, synch);
1654 	if (err)
1655 		return err;
1656 
1657 	return record__mmap_read_evlist(rec, rec->evlist, true, synch);
1658 }
1659 
1660 static void record__thread_munmap_filtered(struct fdarray *fda, int fd,
1661 					   void *arg __maybe_unused)
1662 {
1663 	struct perf_mmap *map = fda->priv[fd].ptr;
1664 
1665 	if (map)
1666 		perf_mmap__put(map);
1667 }
1668 
1669 static void *record__thread(void *arg)
1670 {
1671 	enum thread_msg msg = THREAD_MSG__READY;
1672 	bool terminate = false;
1673 	struct fdarray *pollfd;
1674 	int err, ctlfd_pos;
1675 
1676 	thread = arg;
1677 	thread->tid = gettid();
1678 
1679 	err = write(thread->pipes.ack[1], &msg, sizeof(msg));
1680 	if (err == -1)
1681 		pr_warning("threads[%d]: failed to notify on start: %s\n",
1682 			   thread->tid, strerror(errno));
1683 
1684 	pr_debug("threads[%d]: started on cpu%d\n", thread->tid, sched_getcpu());
1685 
1686 	pollfd = &thread->pollfd;
1687 	ctlfd_pos = thread->ctlfd_pos;
1688 
1689 	for (;;) {
1690 		unsigned long long hits = thread->samples;
1691 
1692 		if (record__mmap_read_all(thread->rec, false) < 0 || terminate)
1693 			break;
1694 
1695 		if (hits == thread->samples) {
1696 
1697 			err = fdarray__poll(pollfd, -1);
1698 			/*
1699 			 * Propagate error, only if there's any. Ignore positive
1700 			 * number of returned events and interrupt error.
1701 			 */
1702 			if (err > 0 || (err < 0 && errno == EINTR))
1703 				err = 0;
1704 			thread->waking++;
1705 
1706 			if (fdarray__filter(pollfd, POLLERR | POLLHUP,
1707 					    record__thread_munmap_filtered, NULL) == 0)
1708 				break;
1709 		}
1710 
1711 		if (pollfd->entries[ctlfd_pos].revents & POLLHUP) {
1712 			terminate = true;
1713 			close(thread->pipes.msg[0]);
1714 			thread->pipes.msg[0] = -1;
1715 			pollfd->entries[ctlfd_pos].fd = -1;
1716 			pollfd->entries[ctlfd_pos].events = 0;
1717 		}
1718 
1719 		pollfd->entries[ctlfd_pos].revents = 0;
1720 	}
1721 	record__mmap_read_all(thread->rec, true);
1722 
1723 	err = write(thread->pipes.ack[1], &msg, sizeof(msg));
1724 	if (err == -1)
1725 		pr_warning("threads[%d]: failed to notify on termination: %s\n",
1726 			   thread->tid, strerror(errno));
1727 
1728 	return NULL;
1729 }
1730 
1731 static void record__init_features(struct record *rec)
1732 {
1733 	struct perf_session *session = rec->session;
1734 	int feat;
1735 
1736 	for (feat = HEADER_FIRST_FEATURE; feat < HEADER_LAST_FEATURE; feat++)
1737 		perf_header__set_feat(&session->header, feat);
1738 
1739 	if (rec->no_buildid)
1740 		perf_header__clear_feat(&session->header, HEADER_BUILD_ID);
1741 
1742 #ifdef HAVE_LIBTRACEEVENT
1743 	if (!have_tracepoints(&rec->evlist->core.entries))
1744 		perf_header__clear_feat(&session->header, HEADER_TRACING_DATA);
1745 #endif
1746 
1747 	if (!rec->opts.branch_stack)
1748 		perf_header__clear_feat(&session->header, HEADER_BRANCH_STACK);
1749 
1750 	if (!rec->opts.full_auxtrace)
1751 		perf_header__clear_feat(&session->header, HEADER_AUXTRACE);
1752 
1753 	if (!(rec->opts.use_clockid && rec->opts.clockid_res_ns))
1754 		perf_header__clear_feat(&session->header, HEADER_CLOCKID);
1755 
1756 	if (!rec->opts.use_clockid)
1757 		perf_header__clear_feat(&session->header, HEADER_CLOCK_DATA);
1758 
1759 	if (!record__threads_enabled(rec))
1760 		perf_header__clear_feat(&session->header, HEADER_DIR_FORMAT);
1761 
1762 	if (!record__comp_enabled(rec))
1763 		perf_header__clear_feat(&session->header, HEADER_COMPRESSED);
1764 
1765 	perf_header__clear_feat(&session->header, HEADER_STAT);
1766 }
1767 
1768 static void
1769 record__finish_output(struct record *rec)
1770 {
1771 	int i;
1772 	struct perf_data *data = &rec->data;
1773 	int fd = perf_data__fd(data);
1774 
1775 	if (data->is_pipe) {
1776 		/* Just to display approx. size */
1777 		data->file.size = rec->bytes_written;
1778 		return;
1779 	}
1780 
1781 	rec->session->header.data_size += rec->bytes_written;
1782 	data->file.size = lseek(perf_data__fd(data), 0, SEEK_CUR);
1783 	if (record__threads_enabled(rec)) {
1784 		for (i = 0; i < data->dir.nr; i++)
1785 			data->dir.files[i].size = lseek(data->dir.files[i].fd, 0, SEEK_CUR);
1786 	}
1787 
1788 	if (!rec->no_buildid) {
1789 		process_buildids(rec);
1790 
1791 		if (rec->buildid_all)
1792 			perf_session__dsos_hit_all(rec->session);
1793 	}
1794 	perf_session__write_header(rec->session, rec->evlist, fd, true);
1795 
1796 	return;
1797 }
1798 
1799 static int record__synthesize_workload(struct record *rec, bool tail)
1800 {
1801 	int err;
1802 	struct perf_thread_map *thread_map;
1803 	bool needs_mmap = rec->opts.synth & PERF_SYNTH_MMAP;
1804 
1805 	if (rec->opts.tail_synthesize != tail)
1806 		return 0;
1807 
1808 	thread_map = thread_map__new_by_tid(rec->evlist->workload.pid);
1809 	if (thread_map == NULL)
1810 		return -1;
1811 
1812 	err = perf_event__synthesize_thread_map(&rec->tool, thread_map,
1813 						 process_synthesized_event,
1814 						 &rec->session->machines.host,
1815 						 needs_mmap,
1816 						 rec->opts.sample_address);
1817 	perf_thread_map__put(thread_map);
1818 	return err;
1819 }
1820 
1821 static int write_finished_init(struct record *rec, bool tail)
1822 {
1823 	if (rec->opts.tail_synthesize != tail)
1824 		return 0;
1825 
1826 	return record__write(rec, NULL, &finished_init_event, sizeof(finished_init_event));
1827 }
1828 
1829 static int record__synthesize(struct record *rec, bool tail);
1830 
1831 static int
1832 record__switch_output(struct record *rec, bool at_exit)
1833 {
1834 	struct perf_data *data = &rec->data;
1835 	char *new_filename = NULL;
1836 	int fd, err;
1837 
1838 	/* Same Size:      "2015122520103046"*/
1839 	char timestamp[] = "InvalidTimestamp";
1840 
1841 	record__aio_mmap_read_sync(rec);
1842 
1843 	write_finished_init(rec, true);
1844 
1845 	record__synthesize(rec, true);
1846 	if (target__none(&rec->opts.target))
1847 		record__synthesize_workload(rec, true);
1848 
1849 	rec->samples = 0;
1850 	record__finish_output(rec);
1851 	err = fetch_current_timestamp(timestamp, sizeof(timestamp));
1852 	if (err) {
1853 		pr_err("Failed to get current timestamp\n");
1854 		return -EINVAL;
1855 	}
1856 
1857 	fd = perf_data__switch(data, timestamp,
1858 			       rec->session->header.data_offset,
1859 			       at_exit, &new_filename);
1860 	if (fd >= 0 && !at_exit) {
1861 		rec->bytes_written = 0;
1862 		rec->session->header.data_size = 0;
1863 	}
1864 
1865 	if (!quiet) {
1866 		fprintf(stderr, "[ perf record: Dump %s.%s ]\n",
1867 			data->path, timestamp);
1868 	}
1869 
1870 	if (rec->switch_output.num_files) {
1871 		int n = rec->switch_output.cur_file + 1;
1872 
1873 		if (n >= rec->switch_output.num_files)
1874 			n = 0;
1875 		rec->switch_output.cur_file = n;
1876 		if (rec->switch_output.filenames[n]) {
1877 			remove(rec->switch_output.filenames[n]);
1878 			zfree(&rec->switch_output.filenames[n]);
1879 		}
1880 		rec->switch_output.filenames[n] = new_filename;
1881 	} else {
1882 		free(new_filename);
1883 	}
1884 
1885 	/* Output tracking events */
1886 	if (!at_exit) {
1887 		record__synthesize(rec, false);
1888 
1889 		/*
1890 		 * In 'perf record --switch-output' without -a,
1891 		 * record__synthesize() in record__switch_output() won't
1892 		 * generate tracking events because there's no thread_map
1893 		 * in evlist. Which causes newly created perf.data doesn't
1894 		 * contain map and comm information.
1895 		 * Create a fake thread_map and directly call
1896 		 * perf_event__synthesize_thread_map() for those events.
1897 		 */
1898 		if (target__none(&rec->opts.target))
1899 			record__synthesize_workload(rec, false);
1900 		write_finished_init(rec, false);
1901 	}
1902 	return fd;
1903 }
1904 
1905 static void __record__save_lost_samples(struct record *rec, struct evsel *evsel,
1906 					struct perf_record_lost_samples *lost,
1907 					int cpu_idx, int thread_idx, u64 lost_count,
1908 					u16 misc_flag)
1909 {
1910 	struct perf_sample_id *sid;
1911 	struct perf_sample sample = {};
1912 	int id_hdr_size;
1913 
1914 	lost->lost = lost_count;
1915 	if (evsel->core.ids) {
1916 		sid = xyarray__entry(evsel->core.sample_id, cpu_idx, thread_idx);
1917 		sample.id = sid->id;
1918 	}
1919 
1920 	id_hdr_size = perf_event__synthesize_id_sample((void *)(lost + 1),
1921 						       evsel->core.attr.sample_type, &sample);
1922 	lost->header.size = sizeof(*lost) + id_hdr_size;
1923 	lost->header.misc = misc_flag;
1924 	record__write(rec, NULL, lost, lost->header.size);
1925 }
1926 
1927 static void record__read_lost_samples(struct record *rec)
1928 {
1929 	struct perf_session *session = rec->session;
1930 	struct perf_record_lost_samples_and_ids lost;
1931 	struct evsel *evsel;
1932 
1933 	/* there was an error during record__open */
1934 	if (session->evlist == NULL)
1935 		return;
1936 
1937 	evlist__for_each_entry(session->evlist, evsel) {
1938 		struct xyarray *xy = evsel->core.sample_id;
1939 		u64 lost_count;
1940 
1941 		if (xy == NULL || evsel->core.fd == NULL)
1942 			continue;
1943 		if (xyarray__max_x(evsel->core.fd) != xyarray__max_x(xy) ||
1944 		    xyarray__max_y(evsel->core.fd) != xyarray__max_y(xy)) {
1945 			pr_debug("Unmatched FD vs. sample ID: skip reading LOST count\n");
1946 			continue;
1947 		}
1948 
1949 		for (int x = 0; x < xyarray__max_x(xy); x++) {
1950 			for (int y = 0; y < xyarray__max_y(xy); y++) {
1951 				struct perf_counts_values count;
1952 
1953 				if (perf_evsel__read(&evsel->core, x, y, &count) < 0) {
1954 					pr_debug("read LOST count failed\n");
1955 					return;
1956 				}
1957 
1958 				if (count.lost) {
1959 					memset(&lost, 0, sizeof(lost));
1960 					lost.lost.header.type = PERF_RECORD_LOST_SAMPLES;
1961 					__record__save_lost_samples(rec, evsel, &lost.lost,
1962 								    x, y, count.lost, 0);
1963 				}
1964 			}
1965 		}
1966 
1967 		lost_count = perf_bpf_filter__lost_count(evsel);
1968 		if (lost_count) {
1969 			memset(&lost, 0, sizeof(lost));
1970 			lost.lost.header.type = PERF_RECORD_LOST_SAMPLES;
1971 			__record__save_lost_samples(rec, evsel, &lost.lost, 0, 0, lost_count,
1972 						    PERF_RECORD_MISC_LOST_SAMPLES_BPF);
1973 		}
1974 	}
1975 }
1976 
1977 static volatile sig_atomic_t workload_exec_errno;
1978 
1979 /*
1980  * evlist__prepare_workload will send a SIGUSR1
1981  * if the fork fails, since we asked by setting its
1982  * want_signal to true.
1983  */
1984 static void workload_exec_failed_signal(int signo __maybe_unused,
1985 					siginfo_t *info,
1986 					void *ucontext __maybe_unused)
1987 {
1988 	workload_exec_errno = info->si_value.sival_int;
1989 	done = 1;
1990 	child_finished = 1;
1991 }
1992 
1993 static void snapshot_sig_handler(int sig);
1994 static void alarm_sig_handler(int sig);
1995 
1996 static const struct perf_event_mmap_page *evlist__pick_pc(struct evlist *evlist)
1997 {
1998 	if (evlist) {
1999 		if (evlist->mmap && evlist->mmap[0].core.base)
2000 			return evlist->mmap[0].core.base;
2001 		if (evlist->overwrite_mmap && evlist->overwrite_mmap[0].core.base)
2002 			return evlist->overwrite_mmap[0].core.base;
2003 	}
2004 	return NULL;
2005 }
2006 
2007 static const struct perf_event_mmap_page *record__pick_pc(struct record *rec)
2008 {
2009 	const struct perf_event_mmap_page *pc = evlist__pick_pc(rec->evlist);
2010 	if (pc)
2011 		return pc;
2012 	return NULL;
2013 }
2014 
2015 static int record__synthesize(struct record *rec, bool tail)
2016 {
2017 	struct perf_session *session = rec->session;
2018 	struct machine *machine = &session->machines.host;
2019 	struct perf_data *data = &rec->data;
2020 	struct record_opts *opts = &rec->opts;
2021 	struct perf_tool *tool = &rec->tool;
2022 	int err = 0;
2023 	event_op f = process_synthesized_event;
2024 
2025 	if (rec->opts.tail_synthesize != tail)
2026 		return 0;
2027 
2028 	if (data->is_pipe) {
2029 		err = perf_event__synthesize_for_pipe(tool, session, data,
2030 						      process_synthesized_event);
2031 		if (err < 0)
2032 			goto out;
2033 
2034 		rec->bytes_written += err;
2035 	}
2036 
2037 	err = perf_event__synth_time_conv(record__pick_pc(rec), tool,
2038 					  process_synthesized_event, machine);
2039 	if (err)
2040 		goto out;
2041 
2042 	/* Synthesize id_index before auxtrace_info */
2043 	err = perf_event__synthesize_id_index(tool,
2044 					      process_synthesized_event,
2045 					      session->evlist, machine);
2046 	if (err)
2047 		goto out;
2048 
2049 	if (rec->opts.full_auxtrace) {
2050 		err = perf_event__synthesize_auxtrace_info(rec->itr, tool,
2051 					session, process_synthesized_event);
2052 		if (err)
2053 			goto out;
2054 	}
2055 
2056 	if (!evlist__exclude_kernel(rec->evlist)) {
2057 		err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
2058 							 machine);
2059 		WARN_ONCE(err < 0, "Couldn't record kernel reference relocation symbol\n"
2060 				   "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
2061 				   "Check /proc/kallsyms permission or run as root.\n");
2062 
2063 		err = perf_event__synthesize_modules(tool, process_synthesized_event,
2064 						     machine);
2065 		WARN_ONCE(err < 0, "Couldn't record kernel module information.\n"
2066 				   "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
2067 				   "Check /proc/modules permission or run as root.\n");
2068 	}
2069 
2070 	if (perf_guest) {
2071 		machines__process_guests(&session->machines,
2072 					 perf_event__synthesize_guest_os, tool);
2073 	}
2074 
2075 	err = perf_event__synthesize_extra_attr(&rec->tool,
2076 						rec->evlist,
2077 						process_synthesized_event,
2078 						data->is_pipe);
2079 	if (err)
2080 		goto out;
2081 
2082 	err = perf_event__synthesize_thread_map2(&rec->tool, rec->evlist->core.threads,
2083 						 process_synthesized_event,
2084 						NULL);
2085 	if (err < 0) {
2086 		pr_err("Couldn't synthesize thread map.\n");
2087 		return err;
2088 	}
2089 
2090 	err = perf_event__synthesize_cpu_map(&rec->tool, rec->evlist->core.all_cpus,
2091 					     process_synthesized_event, NULL);
2092 	if (err < 0) {
2093 		pr_err("Couldn't synthesize cpu map.\n");
2094 		return err;
2095 	}
2096 
2097 	err = perf_event__synthesize_bpf_events(session, process_synthesized_event,
2098 						machine, opts);
2099 	if (err < 0) {
2100 		pr_warning("Couldn't synthesize bpf events.\n");
2101 		err = 0;
2102 	}
2103 
2104 	if (rec->opts.synth & PERF_SYNTH_CGROUP) {
2105 		err = perf_event__synthesize_cgroups(tool, process_synthesized_event,
2106 						     machine);
2107 		if (err < 0) {
2108 			pr_warning("Couldn't synthesize cgroup events.\n");
2109 			err = 0;
2110 		}
2111 	}
2112 
2113 	if (rec->opts.nr_threads_synthesize > 1) {
2114 		mutex_init(&synth_lock);
2115 		perf_set_multithreaded();
2116 		f = process_locked_synthesized_event;
2117 	}
2118 
2119 	if (rec->opts.synth & PERF_SYNTH_TASK) {
2120 		bool needs_mmap = rec->opts.synth & PERF_SYNTH_MMAP;
2121 
2122 		err = __machine__synthesize_threads(machine, tool, &opts->target,
2123 						    rec->evlist->core.threads,
2124 						    f, needs_mmap, opts->sample_address,
2125 						    rec->opts.nr_threads_synthesize);
2126 	}
2127 
2128 	if (rec->opts.nr_threads_synthesize > 1) {
2129 		perf_set_singlethreaded();
2130 		mutex_destroy(&synth_lock);
2131 	}
2132 
2133 out:
2134 	return err;
2135 }
2136 
2137 static int record__process_signal_event(union perf_event *event __maybe_unused, void *data)
2138 {
2139 	struct record *rec = data;
2140 	pthread_kill(rec->thread_id, SIGUSR2);
2141 	return 0;
2142 }
2143 
2144 static int record__setup_sb_evlist(struct record *rec)
2145 {
2146 	struct record_opts *opts = &rec->opts;
2147 
2148 	if (rec->sb_evlist != NULL) {
2149 		/*
2150 		 * We get here if --switch-output-event populated the
2151 		 * sb_evlist, so associate a callback that will send a SIGUSR2
2152 		 * to the main thread.
2153 		 */
2154 		evlist__set_cb(rec->sb_evlist, record__process_signal_event, rec);
2155 		rec->thread_id = pthread_self();
2156 	}
2157 #ifdef HAVE_LIBBPF_SUPPORT
2158 	if (!opts->no_bpf_event) {
2159 		if (rec->sb_evlist == NULL) {
2160 			rec->sb_evlist = evlist__new();
2161 
2162 			if (rec->sb_evlist == NULL) {
2163 				pr_err("Couldn't create side band evlist.\n.");
2164 				return -1;
2165 			}
2166 		}
2167 
2168 		if (evlist__add_bpf_sb_event(rec->sb_evlist, &rec->session->header.env)) {
2169 			pr_err("Couldn't ask for PERF_RECORD_BPF_EVENT side band events.\n.");
2170 			return -1;
2171 		}
2172 	}
2173 #endif
2174 	if (evlist__start_sb_thread(rec->sb_evlist, &rec->opts.target)) {
2175 		pr_debug("Couldn't start the BPF side band thread:\nBPF programs starting from now on won't be annotatable\n");
2176 		opts->no_bpf_event = true;
2177 	}
2178 
2179 	return 0;
2180 }
2181 
2182 static int record__init_clock(struct record *rec)
2183 {
2184 	struct perf_session *session = rec->session;
2185 	struct timespec ref_clockid;
2186 	struct timeval ref_tod;
2187 	u64 ref;
2188 
2189 	if (!rec->opts.use_clockid)
2190 		return 0;
2191 
2192 	if (rec->opts.use_clockid && rec->opts.clockid_res_ns)
2193 		session->header.env.clock.clockid_res_ns = rec->opts.clockid_res_ns;
2194 
2195 	session->header.env.clock.clockid = rec->opts.clockid;
2196 
2197 	if (gettimeofday(&ref_tod, NULL) != 0) {
2198 		pr_err("gettimeofday failed, cannot set reference time.\n");
2199 		return -1;
2200 	}
2201 
2202 	if (clock_gettime(rec->opts.clockid, &ref_clockid)) {
2203 		pr_err("clock_gettime failed, cannot set reference time.\n");
2204 		return -1;
2205 	}
2206 
2207 	ref = (u64) ref_tod.tv_sec * NSEC_PER_SEC +
2208 	      (u64) ref_tod.tv_usec * NSEC_PER_USEC;
2209 
2210 	session->header.env.clock.tod_ns = ref;
2211 
2212 	ref = (u64) ref_clockid.tv_sec * NSEC_PER_SEC +
2213 	      (u64) ref_clockid.tv_nsec;
2214 
2215 	session->header.env.clock.clockid_ns = ref;
2216 	return 0;
2217 }
2218 
2219 static void hit_auxtrace_snapshot_trigger(struct record *rec)
2220 {
2221 	if (trigger_is_ready(&auxtrace_snapshot_trigger)) {
2222 		trigger_hit(&auxtrace_snapshot_trigger);
2223 		auxtrace_record__snapshot_started = 1;
2224 		if (auxtrace_record__snapshot_start(rec->itr))
2225 			trigger_error(&auxtrace_snapshot_trigger);
2226 	}
2227 }
2228 
2229 static int record__terminate_thread(struct record_thread *thread_data)
2230 {
2231 	int err;
2232 	enum thread_msg ack = THREAD_MSG__UNDEFINED;
2233 	pid_t tid = thread_data->tid;
2234 
2235 	close(thread_data->pipes.msg[1]);
2236 	thread_data->pipes.msg[1] = -1;
2237 	err = read(thread_data->pipes.ack[0], &ack, sizeof(ack));
2238 	if (err > 0)
2239 		pr_debug2("threads[%d]: sent %s\n", tid, thread_msg_tags[ack]);
2240 	else
2241 		pr_warning("threads[%d]: failed to receive termination notification from %d\n",
2242 			   thread->tid, tid);
2243 
2244 	return 0;
2245 }
2246 
2247 static int record__start_threads(struct record *rec)
2248 {
2249 	int t, tt, err, ret = 0, nr_threads = rec->nr_threads;
2250 	struct record_thread *thread_data = rec->thread_data;
2251 	sigset_t full, mask;
2252 	pthread_t handle;
2253 	pthread_attr_t attrs;
2254 
2255 	thread = &thread_data[0];
2256 
2257 	if (!record__threads_enabled(rec))
2258 		return 0;
2259 
2260 	sigfillset(&full);
2261 	if (sigprocmask(SIG_SETMASK, &full, &mask)) {
2262 		pr_err("Failed to block signals on threads start: %s\n", strerror(errno));
2263 		return -1;
2264 	}
2265 
2266 	pthread_attr_init(&attrs);
2267 	pthread_attr_setdetachstate(&attrs, PTHREAD_CREATE_DETACHED);
2268 
2269 	for (t = 1; t < nr_threads; t++) {
2270 		enum thread_msg msg = THREAD_MSG__UNDEFINED;
2271 
2272 #ifdef HAVE_PTHREAD_ATTR_SETAFFINITY_NP
2273 		pthread_attr_setaffinity_np(&attrs,
2274 					    MMAP_CPU_MASK_BYTES(&(thread_data[t].mask->affinity)),
2275 					    (cpu_set_t *)(thread_data[t].mask->affinity.bits));
2276 #endif
2277 		if (pthread_create(&handle, &attrs, record__thread, &thread_data[t])) {
2278 			for (tt = 1; tt < t; tt++)
2279 				record__terminate_thread(&thread_data[t]);
2280 			pr_err("Failed to start threads: %s\n", strerror(errno));
2281 			ret = -1;
2282 			goto out_err;
2283 		}
2284 
2285 		err = read(thread_data[t].pipes.ack[0], &msg, sizeof(msg));
2286 		if (err > 0)
2287 			pr_debug2("threads[%d]: sent %s\n", rec->thread_data[t].tid,
2288 				  thread_msg_tags[msg]);
2289 		else
2290 			pr_warning("threads[%d]: failed to receive start notification from %d\n",
2291 				   thread->tid, rec->thread_data[t].tid);
2292 	}
2293 
2294 	sched_setaffinity(0, MMAP_CPU_MASK_BYTES(&thread->mask->affinity),
2295 			(cpu_set_t *)thread->mask->affinity.bits);
2296 
2297 	pr_debug("threads[%d]: started on cpu%d\n", thread->tid, sched_getcpu());
2298 
2299 out_err:
2300 	pthread_attr_destroy(&attrs);
2301 
2302 	if (sigprocmask(SIG_SETMASK, &mask, NULL)) {
2303 		pr_err("Failed to unblock signals on threads start: %s\n", strerror(errno));
2304 		ret = -1;
2305 	}
2306 
2307 	return ret;
2308 }
2309 
2310 static int record__stop_threads(struct record *rec)
2311 {
2312 	int t;
2313 	struct record_thread *thread_data = rec->thread_data;
2314 
2315 	for (t = 1; t < rec->nr_threads; t++)
2316 		record__terminate_thread(&thread_data[t]);
2317 
2318 	for (t = 0; t < rec->nr_threads; t++) {
2319 		rec->samples += thread_data[t].samples;
2320 		if (!record__threads_enabled(rec))
2321 			continue;
2322 		rec->session->bytes_transferred += thread_data[t].bytes_transferred;
2323 		rec->session->bytes_compressed += thread_data[t].bytes_compressed;
2324 		pr_debug("threads[%d]: samples=%lld, wakes=%ld, ", thread_data[t].tid,
2325 			 thread_data[t].samples, thread_data[t].waking);
2326 		if (thread_data[t].bytes_transferred && thread_data[t].bytes_compressed)
2327 			pr_debug("transferred=%" PRIu64 ", compressed=%" PRIu64 "\n",
2328 				 thread_data[t].bytes_transferred, thread_data[t].bytes_compressed);
2329 		else
2330 			pr_debug("written=%" PRIu64 "\n", thread_data[t].bytes_written);
2331 	}
2332 
2333 	return 0;
2334 }
2335 
2336 static unsigned long record__waking(struct record *rec)
2337 {
2338 	int t;
2339 	unsigned long waking = 0;
2340 	struct record_thread *thread_data = rec->thread_data;
2341 
2342 	for (t = 0; t < rec->nr_threads; t++)
2343 		waking += thread_data[t].waking;
2344 
2345 	return waking;
2346 }
2347 
2348 static int __cmd_record(struct record *rec, int argc, const char **argv)
2349 {
2350 	int err;
2351 	int status = 0;
2352 	const bool forks = argc > 0;
2353 	struct perf_tool *tool = &rec->tool;
2354 	struct record_opts *opts = &rec->opts;
2355 	struct perf_data *data = &rec->data;
2356 	struct perf_session *session;
2357 	bool disabled = false, draining = false;
2358 	int fd;
2359 	float ratio = 0;
2360 	enum evlist_ctl_cmd cmd = EVLIST_CTL_CMD_UNSUPPORTED;
2361 
2362 	atexit(record__sig_exit);
2363 	signal(SIGCHLD, sig_handler);
2364 	signal(SIGINT, sig_handler);
2365 	signal(SIGTERM, sig_handler);
2366 	signal(SIGSEGV, sigsegv_handler);
2367 
2368 	if (rec->opts.record_namespaces)
2369 		tool->namespace_events = true;
2370 
2371 	if (rec->opts.record_cgroup) {
2372 #ifdef HAVE_FILE_HANDLE
2373 		tool->cgroup_events = true;
2374 #else
2375 		pr_err("cgroup tracking is not supported\n");
2376 		return -1;
2377 #endif
2378 	}
2379 
2380 	if (rec->opts.auxtrace_snapshot_mode || rec->switch_output.enabled) {
2381 		signal(SIGUSR2, snapshot_sig_handler);
2382 		if (rec->opts.auxtrace_snapshot_mode)
2383 			trigger_on(&auxtrace_snapshot_trigger);
2384 		if (rec->switch_output.enabled)
2385 			trigger_on(&switch_output_trigger);
2386 	} else {
2387 		signal(SIGUSR2, SIG_IGN);
2388 	}
2389 
2390 	session = perf_session__new(data, tool);
2391 	if (IS_ERR(session)) {
2392 		pr_err("Perf session creation failed.\n");
2393 		return PTR_ERR(session);
2394 	}
2395 
2396 	if (record__threads_enabled(rec)) {
2397 		if (perf_data__is_pipe(&rec->data)) {
2398 			pr_err("Parallel trace streaming is not available in pipe mode.\n");
2399 			return -1;
2400 		}
2401 		if (rec->opts.full_auxtrace) {
2402 			pr_err("Parallel trace streaming is not available in AUX area tracing mode.\n");
2403 			return -1;
2404 		}
2405 	}
2406 
2407 	fd = perf_data__fd(data);
2408 	rec->session = session;
2409 
2410 	if (zstd_init(&session->zstd_data, rec->opts.comp_level) < 0) {
2411 		pr_err("Compression initialization failed.\n");
2412 		return -1;
2413 	}
2414 #ifdef HAVE_EVENTFD_SUPPORT
2415 	done_fd = eventfd(0, EFD_NONBLOCK);
2416 	if (done_fd < 0) {
2417 		pr_err("Failed to create wakeup eventfd, error: %m\n");
2418 		status = -1;
2419 		goto out_delete_session;
2420 	}
2421 	err = evlist__add_wakeup_eventfd(rec->evlist, done_fd);
2422 	if (err < 0) {
2423 		pr_err("Failed to add wakeup eventfd to poll list\n");
2424 		status = err;
2425 		goto out_delete_session;
2426 	}
2427 #endif // HAVE_EVENTFD_SUPPORT
2428 
2429 	session->header.env.comp_type  = PERF_COMP_ZSTD;
2430 	session->header.env.comp_level = rec->opts.comp_level;
2431 
2432 	if (rec->opts.kcore &&
2433 	    !record__kcore_readable(&session->machines.host)) {
2434 		pr_err("ERROR: kcore is not readable.\n");
2435 		return -1;
2436 	}
2437 
2438 	if (record__init_clock(rec))
2439 		return -1;
2440 
2441 	record__init_features(rec);
2442 
2443 	if (forks) {
2444 		err = evlist__prepare_workload(rec->evlist, &opts->target, argv, data->is_pipe,
2445 					       workload_exec_failed_signal);
2446 		if (err < 0) {
2447 			pr_err("Couldn't run the workload!\n");
2448 			status = err;
2449 			goto out_delete_session;
2450 		}
2451 	}
2452 
2453 	/*
2454 	 * If we have just single event and are sending data
2455 	 * through pipe, we need to force the ids allocation,
2456 	 * because we synthesize event name through the pipe
2457 	 * and need the id for that.
2458 	 */
2459 	if (data->is_pipe && rec->evlist->core.nr_entries == 1)
2460 		rec->opts.sample_id = true;
2461 
2462 	if (rec->timestamp_filename && perf_data__is_pipe(data)) {
2463 		rec->timestamp_filename = false;
2464 		pr_warning("WARNING: --timestamp-filename option is not available in pipe mode.\n");
2465 	}
2466 
2467 	evlist__uniquify_name(rec->evlist);
2468 
2469 	evlist__config(rec->evlist, opts, &callchain_param);
2470 
2471 	/* Debug message used by test scripts */
2472 	pr_debug3("perf record opening and mmapping events\n");
2473 	if (record__open(rec) != 0) {
2474 		err = -1;
2475 		goto out_free_threads;
2476 	}
2477 	/* Debug message used by test scripts */
2478 	pr_debug3("perf record done opening and mmapping events\n");
2479 	session->header.env.comp_mmap_len = session->evlist->core.mmap_len;
2480 
2481 	if (rec->opts.kcore) {
2482 		err = record__kcore_copy(&session->machines.host, data);
2483 		if (err) {
2484 			pr_err("ERROR: Failed to copy kcore\n");
2485 			goto out_free_threads;
2486 		}
2487 	}
2488 
2489 	/*
2490 	 * Normally perf_session__new would do this, but it doesn't have the
2491 	 * evlist.
2492 	 */
2493 	if (rec->tool.ordered_events && !evlist__sample_id_all(rec->evlist)) {
2494 		pr_warning("WARNING: No sample_id_all support, falling back to unordered processing\n");
2495 		rec->tool.ordered_events = false;
2496 	}
2497 
2498 	if (evlist__nr_groups(rec->evlist) == 0)
2499 		perf_header__clear_feat(&session->header, HEADER_GROUP_DESC);
2500 
2501 	if (data->is_pipe) {
2502 		err = perf_header__write_pipe(fd);
2503 		if (err < 0)
2504 			goto out_free_threads;
2505 	} else {
2506 		err = perf_session__write_header(session, rec->evlist, fd, false);
2507 		if (err < 0)
2508 			goto out_free_threads;
2509 	}
2510 
2511 	err = -1;
2512 	if (!rec->no_buildid
2513 	    && !perf_header__has_feat(&session->header, HEADER_BUILD_ID)) {
2514 		pr_err("Couldn't generate buildids. "
2515 		       "Use --no-buildid to profile anyway.\n");
2516 		goto out_free_threads;
2517 	}
2518 
2519 	err = record__setup_sb_evlist(rec);
2520 	if (err)
2521 		goto out_free_threads;
2522 
2523 	err = record__synthesize(rec, false);
2524 	if (err < 0)
2525 		goto out_free_threads;
2526 
2527 	if (rec->realtime_prio) {
2528 		struct sched_param param;
2529 
2530 		param.sched_priority = rec->realtime_prio;
2531 		if (sched_setscheduler(0, SCHED_FIFO, &param)) {
2532 			pr_err("Could not set realtime priority.\n");
2533 			err = -1;
2534 			goto out_free_threads;
2535 		}
2536 	}
2537 
2538 	if (record__start_threads(rec))
2539 		goto out_free_threads;
2540 
2541 	/*
2542 	 * When perf is starting the traced process, all the events
2543 	 * (apart from group members) have enable_on_exec=1 set,
2544 	 * so don't spoil it by prematurely enabling them.
2545 	 */
2546 	if (!target__none(&opts->target) && !opts->target.initial_delay)
2547 		evlist__enable(rec->evlist);
2548 
2549 	/*
2550 	 * Let the child rip
2551 	 */
2552 	if (forks) {
2553 		struct machine *machine = &session->machines.host;
2554 		union perf_event *event;
2555 		pid_t tgid;
2556 
2557 		event = malloc(sizeof(event->comm) + machine->id_hdr_size);
2558 		if (event == NULL) {
2559 			err = -ENOMEM;
2560 			goto out_child;
2561 		}
2562 
2563 		/*
2564 		 * Some H/W events are generated before COMM event
2565 		 * which is emitted during exec(), so perf script
2566 		 * cannot see a correct process name for those events.
2567 		 * Synthesize COMM event to prevent it.
2568 		 */
2569 		tgid = perf_event__synthesize_comm(tool, event,
2570 						   rec->evlist->workload.pid,
2571 						   process_synthesized_event,
2572 						   machine);
2573 		free(event);
2574 
2575 		if (tgid == -1)
2576 			goto out_child;
2577 
2578 		event = malloc(sizeof(event->namespaces) +
2579 			       (NR_NAMESPACES * sizeof(struct perf_ns_link_info)) +
2580 			       machine->id_hdr_size);
2581 		if (event == NULL) {
2582 			err = -ENOMEM;
2583 			goto out_child;
2584 		}
2585 
2586 		/*
2587 		 * Synthesize NAMESPACES event for the command specified.
2588 		 */
2589 		perf_event__synthesize_namespaces(tool, event,
2590 						  rec->evlist->workload.pid,
2591 						  tgid, process_synthesized_event,
2592 						  machine);
2593 		free(event);
2594 
2595 		evlist__start_workload(rec->evlist);
2596 	}
2597 
2598 	if (opts->target.initial_delay) {
2599 		pr_info(EVLIST_DISABLED_MSG);
2600 		if (opts->target.initial_delay > 0) {
2601 			usleep(opts->target.initial_delay * USEC_PER_MSEC);
2602 			evlist__enable(rec->evlist);
2603 			pr_info(EVLIST_ENABLED_MSG);
2604 		}
2605 	}
2606 
2607 	err = event_enable_timer__start(rec->evlist->eet);
2608 	if (err)
2609 		goto out_child;
2610 
2611 	/* Debug message used by test scripts */
2612 	pr_debug3("perf record has started\n");
2613 	fflush(stderr);
2614 
2615 	trigger_ready(&auxtrace_snapshot_trigger);
2616 	trigger_ready(&switch_output_trigger);
2617 	perf_hooks__invoke_record_start();
2618 
2619 	/*
2620 	 * Must write FINISHED_INIT so it will be seen after all other
2621 	 * synthesized user events, but before any regular events.
2622 	 */
2623 	err = write_finished_init(rec, false);
2624 	if (err < 0)
2625 		goto out_child;
2626 
2627 	for (;;) {
2628 		unsigned long long hits = thread->samples;
2629 
2630 		/*
2631 		 * rec->evlist->bkw_mmap_state is possible to be
2632 		 * BKW_MMAP_EMPTY here: when done == true and
2633 		 * hits != rec->samples in previous round.
2634 		 *
2635 		 * evlist__toggle_bkw_mmap ensure we never
2636 		 * convert BKW_MMAP_EMPTY to BKW_MMAP_DATA_PENDING.
2637 		 */
2638 		if (trigger_is_hit(&switch_output_trigger) || done || draining)
2639 			evlist__toggle_bkw_mmap(rec->evlist, BKW_MMAP_DATA_PENDING);
2640 
2641 		if (record__mmap_read_all(rec, false) < 0) {
2642 			trigger_error(&auxtrace_snapshot_trigger);
2643 			trigger_error(&switch_output_trigger);
2644 			err = -1;
2645 			goto out_child;
2646 		}
2647 
2648 		if (auxtrace_record__snapshot_started) {
2649 			auxtrace_record__snapshot_started = 0;
2650 			if (!trigger_is_error(&auxtrace_snapshot_trigger))
2651 				record__read_auxtrace_snapshot(rec, false);
2652 			if (trigger_is_error(&auxtrace_snapshot_trigger)) {
2653 				pr_err("AUX area tracing snapshot failed\n");
2654 				err = -1;
2655 				goto out_child;
2656 			}
2657 		}
2658 
2659 		if (trigger_is_hit(&switch_output_trigger)) {
2660 			/*
2661 			 * If switch_output_trigger is hit, the data in
2662 			 * overwritable ring buffer should have been collected,
2663 			 * so bkw_mmap_state should be set to BKW_MMAP_EMPTY.
2664 			 *
2665 			 * If SIGUSR2 raise after or during record__mmap_read_all(),
2666 			 * record__mmap_read_all() didn't collect data from
2667 			 * overwritable ring buffer. Read again.
2668 			 */
2669 			if (rec->evlist->bkw_mmap_state == BKW_MMAP_RUNNING)
2670 				continue;
2671 			trigger_ready(&switch_output_trigger);
2672 
2673 			/*
2674 			 * Reenable events in overwrite ring buffer after
2675 			 * record__mmap_read_all(): we should have collected
2676 			 * data from it.
2677 			 */
2678 			evlist__toggle_bkw_mmap(rec->evlist, BKW_MMAP_RUNNING);
2679 
2680 			if (!quiet)
2681 				fprintf(stderr, "[ perf record: dump data: Woken up %ld times ]\n",
2682 					record__waking(rec));
2683 			thread->waking = 0;
2684 			fd = record__switch_output(rec, false);
2685 			if (fd < 0) {
2686 				pr_err("Failed to switch to new file\n");
2687 				trigger_error(&switch_output_trigger);
2688 				err = fd;
2689 				goto out_child;
2690 			}
2691 
2692 			/* re-arm the alarm */
2693 			if (rec->switch_output.time)
2694 				alarm(rec->switch_output.time);
2695 		}
2696 
2697 		if (hits == thread->samples) {
2698 			if (done || draining)
2699 				break;
2700 			err = fdarray__poll(&thread->pollfd, -1);
2701 			/*
2702 			 * Propagate error, only if there's any. Ignore positive
2703 			 * number of returned events and interrupt error.
2704 			 */
2705 			if (err > 0 || (err < 0 && errno == EINTR))
2706 				err = 0;
2707 			thread->waking++;
2708 
2709 			if (fdarray__filter(&thread->pollfd, POLLERR | POLLHUP,
2710 					    record__thread_munmap_filtered, NULL) == 0)
2711 				draining = true;
2712 
2713 			err = record__update_evlist_pollfd_from_thread(rec, rec->evlist, thread);
2714 			if (err)
2715 				goto out_child;
2716 		}
2717 
2718 		if (evlist__ctlfd_process(rec->evlist, &cmd) > 0) {
2719 			switch (cmd) {
2720 			case EVLIST_CTL_CMD_SNAPSHOT:
2721 				hit_auxtrace_snapshot_trigger(rec);
2722 				evlist__ctlfd_ack(rec->evlist);
2723 				break;
2724 			case EVLIST_CTL_CMD_STOP:
2725 				done = 1;
2726 				break;
2727 			case EVLIST_CTL_CMD_ACK:
2728 			case EVLIST_CTL_CMD_UNSUPPORTED:
2729 			case EVLIST_CTL_CMD_ENABLE:
2730 			case EVLIST_CTL_CMD_DISABLE:
2731 			case EVLIST_CTL_CMD_EVLIST:
2732 			case EVLIST_CTL_CMD_PING:
2733 			default:
2734 				break;
2735 			}
2736 		}
2737 
2738 		err = event_enable_timer__process(rec->evlist->eet);
2739 		if (err < 0)
2740 			goto out_child;
2741 		if (err) {
2742 			err = 0;
2743 			done = 1;
2744 		}
2745 
2746 		/*
2747 		 * When perf is starting the traced process, at the end events
2748 		 * die with the process and we wait for that. Thus no need to
2749 		 * disable events in this case.
2750 		 */
2751 		if (done && !disabled && !target__none(&opts->target)) {
2752 			trigger_off(&auxtrace_snapshot_trigger);
2753 			evlist__disable(rec->evlist);
2754 			disabled = true;
2755 		}
2756 	}
2757 
2758 	trigger_off(&auxtrace_snapshot_trigger);
2759 	trigger_off(&switch_output_trigger);
2760 
2761 	if (opts->auxtrace_snapshot_on_exit)
2762 		record__auxtrace_snapshot_exit(rec);
2763 
2764 	if (forks && workload_exec_errno) {
2765 		char msg[STRERR_BUFSIZE], strevsels[2048];
2766 		const char *emsg = str_error_r(workload_exec_errno, msg, sizeof(msg));
2767 
2768 		evlist__scnprintf_evsels(rec->evlist, sizeof(strevsels), strevsels);
2769 
2770 		pr_err("Failed to collect '%s' for the '%s' workload: %s\n",
2771 			strevsels, argv[0], emsg);
2772 		err = -1;
2773 		goto out_child;
2774 	}
2775 
2776 	if (!quiet)
2777 		fprintf(stderr, "[ perf record: Woken up %ld times to write data ]\n",
2778 			record__waking(rec));
2779 
2780 	write_finished_init(rec, true);
2781 
2782 	if (target__none(&rec->opts.target))
2783 		record__synthesize_workload(rec, true);
2784 
2785 out_child:
2786 	record__stop_threads(rec);
2787 	record__mmap_read_all(rec, true);
2788 out_free_threads:
2789 	record__free_thread_data(rec);
2790 	evlist__finalize_ctlfd(rec->evlist);
2791 	record__aio_mmap_read_sync(rec);
2792 
2793 	if (rec->session->bytes_transferred && rec->session->bytes_compressed) {
2794 		ratio = (float)rec->session->bytes_transferred/(float)rec->session->bytes_compressed;
2795 		session->header.env.comp_ratio = ratio + 0.5;
2796 	}
2797 
2798 	if (forks) {
2799 		int exit_status;
2800 
2801 		if (!child_finished)
2802 			kill(rec->evlist->workload.pid, SIGTERM);
2803 
2804 		wait(&exit_status);
2805 
2806 		if (err < 0)
2807 			status = err;
2808 		else if (WIFEXITED(exit_status))
2809 			status = WEXITSTATUS(exit_status);
2810 		else if (WIFSIGNALED(exit_status))
2811 			signr = WTERMSIG(exit_status);
2812 	} else
2813 		status = err;
2814 
2815 	if (rec->off_cpu)
2816 		rec->bytes_written += off_cpu_write(rec->session);
2817 
2818 	record__read_lost_samples(rec);
2819 	record__synthesize(rec, true);
2820 	/* this will be recalculated during process_buildids() */
2821 	rec->samples = 0;
2822 
2823 	if (!err) {
2824 		if (!rec->timestamp_filename) {
2825 			record__finish_output(rec);
2826 		} else {
2827 			fd = record__switch_output(rec, true);
2828 			if (fd < 0) {
2829 				status = fd;
2830 				goto out_delete_session;
2831 			}
2832 		}
2833 	}
2834 
2835 	perf_hooks__invoke_record_end();
2836 
2837 	if (!err && !quiet) {
2838 		char samples[128];
2839 		const char *postfix = rec->timestamp_filename ?
2840 					".<timestamp>" : "";
2841 
2842 		if (rec->samples && !rec->opts.full_auxtrace)
2843 			scnprintf(samples, sizeof(samples),
2844 				  " (%" PRIu64 " samples)", rec->samples);
2845 		else
2846 			samples[0] = '\0';
2847 
2848 		fprintf(stderr,	"[ perf record: Captured and wrote %.3f MB %s%s%s",
2849 			perf_data__size(data) / 1024.0 / 1024.0,
2850 			data->path, postfix, samples);
2851 		if (ratio) {
2852 			fprintf(stderr,	", compressed (original %.3f MB, ratio is %.3f)",
2853 					rec->session->bytes_transferred / 1024.0 / 1024.0,
2854 					ratio);
2855 		}
2856 		fprintf(stderr, " ]\n");
2857 	}
2858 
2859 out_delete_session:
2860 #ifdef HAVE_EVENTFD_SUPPORT
2861 	if (done_fd >= 0) {
2862 		fd = done_fd;
2863 		done_fd = -1;
2864 
2865 		close(fd);
2866 	}
2867 #endif
2868 	zstd_fini(&session->zstd_data);
2869 	if (!opts->no_bpf_event)
2870 		evlist__stop_sb_thread(rec->sb_evlist);
2871 
2872 	perf_session__delete(session);
2873 	return status;
2874 }
2875 
2876 static void callchain_debug(struct callchain_param *callchain)
2877 {
2878 	static const char *str[CALLCHAIN_MAX] = { "NONE", "FP", "DWARF", "LBR" };
2879 
2880 	pr_debug("callchain: type %s\n", str[callchain->record_mode]);
2881 
2882 	if (callchain->record_mode == CALLCHAIN_DWARF)
2883 		pr_debug("callchain: stack dump size %d\n",
2884 			 callchain->dump_size);
2885 }
2886 
2887 int record_opts__parse_callchain(struct record_opts *record,
2888 				 struct callchain_param *callchain,
2889 				 const char *arg, bool unset)
2890 {
2891 	int ret;
2892 	callchain->enabled = !unset;
2893 
2894 	/* --no-call-graph */
2895 	if (unset) {
2896 		callchain->record_mode = CALLCHAIN_NONE;
2897 		pr_debug("callchain: disabled\n");
2898 		return 0;
2899 	}
2900 
2901 	ret = parse_callchain_record_opt(arg, callchain);
2902 	if (!ret) {
2903 		/* Enable data address sampling for DWARF unwind. */
2904 		if (callchain->record_mode == CALLCHAIN_DWARF)
2905 			record->sample_address = true;
2906 		callchain_debug(callchain);
2907 	}
2908 
2909 	return ret;
2910 }
2911 
2912 int record_parse_callchain_opt(const struct option *opt,
2913 			       const char *arg,
2914 			       int unset)
2915 {
2916 	return record_opts__parse_callchain(opt->value, &callchain_param, arg, unset);
2917 }
2918 
2919 int record_callchain_opt(const struct option *opt,
2920 			 const char *arg __maybe_unused,
2921 			 int unset __maybe_unused)
2922 {
2923 	struct callchain_param *callchain = opt->value;
2924 
2925 	callchain->enabled = true;
2926 
2927 	if (callchain->record_mode == CALLCHAIN_NONE)
2928 		callchain->record_mode = CALLCHAIN_FP;
2929 
2930 	callchain_debug(callchain);
2931 	return 0;
2932 }
2933 
2934 static int perf_record_config(const char *var, const char *value, void *cb)
2935 {
2936 	struct record *rec = cb;
2937 
2938 	if (!strcmp(var, "record.build-id")) {
2939 		if (!strcmp(value, "cache"))
2940 			rec->no_buildid_cache = false;
2941 		else if (!strcmp(value, "no-cache"))
2942 			rec->no_buildid_cache = true;
2943 		else if (!strcmp(value, "skip"))
2944 			rec->no_buildid = true;
2945 		else if (!strcmp(value, "mmap"))
2946 			rec->buildid_mmap = true;
2947 		else
2948 			return -1;
2949 		return 0;
2950 	}
2951 	if (!strcmp(var, "record.call-graph")) {
2952 		var = "call-graph.record-mode";
2953 		return perf_default_config(var, value, cb);
2954 	}
2955 #ifdef HAVE_AIO_SUPPORT
2956 	if (!strcmp(var, "record.aio")) {
2957 		rec->opts.nr_cblocks = strtol(value, NULL, 0);
2958 		if (!rec->opts.nr_cblocks)
2959 			rec->opts.nr_cblocks = nr_cblocks_default;
2960 	}
2961 #endif
2962 	if (!strcmp(var, "record.debuginfod")) {
2963 		rec->debuginfod.urls = strdup(value);
2964 		if (!rec->debuginfod.urls)
2965 			return -ENOMEM;
2966 		rec->debuginfod.set = true;
2967 	}
2968 
2969 	return 0;
2970 }
2971 
2972 static int record__parse_event_enable_time(const struct option *opt, const char *str, int unset)
2973 {
2974 	struct record *rec = (struct record *)opt->value;
2975 
2976 	return evlist__parse_event_enable_time(rec->evlist, &rec->opts, str, unset);
2977 }
2978 
2979 static int record__parse_affinity(const struct option *opt, const char *str, int unset)
2980 {
2981 	struct record_opts *opts = (struct record_opts *)opt->value;
2982 
2983 	if (unset || !str)
2984 		return 0;
2985 
2986 	if (!strcasecmp(str, "node"))
2987 		opts->affinity = PERF_AFFINITY_NODE;
2988 	else if (!strcasecmp(str, "cpu"))
2989 		opts->affinity = PERF_AFFINITY_CPU;
2990 
2991 	return 0;
2992 }
2993 
2994 static int record__mmap_cpu_mask_alloc(struct mmap_cpu_mask *mask, int nr_bits)
2995 {
2996 	mask->nbits = nr_bits;
2997 	mask->bits = bitmap_zalloc(mask->nbits);
2998 	if (!mask->bits)
2999 		return -ENOMEM;
3000 
3001 	return 0;
3002 }
3003 
3004 static void record__mmap_cpu_mask_free(struct mmap_cpu_mask *mask)
3005 {
3006 	bitmap_free(mask->bits);
3007 	mask->nbits = 0;
3008 }
3009 
3010 static int record__thread_mask_alloc(struct thread_mask *mask, int nr_bits)
3011 {
3012 	int ret;
3013 
3014 	ret = record__mmap_cpu_mask_alloc(&mask->maps, nr_bits);
3015 	if (ret) {
3016 		mask->affinity.bits = NULL;
3017 		return ret;
3018 	}
3019 
3020 	ret = record__mmap_cpu_mask_alloc(&mask->affinity, nr_bits);
3021 	if (ret) {
3022 		record__mmap_cpu_mask_free(&mask->maps);
3023 		mask->maps.bits = NULL;
3024 	}
3025 
3026 	return ret;
3027 }
3028 
3029 static void record__thread_mask_free(struct thread_mask *mask)
3030 {
3031 	record__mmap_cpu_mask_free(&mask->maps);
3032 	record__mmap_cpu_mask_free(&mask->affinity);
3033 }
3034 
3035 static int record__parse_threads(const struct option *opt, const char *str, int unset)
3036 {
3037 	int s;
3038 	struct record_opts *opts = opt->value;
3039 
3040 	if (unset || !str || !strlen(str)) {
3041 		opts->threads_spec = THREAD_SPEC__CPU;
3042 	} else {
3043 		for (s = 1; s < THREAD_SPEC__MAX; s++) {
3044 			if (s == THREAD_SPEC__USER) {
3045 				opts->threads_user_spec = strdup(str);
3046 				if (!opts->threads_user_spec)
3047 					return -ENOMEM;
3048 				opts->threads_spec = THREAD_SPEC__USER;
3049 				break;
3050 			}
3051 			if (!strncasecmp(str, thread_spec_tags[s], strlen(thread_spec_tags[s]))) {
3052 				opts->threads_spec = s;
3053 				break;
3054 			}
3055 		}
3056 	}
3057 
3058 	if (opts->threads_spec == THREAD_SPEC__USER)
3059 		pr_debug("threads_spec: %s\n", opts->threads_user_spec);
3060 	else
3061 		pr_debug("threads_spec: %s\n", thread_spec_tags[opts->threads_spec]);
3062 
3063 	return 0;
3064 }
3065 
3066 static int parse_output_max_size(const struct option *opt,
3067 				 const char *str, int unset)
3068 {
3069 	unsigned long *s = (unsigned long *)opt->value;
3070 	static struct parse_tag tags_size[] = {
3071 		{ .tag  = 'B', .mult = 1       },
3072 		{ .tag  = 'K', .mult = 1 << 10 },
3073 		{ .tag  = 'M', .mult = 1 << 20 },
3074 		{ .tag  = 'G', .mult = 1 << 30 },
3075 		{ .tag  = 0 },
3076 	};
3077 	unsigned long val;
3078 
3079 	if (unset) {
3080 		*s = 0;
3081 		return 0;
3082 	}
3083 
3084 	val = parse_tag_value(str, tags_size);
3085 	if (val != (unsigned long) -1) {
3086 		*s = val;
3087 		return 0;
3088 	}
3089 
3090 	return -1;
3091 }
3092 
3093 static int record__parse_mmap_pages(const struct option *opt,
3094 				    const char *str,
3095 				    int unset __maybe_unused)
3096 {
3097 	struct record_opts *opts = opt->value;
3098 	char *s, *p;
3099 	unsigned int mmap_pages;
3100 	int ret;
3101 
3102 	if (!str)
3103 		return -EINVAL;
3104 
3105 	s = strdup(str);
3106 	if (!s)
3107 		return -ENOMEM;
3108 
3109 	p = strchr(s, ',');
3110 	if (p)
3111 		*p = '\0';
3112 
3113 	if (*s) {
3114 		ret = __evlist__parse_mmap_pages(&mmap_pages, s);
3115 		if (ret)
3116 			goto out_free;
3117 		opts->mmap_pages = mmap_pages;
3118 	}
3119 
3120 	if (!p) {
3121 		ret = 0;
3122 		goto out_free;
3123 	}
3124 
3125 	ret = __evlist__parse_mmap_pages(&mmap_pages, p + 1);
3126 	if (ret)
3127 		goto out_free;
3128 
3129 	opts->auxtrace_mmap_pages = mmap_pages;
3130 
3131 out_free:
3132 	free(s);
3133 	return ret;
3134 }
3135 
3136 void __weak arch__add_leaf_frame_record_opts(struct record_opts *opts __maybe_unused)
3137 {
3138 }
3139 
3140 static int parse_control_option(const struct option *opt,
3141 				const char *str,
3142 				int unset __maybe_unused)
3143 {
3144 	struct record_opts *opts = opt->value;
3145 
3146 	return evlist__parse_control(str, &opts->ctl_fd, &opts->ctl_fd_ack, &opts->ctl_fd_close);
3147 }
3148 
3149 static void switch_output_size_warn(struct record *rec)
3150 {
3151 	u64 wakeup_size = evlist__mmap_size(rec->opts.mmap_pages);
3152 	struct switch_output *s = &rec->switch_output;
3153 
3154 	wakeup_size /= 2;
3155 
3156 	if (s->size < wakeup_size) {
3157 		char buf[100];
3158 
3159 		unit_number__scnprintf(buf, sizeof(buf), wakeup_size);
3160 		pr_warning("WARNING: switch-output data size lower than "
3161 			   "wakeup kernel buffer size (%s) "
3162 			   "expect bigger perf.data sizes\n", buf);
3163 	}
3164 }
3165 
3166 static int switch_output_setup(struct record *rec)
3167 {
3168 	struct switch_output *s = &rec->switch_output;
3169 	static struct parse_tag tags_size[] = {
3170 		{ .tag  = 'B', .mult = 1       },
3171 		{ .tag  = 'K', .mult = 1 << 10 },
3172 		{ .tag  = 'M', .mult = 1 << 20 },
3173 		{ .tag  = 'G', .mult = 1 << 30 },
3174 		{ .tag  = 0 },
3175 	};
3176 	static struct parse_tag tags_time[] = {
3177 		{ .tag  = 's', .mult = 1        },
3178 		{ .tag  = 'm', .mult = 60       },
3179 		{ .tag  = 'h', .mult = 60*60    },
3180 		{ .tag  = 'd', .mult = 60*60*24 },
3181 		{ .tag  = 0 },
3182 	};
3183 	unsigned long val;
3184 
3185 	/*
3186 	 * If we're using --switch-output-events, then we imply its
3187 	 * --switch-output=signal, as we'll send a SIGUSR2 from the side band
3188 	 *  thread to its parent.
3189 	 */
3190 	if (rec->switch_output_event_set) {
3191 		if (record__threads_enabled(rec)) {
3192 			pr_warning("WARNING: --switch-output-event option is not available in parallel streaming mode.\n");
3193 			return 0;
3194 		}
3195 		goto do_signal;
3196 	}
3197 
3198 	if (!s->set)
3199 		return 0;
3200 
3201 	if (record__threads_enabled(rec)) {
3202 		pr_warning("WARNING: --switch-output option is not available in parallel streaming mode.\n");
3203 		return 0;
3204 	}
3205 
3206 	if (!strcmp(s->str, "signal")) {
3207 do_signal:
3208 		s->signal = true;
3209 		pr_debug("switch-output with SIGUSR2 signal\n");
3210 		goto enabled;
3211 	}
3212 
3213 	val = parse_tag_value(s->str, tags_size);
3214 	if (val != (unsigned long) -1) {
3215 		s->size = val;
3216 		pr_debug("switch-output with %s size threshold\n", s->str);
3217 		goto enabled;
3218 	}
3219 
3220 	val = parse_tag_value(s->str, tags_time);
3221 	if (val != (unsigned long) -1) {
3222 		s->time = val;
3223 		pr_debug("switch-output with %s time threshold (%lu seconds)\n",
3224 			 s->str, s->time);
3225 		goto enabled;
3226 	}
3227 
3228 	return -1;
3229 
3230 enabled:
3231 	rec->timestamp_filename = true;
3232 	s->enabled              = true;
3233 
3234 	if (s->size && !rec->opts.no_buffering)
3235 		switch_output_size_warn(rec);
3236 
3237 	return 0;
3238 }
3239 
3240 static const char * const __record_usage[] = {
3241 	"perf record [<options>] [<command>]",
3242 	"perf record [<options>] -- <command> [<options>]",
3243 	NULL
3244 };
3245 const char * const *record_usage = __record_usage;
3246 
3247 static int build_id__process_mmap(struct perf_tool *tool, union perf_event *event,
3248 				  struct perf_sample *sample, struct machine *machine)
3249 {
3250 	/*
3251 	 * We already have the kernel maps, put in place via perf_session__create_kernel_maps()
3252 	 * no need to add them twice.
3253 	 */
3254 	if (!(event->header.misc & PERF_RECORD_MISC_USER))
3255 		return 0;
3256 	return perf_event__process_mmap(tool, event, sample, machine);
3257 }
3258 
3259 static int build_id__process_mmap2(struct perf_tool *tool, union perf_event *event,
3260 				   struct perf_sample *sample, struct machine *machine)
3261 {
3262 	/*
3263 	 * We already have the kernel maps, put in place via perf_session__create_kernel_maps()
3264 	 * no need to add them twice.
3265 	 */
3266 	if (!(event->header.misc & PERF_RECORD_MISC_USER))
3267 		return 0;
3268 
3269 	return perf_event__process_mmap2(tool, event, sample, machine);
3270 }
3271 
3272 static int process_timestamp_boundary(struct perf_tool *tool,
3273 				      union perf_event *event __maybe_unused,
3274 				      struct perf_sample *sample,
3275 				      struct machine *machine __maybe_unused)
3276 {
3277 	struct record *rec = container_of(tool, struct record, tool);
3278 
3279 	set_timestamp_boundary(rec, sample->time);
3280 	return 0;
3281 }
3282 
3283 static int parse_record_synth_option(const struct option *opt,
3284 				     const char *str,
3285 				     int unset __maybe_unused)
3286 {
3287 	struct record_opts *opts = opt->value;
3288 	char *p = strdup(str);
3289 
3290 	if (p == NULL)
3291 		return -1;
3292 
3293 	opts->synth = parse_synth_opt(p);
3294 	free(p);
3295 
3296 	if (opts->synth < 0) {
3297 		pr_err("Invalid synth option: %s\n", str);
3298 		return -1;
3299 	}
3300 	return 0;
3301 }
3302 
3303 /*
3304  * XXX Ideally would be local to cmd_record() and passed to a record__new
3305  * because we need to have access to it in record__exit, that is called
3306  * after cmd_record() exits, but since record_options need to be accessible to
3307  * builtin-script, leave it here.
3308  *
3309  * At least we don't ouch it in all the other functions here directly.
3310  *
3311  * Just say no to tons of global variables, sigh.
3312  */
3313 static struct record record = {
3314 	.opts = {
3315 		.sample_time	     = true,
3316 		.mmap_pages	     = UINT_MAX,
3317 		.user_freq	     = UINT_MAX,
3318 		.user_interval	     = ULLONG_MAX,
3319 		.freq		     = 4000,
3320 		.target		     = {
3321 			.uses_mmap   = true,
3322 			.default_per_cpu = true,
3323 		},
3324 		.mmap_flush          = MMAP_FLUSH_DEFAULT,
3325 		.nr_threads_synthesize = 1,
3326 		.ctl_fd              = -1,
3327 		.ctl_fd_ack          = -1,
3328 		.synth               = PERF_SYNTH_ALL,
3329 	},
3330 	.tool = {
3331 		.sample		= process_sample_event,
3332 		.fork		= perf_event__process_fork,
3333 		.exit		= perf_event__process_exit,
3334 		.comm		= perf_event__process_comm,
3335 		.namespaces	= perf_event__process_namespaces,
3336 		.mmap		= build_id__process_mmap,
3337 		.mmap2		= build_id__process_mmap2,
3338 		.itrace_start	= process_timestamp_boundary,
3339 		.aux		= process_timestamp_boundary,
3340 		.ordered_events	= true,
3341 	},
3342 };
3343 
3344 const char record_callchain_help[] = CALLCHAIN_RECORD_HELP
3345 	"\n\t\t\t\tDefault: fp";
3346 
3347 static bool dry_run;
3348 
3349 static struct parse_events_option_args parse_events_option_args = {
3350 	.evlistp = &record.evlist,
3351 };
3352 
3353 static struct parse_events_option_args switch_output_parse_events_option_args = {
3354 	.evlistp = &record.sb_evlist,
3355 };
3356 
3357 /*
3358  * XXX Will stay a global variable till we fix builtin-script.c to stop messing
3359  * with it and switch to use the library functions in perf_evlist that came
3360  * from builtin-record.c, i.e. use record_opts,
3361  * evlist__prepare_workload, etc instead of fork+exec'in 'perf record',
3362  * using pipes, etc.
3363  */
3364 static struct option __record_options[] = {
3365 	OPT_CALLBACK('e', "event", &parse_events_option_args, "event",
3366 		     "event selector. use 'perf list' to list available events",
3367 		     parse_events_option),
3368 	OPT_CALLBACK(0, "filter", &record.evlist, "filter",
3369 		     "event filter", parse_filter),
3370 	OPT_CALLBACK_NOOPT(0, "exclude-perf", &record.evlist,
3371 			   NULL, "don't record events from perf itself",
3372 			   exclude_perf),
3373 	OPT_STRING('p', "pid", &record.opts.target.pid, "pid",
3374 		    "record events on existing process id"),
3375 	OPT_STRING('t', "tid", &record.opts.target.tid, "tid",
3376 		    "record events on existing thread id"),
3377 	OPT_INTEGER('r', "realtime", &record.realtime_prio,
3378 		    "collect data with this RT SCHED_FIFO priority"),
3379 	OPT_BOOLEAN(0, "no-buffering", &record.opts.no_buffering,
3380 		    "collect data without buffering"),
3381 	OPT_BOOLEAN('R', "raw-samples", &record.opts.raw_samples,
3382 		    "collect raw sample records from all opened counters"),
3383 	OPT_BOOLEAN('a', "all-cpus", &record.opts.target.system_wide,
3384 			    "system-wide collection from all CPUs"),
3385 	OPT_STRING('C', "cpu", &record.opts.target.cpu_list, "cpu",
3386 		    "list of cpus to monitor"),
3387 	OPT_U64('c', "count", &record.opts.user_interval, "event period to sample"),
3388 	OPT_STRING('o', "output", &record.data.path, "file",
3389 		    "output file name"),
3390 	OPT_BOOLEAN_SET('i', "no-inherit", &record.opts.no_inherit,
3391 			&record.opts.no_inherit_set,
3392 			"child tasks do not inherit counters"),
3393 	OPT_BOOLEAN(0, "tail-synthesize", &record.opts.tail_synthesize,
3394 		    "synthesize non-sample events at the end of output"),
3395 	OPT_BOOLEAN(0, "overwrite", &record.opts.overwrite, "use overwrite mode"),
3396 	OPT_BOOLEAN(0, "no-bpf-event", &record.opts.no_bpf_event, "do not record bpf events"),
3397 	OPT_BOOLEAN(0, "strict-freq", &record.opts.strict_freq,
3398 		    "Fail if the specified frequency can't be used"),
3399 	OPT_CALLBACK('F', "freq", &record.opts, "freq or 'max'",
3400 		     "profile at this frequency",
3401 		      record__parse_freq),
3402 	OPT_CALLBACK('m', "mmap-pages", &record.opts, "pages[,pages]",
3403 		     "number of mmap data pages and AUX area tracing mmap pages",
3404 		     record__parse_mmap_pages),
3405 	OPT_CALLBACK(0, "mmap-flush", &record.opts, "number",
3406 		     "Minimal number of bytes that is extracted from mmap data pages (default: 1)",
3407 		     record__mmap_flush_parse),
3408 	OPT_CALLBACK_NOOPT('g', NULL, &callchain_param,
3409 			   NULL, "enables call-graph recording" ,
3410 			   &record_callchain_opt),
3411 	OPT_CALLBACK(0, "call-graph", &record.opts,
3412 		     "record_mode[,record_size]", record_callchain_help,
3413 		     &record_parse_callchain_opt),
3414 	OPT_INCR('v', "verbose", &verbose,
3415 		    "be more verbose (show counter open errors, etc)"),
3416 	OPT_BOOLEAN('q', "quiet", &quiet, "don't print any warnings or messages"),
3417 	OPT_BOOLEAN('s', "stat", &record.opts.inherit_stat,
3418 		    "per thread counts"),
3419 	OPT_BOOLEAN('d', "data", &record.opts.sample_address, "Record the sample addresses"),
3420 	OPT_BOOLEAN(0, "phys-data", &record.opts.sample_phys_addr,
3421 		    "Record the sample physical addresses"),
3422 	OPT_BOOLEAN(0, "data-page-size", &record.opts.sample_data_page_size,
3423 		    "Record the sampled data address data page size"),
3424 	OPT_BOOLEAN(0, "code-page-size", &record.opts.sample_code_page_size,
3425 		    "Record the sampled code address (ip) page size"),
3426 	OPT_BOOLEAN(0, "sample-cpu", &record.opts.sample_cpu, "Record the sample cpu"),
3427 	OPT_BOOLEAN(0, "sample-identifier", &record.opts.sample_identifier,
3428 		    "Record the sample identifier"),
3429 	OPT_BOOLEAN_SET('T', "timestamp", &record.opts.sample_time,
3430 			&record.opts.sample_time_set,
3431 			"Record the sample timestamps"),
3432 	OPT_BOOLEAN_SET('P', "period", &record.opts.period, &record.opts.period_set,
3433 			"Record the sample period"),
3434 	OPT_BOOLEAN('n', "no-samples", &record.opts.no_samples,
3435 		    "don't sample"),
3436 	OPT_BOOLEAN_SET('N', "no-buildid-cache", &record.no_buildid_cache,
3437 			&record.no_buildid_cache_set,
3438 			"do not update the buildid cache"),
3439 	OPT_BOOLEAN_SET('B', "no-buildid", &record.no_buildid,
3440 			&record.no_buildid_set,
3441 			"do not collect buildids in perf.data"),
3442 	OPT_CALLBACK('G', "cgroup", &record.evlist, "name",
3443 		     "monitor event in cgroup name only",
3444 		     parse_cgroups),
3445 	OPT_CALLBACK('D', "delay", &record, "ms",
3446 		     "ms to wait before starting measurement after program start (-1: start with events disabled), "
3447 		     "or ranges of time to enable events e.g. '-D 10-20,30-40'",
3448 		     record__parse_event_enable_time),
3449 	OPT_BOOLEAN(0, "kcore", &record.opts.kcore, "copy /proc/kcore"),
3450 	OPT_STRING('u', "uid", &record.opts.target.uid_str, "user",
3451 		   "user to profile"),
3452 
3453 	OPT_CALLBACK_NOOPT('b', "branch-any", &record.opts.branch_stack,
3454 		     "branch any", "sample any taken branches",
3455 		     parse_branch_stack),
3456 
3457 	OPT_CALLBACK('j', "branch-filter", &record.opts.branch_stack,
3458 		     "branch filter mask", "branch stack filter modes",
3459 		     parse_branch_stack),
3460 	OPT_BOOLEAN('W', "weight", &record.opts.sample_weight,
3461 		    "sample by weight (on special events only)"),
3462 	OPT_BOOLEAN(0, "transaction", &record.opts.sample_transaction,
3463 		    "sample transaction flags (special events only)"),
3464 	OPT_BOOLEAN(0, "per-thread", &record.opts.target.per_thread,
3465 		    "use per-thread mmaps"),
3466 	OPT_CALLBACK_OPTARG('I', "intr-regs", &record.opts.sample_intr_regs, NULL, "any register",
3467 		    "sample selected machine registers on interrupt,"
3468 		    " use '-I?' to list register names", parse_intr_regs),
3469 	OPT_CALLBACK_OPTARG(0, "user-regs", &record.opts.sample_user_regs, NULL, "any register",
3470 		    "sample selected machine registers on interrupt,"
3471 		    " use '--user-regs=?' to list register names", parse_user_regs),
3472 	OPT_BOOLEAN(0, "running-time", &record.opts.running_time,
3473 		    "Record running/enabled time of read (:S) events"),
3474 	OPT_CALLBACK('k', "clockid", &record.opts,
3475 	"clockid", "clockid to use for events, see clock_gettime()",
3476 	parse_clockid),
3477 	OPT_STRING_OPTARG('S', "snapshot", &record.opts.auxtrace_snapshot_opts,
3478 			  "opts", "AUX area tracing Snapshot Mode", ""),
3479 	OPT_STRING_OPTARG(0, "aux-sample", &record.opts.auxtrace_sample_opts,
3480 			  "opts", "sample AUX area", ""),
3481 	OPT_UINTEGER(0, "proc-map-timeout", &proc_map_timeout,
3482 			"per thread proc mmap processing timeout in ms"),
3483 	OPT_BOOLEAN(0, "namespaces", &record.opts.record_namespaces,
3484 		    "Record namespaces events"),
3485 	OPT_BOOLEAN(0, "all-cgroups", &record.opts.record_cgroup,
3486 		    "Record cgroup events"),
3487 	OPT_BOOLEAN_SET(0, "switch-events", &record.opts.record_switch_events,
3488 			&record.opts.record_switch_events_set,
3489 			"Record context switch events"),
3490 	OPT_BOOLEAN_FLAG(0, "all-kernel", &record.opts.all_kernel,
3491 			 "Configure all used events to run in kernel space.",
3492 			 PARSE_OPT_EXCLUSIVE),
3493 	OPT_BOOLEAN_FLAG(0, "all-user", &record.opts.all_user,
3494 			 "Configure all used events to run in user space.",
3495 			 PARSE_OPT_EXCLUSIVE),
3496 	OPT_BOOLEAN(0, "kernel-callchains", &record.opts.kernel_callchains,
3497 		    "collect kernel callchains"),
3498 	OPT_BOOLEAN(0, "user-callchains", &record.opts.user_callchains,
3499 		    "collect user callchains"),
3500 	OPT_STRING(0, "vmlinux", &symbol_conf.vmlinux_name,
3501 		   "file", "vmlinux pathname"),
3502 	OPT_BOOLEAN(0, "buildid-all", &record.buildid_all,
3503 		    "Record build-id of all DSOs regardless of hits"),
3504 	OPT_BOOLEAN(0, "buildid-mmap", &record.buildid_mmap,
3505 		    "Record build-id in map events"),
3506 	OPT_BOOLEAN(0, "timestamp-filename", &record.timestamp_filename,
3507 		    "append timestamp to output filename"),
3508 	OPT_BOOLEAN(0, "timestamp-boundary", &record.timestamp_boundary,
3509 		    "Record timestamp boundary (time of first/last samples)"),
3510 	OPT_STRING_OPTARG_SET(0, "switch-output", &record.switch_output.str,
3511 			  &record.switch_output.set, "signal or size[BKMG] or time[smhd]",
3512 			  "Switch output when receiving SIGUSR2 (signal) or cross a size or time threshold",
3513 			  "signal"),
3514 	OPT_CALLBACK_SET(0, "switch-output-event", &switch_output_parse_events_option_args,
3515 			 &record.switch_output_event_set, "switch output event",
3516 			 "switch output event selector. use 'perf list' to list available events",
3517 			 parse_events_option_new_evlist),
3518 	OPT_INTEGER(0, "switch-max-files", &record.switch_output.num_files,
3519 		   "Limit number of switch output generated files"),
3520 	OPT_BOOLEAN(0, "dry-run", &dry_run,
3521 		    "Parse options then exit"),
3522 #ifdef HAVE_AIO_SUPPORT
3523 	OPT_CALLBACK_OPTARG(0, "aio", &record.opts,
3524 		     &nr_cblocks_default, "n", "Use <n> control blocks in asynchronous trace writing mode (default: 1, max: 4)",
3525 		     record__aio_parse),
3526 #endif
3527 	OPT_CALLBACK(0, "affinity", &record.opts, "node|cpu",
3528 		     "Set affinity mask of trace reading thread to NUMA node cpu mask or cpu of processed mmap buffer",
3529 		     record__parse_affinity),
3530 #ifdef HAVE_ZSTD_SUPPORT
3531 	OPT_CALLBACK_OPTARG('z', "compression-level", &record.opts, &comp_level_default, "n",
3532 			    "Compress records using specified level (default: 1 - fastest compression, 22 - greatest compression)",
3533 			    record__parse_comp_level),
3534 #endif
3535 	OPT_CALLBACK(0, "max-size", &record.output_max_size,
3536 		     "size", "Limit the maximum size of the output file", parse_output_max_size),
3537 	OPT_UINTEGER(0, "num-thread-synthesize",
3538 		     &record.opts.nr_threads_synthesize,
3539 		     "number of threads to run for event synthesis"),
3540 #ifdef HAVE_LIBPFM
3541 	OPT_CALLBACK(0, "pfm-events", &record.evlist, "event",
3542 		"libpfm4 event selector. use 'perf list' to list available events",
3543 		parse_libpfm_events_option),
3544 #endif
3545 	OPT_CALLBACK(0, "control", &record.opts, "fd:ctl-fd[,ack-fd] or fifo:ctl-fifo[,ack-fifo]",
3546 		     "Listen on ctl-fd descriptor for command to control measurement ('enable': enable events, 'disable': disable events,\n"
3547 		     "\t\t\t  'snapshot': AUX area tracing snapshot).\n"
3548 		     "\t\t\t  Optionally send control command completion ('ack\\n') to ack-fd descriptor.\n"
3549 		     "\t\t\t  Alternatively, ctl-fifo / ack-fifo will be opened and used as ctl-fd / ack-fd.",
3550 		      parse_control_option),
3551 	OPT_CALLBACK(0, "synth", &record.opts, "no|all|task|mmap|cgroup",
3552 		     "Fine-tune event synthesis: default=all", parse_record_synth_option),
3553 	OPT_STRING_OPTARG_SET(0, "debuginfod", &record.debuginfod.urls,
3554 			  &record.debuginfod.set, "debuginfod urls",
3555 			  "Enable debuginfod data retrieval from DEBUGINFOD_URLS or specified urls",
3556 			  "system"),
3557 	OPT_CALLBACK_OPTARG(0, "threads", &record.opts, NULL, "spec",
3558 			    "write collected trace data into several data files using parallel threads",
3559 			    record__parse_threads),
3560 	OPT_BOOLEAN(0, "off-cpu", &record.off_cpu, "Enable off-cpu analysis"),
3561 	OPT_STRING(0, "setup-filter", &record.filter_action, "pin|unpin",
3562 		   "BPF filter action"),
3563 	OPT_END()
3564 };
3565 
3566 struct option *record_options = __record_options;
3567 
3568 static int record__mmap_cpu_mask_init(struct mmap_cpu_mask *mask, struct perf_cpu_map *cpus)
3569 {
3570 	struct perf_cpu cpu;
3571 	int idx;
3572 
3573 	if (cpu_map__is_dummy(cpus))
3574 		return 0;
3575 
3576 	perf_cpu_map__for_each_cpu_skip_any(cpu, idx, cpus) {
3577 		/* Return ENODEV is input cpu is greater than max cpu */
3578 		if ((unsigned long)cpu.cpu > mask->nbits)
3579 			return -ENODEV;
3580 		__set_bit(cpu.cpu, mask->bits);
3581 	}
3582 
3583 	return 0;
3584 }
3585 
3586 static int record__mmap_cpu_mask_init_spec(struct mmap_cpu_mask *mask, const char *mask_spec)
3587 {
3588 	struct perf_cpu_map *cpus;
3589 
3590 	cpus = perf_cpu_map__new(mask_spec);
3591 	if (!cpus)
3592 		return -ENOMEM;
3593 
3594 	bitmap_zero(mask->bits, mask->nbits);
3595 	if (record__mmap_cpu_mask_init(mask, cpus))
3596 		return -ENODEV;
3597 
3598 	perf_cpu_map__put(cpus);
3599 
3600 	return 0;
3601 }
3602 
3603 static void record__free_thread_masks(struct record *rec, int nr_threads)
3604 {
3605 	int t;
3606 
3607 	if (rec->thread_masks)
3608 		for (t = 0; t < nr_threads; t++)
3609 			record__thread_mask_free(&rec->thread_masks[t]);
3610 
3611 	zfree(&rec->thread_masks);
3612 }
3613 
3614 static int record__alloc_thread_masks(struct record *rec, int nr_threads, int nr_bits)
3615 {
3616 	int t, ret;
3617 
3618 	rec->thread_masks = zalloc(nr_threads * sizeof(*(rec->thread_masks)));
3619 	if (!rec->thread_masks) {
3620 		pr_err("Failed to allocate thread masks\n");
3621 		return -ENOMEM;
3622 	}
3623 
3624 	for (t = 0; t < nr_threads; t++) {
3625 		ret = record__thread_mask_alloc(&rec->thread_masks[t], nr_bits);
3626 		if (ret) {
3627 			pr_err("Failed to allocate thread masks[%d]\n", t);
3628 			goto out_free;
3629 		}
3630 	}
3631 
3632 	return 0;
3633 
3634 out_free:
3635 	record__free_thread_masks(rec, nr_threads);
3636 
3637 	return ret;
3638 }
3639 
3640 static int record__init_thread_cpu_masks(struct record *rec, struct perf_cpu_map *cpus)
3641 {
3642 	int t, ret, nr_cpus = perf_cpu_map__nr(cpus);
3643 
3644 	ret = record__alloc_thread_masks(rec, nr_cpus, cpu__max_cpu().cpu);
3645 	if (ret)
3646 		return ret;
3647 
3648 	rec->nr_threads = nr_cpus;
3649 	pr_debug("nr_threads: %d\n", rec->nr_threads);
3650 
3651 	for (t = 0; t < rec->nr_threads; t++) {
3652 		__set_bit(perf_cpu_map__cpu(cpus, t).cpu, rec->thread_masks[t].maps.bits);
3653 		__set_bit(perf_cpu_map__cpu(cpus, t).cpu, rec->thread_masks[t].affinity.bits);
3654 		if (verbose > 0) {
3655 			pr_debug("thread_masks[%d]: ", t);
3656 			mmap_cpu_mask__scnprintf(&rec->thread_masks[t].maps, "maps");
3657 			pr_debug("thread_masks[%d]: ", t);
3658 			mmap_cpu_mask__scnprintf(&rec->thread_masks[t].affinity, "affinity");
3659 		}
3660 	}
3661 
3662 	return 0;
3663 }
3664 
3665 static int record__init_thread_masks_spec(struct record *rec, struct perf_cpu_map *cpus,
3666 					  const char **maps_spec, const char **affinity_spec,
3667 					  u32 nr_spec)
3668 {
3669 	u32 s;
3670 	int ret = 0, t = 0;
3671 	struct mmap_cpu_mask cpus_mask;
3672 	struct thread_mask thread_mask, full_mask, *thread_masks;
3673 
3674 	ret = record__mmap_cpu_mask_alloc(&cpus_mask, cpu__max_cpu().cpu);
3675 	if (ret) {
3676 		pr_err("Failed to allocate CPUs mask\n");
3677 		return ret;
3678 	}
3679 
3680 	ret = record__mmap_cpu_mask_init(&cpus_mask, cpus);
3681 	if (ret) {
3682 		pr_err("Failed to init cpu mask\n");
3683 		goto out_free_cpu_mask;
3684 	}
3685 
3686 	ret = record__thread_mask_alloc(&full_mask, cpu__max_cpu().cpu);
3687 	if (ret) {
3688 		pr_err("Failed to allocate full mask\n");
3689 		goto out_free_cpu_mask;
3690 	}
3691 
3692 	ret = record__thread_mask_alloc(&thread_mask, cpu__max_cpu().cpu);
3693 	if (ret) {
3694 		pr_err("Failed to allocate thread mask\n");
3695 		goto out_free_full_and_cpu_masks;
3696 	}
3697 
3698 	for (s = 0; s < nr_spec; s++) {
3699 		ret = record__mmap_cpu_mask_init_spec(&thread_mask.maps, maps_spec[s]);
3700 		if (ret) {
3701 			pr_err("Failed to initialize maps thread mask\n");
3702 			goto out_free;
3703 		}
3704 		ret = record__mmap_cpu_mask_init_spec(&thread_mask.affinity, affinity_spec[s]);
3705 		if (ret) {
3706 			pr_err("Failed to initialize affinity thread mask\n");
3707 			goto out_free;
3708 		}
3709 
3710 		/* ignore invalid CPUs but do not allow empty masks */
3711 		if (!bitmap_and(thread_mask.maps.bits, thread_mask.maps.bits,
3712 				cpus_mask.bits, thread_mask.maps.nbits)) {
3713 			pr_err("Empty maps mask: %s\n", maps_spec[s]);
3714 			ret = -EINVAL;
3715 			goto out_free;
3716 		}
3717 		if (!bitmap_and(thread_mask.affinity.bits, thread_mask.affinity.bits,
3718 				cpus_mask.bits, thread_mask.affinity.nbits)) {
3719 			pr_err("Empty affinity mask: %s\n", affinity_spec[s]);
3720 			ret = -EINVAL;
3721 			goto out_free;
3722 		}
3723 
3724 		/* do not allow intersection with other masks (full_mask) */
3725 		if (bitmap_intersects(thread_mask.maps.bits, full_mask.maps.bits,
3726 				      thread_mask.maps.nbits)) {
3727 			pr_err("Intersecting maps mask: %s\n", maps_spec[s]);
3728 			ret = -EINVAL;
3729 			goto out_free;
3730 		}
3731 		if (bitmap_intersects(thread_mask.affinity.bits, full_mask.affinity.bits,
3732 				      thread_mask.affinity.nbits)) {
3733 			pr_err("Intersecting affinity mask: %s\n", affinity_spec[s]);
3734 			ret = -EINVAL;
3735 			goto out_free;
3736 		}
3737 
3738 		bitmap_or(full_mask.maps.bits, full_mask.maps.bits,
3739 			  thread_mask.maps.bits, full_mask.maps.nbits);
3740 		bitmap_or(full_mask.affinity.bits, full_mask.affinity.bits,
3741 			  thread_mask.affinity.bits, full_mask.maps.nbits);
3742 
3743 		thread_masks = realloc(rec->thread_masks, (t + 1) * sizeof(struct thread_mask));
3744 		if (!thread_masks) {
3745 			pr_err("Failed to reallocate thread masks\n");
3746 			ret = -ENOMEM;
3747 			goto out_free;
3748 		}
3749 		rec->thread_masks = thread_masks;
3750 		rec->thread_masks[t] = thread_mask;
3751 		if (verbose > 0) {
3752 			pr_debug("thread_masks[%d]: ", t);
3753 			mmap_cpu_mask__scnprintf(&rec->thread_masks[t].maps, "maps");
3754 			pr_debug("thread_masks[%d]: ", t);
3755 			mmap_cpu_mask__scnprintf(&rec->thread_masks[t].affinity, "affinity");
3756 		}
3757 		t++;
3758 		ret = record__thread_mask_alloc(&thread_mask, cpu__max_cpu().cpu);
3759 		if (ret) {
3760 			pr_err("Failed to allocate thread mask\n");
3761 			goto out_free_full_and_cpu_masks;
3762 		}
3763 	}
3764 	rec->nr_threads = t;
3765 	pr_debug("nr_threads: %d\n", rec->nr_threads);
3766 	if (!rec->nr_threads)
3767 		ret = -EINVAL;
3768 
3769 out_free:
3770 	record__thread_mask_free(&thread_mask);
3771 out_free_full_and_cpu_masks:
3772 	record__thread_mask_free(&full_mask);
3773 out_free_cpu_mask:
3774 	record__mmap_cpu_mask_free(&cpus_mask);
3775 
3776 	return ret;
3777 }
3778 
3779 static int record__init_thread_core_masks(struct record *rec, struct perf_cpu_map *cpus)
3780 {
3781 	int ret;
3782 	struct cpu_topology *topo;
3783 
3784 	topo = cpu_topology__new();
3785 	if (!topo) {
3786 		pr_err("Failed to allocate CPU topology\n");
3787 		return -ENOMEM;
3788 	}
3789 
3790 	ret = record__init_thread_masks_spec(rec, cpus, topo->core_cpus_list,
3791 					     topo->core_cpus_list, topo->core_cpus_lists);
3792 	cpu_topology__delete(topo);
3793 
3794 	return ret;
3795 }
3796 
3797 static int record__init_thread_package_masks(struct record *rec, struct perf_cpu_map *cpus)
3798 {
3799 	int ret;
3800 	struct cpu_topology *topo;
3801 
3802 	topo = cpu_topology__new();
3803 	if (!topo) {
3804 		pr_err("Failed to allocate CPU topology\n");
3805 		return -ENOMEM;
3806 	}
3807 
3808 	ret = record__init_thread_masks_spec(rec, cpus, topo->package_cpus_list,
3809 					     topo->package_cpus_list, topo->package_cpus_lists);
3810 	cpu_topology__delete(topo);
3811 
3812 	return ret;
3813 }
3814 
3815 static int record__init_thread_numa_masks(struct record *rec, struct perf_cpu_map *cpus)
3816 {
3817 	u32 s;
3818 	int ret;
3819 	const char **spec;
3820 	struct numa_topology *topo;
3821 
3822 	topo = numa_topology__new();
3823 	if (!topo) {
3824 		pr_err("Failed to allocate NUMA topology\n");
3825 		return -ENOMEM;
3826 	}
3827 
3828 	spec = zalloc(topo->nr * sizeof(char *));
3829 	if (!spec) {
3830 		pr_err("Failed to allocate NUMA spec\n");
3831 		ret = -ENOMEM;
3832 		goto out_delete_topo;
3833 	}
3834 	for (s = 0; s < topo->nr; s++)
3835 		spec[s] = topo->nodes[s].cpus;
3836 
3837 	ret = record__init_thread_masks_spec(rec, cpus, spec, spec, topo->nr);
3838 
3839 	zfree(&spec);
3840 
3841 out_delete_topo:
3842 	numa_topology__delete(topo);
3843 
3844 	return ret;
3845 }
3846 
3847 static int record__init_thread_user_masks(struct record *rec, struct perf_cpu_map *cpus)
3848 {
3849 	int t, ret;
3850 	u32 s, nr_spec = 0;
3851 	char **maps_spec = NULL, **affinity_spec = NULL, **tmp_spec;
3852 	char *user_spec, *spec, *spec_ptr, *mask, *mask_ptr, *dup_mask = NULL;
3853 
3854 	for (t = 0, user_spec = (char *)rec->opts.threads_user_spec; ; t++, user_spec = NULL) {
3855 		spec = strtok_r(user_spec, ":", &spec_ptr);
3856 		if (spec == NULL)
3857 			break;
3858 		pr_debug2("threads_spec[%d]: %s\n", t, spec);
3859 		mask = strtok_r(spec, "/", &mask_ptr);
3860 		if (mask == NULL)
3861 			break;
3862 		pr_debug2("  maps mask: %s\n", mask);
3863 		tmp_spec = realloc(maps_spec, (nr_spec + 1) * sizeof(char *));
3864 		if (!tmp_spec) {
3865 			pr_err("Failed to reallocate maps spec\n");
3866 			ret = -ENOMEM;
3867 			goto out_free;
3868 		}
3869 		maps_spec = tmp_spec;
3870 		maps_spec[nr_spec] = dup_mask = strdup(mask);
3871 		if (!maps_spec[nr_spec]) {
3872 			pr_err("Failed to allocate maps spec[%d]\n", nr_spec);
3873 			ret = -ENOMEM;
3874 			goto out_free;
3875 		}
3876 		mask = strtok_r(NULL, "/", &mask_ptr);
3877 		if (mask == NULL) {
3878 			pr_err("Invalid thread maps or affinity specs\n");
3879 			ret = -EINVAL;
3880 			goto out_free;
3881 		}
3882 		pr_debug2("  affinity mask: %s\n", mask);
3883 		tmp_spec = realloc(affinity_spec, (nr_spec + 1) * sizeof(char *));
3884 		if (!tmp_spec) {
3885 			pr_err("Failed to reallocate affinity spec\n");
3886 			ret = -ENOMEM;
3887 			goto out_free;
3888 		}
3889 		affinity_spec = tmp_spec;
3890 		affinity_spec[nr_spec] = strdup(mask);
3891 		if (!affinity_spec[nr_spec]) {
3892 			pr_err("Failed to allocate affinity spec[%d]\n", nr_spec);
3893 			ret = -ENOMEM;
3894 			goto out_free;
3895 		}
3896 		dup_mask = NULL;
3897 		nr_spec++;
3898 	}
3899 
3900 	ret = record__init_thread_masks_spec(rec, cpus, (const char **)maps_spec,
3901 					     (const char **)affinity_spec, nr_spec);
3902 
3903 out_free:
3904 	free(dup_mask);
3905 	for (s = 0; s < nr_spec; s++) {
3906 		if (maps_spec)
3907 			free(maps_spec[s]);
3908 		if (affinity_spec)
3909 			free(affinity_spec[s]);
3910 	}
3911 	free(affinity_spec);
3912 	free(maps_spec);
3913 
3914 	return ret;
3915 }
3916 
3917 static int record__init_thread_default_masks(struct record *rec, struct perf_cpu_map *cpus)
3918 {
3919 	int ret;
3920 
3921 	ret = record__alloc_thread_masks(rec, 1, cpu__max_cpu().cpu);
3922 	if (ret)
3923 		return ret;
3924 
3925 	if (record__mmap_cpu_mask_init(&rec->thread_masks->maps, cpus))
3926 		return -ENODEV;
3927 
3928 	rec->nr_threads = 1;
3929 
3930 	return 0;
3931 }
3932 
3933 static int record__init_thread_masks(struct record *rec)
3934 {
3935 	int ret = 0;
3936 	struct perf_cpu_map *cpus = rec->evlist->core.all_cpus;
3937 
3938 	if (!record__threads_enabled(rec))
3939 		return record__init_thread_default_masks(rec, cpus);
3940 
3941 	if (evlist__per_thread(rec->evlist)) {
3942 		pr_err("--per-thread option is mutually exclusive to parallel streaming mode.\n");
3943 		return -EINVAL;
3944 	}
3945 
3946 	switch (rec->opts.threads_spec) {
3947 	case THREAD_SPEC__CPU:
3948 		ret = record__init_thread_cpu_masks(rec, cpus);
3949 		break;
3950 	case THREAD_SPEC__CORE:
3951 		ret = record__init_thread_core_masks(rec, cpus);
3952 		break;
3953 	case THREAD_SPEC__PACKAGE:
3954 		ret = record__init_thread_package_masks(rec, cpus);
3955 		break;
3956 	case THREAD_SPEC__NUMA:
3957 		ret = record__init_thread_numa_masks(rec, cpus);
3958 		break;
3959 	case THREAD_SPEC__USER:
3960 		ret = record__init_thread_user_masks(rec, cpus);
3961 		break;
3962 	default:
3963 		break;
3964 	}
3965 
3966 	return ret;
3967 }
3968 
3969 int cmd_record(int argc, const char **argv)
3970 {
3971 	int err;
3972 	struct record *rec = &record;
3973 	char errbuf[BUFSIZ];
3974 
3975 	setlocale(LC_ALL, "");
3976 
3977 #ifndef HAVE_BPF_SKEL
3978 # define set_nobuild(s, l, m, c) set_option_nobuild(record_options, s, l, m, c)
3979 	set_nobuild('\0', "off-cpu", "no BUILD_BPF_SKEL=1", true);
3980 # undef set_nobuild
3981 #endif
3982 
3983 	/* Disable eager loading of kernel symbols that adds overhead to perf record. */
3984 	symbol_conf.lazy_load_kernel_maps = true;
3985 	rec->opts.affinity = PERF_AFFINITY_SYS;
3986 
3987 	rec->evlist = evlist__new();
3988 	if (rec->evlist == NULL)
3989 		return -ENOMEM;
3990 
3991 	err = perf_config(perf_record_config, rec);
3992 	if (err)
3993 		return err;
3994 
3995 	argc = parse_options(argc, argv, record_options, record_usage,
3996 			    PARSE_OPT_STOP_AT_NON_OPTION);
3997 	if (quiet)
3998 		perf_quiet_option();
3999 
4000 	err = symbol__validate_sym_arguments();
4001 	if (err)
4002 		return err;
4003 
4004 	perf_debuginfod_setup(&record.debuginfod);
4005 
4006 	/* Make system wide (-a) the default target. */
4007 	if (!argc && target__none(&rec->opts.target))
4008 		rec->opts.target.system_wide = true;
4009 
4010 	if (nr_cgroups && !rec->opts.target.system_wide) {
4011 		usage_with_options_msg(record_usage, record_options,
4012 			"cgroup monitoring only available in system-wide mode");
4013 
4014 	}
4015 
4016 	if (rec->buildid_mmap) {
4017 		if (!perf_can_record_build_id()) {
4018 			pr_err("Failed: no support to record build id in mmap events, update your kernel.\n");
4019 			err = -EINVAL;
4020 			goto out_opts;
4021 		}
4022 		pr_debug("Enabling build id in mmap2 events.\n");
4023 		/* Enable mmap build id synthesizing. */
4024 		symbol_conf.buildid_mmap2 = true;
4025 		/* Enable perf_event_attr::build_id bit. */
4026 		rec->opts.build_id = true;
4027 		/* Disable build id cache. */
4028 		rec->no_buildid = true;
4029 	}
4030 
4031 	if (rec->opts.record_cgroup && !perf_can_record_cgroup()) {
4032 		pr_err("Kernel has no cgroup sampling support.\n");
4033 		err = -EINVAL;
4034 		goto out_opts;
4035 	}
4036 
4037 	if (rec->opts.kcore)
4038 		rec->opts.text_poke = true;
4039 
4040 	if (rec->opts.kcore || record__threads_enabled(rec))
4041 		rec->data.is_dir = true;
4042 
4043 	if (record__threads_enabled(rec)) {
4044 		if (rec->opts.affinity != PERF_AFFINITY_SYS) {
4045 			pr_err("--affinity option is mutually exclusive to parallel streaming mode.\n");
4046 			goto out_opts;
4047 		}
4048 		if (record__aio_enabled(rec)) {
4049 			pr_err("Asynchronous streaming mode (--aio) is mutually exclusive to parallel streaming mode.\n");
4050 			goto out_opts;
4051 		}
4052 	}
4053 
4054 	if (rec->opts.comp_level != 0) {
4055 		pr_debug("Compression enabled, disabling build id collection at the end of the session.\n");
4056 		rec->no_buildid = true;
4057 	}
4058 
4059 	if (rec->opts.record_switch_events &&
4060 	    !perf_can_record_switch_events()) {
4061 		ui__error("kernel does not support recording context switch events\n");
4062 		parse_options_usage(record_usage, record_options, "switch-events", 0);
4063 		err = -EINVAL;
4064 		goto out_opts;
4065 	}
4066 
4067 	if (switch_output_setup(rec)) {
4068 		parse_options_usage(record_usage, record_options, "switch-output", 0);
4069 		err = -EINVAL;
4070 		goto out_opts;
4071 	}
4072 
4073 	if (rec->switch_output.time) {
4074 		signal(SIGALRM, alarm_sig_handler);
4075 		alarm(rec->switch_output.time);
4076 	}
4077 
4078 	if (rec->switch_output.num_files) {
4079 		rec->switch_output.filenames = calloc(rec->switch_output.num_files,
4080 						      sizeof(char *));
4081 		if (!rec->switch_output.filenames) {
4082 			err = -EINVAL;
4083 			goto out_opts;
4084 		}
4085 	}
4086 
4087 	if (rec->timestamp_filename && record__threads_enabled(rec)) {
4088 		rec->timestamp_filename = false;
4089 		pr_warning("WARNING: --timestamp-filename option is not available in parallel streaming mode.\n");
4090 	}
4091 
4092 	if (rec->filter_action) {
4093 		if (!strcmp(rec->filter_action, "pin"))
4094 			err = perf_bpf_filter__pin();
4095 		else if (!strcmp(rec->filter_action, "unpin"))
4096 			err = perf_bpf_filter__unpin();
4097 		else {
4098 			pr_warning("Unknown BPF filter action: %s\n", rec->filter_action);
4099 			err = -EINVAL;
4100 		}
4101 		goto out_opts;
4102 	}
4103 
4104 	/*
4105 	 * Allow aliases to facilitate the lookup of symbols for address
4106 	 * filters. Refer to auxtrace_parse_filters().
4107 	 */
4108 	symbol_conf.allow_aliases = true;
4109 
4110 	symbol__init(NULL);
4111 
4112 	err = record__auxtrace_init(rec);
4113 	if (err)
4114 		goto out;
4115 
4116 	if (dry_run)
4117 		goto out;
4118 
4119 	err = -ENOMEM;
4120 
4121 	if (rec->no_buildid_cache || rec->no_buildid) {
4122 		disable_buildid_cache();
4123 	} else if (rec->switch_output.enabled) {
4124 		/*
4125 		 * In 'perf record --switch-output', disable buildid
4126 		 * generation by default to reduce data file switching
4127 		 * overhead. Still generate buildid if they are required
4128 		 * explicitly using
4129 		 *
4130 		 *  perf record --switch-output --no-no-buildid \
4131 		 *              --no-no-buildid-cache
4132 		 *
4133 		 * Following code equals to:
4134 		 *
4135 		 * if ((rec->no_buildid || !rec->no_buildid_set) &&
4136 		 *     (rec->no_buildid_cache || !rec->no_buildid_cache_set))
4137 		 *         disable_buildid_cache();
4138 		 */
4139 		bool disable = true;
4140 
4141 		if (rec->no_buildid_set && !rec->no_buildid)
4142 			disable = false;
4143 		if (rec->no_buildid_cache_set && !rec->no_buildid_cache)
4144 			disable = false;
4145 		if (disable) {
4146 			rec->no_buildid = true;
4147 			rec->no_buildid_cache = true;
4148 			disable_buildid_cache();
4149 		}
4150 	}
4151 
4152 	if (record.opts.overwrite)
4153 		record.opts.tail_synthesize = true;
4154 
4155 	if (rec->evlist->core.nr_entries == 0) {
4156 		bool can_profile_kernel = perf_event_paranoid_check(1);
4157 
4158 		err = parse_event(rec->evlist, can_profile_kernel ? "cycles:P" : "cycles:Pu");
4159 		if (err)
4160 			goto out;
4161 	}
4162 
4163 	if (rec->opts.target.tid && !rec->opts.no_inherit_set)
4164 		rec->opts.no_inherit = true;
4165 
4166 	err = target__validate(&rec->opts.target);
4167 	if (err) {
4168 		target__strerror(&rec->opts.target, err, errbuf, BUFSIZ);
4169 		ui__warning("%s\n", errbuf);
4170 	}
4171 
4172 	err = target__parse_uid(&rec->opts.target);
4173 	if (err) {
4174 		int saved_errno = errno;
4175 
4176 		target__strerror(&rec->opts.target, err, errbuf, BUFSIZ);
4177 		ui__error("%s", errbuf);
4178 
4179 		err = -saved_errno;
4180 		goto out;
4181 	}
4182 
4183 	/* Enable ignoring missing threads when -u/-p option is defined. */
4184 	rec->opts.ignore_missing_thread = rec->opts.target.uid != UINT_MAX || rec->opts.target.pid;
4185 
4186 	evlist__warn_user_requested_cpus(rec->evlist, rec->opts.target.cpu_list);
4187 
4188 	if (callchain_param.enabled && callchain_param.record_mode == CALLCHAIN_FP)
4189 		arch__add_leaf_frame_record_opts(&rec->opts);
4190 
4191 	err = -ENOMEM;
4192 	if (evlist__create_maps(rec->evlist, &rec->opts.target) < 0) {
4193 		if (rec->opts.target.pid != NULL) {
4194 			pr_err("Couldn't create thread/CPU maps: %s\n",
4195 				errno == ENOENT ? "No such process" : str_error_r(errno, errbuf, sizeof(errbuf)));
4196 			goto out;
4197 		}
4198 		else
4199 			usage_with_options(record_usage, record_options);
4200 	}
4201 
4202 	err = auxtrace_record__options(rec->itr, rec->evlist, &rec->opts);
4203 	if (err)
4204 		goto out;
4205 
4206 	/*
4207 	 * We take all buildids when the file contains
4208 	 * AUX area tracing data because we do not decode the
4209 	 * trace because it would take too long.
4210 	 */
4211 	if (rec->opts.full_auxtrace)
4212 		rec->buildid_all = true;
4213 
4214 	if (rec->opts.text_poke) {
4215 		err = record__config_text_poke(rec->evlist);
4216 		if (err) {
4217 			pr_err("record__config_text_poke failed, error %d\n", err);
4218 			goto out;
4219 		}
4220 	}
4221 
4222 	if (rec->off_cpu) {
4223 		err = record__config_off_cpu(rec);
4224 		if (err) {
4225 			pr_err("record__config_off_cpu failed, error %d\n", err);
4226 			goto out;
4227 		}
4228 	}
4229 
4230 	if (record_opts__config(&rec->opts)) {
4231 		err = -EINVAL;
4232 		goto out;
4233 	}
4234 
4235 	err = record__config_tracking_events(rec);
4236 	if (err) {
4237 		pr_err("record__config_tracking_events failed, error %d\n", err);
4238 		goto out;
4239 	}
4240 
4241 	err = record__init_thread_masks(rec);
4242 	if (err) {
4243 		pr_err("Failed to initialize parallel data streaming masks\n");
4244 		goto out;
4245 	}
4246 
4247 	if (rec->opts.nr_cblocks > nr_cblocks_max)
4248 		rec->opts.nr_cblocks = nr_cblocks_max;
4249 	pr_debug("nr_cblocks: %d\n", rec->opts.nr_cblocks);
4250 
4251 	pr_debug("affinity: %s\n", affinity_tags[rec->opts.affinity]);
4252 	pr_debug("mmap flush: %d\n", rec->opts.mmap_flush);
4253 
4254 	if (rec->opts.comp_level > comp_level_max)
4255 		rec->opts.comp_level = comp_level_max;
4256 	pr_debug("comp level: %d\n", rec->opts.comp_level);
4257 
4258 	err = __cmd_record(&record, argc, argv);
4259 out:
4260 	record__free_thread_masks(rec, rec->nr_threads);
4261 	rec->nr_threads = 0;
4262 	symbol__exit();
4263 	auxtrace_record__free(rec->itr);
4264 out_opts:
4265 	evlist__close_control(rec->opts.ctl_fd, rec->opts.ctl_fd_ack, &rec->opts.ctl_fd_close);
4266 	evlist__delete(rec->evlist);
4267 	return err;
4268 }
4269 
4270 static void snapshot_sig_handler(int sig __maybe_unused)
4271 {
4272 	struct record *rec = &record;
4273 
4274 	hit_auxtrace_snapshot_trigger(rec);
4275 
4276 	if (switch_output_signal(rec))
4277 		trigger_hit(&switch_output_trigger);
4278 }
4279 
4280 static void alarm_sig_handler(int sig __maybe_unused)
4281 {
4282 	struct record *rec = &record;
4283 
4284 	if (switch_output_time(rec))
4285 		trigger_hit(&switch_output_trigger);
4286 }
4287