xref: /linux/tools/perf/builtin-record.c (revision 001821b0e79716c4e17c71d8e053a23599a7a508)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * builtin-record.c
4  *
5  * Builtin record command: Record the profile of a workload
6  * (or a CPU, or a PID) into the perf.data output file - for
7  * later analysis via perf report.
8  */
9 #include "builtin.h"
10 
11 #include "util/build-id.h"
12 #include <subcmd/parse-options.h>
13 #include <internal/xyarray.h>
14 #include "util/parse-events.h"
15 #include "util/config.h"
16 
17 #include "util/callchain.h"
18 #include "util/cgroup.h"
19 #include "util/header.h"
20 #include "util/event.h"
21 #include "util/evlist.h"
22 #include "util/evsel.h"
23 #include "util/debug.h"
24 #include "util/mmap.h"
25 #include "util/mutex.h"
26 #include "util/target.h"
27 #include "util/session.h"
28 #include "util/tool.h"
29 #include "util/symbol.h"
30 #include "util/record.h"
31 #include "util/cpumap.h"
32 #include "util/thread_map.h"
33 #include "util/data.h"
34 #include "util/perf_regs.h"
35 #include "util/auxtrace.h"
36 #include "util/tsc.h"
37 #include "util/parse-branch-options.h"
38 #include "util/parse-regs-options.h"
39 #include "util/perf_api_probe.h"
40 #include "util/trigger.h"
41 #include "util/perf-hooks.h"
42 #include "util/cpu-set-sched.h"
43 #include "util/synthetic-events.h"
44 #include "util/time-utils.h"
45 #include "util/units.h"
46 #include "util/bpf-event.h"
47 #include "util/util.h"
48 #include "util/pfm.h"
49 #include "util/pmu.h"
50 #include "util/pmus.h"
51 #include "util/clockid.h"
52 #include "util/off_cpu.h"
53 #include "util/bpf-filter.h"
54 #include "asm/bug.h"
55 #include "perf.h"
56 #include "cputopo.h"
57 
58 #include <errno.h>
59 #include <inttypes.h>
60 #include <locale.h>
61 #include <poll.h>
62 #include <pthread.h>
63 #include <unistd.h>
64 #ifndef HAVE_GETTID
65 #include <syscall.h>
66 #endif
67 #include <sched.h>
68 #include <signal.h>
69 #ifdef HAVE_EVENTFD_SUPPORT
70 #include <sys/eventfd.h>
71 #endif
72 #include <sys/mman.h>
73 #include <sys/wait.h>
74 #include <sys/types.h>
75 #include <sys/stat.h>
76 #include <fcntl.h>
77 #include <linux/err.h>
78 #include <linux/string.h>
79 #include <linux/time64.h>
80 #include <linux/zalloc.h>
81 #include <linux/bitmap.h>
82 #include <sys/time.h>
83 
84 struct switch_output {
85 	bool		 enabled;
86 	bool		 signal;
87 	unsigned long	 size;
88 	unsigned long	 time;
89 	const char	*str;
90 	bool		 set;
91 	char		 **filenames;
92 	int		 num_files;
93 	int		 cur_file;
94 };
95 
96 struct thread_mask {
97 	struct mmap_cpu_mask	maps;
98 	struct mmap_cpu_mask	affinity;
99 };
100 
101 struct record_thread {
102 	pid_t			tid;
103 	struct thread_mask	*mask;
104 	struct {
105 		int		msg[2];
106 		int		ack[2];
107 	} pipes;
108 	struct fdarray		pollfd;
109 	int			ctlfd_pos;
110 	int			nr_mmaps;
111 	struct mmap		**maps;
112 	struct mmap		**overwrite_maps;
113 	struct record		*rec;
114 	unsigned long long	samples;
115 	unsigned long		waking;
116 	u64			bytes_written;
117 	u64			bytes_transferred;
118 	u64			bytes_compressed;
119 };
120 
121 static __thread struct record_thread *thread;
122 
123 enum thread_msg {
124 	THREAD_MSG__UNDEFINED = 0,
125 	THREAD_MSG__READY,
126 	THREAD_MSG__MAX,
127 };
128 
129 static const char *thread_msg_tags[THREAD_MSG__MAX] = {
130 	"UNDEFINED", "READY"
131 };
132 
133 enum thread_spec {
134 	THREAD_SPEC__UNDEFINED = 0,
135 	THREAD_SPEC__CPU,
136 	THREAD_SPEC__CORE,
137 	THREAD_SPEC__PACKAGE,
138 	THREAD_SPEC__NUMA,
139 	THREAD_SPEC__USER,
140 	THREAD_SPEC__MAX,
141 };
142 
143 static const char *thread_spec_tags[THREAD_SPEC__MAX] = {
144 	"undefined", "cpu", "core", "package", "numa", "user"
145 };
146 
147 struct pollfd_index_map {
148 	int evlist_pollfd_index;
149 	int thread_pollfd_index;
150 };
151 
152 struct record {
153 	struct perf_tool	tool;
154 	struct record_opts	opts;
155 	u64			bytes_written;
156 	u64			thread_bytes_written;
157 	struct perf_data	data;
158 	struct auxtrace_record	*itr;
159 	struct evlist	*evlist;
160 	struct perf_session	*session;
161 	struct evlist		*sb_evlist;
162 	pthread_t		thread_id;
163 	int			realtime_prio;
164 	bool			switch_output_event_set;
165 	bool			no_buildid;
166 	bool			no_buildid_set;
167 	bool			no_buildid_cache;
168 	bool			no_buildid_cache_set;
169 	bool			buildid_all;
170 	bool			buildid_mmap;
171 	bool			timestamp_filename;
172 	bool			timestamp_boundary;
173 	bool			off_cpu;
174 	struct switch_output	switch_output;
175 	unsigned long long	samples;
176 	unsigned long		output_max_size;	/* = 0: unlimited */
177 	struct perf_debuginfod	debuginfod;
178 	int			nr_threads;
179 	struct thread_mask	*thread_masks;
180 	struct record_thread	*thread_data;
181 	struct pollfd_index_map	*index_map;
182 	size_t			index_map_sz;
183 	size_t			index_map_cnt;
184 };
185 
186 static volatile int done;
187 
188 static volatile int auxtrace_record__snapshot_started;
189 static DEFINE_TRIGGER(auxtrace_snapshot_trigger);
190 static DEFINE_TRIGGER(switch_output_trigger);
191 
192 static const char *affinity_tags[PERF_AFFINITY_MAX] = {
193 	"SYS", "NODE", "CPU"
194 };
195 
196 #ifndef HAVE_GETTID
197 static inline pid_t gettid(void)
198 {
199 	return (pid_t)syscall(__NR_gettid);
200 }
201 #endif
202 
203 static int record__threads_enabled(struct record *rec)
204 {
205 	return rec->opts.threads_spec;
206 }
207 
208 static bool switch_output_signal(struct record *rec)
209 {
210 	return rec->switch_output.signal &&
211 	       trigger_is_ready(&switch_output_trigger);
212 }
213 
214 static bool switch_output_size(struct record *rec)
215 {
216 	return rec->switch_output.size &&
217 	       trigger_is_ready(&switch_output_trigger) &&
218 	       (rec->bytes_written >= rec->switch_output.size);
219 }
220 
221 static bool switch_output_time(struct record *rec)
222 {
223 	return rec->switch_output.time &&
224 	       trigger_is_ready(&switch_output_trigger);
225 }
226 
227 static u64 record__bytes_written(struct record *rec)
228 {
229 	return rec->bytes_written + rec->thread_bytes_written;
230 }
231 
232 static bool record__output_max_size_exceeded(struct record *rec)
233 {
234 	return rec->output_max_size &&
235 	       (record__bytes_written(rec) >= rec->output_max_size);
236 }
237 
238 static int record__write(struct record *rec, struct mmap *map __maybe_unused,
239 			 void *bf, size_t size)
240 {
241 	struct perf_data_file *file = &rec->session->data->file;
242 
243 	if (map && map->file)
244 		file = map->file;
245 
246 	if (perf_data_file__write(file, bf, size) < 0) {
247 		pr_err("failed to write perf data, error: %m\n");
248 		return -1;
249 	}
250 
251 	if (map && map->file) {
252 		thread->bytes_written += size;
253 		rec->thread_bytes_written += size;
254 	} else {
255 		rec->bytes_written += size;
256 	}
257 
258 	if (record__output_max_size_exceeded(rec) && !done) {
259 		fprintf(stderr, "[ perf record: perf size limit reached (%" PRIu64 " KB),"
260 				" stopping session ]\n",
261 				record__bytes_written(rec) >> 10);
262 		done = 1;
263 	}
264 
265 	if (switch_output_size(rec))
266 		trigger_hit(&switch_output_trigger);
267 
268 	return 0;
269 }
270 
271 static int record__aio_enabled(struct record *rec);
272 static int record__comp_enabled(struct record *rec);
273 static ssize_t zstd_compress(struct perf_session *session, struct mmap *map,
274 			    void *dst, size_t dst_size, void *src, size_t src_size);
275 
276 #ifdef HAVE_AIO_SUPPORT
277 static int record__aio_write(struct aiocb *cblock, int trace_fd,
278 		void *buf, size_t size, off_t off)
279 {
280 	int rc;
281 
282 	cblock->aio_fildes = trace_fd;
283 	cblock->aio_buf    = buf;
284 	cblock->aio_nbytes = size;
285 	cblock->aio_offset = off;
286 	cblock->aio_sigevent.sigev_notify = SIGEV_NONE;
287 
288 	do {
289 		rc = aio_write(cblock);
290 		if (rc == 0) {
291 			break;
292 		} else if (errno != EAGAIN) {
293 			cblock->aio_fildes = -1;
294 			pr_err("failed to queue perf data, error: %m\n");
295 			break;
296 		}
297 	} while (1);
298 
299 	return rc;
300 }
301 
302 static int record__aio_complete(struct mmap *md, struct aiocb *cblock)
303 {
304 	void *rem_buf;
305 	off_t rem_off;
306 	size_t rem_size;
307 	int rc, aio_errno;
308 	ssize_t aio_ret, written;
309 
310 	aio_errno = aio_error(cblock);
311 	if (aio_errno == EINPROGRESS)
312 		return 0;
313 
314 	written = aio_ret = aio_return(cblock);
315 	if (aio_ret < 0) {
316 		if (aio_errno != EINTR)
317 			pr_err("failed to write perf data, error: %m\n");
318 		written = 0;
319 	}
320 
321 	rem_size = cblock->aio_nbytes - written;
322 
323 	if (rem_size == 0) {
324 		cblock->aio_fildes = -1;
325 		/*
326 		 * md->refcount is incremented in record__aio_pushfn() for
327 		 * every aio write request started in record__aio_push() so
328 		 * decrement it because the request is now complete.
329 		 */
330 		perf_mmap__put(&md->core);
331 		rc = 1;
332 	} else {
333 		/*
334 		 * aio write request may require restart with the
335 		 * remainder if the kernel didn't write whole
336 		 * chunk at once.
337 		 */
338 		rem_off = cblock->aio_offset + written;
339 		rem_buf = (void *)(cblock->aio_buf + written);
340 		record__aio_write(cblock, cblock->aio_fildes,
341 				rem_buf, rem_size, rem_off);
342 		rc = 0;
343 	}
344 
345 	return rc;
346 }
347 
348 static int record__aio_sync(struct mmap *md, bool sync_all)
349 {
350 	struct aiocb **aiocb = md->aio.aiocb;
351 	struct aiocb *cblocks = md->aio.cblocks;
352 	struct timespec timeout = { 0, 1000 * 1000  * 1 }; /* 1ms */
353 	int i, do_suspend;
354 
355 	do {
356 		do_suspend = 0;
357 		for (i = 0; i < md->aio.nr_cblocks; ++i) {
358 			if (cblocks[i].aio_fildes == -1 || record__aio_complete(md, &cblocks[i])) {
359 				if (sync_all)
360 					aiocb[i] = NULL;
361 				else
362 					return i;
363 			} else {
364 				/*
365 				 * Started aio write is not complete yet
366 				 * so it has to be waited before the
367 				 * next allocation.
368 				 */
369 				aiocb[i] = &cblocks[i];
370 				do_suspend = 1;
371 			}
372 		}
373 		if (!do_suspend)
374 			return -1;
375 
376 		while (aio_suspend((const struct aiocb **)aiocb, md->aio.nr_cblocks, &timeout)) {
377 			if (!(errno == EAGAIN || errno == EINTR))
378 				pr_err("failed to sync perf data, error: %m\n");
379 		}
380 	} while (1);
381 }
382 
383 struct record_aio {
384 	struct record	*rec;
385 	void		*data;
386 	size_t		size;
387 };
388 
389 static int record__aio_pushfn(struct mmap *map, void *to, void *buf, size_t size)
390 {
391 	struct record_aio *aio = to;
392 
393 	/*
394 	 * map->core.base data pointed by buf is copied into free map->aio.data[] buffer
395 	 * to release space in the kernel buffer as fast as possible, calling
396 	 * perf_mmap__consume() from perf_mmap__push() function.
397 	 *
398 	 * That lets the kernel to proceed with storing more profiling data into
399 	 * the kernel buffer earlier than other per-cpu kernel buffers are handled.
400 	 *
401 	 * Coping can be done in two steps in case the chunk of profiling data
402 	 * crosses the upper bound of the kernel buffer. In this case we first move
403 	 * part of data from map->start till the upper bound and then the remainder
404 	 * from the beginning of the kernel buffer till the end of the data chunk.
405 	 */
406 
407 	if (record__comp_enabled(aio->rec)) {
408 		ssize_t compressed = zstd_compress(aio->rec->session, NULL, aio->data + aio->size,
409 						   mmap__mmap_len(map) - aio->size,
410 						   buf, size);
411 		if (compressed < 0)
412 			return (int)compressed;
413 
414 		size = compressed;
415 	} else {
416 		memcpy(aio->data + aio->size, buf, size);
417 	}
418 
419 	if (!aio->size) {
420 		/*
421 		 * Increment map->refcount to guard map->aio.data[] buffer
422 		 * from premature deallocation because map object can be
423 		 * released earlier than aio write request started on
424 		 * map->aio.data[] buffer is complete.
425 		 *
426 		 * perf_mmap__put() is done at record__aio_complete()
427 		 * after started aio request completion or at record__aio_push()
428 		 * if the request failed to start.
429 		 */
430 		perf_mmap__get(&map->core);
431 	}
432 
433 	aio->size += size;
434 
435 	return size;
436 }
437 
438 static int record__aio_push(struct record *rec, struct mmap *map, off_t *off)
439 {
440 	int ret, idx;
441 	int trace_fd = rec->session->data->file.fd;
442 	struct record_aio aio = { .rec = rec, .size = 0 };
443 
444 	/*
445 	 * Call record__aio_sync() to wait till map->aio.data[] buffer
446 	 * becomes available after previous aio write operation.
447 	 */
448 
449 	idx = record__aio_sync(map, false);
450 	aio.data = map->aio.data[idx];
451 	ret = perf_mmap__push(map, &aio, record__aio_pushfn);
452 	if (ret != 0) /* ret > 0 - no data, ret < 0 - error */
453 		return ret;
454 
455 	rec->samples++;
456 	ret = record__aio_write(&(map->aio.cblocks[idx]), trace_fd, aio.data, aio.size, *off);
457 	if (!ret) {
458 		*off += aio.size;
459 		rec->bytes_written += aio.size;
460 		if (switch_output_size(rec))
461 			trigger_hit(&switch_output_trigger);
462 	} else {
463 		/*
464 		 * Decrement map->refcount incremented in record__aio_pushfn()
465 		 * back if record__aio_write() operation failed to start, otherwise
466 		 * map->refcount is decremented in record__aio_complete() after
467 		 * aio write operation finishes successfully.
468 		 */
469 		perf_mmap__put(&map->core);
470 	}
471 
472 	return ret;
473 }
474 
475 static off_t record__aio_get_pos(int trace_fd)
476 {
477 	return lseek(trace_fd, 0, SEEK_CUR);
478 }
479 
480 static void record__aio_set_pos(int trace_fd, off_t pos)
481 {
482 	lseek(trace_fd, pos, SEEK_SET);
483 }
484 
485 static void record__aio_mmap_read_sync(struct record *rec)
486 {
487 	int i;
488 	struct evlist *evlist = rec->evlist;
489 	struct mmap *maps = evlist->mmap;
490 
491 	if (!record__aio_enabled(rec))
492 		return;
493 
494 	for (i = 0; i < evlist->core.nr_mmaps; i++) {
495 		struct mmap *map = &maps[i];
496 
497 		if (map->core.base)
498 			record__aio_sync(map, true);
499 	}
500 }
501 
502 static int nr_cblocks_default = 1;
503 static int nr_cblocks_max = 4;
504 
505 static int record__aio_parse(const struct option *opt,
506 			     const char *str,
507 			     int unset)
508 {
509 	struct record_opts *opts = (struct record_opts *)opt->value;
510 
511 	if (unset) {
512 		opts->nr_cblocks = 0;
513 	} else {
514 		if (str)
515 			opts->nr_cblocks = strtol(str, NULL, 0);
516 		if (!opts->nr_cblocks)
517 			opts->nr_cblocks = nr_cblocks_default;
518 	}
519 
520 	return 0;
521 }
522 #else /* HAVE_AIO_SUPPORT */
523 static int nr_cblocks_max = 0;
524 
525 static int record__aio_push(struct record *rec __maybe_unused, struct mmap *map __maybe_unused,
526 			    off_t *off __maybe_unused)
527 {
528 	return -1;
529 }
530 
531 static off_t record__aio_get_pos(int trace_fd __maybe_unused)
532 {
533 	return -1;
534 }
535 
536 static void record__aio_set_pos(int trace_fd __maybe_unused, off_t pos __maybe_unused)
537 {
538 }
539 
540 static void record__aio_mmap_read_sync(struct record *rec __maybe_unused)
541 {
542 }
543 #endif
544 
545 static int record__aio_enabled(struct record *rec)
546 {
547 	return rec->opts.nr_cblocks > 0;
548 }
549 
550 #define MMAP_FLUSH_DEFAULT 1
551 static int record__mmap_flush_parse(const struct option *opt,
552 				    const char *str,
553 				    int unset)
554 {
555 	int flush_max;
556 	struct record_opts *opts = (struct record_opts *)opt->value;
557 	static struct parse_tag tags[] = {
558 			{ .tag  = 'B', .mult = 1       },
559 			{ .tag  = 'K', .mult = 1 << 10 },
560 			{ .tag  = 'M', .mult = 1 << 20 },
561 			{ .tag  = 'G', .mult = 1 << 30 },
562 			{ .tag  = 0 },
563 	};
564 
565 	if (unset)
566 		return 0;
567 
568 	if (str) {
569 		opts->mmap_flush = parse_tag_value(str, tags);
570 		if (opts->mmap_flush == (int)-1)
571 			opts->mmap_flush = strtol(str, NULL, 0);
572 	}
573 
574 	if (!opts->mmap_flush)
575 		opts->mmap_flush = MMAP_FLUSH_DEFAULT;
576 
577 	flush_max = evlist__mmap_size(opts->mmap_pages);
578 	flush_max /= 4;
579 	if (opts->mmap_flush > flush_max)
580 		opts->mmap_flush = flush_max;
581 
582 	return 0;
583 }
584 
585 #ifdef HAVE_ZSTD_SUPPORT
586 static unsigned int comp_level_default = 1;
587 
588 static int record__parse_comp_level(const struct option *opt, const char *str, int unset)
589 {
590 	struct record_opts *opts = opt->value;
591 
592 	if (unset) {
593 		opts->comp_level = 0;
594 	} else {
595 		if (str)
596 			opts->comp_level = strtol(str, NULL, 0);
597 		if (!opts->comp_level)
598 			opts->comp_level = comp_level_default;
599 	}
600 
601 	return 0;
602 }
603 #endif
604 static unsigned int comp_level_max = 22;
605 
606 static int record__comp_enabled(struct record *rec)
607 {
608 	return rec->opts.comp_level > 0;
609 }
610 
611 static int process_synthesized_event(struct perf_tool *tool,
612 				     union perf_event *event,
613 				     struct perf_sample *sample __maybe_unused,
614 				     struct machine *machine __maybe_unused)
615 {
616 	struct record *rec = container_of(tool, struct record, tool);
617 	return record__write(rec, NULL, event, event->header.size);
618 }
619 
620 static struct mutex synth_lock;
621 
622 static int process_locked_synthesized_event(struct perf_tool *tool,
623 				     union perf_event *event,
624 				     struct perf_sample *sample __maybe_unused,
625 				     struct machine *machine __maybe_unused)
626 {
627 	int ret;
628 
629 	mutex_lock(&synth_lock);
630 	ret = process_synthesized_event(tool, event, sample, machine);
631 	mutex_unlock(&synth_lock);
632 	return ret;
633 }
634 
635 static int record__pushfn(struct mmap *map, void *to, void *bf, size_t size)
636 {
637 	struct record *rec = to;
638 
639 	if (record__comp_enabled(rec)) {
640 		ssize_t compressed = zstd_compress(rec->session, map, map->data,
641 						   mmap__mmap_len(map), bf, size);
642 
643 		if (compressed < 0)
644 			return (int)compressed;
645 
646 		size = compressed;
647 		bf   = map->data;
648 	}
649 
650 	thread->samples++;
651 	return record__write(rec, map, bf, size);
652 }
653 
654 static volatile sig_atomic_t signr = -1;
655 static volatile sig_atomic_t child_finished;
656 #ifdef HAVE_EVENTFD_SUPPORT
657 static volatile sig_atomic_t done_fd = -1;
658 #endif
659 
660 static void sig_handler(int sig)
661 {
662 	if (sig == SIGCHLD)
663 		child_finished = 1;
664 	else
665 		signr = sig;
666 
667 	done = 1;
668 #ifdef HAVE_EVENTFD_SUPPORT
669 	if (done_fd >= 0) {
670 		u64 tmp = 1;
671 		int orig_errno = errno;
672 
673 		/*
674 		 * It is possible for this signal handler to run after done is
675 		 * checked in the main loop, but before the perf counter fds are
676 		 * polled. If this happens, the poll() will continue to wait
677 		 * even though done is set, and will only break out if either
678 		 * another signal is received, or the counters are ready for
679 		 * read. To ensure the poll() doesn't sleep when done is set,
680 		 * use an eventfd (done_fd) to wake up the poll().
681 		 */
682 		if (write(done_fd, &tmp, sizeof(tmp)) < 0)
683 			pr_err("failed to signal wakeup fd, error: %m\n");
684 
685 		errno = orig_errno;
686 	}
687 #endif // HAVE_EVENTFD_SUPPORT
688 }
689 
690 static void sigsegv_handler(int sig)
691 {
692 	perf_hooks__recover();
693 	sighandler_dump_stack(sig);
694 }
695 
696 static void record__sig_exit(void)
697 {
698 	if (signr == -1)
699 		return;
700 
701 	signal(signr, SIG_DFL);
702 	raise(signr);
703 }
704 
705 #ifdef HAVE_AUXTRACE_SUPPORT
706 
707 static int record__process_auxtrace(struct perf_tool *tool,
708 				    struct mmap *map,
709 				    union perf_event *event, void *data1,
710 				    size_t len1, void *data2, size_t len2)
711 {
712 	struct record *rec = container_of(tool, struct record, tool);
713 	struct perf_data *data = &rec->data;
714 	size_t padding;
715 	u8 pad[8] = {0};
716 
717 	if (!perf_data__is_pipe(data) && perf_data__is_single_file(data)) {
718 		off_t file_offset;
719 		int fd = perf_data__fd(data);
720 		int err;
721 
722 		file_offset = lseek(fd, 0, SEEK_CUR);
723 		if (file_offset == -1)
724 			return -1;
725 		err = auxtrace_index__auxtrace_event(&rec->session->auxtrace_index,
726 						     event, file_offset);
727 		if (err)
728 			return err;
729 	}
730 
731 	/* event.auxtrace.size includes padding, see __auxtrace_mmap__read() */
732 	padding = (len1 + len2) & 7;
733 	if (padding)
734 		padding = 8 - padding;
735 
736 	record__write(rec, map, event, event->header.size);
737 	record__write(rec, map, data1, len1);
738 	if (len2)
739 		record__write(rec, map, data2, len2);
740 	record__write(rec, map, &pad, padding);
741 
742 	return 0;
743 }
744 
745 static int record__auxtrace_mmap_read(struct record *rec,
746 				      struct mmap *map)
747 {
748 	int ret;
749 
750 	ret = auxtrace_mmap__read(map, rec->itr, &rec->tool,
751 				  record__process_auxtrace);
752 	if (ret < 0)
753 		return ret;
754 
755 	if (ret)
756 		rec->samples++;
757 
758 	return 0;
759 }
760 
761 static int record__auxtrace_mmap_read_snapshot(struct record *rec,
762 					       struct mmap *map)
763 {
764 	int ret;
765 
766 	ret = auxtrace_mmap__read_snapshot(map, rec->itr, &rec->tool,
767 					   record__process_auxtrace,
768 					   rec->opts.auxtrace_snapshot_size);
769 	if (ret < 0)
770 		return ret;
771 
772 	if (ret)
773 		rec->samples++;
774 
775 	return 0;
776 }
777 
778 static int record__auxtrace_read_snapshot_all(struct record *rec)
779 {
780 	int i;
781 	int rc = 0;
782 
783 	for (i = 0; i < rec->evlist->core.nr_mmaps; i++) {
784 		struct mmap *map = &rec->evlist->mmap[i];
785 
786 		if (!map->auxtrace_mmap.base)
787 			continue;
788 
789 		if (record__auxtrace_mmap_read_snapshot(rec, map) != 0) {
790 			rc = -1;
791 			goto out;
792 		}
793 	}
794 out:
795 	return rc;
796 }
797 
798 static void record__read_auxtrace_snapshot(struct record *rec, bool on_exit)
799 {
800 	pr_debug("Recording AUX area tracing snapshot\n");
801 	if (record__auxtrace_read_snapshot_all(rec) < 0) {
802 		trigger_error(&auxtrace_snapshot_trigger);
803 	} else {
804 		if (auxtrace_record__snapshot_finish(rec->itr, on_exit))
805 			trigger_error(&auxtrace_snapshot_trigger);
806 		else
807 			trigger_ready(&auxtrace_snapshot_trigger);
808 	}
809 }
810 
811 static int record__auxtrace_snapshot_exit(struct record *rec)
812 {
813 	if (trigger_is_error(&auxtrace_snapshot_trigger))
814 		return 0;
815 
816 	if (!auxtrace_record__snapshot_started &&
817 	    auxtrace_record__snapshot_start(rec->itr))
818 		return -1;
819 
820 	record__read_auxtrace_snapshot(rec, true);
821 	if (trigger_is_error(&auxtrace_snapshot_trigger))
822 		return -1;
823 
824 	return 0;
825 }
826 
827 static int record__auxtrace_init(struct record *rec)
828 {
829 	int err;
830 
831 	if ((rec->opts.auxtrace_snapshot_opts || rec->opts.auxtrace_sample_opts)
832 	    && record__threads_enabled(rec)) {
833 		pr_err("AUX area tracing options are not available in parallel streaming mode.\n");
834 		return -EINVAL;
835 	}
836 
837 	if (!rec->itr) {
838 		rec->itr = auxtrace_record__init(rec->evlist, &err);
839 		if (err)
840 			return err;
841 	}
842 
843 	err = auxtrace_parse_snapshot_options(rec->itr, &rec->opts,
844 					      rec->opts.auxtrace_snapshot_opts);
845 	if (err)
846 		return err;
847 
848 	err = auxtrace_parse_sample_options(rec->itr, rec->evlist, &rec->opts,
849 					    rec->opts.auxtrace_sample_opts);
850 	if (err)
851 		return err;
852 
853 	auxtrace_regroup_aux_output(rec->evlist);
854 
855 	return auxtrace_parse_filters(rec->evlist);
856 }
857 
858 #else
859 
860 static inline
861 int record__auxtrace_mmap_read(struct record *rec __maybe_unused,
862 			       struct mmap *map __maybe_unused)
863 {
864 	return 0;
865 }
866 
867 static inline
868 void record__read_auxtrace_snapshot(struct record *rec __maybe_unused,
869 				    bool on_exit __maybe_unused)
870 {
871 }
872 
873 static inline
874 int auxtrace_record__snapshot_start(struct auxtrace_record *itr __maybe_unused)
875 {
876 	return 0;
877 }
878 
879 static inline
880 int record__auxtrace_snapshot_exit(struct record *rec __maybe_unused)
881 {
882 	return 0;
883 }
884 
885 static int record__auxtrace_init(struct record *rec __maybe_unused)
886 {
887 	return 0;
888 }
889 
890 #endif
891 
892 static int record__config_text_poke(struct evlist *evlist)
893 {
894 	struct evsel *evsel;
895 
896 	/* Nothing to do if text poke is already configured */
897 	evlist__for_each_entry(evlist, evsel) {
898 		if (evsel->core.attr.text_poke)
899 			return 0;
900 	}
901 
902 	evsel = evlist__add_dummy_on_all_cpus(evlist);
903 	if (!evsel)
904 		return -ENOMEM;
905 
906 	evsel->core.attr.text_poke = 1;
907 	evsel->core.attr.ksymbol = 1;
908 	evsel->immediate = true;
909 	evsel__set_sample_bit(evsel, TIME);
910 
911 	return 0;
912 }
913 
914 static int record__config_off_cpu(struct record *rec)
915 {
916 	return off_cpu_prepare(rec->evlist, &rec->opts.target, &rec->opts);
917 }
918 
919 static bool record__tracking_system_wide(struct record *rec)
920 {
921 	struct evlist *evlist = rec->evlist;
922 	struct evsel *evsel;
923 
924 	/*
925 	 * If non-dummy evsel exists, system_wide sideband is need to
926 	 * help parse sample information.
927 	 * For example, PERF_EVENT_MMAP event to help parse symbol,
928 	 * and PERF_EVENT_COMM event to help parse task executable name.
929 	 */
930 	evlist__for_each_entry(evlist, evsel) {
931 		if (!evsel__is_dummy_event(evsel))
932 			return true;
933 	}
934 
935 	return false;
936 }
937 
938 static int record__config_tracking_events(struct record *rec)
939 {
940 	struct record_opts *opts = &rec->opts;
941 	struct evlist *evlist = rec->evlist;
942 	bool system_wide = false;
943 	struct evsel *evsel;
944 
945 	/*
946 	 * For initial_delay, system wide or a hybrid system, we need to add
947 	 * tracking event so that we can track PERF_RECORD_MMAP to cover the
948 	 * delay of waiting or event synthesis.
949 	 */
950 	if (opts->target.initial_delay || target__has_cpu(&opts->target) ||
951 	    perf_pmus__num_core_pmus() > 1) {
952 
953 		/*
954 		 * User space tasks can migrate between CPUs, so when tracing
955 		 * selected CPUs, sideband for all CPUs is still needed.
956 		 */
957 		if (!!opts->target.cpu_list && record__tracking_system_wide(rec))
958 			system_wide = true;
959 
960 		evsel = evlist__findnew_tracking_event(evlist, system_wide);
961 		if (!evsel)
962 			return -ENOMEM;
963 
964 		/*
965 		 * Enable the tracking event when the process is forked for
966 		 * initial_delay, immediately for system wide.
967 		 */
968 		if (opts->target.initial_delay && !evsel->immediate &&
969 		    !target__has_cpu(&opts->target))
970 			evsel->core.attr.enable_on_exec = 1;
971 		else
972 			evsel->immediate = 1;
973 	}
974 
975 	return 0;
976 }
977 
978 static bool record__kcore_readable(struct machine *machine)
979 {
980 	char kcore[PATH_MAX];
981 	int fd;
982 
983 	scnprintf(kcore, sizeof(kcore), "%s/proc/kcore", machine->root_dir);
984 
985 	fd = open(kcore, O_RDONLY);
986 	if (fd < 0)
987 		return false;
988 
989 	close(fd);
990 
991 	return true;
992 }
993 
994 static int record__kcore_copy(struct machine *machine, struct perf_data *data)
995 {
996 	char from_dir[PATH_MAX];
997 	char kcore_dir[PATH_MAX];
998 	int ret;
999 
1000 	snprintf(from_dir, sizeof(from_dir), "%s/proc", machine->root_dir);
1001 
1002 	ret = perf_data__make_kcore_dir(data, kcore_dir, sizeof(kcore_dir));
1003 	if (ret)
1004 		return ret;
1005 
1006 	return kcore_copy(from_dir, kcore_dir);
1007 }
1008 
1009 static void record__thread_data_init_pipes(struct record_thread *thread_data)
1010 {
1011 	thread_data->pipes.msg[0] = -1;
1012 	thread_data->pipes.msg[1] = -1;
1013 	thread_data->pipes.ack[0] = -1;
1014 	thread_data->pipes.ack[1] = -1;
1015 }
1016 
1017 static int record__thread_data_open_pipes(struct record_thread *thread_data)
1018 {
1019 	if (pipe(thread_data->pipes.msg))
1020 		return -EINVAL;
1021 
1022 	if (pipe(thread_data->pipes.ack)) {
1023 		close(thread_data->pipes.msg[0]);
1024 		thread_data->pipes.msg[0] = -1;
1025 		close(thread_data->pipes.msg[1]);
1026 		thread_data->pipes.msg[1] = -1;
1027 		return -EINVAL;
1028 	}
1029 
1030 	pr_debug2("thread_data[%p]: msg=[%d,%d], ack=[%d,%d]\n", thread_data,
1031 		 thread_data->pipes.msg[0], thread_data->pipes.msg[1],
1032 		 thread_data->pipes.ack[0], thread_data->pipes.ack[1]);
1033 
1034 	return 0;
1035 }
1036 
1037 static void record__thread_data_close_pipes(struct record_thread *thread_data)
1038 {
1039 	if (thread_data->pipes.msg[0] != -1) {
1040 		close(thread_data->pipes.msg[0]);
1041 		thread_data->pipes.msg[0] = -1;
1042 	}
1043 	if (thread_data->pipes.msg[1] != -1) {
1044 		close(thread_data->pipes.msg[1]);
1045 		thread_data->pipes.msg[1] = -1;
1046 	}
1047 	if (thread_data->pipes.ack[0] != -1) {
1048 		close(thread_data->pipes.ack[0]);
1049 		thread_data->pipes.ack[0] = -1;
1050 	}
1051 	if (thread_data->pipes.ack[1] != -1) {
1052 		close(thread_data->pipes.ack[1]);
1053 		thread_data->pipes.ack[1] = -1;
1054 	}
1055 }
1056 
1057 static bool evlist__per_thread(struct evlist *evlist)
1058 {
1059 	return cpu_map__is_dummy(evlist->core.user_requested_cpus);
1060 }
1061 
1062 static int record__thread_data_init_maps(struct record_thread *thread_data, struct evlist *evlist)
1063 {
1064 	int m, tm, nr_mmaps = evlist->core.nr_mmaps;
1065 	struct mmap *mmap = evlist->mmap;
1066 	struct mmap *overwrite_mmap = evlist->overwrite_mmap;
1067 	struct perf_cpu_map *cpus = evlist->core.all_cpus;
1068 	bool per_thread = evlist__per_thread(evlist);
1069 
1070 	if (per_thread)
1071 		thread_data->nr_mmaps = nr_mmaps;
1072 	else
1073 		thread_data->nr_mmaps = bitmap_weight(thread_data->mask->maps.bits,
1074 						      thread_data->mask->maps.nbits);
1075 	if (mmap) {
1076 		thread_data->maps = zalloc(thread_data->nr_mmaps * sizeof(struct mmap *));
1077 		if (!thread_data->maps)
1078 			return -ENOMEM;
1079 	}
1080 	if (overwrite_mmap) {
1081 		thread_data->overwrite_maps = zalloc(thread_data->nr_mmaps * sizeof(struct mmap *));
1082 		if (!thread_data->overwrite_maps) {
1083 			zfree(&thread_data->maps);
1084 			return -ENOMEM;
1085 		}
1086 	}
1087 	pr_debug2("thread_data[%p]: nr_mmaps=%d, maps=%p, ow_maps=%p\n", thread_data,
1088 		 thread_data->nr_mmaps, thread_data->maps, thread_data->overwrite_maps);
1089 
1090 	for (m = 0, tm = 0; m < nr_mmaps && tm < thread_data->nr_mmaps; m++) {
1091 		if (per_thread ||
1092 		    test_bit(perf_cpu_map__cpu(cpus, m).cpu, thread_data->mask->maps.bits)) {
1093 			if (thread_data->maps) {
1094 				thread_data->maps[tm] = &mmap[m];
1095 				pr_debug2("thread_data[%p]: cpu%d: maps[%d] -> mmap[%d]\n",
1096 					  thread_data, perf_cpu_map__cpu(cpus, m).cpu, tm, m);
1097 			}
1098 			if (thread_data->overwrite_maps) {
1099 				thread_data->overwrite_maps[tm] = &overwrite_mmap[m];
1100 				pr_debug2("thread_data[%p]: cpu%d: ow_maps[%d] -> ow_mmap[%d]\n",
1101 					  thread_data, perf_cpu_map__cpu(cpus, m).cpu, tm, m);
1102 			}
1103 			tm++;
1104 		}
1105 	}
1106 
1107 	return 0;
1108 }
1109 
1110 static int record__thread_data_init_pollfd(struct record_thread *thread_data, struct evlist *evlist)
1111 {
1112 	int f, tm, pos;
1113 	struct mmap *map, *overwrite_map;
1114 
1115 	fdarray__init(&thread_data->pollfd, 64);
1116 
1117 	for (tm = 0; tm < thread_data->nr_mmaps; tm++) {
1118 		map = thread_data->maps ? thread_data->maps[tm] : NULL;
1119 		overwrite_map = thread_data->overwrite_maps ?
1120 				thread_data->overwrite_maps[tm] : NULL;
1121 
1122 		for (f = 0; f < evlist->core.pollfd.nr; f++) {
1123 			void *ptr = evlist->core.pollfd.priv[f].ptr;
1124 
1125 			if ((map && ptr == map) || (overwrite_map && ptr == overwrite_map)) {
1126 				pos = fdarray__dup_entry_from(&thread_data->pollfd, f,
1127 							      &evlist->core.pollfd);
1128 				if (pos < 0)
1129 					return pos;
1130 				pr_debug2("thread_data[%p]: pollfd[%d] <- event_fd=%d\n",
1131 					 thread_data, pos, evlist->core.pollfd.entries[f].fd);
1132 			}
1133 		}
1134 	}
1135 
1136 	return 0;
1137 }
1138 
1139 static void record__free_thread_data(struct record *rec)
1140 {
1141 	int t;
1142 	struct record_thread *thread_data = rec->thread_data;
1143 
1144 	if (thread_data == NULL)
1145 		return;
1146 
1147 	for (t = 0; t < rec->nr_threads; t++) {
1148 		record__thread_data_close_pipes(&thread_data[t]);
1149 		zfree(&thread_data[t].maps);
1150 		zfree(&thread_data[t].overwrite_maps);
1151 		fdarray__exit(&thread_data[t].pollfd);
1152 	}
1153 
1154 	zfree(&rec->thread_data);
1155 }
1156 
1157 static int record__map_thread_evlist_pollfd_indexes(struct record *rec,
1158 						    int evlist_pollfd_index,
1159 						    int thread_pollfd_index)
1160 {
1161 	size_t x = rec->index_map_cnt;
1162 
1163 	if (realloc_array_as_needed(rec->index_map, rec->index_map_sz, x, NULL))
1164 		return -ENOMEM;
1165 	rec->index_map[x].evlist_pollfd_index = evlist_pollfd_index;
1166 	rec->index_map[x].thread_pollfd_index = thread_pollfd_index;
1167 	rec->index_map_cnt += 1;
1168 	return 0;
1169 }
1170 
1171 static int record__update_evlist_pollfd_from_thread(struct record *rec,
1172 						    struct evlist *evlist,
1173 						    struct record_thread *thread_data)
1174 {
1175 	struct pollfd *e_entries = evlist->core.pollfd.entries;
1176 	struct pollfd *t_entries = thread_data->pollfd.entries;
1177 	int err = 0;
1178 	size_t i;
1179 
1180 	for (i = 0; i < rec->index_map_cnt; i++) {
1181 		int e_pos = rec->index_map[i].evlist_pollfd_index;
1182 		int t_pos = rec->index_map[i].thread_pollfd_index;
1183 
1184 		if (e_entries[e_pos].fd != t_entries[t_pos].fd ||
1185 		    e_entries[e_pos].events != t_entries[t_pos].events) {
1186 			pr_err("Thread and evlist pollfd index mismatch\n");
1187 			err = -EINVAL;
1188 			continue;
1189 		}
1190 		e_entries[e_pos].revents = t_entries[t_pos].revents;
1191 	}
1192 	return err;
1193 }
1194 
1195 static int record__dup_non_perf_events(struct record *rec,
1196 				       struct evlist *evlist,
1197 				       struct record_thread *thread_data)
1198 {
1199 	struct fdarray *fda = &evlist->core.pollfd;
1200 	int i, ret;
1201 
1202 	for (i = 0; i < fda->nr; i++) {
1203 		if (!(fda->priv[i].flags & fdarray_flag__non_perf_event))
1204 			continue;
1205 		ret = fdarray__dup_entry_from(&thread_data->pollfd, i, fda);
1206 		if (ret < 0) {
1207 			pr_err("Failed to duplicate descriptor in main thread pollfd\n");
1208 			return ret;
1209 		}
1210 		pr_debug2("thread_data[%p]: pollfd[%d] <- non_perf_event fd=%d\n",
1211 			  thread_data, ret, fda->entries[i].fd);
1212 		ret = record__map_thread_evlist_pollfd_indexes(rec, i, ret);
1213 		if (ret < 0) {
1214 			pr_err("Failed to map thread and evlist pollfd indexes\n");
1215 			return ret;
1216 		}
1217 	}
1218 	return 0;
1219 }
1220 
1221 static int record__alloc_thread_data(struct record *rec, struct evlist *evlist)
1222 {
1223 	int t, ret;
1224 	struct record_thread *thread_data;
1225 
1226 	rec->thread_data = zalloc(rec->nr_threads * sizeof(*(rec->thread_data)));
1227 	if (!rec->thread_data) {
1228 		pr_err("Failed to allocate thread data\n");
1229 		return -ENOMEM;
1230 	}
1231 	thread_data = rec->thread_data;
1232 
1233 	for (t = 0; t < rec->nr_threads; t++)
1234 		record__thread_data_init_pipes(&thread_data[t]);
1235 
1236 	for (t = 0; t < rec->nr_threads; t++) {
1237 		thread_data[t].rec = rec;
1238 		thread_data[t].mask = &rec->thread_masks[t];
1239 		ret = record__thread_data_init_maps(&thread_data[t], evlist);
1240 		if (ret) {
1241 			pr_err("Failed to initialize thread[%d] maps\n", t);
1242 			goto out_free;
1243 		}
1244 		ret = record__thread_data_init_pollfd(&thread_data[t], evlist);
1245 		if (ret) {
1246 			pr_err("Failed to initialize thread[%d] pollfd\n", t);
1247 			goto out_free;
1248 		}
1249 		if (t) {
1250 			thread_data[t].tid = -1;
1251 			ret = record__thread_data_open_pipes(&thread_data[t]);
1252 			if (ret) {
1253 				pr_err("Failed to open thread[%d] communication pipes\n", t);
1254 				goto out_free;
1255 			}
1256 			ret = fdarray__add(&thread_data[t].pollfd, thread_data[t].pipes.msg[0],
1257 					   POLLIN | POLLERR | POLLHUP, fdarray_flag__nonfilterable);
1258 			if (ret < 0) {
1259 				pr_err("Failed to add descriptor to thread[%d] pollfd\n", t);
1260 				goto out_free;
1261 			}
1262 			thread_data[t].ctlfd_pos = ret;
1263 			pr_debug2("thread_data[%p]: pollfd[%d] <- ctl_fd=%d\n",
1264 				 thread_data, thread_data[t].ctlfd_pos,
1265 				 thread_data[t].pipes.msg[0]);
1266 		} else {
1267 			thread_data[t].tid = gettid();
1268 
1269 			ret = record__dup_non_perf_events(rec, evlist, &thread_data[t]);
1270 			if (ret < 0)
1271 				goto out_free;
1272 
1273 			thread_data[t].ctlfd_pos = -1; /* Not used */
1274 		}
1275 	}
1276 
1277 	return 0;
1278 
1279 out_free:
1280 	record__free_thread_data(rec);
1281 
1282 	return ret;
1283 }
1284 
1285 static int record__mmap_evlist(struct record *rec,
1286 			       struct evlist *evlist)
1287 {
1288 	int i, ret;
1289 	struct record_opts *opts = &rec->opts;
1290 	bool auxtrace_overwrite = opts->auxtrace_snapshot_mode ||
1291 				  opts->auxtrace_sample_mode;
1292 	char msg[512];
1293 
1294 	if (opts->affinity != PERF_AFFINITY_SYS)
1295 		cpu__setup_cpunode_map();
1296 
1297 	if (evlist__mmap_ex(evlist, opts->mmap_pages,
1298 				 opts->auxtrace_mmap_pages,
1299 				 auxtrace_overwrite,
1300 				 opts->nr_cblocks, opts->affinity,
1301 				 opts->mmap_flush, opts->comp_level) < 0) {
1302 		if (errno == EPERM) {
1303 			pr_err("Permission error mapping pages.\n"
1304 			       "Consider increasing "
1305 			       "/proc/sys/kernel/perf_event_mlock_kb,\n"
1306 			       "or try again with a smaller value of -m/--mmap_pages.\n"
1307 			       "(current value: %u,%u)\n",
1308 			       opts->mmap_pages, opts->auxtrace_mmap_pages);
1309 			return -errno;
1310 		} else {
1311 			pr_err("failed to mmap with %d (%s)\n", errno,
1312 				str_error_r(errno, msg, sizeof(msg)));
1313 			if (errno)
1314 				return -errno;
1315 			else
1316 				return -EINVAL;
1317 		}
1318 	}
1319 
1320 	if (evlist__initialize_ctlfd(evlist, opts->ctl_fd, opts->ctl_fd_ack))
1321 		return -1;
1322 
1323 	ret = record__alloc_thread_data(rec, evlist);
1324 	if (ret)
1325 		return ret;
1326 
1327 	if (record__threads_enabled(rec)) {
1328 		ret = perf_data__create_dir(&rec->data, evlist->core.nr_mmaps);
1329 		if (ret) {
1330 			pr_err("Failed to create data directory: %s\n", strerror(-ret));
1331 			return ret;
1332 		}
1333 		for (i = 0; i < evlist->core.nr_mmaps; i++) {
1334 			if (evlist->mmap)
1335 				evlist->mmap[i].file = &rec->data.dir.files[i];
1336 			if (evlist->overwrite_mmap)
1337 				evlist->overwrite_mmap[i].file = &rec->data.dir.files[i];
1338 		}
1339 	}
1340 
1341 	return 0;
1342 }
1343 
1344 static int record__mmap(struct record *rec)
1345 {
1346 	return record__mmap_evlist(rec, rec->evlist);
1347 }
1348 
1349 static int record__open(struct record *rec)
1350 {
1351 	char msg[BUFSIZ];
1352 	struct evsel *pos;
1353 	struct evlist *evlist = rec->evlist;
1354 	struct perf_session *session = rec->session;
1355 	struct record_opts *opts = &rec->opts;
1356 	int rc = 0;
1357 
1358 	evlist__for_each_entry(evlist, pos) {
1359 try_again:
1360 		if (evsel__open(pos, pos->core.cpus, pos->core.threads) < 0) {
1361 			if (evsel__fallback(pos, &opts->target, errno, msg, sizeof(msg))) {
1362 				if (verbose > 0)
1363 					ui__warning("%s\n", msg);
1364 				goto try_again;
1365 			}
1366 			if ((errno == EINVAL || errno == EBADF) &&
1367 			    pos->core.leader != &pos->core &&
1368 			    pos->weak_group) {
1369 			        pos = evlist__reset_weak_group(evlist, pos, true);
1370 				goto try_again;
1371 			}
1372 			rc = -errno;
1373 			evsel__open_strerror(pos, &opts->target, errno, msg, sizeof(msg));
1374 			ui__error("%s\n", msg);
1375 			goto out;
1376 		}
1377 
1378 		pos->supported = true;
1379 	}
1380 
1381 	if (symbol_conf.kptr_restrict && !evlist__exclude_kernel(evlist)) {
1382 		pr_warning(
1383 "WARNING: Kernel address maps (/proc/{kallsyms,modules}) are restricted,\n"
1384 "check /proc/sys/kernel/kptr_restrict and /proc/sys/kernel/perf_event_paranoid.\n\n"
1385 "Samples in kernel functions may not be resolved if a suitable vmlinux\n"
1386 "file is not found in the buildid cache or in the vmlinux path.\n\n"
1387 "Samples in kernel modules won't be resolved at all.\n\n"
1388 "If some relocation was applied (e.g. kexec) symbols may be misresolved\n"
1389 "even with a suitable vmlinux or kallsyms file.\n\n");
1390 	}
1391 
1392 	if (evlist__apply_filters(evlist, &pos)) {
1393 		pr_err("failed to set filter \"%s\" on event %s with %d (%s)\n",
1394 			pos->filter ?: "BPF", evsel__name(pos), errno,
1395 			str_error_r(errno, msg, sizeof(msg)));
1396 		rc = -1;
1397 		goto out;
1398 	}
1399 
1400 	rc = record__mmap(rec);
1401 	if (rc)
1402 		goto out;
1403 
1404 	session->evlist = evlist;
1405 	perf_session__set_id_hdr_size(session);
1406 out:
1407 	return rc;
1408 }
1409 
1410 static void set_timestamp_boundary(struct record *rec, u64 sample_time)
1411 {
1412 	if (rec->evlist->first_sample_time == 0)
1413 		rec->evlist->first_sample_time = sample_time;
1414 
1415 	if (sample_time)
1416 		rec->evlist->last_sample_time = sample_time;
1417 }
1418 
1419 static int process_sample_event(struct perf_tool *tool,
1420 				union perf_event *event,
1421 				struct perf_sample *sample,
1422 				struct evsel *evsel,
1423 				struct machine *machine)
1424 {
1425 	struct record *rec = container_of(tool, struct record, tool);
1426 
1427 	set_timestamp_boundary(rec, sample->time);
1428 
1429 	if (rec->buildid_all)
1430 		return 0;
1431 
1432 	rec->samples++;
1433 	return build_id__mark_dso_hit(tool, event, sample, evsel, machine);
1434 }
1435 
1436 static int process_buildids(struct record *rec)
1437 {
1438 	struct perf_session *session = rec->session;
1439 
1440 	if (perf_data__size(&rec->data) == 0)
1441 		return 0;
1442 
1443 	/*
1444 	 * During this process, it'll load kernel map and replace the
1445 	 * dso->long_name to a real pathname it found.  In this case
1446 	 * we prefer the vmlinux path like
1447 	 *   /lib/modules/3.16.4/build/vmlinux
1448 	 *
1449 	 * rather than build-id path (in debug directory).
1450 	 *   $HOME/.debug/.build-id/f0/6e17aa50adf4d00b88925e03775de107611551
1451 	 */
1452 	symbol_conf.ignore_vmlinux_buildid = true;
1453 
1454 	/*
1455 	 * If --buildid-all is given, it marks all DSO regardless of hits,
1456 	 * so no need to process samples. But if timestamp_boundary is enabled,
1457 	 * it still needs to walk on all samples to get the timestamps of
1458 	 * first/last samples.
1459 	 */
1460 	if (rec->buildid_all && !rec->timestamp_boundary)
1461 		rec->tool.sample = NULL;
1462 
1463 	return perf_session__process_events(session);
1464 }
1465 
1466 static void perf_event__synthesize_guest_os(struct machine *machine, void *data)
1467 {
1468 	int err;
1469 	struct perf_tool *tool = data;
1470 	/*
1471 	 *As for guest kernel when processing subcommand record&report,
1472 	 *we arrange module mmap prior to guest kernel mmap and trigger
1473 	 *a preload dso because default guest module symbols are loaded
1474 	 *from guest kallsyms instead of /lib/modules/XXX/XXX. This
1475 	 *method is used to avoid symbol missing when the first addr is
1476 	 *in module instead of in guest kernel.
1477 	 */
1478 	err = perf_event__synthesize_modules(tool, process_synthesized_event,
1479 					     machine);
1480 	if (err < 0)
1481 		pr_err("Couldn't record guest kernel [%d]'s reference"
1482 		       " relocation symbol.\n", machine->pid);
1483 
1484 	/*
1485 	 * We use _stext for guest kernel because guest kernel's /proc/kallsyms
1486 	 * have no _text sometimes.
1487 	 */
1488 	err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
1489 						 machine);
1490 	if (err < 0)
1491 		pr_err("Couldn't record guest kernel [%d]'s reference"
1492 		       " relocation symbol.\n", machine->pid);
1493 }
1494 
1495 static struct perf_event_header finished_round_event = {
1496 	.size = sizeof(struct perf_event_header),
1497 	.type = PERF_RECORD_FINISHED_ROUND,
1498 };
1499 
1500 static struct perf_event_header finished_init_event = {
1501 	.size = sizeof(struct perf_event_header),
1502 	.type = PERF_RECORD_FINISHED_INIT,
1503 };
1504 
1505 static void record__adjust_affinity(struct record *rec, struct mmap *map)
1506 {
1507 	if (rec->opts.affinity != PERF_AFFINITY_SYS &&
1508 	    !bitmap_equal(thread->mask->affinity.bits, map->affinity_mask.bits,
1509 			  thread->mask->affinity.nbits)) {
1510 		bitmap_zero(thread->mask->affinity.bits, thread->mask->affinity.nbits);
1511 		bitmap_or(thread->mask->affinity.bits, thread->mask->affinity.bits,
1512 			  map->affinity_mask.bits, thread->mask->affinity.nbits);
1513 		sched_setaffinity(0, MMAP_CPU_MASK_BYTES(&thread->mask->affinity),
1514 					(cpu_set_t *)thread->mask->affinity.bits);
1515 		if (verbose == 2) {
1516 			pr_debug("threads[%d]: running on cpu%d: ", thread->tid, sched_getcpu());
1517 			mmap_cpu_mask__scnprintf(&thread->mask->affinity, "affinity");
1518 		}
1519 	}
1520 }
1521 
1522 static size_t process_comp_header(void *record, size_t increment)
1523 {
1524 	struct perf_record_compressed *event = record;
1525 	size_t size = sizeof(*event);
1526 
1527 	if (increment) {
1528 		event->header.size += increment;
1529 		return increment;
1530 	}
1531 
1532 	event->header.type = PERF_RECORD_COMPRESSED;
1533 	event->header.size = size;
1534 
1535 	return size;
1536 }
1537 
1538 static ssize_t zstd_compress(struct perf_session *session, struct mmap *map,
1539 			    void *dst, size_t dst_size, void *src, size_t src_size)
1540 {
1541 	ssize_t compressed;
1542 	size_t max_record_size = PERF_SAMPLE_MAX_SIZE - sizeof(struct perf_record_compressed) - 1;
1543 	struct zstd_data *zstd_data = &session->zstd_data;
1544 
1545 	if (map && map->file)
1546 		zstd_data = &map->zstd_data;
1547 
1548 	compressed = zstd_compress_stream_to_records(zstd_data, dst, dst_size, src, src_size,
1549 						     max_record_size, process_comp_header);
1550 	if (compressed < 0)
1551 		return compressed;
1552 
1553 	if (map && map->file) {
1554 		thread->bytes_transferred += src_size;
1555 		thread->bytes_compressed  += compressed;
1556 	} else {
1557 		session->bytes_transferred += src_size;
1558 		session->bytes_compressed  += compressed;
1559 	}
1560 
1561 	return compressed;
1562 }
1563 
1564 static int record__mmap_read_evlist(struct record *rec, struct evlist *evlist,
1565 				    bool overwrite, bool synch)
1566 {
1567 	u64 bytes_written = rec->bytes_written;
1568 	int i;
1569 	int rc = 0;
1570 	int nr_mmaps;
1571 	struct mmap **maps;
1572 	int trace_fd = rec->data.file.fd;
1573 	off_t off = 0;
1574 
1575 	if (!evlist)
1576 		return 0;
1577 
1578 	nr_mmaps = thread->nr_mmaps;
1579 	maps = overwrite ? thread->overwrite_maps : thread->maps;
1580 
1581 	if (!maps)
1582 		return 0;
1583 
1584 	if (overwrite && evlist->bkw_mmap_state != BKW_MMAP_DATA_PENDING)
1585 		return 0;
1586 
1587 	if (record__aio_enabled(rec))
1588 		off = record__aio_get_pos(trace_fd);
1589 
1590 	for (i = 0; i < nr_mmaps; i++) {
1591 		u64 flush = 0;
1592 		struct mmap *map = maps[i];
1593 
1594 		if (map->core.base) {
1595 			record__adjust_affinity(rec, map);
1596 			if (synch) {
1597 				flush = map->core.flush;
1598 				map->core.flush = 1;
1599 			}
1600 			if (!record__aio_enabled(rec)) {
1601 				if (perf_mmap__push(map, rec, record__pushfn) < 0) {
1602 					if (synch)
1603 						map->core.flush = flush;
1604 					rc = -1;
1605 					goto out;
1606 				}
1607 			} else {
1608 				if (record__aio_push(rec, map, &off) < 0) {
1609 					record__aio_set_pos(trace_fd, off);
1610 					if (synch)
1611 						map->core.flush = flush;
1612 					rc = -1;
1613 					goto out;
1614 				}
1615 			}
1616 			if (synch)
1617 				map->core.flush = flush;
1618 		}
1619 
1620 		if (map->auxtrace_mmap.base && !rec->opts.auxtrace_snapshot_mode &&
1621 		    !rec->opts.auxtrace_sample_mode &&
1622 		    record__auxtrace_mmap_read(rec, map) != 0) {
1623 			rc = -1;
1624 			goto out;
1625 		}
1626 	}
1627 
1628 	if (record__aio_enabled(rec))
1629 		record__aio_set_pos(trace_fd, off);
1630 
1631 	/*
1632 	 * Mark the round finished in case we wrote
1633 	 * at least one event.
1634 	 *
1635 	 * No need for round events in directory mode,
1636 	 * because per-cpu maps and files have data
1637 	 * sorted by kernel.
1638 	 */
1639 	if (!record__threads_enabled(rec) && bytes_written != rec->bytes_written)
1640 		rc = record__write(rec, NULL, &finished_round_event, sizeof(finished_round_event));
1641 
1642 	if (overwrite)
1643 		evlist__toggle_bkw_mmap(evlist, BKW_MMAP_EMPTY);
1644 out:
1645 	return rc;
1646 }
1647 
1648 static int record__mmap_read_all(struct record *rec, bool synch)
1649 {
1650 	int err;
1651 
1652 	err = record__mmap_read_evlist(rec, rec->evlist, false, synch);
1653 	if (err)
1654 		return err;
1655 
1656 	return record__mmap_read_evlist(rec, rec->evlist, true, synch);
1657 }
1658 
1659 static void record__thread_munmap_filtered(struct fdarray *fda, int fd,
1660 					   void *arg __maybe_unused)
1661 {
1662 	struct perf_mmap *map = fda->priv[fd].ptr;
1663 
1664 	if (map)
1665 		perf_mmap__put(map);
1666 }
1667 
1668 static void *record__thread(void *arg)
1669 {
1670 	enum thread_msg msg = THREAD_MSG__READY;
1671 	bool terminate = false;
1672 	struct fdarray *pollfd;
1673 	int err, ctlfd_pos;
1674 
1675 	thread = arg;
1676 	thread->tid = gettid();
1677 
1678 	err = write(thread->pipes.ack[1], &msg, sizeof(msg));
1679 	if (err == -1)
1680 		pr_warning("threads[%d]: failed to notify on start: %s\n",
1681 			   thread->tid, strerror(errno));
1682 
1683 	pr_debug("threads[%d]: started on cpu%d\n", thread->tid, sched_getcpu());
1684 
1685 	pollfd = &thread->pollfd;
1686 	ctlfd_pos = thread->ctlfd_pos;
1687 
1688 	for (;;) {
1689 		unsigned long long hits = thread->samples;
1690 
1691 		if (record__mmap_read_all(thread->rec, false) < 0 || terminate)
1692 			break;
1693 
1694 		if (hits == thread->samples) {
1695 
1696 			err = fdarray__poll(pollfd, -1);
1697 			/*
1698 			 * Propagate error, only if there's any. Ignore positive
1699 			 * number of returned events and interrupt error.
1700 			 */
1701 			if (err > 0 || (err < 0 && errno == EINTR))
1702 				err = 0;
1703 			thread->waking++;
1704 
1705 			if (fdarray__filter(pollfd, POLLERR | POLLHUP,
1706 					    record__thread_munmap_filtered, NULL) == 0)
1707 				break;
1708 		}
1709 
1710 		if (pollfd->entries[ctlfd_pos].revents & POLLHUP) {
1711 			terminate = true;
1712 			close(thread->pipes.msg[0]);
1713 			thread->pipes.msg[0] = -1;
1714 			pollfd->entries[ctlfd_pos].fd = -1;
1715 			pollfd->entries[ctlfd_pos].events = 0;
1716 		}
1717 
1718 		pollfd->entries[ctlfd_pos].revents = 0;
1719 	}
1720 	record__mmap_read_all(thread->rec, true);
1721 
1722 	err = write(thread->pipes.ack[1], &msg, sizeof(msg));
1723 	if (err == -1)
1724 		pr_warning("threads[%d]: failed to notify on termination: %s\n",
1725 			   thread->tid, strerror(errno));
1726 
1727 	return NULL;
1728 }
1729 
1730 static void record__init_features(struct record *rec)
1731 {
1732 	struct perf_session *session = rec->session;
1733 	int feat;
1734 
1735 	for (feat = HEADER_FIRST_FEATURE; feat < HEADER_LAST_FEATURE; feat++)
1736 		perf_header__set_feat(&session->header, feat);
1737 
1738 	if (rec->no_buildid)
1739 		perf_header__clear_feat(&session->header, HEADER_BUILD_ID);
1740 
1741 #ifdef HAVE_LIBTRACEEVENT
1742 	if (!have_tracepoints(&rec->evlist->core.entries))
1743 		perf_header__clear_feat(&session->header, HEADER_TRACING_DATA);
1744 #endif
1745 
1746 	if (!rec->opts.branch_stack)
1747 		perf_header__clear_feat(&session->header, HEADER_BRANCH_STACK);
1748 
1749 	if (!rec->opts.full_auxtrace)
1750 		perf_header__clear_feat(&session->header, HEADER_AUXTRACE);
1751 
1752 	if (!(rec->opts.use_clockid && rec->opts.clockid_res_ns))
1753 		perf_header__clear_feat(&session->header, HEADER_CLOCKID);
1754 
1755 	if (!rec->opts.use_clockid)
1756 		perf_header__clear_feat(&session->header, HEADER_CLOCK_DATA);
1757 
1758 	if (!record__threads_enabled(rec))
1759 		perf_header__clear_feat(&session->header, HEADER_DIR_FORMAT);
1760 
1761 	if (!record__comp_enabled(rec))
1762 		perf_header__clear_feat(&session->header, HEADER_COMPRESSED);
1763 
1764 	perf_header__clear_feat(&session->header, HEADER_STAT);
1765 }
1766 
1767 static void
1768 record__finish_output(struct record *rec)
1769 {
1770 	int i;
1771 	struct perf_data *data = &rec->data;
1772 	int fd = perf_data__fd(data);
1773 
1774 	if (data->is_pipe) {
1775 		/* Just to display approx. size */
1776 		data->file.size = rec->bytes_written;
1777 		return;
1778 	}
1779 
1780 	rec->session->header.data_size += rec->bytes_written;
1781 	data->file.size = lseek(perf_data__fd(data), 0, SEEK_CUR);
1782 	if (record__threads_enabled(rec)) {
1783 		for (i = 0; i < data->dir.nr; i++)
1784 			data->dir.files[i].size = lseek(data->dir.files[i].fd, 0, SEEK_CUR);
1785 	}
1786 
1787 	if (!rec->no_buildid) {
1788 		process_buildids(rec);
1789 
1790 		if (rec->buildid_all)
1791 			perf_session__dsos_hit_all(rec->session);
1792 	}
1793 	perf_session__write_header(rec->session, rec->evlist, fd, true);
1794 
1795 	return;
1796 }
1797 
1798 static int record__synthesize_workload(struct record *rec, bool tail)
1799 {
1800 	int err;
1801 	struct perf_thread_map *thread_map;
1802 	bool needs_mmap = rec->opts.synth & PERF_SYNTH_MMAP;
1803 
1804 	if (rec->opts.tail_synthesize != tail)
1805 		return 0;
1806 
1807 	thread_map = thread_map__new_by_tid(rec->evlist->workload.pid);
1808 	if (thread_map == NULL)
1809 		return -1;
1810 
1811 	err = perf_event__synthesize_thread_map(&rec->tool, thread_map,
1812 						 process_synthesized_event,
1813 						 &rec->session->machines.host,
1814 						 needs_mmap,
1815 						 rec->opts.sample_address);
1816 	perf_thread_map__put(thread_map);
1817 	return err;
1818 }
1819 
1820 static int write_finished_init(struct record *rec, bool tail)
1821 {
1822 	if (rec->opts.tail_synthesize != tail)
1823 		return 0;
1824 
1825 	return record__write(rec, NULL, &finished_init_event, sizeof(finished_init_event));
1826 }
1827 
1828 static int record__synthesize(struct record *rec, bool tail);
1829 
1830 static int
1831 record__switch_output(struct record *rec, bool at_exit)
1832 {
1833 	struct perf_data *data = &rec->data;
1834 	char *new_filename = NULL;
1835 	int fd, err;
1836 
1837 	/* Same Size:      "2015122520103046"*/
1838 	char timestamp[] = "InvalidTimestamp";
1839 
1840 	record__aio_mmap_read_sync(rec);
1841 
1842 	write_finished_init(rec, true);
1843 
1844 	record__synthesize(rec, true);
1845 	if (target__none(&rec->opts.target))
1846 		record__synthesize_workload(rec, true);
1847 
1848 	rec->samples = 0;
1849 	record__finish_output(rec);
1850 	err = fetch_current_timestamp(timestamp, sizeof(timestamp));
1851 	if (err) {
1852 		pr_err("Failed to get current timestamp\n");
1853 		return -EINVAL;
1854 	}
1855 
1856 	fd = perf_data__switch(data, timestamp,
1857 			       rec->session->header.data_offset,
1858 			       at_exit, &new_filename);
1859 	if (fd >= 0 && !at_exit) {
1860 		rec->bytes_written = 0;
1861 		rec->session->header.data_size = 0;
1862 	}
1863 
1864 	if (!quiet) {
1865 		fprintf(stderr, "[ perf record: Dump %s.%s ]\n",
1866 			data->path, timestamp);
1867 	}
1868 
1869 	if (rec->switch_output.num_files) {
1870 		int n = rec->switch_output.cur_file + 1;
1871 
1872 		if (n >= rec->switch_output.num_files)
1873 			n = 0;
1874 		rec->switch_output.cur_file = n;
1875 		if (rec->switch_output.filenames[n]) {
1876 			remove(rec->switch_output.filenames[n]);
1877 			zfree(&rec->switch_output.filenames[n]);
1878 		}
1879 		rec->switch_output.filenames[n] = new_filename;
1880 	} else {
1881 		free(new_filename);
1882 	}
1883 
1884 	/* Output tracking events */
1885 	if (!at_exit) {
1886 		record__synthesize(rec, false);
1887 
1888 		/*
1889 		 * In 'perf record --switch-output' without -a,
1890 		 * record__synthesize() in record__switch_output() won't
1891 		 * generate tracking events because there's no thread_map
1892 		 * in evlist. Which causes newly created perf.data doesn't
1893 		 * contain map and comm information.
1894 		 * Create a fake thread_map and directly call
1895 		 * perf_event__synthesize_thread_map() for those events.
1896 		 */
1897 		if (target__none(&rec->opts.target))
1898 			record__synthesize_workload(rec, false);
1899 		write_finished_init(rec, false);
1900 	}
1901 	return fd;
1902 }
1903 
1904 static void __record__save_lost_samples(struct record *rec, struct evsel *evsel,
1905 					struct perf_record_lost_samples *lost,
1906 					int cpu_idx, int thread_idx, u64 lost_count,
1907 					u16 misc_flag)
1908 {
1909 	struct perf_sample_id *sid;
1910 	struct perf_sample sample = {};
1911 	int id_hdr_size;
1912 
1913 	lost->lost = lost_count;
1914 	if (evsel->core.ids) {
1915 		sid = xyarray__entry(evsel->core.sample_id, cpu_idx, thread_idx);
1916 		sample.id = sid->id;
1917 	}
1918 
1919 	id_hdr_size = perf_event__synthesize_id_sample((void *)(lost + 1),
1920 						       evsel->core.attr.sample_type, &sample);
1921 	lost->header.size = sizeof(*lost) + id_hdr_size;
1922 	lost->header.misc = misc_flag;
1923 	record__write(rec, NULL, lost, lost->header.size);
1924 }
1925 
1926 static void record__read_lost_samples(struct record *rec)
1927 {
1928 	struct perf_session *session = rec->session;
1929 	struct perf_record_lost_samples *lost = NULL;
1930 	struct evsel *evsel;
1931 
1932 	/* there was an error during record__open */
1933 	if (session->evlist == NULL)
1934 		return;
1935 
1936 	evlist__for_each_entry(session->evlist, evsel) {
1937 		struct xyarray *xy = evsel->core.sample_id;
1938 		u64 lost_count;
1939 
1940 		if (xy == NULL || evsel->core.fd == NULL)
1941 			continue;
1942 		if (xyarray__max_x(evsel->core.fd) != xyarray__max_x(xy) ||
1943 		    xyarray__max_y(evsel->core.fd) != xyarray__max_y(xy)) {
1944 			pr_debug("Unmatched FD vs. sample ID: skip reading LOST count\n");
1945 			continue;
1946 		}
1947 
1948 		for (int x = 0; x < xyarray__max_x(xy); x++) {
1949 			for (int y = 0; y < xyarray__max_y(xy); y++) {
1950 				struct perf_counts_values count;
1951 
1952 				if (perf_evsel__read(&evsel->core, x, y, &count) < 0) {
1953 					pr_debug("read LOST count failed\n");
1954 					goto out;
1955 				}
1956 
1957 				if (count.lost) {
1958 					if (!lost) {
1959 						lost = zalloc(sizeof(*lost) +
1960 							      session->machines.host.id_hdr_size);
1961 						if (!lost) {
1962 							pr_debug("Memory allocation failed\n");
1963 							return;
1964 						}
1965 						lost->header.type = PERF_RECORD_LOST_SAMPLES;
1966 					}
1967 					__record__save_lost_samples(rec, evsel, lost,
1968 								    x, y, count.lost, 0);
1969 				}
1970 			}
1971 		}
1972 
1973 		lost_count = perf_bpf_filter__lost_count(evsel);
1974 		if (lost_count) {
1975 			if (!lost) {
1976 				lost = zalloc(sizeof(*lost) +
1977 					      session->machines.host.id_hdr_size);
1978 				if (!lost) {
1979 					pr_debug("Memory allocation failed\n");
1980 					return;
1981 				}
1982 				lost->header.type = PERF_RECORD_LOST_SAMPLES;
1983 			}
1984 			__record__save_lost_samples(rec, evsel, lost, 0, 0, lost_count,
1985 						    PERF_RECORD_MISC_LOST_SAMPLES_BPF);
1986 		}
1987 	}
1988 out:
1989 	free(lost);
1990 }
1991 
1992 static volatile sig_atomic_t workload_exec_errno;
1993 
1994 /*
1995  * evlist__prepare_workload will send a SIGUSR1
1996  * if the fork fails, since we asked by setting its
1997  * want_signal to true.
1998  */
1999 static void workload_exec_failed_signal(int signo __maybe_unused,
2000 					siginfo_t *info,
2001 					void *ucontext __maybe_unused)
2002 {
2003 	workload_exec_errno = info->si_value.sival_int;
2004 	done = 1;
2005 	child_finished = 1;
2006 }
2007 
2008 static void snapshot_sig_handler(int sig);
2009 static void alarm_sig_handler(int sig);
2010 
2011 static const struct perf_event_mmap_page *evlist__pick_pc(struct evlist *evlist)
2012 {
2013 	if (evlist) {
2014 		if (evlist->mmap && evlist->mmap[0].core.base)
2015 			return evlist->mmap[0].core.base;
2016 		if (evlist->overwrite_mmap && evlist->overwrite_mmap[0].core.base)
2017 			return evlist->overwrite_mmap[0].core.base;
2018 	}
2019 	return NULL;
2020 }
2021 
2022 static const struct perf_event_mmap_page *record__pick_pc(struct record *rec)
2023 {
2024 	const struct perf_event_mmap_page *pc = evlist__pick_pc(rec->evlist);
2025 	if (pc)
2026 		return pc;
2027 	return NULL;
2028 }
2029 
2030 static int record__synthesize(struct record *rec, bool tail)
2031 {
2032 	struct perf_session *session = rec->session;
2033 	struct machine *machine = &session->machines.host;
2034 	struct perf_data *data = &rec->data;
2035 	struct record_opts *opts = &rec->opts;
2036 	struct perf_tool *tool = &rec->tool;
2037 	int err = 0;
2038 	event_op f = process_synthesized_event;
2039 
2040 	if (rec->opts.tail_synthesize != tail)
2041 		return 0;
2042 
2043 	if (data->is_pipe) {
2044 		err = perf_event__synthesize_for_pipe(tool, session, data,
2045 						      process_synthesized_event);
2046 		if (err < 0)
2047 			goto out;
2048 
2049 		rec->bytes_written += err;
2050 	}
2051 
2052 	err = perf_event__synth_time_conv(record__pick_pc(rec), tool,
2053 					  process_synthesized_event, machine);
2054 	if (err)
2055 		goto out;
2056 
2057 	/* Synthesize id_index before auxtrace_info */
2058 	err = perf_event__synthesize_id_index(tool,
2059 					      process_synthesized_event,
2060 					      session->evlist, machine);
2061 	if (err)
2062 		goto out;
2063 
2064 	if (rec->opts.full_auxtrace) {
2065 		err = perf_event__synthesize_auxtrace_info(rec->itr, tool,
2066 					session, process_synthesized_event);
2067 		if (err)
2068 			goto out;
2069 	}
2070 
2071 	if (!evlist__exclude_kernel(rec->evlist)) {
2072 		err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
2073 							 machine);
2074 		WARN_ONCE(err < 0, "Couldn't record kernel reference relocation symbol\n"
2075 				   "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
2076 				   "Check /proc/kallsyms permission or run as root.\n");
2077 
2078 		err = perf_event__synthesize_modules(tool, process_synthesized_event,
2079 						     machine);
2080 		WARN_ONCE(err < 0, "Couldn't record kernel module information.\n"
2081 				   "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
2082 				   "Check /proc/modules permission or run as root.\n");
2083 	}
2084 
2085 	if (perf_guest) {
2086 		machines__process_guests(&session->machines,
2087 					 perf_event__synthesize_guest_os, tool);
2088 	}
2089 
2090 	err = perf_event__synthesize_extra_attr(&rec->tool,
2091 						rec->evlist,
2092 						process_synthesized_event,
2093 						data->is_pipe);
2094 	if (err)
2095 		goto out;
2096 
2097 	err = perf_event__synthesize_thread_map2(&rec->tool, rec->evlist->core.threads,
2098 						 process_synthesized_event,
2099 						NULL);
2100 	if (err < 0) {
2101 		pr_err("Couldn't synthesize thread map.\n");
2102 		return err;
2103 	}
2104 
2105 	err = perf_event__synthesize_cpu_map(&rec->tool, rec->evlist->core.all_cpus,
2106 					     process_synthesized_event, NULL);
2107 	if (err < 0) {
2108 		pr_err("Couldn't synthesize cpu map.\n");
2109 		return err;
2110 	}
2111 
2112 	err = perf_event__synthesize_bpf_events(session, process_synthesized_event,
2113 						machine, opts);
2114 	if (err < 0) {
2115 		pr_warning("Couldn't synthesize bpf events.\n");
2116 		err = 0;
2117 	}
2118 
2119 	if (rec->opts.synth & PERF_SYNTH_CGROUP) {
2120 		err = perf_event__synthesize_cgroups(tool, process_synthesized_event,
2121 						     machine);
2122 		if (err < 0) {
2123 			pr_warning("Couldn't synthesize cgroup events.\n");
2124 			err = 0;
2125 		}
2126 	}
2127 
2128 	if (rec->opts.nr_threads_synthesize > 1) {
2129 		mutex_init(&synth_lock);
2130 		perf_set_multithreaded();
2131 		f = process_locked_synthesized_event;
2132 	}
2133 
2134 	if (rec->opts.synth & PERF_SYNTH_TASK) {
2135 		bool needs_mmap = rec->opts.synth & PERF_SYNTH_MMAP;
2136 
2137 		err = __machine__synthesize_threads(machine, tool, &opts->target,
2138 						    rec->evlist->core.threads,
2139 						    f, needs_mmap, opts->sample_address,
2140 						    rec->opts.nr_threads_synthesize);
2141 	}
2142 
2143 	if (rec->opts.nr_threads_synthesize > 1) {
2144 		perf_set_singlethreaded();
2145 		mutex_destroy(&synth_lock);
2146 	}
2147 
2148 out:
2149 	return err;
2150 }
2151 
2152 static int record__process_signal_event(union perf_event *event __maybe_unused, void *data)
2153 {
2154 	struct record *rec = data;
2155 	pthread_kill(rec->thread_id, SIGUSR2);
2156 	return 0;
2157 }
2158 
2159 static int record__setup_sb_evlist(struct record *rec)
2160 {
2161 	struct record_opts *opts = &rec->opts;
2162 
2163 	if (rec->sb_evlist != NULL) {
2164 		/*
2165 		 * We get here if --switch-output-event populated the
2166 		 * sb_evlist, so associate a callback that will send a SIGUSR2
2167 		 * to the main thread.
2168 		 */
2169 		evlist__set_cb(rec->sb_evlist, record__process_signal_event, rec);
2170 		rec->thread_id = pthread_self();
2171 	}
2172 #ifdef HAVE_LIBBPF_SUPPORT
2173 	if (!opts->no_bpf_event) {
2174 		if (rec->sb_evlist == NULL) {
2175 			rec->sb_evlist = evlist__new();
2176 
2177 			if (rec->sb_evlist == NULL) {
2178 				pr_err("Couldn't create side band evlist.\n.");
2179 				return -1;
2180 			}
2181 		}
2182 
2183 		if (evlist__add_bpf_sb_event(rec->sb_evlist, &rec->session->header.env)) {
2184 			pr_err("Couldn't ask for PERF_RECORD_BPF_EVENT side band events.\n.");
2185 			return -1;
2186 		}
2187 	}
2188 #endif
2189 	if (evlist__start_sb_thread(rec->sb_evlist, &rec->opts.target)) {
2190 		pr_debug("Couldn't start the BPF side band thread:\nBPF programs starting from now on won't be annotatable\n");
2191 		opts->no_bpf_event = true;
2192 	}
2193 
2194 	return 0;
2195 }
2196 
2197 static int record__init_clock(struct record *rec)
2198 {
2199 	struct perf_session *session = rec->session;
2200 	struct timespec ref_clockid;
2201 	struct timeval ref_tod;
2202 	u64 ref;
2203 
2204 	if (!rec->opts.use_clockid)
2205 		return 0;
2206 
2207 	if (rec->opts.use_clockid && rec->opts.clockid_res_ns)
2208 		session->header.env.clock.clockid_res_ns = rec->opts.clockid_res_ns;
2209 
2210 	session->header.env.clock.clockid = rec->opts.clockid;
2211 
2212 	if (gettimeofday(&ref_tod, NULL) != 0) {
2213 		pr_err("gettimeofday failed, cannot set reference time.\n");
2214 		return -1;
2215 	}
2216 
2217 	if (clock_gettime(rec->opts.clockid, &ref_clockid)) {
2218 		pr_err("clock_gettime failed, cannot set reference time.\n");
2219 		return -1;
2220 	}
2221 
2222 	ref = (u64) ref_tod.tv_sec * NSEC_PER_SEC +
2223 	      (u64) ref_tod.tv_usec * NSEC_PER_USEC;
2224 
2225 	session->header.env.clock.tod_ns = ref;
2226 
2227 	ref = (u64) ref_clockid.tv_sec * NSEC_PER_SEC +
2228 	      (u64) ref_clockid.tv_nsec;
2229 
2230 	session->header.env.clock.clockid_ns = ref;
2231 	return 0;
2232 }
2233 
2234 static void hit_auxtrace_snapshot_trigger(struct record *rec)
2235 {
2236 	if (trigger_is_ready(&auxtrace_snapshot_trigger)) {
2237 		trigger_hit(&auxtrace_snapshot_trigger);
2238 		auxtrace_record__snapshot_started = 1;
2239 		if (auxtrace_record__snapshot_start(rec->itr))
2240 			trigger_error(&auxtrace_snapshot_trigger);
2241 	}
2242 }
2243 
2244 static int record__terminate_thread(struct record_thread *thread_data)
2245 {
2246 	int err;
2247 	enum thread_msg ack = THREAD_MSG__UNDEFINED;
2248 	pid_t tid = thread_data->tid;
2249 
2250 	close(thread_data->pipes.msg[1]);
2251 	thread_data->pipes.msg[1] = -1;
2252 	err = read(thread_data->pipes.ack[0], &ack, sizeof(ack));
2253 	if (err > 0)
2254 		pr_debug2("threads[%d]: sent %s\n", tid, thread_msg_tags[ack]);
2255 	else
2256 		pr_warning("threads[%d]: failed to receive termination notification from %d\n",
2257 			   thread->tid, tid);
2258 
2259 	return 0;
2260 }
2261 
2262 static int record__start_threads(struct record *rec)
2263 {
2264 	int t, tt, err, ret = 0, nr_threads = rec->nr_threads;
2265 	struct record_thread *thread_data = rec->thread_data;
2266 	sigset_t full, mask;
2267 	pthread_t handle;
2268 	pthread_attr_t attrs;
2269 
2270 	thread = &thread_data[0];
2271 
2272 	if (!record__threads_enabled(rec))
2273 		return 0;
2274 
2275 	sigfillset(&full);
2276 	if (sigprocmask(SIG_SETMASK, &full, &mask)) {
2277 		pr_err("Failed to block signals on threads start: %s\n", strerror(errno));
2278 		return -1;
2279 	}
2280 
2281 	pthread_attr_init(&attrs);
2282 	pthread_attr_setdetachstate(&attrs, PTHREAD_CREATE_DETACHED);
2283 
2284 	for (t = 1; t < nr_threads; t++) {
2285 		enum thread_msg msg = THREAD_MSG__UNDEFINED;
2286 
2287 #ifdef HAVE_PTHREAD_ATTR_SETAFFINITY_NP
2288 		pthread_attr_setaffinity_np(&attrs,
2289 					    MMAP_CPU_MASK_BYTES(&(thread_data[t].mask->affinity)),
2290 					    (cpu_set_t *)(thread_data[t].mask->affinity.bits));
2291 #endif
2292 		if (pthread_create(&handle, &attrs, record__thread, &thread_data[t])) {
2293 			for (tt = 1; tt < t; tt++)
2294 				record__terminate_thread(&thread_data[t]);
2295 			pr_err("Failed to start threads: %s\n", strerror(errno));
2296 			ret = -1;
2297 			goto out_err;
2298 		}
2299 
2300 		err = read(thread_data[t].pipes.ack[0], &msg, sizeof(msg));
2301 		if (err > 0)
2302 			pr_debug2("threads[%d]: sent %s\n", rec->thread_data[t].tid,
2303 				  thread_msg_tags[msg]);
2304 		else
2305 			pr_warning("threads[%d]: failed to receive start notification from %d\n",
2306 				   thread->tid, rec->thread_data[t].tid);
2307 	}
2308 
2309 	sched_setaffinity(0, MMAP_CPU_MASK_BYTES(&thread->mask->affinity),
2310 			(cpu_set_t *)thread->mask->affinity.bits);
2311 
2312 	pr_debug("threads[%d]: started on cpu%d\n", thread->tid, sched_getcpu());
2313 
2314 out_err:
2315 	pthread_attr_destroy(&attrs);
2316 
2317 	if (sigprocmask(SIG_SETMASK, &mask, NULL)) {
2318 		pr_err("Failed to unblock signals on threads start: %s\n", strerror(errno));
2319 		ret = -1;
2320 	}
2321 
2322 	return ret;
2323 }
2324 
2325 static int record__stop_threads(struct record *rec)
2326 {
2327 	int t;
2328 	struct record_thread *thread_data = rec->thread_data;
2329 
2330 	for (t = 1; t < rec->nr_threads; t++)
2331 		record__terminate_thread(&thread_data[t]);
2332 
2333 	for (t = 0; t < rec->nr_threads; t++) {
2334 		rec->samples += thread_data[t].samples;
2335 		if (!record__threads_enabled(rec))
2336 			continue;
2337 		rec->session->bytes_transferred += thread_data[t].bytes_transferred;
2338 		rec->session->bytes_compressed += thread_data[t].bytes_compressed;
2339 		pr_debug("threads[%d]: samples=%lld, wakes=%ld, ", thread_data[t].tid,
2340 			 thread_data[t].samples, thread_data[t].waking);
2341 		if (thread_data[t].bytes_transferred && thread_data[t].bytes_compressed)
2342 			pr_debug("transferred=%" PRIu64 ", compressed=%" PRIu64 "\n",
2343 				 thread_data[t].bytes_transferred, thread_data[t].bytes_compressed);
2344 		else
2345 			pr_debug("written=%" PRIu64 "\n", thread_data[t].bytes_written);
2346 	}
2347 
2348 	return 0;
2349 }
2350 
2351 static unsigned long record__waking(struct record *rec)
2352 {
2353 	int t;
2354 	unsigned long waking = 0;
2355 	struct record_thread *thread_data = rec->thread_data;
2356 
2357 	for (t = 0; t < rec->nr_threads; t++)
2358 		waking += thread_data[t].waking;
2359 
2360 	return waking;
2361 }
2362 
2363 static int __cmd_record(struct record *rec, int argc, const char **argv)
2364 {
2365 	int err;
2366 	int status = 0;
2367 	const bool forks = argc > 0;
2368 	struct perf_tool *tool = &rec->tool;
2369 	struct record_opts *opts = &rec->opts;
2370 	struct perf_data *data = &rec->data;
2371 	struct perf_session *session;
2372 	bool disabled = false, draining = false;
2373 	int fd;
2374 	float ratio = 0;
2375 	enum evlist_ctl_cmd cmd = EVLIST_CTL_CMD_UNSUPPORTED;
2376 
2377 	atexit(record__sig_exit);
2378 	signal(SIGCHLD, sig_handler);
2379 	signal(SIGINT, sig_handler);
2380 	signal(SIGTERM, sig_handler);
2381 	signal(SIGSEGV, sigsegv_handler);
2382 
2383 	if (rec->opts.record_namespaces)
2384 		tool->namespace_events = true;
2385 
2386 	if (rec->opts.record_cgroup) {
2387 #ifdef HAVE_FILE_HANDLE
2388 		tool->cgroup_events = true;
2389 #else
2390 		pr_err("cgroup tracking is not supported\n");
2391 		return -1;
2392 #endif
2393 	}
2394 
2395 	if (rec->opts.auxtrace_snapshot_mode || rec->switch_output.enabled) {
2396 		signal(SIGUSR2, snapshot_sig_handler);
2397 		if (rec->opts.auxtrace_snapshot_mode)
2398 			trigger_on(&auxtrace_snapshot_trigger);
2399 		if (rec->switch_output.enabled)
2400 			trigger_on(&switch_output_trigger);
2401 	} else {
2402 		signal(SIGUSR2, SIG_IGN);
2403 	}
2404 
2405 	session = perf_session__new(data, tool);
2406 	if (IS_ERR(session)) {
2407 		pr_err("Perf session creation failed.\n");
2408 		return PTR_ERR(session);
2409 	}
2410 
2411 	if (record__threads_enabled(rec)) {
2412 		if (perf_data__is_pipe(&rec->data)) {
2413 			pr_err("Parallel trace streaming is not available in pipe mode.\n");
2414 			return -1;
2415 		}
2416 		if (rec->opts.full_auxtrace) {
2417 			pr_err("Parallel trace streaming is not available in AUX area tracing mode.\n");
2418 			return -1;
2419 		}
2420 	}
2421 
2422 	fd = perf_data__fd(data);
2423 	rec->session = session;
2424 
2425 	if (zstd_init(&session->zstd_data, rec->opts.comp_level) < 0) {
2426 		pr_err("Compression initialization failed.\n");
2427 		return -1;
2428 	}
2429 #ifdef HAVE_EVENTFD_SUPPORT
2430 	done_fd = eventfd(0, EFD_NONBLOCK);
2431 	if (done_fd < 0) {
2432 		pr_err("Failed to create wakeup eventfd, error: %m\n");
2433 		status = -1;
2434 		goto out_delete_session;
2435 	}
2436 	err = evlist__add_wakeup_eventfd(rec->evlist, done_fd);
2437 	if (err < 0) {
2438 		pr_err("Failed to add wakeup eventfd to poll list\n");
2439 		status = err;
2440 		goto out_delete_session;
2441 	}
2442 #endif // HAVE_EVENTFD_SUPPORT
2443 
2444 	session->header.env.comp_type  = PERF_COMP_ZSTD;
2445 	session->header.env.comp_level = rec->opts.comp_level;
2446 
2447 	if (rec->opts.kcore &&
2448 	    !record__kcore_readable(&session->machines.host)) {
2449 		pr_err("ERROR: kcore is not readable.\n");
2450 		return -1;
2451 	}
2452 
2453 	if (record__init_clock(rec))
2454 		return -1;
2455 
2456 	record__init_features(rec);
2457 
2458 	if (forks) {
2459 		err = evlist__prepare_workload(rec->evlist, &opts->target, argv, data->is_pipe,
2460 					       workload_exec_failed_signal);
2461 		if (err < 0) {
2462 			pr_err("Couldn't run the workload!\n");
2463 			status = err;
2464 			goto out_delete_session;
2465 		}
2466 	}
2467 
2468 	/*
2469 	 * If we have just single event and are sending data
2470 	 * through pipe, we need to force the ids allocation,
2471 	 * because we synthesize event name through the pipe
2472 	 * and need the id for that.
2473 	 */
2474 	if (data->is_pipe && rec->evlist->core.nr_entries == 1)
2475 		rec->opts.sample_id = true;
2476 
2477 	if (rec->timestamp_filename && perf_data__is_pipe(data)) {
2478 		rec->timestamp_filename = false;
2479 		pr_warning("WARNING: --timestamp-filename option is not available in pipe mode.\n");
2480 	}
2481 
2482 	evlist__uniquify_name(rec->evlist);
2483 
2484 	evlist__config(rec->evlist, opts, &callchain_param);
2485 
2486 	/* Debug message used by test scripts */
2487 	pr_debug3("perf record opening and mmapping events\n");
2488 	if (record__open(rec) != 0) {
2489 		err = -1;
2490 		goto out_free_threads;
2491 	}
2492 	/* Debug message used by test scripts */
2493 	pr_debug3("perf record done opening and mmapping events\n");
2494 	session->header.env.comp_mmap_len = session->evlist->core.mmap_len;
2495 
2496 	if (rec->opts.kcore) {
2497 		err = record__kcore_copy(&session->machines.host, data);
2498 		if (err) {
2499 			pr_err("ERROR: Failed to copy kcore\n");
2500 			goto out_free_threads;
2501 		}
2502 	}
2503 
2504 	/*
2505 	 * Normally perf_session__new would do this, but it doesn't have the
2506 	 * evlist.
2507 	 */
2508 	if (rec->tool.ordered_events && !evlist__sample_id_all(rec->evlist)) {
2509 		pr_warning("WARNING: No sample_id_all support, falling back to unordered processing\n");
2510 		rec->tool.ordered_events = false;
2511 	}
2512 
2513 	if (evlist__nr_groups(rec->evlist) == 0)
2514 		perf_header__clear_feat(&session->header, HEADER_GROUP_DESC);
2515 
2516 	if (data->is_pipe) {
2517 		err = perf_header__write_pipe(fd);
2518 		if (err < 0)
2519 			goto out_free_threads;
2520 	} else {
2521 		err = perf_session__write_header(session, rec->evlist, fd, false);
2522 		if (err < 0)
2523 			goto out_free_threads;
2524 	}
2525 
2526 	err = -1;
2527 	if (!rec->no_buildid
2528 	    && !perf_header__has_feat(&session->header, HEADER_BUILD_ID)) {
2529 		pr_err("Couldn't generate buildids. "
2530 		       "Use --no-buildid to profile anyway.\n");
2531 		goto out_free_threads;
2532 	}
2533 
2534 	err = record__setup_sb_evlist(rec);
2535 	if (err)
2536 		goto out_free_threads;
2537 
2538 	err = record__synthesize(rec, false);
2539 	if (err < 0)
2540 		goto out_free_threads;
2541 
2542 	if (rec->realtime_prio) {
2543 		struct sched_param param;
2544 
2545 		param.sched_priority = rec->realtime_prio;
2546 		if (sched_setscheduler(0, SCHED_FIFO, &param)) {
2547 			pr_err("Could not set realtime priority.\n");
2548 			err = -1;
2549 			goto out_free_threads;
2550 		}
2551 	}
2552 
2553 	if (record__start_threads(rec))
2554 		goto out_free_threads;
2555 
2556 	/*
2557 	 * When perf is starting the traced process, all the events
2558 	 * (apart from group members) have enable_on_exec=1 set,
2559 	 * so don't spoil it by prematurely enabling them.
2560 	 */
2561 	if (!target__none(&opts->target) && !opts->target.initial_delay)
2562 		evlist__enable(rec->evlist);
2563 
2564 	/*
2565 	 * Let the child rip
2566 	 */
2567 	if (forks) {
2568 		struct machine *machine = &session->machines.host;
2569 		union perf_event *event;
2570 		pid_t tgid;
2571 
2572 		event = malloc(sizeof(event->comm) + machine->id_hdr_size);
2573 		if (event == NULL) {
2574 			err = -ENOMEM;
2575 			goto out_child;
2576 		}
2577 
2578 		/*
2579 		 * Some H/W events are generated before COMM event
2580 		 * which is emitted during exec(), so perf script
2581 		 * cannot see a correct process name for those events.
2582 		 * Synthesize COMM event to prevent it.
2583 		 */
2584 		tgid = perf_event__synthesize_comm(tool, event,
2585 						   rec->evlist->workload.pid,
2586 						   process_synthesized_event,
2587 						   machine);
2588 		free(event);
2589 
2590 		if (tgid == -1)
2591 			goto out_child;
2592 
2593 		event = malloc(sizeof(event->namespaces) +
2594 			       (NR_NAMESPACES * sizeof(struct perf_ns_link_info)) +
2595 			       machine->id_hdr_size);
2596 		if (event == NULL) {
2597 			err = -ENOMEM;
2598 			goto out_child;
2599 		}
2600 
2601 		/*
2602 		 * Synthesize NAMESPACES event for the command specified.
2603 		 */
2604 		perf_event__synthesize_namespaces(tool, event,
2605 						  rec->evlist->workload.pid,
2606 						  tgid, process_synthesized_event,
2607 						  machine);
2608 		free(event);
2609 
2610 		evlist__start_workload(rec->evlist);
2611 	}
2612 
2613 	if (opts->target.initial_delay) {
2614 		pr_info(EVLIST_DISABLED_MSG);
2615 		if (opts->target.initial_delay > 0) {
2616 			usleep(opts->target.initial_delay * USEC_PER_MSEC);
2617 			evlist__enable(rec->evlist);
2618 			pr_info(EVLIST_ENABLED_MSG);
2619 		}
2620 	}
2621 
2622 	err = event_enable_timer__start(rec->evlist->eet);
2623 	if (err)
2624 		goto out_child;
2625 
2626 	/* Debug message used by test scripts */
2627 	pr_debug3("perf record has started\n");
2628 	fflush(stderr);
2629 
2630 	trigger_ready(&auxtrace_snapshot_trigger);
2631 	trigger_ready(&switch_output_trigger);
2632 	perf_hooks__invoke_record_start();
2633 
2634 	/*
2635 	 * Must write FINISHED_INIT so it will be seen after all other
2636 	 * synthesized user events, but before any regular events.
2637 	 */
2638 	err = write_finished_init(rec, false);
2639 	if (err < 0)
2640 		goto out_child;
2641 
2642 	for (;;) {
2643 		unsigned long long hits = thread->samples;
2644 
2645 		/*
2646 		 * rec->evlist->bkw_mmap_state is possible to be
2647 		 * BKW_MMAP_EMPTY here: when done == true and
2648 		 * hits != rec->samples in previous round.
2649 		 *
2650 		 * evlist__toggle_bkw_mmap ensure we never
2651 		 * convert BKW_MMAP_EMPTY to BKW_MMAP_DATA_PENDING.
2652 		 */
2653 		if (trigger_is_hit(&switch_output_trigger) || done || draining)
2654 			evlist__toggle_bkw_mmap(rec->evlist, BKW_MMAP_DATA_PENDING);
2655 
2656 		if (record__mmap_read_all(rec, false) < 0) {
2657 			trigger_error(&auxtrace_snapshot_trigger);
2658 			trigger_error(&switch_output_trigger);
2659 			err = -1;
2660 			goto out_child;
2661 		}
2662 
2663 		if (auxtrace_record__snapshot_started) {
2664 			auxtrace_record__snapshot_started = 0;
2665 			if (!trigger_is_error(&auxtrace_snapshot_trigger))
2666 				record__read_auxtrace_snapshot(rec, false);
2667 			if (trigger_is_error(&auxtrace_snapshot_trigger)) {
2668 				pr_err("AUX area tracing snapshot failed\n");
2669 				err = -1;
2670 				goto out_child;
2671 			}
2672 		}
2673 
2674 		if (trigger_is_hit(&switch_output_trigger)) {
2675 			/*
2676 			 * If switch_output_trigger is hit, the data in
2677 			 * overwritable ring buffer should have been collected,
2678 			 * so bkw_mmap_state should be set to BKW_MMAP_EMPTY.
2679 			 *
2680 			 * If SIGUSR2 raise after or during record__mmap_read_all(),
2681 			 * record__mmap_read_all() didn't collect data from
2682 			 * overwritable ring buffer. Read again.
2683 			 */
2684 			if (rec->evlist->bkw_mmap_state == BKW_MMAP_RUNNING)
2685 				continue;
2686 			trigger_ready(&switch_output_trigger);
2687 
2688 			/*
2689 			 * Reenable events in overwrite ring buffer after
2690 			 * record__mmap_read_all(): we should have collected
2691 			 * data from it.
2692 			 */
2693 			evlist__toggle_bkw_mmap(rec->evlist, BKW_MMAP_RUNNING);
2694 
2695 			if (!quiet)
2696 				fprintf(stderr, "[ perf record: dump data: Woken up %ld times ]\n",
2697 					record__waking(rec));
2698 			thread->waking = 0;
2699 			fd = record__switch_output(rec, false);
2700 			if (fd < 0) {
2701 				pr_err("Failed to switch to new file\n");
2702 				trigger_error(&switch_output_trigger);
2703 				err = fd;
2704 				goto out_child;
2705 			}
2706 
2707 			/* re-arm the alarm */
2708 			if (rec->switch_output.time)
2709 				alarm(rec->switch_output.time);
2710 		}
2711 
2712 		if (hits == thread->samples) {
2713 			if (done || draining)
2714 				break;
2715 			err = fdarray__poll(&thread->pollfd, -1);
2716 			/*
2717 			 * Propagate error, only if there's any. Ignore positive
2718 			 * number of returned events and interrupt error.
2719 			 */
2720 			if (err > 0 || (err < 0 && errno == EINTR))
2721 				err = 0;
2722 			thread->waking++;
2723 
2724 			if (fdarray__filter(&thread->pollfd, POLLERR | POLLHUP,
2725 					    record__thread_munmap_filtered, NULL) == 0)
2726 				draining = true;
2727 
2728 			err = record__update_evlist_pollfd_from_thread(rec, rec->evlist, thread);
2729 			if (err)
2730 				goto out_child;
2731 		}
2732 
2733 		if (evlist__ctlfd_process(rec->evlist, &cmd) > 0) {
2734 			switch (cmd) {
2735 			case EVLIST_CTL_CMD_SNAPSHOT:
2736 				hit_auxtrace_snapshot_trigger(rec);
2737 				evlist__ctlfd_ack(rec->evlist);
2738 				break;
2739 			case EVLIST_CTL_CMD_STOP:
2740 				done = 1;
2741 				break;
2742 			case EVLIST_CTL_CMD_ACK:
2743 			case EVLIST_CTL_CMD_UNSUPPORTED:
2744 			case EVLIST_CTL_CMD_ENABLE:
2745 			case EVLIST_CTL_CMD_DISABLE:
2746 			case EVLIST_CTL_CMD_EVLIST:
2747 			case EVLIST_CTL_CMD_PING:
2748 			default:
2749 				break;
2750 			}
2751 		}
2752 
2753 		err = event_enable_timer__process(rec->evlist->eet);
2754 		if (err < 0)
2755 			goto out_child;
2756 		if (err) {
2757 			err = 0;
2758 			done = 1;
2759 		}
2760 
2761 		/*
2762 		 * When perf is starting the traced process, at the end events
2763 		 * die with the process and we wait for that. Thus no need to
2764 		 * disable events in this case.
2765 		 */
2766 		if (done && !disabled && !target__none(&opts->target)) {
2767 			trigger_off(&auxtrace_snapshot_trigger);
2768 			evlist__disable(rec->evlist);
2769 			disabled = true;
2770 		}
2771 	}
2772 
2773 	trigger_off(&auxtrace_snapshot_trigger);
2774 	trigger_off(&switch_output_trigger);
2775 
2776 	if (opts->auxtrace_snapshot_on_exit)
2777 		record__auxtrace_snapshot_exit(rec);
2778 
2779 	if (forks && workload_exec_errno) {
2780 		char msg[STRERR_BUFSIZE], strevsels[2048];
2781 		const char *emsg = str_error_r(workload_exec_errno, msg, sizeof(msg));
2782 
2783 		evlist__scnprintf_evsels(rec->evlist, sizeof(strevsels), strevsels);
2784 
2785 		pr_err("Failed to collect '%s' for the '%s' workload: %s\n",
2786 			strevsels, argv[0], emsg);
2787 		err = -1;
2788 		goto out_child;
2789 	}
2790 
2791 	if (!quiet)
2792 		fprintf(stderr, "[ perf record: Woken up %ld times to write data ]\n",
2793 			record__waking(rec));
2794 
2795 	write_finished_init(rec, true);
2796 
2797 	if (target__none(&rec->opts.target))
2798 		record__synthesize_workload(rec, true);
2799 
2800 out_child:
2801 	record__stop_threads(rec);
2802 	record__mmap_read_all(rec, true);
2803 out_free_threads:
2804 	record__free_thread_data(rec);
2805 	evlist__finalize_ctlfd(rec->evlist);
2806 	record__aio_mmap_read_sync(rec);
2807 
2808 	if (rec->session->bytes_transferred && rec->session->bytes_compressed) {
2809 		ratio = (float)rec->session->bytes_transferred/(float)rec->session->bytes_compressed;
2810 		session->header.env.comp_ratio = ratio + 0.5;
2811 	}
2812 
2813 	if (forks) {
2814 		int exit_status;
2815 
2816 		if (!child_finished)
2817 			kill(rec->evlist->workload.pid, SIGTERM);
2818 
2819 		wait(&exit_status);
2820 
2821 		if (err < 0)
2822 			status = err;
2823 		else if (WIFEXITED(exit_status))
2824 			status = WEXITSTATUS(exit_status);
2825 		else if (WIFSIGNALED(exit_status))
2826 			signr = WTERMSIG(exit_status);
2827 	} else
2828 		status = err;
2829 
2830 	if (rec->off_cpu)
2831 		rec->bytes_written += off_cpu_write(rec->session);
2832 
2833 	record__read_lost_samples(rec);
2834 	record__synthesize(rec, true);
2835 	/* this will be recalculated during process_buildids() */
2836 	rec->samples = 0;
2837 
2838 	if (!err) {
2839 		if (!rec->timestamp_filename) {
2840 			record__finish_output(rec);
2841 		} else {
2842 			fd = record__switch_output(rec, true);
2843 			if (fd < 0) {
2844 				status = fd;
2845 				goto out_delete_session;
2846 			}
2847 		}
2848 	}
2849 
2850 	perf_hooks__invoke_record_end();
2851 
2852 	if (!err && !quiet) {
2853 		char samples[128];
2854 		const char *postfix = rec->timestamp_filename ?
2855 					".<timestamp>" : "";
2856 
2857 		if (rec->samples && !rec->opts.full_auxtrace)
2858 			scnprintf(samples, sizeof(samples),
2859 				  " (%" PRIu64 " samples)", rec->samples);
2860 		else
2861 			samples[0] = '\0';
2862 
2863 		fprintf(stderr,	"[ perf record: Captured and wrote %.3f MB %s%s%s",
2864 			perf_data__size(data) / 1024.0 / 1024.0,
2865 			data->path, postfix, samples);
2866 		if (ratio) {
2867 			fprintf(stderr,	", compressed (original %.3f MB, ratio is %.3f)",
2868 					rec->session->bytes_transferred / 1024.0 / 1024.0,
2869 					ratio);
2870 		}
2871 		fprintf(stderr, " ]\n");
2872 	}
2873 
2874 out_delete_session:
2875 #ifdef HAVE_EVENTFD_SUPPORT
2876 	if (done_fd >= 0) {
2877 		fd = done_fd;
2878 		done_fd = -1;
2879 
2880 		close(fd);
2881 	}
2882 #endif
2883 	zstd_fini(&session->zstd_data);
2884 	if (!opts->no_bpf_event)
2885 		evlist__stop_sb_thread(rec->sb_evlist);
2886 
2887 	perf_session__delete(session);
2888 	return status;
2889 }
2890 
2891 static void callchain_debug(struct callchain_param *callchain)
2892 {
2893 	static const char *str[CALLCHAIN_MAX] = { "NONE", "FP", "DWARF", "LBR" };
2894 
2895 	pr_debug("callchain: type %s\n", str[callchain->record_mode]);
2896 
2897 	if (callchain->record_mode == CALLCHAIN_DWARF)
2898 		pr_debug("callchain: stack dump size %d\n",
2899 			 callchain->dump_size);
2900 }
2901 
2902 int record_opts__parse_callchain(struct record_opts *record,
2903 				 struct callchain_param *callchain,
2904 				 const char *arg, bool unset)
2905 {
2906 	int ret;
2907 	callchain->enabled = !unset;
2908 
2909 	/* --no-call-graph */
2910 	if (unset) {
2911 		callchain->record_mode = CALLCHAIN_NONE;
2912 		pr_debug("callchain: disabled\n");
2913 		return 0;
2914 	}
2915 
2916 	ret = parse_callchain_record_opt(arg, callchain);
2917 	if (!ret) {
2918 		/* Enable data address sampling for DWARF unwind. */
2919 		if (callchain->record_mode == CALLCHAIN_DWARF)
2920 			record->sample_address = true;
2921 		callchain_debug(callchain);
2922 	}
2923 
2924 	return ret;
2925 }
2926 
2927 int record_parse_callchain_opt(const struct option *opt,
2928 			       const char *arg,
2929 			       int unset)
2930 {
2931 	return record_opts__parse_callchain(opt->value, &callchain_param, arg, unset);
2932 }
2933 
2934 int record_callchain_opt(const struct option *opt,
2935 			 const char *arg __maybe_unused,
2936 			 int unset __maybe_unused)
2937 {
2938 	struct callchain_param *callchain = opt->value;
2939 
2940 	callchain->enabled = true;
2941 
2942 	if (callchain->record_mode == CALLCHAIN_NONE)
2943 		callchain->record_mode = CALLCHAIN_FP;
2944 
2945 	callchain_debug(callchain);
2946 	return 0;
2947 }
2948 
2949 static int perf_record_config(const char *var, const char *value, void *cb)
2950 {
2951 	struct record *rec = cb;
2952 
2953 	if (!strcmp(var, "record.build-id")) {
2954 		if (!strcmp(value, "cache"))
2955 			rec->no_buildid_cache = false;
2956 		else if (!strcmp(value, "no-cache"))
2957 			rec->no_buildid_cache = true;
2958 		else if (!strcmp(value, "skip"))
2959 			rec->no_buildid = true;
2960 		else if (!strcmp(value, "mmap"))
2961 			rec->buildid_mmap = true;
2962 		else
2963 			return -1;
2964 		return 0;
2965 	}
2966 	if (!strcmp(var, "record.call-graph")) {
2967 		var = "call-graph.record-mode";
2968 		return perf_default_config(var, value, cb);
2969 	}
2970 #ifdef HAVE_AIO_SUPPORT
2971 	if (!strcmp(var, "record.aio")) {
2972 		rec->opts.nr_cblocks = strtol(value, NULL, 0);
2973 		if (!rec->opts.nr_cblocks)
2974 			rec->opts.nr_cblocks = nr_cblocks_default;
2975 	}
2976 #endif
2977 	if (!strcmp(var, "record.debuginfod")) {
2978 		rec->debuginfod.urls = strdup(value);
2979 		if (!rec->debuginfod.urls)
2980 			return -ENOMEM;
2981 		rec->debuginfod.set = true;
2982 	}
2983 
2984 	return 0;
2985 }
2986 
2987 static int record__parse_event_enable_time(const struct option *opt, const char *str, int unset)
2988 {
2989 	struct record *rec = (struct record *)opt->value;
2990 
2991 	return evlist__parse_event_enable_time(rec->evlist, &rec->opts, str, unset);
2992 }
2993 
2994 static int record__parse_affinity(const struct option *opt, const char *str, int unset)
2995 {
2996 	struct record_opts *opts = (struct record_opts *)opt->value;
2997 
2998 	if (unset || !str)
2999 		return 0;
3000 
3001 	if (!strcasecmp(str, "node"))
3002 		opts->affinity = PERF_AFFINITY_NODE;
3003 	else if (!strcasecmp(str, "cpu"))
3004 		opts->affinity = PERF_AFFINITY_CPU;
3005 
3006 	return 0;
3007 }
3008 
3009 static int record__mmap_cpu_mask_alloc(struct mmap_cpu_mask *mask, int nr_bits)
3010 {
3011 	mask->nbits = nr_bits;
3012 	mask->bits = bitmap_zalloc(mask->nbits);
3013 	if (!mask->bits)
3014 		return -ENOMEM;
3015 
3016 	return 0;
3017 }
3018 
3019 static void record__mmap_cpu_mask_free(struct mmap_cpu_mask *mask)
3020 {
3021 	bitmap_free(mask->bits);
3022 	mask->nbits = 0;
3023 }
3024 
3025 static int record__thread_mask_alloc(struct thread_mask *mask, int nr_bits)
3026 {
3027 	int ret;
3028 
3029 	ret = record__mmap_cpu_mask_alloc(&mask->maps, nr_bits);
3030 	if (ret) {
3031 		mask->affinity.bits = NULL;
3032 		return ret;
3033 	}
3034 
3035 	ret = record__mmap_cpu_mask_alloc(&mask->affinity, nr_bits);
3036 	if (ret) {
3037 		record__mmap_cpu_mask_free(&mask->maps);
3038 		mask->maps.bits = NULL;
3039 	}
3040 
3041 	return ret;
3042 }
3043 
3044 static void record__thread_mask_free(struct thread_mask *mask)
3045 {
3046 	record__mmap_cpu_mask_free(&mask->maps);
3047 	record__mmap_cpu_mask_free(&mask->affinity);
3048 }
3049 
3050 static int record__parse_threads(const struct option *opt, const char *str, int unset)
3051 {
3052 	int s;
3053 	struct record_opts *opts = opt->value;
3054 
3055 	if (unset || !str || !strlen(str)) {
3056 		opts->threads_spec = THREAD_SPEC__CPU;
3057 	} else {
3058 		for (s = 1; s < THREAD_SPEC__MAX; s++) {
3059 			if (s == THREAD_SPEC__USER) {
3060 				opts->threads_user_spec = strdup(str);
3061 				if (!opts->threads_user_spec)
3062 					return -ENOMEM;
3063 				opts->threads_spec = THREAD_SPEC__USER;
3064 				break;
3065 			}
3066 			if (!strncasecmp(str, thread_spec_tags[s], strlen(thread_spec_tags[s]))) {
3067 				opts->threads_spec = s;
3068 				break;
3069 			}
3070 		}
3071 	}
3072 
3073 	if (opts->threads_spec == THREAD_SPEC__USER)
3074 		pr_debug("threads_spec: %s\n", opts->threads_user_spec);
3075 	else
3076 		pr_debug("threads_spec: %s\n", thread_spec_tags[opts->threads_spec]);
3077 
3078 	return 0;
3079 }
3080 
3081 static int parse_output_max_size(const struct option *opt,
3082 				 const char *str, int unset)
3083 {
3084 	unsigned long *s = (unsigned long *)opt->value;
3085 	static struct parse_tag tags_size[] = {
3086 		{ .tag  = 'B', .mult = 1       },
3087 		{ .tag  = 'K', .mult = 1 << 10 },
3088 		{ .tag  = 'M', .mult = 1 << 20 },
3089 		{ .tag  = 'G', .mult = 1 << 30 },
3090 		{ .tag  = 0 },
3091 	};
3092 	unsigned long val;
3093 
3094 	if (unset) {
3095 		*s = 0;
3096 		return 0;
3097 	}
3098 
3099 	val = parse_tag_value(str, tags_size);
3100 	if (val != (unsigned long) -1) {
3101 		*s = val;
3102 		return 0;
3103 	}
3104 
3105 	return -1;
3106 }
3107 
3108 static int record__parse_mmap_pages(const struct option *opt,
3109 				    const char *str,
3110 				    int unset __maybe_unused)
3111 {
3112 	struct record_opts *opts = opt->value;
3113 	char *s, *p;
3114 	unsigned int mmap_pages;
3115 	int ret;
3116 
3117 	if (!str)
3118 		return -EINVAL;
3119 
3120 	s = strdup(str);
3121 	if (!s)
3122 		return -ENOMEM;
3123 
3124 	p = strchr(s, ',');
3125 	if (p)
3126 		*p = '\0';
3127 
3128 	if (*s) {
3129 		ret = __evlist__parse_mmap_pages(&mmap_pages, s);
3130 		if (ret)
3131 			goto out_free;
3132 		opts->mmap_pages = mmap_pages;
3133 	}
3134 
3135 	if (!p) {
3136 		ret = 0;
3137 		goto out_free;
3138 	}
3139 
3140 	ret = __evlist__parse_mmap_pages(&mmap_pages, p + 1);
3141 	if (ret)
3142 		goto out_free;
3143 
3144 	opts->auxtrace_mmap_pages = mmap_pages;
3145 
3146 out_free:
3147 	free(s);
3148 	return ret;
3149 }
3150 
3151 void __weak arch__add_leaf_frame_record_opts(struct record_opts *opts __maybe_unused)
3152 {
3153 }
3154 
3155 static int parse_control_option(const struct option *opt,
3156 				const char *str,
3157 				int unset __maybe_unused)
3158 {
3159 	struct record_opts *opts = opt->value;
3160 
3161 	return evlist__parse_control(str, &opts->ctl_fd, &opts->ctl_fd_ack, &opts->ctl_fd_close);
3162 }
3163 
3164 static void switch_output_size_warn(struct record *rec)
3165 {
3166 	u64 wakeup_size = evlist__mmap_size(rec->opts.mmap_pages);
3167 	struct switch_output *s = &rec->switch_output;
3168 
3169 	wakeup_size /= 2;
3170 
3171 	if (s->size < wakeup_size) {
3172 		char buf[100];
3173 
3174 		unit_number__scnprintf(buf, sizeof(buf), wakeup_size);
3175 		pr_warning("WARNING: switch-output data size lower than "
3176 			   "wakeup kernel buffer size (%s) "
3177 			   "expect bigger perf.data sizes\n", buf);
3178 	}
3179 }
3180 
3181 static int switch_output_setup(struct record *rec)
3182 {
3183 	struct switch_output *s = &rec->switch_output;
3184 	static struct parse_tag tags_size[] = {
3185 		{ .tag  = 'B', .mult = 1       },
3186 		{ .tag  = 'K', .mult = 1 << 10 },
3187 		{ .tag  = 'M', .mult = 1 << 20 },
3188 		{ .tag  = 'G', .mult = 1 << 30 },
3189 		{ .tag  = 0 },
3190 	};
3191 	static struct parse_tag tags_time[] = {
3192 		{ .tag  = 's', .mult = 1        },
3193 		{ .tag  = 'm', .mult = 60       },
3194 		{ .tag  = 'h', .mult = 60*60    },
3195 		{ .tag  = 'd', .mult = 60*60*24 },
3196 		{ .tag  = 0 },
3197 	};
3198 	unsigned long val;
3199 
3200 	/*
3201 	 * If we're using --switch-output-events, then we imply its
3202 	 * --switch-output=signal, as we'll send a SIGUSR2 from the side band
3203 	 *  thread to its parent.
3204 	 */
3205 	if (rec->switch_output_event_set) {
3206 		if (record__threads_enabled(rec)) {
3207 			pr_warning("WARNING: --switch-output-event option is not available in parallel streaming mode.\n");
3208 			return 0;
3209 		}
3210 		goto do_signal;
3211 	}
3212 
3213 	if (!s->set)
3214 		return 0;
3215 
3216 	if (record__threads_enabled(rec)) {
3217 		pr_warning("WARNING: --switch-output option is not available in parallel streaming mode.\n");
3218 		return 0;
3219 	}
3220 
3221 	if (!strcmp(s->str, "signal")) {
3222 do_signal:
3223 		s->signal = true;
3224 		pr_debug("switch-output with SIGUSR2 signal\n");
3225 		goto enabled;
3226 	}
3227 
3228 	val = parse_tag_value(s->str, tags_size);
3229 	if (val != (unsigned long) -1) {
3230 		s->size = val;
3231 		pr_debug("switch-output with %s size threshold\n", s->str);
3232 		goto enabled;
3233 	}
3234 
3235 	val = parse_tag_value(s->str, tags_time);
3236 	if (val != (unsigned long) -1) {
3237 		s->time = val;
3238 		pr_debug("switch-output with %s time threshold (%lu seconds)\n",
3239 			 s->str, s->time);
3240 		goto enabled;
3241 	}
3242 
3243 	return -1;
3244 
3245 enabled:
3246 	rec->timestamp_filename = true;
3247 	s->enabled              = true;
3248 
3249 	if (s->size && !rec->opts.no_buffering)
3250 		switch_output_size_warn(rec);
3251 
3252 	return 0;
3253 }
3254 
3255 static const char * const __record_usage[] = {
3256 	"perf record [<options>] [<command>]",
3257 	"perf record [<options>] -- <command> [<options>]",
3258 	NULL
3259 };
3260 const char * const *record_usage = __record_usage;
3261 
3262 static int build_id__process_mmap(struct perf_tool *tool, union perf_event *event,
3263 				  struct perf_sample *sample, struct machine *machine)
3264 {
3265 	/*
3266 	 * We already have the kernel maps, put in place via perf_session__create_kernel_maps()
3267 	 * no need to add them twice.
3268 	 */
3269 	if (!(event->header.misc & PERF_RECORD_MISC_USER))
3270 		return 0;
3271 	return perf_event__process_mmap(tool, event, sample, machine);
3272 }
3273 
3274 static int build_id__process_mmap2(struct perf_tool *tool, union perf_event *event,
3275 				   struct perf_sample *sample, struct machine *machine)
3276 {
3277 	/*
3278 	 * We already have the kernel maps, put in place via perf_session__create_kernel_maps()
3279 	 * no need to add them twice.
3280 	 */
3281 	if (!(event->header.misc & PERF_RECORD_MISC_USER))
3282 		return 0;
3283 
3284 	return perf_event__process_mmap2(tool, event, sample, machine);
3285 }
3286 
3287 static int process_timestamp_boundary(struct perf_tool *tool,
3288 				      union perf_event *event __maybe_unused,
3289 				      struct perf_sample *sample,
3290 				      struct machine *machine __maybe_unused)
3291 {
3292 	struct record *rec = container_of(tool, struct record, tool);
3293 
3294 	set_timestamp_boundary(rec, sample->time);
3295 	return 0;
3296 }
3297 
3298 static int parse_record_synth_option(const struct option *opt,
3299 				     const char *str,
3300 				     int unset __maybe_unused)
3301 {
3302 	struct record_opts *opts = opt->value;
3303 	char *p = strdup(str);
3304 
3305 	if (p == NULL)
3306 		return -1;
3307 
3308 	opts->synth = parse_synth_opt(p);
3309 	free(p);
3310 
3311 	if (opts->synth < 0) {
3312 		pr_err("Invalid synth option: %s\n", str);
3313 		return -1;
3314 	}
3315 	return 0;
3316 }
3317 
3318 /*
3319  * XXX Ideally would be local to cmd_record() and passed to a record__new
3320  * because we need to have access to it in record__exit, that is called
3321  * after cmd_record() exits, but since record_options need to be accessible to
3322  * builtin-script, leave it here.
3323  *
3324  * At least we don't ouch it in all the other functions here directly.
3325  *
3326  * Just say no to tons of global variables, sigh.
3327  */
3328 static struct record record = {
3329 	.opts = {
3330 		.sample_time	     = true,
3331 		.mmap_pages	     = UINT_MAX,
3332 		.user_freq	     = UINT_MAX,
3333 		.user_interval	     = ULLONG_MAX,
3334 		.freq		     = 4000,
3335 		.target		     = {
3336 			.uses_mmap   = true,
3337 			.default_per_cpu = true,
3338 		},
3339 		.mmap_flush          = MMAP_FLUSH_DEFAULT,
3340 		.nr_threads_synthesize = 1,
3341 		.ctl_fd              = -1,
3342 		.ctl_fd_ack          = -1,
3343 		.synth               = PERF_SYNTH_ALL,
3344 	},
3345 	.tool = {
3346 		.sample		= process_sample_event,
3347 		.fork		= perf_event__process_fork,
3348 		.exit		= perf_event__process_exit,
3349 		.comm		= perf_event__process_comm,
3350 		.namespaces	= perf_event__process_namespaces,
3351 		.mmap		= build_id__process_mmap,
3352 		.mmap2		= build_id__process_mmap2,
3353 		.itrace_start	= process_timestamp_boundary,
3354 		.aux		= process_timestamp_boundary,
3355 		.ordered_events	= true,
3356 	},
3357 };
3358 
3359 const char record_callchain_help[] = CALLCHAIN_RECORD_HELP
3360 	"\n\t\t\t\tDefault: fp";
3361 
3362 static bool dry_run;
3363 
3364 static struct parse_events_option_args parse_events_option_args = {
3365 	.evlistp = &record.evlist,
3366 };
3367 
3368 static struct parse_events_option_args switch_output_parse_events_option_args = {
3369 	.evlistp = &record.sb_evlist,
3370 };
3371 
3372 /*
3373  * XXX Will stay a global variable till we fix builtin-script.c to stop messing
3374  * with it and switch to use the library functions in perf_evlist that came
3375  * from builtin-record.c, i.e. use record_opts,
3376  * evlist__prepare_workload, etc instead of fork+exec'in 'perf record',
3377  * using pipes, etc.
3378  */
3379 static struct option __record_options[] = {
3380 	OPT_CALLBACK('e', "event", &parse_events_option_args, "event",
3381 		     "event selector. use 'perf list' to list available events",
3382 		     parse_events_option),
3383 	OPT_CALLBACK(0, "filter", &record.evlist, "filter",
3384 		     "event filter", parse_filter),
3385 	OPT_CALLBACK_NOOPT(0, "exclude-perf", &record.evlist,
3386 			   NULL, "don't record events from perf itself",
3387 			   exclude_perf),
3388 	OPT_STRING('p', "pid", &record.opts.target.pid, "pid",
3389 		    "record events on existing process id"),
3390 	OPT_STRING('t', "tid", &record.opts.target.tid, "tid",
3391 		    "record events on existing thread id"),
3392 	OPT_INTEGER('r', "realtime", &record.realtime_prio,
3393 		    "collect data with this RT SCHED_FIFO priority"),
3394 	OPT_BOOLEAN(0, "no-buffering", &record.opts.no_buffering,
3395 		    "collect data without buffering"),
3396 	OPT_BOOLEAN('R', "raw-samples", &record.opts.raw_samples,
3397 		    "collect raw sample records from all opened counters"),
3398 	OPT_BOOLEAN('a', "all-cpus", &record.opts.target.system_wide,
3399 			    "system-wide collection from all CPUs"),
3400 	OPT_STRING('C', "cpu", &record.opts.target.cpu_list, "cpu",
3401 		    "list of cpus to monitor"),
3402 	OPT_U64('c', "count", &record.opts.user_interval, "event period to sample"),
3403 	OPT_STRING('o', "output", &record.data.path, "file",
3404 		    "output file name"),
3405 	OPT_BOOLEAN_SET('i', "no-inherit", &record.opts.no_inherit,
3406 			&record.opts.no_inherit_set,
3407 			"child tasks do not inherit counters"),
3408 	OPT_BOOLEAN(0, "tail-synthesize", &record.opts.tail_synthesize,
3409 		    "synthesize non-sample events at the end of output"),
3410 	OPT_BOOLEAN(0, "overwrite", &record.opts.overwrite, "use overwrite mode"),
3411 	OPT_BOOLEAN(0, "no-bpf-event", &record.opts.no_bpf_event, "do not record bpf events"),
3412 	OPT_BOOLEAN(0, "strict-freq", &record.opts.strict_freq,
3413 		    "Fail if the specified frequency can't be used"),
3414 	OPT_CALLBACK('F', "freq", &record.opts, "freq or 'max'",
3415 		     "profile at this frequency",
3416 		      record__parse_freq),
3417 	OPT_CALLBACK('m', "mmap-pages", &record.opts, "pages[,pages]",
3418 		     "number of mmap data pages and AUX area tracing mmap pages",
3419 		     record__parse_mmap_pages),
3420 	OPT_CALLBACK(0, "mmap-flush", &record.opts, "number",
3421 		     "Minimal number of bytes that is extracted from mmap data pages (default: 1)",
3422 		     record__mmap_flush_parse),
3423 	OPT_CALLBACK_NOOPT('g', NULL, &callchain_param,
3424 			   NULL, "enables call-graph recording" ,
3425 			   &record_callchain_opt),
3426 	OPT_CALLBACK(0, "call-graph", &record.opts,
3427 		     "record_mode[,record_size]", record_callchain_help,
3428 		     &record_parse_callchain_opt),
3429 	OPT_INCR('v', "verbose", &verbose,
3430 		    "be more verbose (show counter open errors, etc)"),
3431 	OPT_BOOLEAN('q', "quiet", &quiet, "don't print any warnings or messages"),
3432 	OPT_BOOLEAN('s', "stat", &record.opts.inherit_stat,
3433 		    "per thread counts"),
3434 	OPT_BOOLEAN('d', "data", &record.opts.sample_address, "Record the sample addresses"),
3435 	OPT_BOOLEAN(0, "phys-data", &record.opts.sample_phys_addr,
3436 		    "Record the sample physical addresses"),
3437 	OPT_BOOLEAN(0, "data-page-size", &record.opts.sample_data_page_size,
3438 		    "Record the sampled data address data page size"),
3439 	OPT_BOOLEAN(0, "code-page-size", &record.opts.sample_code_page_size,
3440 		    "Record the sampled code address (ip) page size"),
3441 	OPT_BOOLEAN(0, "sample-cpu", &record.opts.sample_cpu, "Record the sample cpu"),
3442 	OPT_BOOLEAN(0, "sample-identifier", &record.opts.sample_identifier,
3443 		    "Record the sample identifier"),
3444 	OPT_BOOLEAN_SET('T', "timestamp", &record.opts.sample_time,
3445 			&record.opts.sample_time_set,
3446 			"Record the sample timestamps"),
3447 	OPT_BOOLEAN_SET('P', "period", &record.opts.period, &record.opts.period_set,
3448 			"Record the sample period"),
3449 	OPT_BOOLEAN('n', "no-samples", &record.opts.no_samples,
3450 		    "don't sample"),
3451 	OPT_BOOLEAN_SET('N', "no-buildid-cache", &record.no_buildid_cache,
3452 			&record.no_buildid_cache_set,
3453 			"do not update the buildid cache"),
3454 	OPT_BOOLEAN_SET('B', "no-buildid", &record.no_buildid,
3455 			&record.no_buildid_set,
3456 			"do not collect buildids in perf.data"),
3457 	OPT_CALLBACK('G', "cgroup", &record.evlist, "name",
3458 		     "monitor event in cgroup name only",
3459 		     parse_cgroups),
3460 	OPT_CALLBACK('D', "delay", &record, "ms",
3461 		     "ms to wait before starting measurement after program start (-1: start with events disabled), "
3462 		     "or ranges of time to enable events e.g. '-D 10-20,30-40'",
3463 		     record__parse_event_enable_time),
3464 	OPT_BOOLEAN(0, "kcore", &record.opts.kcore, "copy /proc/kcore"),
3465 	OPT_STRING('u', "uid", &record.opts.target.uid_str, "user",
3466 		   "user to profile"),
3467 
3468 	OPT_CALLBACK_NOOPT('b', "branch-any", &record.opts.branch_stack,
3469 		     "branch any", "sample any taken branches",
3470 		     parse_branch_stack),
3471 
3472 	OPT_CALLBACK('j', "branch-filter", &record.opts.branch_stack,
3473 		     "branch filter mask", "branch stack filter modes",
3474 		     parse_branch_stack),
3475 	OPT_BOOLEAN('W', "weight", &record.opts.sample_weight,
3476 		    "sample by weight (on special events only)"),
3477 	OPT_BOOLEAN(0, "transaction", &record.opts.sample_transaction,
3478 		    "sample transaction flags (special events only)"),
3479 	OPT_BOOLEAN(0, "per-thread", &record.opts.target.per_thread,
3480 		    "use per-thread mmaps"),
3481 	OPT_CALLBACK_OPTARG('I', "intr-regs", &record.opts.sample_intr_regs, NULL, "any register",
3482 		    "sample selected machine registers on interrupt,"
3483 		    " use '-I?' to list register names", parse_intr_regs),
3484 	OPT_CALLBACK_OPTARG(0, "user-regs", &record.opts.sample_user_regs, NULL, "any register",
3485 		    "sample selected machine registers on interrupt,"
3486 		    " use '--user-regs=?' to list register names", parse_user_regs),
3487 	OPT_BOOLEAN(0, "running-time", &record.opts.running_time,
3488 		    "Record running/enabled time of read (:S) events"),
3489 	OPT_CALLBACK('k', "clockid", &record.opts,
3490 	"clockid", "clockid to use for events, see clock_gettime()",
3491 	parse_clockid),
3492 	OPT_STRING_OPTARG('S', "snapshot", &record.opts.auxtrace_snapshot_opts,
3493 			  "opts", "AUX area tracing Snapshot Mode", ""),
3494 	OPT_STRING_OPTARG(0, "aux-sample", &record.opts.auxtrace_sample_opts,
3495 			  "opts", "sample AUX area", ""),
3496 	OPT_UINTEGER(0, "proc-map-timeout", &proc_map_timeout,
3497 			"per thread proc mmap processing timeout in ms"),
3498 	OPT_BOOLEAN(0, "namespaces", &record.opts.record_namespaces,
3499 		    "Record namespaces events"),
3500 	OPT_BOOLEAN(0, "all-cgroups", &record.opts.record_cgroup,
3501 		    "Record cgroup events"),
3502 	OPT_BOOLEAN_SET(0, "switch-events", &record.opts.record_switch_events,
3503 			&record.opts.record_switch_events_set,
3504 			"Record context switch events"),
3505 	OPT_BOOLEAN_FLAG(0, "all-kernel", &record.opts.all_kernel,
3506 			 "Configure all used events to run in kernel space.",
3507 			 PARSE_OPT_EXCLUSIVE),
3508 	OPT_BOOLEAN_FLAG(0, "all-user", &record.opts.all_user,
3509 			 "Configure all used events to run in user space.",
3510 			 PARSE_OPT_EXCLUSIVE),
3511 	OPT_BOOLEAN(0, "kernel-callchains", &record.opts.kernel_callchains,
3512 		    "collect kernel callchains"),
3513 	OPT_BOOLEAN(0, "user-callchains", &record.opts.user_callchains,
3514 		    "collect user callchains"),
3515 	OPT_STRING(0, "vmlinux", &symbol_conf.vmlinux_name,
3516 		   "file", "vmlinux pathname"),
3517 	OPT_BOOLEAN(0, "buildid-all", &record.buildid_all,
3518 		    "Record build-id of all DSOs regardless of hits"),
3519 	OPT_BOOLEAN(0, "buildid-mmap", &record.buildid_mmap,
3520 		    "Record build-id in map events"),
3521 	OPT_BOOLEAN(0, "timestamp-filename", &record.timestamp_filename,
3522 		    "append timestamp to output filename"),
3523 	OPT_BOOLEAN(0, "timestamp-boundary", &record.timestamp_boundary,
3524 		    "Record timestamp boundary (time of first/last samples)"),
3525 	OPT_STRING_OPTARG_SET(0, "switch-output", &record.switch_output.str,
3526 			  &record.switch_output.set, "signal or size[BKMG] or time[smhd]",
3527 			  "Switch output when receiving SIGUSR2 (signal) or cross a size or time threshold",
3528 			  "signal"),
3529 	OPT_CALLBACK_SET(0, "switch-output-event", &switch_output_parse_events_option_args,
3530 			 &record.switch_output_event_set, "switch output event",
3531 			 "switch output event selector. use 'perf list' to list available events",
3532 			 parse_events_option_new_evlist),
3533 	OPT_INTEGER(0, "switch-max-files", &record.switch_output.num_files,
3534 		   "Limit number of switch output generated files"),
3535 	OPT_BOOLEAN(0, "dry-run", &dry_run,
3536 		    "Parse options then exit"),
3537 #ifdef HAVE_AIO_SUPPORT
3538 	OPT_CALLBACK_OPTARG(0, "aio", &record.opts,
3539 		     &nr_cblocks_default, "n", "Use <n> control blocks in asynchronous trace writing mode (default: 1, max: 4)",
3540 		     record__aio_parse),
3541 #endif
3542 	OPT_CALLBACK(0, "affinity", &record.opts, "node|cpu",
3543 		     "Set affinity mask of trace reading thread to NUMA node cpu mask or cpu of processed mmap buffer",
3544 		     record__parse_affinity),
3545 #ifdef HAVE_ZSTD_SUPPORT
3546 	OPT_CALLBACK_OPTARG('z', "compression-level", &record.opts, &comp_level_default, "n",
3547 			    "Compress records using specified level (default: 1 - fastest compression, 22 - greatest compression)",
3548 			    record__parse_comp_level),
3549 #endif
3550 	OPT_CALLBACK(0, "max-size", &record.output_max_size,
3551 		     "size", "Limit the maximum size of the output file", parse_output_max_size),
3552 	OPT_UINTEGER(0, "num-thread-synthesize",
3553 		     &record.opts.nr_threads_synthesize,
3554 		     "number of threads to run for event synthesis"),
3555 #ifdef HAVE_LIBPFM
3556 	OPT_CALLBACK(0, "pfm-events", &record.evlist, "event",
3557 		"libpfm4 event selector. use 'perf list' to list available events",
3558 		parse_libpfm_events_option),
3559 #endif
3560 	OPT_CALLBACK(0, "control", &record.opts, "fd:ctl-fd[,ack-fd] or fifo:ctl-fifo[,ack-fifo]",
3561 		     "Listen on ctl-fd descriptor for command to control measurement ('enable': enable events, 'disable': disable events,\n"
3562 		     "\t\t\t  'snapshot': AUX area tracing snapshot).\n"
3563 		     "\t\t\t  Optionally send control command completion ('ack\\n') to ack-fd descriptor.\n"
3564 		     "\t\t\t  Alternatively, ctl-fifo / ack-fifo will be opened and used as ctl-fd / ack-fd.",
3565 		      parse_control_option),
3566 	OPT_CALLBACK(0, "synth", &record.opts, "no|all|task|mmap|cgroup",
3567 		     "Fine-tune event synthesis: default=all", parse_record_synth_option),
3568 	OPT_STRING_OPTARG_SET(0, "debuginfod", &record.debuginfod.urls,
3569 			  &record.debuginfod.set, "debuginfod urls",
3570 			  "Enable debuginfod data retrieval from DEBUGINFOD_URLS or specified urls",
3571 			  "system"),
3572 	OPT_CALLBACK_OPTARG(0, "threads", &record.opts, NULL, "spec",
3573 			    "write collected trace data into several data files using parallel threads",
3574 			    record__parse_threads),
3575 	OPT_BOOLEAN(0, "off-cpu", &record.off_cpu, "Enable off-cpu analysis"),
3576 	OPT_END()
3577 };
3578 
3579 struct option *record_options = __record_options;
3580 
3581 static int record__mmap_cpu_mask_init(struct mmap_cpu_mask *mask, struct perf_cpu_map *cpus)
3582 {
3583 	struct perf_cpu cpu;
3584 	int idx;
3585 
3586 	if (cpu_map__is_dummy(cpus))
3587 		return 0;
3588 
3589 	perf_cpu_map__for_each_cpu_skip_any(cpu, idx, cpus) {
3590 		/* Return ENODEV is input cpu is greater than max cpu */
3591 		if ((unsigned long)cpu.cpu > mask->nbits)
3592 			return -ENODEV;
3593 		__set_bit(cpu.cpu, mask->bits);
3594 	}
3595 
3596 	return 0;
3597 }
3598 
3599 static int record__mmap_cpu_mask_init_spec(struct mmap_cpu_mask *mask, const char *mask_spec)
3600 {
3601 	struct perf_cpu_map *cpus;
3602 
3603 	cpus = perf_cpu_map__new(mask_spec);
3604 	if (!cpus)
3605 		return -ENOMEM;
3606 
3607 	bitmap_zero(mask->bits, mask->nbits);
3608 	if (record__mmap_cpu_mask_init(mask, cpus))
3609 		return -ENODEV;
3610 
3611 	perf_cpu_map__put(cpus);
3612 
3613 	return 0;
3614 }
3615 
3616 static void record__free_thread_masks(struct record *rec, int nr_threads)
3617 {
3618 	int t;
3619 
3620 	if (rec->thread_masks)
3621 		for (t = 0; t < nr_threads; t++)
3622 			record__thread_mask_free(&rec->thread_masks[t]);
3623 
3624 	zfree(&rec->thread_masks);
3625 }
3626 
3627 static int record__alloc_thread_masks(struct record *rec, int nr_threads, int nr_bits)
3628 {
3629 	int t, ret;
3630 
3631 	rec->thread_masks = zalloc(nr_threads * sizeof(*(rec->thread_masks)));
3632 	if (!rec->thread_masks) {
3633 		pr_err("Failed to allocate thread masks\n");
3634 		return -ENOMEM;
3635 	}
3636 
3637 	for (t = 0; t < nr_threads; t++) {
3638 		ret = record__thread_mask_alloc(&rec->thread_masks[t], nr_bits);
3639 		if (ret) {
3640 			pr_err("Failed to allocate thread masks[%d]\n", t);
3641 			goto out_free;
3642 		}
3643 	}
3644 
3645 	return 0;
3646 
3647 out_free:
3648 	record__free_thread_masks(rec, nr_threads);
3649 
3650 	return ret;
3651 }
3652 
3653 static int record__init_thread_cpu_masks(struct record *rec, struct perf_cpu_map *cpus)
3654 {
3655 	int t, ret, nr_cpus = perf_cpu_map__nr(cpus);
3656 
3657 	ret = record__alloc_thread_masks(rec, nr_cpus, cpu__max_cpu().cpu);
3658 	if (ret)
3659 		return ret;
3660 
3661 	rec->nr_threads = nr_cpus;
3662 	pr_debug("nr_threads: %d\n", rec->nr_threads);
3663 
3664 	for (t = 0; t < rec->nr_threads; t++) {
3665 		__set_bit(perf_cpu_map__cpu(cpus, t).cpu, rec->thread_masks[t].maps.bits);
3666 		__set_bit(perf_cpu_map__cpu(cpus, t).cpu, rec->thread_masks[t].affinity.bits);
3667 		if (verbose > 0) {
3668 			pr_debug("thread_masks[%d]: ", t);
3669 			mmap_cpu_mask__scnprintf(&rec->thread_masks[t].maps, "maps");
3670 			pr_debug("thread_masks[%d]: ", t);
3671 			mmap_cpu_mask__scnprintf(&rec->thread_masks[t].affinity, "affinity");
3672 		}
3673 	}
3674 
3675 	return 0;
3676 }
3677 
3678 static int record__init_thread_masks_spec(struct record *rec, struct perf_cpu_map *cpus,
3679 					  const char **maps_spec, const char **affinity_spec,
3680 					  u32 nr_spec)
3681 {
3682 	u32 s;
3683 	int ret = 0, t = 0;
3684 	struct mmap_cpu_mask cpus_mask;
3685 	struct thread_mask thread_mask, full_mask, *thread_masks;
3686 
3687 	ret = record__mmap_cpu_mask_alloc(&cpus_mask, cpu__max_cpu().cpu);
3688 	if (ret) {
3689 		pr_err("Failed to allocate CPUs mask\n");
3690 		return ret;
3691 	}
3692 
3693 	ret = record__mmap_cpu_mask_init(&cpus_mask, cpus);
3694 	if (ret) {
3695 		pr_err("Failed to init cpu mask\n");
3696 		goto out_free_cpu_mask;
3697 	}
3698 
3699 	ret = record__thread_mask_alloc(&full_mask, cpu__max_cpu().cpu);
3700 	if (ret) {
3701 		pr_err("Failed to allocate full mask\n");
3702 		goto out_free_cpu_mask;
3703 	}
3704 
3705 	ret = record__thread_mask_alloc(&thread_mask, cpu__max_cpu().cpu);
3706 	if (ret) {
3707 		pr_err("Failed to allocate thread mask\n");
3708 		goto out_free_full_and_cpu_masks;
3709 	}
3710 
3711 	for (s = 0; s < nr_spec; s++) {
3712 		ret = record__mmap_cpu_mask_init_spec(&thread_mask.maps, maps_spec[s]);
3713 		if (ret) {
3714 			pr_err("Failed to initialize maps thread mask\n");
3715 			goto out_free;
3716 		}
3717 		ret = record__mmap_cpu_mask_init_spec(&thread_mask.affinity, affinity_spec[s]);
3718 		if (ret) {
3719 			pr_err("Failed to initialize affinity thread mask\n");
3720 			goto out_free;
3721 		}
3722 
3723 		/* ignore invalid CPUs but do not allow empty masks */
3724 		if (!bitmap_and(thread_mask.maps.bits, thread_mask.maps.bits,
3725 				cpus_mask.bits, thread_mask.maps.nbits)) {
3726 			pr_err("Empty maps mask: %s\n", maps_spec[s]);
3727 			ret = -EINVAL;
3728 			goto out_free;
3729 		}
3730 		if (!bitmap_and(thread_mask.affinity.bits, thread_mask.affinity.bits,
3731 				cpus_mask.bits, thread_mask.affinity.nbits)) {
3732 			pr_err("Empty affinity mask: %s\n", affinity_spec[s]);
3733 			ret = -EINVAL;
3734 			goto out_free;
3735 		}
3736 
3737 		/* do not allow intersection with other masks (full_mask) */
3738 		if (bitmap_intersects(thread_mask.maps.bits, full_mask.maps.bits,
3739 				      thread_mask.maps.nbits)) {
3740 			pr_err("Intersecting maps mask: %s\n", maps_spec[s]);
3741 			ret = -EINVAL;
3742 			goto out_free;
3743 		}
3744 		if (bitmap_intersects(thread_mask.affinity.bits, full_mask.affinity.bits,
3745 				      thread_mask.affinity.nbits)) {
3746 			pr_err("Intersecting affinity mask: %s\n", affinity_spec[s]);
3747 			ret = -EINVAL;
3748 			goto out_free;
3749 		}
3750 
3751 		bitmap_or(full_mask.maps.bits, full_mask.maps.bits,
3752 			  thread_mask.maps.bits, full_mask.maps.nbits);
3753 		bitmap_or(full_mask.affinity.bits, full_mask.affinity.bits,
3754 			  thread_mask.affinity.bits, full_mask.maps.nbits);
3755 
3756 		thread_masks = realloc(rec->thread_masks, (t + 1) * sizeof(struct thread_mask));
3757 		if (!thread_masks) {
3758 			pr_err("Failed to reallocate thread masks\n");
3759 			ret = -ENOMEM;
3760 			goto out_free;
3761 		}
3762 		rec->thread_masks = thread_masks;
3763 		rec->thread_masks[t] = thread_mask;
3764 		if (verbose > 0) {
3765 			pr_debug("thread_masks[%d]: ", t);
3766 			mmap_cpu_mask__scnprintf(&rec->thread_masks[t].maps, "maps");
3767 			pr_debug("thread_masks[%d]: ", t);
3768 			mmap_cpu_mask__scnprintf(&rec->thread_masks[t].affinity, "affinity");
3769 		}
3770 		t++;
3771 		ret = record__thread_mask_alloc(&thread_mask, cpu__max_cpu().cpu);
3772 		if (ret) {
3773 			pr_err("Failed to allocate thread mask\n");
3774 			goto out_free_full_and_cpu_masks;
3775 		}
3776 	}
3777 	rec->nr_threads = t;
3778 	pr_debug("nr_threads: %d\n", rec->nr_threads);
3779 	if (!rec->nr_threads)
3780 		ret = -EINVAL;
3781 
3782 out_free:
3783 	record__thread_mask_free(&thread_mask);
3784 out_free_full_and_cpu_masks:
3785 	record__thread_mask_free(&full_mask);
3786 out_free_cpu_mask:
3787 	record__mmap_cpu_mask_free(&cpus_mask);
3788 
3789 	return ret;
3790 }
3791 
3792 static int record__init_thread_core_masks(struct record *rec, struct perf_cpu_map *cpus)
3793 {
3794 	int ret;
3795 	struct cpu_topology *topo;
3796 
3797 	topo = cpu_topology__new();
3798 	if (!topo) {
3799 		pr_err("Failed to allocate CPU topology\n");
3800 		return -ENOMEM;
3801 	}
3802 
3803 	ret = record__init_thread_masks_spec(rec, cpus, topo->core_cpus_list,
3804 					     topo->core_cpus_list, topo->core_cpus_lists);
3805 	cpu_topology__delete(topo);
3806 
3807 	return ret;
3808 }
3809 
3810 static int record__init_thread_package_masks(struct record *rec, struct perf_cpu_map *cpus)
3811 {
3812 	int ret;
3813 	struct cpu_topology *topo;
3814 
3815 	topo = cpu_topology__new();
3816 	if (!topo) {
3817 		pr_err("Failed to allocate CPU topology\n");
3818 		return -ENOMEM;
3819 	}
3820 
3821 	ret = record__init_thread_masks_spec(rec, cpus, topo->package_cpus_list,
3822 					     topo->package_cpus_list, topo->package_cpus_lists);
3823 	cpu_topology__delete(topo);
3824 
3825 	return ret;
3826 }
3827 
3828 static int record__init_thread_numa_masks(struct record *rec, struct perf_cpu_map *cpus)
3829 {
3830 	u32 s;
3831 	int ret;
3832 	const char **spec;
3833 	struct numa_topology *topo;
3834 
3835 	topo = numa_topology__new();
3836 	if (!topo) {
3837 		pr_err("Failed to allocate NUMA topology\n");
3838 		return -ENOMEM;
3839 	}
3840 
3841 	spec = zalloc(topo->nr * sizeof(char *));
3842 	if (!spec) {
3843 		pr_err("Failed to allocate NUMA spec\n");
3844 		ret = -ENOMEM;
3845 		goto out_delete_topo;
3846 	}
3847 	for (s = 0; s < topo->nr; s++)
3848 		spec[s] = topo->nodes[s].cpus;
3849 
3850 	ret = record__init_thread_masks_spec(rec, cpus, spec, spec, topo->nr);
3851 
3852 	zfree(&spec);
3853 
3854 out_delete_topo:
3855 	numa_topology__delete(topo);
3856 
3857 	return ret;
3858 }
3859 
3860 static int record__init_thread_user_masks(struct record *rec, struct perf_cpu_map *cpus)
3861 {
3862 	int t, ret;
3863 	u32 s, nr_spec = 0;
3864 	char **maps_spec = NULL, **affinity_spec = NULL, **tmp_spec;
3865 	char *user_spec, *spec, *spec_ptr, *mask, *mask_ptr, *dup_mask = NULL;
3866 
3867 	for (t = 0, user_spec = (char *)rec->opts.threads_user_spec; ; t++, user_spec = NULL) {
3868 		spec = strtok_r(user_spec, ":", &spec_ptr);
3869 		if (spec == NULL)
3870 			break;
3871 		pr_debug2("threads_spec[%d]: %s\n", t, spec);
3872 		mask = strtok_r(spec, "/", &mask_ptr);
3873 		if (mask == NULL)
3874 			break;
3875 		pr_debug2("  maps mask: %s\n", mask);
3876 		tmp_spec = realloc(maps_spec, (nr_spec + 1) * sizeof(char *));
3877 		if (!tmp_spec) {
3878 			pr_err("Failed to reallocate maps spec\n");
3879 			ret = -ENOMEM;
3880 			goto out_free;
3881 		}
3882 		maps_spec = tmp_spec;
3883 		maps_spec[nr_spec] = dup_mask = strdup(mask);
3884 		if (!maps_spec[nr_spec]) {
3885 			pr_err("Failed to allocate maps spec[%d]\n", nr_spec);
3886 			ret = -ENOMEM;
3887 			goto out_free;
3888 		}
3889 		mask = strtok_r(NULL, "/", &mask_ptr);
3890 		if (mask == NULL) {
3891 			pr_err("Invalid thread maps or affinity specs\n");
3892 			ret = -EINVAL;
3893 			goto out_free;
3894 		}
3895 		pr_debug2("  affinity mask: %s\n", mask);
3896 		tmp_spec = realloc(affinity_spec, (nr_spec + 1) * sizeof(char *));
3897 		if (!tmp_spec) {
3898 			pr_err("Failed to reallocate affinity spec\n");
3899 			ret = -ENOMEM;
3900 			goto out_free;
3901 		}
3902 		affinity_spec = tmp_spec;
3903 		affinity_spec[nr_spec] = strdup(mask);
3904 		if (!affinity_spec[nr_spec]) {
3905 			pr_err("Failed to allocate affinity spec[%d]\n", nr_spec);
3906 			ret = -ENOMEM;
3907 			goto out_free;
3908 		}
3909 		dup_mask = NULL;
3910 		nr_spec++;
3911 	}
3912 
3913 	ret = record__init_thread_masks_spec(rec, cpus, (const char **)maps_spec,
3914 					     (const char **)affinity_spec, nr_spec);
3915 
3916 out_free:
3917 	free(dup_mask);
3918 	for (s = 0; s < nr_spec; s++) {
3919 		if (maps_spec)
3920 			free(maps_spec[s]);
3921 		if (affinity_spec)
3922 			free(affinity_spec[s]);
3923 	}
3924 	free(affinity_spec);
3925 	free(maps_spec);
3926 
3927 	return ret;
3928 }
3929 
3930 static int record__init_thread_default_masks(struct record *rec, struct perf_cpu_map *cpus)
3931 {
3932 	int ret;
3933 
3934 	ret = record__alloc_thread_masks(rec, 1, cpu__max_cpu().cpu);
3935 	if (ret)
3936 		return ret;
3937 
3938 	if (record__mmap_cpu_mask_init(&rec->thread_masks->maps, cpus))
3939 		return -ENODEV;
3940 
3941 	rec->nr_threads = 1;
3942 
3943 	return 0;
3944 }
3945 
3946 static int record__init_thread_masks(struct record *rec)
3947 {
3948 	int ret = 0;
3949 	struct perf_cpu_map *cpus = rec->evlist->core.all_cpus;
3950 
3951 	if (!record__threads_enabled(rec))
3952 		return record__init_thread_default_masks(rec, cpus);
3953 
3954 	if (evlist__per_thread(rec->evlist)) {
3955 		pr_err("--per-thread option is mutually exclusive to parallel streaming mode.\n");
3956 		return -EINVAL;
3957 	}
3958 
3959 	switch (rec->opts.threads_spec) {
3960 	case THREAD_SPEC__CPU:
3961 		ret = record__init_thread_cpu_masks(rec, cpus);
3962 		break;
3963 	case THREAD_SPEC__CORE:
3964 		ret = record__init_thread_core_masks(rec, cpus);
3965 		break;
3966 	case THREAD_SPEC__PACKAGE:
3967 		ret = record__init_thread_package_masks(rec, cpus);
3968 		break;
3969 	case THREAD_SPEC__NUMA:
3970 		ret = record__init_thread_numa_masks(rec, cpus);
3971 		break;
3972 	case THREAD_SPEC__USER:
3973 		ret = record__init_thread_user_masks(rec, cpus);
3974 		break;
3975 	default:
3976 		break;
3977 	}
3978 
3979 	return ret;
3980 }
3981 
3982 int cmd_record(int argc, const char **argv)
3983 {
3984 	int err;
3985 	struct record *rec = &record;
3986 	char errbuf[BUFSIZ];
3987 
3988 	setlocale(LC_ALL, "");
3989 
3990 #ifndef HAVE_BPF_SKEL
3991 # define set_nobuild(s, l, m, c) set_option_nobuild(record_options, s, l, m, c)
3992 	set_nobuild('\0', "off-cpu", "no BUILD_BPF_SKEL=1", true);
3993 # undef set_nobuild
3994 #endif
3995 
3996 	/* Disable eager loading of kernel symbols that adds overhead to perf record. */
3997 	symbol_conf.lazy_load_kernel_maps = true;
3998 	rec->opts.affinity = PERF_AFFINITY_SYS;
3999 
4000 	rec->evlist = evlist__new();
4001 	if (rec->evlist == NULL)
4002 		return -ENOMEM;
4003 
4004 	err = perf_config(perf_record_config, rec);
4005 	if (err)
4006 		return err;
4007 
4008 	argc = parse_options(argc, argv, record_options, record_usage,
4009 			    PARSE_OPT_STOP_AT_NON_OPTION);
4010 	if (quiet)
4011 		perf_quiet_option();
4012 
4013 	err = symbol__validate_sym_arguments();
4014 	if (err)
4015 		return err;
4016 
4017 	perf_debuginfod_setup(&record.debuginfod);
4018 
4019 	/* Make system wide (-a) the default target. */
4020 	if (!argc && target__none(&rec->opts.target))
4021 		rec->opts.target.system_wide = true;
4022 
4023 	if (nr_cgroups && !rec->opts.target.system_wide) {
4024 		usage_with_options_msg(record_usage, record_options,
4025 			"cgroup monitoring only available in system-wide mode");
4026 
4027 	}
4028 
4029 	if (rec->buildid_mmap) {
4030 		if (!perf_can_record_build_id()) {
4031 			pr_err("Failed: no support to record build id in mmap events, update your kernel.\n");
4032 			err = -EINVAL;
4033 			goto out_opts;
4034 		}
4035 		pr_debug("Enabling build id in mmap2 events.\n");
4036 		/* Enable mmap build id synthesizing. */
4037 		symbol_conf.buildid_mmap2 = true;
4038 		/* Enable perf_event_attr::build_id bit. */
4039 		rec->opts.build_id = true;
4040 		/* Disable build id cache. */
4041 		rec->no_buildid = true;
4042 	}
4043 
4044 	if (rec->opts.record_cgroup && !perf_can_record_cgroup()) {
4045 		pr_err("Kernel has no cgroup sampling support.\n");
4046 		err = -EINVAL;
4047 		goto out_opts;
4048 	}
4049 
4050 	if (rec->opts.kcore)
4051 		rec->opts.text_poke = true;
4052 
4053 	if (rec->opts.kcore || record__threads_enabled(rec))
4054 		rec->data.is_dir = true;
4055 
4056 	if (record__threads_enabled(rec)) {
4057 		if (rec->opts.affinity != PERF_AFFINITY_SYS) {
4058 			pr_err("--affinity option is mutually exclusive to parallel streaming mode.\n");
4059 			goto out_opts;
4060 		}
4061 		if (record__aio_enabled(rec)) {
4062 			pr_err("Asynchronous streaming mode (--aio) is mutually exclusive to parallel streaming mode.\n");
4063 			goto out_opts;
4064 		}
4065 	}
4066 
4067 	if (rec->opts.comp_level != 0) {
4068 		pr_debug("Compression enabled, disabling build id collection at the end of the session.\n");
4069 		rec->no_buildid = true;
4070 	}
4071 
4072 	if (rec->opts.record_switch_events &&
4073 	    !perf_can_record_switch_events()) {
4074 		ui__error("kernel does not support recording context switch events\n");
4075 		parse_options_usage(record_usage, record_options, "switch-events", 0);
4076 		err = -EINVAL;
4077 		goto out_opts;
4078 	}
4079 
4080 	if (switch_output_setup(rec)) {
4081 		parse_options_usage(record_usage, record_options, "switch-output", 0);
4082 		err = -EINVAL;
4083 		goto out_opts;
4084 	}
4085 
4086 	if (rec->switch_output.time) {
4087 		signal(SIGALRM, alarm_sig_handler);
4088 		alarm(rec->switch_output.time);
4089 	}
4090 
4091 	if (rec->switch_output.num_files) {
4092 		rec->switch_output.filenames = calloc(rec->switch_output.num_files,
4093 						      sizeof(char *));
4094 		if (!rec->switch_output.filenames) {
4095 			err = -EINVAL;
4096 			goto out_opts;
4097 		}
4098 	}
4099 
4100 	if (rec->timestamp_filename && record__threads_enabled(rec)) {
4101 		rec->timestamp_filename = false;
4102 		pr_warning("WARNING: --timestamp-filename option is not available in parallel streaming mode.\n");
4103 	}
4104 
4105 	/*
4106 	 * Allow aliases to facilitate the lookup of symbols for address
4107 	 * filters. Refer to auxtrace_parse_filters().
4108 	 */
4109 	symbol_conf.allow_aliases = true;
4110 
4111 	symbol__init(NULL);
4112 
4113 	err = record__auxtrace_init(rec);
4114 	if (err)
4115 		goto out;
4116 
4117 	if (dry_run)
4118 		goto out;
4119 
4120 	err = -ENOMEM;
4121 
4122 	if (rec->no_buildid_cache || rec->no_buildid) {
4123 		disable_buildid_cache();
4124 	} else if (rec->switch_output.enabled) {
4125 		/*
4126 		 * In 'perf record --switch-output', disable buildid
4127 		 * generation by default to reduce data file switching
4128 		 * overhead. Still generate buildid if they are required
4129 		 * explicitly using
4130 		 *
4131 		 *  perf record --switch-output --no-no-buildid \
4132 		 *              --no-no-buildid-cache
4133 		 *
4134 		 * Following code equals to:
4135 		 *
4136 		 * if ((rec->no_buildid || !rec->no_buildid_set) &&
4137 		 *     (rec->no_buildid_cache || !rec->no_buildid_cache_set))
4138 		 *         disable_buildid_cache();
4139 		 */
4140 		bool disable = true;
4141 
4142 		if (rec->no_buildid_set && !rec->no_buildid)
4143 			disable = false;
4144 		if (rec->no_buildid_cache_set && !rec->no_buildid_cache)
4145 			disable = false;
4146 		if (disable) {
4147 			rec->no_buildid = true;
4148 			rec->no_buildid_cache = true;
4149 			disable_buildid_cache();
4150 		}
4151 	}
4152 
4153 	if (record.opts.overwrite)
4154 		record.opts.tail_synthesize = true;
4155 
4156 	if (rec->evlist->core.nr_entries == 0) {
4157 		bool can_profile_kernel = perf_event_paranoid_check(1);
4158 
4159 		err = parse_event(rec->evlist, can_profile_kernel ? "cycles:P" : "cycles:Pu");
4160 		if (err)
4161 			goto out;
4162 	}
4163 
4164 	if (rec->opts.target.tid && !rec->opts.no_inherit_set)
4165 		rec->opts.no_inherit = true;
4166 
4167 	err = target__validate(&rec->opts.target);
4168 	if (err) {
4169 		target__strerror(&rec->opts.target, err, errbuf, BUFSIZ);
4170 		ui__warning("%s\n", errbuf);
4171 	}
4172 
4173 	err = target__parse_uid(&rec->opts.target);
4174 	if (err) {
4175 		int saved_errno = errno;
4176 
4177 		target__strerror(&rec->opts.target, err, errbuf, BUFSIZ);
4178 		ui__error("%s", errbuf);
4179 
4180 		err = -saved_errno;
4181 		goto out;
4182 	}
4183 
4184 	/* Enable ignoring missing threads when -u/-p option is defined. */
4185 	rec->opts.ignore_missing_thread = rec->opts.target.uid != UINT_MAX || rec->opts.target.pid;
4186 
4187 	evlist__warn_user_requested_cpus(rec->evlist, rec->opts.target.cpu_list);
4188 
4189 	if (callchain_param.enabled && callchain_param.record_mode == CALLCHAIN_FP)
4190 		arch__add_leaf_frame_record_opts(&rec->opts);
4191 
4192 	err = -ENOMEM;
4193 	if (evlist__create_maps(rec->evlist, &rec->opts.target) < 0) {
4194 		if (rec->opts.target.pid != NULL) {
4195 			pr_err("Couldn't create thread/CPU maps: %s\n",
4196 				errno == ENOENT ? "No such process" : str_error_r(errno, errbuf, sizeof(errbuf)));
4197 			goto out;
4198 		}
4199 		else
4200 			usage_with_options(record_usage, record_options);
4201 	}
4202 
4203 	err = auxtrace_record__options(rec->itr, rec->evlist, &rec->opts);
4204 	if (err)
4205 		goto out;
4206 
4207 	/*
4208 	 * We take all buildids when the file contains
4209 	 * AUX area tracing data because we do not decode the
4210 	 * trace because it would take too long.
4211 	 */
4212 	if (rec->opts.full_auxtrace)
4213 		rec->buildid_all = true;
4214 
4215 	if (rec->opts.text_poke) {
4216 		err = record__config_text_poke(rec->evlist);
4217 		if (err) {
4218 			pr_err("record__config_text_poke failed, error %d\n", err);
4219 			goto out;
4220 		}
4221 	}
4222 
4223 	if (rec->off_cpu) {
4224 		err = record__config_off_cpu(rec);
4225 		if (err) {
4226 			pr_err("record__config_off_cpu failed, error %d\n", err);
4227 			goto out;
4228 		}
4229 	}
4230 
4231 	if (record_opts__config(&rec->opts)) {
4232 		err = -EINVAL;
4233 		goto out;
4234 	}
4235 
4236 	err = record__config_tracking_events(rec);
4237 	if (err) {
4238 		pr_err("record__config_tracking_events failed, error %d\n", err);
4239 		goto out;
4240 	}
4241 
4242 	err = record__init_thread_masks(rec);
4243 	if (err) {
4244 		pr_err("Failed to initialize parallel data streaming masks\n");
4245 		goto out;
4246 	}
4247 
4248 	if (rec->opts.nr_cblocks > nr_cblocks_max)
4249 		rec->opts.nr_cblocks = nr_cblocks_max;
4250 	pr_debug("nr_cblocks: %d\n", rec->opts.nr_cblocks);
4251 
4252 	pr_debug("affinity: %s\n", affinity_tags[rec->opts.affinity]);
4253 	pr_debug("mmap flush: %d\n", rec->opts.mmap_flush);
4254 
4255 	if (rec->opts.comp_level > comp_level_max)
4256 		rec->opts.comp_level = comp_level_max;
4257 	pr_debug("comp level: %d\n", rec->opts.comp_level);
4258 
4259 	err = __cmd_record(&record, argc, argv);
4260 out:
4261 	evlist__delete(rec->evlist);
4262 	symbol__exit();
4263 	auxtrace_record__free(rec->itr);
4264 out_opts:
4265 	record__free_thread_masks(rec, rec->nr_threads);
4266 	rec->nr_threads = 0;
4267 	evlist__close_control(rec->opts.ctl_fd, rec->opts.ctl_fd_ack, &rec->opts.ctl_fd_close);
4268 	return err;
4269 }
4270 
4271 static void snapshot_sig_handler(int sig __maybe_unused)
4272 {
4273 	struct record *rec = &record;
4274 
4275 	hit_auxtrace_snapshot_trigger(rec);
4276 
4277 	if (switch_output_signal(rec))
4278 		trigger_hit(&switch_output_trigger);
4279 }
4280 
4281 static void alarm_sig_handler(int sig __maybe_unused)
4282 {
4283 	struct record *rec = &record;
4284 
4285 	if (switch_output_time(rec))
4286 		trigger_hit(&switch_output_trigger);
4287 }
4288